team parse fixes

This commit is contained in:
Robin Appelman 2025-04-14 16:14:01 +02:00
commit ddbac7dc79
16 changed files with 4505 additions and 295 deletions

1
Cargo.lock generated
View file

@ -1884,6 +1884,7 @@ version = "0.5.0"
dependencies = [ dependencies = [
"insta", "insta",
"main_error", "main_error",
"regex",
"reqwest", "reqwest",
"scraper", "scraper",
"steamid-ng", "steamid-ng",

View file

@ -15,6 +15,7 @@ thiserror = "2.0.3"
time = { version = "0.3.41", features = ["parsing", "macros"] } time = { version = "0.3.41", features = ["parsing", "macros"] }
steamid-ng = "1.0.0" steamid-ng = "1.0.0"
ugc-scraper-types = { version = "0.2.0", path = "./types" } ugc-scraper-types = { version = "0.2.0", path = "./types" }
regex = "1.11.1"
[dev-dependencies] [dev-dependencies]
tokio = { version = "1.44.2", features = ["macros", "rt-multi-thread", "rt"] } tokio = { version = "1.44.2", features = ["macros", "rt-multi-thread", "rt"] }

583
api-server/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -7,7 +7,8 @@ edition = "2021"
tokio = { version = "1.44.2", features = ["macros", "rt-multi-thread", "rt", "signal"] } tokio = { version = "1.44.2", features = ["macros", "rt-multi-thread", "rt", "signal"] }
main_error = "0.1.2" main_error = "0.1.2"
#ugc-scraper = { version = "*", path = ".." } #ugc-scraper = { version = "*", path = ".." }
ugc-scraper = "0.4.4" ugc-scraper = { version = "0.5.0", git = "https://github.com/icewind1991/ugc-scaper" }
#ugc-scraper = "0.5.0"
axum = "0.8.3" axum = "0.8.3"
steamid-ng = "1.0.0" steamid-ng = "1.0.0"
thiserror = "2.0.12" thiserror = "2.0.12"

View file

@ -62,15 +62,15 @@ async fn main() -> MainResult {
// build our application with a route // build our application with a route
let app = Router::new() let app = Router::new()
.route("/", get(handler)) .route("/", get(handler))
.route("/player/:id", get(player)) .route("/player/{id}", get(player))
.route("/player/:id/history", get(player_history)) .route("/player/{id}/history", get(player_history))
.route("/teams/:format", get(teams)) .route("/teams/{format}", get(teams))
.route("/transactions/:format", get(transactions)) .route("/transactions/{format}", get(transactions))
.route("/team/:id", get(team)) .route("/team/{id}", get(team))
.route("/team/:id/roster", get(team_roster)) .route("/team/{id}/roster", get(team_roster))
.route("/team/:id/matches", get(team_matches)) .route("/team/{id}/matches", get(team_matches))
.route("/match/:id", get(match_page)) .route("/match/{id}", get(match_page))
.route("/maps/:format", get(map_history)) .route("/maps/{format}", get(map_history))
.with_state(AppState::default()); .with_state(AppState::default());
let listener = TcpListener::bind((Ipv4Addr::new(127, 0, 0, 1), port)).await?; let listener = TcpListener::bind((Ipv4Addr::new(127, 0, 0, 1), port)).await?;

View file

@ -29,5 +29,8 @@ rustPlatform.buildRustPackage rec {
cargoLock = { cargoLock = {
lockFile = ./api-server/Cargo.lock; lockFile = ./api-server/Cargo.lock;
outputHashes = {
"ugc-scraper-0.5.0" = "sha256-xuvuhNLKCgI/wPhMXPxBlgZGdkn6qnpxCV17TCNg/xM=";
};
}; };
} }

View file

@ -58,7 +58,7 @@ fn select_last_text<'a>(el: ElementRef<'a>, selector: &Selector) -> Option<&'a s
const DATE_FORMAT: &[FormatItem<'static>] = const DATE_FORMAT: &[FormatItem<'static>] =
format_description!("[month padding:none]/[day padding:none]/[year]"); format_description!("[month padding:none]/[day padding:none]/[year]");
const MEMBER_DATE_FORMAT: &[FormatItem<'static>] = format_description!( const MEMBER_DATE_FORMAT: &[FormatItem<'static>] = format_description!(
"[month repr:short] [day padding:none], [year]\n/\n[hour padding:none]:[minute] [period]\n(ET)" "[month repr:short] [day padding:none], [year] / [hour padding:none]:[minute] [period] (ET)"
); );
const MEMBER_DATE_ALT_FORMAT: &[FormatItem<'static>] = const MEMBER_DATE_ALT_FORMAT: &[FormatItem<'static>] =
format_description!("[month repr:short] [day padding:none], [year]"); format_description!("[month repr:short] [day padding:none], [year]");

View file

@ -4,8 +4,10 @@ use crate::parser::{
select_text, steam_id_from_link, DATE_FORMAT, MEMBER_DATE_ALT_FORMAT, MEMBER_DATE_FORMAT, select_text, steam_id_from_link, DATE_FORMAT, MEMBER_DATE_ALT_FORMAT, MEMBER_DATE_FORMAT,
}; };
use crate::{ParseError, Result, ScrapeError}; use crate::{ParseError, Result, ScrapeError};
use regex::Regex;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use std::str::FromStr; use std::str::FromStr;
use std::sync::OnceLock;
use time::{Date, PrimitiveDateTime, Time, UtcOffset}; use time::{Date, PrimitiveDateTime, Time, UtcOffset};
use ugc_scraper_types::{GameMode, Region}; use ugc_scraper_types::{GameMode, Region};
@ -112,10 +114,14 @@ impl TeamParser {
} }
} }
static WHITESPACE_REGEX: OnceLock<Regex> = OnceLock::new();
impl Parser for TeamParser { impl Parser for TeamParser {
type Output = Team; type Output = Team;
fn parse(&self, document: &str) -> Result<Self::Output> { fn parse(&self, document: &str) -> Result<Self::Output> {
let whitespace_regex = WHITESPACE_REGEX.get_or_init(|| Regex::new("[\n\t ]+").unwrap());
let document = Html::parse_document(document); let document = Html::parse_document(document);
let root = document.root_element(); let root = document.root_element();
let mut name = select_text(root, &self.selector_name) let mut name = select_text(root, &self.selector_name)
@ -175,6 +181,7 @@ impl Parser for TeamParser {
let region = division let region = division
.split(' ') .split(' ')
.find_map(|part| Region::from_str(part).ok()) .find_map(|part| Region::from_str(part).ok())
.or_else(|| Region::from_str(&division).ok())
.ok_or_else(|| ParseError::InvalidText { .ok_or_else(|| ParseError::InvalidText {
text: division.clone(), text: division.clone(),
role: "team region", role: "team region",
@ -279,7 +286,7 @@ impl Parser for TeamParser {
}, },
)?; )?;
let role = role.trim().to_string(); let role = role.trim().to_string();
let since = since.trim(); let since = whitespace_regex.replace_all(since.trim(), " ");
let since = if since.starts_with('(') { let since = if since.starts_with('(') {
let part = since let part = since
.split_once('-') .split_once('-')
@ -295,7 +302,7 @@ impl Parser for TeamParser {
})?; })?;
PrimitiveDateTime::new(date, Time::MIDNIGHT).assume_offset(UtcOffset::UTC) PrimitiveDateTime::new(date, Time::MIDNIGHT).assume_offset(UtcOffset::UTC)
} else { } else {
PrimitiveDateTime::parse(since, MEMBER_DATE_FORMAT) PrimitiveDateTime::parse(&since, MEMBER_DATE_FORMAT)
.map_err(|_| ParseError::InvalidDate { .map_err(|_| ParseError::InvalidDate {
role: "member join date", role: "member join date",
date: since.to_string(), date: since.to_string(),

4080
tests/data/team_29228.html Normal file

File diff suppressed because it is too large Load diff

View file

@ -34,6 +34,7 @@ fn test_parse_player_details_html(input: &str, name: &str) {
#[test_case("team_8157.html", "team_no_tz")] #[test_case("team_8157.html", "team_no_tz")]
#[test_case("team_6929.html", "team_changed_name")] #[test_case("team_6929.html", "team_changed_name")]
#[test_case("team_32437.html", "team_empty_name_change")] #[test_case("team_32437.html", "team_empty_name_change")]
#[test_case("team_29228.html", "team_newlines_join_date")]
#[cfg(feature = "serde")] #[cfg(feature = "serde")]
fn test_parse_team_html(input: &str, name: &str) { fn test_parse_team_html(input: &str, name: &str) {
let body = read_to_string(format!("tests/data/{input}")).unwrap(); let body = read_to_string(format!("tests/data/{input}")).unwrap();

View file

@ -7,7 +7,7 @@ expression: parsed
"tag": "Melting Pot", "tag": "Melting Pot",
"image": "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/f7/f75809d7774c917be9883370d772d3099bfe457d_full.jpg", "image": "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/f7/f75809d7774c917be9883370d772d3099bfe457d_full.jpg",
"format": "9v9", "format": "9v9",
"region": "Euro", "region": "Europe",
"timezone": "West-Euro", "timezone": "West-Euro",
"steam_group": "http://steamcommunity.com/groups/Melintongpotsss", "steam_group": "http://steamcommunity.com/groups/Melintongpotsss",
"division": "Euro Platinum", "division": "Euro Platinum",

View file

@ -7,6 +7,7 @@ expression: parsed
"tag": "Europe", "tag": "Europe",
"image": "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/09/096a30b1025c586f9d41c686077129f6e86998d0_full.jpg", "image": "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/09/096a30b1025c586f9d41c686077129f6e86998d0_full.jpg",
"format": "6v6", "format": "6v6",
"region": "Europe",
"timezone": "West-Euro", "timezone": "West-Euro",
"steam_group": null, "steam_group": null,
"division": "Europe", "division": "Europe",

View file

@ -7,6 +7,7 @@ expression: parsed
"tag": "PNKTSU", "tag": "PNKTSU",
"image": "clan_avatars/32437_thumbnail.jpg", "image": "clan_avatars/32437_thumbnail.jpg",
"format": "9v9", "format": "9v9",
"region": "NorthAmerica",
"timezone": null, "timezone": null,
"steam_group": "https://steamcommunity.com/groups/xyxxx-", "steam_group": "https://steamcommunity.com/groups/xyxxx-",
"division": "North America", "division": "North America",

View file

@ -7,7 +7,7 @@ expression: parsed
"tag": "-Xe-", "tag": "-Xe-",
"image": "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/db/dbabbd8bab7ccf6d27a9d4ca2e73a76e085bb201_full.jpg", "image": "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/db/dbabbd8bab7ccf6d27a9d4ca2e73a76e085bb201_full.jpg",
"format": "9v9", "format": "9v9",
"region": "Euro", "region": "Europe",
"timezone": "West-Euro", "timezone": "West-Euro",
"steam_group": "https://steamcommunity.com/groups/XenonxTF2", "steam_group": "https://steamcommunity.com/groups/XenonxTF2",
"division": "Euro Platinum", "division": "Euro Platinum",

View file

@ -0,0 +1,87 @@
---
source: tests/snapshot.rs
expression: parsed
---
{
"name": "#1 Intercollegiate TF2 Team",
"tag": "#1TF2Team",
"image": "https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/b9/b9cc1e4949f5e6d0364f9ee59a13cc931fd3aab1_full.jpg",
"format": "9v9",
"region": "NorthAmerica",
"timezone": "East",
"steam_group": "http://steamcommunity.com/groups/NumberOneTF2Team",
"division": "Main NA",
"description": "We're uncontested #1. (pls contact copycat for any team stuff)",
"titles": [],
"members": [
{
"name": "copycat",
"steam_id": "76561198072397106",
"role": "Leader",
"since": "+002019-01-22T11:04:00.000000000-05:00"
},
{
"name": "Mei",
"steam_id": "76561198134574357",
"role": "Member",
"since": "+002019-01-22T11:07:00.000000000-05:00"
},
{
"name": "melstrom",
"steam_id": "76561198068890768",
"role": "Member",
"since": "+002019-01-22T11:07:00.000000000-05:00"
},
{
"name": "Otter Speaking Eng",
"steam_id": "76561198073466450",
"role": "Member",
"since": "+002019-01-22T11:13:00.000000000-05:00"
},
{
"name": "DarkSlayer415",
"steam_id": "76561198053913751",
"role": "Member",
"since": "+002019-01-22T05:54:00.000000000-05:00"
},
{
"name": "corn face",
"steam_id": "76561198066113821",
"role": "Member",
"since": "+002019-01-22T06:59:00.000000000-05:00"
},
{
"name": "erin",
"steam_id": "76561198307572837",
"role": "Member",
"since": "+002019-01-22T07:17:00.000000000-05:00"
},
{
"name": "Naps",
"steam_id": "76561198061524698",
"role": "Member",
"since": "+002019-01-23T04:16:00.000000000-05:00"
},
{
"name": "java",
"steam_id": "76561198158291013",
"role": "Member",
"since": "+002019-01-23T08:52:00.000000000-05:00"
}
],
"results": [
{
"season": 43,
"division": "Main NA",
"wins": 0,
"losses": 0
},
{
"season": 27,
"division": "Main NA",
"wins": 2,
"losses": 4
}
],
"name_changes": []
}

View file

@ -359,7 +359,7 @@ pub struct InvalidRegion {
#[derive(Serialize, Deserialize, Copy, Clone, Debug)] #[derive(Serialize, Deserialize, Copy, Clone, Debug)]
pub enum Region { pub enum Region {
Euro, Europe,
NorthAmerica, NorthAmerica,
SouthAmerica, SouthAmerica,
Asia, Asia,
@ -371,11 +371,13 @@ impl FromStr for Region {
fn from_str(s: &str) -> Result<Self, Self::Err> { fn from_str(s: &str) -> Result<Self, Self::Err> {
match s { match s {
"Euro" => Ok(Region::Euro), "Euro" => Ok(Region::Europe),
"EU" => Ok(Region::Euro), "Europe" => Ok(Region::Europe),
"EU" => Ok(Region::Europe),
"Asia" => Ok(Region::Asia), "Asia" => Ok(Region::Asia),
"ASIA" => Ok(Region::Asia), "ASIA" => Ok(Region::Asia),
"NA" => Ok(Region::NorthAmerica), "NA" => Ok(Region::NorthAmerica),
"North America" => Ok(Region::NorthAmerica),
"South American" => Ok(Region::SouthAmerica), "South American" => Ok(Region::SouthAmerica),
"SA" => Ok(Region::SouthAmerica), "SA" => Ok(Region::SouthAmerica),
"AUS" => Ok(Region::Australia), "AUS" => Ok(Region::Australia),