mirror of
https://codeberg.org/icewind/ugc-scaper.git
synced 2026-06-03 10:14:11 +02:00
383 lines
16 KiB
Rust
383 lines
16 KiB
Rust
use super::{ElementExt, Parser, select_text_empty, whitespace_regex};
|
|
use crate::data::{Membership, NameChange, Record, Team};
|
|
use crate::parser::{
|
|
DATE_FORMAT, MEMBER_DATE_ALT_FORMAT, MEMBER_DATE_FORMAT, select_text, steam_id_from_link,
|
|
};
|
|
use crate::{ParseError, Result, ScrapeError};
|
|
use scraper::{Html, Selector};
|
|
use std::str::FromStr;
|
|
use time::{Date, PrimitiveDateTime, Time, UtcOffset};
|
|
use ugc_scraper_types::{GameMode, MembershipRole, Region};
|
|
|
|
const SELECTOR_TEAM_NAME: &str = ".container .col-md-12 h1 > b";
|
|
const SELECTOR_TEAM_TAG: &str = ".container .col-md-12 h1 > span";
|
|
const SELECTOR_TEAM_IMAGE: &str = ".container .col-md-12 a > img";
|
|
|
|
const SELECTOR_TEAM_FORMAT: &str = ".container .col-md-3 .white-row-small h5 .text-danger b";
|
|
const SELECTOR_TEAM_DIVISION: &str = ".container .col-md-3 .white-row-small h5 > b";
|
|
const SELECTOR_TEAM_TIMEZONE: &str = ".container .col-md-3 .white-row-small p > small > b";
|
|
const SELECTOR_TEAM_DESCRIPTION: &str =
|
|
".container .col-md-3 .white-row-small p:nth-child(4) > small";
|
|
const SELECTOR_TEAM_TITLES: &str = ".container .col-md-3 .white-row-small p > .text-warning";
|
|
|
|
const SELECTOR_TEAM_MEMBER_ROW: &str =
|
|
".container .white-row-small > .row-fluid > .col-md-12 > .white-row-light-small";
|
|
const SELECTOR_TEAM_MEMBER_LINK: &str = "b > a[href^=\"players_page\"]";
|
|
const SELECTOR_TEAM_MEMBER_STEAM_LINK: &str = "div > a[href*=\"profiles/\"]";
|
|
const SELECTOR_TEAM_MEMBER_ROLE: &str = ".tinytext";
|
|
const SELECTOR_TEAM_MEMBER_SINCE: &str = ".tinytext > em";
|
|
|
|
const SELECTOR_TEAM_RECORDS: &str =
|
|
".container .col-md-3 .white-row-small .table-responsive > table tbody tr";
|
|
const SELECTOR_TEAM_RECORD_SEASON: &str = "td:nth-child(1) small span b";
|
|
const SELECTOR_TEAM_RECORD_DIVISION: &str = "td:nth-child(2) small";
|
|
const SELECTOR_TEAM_RECORD_RESULT: &str = "td:nth-child(3)";
|
|
|
|
const SELECTOR_TEAM_NAME_CHANGE: &str =
|
|
".white-row-small:nth-child(3) .table-responsive table tbody tr";
|
|
const SELECTOR_TEAM_NAME_FROM_TAG: &str = "td:nth-child(1) small";
|
|
const SELECTOR_TEAM_NAME_FROM_NAME: &str = "td:nth-child(2) small";
|
|
const SELECTOR_TEAM_NAME_TO_TAG: &str = "td:nth-child(3) small";
|
|
const SELECTOR_TEAM_NAME_TO_NAME: &str = "td:nth-child(4) small";
|
|
const SELECTOR_TEAM_NAME_DATE: &str = "td:nth-child(5) small";
|
|
|
|
const SELECTOR_STEAM: &str = r#"a.btn.btn-xs.btn-default[href*="//steamcommunity.com/groups"]"#;
|
|
|
|
pub struct TeamParser {
|
|
selector_name: Selector,
|
|
selector_tag: Selector,
|
|
selector_image: Selector,
|
|
|
|
selector_team_format: Selector,
|
|
selector_team_division: Selector,
|
|
selector_team_timezone: Selector,
|
|
selector_team_description: Selector,
|
|
selector_team_titles: Selector,
|
|
selector_steam_group: Selector,
|
|
|
|
selector_team_member_row: Selector,
|
|
selector_team_member_link: Selector,
|
|
selector_team_member_steam_link: Selector,
|
|
selector_team_member_role: Selector,
|
|
selector_team_member_since: Selector,
|
|
|
|
selector_team_records: Selector,
|
|
selector_team_record_season: Selector,
|
|
selector_team_record_division: Selector,
|
|
selector_team_record_result: Selector,
|
|
|
|
selector_team_name_item: Selector,
|
|
selector_team_name_from_tag: Selector,
|
|
selector_team_name_from_name: Selector,
|
|
selector_team_name_to_tag: Selector,
|
|
selector_team_name_to_name: Selector,
|
|
selector_team_name_date: Selector,
|
|
}
|
|
|
|
impl Default for TeamParser {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
impl TeamParser {
|
|
pub fn new() -> Self {
|
|
TeamParser {
|
|
selector_name: Selector::parse(SELECTOR_TEAM_NAME).unwrap(),
|
|
selector_tag: Selector::parse(SELECTOR_TEAM_TAG).unwrap(),
|
|
selector_image: Selector::parse(SELECTOR_TEAM_IMAGE).unwrap(),
|
|
|
|
selector_team_format: Selector::parse(SELECTOR_TEAM_FORMAT).unwrap(),
|
|
selector_team_division: Selector::parse(SELECTOR_TEAM_DIVISION).unwrap(),
|
|
selector_team_timezone: Selector::parse(SELECTOR_TEAM_TIMEZONE).unwrap(),
|
|
selector_team_description: Selector::parse(SELECTOR_TEAM_DESCRIPTION).unwrap(),
|
|
selector_team_titles: Selector::parse(SELECTOR_TEAM_TITLES).unwrap(),
|
|
selector_steam_group: Selector::parse(SELECTOR_STEAM).unwrap(),
|
|
|
|
selector_team_member_row: Selector::parse(SELECTOR_TEAM_MEMBER_ROW).unwrap(),
|
|
selector_team_member_link: Selector::parse(SELECTOR_TEAM_MEMBER_LINK).unwrap(),
|
|
selector_team_member_steam_link: Selector::parse(SELECTOR_TEAM_MEMBER_STEAM_LINK)
|
|
.unwrap(),
|
|
selector_team_member_role: Selector::parse(SELECTOR_TEAM_MEMBER_ROLE).unwrap(),
|
|
selector_team_member_since: Selector::parse(SELECTOR_TEAM_MEMBER_SINCE).unwrap(),
|
|
|
|
selector_team_records: Selector::parse(SELECTOR_TEAM_RECORDS).unwrap(),
|
|
selector_team_record_season: Selector::parse(SELECTOR_TEAM_RECORD_SEASON).unwrap(),
|
|
selector_team_record_division: Selector::parse(SELECTOR_TEAM_RECORD_DIVISION).unwrap(),
|
|
selector_team_record_result: Selector::parse(SELECTOR_TEAM_RECORD_RESULT).unwrap(),
|
|
|
|
selector_team_name_item: Selector::parse(SELECTOR_TEAM_NAME_CHANGE).unwrap(),
|
|
selector_team_name_from_tag: Selector::parse(SELECTOR_TEAM_NAME_FROM_TAG).unwrap(),
|
|
selector_team_name_from_name: Selector::parse(SELECTOR_TEAM_NAME_FROM_NAME).unwrap(),
|
|
selector_team_name_to_tag: Selector::parse(SELECTOR_TEAM_NAME_TO_TAG).unwrap(),
|
|
selector_team_name_to_name: Selector::parse(SELECTOR_TEAM_NAME_TO_NAME).unwrap(),
|
|
selector_team_name_date: Selector::parse(SELECTOR_TEAM_NAME_DATE).unwrap(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Parser for TeamParser {
|
|
type Output = Team;
|
|
|
|
fn parse(&self, document: &str) -> Result<Self::Output> {
|
|
let whitespace_regex = whitespace_regex();
|
|
|
|
let document = Html::parse_document(document);
|
|
let root = document.root_element();
|
|
let mut name = select_text(root, &self.selector_name)
|
|
.unwrap_or_default()
|
|
.to_string();
|
|
|
|
let tag = select_text(root, &self.selector_tag)
|
|
.unwrap_or_default()
|
|
.to_string();
|
|
|
|
let image = document.select(&self.selector_image).next();
|
|
|
|
match (tag.as_str(), name.as_str(), image.is_some()) {
|
|
("", "", false) => return Err(ScrapeError::NotFound),
|
|
(_, "", true) => name = tag.clone(),
|
|
_ => {}
|
|
};
|
|
|
|
let image = image.and_then(|image| {
|
|
image
|
|
.attr("data-cfsrc")
|
|
.or_else(|| image.attr("src"))
|
|
.filter(|image| !image.ends_with("team_avatar_placeholder.png"))
|
|
.map(String::from)
|
|
});
|
|
|
|
let format_text =
|
|
select_text(root, &self.selector_team_format).ok_or(ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_FORMAT,
|
|
role: "team format",
|
|
})?;
|
|
let format = format_text
|
|
.parse::<GameMode>()
|
|
.map_err(|e| ParseError::InvalidText {
|
|
text: e.text,
|
|
role: "team game mode",
|
|
})?;
|
|
|
|
let steam_group = root
|
|
.select(&self.selector_steam_group)
|
|
.next()
|
|
.and_then(|link| {
|
|
link.attr("href")
|
|
.map(|group| group.replace("http://http", "http"))
|
|
});
|
|
|
|
let division = select_text(root, &self.selector_team_division)
|
|
.ok_or(ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_DIVISION,
|
|
role: "team division",
|
|
})?
|
|
.to_string();
|
|
|
|
let mut region = division
|
|
.split(' ')
|
|
.find_map(|part| Region::from_str(part).ok())
|
|
.or_else(|| Region::from_str(division.trim_end_matches("New Teams").trim()).ok())
|
|
.or_else(|| Region::from_str(&division).ok());
|
|
|
|
let timezone = select_text(root, &self.selector_team_timezone).map(String::from);
|
|
|
|
if let Some(timezone) = timezone.as_deref()
|
|
&& region.is_none()
|
|
{
|
|
region = Region::from_str(timezone).ok();
|
|
}
|
|
|
|
if region.is_none() {
|
|
region = format_text
|
|
.split(' ')
|
|
.find_map(|part| Region::from_str(part).ok());
|
|
}
|
|
|
|
let description = select_text(root, &self.selector_team_description)
|
|
.unwrap_or_default()
|
|
.replace('\n', " ");
|
|
|
|
let titles = document
|
|
.select(&self.selector_team_titles)
|
|
.next()
|
|
.map(|el| {
|
|
el.text()
|
|
.map(str::trim)
|
|
.filter(|s| !s.is_empty())
|
|
.map(String::from)
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
let results = document
|
|
.select(&self.selector_team_records)
|
|
.map(|record| {
|
|
let season = select_text(record, &self.selector_team_record_season).ok_or(
|
|
ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_RECORD_SEASON,
|
|
role: "team record season",
|
|
},
|
|
)?;
|
|
let division = select_text(record, &self.selector_team_record_division)
|
|
.ok_or(ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_RECORD_DIVISION,
|
|
role: "team record division",
|
|
})?
|
|
.to_string();
|
|
let result = select_text(record, &self.selector_team_record_result).ok_or(
|
|
ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_RECORD_RESULT,
|
|
role: "team record result",
|
|
},
|
|
)?;
|
|
|
|
let (wins, losses) =
|
|
result
|
|
.split_once('-')
|
|
.ok_or_else(|| ParseError::InvalidText {
|
|
text: result.to_string(),
|
|
role: "team record result",
|
|
})?;
|
|
|
|
Ok(Record {
|
|
season: season.parse().map_err(|_| ParseError::InvalidText {
|
|
text: season.to_string(),
|
|
role: "team record season",
|
|
})?,
|
|
division,
|
|
wins: wins.parse().map_err(|_| ParseError::InvalidText {
|
|
text: wins.to_string(),
|
|
role: "team record wins",
|
|
})?,
|
|
losses: losses.parse().map_err(|_| ParseError::InvalidText {
|
|
text: losses.to_string(),
|
|
role: "team record losses",
|
|
})?,
|
|
})
|
|
})
|
|
.collect::<Result<Vec<_>>>()?;
|
|
|
|
let members = document
|
|
.select(&self.selector_team_member_row)
|
|
.map(|row| {
|
|
let link = row
|
|
.select(&self.selector_team_member_link)
|
|
.next()
|
|
.or_else(|| row.select(&self.selector_team_member_steam_link).next())
|
|
.ok_or(ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_MEMBER_LINK,
|
|
role: "team member link",
|
|
})?;
|
|
let name = link.first_text().unwrap_or_default().to_string();
|
|
let link = link.attr("href").unwrap_or_default();
|
|
|
|
let role = select_text(row, &self.selector_team_member_role)
|
|
.ok_or(ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_MEMBER_ROLE,
|
|
role: "team member role",
|
|
})?
|
|
.split('\n')
|
|
.next()
|
|
.unwrap()
|
|
.parse::<MembershipRole>()
|
|
.map_err(|err| ParseError::InvalidText {
|
|
role: "member role",
|
|
text: err.text,
|
|
})?;
|
|
|
|
let since = select_text(row, &self.selector_team_member_since).ok_or(
|
|
ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_MEMBER_SINCE,
|
|
role: "team member since",
|
|
},
|
|
)?;
|
|
let since = whitespace_regex.replace_all(since.trim(), " ");
|
|
let since = if since.starts_with('(') {
|
|
let part = since
|
|
.split_once('-')
|
|
.unwrap_or_default()
|
|
.0
|
|
.trim()
|
|
.trim_start_matches('(');
|
|
let date = Date::parse(part, MEMBER_DATE_ALT_FORMAT).map_err(|_| {
|
|
ParseError::InvalidDate {
|
|
role: "member join date (alternate format)",
|
|
date: since.to_string(),
|
|
}
|
|
})?;
|
|
PrimitiveDateTime::new(date, Time::MIDNIGHT).assume_offset(UtcOffset::UTC)
|
|
} else {
|
|
PrimitiveDateTime::parse(&since, MEMBER_DATE_FORMAT)
|
|
.map_err(|_| ParseError::InvalidDate {
|
|
role: "member join date",
|
|
date: since.to_string(),
|
|
})?
|
|
.assume_offset(UtcOffset::from_hms(-5, 0, 0).unwrap())
|
|
};
|
|
|
|
Ok(Membership {
|
|
name,
|
|
steam_id: steam_id_from_link(link)?,
|
|
role,
|
|
since,
|
|
})
|
|
})
|
|
.collect::<Result<Vec<_>>>()?;
|
|
|
|
let name_changes = document
|
|
.select(&self.selector_team_name_item)
|
|
.map(|row| {
|
|
let from_tag =
|
|
select_text(row, &self.selector_team_name_from_tag).unwrap_or_default();
|
|
let from_name = select_text_empty(row, &self.selector_team_name_from_name).ok_or(
|
|
ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_NAME_FROM_NAME,
|
|
role: "team name change from name",
|
|
},
|
|
)?;
|
|
let to_tag = select_text(row, &self.selector_team_name_to_tag).unwrap_or_default();
|
|
let to_name = select_text_empty(row, &self.selector_team_name_to_name).ok_or(
|
|
ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_NAME_TO_NAME,
|
|
role: "team name change to name",
|
|
},
|
|
)?;
|
|
let date = select_text(row, &self.selector_team_name_date).ok_or(
|
|
ParseError::ElementNotFound {
|
|
selector: SELECTOR_TEAM_NAME_DATE,
|
|
role: "team name change date",
|
|
},
|
|
)?;
|
|
let date = Date::parse(date, DATE_FORMAT).map_err(|_| ParseError::InvalidDate {
|
|
date: date.to_string(),
|
|
role: "team name change date",
|
|
})?;
|
|
Ok(NameChange {
|
|
from_tag: from_tag.to_string(),
|
|
from: from_name.to_string(),
|
|
to_tag: to_tag.to_string(),
|
|
to: to_name.to_string(),
|
|
date,
|
|
})
|
|
})
|
|
.collect::<Result<_>>()?;
|
|
|
|
Ok(Team {
|
|
name,
|
|
description,
|
|
division,
|
|
timezone,
|
|
format,
|
|
steam_group,
|
|
image,
|
|
tag,
|
|
titles,
|
|
results,
|
|
members,
|
|
name_changes,
|
|
region,
|
|
})
|
|
}
|
|
}
|