player page parsing

This commit is contained in:
Robin Appelman 2023-11-15 23:26:40 +01:00
commit 7a1b207d66
5 changed files with 380 additions and 13 deletions

192
Cargo.lock generated
View file

@ -30,6 +30,15 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@ -141,6 +150,15 @@ dependencies = [
"syn 2.0.39",
]
[[package]]
name = "deranged"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f32d04922c60427da6f9fef14d042d9edddef64cb9d4ce0d64d0685fbeb1fd3"
dependencies = [
"powerfmt",
]
[[package]]
name = "derive_more"
version = "0.99.17"
@ -182,6 +200,15 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "enum_primitive"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4551092f4d519593039259a9ed8daedf0da12e5109c5280338073eaeb81180"
dependencies = [
"num-traits 0.1.43",
]
[[package]]
name = "equivalent"
version = "1.0.1"
@ -614,6 +641,91 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "num"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b7a8e9be5e039e2ff869df49155f1c06bd01ade2117ec783e56ab0932b67a8f"
dependencies = [
"num-bigint",
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-traits 0.2.17",
]
[[package]]
name = "num-bigint"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f6f7833f2cbf2360a6cfd58cd41a53aa7a90bd4c202f5b1c7dd2ed73c57b2c3"
dependencies = [
"autocfg",
"num-integer",
"num-traits 0.2.17",
]
[[package]]
name = "num-complex"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "747d632c0c558b87dbabbe6a82f3b4ae03720d0646ac5b7b4dae89394be5f2c5"
dependencies = [
"num-traits 0.2.17",
]
[[package]]
name = "num-integer"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
dependencies = [
"autocfg",
"num-traits 0.2.17",
]
[[package]]
name = "num-iter"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
dependencies = [
"autocfg",
"num-integer",
"num-traits 0.2.17",
]
[[package]]
name = "num-rational"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12ac428b1cb17fce6f731001d307d351ec70a6d202fc2e60f7d4c5e42d8f4f07"
dependencies = [
"autocfg",
"num-bigint",
"num-integer",
"num-traits 0.2.17",
]
[[package]]
name = "num-traits"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92e5113e9fd4cc14ded8e499429f396a20f98c772a47cc8622a736e1ec843c31"
dependencies = [
"num-traits 0.2.17",
]
[[package]]
name = "num-traits"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.16.0"
@ -810,6 +922,12 @@ version = "0.3.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
@ -879,6 +997,35 @@ dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "regex"
version = "1.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]]
name = "reqwest"
version = "0.11.22"
@ -1114,6 +1261,21 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "steamid-ng"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffb049f8faa2cba570c5366dbaf88ee5849725b16edb771848639fac92e33673"
dependencies = [
"enum_primitive",
"lazy_static",
"num",
"regex",
"serde",
"serde_derive",
"thiserror",
]
[[package]]
name = "string_cache"
version = "0.8.7"
@ -1227,6 +1389,34 @@ dependencies = [
"syn 2.0.39",
]
[[package]]
name = "time"
version = "0.3.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4a34ab300f2dee6e562c10a046fc05e358b29f9bf92277f30c3c8d82275f6f5"
dependencies = [
"deranged",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20"
dependencies = [
"time-core",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
@ -1333,7 +1523,9 @@ dependencies = [
"miette",
"reqwest",
"scraper",
"steamid-ng",
"thiserror",
"time",
"tokio",
]

View file

@ -11,3 +11,5 @@ scraper = "0.18.1"
miette = "5.10.0"
thiserror = "1.0.50"
main_error = "0.1.2"
time = { version = "0.3.30", features = ["parsing", "macros"] }
steamid-ng = "1.0.0"

View file

@ -1,4 +1,30 @@
#[derive(Debug)]
use steamid_ng::SteamID;
use time::Date;
#[derive(Debug, Clone)]
pub struct Player {
pub name: String,
pub steam_id: SteamID,
pub honors: Vec<Honors>,
pub teams: Vec<TeamMemberShip>,
}
#[derive(Debug, Clone)]
pub struct Honors {
pub format: String,
pub season: String,
pub team: String,
}
#[derive(Debug, Clone)]
pub struct TeamMemberShip {
pub team: TeamRef,
pub league: String,
pub since: Date,
}
#[derive(Debug, Clone)]
pub struct TeamRef {
pub name: String,
pub id: u32,
}

View file

@ -1,4 +1,5 @@
use crate::Result;
use scraper::ElementRef;
mod player;
@ -8,3 +9,21 @@ pub trait Parser {
type Output;
fn parse(&self, document: &str) -> Result<Self::Output>;
}
trait ElementExt<'a> {
fn first_text(&'a self) -> Option<&'a str>;
fn nth_text(&'a self, n: usize) -> Option<&'a str>;
}
impl<'a> ElementExt<'a> for ElementRef<'a> {
fn first_text(&self) -> Option<&'a str> {
self.text().filter(|s| !s.trim().is_empty()).next()
}
fn nth_text(&self, n: usize) -> Option<&'a str> {
self.text()
.filter(|s| !s.trim().is_empty())
.skip(n - 1)
.next()
.map(|s| s.trim())
}
}

View file

@ -1,35 +1,163 @@
use scraper::{Html, Selector};
use super::Parser;
use super::{ElementExt, Parser};
use crate::data::{Honors, Player, TeamMemberShip, TeamRef};
use crate::{ParseError, Result};
use crate::data::Player;
use scraper::{ElementRef, Html, Selector};
use std::iter::repeat;
use steamid_ng::SteamID;
use time::{macros::format_description, Date};
const SELECTOR_PLAYER_NAME: &str = ".col-md-4 > h3 > b";
const SELECTOR_PLAYER_NAME: &str = ".container .col-md-4 > h3 > b";
const SELECTOR_PLAYER_ID: &str = ".container .col-md-4 > p.nomargin";
const SELECTOR_PLAYER_HONORS_GROUP: &str =
".container .col-md-6:nth-child(2) .white-row-small .row-fluid";
const SELECTOR_PLAYER_HONORS_HEADER: &str = "h5";
const SELECTOR_PLAYER_HONORS_LEAGUE: &str = "li div";
const SELECTOR_PLAYER_HONORS_TEAM: &str = "li small";
const SELECTOR_PLAYER_TEAM_GROUP: &str =
".container .col-md-6:nth-child(1) .white-row-small .row-fluid";
const SELECTOR_PLAYER_TEAM_LINK: &str = "p a";
const SELECTOR_PLAYER_TEAM_NAME: &str = "span.text-primary b";
const SELECTOR_PLAYER_TEAM_LEAGUE: &str = "small";
const SELECTOR_PLAYER_TEAM_SINCE: &str = "small";
pub struct PlayerParser {
selector_name: Selector,
selector_id: Selector,
selector_honors_header: Selector,
selector_honors_group: Selector,
selector_honors_league: Selector,
selector_honors_team: Selector,
selector_team_group: Selector,
selector_team_link: Selector,
selector_team_name: Selector,
selector_team_league: Selector,
selector_team_since: Selector,
}
impl PlayerParser {
pub fn new() -> Self {
PlayerParser {
selector_name: Selector::parse(SELECTOR_PLAYER_NAME).unwrap(),
selector_id: Selector::parse(SELECTOR_PLAYER_ID).unwrap(),
selector_honors_header: Selector::parse(SELECTOR_PLAYER_HONORS_HEADER).unwrap(),
selector_honors_group: Selector::parse(SELECTOR_PLAYER_HONORS_GROUP).unwrap(),
selector_honors_league: Selector::parse(SELECTOR_PLAYER_HONORS_LEAGUE).unwrap(),
selector_honors_team: Selector::parse(SELECTOR_PLAYER_HONORS_TEAM).unwrap(),
selector_team_group: Selector::parse(SELECTOR_PLAYER_TEAM_GROUP).unwrap(),
selector_team_link: Selector::parse(SELECTOR_PLAYER_TEAM_LINK).unwrap(),
selector_team_name: Selector::parse(SELECTOR_PLAYER_TEAM_NAME).unwrap(),
selector_team_league: Selector::parse(SELECTOR_PLAYER_TEAM_LEAGUE).unwrap(),
selector_team_since: Selector::parse(SELECTOR_PLAYER_TEAM_SINCE).unwrap(),
}
}
}
fn select_text<'a>(el: ElementRef<'a>, selector: &Selector, default: &'static str) -> &'a str {
el.select(selector)
.next()
.and_then(|item| item.text().filter(|s| !s.trim().is_empty()).next())
.unwrap_or(default)
.trim()
}
fn select_last_text<'a>(el: ElementRef<'a>, selector: &Selector, default: &'static str) -> &'a str {
el.select(selector)
.next()
.and_then(|item| item.text().last())
.unwrap_or(default)
.trim()
}
impl Parser for PlayerParser {
type Output = Player;
fn parse(&self, document: &str) -> Result<Self::Output> {
let document = Html::parse_document(&document);
let name = document.select(&self.selector_name).next().ok_or(ParseError::ElementNotFound {
selector: SELECTOR_PLAYER_NAME,
role: "player name",
})?.text().next().unwrap_or_default().to_string();
let name = document
.select(&self.selector_name)
.next()
.ok_or(ParseError::ElementNotFound {
selector: SELECTOR_PLAYER_NAME,
role: "player name",
})?
.first_text()
.unwrap_or_default()
.to_string();
let id = document
.select(&self.selector_id)
.next()
.ok_or(ParseError::ElementNotFound {
selector: SELECTOR_PLAYER_ID,
role: "player steam id",
})?
.nth_text(3)
.unwrap_or_default()
.to_string();
let honors = document
.select(&self.selector_honors_group)
.flat_map(|group| {
let format =
select_text(group, &self.selector_honors_header, "format not detected")
.trim_end_matches(" Medals");
let leagues = group.select(&self.selector_honors_league);
let teams = group.select(&self.selector_honors_team);
repeat(format).zip(leagues).zip(teams)
})
.map(|((format, season), team)| Honors {
format: format.to_string(),
season: season.text().next().unwrap_or_default().trim().to_string(),
team: team.text().next().unwrap_or_default().trim().to_string(),
})
.collect();
let teams = document
.select(&self.selector_team_group)
.filter(|item| item.select(&self.selector_team_link).next().is_some())
.map(|item| {
let link = item
.select(&self.selector_team_link)
.next()
.and_then(|link| link.attr("href"))
.unwrap_or("=0");
let name = select_text(item, &self.selector_team_name, "failed to find name");
let league = select_text(item, &self.selector_team_league, "failed to find league");
let since = select_last_text(item, &self.selector_team_since, "");
let id = match link.rsplit_once("=") {
Some((_, id)) => id.parse().unwrap_or_default(),
_ => 0,
};
let format = format_description!("[month padding:none]/[day padding:none]/[year]");
let since = match since.rsplit_once("\n") {
Some((_, since)) => Date::parse(since, &format).unwrap_or(Date::MIN),
_ => Date::MIN,
};
TeamMemberShip {
team: TeamRef {
name: name.to_string(),
id,
},
league: league.to_string(),
since,
}
})
.collect();
Ok(Player {
name
name,
steam_id: SteamID::from_steam3(&id).unwrap_or_default(),
honors,
teams,
})
}
}