mirror of
https://codeberg.org/icewind/ugc-scaper.git
synced 2026-06-03 18:24:10 +02:00
player page parsing
This commit is contained in:
parent
0c58410f6a
commit
7a1b207d66
5 changed files with 380 additions and 13 deletions
192
Cargo.lock
generated
192
Cargo.lock
generated
|
|
@ -30,6 +30,15 @@ dependencies = [
|
|||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
|
|
@ -141,6 +150,15 @@ dependencies = [
|
|||
"syn 2.0.39",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f32d04922c60427da6f9fef14d042d9edddef64cb9d4ce0d64d0685fbeb1fd3"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.17"
|
||||
|
|
@ -182,6 +200,15 @@ dependencies = [
|
|||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "enum_primitive"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be4551092f4d519593039259a9ed8daedf0da12e5109c5280338073eaeb81180"
|
||||
dependencies = [
|
||||
"num-traits 0.1.43",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.1"
|
||||
|
|
@ -614,6 +641,91 @@ version = "1.0.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b7a8e9be5e039e2ff869df49155f1c06bd01ade2117ec783e56ab0932b67a8f"
|
||||
dependencies = [
|
||||
"num-bigint",
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-iter",
|
||||
"num-rational",
|
||||
"num-traits 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f6f7833f2cbf2360a6cfd58cd41a53aa7a90bd4c202f5b1c7dd2ed73c57b2c3"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-integer",
|
||||
"num-traits 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "747d632c0c558b87dbabbe6a82f3b4ae03720d0646ac5b7b4dae89394be5f2c5"
|
||||
dependencies = [
|
||||
"num-traits 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.45"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-traits 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-iter"
|
||||
version = "0.1.43"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-integer",
|
||||
"num-traits 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-rational"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "12ac428b1cb17fce6f731001d307d351ec70a6d202fc2e60f7d4c5e42d8f4f07"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.1.43"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "92e5113e9fd4cc14ded8e499429f396a20f98c772a47cc8622a736e1ec843c31"
|
||||
dependencies = [
|
||||
"num-traits 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.16.0"
|
||||
|
|
@ -810,6 +922,12 @@ version = "0.3.27"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.17"
|
||||
|
|
@ -879,6 +997,35 @@ dependencies = [
|
|||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.11.22"
|
||||
|
|
@ -1114,6 +1261,21 @@ version = "1.2.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "steamid-ng"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffb049f8faa2cba570c5366dbaf88ee5849725b16edb771848639fac92e33673"
|
||||
dependencies = [
|
||||
"enum_primitive",
|
||||
"lazy_static",
|
||||
"num",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "string_cache"
|
||||
version = "0.8.7"
|
||||
|
|
@ -1227,6 +1389,34 @@ dependencies = [
|
|||
"syn 2.0.39",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4a34ab300f2dee6e562c10a046fc05e358b29f9bf92277f30c3c8d82275f6f5"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"powerfmt",
|
||||
"serde",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20"
|
||||
dependencies = [
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.6.0"
|
||||
|
|
@ -1333,7 +1523,9 @@ dependencies = [
|
|||
"miette",
|
||||
"reqwest",
|
||||
"scraper",
|
||||
"steamid-ng",
|
||||
"thiserror",
|
||||
"time",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -11,3 +11,5 @@ scraper = "0.18.1"
|
|||
miette = "5.10.0"
|
||||
thiserror = "1.0.50"
|
||||
main_error = "0.1.2"
|
||||
time = { version = "0.3.30", features = ["parsing", "macros"] }
|
||||
steamid-ng = "1.0.0"
|
||||
28
src/data.rs
28
src/data.rs
|
|
@ -1,4 +1,30 @@
|
|||
#[derive(Debug)]
|
||||
use steamid_ng::SteamID;
|
||||
use time::Date;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Player {
|
||||
pub name: String,
|
||||
pub steam_id: SteamID,
|
||||
pub honors: Vec<Honors>,
|
||||
pub teams: Vec<TeamMemberShip>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Honors {
|
||||
pub format: String,
|
||||
pub season: String,
|
||||
pub team: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TeamMemberShip {
|
||||
pub team: TeamRef,
|
||||
pub league: String,
|
||||
pub since: Date,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TeamRef {
|
||||
pub name: String,
|
||||
pub id: u32,
|
||||
}
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
use crate::Result;
|
||||
use scraper::ElementRef;
|
||||
|
||||
mod player;
|
||||
|
||||
|
|
@ -8,3 +9,21 @@ pub trait Parser {
|
|||
type Output;
|
||||
fn parse(&self, document: &str) -> Result<Self::Output>;
|
||||
}
|
||||
|
||||
trait ElementExt<'a> {
|
||||
fn first_text(&'a self) -> Option<&'a str>;
|
||||
fn nth_text(&'a self, n: usize) -> Option<&'a str>;
|
||||
}
|
||||
|
||||
impl<'a> ElementExt<'a> for ElementRef<'a> {
|
||||
fn first_text(&self) -> Option<&'a str> {
|
||||
self.text().filter(|s| !s.trim().is_empty()).next()
|
||||
}
|
||||
fn nth_text(&self, n: usize) -> Option<&'a str> {
|
||||
self.text()
|
||||
.filter(|s| !s.trim().is_empty())
|
||||
.skip(n - 1)
|
||||
.next()
|
||||
.map(|s| s.trim())
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,35 +1,163 @@
|
|||
use scraper::{Html, Selector};
|
||||
use super::Parser;
|
||||
use super::{ElementExt, Parser};
|
||||
use crate::data::{Honors, Player, TeamMemberShip, TeamRef};
|
||||
use crate::{ParseError, Result};
|
||||
use crate::data::Player;
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use std::iter::repeat;
|
||||
use steamid_ng::SteamID;
|
||||
use time::{macros::format_description, Date};
|
||||
|
||||
const SELECTOR_PLAYER_NAME: &str = ".col-md-4 > h3 > b";
|
||||
const SELECTOR_PLAYER_NAME: &str = ".container .col-md-4 > h3 > b";
|
||||
const SELECTOR_PLAYER_ID: &str = ".container .col-md-4 > p.nomargin";
|
||||
|
||||
const SELECTOR_PLAYER_HONORS_GROUP: &str =
|
||||
".container .col-md-6:nth-child(2) .white-row-small .row-fluid";
|
||||
const SELECTOR_PLAYER_HONORS_HEADER: &str = "h5";
|
||||
const SELECTOR_PLAYER_HONORS_LEAGUE: &str = "li div";
|
||||
const SELECTOR_PLAYER_HONORS_TEAM: &str = "li small";
|
||||
|
||||
const SELECTOR_PLAYER_TEAM_GROUP: &str =
|
||||
".container .col-md-6:nth-child(1) .white-row-small .row-fluid";
|
||||
const SELECTOR_PLAYER_TEAM_LINK: &str = "p a";
|
||||
const SELECTOR_PLAYER_TEAM_NAME: &str = "span.text-primary b";
|
||||
const SELECTOR_PLAYER_TEAM_LEAGUE: &str = "small";
|
||||
const SELECTOR_PLAYER_TEAM_SINCE: &str = "small";
|
||||
|
||||
pub struct PlayerParser {
|
||||
selector_name: Selector,
|
||||
selector_id: Selector,
|
||||
|
||||
selector_honors_header: Selector,
|
||||
selector_honors_group: Selector,
|
||||
selector_honors_league: Selector,
|
||||
selector_honors_team: Selector,
|
||||
|
||||
selector_team_group: Selector,
|
||||
selector_team_link: Selector,
|
||||
selector_team_name: Selector,
|
||||
selector_team_league: Selector,
|
||||
selector_team_since: Selector,
|
||||
}
|
||||
|
||||
impl PlayerParser {
|
||||
pub fn new() -> Self {
|
||||
PlayerParser {
|
||||
selector_name: Selector::parse(SELECTOR_PLAYER_NAME).unwrap(),
|
||||
selector_id: Selector::parse(SELECTOR_PLAYER_ID).unwrap(),
|
||||
|
||||
selector_honors_header: Selector::parse(SELECTOR_PLAYER_HONORS_HEADER).unwrap(),
|
||||
selector_honors_group: Selector::parse(SELECTOR_PLAYER_HONORS_GROUP).unwrap(),
|
||||
selector_honors_league: Selector::parse(SELECTOR_PLAYER_HONORS_LEAGUE).unwrap(),
|
||||
selector_honors_team: Selector::parse(SELECTOR_PLAYER_HONORS_TEAM).unwrap(),
|
||||
|
||||
selector_team_group: Selector::parse(SELECTOR_PLAYER_TEAM_GROUP).unwrap(),
|
||||
selector_team_link: Selector::parse(SELECTOR_PLAYER_TEAM_LINK).unwrap(),
|
||||
selector_team_name: Selector::parse(SELECTOR_PLAYER_TEAM_NAME).unwrap(),
|
||||
selector_team_league: Selector::parse(SELECTOR_PLAYER_TEAM_LEAGUE).unwrap(),
|
||||
selector_team_since: Selector::parse(SELECTOR_PLAYER_TEAM_SINCE).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn select_text<'a>(el: ElementRef<'a>, selector: &Selector, default: &'static str) -> &'a str {
|
||||
el.select(selector)
|
||||
.next()
|
||||
.and_then(|item| item.text().filter(|s| !s.trim().is_empty()).next())
|
||||
.unwrap_or(default)
|
||||
.trim()
|
||||
}
|
||||
|
||||
fn select_last_text<'a>(el: ElementRef<'a>, selector: &Selector, default: &'static str) -> &'a str {
|
||||
el.select(selector)
|
||||
.next()
|
||||
.and_then(|item| item.text().last())
|
||||
.unwrap_or(default)
|
||||
.trim()
|
||||
}
|
||||
|
||||
impl Parser for PlayerParser {
|
||||
type Output = Player;
|
||||
|
||||
fn parse(&self, document: &str) -> Result<Self::Output> {
|
||||
let document = Html::parse_document(&document);
|
||||
|
||||
let name = document.select(&self.selector_name).next().ok_or(ParseError::ElementNotFound {
|
||||
selector: SELECTOR_PLAYER_NAME,
|
||||
role: "player name",
|
||||
})?.text().next().unwrap_or_default().to_string();
|
||||
let name = document
|
||||
.select(&self.selector_name)
|
||||
.next()
|
||||
.ok_or(ParseError::ElementNotFound {
|
||||
selector: SELECTOR_PLAYER_NAME,
|
||||
role: "player name",
|
||||
})?
|
||||
.first_text()
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
|
||||
let id = document
|
||||
.select(&self.selector_id)
|
||||
.next()
|
||||
.ok_or(ParseError::ElementNotFound {
|
||||
selector: SELECTOR_PLAYER_ID,
|
||||
role: "player steam id",
|
||||
})?
|
||||
.nth_text(3)
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
|
||||
let honors = document
|
||||
.select(&self.selector_honors_group)
|
||||
.flat_map(|group| {
|
||||
let format =
|
||||
select_text(group, &self.selector_honors_header, "format not detected")
|
||||
.trim_end_matches(" Medals");
|
||||
let leagues = group.select(&self.selector_honors_league);
|
||||
let teams = group.select(&self.selector_honors_team);
|
||||
repeat(format).zip(leagues).zip(teams)
|
||||
})
|
||||
.map(|((format, season), team)| Honors {
|
||||
format: format.to_string(),
|
||||
season: season.text().next().unwrap_or_default().trim().to_string(),
|
||||
team: team.text().next().unwrap_or_default().trim().to_string(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let teams = document
|
||||
.select(&self.selector_team_group)
|
||||
.filter(|item| item.select(&self.selector_team_link).next().is_some())
|
||||
.map(|item| {
|
||||
let link = item
|
||||
.select(&self.selector_team_link)
|
||||
.next()
|
||||
.and_then(|link| link.attr("href"))
|
||||
.unwrap_or("=0");
|
||||
let name = select_text(item, &self.selector_team_name, "failed to find name");
|
||||
let league = select_text(item, &self.selector_team_league, "failed to find league");
|
||||
let since = select_last_text(item, &self.selector_team_since, "");
|
||||
|
||||
let id = match link.rsplit_once("=") {
|
||||
Some((_, id)) => id.parse().unwrap_or_default(),
|
||||
_ => 0,
|
||||
};
|
||||
let format = format_description!("[month padding:none]/[day padding:none]/[year]");
|
||||
let since = match since.rsplit_once("\n") {
|
||||
Some((_, since)) => Date::parse(since, &format).unwrap_or(Date::MIN),
|
||||
_ => Date::MIN,
|
||||
};
|
||||
|
||||
TeamMemberShip {
|
||||
team: TeamRef {
|
||||
name: name.to_string(),
|
||||
id,
|
||||
},
|
||||
league: league.to_string(),
|
||||
since,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(Player {
|
||||
name
|
||||
name,
|
||||
steam_id: SteamID::from_steam3(&id).unwrap_or_default(),
|
||||
honors,
|
||||
teams,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue