mirror of
https://codeberg.org/icewind/palantir.git
synced 2026-06-03 10:14:09 +02:00
gpu usage bits
This commit is contained in:
parent
d549f17da7
commit
2dff136ee6
11 changed files with 184 additions and 40 deletions
8
Cargo.lock
generated
8
Cargo.lock
generated
|
|
@ -306,10 +306,10 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dotenv"
|
name = "dotenvy"
|
||||||
version = "0.15.0"
|
version = "0.15.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f"
|
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "eyre"
|
name = "eyre"
|
||||||
|
|
@ -848,7 +848,7 @@ dependencies = [
|
||||||
"bollard",
|
"bollard",
|
||||||
"color-eyre",
|
"color-eyre",
|
||||||
"ctrlc",
|
"ctrlc",
|
||||||
"dotenv",
|
"dotenvy",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"hostname",
|
"hostname",
|
||||||
"iai",
|
"iai",
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ color-eyre = "0.6.1"
|
||||||
warp = "0.3.2"
|
warp = "0.3.2"
|
||||||
tokio = { version = "1.19.2", features = ["macros", "rt-multi-thread"] }
|
tokio = { version = "1.19.2", features = ["macros", "rt-multi-thread"] }
|
||||||
ctrlc = { version = "3.2.2", features = ["termination"] }
|
ctrlc = { version = "3.2.2", features = ["termination"] }
|
||||||
dotenv = "0.15.0"
|
dotenvy = "0.15.7"
|
||||||
regex = { version = "1.5.6", default-features = false, features = ["std"] }
|
regex = { version = "1.5.6", default-features = false, features = ["std"] }
|
||||||
once_cell = "1.12.0"
|
once_cell = "1.12.0"
|
||||||
hostname = "0.3.1"
|
hostname = "0.3.1"
|
||||||
|
|
|
||||||
48
flake.lock
generated
48
flake.lock
generated
|
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs": "nixpkgs"
|
"nixpkgs": "nixpkgs"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1671096816,
|
"lastModified": 1679567394,
|
||||||
"narHash": "sha256-ezQCsNgmpUHdZANDCILm3RvtO1xH8uujk/+EqNvzIOg=",
|
"narHash": "sha256-ZvLuzPeARDLiQUt6zSZFGOs+HZmE+3g4QURc8mkBsfM=",
|
||||||
"owner": "nix-community",
|
"owner": "nix-community",
|
||||||
"repo": "naersk",
|
"repo": "naersk",
|
||||||
"rev": "d998160d6a076cfe8f9741e56aeec7e267e3e114",
|
"rev": "88cd22380154a2c36799fe8098888f0f59861a15",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -20,10 +20,10 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1674407282,
|
"lastModified": 1682669017,
|
||||||
"narHash": "sha256-2qwc8mrPINSFdWffPK+ji6nQ9aGnnZyHSItVcYDZDlk=",
|
"narHash": "sha256-Vi+p4y3wnl0/4gcwTdmCO398kKlDaUrNROtf3GOD2NY=",
|
||||||
"path": "/nix/store/47v7isgz6w8zgb1224d46lwvwkdd69bm-source",
|
"path": "/nix/store/wm2cdd01f8jqbxpw817nv5j3sw6p93g8-source",
|
||||||
"rev": "ab1254087f4cdf4af74b552d7fc95175d9bdbb49",
|
"rev": "7449971a3ecf857b4a554cf79b1d9dcc1a4647d8",
|
||||||
"type": "path"
|
"type": "path"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -33,10 +33,10 @@
|
||||||
},
|
},
|
||||||
"nixpkgs_2": {
|
"nixpkgs_2": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1674407282,
|
"lastModified": 1682669017,
|
||||||
"narHash": "sha256-2qwc8mrPINSFdWffPK+ji6nQ9aGnnZyHSItVcYDZDlk=",
|
"narHash": "sha256-Vi+p4y3wnl0/4gcwTdmCO398kKlDaUrNROtf3GOD2NY=",
|
||||||
"path": "/nix/store/47v7isgz6w8zgb1224d46lwvwkdd69bm-source",
|
"path": "/nix/store/wm2cdd01f8jqbxpw817nv5j3sw6p93g8-source",
|
||||||
"rev": "ab1254087f4cdf4af74b552d7fc95175d9bdbb49",
|
"rev": "7449971a3ecf857b4a554cf79b1d9dcc1a4647d8",
|
||||||
"type": "path"
|
"type": "path"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -51,13 +51,31 @@
|
||||||
"utils": "utils"
|
"utils": "utils"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"utils": {
|
"systems": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1667395993,
|
"lastModified": 1681028828,
|
||||||
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"utils": {
|
||||||
|
"inputs": {
|
||||||
|
"systems": "systems"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681202837,
|
||||||
|
"narHash": "sha256-H+Rh19JDwRtpVPAWp64F+rlEtxUWBAQW28eAi3SRSzg=",
|
||||||
"owner": "numtide",
|
"owner": "numtide",
|
||||||
"repo": "flake-utils",
|
"repo": "flake-utils",
|
||||||
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
|
"rev": "cfacdce06f30d2b68473a46042957675eebb3401",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
||||||
|
|
@ -106,6 +106,7 @@
|
||||||
path = lib.optional cfg.zfs pkgs.zfs;
|
path = lib.optional cfg.zfs pkgs.zfs;
|
||||||
environment = {
|
environment = {
|
||||||
PORT = "${toString cfg.port}";
|
PORT = "${toString cfg.port}";
|
||||||
|
LD_LIBRARY_PATH = "/run/opengl-driver/lib/"; # needed for nvidia
|
||||||
} // (if (cfg.mdns == false) then {
|
} // (if (cfg.mdns == false) then {
|
||||||
DISABLE_MDNS = "true";
|
DISABLE_MDNS = "true";
|
||||||
} else {});
|
} else {});
|
||||||
|
|
|
||||||
83
src/gpu/mod.rs
Normal file
83
src/gpu/mod.rs
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
use crate::sensors::Memory;
|
||||||
|
use std::fmt::Write;
|
||||||
|
use std::fs::read_to_string;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
pub mod nvidia;
|
||||||
|
|
||||||
|
pub fn gpu_metrics<W: Write>(mut out: W, hostname: &str) {
|
||||||
|
if let Some(memory) = memory() {
|
||||||
|
writeln!(
|
||||||
|
&mut out,
|
||||||
|
"gpu_memory_total{{host=\"{}\"}} {}",
|
||||||
|
hostname, memory.total
|
||||||
|
)
|
||||||
|
.ok();
|
||||||
|
writeln!(
|
||||||
|
&mut out,
|
||||||
|
"gpu_memory_free{{host=\"{}\"}} {}",
|
||||||
|
hostname, memory.free
|
||||||
|
)
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
for usage in utilization() {
|
||||||
|
usage.write(&mut out, hostname);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_num<T: FromStr>(path: &str) -> Option<T> {
|
||||||
|
read_to_string(path).ok()?.trim().parse().ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn memory() -> Option<Memory> {
|
||||||
|
if let Some(nv_mem) = nvidia::memory() {
|
||||||
|
return Some(nv_mem);
|
||||||
|
}
|
||||||
|
// 1 gpu should be enough for everyone
|
||||||
|
let used = read_num::<u64>("/sys/class/drm/card0/device/mem_info_vram_used")?;
|
||||||
|
let total = read_num("/sys/class/drm/card0/device/mem_info_vram_total")?;
|
||||||
|
Some(Memory {
|
||||||
|
total,
|
||||||
|
free: total - used,
|
||||||
|
available: total - used,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct GpuUsage {
|
||||||
|
pub system: &'static str,
|
||||||
|
pub usage: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GpuUsage {
|
||||||
|
pub fn write<W: Write>(&self, mut w: W, hostname: &str) {
|
||||||
|
writeln!(
|
||||||
|
&mut w,
|
||||||
|
r#"gpu_usage{{host="{}", system="{}"}} {:.3}"#,
|
||||||
|
hostname, self.system, self.usage,
|
||||||
|
)
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
|
||||||
|
let nv_usage = nvidia::utilization();
|
||||||
|
|
||||||
|
let sources = [
|
||||||
|
(
|
||||||
|
"memory",
|
||||||
|
read_num("/sys/class/drm/card0/device/mem_busy_percent"),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"compute",
|
||||||
|
read_num("/sys/class/drm/card0/device/gpu_busy_percent"),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
let drm = sources.into_iter().flat_map(|(system, usage)| {
|
||||||
|
Some(GpuUsage {
|
||||||
|
system,
|
||||||
|
usage: usage?,
|
||||||
|
})
|
||||||
|
});
|
||||||
|
drm.chain(nv_usage)
|
||||||
|
}
|
||||||
55
src/gpu/nvidia.rs
Normal file
55
src/gpu/nvidia.rs
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
use crate::gpu::GpuUsage;
|
||||||
|
use crate::sensors::Memory;
|
||||||
|
use nvml_wrapper::enum_wrappers::device::TemperatureSensor;
|
||||||
|
use nvml_wrapper::{Device, Nvml};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
static NVIDIA: Lazy<Option<Nvml>> = Lazy::new(|| Nvml::init().ok());
|
||||||
|
|
||||||
|
fn device() -> Option<Device<'static>> {
|
||||||
|
NVIDIA.as_ref()?.device_by_index(0).ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn temperature() -> Option<f32> {
|
||||||
|
let temp = device()?.temperature(TemperatureSensor::Gpu).ok()?;
|
||||||
|
Some(temp as f32)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn power() -> Option<u64> {
|
||||||
|
device()?.total_energy_consumption().ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn memory() -> Option<Memory> {
|
||||||
|
let mem = device()?.memory_info().ok()?;
|
||||||
|
Some(Memory {
|
||||||
|
total: mem.total,
|
||||||
|
free: mem.free,
|
||||||
|
available: mem.free,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
|
||||||
|
let sources = if let Some(device) = device() {
|
||||||
|
let utilization = device.utilization_rates().ok();
|
||||||
|
[
|
||||||
|
("compute", utilization.as_ref().map(|u| u.gpu)),
|
||||||
|
("memory", utilization.as_ref().map(|u| u.gpu)),
|
||||||
|
(
|
||||||
|
"encode",
|
||||||
|
device.encoder_utilization().ok().map(|u| u.utilization),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"decode",
|
||||||
|
device.decoder_utilization().ok().map(|u| u.utilization),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
} else {
|
||||||
|
[("", None); 4]
|
||||||
|
};
|
||||||
|
sources.into_iter().flat_map(|(system, usage)| {
|
||||||
|
Some(GpuUsage {
|
||||||
|
system,
|
||||||
|
usage: usage?,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
pub mod disk;
|
pub mod disk;
|
||||||
pub mod docker;
|
pub mod docker;
|
||||||
pub mod nvidia;
|
pub mod gpu;
|
||||||
pub mod power;
|
pub mod power;
|
||||||
pub mod sensors;
|
pub mod sensors;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ use libmdns::Responder;
|
||||||
use palantir::disk::zfs::arcstats;
|
use palantir::disk::zfs::arcstats;
|
||||||
use palantir::docker::{get_docker, stat, Container};
|
use palantir::docker::{get_docker, stat, Container};
|
||||||
use palantir::get_metrics;
|
use palantir::get_metrics;
|
||||||
|
use palantir::gpu::gpu_metrics;
|
||||||
use palantir::power::power_usage;
|
use palantir::power::power_usage;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::runtime::Handle;
|
use tokio::runtime::Handle;
|
||||||
|
|
@ -44,6 +45,8 @@ async fn serve_inner(docker: Option<Docker>) -> Result<String> {
|
||||||
if let Some(arc) = arcstats()? {
|
if let Some(arc) = arcstats()? {
|
||||||
arc.write(&mut metrics, &hostname);
|
arc.write(&mut metrics, &hostname);
|
||||||
}
|
}
|
||||||
|
gpu_metrics(&mut metrics, &hostname);
|
||||||
|
|
||||||
Ok(metrics)
|
Ok(metrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -58,13 +61,13 @@ async fn serve_metrics(docker: Option<Docker>) -> Result<String, Rejection> {
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
tracing_subscriber::fmt::init();
|
tracing_subscriber::fmt::init();
|
||||||
|
|
||||||
let host_port: u16 = dotenv::var("PORT")
|
let host_port: u16 = dotenvy::var("PORT")
|
||||||
.ok()
|
.ok()
|
||||||
.map(|port| port.parse())
|
.map(|port| port.parse())
|
||||||
.transpose()?
|
.transpose()?
|
||||||
.unwrap_or(80);
|
.unwrap_or(80);
|
||||||
|
|
||||||
let mdns = dotenv::var("DISABLE_MDNS").is_ok();
|
let mdns = dotenvy::var("DISABLE_MDNS").is_ok();
|
||||||
|
|
||||||
ctrlc::set_handler(move || {
|
ctrlc::set_handler(move || {
|
||||||
std::process::exit(0);
|
std::process::exit(0);
|
||||||
|
|
|
||||||
|
|
@ -1,16 +0,0 @@
|
||||||
use nvml_wrapper::enum_wrappers::device::TemperatureSensor;
|
|
||||||
use nvml_wrapper::Nvml;
|
|
||||||
use once_cell::sync::Lazy;
|
|
||||||
|
|
||||||
static NVIDIA: Lazy<Option<Nvml>> = Lazy::new(|| Nvml::init().ok());
|
|
||||||
|
|
||||||
pub fn temperature() -> Option<f32> {
|
|
||||||
let device = NVIDIA.as_ref()?.device_by_index(0).ok()?;
|
|
||||||
let temp = device.temperature(TemperatureSensor::Gpu).ok()?;
|
|
||||||
Some(temp as f32)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn power() -> Option<u64> {
|
|
||||||
let device = NVIDIA.as_ref()?.device_by_index(0).ok()?;
|
|
||||||
device.total_energy_consumption().ok()
|
|
||||||
}
|
|
||||||
|
|
@ -84,7 +84,7 @@ pub fn power_usage() -> Result<Option<PowerUsage>> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(nvidia_power) = crate::nvidia::power() {
|
if let Some(nvidia_power) = crate::gpu::nvidia::power() {
|
||||||
usage.gpu_uj = nvidia_power;
|
usage.gpu_uj = nvidia_power;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,7 @@ pub fn temperatures() -> Result<Temperatures> {
|
||||||
temps.cpu = core_total / cores_found
|
temps.cpu = core_total / cores_found
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(nvidia_temperature) = crate::nvidia::temperature() {
|
if let Some(nvidia_temperature) = crate::gpu::nvidia::temperature() {
|
||||||
temps.gpu = nvidia_temperature;
|
temps.gpu = nvidia_temperature;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue