mirror of
https://codeberg.org/icewind/palantir.git
synced 2026-06-03 18:24:08 +02:00
gpu usage bits
This commit is contained in:
parent
d549f17da7
commit
2dff136ee6
11 changed files with 184 additions and 40 deletions
83
src/gpu/mod.rs
Normal file
83
src/gpu/mod.rs
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
use crate::sensors::Memory;
|
||||
use std::fmt::Write;
|
||||
use std::fs::read_to_string;
|
||||
use std::str::FromStr;
|
||||
|
||||
pub mod nvidia;
|
||||
|
||||
pub fn gpu_metrics<W: Write>(mut out: W, hostname: &str) {
|
||||
if let Some(memory) = memory() {
|
||||
writeln!(
|
||||
&mut out,
|
||||
"gpu_memory_total{{host=\"{}\"}} {}",
|
||||
hostname, memory.total
|
||||
)
|
||||
.ok();
|
||||
writeln!(
|
||||
&mut out,
|
||||
"gpu_memory_free{{host=\"{}\"}} {}",
|
||||
hostname, memory.free
|
||||
)
|
||||
.ok();
|
||||
}
|
||||
|
||||
for usage in utilization() {
|
||||
usage.write(&mut out, hostname);
|
||||
}
|
||||
}
|
||||
|
||||
fn read_num<T: FromStr>(path: &str) -> Option<T> {
|
||||
read_to_string(path).ok()?.trim().parse().ok()
|
||||
}
|
||||
|
||||
pub fn memory() -> Option<Memory> {
|
||||
if let Some(nv_mem) = nvidia::memory() {
|
||||
return Some(nv_mem);
|
||||
}
|
||||
// 1 gpu should be enough for everyone
|
||||
let used = read_num::<u64>("/sys/class/drm/card0/device/mem_info_vram_used")?;
|
||||
let total = read_num("/sys/class/drm/card0/device/mem_info_vram_total")?;
|
||||
Some(Memory {
|
||||
total,
|
||||
free: total - used,
|
||||
available: total - used,
|
||||
})
|
||||
}
|
||||
|
||||
pub struct GpuUsage {
|
||||
pub system: &'static str,
|
||||
pub usage: u32,
|
||||
}
|
||||
|
||||
impl GpuUsage {
|
||||
pub fn write<W: Write>(&self, mut w: W, hostname: &str) {
|
||||
writeln!(
|
||||
&mut w,
|
||||
r#"gpu_usage{{host="{}", system="{}"}} {:.3}"#,
|
||||
hostname, self.system, self.usage,
|
||||
)
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
|
||||
let nv_usage = nvidia::utilization();
|
||||
|
||||
let sources = [
|
||||
(
|
||||
"memory",
|
||||
read_num("/sys/class/drm/card0/device/mem_busy_percent"),
|
||||
),
|
||||
(
|
||||
"compute",
|
||||
read_num("/sys/class/drm/card0/device/gpu_busy_percent"),
|
||||
),
|
||||
];
|
||||
let drm = sources.into_iter().flat_map(|(system, usage)| {
|
||||
Some(GpuUsage {
|
||||
system,
|
||||
usage: usage?,
|
||||
})
|
||||
});
|
||||
drm.chain(nv_usage)
|
||||
}
|
||||
55
src/gpu/nvidia.rs
Normal file
55
src/gpu/nvidia.rs
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
use crate::gpu::GpuUsage;
|
||||
use crate::sensors::Memory;
|
||||
use nvml_wrapper::enum_wrappers::device::TemperatureSensor;
|
||||
use nvml_wrapper::{Device, Nvml};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static NVIDIA: Lazy<Option<Nvml>> = Lazy::new(|| Nvml::init().ok());
|
||||
|
||||
fn device() -> Option<Device<'static>> {
|
||||
NVIDIA.as_ref()?.device_by_index(0).ok()
|
||||
}
|
||||
|
||||
pub fn temperature() -> Option<f32> {
|
||||
let temp = device()?.temperature(TemperatureSensor::Gpu).ok()?;
|
||||
Some(temp as f32)
|
||||
}
|
||||
|
||||
pub fn power() -> Option<u64> {
|
||||
device()?.total_energy_consumption().ok()
|
||||
}
|
||||
|
||||
pub fn memory() -> Option<Memory> {
|
||||
let mem = device()?.memory_info().ok()?;
|
||||
Some(Memory {
|
||||
total: mem.total,
|
||||
free: mem.free,
|
||||
available: mem.free,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
|
||||
let sources = if let Some(device) = device() {
|
||||
let utilization = device.utilization_rates().ok();
|
||||
[
|
||||
("compute", utilization.as_ref().map(|u| u.gpu)),
|
||||
("memory", utilization.as_ref().map(|u| u.gpu)),
|
||||
(
|
||||
"encode",
|
||||
device.encoder_utilization().ok().map(|u| u.utilization),
|
||||
),
|
||||
(
|
||||
"decode",
|
||||
device.decoder_utilization().ok().map(|u| u.utilization),
|
||||
),
|
||||
]
|
||||
} else {
|
||||
[("", None); 4]
|
||||
};
|
||||
sources.into_iter().flat_map(|(system, usage)| {
|
||||
Some(GpuUsage {
|
||||
system,
|
||||
usage: usage?,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
pub mod disk;
|
||||
pub mod docker;
|
||||
pub mod nvidia;
|
||||
pub mod gpu;
|
||||
pub mod power;
|
||||
pub mod sensors;
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ use libmdns::Responder;
|
|||
use palantir::disk::zfs::arcstats;
|
||||
use palantir::docker::{get_docker, stat, Container};
|
||||
use palantir::get_metrics;
|
||||
use palantir::gpu::gpu_metrics;
|
||||
use palantir::power::power_usage;
|
||||
use std::time::Duration;
|
||||
use tokio::runtime::Handle;
|
||||
|
|
@ -44,6 +45,8 @@ async fn serve_inner(docker: Option<Docker>) -> Result<String> {
|
|||
if let Some(arc) = arcstats()? {
|
||||
arc.write(&mut metrics, &hostname);
|
||||
}
|
||||
gpu_metrics(&mut metrics, &hostname);
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
|
|
@ -58,13 +61,13 @@ async fn serve_metrics(docker: Option<Docker>) -> Result<String, Rejection> {
|
|||
async fn main() -> Result<()> {
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let host_port: u16 = dotenv::var("PORT")
|
||||
let host_port: u16 = dotenvy::var("PORT")
|
||||
.ok()
|
||||
.map(|port| port.parse())
|
||||
.transpose()?
|
||||
.unwrap_or(80);
|
||||
|
||||
let mdns = dotenv::var("DISABLE_MDNS").is_ok();
|
||||
let mdns = dotenvy::var("DISABLE_MDNS").is_ok();
|
||||
|
||||
ctrlc::set_handler(move || {
|
||||
std::process::exit(0);
|
||||
|
|
|
|||
|
|
@ -1,16 +0,0 @@
|
|||
use nvml_wrapper::enum_wrappers::device::TemperatureSensor;
|
||||
use nvml_wrapper::Nvml;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static NVIDIA: Lazy<Option<Nvml>> = Lazy::new(|| Nvml::init().ok());
|
||||
|
||||
pub fn temperature() -> Option<f32> {
|
||||
let device = NVIDIA.as_ref()?.device_by_index(0).ok()?;
|
||||
let temp = device.temperature(TemperatureSensor::Gpu).ok()?;
|
||||
Some(temp as f32)
|
||||
}
|
||||
|
||||
pub fn power() -> Option<u64> {
|
||||
let device = NVIDIA.as_ref()?.device_by_index(0).ok()?;
|
||||
device.total_energy_consumption().ok()
|
||||
}
|
||||
|
|
@ -84,7 +84,7 @@ pub fn power_usage() -> Result<Option<PowerUsage>> {
|
|||
}
|
||||
}
|
||||
|
||||
if let Some(nvidia_power) = crate::nvidia::power() {
|
||||
if let Some(nvidia_power) = crate::gpu::nvidia::power() {
|
||||
usage.gpu_uj = nvidia_power;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ pub fn temperatures() -> Result<Temperatures> {
|
|||
temps.cpu = core_total / cores_found
|
||||
}
|
||||
|
||||
if let Some(nvidia_temperature) = crate::nvidia::temperature() {
|
||||
if let Some(nvidia_temperature) = crate::gpu::nvidia::temperature() {
|
||||
temps.gpu = nvidia_temperature;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue