multi gpu

This commit is contained in:
Robin Appelman 2026-04-06 23:50:41 +02:00
commit 217933c1a1
15 changed files with 234 additions and 129 deletions

View file

@ -60,6 +60,7 @@ impl SensorData for Memory {
#[derive(Debug, Clone, Default)]
pub struct GpuMemory {
pub card: u32,
pub total: u64,
pub free: u64,
}
@ -68,14 +69,14 @@ impl SensorData for GpuMemory {
fn write<W: Write>(&self, mut w: W, hostname: &str) {
writeln!(
&mut w,
"gpu_memory_total{{host=\"{}\"}} {}",
hostname, self.total
"gpu_memory_total{{host=\"{}\", gpu=\"{}\"}} {}",
hostname, self.card, self.total
)
.ok();
writeln!(
&mut w,
"gpu_memory_free{{host=\"{}\"}} {}",
hostname, self.free
"gpu_memory_free{{host=\"{}\", gpu=\"{}\"}} {}",
hostname, self.card, self.free
)
.ok();
}
@ -116,6 +117,7 @@ impl SensorData for NetStats {
}
pub struct GpuUsage {
pub card: u32,
pub system: Cow<'static, str>,
pub usage: u32,
}
@ -124,8 +126,8 @@ impl GpuUsage {
pub fn write<W: Write>(&self, mut w: W, hostname: &str) {
writeln!(
&mut w,
r#"gpu_usage{{host="{}", system="{}"}} {:.3}"#,
hostname, self.system, self.usage,
r#"gpu_usage{{host="{}", system="{}", gpu="{}"}} {:.3}"#,
hostname, self.system, self.card, self.usage,
)
.ok();
}
@ -213,6 +215,7 @@ impl SensorData for CpuPowerUsage {
#[derive(Debug, Default)]
pub struct GpuPowerUsage {
pub card: u32,
pub gpu_uj: u64,
}
@ -221,8 +224,9 @@ impl SensorData for GpuPowerUsage {
if self.gpu_uj > 0 {
writeln!(
&mut w,
r#"total_power{{host="{}", device="gpu"}} {:.3}"#,
r#"total_power{{host="{}", device="gpu", gpu="{}"}} {:.3}"#,
hostname,
self.card,
self.gpu_uj as f64 / 1_000_000.0
)
.ok();

View file

@ -15,9 +15,9 @@ pub mod linux;
pub mod win;
#[cfg(not(target_os = "windows"))]
pub use linux::{get_metrics, Sensors};
pub use linux::{Sensors, get_metrics};
#[cfg(target_os = "windows")]
pub use win::{get_metrics, Sensors};
pub use win::{Sensors, get_metrics};
#[derive(Debug, thiserror::Error)]
pub enum Error {
@ -82,11 +82,8 @@ pub trait SensorSource {
pub trait MultiSensorSource {
type Data: SensorData;
type Iter<'a>: Iterator<Item = Result<Self::Data>>
where
Self: 'a;
fn read(&mut self) -> Result<Self::Iter<'_>>;
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>>;
}
pub fn hostname() -> Result<String> {

View file

@ -29,9 +29,8 @@ impl DiskStatSource {
impl MultiSensorSource for DiskStatSource {
type Data = DiskStats;
type Iter<'a> = DiskStatParser<'a>;
fn read(&mut self) -> Result<Self::Iter<'_>> {
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
self.buff.clear();
self.source.rewind().context("error rewinding disk stats")?;
self.source
@ -93,9 +92,8 @@ impl DiskUsageSource {
impl MultiSensorSource for DiskUsageSource {
type Data = DiskUsage;
type Iter<'a> = DiskUsageParser<'a>;
fn read(&mut self) -> Result<Self::Iter<'_>> {
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
self.buff.clear();
self.source.rewind().context("error rewinding mounts")?;
self.source

View file

@ -1,7 +1,9 @@
use crate::data::{GpuMemory, GpuUsage};
use crate::data::{GpuMemory, GpuPowerUsage, GpuUsage};
use crate::linux::hwmon::FileSource;
use either::Either;
use std::borrow::Cow;
use std::fs::{read_dir, read_to_string};
use std::iter::empty;
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
@ -12,46 +14,91 @@ use tracing::{info, warn};
pub mod nvidia;
fn read_num<T: FromStr>(path: &str) -> Option<T> {
read_to_string(path).ok()?.trim().parse().ok()
struct Card {
id: u32,
path: PathBuf,
}
pub fn memory() -> Option<GpuMemory> {
if let Some(nv_mem) = nvidia::memory() {
return Some(nv_mem);
impl Card {
fn read_num<T: FromStr>(&self, name: &str) -> Option<T> {
read_to_string(self.path.join(name))
.ok()?
.trim()
.parse()
.ok()
}
// 1 gpu should be enough for everyone
let used = read_num::<u64>("/sys/class/drm/card0/device/mem_info_vram_used")?;
let total = read_num("/sys/class/drm/card0/device/mem_info_vram_total")?;
Some(GpuMemory {
total,
free: total - used,
})
}
fn cards() -> impl Iterator<Item = Card> {
let Ok(dir) = read_dir("/sys/class/drm") else {
return Either::Left(empty());
};
Either::Right(dir.flatten().flat_map(|entry| {
let mut path = entry.path();
let id: u32 = path
.file_name()?
.to_str()?
.strip_prefix("card")?
.parse()
.ok()?;
path.push("device");
Some(Card { id, path })
}))
}
pub fn memory() -> impl Iterator<Item = GpuMemory> {
if let Some(nv_mem) = nvidia::memory() {
return Either::Left(nv_mem);
}
Either::Right(cards().flat_map(|card| {
let used = card.read_num::<u64>("mem_info_vram_used")?;
let total = card.read_num("mem_info_vram_total")?;
Some(GpuMemory {
card: card.id,
total,
free: total - used,
})
}))
}
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
let nv_usage = nvidia::utilization();
cards().flat_map(|card| {
let nv_usage = nvidia::utilization();
let sources = [
(
"memory",
read_num("/sys/class/drm/card0/device/mem_busy_percent"),
),
(
"compute",
read_num("/sys/class/drm/card0/device/gpu_busy_percent"),
),
];
let drm = sources.into_iter().flat_map(|(system, usage)| {
Some(GpuUsage {
system: Cow::Borrowed(system),
usage: usage?,
})
});
drm.chain(nv_usage)
let sources = [
("memory", card.read_num("mem_busy_percent")),
("compute", card.read_num("gpu_busy_percent")),
];
let drm = sources.into_iter().flat_map(move |(system, usage)| {
Some(GpuUsage {
card: card.id,
system: Cow::Borrowed(system),
usage: usage?,
})
});
drm.chain(nv_usage.into_iter().flatten())
})
}
static GPU_POWER_UJ: AtomicU64 = AtomicU64::new(0);
static GPU_POWER_UJ: [AtomicU64; 16] = [
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
];
static GPU_POWER_LAST_READ: Mutex<Option<Instant>> = Mutex::new(None);
fn get_gpu_power_elapsed() -> Option<Duration> {
@ -62,33 +109,40 @@ fn get_gpu_power_elapsed() -> Option<Duration> {
elapsed
}
fn find_gpu_sensor() -> Option<PathBuf> {
read_dir("/sys/class/drm/card0/device/hwmon")
.ok()?
.flatten()
.find_map(|hwmon| {
let path = hwmon.path().join("power1_average");
path.exists().then_some(path)
})
fn find_gpu_sensor() -> impl Iterator<Item = PathBuf> {
cards().flat_map(|card| {
read_dir(card.path.join("hwmon"))
.ok()?
.flatten()
.find_map(|hwmon| {
let path = hwmon.path().join("power1_average");
path.exists().then_some(path)
})
})
}
pub fn update_gpu_power() {
if let Some(Ok(mut file)) = find_gpu_sensor().map(FileSource::open) {
let mut sensors = find_gpu_sensor()
.flat_map(FileSource::open)
.collect::<Vec<_>>();
if !sensors.is_empty() {
loop {
if let Some(elapsed) = get_gpu_power_elapsed() {
let current_power: u64 = match file.read() {
Ok(current_power) => current_power,
Err(_) => {
warn!("failed to read gpu power sensor");
return;
}
};
for (card, sensor) in sensors.iter_mut().enumerate().take(16) {
let current_power: u64 = match sensor.read() {
Ok(current_power) => current_power,
Err(_) => {
warn!("failed to read gpu power sensor");
return;
}
};
let elapsed_milli = elapsed.as_millis() as u64;
let elapsed_milli = elapsed.as_millis() as u64;
let power = current_power * elapsed_milli / 1000;
let power = current_power * elapsed_milli / 1000;
GPU_POWER_UJ.fetch_add(power, Ordering::SeqCst);
GPU_POWER_UJ[card].fetch_add(power, Ordering::SeqCst);
}
}
sleep(Duration::from_millis(500));
}
@ -96,6 +150,14 @@ pub fn update_gpu_power() {
info!("no gpu sensor");
}
pub fn gpu_power() -> u64 {
GPU_POWER_UJ.load(Ordering::SeqCst)
pub fn gpu_power() -> impl Iterator<Item = GpuPowerUsage> {
GPU_POWER_UJ
.iter()
.map(|gpu| gpu.load(Ordering::SeqCst))
.enumerate()
.filter(|(_, power)| *power > 0)
.map(|(card, power)| GpuPowerUsage {
card: card as u32,
gpu_uj: power,
})
}

View file

@ -1,4 +1,4 @@
use crate::data::{GpuMemory, GpuUsage};
use crate::data::{GpuMemory, GpuPowerUsage, GpuUsage};
use nvml_wrapper::enum_wrappers::device::TemperatureSensor;
use nvml_wrapper::{Device, Nvml};
use once_cell::sync::Lazy;
@ -6,32 +6,50 @@ use std::borrow::Cow;
static NVIDIA: Lazy<Option<Nvml>> = Lazy::new(|| Nvml::init().ok());
fn device() -> Option<Device<'static>> {
NVIDIA.as_ref()?.device_by_index(0).ok()
fn devices() -> Option<impl Iterator<Item = Device<'static>>> {
let count = NVIDIA.as_ref()?.device_count().unwrap_or_default();
Some((0..count).flat_map(device))
}
pub fn temperature() -> Option<f32> {
let temp = device()?.temperature(TemperatureSensor::Gpu).ok()?;
Some(temp as f32)
fn device(index: u32) -> Option<Device<'static>> {
NVIDIA.as_ref()?.device_by_index(index).ok()
}
pub fn power() -> Option<u64> {
device()?
.total_energy_consumption()
.ok()
.map(|mj| mj * 1_000)
pub fn temperature() -> Option<impl Iterator<Item = f32>> {
Some(devices()?.flat_map(|device| {
device
.temperature(TemperatureSensor::Gpu)
.ok()
.map(|t| t as f32)
}))
}
pub fn memory() -> Option<GpuMemory> {
let mem = device()?.memory_info().ok()?;
Some(GpuMemory {
total: mem.total,
free: mem.free,
})
pub fn power() -> Option<impl Iterator<Item = GpuPowerUsage>> {
Some(devices()?.flat_map(|device| {
let power = device
.total_energy_consumption()
.ok()
.map(|mj| mj * 1_000)?;
Some(GpuPowerUsage {
card: device.index().unwrap_or_default(),
gpu_uj: power,
})
}))
}
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
let sources = if let Some(device) = device() {
pub fn memory() -> Option<impl Iterator<Item = GpuMemory>> {
Some(devices()?.flat_map(|device| {
let mem = device.memory_info().ok()?;
Some(GpuMemory {
card: device.index().unwrap_or_default(),
total: mem.total,
free: mem.free,
})
}))
}
pub fn utilization() -> Option<impl Iterator<Item = GpuUsage>> {
let sources = devices()?.flat_map(|device| {
let utilization = device.utilization_rates().ok();
[
("compute", utilization.as_ref().map(|u| u.gpu)),
@ -45,13 +63,12 @@ pub fn utilization() -> impl Iterator<Item = GpuUsage> {
device.decoder_utilization().ok().map(|u| u.utilization),
),
]
} else {
[("", None); 4]
};
sources.into_iter().flat_map(|(system, usage)| {
});
Some(sources.into_iter().flat_map(|(system, usage)| {
Some(GpuUsage {
card: 0,
system: Cow::Borrowed(system),
usage: usage?,
})
})
}))
}

View file

@ -1,4 +1,5 @@
use std::fs::{File, read_dir, read_to_string};
use std::fmt::Debug;
use std::fs::{read_dir, read_to_string, File};
use std::io;
use std::io::{ErrorKind, Read, Seek};
use std::path::{Path, PathBuf};
@ -11,13 +12,20 @@ fn read_to_string_trimmed(path: &Path) -> io::Result<String> {
s.truncate(len);
Ok(s)
}
pub struct FileSource {
path: PathBuf,
buff: String,
file: File,
}
impl Debug for FileSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FileSource")
.field("path", &self.path)
.finish_non_exhaustive()
}
}
impl FileSource {
#[instrument(skip_all, fields(path = ?path.as_ref()))]
pub fn open<P: AsRef<Path>>(path: P) -> io::Result<FileSource> {

View file

@ -12,7 +12,7 @@ use crate::linux::disk::zfs::arcstats;
use crate::linux::gpu::{update_gpu_power, utilization};
use crate::linux::power::{CpuPowerSource, GpuPowerSource};
use crate::linux::proc::ProcSource;
use crate::{hostname, Error, MultiSensorSource, Result, SensorData, SensorSource};
use crate::{Error, MultiSensorSource, Result, SensorData, SensorSource, hostname};
use std::fmt::Write;
use std::sync::Mutex;
use sysconf::SysconfError;
@ -65,7 +65,7 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
let memory = sensors.mem.lock().unwrap().read()?;
let temperatures = sensors.temp.lock().unwrap().read()?;
let cpu_power = sensors.cpu_power.lock().unwrap().read()?;
let gpu_power = sensors.gpu_power.lock().unwrap().read()?;
let mut gpu_power = sensors.gpu_power.lock().unwrap();
let mut net = sensors.net.lock().unwrap();
let mut proc = sensors.proc.lock().unwrap();
let networks = net.read()?;
@ -111,11 +111,14 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
}
cpu_power.write(&mut result, &sensors.hostname);
gpu_power.write(&mut result, &sensors.hostname);
for gpu_power in gpu_power.read()? {
gpu_power?.write(&mut result, &sensors.hostname);
}
if let Some(arc) = arcstats() {
arc.write(&mut result, &sensors.hostname);
}
if let Some(memory) = gpu::memory() {
for memory in gpu::memory() {
memory.write(&mut result, &sensors.hostname)
}

View file

@ -1,7 +1,9 @@
use either::Either;
use crate::data::{CpuPowerUsage, GpuPowerUsage};
use crate::linux::gpu::gpu_power;
use crate::linux::hwmon::FileSource;
use crate::{IoResultExt, Result, SensorSource};
use crate::{IoResultExt, MultiSensorSource, Result, SensorSource};
use std::fs::read_dir;
#[derive(Default)]
@ -49,11 +51,13 @@ impl SensorSource for CpuPowerSource {
#[derive(Default)]
pub struct GpuPowerSource;
impl SensorSource for GpuPowerSource {
impl MultiSensorSource for GpuPowerSource {
type Data = GpuPowerUsage;
fn read(&mut self) -> Result<Self::Data> {
let gpu_uj = crate::linux::gpu::nvidia::power().unwrap_or_else(gpu_power);
Ok(GpuPowerUsage { gpu_uj })
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
Ok(crate::linux::gpu::nvidia::power()
.map(Either::Left)
.unwrap_or_else(|| Either::Right(gpu_power()))
.map(Ok))
}
}

View file

@ -3,7 +3,6 @@ use crate::linux::sensors::MemorySource;
use crate::{MultiSensorSource, Result, SensorSource};
use procfs::page_size;
use procfs::process::all_processes;
use std::vec::IntoIter;
#[derive(Default)]
pub struct ProcSource {
@ -26,9 +25,8 @@ impl ProcSource {
impl MultiSensorSource for ProcSource {
type Data = ProcData;
type Iter<'a> = IntoIter<Result<ProcData>>;
fn read(&mut self) -> Result<Self::Iter<'_>> {
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
Ok(all_processes()?
.flatten()
.flat_map(|proc| proc.stat())

View file

@ -4,7 +4,7 @@ use crate::{Error, IoResultExt, MultiSensorSource, Result, SensorSource};
use std::fs::File;
use std::io;
use std::io::{BufRead, BufReader, ErrorKind, Read, Seek};
use sysconf::{sysconf, SysconfVariable};
use sysconf::{SysconfVariable, sysconf};
pub struct TemperatureSource {
cpu_sensors: Vec<FileSource>,
@ -70,11 +70,19 @@ impl SensorSource for TemperatureSource {
fn read(&mut self) -> Result<Self::Data> {
let mut result = Temperatures {
cpu: average_sensors(&mut self.cpu_sensors) / 1000.0,
gpu: average_sensors(&mut self.gpu_sensors) / 1000.0,
gpu: self
.gpu_sensors
.iter_mut()
.flat_map(|sensor| sensor.read::<f32>())
.max_by(f32::total_cmp)
.unwrap_or_default()
/ 1000.0,
};
if let Some(gpu) = super::gpu::nvidia::temperature() {
result.gpu = gpu;
if let Some(gpu) = super::gpu::nvidia::temperature()
&& let Some(temp) = gpu.max_by(f32::total_cmp)
{
result.gpu = temp
}
Ok(result)
@ -224,9 +232,8 @@ impl NetworkSource {
impl MultiSensorSource for NetworkSource {
type Data = NetStats;
type Iter<'a> = NetworkStatParser<'a>;
fn read(&mut self) -> Result<Self::Iter<'_>> {
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
self.buff.clear();
let mut source = File::open("/proc/net/dev").context("error opening netdev")?;
source

View file

@ -52,9 +52,9 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
let mut disks = sensors.disks.lock().unwrap();
system.refresh_all();
networks.refresh();
components.refresh();
disks.refresh();
networks.refresh(true);
components.refresh(true);
disks.refresh(true);
let hostname = &sensors.hostname;
let mut result = String::with_capacity(256);
@ -86,6 +86,7 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
let gpu_mem_used = WMI.with(|wmi| wmi.gpu_mem())?;
let gpu_mem = GpuMemory {
card: 0,
total: sensors.gpu_mem_total,
free: sensors.gpu_mem_total - gpu_mem_used,
};
@ -94,6 +95,7 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
let gpu_engines = WMI.with(|wmi| wmi.gpu_usage())?;
for (name, usage) in gpu_engines.into_iter() {
let gpu_usage = GpuUsage {
card: 0,
system: Cow::Owned(name),
usage,
};

View file

@ -6,7 +6,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Mutex;
use std::thread::sleep;
use std::time::{Duration, Instant};
use wmi::{COMLibrary, WMIConnection};
use wmi::WMIConnection;
pub struct WmiSensor {
wmi_con: WMIConnection,
@ -15,10 +15,8 @@ pub struct WmiSensor {
impl WmiSensor {
pub fn new() -> Result<Self> {
let com_con = COMLibrary::new()?;
let wmi_con = WMIConnection::new(com_con)?;
let wmi_hwmon_con =
WMIConnection::with_namespace_path("ROOT\\LibreHardwareMonitor", com_con).ok();
let wmi_con = WMIConnection::new()?;
let wmi_hwmon_con = WMIConnection::with_namespace_path("ROOT\\LibreHardwareMonitor").ok();
Ok(WmiSensor {
wmi_con,
@ -155,10 +153,7 @@ fn get_sensor(sensors: &[Sensor], ty: &str, name: &str) -> Option<f32> {
}
pub fn update_power() {
let Ok(com_con) = COMLibrary::new() else {
return;
};
if let Ok(wmi_con) = WMIConnection::with_namespace_path("ROOT\\LibreHardwareMonitor", com_con) {
if let Ok(wmi_con) = WMIConnection::with_namespace_path("ROOT\\LibreHardwareMonitor") {
loop {
if let Some(elapsed) = get_power_elapsed() {
let Ok(sensors) = wmi_con.query::<Sensor>() else {
@ -194,6 +189,7 @@ pub fn cpu_power() -> CpuPowerUsage {
pub fn gpu_power() -> GpuPowerUsage {
GpuPowerUsage {
card: 0,
gpu_uj: GPU_POWER_UJ.load(Ordering::SeqCst),
}
}