multi gpu

This commit is contained in:
Robin Appelman 2026-04-06 23:50:41 +02:00
commit 217933c1a1
15 changed files with 234 additions and 129 deletions

View file

@ -29,9 +29,8 @@ impl DiskStatSource {
impl MultiSensorSource for DiskStatSource {
type Data = DiskStats;
type Iter<'a> = DiskStatParser<'a>;
fn read(&mut self) -> Result<Self::Iter<'_>> {
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
self.buff.clear();
self.source.rewind().context("error rewinding disk stats")?;
self.source
@ -93,9 +92,8 @@ impl DiskUsageSource {
impl MultiSensorSource for DiskUsageSource {
type Data = DiskUsage;
type Iter<'a> = DiskUsageParser<'a>;
fn read(&mut self) -> Result<Self::Iter<'_>> {
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
self.buff.clear();
self.source.rewind().context("error rewinding mounts")?;
self.source

View file

@ -1,7 +1,9 @@
use crate::data::{GpuMemory, GpuUsage};
use crate::data::{GpuMemory, GpuPowerUsage, GpuUsage};
use crate::linux::hwmon::FileSource;
use either::Either;
use std::borrow::Cow;
use std::fs::{read_dir, read_to_string};
use std::iter::empty;
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
@ -12,46 +14,91 @@ use tracing::{info, warn};
pub mod nvidia;
fn read_num<T: FromStr>(path: &str) -> Option<T> {
read_to_string(path).ok()?.trim().parse().ok()
struct Card {
id: u32,
path: PathBuf,
}
pub fn memory() -> Option<GpuMemory> {
if let Some(nv_mem) = nvidia::memory() {
return Some(nv_mem);
impl Card {
fn read_num<T: FromStr>(&self, name: &str) -> Option<T> {
read_to_string(self.path.join(name))
.ok()?
.trim()
.parse()
.ok()
}
// 1 gpu should be enough for everyone
let used = read_num::<u64>("/sys/class/drm/card0/device/mem_info_vram_used")?;
let total = read_num("/sys/class/drm/card0/device/mem_info_vram_total")?;
Some(GpuMemory {
total,
free: total - used,
})
}
fn cards() -> impl Iterator<Item = Card> {
let Ok(dir) = read_dir("/sys/class/drm") else {
return Either::Left(empty());
};
Either::Right(dir.flatten().flat_map(|entry| {
let mut path = entry.path();
let id: u32 = path
.file_name()?
.to_str()?
.strip_prefix("card")?
.parse()
.ok()?;
path.push("device");
Some(Card { id, path })
}))
}
pub fn memory() -> impl Iterator<Item = GpuMemory> {
if let Some(nv_mem) = nvidia::memory() {
return Either::Left(nv_mem);
}
Either::Right(cards().flat_map(|card| {
let used = card.read_num::<u64>("mem_info_vram_used")?;
let total = card.read_num("mem_info_vram_total")?;
Some(GpuMemory {
card: card.id,
total,
free: total - used,
})
}))
}
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
let nv_usage = nvidia::utilization();
cards().flat_map(|card| {
let nv_usage = nvidia::utilization();
let sources = [
(
"memory",
read_num("/sys/class/drm/card0/device/mem_busy_percent"),
),
(
"compute",
read_num("/sys/class/drm/card0/device/gpu_busy_percent"),
),
];
let drm = sources.into_iter().flat_map(|(system, usage)| {
Some(GpuUsage {
system: Cow::Borrowed(system),
usage: usage?,
})
});
drm.chain(nv_usage)
let sources = [
("memory", card.read_num("mem_busy_percent")),
("compute", card.read_num("gpu_busy_percent")),
];
let drm = sources.into_iter().flat_map(move |(system, usage)| {
Some(GpuUsage {
card: card.id,
system: Cow::Borrowed(system),
usage: usage?,
})
});
drm.chain(nv_usage.into_iter().flatten())
})
}
static GPU_POWER_UJ: AtomicU64 = AtomicU64::new(0);
static GPU_POWER_UJ: [AtomicU64; 16] = [
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
AtomicU64::new(0),
];
static GPU_POWER_LAST_READ: Mutex<Option<Instant>> = Mutex::new(None);
fn get_gpu_power_elapsed() -> Option<Duration> {
@ -62,33 +109,40 @@ fn get_gpu_power_elapsed() -> Option<Duration> {
elapsed
}
fn find_gpu_sensor() -> Option<PathBuf> {
read_dir("/sys/class/drm/card0/device/hwmon")
.ok()?
.flatten()
.find_map(|hwmon| {
let path = hwmon.path().join("power1_average");
path.exists().then_some(path)
})
fn find_gpu_sensor() -> impl Iterator<Item = PathBuf> {
cards().flat_map(|card| {
read_dir(card.path.join("hwmon"))
.ok()?
.flatten()
.find_map(|hwmon| {
let path = hwmon.path().join("power1_average");
path.exists().then_some(path)
})
})
}
pub fn update_gpu_power() {
if let Some(Ok(mut file)) = find_gpu_sensor().map(FileSource::open) {
let mut sensors = find_gpu_sensor()
.flat_map(FileSource::open)
.collect::<Vec<_>>();
if !sensors.is_empty() {
loop {
if let Some(elapsed) = get_gpu_power_elapsed() {
let current_power: u64 = match file.read() {
Ok(current_power) => current_power,
Err(_) => {
warn!("failed to read gpu power sensor");
return;
}
};
for (card, sensor) in sensors.iter_mut().enumerate().take(16) {
let current_power: u64 = match sensor.read() {
Ok(current_power) => current_power,
Err(_) => {
warn!("failed to read gpu power sensor");
return;
}
};
let elapsed_milli = elapsed.as_millis() as u64;
let elapsed_milli = elapsed.as_millis() as u64;
let power = current_power * elapsed_milli / 1000;
let power = current_power * elapsed_milli / 1000;
GPU_POWER_UJ.fetch_add(power, Ordering::SeqCst);
GPU_POWER_UJ[card].fetch_add(power, Ordering::SeqCst);
}
}
sleep(Duration::from_millis(500));
}
@ -96,6 +150,14 @@ pub fn update_gpu_power() {
info!("no gpu sensor");
}
pub fn gpu_power() -> u64 {
GPU_POWER_UJ.load(Ordering::SeqCst)
pub fn gpu_power() -> impl Iterator<Item = GpuPowerUsage> {
GPU_POWER_UJ
.iter()
.map(|gpu| gpu.load(Ordering::SeqCst))
.enumerate()
.filter(|(_, power)| *power > 0)
.map(|(card, power)| GpuPowerUsage {
card: card as u32,
gpu_uj: power,
})
}

View file

@ -1,4 +1,4 @@
use crate::data::{GpuMemory, GpuUsage};
use crate::data::{GpuMemory, GpuPowerUsage, GpuUsage};
use nvml_wrapper::enum_wrappers::device::TemperatureSensor;
use nvml_wrapper::{Device, Nvml};
use once_cell::sync::Lazy;
@ -6,32 +6,50 @@ use std::borrow::Cow;
static NVIDIA: Lazy<Option<Nvml>> = Lazy::new(|| Nvml::init().ok());
fn device() -> Option<Device<'static>> {
NVIDIA.as_ref()?.device_by_index(0).ok()
fn devices() -> Option<impl Iterator<Item = Device<'static>>> {
let count = NVIDIA.as_ref()?.device_count().unwrap_or_default();
Some((0..count).flat_map(device))
}
pub fn temperature() -> Option<f32> {
let temp = device()?.temperature(TemperatureSensor::Gpu).ok()?;
Some(temp as f32)
fn device(index: u32) -> Option<Device<'static>> {
NVIDIA.as_ref()?.device_by_index(index).ok()
}
pub fn power() -> Option<u64> {
device()?
.total_energy_consumption()
.ok()
.map(|mj| mj * 1_000)
pub fn temperature() -> Option<impl Iterator<Item = f32>> {
Some(devices()?.flat_map(|device| {
device
.temperature(TemperatureSensor::Gpu)
.ok()
.map(|t| t as f32)
}))
}
pub fn memory() -> Option<GpuMemory> {
let mem = device()?.memory_info().ok()?;
Some(GpuMemory {
total: mem.total,
free: mem.free,
})
pub fn power() -> Option<impl Iterator<Item = GpuPowerUsage>> {
Some(devices()?.flat_map(|device| {
let power = device
.total_energy_consumption()
.ok()
.map(|mj| mj * 1_000)?;
Some(GpuPowerUsage {
card: device.index().unwrap_or_default(),
gpu_uj: power,
})
}))
}
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
let sources = if let Some(device) = device() {
pub fn memory() -> Option<impl Iterator<Item = GpuMemory>> {
Some(devices()?.flat_map(|device| {
let mem = device.memory_info().ok()?;
Some(GpuMemory {
card: device.index().unwrap_or_default(),
total: mem.total,
free: mem.free,
})
}))
}
pub fn utilization() -> Option<impl Iterator<Item = GpuUsage>> {
let sources = devices()?.flat_map(|device| {
let utilization = device.utilization_rates().ok();
[
("compute", utilization.as_ref().map(|u| u.gpu)),
@ -45,13 +63,12 @@ pub fn utilization() -> impl Iterator<Item = GpuUsage> {
device.decoder_utilization().ok().map(|u| u.utilization),
),
]
} else {
[("", None); 4]
};
sources.into_iter().flat_map(|(system, usage)| {
});
Some(sources.into_iter().flat_map(|(system, usage)| {
Some(GpuUsage {
card: 0,
system: Cow::Borrowed(system),
usage: usage?,
})
})
}))
}

View file

@ -1,4 +1,5 @@
use std::fs::{File, read_dir, read_to_string};
use std::fmt::Debug;
use std::fs::{read_dir, read_to_string, File};
use std::io;
use std::io::{ErrorKind, Read, Seek};
use std::path::{Path, PathBuf};
@ -11,13 +12,20 @@ fn read_to_string_trimmed(path: &Path) -> io::Result<String> {
s.truncate(len);
Ok(s)
}
pub struct FileSource {
path: PathBuf,
buff: String,
file: File,
}
impl Debug for FileSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FileSource")
.field("path", &self.path)
.finish_non_exhaustive()
}
}
impl FileSource {
#[instrument(skip_all, fields(path = ?path.as_ref()))]
pub fn open<P: AsRef<Path>>(path: P) -> io::Result<FileSource> {

View file

@ -12,7 +12,7 @@ use crate::linux::disk::zfs::arcstats;
use crate::linux::gpu::{update_gpu_power, utilization};
use crate::linux::power::{CpuPowerSource, GpuPowerSource};
use crate::linux::proc::ProcSource;
use crate::{hostname, Error, MultiSensorSource, Result, SensorData, SensorSource};
use crate::{Error, MultiSensorSource, Result, SensorData, SensorSource, hostname};
use std::fmt::Write;
use std::sync::Mutex;
use sysconf::SysconfError;
@ -65,7 +65,7 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
let memory = sensors.mem.lock().unwrap().read()?;
let temperatures = sensors.temp.lock().unwrap().read()?;
let cpu_power = sensors.cpu_power.lock().unwrap().read()?;
let gpu_power = sensors.gpu_power.lock().unwrap().read()?;
let mut gpu_power = sensors.gpu_power.lock().unwrap();
let mut net = sensors.net.lock().unwrap();
let mut proc = sensors.proc.lock().unwrap();
let networks = net.read()?;
@ -111,11 +111,14 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
}
cpu_power.write(&mut result, &sensors.hostname);
gpu_power.write(&mut result, &sensors.hostname);
for gpu_power in gpu_power.read()? {
gpu_power?.write(&mut result, &sensors.hostname);
}
if let Some(arc) = arcstats() {
arc.write(&mut result, &sensors.hostname);
}
if let Some(memory) = gpu::memory() {
for memory in gpu::memory() {
memory.write(&mut result, &sensors.hostname)
}

View file

@ -1,7 +1,9 @@
use either::Either;
use crate::data::{CpuPowerUsage, GpuPowerUsage};
use crate::linux::gpu::gpu_power;
use crate::linux::hwmon::FileSource;
use crate::{IoResultExt, Result, SensorSource};
use crate::{IoResultExt, MultiSensorSource, Result, SensorSource};
use std::fs::read_dir;
#[derive(Default)]
@ -49,11 +51,13 @@ impl SensorSource for CpuPowerSource {
#[derive(Default)]
pub struct GpuPowerSource;
impl SensorSource for GpuPowerSource {
impl MultiSensorSource for GpuPowerSource {
type Data = GpuPowerUsage;
fn read(&mut self) -> Result<Self::Data> {
let gpu_uj = crate::linux::gpu::nvidia::power().unwrap_or_else(gpu_power);
Ok(GpuPowerUsage { gpu_uj })
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
Ok(crate::linux::gpu::nvidia::power()
.map(Either::Left)
.unwrap_or_else(|| Either::Right(gpu_power()))
.map(Ok))
}
}

View file

@ -3,7 +3,6 @@ use crate::linux::sensors::MemorySource;
use crate::{MultiSensorSource, Result, SensorSource};
use procfs::page_size;
use procfs::process::all_processes;
use std::vec::IntoIter;
#[derive(Default)]
pub struct ProcSource {
@ -26,9 +25,8 @@ impl ProcSource {
impl MultiSensorSource for ProcSource {
type Data = ProcData;
type Iter<'a> = IntoIter<Result<ProcData>>;
fn read(&mut self) -> Result<Self::Iter<'_>> {
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
Ok(all_processes()?
.flatten()
.flat_map(|proc| proc.stat())

View file

@ -4,7 +4,7 @@ use crate::{Error, IoResultExt, MultiSensorSource, Result, SensorSource};
use std::fs::File;
use std::io;
use std::io::{BufRead, BufReader, ErrorKind, Read, Seek};
use sysconf::{sysconf, SysconfVariable};
use sysconf::{SysconfVariable, sysconf};
pub struct TemperatureSource {
cpu_sensors: Vec<FileSource>,
@ -70,11 +70,19 @@ impl SensorSource for TemperatureSource {
fn read(&mut self) -> Result<Self::Data> {
let mut result = Temperatures {
cpu: average_sensors(&mut self.cpu_sensors) / 1000.0,
gpu: average_sensors(&mut self.gpu_sensors) / 1000.0,
gpu: self
.gpu_sensors
.iter_mut()
.flat_map(|sensor| sensor.read::<f32>())
.max_by(f32::total_cmp)
.unwrap_or_default()
/ 1000.0,
};
if let Some(gpu) = super::gpu::nvidia::temperature() {
result.gpu = gpu;
if let Some(gpu) = super::gpu::nvidia::temperature()
&& let Some(temp) = gpu.max_by(f32::total_cmp)
{
result.gpu = temp
}
Ok(result)
@ -224,9 +232,8 @@ impl NetworkSource {
impl MultiSensorSource for NetworkSource {
type Data = NetStats;
type Iter<'a> = NetworkStatParser<'a>;
fn read(&mut self) -> Result<Self::Iter<'_>> {
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
self.buff.clear();
let mut source = File::open("/proc/net/dev").context("error opening netdev")?;
source