mirror of
https://codeberg.org/icewind/palantir.git
synced 2026-06-03 18:24:08 +02:00
multi gpu
This commit is contained in:
parent
22c82c59af
commit
217933c1a1
15 changed files with 234 additions and 129 deletions
|
|
@ -29,9 +29,8 @@ impl DiskStatSource {
|
|||
|
||||
impl MultiSensorSource for DiskStatSource {
|
||||
type Data = DiskStats;
|
||||
type Iter<'a> = DiskStatParser<'a>;
|
||||
|
||||
fn read(&mut self) -> Result<Self::Iter<'_>> {
|
||||
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
|
||||
self.buff.clear();
|
||||
self.source.rewind().context("error rewinding disk stats")?;
|
||||
self.source
|
||||
|
|
@ -93,9 +92,8 @@ impl DiskUsageSource {
|
|||
|
||||
impl MultiSensorSource for DiskUsageSource {
|
||||
type Data = DiskUsage;
|
||||
type Iter<'a> = DiskUsageParser<'a>;
|
||||
|
||||
fn read(&mut self) -> Result<Self::Iter<'_>> {
|
||||
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
|
||||
self.buff.clear();
|
||||
self.source.rewind().context("error rewinding mounts")?;
|
||||
self.source
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
use crate::data::{GpuMemory, GpuUsage};
|
||||
use crate::data::{GpuMemory, GpuPowerUsage, GpuUsage};
|
||||
use crate::linux::hwmon::FileSource;
|
||||
use either::Either;
|
||||
use std::borrow::Cow;
|
||||
use std::fs::{read_dir, read_to_string};
|
||||
use std::iter::empty;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
|
@ -12,46 +14,91 @@ use tracing::{info, warn};
|
|||
|
||||
pub mod nvidia;
|
||||
|
||||
fn read_num<T: FromStr>(path: &str) -> Option<T> {
|
||||
read_to_string(path).ok()?.trim().parse().ok()
|
||||
struct Card {
|
||||
id: u32,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
pub fn memory() -> Option<GpuMemory> {
|
||||
if let Some(nv_mem) = nvidia::memory() {
|
||||
return Some(nv_mem);
|
||||
impl Card {
|
||||
fn read_num<T: FromStr>(&self, name: &str) -> Option<T> {
|
||||
read_to_string(self.path.join(name))
|
||||
.ok()?
|
||||
.trim()
|
||||
.parse()
|
||||
.ok()
|
||||
}
|
||||
// 1 gpu should be enough for everyone
|
||||
let used = read_num::<u64>("/sys/class/drm/card0/device/mem_info_vram_used")?;
|
||||
let total = read_num("/sys/class/drm/card0/device/mem_info_vram_total")?;
|
||||
Some(GpuMemory {
|
||||
total,
|
||||
free: total - used,
|
||||
})
|
||||
}
|
||||
|
||||
fn cards() -> impl Iterator<Item = Card> {
|
||||
let Ok(dir) = read_dir("/sys/class/drm") else {
|
||||
return Either::Left(empty());
|
||||
};
|
||||
Either::Right(dir.flatten().flat_map(|entry| {
|
||||
let mut path = entry.path();
|
||||
let id: u32 = path
|
||||
.file_name()?
|
||||
.to_str()?
|
||||
.strip_prefix("card")?
|
||||
.parse()
|
||||
.ok()?;
|
||||
path.push("device");
|
||||
Some(Card { id, path })
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn memory() -> impl Iterator<Item = GpuMemory> {
|
||||
if let Some(nv_mem) = nvidia::memory() {
|
||||
return Either::Left(nv_mem);
|
||||
}
|
||||
|
||||
Either::Right(cards().flat_map(|card| {
|
||||
let used = card.read_num::<u64>("mem_info_vram_used")?;
|
||||
let total = card.read_num("mem_info_vram_total")?;
|
||||
Some(GpuMemory {
|
||||
card: card.id,
|
||||
total,
|
||||
free: total - used,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
|
||||
let nv_usage = nvidia::utilization();
|
||||
cards().flat_map(|card| {
|
||||
let nv_usage = nvidia::utilization();
|
||||
|
||||
let sources = [
|
||||
(
|
||||
"memory",
|
||||
read_num("/sys/class/drm/card0/device/mem_busy_percent"),
|
||||
),
|
||||
(
|
||||
"compute",
|
||||
read_num("/sys/class/drm/card0/device/gpu_busy_percent"),
|
||||
),
|
||||
];
|
||||
let drm = sources.into_iter().flat_map(|(system, usage)| {
|
||||
Some(GpuUsage {
|
||||
system: Cow::Borrowed(system),
|
||||
usage: usage?,
|
||||
})
|
||||
});
|
||||
drm.chain(nv_usage)
|
||||
let sources = [
|
||||
("memory", card.read_num("mem_busy_percent")),
|
||||
("compute", card.read_num("gpu_busy_percent")),
|
||||
];
|
||||
let drm = sources.into_iter().flat_map(move |(system, usage)| {
|
||||
Some(GpuUsage {
|
||||
card: card.id,
|
||||
system: Cow::Borrowed(system),
|
||||
usage: usage?,
|
||||
})
|
||||
});
|
||||
drm.chain(nv_usage.into_iter().flatten())
|
||||
})
|
||||
}
|
||||
|
||||
static GPU_POWER_UJ: AtomicU64 = AtomicU64::new(0);
|
||||
static GPU_POWER_UJ: [AtomicU64; 16] = [
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
AtomicU64::new(0),
|
||||
];
|
||||
static GPU_POWER_LAST_READ: Mutex<Option<Instant>> = Mutex::new(None);
|
||||
|
||||
fn get_gpu_power_elapsed() -> Option<Duration> {
|
||||
|
|
@ -62,33 +109,40 @@ fn get_gpu_power_elapsed() -> Option<Duration> {
|
|||
elapsed
|
||||
}
|
||||
|
||||
fn find_gpu_sensor() -> Option<PathBuf> {
|
||||
read_dir("/sys/class/drm/card0/device/hwmon")
|
||||
.ok()?
|
||||
.flatten()
|
||||
.find_map(|hwmon| {
|
||||
let path = hwmon.path().join("power1_average");
|
||||
path.exists().then_some(path)
|
||||
})
|
||||
fn find_gpu_sensor() -> impl Iterator<Item = PathBuf> {
|
||||
cards().flat_map(|card| {
|
||||
read_dir(card.path.join("hwmon"))
|
||||
.ok()?
|
||||
.flatten()
|
||||
.find_map(|hwmon| {
|
||||
let path = hwmon.path().join("power1_average");
|
||||
path.exists().then_some(path)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
pub fn update_gpu_power() {
|
||||
if let Some(Ok(mut file)) = find_gpu_sensor().map(FileSource::open) {
|
||||
let mut sensors = find_gpu_sensor()
|
||||
.flat_map(FileSource::open)
|
||||
.collect::<Vec<_>>();
|
||||
if !sensors.is_empty() {
|
||||
loop {
|
||||
if let Some(elapsed) = get_gpu_power_elapsed() {
|
||||
let current_power: u64 = match file.read() {
|
||||
Ok(current_power) => current_power,
|
||||
Err(_) => {
|
||||
warn!("failed to read gpu power sensor");
|
||||
return;
|
||||
}
|
||||
};
|
||||
for (card, sensor) in sensors.iter_mut().enumerate().take(16) {
|
||||
let current_power: u64 = match sensor.read() {
|
||||
Ok(current_power) => current_power,
|
||||
Err(_) => {
|
||||
warn!("failed to read gpu power sensor");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let elapsed_milli = elapsed.as_millis() as u64;
|
||||
let elapsed_milli = elapsed.as_millis() as u64;
|
||||
|
||||
let power = current_power * elapsed_milli / 1000;
|
||||
let power = current_power * elapsed_milli / 1000;
|
||||
|
||||
GPU_POWER_UJ.fetch_add(power, Ordering::SeqCst);
|
||||
GPU_POWER_UJ[card].fetch_add(power, Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
|
@ -96,6 +150,14 @@ pub fn update_gpu_power() {
|
|||
info!("no gpu sensor");
|
||||
}
|
||||
|
||||
pub fn gpu_power() -> u64 {
|
||||
GPU_POWER_UJ.load(Ordering::SeqCst)
|
||||
pub fn gpu_power() -> impl Iterator<Item = GpuPowerUsage> {
|
||||
GPU_POWER_UJ
|
||||
.iter()
|
||||
.map(|gpu| gpu.load(Ordering::SeqCst))
|
||||
.enumerate()
|
||||
.filter(|(_, power)| *power > 0)
|
||||
.map(|(card, power)| GpuPowerUsage {
|
||||
card: card as u32,
|
||||
gpu_uj: power,
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use crate::data::{GpuMemory, GpuUsage};
|
||||
use crate::data::{GpuMemory, GpuPowerUsage, GpuUsage};
|
||||
use nvml_wrapper::enum_wrappers::device::TemperatureSensor;
|
||||
use nvml_wrapper::{Device, Nvml};
|
||||
use once_cell::sync::Lazy;
|
||||
|
|
@ -6,32 +6,50 @@ use std::borrow::Cow;
|
|||
|
||||
static NVIDIA: Lazy<Option<Nvml>> = Lazy::new(|| Nvml::init().ok());
|
||||
|
||||
fn device() -> Option<Device<'static>> {
|
||||
NVIDIA.as_ref()?.device_by_index(0).ok()
|
||||
fn devices() -> Option<impl Iterator<Item = Device<'static>>> {
|
||||
let count = NVIDIA.as_ref()?.device_count().unwrap_or_default();
|
||||
Some((0..count).flat_map(device))
|
||||
}
|
||||
|
||||
pub fn temperature() -> Option<f32> {
|
||||
let temp = device()?.temperature(TemperatureSensor::Gpu).ok()?;
|
||||
Some(temp as f32)
|
||||
fn device(index: u32) -> Option<Device<'static>> {
|
||||
NVIDIA.as_ref()?.device_by_index(index).ok()
|
||||
}
|
||||
|
||||
pub fn power() -> Option<u64> {
|
||||
device()?
|
||||
.total_energy_consumption()
|
||||
.ok()
|
||||
.map(|mj| mj * 1_000)
|
||||
pub fn temperature() -> Option<impl Iterator<Item = f32>> {
|
||||
Some(devices()?.flat_map(|device| {
|
||||
device
|
||||
.temperature(TemperatureSensor::Gpu)
|
||||
.ok()
|
||||
.map(|t| t as f32)
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn memory() -> Option<GpuMemory> {
|
||||
let mem = device()?.memory_info().ok()?;
|
||||
Some(GpuMemory {
|
||||
total: mem.total,
|
||||
free: mem.free,
|
||||
})
|
||||
pub fn power() -> Option<impl Iterator<Item = GpuPowerUsage>> {
|
||||
Some(devices()?.flat_map(|device| {
|
||||
let power = device
|
||||
.total_energy_consumption()
|
||||
.ok()
|
||||
.map(|mj| mj * 1_000)?;
|
||||
Some(GpuPowerUsage {
|
||||
card: device.index().unwrap_or_default(),
|
||||
gpu_uj: power,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn utilization() -> impl Iterator<Item = GpuUsage> {
|
||||
let sources = if let Some(device) = device() {
|
||||
pub fn memory() -> Option<impl Iterator<Item = GpuMemory>> {
|
||||
Some(devices()?.flat_map(|device| {
|
||||
let mem = device.memory_info().ok()?;
|
||||
Some(GpuMemory {
|
||||
card: device.index().unwrap_or_default(),
|
||||
total: mem.total,
|
||||
free: mem.free,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn utilization() -> Option<impl Iterator<Item = GpuUsage>> {
|
||||
let sources = devices()?.flat_map(|device| {
|
||||
let utilization = device.utilization_rates().ok();
|
||||
[
|
||||
("compute", utilization.as_ref().map(|u| u.gpu)),
|
||||
|
|
@ -45,13 +63,12 @@ pub fn utilization() -> impl Iterator<Item = GpuUsage> {
|
|||
device.decoder_utilization().ok().map(|u| u.utilization),
|
||||
),
|
||||
]
|
||||
} else {
|
||||
[("", None); 4]
|
||||
};
|
||||
sources.into_iter().flat_map(|(system, usage)| {
|
||||
});
|
||||
Some(sources.into_iter().flat_map(|(system, usage)| {
|
||||
Some(GpuUsage {
|
||||
card: 0,
|
||||
system: Cow::Borrowed(system),
|
||||
usage: usage?,
|
||||
})
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use std::fs::{File, read_dir, read_to_string};
|
||||
use std::fmt::Debug;
|
||||
use std::fs::{read_dir, read_to_string, File};
|
||||
use std::io;
|
||||
use std::io::{ErrorKind, Read, Seek};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
|
@ -11,13 +12,20 @@ fn read_to_string_trimmed(path: &Path) -> io::Result<String> {
|
|||
s.truncate(len);
|
||||
Ok(s)
|
||||
}
|
||||
|
||||
pub struct FileSource {
|
||||
path: PathBuf,
|
||||
buff: String,
|
||||
file: File,
|
||||
}
|
||||
|
||||
impl Debug for FileSource {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("FileSource")
|
||||
.field("path", &self.path)
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl FileSource {
|
||||
#[instrument(skip_all, fields(path = ?path.as_ref()))]
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> io::Result<FileSource> {
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ use crate::linux::disk::zfs::arcstats;
|
|||
use crate::linux::gpu::{update_gpu_power, utilization};
|
||||
use crate::linux::power::{CpuPowerSource, GpuPowerSource};
|
||||
use crate::linux::proc::ProcSource;
|
||||
use crate::{hostname, Error, MultiSensorSource, Result, SensorData, SensorSource};
|
||||
use crate::{Error, MultiSensorSource, Result, SensorData, SensorSource, hostname};
|
||||
use std::fmt::Write;
|
||||
use std::sync::Mutex;
|
||||
use sysconf::SysconfError;
|
||||
|
|
@ -65,7 +65,7 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
|
|||
let memory = sensors.mem.lock().unwrap().read()?;
|
||||
let temperatures = sensors.temp.lock().unwrap().read()?;
|
||||
let cpu_power = sensors.cpu_power.lock().unwrap().read()?;
|
||||
let gpu_power = sensors.gpu_power.lock().unwrap().read()?;
|
||||
let mut gpu_power = sensors.gpu_power.lock().unwrap();
|
||||
let mut net = sensors.net.lock().unwrap();
|
||||
let mut proc = sensors.proc.lock().unwrap();
|
||||
let networks = net.read()?;
|
||||
|
|
@ -111,11 +111,14 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
|
|||
}
|
||||
|
||||
cpu_power.write(&mut result, &sensors.hostname);
|
||||
gpu_power.write(&mut result, &sensors.hostname);
|
||||
|
||||
for gpu_power in gpu_power.read()? {
|
||||
gpu_power?.write(&mut result, &sensors.hostname);
|
||||
}
|
||||
if let Some(arc) = arcstats() {
|
||||
arc.write(&mut result, &sensors.hostname);
|
||||
}
|
||||
if let Some(memory) = gpu::memory() {
|
||||
for memory in gpu::memory() {
|
||||
memory.write(&mut result, &sensors.hostname)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
use either::Either;
|
||||
|
||||
use crate::data::{CpuPowerUsage, GpuPowerUsage};
|
||||
use crate::linux::gpu::gpu_power;
|
||||
use crate::linux::hwmon::FileSource;
|
||||
use crate::{IoResultExt, Result, SensorSource};
|
||||
use crate::{IoResultExt, MultiSensorSource, Result, SensorSource};
|
||||
use std::fs::read_dir;
|
||||
|
||||
#[derive(Default)]
|
||||
|
|
@ -49,11 +51,13 @@ impl SensorSource for CpuPowerSource {
|
|||
#[derive(Default)]
|
||||
pub struct GpuPowerSource;
|
||||
|
||||
impl SensorSource for GpuPowerSource {
|
||||
impl MultiSensorSource for GpuPowerSource {
|
||||
type Data = GpuPowerUsage;
|
||||
|
||||
fn read(&mut self) -> Result<Self::Data> {
|
||||
let gpu_uj = crate::linux::gpu::nvidia::power().unwrap_or_else(gpu_power);
|
||||
Ok(GpuPowerUsage { gpu_uj })
|
||||
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
|
||||
Ok(crate::linux::gpu::nvidia::power()
|
||||
.map(Either::Left)
|
||||
.unwrap_or_else(|| Either::Right(gpu_power()))
|
||||
.map(Ok))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ use crate::linux::sensors::MemorySource;
|
|||
use crate::{MultiSensorSource, Result, SensorSource};
|
||||
use procfs::page_size;
|
||||
use procfs::process::all_processes;
|
||||
use std::vec::IntoIter;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ProcSource {
|
||||
|
|
@ -26,9 +25,8 @@ impl ProcSource {
|
|||
|
||||
impl MultiSensorSource for ProcSource {
|
||||
type Data = ProcData;
|
||||
type Iter<'a> = IntoIter<Result<ProcData>>;
|
||||
|
||||
fn read(&mut self) -> Result<Self::Iter<'_>> {
|
||||
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
|
||||
Ok(all_processes()?
|
||||
.flatten()
|
||||
.flat_map(|proc| proc.stat())
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ use crate::{Error, IoResultExt, MultiSensorSource, Result, SensorSource};
|
|||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{BufRead, BufReader, ErrorKind, Read, Seek};
|
||||
use sysconf::{sysconf, SysconfVariable};
|
||||
use sysconf::{SysconfVariable, sysconf};
|
||||
|
||||
pub struct TemperatureSource {
|
||||
cpu_sensors: Vec<FileSource>,
|
||||
|
|
@ -70,11 +70,19 @@ impl SensorSource for TemperatureSource {
|
|||
fn read(&mut self) -> Result<Self::Data> {
|
||||
let mut result = Temperatures {
|
||||
cpu: average_sensors(&mut self.cpu_sensors) / 1000.0,
|
||||
gpu: average_sensors(&mut self.gpu_sensors) / 1000.0,
|
||||
gpu: self
|
||||
.gpu_sensors
|
||||
.iter_mut()
|
||||
.flat_map(|sensor| sensor.read::<f32>())
|
||||
.max_by(f32::total_cmp)
|
||||
.unwrap_or_default()
|
||||
/ 1000.0,
|
||||
};
|
||||
|
||||
if let Some(gpu) = super::gpu::nvidia::temperature() {
|
||||
result.gpu = gpu;
|
||||
if let Some(gpu) = super::gpu::nvidia::temperature()
|
||||
&& let Some(temp) = gpu.max_by(f32::total_cmp)
|
||||
{
|
||||
result.gpu = temp
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
|
|
@ -224,9 +232,8 @@ impl NetworkSource {
|
|||
|
||||
impl MultiSensorSource for NetworkSource {
|
||||
type Data = NetStats;
|
||||
type Iter<'a> = NetworkStatParser<'a>;
|
||||
|
||||
fn read(&mut self) -> Result<Self::Iter<'_>> {
|
||||
fn read(&mut self) -> Result<impl Iterator<Item = Result<Self::Data>>> {
|
||||
self.buff.clear();
|
||||
let mut source = File::open("/proc/net/dev").context("error opening netdev")?;
|
||||
source
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue