improved power handling

This commit is contained in:
Robin Appelman 2023-07-17 21:01:12 +02:00
commit 224bf80588
6 changed files with 79 additions and 54 deletions

View file

@ -184,14 +184,13 @@ impl SensorData for DiskUsage {
} }
#[derive(Debug, Default)] #[derive(Debug, Default)]
pub struct PowerUsage { pub struct CpuPowerUsage {
pub cpu_uj: u64, pub cpu_uj: u64,
pub cpu_packages_uj: Vec<u64>, pub cpu_packages_uj: Vec<u64>,
pub gpu_uj: u64,
} }
impl PowerUsage { impl SensorData for CpuPowerUsage {
pub fn write<W: Write>(&self, mut w: W, hostname: &str) { fn write<W: Write>(&self, mut w: W, hostname: &str) {
writeln!( writeln!(
&mut w, &mut w,
r#"total_power{{host="{}", device="cpu"}} {:.3}"#, r#"total_power{{host="{}", device="cpu"}} {:.3}"#,
@ -209,6 +208,16 @@ impl PowerUsage {
) )
.ok(); .ok();
} }
}
}
#[derive(Debug, Default)]
pub struct GpuPowerUsage {
pub gpu_uj: u64,
}
impl SensorData for GpuPowerUsage {
fn write<W: Write>(&self, mut w: W, hostname: &str) {
if self.gpu_uj > 0 { if self.gpu_uj > 0 {
writeln!( writeln!(
&mut w, &mut w,

View file

@ -8,6 +8,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Mutex; use std::sync::Mutex;
use std::thread::sleep; use std::thread::sleep;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use tracing::{info, warn};
pub mod nvidia; pub mod nvidia;
@ -78,6 +79,7 @@ pub fn update_gpu_power() {
let current_power: u64 = match file.read() { let current_power: u64 = match file.read() {
Ok(current_power) => current_power, Ok(current_power) => current_power,
Err(_) => { Err(_) => {
warn!("failed to read gpu power sensor");
return; return;
} }
}; };
@ -91,6 +93,7 @@ pub fn update_gpu_power() {
sleep(Duration::from_millis(500)); sleep(Duration::from_millis(500));
} }
} }
info!("no gpu sensor");
} }
pub fn gpu_power() -> u64 { pub fn gpu_power() -> u64 {

View file

@ -3,6 +3,7 @@ use std::io;
use std::io::{ErrorKind, Read, Seek}; use std::io::{ErrorKind, Read, Seek};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::str::FromStr; use std::str::FromStr;
use tracing::{debug, instrument, warn};
fn read_to_string_trimmed(path: &Path) -> io::Result<String> { fn read_to_string_trimmed(path: &Path) -> io::Result<String> {
let mut s = read_to_string(path)?; let mut s = read_to_string(path)?;
@ -17,10 +18,15 @@ pub struct FileSource {
} }
impl FileSource { impl FileSource {
#[instrument(skip_all, fields(path = ?path.as_ref()))]
pub fn open<P: AsRef<Path>>(path: P) -> io::Result<FileSource> { pub fn open<P: AsRef<Path>>(path: P) -> io::Result<FileSource> {
debug!("opening sensor");
Ok(FileSource { Ok(FileSource {
buff: String::with_capacity(32), buff: String::with_capacity(32),
file: File::open(path)?, file: File::open(path).map_err(|e| {
warn!("failed to open sensor");
e
})?,
}) })
} }

View file

@ -9,7 +9,7 @@ use self::disk::*;
use self::sensors::*; use self::sensors::*;
use crate::linux::disk::zfs::arcstats; use crate::linux::disk::zfs::arcstats;
use crate::linux::gpu::{update_gpu_power, utilization}; use crate::linux::gpu::{update_gpu_power, utilization};
use crate::linux::power::power_usage; use crate::linux::power::{CpuPowerSource, GpuPowerSource};
use crate::{hostname, Error, MultiSensorSource, Result, SensorData, SensorSource}; use crate::{hostname, Error, MultiSensorSource, Result, SensorData, SensorSource};
use std::fmt::Write; use std::fmt::Write;
use std::sync::Mutex; use std::sync::Mutex;
@ -29,6 +29,8 @@ pub struct Sensors {
mem: Mutex<MemorySource>, mem: Mutex<MemorySource>,
disk_stats: Mutex<DiskStatSource>, disk_stats: Mutex<DiskStatSource>,
disk_usage: Mutex<DiskUsageSource>, disk_usage: Mutex<DiskUsageSource>,
cpu_power: Mutex<CpuPowerSource>,
gpu_power: Mutex<GpuPowerSource>,
} }
impl Sensors { impl Sensors {
@ -43,6 +45,8 @@ impl Sensors {
mem: Mutex::new(MemorySource::new()?), mem: Mutex::new(MemorySource::new()?),
disk_stats: Mutex::new(DiskStatSource::new()?), disk_stats: Mutex::new(DiskStatSource::new()?),
disk_usage: Mutex::new(DiskUsageSource::new()?), disk_usage: Mutex::new(DiskUsageSource::new()?),
cpu_power: Mutex::new(CpuPowerSource::new().unwrap_or_default()),
gpu_power: Mutex::new(GpuPowerSource::default()),
}) })
} }
} }
@ -56,6 +60,8 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
let cpu = sensors.cpu.lock().unwrap().read()?; let cpu = sensors.cpu.lock().unwrap().read()?;
let memory = sensors.mem.lock().unwrap().read()?; let memory = sensors.mem.lock().unwrap().read()?;
let temperatures = sensors.temp.lock().unwrap().read()?; let temperatures = sensors.temp.lock().unwrap().read()?;
let cpu_power = sensors.cpu_power.lock().unwrap().read()?;
let gpu_power = sensors.gpu_power.lock().unwrap().read()?;
let mut net = sensors.net.lock().unwrap(); let mut net = sensors.net.lock().unwrap();
let networks = net.read()?; let networks = net.read()?;
let pools = pools(); let pools = pools();
@ -99,9 +105,8 @@ pub fn get_metrics(sensors: &Sensors) -> Result<String> {
} }
} }
if let Some(power) = power_usage()? { cpu_power.write(&mut result, &sensors.hostname);
power.write(&mut result, &sensors.hostname); gpu_power.write(&mut result, &sensors.hostname);
}
if let Some(arc) = arcstats()? { if let Some(arc) = arcstats()? {
arc.write(&mut result, &sensors.hostname); arc.write(&mut result, &sensors.hostname);
} }

View file

@ -1,56 +1,58 @@
use crate::data::PowerUsage; use crate::data::{CpuPowerUsage, GpuPowerUsage};
use crate::linux::gpu::gpu_power; use crate::linux::gpu::gpu_power;
use crate::{Error, Result}; use crate::linux::hwmon::FileSource;
use std::fs::{read_dir, read_to_string}; use crate::{Result, SensorSource};
use std::sync::atomic::{AtomicBool, Ordering}; use std::fs::read_dir;
use tracing::warn;
static CAN_READ: AtomicBool = AtomicBool::new(true); #[derive(Default)]
pub struct CpuPowerSource {
sources: Vec<FileSource>,
}
pub fn power_usage() -> Result<Option<PowerUsage>> { impl CpuPowerSource {
if !CAN_READ.load(Ordering::Relaxed) { pub fn new() -> Result<CpuPowerSource> {
return Ok(None); let sources: Vec<_> = read_dir("/sys/devices/virtual/powercap/intel-rapl")?
.flatten()
.filter(|path| {
path.file_name()
.to_str()
.unwrap_or_default()
.starts_with("intel-rapl")
})
.map(|entry| {
let mut path = entry.path();
path.push("energy_uj");
path
})
.flat_map(FileSource::open)
.collect();
Ok(CpuPowerSource { sources })
} }
}
let dir = match read_dir("/sys/devices/virtual/powercap/intel-rapl") { impl SensorSource for CpuPowerSource {
Ok(dir) => dir, type Data = CpuPowerUsage;
Err(_) => {
CAN_READ.store(false, Ordering::Relaxed); fn read(&mut self) -> Result<Self::Data> {
return Ok(None); let mut usage = CpuPowerUsage::default();
} for source in self.sources.iter_mut() {
}; let package_usage = source.read()?;
let mut usage = PowerUsage::default();
for package in dir {
let package = package?;
if package
.file_name()
.to_str()
.ok_or_else(|| Error::Other("Invalid name".into()))?
.starts_with("intel-rapl")
{
let mut package_path = package.path();
package_path.push("energy_uj");
let package_usage = match read_to_string(&package_path) {
Err(e) if e.raw_os_error() == Some(13) => {
CAN_READ.store(false, Ordering::Relaxed);
warn!(
package_path = display(package_path.display()),
"can\'t read power usage"
);
return Ok(None);
}
result => result,
}?;
let package_usage = package_usage.trim().parse::<u64>()?;
usage.cpu_uj += package_usage; usage.cpu_uj += package_usage;
usage.cpu_packages_uj.push(package_usage); usage.cpu_packages_uj.push(package_usage);
} }
Ok(usage)
}
}
#[derive(Default)]
pub struct GpuPowerSource;
impl SensorSource for GpuPowerSource {
type Data = GpuPowerUsage;
fn read(&mut self) -> Result<Self::Data> {
let gpu_uj = crate::linux::gpu::nvidia::power().unwrap_or_else(gpu_power);
Ok(GpuPowerUsage { gpu_uj })
} }
usage.gpu_uj = gpu_power();
if let Some(nvidia_power) = crate::linux::gpu::nvidia::power() {
usage.gpu_uj = nvidia_power;
}
Ok(Some(usage))
} }

View file

@ -41,7 +41,7 @@ impl TemperatureSource {
} }
} }
fn average_sensors(sensors: &mut [FileSource]) -> f32 { pub fn average_sensors(sensors: &mut [FileSource]) -> f32 {
if sensors.is_empty() { if sensors.is_empty() {
return 0.0; return 0.0;
} }