diff --git a/cache_slice/Cargo.toml b/cache_slice/Cargo.toml index eeafc02..5d8b255 100644 --- a/cache_slice/Cargo.toml +++ b/cache_slice/Cargo.toml @@ -4,3 +4,5 @@ version = "0.1.0" edition = "2021" [dependencies] +raw-cpuid = "11.0.2" +nix = "0.29.0" diff --git a/cache_slice/src/arch.rs b/cache_slice/src/arch.rs new file mode 100644 index 0000000..1793265 --- /dev/null +++ b/cache_slice/src/arch.rs @@ -0,0 +1,241 @@ +use raw_cpuid::CpuId; +use crate::arch::CpuClass::{IntelCore, IntelXeon, IntelXeonSP}; + +pub(crate) enum CpuClass { + IntelCore, + IntelXeon, + IntelXeonSP, + // Add further CPUs later on +} + +pub(crate) fn determine_cpu_class() -> Option { + let cpuid = CpuId::new(); + let info = if let Some(info) = cpuid.get_feature_info() { + info + } else { + return None; + }; + + // Todo, sift through the documentation to add support for more CPUs + match (info.family_id(), info.model_id()) { + (0x06, 0x4f) + | (0x06, 0x2d) + | (0x06, 0x3e) + | (06, 0x3f) + | (0x06, 0x56) => { + Some(IntelXeon) + } + (0x06, 0x55) => { + Some(IntelXeonSP) + } + // 42, 58, 60, 69, 70, 61, 71, 78, 94, 142, 158 + (0x06, 0x2a) + | (0x06, 0x3a) + | (0x06, 0x3c) + | (0x06, 0x45) + | (0x06, 0x46) + | (0x06, 0x3d) + | (0x06, 0x47) + | (0x06, 0x4e) + | (0x06, 0x5e) + | (0x06, 0x8e) + | (0x06, 0x9e) => { + Some(IntelCore) + } + _ => { + None + } + } +} + +pub(crate) fn get_performance_counters_xeon() -> Option<&'static XeonPerfCounters> { + let cpuid = CpuId::new(); + let info = if let Some(info) = cpuid.get_feature_info() { + info + } else { + return None; + }; + if info.family_id() != 6 { + return None; + } + match info.model_id() { + 0x2d /* 45 */ => Some(&SANDY_BRIDGE_XEON), + 0x3e /* 62 */ => Some(&IVY_BRIDGE_XEON), + 0x3f /* 63 */ => Some(&HASWELL_XEON), + 0x56 /* 86 */ => Some(&BROADWELL_XEON), + _ => None, + } +} + +pub(crate) fn get_performance_counters_core() -> Option<&'static CorePerfCounters> { + let cpuid = CpuId::new(); + let info = if let Some(info) = cpuid.get_feature_info() { + info + } else { + return None; + }; + if info.family_id() != 6 { + return None; + } + // TODO, review if the list can be extended to further CPUs + // TODO, add post Cannon Lake stuff + match info.model_id() { + 0x2a + | 0x3a + | 0x3c + | 0x45 + | 0x46 + | 0x3d + | 0x47 => Some(&SANDYBRIDGE_TO_BROADWELL_CORE), + 0x4e + | 0x5e + | 0x8e + | 0x9e => Some(&SKYLAKE_KABYLAKE_CORE), + _ => None, + } +} + +pub struct XeonPerfCounters { + pub max_slice: u16, + pub msr_pmon_ctr0: &'static [u64], + pub msr_pmon_box_filter: &'static [u64], + pub msr_pmon_ctl0: &'static [u64], + pub msr_pmon_box_ctl: &'static [u64], + pub val_box_freeze: u64, + pub val_box_reset: u64, + pub val_enable_counting: u64, + pub val_select_event: u64, + pub val_filter: u64, + pub val_box_unfreeze: u64, +} + +pub struct CorePerfCounters { + pub max_slice: u16, + pub msr_unc_perf_global_ctr: u64, + pub val_enable_ctrs: u64, + pub msr_unc_cbo_perfevtsel0: &'static [u64], + pub msr_unc_cbo_per_ctr0: &'static [u64], + pub val_disable_ctrs: u64, + pub val_select_evt_core: u64, + pub val_reset_ctrs: u64, +} + +const SANDY_BRIDGE_XEON: XeonPerfCounters = XeonPerfCounters { + max_slice: 8, + msr_pmon_ctr0: &[0xd16, 0xd36, 0xd56, 0xd76, + 0xd96, 0xdb6, 0xdd6, 0xdf6], + msr_pmon_box_filter: &[0xd14, 0xd34, 0xd54, 0xd74, 0xd94, 0xdb4, 0xdd4, 0xdf4], + msr_pmon_ctl0: &[0xd10, 0xd30, 0xd50, 0xd70, + 0xd90, 0xdb0, 0xdd0, 0xdf0], + msr_pmon_box_ctl: &[0xd04, 0xd24, 0xd44, 0xd64, 0xd84, 0xda4, 0xdc4, 0xde4], + val_box_freeze: 0x10100, + val_box_reset: 0x10103, + val_enable_counting: 0x400000, + val_select_event: 0x401134, + val_filter: 0x7c0000, + val_box_unfreeze: 0x10000, +}; + +const IVY_BRIDGE_XEON: XeonPerfCounters = XeonPerfCounters { + max_slice: 15, + msr_pmon_ctr0: &[0xd16, 0xd36, 0xd56, 0xd76, 0xd96, 0xdb6, 0xdd6, 0xdf6, + 0xe16, 0xe36, 0xe56, 0xe76, 0xe96, 0xeb6, 0xed6], + msr_pmon_box_filter: &[0xd14, 0xd34, 0xd54, 0xd74, 0xd94, 0xdb4, 0xdd4, 0xdf4, + 0xe14, 0xe34, 0xe54, 0xe74, 0xe94, 0xeb4, 0xed4], + msr_pmon_ctl0: &[0xd10, 0xd30, 0xd50, 0xd70, 0xd90, 0xdb0, 0xdd0, 0xdf0, + 0xe10, 0xe30, 0xe50, 0xe70, 0xe90, 0xeb0, 0xed0], + msr_pmon_box_ctl: &[0xd04, 0xd24, 0xd44, 0xd64, 0xd84, 0xda4, 0xdc4, 0xde4, + 0xe04, 0xe24, 0xe44, 0xe64, 0xe84, 0xea4, 0xec4], + val_box_freeze: 0x30100, + val_box_reset: 0x30103, + val_enable_counting: 0x400000, + val_select_event: 0x401134, + val_filter: 0x7e0010, + val_box_unfreeze: 0x30000, +}; + +const HASWELL_XEON: XeonPerfCounters = XeonPerfCounters { + max_slice: 18, + msr_pmon_ctr0: &[0xe08, 0xe18, 0xe28, 0xe38, 0xe48, 0xe58, 0xe68, 0xe78, 0xe88, + 0xe98, 0xea8, 0xeb8, 0xec8, 0xed8, 0xee8, 0xef8, 0xf08, 0xf18], + msr_pmon_box_filter: &[0xe05, 0xe15, 0xe25, 0xe35, 0xe45, 0xe55, 0xe65, 0xe75, 0xe85, + 0xe95, 0xea5, 0xeb5, 0xec5, 0xed5, 0xee5, 0xef5, 0xf05, 0xf15], + msr_pmon_ctl0: &[0xe01, 0xe11, 0xe21, 0xe31, 0xe41, 0xe51, 0xe61, 0xe71, 0xe81, + 0xe91, 0xea1, 0xeb1, 0xec1, 0xed1, 0xee1, 0xef1, 0xf01, 0xf11], + msr_pmon_box_ctl: &[0xe00, 0xe10, 0xe20, 0xe30, 0xe40, 0xe50, 0xe60, 0xe70, 0xe80, + 0xe90, 0xea0, 0xeb0, 0xec0, 0xed0, 0xee0, 0xef0, 0xf00, 0xf10], + val_box_freeze: 0x30100, + val_box_reset: 0x30103, + val_enable_counting: 0x400000, + val_select_event: 0x401134, + val_filter: 0x7e0020, + val_box_unfreeze: 0x30000, +}; + +const BROADWELL_XEON: XeonPerfCounters = XeonPerfCounters { + max_slice: 24, + msr_pmon_ctr0: &[0xe08, 0xe18, 0xe28, 0xe38, 0xe48, 0xe58, 0xe68, 0xe78, + 0xe88, 0xe98, 0xea8, 0xeb8, 0xec8, 0xed8, 0xee8, 0xef8, + 0xf08, 0xf18, 0xf28, 0xf38, 0xf48, 0xf58, 0xf68, 0xf78], + msr_pmon_box_filter: &[0xe05, 0xe15, 0xe25, 0xe35, 0xe45, 0xe55, 0xe65, 0xe75, + 0xe85, 0xe95, 0xea5, 0xeb5, 0xec5, 0xed5, 0xee5, 0xef5, + 0xf05, 0xf15, 0xf25, 0xf35, 0xf45, 0xf55, 0xf65, 0xf75], + msr_pmon_ctl0: &[0xe01, 0xe11, 0xe21, 0xe31, 0xe41, 0xe51, 0xe61, 0xe71, + 0xe81, 0xe91, 0xea1, 0xeb1, 0xec1, 0xed1, 0xee1, 0xef1, + 0xf01, 0xf11, 0xf21, 0xf31, 0xf41, 0xf51, 0xf61, 0xf71], + msr_pmon_box_ctl: &[0xe00, 0xe10, 0xe20, 0xe30, 0xe40, 0xe50, 0xe60, 0xe70, + 0xe80, 0xe90, 0xea0, 0xeb0, 0xec0, 0xed0, 0xee0, 0xef0, + 0xf00, 0xf10, 0xf20, 0xf30, 0xf40, 0xf50, 0xf60, 0xf70], + val_box_freeze: 0x30100, + val_box_reset: 0x30103, + val_enable_counting: 0x400000, + val_select_event: 0x401134, + val_filter: 0xfe0020, + val_box_unfreeze: 0x30000, +}; + +// TODO find appropriate values +const ALDER_LAKE_TO_RAPTOR_LAKE_CORE: CorePerfCounters = CorePerfCounters { + max_slice: 10, + msr_unc_perf_global_ctr: 0x2ff0, + val_enable_ctrs: 0, // TODO + msr_unc_cbo_perfevtsel0: &[0x2000, 0x2008, 0x2010, 0x2018, 0x2020, 0x2028, 0x2030, 0x2038, 0x2040, 0x2048], + msr_unc_cbo_per_ctr0: &[0x2002, 0x200a, 0x2012, 0x201a, 0x2022, 0x202a, 0x2032, 0x203a, 0x2042, 0x204a], + val_disable_ctrs: 0, // TODO + val_select_evt_core: 0, // TODO + val_reset_ctrs: 0, // TODO +}; + +// TODO verify his on ICELAKE, and appropriate values. Also deal with backport Cypress Cove ? +const CANNON_LAKE_TO_TIGER_LAKE_CORE: CorePerfCounters = CorePerfCounters { + max_slice: 8, // TODO + msr_unc_perf_global_ctr: 0xe01, + val_enable_ctrs: 0, // TODO + msr_unc_cbo_perfevtsel0: &[0x700, 0x708, 0x710, 0x718, 0x720, 0x728, 0x730, 0x738], + msr_unc_cbo_per_ctr0: &[0x702, 0x70a, 0x712, 0x71a, 0x722, 0x72a, 0x732, 0x73a], + val_disable_ctrs: 0x0, // TODO + val_select_evt_core: 0, // TODO + val_reset_ctrs: 0x0, // TODO +}; + +const SKYLAKE_KABYLAKE_CORE: CorePerfCounters = CorePerfCounters { + max_slice: 7, + msr_unc_perf_global_ctr: 0xe01, + val_enable_ctrs: 0x20000000, + msr_unc_cbo_perfevtsel0: &[0x700, 0x710, 0x720, 0x730, 0x740, 0x750, 0x760], + msr_unc_cbo_per_ctr0: &[0x706, 0x716, 0x726, 0x736, 0x746, 0x756, 0x766], + val_disable_ctrs: 0x0, + val_select_evt_core: 0x408f34, + val_reset_ctrs: 0x0, +}; + +const SANDYBRIDGE_TO_BROADWELL_CORE: CorePerfCounters = CorePerfCounters { + max_slice: 0, + msr_unc_perf_global_ctr: 0x391, + val_enable_ctrs: 0x2000000f, + msr_unc_cbo_perfevtsel0: &[0x700, 0x710, 0x720, 0x730], + msr_unc_cbo_per_ctr0: &[0x706, 0x716, 0x726, 0x736], + val_disable_ctrs: 0x0, + val_select_evt_core: 0x408f34, + val_reset_ctrs: 0x0, +}; diff --git a/cache_slice/src/bin/scan.rs b/cache_slice/src/bin/scan.rs new file mode 100644 index 0000000..a862996 --- /dev/null +++ b/cache_slice/src/bin/scan.rs @@ -0,0 +1,37 @@ +use cache_slice::monitor_address; +use cache_slice::utils::core_per_package; +use nix::sched::{sched_getaffinity, CpuSet}; + + +pub fn main() { + let nb_cores = core_per_package(); + println!("Found {} cores", nb_cores); + + let target = vec![0x0123456789abcdefu64, 64]; + for core in 0..CpuSet::count() { + for addr in target.iter() { + let res = unsafe { monitor_address(addr as *const u64 as *const u8, core as u8, nb_cores) }; + let slice = res.iter().enumerate().max_by_key(|(i, val)| { val }); + match slice { + Some((slice, _)) => { + println!("({:2}) Slice for addr {:x}: {}", core, addr as *const u64 as usize, slice) + } + None => { + eprintln!("({:2}) Failed to find slice for addr {:x}", core, addr as *const u64 as usize) + } + } + } + for addr in target.iter() { + let res = unsafe { monitor_address(addr as *const u64 as *const u8, 0, nb_cores) }; + let slice = res.iter().enumerate().max_by_key(|(i, val)| { val }); + match slice { + Some((slice, _)) => { + println!("({:2}) Slice for addr {:x}: {}", 0, addr as *const u64 as usize, slice) + } + None => { + eprintln!("({:2}) Failed to find slice for addr {:x}", 0, addr as *const u64 as usize) + } + } + } + } +} diff --git a/cache_slice/src/lib.rs b/cache_slice/src/lib.rs index 89e7ac1..9883c32 100644 --- a/cache_slice/src/lib.rs +++ b/cache_slice/src/lib.rs @@ -1,7 +1,109 @@ -pub mod msr; +#![deny(unsafe_op_in_unsafe_fn)] -pub fn add(left: u64, right: u64) -> u64 { - left + right +use std::arch::x86_64::_mm_clflush; +use crate::arch::CpuClass::{IntelCore, IntelXeon, IntelXeonSP}; +use crate::arch::get_performance_counters_xeon; +use crate::Error::UnsupportedCPU; +use crate::msr::{read_msr_on_cpu, write_msr_on_cpu}; + +pub mod msr; +pub mod utils; +mod arch; + +pub enum Error { + UnsupportedCPU, + InvalidParameter, + IO(std::io::Error), +} + +impl From for Error { + fn from(value: std::io::Error) -> Self { + Error::IO(value) + } +} + +const NUM_POKE: usize = 10000; + +unsafe fn poke(addr: *const u8) { + for _i in 0..NUM_POKE { + unsafe { _mm_clflush(addr) }; + } +} + +unsafe fn monitor_xeon(addr: *const u8, cpu: u8, max_cbox: usize) -> Result, Error> { + let performance_counters = if let Some(p) = get_performance_counters_xeon() { + p + } else { + return Err(UnsupportedCPU); + }; + + if (performance_counters.max_slice as usize) < max_cbox { + return Err(Error::InvalidParameter); + } + + // Freeze counters + for i in 0..max_cbox { + write_msr_on_cpu(performance_counters.msr_pmon_ctr0[i], cpu, performance_counters.val_box_freeze)?; + } + + // Reset counters + for i in 0..max_cbox { + write_msr_on_cpu(performance_counters.msr_pmon_ctl0[i], cpu, performance_counters.val_box_reset)?; + } + + // Enable counting + for i in 0..max_cbox { + write_msr_on_cpu(performance_counters.msr_pmon_ctl0[i], cpu, performance_counters.val_enable_counting)?; + } + + // Select event + for i in 0..max_cbox { + write_msr_on_cpu(performance_counters.msr_pmon_ctl0[i], cpu, performance_counters.val_select_event)?; + write_msr_on_cpu(performance_counters.msr_pmon_box_filter[i], cpu, performance_counters.val_filter)?; + } + + // Unfreeze + for i in 0..max_cbox { + write_msr_on_cpu(performance_counters.msr_pmon_box_ctl[i], cpu, performance_counters.val_box_unfreeze)?; + } + + unsafe { poke(addr) }; + + // Freeze counters + for i in 0..max_cbox { + write_msr_on_cpu(performance_counters.msr_pmon_ctr0[i], cpu, performance_counters.val_box_freeze)?; + } + + // Read counters + let mut result = Vec::new(); + for i in 0..max_cbox { + let result = read_msr_on_cpu(performance_counters.msr_pmon_ctr0[i], cpu)?; + result.push(result) + } + + Ok(result) +} + +fn monitor_core(addr: *const u8, cpu: u8, max_core: u8) -> Result, Error> { + // Note, we need to add the workaround for one missing perf counter here. + unimplemented!() +} + +pub unsafe fn monitor_address(addr: *const u8, cpu: u8, max_cbox: u16) -> Result, Error> { + match arch::determine_cpu_class() { + Some(IntelCore) => { + unimplemented!() + } + Some(IntelXeon) => { + unsafe { monitor_xeon(addr, cpu, max_cbox as usize) } + } + Some(IntelXeonSP) => { // TODO + Err(UnsupportedCPU) + } + None => { + Err(UnsupportedCPU) + } + } } #[cfg(test)] @@ -10,7 +112,7 @@ mod tests { #[test] fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + let result = 2; + assert_eq!(result, 2); } } diff --git a/cache_slice/src/msr.rs b/cache_slice/src/msr.rs index 9b6ee29..27d128a 100644 --- a/cache_slice/src/msr.rs +++ b/cache_slice/src/msr.rs @@ -2,35 +2,35 @@ use core::mem::size_of; use std::format; use std::fs::{File, OpenOptions}; use std::os::unix::fs::FileExt; -use std::io::{Result, Error, ErrorKind}; +use std::io::{Result, Error}; -pub fn write_msr_on_cpu(msr: u32, cpu: u8, value: u64) -> Result<()> { +pub fn write_msr_on_cpu(msr: u64, cpu: u8, value: u64) -> Result<()> { let path = format!("/dev/cpu/{}/msr", cpu); - let file : File = OpenOptions::new().write(true).open(path).expect("Failed to open MSR, are you running as root ?"); - match file.write_at(&value.to_ne_bytes(), msr as u64) { + let file: File = OpenOptions::new().write(true).open(path).expect("Failed to open MSR, are you running as root ?"); + match file.write_at(&value.to_ne_bytes(), msr) { Ok(size) => { if size == size_of::() { Ok(()) } else { Err(Error::other("Failed to write complete value")) } - }, + } Err(e) => Err(e) } } -pub fn read_msr_on_cpu(msr: u32, cpu: u8) -> Result { +pub fn read_msr_on_cpu(msr: u64, cpu: u8) -> Result { let path = format!("/dev/cpu/{}/msr", cpu); - let file : File = OpenOptions::new().read(true).open(path).expect("Failed to open MSR, are you running as root ?"); + let file: File = OpenOptions::new().read(true).open(path).expect("Failed to open MSR, are you running as root ?"); let mut read_data = [0u8; size_of::()]; - match file.read_at(&mut read_data, msr as u64) { + match file.read_at(&mut read_data, msr) { Ok(size) => { if size == size_of::() { Ok(u64::from_ne_bytes(read_data)) } else { Err(Error::other("Failed to write complete value")) } - }, + } Err(e) => Err(e) } } diff --git a/cache_slice/src/utils.rs b/cache_slice/src/utils.rs new file mode 100644 index 0000000..681d815 --- /dev/null +++ b/cache_slice/src/utils.rs @@ -0,0 +1,55 @@ +use raw_cpuid::{CpuId, CpuIdReaderNative, ExtendedTopologyIter, TopologyType}; + +fn get_topology_iterator() -> ExtendedTopologyIter { + let cpuid = CpuId::new(); + let topology_iter = if let Some(t) = cpuid.get_extended_topology_info_v2() { + t + } else if let Some(t) = cpuid.get_extended_topology_info() { + t + } else { + panic!("Unsupported CPU"); + }; + topology_iter +} + +pub fn threads_per_package() -> Option { + let topology_iter = get_topology_iterator(); + let mut t_per_package = None; + for level in topology_iter { + if let Some(t_per_package) = t_per_package { + assert!(t_per_package <= level.processors()) + } + t_per_package = Some(level.processors()) + } + t_per_package +} + +pub fn core_per_package() -> u16 { + let topology_iter = get_topology_iterator(); + let mut t_per_core = None; + let mut t_per_package = None; + for level in topology_iter { + //println!("{:?}", level); + match level.level_type() { + TopologyType::SMT => { + assert_eq!(t_per_core, None); + t_per_core = Some(level.processors()); + } + _ => { // TODO identify the right level ? + if let Some(t_per_package) = t_per_package { + assert!(t_per_package <= level.processors()) + } + // Or change the API to enable the user to specify the topology level to use according to the CPU micro-arch. + t_per_package = Some(level.processors()) + } + } + } + if let Some(t_per_core) = t_per_core { + if let Some(t_per_package) = t_per_package { + if t_per_package % t_per_core == 0 { + return t_per_package / t_per_core; + } + } + } + 0 +}