init eACGM

This commit is contained in:
Tokisakix
2025-08-07 10:14:54 +08:00
commit 7a4a0b1b14
51 changed files with 11495 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
from .base import BaseSampler
from .ebpfsampler import eBPFSampler, eBPFSamplerState
from .nvmlsampler import NVMLSampler, NVMLSamplerState
from .gpusampler import GPUSampler, GPUSamplerState

35
eacgm/sampler/base.py Normal file
View File

@@ -0,0 +1,35 @@
class BaseSamplerState:
task:str
pid:int
cpu:int
timestamp:int
message:str
def __init__(self) -> None:
self.task = None
self.pid = None
self.cpu = None
self.timestamp = None
self.message = None
return
def is_none(self) -> bool:
return self.task is None
def __repr__(self) -> str:
info = f"{self.task} {self.pid} {self.cpu} {self.timestamp} {self.message}"
return info
class BaseSampler:
def __init__(self, name:str) -> None:
self.name = name
return
def run(self) -> None:
raise NotImplementedError
def sample(self):
raise NotImplementedError
def close(self) -> None:
raise NotImplementedError

View File

@@ -0,0 +1,88 @@
import time
from typing import Dict, List
from .base import BaseSamplerState, BaseSampler
from eacgm.bpf import BPFState, BccBPF
class eBPFSamplerState(BaseSamplerState):
def __init__(self) -> None:
super().__init__()
return
def from_ebpfstate(other:BPFState) -> "eBPFSamplerState":
state = eBPFSamplerState()
state.task = other.task
state.pid = other.pid
state.cpu = other.cpu
state.timestamp = other.timestamp
state.message = other.message
return state
def collect(self) -> Dict:
event = self.message[1]
if "cuda" in event:
cat = "cuda"
elif "Py" in event:
cat = "python"
elif "nccl" in event:
cat = "nccl"
elif "Torch" in event:
cat = "torch"
else:
cat = "other"
ph = "B" if self.message[0] == "start" else "E"
res = {
"name": event,
"cat": cat,
"pid": self.pid,
"tid": self.pid,
"cpu": self.cpu,
"ts": self.timestamp / 1_000,
"ph": ph,
"message": self.message[2:],
}
return res
def __repr__(self) -> str:
info = f"eBPFSamplerState {super().__repr__()}"
return info
class eBPFSampler(BaseSampler):
def __init__(self, bpf:BccBPF) -> None:
super().__init__(name="eBPFSampler")
self.bpf = bpf
return
def run(self, attach_config:List) -> None:
for attach_info in attach_config:
name = attach_info["name"]
exe_path = attach_info["exe_path"]
exe_sym = attach_info["exe_sym"]
for path in exe_path:
for sym in exe_sym:
try:
self.bpf.attach_uprobe(path, sym, sym + "Entry")
self.bpf.attach_uretprobe(path, sym, sym + "Exit")
except Exception as e:
print(e)
return
def sample(self, time_stamp:float) -> List[eBPFSamplerState]:
samples = []
start_time = time.perf_counter()
flag = True
while flag:
if time.perf_counter() > start_time + time_stamp:
flag = False
state = self.bpf.trace_ebpf(True)
if state.is_none():
continue
state = eBPFSamplerState.from_ebpfstate(state)
samples.append(state)
return samples
def close(self) -> None:
self.bpf.cleanup()
return

View File

@@ -0,0 +1,64 @@
import time
import pynvml
from typing import List
from .base import BaseSampler
class GPUSamplerState:
def __init__(self) -> None:
super().__init__()
self.gpu:int = None
self.name:str = None
self.sm:int = None
self.totMem:int = None
self.usedMem:int = None
self.enc:int = None
self.dec:int = None
self.tmp:int = None
self.fan:int = None
self.usedPower:float = None
self.totPower:float = None
return
def __repr__(self) -> str:
info = f"GPUSamplerState {self.gpu} {self.name} {self.sm} {self.usedMem} {self.totMem} {self.enc} {self.dec} {self.tmp} {self.fan} {self.usedPower} {self.totPower}"
return info
class GPUSampler(BaseSampler):
def __init__(self) -> None:
super().__init__(name="GPUSampler")
pynvml.nvmlInit()
self.deviceCount:int = pynvml.nvmlDeviceGetCount()
self.nvDevices:List = [pynvml.nvmlDeviceGetHandleByIndex(idx) for idx in range(self.deviceCount)]
return
def run(self) -> None:
return
def sample(self) -> List[GPUSamplerState]:
samples = []
for gpu_idx in range(self.deviceCount):
gpu_handle = self.nvDevices[gpu_idx]
try:
sample = GPUSamplerState()
sample.gpu = pynvml.nvmlDeviceGetIndex(gpu_handle)
sample.name = pynvml.nvmlDeviceGetName(gpu_handle)
sample.sm = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
sample.totMem = mem_info.total
sample.usedMem = mem_info.used
sample.enc = pynvml.nvmlDeviceGetEncoderUtilization(gpu_handle)[0]
sample.dec = pynvml.nvmlDeviceGetDecoderUtilization(gpu_handle)[0]
sample.tmp = pynvml.nvmlDeviceGetTemperature(gpu_handle, pynvml.NVML_TEMPERATURE_GPU)
sample.fan = pynvml.nvmlDeviceGetFanSpeed(gpu_handle)
sample.usedPower = pynvml.nvmlDeviceGetPowerUsage(gpu_handle) / 1000.0
sample.totPower = pynvml.nvmlDeviceGetPowerManagementLimit(gpu_handle) / 1000.0
samples.append(sample)
except pynvml.NVMLError as e:
print(e)
pass
return samples
def close(self) -> None:
pynvml.nvmlShutdown()
return

View File

@@ -0,0 +1,57 @@
import time
import pynvml
from typing import List
from .base import BaseSamplerState, BaseSampler
class NVMLSamplerState(BaseSamplerState):
def __init__(self) -> None:
super().__init__()
self.gpu:int = None
self.sm:int = None
self.mem:int = None
self.enc:int = None
self.dec:int = None
return
def __repr__(self) -> str:
info = f"NVMLSamplerState {self.gpu} {self.sm} {self.mem} {self.enc} {self.dec} {super().__repr__()}"
return info
class NVMLSampler(BaseSampler):
def __init__(self) -> None:
super().__init__(name="NVMLSampler")
pynvml.nvmlInit()
self.deviceCount:int = pynvml.nvmlDeviceGetCount()
self.nvDevices:List = [pynvml.nvmlDeviceGetHandleByIndex(idx) for idx in range(self.deviceCount)]
return
def run(self) -> None:
return
def sample(self, time_stamp:float) -> List[NVMLSamplerState]:
samples = []
for gpu_idx in range(self.deviceCount):
gpu_handle = self.nvDevices[gpu_idx]
try:
processes = pynvml.nvmlDeviceGetProcessUtilization(gpu_handle, time.time_ns() // 1000 - 1000_000 * time_stamp)
for process in processes:
state = NVMLSamplerState()
state.task = None
state.pid = process.pid
state.cpu = None
state.timestamp = process.timeStamp
state.message = None
state.gpu = gpu_idx
state.sm = process.smUtil
state.mem = process.memUtil
state.enc = process.encUtil
state.dec = process.decUtil
samples.append(state)
except pynvml.NVMLError as e:
pass
return samples
def close(self) -> None:
pynvml.nvmlShutdown()
return