init eACGM

This commit is contained in:
Tokisakix
2025-08-07 10:14:54 +08:00
commit 7a4a0b1b14
51 changed files with 11495 additions and 0 deletions

1
eacgm/__init__.py Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.1.0"

2
eacgm/bpf/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
from .base import BPFState, BaseBPF
from .bccBPF import BccBPF

38
eacgm/bpf/base.py Normal file
View File

@@ -0,0 +1,38 @@
class BPFState:
task:str
pid:int
cpu:int
timestamp:int
message:str
def __init__(self) -> None:
self.task = None
self.pid = None
self.cpu = None
self.timestamp = None
self.message = None
return
def is_none(self) -> bool:
return self.task is None
def __repr__(self) -> str:
info = f"BPFState {self.task} {self.pid} { self.cpu} {self.timestamp} {self.message}"
return info
class BaseBPF:
def __init__(self, name:str) -> None:
self.name = name
return
def attach_uprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
raise NotADirectoryError
def attach_uretprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
raise NotADirectoryError
def cleanup(self) -> None:
raise NotADirectoryError
def trace_ebpf(self) -> BPFState:
raise NotADirectoryError

34
eacgm/bpf/bccBPF.py Normal file
View File

@@ -0,0 +1,34 @@
from bcc import BPF
from typing import List
from .base import BPFState, BaseBPF
class BccBPF(BaseBPF):
def __init__(self, name:str, text:str, cflags:List=[]) -> None:
super().__init__(name)
self.bpf = BPF(text=text, cflags=cflags)
return
def attach_uprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
self.bpf.attach_uprobe(exe_path, exe_sym, fn_name=bpf_func)
return
def attach_uretprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
self.bpf.attach_uretprobe(exe_path, exe_sym, fn_name=bpf_func)
return
def cleanup(self) -> None:
self.bpf.cleanup()
return
def trace_ebpf(self, nonblocking:bool) -> BPFState:
(task, pid, cpu, _, _, message) = self.bpf.trace_fields(nonblocking)
state = BPFState()
if task is not None:
message = message.decode("utf-8")
state.task = task.decode("utf-8")
state.pid = int(pid)
state.cpu = int(cpu)
state.timestamp = int(message.split("@")[0])
state.message = message.split("@")[1:]
return state

View File

@@ -0,0 +1 @@
from .profetto import to_perfetto

View File

@@ -0,0 +1,21 @@
from typing import List
from eacgm.sampler import eBPFSamplerState
def to_perfetto(states:List[eBPFSamplerState]) -> List:
res = []
last_event = {}
for state in states:
if not isinstance(state, eBPFSamplerState):
continue
state = state.collect()
name = f"{state['name']}-{state['pid']}"
last_state = last_event.get(name, None)
if last_state is None:
last_event[name] = state
continue
if last_state["ph"] == "B" and state["ph"] == "E":
res.append(last_state)
res.append(state)
last_event[name] = state
return res

View File

@@ -0,0 +1,4 @@
from .base import BaseSampler
from .ebpfsampler import eBPFSampler, eBPFSamplerState
from .nvmlsampler import NVMLSampler, NVMLSamplerState
from .gpusampler import GPUSampler, GPUSamplerState

35
eacgm/sampler/base.py Normal file
View File

@@ -0,0 +1,35 @@
class BaseSamplerState:
task:str
pid:int
cpu:int
timestamp:int
message:str
def __init__(self) -> None:
self.task = None
self.pid = None
self.cpu = None
self.timestamp = None
self.message = None
return
def is_none(self) -> bool:
return self.task is None
def __repr__(self) -> str:
info = f"{self.task} {self.pid} {self.cpu} {self.timestamp} {self.message}"
return info
class BaseSampler:
def __init__(self, name:str) -> None:
self.name = name
return
def run(self) -> None:
raise NotImplementedError
def sample(self):
raise NotImplementedError
def close(self) -> None:
raise NotImplementedError

View File

@@ -0,0 +1,88 @@
import time
from typing import Dict, List
from .base import BaseSamplerState, BaseSampler
from eacgm.bpf import BPFState, BccBPF
class eBPFSamplerState(BaseSamplerState):
def __init__(self) -> None:
super().__init__()
return
def from_ebpfstate(other:BPFState) -> "eBPFSamplerState":
state = eBPFSamplerState()
state.task = other.task
state.pid = other.pid
state.cpu = other.cpu
state.timestamp = other.timestamp
state.message = other.message
return state
def collect(self) -> Dict:
event = self.message[1]
if "cuda" in event:
cat = "cuda"
elif "Py" in event:
cat = "python"
elif "nccl" in event:
cat = "nccl"
elif "Torch" in event:
cat = "torch"
else:
cat = "other"
ph = "B" if self.message[0] == "start" else "E"
res = {
"name": event,
"cat": cat,
"pid": self.pid,
"tid": self.pid,
"cpu": self.cpu,
"ts": self.timestamp / 1_000,
"ph": ph,
"message": self.message[2:],
}
return res
def __repr__(self) -> str:
info = f"eBPFSamplerState {super().__repr__()}"
return info
class eBPFSampler(BaseSampler):
def __init__(self, bpf:BccBPF) -> None:
super().__init__(name="eBPFSampler")
self.bpf = bpf
return
def run(self, attach_config:List) -> None:
for attach_info in attach_config:
name = attach_info["name"]
exe_path = attach_info["exe_path"]
exe_sym = attach_info["exe_sym"]
for path in exe_path:
for sym in exe_sym:
try:
self.bpf.attach_uprobe(path, sym, sym + "Entry")
self.bpf.attach_uretprobe(path, sym, sym + "Exit")
except Exception as e:
print(e)
return
def sample(self, time_stamp:float) -> List[eBPFSamplerState]:
samples = []
start_time = time.perf_counter()
flag = True
while flag:
if time.perf_counter() > start_time + time_stamp:
flag = False
state = self.bpf.trace_ebpf(True)
if state.is_none():
continue
state = eBPFSamplerState.from_ebpfstate(state)
samples.append(state)
return samples
def close(self) -> None:
self.bpf.cleanup()
return

View File

@@ -0,0 +1,64 @@
import time
import pynvml
from typing import List
from .base import BaseSampler
class GPUSamplerState:
def __init__(self) -> None:
super().__init__()
self.gpu:int = None
self.name:str = None
self.sm:int = None
self.totMem:int = None
self.usedMem:int = None
self.enc:int = None
self.dec:int = None
self.tmp:int = None
self.fan:int = None
self.usedPower:float = None
self.totPower:float = None
return
def __repr__(self) -> str:
info = f"GPUSamplerState {self.gpu} {self.name} {self.sm} {self.usedMem} {self.totMem} {self.enc} {self.dec} {self.tmp} {self.fan} {self.usedPower} {self.totPower}"
return info
class GPUSampler(BaseSampler):
def __init__(self) -> None:
super().__init__(name="GPUSampler")
pynvml.nvmlInit()
self.deviceCount:int = pynvml.nvmlDeviceGetCount()
self.nvDevices:List = [pynvml.nvmlDeviceGetHandleByIndex(idx) for idx in range(self.deviceCount)]
return
def run(self) -> None:
return
def sample(self) -> List[GPUSamplerState]:
samples = []
for gpu_idx in range(self.deviceCount):
gpu_handle = self.nvDevices[gpu_idx]
try:
sample = GPUSamplerState()
sample.gpu = pynvml.nvmlDeviceGetIndex(gpu_handle)
sample.name = pynvml.nvmlDeviceGetName(gpu_handle)
sample.sm = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
sample.totMem = mem_info.total
sample.usedMem = mem_info.used
sample.enc = pynvml.nvmlDeviceGetEncoderUtilization(gpu_handle)[0]
sample.dec = pynvml.nvmlDeviceGetDecoderUtilization(gpu_handle)[0]
sample.tmp = pynvml.nvmlDeviceGetTemperature(gpu_handle, pynvml.NVML_TEMPERATURE_GPU)
sample.fan = pynvml.nvmlDeviceGetFanSpeed(gpu_handle)
sample.usedPower = pynvml.nvmlDeviceGetPowerUsage(gpu_handle) / 1000.0
sample.totPower = pynvml.nvmlDeviceGetPowerManagementLimit(gpu_handle) / 1000.0
samples.append(sample)
except pynvml.NVMLError as e:
print(e)
pass
return samples
def close(self) -> None:
pynvml.nvmlShutdown()
return

View File

@@ -0,0 +1,57 @@
import time
import pynvml
from typing import List
from .base import BaseSamplerState, BaseSampler
class NVMLSamplerState(BaseSamplerState):
def __init__(self) -> None:
super().__init__()
self.gpu:int = None
self.sm:int = None
self.mem:int = None
self.enc:int = None
self.dec:int = None
return
def __repr__(self) -> str:
info = f"NVMLSamplerState {self.gpu} {self.sm} {self.mem} {self.enc} {self.dec} {super().__repr__()}"
return info
class NVMLSampler(BaseSampler):
def __init__(self) -> None:
super().__init__(name="NVMLSampler")
pynvml.nvmlInit()
self.deviceCount:int = pynvml.nvmlDeviceGetCount()
self.nvDevices:List = [pynvml.nvmlDeviceGetHandleByIndex(idx) for idx in range(self.deviceCount)]
return
def run(self) -> None:
return
def sample(self, time_stamp:float) -> List[NVMLSamplerState]:
samples = []
for gpu_idx in range(self.deviceCount):
gpu_handle = self.nvDevices[gpu_idx]
try:
processes = pynvml.nvmlDeviceGetProcessUtilization(gpu_handle, time.time_ns() // 1000 - 1000_000 * time_stamp)
for process in processes:
state = NVMLSamplerState()
state.task = None
state.pid = process.pid
state.cpu = None
state.timestamp = process.timeStamp
state.message = None
state.gpu = gpu_idx
state.sm = process.smUtil
state.mem = process.memUtil
state.enc = process.encUtil
state.dec = process.decUtil
samples.append(state)
except pynvml.NVMLError as e:
pass
return samples
def close(self) -> None:
pynvml.nvmlShutdown()
return

3
eacgm/webui/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from reader import log_reader
from connect import database
from insert import push_log

19
eacgm/webui/connect.py Normal file
View File

@@ -0,0 +1,19 @@
# connect to mysql database
import mysql.connector
class database:
def __init__(self, ip, port, user, pwd, database) -> None:
self.conn = mysql.connector.connect(
host = ip,
port = port,
user = user,
password = pwd,
database = database
)
self.cursor = self.conn.cursor()
def exec(self, cmd: str):
self.cursor.execute(cmd)
result = self.cursor.fetchall()
self.conn.commit()
return result

113
eacgm/webui/insert.py Normal file
View File

@@ -0,0 +1,113 @@
# insert data into mysql database
import argparse
from reader import log_reader
from reader import log_reader
from connect import database
import time
def get_col_num(db) -> int:
col_num = db.exec(
f"SELECT COUNT(*) FROM information_schema.COLUMNS where `TABLE_SCHEMA` = 'grafana' and `TABLE_NAME` = 'CudaEvent';"
)
col_num = col_num[0][0]
return col_num
def lts_cuda_event(db) -> list:
"""to get the latest cuda event before
"""
ret = db.exec(f"SELECT * FROM grafana.`CudaEvent` ORDER BY time DESC LIMIT 1;")
# print(ret)
if len(ret) == 0:
col_num = get_col_num(db)
lts_event = [None] * (col_num - 1)
else:
lts_event = list(ret[0][1:])
return lts_event
def lts_event_cnt(db) -> dict:
"""to get the latest data of event count
"""
ret = db.exec(
"""SELECT * FROM grafana.events;"""
)
d = dict()
for name, cnt in ret:
d[name] = cnt
return d
def add_col(db):
col_num = get_col_num(db)
db.exec(f"""ALTER TABLE grafana.`CudaEvent` ADD COLUMN event{col_num} CHAR(255)""")
def del_col(db, col_num):
db.exec(f"""ALTER TABLE grafana.`CudaEvent` DROP COLUMN event{col_num};""")
def add_empty(max_time, db):
col_num = get_col_num(db)
db.exec(f"""INSERT INTO grafana.`CudaEvent` VALUES ({max_time}, {','.join(['NULL'] * (col_num - 1))})""")
def push_log(db, log):
max_time = 0
## latest cuda event
cuda_event = lts_cuda_event(db)
## latest event cnt
event_cnt = lts_event_cnt(db)
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
for line_idx, l in enumerate(log):
if l['op'] == 'start':
if l['name'] in event_cnt:
event_cnt[l['name']] += 1
else:
event_cnt[l["name"]] = 1
empty_col = False
i = 0
for e in cuda_event:
if e is None:
cuda_event[i] = l['name']
empty_col = True
break
i += 1
if not empty_col:
if len(cmd) > 37:
cmd = cmd[:-1] + ";"
# print(cmd)
# print('------')
db.exec(cmd)
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
add_col(db)
cuda_event.append(l['name'])
elif l['op'] == 'end':
if l['name'] in event_cnt:
if event_cnt[l["name"]] == 0:
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended more than starting")
#raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended more than starting")
continue
event_cnt[l["name"]] -= 1
for i, e in enumerate(cuda_event[::-1]):
if e == l["name"]:
cuda_event[len(cuda_event)- 1 - i] = None
break
if l["name"] not in event_cnt:
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended without starting")
# raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended without starting")
continue
else:
raise ValueError(f"in line {line_idx + 1}: unknown operation {l['op']}")
tmp_cmd = f"({l['time']}, "
max_time = max(max_time, float(l['time']))
for e in cuda_event:
if e is None:
tmp_cmd += "NULL, "
else:
tmp_cmd += f"'{e}', "
tmp_cmd = tmp_cmd[:-2] + "),"
cmd += tmp_cmd
if len(cmd) > 37:
cmd = cmd[:-1] + ";"
# print(cmd)
# print("------")
db.exec(cmd)
# print(cuda_event)
# print(event_cnt)
add_empty(max_time,db)

13
eacgm/webui/reader.py Normal file
View File

@@ -0,0 +1,13 @@
def log_reader(path):
with open(path, 'r') as f:
data = f.readlines()
for i, d in enumerate(data):
data[i] = d.strip().split(' ')
ret = []
for d in data:
tmp = dict()
tmp['time'] = d[3]
tmp['op'] = d[5]
tmp['name'] = d[6]
ret.append(tmp)
return ret