init eACGM
This commit is contained in:
1
eacgm/__init__.py
Normal file
1
eacgm/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "0.1.0"
|
2
eacgm/bpf/__init__.py
Normal file
2
eacgm/bpf/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .base import BPFState, BaseBPF
|
||||
from .bccBPF import BccBPF
|
38
eacgm/bpf/base.py
Normal file
38
eacgm/bpf/base.py
Normal file
@@ -0,0 +1,38 @@
|
||||
class BPFState:
|
||||
task:str
|
||||
pid:int
|
||||
cpu:int
|
||||
timestamp:int
|
||||
message:str
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.task = None
|
||||
self.pid = None
|
||||
self.cpu = None
|
||||
self.timestamp = None
|
||||
self.message = None
|
||||
return
|
||||
|
||||
def is_none(self) -> bool:
|
||||
return self.task is None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
info = f"BPFState {self.task} {self.pid} { self.cpu} {self.timestamp} {self.message}"
|
||||
return info
|
||||
|
||||
class BaseBPF:
|
||||
def __init__(self, name:str) -> None:
|
||||
self.name = name
|
||||
return
|
||||
|
||||
def attach_uprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
|
||||
raise NotADirectoryError
|
||||
|
||||
def attach_uretprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
|
||||
raise NotADirectoryError
|
||||
|
||||
def cleanup(self) -> None:
|
||||
raise NotADirectoryError
|
||||
|
||||
def trace_ebpf(self) -> BPFState:
|
||||
raise NotADirectoryError
|
34
eacgm/bpf/bccBPF.py
Normal file
34
eacgm/bpf/bccBPF.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from bcc import BPF
|
||||
from typing import List
|
||||
|
||||
from .base import BPFState, BaseBPF
|
||||
|
||||
class BccBPF(BaseBPF):
|
||||
def __init__(self, name:str, text:str, cflags:List=[]) -> None:
|
||||
super().__init__(name)
|
||||
self.bpf = BPF(text=text, cflags=cflags)
|
||||
return
|
||||
|
||||
def attach_uprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
|
||||
self.bpf.attach_uprobe(exe_path, exe_sym, fn_name=bpf_func)
|
||||
return
|
||||
|
||||
def attach_uretprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
|
||||
self.bpf.attach_uretprobe(exe_path, exe_sym, fn_name=bpf_func)
|
||||
return
|
||||
|
||||
def cleanup(self) -> None:
|
||||
self.bpf.cleanup()
|
||||
return
|
||||
|
||||
def trace_ebpf(self, nonblocking:bool) -> BPFState:
|
||||
(task, pid, cpu, _, _, message) = self.bpf.trace_fields(nonblocking)
|
||||
state = BPFState()
|
||||
if task is not None:
|
||||
message = message.decode("utf-8")
|
||||
state.task = task.decode("utf-8")
|
||||
state.pid = int(pid)
|
||||
state.cpu = int(cpu)
|
||||
state.timestamp = int(message.split("@")[0])
|
||||
state.message = message.split("@")[1:]
|
||||
return state
|
1
eacgm/collector/__init__.py
Normal file
1
eacgm/collector/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .profetto import to_perfetto
|
21
eacgm/collector/profetto.py
Normal file
21
eacgm/collector/profetto.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from typing import List
|
||||
|
||||
from eacgm.sampler import eBPFSamplerState
|
||||
|
||||
def to_perfetto(states:List[eBPFSamplerState]) -> List:
|
||||
res = []
|
||||
last_event = {}
|
||||
for state in states:
|
||||
if not isinstance(state, eBPFSamplerState):
|
||||
continue
|
||||
state = state.collect()
|
||||
name = f"{state['name']}-{state['pid']}"
|
||||
last_state = last_event.get(name, None)
|
||||
if last_state is None:
|
||||
last_event[name] = state
|
||||
continue
|
||||
if last_state["ph"] == "B" and state["ph"] == "E":
|
||||
res.append(last_state)
|
||||
res.append(state)
|
||||
last_event[name] = state
|
||||
return res
|
4
eacgm/sampler/__init__.py
Normal file
4
eacgm/sampler/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .base import BaseSampler
|
||||
from .ebpfsampler import eBPFSampler, eBPFSamplerState
|
||||
from .nvmlsampler import NVMLSampler, NVMLSamplerState
|
||||
from .gpusampler import GPUSampler, GPUSamplerState
|
35
eacgm/sampler/base.py
Normal file
35
eacgm/sampler/base.py
Normal file
@@ -0,0 +1,35 @@
|
||||
class BaseSamplerState:
|
||||
task:str
|
||||
pid:int
|
||||
cpu:int
|
||||
timestamp:int
|
||||
message:str
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.task = None
|
||||
self.pid = None
|
||||
self.cpu = None
|
||||
self.timestamp = None
|
||||
self.message = None
|
||||
return
|
||||
|
||||
def is_none(self) -> bool:
|
||||
return self.task is None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
info = f"{self.task} {self.pid} {self.cpu} {self.timestamp} {self.message}"
|
||||
return info
|
||||
|
||||
class BaseSampler:
|
||||
def __init__(self, name:str) -> None:
|
||||
self.name = name
|
||||
return
|
||||
|
||||
def run(self) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def sample(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def close(self) -> None:
|
||||
raise NotImplementedError
|
88
eacgm/sampler/ebpfsampler.py
Normal file
88
eacgm/sampler/ebpfsampler.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import time
|
||||
from typing import Dict, List
|
||||
|
||||
from .base import BaseSamplerState, BaseSampler
|
||||
from eacgm.bpf import BPFState, BccBPF
|
||||
|
||||
class eBPFSamplerState(BaseSamplerState):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
return
|
||||
|
||||
def from_ebpfstate(other:BPFState) -> "eBPFSamplerState":
|
||||
state = eBPFSamplerState()
|
||||
state.task = other.task
|
||||
state.pid = other.pid
|
||||
state.cpu = other.cpu
|
||||
state.timestamp = other.timestamp
|
||||
state.message = other.message
|
||||
return state
|
||||
|
||||
def collect(self) -> Dict:
|
||||
event = self.message[1]
|
||||
if "cuda" in event:
|
||||
cat = "cuda"
|
||||
elif "Py" in event:
|
||||
cat = "python"
|
||||
elif "nccl" in event:
|
||||
cat = "nccl"
|
||||
elif "Torch" in event:
|
||||
cat = "torch"
|
||||
else:
|
||||
cat = "other"
|
||||
ph = "B" if self.message[0] == "start" else "E"
|
||||
res = {
|
||||
"name": event,
|
||||
"cat": cat,
|
||||
"pid": self.pid,
|
||||
"tid": self.pid,
|
||||
"cpu": self.cpu,
|
||||
"ts": self.timestamp / 1_000,
|
||||
"ph": ph,
|
||||
"message": self.message[2:],
|
||||
}
|
||||
return res
|
||||
|
||||
def __repr__(self) -> str:
|
||||
info = f"eBPFSamplerState {super().__repr__()}"
|
||||
return info
|
||||
|
||||
class eBPFSampler(BaseSampler):
|
||||
def __init__(self, bpf:BccBPF) -> None:
|
||||
super().__init__(name="eBPFSampler")
|
||||
self.bpf = bpf
|
||||
return
|
||||
|
||||
def run(self, attach_config:List) -> None:
|
||||
for attach_info in attach_config:
|
||||
name = attach_info["name"]
|
||||
exe_path = attach_info["exe_path"]
|
||||
exe_sym = attach_info["exe_sym"]
|
||||
for path in exe_path:
|
||||
for sym in exe_sym:
|
||||
try:
|
||||
self.bpf.attach_uprobe(path, sym, sym + "Entry")
|
||||
self.bpf.attach_uretprobe(path, sym, sym + "Exit")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return
|
||||
|
||||
def sample(self, time_stamp:float) -> List[eBPFSamplerState]:
|
||||
samples = []
|
||||
start_time = time.perf_counter()
|
||||
|
||||
flag = True
|
||||
while flag:
|
||||
if time.perf_counter() > start_time + time_stamp:
|
||||
flag = False
|
||||
state = self.bpf.trace_ebpf(True)
|
||||
if state.is_none():
|
||||
continue
|
||||
state = eBPFSamplerState.from_ebpfstate(state)
|
||||
samples.append(state)
|
||||
|
||||
return samples
|
||||
|
||||
def close(self) -> None:
|
||||
self.bpf.cleanup()
|
||||
return
|
64
eacgm/sampler/gpusampler.py
Normal file
64
eacgm/sampler/gpusampler.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import time
|
||||
import pynvml
|
||||
from typing import List
|
||||
|
||||
from .base import BaseSampler
|
||||
|
||||
class GPUSamplerState:
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.gpu:int = None
|
||||
self.name:str = None
|
||||
self.sm:int = None
|
||||
self.totMem:int = None
|
||||
self.usedMem:int = None
|
||||
self.enc:int = None
|
||||
self.dec:int = None
|
||||
self.tmp:int = None
|
||||
self.fan:int = None
|
||||
self.usedPower:float = None
|
||||
self.totPower:float = None
|
||||
return
|
||||
|
||||
def __repr__(self) -> str:
|
||||
info = f"GPUSamplerState {self.gpu} {self.name} {self.sm} {self.usedMem} {self.totMem} {self.enc} {self.dec} {self.tmp} {self.fan} {self.usedPower} {self.totPower}"
|
||||
return info
|
||||
|
||||
class GPUSampler(BaseSampler):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(name="GPUSampler")
|
||||
pynvml.nvmlInit()
|
||||
self.deviceCount:int = pynvml.nvmlDeviceGetCount()
|
||||
self.nvDevices:List = [pynvml.nvmlDeviceGetHandleByIndex(idx) for idx in range(self.deviceCount)]
|
||||
return
|
||||
|
||||
def run(self) -> None:
|
||||
return
|
||||
|
||||
def sample(self) -> List[GPUSamplerState]:
|
||||
samples = []
|
||||
for gpu_idx in range(self.deviceCount):
|
||||
gpu_handle = self.nvDevices[gpu_idx]
|
||||
try:
|
||||
sample = GPUSamplerState()
|
||||
sample.gpu = pynvml.nvmlDeviceGetIndex(gpu_handle)
|
||||
sample.name = pynvml.nvmlDeviceGetName(gpu_handle)
|
||||
sample.sm = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
|
||||
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
|
||||
sample.totMem = mem_info.total
|
||||
sample.usedMem = mem_info.used
|
||||
sample.enc = pynvml.nvmlDeviceGetEncoderUtilization(gpu_handle)[0]
|
||||
sample.dec = pynvml.nvmlDeviceGetDecoderUtilization(gpu_handle)[0]
|
||||
sample.tmp = pynvml.nvmlDeviceGetTemperature(gpu_handle, pynvml.NVML_TEMPERATURE_GPU)
|
||||
sample.fan = pynvml.nvmlDeviceGetFanSpeed(gpu_handle)
|
||||
sample.usedPower = pynvml.nvmlDeviceGetPowerUsage(gpu_handle) / 1000.0
|
||||
sample.totPower = pynvml.nvmlDeviceGetPowerManagementLimit(gpu_handle) / 1000.0
|
||||
samples.append(sample)
|
||||
except pynvml.NVMLError as e:
|
||||
print(e)
|
||||
pass
|
||||
return samples
|
||||
|
||||
def close(self) -> None:
|
||||
pynvml.nvmlShutdown()
|
||||
return
|
57
eacgm/sampler/nvmlsampler.py
Normal file
57
eacgm/sampler/nvmlsampler.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import time
|
||||
import pynvml
|
||||
from typing import List
|
||||
|
||||
from .base import BaseSamplerState, BaseSampler
|
||||
|
||||
class NVMLSamplerState(BaseSamplerState):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.gpu:int = None
|
||||
self.sm:int = None
|
||||
self.mem:int = None
|
||||
self.enc:int = None
|
||||
self.dec:int = None
|
||||
return
|
||||
|
||||
def __repr__(self) -> str:
|
||||
info = f"NVMLSamplerState {self.gpu} {self.sm} {self.mem} {self.enc} {self.dec} {super().__repr__()}"
|
||||
return info
|
||||
|
||||
class NVMLSampler(BaseSampler):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(name="NVMLSampler")
|
||||
pynvml.nvmlInit()
|
||||
self.deviceCount:int = pynvml.nvmlDeviceGetCount()
|
||||
self.nvDevices:List = [pynvml.nvmlDeviceGetHandleByIndex(idx) for idx in range(self.deviceCount)]
|
||||
return
|
||||
|
||||
def run(self) -> None:
|
||||
return
|
||||
|
||||
def sample(self, time_stamp:float) -> List[NVMLSamplerState]:
|
||||
samples = []
|
||||
for gpu_idx in range(self.deviceCount):
|
||||
gpu_handle = self.nvDevices[gpu_idx]
|
||||
try:
|
||||
processes = pynvml.nvmlDeviceGetProcessUtilization(gpu_handle, time.time_ns() // 1000 - 1000_000 * time_stamp)
|
||||
for process in processes:
|
||||
state = NVMLSamplerState()
|
||||
state.task = None
|
||||
state.pid = process.pid
|
||||
state.cpu = None
|
||||
state.timestamp = process.timeStamp
|
||||
state.message = None
|
||||
state.gpu = gpu_idx
|
||||
state.sm = process.smUtil
|
||||
state.mem = process.memUtil
|
||||
state.enc = process.encUtil
|
||||
state.dec = process.decUtil
|
||||
samples.append(state)
|
||||
except pynvml.NVMLError as e:
|
||||
pass
|
||||
return samples
|
||||
|
||||
def close(self) -> None:
|
||||
pynvml.nvmlShutdown()
|
||||
return
|
3
eacgm/webui/__init__.py
Normal file
3
eacgm/webui/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from reader import log_reader
|
||||
from connect import database
|
||||
from insert import push_log
|
19
eacgm/webui/connect.py
Normal file
19
eacgm/webui/connect.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# connect to mysql database
|
||||
import mysql.connector
|
||||
|
||||
class database:
|
||||
def __init__(self, ip, port, user, pwd, database) -> None:
|
||||
self.conn = mysql.connector.connect(
|
||||
host = ip,
|
||||
port = port,
|
||||
user = user,
|
||||
password = pwd,
|
||||
database = database
|
||||
)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def exec(self, cmd: str):
|
||||
self.cursor.execute(cmd)
|
||||
result = self.cursor.fetchall()
|
||||
self.conn.commit()
|
||||
return result
|
113
eacgm/webui/insert.py
Normal file
113
eacgm/webui/insert.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# insert data into mysql database
|
||||
import argparse
|
||||
from reader import log_reader
|
||||
from reader import log_reader
|
||||
from connect import database
|
||||
import time
|
||||
|
||||
def get_col_num(db) -> int:
|
||||
col_num = db.exec(
|
||||
f"SELECT COUNT(*) FROM information_schema.COLUMNS where `TABLE_SCHEMA` = 'grafana' and `TABLE_NAME` = 'CudaEvent';"
|
||||
)
|
||||
col_num = col_num[0][0]
|
||||
return col_num
|
||||
|
||||
def lts_cuda_event(db) -> list:
|
||||
"""to get the latest cuda event before
|
||||
"""
|
||||
ret = db.exec(f"SELECT * FROM grafana.`CudaEvent` ORDER BY time DESC LIMIT 1;")
|
||||
# print(ret)
|
||||
if len(ret) == 0:
|
||||
col_num = get_col_num(db)
|
||||
lts_event = [None] * (col_num - 1)
|
||||
else:
|
||||
lts_event = list(ret[0][1:])
|
||||
return lts_event
|
||||
|
||||
def lts_event_cnt(db) -> dict:
|
||||
"""to get the latest data of event count
|
||||
"""
|
||||
ret = db.exec(
|
||||
"""SELECT * FROM grafana.events;"""
|
||||
)
|
||||
d = dict()
|
||||
for name, cnt in ret:
|
||||
d[name] = cnt
|
||||
return d
|
||||
|
||||
def add_col(db):
|
||||
col_num = get_col_num(db)
|
||||
db.exec(f"""ALTER TABLE grafana.`CudaEvent` ADD COLUMN event{col_num} CHAR(255)""")
|
||||
|
||||
def del_col(db, col_num):
|
||||
db.exec(f"""ALTER TABLE grafana.`CudaEvent` DROP COLUMN event{col_num};""")
|
||||
|
||||
def add_empty(max_time, db):
|
||||
col_num = get_col_num(db)
|
||||
db.exec(f"""INSERT INTO grafana.`CudaEvent` VALUES ({max_time}, {','.join(['NULL'] * (col_num - 1))})""")
|
||||
|
||||
def push_log(db, log):
|
||||
max_time = 0
|
||||
## latest cuda event
|
||||
cuda_event = lts_cuda_event(db)
|
||||
## latest event cnt
|
||||
event_cnt = lts_event_cnt(db)
|
||||
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
|
||||
for line_idx, l in enumerate(log):
|
||||
if l['op'] == 'start':
|
||||
if l['name'] in event_cnt:
|
||||
event_cnt[l['name']] += 1
|
||||
else:
|
||||
event_cnt[l["name"]] = 1
|
||||
empty_col = False
|
||||
i = 0
|
||||
for e in cuda_event:
|
||||
if e is None:
|
||||
cuda_event[i] = l['name']
|
||||
empty_col = True
|
||||
break
|
||||
i += 1
|
||||
if not empty_col:
|
||||
if len(cmd) > 37:
|
||||
cmd = cmd[:-1] + ";"
|
||||
# print(cmd)
|
||||
# print('------')
|
||||
db.exec(cmd)
|
||||
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
|
||||
add_col(db)
|
||||
cuda_event.append(l['name'])
|
||||
elif l['op'] == 'end':
|
||||
if l['name'] in event_cnt:
|
||||
if event_cnt[l["name"]] == 0:
|
||||
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended more than starting")
|
||||
#raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended more than starting")
|
||||
continue
|
||||
event_cnt[l["name"]] -= 1
|
||||
for i, e in enumerate(cuda_event[::-1]):
|
||||
if e == l["name"]:
|
||||
cuda_event[len(cuda_event)- 1 - i] = None
|
||||
break
|
||||
if l["name"] not in event_cnt:
|
||||
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended without starting")
|
||||
# raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended without starting")
|
||||
continue
|
||||
|
||||
else:
|
||||
raise ValueError(f"in line {line_idx + 1}: unknown operation {l['op']}")
|
||||
tmp_cmd = f"({l['time']}, "
|
||||
max_time = max(max_time, float(l['time']))
|
||||
for e in cuda_event:
|
||||
if e is None:
|
||||
tmp_cmd += "NULL, "
|
||||
else:
|
||||
tmp_cmd += f"'{e}', "
|
||||
tmp_cmd = tmp_cmd[:-2] + "),"
|
||||
cmd += tmp_cmd
|
||||
if len(cmd) > 37:
|
||||
cmd = cmd[:-1] + ";"
|
||||
# print(cmd)
|
||||
# print("------")
|
||||
db.exec(cmd)
|
||||
# print(cuda_event)
|
||||
# print(event_cnt)
|
||||
add_empty(max_time,db)
|
13
eacgm/webui/reader.py
Normal file
13
eacgm/webui/reader.py
Normal file
@@ -0,0 +1,13 @@
|
||||
def log_reader(path):
|
||||
with open(path, 'r') as f:
|
||||
data = f.readlines()
|
||||
for i, d in enumerate(data):
|
||||
data[i] = d.strip().split(' ')
|
||||
ret = []
|
||||
for d in data:
|
||||
tmp = dict()
|
||||
tmp['time'] = d[3]
|
||||
tmp['op'] = d[5]
|
||||
tmp['name'] = d[6]
|
||||
ret.append(tmp)
|
||||
return ret
|
Reference in New Issue
Block a user