init eACGM

This commit is contained in:
Tokisakix
2025-08-07 10:14:54 +08:00
commit 7a4a0b1b14
51 changed files with 11495 additions and 0 deletions

23
.gitignore vendored Normal file
View File

@@ -0,0 +1,23 @@
*.pyc
*.egg-info
temp.*
python*.json
torch.json
cuda.json
gpu.json
nvml.json
nccl.json
ebpf.json
eacg.json
*.log
.python-version
uv.lock
requirements.txt

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 eACGM
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

67
README.md Normal file
View File

@@ -0,0 +1,67 @@
# eACGM
**eACGM:** An **e**BPF-based **A**utomated **C**omprehensive **G**overnance and **M**onitoring framework for AI/ML systems.
---
:star: **[News] Our work has been accepted by [IEEE/ACM IWQoS 2025 (CCF-B)! ](https://iwqos2025.ieee-iwqos.org/)**
**[Paper(Dropbox)](https://www.dropbox.com/scl/fi/q4vplv95usw4u5h3syx62/IWQoS_2025.pdf?rlkey=gv8h65oupkzrmv6zu1yu7s558&e=1&st=k8sttham&dl=0)**
---
eACGM provides zero-intrusive, low-overhead, full-stack observability for both hardware (GPU, NCCL) and software (CUDA, Python, PyTorch) layers in modern AI/ML workloads.
![Architecture](asset/arch.png)
## Features
- [x] **Event tracing for CUDA Runtime** based on eBPF
- [x] **Event tracing for NCCL GPU communication library** based on eBPF
- [x] **Function call tracing for Python virtual machine** based on eBPF
- [x] **Operator tracing for PyTorch** based on eBPF
- [x] **Process-level GPU information monitoring** based on `libnvml`
- [x] **Global GPU information monitoring** based on `libnvml`
- [x] **Automatic eBPF program generation**
- [x] **Comprehensive analysis** of all traced events and operators
- [x] **Flexible integration** for multi-level tracing (CUDA, NCCL, PyTorch, Python, GPU)
- [x] **Visualization-ready data output** for monitoring platforms
## Visualization
To visualize monitoring data, deploy Grafana and MySQL using Docker. Access the Grafana dashboard at [http://127.0.0.1:3000](http://127.0.0.1:3000).
```bash
cd grafana/
sh ./launch.sh
```
Start the monitoring service with:
```bash
./service.sh
```
Stop the monitoring service with:
```bash
./stop.sh
```
## Case Demonstration
The `demo` folder provides example programs to showcase the capabilities of eACGM:
- `pytorch_example.py`: Multi-node, multi-GPU PyTorch training demo
- `sampler_cuda.py`: Trace CUDA Runtime events using eBPF
- `sampler_nccl.py`: Trace NCCL GPU communication events using eBPF
- `sampler_torch.py`: Trace PyTorch operator events using eBPF
- `sampler_python.py`: Trace Python VM function calls using eBPF
- `sampler_gpu.py`: Monitor global GPU information using `libnvml`
- `sampler_nccl.py`: Monitor process-level GPU information using `libnvml`
- `sampler_eacg.py`: Combined monitoring of all supported sources
- `webui.py`: Automatically visualize captured data in Grafana
## Citation
If you find this project helpful, please consider citing our IWQoS 2025 paper (In press, to appear).

BIN
asset/arch.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 MiB

114
demo/sampler_cuda.py Normal file
View File

@@ -0,0 +1,114 @@
import time
import json
from eacgm.bpf import BccBPF
from eacgm.sampler import eBPFSampler
from eacgm.collector import to_perfetto
text = """
// #include <cuda_runtime.h>
#include <uapi/linux/ptrace.h>
struct dim3 {
unsigned int x, y, z;
};
int cudaMallocEntry(struct pt_regs *ctx){
u64 malloc_ptr = PT_REGS_PARM1(ctx);
u64 byte_length = PT_REGS_PARM2(ctx);
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@start@cudaMalloc@%ld@%ld\\n", ts, malloc_ptr, byte_length);
return 0;
};
int cudaMallocExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@cudaMalloc\\n", ts);
return 0;
};
int cudaMemcpyEntry(struct pt_regs *ctx){
u64 byte_length = PT_REGS_PARM3(ctx);
u64 memcpy_kind = PT_REGS_PARM4(ctx);
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@start@cudaMemcpy@%ld@%ld\\n", ts, memcpy_kind);
return 0;
};
int cudaMemcpyExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@cudaMemcpy\\n", ts);
return 0;
};
int cudaFreeEntry(struct pt_regs *ctx){
u64 malloc_ptr = PT_REGS_PARM1(ctx);
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@start@cudaFree@%ld\\n", malloc_ptr, ts);
return 0;
};
int cudaFreeExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@cudaFree\\n", ts);
return 0;
};
int cudaLaunchKernelEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u32 g_x = PT_REGS_PARM2(ctx) & 0xFFFF;
u32 g_y = PT_REGS_PARM2(ctx) >> 32;
u32 g_z = PT_REGS_PARM3(ctx) & 0xFFFF;
u32 b_x = PT_REGS_PARM4(ctx) & 0xFFFF;
u32 b_y = PT_REGS_PARM4(ctx) >> 32;
u32 b_z = PT_REGS_PARM5(ctx) & 0xFFFF;
// bpf_trace_printk("0 ----- cudaLaunchKernel %u %u %u\\n", g_x, g_y, g_z);
// bpf_trace_printk("0 ----- cudaLaunchKernel %u %u %u\\n", b_x, b_y, b_z);
u32 stream_num = g_x * g_y * g_z * b_x * b_y * b_z;
bpf_trace_printk("%ld@start@cudaLaunchKernel@%u\\n", ts, stream_num);
return 0;
};
int cudaLaunchKernelExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@cudaLaunchKernel\\n", ts);
return 0;
};
"""
bpf = BccBPF("CUDAeBPF", text, ["-w"])
attach_config = [
{
"name": "CUDASampler",
"exe_path": [
"/home/txx/data/miniconda3/envs/eACGM/lib/python3.12/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12",
],
"exe_sym": [
"cudaMalloc",
"cudaMemcpy",
"cudaFree",
"cudaLaunchKernel",
]
},
]
sampler = eBPFSampler(bpf)
sampler.run(attach_config)
states = []
while True:
try:
samples = sampler.sample(time_stamp=1)
states += samples
# for sample in samples:
# print(sample)
# print("---")
except KeyboardInterrupt:
break
sampler.close()
collector = to_perfetto(states)
json.dump(collector, open("cuda.json", "w", encoding="utf-8"), indent=4)

302
demo/sampler_eacg.py Normal file
View File

@@ -0,0 +1,302 @@
import os
import time
import json
from eacgm.bpf import BccBPF
from eacgm.sampler import eBPFSampler, NVMLSampler, GPUSampler
from eacgm.collector import to_perfetto
for filename in os.listdir("res"):
os.remove(os.path.join("res", filename))
start_time = time.time_ns() - time.clock_gettime_ns(time.CLOCK_MONOTONIC)
start_time /= 1_000
torch_func_sym = {
"TorchAdd": "_ZN5torch8autogradL15THPVariable_addEP7_objectS2_S2_",
"TorchSub": "_ZN5torch8autogradL15THPVariable_subEP7_objectS2_S2_",
"TorchMul": "_ZN5torch8autogradL15THPVariable_mulEP7_objectS2_S2_",
"TorchMatmul": "_ZN5torch8autogradL18THPVariable_matmulEP7_objectS2_S2_",
"TorchDiv": "_ZN5torch8autogradL15THPVariable_divEP7_objectS2_S2_",
"TorchLinear": "_ZN5torch8autogradL18THPVariable_linearEP7_objectS2_S2_",
"TorchConv2d": "_ZN5torch8autogradL18THPVariable_conv2dEP7_objectS2_S2_",
"TorchReLU": "_ZN5torch8autogradL16THPVariable_reluEP7_objectS2_S2_",
"TorchSigmoid": "_ZN5torch8autogradL19THPVariable_sigmoidEP7_objectS2_S2_",
"TorchTanh": "_ZN5torch8autogradL16THPVariable_tanhEP7_objectS2_S2_",
"TorchSoftmax": "_ZN5torch8autogradL19THPVariable_softmaxEP7_objectS2_S2_",
"TorchMSELoss": "_ZN5torch8autogradL20THPVariable_mse_lossEP7_objectS2_S2_",
"TorchBCELoss": "_ZN5torch8autogradL32THPVariable_binary_cross_entropyEP7_objectS2_S2_",
"TorchCrossEntropyLoss": "_ZN5torch8autogradL30THPVariable_cross_entropy_lossEP7_objectS2_S2_",
"TorchConvTranspose2d": "_ZN5torch8autogradL28THPVariable_conv_transpose2dEP7_objectS2_S2_",
"TorchMaxUnpool2d": "_ZN5torch8autogradL24THPVariable_max_unpool2dEP7_objectS2_S2_",
"TorchBatchNorm2d": "_ZN5torch8autogradL22THPVariable_batch_normEP7_objectS2_S2_",
"TorchAvgPool2d": "_ZN5torch8autogradL22THPVariable_avg_pool2dEP7_objectS2_S2_",
"TorchMaxPool2d": "_ZN5torch8autogradL22THPVariable_max_pool2dEP7_objectS2_S2_",
"TorchDropout": "_ZN5torch8autogradL19THPVariable_dropoutEP7_objectS2_S2_",
"TorchEmbedding": "_ZN5torch8autogradL21THPVariable_embeddingEP7_objectS2_S2_",
"TorchLSTM": "_ZN5torch8autogradL16THPVariable_lstmEP7_objectS2_S2_",
"TorchAdaptiveMaxPool2d": "_ZN5torch8autogradL31THPVariable_adaptive_max_pool2dEP7_objectS2_S2_",
"TorchAdaptiveAvgPool2d": "_ZN5torch8autogradL31THPVariable_adaptive_avg_pool2dEP7_objectS2_S2_",
}
torch_template = """
int <TorchSym>Entry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@start@<TorchFunc>\\n", ts);
return 0;
};
int <TorchSym>Exit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@<TorchFunc>\\n", ts);
return 0;
};
"""
text = """
#include <uapi/linux/ptrace.h>
int ncclAllReduceEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM3(ctx);
u64 data_type = PT_REGS_PARM4(ctx);
u64 reduce_op = PT_REGS_PARM5(ctx);
bpf_trace_printk("%ld@start@ncclAllReduce@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclAllReduceExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclAllReduce\\n", ts);
return 0;
};
int ncclReduceEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM3(ctx);
u64 data_type = PT_REGS_PARM4(ctx);
u64 reduce_op = PT_REGS_PARM5(ctx);
bpf_trace_printk("%ld@start@ncclReduce@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclReduceExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclReduce\\n", ts);
return 0;
};
int ncclBroadcastEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM3(ctx);
u64 data_type = PT_REGS_PARM4(ctx);
u64 root_id = PT_REGS_PARM5(ctx);
bpf_trace_printk("%ld@start@ncclBroadcast@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclBroadcastExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclBroadcast\\n", ts);
return 0;
};
int ncclAllGatherEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM3(ctx);
u64 data_type = PT_REGS_PARM4(ctx);
bpf_trace_printk("%ld@start@ncclAllGather@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclAllGatherExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclAllGather\\n", ts);
return 0;
};
int ncclSendEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM2(ctx);
u64 data_type = PT_REGS_PARM3(ctx);
bpf_trace_printk("%ld@start@ncclSend@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclSendExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclSend\\n", ts);
return 0;
};
int ncclRecvEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM2(ctx);
u64 data_type = PT_REGS_PARM3(ctx);
bpf_trace_printk("%ld@start@ncclRecv@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclRecvExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclRecv\\n", ts);
return 0;
};
int PyObject_CallFunctionEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld start PyObject_CallFunction\\n", ts);
return 0;
};
int PyObject_CallFunctionExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld end PyObject_CallFunction\\n", ts);
return 0;
};
struct dim3 {
unsigned int x, y, z;
};
int cudaMallocEntry(struct pt_regs *ctx){
u64 malloc_ptr = PT_REGS_PARM1(ctx);
u64 byte_length = PT_REGS_PARM2(ctx);
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@start@cudaMalloc@%ld@%ld\\n", ts, malloc_ptr, byte_length);
return 0;
};
int cudaMallocExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@cudaMalloc\\n", ts);
return 0;
};
int cudaMemcpyEntry(struct pt_regs *ctx){
u64 byte_length = PT_REGS_PARM3(ctx);
u64 memcpy_kind = PT_REGS_PARM4(ctx);
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@start@cudaMemcpy@%ld@%ld\\n", ts, memcpy_kind);
return 0;
};
int cudaMemcpyExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@cudaMemcpy\\n", ts);
return 0;
};
int cudaFreeEntry(struct pt_regs *ctx){
u64 malloc_ptr = PT_REGS_PARM1(ctx);
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@start@cudaFree@%ld\\n", malloc_ptr, ts);
return 0;
};
int cudaFreeExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@cudaFree\\n", ts);
return 0;
};
int cudaLaunchKernelEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u32 g_x = PT_REGS_PARM2(ctx) & 0xFFFF;
u32 g_y = PT_REGS_PARM2(ctx) >> 32;
u32 g_z = PT_REGS_PARM3(ctx) & 0xFFFF;
u32 b_x = PT_REGS_PARM4(ctx) & 0xFFFF;
u32 b_y = PT_REGS_PARM4(ctx) >> 32;
u32 b_z = PT_REGS_PARM5(ctx) & 0xFFFF;
// bpf_trace_printk("0 ----- cudaLaunchKernel %u %u %u\\n", g_x, g_y, g_z);
// bpf_trace_printk("0 ----- cudaLaunchKernel %u %u %u\\n", b_x, b_y, b_z);
u32 stream_num = g_x * g_y * g_z * b_x * b_y * b_z;
bpf_trace_printk("%ld@start@cudaLaunchKernel@%u\\n", ts, stream_num);
return 0;
};
int cudaLaunchKernelExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@cudaLaunchKernel\\n", ts);
return 0;
};
"""
for func in torch_func_sym:
sym = torch_func_sym[func]
text += torch_template.replace("<TorchSym>", sym).replace("<TorchFunc>", func)
bpf = BccBPF("eACGSampler", text, ["-w"])
attach_config = [
{
"name": "CUDASampler",
"exe_path": [
"/home/txx/data/miniconda3/envs/eACGM/lib/python3.12/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12",
],
"exe_sym": [
"cudaLaunchKernel",
]
},
{
"name": "NCCLSampler",
"exe_path": [
"/home/txx/data/miniconda3/envs/eACGM/lib/python3.12/site-packages/nvidia/nccl/lib/libnccl.so.2",
],
"exe_sym": [
"ncclAllReduce",
]
},
{
"name": "PythonSampler",
"exe_path": [
"/home/txx/data/miniconda3/envs/eACGM/bin/python",
],
"exe_sym": [
# "PyObject_CallFunction",
]
},
{
"name": "TorchSampler",
"exe_path": [
"/home/txx/data/miniconda3/envs/eACGM/lib/python3.12/site-packages/torch/lib/libtorch_python.so",
],
"exe_sym": [
torch_func_sym[func] for func in torch_func_sym
]
},
]
eacg_sampler = eBPFSampler(bpf)
nvml_sampler = NVMLSampler()
gpu_sampler = GPUSampler()
eacg_sampler.run(attach_config)
states = []
while True:
try:
samples = []
samples += eacg_sampler.sample(time_stamp=1)
samples += nvml_sampler.sample(time_stamp=1)
samples += gpu_sampler.sample()
states += samples
for sample in samples:
print(sample)
print("---")
except KeyboardInterrupt:
break
eacg_sampler.close()
nvml_sampler.close()
gpu_sampler.close()
ebpf_collector = to_perfetto(states)
json.dump(ebpf_collector, open("res/ebpf.json", "w", encoding="utf-8"), indent=4)
eacg_collector = ebpf_collector
for python_log in os.listdir("res"):
if "python" not in python_log:
continue
python_collector = json.load(open(os.path.join("res", python_log), "r", encoding="utf-8"))
eacg_collector += python_collector
json.dump(eacg_collector, open("res/eacg.json", "w", encoding="utf-8"), indent=4)

35
demo/sampler_gpu.py Normal file
View File

@@ -0,0 +1,35 @@
import time
import json
from eacgm.sampler import GPUSampler
sampler = GPUSampler()
sampler.run()
states = []
while True:
try:
samples = sampler.sample()
for sample in samples:
states.append({
"ts": time.time_ns(),
"gpu": sample.gpu,
"gpu_utl": sample.sm,
"totMem": sample.totMem,
"usedMem": sample.usedMem,
"encode_utl": sample.enc,
"decode_utl": sample.dec,
"temperature": sample.tmp,
"fan_utl": sample.fan,
"usedPower": sample.usedPower,
"totPower": sample.totPower,
})
# print(sample)
time.sleep(1)
print("---")
except KeyboardInterrupt:
break
sampler.close()
json.dump(states, open("gpu.json", "w", encoding="utf-8"), indent=4)

135
demo/sampler_nccl.py Normal file
View File

@@ -0,0 +1,135 @@
import time
import json
from eacgm.bpf import BccBPF
from eacgm.sampler import eBPFSampler
from eacgm.collector import to_perfetto
text = """
#include <uapi/linux/ptrace.h>
int ncclAllReduceEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM3(ctx);
u64 data_type = PT_REGS_PARM4(ctx);
u64 reduce_op = PT_REGS_PARM5(ctx);
bpf_trace_printk("%ld@start@ncclAllReduce@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclAllReduceExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclAllReduce\\n", ts);
return 0;
};
int ncclReduceEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM3(ctx);
u64 data_type = PT_REGS_PARM4(ctx);
u64 reduce_op = PT_REGS_PARM5(ctx);
bpf_trace_printk("%ld@start@ncclReduce@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclReduceExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclReduce\\n", ts);
return 0;
};
int ncclBroadcastEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM3(ctx);
u64 data_type = PT_REGS_PARM4(ctx);
u64 root_id = PT_REGS_PARM5(ctx);
bpf_trace_printk("%ld@start@ncclBroadcast@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclBroadcastExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclBroadcast\\n", ts);
return 0;
};
int ncclAllGatherEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM3(ctx);
u64 data_type = PT_REGS_PARM4(ctx);
bpf_trace_printk("%ld@start@ncclAllGather@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclAllGatherExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclAllGather\\n", ts);
return 0;
};
int ncclSendEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM2(ctx);
u64 data_type = PT_REGS_PARM3(ctx);
bpf_trace_printk("%ld@start@ncclSend@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclSendExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclSend\\n", ts);
return 0;
};
int ncclRecvEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
u64 size_count = PT_REGS_PARM2(ctx);
u64 data_type = PT_REGS_PARM3(ctx);
bpf_trace_printk("%ld@start@ncclRecv@%ld\\n", ts, size_count * 8);
return 0;
};
int ncclRecvExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@ncclRecv\\n", ts);
return 0;
};
"""
bpf = BccBPF("NCCLeBPF", text, ["-w"])
attach_config = [
{
"name": "NCCLSampler",
"exe_path": [
"/home/txx/data/miniconda3/envs/eACGM/lib/python3.12/site-packages/nvidia/nccl/lib/libnccl.so.2",
],
"exe_sym": [
"ncclAllReduce",
"ncclReduce",
"ncclBroadcast",
"ncclAllGather",
"ncclSend",
"ncclRecv",
]
},
]
sampler = eBPFSampler(bpf)
sampler.run(attach_config)
states = []
while True:
try:
samples = sampler.sample(time_stamp=1)
states += samples
# for sample in samples:
# print(sample)
# print("---")
except KeyboardInterrupt:
break
sampler.close()
collector = to_perfetto(states)
json.dump(collector, open("nccl.json", "w", encoding="utf-8"), indent=4)

30
demo/sampler_nvml.py Normal file
View File

@@ -0,0 +1,30 @@
import time
import json
from eacgm.sampler import NVMLSampler
sampler = NVMLSampler()
sampler.run()
states = []
while True:
try:
for sample in sampler.sample(time_stamp=1):
# print(sample)
states.append({
"ts": time.time_ns(),
"pid": sample.pid,
"gpu": sample.gpu,
"gpu_utl": sample.sm,
"mem": sample.mem,
"encode_utl": sample.enc,
"decode_utl": sample.dec,
})
time.sleep(2)
print("---")
except KeyboardInterrupt:
break
sampler.close()
json.dump(states, open("nvml.json", "w", encoding="utf-8"), indent=4)

50
demo/sampler_python.py Normal file
View File

@@ -0,0 +1,50 @@
import time
import ctypes
from eacgm.bpf import BccBPF
from eacgm.sampler import eBPFSampler
text = """
#include <uapi/linux/ptrace.h>
int PyObject_CallFunctionEntry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld start PyObject_CallFunction\\n", ts);
return 0;
};
int PyObject_CallFunctionExit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld end PyObject_CallFunction\\n", ts);
return 0;
};
"""
bpf = BccBPF("PythoneBPF", text, ["-w"])
attach_config = [
{
"name": "PythonSampler",
"exe_path": [
"/home/txx/data/miniconda3/envs/py312-torch24-cu124/bin/python",
],
"exe_sym": [
"PyObject_CallFunction",
]
},
]
sampler = eBPFSampler(bpf)
sampler.run(attach_config)
while True:
try:
samples = sampler.sample(time_stamp=1)
for sample in samples:
print(sample)
print("---")
except KeyboardInterrupt:
break
sampler.close()

109
demo/sampler_torch.py Normal file
View File

@@ -0,0 +1,109 @@
import time
import json
from eacgm.bpf import BccBPF
from eacgm.sampler import eBPFSampler
from eacgm.collector import to_perfetto
func_sym = {
"TorchAdd": "_ZN5torch8autogradL15THPVariable_addEP7_objectS2_S2_",
"TorchSub": "_ZN5torch8autogradL15THPVariable_subEP7_objectS2_S2_",
"TorchMul": "_ZN5torch8autogradL15THPVariable_mulEP7_objectS2_S2_",
"TorchMatmul": "_ZN5torch8autogradL18THPVariable_matmulEP7_objectS2_S2_",
"TorchDiv": "_ZN5torch8autogradL15THPVariable_divEP7_objectS2_S2_",
"TorchLinear": "_ZN5torch8autogradL18THPVariable_linearEP7_objectS2_S2_",
"TorchConv2d": "_ZN5torch8autogradL18THPVariable_conv2dEP7_objectS2_S2_",
"TorchReLU": "_ZN5torch8autogradL16THPVariable_reluEP7_objectS2_S2_",
"TorchSigmoid": "_ZN5torch8autogradL19THPVariable_sigmoidEP7_objectS2_S2_",
"TorchTanh": "_ZN5torch8autogradL16THPVariable_tanhEP7_objectS2_S2_",
"TorchSoftmax": "_ZN5torch8autogradL19THPVariable_softmaxEP7_objectS2_S2_",
"TorchMSELoss": "_ZN5torch8autogradL20THPVariable_mse_lossEP7_objectS2_S2_",
"TorchBCELoss": "_ZN5torch8autogradL32THPVariable_binary_cross_entropyEP7_objectS2_S2_",
"TorchCrossEntropyLoss": "_ZN5torch8autogradL30THPVariable_cross_entropy_lossEP7_objectS2_S2_",
"TorchConvTranspose2d": "_ZN5torch8autogradL28THPVariable_conv_transpose2dEP7_objectS2_S2_",
"TorchMaxUnpool2d": "_ZN5torch8autogradL24THPVariable_max_unpool2dEP7_objectS2_S2_",
"TorchBatchNorm2d": "_ZN5torch8autogradL22THPVariable_batch_normEP7_objectS2_S2_",
"TorchAvgPool2d": "_ZN5torch8autogradL22THPVariable_avg_pool2dEP7_objectS2_S2_",
"TorchMaxPool2d": "_ZN5torch8autogradL22THPVariable_max_pool2dEP7_objectS2_S2_",
"TorchDropout": "_ZN5torch8autogradL19THPVariable_dropoutEP7_objectS2_S2_",
"TorchEmbedding": "_ZN5torch8autogradL21THPVariable_embeddingEP7_objectS2_S2_",
"TorchLSTM": "_ZN5torch8autogradL16THPVariable_lstmEP7_objectS2_S2_",
"TorchAdaptiveMaxPool2d": "_ZN5torch8autogradL31THPVariable_adaptive_max_pool2dEP7_objectS2_S2_",
"TorchAdaptiveAvgPool2d": "_ZN5torch8autogradL31THPVariable_adaptive_avg_pool2dEP7_objectS2_S2_",
}
template = """
int <TorchSym>Entry(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@start@<TorchFunc>\\n", ts);
return 0;
};
int <TorchSym>Exit(struct pt_regs *ctx){
u64 ts = bpf_ktime_get_ns();
bpf_trace_printk("%ld@end@<TorchFunc>\\n", ts);
return 0;
};
"""
text = ""
for func in func_sym:
sym = func_sym[func]
text += template.replace("<TorchSym>", sym).replace("<TorchFunc>", func)
bpf = BccBPF("TorcheBPF", text, ["-w"])
attach_config = [
{
"name": "TorchSampler",
"exe_path": [
"/home/txx/data/miniconda3/envs/eACGM/lib/python3.12/site-packages/torch/./lib/libtorch_python.so",
],
"exe_sym": [
"_ZN5torch8autogradL15THPVariable_addEP7_objectS2_S2_",
"_ZN5torch8autogradL15THPVariable_subEP7_objectS2_S2_",
"_ZN5torch8autogradL15THPVariable_mulEP7_objectS2_S2_",
"_ZN5torch8autogradL18THPVariable_matmulEP7_objectS2_S2_",
"_ZN5torch8autogradL15THPVariable_divEP7_objectS2_S2_",
"_ZN5torch8autogradL18THPVariable_linearEP7_objectS2_S2_",
"_ZN5torch8autogradL18THPVariable_conv2dEP7_objectS2_S2_",
"_ZN5torch8autogradL16THPVariable_reluEP7_objectS2_S2_",
"_ZN5torch8autogradL19THPVariable_sigmoidEP7_objectS2_S2_",
"_ZN5torch8autogradL16THPVariable_tanhEP7_objectS2_S2_",
"_ZN5torch8autogradL19THPVariable_softmaxEP7_objectS2_S2_",
"_ZN5torch8autogradL20THPVariable_mse_lossEP7_objectS2_S2_",
"_ZN5torch8autogradL32THPVariable_binary_cross_entropyEP7_objectS2_S2_",
"_ZN5torch8autogradL30THPVariable_cross_entropy_lossEP7_objectS2_S2_",
"_ZN5torch8autogradL28THPVariable_conv_transpose2dEP7_objectS2_S2_",
"_ZN5torch8autogradL24THPVariable_max_unpool2dEP7_objectS2_S2_",
"_ZN5torch8autogradL22THPVariable_batch_normEP7_objectS2_S2_",
"_ZN5torch8autogradL22THPVariable_avg_pool2dEP7_objectS2_S2_",
"_ZN5torch8autogradL22THPVariable_max_pool2dEP7_objectS2_S2_",
"_ZN5torch8autogradL19THPVariable_dropoutEP7_objectS2_S2_",
"_ZN5torch8autogradL21THPVariable_embeddingEP7_objectS2_S2_",
"_ZN5torch8autogradL16THPVariable_lstmEP7_objectS2_S2_",
"_ZN5torch8autogradL31THPVariable_adaptive_max_pool2dEP7_objectS2_S2_",
"_ZN5torch8autogradL31THPVariable_adaptive_avg_pool2dEP7_objectS2_S2_",
]
},
]
sampler = eBPFSampler(bpf)
sampler.run(attach_config)
states = []
while True:
try:
samples = sampler.sample(time_stamp=1)
states += samples
# for sample in samples:
# print(sample)
# print("---")
except KeyboardInterrupt:
break
sampler.close()
collector = to_perfetto(states)
json.dump(collector, open("torch.json", "w", encoding="utf-8"), indent=4)

15
demo/webui.py Normal file
View File

@@ -0,0 +1,15 @@
from eacgm.webui import log_reader, database, push_log
ip = "127.0.0.1"
port = 3306
user = "node1"
pwd = "mysql114514"
data_base = "grafana"
table = "CudaEvent"
if __name__ == "__main__":
log_file = "log/transformer.log"
log = log_reader(log_file)
db = database(ip, port, user, pwd, data_base)
push_log(db, log)

1
eacgm/__init__.py Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.1.0"

2
eacgm/bpf/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
from .base import BPFState, BaseBPF
from .bccBPF import BccBPF

38
eacgm/bpf/base.py Normal file
View File

@@ -0,0 +1,38 @@
class BPFState:
task:str
pid:int
cpu:int
timestamp:int
message:str
def __init__(self) -> None:
self.task = None
self.pid = None
self.cpu = None
self.timestamp = None
self.message = None
return
def is_none(self) -> bool:
return self.task is None
def __repr__(self) -> str:
info = f"BPFState {self.task} {self.pid} { self.cpu} {self.timestamp} {self.message}"
return info
class BaseBPF:
def __init__(self, name:str) -> None:
self.name = name
return
def attach_uprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
raise NotADirectoryError
def attach_uretprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
raise NotADirectoryError
def cleanup(self) -> None:
raise NotADirectoryError
def trace_ebpf(self) -> BPFState:
raise NotADirectoryError

34
eacgm/bpf/bccBPF.py Normal file
View File

@@ -0,0 +1,34 @@
from bcc import BPF
from typing import List
from .base import BPFState, BaseBPF
class BccBPF(BaseBPF):
def __init__(self, name:str, text:str, cflags:List=[]) -> None:
super().__init__(name)
self.bpf = BPF(text=text, cflags=cflags)
return
def attach_uprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
self.bpf.attach_uprobe(exe_path, exe_sym, fn_name=bpf_func)
return
def attach_uretprobe(self, exe_path:str, exe_sym:str, bpf_func:str) -> bool:
self.bpf.attach_uretprobe(exe_path, exe_sym, fn_name=bpf_func)
return
def cleanup(self) -> None:
self.bpf.cleanup()
return
def trace_ebpf(self, nonblocking:bool) -> BPFState:
(task, pid, cpu, _, _, message) = self.bpf.trace_fields(nonblocking)
state = BPFState()
if task is not None:
message = message.decode("utf-8")
state.task = task.decode("utf-8")
state.pid = int(pid)
state.cpu = int(cpu)
state.timestamp = int(message.split("@")[0])
state.message = message.split("@")[1:]
return state

View File

@@ -0,0 +1 @@
from .profetto import to_perfetto

View File

@@ -0,0 +1,21 @@
from typing import List
from eacgm.sampler import eBPFSamplerState
def to_perfetto(states:List[eBPFSamplerState]) -> List:
res = []
last_event = {}
for state in states:
if not isinstance(state, eBPFSamplerState):
continue
state = state.collect()
name = f"{state['name']}-{state['pid']}"
last_state = last_event.get(name, None)
if last_state is None:
last_event[name] = state
continue
if last_state["ph"] == "B" and state["ph"] == "E":
res.append(last_state)
res.append(state)
last_event[name] = state
return res

View File

@@ -0,0 +1,4 @@
from .base import BaseSampler
from .ebpfsampler import eBPFSampler, eBPFSamplerState
from .nvmlsampler import NVMLSampler, NVMLSamplerState
from .gpusampler import GPUSampler, GPUSamplerState

35
eacgm/sampler/base.py Normal file
View File

@@ -0,0 +1,35 @@
class BaseSamplerState:
task:str
pid:int
cpu:int
timestamp:int
message:str
def __init__(self) -> None:
self.task = None
self.pid = None
self.cpu = None
self.timestamp = None
self.message = None
return
def is_none(self) -> bool:
return self.task is None
def __repr__(self) -> str:
info = f"{self.task} {self.pid} {self.cpu} {self.timestamp} {self.message}"
return info
class BaseSampler:
def __init__(self, name:str) -> None:
self.name = name
return
def run(self) -> None:
raise NotImplementedError
def sample(self):
raise NotImplementedError
def close(self) -> None:
raise NotImplementedError

View File

@@ -0,0 +1,88 @@
import time
from typing import Dict, List
from .base import BaseSamplerState, BaseSampler
from eacgm.bpf import BPFState, BccBPF
class eBPFSamplerState(BaseSamplerState):
def __init__(self) -> None:
super().__init__()
return
def from_ebpfstate(other:BPFState) -> "eBPFSamplerState":
state = eBPFSamplerState()
state.task = other.task
state.pid = other.pid
state.cpu = other.cpu
state.timestamp = other.timestamp
state.message = other.message
return state
def collect(self) -> Dict:
event = self.message[1]
if "cuda" in event:
cat = "cuda"
elif "Py" in event:
cat = "python"
elif "nccl" in event:
cat = "nccl"
elif "Torch" in event:
cat = "torch"
else:
cat = "other"
ph = "B" if self.message[0] == "start" else "E"
res = {
"name": event,
"cat": cat,
"pid": self.pid,
"tid": self.pid,
"cpu": self.cpu,
"ts": self.timestamp / 1_000,
"ph": ph,
"message": self.message[2:],
}
return res
def __repr__(self) -> str:
info = f"eBPFSamplerState {super().__repr__()}"
return info
class eBPFSampler(BaseSampler):
def __init__(self, bpf:BccBPF) -> None:
super().__init__(name="eBPFSampler")
self.bpf = bpf
return
def run(self, attach_config:List) -> None:
for attach_info in attach_config:
name = attach_info["name"]
exe_path = attach_info["exe_path"]
exe_sym = attach_info["exe_sym"]
for path in exe_path:
for sym in exe_sym:
try:
self.bpf.attach_uprobe(path, sym, sym + "Entry")
self.bpf.attach_uretprobe(path, sym, sym + "Exit")
except Exception as e:
print(e)
return
def sample(self, time_stamp:float) -> List[eBPFSamplerState]:
samples = []
start_time = time.perf_counter()
flag = True
while flag:
if time.perf_counter() > start_time + time_stamp:
flag = False
state = self.bpf.trace_ebpf(True)
if state.is_none():
continue
state = eBPFSamplerState.from_ebpfstate(state)
samples.append(state)
return samples
def close(self) -> None:
self.bpf.cleanup()
return

View File

@@ -0,0 +1,64 @@
import time
import pynvml
from typing import List
from .base import BaseSampler
class GPUSamplerState:
def __init__(self) -> None:
super().__init__()
self.gpu:int = None
self.name:str = None
self.sm:int = None
self.totMem:int = None
self.usedMem:int = None
self.enc:int = None
self.dec:int = None
self.tmp:int = None
self.fan:int = None
self.usedPower:float = None
self.totPower:float = None
return
def __repr__(self) -> str:
info = f"GPUSamplerState {self.gpu} {self.name} {self.sm} {self.usedMem} {self.totMem} {self.enc} {self.dec} {self.tmp} {self.fan} {self.usedPower} {self.totPower}"
return info
class GPUSampler(BaseSampler):
def __init__(self) -> None:
super().__init__(name="GPUSampler")
pynvml.nvmlInit()
self.deviceCount:int = pynvml.nvmlDeviceGetCount()
self.nvDevices:List = [pynvml.nvmlDeviceGetHandleByIndex(idx) for idx in range(self.deviceCount)]
return
def run(self) -> None:
return
def sample(self) -> List[GPUSamplerState]:
samples = []
for gpu_idx in range(self.deviceCount):
gpu_handle = self.nvDevices[gpu_idx]
try:
sample = GPUSamplerState()
sample.gpu = pynvml.nvmlDeviceGetIndex(gpu_handle)
sample.name = pynvml.nvmlDeviceGetName(gpu_handle)
sample.sm = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle).gpu
mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
sample.totMem = mem_info.total
sample.usedMem = mem_info.used
sample.enc = pynvml.nvmlDeviceGetEncoderUtilization(gpu_handle)[0]
sample.dec = pynvml.nvmlDeviceGetDecoderUtilization(gpu_handle)[0]
sample.tmp = pynvml.nvmlDeviceGetTemperature(gpu_handle, pynvml.NVML_TEMPERATURE_GPU)
sample.fan = pynvml.nvmlDeviceGetFanSpeed(gpu_handle)
sample.usedPower = pynvml.nvmlDeviceGetPowerUsage(gpu_handle) / 1000.0
sample.totPower = pynvml.nvmlDeviceGetPowerManagementLimit(gpu_handle) / 1000.0
samples.append(sample)
except pynvml.NVMLError as e:
print(e)
pass
return samples
def close(self) -> None:
pynvml.nvmlShutdown()
return

View File

@@ -0,0 +1,57 @@
import time
import pynvml
from typing import List
from .base import BaseSamplerState, BaseSampler
class NVMLSamplerState(BaseSamplerState):
def __init__(self) -> None:
super().__init__()
self.gpu:int = None
self.sm:int = None
self.mem:int = None
self.enc:int = None
self.dec:int = None
return
def __repr__(self) -> str:
info = f"NVMLSamplerState {self.gpu} {self.sm} {self.mem} {self.enc} {self.dec} {super().__repr__()}"
return info
class NVMLSampler(BaseSampler):
def __init__(self) -> None:
super().__init__(name="NVMLSampler")
pynvml.nvmlInit()
self.deviceCount:int = pynvml.nvmlDeviceGetCount()
self.nvDevices:List = [pynvml.nvmlDeviceGetHandleByIndex(idx) for idx in range(self.deviceCount)]
return
def run(self) -> None:
return
def sample(self, time_stamp:float) -> List[NVMLSamplerState]:
samples = []
for gpu_idx in range(self.deviceCount):
gpu_handle = self.nvDevices[gpu_idx]
try:
processes = pynvml.nvmlDeviceGetProcessUtilization(gpu_handle, time.time_ns() // 1000 - 1000_000 * time_stamp)
for process in processes:
state = NVMLSamplerState()
state.task = None
state.pid = process.pid
state.cpu = None
state.timestamp = process.timeStamp
state.message = None
state.gpu = gpu_idx
state.sm = process.smUtil
state.mem = process.memUtil
state.enc = process.encUtil
state.dec = process.decUtil
samples.append(state)
except pynvml.NVMLError as e:
pass
return samples
def close(self) -> None:
pynvml.nvmlShutdown()
return

3
eacgm/webui/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from reader import log_reader
from connect import database
from insert import push_log

19
eacgm/webui/connect.py Normal file
View File

@@ -0,0 +1,19 @@
# connect to mysql database
import mysql.connector
class database:
def __init__(self, ip, port, user, pwd, database) -> None:
self.conn = mysql.connector.connect(
host = ip,
port = port,
user = user,
password = pwd,
database = database
)
self.cursor = self.conn.cursor()
def exec(self, cmd: str):
self.cursor.execute(cmd)
result = self.cursor.fetchall()
self.conn.commit()
return result

113
eacgm/webui/insert.py Normal file
View File

@@ -0,0 +1,113 @@
# insert data into mysql database
import argparse
from reader import log_reader
from reader import log_reader
from connect import database
import time
def get_col_num(db) -> int:
col_num = db.exec(
f"SELECT COUNT(*) FROM information_schema.COLUMNS where `TABLE_SCHEMA` = 'grafana' and `TABLE_NAME` = 'CudaEvent';"
)
col_num = col_num[0][0]
return col_num
def lts_cuda_event(db) -> list:
"""to get the latest cuda event before
"""
ret = db.exec(f"SELECT * FROM grafana.`CudaEvent` ORDER BY time DESC LIMIT 1;")
# print(ret)
if len(ret) == 0:
col_num = get_col_num(db)
lts_event = [None] * (col_num - 1)
else:
lts_event = list(ret[0][1:])
return lts_event
def lts_event_cnt(db) -> dict:
"""to get the latest data of event count
"""
ret = db.exec(
"""SELECT * FROM grafana.events;"""
)
d = dict()
for name, cnt in ret:
d[name] = cnt
return d
def add_col(db):
col_num = get_col_num(db)
db.exec(f"""ALTER TABLE grafana.`CudaEvent` ADD COLUMN event{col_num} CHAR(255)""")
def del_col(db, col_num):
db.exec(f"""ALTER TABLE grafana.`CudaEvent` DROP COLUMN event{col_num};""")
def add_empty(max_time, db):
col_num = get_col_num(db)
db.exec(f"""INSERT INTO grafana.`CudaEvent` VALUES ({max_time}, {','.join(['NULL'] * (col_num - 1))})""")
def push_log(db, log):
max_time = 0
## latest cuda event
cuda_event = lts_cuda_event(db)
## latest event cnt
event_cnt = lts_event_cnt(db)
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
for line_idx, l in enumerate(log):
if l['op'] == 'start':
if l['name'] in event_cnt:
event_cnt[l['name']] += 1
else:
event_cnt[l["name"]] = 1
empty_col = False
i = 0
for e in cuda_event:
if e is None:
cuda_event[i] = l['name']
empty_col = True
break
i += 1
if not empty_col:
if len(cmd) > 37:
cmd = cmd[:-1] + ";"
# print(cmd)
# print('------')
db.exec(cmd)
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
add_col(db)
cuda_event.append(l['name'])
elif l['op'] == 'end':
if l['name'] in event_cnt:
if event_cnt[l["name"]] == 0:
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended more than starting")
#raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended more than starting")
continue
event_cnt[l["name"]] -= 1
for i, e in enumerate(cuda_event[::-1]):
if e == l["name"]:
cuda_event[len(cuda_event)- 1 - i] = None
break
if l["name"] not in event_cnt:
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended without starting")
# raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended without starting")
continue
else:
raise ValueError(f"in line {line_idx + 1}: unknown operation {l['op']}")
tmp_cmd = f"({l['time']}, "
max_time = max(max_time, float(l['time']))
for e in cuda_event:
if e is None:
tmp_cmd += "NULL, "
else:
tmp_cmd += f"'{e}', "
tmp_cmd = tmp_cmd[:-2] + "),"
cmd += tmp_cmd
if len(cmd) > 37:
cmd = cmd[:-1] + ";"
# print(cmd)
# print("------")
db.exec(cmd)
# print(cuda_event)
# print(event_cnt)
add_empty(max_time,db)

13
eacgm/webui/reader.py Normal file
View File

@@ -0,0 +1,13 @@
def log_reader(path):
with open(path, 'r') as f:
data = f.readlines()
for i, d in enumerate(data):
data[i] = d.strip().split(' ')
ret = []
for d in data:
tmp = dict()
tmp['time'] = d[3]
tmp['op'] = d[5]
tmp['name'] = d[6]
ret.append(tmp)
return ret

View File

@@ -0,0 +1,41 @@
version: '2.1'
services:
mysql:
build:
context: ./mysql
dockerfile: dockerfile
ports:
- "3306:3306"
volumes:
- ../volumes/mysql/data:/var/lib/mysql
environment:
- "MYSQL_ROOT_PASSWORD=adminpwd"
container_name: gf-mysql
networks:
- gf-network
grafana:
build:
context: ./grafana
dockerfile: dockerfile
container_name: gf-grafana
ports:
- "3000:3000"
environment:
- "GF_SECURITY_ADMIN_PASSWORD=admin"
depends_on:
- mysql
networks:
- gf-network
links:
- mysql
networks:
gf-network:
driver: bridge
ipam:
driver: default
config:
- subnet: 192.168.114.0/24
gateway: 192.168.114.254

View File

@@ -0,0 +1,2 @@
FROM grafana/grafana
COPY --chown=grafana:grafana grafana.db /var/lib/grafana/grafana.db

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
FROM mysql:8.0
COPY ./init.sql /docker-entrypoint-initdb.d/
EXPOSE 3306

View File

@@ -0,0 +1,62 @@
CREATE USER 'node1' @'%' IDENTIFIED BY 'mysql114514';
GRANT ALL PRIVILEGES ON *.* TO 'node1' @'%' WITH GRANT OPTION;
FLUSH PRIVILEGES;
-- grafana database
CREATE DATABASE IF NOT EXISTS grafana;
-- state timeline
CREATE TABLE IF NOT EXISTS grafana.CudaEvent (
time DOUBLE,
event1 CHAR(255),
event2 CHAR(255),
event3 CHAR(255)
);
CREATE TABLE IF NOT EXISTS grafana.events (
name CHAR(255) PRIMARY KEY,
cnt INT
);
-- top
CREATE Table IF NOT EXISTS grafana.gauge (
TIME DATETIME,
cpu DOUBLE,
mem DOUBLE,
gpu_load DOUBLE,
gpu_mem DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.memory (
TIME DATETIME,
total DOUBLE,
used DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.gpumem (
TIME DATETIME,
total DOUBLE,
used DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.diskio (
TIME DATETIME,
read_rate DOUBLE,
write_rate DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.netio (
TIME DATETIME,
send_rate DOUBLE,
recv_rate DOUBLE
);
-- ollamanet
CREATE TABLE IF NOT EXISTS grafana.ollamanet (
time DATETIME,
request DOUBLE,
token DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.ipport (
ipport CHAR(255) PRIMARY KEY,
cnt INT
);

9
grafana/launch.sh Normal file
View File

@@ -0,0 +1,9 @@
sudo apt install tcpdump make
pip install mysql-connector-python psutil GPUtil
echo -e "\x1b[32m[+] Successfully installed required packages\x1b[0m"
cd compose && docker-compose up -d && cd ..
echo -e "\x1b[32m[+] Successfully launched docker containers gf-mysql and gf-grafana\x1b[0m"
echo -e "\x1b[32m[+] grafana is now available at http://127.0.0.1:3000 \x1b[0m"
echo -e "\x1b[32m[+] default username: admin, password: admin \x1b[0m"

8
grafana/service.sh Normal file
View File

@@ -0,0 +1,8 @@
cd src/ollamanet && make run && cd ../..
echo -e "\x1b[32m[+] Successfully launched ollamanet\x1b[0m"
cd src/top
nohup python top.py > log/top.log 2>&1 &
cd ../..
echo -e "\x1b[32m[+] Successfully launched top\x1b[0m"

View File

@@ -0,0 +1,29 @@
# connect to mysql database
import mysql.connector
class database:
def __init__(self, ip, port, user, pwd, database) -> None:
self.conn = mysql.connector.connect(
host = ip,
port = port,
user = user,
password = pwd,
database = database
)
self.cursor = self.conn.cursor()
def exec(self, cmd: str):
self.cursor.execute(cmd)
result = self.cursor.fetchall()
self.conn.commit()
return result
if __name__ == '__main__':
db = database(
ip="127.0.0.1",
port=3306,
user="node1",
pwd="mysql114514",
database="grafana",
)

View File

@@ -0,0 +1,13 @@
DROP Table IF EXISTS grafana.ollamanet;
DROP TABLE if EXISTS grafana.ipport;
CREATE TABLE IF NOT EXISTS grafana.ollamanet
(
time DATETIME,
request DOUBLE,
token DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.ipport
(
ipport CHAR(255) PRIMARY KEY,
cnt INT
);

View File

@@ -0,0 +1,37 @@
#!/bin/bash
sudo tcpdump -i any port 11434 -n -l | awk '
BEGIN {
start_time = systime()
packets = 0
inp = 0
out = 0
}
{
if ($3 == "In"){
inp++
}else{
out++
}
packets++
current_time = systime()
table[$5]++
dest = substr($7, 1, length($7) - 1)
table[dest]++
if (current_time - start_time >= 5) {
start_time = current_time
timestamp = strftime("%Y-%m-%d %H:%M:%S", current_time)
print current_time, packets >> "trace.txt"
print inp, out >> "trace.txt"
for (i in table) {
print i, table[i] >> "trace.txt"
}
print "---" >> "trace.txt"
fflush("trace.txt")
packets = 0
inp=0
out=0
delete table
}
}'

View File

@@ -0,0 +1,3 @@
run:
nohup ./listen.sh > log/listen.log 2>&1 &
nohup python tailf.py > log/tailf.log 2>&1 &

View File

@@ -0,0 +1,85 @@
import time
import os
import argparse
from connect import database
interval = 5
max_time = 0
def tail_f(args, db, filename):
with open(filename, 'r') as file:
# 移动文件指针到文件末尾
file.seek(0, 2)
global max_time
while True:
# 读取新行
line = file.readline()
if not line:
time.sleep(1) # 如果没有新行,暂停一秒后继续检查
ts = int(time.time())
if ts - max_time > interval:
db.exec(f"""INSERT INTO {args.database}.ollamanet VALUES (NOW(), 0, 0)""")
max_time = ts
continue
yield line
def main(db:database, args):
global interval, max_time
log_file = args.file
interval = args.interval
if not os.path.exists(log_file):
os.system(f"touch {log_file}")
buf = []
for line in tail_f(args, db, log_file):
line = line.strip()
if line.strip() == "---":
l0 = buf[0].split(' ')
ts = int(l0[0])
max_time = max(max_time, ts)
cnt = int(l0[1]) / interval
l1 = buf[1].split(' ')
recv = int(l1[0]) / interval
send = int(l1[1]) / interval
# print(f"{ts} {cnt} {recv} {send}")
# print(buf)
db.exec(f"""INSERT INTO {args.database}.ollamanet VALUES (NOW(), {recv}, {send});""")
i = 2
while i < len(buf) - 1:
l = buf[i].split(' ')
ipport = l[0]
ipport = ipport[:ipport.rfind('.')]
i += 1
if ipport == args.local:
continue
cnt = int(l[1])
all = db.exec(f"""SELECT cnt from {args.database}.ipport where ipport='{ipport}';""")
if not all:
all = cnt
db.exec(f"""INSERT INTO {args.database}.ipport VALUES ('{ipport}', {cnt});""")
else:
all = all[0][0]
all += cnt
db.exec(f"""UPDATE {args.database}.ipport SET cnt={all} where ipport='{ipport}';""")
buf = []
continue
buf.append(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--file', type=str, default='trace.txt', help='log file')
parser.add_argument('--interval', type=int, default=5, help='interval (s)')
parser.add_argument('--ip', type=str, default='127.0.0.1', help='ip')
parser.add_argument('--port', type=int, default=3306, help='port')
parser.add_argument('--user', type=str, default='node1', help='user')
parser.add_argument('--password', type=str, default='mysql114514', help='password')
parser.add_argument("--database", type=str, default="grafana", help="database")
parser.add_argument("--local", type=str, default="127.0.0.1.11434")
args = parser.parse_args()
db = database(args.ip, args.port, args.user, args.password, args.database)
main(db, args)

View File

@@ -0,0 +1,60 @@
1722135153 93
34 59
100.77.22.47.18099 6
100.77.22.47.18188 87
100.82.183.119.11434 93
---
1722135153 93
34 59
100.77.22.47.18099 1
100.77.22.47.18188 92
100.82.183.119.11434 93
---
1722135155 93
33 60
100.77.22.47.18188 93
100.82.183.119.11434 93
---
1722135155 1
1 0
100.82.183.119.11434 1
100.77.22.47.17946 1
---
1722135230 62
23 39
100.82.183.119.11434 62
100.77.22.47.17946 62
---
1722135230 93
36 57
100.77.22.47.18188 19
100.82.183.119.11434 93
100.77.22.47.17946 74
---
1722135230 93
35 58
100.77.22.47.18188 14
100.82.183.119.11434 93
100.77.22.47.17946 79
---
1722135231 94
35 59
100.77.22.47.18188 12
100.82.183.119.11434 94
100.77.22.47.17946 82
---
1722137005 1
1 0
100.82.183.119.11434 1
100.77.22.47.8112 1
---
1722137235 126
44 82
100.82.183.119.11434 126
100.77.22.47.8112 126
---
1722137236 1
1 0
100.82.183.119.11434 1
100.77.22.47.55880 1
---

View File

@@ -0,0 +1,29 @@
# connect to mysql database
import mysql.connector
class database:
def __init__(self, ip, port, user, pwd, database) -> None:
self.conn = mysql.connector.connect(
host = ip,
port = port,
user = user,
password = pwd,
database = database
)
self.cursor = self.conn.cursor()
def exec(self, cmd: str):
self.cursor.execute(cmd)
result = self.cursor.fetchall()
self.conn.commit()
return result
if __name__ == '__main__':
db = database(
ip="127.0.0.1",
port=3306,
user="node1",
pwd="mysql114514",
database="grafana",
)

View File

@@ -0,0 +1,14 @@
-- WARNING: database grafana will be cleared if exists
DROP DATABASE IF EXISTS grafana;
CREATE DATABASE IF NOT EXISTS grafana;
drop TABLE if EXISTS grafana.CudaEvent;
CREATE TABLE IF NOT EXISTS grafana.CudaEvent(
time DOUBLE,
event1 CHAR(255),
event2 CHAR(255),
event3 CHAR(255)
);
CREATE TABLE IF NOT EXISTS grafana.events(
name CHAR(255) PRIMARY KEY,
cnt INT
);

View File

@@ -0,0 +1,120 @@
# insert data into mysql database
import argparse
from log_reader import reader, ollama_reader
from log_reader import reader
from connect import database
import time
def get_col_num(db) -> int:
col_num = db.exec(
f"SELECT COUNT(*) FROM information_schema.COLUMNS where `TABLE_SCHEMA` = 'grafana' and `TABLE_NAME` = 'CudaEvent';"
)
col_num = col_num[0][0]
return col_num
def lts_cuda_event(db) -> list:
"""to get the latest cuda event before
"""
ret = db.exec(f"SELECT * FROM grafana.`CudaEvent` ORDER BY time DESC LIMIT 1;")
# print(ret)
if len(ret) == 0:
col_num = get_col_num(db)
lts_event = [None] * (col_num - 1)
else:
lts_event = list(ret[0][1:])
return lts_event
def lts_event_cnt(db) -> dict:
"""to get the latest data of event count
"""
ret = db.exec(
"""SELECT * FROM grafana.events;"""
)
d = dict()
for name, cnt in ret:
d[name] = cnt
return d
def add_col(db):
col_num = get_col_num(db)
db.exec(f"""ALTER TABLE grafana.`CudaEvent` ADD COLUMN event{col_num} CHAR(255)""")
def del_col(db, col_num):
db.exec(f"""ALTER TABLE grafana.`CudaEvent` DROP COLUMN event{col_num};""")
def add_empty(max_time, db):
col_num = get_col_num(db)
db.exec(f"""INSERT INTO grafana.`CudaEvent` VALUES ({max_time}, {','.join(['NULL'] * (col_num - 1))})""")
def push_log(db, log):
max_time = 0
## latest cuda event
cuda_event = lts_cuda_event(db)
## latest event cnt
event_cnt = lts_event_cnt(db)
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
for line_idx, l in enumerate(log):
if l['op'] == 'start':
if l['name'] in event_cnt:
event_cnt[l['name']] += 1
else:
event_cnt[l["name"]] = 1
empty_col = False
i = 0
for e in cuda_event:
if e is None:
cuda_event[i] = l['name']
empty_col = True
break
i += 1
if not empty_col:
if len(cmd) > 37:
cmd = cmd[:-1] + ";"
# print(cmd)
# print('------')
db.exec(cmd)
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
add_col(db)
cuda_event.append(l['name'])
elif l['op'] == 'end':
if l['name'] in event_cnt:
if event_cnt[l["name"]] == 0:
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended more than starting")
#raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended more than starting")
continue
event_cnt[l["name"]] -= 1
for i, e in enumerate(cuda_event[::-1]):
if e == l["name"]:
cuda_event[len(cuda_event)- 1 - i] = None
break
if l["name"] not in event_cnt:
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended without starting")
# raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended without starting")
continue
else:
raise ValueError(f"in line {line_idx + 1}: unknown operation {l['op']}")
tmp_cmd = f"({l['time']}, "
max_time = max(max_time, float(l['time']))
for e in cuda_event:
if e is None:
tmp_cmd += "NULL, "
else:
tmp_cmd += f"'{e}', "
tmp_cmd = tmp_cmd[:-2] + "),"
cmd += tmp_cmd
if len(cmd) > 37:
cmd = cmd[:-1] + ";"
# print(cmd)
# print("------")
db.exec(cmd)
# print(cuda_event)
# print(event_cnt)
add_empty(max_time,db)
def main(ip:str="127.0.0.1", port:int=3306, user:str="node1", pwd:str="mysql114514", data_base:str="grafana", table:str="CudaEvent", log_file:str="log/transformer.log"):
log = reader(log_file)
db = database(ip, port, user, pwd, data_base)
push_log(db, log)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,31 @@
def reader(path):
with open(path, 'r') as f:
data = f.readlines()
for i, d in enumerate(data):
data[i] = d.strip().split(' ')
ret = []
for d in data:
tmp = dict()
tmp['time'] = d[3]
tmp['op'] = d[5]
tmp['name'] = d[6]
ret.append(tmp)
return ret
def ollama_reader(path):
with open(path, 'r') as f:
data = f.readlines()
for i, d in enumerate(data):
data[i] = d.strip().split(' ')
ret = []
for d in data:
tmp = dict()
tmp['time'] = d[0]
tmp['op'] = "start" if d[2] == "B" else "end"
tmp['name'] = d[3]
ret.append(tmp)
return ret
if __name__ == '__main__':
data = reader('log/transformer.log')
print(data)

View File

@@ -0,0 +1,29 @@
# connect to mysql database
import mysql.connector
class database:
def __init__(self, ip, port, user, pwd, database) -> None:
self.conn = mysql.connector.connect(
host = ip,
port = port,
user = user,
password = pwd,
database = database
)
self.cursor = self.conn.cursor()
def exec(self, cmd: str):
self.cursor.execute(cmd)
result = self.cursor.fetchall()
self.conn.commit()
return result
if __name__ == '__main__':
db = database(
ip="127.0.0.1",
port=3306,
user="node1",
pwd="mysql114514",
database="grafana",
)

View File

@@ -0,0 +1,32 @@
DROP TABLE IF EXISTS grafana.gauge;
DROP TABLE IF EXISTS grafana.memory;
DROP TABLE IF EXISTS grafana.gpumem;
DROP TABLE IF EXISTS grafana.diskio;
DROP TABLE IF EXISTS grafana.netio;
CREATE Table IF NOT EXISTS grafana.gauge(
TIME DATETIME,
cpu DOUBLE,
mem DOUBLE,
gpu_load DOUBLE,
gpu_mem DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.memory(
TIME DATETIME,
total DOUBLE,
used DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.gpumem(
TIME DATETIME,
total DOUBLE,
used DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.diskio(
TIME DATETIME,
read_rate DOUBLE,
write_rate DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.netio(
TIME DATETIME,
send_rate DOUBLE,
recv_rate DOUBLE
);

96
grafana/src/top/top.py Normal file
View File

@@ -0,0 +1,96 @@
import psutil
from connect import database
import GPUtil
from time import sleep
from time import time
def avg(lst):
return sum(lst) / len(lst)
# print(f"{avg(psutil.cpu_percent(interval=0.5, percpu=True))}%")
def get_cpu_percent():
return avg(psutil.cpu_percent(interval=0.5, percpu=True))
def get_mem_percent():
return psutil.virtual_memory().percent
def get_mem_total():
return psutil.virtual_memory().total/(1024*1024)
def get_mem_used():
return psutil.virtual_memory().used/(1024*1024)
disk_io_start = psutil.disk_io_counters()
last_time = time()
def get_disk_io_rate():
global disk_io_start, last_time
disk_io_end = psutil.disk_io_counters()
current_time = time()
read_bytes = disk_io_end.read_bytes - disk_io_start.read_bytes
write_bytes = disk_io_end.write_bytes - disk_io_start.write_bytes
read_rate = read_bytes / (current_time - last_time)
write_rate = write_bytes / (current_time - last_time)
disk_io_start = disk_io_end
last_time = current_time
return read_rate, write_rate
net_io_start = psutil.net_io_counters()
last_time_net = time()
def get_network_traffic():
global net_io_start, last_time_net
net_io_end = psutil.net_io_counters()
current_time = time()
send_bytes = net_io_end.bytes_sent - net_io_start.bytes_sent
recv_bytes = net_io_end.bytes_recv - net_io_start.bytes_recv
send_rate = send_bytes / (current_time - last_time_net)
recv_rate = recv_bytes / (current_time - last_time_net)
net_io_start = net_io_end
last_time_net = current_time
return send_rate, recv_rate
def get_gpu():
"""
Returns: gpu load, gpu memory percentage, gpu memory used, gpu memory total, gpu temperature
"""
GPUs = GPUtil.getGPUs()
if len(GPUs) == 0:
return 0, 0
else:
return GPUs[0].load, GPUs[0].memoryUtil, GPUs[0].memoryUsed, GPUs[0].memoryTotal, GPUs[0].temperature
def main(ip:str="127.0.0.1", port:int=3306, user:str="node1", pwd:str="mysql114514", data_base:str="grafana", log_file:str="log/transformer.log", flush:int=10):
db = database(
ip=ip,
port=port,
user=user,
pwd=pwd,
database=data_base,
)
while True:
cpu_percent = get_cpu_percent()
mem_percent = get_mem_percent()
gpu_load, gpu_mem_percent, gpu_mem_used, gpu_mem_total, gpu_temp = get_gpu()
db.exec(
f"""INSERT INTO {data_base}.gauge (time, cpu, mem, gpu_load, gpu_mem) VALUES (NOW(), {cpu_percent}, {mem_percent}, {gpu_load}, {gpu_mem_percent});"""
)
db.exec(
f"""INSERT INTO {data_base}.memory (time, total, used) VALUES (NOW(), {get_mem_total()}, {get_mem_used()});"""
)
db.exec(
f"""INSERT INTO {data_base}.gpumem (time, total, used) VALUES (NOW(), {gpu_mem_total}, {gpu_mem_used});"""
)
sleep(flush)
read_rate, write_rate = get_disk_io_rate()
db.exec(
f"""INSERT INTO {data_base}.diskio (time, read_rate, write_rate) VALUES (NOW(), {read_rate / 1024/1024}, {write_rate / 1024/1024});"""
)
send_rate, recv_rate = get_network_traffic()
db.exec(
f"""INSERT INTO {data_base}.netio (time, send_rate, recv_rate) VALUES (NOW(), {send_rate / 1024/1024}, {recv_rate / 1024/1024});"""
)
if __name__ == "__main__":
main()

45
grafana/stop.sh Normal file
View File

@@ -0,0 +1,45 @@
cd compose
docker-compose stop
echo -e "\x1b[32m[+]Successfully stopped docker containers gf-mysql and gf-grafana\x1b[0m"
PROCESS_NAME="tailf.py"
PIDS=$(ps aux | grep $PROCESS_NAME | grep -v grep | awk '{print $2}')
# 检查是否找到了进程
if [ -z "$PIDS" ]; then
echo "Unable to find $PROCESS_NAME"
fi
# 杀死找到的进程
for PID in $PIDS; do
sudo kill $PID
echo "Killed process $PID#$PROCESS_NAME"
done
PROCESS_NAME="listen.sh"
PIDS=$(ps aux | grep $PROCESS_NAME | grep -v grep | awk '{print $2}')
# 检查是否找到了进程
if [ -z "$PIDS" ]; then
echo "Unable to find $PROCESS_NAME"
fi
# 杀死找到的进程
for PID in $PIDS; do
sudo kill $PID
echo "Killed process $PID#$PROCESS_NAME"
done
PROCESS_NAME="top.py"
PIDS=$(ps aux | grep $PROCESS_NAME | grep -v grep | awk '{print $2}')
# 检查是否找到了进程
if [ -z "$PIDS" ]; then
echo "Unable to find $PROCESS_NAME"
fi
# 杀死找到的进程
for PID in $PIDS; do
sudo kill $PID
echo "Killed process $PID#$PROCESS_NAME"
done

12
pyproject.toml Normal file
View File

@@ -0,0 +1,12 @@
[project]
name = "eACGM"
version = "0.1.0"
description = "eACGM: An eBPF-based Automated Comprehensive Governance and Monitoring framework for AI/ML systems. (IWQoS 2025)"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"bcc>=0.1.10",
"pymysql>=1.1.1",
"pynvml>=12.0.0",
"tqdm>=4.67.1",
]