init eACGM

This commit is contained in:
Tokisakix
2025-08-07 10:14:54 +08:00
commit 7a4a0b1b14
51 changed files with 11495 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
version: '2.1'
services:
mysql:
build:
context: ./mysql
dockerfile: dockerfile
ports:
- "3306:3306"
volumes:
- ../volumes/mysql/data:/var/lib/mysql
environment:
- "MYSQL_ROOT_PASSWORD=adminpwd"
container_name: gf-mysql
networks:
- gf-network
grafana:
build:
context: ./grafana
dockerfile: dockerfile
container_name: gf-grafana
ports:
- "3000:3000"
environment:
- "GF_SECURITY_ADMIN_PASSWORD=admin"
depends_on:
- mysql
networks:
- gf-network
links:
- mysql
networks:
gf-network:
driver: bridge
ipam:
driver: default
config:
- subnet: 192.168.114.0/24
gateway: 192.168.114.254

View File

@@ -0,0 +1,2 @@
FROM grafana/grafana
COPY --chown=grafana:grafana grafana.db /var/lib/grafana/grafana.db

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
FROM mysql:8.0
COPY ./init.sql /docker-entrypoint-initdb.d/
EXPOSE 3306

View File

@@ -0,0 +1,62 @@
CREATE USER 'node1' @'%' IDENTIFIED BY 'mysql114514';
GRANT ALL PRIVILEGES ON *.* TO 'node1' @'%' WITH GRANT OPTION;
FLUSH PRIVILEGES;
-- grafana database
CREATE DATABASE IF NOT EXISTS grafana;
-- state timeline
CREATE TABLE IF NOT EXISTS grafana.CudaEvent (
time DOUBLE,
event1 CHAR(255),
event2 CHAR(255),
event3 CHAR(255)
);
CREATE TABLE IF NOT EXISTS grafana.events (
name CHAR(255) PRIMARY KEY,
cnt INT
);
-- top
CREATE Table IF NOT EXISTS grafana.gauge (
TIME DATETIME,
cpu DOUBLE,
mem DOUBLE,
gpu_load DOUBLE,
gpu_mem DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.memory (
TIME DATETIME,
total DOUBLE,
used DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.gpumem (
TIME DATETIME,
total DOUBLE,
used DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.diskio (
TIME DATETIME,
read_rate DOUBLE,
write_rate DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.netio (
TIME DATETIME,
send_rate DOUBLE,
recv_rate DOUBLE
);
-- ollamanet
CREATE TABLE IF NOT EXISTS grafana.ollamanet (
time DATETIME,
request DOUBLE,
token DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.ipport (
ipport CHAR(255) PRIMARY KEY,
cnt INT
);

9
grafana/launch.sh Normal file
View File

@@ -0,0 +1,9 @@
sudo apt install tcpdump make
pip install mysql-connector-python psutil GPUtil
echo -e "\x1b[32m[+] Successfully installed required packages\x1b[0m"
cd compose && docker-compose up -d && cd ..
echo -e "\x1b[32m[+] Successfully launched docker containers gf-mysql and gf-grafana\x1b[0m"
echo -e "\x1b[32m[+] grafana is now available at http://127.0.0.1:3000 \x1b[0m"
echo -e "\x1b[32m[+] default username: admin, password: admin \x1b[0m"

8
grafana/service.sh Normal file
View File

@@ -0,0 +1,8 @@
cd src/ollamanet && make run && cd ../..
echo -e "\x1b[32m[+] Successfully launched ollamanet\x1b[0m"
cd src/top
nohup python top.py > log/top.log 2>&1 &
cd ../..
echo -e "\x1b[32m[+] Successfully launched top\x1b[0m"

View File

@@ -0,0 +1,29 @@
# connect to mysql database
import mysql.connector
class database:
def __init__(self, ip, port, user, pwd, database) -> None:
self.conn = mysql.connector.connect(
host = ip,
port = port,
user = user,
password = pwd,
database = database
)
self.cursor = self.conn.cursor()
def exec(self, cmd: str):
self.cursor.execute(cmd)
result = self.cursor.fetchall()
self.conn.commit()
return result
if __name__ == '__main__':
db = database(
ip="127.0.0.1",
port=3306,
user="node1",
pwd="mysql114514",
database="grafana",
)

View File

@@ -0,0 +1,13 @@
DROP Table IF EXISTS grafana.ollamanet;
DROP TABLE if EXISTS grafana.ipport;
CREATE TABLE IF NOT EXISTS grafana.ollamanet
(
time DATETIME,
request DOUBLE,
token DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.ipport
(
ipport CHAR(255) PRIMARY KEY,
cnt INT
);

View File

@@ -0,0 +1,37 @@
#!/bin/bash
sudo tcpdump -i any port 11434 -n -l | awk '
BEGIN {
start_time = systime()
packets = 0
inp = 0
out = 0
}
{
if ($3 == "In"){
inp++
}else{
out++
}
packets++
current_time = systime()
table[$5]++
dest = substr($7, 1, length($7) - 1)
table[dest]++
if (current_time - start_time >= 5) {
start_time = current_time
timestamp = strftime("%Y-%m-%d %H:%M:%S", current_time)
print current_time, packets >> "trace.txt"
print inp, out >> "trace.txt"
for (i in table) {
print i, table[i] >> "trace.txt"
}
print "---" >> "trace.txt"
fflush("trace.txt")
packets = 0
inp=0
out=0
delete table
}
}'

View File

@@ -0,0 +1,3 @@
run:
nohup ./listen.sh > log/listen.log 2>&1 &
nohup python tailf.py > log/tailf.log 2>&1 &

View File

@@ -0,0 +1,85 @@
import time
import os
import argparse
from connect import database
interval = 5
max_time = 0
def tail_f(args, db, filename):
with open(filename, 'r') as file:
# 移动文件指针到文件末尾
file.seek(0, 2)
global max_time
while True:
# 读取新行
line = file.readline()
if not line:
time.sleep(1) # 如果没有新行,暂停一秒后继续检查
ts = int(time.time())
if ts - max_time > interval:
db.exec(f"""INSERT INTO {args.database}.ollamanet VALUES (NOW(), 0, 0)""")
max_time = ts
continue
yield line
def main(db:database, args):
global interval, max_time
log_file = args.file
interval = args.interval
if not os.path.exists(log_file):
os.system(f"touch {log_file}")
buf = []
for line in tail_f(args, db, log_file):
line = line.strip()
if line.strip() == "---":
l0 = buf[0].split(' ')
ts = int(l0[0])
max_time = max(max_time, ts)
cnt = int(l0[1]) / interval
l1 = buf[1].split(' ')
recv = int(l1[0]) / interval
send = int(l1[1]) / interval
# print(f"{ts} {cnt} {recv} {send}")
# print(buf)
db.exec(f"""INSERT INTO {args.database}.ollamanet VALUES (NOW(), {recv}, {send});""")
i = 2
while i < len(buf) - 1:
l = buf[i].split(' ')
ipport = l[0]
ipport = ipport[:ipport.rfind('.')]
i += 1
if ipport == args.local:
continue
cnt = int(l[1])
all = db.exec(f"""SELECT cnt from {args.database}.ipport where ipport='{ipport}';""")
if not all:
all = cnt
db.exec(f"""INSERT INTO {args.database}.ipport VALUES ('{ipport}', {cnt});""")
else:
all = all[0][0]
all += cnt
db.exec(f"""UPDATE {args.database}.ipport SET cnt={all} where ipport='{ipport}';""")
buf = []
continue
buf.append(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--file', type=str, default='trace.txt', help='log file')
parser.add_argument('--interval', type=int, default=5, help='interval (s)')
parser.add_argument('--ip', type=str, default='127.0.0.1', help='ip')
parser.add_argument('--port', type=int, default=3306, help='port')
parser.add_argument('--user', type=str, default='node1', help='user')
parser.add_argument('--password', type=str, default='mysql114514', help='password')
parser.add_argument("--database", type=str, default="grafana", help="database")
parser.add_argument("--local", type=str, default="127.0.0.1.11434")
args = parser.parse_args()
db = database(args.ip, args.port, args.user, args.password, args.database)
main(db, args)

View File

@@ -0,0 +1,60 @@
1722135153 93
34 59
100.77.22.47.18099 6
100.77.22.47.18188 87
100.82.183.119.11434 93
---
1722135153 93
34 59
100.77.22.47.18099 1
100.77.22.47.18188 92
100.82.183.119.11434 93
---
1722135155 93
33 60
100.77.22.47.18188 93
100.82.183.119.11434 93
---
1722135155 1
1 0
100.82.183.119.11434 1
100.77.22.47.17946 1
---
1722135230 62
23 39
100.82.183.119.11434 62
100.77.22.47.17946 62
---
1722135230 93
36 57
100.77.22.47.18188 19
100.82.183.119.11434 93
100.77.22.47.17946 74
---
1722135230 93
35 58
100.77.22.47.18188 14
100.82.183.119.11434 93
100.77.22.47.17946 79
---
1722135231 94
35 59
100.77.22.47.18188 12
100.82.183.119.11434 94
100.77.22.47.17946 82
---
1722137005 1
1 0
100.82.183.119.11434 1
100.77.22.47.8112 1
---
1722137235 126
44 82
100.82.183.119.11434 126
100.77.22.47.8112 126
---
1722137236 1
1 0
100.82.183.119.11434 1
100.77.22.47.55880 1
---

View File

@@ -0,0 +1,29 @@
# connect to mysql database
import mysql.connector
class database:
def __init__(self, ip, port, user, pwd, database) -> None:
self.conn = mysql.connector.connect(
host = ip,
port = port,
user = user,
password = pwd,
database = database
)
self.cursor = self.conn.cursor()
def exec(self, cmd: str):
self.cursor.execute(cmd)
result = self.cursor.fetchall()
self.conn.commit()
return result
if __name__ == '__main__':
db = database(
ip="127.0.0.1",
port=3306,
user="node1",
pwd="mysql114514",
database="grafana",
)

View File

@@ -0,0 +1,14 @@
-- WARNING: database grafana will be cleared if exists
DROP DATABASE IF EXISTS grafana;
CREATE DATABASE IF NOT EXISTS grafana;
drop TABLE if EXISTS grafana.CudaEvent;
CREATE TABLE IF NOT EXISTS grafana.CudaEvent(
time DOUBLE,
event1 CHAR(255),
event2 CHAR(255),
event3 CHAR(255)
);
CREATE TABLE IF NOT EXISTS grafana.events(
name CHAR(255) PRIMARY KEY,
cnt INT
);

View File

@@ -0,0 +1,120 @@
# insert data into mysql database
import argparse
from log_reader import reader, ollama_reader
from log_reader import reader
from connect import database
import time
def get_col_num(db) -> int:
col_num = db.exec(
f"SELECT COUNT(*) FROM information_schema.COLUMNS where `TABLE_SCHEMA` = 'grafana' and `TABLE_NAME` = 'CudaEvent';"
)
col_num = col_num[0][0]
return col_num
def lts_cuda_event(db) -> list:
"""to get the latest cuda event before
"""
ret = db.exec(f"SELECT * FROM grafana.`CudaEvent` ORDER BY time DESC LIMIT 1;")
# print(ret)
if len(ret) == 0:
col_num = get_col_num(db)
lts_event = [None] * (col_num - 1)
else:
lts_event = list(ret[0][1:])
return lts_event
def lts_event_cnt(db) -> dict:
"""to get the latest data of event count
"""
ret = db.exec(
"""SELECT * FROM grafana.events;"""
)
d = dict()
for name, cnt in ret:
d[name] = cnt
return d
def add_col(db):
col_num = get_col_num(db)
db.exec(f"""ALTER TABLE grafana.`CudaEvent` ADD COLUMN event{col_num} CHAR(255)""")
def del_col(db, col_num):
db.exec(f"""ALTER TABLE grafana.`CudaEvent` DROP COLUMN event{col_num};""")
def add_empty(max_time, db):
col_num = get_col_num(db)
db.exec(f"""INSERT INTO grafana.`CudaEvent` VALUES ({max_time}, {','.join(['NULL'] * (col_num - 1))})""")
def push_log(db, log):
max_time = 0
## latest cuda event
cuda_event = lts_cuda_event(db)
## latest event cnt
event_cnt = lts_event_cnt(db)
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
for line_idx, l in enumerate(log):
if l['op'] == 'start':
if l['name'] in event_cnt:
event_cnt[l['name']] += 1
else:
event_cnt[l["name"]] = 1
empty_col = False
i = 0
for e in cuda_event:
if e is None:
cuda_event[i] = l['name']
empty_col = True
break
i += 1
if not empty_col:
if len(cmd) > 37:
cmd = cmd[:-1] + ";"
# print(cmd)
# print('------')
db.exec(cmd)
cmd = f"INSERT INTO grafana.CudaEvent VALUES "
add_col(db)
cuda_event.append(l['name'])
elif l['op'] == 'end':
if l['name'] in event_cnt:
if event_cnt[l["name"]] == 0:
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended more than starting")
#raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended more than starting")
continue
event_cnt[l["name"]] -= 1
for i, e in enumerate(cuda_event[::-1]):
if e == l["name"]:
cuda_event[len(cuda_event)- 1 - i] = None
break
if l["name"] not in event_cnt:
print(f"[!]: in line {line_idx + 1}: event {l['name']} ended without starting")
# raise ValueError(f"in line {line_idx + 1}: event {l['name']} ended without starting")
continue
else:
raise ValueError(f"in line {line_idx + 1}: unknown operation {l['op']}")
tmp_cmd = f"({l['time']}, "
max_time = max(max_time, float(l['time']))
for e in cuda_event:
if e is None:
tmp_cmd += "NULL, "
else:
tmp_cmd += f"'{e}', "
tmp_cmd = tmp_cmd[:-2] + "),"
cmd += tmp_cmd
if len(cmd) > 37:
cmd = cmd[:-1] + ";"
# print(cmd)
# print("------")
db.exec(cmd)
# print(cuda_event)
# print(event_cnt)
add_empty(max_time,db)
def main(ip:str="127.0.0.1", port:int=3306, user:str="node1", pwd:str="mysql114514", data_base:str="grafana", table:str="CudaEvent", log_file:str="log/transformer.log"):
log = reader(log_file)
db = database(ip, port, user, pwd, data_base)
push_log(db, log)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,31 @@
def reader(path):
with open(path, 'r') as f:
data = f.readlines()
for i, d in enumerate(data):
data[i] = d.strip().split(' ')
ret = []
for d in data:
tmp = dict()
tmp['time'] = d[3]
tmp['op'] = d[5]
tmp['name'] = d[6]
ret.append(tmp)
return ret
def ollama_reader(path):
with open(path, 'r') as f:
data = f.readlines()
for i, d in enumerate(data):
data[i] = d.strip().split(' ')
ret = []
for d in data:
tmp = dict()
tmp['time'] = d[0]
tmp['op'] = "start" if d[2] == "B" else "end"
tmp['name'] = d[3]
ret.append(tmp)
return ret
if __name__ == '__main__':
data = reader('log/transformer.log')
print(data)

View File

@@ -0,0 +1,29 @@
# connect to mysql database
import mysql.connector
class database:
def __init__(self, ip, port, user, pwd, database) -> None:
self.conn = mysql.connector.connect(
host = ip,
port = port,
user = user,
password = pwd,
database = database
)
self.cursor = self.conn.cursor()
def exec(self, cmd: str):
self.cursor.execute(cmd)
result = self.cursor.fetchall()
self.conn.commit()
return result
if __name__ == '__main__':
db = database(
ip="127.0.0.1",
port=3306,
user="node1",
pwd="mysql114514",
database="grafana",
)

View File

@@ -0,0 +1,32 @@
DROP TABLE IF EXISTS grafana.gauge;
DROP TABLE IF EXISTS grafana.memory;
DROP TABLE IF EXISTS grafana.gpumem;
DROP TABLE IF EXISTS grafana.diskio;
DROP TABLE IF EXISTS grafana.netio;
CREATE Table IF NOT EXISTS grafana.gauge(
TIME DATETIME,
cpu DOUBLE,
mem DOUBLE,
gpu_load DOUBLE,
gpu_mem DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.memory(
TIME DATETIME,
total DOUBLE,
used DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.gpumem(
TIME DATETIME,
total DOUBLE,
used DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.diskio(
TIME DATETIME,
read_rate DOUBLE,
write_rate DOUBLE
);
CREATE TABLE IF NOT EXISTS grafana.netio(
TIME DATETIME,
send_rate DOUBLE,
recv_rate DOUBLE
);

96
grafana/src/top/top.py Normal file
View File

@@ -0,0 +1,96 @@
import psutil
from connect import database
import GPUtil
from time import sleep
from time import time
def avg(lst):
return sum(lst) / len(lst)
# print(f"{avg(psutil.cpu_percent(interval=0.5, percpu=True))}%")
def get_cpu_percent():
return avg(psutil.cpu_percent(interval=0.5, percpu=True))
def get_mem_percent():
return psutil.virtual_memory().percent
def get_mem_total():
return psutil.virtual_memory().total/(1024*1024)
def get_mem_used():
return psutil.virtual_memory().used/(1024*1024)
disk_io_start = psutil.disk_io_counters()
last_time = time()
def get_disk_io_rate():
global disk_io_start, last_time
disk_io_end = psutil.disk_io_counters()
current_time = time()
read_bytes = disk_io_end.read_bytes - disk_io_start.read_bytes
write_bytes = disk_io_end.write_bytes - disk_io_start.write_bytes
read_rate = read_bytes / (current_time - last_time)
write_rate = write_bytes / (current_time - last_time)
disk_io_start = disk_io_end
last_time = current_time
return read_rate, write_rate
net_io_start = psutil.net_io_counters()
last_time_net = time()
def get_network_traffic():
global net_io_start, last_time_net
net_io_end = psutil.net_io_counters()
current_time = time()
send_bytes = net_io_end.bytes_sent - net_io_start.bytes_sent
recv_bytes = net_io_end.bytes_recv - net_io_start.bytes_recv
send_rate = send_bytes / (current_time - last_time_net)
recv_rate = recv_bytes / (current_time - last_time_net)
net_io_start = net_io_end
last_time_net = current_time
return send_rate, recv_rate
def get_gpu():
"""
Returns: gpu load, gpu memory percentage, gpu memory used, gpu memory total, gpu temperature
"""
GPUs = GPUtil.getGPUs()
if len(GPUs) == 0:
return 0, 0
else:
return GPUs[0].load, GPUs[0].memoryUtil, GPUs[0].memoryUsed, GPUs[0].memoryTotal, GPUs[0].temperature
def main(ip:str="127.0.0.1", port:int=3306, user:str="node1", pwd:str="mysql114514", data_base:str="grafana", log_file:str="log/transformer.log", flush:int=10):
db = database(
ip=ip,
port=port,
user=user,
pwd=pwd,
database=data_base,
)
while True:
cpu_percent = get_cpu_percent()
mem_percent = get_mem_percent()
gpu_load, gpu_mem_percent, gpu_mem_used, gpu_mem_total, gpu_temp = get_gpu()
db.exec(
f"""INSERT INTO {data_base}.gauge (time, cpu, mem, gpu_load, gpu_mem) VALUES (NOW(), {cpu_percent}, {mem_percent}, {gpu_load}, {gpu_mem_percent});"""
)
db.exec(
f"""INSERT INTO {data_base}.memory (time, total, used) VALUES (NOW(), {get_mem_total()}, {get_mem_used()});"""
)
db.exec(
f"""INSERT INTO {data_base}.gpumem (time, total, used) VALUES (NOW(), {gpu_mem_total}, {gpu_mem_used});"""
)
sleep(flush)
read_rate, write_rate = get_disk_io_rate()
db.exec(
f"""INSERT INTO {data_base}.diskio (time, read_rate, write_rate) VALUES (NOW(), {read_rate / 1024/1024}, {write_rate / 1024/1024});"""
)
send_rate, recv_rate = get_network_traffic()
db.exec(
f"""INSERT INTO {data_base}.netio (time, send_rate, recv_rate) VALUES (NOW(), {send_rate / 1024/1024}, {recv_rate / 1024/1024});"""
)
if __name__ == "__main__":
main()

45
grafana/stop.sh Normal file
View File

@@ -0,0 +1,45 @@
cd compose
docker-compose stop
echo -e "\x1b[32m[+]Successfully stopped docker containers gf-mysql and gf-grafana\x1b[0m"
PROCESS_NAME="tailf.py"
PIDS=$(ps aux | grep $PROCESS_NAME | grep -v grep | awk '{print $2}')
# 检查是否找到了进程
if [ -z "$PIDS" ]; then
echo "Unable to find $PROCESS_NAME"
fi
# 杀死找到的进程
for PID in $PIDS; do
sudo kill $PID
echo "Killed process $PID#$PROCESS_NAME"
done
PROCESS_NAME="listen.sh"
PIDS=$(ps aux | grep $PROCESS_NAME | grep -v grep | awk '{print $2}')
# 检查是否找到了进程
if [ -z "$PIDS" ]; then
echo "Unable to find $PROCESS_NAME"
fi
# 杀死找到的进程
for PID in $PIDS; do
sudo kill $PID
echo "Killed process $PID#$PROCESS_NAME"
done
PROCESS_NAME="top.py"
PIDS=$(ps aux | grep $PROCESS_NAME | grep -v grep | awk '{print $2}')
# 检查是否找到了进程
if [ -z "$PIDS" ]; then
echo "Unable to find $PROCESS_NAME"
fi
# 杀死找到的进程
for PID in $PIDS; do
sudo kill $PID
echo "Killed process $PID#$PROCESS_NAME"
done