feat: add network connections and SM utilization to process metrics

This commit is contained in:
Jonathan Atta
2026-03-11 18:46:36 +01:00
parent d58003feb7
commit 3ad8bf68a4
3 changed files with 146 additions and 70 deletions

View File

@@ -70,15 +70,17 @@ type ProcessInfo struct {
Mem uint64 `json:"mem"`
ReadBps uint64 `json:"read_bps"`
WriteBps uint64 `json:"write_bps"`
NetConns int `json:"net_conns"`
}
// GPUProcessInfo holds per-process GPU stats from nvidia-smi pmon.
type GPUProcessInfo struct {
PID int32 `json:"pid"`
Name string `json:"name"`
Type string `json:"type"` // "C", "G", or "C+G"
VRAM uint64 `json:"vram_mb"` // framebuffer memory in MB
RAM uint64 `json:"ram"` // resident set size in bytes
PID int32 `json:"pid"`
Name string `json:"name"`
Type string `json:"type"` // "C", "G", or "C+G"
VRAM uint64 `json:"vram_mb"` // framebuffer memory in MB
RAM uint64 `json:"ram"` // resident set size in bytes
SMUtil uint32 `json:"sm_util"` // SM (CUDA core) utilization %
}
type Metrics struct {
@@ -116,6 +118,16 @@ func (s *SysInfo) CollectOnce() (*Metrics, error) {
cpuPercents, _ := cpu.PercentWithContext(s.ctx, 0, false)
vm, _ := mem.VirtualMemory()
// Aggregate network connections per PID in one call
netConnByPid := make(map[int32]int)
if allConns, err := psnet.Connections("all"); err == nil {
for _, c := range allConns {
if c.Pid > 0 {
netConnByPid[c.Pid]++
}
}
}
// Collect all processes with disk I/O rates
procs, _ := ps.Processes()
newProcIO := make(map[int32][2]uint64, len(procs))
@@ -149,6 +161,7 @@ func (s *SysInfo) CollectOnce() (*Metrics, error) {
Mem: memBytes,
ReadBps: readBps,
WriteBps: writeBps,
NetConns: netConnByPid[p.Pid],
})
}
// Sort: primary by CPU desc, secondary by Mem desc
@@ -221,7 +234,9 @@ func (s *SysInfo) CollectOnce() (*Metrics, error) {
// queryGPUProcesses lists all processes currently using the GPU via nvidia-smi pmon.
func queryGPUProcesses(ctx context.Context) []GPUProcessInfo {
cmd := exec.CommandContext(ctx, "nvidia-smi", "pmon", "-s", "m", "-c", "1")
// -s mu: SM utilization + memory (fb)
// columns: gpuIdx pid type sm% mem% enc% dec% fbMB ccpmMB command...
cmd := exec.CommandContext(ctx, "nvidia-smi", "pmon", "-s", "mu", "-c", "1")
out, err := cmd.Output()
if err != nil {
return nil
@@ -233,26 +248,26 @@ func queryGPUProcesses(ctx context.Context) []GPUProcessInfo {
continue
}
fields := strings.Fields(line)
// expected: [gpuIdx, pid, type, fbMB, ccpmMB, name...]
if len(fields) < 5 {
// expected: [gpuIdx, pid, type, sm%, mem%, enc%, dec%, fbMB, ccpmMB, name...]
if len(fields) < 9 {
continue
}
pid64, err := strconv.ParseInt(fields[1], 10, 32)
if err != nil {
continue
}
vram, _ := strconv.ParseUint(fields[3], 10, 64)
name := ""
if len(fields) >= 6 {
name = strings.Join(fields[5:], " ")
} else {
sm64, _ := strconv.ParseUint(fields[3], 10, 32)
vram, _ := strconv.ParseUint(fields[7], 10, 64)
name := strings.Join(fields[9:], " ")
if name == "" {
name = fields[len(fields)-1]
}
gp := GPUProcessInfo{
PID: int32(pid64),
Name: name,
Type: fields[2],
VRAM: vram,
PID: int32(pid64),
Name: name,
Type: fields[2],
VRAM: vram,
SMUtil: uint32(sm64),
}
// look up RAM (RSS) for this process
if p, err := ps.NewProcess(int32(pid64)); err == nil {