feat: add network connections and SM utilization to process metrics
This commit is contained in:
@@ -70,15 +70,17 @@ type ProcessInfo struct {
|
||||
Mem uint64 `json:"mem"`
|
||||
ReadBps uint64 `json:"read_bps"`
|
||||
WriteBps uint64 `json:"write_bps"`
|
||||
NetConns int `json:"net_conns"`
|
||||
}
|
||||
|
||||
// GPUProcessInfo holds per-process GPU stats from nvidia-smi pmon.
|
||||
type GPUProcessInfo struct {
|
||||
PID int32 `json:"pid"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"` // "C", "G", or "C+G"
|
||||
VRAM uint64 `json:"vram_mb"` // framebuffer memory in MB
|
||||
RAM uint64 `json:"ram"` // resident set size in bytes
|
||||
PID int32 `json:"pid"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"` // "C", "G", or "C+G"
|
||||
VRAM uint64 `json:"vram_mb"` // framebuffer memory in MB
|
||||
RAM uint64 `json:"ram"` // resident set size in bytes
|
||||
SMUtil uint32 `json:"sm_util"` // SM (CUDA core) utilization %
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
@@ -116,6 +118,16 @@ func (s *SysInfo) CollectOnce() (*Metrics, error) {
|
||||
cpuPercents, _ := cpu.PercentWithContext(s.ctx, 0, false)
|
||||
vm, _ := mem.VirtualMemory()
|
||||
|
||||
// Aggregate network connections per PID in one call
|
||||
netConnByPid := make(map[int32]int)
|
||||
if allConns, err := psnet.Connections("all"); err == nil {
|
||||
for _, c := range allConns {
|
||||
if c.Pid > 0 {
|
||||
netConnByPid[c.Pid]++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collect all processes with disk I/O rates
|
||||
procs, _ := ps.Processes()
|
||||
newProcIO := make(map[int32][2]uint64, len(procs))
|
||||
@@ -149,6 +161,7 @@ func (s *SysInfo) CollectOnce() (*Metrics, error) {
|
||||
Mem: memBytes,
|
||||
ReadBps: readBps,
|
||||
WriteBps: writeBps,
|
||||
NetConns: netConnByPid[p.Pid],
|
||||
})
|
||||
}
|
||||
// Sort: primary by CPU desc, secondary by Mem desc
|
||||
@@ -221,7 +234,9 @@ func (s *SysInfo) CollectOnce() (*Metrics, error) {
|
||||
|
||||
// queryGPUProcesses lists all processes currently using the GPU via nvidia-smi pmon.
|
||||
func queryGPUProcesses(ctx context.Context) []GPUProcessInfo {
|
||||
cmd := exec.CommandContext(ctx, "nvidia-smi", "pmon", "-s", "m", "-c", "1")
|
||||
// -s mu: SM utilization + memory (fb)
|
||||
// columns: gpuIdx pid type sm% mem% enc% dec% fbMB ccpmMB command...
|
||||
cmd := exec.CommandContext(ctx, "nvidia-smi", "pmon", "-s", "mu", "-c", "1")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
@@ -233,26 +248,26 @@ func queryGPUProcesses(ctx context.Context) []GPUProcessInfo {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)
|
||||
// expected: [gpuIdx, pid, type, fbMB, ccpmMB, name...]
|
||||
if len(fields) < 5 {
|
||||
// expected: [gpuIdx, pid, type, sm%, mem%, enc%, dec%, fbMB, ccpmMB, name...]
|
||||
if len(fields) < 9 {
|
||||
continue
|
||||
}
|
||||
pid64, err := strconv.ParseInt(fields[1], 10, 32)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
vram, _ := strconv.ParseUint(fields[3], 10, 64)
|
||||
name := ""
|
||||
if len(fields) >= 6 {
|
||||
name = strings.Join(fields[5:], " ")
|
||||
} else {
|
||||
sm64, _ := strconv.ParseUint(fields[3], 10, 32)
|
||||
vram, _ := strconv.ParseUint(fields[7], 10, 64)
|
||||
name := strings.Join(fields[9:], " ")
|
||||
if name == "" {
|
||||
name = fields[len(fields)-1]
|
||||
}
|
||||
gp := GPUProcessInfo{
|
||||
PID: int32(pid64),
|
||||
Name: name,
|
||||
Type: fields[2],
|
||||
VRAM: vram,
|
||||
PID: int32(pid64),
|
||||
Name: name,
|
||||
Type: fields[2],
|
||||
VRAM: vram,
|
||||
SMUtil: uint32(sm64),
|
||||
}
|
||||
// look up RAM (RSS) for this process
|
||||
if p, err := ps.NewProcess(int32(pid64)); err == nil {
|
||||
|
||||
Reference in New Issue
Block a user