categraf/inputs/nvidiasmi/fields.go

220 lines
11 KiB
Go

package nvidiasmi
import (
"bytes"
"errors"
"fmt"
"os/exec"
"regexp"
"strings"
"time"
"flashcat.cloud/categraf/pkg/cmdx"
)
const (
uuidQField qField = "uuid"
nameQField qField = "name"
driverModelCurrentQField qField = "driver_model.current"
driverModelPendingQField qField = "driver_model.pending"
vBiosVersionQField qField = "vbios_version"
driverVersionQField qField = "driver_version"
qFieldsAuto = "AUTO"
DefaultQField = qFieldsAuto
)
var (
ErrNoQueryFields = errors.New("could not extract any query fields")
fieldRegex = regexp.MustCompile(`(?m)\n\s*\n^"([^"]+)"`)
fallbackQFieldToRFieldMap = map[qField]rField{
"timestamp": "timestamp",
"driver_version": "driver_version",
"count": "count",
"name": "name",
"serial": "serial",
"uuid": "uuid",
"pci.bus_id": "pci.bus_id",
"pci.domain": "pci.domain",
"pci.bus": "pci.bus",
"pci.device": "pci.device",
"pci.device_id": "pci.device_id",
"pci.sub_device_id": "pci.sub_device_id",
"pcie.link.gen.current": "pcie.link.gen.current",
"pcie.link.gen.max": "pcie.link.gen.max",
"pcie.link.width.current": "pcie.link.width.current",
"pcie.link.width.max": "pcie.link.width.max",
"index": "index",
"display_mode": "display_mode",
"display_active": "display_active",
"persistence_mode": "persistence_mode",
"accounting.mode": "accounting.mode",
"accounting.buffer_size": "accounting.buffer_size",
"driver_model.current": "driver_model.current",
"driver_model.pending": "driver_model.pending",
"vbios_version": "vbios_version",
"inforom.img": "inforom.img",
"inforom.oem": "inforom.oem",
"inforom.ecc": "inforom.ecc",
"inforom.pwr": "inforom.pwr",
"gom.current": "gom.current",
"gom.pending": "gom.pending",
"fan.speed": "fan.speed [%]",
"pstate": "pstate",
"clocks_throttle_reasons.supported": "clocks_throttle_reasons.supported",
"clocks_throttle_reasons.active": "clocks_throttle_reasons.active",
"clocks_throttle_reasons.gpu_idle": "clocks_throttle_reasons.gpu_idle",
"clocks_throttle_reasons.applications_clocks_setting": "clocks_throttle_reasons.applications_clocks_setting",
"clocks_throttle_reasons.sw_power_cap": "clocks_throttle_reasons.sw_power_cap",
"clocks_throttle_reasons.hw_slowdown": "clocks_throttle_reasons.hw_slowdown",
"clocks_throttle_reasons.hw_thermal_slowdown": "clocks_throttle_reasons.hw_thermal_slowdown",
"clocks_throttle_reasons.hw_power_brake_slowdown": "clocks_throttle_reasons.hw_power_brake_slowdown",
"clocks_throttle_reasons.sw_thermal_slowdown": "clocks_throttle_reasons.sw_thermal_slowdown",
"clocks_throttle_reasons.sync_boost": "clocks_throttle_reasons.sync_boost",
"memory.total": "memory.total [MiB]",
"memory.used": "memory.used [MiB]",
"memory.free": "memory.free [MiB]",
"compute_mode": "compute_mode",
"utilization.gpu": "utilization.gpu [%]",
"utilization.memory": "utilization.memory [%]",
"encoder.stats.sessionCount": "encoder.stats.sessionCount",
"encoder.stats.averageFps": "encoder.stats.averageFps",
"encoder.stats.averageLatency": "encoder.stats.averageLatency",
"ecc.mode.current": "ecc.mode.current",
"ecc.mode.pending": "ecc.mode.pending",
"ecc.errors.corrected.volatile.device_memory": "ecc.errors.corrected.volatile.device_memory",
"ecc.errors.corrected.volatile.dram": "ecc.errors.corrected.volatile.dram",
"ecc.errors.corrected.volatile.register_file": "ecc.errors.corrected.volatile.register_file",
"ecc.errors.corrected.volatile.l1_cache": "ecc.errors.corrected.volatile.l1_cache",
"ecc.errors.corrected.volatile.l2_cache": "ecc.errors.corrected.volatile.l2_cache",
"ecc.errors.corrected.volatile.texture_memory": "ecc.errors.corrected.volatile.texture_memory",
"ecc.errors.corrected.volatile.cbu": "ecc.errors.corrected.volatile.cbu",
"ecc.errors.corrected.volatile.sram": "ecc.errors.corrected.volatile.sram",
"ecc.errors.corrected.volatile.total": "ecc.errors.corrected.volatile.total",
"ecc.errors.corrected.aggregate.device_memory": "ecc.errors.corrected.aggregate.device_memory",
"ecc.errors.corrected.aggregate.dram": "ecc.errors.corrected.aggregate.dram",
"ecc.errors.corrected.aggregate.register_file": "ecc.errors.corrected.aggregate.register_file",
"ecc.errors.corrected.aggregate.l1_cache": "ecc.errors.corrected.aggregate.l1_cache",
"ecc.errors.corrected.aggregate.l2_cache": "ecc.errors.corrected.aggregate.l2_cache",
"ecc.errors.corrected.aggregate.texture_memory": "ecc.errors.corrected.aggregate.texture_memory",
"ecc.errors.corrected.aggregate.cbu": "ecc.errors.corrected.aggregate.cbu",
"ecc.errors.corrected.aggregate.sram": "ecc.errors.corrected.aggregate.sram",
"ecc.errors.corrected.aggregate.total": "ecc.errors.corrected.aggregate.total",
"ecc.errors.uncorrected.volatile.device_memory": "ecc.errors.uncorrected.volatile.device_memory",
"ecc.errors.uncorrected.volatile.dram": "ecc.errors.uncorrected.volatile.dram",
"ecc.errors.uncorrected.volatile.register_file": "ecc.errors.uncorrected.volatile.register_file",
"ecc.errors.uncorrected.volatile.l1_cache": "ecc.errors.uncorrected.volatile.l1_cache",
"ecc.errors.uncorrected.volatile.l2_cache": "ecc.errors.uncorrected.volatile.l2_cache",
"ecc.errors.uncorrected.volatile.texture_memory": "ecc.errors.uncorrected.volatile.texture_memory",
"ecc.errors.uncorrected.volatile.cbu": "ecc.errors.uncorrected.volatile.cbu",
"ecc.errors.uncorrected.volatile.sram": "ecc.errors.uncorrected.volatile.sram",
"ecc.errors.uncorrected.volatile.total": "ecc.errors.uncorrected.volatile.total",
"ecc.errors.uncorrected.aggregate.device_memory": "ecc.errors.uncorrected.aggregate.device_memory",
"ecc.errors.uncorrected.aggregate.dram": "ecc.errors.uncorrected.aggregate.dram",
"ecc.errors.uncorrected.aggregate.register_file": "ecc.errors.uncorrected.aggregate.register_file",
"ecc.errors.uncorrected.aggregate.l1_cache": "ecc.errors.uncorrected.aggregate.l1_cache",
"ecc.errors.uncorrected.aggregate.l2_cache": "ecc.errors.uncorrected.aggregate.l2_cache",
"ecc.errors.uncorrected.aggregate.texture_memory": "ecc.errors.uncorrected.aggregate.texture_memory",
"ecc.errors.uncorrected.aggregate.cbu": "ecc.errors.uncorrected.aggregate.cbu",
"ecc.errors.uncorrected.aggregate.sram": "ecc.errors.uncorrected.aggregate.sram",
"ecc.errors.uncorrected.aggregate.total": "ecc.errors.uncorrected.aggregate.total",
"retired_pages.single_bit_ecc.count": "retired_pages.single_bit_ecc.count",
"retired_pages.double_bit.count": "retired_pages.double_bit.count",
"retired_pages.pending": "retired_pages.pending",
"temperature.gpu": "temperature.gpu",
"temperature.memory": "temperature.memory",
"power.management": "power.management",
"power.draw": "power.draw [W]",
"power.limit": "power.limit [W]",
"enforced.power.limit": "enforced.power.limit [W]",
"power.default_limit": "power.default_limit [W]",
"power.min_limit": "power.min_limit [W]",
"power.max_limit": "power.max_limit [W]",
"clocks.current.graphics": "clocks.current.graphics [MHz]",
"clocks.current.sm": "clocks.current.sm [MHz]",
"clocks.current.memory": "clocks.current.memory [MHz]",
"clocks.current.video": "clocks.current.video [MHz]",
"clocks.applications.graphics": "clocks.applications.graphics [MHz]",
"clocks.applications.memory": "clocks.applications.memory [MHz]",
"clocks.default_applications.graphics": "clocks.default_applications.graphics [MHz]",
"clocks.default_applications.memory": "clocks.default_applications.memory [MHz]",
"clocks.max.graphics": "clocks.max.graphics [MHz]",
"clocks.max.sm": "clocks.max.sm [MHz]",
"clocks.max.memory": "clocks.max.memory [MHz]",
"mig.mode.current": "mig.mode.current",
"mig.mode.pending": "mig.mode.pending",
}
)
func parseAutoQFields(nvidiaSmiCommand string) ([]qField, error) {
cmdAndArgs := strings.Fields(nvidiaSmiCommand)
cmdAndArgs = append(cmdAndArgs, "--help-query-gpu")
cmd := exec.Command(cmdAndArgs[0], cmdAndArgs[1:]...) //nolint:gosec
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err, timeout := cmdx.RunTimeout(cmd, time.Second*3)
if timeout {
return nil, fmt.Errorf("timeout to run command: %s", strings.Join(cmdAndArgs, " "))
}
outStr := stdout.String()
errStr := stdout.String()
if err != nil {
return nil, fmt.Errorf("failed to run command: %s | error: %v | stdout: %s | stderr: %s",
strings.Join(cmdAndArgs, " "), err, outStr, errStr)
}
fields := extractQFields(outStr)
if fields == nil {
return nil, fmt.Errorf("%w | command: %s | stdout: %s | stderr: %s", ErrNoQueryFields,
strings.Join(cmdAndArgs, " "), outStr, errStr)
}
return fields, nil
}
func extractQFields(text string) []qField {
found := fieldRegex.FindAllStringSubmatch(text, -1)
fields := make([]qField, len(found))
for i, ss := range found {
fields[i] = qField(ss[1])
}
return fields
}
func toQFieldSlice(ss []string) []qField {
r := make([]qField, len(ss))
for i, s := range ss {
r[i] = qField(s)
}
return r
}
func toRFieldSlice(ss []string) []rField {
r := make([]rField, len(ss))
for i, s := range ss {
r[i] = rField(s)
}
return r
}
func QFieldSliceToStringSlice(qs []qField) []string {
r := make([]string, len(qs))
for i, q := range qs {
r[i] = string(q)
}
return r
}