categraf/inputs/nvidia_smi/nvidia_smi.go

105 lines
2.9 KiB
Go

package nvidia_smi
// This is a fork of https://github.com/utkuozdemir/nvidia_gpu_exporter
import (
"log"
"strings"
"time"
"flashcat.cloud/categraf/config"
"flashcat.cloud/categraf/inputs"
"flashcat.cloud/categraf/types"
)
const inputName = "nvidia_smi"
type GPUStats struct {
config.PluginConfig
NvidiaSmiCommand string `toml:"nvidia_smi_command"`
QueryFieldNames string `toml:"query_field_names"`
qFields []qField
qFieldToMetricInfoMap map[qField]MetricInfo
}
func init() {
inputs.Add(inputName, func() inputs.Input {
return &GPUStats{}
})
}
func (s *GPUStats) Drop() {}
func (s *GPUStats) GetInstances() []inputs.Instance { return nil }
func (s *GPUStats) Init() error {
if s.NvidiaSmiCommand == "" {
return types.ErrInstancesEmpty
}
qFieldsOrdered, qFieldToRFieldMap, err := buildQFieldToRFieldMap(s.QueryFieldNames, s.NvidiaSmiCommand)
if err != nil {
return err
}
s.qFields = qFieldsOrdered
s.qFieldToMetricInfoMap = buildQFieldToMetricInfoMap(qFieldToRFieldMap)
return nil
}
func (s *GPUStats) Gather(slist *types.SampleList) {
if s.NvidiaSmiCommand == "" {
return
}
begun := time.Now()
// scrape use seconds
defer func(begun time.Time) {
use := time.Since(begun).Seconds()
slist.PushFront(types.NewSample(inputName, "scrape_use_seconds", use))
}(begun)
currentTable, err := scrape(s.qFields, s.NvidiaSmiCommand)
if err != nil {
slist.PushFront(types.NewSample(inputName, "scraper_up", 0))
return
}
slist.PushFront(types.NewSample(inputName, "scraper_up", 1))
for _, currentRow := range currentTable.rows {
uuid := strings.TrimPrefix(strings.ToLower(currentRow.qFieldToCells[uuidQField].rawValue), "gpu-")
name := currentRow.qFieldToCells[nameQField].rawValue
driverModelCurrent := currentRow.qFieldToCells[driverModelCurrentQField].rawValue
driverModelPending := currentRow.qFieldToCells[driverModelPendingQField].rawValue
vBiosVersion := currentRow.qFieldToCells[vBiosVersionQField].rawValue
driverVersion := currentRow.qFieldToCells[driverVersionQField].rawValue
slist.PushFront(types.NewSample(inputName, "gpu_info", 1, map[string]string{
"uuid": uuid,
"name": name,
"driver_model_current": driverModelCurrent,
"driver_model_pending": driverModelPending,
"vbios_version": vBiosVersion,
"driver_version": driverVersion,
}))
for _, currentCell := range currentRow.cells {
metricInfo := s.qFieldToMetricInfoMap[currentCell.qField]
num, err := transformRawValue(currentCell.rawValue, metricInfo.valueMultiplier)
if err != nil {
if config.Config.DebugMode {
log.Println("D! failed to transform gpu field:", currentCell.qField, "raw value:", currentCell.rawValue, "error:", err)
}
continue
}
slist.PushFront(types.NewSample(inputName, metricInfo.metricName, num, map[string]string{"uuid": uuid}))
}
}
}