Skip to content

Commit

Permalink
feat: support nvidia-smi timeout (#779)
Browse files Browse the repository at this point in the history
* feat: support nvidia-smi timeout

Signed-off-by: xmh1011 <[email protected]>

* feat: support nvidia-smi timeout

Signed-off-by: xmh1011 <[email protected]>

---------

Signed-off-by: xmh1011 <[email protected]>
  • Loading branch information
xmh1011 authored Jan 31, 2024
1 parent cd3f17b commit d6d4edd
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 15 deletions.
5 changes: 5 additions & 0 deletions agent/metrics_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,11 @@ func (ma *MetricsAgent) inputGo(name string, sum string, input inputs.Input) {
if err = inputs.MayInit(input); err != nil {
if !errors.Is(err, types.ErrInstancesEmpty) {
log.Println("E! failed to init input:", name, "error:", err)
} else {
if config.Config.DebugMode {
_, inputKey := inputs.ParseInputName(name)
log.Println("W! no instances for input: ", inputKey)
}
}
return
}
Expand Down
5 changes: 4 additions & 1 deletion conf/input.nvidia_smi/nvidia_smi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ nvidia_smi_command = ""
# Comma-separated list of the query fields.
# You can find out possible fields by running `nvidia-smi --help-query-gpus`.
# The value `AUTO` will automatically detect the fields to query.
query_field_names = "AUTO"
query_field_names = "AUTO"

# query_timeout is used to set the query timeout to avoid the delay of date collection.
query_timeout = "5s"
8 changes: 4 additions & 4 deletions inputs/nvidia_smi/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ func buildMetricInfo(rField rField) MetricInfo {
}
}

func buildQFieldToRFieldMap(qFieldsRaw string, nvidiaSmiCommand string) ([]qField, map[qField]rField, error) {
qFieldsSeparated := strings.Split(qFieldsRaw, ",")
func (s *GPUStats) buildQFieldToRFieldMap() ([]qField, map[qField]rField, error) {
qFieldsSeparated := strings.Split(s.QueryFieldNames, ",")

qFields := toQFieldSlice(qFieldsSeparated)
qFields = append(qFields, requiredFields...)
qFields = removeDuplicateQFields(qFields)

if len(qFieldsSeparated) == 1 && qFieldsSeparated[0] == qFieldsAuto {
parsed, err := parseAutoQFields(nvidiaSmiCommand)
parsed, err := parseAutoQFields(s.NvidiaSmiCommand)
if err != nil {
log.Println("W! failed to auto-determine query field names, falling back to the built-in list. error:", err)
return getKeys(fallbackQFieldToRFieldMap), fallbackQFieldToRFieldMap, nil
Expand All @@ -62,7 +62,7 @@ func buildQFieldToRFieldMap(qFieldsRaw string, nvidiaSmiCommand string) ([]qFiel
qFields = parsed
}

resultTable, err := scrape(qFields, nvidiaSmiCommand)
resultTable, err := s.scrape()

var rFields []rField

Expand Down
10 changes: 5 additions & 5 deletions inputs/nvidia_smi/nvidia_smi.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ const inputName = "nvidia_smi"
type GPUStats struct {
config.PluginConfig

NvidiaSmiCommand string `toml:"nvidia_smi_command"`
QueryFieldNames string `toml:"query_field_names"`
NvidiaSmiCommand string `toml:"nvidia_smi_command"`
QueryFieldNames string `toml:"query_field_names"`
QueryTimeOut config.Duration `toml:"query_timeout"`

qFields []qField
qFieldToMetricInfoMap map[qField]MetricInfo
Expand All @@ -43,7 +44,7 @@ func (s *GPUStats) Init() error {
return types.ErrInstancesEmpty
}

qFieldsOrdered, qFieldToRFieldMap, err := buildQFieldToRFieldMap(s.QueryFieldNames, s.NvidiaSmiCommand)
qFieldsOrdered, qFieldToRFieldMap, err := s.buildQFieldToRFieldMap()
if err != nil {
return err
}
Expand All @@ -58,7 +59,6 @@ func (s *GPUStats) Gather(slist *types.SampleList) {
if s.NvidiaSmiCommand == "" {
return
}

begun := time.Now()

// scrape use seconds
Expand All @@ -67,7 +67,7 @@ func (s *GPUStats) Gather(slist *types.SampleList) {
slist.PushFront(types.NewSample(inputName, "scrape_use_seconds", use))
}(begun)

currentTable, err := scrape(s.qFields, s.NvidiaSmiCommand)
currentTable, err := s.scrape()
if err != nil {
slist.PushFront(types.NewSample(inputName, "scraper_up", 0))
return
Expand Down
10 changes: 5 additions & 5 deletions inputs/nvidia_smi/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ import (
"flashcat.cloud/categraf/pkg/cmdx"
)

func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) {
qFieldsJoined := strings.Join(QFieldSliceToStringSlice(qFields), ",")
func (s *GPUStats) scrape() (*table, error) {
qFieldsJoined := strings.Join(QFieldSliceToStringSlice(s.qFields), ",")

cmdAndArgs := strings.Fields(nvidiaSmiCommand)
cmdAndArgs := strings.Fields(s.NvidiaSmiCommand)
cmdAndArgs = append(cmdAndArgs, fmt.Sprintf("--query-gpu=%s", qFieldsJoined))
cmdAndArgs = append(cmdAndArgs, "--format=csv")

Expand All @@ -24,7 +24,7 @@ func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) {
cmd.Stdout = &stdout
cmd.Stderr = &stderr

err, timeout := cmdx.RunTimeout(cmd, time.Second*5)
err, timeout := cmdx.RunTimeout(cmd, time.Duration(s.QueryTimeOut))
if timeout {
return nil, fmt.Errorf("run command: %s timeout", strings.Join(cmdAndArgs, " "))
}
Expand All @@ -34,7 +34,7 @@ func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) {
strings.Join(cmdAndArgs, " "), err, stdout.String(), stderr.String())
}

t, err := parseCSVIntoTable(strings.TrimSpace(stdout.String()), qFields)
t, err := parseCSVIntoTable(strings.TrimSpace(stdout.String()), s.qFields)
if err != nil {
return nil, err
}
Expand Down

0 comments on commit d6d4edd

Please sign in to comment.