From d6d4edd0430a7b700f6d13db7f3e0d1cd2e5785e Mon Sep 17 00:00:00 2001 From: xmh <1197843839@qq.com> Date: Wed, 31 Jan 2024 15:43:04 +0800 Subject: [PATCH] feat: support nvidia-smi timeout (#779) * feat: support nvidia-smi timeout Signed-off-by: xmh1011 <1197843839@qq.com> * feat: support nvidia-smi timeout Signed-off-by: xmh1011 <1197843839@qq.com> --------- Signed-off-by: xmh1011 <1197843839@qq.com> --- agent/metrics_agent.go | 5 +++++ conf/input.nvidia_smi/nvidia_smi.toml | 5 ++++- inputs/nvidia_smi/builder.go | 8 ++++---- inputs/nvidia_smi/nvidia_smi.go | 10 +++++----- inputs/nvidia_smi/scrape.go | 10 +++++----- 5 files changed, 23 insertions(+), 15 deletions(-) diff --git a/agent/metrics_agent.go b/agent/metrics_agent.go index d7fe5e2c..be06eaed 100644 --- a/agent/metrics_agent.go +++ b/agent/metrics_agent.go @@ -267,6 +267,11 @@ func (ma *MetricsAgent) inputGo(name string, sum string, input inputs.Input) { if err = inputs.MayInit(input); err != nil { if !errors.Is(err, types.ErrInstancesEmpty) { log.Println("E! failed to init input:", name, "error:", err) + } else { + if config.Config.DebugMode { + _, inputKey := inputs.ParseInputName(name) + log.Println("W! no instances for input: ", inputKey) + } } return } diff --git a/conf/input.nvidia_smi/nvidia_smi.toml b/conf/input.nvidia_smi/nvidia_smi.toml index fcff6d18..4c060375 100644 --- a/conf/input.nvidia_smi/nvidia_smi.toml +++ b/conf/input.nvidia_smi/nvidia_smi.toml @@ -11,4 +11,7 @@ nvidia_smi_command = "" # Comma-separated list of the query fields. # You can find out possible fields by running `nvidia-smi --help-query-gpus`. # The value `AUTO` will automatically detect the fields to query. -query_field_names = "AUTO" \ No newline at end of file +query_field_names = "AUTO" + +# query_timeout is used to set the query timeout to avoid the delay of date collection. +query_timeout = "5s" \ No newline at end of file diff --git a/inputs/nvidia_smi/builder.go b/inputs/nvidia_smi/builder.go index 850e2691..6432acc6 100644 --- a/inputs/nvidia_smi/builder.go +++ b/inputs/nvidia_smi/builder.go @@ -45,15 +45,15 @@ func buildMetricInfo(rField rField) MetricInfo { } } -func buildQFieldToRFieldMap(qFieldsRaw string, nvidiaSmiCommand string) ([]qField, map[qField]rField, error) { - qFieldsSeparated := strings.Split(qFieldsRaw, ",") +func (s *GPUStats) buildQFieldToRFieldMap() ([]qField, map[qField]rField, error) { + qFieldsSeparated := strings.Split(s.QueryFieldNames, ",") qFields := toQFieldSlice(qFieldsSeparated) qFields = append(qFields, requiredFields...) qFields = removeDuplicateQFields(qFields) if len(qFieldsSeparated) == 1 && qFieldsSeparated[0] == qFieldsAuto { - parsed, err := parseAutoQFields(nvidiaSmiCommand) + parsed, err := parseAutoQFields(s.NvidiaSmiCommand) if err != nil { log.Println("W! failed to auto-determine query field names, falling back to the built-in list. error:", err) return getKeys(fallbackQFieldToRFieldMap), fallbackQFieldToRFieldMap, nil @@ -62,7 +62,7 @@ func buildQFieldToRFieldMap(qFieldsRaw string, nvidiaSmiCommand string) ([]qFiel qFields = parsed } - resultTable, err := scrape(qFields, nvidiaSmiCommand) + resultTable, err := s.scrape() var rFields []rField diff --git a/inputs/nvidia_smi/nvidia_smi.go b/inputs/nvidia_smi/nvidia_smi.go index 152c0ceb..b967b78a 100644 --- a/inputs/nvidia_smi/nvidia_smi.go +++ b/inputs/nvidia_smi/nvidia_smi.go @@ -17,8 +17,9 @@ const inputName = "nvidia_smi" type GPUStats struct { config.PluginConfig - NvidiaSmiCommand string `toml:"nvidia_smi_command"` - QueryFieldNames string `toml:"query_field_names"` + NvidiaSmiCommand string `toml:"nvidia_smi_command"` + QueryFieldNames string `toml:"query_field_names"` + QueryTimeOut config.Duration `toml:"query_timeout"` qFields []qField qFieldToMetricInfoMap map[qField]MetricInfo @@ -43,7 +44,7 @@ func (s *GPUStats) Init() error { return types.ErrInstancesEmpty } - qFieldsOrdered, qFieldToRFieldMap, err := buildQFieldToRFieldMap(s.QueryFieldNames, s.NvidiaSmiCommand) + qFieldsOrdered, qFieldToRFieldMap, err := s.buildQFieldToRFieldMap() if err != nil { return err } @@ -58,7 +59,6 @@ func (s *GPUStats) Gather(slist *types.SampleList) { if s.NvidiaSmiCommand == "" { return } - begun := time.Now() // scrape use seconds @@ -67,7 +67,7 @@ func (s *GPUStats) Gather(slist *types.SampleList) { slist.PushFront(types.NewSample(inputName, "scrape_use_seconds", use)) }(begun) - currentTable, err := scrape(s.qFields, s.NvidiaSmiCommand) + currentTable, err := s.scrape() if err != nil { slist.PushFront(types.NewSample(inputName, "scraper_up", 0)) return diff --git a/inputs/nvidia_smi/scrape.go b/inputs/nvidia_smi/scrape.go index 6596ca77..a4ba9d16 100644 --- a/inputs/nvidia_smi/scrape.go +++ b/inputs/nvidia_smi/scrape.go @@ -10,10 +10,10 @@ import ( "flashcat.cloud/categraf/pkg/cmdx" ) -func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) { - qFieldsJoined := strings.Join(QFieldSliceToStringSlice(qFields), ",") +func (s *GPUStats) scrape() (*table, error) { + qFieldsJoined := strings.Join(QFieldSliceToStringSlice(s.qFields), ",") - cmdAndArgs := strings.Fields(nvidiaSmiCommand) + cmdAndArgs := strings.Fields(s.NvidiaSmiCommand) cmdAndArgs = append(cmdAndArgs, fmt.Sprintf("--query-gpu=%s", qFieldsJoined)) cmdAndArgs = append(cmdAndArgs, "--format=csv") @@ -24,7 +24,7 @@ func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) { cmd.Stdout = &stdout cmd.Stderr = &stderr - err, timeout := cmdx.RunTimeout(cmd, time.Second*5) + err, timeout := cmdx.RunTimeout(cmd, time.Duration(s.QueryTimeOut)) if timeout { return nil, fmt.Errorf("run command: %s timeout", strings.Join(cmdAndArgs, " ")) } @@ -34,7 +34,7 @@ func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) { strings.Join(cmdAndArgs, " "), err, stdout.String(), stderr.String()) } - t, err := parseCSVIntoTable(strings.TrimSpace(stdout.String()), qFields) + t, err := parseCSVIntoTable(strings.TrimSpace(stdout.String()), s.qFields) if err != nil { return nil, err }