diff --git a/conf/input.ping/ping.toml b/conf/input.ping/ping.toml index 187afae3..3156c514 100644 --- a/conf/input.ping/ping.toml +++ b/conf/input.ping/ping.toml @@ -10,6 +10,18 @@ targets = [ # "10.4.5.7" ] +## Method used for sending pings, can be either "exec" or "native". When set +## to "exec" the systems ping command will be executed. When set to "native" +## the plugin will send pings directly. +## +## While the default is "native" for backwards compatibility, new deployments +## are encouraged to use the "native" method for improved compatibility and +## performance. +# method = "exec" + +## Specify the ping executable binary. +# binary = "ping" + # # append some labels for series # labels = { region="cloud", product="n9e" } diff --git a/inputs/ping/README.md b/inputs/ping/README.md index ec34ff86..3fa79400 100644 --- a/inputs/ping/README.md +++ b/inputs/ping/README.md @@ -3,6 +3,9 @@ ping 监控插件,探测远端目标地址能否 ping 通,如果机器没有禁 ping,这就是一个很好用的探测机器存活的手段 ## Configuration +这个插件有两种主要的操作方法:`exec` 和 `native`.推荐使用 `native` 方法,因为它具有更好的系统兼容性和性能. +为了向后兼容和更精准的response_ms,`native` 方法是默认的. +使用 `method = "exec"`,将会调用系统ping程序来发送ping packets. 要探测的机器配置到 targets 中,targets 是个数组,可以配置多个,当然也可以拆成多个 `[[instances]]` 配置段,比如: @@ -82,3 +85,25 @@ When using `method = "native"`, you will need permissions similar to the executa 大盘地址 [dashboard-2.0.json](https://github.com/flashcatcloud/categraf/tree/main/inputs/ping/dashboard-2.0.json) +## Example Output + +```text +ping_maximum_response_ms agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0.036 +ping_packets_transmitted agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 1 +ping_packets_received agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 1 +ping_average_response_ms agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0.036 +ping_minimum_response_ms agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0.036 +ping_standard_deviation_ms agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0 +ping_result_code agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0 +ping_percent_packet_loss agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 0 +ping_ttl agent_hostname=zy-fat product=n9e region=cloud target=10.0.24.136 64 +ping_minimum_response_ms agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 20.935 +ping_average_response_ms agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 20.935 +ping_standard_deviation_ms agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 0 +ping_result_code agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 0 +ping_packets_transmitted agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 1 +ping_packets_received agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 1 +ping_ttl agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 50 +ping_percent_packet_loss agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 0 +ping_maximum_response_ms agent_hostname=zy-fat product=n9e region=cloud target=www.baidu.com 20.935 +``` \ No newline at end of file diff --git a/inputs/ping/ping.go b/inputs/ping/ping.go index 0fc5daec..47ec826f 100644 --- a/inputs/ping/ping.go +++ b/inputs/ping/ping.go @@ -1,18 +1,23 @@ package ping import ( + "bytes" + "errors" "fmt" "log" "net" + "os/exec" "runtime" "strings" "sync" "time" + ping "github.com/prometheus-community/pro-bing" + "flashcat.cloud/categraf/config" "flashcat.cloud/categraf/inputs" + "flashcat.cloud/categraf/pkg/cmdx" "flashcat.cloud/categraf/types" - ping "github.com/prometheus-community/pro-bing" ) const ( @@ -20,6 +25,8 @@ const ( defaultPingDataBytesSize = 56 ) +type HostPinger func(binary string, timeout float64, args ...string) (string, error) + type Instance struct { config.InstanceConfig @@ -31,10 +38,17 @@ type Instance struct { IPv6 bool `toml:"ipv6"` // Whether to resolve addresses using ipv6 or not. Size *int `toml:"size"` // Packet size Conc int `toml:"concurrency"` // max concurrency coroutine + Method string `toml:"method"` // Method defines how to ping (native or exec) + Binary string `toml:"binary"` // Ping executable binary calcInterval time.Duration calcTimeout time.Duration sourceAddress string + + // host ping function + pingHost HostPinger + + Deadline int // Ping deadline, in seconds. 0 means no deadline. (ping -w ) } func (ins *Instance) Init() error { @@ -80,6 +94,15 @@ func (ins *Instance) Init() error { } } + if ins.Method == "" { + ins.Method = "native" + } + + if ins.Method == "exec" && ins.Binary == "" { + ins.Binary = "ping" + } + + ins.pingHost = hostPinger return nil } @@ -115,6 +138,9 @@ func (ins *Instance) Gather(slist *types.SampleList) { return } + if ins.DebugMod { + log.Println("D! ping method", ins.Method) + } wg := new(sync.WaitGroup) ch := make(chan struct{}, ins.Conc) for _, target := range ins.Targets { @@ -122,14 +148,19 @@ func (ins *Instance) Gather(slist *types.SampleList) { wg.Add(1) go func(target string) { defer wg.Done() - ins.gather(slist, target) + switch ins.Method { + case "exec": + ins.execGather(slist, target) + default: + ins.nativeGather(slist, target) + } <-ch }(target) } wg.Wait() } -func (ins *Instance) gather(slist *types.SampleList, target string) { +func (ins *Instance) nativeGather(slist *types.SampleList, target string) { if ins.DebugMod { log.Println("D! ping...", target) } @@ -244,3 +275,23 @@ func (ins *Instance) ping(destination string) (*pingStats, error) { return ps, nil } + +func hostPinger(binary string, timeout float64, args ...string) (string, error) { + bin, err := exec.LookPath(binary) + if err != nil { + return "", err + } + + var stdout bytes.Buffer + var stderr bytes.Buffer + cmd := exec.Command(bin, args...) + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err, to := cmdx.RunTimeout(cmd, time.Second*time.Duration(timeout+5)) + if to { + log.Printf("E! run command: %s timeout", strings.Join(cmd.Args, " ")) + return stderr.String(), errors.New("run command timeout") + } + return stdout.String(), err +} diff --git a/inputs/ping/ping_notwindows.go b/inputs/ping/ping_notwindows.go new file mode 100644 index 00000000..a141b4f7 --- /dev/null +++ b/inputs/ping/ping_notwindows.go @@ -0,0 +1,290 @@ +//go:build !windows + +package ping + +import ( + "errors" + "fmt" + "log" + "os/exec" + "regexp" + "runtime" + "strconv" + "strings" + "syscall" + + "flashcat.cloud/categraf/types" +) + +type roundTripTimeStats struct { + min float64 + avg float64 + max float64 + stddev float64 +} + +type statistics struct { + packetsTransmitted int + packetsReceived int + ttl int + roundTripTimeStats +} + +func (ins *Instance) execGather(slist *types.SampleList, target string) { + if ins.DebugMod { + log.Println("D! ping...", target) + } + + fields := map[string]interface{}{"result_code": 0} + labels := map[string]string{"target": target} + defer func() { + for field, value := range fields { + slist.PushFront(types.NewSample(inputName, field, value, labels)) + } + }() + + out, err := ins.pingHost(ins.Binary, 60.0, ins.args(target, runtime.GOOS)...) + if err != nil { + // Some implementations of ping return a non-zero exit code on + // timeout, if this occurs we will not exit and try to parse + // the output. + // Linux iputils-ping returns 1, BSD-derived ping returns 2. + status := -1 + var exitError *exec.ExitError + if errors.As(err, &exitError) { + if ws, ok := exitError.Sys().(syscall.WaitStatus); ok { + status = ws.ExitStatus() + fields["result_code"] = status + } + } + + var timeoutExitCode int + switch runtime.GOOS { + case "freebsd", "netbsd", "openbsd", "darwin": + timeoutExitCode = 2 + case "linux": + timeoutExitCode = 1 + default: + timeoutExitCode = 1 + } + + if status != timeoutExitCode { + // Combine go err + stderr output + out = strings.TrimSpace(out) + if len(out) > 0 { + log.Println(target, fmt.Errorf("%w - %s", err, out)) + } else { + log.Println(target, fmt.Errorf("%w", err)) + } + fields["result_code"] = 2 + return + } + } + + stats, err := processPingOutput(out) + if err != nil { + // fatal error + log.Println(target, fmt.Errorf("%w - %s", err, out)) + fields["result_code"] = 2 + return + } + + // Calculate packet loss percentage + percentPacketLoss := float64(stats.packetsTransmitted-stats.packetsReceived) / float64(stats.packetsTransmitted) * 100.0 + + fields["packets_transmitted"] = stats.packetsTransmitted + fields["packets_received"] = stats.packetsReceived + fields["percent_packet_loss"] = percentPacketLoss + if stats.ttl >= 0 { + fields["ttl"] = stats.ttl + } + if stats.min >= 0 { + fields["minimum_response_ms"] = stats.min + } + if stats.avg >= 0 { + fields["average_response_ms"] = stats.avg + } + if stats.max >= 0 { + fields["maximum_response_ms"] = stats.max + } + if stats.stddev >= 0 { + fields["standard_deviation_ms"] = stats.stddev + } +} + +// args returns the arguments for the 'ping' executable +func (ins *Instance) args(url string, system string) []string { + // build the ping command args based on toml config + args := []string{"-c", strconv.Itoa(ins.Count), "-n", "-s", "16"} + if ins.PingInterval > 0 { + args = append(args, "-i", strconv.FormatFloat(ins.PingInterval, 'f', -1, 64)) + } + if ins.Timeout > 0 { + switch system { + case "darwin": + args = append(args, "-W", strconv.FormatFloat(ins.Timeout*1000, 'f', -1, 64)) + case "freebsd": + if strings.Contains(ins.Binary, "ping6") && freeBSDMajorVersion() <= 12 { + args = append(args, "-x", strconv.FormatFloat(ins.Timeout*1000, 'f', -1, 64)) + } else { + args = append(args, "-W", strconv.FormatFloat(ins.Timeout*1000, 'f', -1, 64)) + } + case "netbsd", "openbsd": + args = append(args, "-W", strconv.FormatFloat(ins.Timeout*1000, 'f', -1, 64)) + case "linux": + args = append(args, "-W", strconv.FormatFloat(ins.Timeout, 'f', -1, 64)) + default: + // Not sure the best option here, just assume GNU ping? + args = append(args, "-W", strconv.FormatFloat(ins.Timeout, 'f', -1, 64)) + } + } + if ins.Deadline > 0 { + switch system { + case "freebsd": + if strings.Contains(ins.Binary, "ping6") && freeBSDMajorVersion() <= 12 { + args = append(args, "-X", strconv.Itoa(ins.Deadline)) + } else { + args = append(args, "-t", strconv.Itoa(ins.Deadline)) + } + case "darwin", "netbsd", "openbsd": + args = append(args, "-t", strconv.Itoa(ins.Deadline)) + case "linux": + args = append(args, "-w", strconv.Itoa(ins.Deadline)) + default: + // not sure the best option here, just assume gnu ping? + args = append(args, "-w", strconv.Itoa(ins.Deadline)) + } + } + if ins.Interface != "" { + switch system { + case "darwin": + args = append(args, "-I", ins.Interface) + case "freebsd", "netbsd", "openbsd": + args = append(args, "-S", ins.Interface) + case "linux": + args = append(args, "-I", ins.Interface) + default: + // not sure the best option here, just assume gnu ping? + args = append(args, "-i", ins.Interface) + } + } + args = append(args, url) + return args +} + +// processPingOutput takes in a string output from the ping command, like: +// +// ping www.google.com (173.194.115.84): 56 data bytes +// 64 bytes from 173.194.115.84: icmp_seq=0 ttl=54 time=52.172 ms +// 64 bytes from 173.194.115.84: icmp_seq=1 ttl=54 time=34.843 ms +// +// --- www.google.com ping statistics --- +// 2 packets transmitted, 2 packets received, 0.0% packet loss +// round-trip min/avg/max/stddev = 34.843/43.508/52.172/8.664 ms +// +// It returns (, , ) +func processPingOutput(out string) (statistics, error) { + stats := statistics{ + packetsTransmitted: 0, + packetsReceived: 0, + ttl: -1, + roundTripTimeStats: roundTripTimeStats{ + min: -1.0, + avg: -1.0, + max: -1.0, + stddev: -1.0, + }, + } + + // Set this error to nil if we find a 'transmitted' line + err := errors.New("fatal error processing ping output") + lines := strings.Split(out, "\n") + for _, line := range lines { + // Reading only first TTL, ignoring other TTL messages + if stats.ttl == -1 && (strings.Contains(line, "ttl=") || strings.Contains(line, "hlim=")) { + stats.ttl, err = getTTL(line) + } else if strings.Contains(line, "transmitted") && strings.Contains(line, "received") { + stats.packetsTransmitted, stats.packetsReceived, err = getPacketStats(line) + if err != nil { + return stats, err + } + } else if strings.Contains(line, "min/avg/max") { + stats.roundTripTimeStats, err = checkRoundTripTimeStats(line) + if err != nil { + return stats, err + } + } + } + return stats, err +} + +func getPacketStats(line string) (trans int, recv int, err error) { + recv = 0 + + stats := strings.Split(line, ", ") + // Transmitted packets + trans, err = strconv.Atoi(strings.Split(stats[0], " ")[0]) + if err != nil { + return trans, recv, err + } + // Received packets + recv, err = strconv.Atoi(strings.Split(stats[1], " ")[0]) + return trans, recv, err +} + +func getTTL(line string) (int, error) { + ttlLine := regexp.MustCompile(`(ttl|hlim)=(\d+)`) + ttlMatch := ttlLine.FindStringSubmatch(line) + return strconv.Atoi(ttlMatch[2]) +} + +func checkRoundTripTimeStats(line string) (roundTripTimeStats, error) { + roundTripTimeStats := roundTripTimeStats{ + min: -1.0, + avg: -1.0, + max: -1.0, + stddev: -1.0, + } + + stats := strings.Split(line, " ")[3] + data := strings.Split(stats, "/") + + var err error + roundTripTimeStats.min, err = strconv.ParseFloat(data[0], 64) + if err != nil { + return roundTripTimeStats, err + } + roundTripTimeStats.avg, err = strconv.ParseFloat(data[1], 64) + if err != nil { + return roundTripTimeStats, err + } + roundTripTimeStats.max, err = strconv.ParseFloat(data[2], 64) + if err != nil { + return roundTripTimeStats, err + } + if len(data) == 4 { + roundTripTimeStats.stddev, err = strconv.ParseFloat(data[3], 64) + if err != nil { + return roundTripTimeStats, err + } + } + return roundTripTimeStats, err +} + +// Due to different behavior in version of freebsd, get the major +// version number. In the event of an error we assume we return a low number +// to avoid changing behavior. +func freeBSDMajorVersion() int { + out, err := exec.Command("freebsd-version", "-u").Output() + if err != nil { + return -1 + } + + majorVersionStr := strings.Split(string(out), ".")[0] + majorVersion, err := strconv.Atoi(majorVersionStr) + if err != nil { + return -1 + } + + return majorVersion +} diff --git a/inputs/ping/ping_windows.go b/inputs/ping/ping_windows.go new file mode 100644 index 00000000..40364b0b --- /dev/null +++ b/inputs/ping/ping_windows.go @@ -0,0 +1,183 @@ +//go:build windows + +package ping + +import ( + "errors" + "fmt" + "log" + "regexp" + "strconv" + "strings" + + "flashcat.cloud/categraf/types" +) + +type roundTripTimeStats struct { + min int + avg int + max int +} + +type statistics struct { + packetsTransmitted int + replyReceived int + packetsReceived int + roundTripTimeStats +} + +func (ins *Instance) execGather(slist *types.SampleList, target string) { + if ins.DebugMod { + log.Println("D! ping...", target) + } + + fields := map[string]interface{}{"result_code": 0} + labels := map[string]string{"target": target} + defer func() { + for field, value := range fields { + slist.PushFront(types.NewSample(inputName, field, value, labels)) + } + }() + args := ins.args(target) + totalTimeout := 60.0 + totalTimeout = ins.timeout() * float64(ins.Count) + + out, err := ins.pingHost(ins.Binary, totalTimeout, args...) + // ping host return exitcode != 0 also when there was no response from host but command was executed successfully + var pendingError error + if err != nil { + // Combine go err + stderr output + pendingError = errors.New(strings.TrimSpace(out) + ", " + err.Error()) + } + stats, err := processPingOutput(out) + if err != nil { + // fatal error + if pendingError != nil { + log.Println(target, fmt.Errorf("%s: %w", target, pendingError)) + } else { + log.Println(target, fmt.Errorf("%s: %w", target, err)) + } + + fields["result_code"] = 2 + fields["errors"] = 100.0 + return + } + // Calculate packet loss percentage + lossReply := float64(stats.packetsTransmitted-stats.replyReceived) / float64(stats.packetsTransmitted) * 100.0 + lossPackets := float64(stats.packetsTransmitted-stats.packetsReceived) / float64(stats.packetsTransmitted) * 100.0 + + fields["packets_transmitted"] = stats.packetsTransmitted + fields["reply_received"] = stats.replyReceived + fields["packets_received"] = stats.packetsReceived + fields["percent_packet_loss"] = lossPackets + fields["percent_reply_loss"] = lossReply + if stats.avg >= 0 { + fields["average_response_ms"] = float64(stats.avg) + } + if stats.min >= 0 { + fields["minimum_response_ms"] = float64(stats.min) + } + if stats.max >= 0 { + fields["maximum_response_ms"] = float64(stats.max) + } +} + +// args returns the arguments for the 'ping' executable +func (ins *Instance) args(url string) []string { + args := []string{"-n", strconv.Itoa(ins.Count)} + + if ins.Timeout > 0 { + args = append(args, "-w", strconv.FormatFloat(ins.Timeout*1000, 'f', 0, 64)) + } + + args = append(args, url) + + return args +} + +// processPingOutput takes in a string output from the ping command +// based on linux implementation but using regex (multi-language support) +// It returns (, , , , , ) +func processPingOutput(out string) (statistics, error) { + // So find a line contain 3 numbers except reply lines + var statsLine, aproxs []string = nil, nil + err := errors.New("fatal error processing ping output") + stat := regexp.MustCompile(`=\W*(\d+)\D*=\W*(\d+)\D*=\W*(\d+)`) + aprox := regexp.MustCompile(`=\W*(\d+)\D*ms\D*=\W*(\d+)\D*ms\D*=\W*(\d+)\D*ms`) + tttLine := regexp.MustCompile(`TTL=\d+`) + lines := strings.Split(out, "\n") + var replyReceived = 0 + for _, line := range lines { + if tttLine.MatchString(line) { + replyReceived++ + } else { + if statsLine == nil { + statsLine = stat.FindStringSubmatch(line) + } + if statsLine != nil && aproxs == nil { + aproxs = aprox.FindStringSubmatch(line) + } + } + } + + stats := statistics{ + packetsTransmitted: 0, + replyReceived: 0, + packetsReceived: 0, + roundTripTimeStats: roundTripTimeStats{ + min: -1, + avg: -1, + max: -1, + }, + } + + // statsLine data should contain 4 members: entireExpression + ( Send, Receive, Lost ) + if len(statsLine) != 4 { + return stats, err + } + packetsTransmitted, err := strconv.Atoi(statsLine[1]) + if err != nil { + return stats, err + } + packetsReceived, err := strconv.Atoi(statsLine[2]) + if err != nil { + return stats, err + } + + stats.packetsTransmitted = packetsTransmitted + stats.replyReceived = replyReceived + stats.packetsReceived = packetsReceived + + // aproxs data should contain 4 members: entireExpression + ( min, max, avg ) + if len(aproxs) != 4 { + return stats, err + } + min, err := strconv.Atoi(aproxs[1]) + if err != nil { + return stats, err + } + max, err := strconv.Atoi(aproxs[2]) + if err != nil { + return stats, err + } + avg, err := strconv.Atoi(aproxs[3]) + if err != nil { + return statistics{}, err + } + + stats.avg = avg + stats.min = min + stats.max = max + + return stats, err +} + +func (ins *Instance) timeout() float64 { + // According to MSDN, default ping timeout for windows is 4 second + // Add also one second interval + + if ins.Timeout > 0 { + return ins.Timeout + 1 + } + return 4 + 1 +}