Skip to content

Commit

Permalink
Merge pull request #203 from cloudradar-monitoring/storcli-module-mon…
Browse files Browse the repository at this point in the history
…itoring

Add storcli RAID monitoring support
  • Loading branch information
nikita-vanyasin authored Sep 17, 2019
2 parents 76a7fd5 + b81806e commit f89792d
Show file tree
Hide file tree
Showing 15 changed files with 4,448 additions and 3 deletions.
17 changes: 14 additions & 3 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,18 @@ type Config struct {

DiscoverAutostartingServicesOnly bool `toml:"discover_autostarting_services_only" comment:"default true"`

CPUUtilisationAnalysis CPUUtilisationAnalysis `toml:"cpu_utilisation_analysis"`
CPUUtilisationAnalysis CPUUtilisationAnalysisConfig `toml:"cpu_utilisation_analysis"`

TemperatureMonitoring bool `toml:"temperature_monitoring" comment:"default true"`

SMARTMonitoring bool `toml:"smart_monitoring" comment:"Enable S.M.A.R.T monitoring of hard disks\ndefault false"`
SMARTCtl string `toml:"smartctl" comment:"Path to a smartctl binary (smartctl.exe on windows, path must be escaped) version >= 7\nSee https://docs.cloudradar.io/configuring-hosts/installing-agents/troubleshoot-s.m.a.r.t-monitoring\nsmartctl = \"C:\\\\Program Files\\\\smartmontools\\\\bin\\\\smartctl.exe\"\nsmartctl = \"/usr/local/bin/smartctl\""`
Logs LogsFilesConfig `toml:"logs,omitempty"`

StorCLI StorCLIConfig `toml:"storcli,omitempty" comment:"Enable monitoring of hardware health for MegaRaids\nreported by the storcli command-line tool\nRefer to https://docs.cloudradar.io/cagent/modules#storcli\nOn Linux make sure a sudo rule exists. The storcli command is always executed via sudo. Example:\ncagent ALL= NOPASSWD: /opt/MegaRAID/storcli/storcli64 /c[0-9] show all J"`
}

type CPUUtilisationAnalysis struct {
type CPUUtilisationAnalysisConfig struct {
Threshold float64 `toml:"threshold" comment:"target value to start the analysis" json:"threshold"`
Function string `toml:"function" comment:"threshold compare function, possible values: 'lt', 'lte', 'gt', 'gte'" json:"function"`
Metric string `toml:"metric" commend:"possible values: 'user','system','idle','iowait'" json:"metric"`
Expand All @@ -120,6 +122,11 @@ type CPUUtilisationAnalysis struct {
TrailingProcessAnalysisMinutes int `toml:"trailing_process_analysis_minutes" comment:"how much time analysis will continue to perform after the CPU utilisation returns to the normal value" json:"trailing_process_analysis_minutes"`
}

type StorCLIConfig struct {
BinaryPath string `toml:"binary" comment:"Enable on Windows:\n binary = 'C:\\\\Program Files\\\\storcli\\\\storcli64.exe'\nEnable on Linux:\n binary = '/opt/storcli/sbin/storcli64'"`
ControllerList []uint `toml:"controllers" comment:"controllers to monitor, comma separated. default: [0] (monitor only controller c0)"`
}

func init() {
ex, err := os.Executable()
if err != nil {
Expand Down Expand Up @@ -164,7 +171,7 @@ func NewConfig() *Config {
SystemFields: []string{"uname", "os_kernel", "os_family", "os_arch", "cpu_model", "fqdn", "memory_total_B"},
HardwareInventory: true,
DiscoverAutostartingServicesOnly: true,
CPUUtilisationAnalysis: CPUUtilisationAnalysis{
CPUUtilisationAnalysis: CPUUtilisationAnalysisConfig{
Threshold: 10,
Function: "lt",
Metric: "idle",
Expand All @@ -177,6 +184,10 @@ func NewConfig() *Config {
Logs: LogsFilesConfig{
HubFile: "",
},
StorCLI: StorCLIConfig{
BinaryPath: "",
ControllerList: []uint{0},
},
}

cfg.MinValuableConfig = *(defaultMinValuableConfig())
Expand Down
12 changes: 12 additions & 0 deletions example.config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,15 @@ temperature_monitoring = true # default true
report_processes = 5 # number of processes to return
trailing_process_analysis_minutes = 5 # how much time analysis will continue to perform after the CPU utilisation returns to the normal value

# Enable monitoring of hardware health for MegaRaids
# reported by the storcli command-line tool
# Refer to https://docs.cloudradar.io/cagent/modules#storcli
# On Linux make sure a sudo rule exists. The storcli command is always executed via sudo. Example sudo rule:
# cagent ALL= NOPASSWD: /opt/MegaRAID/storcli/storcli64 /c[0-9] show all J
[storcli]
# Enable on Windows (use double slashes):
# binary = "C:\\Program Files\storcli\storcli64.exe"
# Enable on Linux:
binary = "/opt/storcli/sbin/storcli64"
# controllers list to monitor:
controllers = [0, 1]
4 changes: 4 additions & 0 deletions handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,10 @@ func (ca *Cagent) CollectMeasurements(full bool) (common.MeasurementsMap, error)
measurements = measurements.AddWithPrefix("temperatures.", common.MeasurementsMap{"list": temperatures})
}

modules, err := ca.collectModulesMeasurements()
errCollector.Add(err)
measurements = measurements.AddWithPrefix("", common.MeasurementsMap{"modules": modules})

smartMeas := ca.getSMARTMeasurements()
if len(smartMeas) > 0 {
measurements = measurements.AddInnerWithPrefix("smartmon", smartMeas)
Expand Down
55 changes: 55 additions & 0 deletions modules.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package cagent

import (
"github.com/pkg/errors"
"github.com/sirupsen/logrus"

"github.com/cloudradar-monitoring/cagent/pkg/common"
"github.com/cloudradar-monitoring/cagent/pkg/monitoring/storcli"
)

func (ca *Cagent) collectModulesMeasurements() ([]map[string]interface{}, error) {
var result []map[string]interface{}
var errs common.ErrorCollector

modules := storcli.CreateModules(ca.Config.StorCLI.BinaryPath, ca.Config.StorCLI.ControllerList)

for _, m := range modules {
if !m.IsEnabled() {
continue
}

err := m.Run()
if err != nil {
err = errors.Wrapf(err, "while executing module '%s'", m.GetName())
logrus.WithError(err).Debug()
errs.Add(err)
continue
}

moduleResult := map[string]interface{}{
"name": m.GetName(),
"command executed": m.GetExecutedCommand(),
"measurements": m.GetMeasurements(),
}

msg := m.GetMessage()
if len(msg) > 0 {
moduleResult["message"] = msg
}

alerts := m.GetAlerts()
if len(alerts) > 0 {
moduleResult["alerts"] = alerts
}

warnings := m.GetWarnings()
if len(warnings) > 0 {
moduleResult["warnings"] = warnings
}

result = append(result, moduleResult)
}

return result, errors.Wrap(errs.Combine(), "while collecting modules measurements")
}
15 changes: 15 additions & 0 deletions pkg/monitoring/module.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package monitoring

type Alert string
type Warning string

type Module interface {
IsEnabled() bool
Run() error
GetName() string
GetExecutedCommand() string
GetAlerts() []Alert
GetWarnings() []Warning
GetMessage() string
GetMeasurements() map[string]interface{}
}
181 changes: 181 additions & 0 deletions pkg/monitoring/storcli/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package storcli

import (
"encoding/json"
"fmt"
"strconv"

"github.com/pkg/errors"

"github.com/cloudradar-monitoring/cagent/pkg/common"
"github.com/cloudradar-monitoring/cagent/pkg/monitoring"
)

type controllersResult struct {
Controllers []struct {
ResponseData controllerResponseData `json:"Response Data"`
} `json:"Controllers"`
}

type controllerResponseData struct {
// we can't parse some nested fields due to inconsistency (same field can have different types)
// so we use json.RawMessage

Status map[string]*json.RawMessage `json:"Status"`
VirtualDrives []map[string]*json.RawMessage `json:"VD LIST"`
PhysicalDrives []map[string]*json.RawMessage `json:"PD LIST"`
}

const statusOptimal = "Optimal"
const stateOperational = "Optl"

var physDriveStatesMap = map[string]string{
"UBad": "Unconfigured Bad",
"Offln": "Offline",
"UBUnsp": "UBad Unsupported",
"UGUnsp": "Unsupported",

"Onln": "Online",
"UGood": "Unconfigured Good",
"GHS": "Global Hot Spare",
"DHS": "Dedicated Hot Space",
"JBOD": "Just a Bunch of Disks",
}
var goodPhysicalDriveStates = []string{"Onln", "UGood", "GHS", "DHS", "JBOD"}

func getHumanReadablePhysDriveState(state string) string {
res, exists := physDriveStatesMap[state]
if !exists {
return state
}
return res
}

func tryParseCmdOutput(outBytes *[]byte) (
measurements map[string]interface{},
alerts []monitoring.Alert,
warnings []monitoring.Warning,
err error,
) {
var output controllersResult
err = json.Unmarshal(*outBytes, &output)
if err != nil {
err = errors.Wrap(err, "while Unmarshal storcli cmd output")
return
}

if len(output.Controllers) < 1 {
err = errors.New("unexpected json: no controllers listed")
return
}

responseData := output.Controllers[0].ResponseData

status := responseData.Status
measurements = map[string]interface{}{}
measurements["Status"] = status

// If the status is not Optimal an alert with the status is created.
var controllerStatus string
controllerStatus, err = extractFieldFromRawMap(&status, "Controller Status")
if err != nil {
return
}
if controllerStatus != statusOptimal {
alerts = append(alerts, monitoring.Alert(fmt.Sprintf("Controller status not optimal (%s)", controllerStatus)))
}

// If one of the virtual disks is not in operational status, an alert with all details is created.
for _, vd := range responseData.VirtualDrives {
var vdState string
vdState, err = extractFieldFromRawMap(&vd, "State")
if err != nil {
return
}
if vdState != stateOperational {
var dgVD string
dgVD, err = extractFieldFromRawMap(&vd, "DG/VD")
if err != nil {
return
}

var vdType string
vdType, err = extractFieldFromRawMap(&vd, "TYPE")
if err != nil {
return
}

vdStatusAlertMsg := fmt.Sprintf(
"DG/VD %s %s State not operational (%s)",
dgVD,
vdType,
vdState,
)
alerts = append(alerts, monitoring.Alert(vdStatusAlertMsg))
}

}

// If one of the physical disks is in bad state,
// a warning with the details of the device is created.
for _, pd := range responseData.PhysicalDrives {
var pdState string
pdState, err = extractFieldFromRawMap(&pd, "State")
if err != nil {
return
}

if !common.StrInSlice(pdState, goodPhysicalDriveStates) {
humanReadableState := getHumanReadablePhysDriveState(pdState)
var deviceID, eIDSlot, interfaceName, mediaType string

deviceID, err = extractFieldFromRawMap(&pd, "DID")
if err != nil {
return
}
eIDSlot, err = extractFieldFromRawMap(&pd, "EID:Slt")
if err != nil {
return
}
interfaceName, err = extractFieldFromRawMap(&pd, "Intf")
if err != nil {
return
}
mediaType, err = extractFieldFromRawMap(&pd, "Med")
if err != nil {
return
}

warnMsg := fmt.Sprintf(
"Physical device %s (%s) %s %s state is %s (%s)",
deviceID, eIDSlot, interfaceName, mediaType, humanReadableState, pdState,
)
warnings = append(warnings, monitoring.Warning(warnMsg))
}
}

return
}

// extractFieldFromRawMap tries to read map field as string value
// If it fails, tries to read as int value
// the return is always string
func extractFieldFromRawMap(raw *map[string]*json.RawMessage, key string) (string, error) {
rawValue, exists := (*raw)[key]
if !exists {
return "", fmt.Errorf("unexpected json: %s field is not present", key)
}
var strValue string
strUnmarshalErr := json.Unmarshal(*rawValue, &strValue)
if strUnmarshalErr != nil {
// try unmarshall int
var intValue int
err := json.Unmarshal(*rawValue, &intValue)
if err != nil {
return "", errors.Wrapf(strUnmarshalErr, "while retrieving %s from json", key)
}
strValue = strconv.Itoa(intValue)
}

return strValue, nil
}
65 changes: 65 additions & 0 deletions pkg/monitoring/storcli/parser_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package storcli

import (
"io/ioutil"
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"
)

func helperLoadTestData(t *testing.T, fileName string) []byte {
path := filepath.Join("testdata", fileName)
result, err := ioutil.ReadFile(path)
if err != nil {
t.Fatal(err)
}
return result
}

func TestTryParseCmdOutput(t *testing.T) {
t.Run("all-good-output", func(t *testing.T) {
output := helperLoadTestData(t, "output_allgood.json")
measurements, alerts, warnings, err := tryParseCmdOutput(&output)
assert.NoError(t, err)
assert.Empty(t, alerts)
assert.Empty(t, warnings)
assert.NotNil(t, measurements["Status"])
})

t.Run("all-good-output-ubuntu", func(t *testing.T) {
output := helperLoadTestData(t, "output_allgood_ubuntu.json")
measurements, alerts, warnings, err := tryParseCmdOutput(&output)
assert.NoError(t, err)
assert.Empty(t, alerts)
assert.Empty(t, warnings)
assert.NotNil(t, measurements["Status"])
})

t.Run("non-optimal-output", func(t *testing.T) {
output := helperLoadTestData(t, "output_nonoptimal.json")
measurements, alerts, warnings, err := tryParseCmdOutput(&output)
assert.NoError(t, err)
assert.Empty(t, warnings)
assert.NotNil(t, measurements["Status"])
assert.NotEmpty(t, alerts)
})

t.Run("virtual-drive-bad-output", func(t *testing.T) {
output := helperLoadTestData(t, "output_vdbad.json")
measurements, alerts, warnings, err := tryParseCmdOutput(&output)
assert.NoError(t, err)
assert.Empty(t, warnings)
assert.NotNil(t, measurements["Status"])
assert.NotEmpty(t, alerts)
})

t.Run("hard-drive-bad-output", func(t *testing.T) {
output := helperLoadTestData(t, "output_hdbad.json")
measurements, alerts, warnings, err := tryParseCmdOutput(&output)
assert.NoError(t, err)
assert.Empty(t, alerts)
assert.NotEmpty(t, warnings)
assert.NotNil(t, measurements["Status"])
})
}
Loading

0 comments on commit f89792d

Please sign in to comment.