Skip to content

Commit

Permalink
[receiver/hostmetrics] Collect cpu utilization metric (#7130)
Browse files Browse the repository at this point in the history
* collect cpu utilization metric in host metrics receiver

* record utilization on calculation

* hostmetricsreceiver: system.cpu.utilization disabled by default

* Move calculator to dedicated pkg. Avoid redundandt code

* cpu utilization disabled by default. updated changelog

* Fix changelog. Fix pkg rename and tests

* Update CHANGELOG.md

Co-authored-by: Dmitrii Anoshin <[email protected]>
  • Loading branch information
rubenruizdegauna and dmitryax authored Mar 7, 2022
1 parent 82b0394 commit 92bcf6a
Show file tree
Hide file tree
Showing 10 changed files with 690 additions and 9 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- `signalfxexporter`: Add validation for `sending_queue` setting (#8026)
- `internal/stanza`: Add support for arbitrary attribute types (#8081)
- `resourcedetectionprocessor`: Add confighttp.HTTPClientSettings To Resource Detection Config Fixes (#7397)
- `hostmetricsreceiver`: Add cpu.utilization metrics to cpu scrapper (#7130)
- `honeycombexporter`: Add validation for `sending_queue` setting (#8113)
- `routingprocessor`: Expand error handling on failure to build exporters (#8125)
- `skywalkingreceiver`: Add new skywalking receiver component folder and structure (#8107)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,26 @@ import (
"go.opentelemetry.io/collector/receiver/scrapererror"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/scraper/cpuscraper/internal/metadata"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/scraper/cpuscraper/ucal"
)

const metricsLen = 1
const metricsLen = 2

// scraper for CPU Metrics
type scraper struct {
config *Config
mb *metadata.MetricsBuilder
ucal *ucal.CPUUtilizationCalculator

// for mocking
bootTime func() (uint64, error)
times func(bool) ([]cpu.TimesStat, error)
now func() time.Time
}

// newCPUScraper creates a set of CPU related metrics
func newCPUScraper(_ context.Context, cfg *Config) *scraper {
return &scraper{config: cfg, bootTime: host.BootTime, times: cpu.Times}
return &scraper{config: cfg, bootTime: host.BootTime, times: cpu.Times, ucal: &ucal.CPUUtilizationCalculator{}, now: time.Now}
}

func (s *scraper) start(context.Context, component.Host) error {
Expand All @@ -57,7 +60,7 @@ func (s *scraper) scrape(_ context.Context) (pdata.Metrics, error) {
md := pdata.NewMetrics()
metrics := md.ResourceMetrics().AppendEmpty().InstrumentationLibraryMetrics().AppendEmpty().Metrics()

now := pdata.NewTimestampFromTime(time.Now())
now := pdata.NewTimestampFromTime(s.now())
cpuTimes, err := s.times( /*percpu=*/ true)
if err != nil {
return md, scrapererror.NewPartialScrapeError(err, metricsLen)
Expand All @@ -66,6 +69,12 @@ func (s *scraper) scrape(_ context.Context) (pdata.Metrics, error) {
for _, cpuTime := range cpuTimes {
s.recordCPUTimeStateDataPoints(now, cpuTime)
}

err = s.ucal.CalculateAndRecord(now, cpuTimes, s.recordCPUUtilization)
if err != nil {
return md, scrapererror.NewPartialScrapeError(err, metricsLen)
}

s.mb.Emit(metrics)
return md, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"go.opentelemetry.io/collector/model/pdata"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/scraper/cpuscraper/internal/metadata"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/scraper/cpuscraper/ucal"
)

func (s *scraper) recordCPUTimeStateDataPoints(now pdata.Timestamp, cpuTime cpu.TimesStat) {
Expand All @@ -34,3 +35,14 @@ func (s *scraper) recordCPUTimeStateDataPoints(now pdata.Timestamp, cpuTime cpu.
s.mb.RecordSystemCPUTimeDataPoint(now, cpuTime.Steal, cpuTime.CPU, metadata.AttributeState.Steal)
s.mb.RecordSystemCPUTimeDataPoint(now, cpuTime.Iowait, cpuTime.CPU, metadata.AttributeState.Wait)
}

func (s *scraper) recordCPUUtilization(now pdata.Timestamp, cpuUtilization ucal.CPUUtilization) {
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.User, cpuUtilization.CPU, metadata.AttributeState.User)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.System, cpuUtilization.CPU, metadata.AttributeState.System)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.Idle, cpuUtilization.CPU, metadata.AttributeState.Idle)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.Irq, cpuUtilization.CPU, metadata.AttributeState.Interrupt)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.Nice, cpuUtilization.CPU, metadata.AttributeState.Nice)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.Softirq, cpuUtilization.CPU, metadata.AttributeState.Softirq)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.Steal, cpuUtilization.CPU, metadata.AttributeState.Steal)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.Iowait, cpuUtilization.CPU, metadata.AttributeState.Wait)
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"go.opentelemetry.io/collector/model/pdata"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/scraper/cpuscraper/internal/metadata"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/scraper/cpuscraper/ucal"
)

func (s *scraper) recordCPUTimeStateDataPoints(now pdata.Timestamp, cpuTime cpu.TimesStat) {
Expand All @@ -30,3 +31,10 @@ func (s *scraper) recordCPUTimeStateDataPoints(now pdata.Timestamp, cpuTime cpu.
s.mb.RecordSystemCPUTimeDataPoint(now, cpuTime.Idle, cpuTime.CPU, metadata.AttributeState.Idle)
s.mb.RecordSystemCPUTimeDataPoint(now, cpuTime.Irq, cpuTime.CPU, metadata.AttributeState.Interrupt)
}

func (s *scraper) recordCPUUtilization(now pdata.Timestamp, cpuUtilization ucal.CPUUtilization) {
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.User, cpuUtilization.CPU, metadata.AttributeState.User)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.System, cpuUtilization.CPU, metadata.AttributeState.System)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.Idle, cpuUtilization.CPU, metadata.AttributeState.Idle)
s.mb.RecordSystemCPUUtilizationDataPoint(now, cpuUtilization.Irq, cpuUtilization.CPU, metadata.AttributeState.Interrupt)
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"runtime"
"testing"
"time"

"github.com/shirou/gopsutil/v3/cpu"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -105,7 +106,7 @@ func TestScrape(t *testing.T) {
isPartial := scrapererror.IsPartialScrapeError(err)
assert.True(t, isPartial)
if isPartial {
assert.Equal(t, 1, err.(scrapererror.PartialScrapeError).Failed)
assert.Equal(t, 2, err.(scrapererror.PartialScrapeError).Failed)
}

return
Expand All @@ -128,6 +129,216 @@ func TestScrape(t *testing.T) {
}
}

// TestScrape_CpuUtilization to test utilization we need to execute scrape at least twice to have
// data to calculate the difference, so assertions will be done after the second scraping
func TestScrape_CpuUtilization(t *testing.T) {
type testCase struct {
name string
metricsConfig metadata.MetricsSettings
expectedMetricCount int
times bool
utilization bool
utilizationIndex int
}

testCases := []testCase{
{
name: "Standard",
metricsConfig: metadata.DefaultMetricsSettings(),
expectedMetricCount: 1,
times: true,
utilization: false,
},
{
name: "SystemCPUTime metric is disabled",
times: false,
utilization: true,
expectedMetricCount: 1,
},
{
name: "all metrics are enabled",
times: true,
utilization: true,
expectedMetricCount: 2,
utilizationIndex: 1,
},
{
name: "all metrics are disabled",
times: false,
utilization: false,
expectedMetricCount: 0,
},
}

for _, test := range testCases {
test := test
t.Run(test.name, func(t *testing.T) {
t.Parallel()
settings := test.metricsConfig
if test.metricsConfig == (metadata.MetricsSettings{}) {
settings = metadata.MetricsSettings{
SystemCPUTime: metadata.MetricSettings{
Enabled: test.times,
},
SystemCPUUtilization: metadata.MetricSettings{
Enabled: test.utilization,
},
}
}

scraper := newCPUScraper(context.Background(), &Config{Metrics: settings})
err := scraper.start(context.Background(), componenttest.NewNopHost())
require.NoError(t, err, "Failed to initialize cpu scraper: %v", err)

_, err = scraper.scrape(context.Background())
require.NoError(t, err, "Failed to scrape metrics: %v", err)
//2nd scrape will trigger utilization metrics calculation
md, err := scraper.scrape(context.Background())
require.NoError(t, err, "Failed to scrape metrics: %v", err)

assert.Equal(t, test.expectedMetricCount, md.MetricCount())
metrics := md.ResourceMetrics().At(0).InstrumentationLibraryMetrics().At(0).Metrics()
internal.AssertSameTimeStampForAllMetrics(t, metrics)
if test.times {
timesMetrics := metrics.At(0)
assertCPUMetricValid(t, timesMetrics, 0)
if runtime.GOOS == "linux" {
assertCPUMetricHasLinuxSpecificStateLabels(t, timesMetrics)
}
}
if test.utilization {
utilizationMetrics := metrics.At(test.utilizationIndex)
assertCPUUtilizationMetricValid(t, utilizationMetrics, 0)
if runtime.GOOS == "linux" {
assertCPUUtilizationMetricHasLinuxSpecificStateLabels(t, utilizationMetrics)
}
}
})
}
}

func TestScrape_CpuUtilizationError(t *testing.T) {
scraper := newCPUScraper(context.Background(), &Config{Metrics: metadata.DefaultMetricsSettings()})
err := scraper.start(context.Background(), componenttest.NewNopHost())
require.NoError(t, err, "Failed to initialize cpu scraper: %v", err)

scraper.now = func() time.Time {
now, _ := time.Parse(time.RFC3339, "2021-12-21 00:23:21")
return now
}

_, err = scraper.scrape(context.Background())
require.NoError(t, err, "Failed to scrape metrics: %v", err)
//2nd scrape will trigger utilization metrics calculation
md, err := scraper.scrape(context.Background())
var partialScrapeErr scrapererror.PartialScrapeError
assert.ErrorAs(t, err, &partialScrapeErr)
assert.Equal(t, 0, md.MetricCount())
}

func TestScrape_CpuUtilizationStandard(t *testing.T) {
metricSettings := metadata.MetricsSettings{
SystemCPUUtilization: metadata.MetricSettings{
Enabled: true,
},
}

//datapoint data
type dpData struct {
val float64
attrs map[string]string
}

scrapesData := []struct {
times []cpu.TimesStat
scrapeTime string
expectedDps []dpData
}{
{
times: []cpu.TimesStat{{CPU: "cpu0", User: 1.5, System: 2.7, Idle: 0.8}, {CPU: "cpu1", User: 2, System: 3, Idle: 1}},
scrapeTime: "2006-01-02T15:04:05Z",
expectedDps: []dpData{},
},
{
times: []cpu.TimesStat{{CPU: "cpu0", User: 2.8, System: 3.9, Idle: 3.3}, {CPU: "cpu1", User: 3.2, System: 5.2, Idle: 2.6}},
scrapeTime: "2006-01-02T15:04:10Z",
expectedDps: []dpData{
{val: 0.26, attrs: map[string]string{"cpu": "cpu0", "state": "user"}},
{val: 0.24, attrs: map[string]string{"cpu": "cpu0", "state": "system"}},
{val: 0.5, attrs: map[string]string{"cpu": "cpu0", "state": "idle"}},
{val: 0.24, attrs: map[string]string{"cpu": "cpu1", "state": "user"}},
{val: 0.44, attrs: map[string]string{"cpu": "cpu1", "state": "system"}},
{val: 0.32, attrs: map[string]string{"cpu": "cpu1", "state": "idle"}},
},
},
{
times: []cpu.TimesStat{{CPU: "cpu0", User: 3.4, System: 5.3, Idle: 6.3}, {CPU: "cpu1", User: 3.7, System: 7.1, Idle: 5.2}},
scrapeTime: "2006-01-02T15:04:15Z",
expectedDps: []dpData{
{val: 0.12, attrs: map[string]string{"cpu": "cpu0", "state": "user"}},
{val: 0.28, attrs: map[string]string{"cpu": "cpu0", "state": "system"}},
{val: 0.6, attrs: map[string]string{"cpu": "cpu0", "state": "idle"}},
{val: 0.1, attrs: map[string]string{"cpu": "cpu1", "state": "user"}},
{val: 0.38, attrs: map[string]string{"cpu": "cpu1", "state": "system"}},
{val: 0.52, attrs: map[string]string{"cpu": "cpu1", "state": "idle"}},
},
},
}

cpuScraper := newCPUScraper(context.Background(), &Config{Metrics: metricSettings})
for _, scrapeData := range scrapesData {
//mock TimeStats and Now
cpuScraper.times = func(_ bool) ([]cpu.TimesStat, error) {
return scrapeData.times, nil
}
cpuScraper.now = func() time.Time {
now, _ := time.Parse(time.RFC3339, scrapeData.scrapeTime)
return now
}

err := cpuScraper.start(context.Background(), componenttest.NewNopHost())
require.NoError(t, err, "Failed to initialize cpu scraper: %v", err)

md, err := cpuScraper.scrape(context.Background())
require.NoError(t, err)
//no metrics in the first scrape
if len(scrapeData.expectedDps) == 0 {
assert.Equal(t, 0, md.ResourceMetrics().At(0).InstrumentationLibraryMetrics().At(0).Metrics().Len())
continue
}

assert.Equal(t, 1, md.ResourceMetrics().At(0).InstrumentationLibraryMetrics().At(0).Metrics().Len())
metric := md.ResourceMetrics().At(0).InstrumentationLibraryMetrics().At(0).Metrics().At(0)
assertCPUUtilizationMetricValid(t, metric, 0)
dp := metric.Gauge().DataPoints()

expectedDataPoints := 8
if runtime.GOOS == "linux" {
expectedDataPoints = 16
assertCPUUtilizationMetricHasLinuxSpecificStateLabels(t, metric)
}
assert.Equal(t, expectedDataPoints, dp.Len())

//remove empty values to make the test more simple
dp.RemoveIf(func(n pdata.NumberDataPoint) bool {
return n.DoubleVal() == 0.0
})

for idx, expectedDp := range scrapeData.expectedDps {
assertDatapointValueAndStringAttributes(t, dp.At(idx), expectedDp.val, expectedDp.attrs)
}
}
}

func assertDatapointValueAndStringAttributes(t *testing.T, dp pdata.NumberDataPoint, value float64, attrs map[string]string) {
assert.InDelta(t, value, dp.DoubleVal(), 0.0001)
for k, v := range attrs {
cpuAttribute, exists := dp.Attributes().Get(k)
assert.True(t, exists)
assert.Equal(t, v, cpuAttribute.StringVal())
}
}

func assertCPUMetricValid(t *testing.T, metric pdata.Metric, startTime pdata.Timestamp) {
expected := pdata.NewMetric()
expected.SetName("system.cpu.time")
Expand All @@ -152,3 +363,27 @@ func assertCPUMetricHasLinuxSpecificStateLabels(t *testing.T, metric pdata.Metri
internal.AssertSumMetricHasAttributeValue(t, metric, 6, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.Steal))
internal.AssertSumMetricHasAttributeValue(t, metric, 7, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.Wait))
}

func assertCPUUtilizationMetricValid(t *testing.T, metric pdata.Metric, startTime pdata.Timestamp) {
expected := pdata.NewMetric()
expected.SetName("system.cpu.utilization")
expected.SetDescription("Percentage of CPU time broken down by different states.")
expected.SetUnit("1")
expected.SetDataType(pdata.MetricDataTypeGauge)
internal.AssertDescriptorEqual(t, expected, metric)
if startTime != 0 {
internal.AssertGaugeMetricStartTimeEquals(t, metric, startTime)
}
internal.AssertGaugeMetricHasAttribute(t, metric, 0, metadata.Attributes.Cpu)
internal.AssertGaugeMetricHasAttributeValue(t, metric, 0, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.User))
internal.AssertGaugeMetricHasAttributeValue(t, metric, 1, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.System))
internal.AssertGaugeMetricHasAttributeValue(t, metric, 2, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.Idle))
internal.AssertGaugeMetricHasAttributeValue(t, metric, 3, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.Interrupt))
}

func assertCPUUtilizationMetricHasLinuxSpecificStateLabels(t *testing.T, metric pdata.Metric) {
internal.AssertGaugeMetricHasAttributeValue(t, metric, 4, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.Nice))
internal.AssertGaugeMetricHasAttributeValue(t, metric, 5, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.Softirq))
internal.AssertGaugeMetricHasAttributeValue(t, metric, 6, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.Steal))
internal.AssertGaugeMetricHasAttributeValue(t, metric, 7, metadata.Attributes.State, pdata.NewAttributeValueString(metadata.AttributeState.Wait))
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ These are the metrics available for this scraper.
| Name | Description | Unit | Type | Attributes |
| ---- | ----------- | ---- | ---- | ---------- |
| **system.cpu.time** | Total CPU seconds broken down by different states. | s | Sum(Double) | <ul> <li>cpu</li> <li>state</li> </ul> |
| system.cpu.utilization | Percentage of CPU time broken down by different states. | 1 | Gauge(Double) | <ul> <li>cpu</li> <li>state</li> </ul> |

**Highlighted metrics** are emitted by default. Other metrics are optional and not emitted by default.
Any metric can be enabled or disabled with the following scraper configuration:
Expand Down
Loading

0 comments on commit 92bcf6a

Please sign in to comment.