Skip to content

Commit

Permalink
Merge branch 'main' into gcp-histogram
Browse files Browse the repository at this point in the history
  • Loading branch information
ishleenk17 authored Dec 3, 2024
2 parents aacbfb6 + 01cc134 commit 2c8d78d
Show file tree
Hide file tree
Showing 14 changed files with 302 additions and 90 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,9 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff]
- Add support for Journald in the System module. {pull}41555[41555]
- Add ability to remove request trace logs from http_endpoint input. {pull}40005[40005]
- Add ability to remove request trace logs from entityanalytics input. {pull}40004[40004]
- Refactor & cleanup with updates to default values and documentation. {pull}41834[41834]
- Update CEL mito extensions to v1.16.0. {pull}41727[41727]
- Add evaluation state dump debugging option to CEL input. {pull}41335[41335]

*Auditbeat*

Expand Down
12 changes: 5 additions & 7 deletions metricbeat/module/system/raid/blockinfo/getdev.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@ package blockinfo

import (
"fmt"
"io/ioutil"
"os"
"path/filepath"

"github.com/elastic/beats/v7/metricbeat/mb"
)

// ListAll lists all the multi-disk devices in a RAID array
func ListAll(path string) ([]MDDevice, error) {
dir, err := ioutil.ReadDir(path)
dir, err := os.ReadDir(path)
if err != nil {
return nil, fmt.Errorf("could not read directory: %w", err)
}
Expand All @@ -44,7 +45,7 @@ func ListAll(path string) ([]MDDevice, error) {
}

if len(mds) == 0 {
return nil, fmt.Errorf("no matches from path %s", path)
return nil, mb.PartialMetricsError{Err: fmt.Errorf("no RAID devices found. You have probably enabled the RAID metrics on a non-RAID system.")}
}

return mds, nil
Expand All @@ -69,8 +70,5 @@ func getMDDevice(path string) (MDDevice, error) {
// Right now, we're doing this by looking for an `md` directory in the device dir.
func isMD(path string) bool {
_, err := os.Stat(filepath.Join(path, "md"))
if err != nil {
return false
}
return true
return err == nil
}
7 changes: 5 additions & 2 deletions metricbeat/module/system/raid/raid.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,11 @@ type MetricSet struct {

// New creates a new instance of the raid metricset.
func New(base mb.BaseMetricSet) (mb.MetricSet, error) {
sys, ok := base.Module().(resolve.Resolver)
if !ok {
return nil, fmt.Errorf("unexpected module type: %T", base.Module())
}

sys := base.Module().(resolve.Resolver)
return &MetricSet{
BaseMetricSet: base,

Expand All @@ -62,7 +65,7 @@ func blockto1024(b int64) int64 {
func (m *MetricSet) Fetch(r mb.ReporterV2) error {
devices, err := blockinfo.ListAll(m.mod.ResolveHostFS("/sys/block"))
if err != nil {
return fmt.Errorf("failed to parse sysfs: %w", err)
return fmt.Errorf("failed to list RAID devices: %w", err)
}

for _, blockDev := range devices {
Expand Down
20 changes: 20 additions & 0 deletions metricbeat/module/system/raid/raid_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,14 @@
package raid

import (
"errors"
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"

"github.com/elastic/beats/v7/metricbeat/mb"
mbtest "github.com/elastic/beats/v7/metricbeat/mb/testing"
_ "github.com/elastic/beats/v7/metricbeat/module/system"
)
Expand All @@ -46,6 +50,22 @@ func TestFetch(t *testing.T) {
events[0].BeatEvent("system", "raid").Fields.StringToPrint())
}

func TestFetchNoRAID(t *testing.T) {
// Ensure that we return partial metrics when no RAID devices are present.
tmpDir := t.TempDir()
assert.NoError(t, os.MkdirAll(filepath.Join(tmpDir, "sys/block"), 0755))
c := getConfig()
c["hostfs"] = tmpDir

f := mbtest.NewReportingMetricSetV2Error(t, c)
events, errs := mbtest.ReportingFetchV2Error(f)

assert.Len(t, errs, 1)
assert.ErrorAs(t, errors.Join(errs...), &mb.PartialMetricsError{})
assert.Contains(t, errors.Join(errs...).Error(), "failed to list RAID devices: no RAID devices found. You have probably enabled the RAID metrics on a non-RAID system.")
assert.Empty(t, events)
}

func getConfig() map[string]interface{} {
return map[string]interface{}{
"module": "system",
Expand Down
20 changes: 20 additions & 0 deletions x-pack/filebeat/docs/inputs/input-cel.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,26 @@ This specifies fields in the `state` to be redacted prior to debug logging. Fiel

This specifies whether fields should be replaced with a `*` or deleted entirely from messages sent to debug logs. If delete is `true`, fields will be deleted rather than replaced.

[float]
==== `failure_dump.enabled`

It is possible to log CEL program evaluation failures to a local file-system for debugging configurations.
This option is enabled by setting `failure_dump.enabled` to true and setting the `failure_dump.filename` value.
To delete existing failure dumps, set `failure_dump.enabled` to false without unsetting the filename option.

Enabling this option compromises security and should only be used for debugging.

[float]
==== `failure_dump.filename`

This specifies a directory path to write failure dumps to. If it is not empty and a CEL program evaluation fails,
the complete set of states for the CEL program's evaluation will be written as a JSON file, along with the error
that was reported. This option should only be used when debugging a failure as it imposes a significant performance
impact on the input and may potentially use large quantities of memory to hold the full set of states. If a failure
dump is configured, it is recommended that data input sizes be reduced to avoid excessive memory consumption, and
making dumps that are intractable to analysis. To delete existing failure dumps, set `failure_dump.enabled` to
false without unsetting the filename option.

[float]
=== Metrics

Expand Down
18 changes: 7 additions & 11 deletions x-pack/filebeat/docs/inputs/input-gcs.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
++++

Use the `google cloud storage input` to read content from files stored in buckets which reside on your Google Cloud.
The input can be configured to work with and without polling, though currently, if polling is disabled it will only
perform a one time passthrough, list the file contents and end the process. Polling is generally recommented for most cases
even though it can get expensive with dealing with a very large number of files.
The input can be configured to work with and without polling, though if polling is disabled it will only perform a single collection of data, list the file contents and end the process.

*To mitigate errors and ensure a stable processing environment, this input employs the following features :*

Expand Down Expand Up @@ -66,12 +64,11 @@ many buckets as we deem fit. We are also able to configure the attributes `max_w
then be applied to all buckets which do not specify any of these attributes explicitly.

NOTE: If the attributes `max_workers`, `poll`, `poll_interval` and `bucket_timeout` are specified at the root level, these can still be overridden at the bucket level with
different values, thus offering extensive flexibility and customization. Examples <<bucket-overrides,below>> show this behaviour.
different values, thus offering extensive flexibility and customization. Examples <<bucket-overrides,below>> show this behavior.

On receiving this config the google cloud storage input will connect to the service and retrieve a `Storage Client` using the given `bucket_name` and
`auth.credentials_file`, then it will spawn two main go-routines, one for each bucket. After this each of these routines (threads) will initialize a scheduler
which will in turn use the `max_workers` value to initialize an in-memory worker pool (thread pool) with `3` `workers` available. Basically that equates to two instances of a worker pool,
one per bucket, each having 3 workers. These `workers` will be responsible for performing `jobs` that process a file (in this case read and output the contents of a file).
which will in turn use the `max_workers` value to initialize an in-memory worker pool (thread pool) with `3` `workers` available. Basically that equates to two instances of a worker pool, one per bucket, each having 3 workers. These `workers` will be responsible for performing `jobs` that process a file (in this case read and output the contents of a file).

NOTE: The scheduler is responsible for scheduling jobs, and uses the `maximum available workers` in the pool, at each iteration, to decide the number of files to retrieve and
process. This keeps work distribution efficient. The scheduler uses `poll_interval` attribute value to decide how long to wait after each iteration. The `bucket_timeout` value is used to timeout calls to the bucket list api if it exceeds the given value. Each iteration consists of processing a certain number of files, decided by the `maximum available workers` value.
Expand Down Expand Up @@ -213,7 +210,7 @@ This is a specific subfield of a bucket. It specifies the bucket name.

This attribute defines the maximum amount of time after which a bucket operation will give and stop if no response is recieved (example: reading a file / listing a file).
It can be defined in the following formats : `{{x}}s`, `{{x}}m`, `{{x}}h`, here `s = seconds`, `m = minutes` and `h = hours`. The value `{{x}}` can be anything we wish.
If no value is specified for this, by default its initialized to `50 seconds`. This attribute can be specified both at the root level of the configuration as well at the bucket level. The bucket level values will always take priority and override the root level values if both are specified. The value of `bucket_timeout` that should be used depends on the size of the files and the network speed. If the timeout is too low, the input will not be able to read the file completely and `context_deadline_exceeded` errors will be seen in the logs. If the timeout is too high, the input will wait for a long time for the file to be read, which can cause the input to be slow. The ratio between the `bucket_timeout` and `poll_interval` should be considered while setting both the values. A low `poll_interval` and a very high `bucket_timeout` can cause resource utilization issues as schedule ops will be spawned every poll iteration. If previous poll ops are still running, this could result in concurrently running ops and so could cause a bottleneck over time.
If no value is specified for this, by default its initialized to `120 seconds`. This attribute can be specified both at the root level of the configuration as well at the bucket level. The bucket level values will always take priority and override the root level values if both are specified. The value of `bucket_timeout` that should be used depends on the size of the files and the network speed. If the timeout is too low, the input will not be able to read the file completely and `context_deadline_exceeded` errors will be seen in the logs. If the timeout is too high, the input will wait for a long time for the file to be read, which can cause the input to be slow. The ratio between the `bucket_timeout` and `poll_interval` should be considered while setting both the values. A low `poll_interval` and a very high `bucket_timeout` can cause resource utilization issues as schedule ops will be spawned every poll iteration. If previous poll ops are still running, this could result in concurrently running ops and so could cause a bottleneck over time.

[id="attrib-max_workers-gcs"]
[float]
Expand All @@ -228,17 +225,16 @@ NOTE: The value of `max_workers` is tied to the `batch_size` currently to ensure
[float]
==== `poll`

This attribute informs the scheduler whether to keep polling for new files or not. Default value of this is `false`, so it will not keep polling if not explicitly
specified. This attribute can be specified both at the root level of the configuration as well at the bucket level. The bucket level values will always
take priority and override the root level values if both are specified.
This attribute informs the scheduler whether to keep polling for new files or not. Default value of this is set to `true`. This attribute can be specified both at the
root level of the configuration as well at the bucket level. The bucket level values will always take priority and override the root level values if both are specified.

[id="attrib-poll_interval-gcs"]
[float]
==== `poll_interval`

This attribute defines the maximum amount of time after which the internal scheduler will make the polling call for the next set of objects/files. It can be
defined in the following formats : `{{x}}s`, `{{x}}m`, `{{x}}h`, here `s = seconds`, `m = minutes` and `h = hours`. The value `{{x}}` can be anything we wish.
Example : `10s` would mean we would like the polling to occur every 10 seconds. If no value is specified for this, by default its initialized to `300 seconds`.
Example : `10s` would mean we would like the polling to occur every 10 seconds. If no value is specified for this, by default its initialized to `5 minutes`.
This attribute can be specified both at the root level of the configuration as well at the bucket level. The bucket level values will always take priority
and override the root level values if both are specified. The `poll_interval` should be set to a value that is equal to the `bucket_timeout` value. This would ensure that another schedule operation is not started before the current buckets have all been processed. If the `poll_interval` is set to a value that is less than the `bucket_timeout`, then the input will start another schedule operation before the current one has finished, which can cause a bottleneck over time. Having a lower `poll_interval` can make the input faster at the cost of more resource utilization.

Expand Down
19 changes: 18 additions & 1 deletion x-pack/filebeat/input/cel/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ type config struct {
// Resource is the configuration for establishing an
// HTTP request or for locating a local resource.
Resource *ResourceConfig `config:"resource" validate:"required"`

// FailureDump configures failure dump behaviour.
FailureDump *dumpConfig `config:"failure_dump"`
}

type redact struct {
Expand All @@ -69,6 +72,19 @@ type redact struct {
Delete bool `config:"delete"`
}

// dumpConfig configures the CEL program to retain
// the full evaluation state using the cel.OptTrackState
// option. The state is written to a file in the path if
// the evaluation fails.
type dumpConfig struct {
Enabled *bool `config:"enabled"`
Filename string `config:"filename"`
}

func (t *dumpConfig) enabled() bool {
return t != nil && (t.Enabled == nil || *t.Enabled)
}

func (c config) Validate() error {
if c.Redact == nil {
logp.L().Named("input.cel").Warn("missing recommended 'redact' configuration: " +
Expand All @@ -89,7 +105,8 @@ func (c config) Validate() error {
if len(c.Regexps) != 0 {
patterns = map[string]*regexp.Regexp{".": nil}
}
_, _, err = newProgram(context.Background(), c.Program, root, nil, &http.Client{}, nil, nil, patterns, c.XSDs, logp.L().Named("input.cel"), nil)
wantDump := c.FailureDump.enabled() && c.FailureDump.Filename != ""
_, _, err = newProgram(context.Background(), c.Program, root, nil, &http.Client{}, nil, nil, patterns, c.XSDs, logp.L().Named("input.cel"), nil, wantDump)
if err != nil {
return fmt.Errorf("failed to check program: %w", err)
}
Expand Down
Loading

0 comments on commit 2c8d78d

Please sign in to comment.