Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cadvisor: Fix pod detection for containerd runtime on k8s #189

Merged
merged 8 commits into from
Mar 24, 2021
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@ config-downloader: copy-version-file
$(WIN_BUILD)/config-downloader.exe github.com/aws/amazon-cloudwatch-agent/cmd/config-downloader
$(DARWIN_BUILD)/config-downloader github.com/aws/amazon-cloudwatch-agent/cmd/config-downloader

# A fast build that only builds amd64, we don't need wizard and config downloader
build-for-docker:
$(LINUX_AMD64_BUILD)/amazon-cloudwatch-agent github.com/aws/amazon-cloudwatch-agent/cmd/amazon-cloudwatch-agent
$(LINUX_AMD64_BUILD)/start-amazon-cloudwatch-agent github.com/aws/amazon-cloudwatch-agent/cmd/start-amazon-cloudwatch-agent
$(LINUX_AMD64_BUILD)/config-translator github.com/aws/amazon-cloudwatch-agent/cmd/config-translator

fmt:
go fmt ./...

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ spec:
- name: varlibdocker
mountPath: /var/lib/docker
readOnly: true
- name: containerdsock
mountPath: /run/containerd/containerd.sock
readOnly: true
- name: sys
mountPath: /sys
readOnly: true
Expand All @@ -177,6 +180,9 @@ spec:
- name: varlibdocker
hostPath:
path: /var/lib/docker
- name: containerdsock
hostPath:
path: /run/containerd/containerd.sock
- name: sys
hostPath:
path: /sys
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ spec:
- name: varlibdocker
mountPath: /var/lib/docker
readOnly: true
- name: containerdsock
mountPath: /run/containerd/containerd.sock
readOnly: true
- name: sys
mountPath: /sys
readOnly: true
Expand All @@ -75,6 +78,9 @@ spec:
- name: varlibdocker
hostPath:
path: /var/lib/docker
- name: containerdsock
hostPath:
path: /run/containerd/containerd.sock
- name: sys
hostPath:
path: /sys
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ spec:
- name: varlibdocker
mountPath: /var/lib/docker
readOnly: true
- name: containerdsock
mountPath: /run/containerd/containerd.sock
readOnly: true
- name: sys
mountPath: /sys
readOnly: true
Expand All @@ -159,6 +162,9 @@ spec:
- name: varlibdocker
hostPath:
path: /var/lib/docker
- name: containerdsock
hostPath:
path: /run/containerd/containerd.sock
- name: sys
hostPath:
path: /sys
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ spec:
- name: varlibdocker
mountPath: /var/lib/docker
readOnly: true
- name: containerdsock
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just want to mention that the yaml files should also be updated in another repo: https://github.com/aws-samples/amazon-cloudwatch-container-insights

mountPath: /run/containerd/containerd.sock
readOnly: true
- name: sys
mountPath: /sys
readOnly: true
Expand All @@ -159,6 +162,9 @@ spec:
- name: varlibdocker
hostPath:
path: /var/lib/docker
- name: containerdsock
hostPath:
path: /run/containerd/containerd.sock
- name: sys
hostPath:
path: /sys
Expand Down
3 changes: 3 additions & 0 deletions internal/containerinsightscommon/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,7 @@ const (
TypeContainer = "Container"
TypeContainerFS = "ContainerFS"
TypeContainerDiskIO = "ContainerDiskIO"
// Special type for pause container, introduced in https://github.com/aws/amazon-cloudwatch-agent/issues/188
// because containerd does not set container name pause container name to POD like docker does.
TypeInfraContainer = "InfraContainer"
)
50 changes: 29 additions & 21 deletions plugins/inputs/cadvisor/container_info_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,11 @@ func processContainers(cInfos []*cinfo.ContainerInfo, detailMode bool, container
metrics = append(metrics, processPod(cInfo, podKeys)...)
}
}
// This happens when our cgroup path based pod detection logic is not working.
// This happens when our cgroup path and label based pod detection logic is not working.
// contained https://github.com/aws/amazon-cloudwatch-agent/issues/188
// docker systemd https://github.com/aws/amazon-cloudwatch-agent/pull/171
if len(metrics) == beforePod {
log.Printf("W! No pod metric collected, metrics count is still %d", beforePod)
log.Printf("W! No pod metric collected, metrics count is still %d is containerd socket mounted? https://github.com/aws/amazon-cloudwatch-agent/issues/188", beforePod)
}

metrics = mergeMetrics(metrics)
Expand Down Expand Up @@ -98,32 +100,44 @@ func processContainer(info *cinfo.ContainerInfo, detailMode bool, containerOrche
if !detailMode {
return result, pKey
}

// Only a container has all these three labels set.
containerName := info.Spec.Labels[containerNameLable]
namespace := info.Spec.Labels[namespaceLable]
podName := info.Spec.Labels[podNameLable]
podId := info.Spec.Labels[podIdLable]
if containerName == "" || namespace == "" || podName == "" {
// NOTE: containerName can be empty for pause container on containerd
// https://github.com/containerd/cri/issues/922#issuecomment-423729537
if namespace == "" || podName == "" {
return result, pKey
}

// Pod's cgroup path is parent for a container.
// contianer name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/docker-57b3dabd69b94beb462244a0c15c244b509adad0940cdcc67ca079b8208ec1f2.scope
// container name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/docker-57b3dabd69b94beb462244a0c15c244b509adad0940cdcc67ca079b8208ec1f2.scope
// pod name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/
podPath := path.Dir(info.Name)
pKey = &podKey{cgroupPath: podPath, podName: podName, podId: podId, namespace: namespace}

tags[PodIdKey] = podId
tags[K8sPodNameKey] = podName
tags[K8sNamespace] = namespace
if containerName != infraContainerName {

switch containerName {
// For docker, pause container name is set to POD while containerd does not set it.
// See https://github.com/aws/amazon-cloudwatch-agent/issues/188
case "", infraContainerName:
pingleig marked this conversation as resolved.
Show resolved Hide resolved
// NOTE: the pod here is only used by NetMetricExtractor,
// other pod info like CPU, Mem are dealt within in processPod.
containerType = TypeInfraContainer
default:
tags[ContainerNamekey] = containerName
tags[ContainerIdkey] = path.Base(info.Name)
containerType = TypeContainer
} else {
// NOTE: the pod here is only used by NetMetricExtractor,
// other pod info like CPU, Mem are dealt within in processPod.
containerType = TypePod

// TODO(pingleig): wait for upstream fix https://github.com/aws/amazon-cloudwatch-agent/issues/192
if !info.Spec.HasFilesystem {
log.Printf("D! containerd does not have container filesystem metrics from cadvisor, See https://github.com/aws/amazon-cloudwatch-agent/issues/192")
}
}
} else {
containerType = TypeNode
Expand All @@ -146,15 +160,19 @@ func processContainer(info *cinfo.ContainerInfo, detailMode bool, containerOrche
return result, pKey
}

// processPod is almost identical as processContainer. We got this second loop because pod detection relies
// on inspecting labels from containers in processContainer. cgroup path for detected pods are saved in podKeys.
// We may not get container before pod when looping all returned cgroup paths so we use a two pass solution
// in processContainers.
func processPod(info *cinfo.ContainerInfo, podKeys map[string]podKey) []*extractors.CAdvisorMetric {
var result []*extractors.CAdvisorMetric
if isContainerInContainer(info.Name) {
log.Printf("D! drop metric because it's nested container, name %s", info.Name)
return result
}

podKey := getPodKey(info, podKeys)
if podKey == nil {
podKey, ok := podKeys[info.Name]
if !ok {
return result
}

Expand All @@ -177,16 +195,6 @@ func processPod(info *cinfo.ContainerInfo, podKeys map[string]podKey) []*extract
return result
}

func getPodKey(info *cinfo.ContainerInfo, podKeys map[string]podKey) *podKey {
key := info.Name

if v, ok := podKeys[key]; ok {
return &v
}

return nil
}

// Check if it's a container running inside container, caller will drop the metric when return value is true.
// The validation is based on ContainerReference.Name, which is essentially cgroup path.
// The first version is from https://github.com/aws/amazon-cloudwatch-agent/commit/e8daa5f5926c5a5f38e0ceb746c141be463e11e4#diff-599185154c116b295172b56311729990d20672f6659500870997c018ce072100
Expand Down
6 changes: 4 additions & 2 deletions plugins/inputs/cadvisor/extractors/cpu_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ func (c *CpuMetricExtractor) recordPreviousInfo(info *cInfo.ContainerInfo) {

func (c *CpuMetricExtractor) GetValue(info *cInfo.ContainerInfo, containerType string) []*CAdvisorMetric {
var metrics []*CAdvisorMetric
if info.Spec.Labels[containerNameLable] == infraContainerName {
// Skip infra container and handle node, pod, other containers in pod
if containerType == TypeInfraContainer {
return metrics
}

Expand All @@ -41,6 +42,7 @@ func (c *CpuMetricExtractor) GetValue(info *cInfo.ContainerInfo, containerType s

if deltaCTimeInNano > MinTimeDiff {
metric := newCadvisorMetric(containerType)
metric.cgroupPath = info.Name

metric.fields[MetricName(containerType, CpuTotal)] = float64(curStats.Cpu.Usage.Total-preStats.Cpu.Usage.Total) / float64(deltaCTimeInNano) * decimalToMillicores
metric.fields[MetricName(containerType, CpuUser)] = float64(curStats.Cpu.Usage.User-preStats.Cpu.Usage.User) / float64(deltaCTimeInNano) * decimalToMillicores
Expand All @@ -59,6 +61,6 @@ func (c *CpuMetricExtractor) CleanUp(now time.Time) {

func NewCpuMetricExtractor() *CpuMetricExtractor {
return &CpuMetricExtractor{
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval),
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval),
}
}
2 changes: 1 addition & 1 deletion plugins/inputs/cadvisor/extractors/diskio_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ func (d *DiskIOMetricExtractor) CleanUp(now time.Time) {

func NewDiskIOMetricExtractor() *DiskIOMetricExtractor {
return &DiskIOMetricExtractor{
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval),
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval),
}
}

Expand Down
20 changes: 14 additions & 6 deletions plugins/inputs/cadvisor/extractors/extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,28 @@ import (
)

const (
containerNameLable = "io.kubernetes.container.name"
// TODO: https://github.com/containerd/cri/issues/922#issuecomment-423729537 the container name can be empty on containerd
infraContainerName = "POD"
containerNameLabel = "io.kubernetes.container.name"
Metrics = "Metrics"
Dimensions = "Dimensions"
CleanInteval = 5 * time.Minute
CleanInterval = 5 * time.Minute
)

type MetricExtractor interface {
HasValue(*cinfo.ContainerInfo) bool
GetValue(*cinfo.ContainerInfo, string) []*CAdvisorMetric
// GetValue normally applies to the following types:
// containerinsightscommon.TypeContainer
// containerinsightscommon.TypePod
// containerinsightscommon.TypeNode
// and ignores:
// containerinsightscommon.TypeInfraContainer
// The only exception is NetMetricExtractor because pod network metrics comes from infra container (i.e. pause).
// See https://www.ianlewis.org/en/almighty-pause-container
GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric
CleanUp(time.Time)
}

type CAdvisorMetric struct {
cgroupPath string // source of the metric for debugging merge conflict
fields map[string]interface{}
tags map[string]string
metricType string
Expand Down Expand Up @@ -68,7 +75,8 @@ func (c *CAdvisorMetric) Merge(src *CAdvisorMetric) {
// If there is any conflict, keep the fields with earlier timestamp
for k, v := range src.fields {
if _, ok := c.fields[k]; ok {
log.Printf("D! metric being merged has conflict in fields, src: %v, dest: %v \n", *src, *c)
log.Printf("D! metric being merged has conflict in fields, path src: %q, dest: %q", src.cgroupPath, c.cgroupPath)
log.Printf("D! metric being merged has conflict in fields, src: %v, dest: %v", *src, *c)
if c.tags[containerinsightscommon.Timestamp] < src.tags[containerinsightscommon.Timestamp] {
continue
}
Expand Down
4 changes: 3 additions & 1 deletion plugins/inputs/cadvisor/extractors/fs_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@ type FileSystemMetricExtractor struct {
allowListRegexP *regexp.Regexp
}

// TODO(pingleig): it is always false for container using containerd https://github.com/aws/amazon-cloudwatch-agent/issues/192
func (f *FileSystemMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool {
return info.Spec.HasFilesystem
}

func (f *FileSystemMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric {
var metrics []*CAdvisorMetric
if containerType == TypePod || info.Spec.Labels[containerNameLable] == infraContainerName {
if containerType == TypePod || containerType == TypeInfraContainer {
return metrics
}

Expand Down Expand Up @@ -58,6 +59,7 @@ func (f *FileSystemMetricExtractor) GetValue(info *cinfo.ContainerInfo, containe
metric.fields[MetricName(containerType, FSInodesfree)] = v.InodesFree
}

metric.cgroupPath = info.Name
metrics = append(metrics, metric)
}
return metrics
Expand Down
5 changes: 3 additions & 2 deletions plugins/inputs/cadvisor/extractors/mem_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ func (m *MemMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool {

func (m *MemMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric {
var metrics []*CAdvisorMetric
if info.Spec.Labels[containerNameLable] == infraContainerName {
if containerType == TypeInfraContainer {
return metrics
}

metric := newCadvisorMetric(containerType)
metric.cgroupPath = info.Name
pingleig marked this conversation as resolved.
Show resolved Hide resolved
curStats := GetStats(info)

metric.fields[MetricName(containerType, MemUsage)] = curStats.Memory.Usage
Expand Down Expand Up @@ -64,6 +65,6 @@ func (m *MemMetricExtractor) CleanUp(now time.Time) {

func NewMemMetricExtractor() *MemMetricExtractor {
return &MemMetricExtractor{
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval),
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval),
}
}
12 changes: 9 additions & 3 deletions plugins/inputs/cadvisor/extractors/net_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,14 @@ func (n *NetMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool {
func (n *NetMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric {
var metrics []*CAdvisorMetric

// Just a protection here, there is no Container level Net metrics
if (containerType == TypePod && info.Spec.Labels[containerNameLable] != infraContainerName) || containerType == TypeContainer {
// Ignore both pod and container because the network metrics comes from InfraContainer.
if containerType == TypePod || containerType == TypeContainer {
return metrics
}
// Rename type to pod so the metric name prefix is pod_
if containerType == TypeInfraContainer {
containerType = TypePod
}

if preInfo, ok := n.preInfos.Get(info.Name); ok {
curStats := GetStats(info)
Expand Down Expand Up @@ -86,6 +90,7 @@ func (n *NetMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType s
metric.fields[MetricName(mType, k)] = v
}

metric.cgroupPath = info.Name
metrics = append(metrics, metric)
break
}
Expand All @@ -97,6 +102,7 @@ func (n *NetMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType s
for k, v := range aggregatedFields {
metric.fields[MetricName(containerType, k)] = v
}
metric.cgroupPath = info.Name
metrics = append(metrics, metric)
}
}
Expand All @@ -112,7 +118,7 @@ func (n *NetMetricExtractor) CleanUp(now time.Time) {

func NewNetMetricExtractor() *NetMetricExtractor {
return &NetMetricExtractor{
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval),
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval),
}
}

Expand Down
1 change: 1 addition & 0 deletions plugins/inputs/cadvisor/merger.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package cadvisor

import (
"fmt"

. "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon"
"github.com/aws/amazon-cloudwatch-agent/plugins/inputs/cadvisor/extractors"
)
Expand Down