Skip to content

Commit

Permalink
Revert "Revert "[kubelet] Fix oom-score-adj policy in kubelet""
Browse files Browse the repository at this point in the history
  • Loading branch information
vishh authored Sep 16, 2016
1 parent af3050d commit a1fe3ad
Show file tree
Hide file tree
Showing 10 changed files with 296 additions and 107 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ test-e2e: ginkgo generated_files
# DELETE_INSTANCES: For REMOTE=true only. Delete any instances created as
# part of this test run. Defaults to false.
# ARTIFACTS: For REMOTE=true only. Local directory to scp test artifacts into
# from the remote hosts. Defaults to ""/tmp/_artifacts".
# from the remote hosts. Defaults to "/tmp/_artifacts".
# REPORT: For REMOTE=false only. Local directory to write juntil xml results
# to. Defaults to "/tmp/".
# CLEANUP: For REMOTE=true only. If false, do not stop processes or delete
Expand Down
1 change: 1 addition & 0 deletions docs/design/resource-qos.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ Pod OOM score configuration

*Pod infra containers* or *Special Pod init process*
- OOM_SCORE_ADJ: -998

*Kubelet, Docker*
- OOM_SCORE_ADJ: -999 (won’t be OOM killed)
- Hack, because these critical tasks might die if they conflict with guaranteed containers. In the future, we should place all user-pods into a separate cgroup, and set a limit on the memory they can consume.
Expand Down
2 changes: 1 addition & 1 deletion docs/proposals/kubelet-systemd.md
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ The `kubelet` will set the following:
The `kubelet` at bootstrapping will set the `oom_score_adj` value for Kubernetes
daemons, and any dependent container-runtime daemons.

If `container-runtime` is set to `docker`, then set its `oom_score_adj=-900`
If `container-runtime` is set to `docker`, then set its `oom_score_adj=-999`

## Implementation concerns

Expand Down
8 changes: 8 additions & 0 deletions hack/verify-flags/exceptions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ pkg/kubelet/api/v1alpha1/runtime/api.pb.go: ContainerPort *int32 `protobuf:"vari
pkg/kubelet/api/v1alpha1/runtime/api.pb.go: OomScoreAdj *int64 `protobuf:"varint,5,opt,name=oom_score_adj,json=oomScoreAdj" json:"oom_score_adj,omitempty"`
pkg/kubelet/api/v1alpha1/runtime/api.proto: optional int32 container_port = 3;
pkg/kubelet/api/v1alpha1/runtime/api.proto: optional int64 oom_score_adj = 5;
pkg/kubelet/cm/container_manager_linux.go: glog.V(3).Infof("Failed to apply oom_score_adj %d for pid %d: %v", oomScoreAdj, pid, err)
pkg/kubelet/cm/container_manager_linux.go: glog.V(5).Infof("attempting to apply oom_score_adj of %d to pid %d", oomScoreAdj, pid)
pkg/kubelet/network/hairpin/hairpin.go: hairpinModeRelativePath = "hairpin_mode"
pkg/kubelet/qos/policy_test.go: t.Errorf("oom_score_adj should be between %d and %d, but was %d", test.lowOOMScoreAdj, test.highOOMScoreAdj, oomScoreAdj)
pkg/kubelet/qos/policy_test.go: highOOMScoreAdj int // The min oom_score_adj score the container should be assigned.
Expand All @@ -117,6 +119,12 @@ test/e2e/common/host_path.go: fmt.Sprintf("--retry_time=%d", retryDuration),
test/e2e/es_cluster_logging.go: framework.Failf("No cluster_name field in Elasticsearch response: %v", esResponse)
test/e2e/es_cluster_logging.go: // Check to see if have a cluster_name field.
test/e2e/es_cluster_logging.go: clusterName, ok := esResponse["cluster_name"]
test/e2e_node/container_manager_test.go: return fmt.Errorf("expected pid %d's oom_score_adj to be %d; found %d", pid, expectedOOMScoreAdj, oomScore)
test/e2e_node/container_manager_test.go: return fmt.Errorf("expected pid %d's oom_score_adj to be < %d; found %d", pid, expectedMaxOOMScoreAdj, oomScore)
test/e2e_node/container_manager_test.go: return fmt.Errorf("expected pid %d's oom_score_adj to be >= %d; found %d", pid, expectedMinOOMScoreAdj, oomScore)
test/e2e_node/container_manager_test.go: return fmt.Errorf("failed to get oom_score_adj for %d", pid)
test/e2e_node/container_manager_test.go: return fmt.Errorf("failed to get oom_score_adj for %d: %v", pid, err)
test/e2e_node/container_manager_test.go: procfsPath := path.Join("/proc", strconv.Itoa(pid), "oom_score_adj")
test/images/mount-tester/mt.go: flag.BoolVar(&breakOnExpectedContent, "break_on_expected_content", true, "Break out of loop on expected content, (use with --file_content_in_loop flag only)")
test/images/mount-tester/mt.go: flag.IntVar(&retryDuration, "retry_time", 180, "Retry time during the loop")
test/images/mount-tester/mt.go: flag.StringVar(&readFileContentInLoopPath, "file_content_in_loop", "", "Path to read the file content in loop from")
57 changes: 36 additions & 21 deletions pkg/kubelet/cm/container_manager_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ func (cm *containerManagerImpl) setupNode() error {

systemContainers := []*systemContainer{}
if cm.ContainerRuntime == "docker" {
dockerVersion := getDockerVersion(cm.cadvisorInterface)
if cm.RuntimeCgroupsName != "" {
cont := newSystemCgroups(cm.RuntimeCgroupsName)
var capacity = api.ResourceList{}
Expand All @@ -351,13 +352,17 @@ func (cm *containerManagerImpl) setupNode() error {
},
},
}
dockerVersion := getDockerVersion(cm.cadvisorInterface)
cont.ensureStateFunc = func(manager *fs.Manager) error {
return ensureDockerInContainer(dockerVersion, -900, dockerContainer)
return ensureDockerInContainer(dockerVersion, qos.DockerOOMScoreAdj, dockerContainer)
}
systemContainers = append(systemContainers, cont)
} else {
cm.periodicTasks = append(cm.periodicTasks, func() {
glog.V(10).Infof("Adding docker daemon periodic tasks")
if err := ensureDockerInContainer(dockerVersion, qos.DockerOOMScoreAdj, nil); err != nil {
glog.Error(err)
return
}
cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
if err != nil {
glog.Error(err)
Expand Down Expand Up @@ -401,11 +406,15 @@ func (cm *containerManagerImpl) setupNode() error {
},
}
cont.ensureStateFunc = func(_ *fs.Manager) error {
return manager.Apply(os.Getpid())
return ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, &manager)
}
systemContainers = append(systemContainers, cont)
} else {
cm.periodicTasks = append(cm.periodicTasks, func() {
if err := ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, nil); err != nil {
glog.Error(err)
return
}
cont, err := getContainer(os.Getpid())
if err != nil {
glog.Errorf("failed to find cgroups of kubelet - %v", err)
Expand Down Expand Up @@ -516,16 +525,18 @@ func (cm *containerManagerImpl) SystemCgroupsLimit() api.ResourceList {
}

func isProcessRunningInHost(pid int) (bool, error) {
// Get init mount namespace. Mount namespace is unique for all containers.
initMntNs, err := os.Readlink("/proc/1/ns/mnt")
// Get init pid namespace.
initPidNs, err := os.Readlink("/proc/1/ns/pid")
if err != nil {
return false, fmt.Errorf("failed to find mount namespace of init process")
return false, fmt.Errorf("failed to find pid namespace of init process")
}
processMntNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/mnt", pid))
glog.V(10).Infof("init pid ns is %q", initPidNs)
processPidNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", pid))
if err != nil {
return false, fmt.Errorf("failed to find mount namespace of process %q", pid)
return false, fmt.Errorf("failed to find pid namespace of process %q", pid)
}
return initMntNs == processMntNs, nil
glog.V(10).Infof("Pid %d pid ns is %q", pid, processPidNs)
return initPidNs == processPidNs, nil
}

func getPidFromPidFile(pidFile string) (int, error) {
Expand Down Expand Up @@ -567,7 +578,6 @@ func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, mana
if dockerVersion.GTE(containerdVersion) {
dockerProcs = append(dockerProcs, process{containerdProcessName, containerdPidFile})
}

var errs []error
for _, proc := range dockerProcs {
pids, err := getPidsForProcess(proc.name, proc.file)
Expand All @@ -578,40 +588,45 @@ func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, mana

// Move if the pid is not already in the desired container.
for _, pid := range pids {
if err := ensureProcessInContainer(pid, oomScoreAdj, manager); err != nil {
if err := ensureProcessInContainerWithOOMScore(pid, oomScoreAdj, manager); err != nil {
errs = append(errs, fmt.Errorf("errors moving %q pid: %v", proc.name, err))
}
}
}
return utilerrors.NewAggregate(errs)
}

func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) error {
func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager *fs.Manager) error {
if runningInHost, err := isProcessRunningInHost(pid); err != nil {
// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
return err
} else if !runningInHost {
// Process is running inside a container. Don't touch that.
glog.V(2).Infof("pid %d is not running in the host namespaces", pid)
return nil
}

var errs []error
cont, err := getContainer(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
}

if cont != manager.Cgroups.Name {
err = manager.Apply(pid)
if manager != nil {
cont, err := getContainer(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name))
errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
}

if cont != manager.Cgroups.Name {
err = manager.Apply(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, manager.Cgroups.Name, err))
}
}
}

// Also apply oom-score-adj to processes
oomAdjuster := oom.NewOOMAdjuster()
glog.V(5).Infof("attempting to apply oom_score_adj of %d to pid %d", oomScoreAdj, pid)
if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid))
glog.V(3).Infof("Failed to apply oom_score_adj %d for pid %d: %v", oomScoreAdj, pid, err)
errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d: %v", oomScoreAdj, pid, err))
}
return utilerrors.NewAggregate(errs)
}
Expand Down
9 changes: 5 additions & 4 deletions pkg/kubelet/qos/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ import (
)

const (
PodInfraOOMAdj int = -999
PodInfraOOMAdj int = -998
KubeletOOMScoreAdj int = -999
DockerOOMScoreAdj int = -999
KubeProxyOOMScoreAdj int = -999
guaranteedOOMScoreAdj int = -998
besteffortOOMScoreAdj int = 1000
Expand Down Expand Up @@ -53,10 +54,10 @@ func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCa
// Note that this is a heuristic, it won't work if a container has many small processes.
memoryRequest := container.Resources.Requests.Memory().Value()
oomScoreAdjust := 1000 - (1000*memoryRequest)/memoryCapacity
// A guaranteed pod using 100% of memory can have an OOM score of 1. Ensure
// A guaranteed pod using 100% of memory can have an OOM score of 10. Ensure
// that burstable pods have a higher OOM score adjustment.
if oomScoreAdjust < 2 {
return 2
if int(oomScoreAdjust) < (1000 + guaranteedOOMScoreAdj) {
return (1000 + guaranteedOOMScoreAdj)
}
// Give burstable pods a higher chance of survival over besteffort pods.
if int(oomScoreAdjust) == besteffortOOMScoreAdj {
Expand Down
21 changes: 9 additions & 12 deletions pkg/util/oom/oom_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package oom

import (
"fmt"
"io/ioutil"
"os"
"path"
"strconv"
Expand Down Expand Up @@ -65,28 +66,24 @@ func applyOOMScoreAdj(pid int, oomScoreAdj int) error {
maxTries := 2
oomScoreAdjPath := path.Join("/proc", pidStr, "oom_score_adj")
value := strconv.Itoa(oomScoreAdj)
glog.V(4).Infof("attempting to set %q to %q", oomScoreAdjPath, value)
var err error
for i := 0; i < maxTries; i++ {
f, err := os.Open(oomScoreAdjPath)
err = ioutil.WriteFile(oomScoreAdjPath, []byte(value), 0700)
if err != nil {
if os.IsNotExist(err) {
glog.V(2).Infof("%q does not exist", oomScoreAdjPath)
return os.ErrNotExist
}
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
if _, err := f.Write([]byte(value)); err != nil {
// we can ignore the return value of f.Close() here.
f.Close()
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
if err = f.Close(); err != nil {
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)

glog.V(3).Info(err)
continue
}
return nil
}
if err != nil {
glog.V(2).Infof("failed to set %q to %q: %v", oomScoreAdjPath, value, err)
}
return err
}

Expand Down
Loading

0 comments on commit a1fe3ad

Please sign in to comment.