From 108ee85b8271e106b2214da18766833f2cd8883d Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 29 Jun 2020 16:51:05 -0700 Subject: [PATCH] libct/cgroups: add SkipDevices to Resources The kubelet uses libct/cgroups code to set up cgroups. It creates a parent cgroup (kubepods) to put the containers into. The problem (for cgroupv2 that uses eBPF for device configuration) is the hard requirement to have devices cgroup configured results in leaking an eBPF program upon every kubelet restart. program. If kubelet is restarted 64+ times, the cgroup can't be configured anymore. Work around this by adding a SkipDevices flag to Resources. A check was added so that if SkipDevices is set, such a "container" can't be started (to make sure it is only used for non-containers). Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fs/devices.go | 5 ++++- libcontainer/cgroups/fs/fs.go | 2 +- libcontainer/cgroups/fs2/devices.go | 3 +++ libcontainer/cgroups/systemd/v1.go | 27 +++++++++++++++------------ libcontainer/cgroups/systemd/v2.go | 27 +++++++++++++++------------ libcontainer/configs/cgroup_linux.go | 7 +++++++ libcontainer/container_linux.go | 3 +++ 7 files changed, 48 insertions(+), 26 deletions(-) diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go index a70619cb..6a8f1d35 100644 --- a/libcontainer/cgroups/fs/devices.go +++ b/libcontainer/cgroups/fs/devices.go @@ -23,6 +23,9 @@ func (s *DevicesGroup) Name() string { } func (s *DevicesGroup) Apply(d *cgroupData) error { + if d.config.SkipDevices { + return nil + } _, err := d.join("devices") if err != nil { // We will return error even it's `not found` error, devices @@ -52,7 +55,7 @@ func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) { } func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { - if system.RunningInUserNS() { + if system.RunningInUserNS() || cgroup.SkipDevices { return nil } diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index 245ce84f..f00fa11a 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -204,7 +204,7 @@ func (m *manager) Apply(pid int) (err error) { if err != nil { // The non-presence of the devices subsystem is // considered fatal for security reasons. - if cgroups.IsNotFound(err) && sys.Name() != "devices" { + if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") { continue } return err diff --git a/libcontainer/cgroups/fs2/devices.go b/libcontainer/cgroups/fs2/devices.go index ee180bda..053ec33e 100644 --- a/libcontainer/cgroups/fs2/devices.go +++ b/libcontainer/cgroups/fs2/devices.go @@ -37,6 +37,9 @@ func canSkipEBPFError(cgroup *configs.Cgroup) bool { } func setDevices(dirPath string, cgroup *configs.Cgroup) error { + if cgroup.SkipDevices { + return nil + } // XXX: This is currently a white-list (but all callers pass a blacklist of // devices). This is bad for a whole variety of reasons, but will need // to be fixed with co-ordinated effort with downstreams. diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 9e8e2fdd..63503ad8 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -379,24 +379,27 @@ func (m *legacyManager) Set(container *configs.Config) error { return err } - // Figure out the current freezer state, so we can revert to it after we - // temporarily freeze the container. - targetFreezerState, err := m.GetFreezerState() - if err != nil { - return err - } - if targetFreezerState == configs.Undefined { - targetFreezerState = configs.Thawed - } - // We have to freeze the container while systemd sets the cgroup settings. // The reason for this is that systemd's application of DeviceAllow rules // is done disruptively, resulting in spurrious errors to common devices // (unlike our fs driver, they will happily write deny-all rules to running // containers). So we freeze the container to avoid them hitting the cgroup // error. But if the freezer cgroup isn't supported, we just warn about it. - if err := m.Freeze(configs.Frozen); err != nil { - logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + targetFreezerState := configs.Undefined + if !m.cgroups.SkipDevices { + // Figure out the current freezer state, so we can revert to it after we + // temporarily freeze the container. + targetFreezerState, err = m.GetFreezerState() + if err != nil { + return err + } + if targetFreezerState == configs.Undefined { + targetFreezerState = configs.Thawed + } + + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } } if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil { diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 6e343aa9..e1a6622a 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -298,24 +298,27 @@ func (m *unifiedManager) Set(container *configs.Config) error { return err } - // Figure out the current freezer state, so we can revert to it after we - // temporarily freeze the container. - targetFreezerState, err := m.GetFreezerState() - if err != nil { - return err - } - if targetFreezerState == configs.Undefined { - targetFreezerState = configs.Thawed - } - // We have to freeze the container while systemd sets the cgroup settings. // The reason for this is that systemd's application of DeviceAllow rules // is done disruptively, resulting in spurrious errors to common devices // (unlike our fs driver, they will happily write deny-all rules to running // containers). So we freeze the container to avoid them hitting the cgroup // error. But if the freezer cgroup isn't supported, we just warn about it. - if err := m.Freeze(configs.Frozen); err != nil { - logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + targetFreezerState := configs.Undefined + if !m.cgroups.SkipDevices { + // Figure out the current freezer state, so we can revert to it after we + // temporarily freeze the container. + targetFreezerState, err = m.GetFreezerState() + if err != nil { + return err + } + if targetFreezerState == configs.Undefined { + targetFreezerState = configs.Thawed + } + + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } } if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil { diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index f1a5bd11..6e90ae16 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -126,4 +126,11 @@ type Resources struct { // CpuWeight sets a proportional bandwidth limit. CpuWeight uint64 `json:"cpu_weight"` + + // SkipDevices allows to skip configuring device permissions. + // Used by e.g. kubelet while creating a parent cgroup (kubepods) + // common for many containers. + // + // NOTE it is impossible to start a container which has this flag set. + SkipDevices bool `json:"skip_devices"` } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index b4d722b0..c5188b1d 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -251,6 +251,9 @@ func (c *linuxContainer) Set(config configs.Config) error { func (c *linuxContainer) Start(process *Process) error { c.m.Lock() defer c.m.Unlock() + if c.config.Cgroups.Resources.SkipDevices { + return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid) + } if process.Init { if err := c.createExecFifo(); err != nil { return err