libct/cgroups: add SkipDevices to Resources

The kubelet uses libct/cgroups code to set up cgroups. It creates a parent cgroup (kubepods) to put the containers into. The problem (for cgroupv2 that uses eBPF for device configuration) is the hard requirement to have devices cgroup configured results in leaking an eBPF program upon every kubelet restart. program. If kubelet is restarted 64+ times, the cgroup can't be configured anymore. Work around this by adding a SkipDevices flag to Resources. A check was added so that if SkipDevices is set, such a "container" can't be started (to make sure it is only used for non-containers). Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2020-06-29 16:51:05 -07:00 · 2020-06-29 16:51:05 -07:00 · 108ee85b82
parent 1b94395c06
commit 108ee85b82
7 changed files with 48 additions and 26 deletions
--- a/libcontainer/cgroups/fs/devices.go
+++ b/libcontainer/cgroups/fs/devices.go
@ -23,6 +23,9 @@ func (s *DevicesGroup) Name() string {
 }

 func (s *DevicesGroup) Apply(d *cgroupData) error {
+	if d.config.SkipDevices {
+		return nil
+	}
 	_, err := d.join("devices")
 	if err != nil {
 		// We will return error even it's `not found` error, devices
@ -52,7 +55,7 @@ func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) {
 }

 func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if system.RunningInUserNS() {
+	if system.RunningInUserNS() || cgroup.SkipDevices {
 		return nil
 	}

--- a/libcontainer/cgroups/fs/fs.go
+++ b/libcontainer/cgroups/fs/fs.go
@ -204,7 +204,7 @@ func (m *manager) Apply(pid int) (err error) {
 		if err != nil {
 			// The non-presence of the devices subsystem is
 			// considered fatal for security reasons.
-			if cgroups.IsNotFound(err) && sys.Name() != "devices" {
+			if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") {
 				continue
 			}
 			return err
--- a/libcontainer/cgroups/fs2/devices.go
+++ b/libcontainer/cgroups/fs2/devices.go
@ -37,6 +37,9 @@ func canSkipEBPFError(cgroup *configs.Cgroup) bool {
 }

 func setDevices(dirPath string, cgroup *configs.Cgroup) error {
+	if cgroup.SkipDevices {
+		return nil
+	}
 	// XXX: This is currently a white-list (but all callers pass a blacklist of
 	//      devices). This is bad for a whole variety of reasons, but will need
 	//      to be fixed with co-ordinated effort with downstreams.
--- a/libcontainer/cgroups/systemd/v1.go
+++ b/libcontainer/cgroups/systemd/v1.go
@ -379,24 +379,27 @@ func (m *legacyManager) Set(container *configs.Config) error {
 		return err
 	}

-	// Figure out the current freezer state, so we can revert to it after we
-	// temporarily freeze the container.
-	targetFreezerState, err := m.GetFreezerState()
-	if err != nil {
-		return err
-	}
-	if targetFreezerState == configs.Undefined {
-		targetFreezerState = configs.Thawed
-	}
-
 	// We have to freeze the container while systemd sets the cgroup settings.
 	// The reason for this is that systemd's application of DeviceAllow rules
 	// is done disruptively, resulting in spurrious errors to common devices
 	// (unlike our fs driver, they will happily write deny-all rules to running
 	// containers). So we freeze the container to avoid them hitting the cgroup
 	// error. But if the freezer cgroup isn't supported, we just warn about it.
-	if err := m.Freeze(configs.Frozen); err != nil {
-		logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+	targetFreezerState := configs.Undefined
+	if !m.cgroups.SkipDevices {
+		// Figure out the current freezer state, so we can revert to it after we
+		// temporarily freeze the container.
+		targetFreezerState, err = m.GetFreezerState()
+		if err != nil {
+			return err
+		}
+		if targetFreezerState == configs.Undefined {
+			targetFreezerState = configs.Thawed
+		}
+
+		if err := m.Freeze(configs.Frozen); err != nil {
+			logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+		}
 	}

 	if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
--- a/libcontainer/cgroups/systemd/v2.go
+++ b/libcontainer/cgroups/systemd/v2.go
@ -298,24 +298,27 @@ func (m *unifiedManager) Set(container *configs.Config) error {
 		return err
 	}

-	// Figure out the current freezer state, so we can revert to it after we
-	// temporarily freeze the container.
-	targetFreezerState, err := m.GetFreezerState()
-	if err != nil {
-		return err
-	}
-	if targetFreezerState == configs.Undefined {
-		targetFreezerState = configs.Thawed
-	}
-
 	// We have to freeze the container while systemd sets the cgroup settings.
 	// The reason for this is that systemd's application of DeviceAllow rules
 	// is done disruptively, resulting in spurrious errors to common devices
 	// (unlike our fs driver, they will happily write deny-all rules to running
 	// containers). So we freeze the container to avoid them hitting the cgroup
 	// error. But if the freezer cgroup isn't supported, we just warn about it.
-	if err := m.Freeze(configs.Frozen); err != nil {
-		logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+	targetFreezerState := configs.Undefined
+	if !m.cgroups.SkipDevices {
+		// Figure out the current freezer state, so we can revert to it after we
+		// temporarily freeze the container.
+		targetFreezerState, err = m.GetFreezerState()
+		if err != nil {
+			return err
+		}
+		if targetFreezerState == configs.Undefined {
+			targetFreezerState = configs.Thawed
+		}
+
+		if err := m.Freeze(configs.Frozen); err != nil {
+			logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+		}
 	}

 	if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
--- a/libcontainer/configs/cgroup_linux.go
+++ b/libcontainer/configs/cgroup_linux.go
@ -126,4 +126,11 @@ type Resources struct {

 	// CpuWeight sets a proportional bandwidth limit.
 	CpuWeight uint64 `json:"cpu_weight"`
+
+	// SkipDevices allows to skip configuring device permissions.
+	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
+	// common for many containers.
+	//
+	// NOTE it is impossible to start a container which has this flag set.
+	SkipDevices bool `json:"skip_devices"`
 }
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@ -251,6 +251,9 @@ func (c *linuxContainer) Set(config configs.Config) error {
 func (c *linuxContainer) Start(process *Process) error {
 	c.m.Lock()
 	defer c.m.Unlock()
+	if c.config.Cgroups.Resources.SkipDevices {
+		return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid)
+	}
 	if process.Init {
 		if err := c.createExecFifo(); err != nil {
 			return err