libct/cgroups: add SkipDevices to Resources

The kubelet uses libct/cgroups code to set up cgroups. It creates a
parent cgroup (kubepods) to put the containers into.

The problem (for cgroupv2 that uses eBPF for device configuration) is
the hard requirement to have devices cgroup configured results in
leaking an eBPF program upon every kubelet restart.  program. If kubelet
is restarted 64+ times, the cgroup can't be configured anymore.

Work around this by adding a SkipDevices flag to Resources.

A check was added so that if SkipDevices is set, such a "container"
can't be started (to make sure it is only used for non-containers).

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
Kir Kolyshkin 2020-06-29 16:51:05 -07:00
parent 1b94395c06
commit 108ee85b82
7 changed files with 48 additions and 26 deletions

View File

@ -23,6 +23,9 @@ func (s *DevicesGroup) Name() string {
}
func (s *DevicesGroup) Apply(d *cgroupData) error {
if d.config.SkipDevices {
return nil
}
_, err := d.join("devices")
if err != nil {
// We will return error even it's `not found` error, devices
@ -52,7 +55,7 @@ func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) {
}
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
if system.RunningInUserNS() || cgroup.SkipDevices {
return nil
}

View File

@ -204,7 +204,7 @@ func (m *manager) Apply(pid int) (err error) {
if err != nil {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") {
continue
}
return err

View File

@ -37,6 +37,9 @@ func canSkipEBPFError(cgroup *configs.Cgroup) bool {
}
func setDevices(dirPath string, cgroup *configs.Cgroup) error {
if cgroup.SkipDevices {
return nil
}
// XXX: This is currently a white-list (but all callers pass a blacklist of
// devices). This is bad for a whole variety of reasons, but will need
// to be fixed with co-ordinated effort with downstreams.

View File

@ -379,24 +379,27 @@ func (m *legacyManager) Set(container *configs.Config) error {
return err
}
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
targetFreezerState := configs.Undefined
if !m.cgroups.SkipDevices {
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err = m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}
}
if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {

View File

@ -298,24 +298,27 @@ func (m *unifiedManager) Set(container *configs.Config) error {
return err
}
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
targetFreezerState := configs.Undefined
if !m.cgroups.SkipDevices {
// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err = m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}
}
if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {

View File

@ -126,4 +126,11 @@ type Resources struct {
// CpuWeight sets a proportional bandwidth limit.
CpuWeight uint64 `json:"cpu_weight"`
// SkipDevices allows to skip configuring device permissions.
// Used by e.g. kubelet while creating a parent cgroup (kubepods)
// common for many containers.
//
// NOTE it is impossible to start a container which has this flag set.
SkipDevices bool `json:"skip_devices"`
}

View File

@ -251,6 +251,9 @@ func (c *linuxContainer) Set(config configs.Config) error {
func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
if c.config.Cgroups.Resources.SkipDevices {
return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid)
}
if process.Init {
if err := c.createExecFifo(); err != nil {
return err