diff --git a/cgroups/fs/notify_linux.go b/cgroups/fs/notify_linux.go new file mode 100644 index 00000000..37893ff2 --- /dev/null +++ b/cgroups/fs/notify_linux.go @@ -0,0 +1,81 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "github.com/docker/libcontainer/cgroups" +) + +// NotifyOnOOM sends signals on the returned channel when the cgroup reaches +// its memory limit. The channel is closed when the cgroup is removed. +func NotifyOnOOM(c *cgroups.Cgroup) (chan struct{}, error) { + d, err := getCgroupData(c, 0) + if err != nil { + return nil, err + } + + return notifyOnOOM(d) +} + +func notifyOnOOM(d *data) (chan struct{}, error) { + dir, err := d.path("memory") + if err != nil { + return nil, err + } + + fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0) + if syserr != 0 { + return nil, syserr + } + + eventfd := os.NewFile(fd, "eventfd") + + oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control")) + if err != nil { + eventfd.Close() + return nil, err + } + + var ( + eventControlPath = filepath.Join(dir, "cgroup.event_control") + + data = fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd()) + ) + + if err := writeFile(dir, "cgroup.event_control", data); err != nil { + eventfd.Close() + oomControl.Close() + return nil, err + } + + ch := make(chan struct{}) + + go func() { + defer close(ch) + defer eventfd.Close() + defer oomControl.Close() + + var buf = make([]byte, 8) + + for { + if _, err := eventfd.Read(buf); err != nil { + return + } + + // When a cgroup is destroyed, an event is sent to eventfd. + // So if the control path is gone, return instead of notifying. + if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) { + return + } + + ch <- struct{}{} + } + }() + + return ch, nil +} diff --git a/cgroups/fs/notify_linux_test.go b/cgroups/fs/notify_linux_test.go new file mode 100644 index 00000000..a11880cb --- /dev/null +++ b/cgroups/fs/notify_linux_test.go @@ -0,0 +1,86 @@ +// +build linux + +package fs + +import ( + "encoding/binary" + "fmt" + "syscall" + "testing" + "time" +) + +func TestNotifyOnOOM(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "memory.oom_control": "", + "cgroup.event_control": "", + }) + + var eventFd, oomControlFd int + + ooms, err := notifyOnOOM(helper.CgroupData) + if err != nil { + t.Fatal("expected no error, got:", err) + } + + memoryPath, _ := helper.CgroupData.path("memory") + data, err := readFile(memoryPath, "cgroup.event_control") + if err != nil { + t.Fatal("couldn't read event control file:", err) + } + + if _, err := fmt.Sscanf(data, "%d %d", &eventFd, &oomControlFd); err != nil { + t.Fatalf("invalid control data %q: %s", data, err) + } + + // re-open the eventfd + efd, err := syscall.Dup(eventFd) + if err != nil { + t.Fatal("unable to reopen eventfd:", err) + } + defer syscall.Close(efd) + + if err != nil { + t.Fatal("unable to dup event fd:", err) + } + + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, 1) + + if _, err := syscall.Write(efd, buf); err != nil { + t.Fatal("unable to write to eventfd:", err) + } + + select { + case <-ooms: + case <-time.After(100 * time.Millisecond): + t.Fatal("no notification on oom channel after 100ms") + } + + // simulate what happens when a cgroup is destroyed by cleaning up and then + // writing to the eventfd. + helper.cleanup() + if _, err := syscall.Write(efd, buf); err != nil { + t.Fatal("unable to write to eventfd:", err) + } + + // give things a moment to shut down + select { + case _, ok := <-ooms: + if ok { + t.Fatal("expected no oom to be triggered") + } + case <-time.After(100 * time.Millisecond): + } + + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, uintptr(oomControlFd), syscall.F_GETFD, 0); err != syscall.EBADF { + t.Error("expected oom control to be closed") + } + + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, uintptr(eventFd), syscall.F_GETFD, 0); err != syscall.EBADF { + t.Error("expected event fd to be closed") + } +}