diff --git a/Makefile b/Makefile index 6c435b19..1a2e23e0 100644 --- a/Makefile +++ b/Makefile @@ -18,8 +18,6 @@ direct-test-short: go test $(TEST_TAGS) -cover -test.short -v $(GO_PACKAGES) direct-build: - chmod 755 hack/seccomp.sh - hack/seccomp.sh go build -v $(GO_PACKAGES) direct-install: diff --git a/configs/config.go b/configs/config.go index e75e5701..7275b642 100644 --- a/configs/config.go +++ b/configs/config.go @@ -13,8 +13,38 @@ type IDMap struct { Size int `json:"size"` } -type SeccompConf struct { - SysCalls []int `json:"syscalls"` +type Seccomp struct { + Syscalls []*Syscall `json:"syscalls"` +} + +type Action int + +const ( + Kill Action = iota - 3 + Trap + Allow +) + +type Operator int + +const ( + EqualTo Operator = iota + NotEqualTo + GreatherThan + LessThan + MaskEqualTo +) + +type Arg struct { + Index int `json:"index"` + Value uint32 `json:"value"` + Op Operator `json:"op"` +} + +type Syscall struct { + Value int `json:"value"` + Action Action `json:"action"` + Args []*Arg `json:"args"` } // TODO Windows. Many of these fields should be factored out into those parts @@ -109,6 +139,8 @@ type Config struct { // sysctl -w my.property.name value in Linux. SystemProperties map[string]string `json:"system_properties"` - // SysCalls specify the system calls to keep when executing the process inside the container - Seccomps SeccompConf `json:"seccomp"` + // Seccomp allows actions to be taken whenever a syscall is made within the container. + // By default, all syscalls are allowed with actions to allow, trap, kill, or return an errno + // can be specified on a per syscall basis. + Seccomp *Seccomp `json:"seccomp"` } diff --git a/configs/namespaces_syscall.go b/configs/namespaces_syscall.go index d3bd3893..c962999e 100644 --- a/configs/namespaces_syscall.go +++ b/configs/namespaces_syscall.go @@ -4,22 +4,17 @@ package configs import "syscall" -var ( - CLONE_SECCOMP = 0x10000 //diffrent from other flag, hard code -) - func (n *Namespace) Syscall() int { return namespaceInfo[n.Type] } var namespaceInfo = map[NamespaceType]int{ - NEWNET: syscall.CLONE_NEWNET, - NEWNS: syscall.CLONE_NEWNS, - NEWUSER: syscall.CLONE_NEWUSER, - NEWIPC: syscall.CLONE_NEWIPC, - NEWUTS: syscall.CLONE_NEWUTS, - NEWPID: syscall.CLONE_NEWPID, - NEWSECCOMP: CLONE_SECCOMP, + NEWNET: syscall.CLONE_NEWNET, + NEWNS: syscall.CLONE_NEWNS, + NEWUSER: syscall.CLONE_NEWUSER, + NEWIPC: syscall.CLONE_NEWIPC, + NEWUTS: syscall.CLONE_NEWUTS, + NEWPID: syscall.CLONE_NEWPID, } // CloneFlags parses the container's Namespaces options to set the correct diff --git a/configs/namespaces_unix.go b/configs/namespaces_unix.go index 61dd74b8..7bc90854 100644 --- a/configs/namespaces_unix.go +++ b/configs/namespaces_unix.go @@ -5,13 +5,12 @@ package configs import "fmt" const ( - NEWNET NamespaceType = "NEWNET" - NEWPID NamespaceType = "NEWPID" - NEWNS NamespaceType = "NEWNS" - NEWUTS NamespaceType = "NEWUTS" - NEWIPC NamespaceType = "NEWIPC" - NEWUSER NamespaceType = "NEWUSER" - NEWSECCOMP NamespaceType = "NEWSECCOMP" + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" ) func NamespaceTypes() []NamespaceType { diff --git a/container_linux.go b/container_linux.go index b833c9e5..215f35d3 100644 --- a/container_linux.go +++ b/container_linux.go @@ -169,13 +169,6 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c cmd.SysProcAttr.Credential = &syscall.Credential{} } } - if cloneFlags&uintptr(configs.CLONE_SECCOMP) != 0 { - //os don't surport for CLONE_SECCOMP, remote it - c.config.Namespaces.Remove(configs.NEWSECCOMP) - cloneFlags = c.config.Namespaces.CloneFlags() - } else { - c.config.Seccomps.SysCalls = []int{} - } cmd.Env = append(cmd.Env, t) cmd.SysProcAttr.Cloneflags = cloneFlags return &initProcess{ diff --git a/hack/seccomp.pl b/hack/seccomp.pl deleted file mode 100755 index dc0f6646..00000000 --- a/hack/seccomp.pl +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/perl - -# ./seccomp.pl < syscall.sample > seccompsyscall.go - -use strict; -use warnings; - -my $pid = open(my $in, "-|") // die "Couldn't fork1 ($!)\n"; - -if($pid == 0) { - $pid = open(my $out, "|-") // die "Couldn't fork2 ($!)\n"; - if($pid == 0) { - exec "cpp" or die "Couldn't exec cpp ($!)\n"; - exit 1; - } - - print $out "#include \n"; - while(<>) { - if(/^\w/) { - my $name="$_"; - chomp($name); - - print $out $name; - print $out " = "; - print $out "__NR_$_"; - } - } - close $out; - exit 0; -} -print "//"; -system("uname -m"); -print "package seccomp\r\n\r\n"; -print "var syscallMap = map[string] int {\n"; -while(<$in>) { - my $line=$_; - - if($line =~ /^[\da-z_]/) - { - my @personal=split(/=/); - $personal[0] =~ s/[ ]//; - $personal[1] =~ s/[\r\n]//; - print " \""; - print $personal[0]; - print "\""; - print " : "; - if (($personal[1] !~ /[0-9]/) || length($personal[1]) > 4) - { - print "-1,\r\n"; - }else{ - print $personal[1]; - print ",\r\n"; - } - } -} - -print "}\r\n"; - diff --git a/hack/seccomp.sh b/hack/seccomp.sh deleted file mode 100755 index 40fa02c5..00000000 --- a/hack/seccomp.sh +++ /dev/null @@ -1,4 +0,0 @@ -#/bin/bash - -chmod 755 hack/seccomp.pl -hack/seccomp.pl < hack/syscall.sample > seccomp/seccompsyscall.go diff --git a/hack/syscall.sample b/hack/syscall.sample deleted file mode 100644 index b1f61d5d..00000000 --- a/hack/syscall.sample +++ /dev/null @@ -1,405 +0,0 @@ -access -chdir -chmod -chown -chown32 -close -creat -dup -dup2 -dup3 -epoll_create -epoll_create1 -epoll_ctl -epoll_ctl_old -epoll_pwait -epoll_wait -epoll_wait_old -eventfd -eventfd2 -faccessat -fadvise64 -fadvise64_64 -fallocate -fanotify_init -fanotify_mark -ioctl -fchdir -fchmod -fchmodat -fchown -fchown32 -fchownat -fcntl -fcntl64 -fdatasync -fgetxattr -flistxattr -flock -fremovexattr -fsetxattr -fstat -fstat64 -fstatat64 -fstatfs -fstatfs64 -fsync -ftruncate -ftruncate64 -getcwd -getdents -getdents64 -getxattr -inotify_add_watch -inotify_init -inotify_init1 -inotify_rm_watch -io_cancel -io_destroy -io_getevents -io_setup -io_submit -lchown -lchown32 -lgetxattr -link -linkat -listxattr -llistxattr -llseek -_llseek -lremovexattr -lseek -lsetxattr -lstat -lstat64 -mkdir -mkdirat -mknod -mknodat -newfstatat -_newselect -oldfstat -oldlstat -oldolduname -oldstat -olduname -oldwait4 -open -openat -pipe -pipe2 -poll -ppoll -pread64 -preadv -futimesat -pselect6 -pwrite64 -pwritev -read -readahead -readdir -readlink -readlinkat -readv -removexattr -rename -renameat -rmdir -select -sendfile -sendfile64 -setxattr -splice -stat -stat64 -statfs -statfs64 -symlink -symlinkat -sync -sync_file_range -sync_file_range2 -syncfs -tee -truncate -truncate64 -umask -unlink -unlinkat -ustat -utime -utimensat -utimes -write -writev - -// Network related -accept -accept4 -bind -connect -getpeername -getsockname -getsockopt -listen -recv -recvfrom -recvmmsg -recvmsg -send -sendmmsg -sendmsg -sendto -setsockopt -shutdown -socket -socketcall -socketpair -sethostname - -// Signal related -pause -rt_sigaction -rt_sigpending -rt_sigprocmask -rt_sigqueueinfo -rt_sigreturn -rt_sigsuspend -rt_sigtimedwait -rt_tgsigqueueinfo -sigaction -sigaltstack -signal -signalfd -signalfd4 -sigpending -sigprocmask -sigreturn -sigsuspend - -// Other needed POSIX -alarm -brk -clock_adjtime -clock_getres -clock_gettime -clock_nanosleep -clock_settime -gettimeofday -nanosleep -nice -sysinfo -syslog -time -timer_create -timer_delete -timerfd_create -timerfd_gettime -timerfd_settime -timer_getoverrun -timer_gettime -timer_settime -times -uname - -// Memory control -madvise -mbind -mincore -mlock -mlockall -mmap -mmap2 -mprotect -mremap -msync -munlock -munlockall -munmap -remap_file_pages -set_mempolicy -vmsplice - -// Process control -capget -capset -clone -execve -exit -exit_group -fork -getcpu -getpgid -getpgrp -getpid -getppid -getpriority -getresgid -getresgid32 -getresuid -getresuid32 -getrlimit -getrusage -getsid -getuid -getuid32 -getegid -getegid32 -geteuid -geteuid32 -getgid -getgid32 -getgroups -getgroups32 -getitimer -get_mempolicy -kill -prctl -prlimit64 -sched_getaffinity -sched_getparam -sched_get_priority_max -sched_get_priority_min -sched_getscheduler -sched_rr_get_interval -sched_setaffinity -sched_setparam -sched_setscheduler -sched_yield -setfsgid -setfsgid32 -setfsuid -setfsuid32 -setgid -setgid32 -setgroups -setgroups32 -setitimer -setpgid -setpriority -setregid -setregid32 -setresgid -setresgid32 -setresuid -setresuid32 -setreuid -setreuid32 -setrlimit -setsid -setuid -setuid32 -ugetrlimit -vfork -wait4 -waitid -waitpid - -// IPC -ipc -mq_getsetattr -mq_notify -mq_open -mq_timedreceive -mq_timedsend -mq_unlink -msgctl -msgget -msgrcv -msgsnd -semctl -semget -semop -semtimedop -shmat -shmctl -shmdt -shmget - -// Linux specific, mostly needed for thread-related stuff -arch_prctl -get_robust_list -get_thread_area -gettid -futex -restart_syscall -set_robust_list -set_thread_area -set_tid_address -tgkill -tkill - -// Admin syscalls, these are blocked -acct -adjtimex -bdflush -chroot -create_module -delete_module -get_kernel_syms -idle -init_module -ioperm -iopl -ioprio_get -ioprio_set -kexec_load -lookup_dcookie -migrate_pages -modify_ldt -mount -move_pages -name_to_handle_at -nfsservctl -open_by_handle_at -perf_event_open -pivot_root -process_vm_readv -process_vm_writev -ptrace -query_module -quotactl -reboot -setdomainname -setns -settimeofday -sgetmask -ssetmask -stime -swapoff -swapon -_sysctl -sysfs -sys_setaltroot -umount -umount2 -unshare -uselib -vhangup -vm86 -vm86old - -// Kernel key management -add_key -keyctl -request_key - -// Unimplemented -afs_syscall -break -ftime -getpmsg -gtty -lock -madvise1 -mpx -prof -profil -putpmsg -security -stty -tuxcall -ulimit -vserver diff --git a/init_linux.go b/init_linux.go index bd97364e..3eabe3cd 100644 --- a/init_linux.go +++ b/init_linux.go @@ -262,13 +262,59 @@ func killCgroupProcesses(m cgroups.Manager) error { } func finalizeSeccomp(config *initConfig) error { - if len(config.Config.Seccomps.SysCalls) > 0 { - scmpCtx, _ := seccomp.ScmpInit(seccomp.ScmpActAllow) - for _, key := range config.Config.Seccomps.SysCalls { - seccomp.ScmpAdd(scmpCtx, key, seccomp.ScmpActAllow) - } - return seccomp.ScmpLoad(scmpCtx) + if config.Config.Seccomp == nil { + return nil } - - return nil + context := seccomp.New() + for _, s := range config.Config.Seccomp.Syscalls { + ss := &seccomp.Syscall{ + Value: uint32(s.Value), + Action: seccompAction(s.Action), + } + if len(s.Args) > 0 { + ss.Args = seccompArgs(s.Args) + } + context.Add(ss) + } + return context.Load() +} + +func seccompAction(a configs.Action) seccomp.Action { + switch a { + case configs.Kill: + return seccomp.Kill + case configs.Trap: + return seccomp.Trap + case configs.Allow: + return seccomp.Allow + } + return seccomp.Error(syscall.Errno(int(a))) +} + +func seccompArgs(args []*configs.Arg) seccomp.Args { + var sa []seccomp.Arg + for _, a := range args { + sa = append(sa, seccomp.Arg{ + Index: uint32(a.Index), + Op: seccompOperator(a.Op), + Value: uint(a.Value), + }) + } + return seccomp.Args{sa} +} + +func seccompOperator(o configs.Operator) seccomp.Operator { + switch o { + case configs.EqualTo: + return seccomp.EqualTo + case configs.NotEqualTo: + return seccomp.NotEqualTo + case configs.GreatherThan: + return seccomp.GreatherThan + case configs.LessThan: + return seccomp.LessThan + case configs.MaskEqualTo: + return seccomp.MaskEqualTo + } + return 0 } diff --git a/integration/exec_test.go b/integration/exec_test.go index f9dcc003..3b8a83b7 100644 --- a/integration/exec_test.go +++ b/integration/exec_test.go @@ -1,15 +1,10 @@ package integration import ( - "bufio" "bytes" - "errors" - "fmt" - "io" "io/ioutil" "os" "path/filepath" - "runtime" "strconv" "strings" "syscall" @@ -720,103 +715,7 @@ func TestSystemProperties(t *testing.T) { } } -func genSeccompConfigFile(file string, calls []int) error { - callBegin := 0 - callEnd := 0 - if runtime.GOARCH == "386" { - callEnd = 340 - } else if runtime.GOARCH == "amd64" { - callEnd = 302 - } else if runtime.GOARCH == "arm" { - callEnd = 377 - } else if runtime.GOARCH == "arm64" { - callEnd = 281 - } else if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" { - callEnd = 354 - } - - conf := fmt.Sprintf("%d\nwhitelist\n", 1) - i := 0 - nr := callBegin - for nr <= callEnd { - j := 0 - for _, key := range calls { - if nr == key { - break - } - j++ - } - if j == len(calls) { - callfilter := fmt.Sprintf("%d\n", nr) - conf += callfilter - i++ - } - nr++ - } - fout, err := os.Create(file) - defer fout.Close() - if err == nil { - fout.WriteString(conf) - } - return nil -} - -func genSeccompSyscall(configFile string, Seccomps *configs.SeccompConf) error { - f, err := os.Open(configFile) - defer f.Close() - if nil == err { - buff := bufio.NewReader(f) - firstl, err := buff.ReadString('\n') - if err != nil || io.EOF == err { - return errors.New("initSeccomp ReadString, firstl") - } - ver := 0 - fmt.Sscanf(firstl, "%d\n", &ver) - if err != nil || 1 != ver { - return errors.New("initSeccomp Sscanf") - } - - secondl, err := buff.ReadString('\n') - if err != nil || io.EOF == err || strings.EqualFold(secondl, "whitelist") { - return errors.New("initSeccomp ReadString, secondl") - } - nr := 0 - for { - line, err := buff.ReadString('\n') - if err != nil || io.EOF == err { - break - } - fmt.Sscanf(line, "%d\n", &nr) - Seccomps.SysCalls = append(Seccomps.SysCalls, nr) - } - return nil - } - return nil -} - -func TestSeccompNotStat(t *testing.T) { - if testing.Short() { - return - } - - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - config := newTemplateConfig(rootfs) - exceptCall := []int{syscall.SYS_STAT} - genSeccompConfigFile("seccomp.conf", exceptCall) - genSeccompSyscall("seccomp.conf", &config.Seccomps) - out, _, err := runContainer(config, "", "/bin/sh", "-c", "ls / -l") - if err == nil { - t.Fatal("runontainer[ls without SYS_STAT] should be failed") - } else { - fmt.Println(out) - } -} - -func TestSeccompStat(t *testing.T) { +func TestSeccompNoChown(t *testing.T) { if testing.Short() { return } @@ -825,14 +724,17 @@ func TestSeccompStat(t *testing.T) { t.Fatal(err) } defer remove(rootfs) - config := newTemplateConfig(rootfs) - exceptCall := []int{} - genSeccompConfigFile("seccomp.conf", exceptCall) - genSeccompSyscall("seccomp.conf", &config.Seccomps) - out, _, err := runContainer(config, "", "/bin/sh", "-c", "ls / -l") - if err != nil { - t.Fatal(err) + config.Seccomp = &configs.Seccomp{} + config.Seccomp.Syscalls = append(config.Seccomp.Syscalls, &configs.Syscall{ + Value: syscall.SYS_CHOWN, + Action: configs.Action(syscall.EPERM), + }) + buffers, _, err := runContainer(config, "", "/bin/sh", "-c", "chown 1:1 /tmp") + if err == nil { + t.Fatal("running chown in a container should fail") + } + if s := buffers.String(); !strings.Contains(s, "not permitted") { + t.Fatalf("running chown should result in an EPERM but got %q", s) } - fmt.Println(out) } diff --git a/integration/template_test.go b/integration/template_test.go index 02a738e9..cb991b41 100644 --- a/integration/template_test.go +++ b/integration/template_test.go @@ -44,7 +44,6 @@ func newTemplateConfig(rootfs string) *configs.Config { {Type: configs.NEWIPC}, {Type: configs.NEWPID}, {Type: configs.NEWNET}, - {Type: configs.NEWSECCOMP}, }), Cgroups: &configs.Cgroup{ Name: "test", @@ -115,8 +114,5 @@ func newTemplateConfig(rootfs string) *configs.Config { Soft: uint64(1025), }, }, - Seccomps: configs.SeccompConf{ - SysCalls: make([]int, 0, 512), - }, } } diff --git a/integration/utils_test.go b/integration/utils_test.go index 41b914ca..0f918133 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -122,11 +122,11 @@ func runContainer(config *configs.Config, console string, args ...string) (buffe err = container.Start(process) if err != nil { - return nil, -1, err + return buffers, -1, err } ps, err := process.Wait() if err != nil { - return nil, -1, err + return buffers, -1, err } status := ps.Sys().(syscall.WaitStatus) if status.Exited() { @@ -134,7 +134,7 @@ func runContainer(config *configs.Config, console string, args ...string) (buffe } else if status.Signaled() { exitCode = -int(status.Signal()) } else { - return nil, -1, err + return buffers, -1, err } return } diff --git a/nsinit/config.go b/nsinit/config.go index bf3506c2..7fb28a58 100644 --- a/nsinit/config.go +++ b/nsinit/config.go @@ -19,32 +19,33 @@ import ( const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV var createFlags = []cli.Flag{ - cli.IntFlag{Name: "parent-death-signal", Usage: "set the signal that will be delivered to the process in case the parent dies"}, + cli.BoolFlag{Name: "cgroup", Usage: "mount the cgroup data for the container"}, cli.BoolFlag{Name: "read-only", Usage: "set the container's rootfs as read-only"}, - cli.StringSliceFlag{Name: "bind", Value: &cli.StringSlice{}, Usage: "add bind mounts to the container"}, - cli.StringSliceFlag{Name: "tmpfs", Value: &cli.StringSlice{}, Usage: "add tmpfs mounts to the container"}, cli.IntFlag{Name: "cpushares", Usage: "set the cpushares for the container"}, cli.IntFlag{Name: "memory-limit", Usage: "set the memory limit for the container"}, cli.IntFlag{Name: "memory-swap", Usage: "set the memory swap limit for the container"}, + cli.IntFlag{Name: "parent-death-signal", Usage: "set the signal that will be delivered to the process in case the parent dies"}, + cli.IntFlag{Name: "userns-root-uid", Usage: "set the user namespace root uid"}, + cli.IntFlag{Name: "veth-mtu", Usage: "veth mtu"}, + cli.StringFlag{Name: "apparmor-profile", Usage: "set the apparmor profile"}, cli.StringFlag{Name: "cpuset-cpus", Usage: "set the cpuset cpus"}, cli.StringFlag{Name: "cpuset-mems", Usage: "set the cpuset mems"}, - cli.StringFlag{Name: "apparmor-profile", Usage: "set the apparmor profile"}, - cli.StringFlag{Name: "process-label", Usage: "set the process label"}, - cli.StringFlag{Name: "mount-label", Usage: "set the mount label"}, - cli.StringFlag{Name: "rootfs", Usage: "set the rootfs"}, - cli.IntFlag{Name: "userns-root-uid", Usage: "set the user namespace root uid"}, cli.StringFlag{Name: "hostname", Value: "nsinit", Usage: "hostname value for the container"}, - cli.StringFlag{Name: "net", Value: "", Usage: "network namespace"}, cli.StringFlag{Name: "ipc", Value: "", Usage: "ipc namespace"}, - cli.StringFlag{Name: "pid", Value: "", Usage: "pid namespace"}, - cli.StringFlag{Name: "uts", Value: "", Usage: "uts namespace"}, cli.StringFlag{Name: "mnt", Value: "", Usage: "mount namespace"}, - cli.StringFlag{Name: "veth-bridge", Usage: "veth bridge"}, + cli.StringFlag{Name: "mount-label", Usage: "set the mount label"}, + cli.StringFlag{Name: "net", Value: "", Usage: "network namespace"}, + cli.StringFlag{Name: "pid", Value: "", Usage: "pid namespace"}, + cli.StringFlag{Name: "process-label", Usage: "set the process label"}, + cli.StringFlag{Name: "rootfs", Usage: "set the rootfs"}, + cli.StringFlag{Name: "security", Value: "", Usage: "set the security profile (high, medium, low)"}, + cli.StringFlag{Name: "uts", Value: "", Usage: "uts namespace"}, cli.StringFlag{Name: "veth-address", Usage: "veth ip address"}, + cli.StringFlag{Name: "veth-bridge", Usage: "veth bridge"}, cli.StringFlag{Name: "veth-gateway", Usage: "veth gateway address"}, - cli.IntFlag{Name: "veth-mtu", Usage: "veth mtu"}, - cli.BoolFlag{Name: "cgroup", Usage: "mount the cgroup data for the container"}, + cli.StringSliceFlag{Name: "bind", Value: &cli.StringSlice{}, Usage: "add bind mounts to the container"}, cli.StringSliceFlag{Name: "sysctl", Value: &cli.StringSlice{}, Usage: "set system properties in the container"}, + cli.StringSliceFlag{Name: "tmpfs", Value: &cli.StringSlice{}, Usage: "add tmpfs mounts to the container"}, } var configCommand = cli.Command{ @@ -203,6 +204,24 @@ func modify(config *configs.Config, context *cli.Context) { Device: "cgroup", }) } + modifySecurityProfile(context, config) +} + +func modifySecurityProfile(context *cli.Context, config *configs.Config) { + profileName := context.String("security") + if profileName == "" { + return + } + profile := profiles[profileName] + if profile == nil { + logrus.Fatalf("invalid profile name %q", profileName) + } + config.Rlimits = profile.Rlimits + config.Capabilities = profile.Capabilities + config.Seccomp = profile.Seccomp + config.AppArmorProfile = profile.ApparmorProfile + config.MountLabel = profile.MountLabel + config.ProcessLabel = profile.ProcessLabel } func getTemplate() *configs.Config { @@ -290,13 +309,5 @@ func getTemplate() *configs.Config { Flags: defaultMountFlags | syscall.MS_RDONLY, }, }, - Rlimits: []configs.Rlimit{ - { - Type: syscall.RLIMIT_NOFILE, - Hard: 1024, - Soft: 1024, - }, - }, } - } diff --git a/nsinit/security.go b/nsinit/security.go new file mode 100644 index 00000000..7835c4b9 --- /dev/null +++ b/nsinit/security.go @@ -0,0 +1,272 @@ +package main + +import ( + "syscall" + + "github.com/docker/libcontainer/configs" + "github.com/docker/libcontainer/system" +) + +var profiles = map[string]*securityProfile{ + "high": highProfile, + "medium": mediumProfile, + "low": lowProfile, +} + +type securityProfile struct { + Capabilities []string `json:"capabilities"` + ApparmorProfile string `json:"apparmor_profile"` + MountLabel string `json:"mount_label"` + ProcessLabel string `json:"process_label"` + Rlimits []configs.Rlimit `json:"rlimits"` + Seccomp *configs.Seccomp `json:"seccomp"` +} + +// this should be a runtime config that is not able to do things like apt-get or yum install. +var highProfile = &securityProfile{ + Capabilities: []string{ + "NET_BIND_SERVICE", + "KILL", + "AUDIT_WRITE", + }, + Rlimits: []configs.Rlimit{ + { + Type: syscall.RLIMIT_NOFILE, + Hard: 1024, + Soft: 1024, + }, + }, + // http://man7.org/linux/man-pages/man2/syscalls.2.html + Seccomp: &configs.Seccomp{ + Syscalls: []*configs.Syscall{ + { + Value: syscall.SYS_CAPSET, // http://man7.org/linux/man-pages/man2/capset.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UNSHARE, // http://man7.org/linux/man-pages/man2/unshare.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: int(system.SysSetns()), + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_MOUNT, // http://man7.org/linux/man-pages/man2/mount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UMOUNT2, // http://man7.org/linux/man-pages/man2/umount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CREATE_MODULE, // http://man7.org/linux/man-pages/man2/create_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_DELETE_MODULE, // http://man7.org/linux/man-pages/man2/delete_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CHMOD, // http://man7.org/linux/man-pages/man2/chmod.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CHOWN, // http://man7.org/linux/man-pages/man2/chown.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_LINK, // http://man7.org/linux/man-pages/man2/link.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_LINKAT, // http://man7.org/linux/man-pages/man2/linkat.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UNLINK, // http://man7.org/linux/man-pages/man2/unlink.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UNLINKAT, // http://man7.org/linux/man-pages/man2/unlinkat.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CHROOT, // http://man7.org/linux/man-pages/man2/chroot.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_KEXEC_LOAD, // http://man7.org/linux/man-pages/man2/kexec_load.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_SETDOMAINNAME, // http://man7.org/linux/man-pages/man2/setdomainname.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_SETHOSTNAME, // http://man7.org/linux/man-pages/man2/sethostname.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CLONE, // http://man7.org/linux/man-pages/man2/clone.2.html + Action: configs.Action(syscall.EPERM), + Args: []*configs.Arg{ + { + Index: 0, // the glibc wrapper has the flags at arg2 but the raw syscall has flags at arg0 + Value: syscall.CLONE_NEWUSER, + Op: configs.MaskEqualTo, + }, + }, + }, + }, + }, +} + +// This is a medium level profile that should be able to do things like installing from +// apt-get or yum. +var mediumProfile = &securityProfile{ + Capabilities: []string{ + "CHOWN", + "DAC_OVERRIDE", + "FSETID", + "FOWNER", + "SETGID", + "SETUID", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "KILL", + "AUDIT_WRITE", + }, + Rlimits: []configs.Rlimit{ + { + Type: syscall.RLIMIT_NOFILE, + Hard: 1024, + Soft: 1024, + }, + }, + // http://man7.org/linux/man-pages/man2/syscalls.2.html + Seccomp: &configs.Seccomp{ + Syscalls: []*configs.Syscall{ + { + Value: syscall.SYS_UNSHARE, // http://man7.org/linux/man-pages/man2/unshare.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: int(system.SysSetns()), + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_MOUNT, // http://man7.org/linux/man-pages/man2/mount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UMOUNT2, // http://man7.org/linux/man-pages/man2/umount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CHROOT, // http://man7.org/linux/man-pages/man2/chroot.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CREATE_MODULE, // http://man7.org/linux/man-pages/man2/create_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_DELETE_MODULE, // http://man7.org/linux/man-pages/man2/delete_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_KEXEC_LOAD, // http://man7.org/linux/man-pages/man2/kexec_load.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_SETDOMAINNAME, // http://man7.org/linux/man-pages/man2/setdomainname.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_SETHOSTNAME, // http://man7.org/linux/man-pages/man2/sethostname.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CLONE, // http://man7.org/linux/man-pages/man2/clone.2.html + Action: configs.Action(syscall.EPERM), + Args: []*configs.Arg{ + { + Index: 0, // the glibc wrapper has the flags at arg2 but the raw syscall has flags at arg0 + Value: syscall.CLONE_NEWUSER, + Op: configs.MaskEqualTo, + }, + }, + }, + }, + }, +} + +var lowProfile = &securityProfile{ + Capabilities: []string{ + "CHOWN", + "DAC_OVERRIDE", + "FSETID", + "FOWNER", + "SETGID", + "SETUID", + "SYS_CHROOT", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "KILL", + "AUDIT_WRITE", + }, + Rlimits: []configs.Rlimit{ + { + Type: syscall.RLIMIT_NOFILE, + Hard: 1024, + Soft: 1024, + }, + }, + // http://man7.org/linux/man-pages/man2/syscalls.2.html + Seccomp: &configs.Seccomp{ + Syscalls: []*configs.Syscall{ + { + Value: syscall.SYS_UNSHARE, // http://man7.org/linux/man-pages/man2/unshare.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: int(system.SysSetns()), + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_MOUNT, // http://man7.org/linux/man-pages/man2/mount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_UMOUNT2, // http://man7.org/linux/man-pages/man2/umount.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CREATE_MODULE, // http://man7.org/linux/man-pages/man2/create_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_DELETE_MODULE, // http://man7.org/linux/man-pages/man2/delete_module.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_KEXEC_LOAD, // http://man7.org/linux/man-pages/man2/kexec_load.2.html + Action: configs.Action(syscall.EPERM), + }, + { + Value: syscall.SYS_CLONE, // http://man7.org/linux/man-pages/man2/clone.2.html + Action: configs.Action(syscall.EPERM), + Args: []*configs.Arg{ + { + Index: 0, // the glibc wrapper has the flags at arg2 but the raw syscall has flags at arg0 + Value: syscall.CLONE_NEWUSER, + Op: configs.MaskEqualTo, + }, + }, + }, + }, + }, +} diff --git a/seccomp/bpf.go b/seccomp/bpf.go new file mode 100644 index 00000000..a4b3bdf7 --- /dev/null +++ b/seccomp/bpf.go @@ -0,0 +1,32 @@ +package seccomp + +import "strings" + +type bpfLabel struct { + label string + location uint32 +} + +type bpfLabels []bpfLabel + +// labelIndex returns the index for the label if it exists in the slice. +// if it does not exist in the slice it appends the label lb to the end +// of the slice and returns the index. +func labelIndex(labels *bpfLabels, lb string) uint32 { + var id uint32 + for id = 0; id < uint32(len(*labels)); id++ { + if strings.EqualFold(lb, (*labels)[id].label) { + return id + } + } + *labels = append(*labels, bpfLabel{lb, 0xffffffff}) + return id +} + +func scmpBpfStmt(code uint16, k uint32) sockFilter { + return sockFilter{code, 0, 0, k} +} + +func scmpBpfJump(code uint16, k uint32, jt, jf uint8) sockFilter { + return sockFilter{code, jt, jf, k} +} diff --git a/seccomp/context.go b/seccomp/context.go new file mode 100644 index 00000000..c8d4e731 --- /dev/null +++ b/seccomp/context.go @@ -0,0 +1,144 @@ +package seccomp + +import ( + "errors" + "syscall" +) + +const labelTemplate = "lb-%d-%d" + +// Action is the type of action that will be taken when a +// syscall is performed. +type Action int + +const ( + Kill Action = iota - 3 // Kill the calling process of the syscall. + Trap // Trap and coredump the calling process of the syscall. + Allow // Allow the syscall to be completed. +) + +// Syscall is the specified syscall, action, and any type of arguments +// to filter on. +type Syscall struct { + // Value is the syscall number. + Value uint32 + // Action is the action to perform when the specified syscall is made. + Action Action + // Args are filters that can be specified on the arguments to the syscall. + Args Args +} + +func (s *Syscall) scmpAction() uint32 { + switch s.Action { + case Allow: + return retAllow + case Trap: + return retTrap + case Kill: + return retKill + } + return actionErrno(uint32(s.Action)) +} + +// Arg represents an argument to the syscall with the argument's index, +// the operator to apply when matching, and the argument's value at that time. +type Arg struct { + Index uint32 // index of args which start from zero + Op Operator // operation, such as EQ/NE/GE/LE + Value uint // the value of arg +} + +type Args [][]Arg + +var ( + ErrUnresolvedLabel = errors.New("seccomp: unresolved label") + ErrDuplicateLabel = errors.New("seccomp: duplicate label use") + ErrUnsupportedOperation = errors.New("seccomp: unsupported operation for argument") +) + +// Error returns an Action that will be used to send the calling +// process the specified errno when the syscall is made. +func Error(code syscall.Errno) Action { + return Action(code) +} + +// New returns a new syscall context for use. +func New() *Context { + return &Context{ + syscalls: make(map[uint32]*Syscall), + } +} + +// Context holds syscalls for the current process to limit the type of +// actions the calling process can make. +type Context struct { + syscalls map[uint32]*Syscall +} + +// Add will add the specified syscall, action, and arguments to the seccomp +// Context. +func (c *Context) Add(s *Syscall) { + c.syscalls[s.Value] = s +} + +// Remove removes the specified syscall configuration from the Context. +func (c *Context) Remove(call uint32) { + delete(c.syscalls, call) +} + +// Load will apply the Context to the calling process makeing any secccomp process changes +// apply after the context is loaded. +func (c *Context) Load() error { + filter, err := c.newFilter() + if err != nil { + return err + } + if err := prctl(prSetNoNewPrivileges, 1, 0, 0, 0); err != nil { + return err + } + prog := newSockFprog(filter) + return prog.set() +} + +func (c *Context) newFilter() ([]sockFilter, error) { + var ( + labels bpfLabels + f = newFilter() + ) + for _, s := range c.syscalls { + f.addSyscall(s, &labels) + } + f.allow() + // process args for the syscalls + for _, s := range c.syscalls { + if err := f.addArguments(s, &labels); err != nil { + return nil, err + } + } + // apply labels for arguments + idx := int32(len(*f) - 1) + for ; idx >= 0; idx-- { + lf := &(*f)[idx] + if lf.code != (syscall.BPF_JMP + syscall.BPF_JA) { + continue + } + rel := int32(lf.jt)<<8 | int32(lf.jf) + if ((jumpJT << 8) | jumpJF) == rel { + if labels[lf.k].location == 0xffffffff { + return nil, ErrUnresolvedLabel + } + lf.k = labels[lf.k].location - uint32(idx+1) + lf.jt = 0 + lf.jf = 0 + } else if ((labelJT << 8) | labelJF) == rel { + if labels[lf.k].location != 0xffffffff { + return nil, ErrDuplicateLabel + } + labels[lf.k].location = uint32(idx) + lf.k = 0 + lf.jt = 0 + lf.jf = 0 + } + } + return *f, nil +} diff --git a/seccomp/filter.go b/seccomp/filter.go new file mode 100644 index 00000000..370cdf08 --- /dev/null +++ b/seccomp/filter.go @@ -0,0 +1,116 @@ +package seccomp + +import ( + "fmt" + "syscall" + "unsafe" +) + +type sockFilter struct { + code uint16 + jt uint8 + jf uint8 + k uint32 +} + +func newFilter() *filter { + var f filter + f = append(f, sockFilter{ + pfLD + syscall.BPF_W + syscall.BPF_ABS, + 0, + 0, + uint32(unsafe.Offsetof(secData.nr)), + }) + return &f +} + +type filter []sockFilter + +func (f *filter) addSyscall(s *Syscall, labels *bpfLabels) { + if len(s.Args) == 0 { + f.call(s.Value, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction())) + } else { + if len(s.Args[0]) > 0 { + lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[0][0].Index) + f.call(s.Value, + scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), + jumpJT, jumpJF)) + } + } +} + +func (f *filter) addArguments(s *Syscall, labels *bpfLabels) error { + for i := 0; len(s.Args) > i; i++ { + if len(s.Args[i]) > 0 { + lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[i][0].Index) + f.label(labels, lb) + f.arg(s.Args[i][0].Index) + } + for j := 0; j < len(s.Args[i]); j++ { + var jf sockFilter + if len(s.Args)-1 > i && len(s.Args[i+1]) > 0 { + lbj := fmt.Sprintf(labelTemplate, s.Value, s.Args[i+1][0].Index) + jf = scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, + labelIndex(labels, lbj), jumpJT, jumpJF) + } else { + jf = scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction()) + } + if err := f.op(s.Args[i][j].Op, s.Args[i][j].Value, jf); err != nil { + return err + } + } + f.allow() + } + return nil +} + +func (f *filter) label(labels *bpfLabels, lb string) { + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), labelJT, labelJF)) +} + +func (f *filter) call(nr uint32, jt sockFilter) { + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, nr, 0, 1)) + *f = append(*f, jt) +} + +func (f *filter) allow() { + *f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retAllow)) +} + +func (f *filter) deny() { + *f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retTrap)) +} + +func (f *filter) arg(index uint32) { + arg(f, index) +} + +func (f *filter) op(operation Operator, v uint, jf sockFilter) error { + switch operation { + case EqualTo: + jumpEqualTo(f, v, jf) + case NotEqualTo: + jumpNotEqualTo(f, v, jf) + case GreatherThan: + jumpGreaterThan(f, v, jf) + case LessThan: + jumpLessThan(f, v, jf) + case MaskEqualTo: + jumpMaskEqualTo(f, v, jf) + default: + return ErrUnsupportedOperation + } + return nil +} + +func arg(f *filter, idx uint32) { + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.low(idx))) + *f = append(*f, scmpBpfStmt(syscall.BPF_ST, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.hi(idx))) + *f = append(*f, scmpBpfStmt(syscall.BPF_ST, 1)) +} + +func jump(f *filter, labels *bpfLabels, lb string) { + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), + jumpJT, jumpJF)) +} diff --git a/seccomp/jump_amd64.go b/seccomp/jump_amd64.go new file mode 100644 index 00000000..f0d07716 --- /dev/null +++ b/seccomp/jump_amd64.go @@ -0,0 +1,68 @@ +// +build linux,amd64 + +package seccomp + +// Using BPF filters +// +// ref: http://www.gsp.com/cgi-bin/man.cgi?topic=bpf +import "syscall" + +func jumpGreaterThan(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 4, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGE+syscall.BPF_K, (lo), 0, 2)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} + +func jumpEqualTo(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (lo), 0, 2)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} + +func jumpLessThan(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 6, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 3)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (lo), 2, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} + +func jumpNotEqualTo(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 5, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 2, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} + +// this checks for a value inside a mask. The evalusation is equal to doing +// CLONE_NEWUSER & syscallMask == CLONE_NEWUSER +func jumpMaskEqualTo(f *filter, v uint, jt sockFilter) { + lo := uint32(uint64(v) % 0x100000000) + hi := uint32(uint64(v) / 0x100000000) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 0, 6)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0)) + *f = append(*f, scmpBpfStmt(syscall.BPF_ALU+syscall.BPF_AND, uint32(v))) + *f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 0, 2)) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) + *f = append(*f, jt) + *f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1)) +} diff --git a/seccomp/seccomp.go b/seccomp/seccomp.go index 91a6fb79..78d7d853 100644 --- a/seccomp/seccomp.go +++ b/seccomp/seccomp.go @@ -1,77 +1,110 @@ +// Package seccomp provides native seccomp ( https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt ) support for go. package seccomp import ( - "errors" - "fmt" - "os" - "os/signal" - "runtime" - "strings" "syscall" "unsafe" ) +// Operator that is used for argument comparison. +type Operator int + const ( - EQ = 0 - NE = 1 - GE = 2 - LE = 3 + EqualTo Operator = iota + NotEqualTo + GreatherThan + LessThan + MaskEqualTo ) const ( - ALLOW = 0 - DENY = 1 - JUMP = 2 + jumpJT = 0xff + jumpJF = 0xff + labelJT = 0xfe + labelJF = 0xfe ) const ( - JUMP_JT = 0xff - JUMP_JF = 0xff - LABEL_JT = 0xfe - LABEL_JF = 0xfe + pfLD = 0x0 + retKill = 0x00000000 + retTrap = 0x00030000 + retAllow = 0x7fff0000 + modeFilter = 0x2 + prSetNoNewPrivileges = 0x26 ) -const ( - pseudoCall = 30 -) - -const ( - ScmpActAllow = 0x0 - - PF_LD = 0x0 - BPF_RET = syscall.BPF_RET - BPF_K = syscall.BPF_K - BPF_ABS = syscall.BPF_ABS - BPF_JMP = syscall.BPF_JMP - BPF_JEQ = syscall.BPF_JEQ - BPF_W = syscall.BPF_W - BPF_LD = syscall.BPF_LD - BPF_JA = syscall.BPF_JA - BPF_MEM = syscall.BPF_MEM - BPF_ST = syscall.BPF_ST - BPF_JGT = syscall.BPF_JGT - BPF_JGE = syscall.BPF_JGE - BPF_JSET = syscall.BPF_JSET - - SECCOMP_RET_KILL = 0x00000000 - SECCOMP_RET_TRAP = 0x00030000 - SECCOMP_RET_ALLOW = 0x7fff0000 - SECCOMP_MODE_FILTER = 0x2 - PR_SET_NO_NEW_PRIVS = 0x26 -) - -type seccompData struct { - nr int32 - arch uint32 - insPointer uint64 - args [6]uint64 +func actionErrno(errno uint32) uint32 { + return 0x00050000 | (errno & 0x0000ffff) } -type sockFilter struct { - code uint16 - jt uint8 - jf uint8 - k uint32 +var ( + secData = struct { + nr int32 + arch uint32 + insPointer uint64 + args [6]uint64 + }{0, 0, 0, [6]uint64{0, 0, 0, 0, 0, 0}} +) + +var isLittle = func() bool { + var ( + x = 0x1234 + p = unsafe.Pointer(&x) + p2 = (*[unsafe.Sizeof(0)]byte)(p) + ) + if p2[0] == 0 { + return false + } + return true +}() + +var endian endianSupport + +type endianSupport struct { +} + +func (e endianSupport) hi(i uint32) uint32 { + if isLittle { + return e.little(i) + } + return e.big(i) +} + +func (e endianSupport) low(i uint32) uint32 { + if isLittle { + return e.big(i) + } + return e.little(i) +} + +func (endianSupport) big(idx uint32) uint32 { + if idx >= 6 { + return 0 + } + return uint32(unsafe.Offsetof(secData.args)) + 8*idx +} + +func (endianSupport) little(idx uint32) uint32 { + if idx < 0 || idx >= 6 { + return 0 + } + return uint32(unsafe.Offsetof(secData.args)) + + uint32(unsafe.Alignof(secData.args[0]))*idx + uint32(unsafe.Sizeof(secData.arch)) +} + +func prctl(option int, arg2, arg3, arg4, arg5 uintptr) error { + _, _, err := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0) + if err != 0 { + return err + } + return nil +} + +func newSockFprog(filter []sockFilter) *sockFprog { + return &sockFprog{ + len: uint16(len(filter)), + filt: filter, + } } type sockFprog struct { @@ -79,440 +112,11 @@ type sockFprog struct { filt []sockFilter } -type FilterArgs struct { - Args []Filter -} - -type Action struct { - action int - args []FilterArgs -} - -type Filter struct { - Arg uint32 //index of args which start from zero - Op int //operation, such ass EQ/NE/GE/LE - V uint //the value of arg -} - -type bpfLabel struct { - label string - location uint32 -} - -type bpfLabels struct { - count uint32 - labels []bpfLabel -} - -type ScmpCtx struct { - CallMap map[int]*Action - filter []sockFilter - label bpfLabels -} - -type argOFunc func(uint32) uint32 -type argFunc func(*ScmpCtx, uint32) -type jFunc func(*ScmpCtx, uint, sockFilter) -type addFunc func(ctx *ScmpCtx, call int, action int, args ...FilterArgs) error - -var secData seccompData = seccompData{0, 0, 0, [6]uint64{0, 0, 0, 0, 0, 0}} -var hiArg argOFunc -var loArg argOFunc -var arg argFunc -var jEq jFunc -var jNe jFunc -var jGe jFunc -var jLe jFunc -var secAdd addFunc = nil - -var op [4]jFunc - -var ( - sysCallMin = 0 - sysCallMax = 0 -) -var sigSec bool = false - -func arg32(ctx *ScmpCtx, idx uint32) { - ctx.filter = append(ctx.filter, - scmpBpfStmt(BPF_LD+BPF_W+BPF_ABS, loArg(idx))) -} - -func jEq32(ctx *ScmpCtx, v uint, jt sockFilter) { - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, uint32(v), 0, 1)) - ctx.filter = append(ctx.filter, jt) -} - -func jNe32(ctx *ScmpCtx, v uint, jt sockFilter) { - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, uint32(v), 1, 0)) - ctx.filter = append(ctx.filter, jt) -} - -func jGe32(ctx *ScmpCtx, v uint, jt sockFilter) { - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JGE+BPF_K, uint32(v), 0, 1)) - ctx.filter = append(ctx.filter, jt) -} - -func jLe32(ctx *ScmpCtx, v uint, jt sockFilter) { - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JGT+BPF_K, uint32(v), 1, 0)) - ctx.filter = append(ctx.filter, jt) -} - -func arg64(ctx *ScmpCtx, idx uint32) { - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_W+BPF_ABS, loArg(idx))) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_ST, 0)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_W+BPF_ABS, hiArg(idx))) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_ST, 1)) -} - -func jNe64(ctx *ScmpCtx, v uint, jt sockFilter) { - lo := uint32(uint64(v) % 0x100000000) - hi := uint32(uint64(v) / 0x100000000) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, (hi), 5, 0)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 0)) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 1)) - ctx.filter = append(ctx.filter, jt) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 1)) -} - -func jGe64(ctx *ScmpCtx, v uint, jt sockFilter) { - lo := uint32(uint64(v) % 0x100000000) - hi := uint32(uint64(v) / 0x100000000) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0)) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 0)) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 1)) - ctx.filter = append(ctx.filter, jt) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 1)) -} - -func jEq64(ctx *ScmpCtx, v uint, jt sockFilter) { - lo := uint32(uint64(v) % 0x100000000) - hi := uint32(uint64(v) / 0x100000000) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 0)) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 1)) - ctx.filter = append(ctx.filter, jt) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 1)) -} - -func jLe64(ctx *ScmpCtx, v uint, jt sockFilter) { - lo := uint32(uint64(v) % 0x100000000) - hi := uint32(uint64(v) / 0x100000000) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JGT+BPF_K, (hi), 6, 0)) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 0)) - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0)) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 1)) - ctx.filter = append(ctx.filter, jt) - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_LD+BPF_MEM, 1)) -} - -func allow(ctx *ScmpCtx) { - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)) -} - -func deny(ctx *ScmpCtx) { - ctx.filter = append(ctx.filter, scmpBpfStmt(BPF_RET+BPF_K, SECCOMP_RET_TRAP)) -} - -func jump(ctx *ScmpCtx, lb string) { - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JA, findLabel(&ctx.label, lb), - JUMP_JT, JUMP_JF)) -} - -func label(ctx *ScmpCtx, lb string) { - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JA, findLabel(&ctx.label, lb), - LABEL_JT, LABEL_JF)) -} - -func secCall(ctx *ScmpCtx, nr int, jt sockFilter) { - ctx.filter = append(ctx.filter, scmpBpfJump(BPF_JMP+BPF_JEQ+BPF_K, uint32(nr), 0, 1)) - ctx.filter = append(ctx.filter, jt) -} - -func findLabel(labels *bpfLabels, lb string) uint32 { - var id uint32 - for id = 0; id < labels.count; id++ { - if true == strings.EqualFold(lb, labels.labels[id].label) { - return id - } - } - tlabel := bpfLabel{lb, 0xffffffff} - labels.labels = append(labels.labels, tlabel) - labels.count += 1 - return id -} - -func hiArgLittle(idx uint32) uint32 { - if idx < 0 || idx >= 6 { - return 0 - } - - hi := uint32(unsafe.Offsetof(secData.args)) + uint32(unsafe.Alignof(secData.args[0]))*idx + uint32(unsafe.Sizeof(secData.arch)) - return uint32(hi) -} - -func hiArgBig(idx uint32) uint32 { - if idx >= 6 { - return 0 - } - hi := uint32(unsafe.Offsetof(secData.args)) + 8*idx - return uint32(hi) -} - -func isLittle() bool { - litEndian := true - x := 0x1234 - p := unsafe.Pointer(&x) - p2 := (*[unsafe.Sizeof(0)]byte)(p) - if p2[0] == 0 { - litEndian = false - } - return litEndian -} - -func scmpBpfStmt(code uint16, k uint32) sockFilter { - return sockFilter{code, 0, 0, k} -} - -func scmpBpfJump(code uint16, k uint32, jt, jf uint8) sockFilter { - return sockFilter{code, jt, jf, k} -} - -func prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) { - _, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0) - if e1 != 0 { - err = e1 +func (s *sockFprog) set() error { + _, _, err := syscall.Syscall(syscall.SYS_PRCTL, uintptr(syscall.PR_SET_SECCOMP), + uintptr(modeFilter), uintptr(unsafe.Pointer(s))) + if err != 0 { + return err } return nil } - -func scmpfilter(prog *sockFprog) (err error) { - _, _, e1 := syscall.Syscall(syscall.SYS_PRCTL, uintptr(syscall.PR_SET_SECCOMP), - uintptr(SECCOMP_MODE_FILTER), uintptr(unsafe.Pointer(prog))) - if e1 != 0 { - err = e1 - } - return nil -} - -func CombineArgs(args1 []FilterArgs, args2 []FilterArgs) []FilterArgs { - ilen1 := len(args1) - if ilen1 > len(args2) { - ilen1 = len(args2) - } - for i1 := 0; i1 < ilen1; i1++ { - jlen1 := len(args1[i1].Args) - jlen2 := len(args2[i1].Args) - for j2 := 0; j2 < jlen2; j2++ { - num := 0 - for j1 := 0; j1 < jlen1; j1++ { - if args1[i1].Args[j1] == args2[i1].Args[j2] { - break - } - num = num + 1 - } - if num == jlen1 { - args1[i1].Args = append(args1[i1].Args, args2[i1].Args[j2]) - } - } - } - if ilen1 < len(args2) { - args1 = append(args1, args2[ilen1:]...) - } - return args1 -} - -func Sys(call string) int { - number, exists := syscallMap[call] - if exists { - return number - } - return -1 -} - -func ScmpInit(action int) (*ScmpCtx, error) { - ctx := ScmpCtx{ - CallMap: make(map[int]*Action), - filter: make([]sockFilter, 0, 128), - label: bpfLabels{ - count: 0, - labels: make([]bpfLabel, 0, 128), - }, - } - - ctx.filter = append(ctx.filter, - sockFilter{PF_LD + BPF_W + BPF_ABS, 0, 0, uint32(unsafe.Offsetof(secData.nr))}) - return &ctx, nil -} - -func ScmpDel(ctx *ScmpCtx, call int) error { - _, exists := ctx.CallMap[call] - if exists { - delete(ctx.CallMap, call) - return nil - } - - return errors.New("syscall not exist") -} - -func ScmpAdd(ctx *ScmpCtx, call int, action int, args ...FilterArgs) error { - if call < 0 { - return errors.New("syscall error, call < 0") - } - - if call <= sysCallMax { - _, exists := ctx.CallMap[call] - if exists { - return errors.New("syscall exist") - } - ctx.CallMap[call] = &Action{action, args} - return nil - } else { - if nil != secAdd { - return secAdd(ctx, call, action, args...) - } - } - - return errors.New("syscall not surport") -} - -func ScmpLoad(ctx *ScmpCtx) error { - for call, act := range ctx.CallMap { - if len(act.args) == 0 { - secCall(ctx, call, scmpBpfStmt(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)) - } else { - if len(act.args[0].Args) > 0 { - lb := fmt.Sprintf("lb-%d-%d", call, act.args[0].Args[0].Arg) - secCall(ctx, call, - scmpBpfJump(BPF_JMP+BPF_JA, findLabel(&ctx.label, lb), - JUMP_JT, JUMP_JF)) - } - } - } - deny(ctx) - - for call, act := range ctx.CallMap { - for i := 0; i < len(act.args); i++ { - if len(act.args[i].Args) > 0 { - lb := fmt.Sprintf("lb-%d-%d", call, act.args[i].Args[0].Arg) - label(ctx, lb) - arg(ctx, act.args[i].Args[0].Arg) - } - - for j := 0; j < len(act.args[i].Args); j++ { - var jf sockFilter - if len(act.args)-1 > i && len(act.args[i+1].Args) > 0 { - lbj := fmt.Sprintf("lb-%d-%d", call, act.args[i+1].Args[0].Arg) - jf = scmpBpfJump(BPF_JMP+BPF_JA, - findLabel(&ctx.label, lbj), JUMP_JT, JUMP_JF) - } else { - jf = scmpBpfStmt(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) - } - op[act.args[i].Args[j].Op](ctx, act.args[i].Args[j].V, jf) - } - - deny(ctx) - } - } - - idx := int32(len(ctx.filter) - 1) - for ; idx >= 0; idx-- { - filter := &ctx.filter[idx] - if filter.code != (BPF_JMP + BPF_JA) { - continue - } - - rel := int32(filter.jt)<<8 | int32(filter.jf) - if ((JUMP_JT << 8) | JUMP_JF) == rel { - if ctx.label.labels[filter.k].location == 0xffffffff { - return errors.New("Unresolved label") - } - filter.k = ctx.label.labels[filter.k].location - uint32(idx+1) - filter.jt = 0 - filter.jf = 0 - } else if ((LABEL_JT << 8) | LABEL_JF) == rel { - if ctx.label.labels[filter.k].location != 0xffffffff { - return errors.New("Duplicate label use") - } - ctx.label.labels[filter.k].location = uint32(idx) - filter.k = 0 - filter.jt = 0 - filter.jf = 0 - } - } - prog := sockFprog{ - len: uint16(len(ctx.filter)), - filt: ctx.filter, - } - - if nil != prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) { - fmt.Println("prctl PR_SET_NO_NEW_PRIVS error") - return errors.New("prctl PR_SET_NO_NEW_PRIVS error") - } - - if nil != scmpfilter(&prog) { - fmt.Println("scmpfilter error") - return errors.New("scmpfilter error") - } - return nil -} - -func sigSeccomp() { - sigSec = true -} - -func ScmpError() bool { - ret := sigSec - sigSec = false - return ret -} - -func init() { - if runtime.GOARCH == "386" { - sysCallMax = 340 - } else if runtime.GOARCH == "amd64" { - sysCallMax = 302 - } else if runtime.GOARCH == "arm" { - sysCallMax = 377 - } else if runtime.GOARCH == "arm64" { - sysCallMax = 281 - } else if runtime.GOARCH == "ppc64" { - sysCallMax = 354 - } else if runtime.GOARCH == "ppc64le" { - sysCallMax = 354 - } - if isLittle() { - hiArg = hiArgLittle - loArg = hiArgBig - } else { - hiArg = hiArgBig - loArg = hiArgLittle - } - - var length int - if 8 == int(unsafe.Sizeof(length)) { - arg = arg64 - jEq = jEq64 - jNe = jNe64 - jGe = jGe64 - jLe = jLe64 - } else { - arg = arg32 - jEq = jEq32 - jNe = jNe32 - jGe = jGe32 - jLe = jLe32 - } - op[EQ] = jEq - op[NE] = jNe - op[GE] = jGe - op[LE] = jLe - chSignal := make(chan os.Signal) - signal.Notify(chSignal, syscall.SIGSYS) - go sigSeccomp() -} diff --git a/seccomp/seccomp.test b/seccomp/seccomp.test deleted file mode 100644 index 25a5554f..00000000 --- a/seccomp/seccomp.test +++ /dev/null @@ -1,107 +0,0 @@ - -package main - -import ( - "fmt" - "flag" - "os" - "syscall" - - sec "seccomp" -) - -const ( - STDIN_FILENO = 0 - STDOUT_FILENO = 1 - BUFLEN = 8 -) - -func writeOk(args []string) { - scmpCtx, _ := sec.ScmpInit(sec.ScmpActAllow) - - sec.ScmpAdd(scmpCtx, sec.Sys("exit"), sec.ScmpActAllow) - sec.ScmpAdd(scmpCtx, sec.Sys("exit_group"), sec.ScmpActAllow) - - //the first arg is STDOUT_FILENO, the third arg must be <= BUFLEN - sec.ScmpAdd(scmpCtx, sec.Sys("write"), sec.ScmpActAllow, - sec.FilterArgs{[]sec.Filter{{0, sec.EQ, STDOUT_FILENO}}}, - sec.FilterArgs{[]sec.Filter{{2, sec.LE, BUFLEN}}}, - ) - - sec.ScmpLoad(scmpCtx) - fmt.Printf("8888888\n") //ok -} - -func writeErr(args []string) { - scmpCtx, _ := sec.ScmpInit(sec.ScmpActAllow) - - sec.ScmpAdd(scmpCtx, sec.Sys("exit"), sec.ScmpActAllow) - sec.ScmpAdd(scmpCtx, sec.Sys("exit_group"), sec.ScmpActAllow) - - sec.ScmpAdd(scmpCtx, sec.Sys("write"), sec.ScmpActAllow, - sec.FilterArgs{[]sec.Filter{{0, sec.EQ, STDOUT_FILENO}}}, - sec.FilterArgs{[]sec.Filter{{2, sec.LE, BUFLEN}}}, - ) - - sec.ScmpLoad(scmpCtx) - - // bad system call - fmt.Printf("99999999\n") -} - -func socketOk(args []string) { - scmpCtx, _ := sec.ScmpInit(sec.ScmpActAllow) - - //for 386, the next line is same as - //sec.ScmpAdd(scmpCtx, sec.Sys("socketcall"), sec.ScmpActAllow, - // sec.FilterArgs{[]sec.Filter{{0, sec.EQ, 1}}}, - //) - //SYS_SOCKET = 1 - sec.ScmpAdd(scmpCtx, sec.Sys("socket"), sec.ScmpActAllow) - - sec.ScmpAdd(scmpCtx, sec.Sys("exit"), sec.ScmpActAllow) - sec.ScmpAdd(scmpCtx, sec.Sys("exit_group"), sec.ScmpActAllow) - - sec.ScmpAdd(scmpCtx, sec.Sys("write"), sec.ScmpActAllow, - sec.FilterArgs{[]sec.Filter{{0, sec.EQ, STDOUT_FILENO}}}, - sec.FilterArgs{[]sec.Filter{{2, sec.LE, BUFLEN}}}, - ) - - sec.ScmpLoad(scmpCtx) - - syscall.Socket(syscall.AF_INET, syscall.SOCK_STREAM, syscall.IPPROTO_IP) - fmt.Printf("Sock ok\n") -} - -func socketErr(args []string) { - scmpCtx, _ := sec.ScmpInit(sec.ScmpActAllow) - - sec.ScmpAdd(scmpCtx, sec.Sys("exit"), sec.ScmpActAllow) - sec.ScmpAdd(scmpCtx, sec.Sys("exit_group"), sec.ScmpActAllow) - - sec.ScmpLoad(scmpCtx) - - // bad system call - syscall.Socket(syscall.AF_INET, syscall.SOCK_STREAM, syscall.IPPROTO_IP) -} - - - -func main() { - flag.Parse() - - if 1 == flag.NArg() { - idx := 0 - args := os.Args[(idx + 1):] - if flag.Arg(idx) == "writeOk" { - writeOk(args) - } else if flag.Arg(idx) == "writeErr" { - writeErr(args) - } else if flag.Arg(idx) == "socketOk" { - socketOk(args) - } else if flag.Arg(idx) == "socketErr" { - socketErr(args) - } - } -} - diff --git a/seccomp/seccomp386.go b/seccomp/seccomp386.go deleted file mode 100644 index db696e6f..00000000 --- a/seccomp/seccomp386.go +++ /dev/null @@ -1,117 +0,0 @@ -// +build linux -// +build 386 - -package seccomp - -import ( - "errors" -) - -var ( - syscallInterval = 100 - ipcNr = syscallInterval + 0 - socketcallNr = syscallInterval + ipcNr - callipc = 0 - callsocket = 0 -) - -func scmpAdd386(ctx *ScmpCtx, call int, action int, args ...FilterArgs) error { - var syscallNo int - pseCall := call - sysCallMax - if (pseCall >= ipcNr) && (pseCall < ipcNr+syscallInterval) { - syscallNo, _ = syscallMap["ipc"] - pseCall = (pseCall - ipcNr) % ipcNr - - } else if (pseCall >= socketcallNr) && (pseCall < socketcallNr+syscallInterval) { - syscallNo, _ = syscallMap["socketcall"] - pseCall = (pseCall - socketcallNr) % socketcallNr - } else { - return errors.New("scmpAdd386, syscall error") - } - act, exists := ctx.CallMap[syscallNo] - if !exists { - newArg := make([]FilterArgs, len(args)+1) - newArg[0].Args = make([]Filter, 1) - newArg[0].Args[0].Op = EQ - newArg[0].Args[0].Arg = 0 - newArg[0].Args[0].V = uint(pseCall) - for i := 0; i < len(args); i++ { - alen := len(args[i].Args) - if alen > 0 { - newArg[i+1].Args = make([]Filter, alen) - for j := 0; j < alen; i++ { - newArg[i+1].Args[j].Op = args[i].Args[j].Op - newArg[i+1].Args[j].Arg = args[i].Args[j].Arg - newArg[i+1].Args[j].V = args[i].Args[j].V - } - } - } - ctx.CallMap[syscallNo] = &Action{action, newArg} - } else { - newArg := make([]FilterArgs, len(args)) - for i := 0; i < len(args); i++ { - alen := len(args[i].Args) - if alen > 0 { - newArg[i].Args = make([]Filter, alen) - for j := 0; j < alen; i++ { - newArg[i].Args[j].Op = args[i].Args[j].Op - newArg[i].Args[j].Arg = args[i].Args[j].Arg - newArg[i].Args[j].V = args[i].Args[j].V - } - } - } - act.args = CombineArgs(act.args, newArg) - } - - return nil -} - -func resetCallipc(call string, num int) { - syscallMap[call] = num + callipc -} - -func resetCallsocket(call string, num int) { - syscallMap[call] = num + callsocket -} - -func init() { - sysCallMax = 340 - callipc = ipcNr + sysCallMax - callsocket = socketcallNr + sysCallMax - secAdd = scmpAdd386 - - resetCallipc("semop", 1) - resetCallipc("semget", 2) - resetCallipc("semctl", 3) - resetCallipc("semtimedop", 4) - resetCallipc("msgsnd", 11) - resetCallipc("msgrcv", 12) - resetCallipc("msgget", 13) - resetCallipc("msgctl", 14) - resetCallipc("shmat", 21) - resetCallipc("shmdt", 22) - resetCallipc("shmget", 23) - resetCallipc("shmctl", 24) - - resetCallsocket("socket", 1) - resetCallsocket("bind", 2) - resetCallsocket("connect", 3) - resetCallsocket("listen", 4) - resetCallsocket("accept", 5) - resetCallsocket("getsockname", 6) - resetCallsocket("getpeername", 7) - resetCallsocket("socketpair", 8) - resetCallsocket("send", 9) - resetCallsocket("recv", 10) - resetCallsocket("sendto", 11) - resetCallsocket("recvfrom", 12) - resetCallsocket("shutdown", 13) - resetCallsocket("setsockopt", 14) - resetCallsocket("getsockopt", 15) - resetCallsocket("sendmsg", 16) - resetCallsocket("recvmsg", 17) - resetCallsocket("accept4", 18) - resetCallsocket("recvmmsg", 19) - resetCallsocket("sendmmsg", 20) - -} diff --git a/seccomp/seccomp_test.go b/seccomp/seccomp_test.go deleted file mode 100644 index f0db718f..00000000 --- a/seccomp/seccomp_test.go +++ /dev/null @@ -1,58 +0,0 @@ -package seccomp - -import ( - "fmt" - "os/exec" - "testing" -) - -var osec = "/go/src/seccomp_main.go" - -func secMain(t *testing.T, args []string) { - if len(args) < 1 { - return - } - - cmd := args[0] - path := "go" - argv := []string{"run", osec} - argv = append(argv, args[0:]...) - - c := exec.Command(path, argv...) - _, err := c.Output() - fmt.Printf("do %s, err is [%v]\n", cmd, err) - if err != nil { - if "writeOk" == cmd || "socketOk" == cmd { - t.Fatal(err) - } - } else { - if "writeErr" == cmd || "socketErr" == cmd { - t.Fatal(err) - } - } -} - -func commandGC(file string) { - c := exec.Command("rm", "-rf", file) - d, _ := c.Output() - fmt.Println(string(d)) -} - -func cp(src, dst string) { - c := exec.Command("cp", "-ra", src, dst) - d, _ := c.Output() - fmt.Println(string(d)) -} - -func TestSeccomp(t *testing.T) { - //hard code - cp("../seccomp", "/go/src/") - cp("./seccomp.test", osec) - defer commandGC("/go/src/seccomp") - defer commandGC(osec) - - secMain(t, []string{"writeOk"}) - secMain(t, []string{"writeErr"}) - secMain(t, []string{"socketOk"}) - secMain(t, []string{"socketErr"}) -} diff --git a/seccomp/seccompsyscall.go b/seccomp/seccompsyscall.go deleted file mode 100644 index d7674d1a..00000000 --- a/seccomp/seccompsyscall.go +++ /dev/null @@ -1,390 +0,0 @@ -//x86_64 -package seccomp - -var syscallMap = map[string] int { - "access" : 21, - "chdir" : 80, - "chmod" : 90, - "chown" : 92, - "chown32" : -1, - "close" : 3, - "creat" : 85, - "dup" : 32, - "dup2" : 33, - "dup3" : 292, - "epoll_create" : 213, - "epoll_create1" : 291, - "epoll_ctl" : 233, - "epoll_ctl_old" : 214, - "epoll_pwait" : 281, - "epoll_wait" : 232, - "epoll_wait_old" : 215, - "eventfd" : 284, - "eventfd2" : 290, - "faccessat" : 269, - "fadvise64" : 221, - "fadvise64_64" : -1, - "fallocate" : 285, - "fanotify_init" : 300, - "fanotify_mark" : 301, - "ioctl" : 16, - "fchdir" : 81, - "fchmod" : 91, - "fchmodat" : 268, - "fchown" : 93, - "fchown32" : -1, - "fchownat" : 260, - "fcntl" : 72, - "fcntl64" : -1, - "fdatasync" : 75, - "fgetxattr" : 193, - "flistxattr" : 196, - "flock" : 73, - "fremovexattr" : 199, - "fsetxattr" : 190, - "fstat" : 5, - "fstat64" : -1, - "fstatat64" : -1, - "fstatfs" : 138, - "fstatfs64" : -1, - "fsync" : 74, - "ftruncate" : 77, - "ftruncate64" : -1, - "getcwd" : 79, - "getdents" : 78, - "getdents64" : 217, - "getxattr" : 191, - "inotify_add_watch" : 254, - "inotify_init" : 253, - "inotify_init1" : 294, - "inotify_rm_watch" : 255, - "io_cancel" : 210, - "io_destroy" : 207, - "io_getevents" : 208, - "io_setup" : 206, - "io_submit" : 209, - "lchown" : 94, - "lchown32" : -1, - "lgetxattr" : 192, - "link" : 86, - "linkat" : 265, - "listxattr" : 194, - "llistxattr" : 195, - "llseek" : -1, - "_llseek" : -1, - "lremovexattr" : 198, - "lseek" : 8, - "lsetxattr" : 189, - "lstat" : 6, - "lstat64" : -1, - "mkdir" : 83, - "mkdirat" : 258, - "mknod" : 133, - "mknodat" : 259, - "newfstatat" : 262, - "_newselect" : -1, - "oldfstat" : -1, - "oldlstat" : -1, - "oldolduname" : -1, - "oldstat" : -1, - "olduname" : -1, - "oldwait4" : -1, - "open" : 2, - "openat" : 257, - "pipe" : 22, - "pipe2" : 293, - "poll" : 7, - "ppoll" : 271, - "pread64" : 17, - "preadv" : 295, - "futimesat" : 261, - "pselect6" : 270, - "pwrite64" : 18, - "pwritev" : 296, - "read" : 0, - "readahead" : 187, - "readdir" : -1, - "readlink" : 89, - "readlinkat" : 267, - "readv" : 19, - "removexattr" : 197, - "rename" : 82, - "renameat" : 264, - "rmdir" : 84, - "select" : 23, - "sendfile" : 40, - "sendfile64" : -1, - "setxattr" : 188, - "splice" : 275, - "stat" : 4, - "stat64" : -1, - "statfs" : 137, - "statfs64" : -1, - "symlink" : 88, - "symlinkat" : 266, - "sync" : 162, - "sync_file_range" : 277, - "sync_file_range2" : -1, - "syncfs" : 306, - "tee" : 276, - "truncate" : 76, - "truncate64" : -1, - "umask" : 95, - "unlink" : 87, - "unlinkat" : 263, - "ustat" : 136, - "utime" : 132, - "utimensat" : 280, - "utimes" : 235, - "write" : 1, - "writev" : 20, - "accept" : 43, - "accept4" : 288, - "bind" : 49, - "connect" : 42, - "getpeername" : 52, - "getsockname" : 51, - "getsockopt" : 55, - "listen" : 50, - "recv" : -1, - "recvfrom" : 45, - "recvmmsg" : 299, - "recvmsg" : 47, - "send" : -1, - "sendmmsg" : 307, - "sendmsg" : 46, - "sendto" : 44, - "setsockopt" : 54, - "shutdown" : 48, - "socket" : 41, - "socketcall" : -1, - "socketpair" : 53, - "sethostname" : 170, - "pause" : 34, - "rt_sigaction" : 13, - "rt_sigpending" : 127, - "rt_sigprocmask" : 14, - "rt_sigqueueinfo" : 129, - "rt_sigreturn" : 15, - "rt_sigsuspend" : 130, - "rt_sigtimedwait" : 128, - "rt_tgsigqueueinfo" : 297, - "sigaction" : -1, - "sigaltstack" : 131, - "signal" : -1, - "signalfd" : 282, - "signalfd4" : 289, - "sigpending" : -1, - "sigprocmask" : -1, - "sigreturn" : -1, - "sigsuspend" : -1, - "alarm" : 37, - "brk" : 12, - "clock_adjtime" : 305, - "clock_getres" : 229, - "clock_gettime" : 228, - "clock_nanosleep" : 230, - "clock_settime" : 227, - "gettimeofday" : 96, - "nanosleep" : 35, - "nice" : -1, - "sysinfo" : 99, - "syslog" : 103, - "time" : 201, - "timer_create" : 222, - "timer_delete" : 226, - "timerfd_create" : 283, - "timerfd_gettime" : 287, - "timerfd_settime" : 286, - "timer_getoverrun" : 225, - "timer_gettime" : 224, - "timer_settime" : 223, - "times" : 100, - "uname" : 63, - "madvise" : 28, - "mbind" : 237, - "mincore" : 27, - "mlock" : 149, - "mlockall" : 151, - "mmap" : 9, - "mmap2" : -1, - "mprotect" : 10, - "mremap" : 25, - "msync" : 26, - "munlock" : 150, - "munlockall" : 152, - "munmap" : 11, - "remap_file_pages" : 216, - "set_mempolicy" : 238, - "vmsplice" : 278, - "capget" : 125, - "capset" : 126, - "clone" : 56, - "execve" : 59, - "exit" : 60, - "exit_group" : 231, - "fork" : 57, - "getcpu" : 309, - "getpgid" : 121, - "getpgrp" : 111, - "getpid" : 39, - "getppid" : 110, - "getpriority" : 140, - "getresgid" : 120, - "getresgid32" : -1, - "getresuid" : 118, - "getresuid32" : -1, - "getrlimit" : 97, - "getrusage" : 98, - "getsid" : 124, - "getuid" : 102, - "getuid32" : -1, - "getegid" : 108, - "getegid32" : -1, - "geteuid" : 107, - "geteuid32" : -1, - "getgid" : 104, - "getgid32" : -1, - "getgroups" : 115, - "getgroups32" : -1, - "getitimer" : 36, - "get_mempolicy" : 239, - "kill" : 62, - "prctl" : 157, - "prlimit64" : 302, - "sched_getaffinity" : 204, - "sched_getparam" : 143, - "sched_get_priority_max" : 146, - "sched_get_priority_min" : 147, - "sched_getscheduler" : 145, - "sched_rr_get_interval" : 148, - "sched_setaffinity" : 203, - "sched_setparam" : 142, - "sched_setscheduler" : 144, - "sched_yield" : 24, - "setfsgid" : 123, - "setfsgid32" : -1, - "setfsuid" : 122, - "setfsuid32" : -1, - "setgid" : 106, - "setgid32" : -1, - "setgroups" : 116, - "setgroups32" : -1, - "setitimer" : 38, - "setpgid" : 109, - "setpriority" : 141, - "setregid" : 114, - "setregid32" : -1, - "setresgid" : 119, - "setresgid32" : -1, - "setresuid" : 117, - "setresuid32" : -1, - "setreuid" : 113, - "setreuid32" : -1, - "setrlimit" : 160, - "setsid" : 112, - "setuid" : 105, - "setuid32" : -1, - "ugetrlimit" : -1, - "vfork" : 58, - "wait4" : 61, - "waitid" : 247, - "waitpid" : -1, - "ipc" : -1, - "mq_getsetattr" : 245, - "mq_notify" : 244, - "mq_open" : 240, - "mq_timedreceive" : 243, - "mq_timedsend" : 242, - "mq_unlink" : 241, - "msgctl" : 71, - "msgget" : 68, - "msgrcv" : 70, - "msgsnd" : 69, - "semctl" : 66, - "semget" : 64, - "semop" : 65, - "semtimedop" : 220, - "shmat" : 30, - "shmctl" : 31, - "shmdt" : 67, - "shmget" : 29, - "arch_prctl" : 158, - "get_robust_list" : 274, - "get_thread_area" : 211, - "gettid" : 186, - "futex" : 202, - "restart_syscall" : 219, - "set_robust_list" : 273, - "set_thread_area" : 205, - "set_tid_address" : 218, - "tgkill" : 234, - "tkill" : 200, - "acct" : 163, - "adjtimex" : 159, - "bdflush" : -1, - "chroot" : 161, - "create_module" : 174, - "delete_module" : 176, - "get_kernel_syms" : 177, - "idle" : -1, - "init_module" : 175, - "ioperm" : 173, - "iopl" : 172, - "ioprio_get" : 252, - "ioprio_set" : 251, - "kexec_load" : 246, - "lookup_dcookie" : 212, - "migrate_pages" : 256, - "modify_ldt" : 154, - "mount" : 165, - "move_pages" : 279, - "name_to_handle_at" : 303, - "nfsservctl" : 180, - "open_by_handle_at" : 304, - "perf_event_open" : 298, - "pivot_root" : 155, - "process_vm_readv" : 310, - "process_vm_writev" : 311, - "ptrace" : 101, - "query_module" : 178, - "quotactl" : 179, - "reboot" : 169, - "setdomainname" : 171, - "setns" : 308, - "settimeofday" : 164, - "sgetmask" : -1, - "ssetmask" : -1, - "stime" : -1, - "swapoff" : 168, - "swapon" : 167, - "_sysctl" : 156, - "sysfs" : 139, - "sys_setaltroot" : -1, - "umount" : -1, - "umount2" : 166, - "unshare" : 272, - "uselib" : 134, - "vhangup" : 153, - "vm86" : -1, - "vm86old" : -1, - "add_key" : 248, - "keyctl" : 250, - "request_key" : 249, - "afs_syscall" : 183, - "break" : -1, - "ftime" : -1, - "getpmsg" : 181, - "gtty" : -1, - "lock" : -1, - "madvise1" : -1, - "mpx" : -1, - "prof" : -1, - "profil" : -1, - "putpmsg" : 182, - "security" : 185, - "stty" : -1, - "tuxcall" : 184, - "ulimit" : -1, - "vserver" : 236, -} diff --git a/system/setns_linux.go b/system/setns_linux.go index a3c4cbb2..615ff4c8 100644 --- a/system/setns_linux.go +++ b/system/setns_linux.go @@ -21,16 +21,20 @@ var setNsMap = map[string]uintptr{ "linux/s390x": 339, } +var sysSetns = setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] + +func SysSetns() uint32 { + return uint32(sysSetns) +} + func Setns(fd uintptr, flags uintptr) error { ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] if !exists { return fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH) } - _, _, err := syscall.RawSyscall(ns, fd, flags, 0) if err != 0 { return err } - return nil }