diff --git a/Makefile b/Makefile index c72dd2f414d..0d48fe8c521 100644 --- a/Makefile +++ b/Makefile @@ -105,7 +105,7 @@ unittest: runcimage -t --privileged --rm \ -v /lib/modules:/lib/modules:ro \ -v $(CURDIR):/go/src/$(PROJECT) \ - $(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS) + $(RUNC_IMAGE) make localunittest TESTFLAGS="$(TESTFLAGS)" localunittest: all $(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./... @@ -115,7 +115,7 @@ integration: runcimage -t --privileged --rm \ -v /lib/modules:/lib/modules:ro \ -v $(CURDIR):/go/src/$(PROJECT) \ - $(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH) + $(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)" localintegration: all bats -t tests/integration$(TESTPATH) diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go index 35c293de642..532a4c33904 100644 --- a/contrib/cmd/recvtty/recvtty.go +++ b/contrib/cmd/recvtty/recvtty.go @@ -98,7 +98,7 @@ func handleSingle(path string, noStdin bool) error { defer socket.Close() // Get the master file descriptor from runC. - master, err := utils.RecvFd(socket) + master, err := utils.RecvFile(socket) if err != nil { return err } @@ -171,7 +171,7 @@ func handleNull(path string) error { defer socket.Close() // Get the master file descriptor from runC. - master, err := utils.RecvFd(socket) + master, err := utils.RecvFile(socket) if err != nil { return } diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go index a4275df3a03..6d4106de0c6 100644 --- a/libcontainer/configs/mount_linux.go +++ b/libcontainer/configs/mount_linux.go @@ -34,13 +34,13 @@ type Mount struct { // Note that, the underlying filesystem should support this feature to be // used. // Every mount point could have its own mapping. - UIDMappings []IDMap `json:"uidMappings,omitempty"` + UIDMappings []IDMap `json:"uid_mappings,omitempty"` // GIDMappings is used to changing file group owners w/o calling chown. // Note that, the underlying filesystem should support this feature to be // used. // Every mount point could have its own mapping. - GIDMappings []IDMap `json:"gidMappings,omitempty"` + GIDMappings []IDMap `json:"gid_mappings,omitempty"` } func (m *Mount) IsBind() bool { diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go index 31c8a900eb1..ec6228399d8 100644 --- a/libcontainer/criu_linux.go +++ b/libcontainer/criu_linux.go @@ -1185,7 +1185,7 @@ func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process, defer master.Close() // While we can access console.master, using the API is a good idea. - if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil { + if err := utils.SendFile(process.ConsoleSocket, master); err != nil { return err } case "status-ready": diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 2e2d00ff83a..7b6c8113b6c 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -5,7 +5,6 @@ import ( "encoding/json" "errors" "fmt" - "io" "net" "os" "runtime" @@ -121,11 +120,8 @@ func startInitialization() (retErr error) { defer func() { // If this defer is ever called, this means initialization has failed. // Send the error back to the parent process in the form of an initError. - if err := writeSync(pipe, procError); err != nil { - fmt.Fprintln(os.Stderr, err) - return - } - if err := utils.WriteJSON(pipe, &initError{Message: retErr.Error()}); err != nil { + ierr := initError{Message: retErr.Error()} + if err := writeSyncArg(pipe, procError, ierr); err != nil { fmt.Fprintln(os.Stderr, err) return } @@ -352,7 +348,6 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { if err != nil { return err } - // After we return from here, we don't need the console anymore. defer pty.Close() @@ -374,9 +369,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { } } // While we can access console.master, using the API is a good idea. - if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil { + if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil { return err } + runtime.KeepAlive(pty) + // Now, dup over all the things. return dupStdio(slavePath) } @@ -384,12 +381,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { // syncParentReady sends to the given pipe a JSON payload which indicates that // the init is ready to Exec the child process. It then waits for the parent to // indicate that it is cleared to Exec. -func syncParentReady(pipe io.ReadWriter) error { +func syncParentReady(pipe *os.File) error { // Tell parent. if err := writeSync(pipe, procReady); err != nil { return err } - // Wait for parent to give the all-clear. return readSync(pipe, procRun) } @@ -397,44 +393,37 @@ func syncParentReady(pipe io.ReadWriter) error { // syncParentHooks sends to the given pipe a JSON payload which indicates that // the parent should execute pre-start hooks. It then waits for the parent to // indicate that it is cleared to resume. -func syncParentHooks(pipe io.ReadWriter) error { +func syncParentHooks(pipe *os.File) error { // Tell parent. if err := writeSync(pipe, procHooks); err != nil { return err } - // Wait for parent to give the all-clear. return readSync(pipe, procResume) } -// syncParentSeccomp sends to the given pipe a JSON payload which -// indicates that the parent should pick up the seccomp fd with pidfd_getfd() -// and send it to the seccomp agent over a unix socket. It then waits for -// the parent to indicate that it is cleared to resume and closes the seccompFd. -// If the seccompFd is -1, there isn't anything to sync with the parent, so it -// returns no error. -func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error { - if seccompFd == -1 { +// syncParentSeccomp sends the fd associated with the seccomp file descriptor +// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy. +func syncParentSeccomp(pipe *os.File, seccompFd *os.File) error { + if seccompFd == nil { return nil } - - // Tell parent. - if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil { - unix.Close(seccompFd) + defer seccompFd.Close() + + // Tell parent to grab our fd. + // + // Notably, we do not use writeSyncFile here because a container might have + // an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest + // possible number of system calls here because all of those syscalls + // cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here + // before the parent gets the file descriptor would deadlock "runc init" if + // we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more + // details. + if err := writeSyncArg(pipe, procSeccomp, seccompFd.Fd()); err != nil { return err } - - // Wait for parent to give the all-clear. - if err := readSync(pipe, procSeccompDone); err != nil { - unix.Close(seccompFd) - return fmt.Errorf("sync parent seccomp: %w", err) - } - - if err := unix.Close(seccompFd); err != nil { - return fmt.Errorf("close seccomp fd: %w", err) - } - - return nil + // Wait for parent to tell us they've grabbed the seccompfd. + return readSync(pipe, procSeccompDone) } // setupUser changes the groups, gid, and uid for the user inside the container diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go index 58519a419c0..c5c324130c6 100644 --- a/libcontainer/integration/execin_test.go +++ b/libcontainer/integration/execin_test.go @@ -277,9 +277,9 @@ func TestExecInTTY(t *testing.T) { done := make(chan (error)) go func() { - f, err := utils.RecvFd(parent) + f, err := utils.RecvFile(parent) if err != nil { - done <- fmt.Errorf("RecvFd: %w", err) + done <- fmt.Errorf("RecvFile: %w", err) return } c, err := console.ConsoleFromFile(f) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 07d9e0df130..17e0468c6af 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -1171,7 +1171,7 @@ void nsexec(void) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ - try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)"); + try_unshare(config.cloneflags, "remaining namespaces"); update_timens_offsets(config.timensoffset, config.timensoffset_len); /* Ask our parent to send the mount sources fds. */ @@ -1302,10 +1302,6 @@ void nsexec(void) bail("setgroups failed"); } - if (config.cloneflags & CLONE_NEWCGROUP) { - try_unshare(CLONE_NEWCGROUP, "cgroup namespace"); - } - write_log(DEBUG, "signal completion to stage-0"); s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 48861406dba..7f78f730257 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "strconv" "time" @@ -171,14 +172,27 @@ func (p *setnsProcess) start() (retErr error) { panic("unexpected procHooks in setns") case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { - return errors.New("listenerPath is not set") + return errors.New("seccomp listenerPath is not set") } - - seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd)) + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + var srcFd int + if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil { + return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err) + } + seccompFd, err := pidGetFd(p.pid(), srcFd) if err != nil { + return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err) + } + defer seccompFd.Close() + // We have a copy, the child can keep working. We don't need to + // wait for the seccomp notify listener to get the fd before we + // permit the child to continue because the child will happily wait + // for the listener if it hits SCMP_ACT_NOTIFY. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { return err } - defer unix.Close(seccompFd) bundle, annotations := utils.Annotations(p.config.Config.Labels) containerProcessState := &specs.ContainerProcessState{ @@ -199,15 +213,10 @@ func (p *setnsProcess) start() (retErr error) { containerProcessState, seccompFd); err != nil { return err } - - // Sync with child. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { - return err - } - return nil default: return errors.New("invalid JSON payload from child") } + return nil }) if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { @@ -451,23 +460,33 @@ func (p *initProcess) start() (retErr error) { if err := p.sendConfig(); err != nil { return fmt.Errorf("error sending config to init process: %w", err) } - var ( - sentRun bool - sentResume bool - ) + var seenProcReady bool ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { switch sync.Type { case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { - return errors.New("listenerPath is not set") + return errors.New("seccomp listenerPath is not set") } - - seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd)) + var srcFd int + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil { + return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err) + } + seccompFd, err := pidGetFd(p.pid(), srcFd) if err != nil { + return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err) + } + defer seccompFd.Close() + // We have a copy, the child can keep working. We don't need to + // wait for the seccomp notify listener to get the fd before we + // permit the child to continue because the child will happily wait + // for the listener if it hits SCMP_ACT_NOTIFY. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { return err } - defer unix.Close(seccompFd) s, err := p.container.currentOCIState() if err != nil { @@ -488,12 +507,8 @@ func (p *initProcess) start() (retErr error) { containerProcessState, seccompFd); err != nil { return err } - - // Sync with child. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { - return err - } case procReady: + seenProcReady = true // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { @@ -555,7 +570,6 @@ func (p *initProcess) start() (retErr error) { if err := writeSync(p.messageSockPair.parent, procRun); err != nil { return err } - sentRun = true case procHooks: // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { @@ -587,28 +601,20 @@ func (p *initProcess) start() (retErr error) { if err := writeSync(p.messageSockPair.parent, procResume); err != nil { return err } - sentResume = true default: return errors.New("invalid JSON payload from child") } - return nil }) - if !sentRun { - return fmt.Errorf("error during container init: %w", ierr) - } - if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { - return errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process") - } - if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { + if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil && ierr == nil { return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err} } - - // Must be done after Shutdown so the child will exit and we can wait for it. + if !seenProcReady && ierr == nil { + ierr = errors.New("procReady not received") + } if ierr != nil { - _, _ = p.wait() - return ierr + return fmt.Errorf("error during container init: %w", ierr) } return nil } @@ -684,22 +690,20 @@ func (p *initProcess) forwardChildLogs() chan error { return logs.ForwardLogs(p.logFilePair.parent) } -func recvSeccompFd(childPid, childFd uintptr) (int, error) { - pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0) - if errno != 0 { - return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno) +func pidGetFd(pid, srcFd int) (*os.File, error) { + pidFd, err := unix.PidfdOpen(pid, 0) + if err != nil { + return nil, os.NewSyscallError("pidfd_open", err) } - defer unix.Close(int(pidfd)) - - seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0) - if errno != 0 { - return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno) + defer unix.Close(pidFd) + fd, err := unix.PidfdGetfd(pidFd, srcFd, 0) + if err != nil { + return nil, os.NewSyscallError("pidfd_getfd", err) } - - return int(seccompFd), nil + return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil } -func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error { +func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error { conn, err := net.Dial("unix", listenerPath) if err != nil { return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err) @@ -716,11 +720,10 @@ func sendContainerProcessState(listenerPath string, state *specs.ContainerProces return fmt.Errorf("cannot marshall seccomp state: %w", err) } - err = utils.SendFds(socket, b, fd) - if err != nil { + if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil { return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err) } - + runtime.KeepAlive(file) return nil } diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 54a2ff57a85..4212b33a314 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -3,7 +3,6 @@ package libcontainer import ( "errors" "fmt" - "io" "os" "path" "path/filepath" @@ -64,7 +63,7 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (err error) { +func prepareRootfs(pipe *os.File, iConfig *initConfig, mountFds mountFds) (err error) { config := iConfig.Config if err := prepareRoot(config); err != nil { return fmt.Errorf("error preparing rootfs: %w", err) @@ -1106,7 +1105,7 @@ func writeSystemProperty(key, value string) error { func remount(m mountEntry, rootfs string, noMountFallback bool) error { return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { flags := uintptr(m.Flags | unix.MS_REMOUNT) - err := mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "") + err := mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "") if err == nil { return nil } @@ -1130,7 +1129,7 @@ func remount(m mountEntry, rootfs string, noMountFallback bool) error { } // ... and retry the mount with flags found above. flags |= uintptr(int(s.Flags) & checkflags) - return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "") + return mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "") }) } diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index 7fc9fd662c3..40e51533410 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -690,17 +690,17 @@ func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err erro // patches said filter to handle -ENOSYS in a much nicer manner than the // default libseccomp default action behaviour, and loads the patched filter // into the kernel for the current process. -func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) { +func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (*os.File, error) { // Generate a patched filter. fprog, err := enosysPatchFilter(config, filter) if err != nil { - return -1, fmt.Errorf("error patching filter: %w", err) + return nil, fmt.Errorf("error patching filter: %w", err) } // Get the set of libseccomp flags set. seccompFlags, noNewPrivs, err := filterFlags(config, filter) if err != nil { - return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) + return nil, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) } // Set no_new_privs if it was requested, though in runc we handle @@ -708,15 +708,14 @@ func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, if noNewPrivs { logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path") if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err) + return nil, fmt.Errorf("error enabling no_new_privs bit: %w", err) } } // Finally, load the filter. fd, err := sysSeccompSetFilter(seccompFlags, fprog) if err != nil { - return -1, fmt.Errorf("error loading seccomp filter: %w", err) + return nil, fmt.Errorf("error loading seccomp filter: %w", err) } - - return fd, nil + return os.NewFile(uintptr(fd), "[seccomp filter]"), nil } diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go index fed02bcedc4..2c64ebbbadd 100644 --- a/libcontainer/seccomp/seccomp_linux.go +++ b/libcontainer/seccomp/seccomp_linux.go @@ -6,6 +6,7 @@ package seccomp import ( "errors" "fmt" + "os" libseccomp "github.com/seccomp/libseccomp-golang" "github.com/sirupsen/logrus" @@ -27,17 +28,16 @@ const ( ) // InitSeccomp installs the seccomp filters to be used in the container as -// specified in config. -// Returns the seccomp file descriptor if any of the filters include a -// SCMP_ACT_NOTIFY action, otherwise returns -1. -func InitSeccomp(config *configs.Seccomp) (int, error) { +// specified in config. Returns the seccomp file descriptor if any of the +// filters include a SCMP_ACT_NOTIFY action. +func InitSeccomp(config *configs.Seccomp) (*os.File, error) { if config == nil { - return -1, errors.New("cannot initialize Seccomp - nil config passed") + return nil, errors.New("cannot initialize Seccomp - nil config passed") } defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet) if err != nil { - return -1, errors.New("error initializing seccomp - invalid default action") + return nil, errors.New("error initializing seccomp - invalid default action") } // Ignore the error since pre-2.4 libseccomp is treated as API level 0. @@ -45,7 +45,7 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { for _, call := range config.Syscalls { if call.Action == configs.Notify { if apiLevel < 6 { - return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) + return nil, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) } // We can't allow the write syscall to notify to the seccomp agent. @@ -61,36 +61,36 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { // agent allows those syscalls to proceed, initialization works just fine and the agent can // handle future read()/close() syscalls as it wanted. if call.Name == "write" { - return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") + return nil, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") } } } // See comment on why write is not allowed. The same reason applies, as this can mean handling write too. if defaultAction == libseccomp.ActNotify { - return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") + return nil, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") } filter, err := libseccomp.NewFilter(defaultAction) if err != nil { - return -1, fmt.Errorf("error creating filter: %w", err) + return nil, fmt.Errorf("error creating filter: %w", err) } // Add extra architectures for _, arch := range config.Architectures { scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { - return -1, fmt.Errorf("error validating Seccomp architecture: %w", err) + return nil, fmt.Errorf("error validating Seccomp architecture: %w", err) } if err := filter.AddArch(scmpArch); err != nil { - return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err) + return nil, fmt.Errorf("error adding architecture to seccomp filter: %w", err) } } // Add extra flags. for _, flag := range config.Flags { if err := setFlag(filter, flag); err != nil { - return -1, err + return nil, err } } @@ -110,25 +110,24 @@ func InitSeccomp(config *configs.Seccomp) (int, error) { // Unset no new privs bit if err := filter.SetNoNewPrivsBit(false); err != nil { - return -1, fmt.Errorf("error setting no new privileges: %w", err) + return nil, fmt.Errorf("error setting no new privileges: %w", err) } // Add a rule for each syscall for _, call := range config.Syscalls { if call == nil { - return -1, errors.New("encountered nil syscall while initializing Seccomp") + return nil, errors.New("encountered nil syscall while initializing Seccomp") } if err := matchCall(filter, call, defaultAction); err != nil { - return -1, err + return nil, err } } seccompFd, err := patchbpf.PatchAndLoad(config, filter) if err != nil { - return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err) + return nil, fmt.Errorf("error loading seccomp filter into kernel: %w", err) } - return seccompFd, nil } diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go index 885529dc7d0..b08a3498687 100644 --- a/libcontainer/seccomp/seccomp_unsupported.go +++ b/libcontainer/seccomp/seccomp_unsupported.go @@ -5,6 +5,7 @@ package seccomp import ( "errors" + "os" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runtime-spec/specs-go" @@ -13,11 +14,11 @@ import ( var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") // InitSeccomp does nothing because seccomp is not supported. -func InitSeccomp(config *configs.Seccomp) (int, error) { +func InitSeccomp(config *configs.Seccomp) (*os.File, error) { if config != nil { - return -1, ErrSeccompNotEnabled + return nil, ErrSeccompNotEnabled } - return -1, nil + return nil, nil } // FlagSupported tells if a provided seccomp flag is supported. diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index ac58190758a..40a47a2e95c 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -75,7 +75,6 @@ func (l *linuxSetnsInit) Init() error { if err != nil { return err } - if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } @@ -94,7 +93,6 @@ func (l *linuxSetnsInit) Init() error { if err != nil { return fmt.Errorf("unable to init seccomp: %w", err) } - if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 25dc2863071..25507e58ad9 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "os" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -15,15 +16,19 @@ type syncType string // during container setup. They come in pairs (with procError being a generic // response which is followed by an &initError). // -// [ child ] <-> [ parent ] +// [ child ] <-> [ parent ] // -// procHooks --> [run hooks] -// <-- procResume +// procSeccomp --> [forward fd to listenerPath] +// file: seccomp fd +// --- no return synchronisation // -// procReady --> [final setup] -// <-- procRun +// procHooks --> [run hooks] +// <-- procResume // -// procSeccomp --> [pick up seccomp fd with pidfd_getfd()] +// procReady --> [final setup] +// <-- procRun +// +// procSeccomp --> [grab seccomp fd with pidfd_getfd()] // <-- procSeccompDone const ( procError syncType = "procError" @@ -35,9 +40,17 @@ const ( procSeccompDone syncType = "procSeccompDone" ) +type syncFlags int + +const ( + syncFlagHasFd syncFlags = (1 << iota) +) + type syncT struct { - Type syncType `json:"type"` - Fd int `json:"fd"` + Type syncType `json:"type"` + Flags syncFlags `json:"flags"` + Arg *json.RawMessage `json:"arg,omitempty"` + File *os.File `json:"-"` // passed oob through SCM_RIGHTS } // initError is used to wrap errors for passing them via JSON, @@ -50,74 +63,100 @@ func (i initError) Error() string { return i.Message } -// writeSync is used to write to a synchronisation pipe. An error is returned -// if there was a problem writing the payload. -func writeSync(pipe io.Writer, sync syncType) error { - return writeSyncWithFd(pipe, sync, -1) +func doWriteSync(pipe *os.File, sync syncT) error { + sync.Flags &= ^syncFlagHasFd + if sync.File != nil { + sync.Flags |= syncFlagHasFd + } + if err := utils.WriteJSON(pipe, sync); err != nil { + return fmt.Errorf("writing sync %q: %w", sync.Type, err) + } + if sync.Flags&syncFlagHasFd != 0 { + if err := utils.SendFile(pipe, sync.File); err != nil { + return fmt.Errorf("sending file after sync %q: %w", sync.Type, err) + } + } + return nil +} + +func writeSync(pipe *os.File, sync syncType) error { + return doWriteSync(pipe, syncT{Type: sync}) } -// writeSyncWithFd is used to write to a synchronisation pipe. An error is -// returned if there was a problem writing the payload. -func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error { - if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil { - return fmt.Errorf("writing syncT %q: %w", string(sync), err) +func writeSyncArg(pipe *os.File, sync syncType, arg interface{}) error { + argJSON, err := json.Marshal(arg) + if err != nil { + return fmt.Errorf("writing sync %q: marshal argument failed: %w", sync, err) } - return nil + argJSONMsg := json.RawMessage(argJSON) + return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg}) } -// readSync is used to read from a synchronisation pipe. An error is returned -// if we got an initError, the pipe was closed, or we got an unexpected flag. -func readSync(pipe io.Reader, expected syncType) error { - var procSync syncT - if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { +func doReadSync(pipe *os.File) (syncT, error) { + var sync syncT + if err := json.NewDecoder(pipe).Decode(&sync); err != nil { if errors.Is(err, io.EOF) { - return errors.New("parent closed synchronisation channel") + return sync, err } - return fmt.Errorf("failed reading error from parent: %w", err) + return sync, fmt.Errorf("reading from parent failed: %w", err) } - - if procSync.Type == procError { + if sync.Type == procError { var ierr initError - - if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { - return fmt.Errorf("failed reading error from parent: %w", err) + if sync.Arg == nil { + return sync, errors.New("procError missing error payload") } + if err := json.Unmarshal(*sync.Arg, &ierr); err != nil { + return sync, fmt.Errorf("unmarshal procError failed: %w", err) + } + return sync, &ierr + } + if sync.Flags&syncFlagHasFd != 0 { + file, err := utils.RecvFile(pipe) + if err != nil { + return sync, fmt.Errorf("receiving fd from sync %q failed: %w", sync.Type, err) + } + sync.File = file + } + return sync, nil +} - return &ierr +func readSyncFull(pipe *os.File, expected syncType) (syncT, error) { + sync, err := doReadSync(pipe) + if err != nil { + return sync, err + } + if sync.Type != expected { + return sync, fmt.Errorf("unexpected synchronisation flag: got %q, expected %q", sync.Type, expected) } + return sync, nil +} - if procSync.Type != expected { - return errors.New("invalid synchronisation flag from parent") +func readSync(pipe *os.File, expected syncType) error { + sync, err := readSyncFull(pipe, expected) + if err != nil { + return err + } + if sync.Arg != nil { + return fmt.Errorf("sync %q had unexpected argument passed: %q", expected, string(*sync.Arg)) + } + if sync.File != nil { + _ = sync.File.Close() + return fmt.Errorf("sync %q had unexpected file passed", sync.Type) } return nil } // parseSync runs the given callback function on each syncT received from the // child. It will return once io.EOF is returned from the given pipe. -func parseSync(pipe io.Reader, fn func(*syncT) error) error { - dec := json.NewDecoder(pipe) +func parseSync(pipe *os.File, fn func(*syncT) error) error { for { - var sync syncT - if err := dec.Decode(&sync); err != nil { + sync, err := doReadSync(pipe) + if err != nil { if errors.Is(err, io.EOF) { break } return err } - - // We handle this case outside fn for cleanliness reasons. - var ierr *initError - if sync.Type == procError { - if err := dec.Decode(&ierr); err != nil && !errors.Is(err, io.EOF) { - return fmt.Errorf("error decoding proc error from init: %w", err) - } - if ierr != nil { - return ierr - } - // Programmer error. - panic("No error following JSON procError payload.") - } - if err := fn(&sync); err != nil { return err } diff --git a/libcontainer/utils/cmsg.go b/libcontainer/utils/cmsg.go index fd9326cb59a..2edd1417af3 100644 --- a/libcontainer/utils/cmsg.go +++ b/libcontainer/utils/cmsg.go @@ -19,13 +19,14 @@ package utils import ( "fmt" "os" + "runtime" "golang.org/x/sys/unix" ) -// MaxNameLen is the maximum length of the name of a file descriptor being -// sent using SendFd. The name of the file handle returned by RecvFd will never -// be larger than this value. +// MaxNameLen is the maximum length of the name of a file descriptor being sent +// using SendFile. The name of the file handle returned by RecvFile will never be +// larger than this value. const MaxNameLen = 4096 // oobSpace is the size of the oob slice required to store a single FD. Note @@ -33,26 +34,21 @@ const MaxNameLen = 4096 // so sizeof(fd) = 4. var oobSpace = unix.CmsgSpace(4) -// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// RecvFile waits for a file descriptor to be sent over the given AF_UNIX // socket. The file name of the remote file descriptor will be recreated // locally (it is sent as non-auxiliary data in the same payload). -func RecvFd(socket *os.File) (*os.File, error) { - // For some reason, unix.Recvmsg uses the length rather than the capacity - // when passing the msg_controllen and other attributes to recvmsg. So we - // have to actually set the length. +func RecvFile(socket *os.File) (_ *os.File, Err error) { name := make([]byte, MaxNameLen) oob := make([]byte, oobSpace) sockfd := socket.Fd() - n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC) if err != nil { return nil, err } - if n >= MaxNameLen || oobn != oobSpace { - return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) } - // Truncate. name = name[:n] oob = oob[:oobn] @@ -61,36 +57,63 @@ func RecvFd(socket *os.File) (*os.File, error) { if err != nil { return nil, err } - if len(scms) != 1 { - return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + + // We cannot control how many SCM_RIGHTS we receive, and upon receiving + // them all of the descriptors are installed in our fd table, so we need to + // parse all of the SCM_RIGHTS we received in order to close all of the + // descriptors on error. + var fds []int + defer func() { + for i, fd := range fds { + if i == 0 && Err == nil { + // Only close the first one on error. + continue + } + // Always close extra ones. + _ = unix.Close(fd) + } + }() + var lastErr error + for _, scm := range scms { + if scm.Header.Type == unix.SCM_RIGHTS { + scmFds, err := unix.ParseUnixRights(&scm) + if err != nil { + lastErr = err + } else { + fds = append(fds, scmFds...) + } + } + } + if lastErr != nil { + return nil, lastErr } - scm := scms[0] - fds, err := unix.ParseUnixRights(&scm) - if err != nil { - return nil, err + // We do this after collecting the fds to make sure we close them all when + // returning an error here. + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) } if len(fds) != 1 { return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) } - fd := uintptr(fds[0]) - - return os.NewFile(fd, string(name)), nil + return os.NewFile(uintptr(fds[0]), string(name)), nil } -// SendFd sends a file descriptor over the given AF_UNIX socket. In -// addition, the file.Name() of the given file will also be sent as -// non-auxiliary data in the same payload (allowing to send contextual -// information for a file descriptor). -func SendFd(socket *os.File, name string, fd uintptr) error { +// SendFile sends a file over the given AF_UNIX socket. file.Name() is also +// included so that if the other end uses RecvFile, the file will have the same +// name information. +func SendFile(socket *os.File, file *os.File) error { + name := file.Name() if len(name) >= MaxNameLen { return fmt.Errorf("sendfd: filename too long: %s", name) } - return SendFds(socket, []byte(name), int(fd)) + err := SendRawFd(socket, name, file.Fd()) + runtime.KeepAlive(file) + return err } -// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket. -func SendFds(socket *os.File, msg []byte, fds ...int) error { - oob := unix.UnixRights(fds...) - return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0) +// SendRawFd sends a specific file descriptor over the given AF_UNIX socket. +func SendRawFd(socket *os.File, msg string, fd uintptr) error { + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0) } diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 6b9a7be038f..ca520b63b36 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -91,7 +91,7 @@ func CloseExecFrom(minFd int) error { } // NewSockPair returns a new unix socket pair -func NewSockPair(name string) (parent *os.File, child *os.File, err error) { +func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, err diff --git a/tty.go b/tty.go index fba3e025bc0..c101aacb78b 100644 --- a/tty.go +++ b/tty.go @@ -100,7 +100,7 @@ func (t *tty) initHostConsole() error { } func (t *tty) recvtty(socket *os.File) (Err error) { - f, err := utils.RecvFd(socket) + f, err := utils.RecvFile(socket) if err != nil { return err }