kata agent runc 启动容器过程分析(附 CVE-2019-5736 实现过程)

root@ubuntu:/home/ubuntu# kata-runtime exec e12a7db6fb05df044a59a19bb03c39fe7752e4d684a8e2e58822b88606d3ac3e
rpc error: code = Internal desc = Could not run process: container_linux.go:349: starting container process caused "panic from initialization: runtime error: index out of range, goroutine 1 [running, locked to thread]:
runtime/debug.Stack(0x400018fbd8, 0xaaaab1b68260, 0xaaaab21de220)
	/usr/go/src/runtime/debug/stack.go:24 +0x88

github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer.(*LinuxFactory).StartInitialization.func2(0x400018fea0) /root/go/src/github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go:370
+0x40 panic(0xaaaab1b68260, 0xaaaab21de220) /usr/go/src/runtime/panic.go:513 +0x18c github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer.(*linuxSetnsInit).Init(0x400012d9c0, 0x0, 0x0) /root/go/src/github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go:91 +0x434 github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer.(*LinuxFactory).StartInitialization(0x4000164090, 0x0, 0x0) /root/go/src/github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go: 380 +0x2ec main.init.0() /root/go/src/github.com/kata-containers/agent/agent.go:1506 +0x88 " root@ubuntu:/home/ubuntu#

kata agent
runc 启动容器过程分析(附 CVE-2019-5736 实现过程)

【kubernetes/k8s源码分析】kata container agent create container 源码分析

https://blog.csdn.net/zhonglinzhang/article/details/101212033

linuxStandardInit.Init()(github.com/opencontainers/runc/libcontainer/standard_init_linux.go#47):

func (l *linuxStandardInit) Init() error {
    // 这里比较重要的是这个函数,此时各个 Namespace 虽然都挂载完毕了,但是当前的进程的视角里根目录和容器外是一样的
    // 因此这个方法会挂载设备,bind mount,然后将当前根目录切换到容器的根目录下。
	if err := prepareRootfs(l.pipe, l.config); err != nil {
		return err
	}

	// 设置 root (/) 为只读
	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
		if err := finalizeRootfs(l.config.Config); err != nil {
			return err
		}
	}

	// 在完成一系列容器内的环境准备之后,通过 execve 执行容器内的 entrypoint
	if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
		return newSystemErrorWithCause(err, "exec user process")
	}
	return nil
}

总结:

  • runc init 一个会有三个进程
    • 第一个进程读取 bootstrapData,并完成第二个进程的 user map 的设置
    • 第二个进程完成 namespace 的设置
    • 第三个进程完成 CGROUP namesapce 的设置,并读取了 0x80 的同步信息。最后进入 go 代码。go 代码读取 container config,进行容器内环境准备,最后执行容器的 entrypoint
      • 47 func (l *linuxStandardInit) Init() error {
         48         runtime.LockOSThread()
         49         defer runtime.UnlockOSThread()
         50         if !l.config.Config.NoNewKeyring {
         51                 if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil {
         52                         return err
         53                 }
         54                 defer label.SetKeyLabel("")
         55                 ringname, keepperms, newperms := l.getSessionRingParams()
         56 
         57                 // Do not inherit the parent's session keyring.
         58                 if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
         59                         // If keyrings aren't supported then it is likely we are on an
         60                         // older kernel (or inside an LXC container). While we could bail,
         61                         // the security feature we are using here is best-effort (it only
         62                         // really provides marginal protection since VFS credentials are
         63                         // the only significant protection of keyrings).
         64                         //
         65                         // TODO(cyphar): Log this so people know what's going on, once we
         66                         //               have proper logging in 'runc init'.
         67                         if errors.Cause(err) != unix.ENOSYS {
         68                                 return errors.Wrap(err, "join session keyring")
         69                         }
         70                 } else {
         71                         // Make session keyring searcheable. If we've gotten this far we
         72                         // bail on any error -- we don't want to have a keyring with bad
         73                         // permissions.
         74                         if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
         75                                 return errors.Wrap(err, "mod keyring permissions")
         76                         }
         77                 }
         78         }
         79 
         80         if err := setupNetwork(l.config); err != nil {
         81                 return err
         82         }
         83         if err := setupRoute(l.config.Config); err != nil {
         84                 return err
         85         }
        
        
        
        
         86 
         87         label.Init()
         88         if err := prepareRootfs(l.pipe, l.config); err != nil {
         89                 return err
         90         }
         91         // Set up the console. This has to be done *before* we finalize the rootfs,
         92         // but *after* we've given the user the chance to set up all of the mounts
         93         // they wanted.
         94         if l.config.CreateConsole {
         95                 if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
         96                         return err
         97                 }
         98                 if err := system.Setctty(); err != nil {
         99                         return errors.Wrap(err, "setctty")
        100                 }
        101         }
        102 
        103         // Finish the rootfs setup.
        104         if l.config.Config.Namespaces.Contains(configs.NEWNS) {
        105                 if err := finalizeRootfs(l.config.Config); err != nil {
        106                         return err
        107                 }
        108         }
        109 
        110         if hostname := l.config.Config.Hostname; hostname != "" {
        111                 if err := unix.Sethostname([]byte(hostname)); err != nil {
        112                         return errors.Wrap(err, "sethostname")
        113                 }
        114         }
        115         if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
        116                 return errors.Wrap(err, "apply apparmor profile")
        117         }
        118 
        119         for key, value := range l.config.Config.Sysctl {
        120                 if err := writeSystemProperty(key, value); err != nil {
        121                         return errors.Wrapf(err, "write sysctl key %s", key)
        122                 }
        123         }
        124         for _, path := range l.config.Config.ReadonlyPaths {
        125                 if err := readonlyPath(path); err != nil {
        126                         return errors.Wrapf(err, "readonly path %s", path)
        127                 }
        128         }
        129         for _, path := range l.config.Config.MaskPaths {
        130                 if err := maskPath(path, l.config.Config.MountLabel); err != nil {
        131                         return errors.Wrapf(err, "mask path %s", path)
        132                 }
        133         }
        134         pdeath, err := system.GetParentDeathSignal()
        135         if err != nil {
        136                 return errors.Wrap(err, "get pdeath signal")
        137         }
        138         if l.config.NoNewPrivileges {
        139                 if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
        140                         return errors.Wrap(err, "set nonewprivileges")
        141                 }
        142         }
        143         // Tell our parent that we're ready to Execv. This must be done before the
        144         // Seccomp rules have been applied, because we need to be able to read and
        145         // write to a socket.
        146         if err := syncParentReady(l.pipe); err != nil {
        147                 return errors.Wrap(err, "sync ready")
        148         }
        149         if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
        150                 return errors.Wrap(err, "set process label")
        151         }
        152         defer label.SetProcessLabel("")
        153         // Without NoNewPrivileges seccomp is a privileged operation, so we need to
        154         // do this before dropping capabilities; otherwise do it as late as possible
        155         // just before execve so as few syscalls take place after it as possible.
        156         if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
        157                 if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
        158                         return err
        159                 }
        160         }
        161         if err := finalizeNamespace(l.config); err != nil {
        162                 return err
        163         }
        164         // finalizeNamespace can change user/group which clears the parent death
        165         // signal, so we restore it here.
        166         if err := pdeath.Restore(); err != nil {
        167                 return errors.Wrap(err, "restore pdeath signal")
        168         }
        169         // Compare the parent from the initial start of the init process and make
        170         // sure that it did not change.  if the parent changes that means it died
        171         // and we were reparented to something else so we should just kill ourself
        172         // and not cause problems for someone else.
        173         if unix.Getppid() != l.parentPid {
        174                 return unix.Kill(unix.Getpid(), unix.SIGKILL)
        175         }
        176         // Check for the arg before waiting to make sure it exists and it is
        177         // returned as a create time error.
        178         name, err := exec.LookPath(l.config.Args[0])
        179         if err != nil {
        180                 return err
        181         }
        182         // Close the pipe to signal that we have completed our init.
        183         l.pipe.Close()
        184         // Wait for the FIFO to be opened on the other side before exec-ing the
        185         // user process. We open it through /proc/self/fd/$fd, because the fd that
        186         // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
        187         // re-open an O_PATH fd through /proc.
        188         fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
        189         if err != nil {
        190                 return newSystemErrorWithCause(err, "open exec fifo")
        191         }
        192         if _, err := unix.Write(fd, []byte("0")); err != nil {
        193                 return newSystemErrorWithCause(err, "write 0 exec fifo")
        194         }
        195         // Close the O_PATH fifofd fd before exec because the kernel resets
        196         // dumpable in the wrong order. This has been fixed in newer kernels, but
        197         // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
        198         // N.B. the core issue itself (passing dirfds to the host filesystem) has
        199         // since been resolved.
        200         // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
        201         unix.Close(l.fifoFd)
        202         // Set seccomp as close to execve as possible, so as few syscalls take
        203         // place afterward (reducing the amount of syscalls that users need to
        204         // enable in their seccomp profiles).
        205         if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
        206                 if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
        207                         return newSystemErrorWithCause(err, "init seccomp")
        208                 }
        209         }
        210         if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
        211                 return newSystemErrorWithCause(err, "exec user process")
        212         }
        213         return nil
        214 }

          调用Init

          •         i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
                    if err != nil {
                            return err
                    }
            
                    // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
                    return i.Init()
            }
      • // Shared function between CreateContainer and ExecProcess, because those expect
        // a process to be run.
        func (a *agentGRPC) execProcess(ctr *container, proc *process, createContainer bool) (err error) {
                if ctr == nil {
                        return grpcStatus.Error(codes.InvalidArgument, "Container cannot be nil")
                }
        
                if proc == nil {
                        return grpcStatus.Error(codes.InvalidArgument, "Process cannot be nil")
                }
        
                // This lock is very important to avoid any race with reaper.reap().
                // Indeed, if we don't lock this here, we could potentially get the
                // SIGCHLD signal before the channel has been created, meaning we will
                // miss the opportunity to get the exit code, leading WaitProcess() to
                // wait forever on the new channel.
                // This lock has to be taken before we run the new process.
                a.sandbox.subreaper.lock()
                defer a.sandbox.subreaper.unlock()
        
                if createContainer {
                        err = ctr.container.Start(&proc.process)
                } else {
                        err = ctr.container.Run(&(proc.process))
                }
                if err != nil {
                        return grpcStatus.Errorf(codes.Internal, "Could not run process: %v", err)
                }

        vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +233

        • func (c *linuxContainer) Start(process *Process) error {
                  c.m.Lock()
                  defer c.m.Unlock()
                  if process.Init {
                          if err := c.createExecFifo(); err != nil {
                                  return err
                          }
                  }
                  if err := c.start(process); err != nil {
                          if process.Init {
                                  c.deleteExecFifo()
                          }
                          return err
                  }
                  return nil
          }
          
          func (c *linuxContainer) Run(process *Process) error {
                  if err := c.Start(process); err != nil {
                          return err
                  }
                  if process.Init {
                          return c.exec()
                  }
                  return nil
          }
        • newParentProcess 函数
          
          创建一对pipe,parentPipe和childPipe,作为 start 进程与容器内部 init 进程通信管道
          创建一个命令模版作为 Parent 进程启动的模板
          newInitProcess 封装 initProcess。主要工作为添加初始化类型环境变量,将namespace、uid/gid 映射等信息使用 bootstrapData 封装为一个 io.Reader
                 initProcess 实现了 parentProcess 接口
        • func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
              parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
              if err != nil {
                  return nil, newSystemErrorWithCause(err, "creating new init pipe")
              }
              messageSockPair := filePair{parentInitPipe, childInitPipe}
           
              parentLogPipe, childLogPipe, err := os.Pipe()
              if err != nil {
                  return nil, fmt.Errorf("Unable to create the log pipe:  %s", err)
              }
              logFilePair := filePair{parentLogPipe, childLogPipe}
           
              cmd, err := c.commandTemplate(p, childInitPipe, childLogPipe)
              if err != nil {
                  return nil, newSystemErrorWithCause(err, "creating new command template")
              }
              if !p.Init {
                  return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
              }
           
              // We only set up fifoFd if we're not doing a `runc exec`. The historic
              // reason for this is that previously we would pass a dirfd that allowed
              // for container rootfs escape (and not doing it in `runc exec` avoided
              // that problem), but we no longer do that. However, there's no need to do
              // this for `runc exec` so we just keep it this way to be safe.
              if err := c.includeExecFifo(cmd); err != nil {
                  return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
              }
              return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
        • agent.go

        • func main() {
                  defer handlePanic()
          
                  err := realMain()
                  if err != nil {
                          agentLog.WithError(err).Error("agent failed")
                          os.Exit(1)
                  }
          
                  agentLog.Debug("agent exiting")
          
                  os.Exit(0)
          }
        • initProcess start 函数
          
               创建新的进程。而此时新的进程使用 /proc/self/exec 为执行入口,参数为 init,会在 main 函数调用之前执行,所以在新的进程中 func init() 会直接调用,而不会去执行main函数
          
          func (p *initProcess) start() error {
              defer p.messageSockPair.parent.Close()
              err := p.cmd.Start()
              p.process.ops = p
              // close the write-side of the pipes (controlled by child)
              p.messageSockPair.child.Close()
              p.logFilePair.child.Close()
              if err != nil {
                  p.process.ops = nil
                  return newSystemErrorWithCause(err, "starting init process command")
              }
                   cmd 如最后命令所示,Path填充为 /proc/self/exe(本身 agent)。参数字段 Args 为 init,
          表示对容器进行初始化,调用的为 agent init
          agent 最后直接复用 runc 代码
        • func init() {
                  if len(os.Args) > 1 && os.Args[1] == "init" {
                          runtime.GOMAXPROCS(1)
                          runtime.LockOSThread()
                          factory, _ := libcontainer.New("")
                          if err := factory.StartInitialization(); err != nil {
                                  agentLog.WithError(err).Error("init failed")
                          }
                          panic("--this line should have never been executed, congratulations--")
                  }
          }
        • 环境

          OCI runtime spec 地址:https://github.com/opencontainers/runtime-spec
          runc 地址:https://github.com/opencontainers/runc/
          Commit:f414f497b50a61750ea3af9fccf998a3db687cea
          系统版本:Fedora Release 28
          内核版本:4.17.9-200.fc28.x86_64

          runc 介绍

          runc 实现了 OCI 的容器标准,能够管理容器的生命周期。runc 的详细功能请参考 帮助文档

          runc 不是基于 server 形式的,所以所有的配置和状态都会存储在本地文件系统中(以下均为使用 docker 时的默认路径):

          • 容器配置:/run/docker/libcontainerd/{cnotainer-id}/config.json
          • 容器 init 进程的标准输入输出流:/run/docker/libcontainerd/{cnotainer-id}/{init-stdin,init-stdout,init-stderr}
          • 容器状态信息:/run/runc/*/state.json

          runc 创建容器时会将状态记录到 state.json 中,所有查询都是从 state.json 中取得容器基本信息,然后再从系统中获取容器实时状态。

          docker 的调用链如下:

          docker-client -> dockerd -> docker-containerd -> docker-containerd-shim -> runc(容器外) -> runc(容器内) -> containter-entrypoint
          

          runc 启动容器过程

          runc 在被 docker-containerd-shim 调用时,参数中会指定容器的配置路径(即 config.json 的位置),同时容器的根路径也已经准备完毕,因此 runc 不会有跟镜像相关的概念。容器的启动过程分析直接从 runc run 开始,即 docker 调用链中的 runc(容器外)这个时间点。

          runc(容器外)环境准备

          读取 config.json(github.com/opencontainers/runc/run.go#65):

          // 读取 config.json
          spec, err := setupSpec(context)
          if err != nil {
          	return err
          }
          // 启动容器
          status, err := startContainer(context, spec, CT_ACT_RUN, nil)
          if err == nil {
          	os.Exit(status)
          }
          return err
          

          startContainer 创建容器信息,并启动(github.com/opencontainers/runc/utils_linux.go#396):

          func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
              // 通过 spec 创建容器结构,在 createContainer 中将 spec 转换为了 runc 的 container config
          	container, err := createContainer(context, id, spec)
          	if err != nil {
          		return -1, err
          	}
              // 构建 runner 启动容器
          	r := &runner{
          		// 容器
          		container:       container,
          		// 即 CT_ACT_RUN
          		action:          action,
          		// 用于设置 process.Init 字段
          		init:            true,
          	}
          	return r.run(spec.Process)
          }
          

          r.run() 启动容器(github.com/opencontainers/runc/utils_linux.go#268):

          func (r *runner) run(config *specs.Process) (int, error) {
          	// 根据 config 构建容器进程,此处 r.init 为 true
          	process, err := newProcess(*config, r.init)
          	if err != nil {
          		r.destroy()
          		return -1, err
          	}
          
              // 根据 action 调用 container 的对应方法
          	switch r.action {
          	case CT_ACT_CREATE:
          		err = r.container.Start(process)
          	case CT_ACT_RESTORE:
          		err = r.container.Restore(process, r.criuOpts)
              case CT_ACT_RUN:
                  // 此处调用的是这个方法
          		err = r.container.Run(process)
          	default:
          		panic("Unknown action")
          	}
          }
          

          container 是由 createContainer() 方法创建,根据创建链路 createContainer() -> loadFactory() -> libcontainer.New() 确认容器由 LinuxFactory.Create() 创建:

          // github.com/opencontainers/runc/libcontainer/factory_linux.go#132
          func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
          	l := &LinuxFactory{
                  // 指向当前的 exe 程序,即 runc 本身
                  InitPath:  "/proc/self/exe",
                  // os.Args[0] 是当前 runc 的路径,本质上和 InitPath 是一样的,即 runc init
          		InitArgs:  []string{os.Args[0], "init"},
          	}
          	return l, nil
          }
          
          // github.com/opencontainers/runc/libcontainer/factory_linux.go#189
          func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
              // 创建 linux 容器结构
          	c := &linuxContainer{
                  // 容器 ID
                  id:            id,
                  // 容器状态文件存放目录,默认是 /run/runc/{容器 id}/
                  root:          containerRoot,
                  // 容器配置
                  config:        config,
                  // 即 /proc/self/exe,就是 runc
                  initPath:      l.InitPath,
                  // 即 runc init
          		initArgs:      l.InitArgs,
          	}
          	return c, nil
          }
          

          所以整个容器的启动逻辑在 linuxContainer.Run() 里,调用链是 linuxContainer.Run() -> linuxContainer.Start() -> linuxContainer.start():

          // github.com/opencontainers/runc/libcontainer/container_linux.go#334
          func (c *linuxContainer) start(process *Process) error {
              // process 是容器的 entrypoint,此处创建的是 entrypoint 的父进程
          	parent, err := c.newParentProcess(process)
          	if err != nil {
          		return newSystemErrorWithCause(err, "creating new parent process")
              }
              // 启动父进程
          	if err := parent.start(); err != nil {
          		// terminate the process to ensure that it properly is reaped.
          		if err := ignoreTerminateErrors(parent.terminate()); err != nil {
          			logrus.Warn(err)
          		}
          		return newSystemErrorWithCause(err, "starting container process")
          	}
          }
          
          func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
              // 创建用于父子进程通信的 pipe
          	parentPipe, childPipe, err := utils.NewSockPair("init")
          	if err != nil {
          		return nil, newSystemErrorWithCause(err, "creating new init pipe")
              }
              // 创建父进程的 cmd
          	cmd, err := c.commandTemplate(p, childPipe)
          	if err != nil {
          		return nil, newSystemErrorWithCause(err, "creating new command template")
          	}
          	if !p.Init {
                  // 由于 p.Init 为 true,所以不会执行到这里
          		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
          	}
          
              // 返回标准 init 进程
          	return c.newInitProcess(p, cmd, parentPipe, childPipe)
          }
          
          func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
              // 这里可以看到 cmd 就是 runc init
          	cmd := exec.Command(c.initPath, c.initArgs[1:]...