容器run

启动一个示例容器

使用runc的run命令可以创建并运行一个容器。参数很简单需要指定一个容器的id，以及一个包含config.json文件的目录来作为容器绑定的目录。config.json文件的格式就是在spec中所介绍的。

使用runc spec可以帮助我们快速生成这个文件，在runc spec --help可以看到一个完成的示例

mkdir hello
cd hello
docker pull hello-world
docker export $(docker create hello-world) > hello-world.tar
mkdir rootfs
tar -C rootfs -xf hello-world.tar
runc spec
sed -i 's;"sh";"/hello";' config.json
runc run container1

这个示例创建了hello目录用作容器的bundle目录，基于hello-world这个镜像制作了rootfs，然后生成config.json文件，替换启动命令，最终完成了一个标准的容器bundle目录的结构。最后即可启动容器。

可以按照上述命令操作，最终顺利执行程序并在控制台打印信息

Hello from Docker!                                                        
This message shows that your installation appears to be working correctly.

run流程

经过上面示例的操作，来看run命令的实现流程就比较明了了。

func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
	if err := revisePidFile(context); err != nil {
		return -1, err
	}
	spec, err := setupSpec(context)
	if err != nil {
		return -1, err
	}
 
	id := context.Args().First()
	if id == "" {
		return -1, errEmptyID
	}
 
	// 省略与systemd通知和按需启动的集成
 
	container, err := createContainer(context, id, spec)
	if err != nil {
		return -1, err
	}
 
	r := &runner{
		enableSubreaper: !context.Bool("no-subreaper"),
		shouldDestroy:   !context.Bool("keep"),
		container:       container,
		listenFDs:       listenFDs,
		notifySocket:    notifySocket,
		consoleSocket:   context.String("console-socket"),
		pidfdSocket:     context.String("pidfd-socket"),
		detach:          context.Bool("detach"),
		pidFile:         context.String("pid-file"),
		preserveFDs:     context.Int("preserve-fds"),
		action:          action,
		criuOpts:        criuOpts,
		init:            true,
	}
	return r.run(spec.Process)
}

省略中间和systemd集成的内容，run的过程其实非常简单

加载spec描述
创建Container对象
启动Container

我们知道runc虽然本身提供了命令行程序，但是实际上更多是作为一个package为其他更上层的应用提供支持的，比如containerd，docker。所以最终都会走到libcontainer这个包的调用上。

创建Container对象

libcontainer/factory_linux.go的Create方法。将spec描述转为运行时的Container描述。这里描述了一个进程运行的一些环境，除了要运行的进程本身，这样获得Container对象之后，可以任意指定在这个环境中要指定的进程。

type Container struct {
	id                   string
	stateDir             string
	config               *configs.Config
	cgroupManager        cgroups.Manager
	intelRdtManager      *intelrdt.Manager
	initProcess          parentProcess
	initProcessStartTime uint64
	m                    sync.Mutex
	criuVersion          int
	state                containerState
	created              time.Time
	fifo                 *os.File
}
 
type Config struct {
	// 具有进程运行的所有环境信息
}

在Container中运行进程

基于spec中的process信息，创建运行的process描述。libcontainer/process.go下的Process结构体
按照指定的方式来启动容器
1. CT_ACT_CREATE：创建容器，但不启动用户进程，需要后续调用exec来实际运行。通常分为两部runc create mycontainer; runc start mycontainer
2. CT_ACT_RUN：一步完成容器的创建和用户进程的启动。runc run mycontainer
3. CT_ACT_RESTORE：支持从检查点恢复容器runc checkpoint mycontainer; runc restore mycontainer

func (c *Container) Run(process *Process) error {
	c.m.Lock()
	defer c.m.Unlock()
	if err := c.start(process); err != nil {
		return err
	}
	if process.Init {
		return c.exec()
	}
	return nil
}

从代码中就可以看出来，run就是start + exec。这里有个关键点就是进程的Init属性，这个属性决定了启动的进程是否为容器内的init进程，也就是1号进程，这个进程和容器的生命周期相绑定，启动过程中会创建容器环境和资源。而非init的进程启动过程中就不需要创建容器资源，直接加入已有的命名空间当中就行，就像我们使用runc exec命令来在一个已有的容器当中执行命令。

Container start

做一些检查校验。比如cgroup设备的权限，rootless容器不允许设置额外组。

	if c.config.Cgroups.Resources.SkipDevices {
		return errors.New("can't start container with SkipDevices set")
	}
 
	if c.config.RootlessEUID && len(process.AdditionalGroups) > 0 {
		// We cannot set any additional groups in a rootless container
		// and thus we bail if the user asked us to do so.
		return errors.New("cannot set any additional groups in a rootless container")
	}

对于Init进程，首先确保该容器内只有一个Init进程。然后创建exec FIFO用于同步

	if process.Init {
		if c.initProcessStartTime != 0 {
			return errors.New("container already has init process")
		}
		if err := c.createExecFifo(); err != nil {
			return err
		}
		defer func() {
			// 启动失败则清理
			if retErr != nil {
				c.deleteExecFifo()
			}
		}()
	}

创建父进程，根据是否为Init进程来决定是创建initProcess还是setnsProcess。

	parent, err := c.newParentProcess(process)
	if err != nil {
		return fmt.Errorf("unable to create new parent process: %w", err)
	}
	// We do not need the cloned binaries once the process is spawned.
	defer process.closeClonedExes()

创建父进程的逻辑比较复杂，但是总结起来就是准备好基础资源，为执行runc init子进程做好准备，OCI规范要求了运行时需要提供init命令。这样设计使得权限、环境分离，提供了精确的生命周期。比如runc run运行在宿主机命名空间下在宿主机下创建容器资源，runc init则是切换到容器命令空间下，设置容器运行环境，然后exec启动用户进程。再比如父子进程间通过exec fifo同步，实现了init子进程的阻塞，例如runc create创建容器，阻塞到runc start再开始运行用户进程。并且将OCI规范和运行时分离开了 4. 将子进程的日志转发到父进程

logsDone := parent.forwardChildLogs()

做安全防护，启动子进程前将非标准IO的文件描述符标记为O_CLOEXEC，防止泄漏到runc init子进程

	if err := utils.CloseExecFrom(3); err != nil {
		return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
	}

启动容器进程。initProcess: 创建新的命名空间和环境；setnsProcess: 加入现有命名空间。容器资源创建的逻辑都在这里了，详细可以看两者start的实现

	if err := parent.start(); err != nil {
		return fmt.Errorf("unable to start container process: %w", err)
	}

执行回调

if process.Init {
    c.fifo.Close()
    if c.config.HasHook(configs.Poststart) {
        s, err := c.currentOCIState()
        if err != nil {
            return err
        }
 
        if err := c.config.Hooks.Run(configs.Poststart, s); err != nil {
            if err := ignoreTerminateErrors(parent.terminate()); err != nil {
                logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
            }
            return err
        }
    }
}

init命令

在前面的start过程中，父子进程之间创建了许多通信管道，父进程在宿主机创建好了容器资源，然后将这些运行环境全部传递给init子进程。init命令时OCI规范要求的运行时要提供的命令。在init子进程中会从中和父进程进行同步，也会获取到容器的环境信息，据此来初始化容器环境。

这里我们直接看runc的init命令实现。init.go下的startInitialization函数实现。首先要明确的是这个函数通常不会返回，返回说明初始化失败了。

获取同步管道，用来和父进程进行通信。父进程通过这个管道来控制子进程的执行步骤

	envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
	syncPipeFd, err := strconv.Atoi(envSyncPipe)
	if err != nil {
		return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
	}
	syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
	defer syncPipe.Close()

错误处理。用于最后做init检查，失败则将错误发送给父进程

	defer func() {
		// If this defer is ever called, this means initialization has failed.
		// Send the error back to the parent process in the form of an initError
		// if the sync socket has not been closed.
		if syncPipe.isClosed() {
			return
		}
		ierr := initError{Message: retErr.Error()}
		if err := writeSyncArg(syncPipe, procError, ierr); err != nil {
			fmt.Fprintln(os.Stderr, err)
			return
		}
		// The error is sent, no need to also return it (or it will be reported twice).
		retErr = nil
	}()

获取初始化管道。容器信息是通过该管道发送给运行时的。参数是json格式的，包含所有容器运行时需要的参数

	// Get the INITPIPE.
	envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
	initPipeFd, err := strconv.Atoi(envInitPipe)
	if err != nil {
		return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
	}
	initPipe := os.NewFile(uintptr(initPipeFd), "init")
	defer initPipe.Close()

设置日志系统。将日志输出重定向到专门的管道，使用json格式方便父进程解析

// 设置日志级别（可选）
if levelStr := os.Getenv("_LIBCONTAINER_LOGLEVEL"); levelStr != "" {
    logLevel, err := strconv.Atoi(levelStr)
    if err != nil {
        return fmt.Errorf("unable to convert _LIBCONTAINER_LOGLEVEL: %w", err)
    }
    logrus.SetLevel(logrus.Level(logLevel))
}
 
// 设置日志输出管道
logFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
logPipe := os.NewFile(uintptr(logFd), "logpipe")
 
logrus.SetOutput(logPipe)
logrus.SetFormatter(new(logrus.JSONFormatter))
logrus.Debug("child process in init()")

判断init命令的类型处理FIFO。init类型分为initStandard和initSetns。只有initStandard需要execFIFO控制执行时机

var fifoFile *os.File
envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
it := initType(envInitType)
if it == initStandard {
    // 只有标准初始化进程才有 FIFO
    fifoFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_FIFOFD"))
    fifoFile = os.NewFile(uintptr(fifoFd), "initfifo")
}

console和PidFd处理。用于设置容器的终端，设置init进程的pidfd(linux一种pid的高级管理方式)

// Console Socket - 用于 TTY 设置
var consoleSocket *os.File
if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
    console, err := strconv.Atoi(envConsole)
    consoleSocket = os.NewFile(uintptr(console), "console-socket")
    defer consoleSocket.Close()
}
 
// PidFD Socket - 用于进程文件描述符
var pidfdSocket *os.File
if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" {
    sockFd, err := strconv.Atoi(envSockFd)
    pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket")
    defer pidfdSocket.Close()
}

从init管道读取配置并执行真正的容器初始化

// 从管道读取配置
var config initConfig
if err := json.NewDecoder(initPipe).Decode(&config); err != nil {
    return err
}
 
// 执行真正的容器初始化
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe)

在containerInit当中负责基于创建好的资源设置好容器环境，最后调用exec加载执行用户进程。具体的实现不再这里关注。

至此完成run命令，启动容器的流程

Smarticen Notes

Explorer