容器能不能将 volume 挂载直接挂到根目录？（上）—— 从 runc 说起

这件事起源于有小伙伴在某群里问，在 K8s 中，能不能把 volume 挂载直接挂到根目录？我的第一反应是不能。容器会使用 union filesystem 将容器的内容挂到根目录下，这点在正常情况下是无法更改的。但是就止于此吗？发现给不出合理解释的时候，突然感觉自己对于容器的认知只停留在了很表面的阶段。

一、从 runc 源码开始

于是我翻到了 runc 的代码，一起看看他是怎么做的，看看有没有什么切入点。我们首先关注容器的创建这一部分：libcontainer/init_linux.go:78

func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds []int) (initer, error) {
	var config *initConfig
	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
		return nil, err
	}
	if err := populateProcessEnvironment(config.Env); err != nil {
		return nil, err
	}
	switch t {
	case initSetns:
		// mountFds must be nil in this case. We don't mount while doing runc exec.
		if mountFds != nil {
			return nil, errors.New("mountFds must be nil; can't mount from exec")
		}

		return &linuxSetnsInit{
		}, nil
	case initStandard:
		return &linuxStandardInit{
		}, nil
	}
	return nil, fmt.Errorf("unknown init type %q", t)
}

这里做的事情比较简单，一个是从 Pipe 拿到初始化配置，解析配置中注入的 env，将其设置到本进程中。容器初始化的方式有两种，其一是 initSetns，启动一个已有的容器。其次是 initStandard，启动一个标准容器。

initStandard 中与 rootfs 最密切相关的就是 err := prepareRootfs(l.pipe, l.config, l.mountFds)，在 prepareRootfs 之前，主要进行了网络的初始化，比如 lo 网卡和 route 的初始化。不过我们主要还是关注 rootfs 部分，从注释我们可以看到这里主要做了这几件事情：设备、挂载点、fs的初始化，最后提醒你调用 finalizeRootfs 来完成初始化，我们先以 prepareRootfs 为核心，逐行解析这里面发生了什么：

// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs.
func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err error) {
	config := iConfig.Config
	if err := prepareRoot(config); err != nil {
		return fmt.Errorf("error preparing rootfs: %w", err)
	}

	if mountFds != nil && len(mountFds) != len(config.Mounts) {
		return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v. Slice: %v", len(config.Mounts), len(mountFds), mountFds)
	}

	mountConfig := &mountConfig{
		root:            config.Rootfs,
		label:           config.MountLabel,
		cgroup2Path:     iConfig.Cgroup2Path,
		rootlessCgroups: iConfig.RootlessCgroups,
		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
	}
	setupDev := needsSetupDev(config)
	for i, m := range config.Mounts {
		// Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts).
		// Therefore, we can access mountFds[i] without any concerns.
		if mountFds != nil && mountFds[i] != -1 {
			mountConfig.fd = &mountFds[i]
		} else {
			mountConfig.fd = nil
		}

		if err := mountToRootfs(m, mountConfig); err != nil {
			return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
		}
	}

	if setupDev {
		if err := createDevices(config); err != nil {
			return fmt.Errorf("error creating device nodes: %w", err)
		}
		if err := setupPtmx(config); err != nil {
			return fmt.Errorf("error setting up ptmx: %w", err)
		}
		if err := setupDevSymlinks(config.Rootfs); err != nil {
			return fmt.Errorf("error setting up /dev symlinks: %w", err)
		}
	}

	// Signal the parent to run the pre-start hooks.
	// The hooks are run after the mounts are setup, but before we switch to the new
	// root, so that the old root is still available in the hooks for any mount
	// manipulations.
	// Note that iConfig.Cwd is not guaranteed to exist here.
	if err := syncParentHooks(pipe); err != nil {
		return err
	}

	// The reason these operations are done here rather than in finalizeRootfs
	// is because the console-handling code gets quite sticky if we have to set
	// up the console before doing the pivot_root(2). This is because the
	// Console API has to also work with the ExecIn case, which means that the
	// API must be able to deal with being inside as well as outside the
	// container. It's just cleaner to do this here (at the expense of the
	// operation not being perfectly split).

	if err := unix.Chdir(config.Rootfs); err != nil {
		return &os.PathError{Op: "chdir", Path: config.Rootfs, Err: err}
	}

	s := iConfig.SpecState
	s.Pid = unix.Getpid()
	s.Status = specs.StateCreating
	if err := iConfig.Config.Hooks[configs.CreateContainer].RunHooks(s); err != nil {
		return err
	}

	if config.NoPivotRoot {
		err = msMoveRoot(config.Rootfs)
	} else if config.Namespaces.Contains(configs.NEWNS) {
		err = pivotRoot(config.Rootfs)
	} else {
		err = chroot()
	}
	if err != nil {
		return fmt.Errorf("error jailing process inside rootfs: %w", err)
	}

	if setupDev {
		if err := reOpenDevNull(); err != nil {
			return fmt.Errorf("error reopening /dev/null inside container: %w", err)
		}
	}

	if cwd := iConfig.Cwd; cwd != "" {
		// Note that spec.Process.Cwd can contain unclean value like  "../../../../foo/bar...".
		// However, we are safe to call MkDirAll directly because we are in the jail here.
		if err := os.MkdirAll(cwd, 0o755); err != nil {
			return err
		}
	}

	return nil
}

1、prepareRoot

1.1 RootPropagation

func prepareRoot(config *configs.Config) error {
   flag := unix.MS_SLAVE | unix.MS_REC
   if config.RootPropagation != 0 {
      flag = config.RootPropagation
   }
   if err := mount("", "/", "", "", uintptr(flag), ""); err != nil {
      return err
   }

   // Make parent mount private to make sure following bind mount does
   // not propagate in other namespaces. Also it will help with kernel
   // check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
   if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
      return err
   }

   return mount(config.Rootfs, config.Rootfs, "", "bind", unix.MS_BIND|unix.MS_REC, "")
}

在 prepareRoot 的最一开始，先进行了一次 mount，这次 mount 实际上是一个 propagation 的递归修改（unix.MS_REC）。默认情况下 flag 是 unix.MS_SLAVE。从 linux 小手册上可以得知，这个 flag 表示 mount 点从属挂载下的 mount 事件单向传播，此从节点下的挂载将不会影响到主节点。由于它这里 mount 的是 "/" 目录，而且使用了递归参数，即表示在此 ns 中的任何 mount 操作，都不对外界产生影响，不过反过来（准确的说是 peer group 之间）是产生影响的。

我们这里模拟一下，进行一个 tmpfs 的 mount，并设置传播等级为 shared：

mount -t tmpfs myt /root/dir1 --make-shared

findmnt -o TARGET,PROPAGATION 查看一下传播等级：

|-/var/lib/kubelet/pods/6c4a58a7-557f-4cc8-b95f-4170c6ac2ab8/volume-subpaths/dashboard-manager-secret/customer-dashboard-manager/2 private
|-/root/dir1 shared

我模拟 runc clone 一个 ns，然后同样查看传播等级，发现结果与上面一样。执行 mount --make-rslave /，再次查看传播等级，发现已经变成了 slave，而原先的 private 则保持不变：

|-/var/lib/kubelet/pods/6c4a58a7-557f-4cc8-b95f-4170c6ac2ab8/volume-subpaths/dashboard-manager-secret/customer-dashboard-manager/2 private
|-/root/dir1 private,slave

行为也和 man page 的描述一致，不是 shared 的并不会因为此命令而改变：

MS_SLAVE
              If this is a shared mount that is a member of a peer group
              that contains other members, convert it to a slave mount.
              If this is a shared mount that is a member of a peer group
              that contains no other members, convert it to a private
              mount.  Otherwise, the propagation type of the mount is
              left unchanged.

当然我们可以看到这里留了个口子，可以依据 config.RootPropagation 来改变这个默认行为，docker 的默认是 rprivate，即双向的 mount 都互不产生影响。K8s 的默认也是 private，在 K8s 1.2.1以后，支持对 Volume 进行传播等级配置，比如 HostToContainer，其实就是 MS_SLAVE。还有一种 Bidirectional，则是 MS_SHARED，表示此 ns 下的 mount 与外界共享，这个口子灵活又危险，比如可以在容器里进行 device 的 mount/unmount。

1.2 rootfsParentMountPrivate

这块的注释非常全，其实就是检查一下准备作为 root 的这个目录是不是 shared，如果是 shared，则改为 private。也就是无论如何，容器都要求 rootfs 为 private，即使我们将 RootPropagation 设置为 shared 或者其他。

这块意图也合理，如果 rootfs 如果随意被 propagation 影响，很容易导致容器崩溃。（不过我也不太确定我这个猜想是否正确。）

另外，这里注释提到，把他改成 private 也是避免后续做 bind 操作的时候，将 mount 传播到其他 namespace。以及，pivot_root 也不允许此 mount 为 shared。

1.3 bind

Bind 和硬链看起来有点点像，不过底层实现完全不同。man page 提到 bind 是一种对 fs attach 的操作，而软硬链是借助 inode 来完成的。

mount(config.Rootfs, config.Rootfs, "", "bind", unix.MS_BIND|unix.MS_REC, "")

REC 参数的意图和上面提到的 propagation 时的一致，就是递归。man page 中提到，如果没有 REC 参数，则 bind 只 mount 当前这个目录，而目录底下的 submounts 不会被复制。我们发现它把 rootfs 目录 bind 到 rootfs 目录了，这是为了创建一个 mountpoint。这个 mountpoint 是容器根目录的 mount，比如：

➜  ~ mount
/dev/disk3s3s1 on / (apfs, sealed, local, read-only, journaled)

2、mountToRootfs

在 bind 完 rootfs 这个 mountpoint 后，会根据 config.Mounts 中的配置，去逐个创建对应的 mount，这里就是处理我们挂载的地方：

func mountToRootfs(m *configs.Mount, c *mountConfig) error {
	rootfs := c.root
	mountLabel := c.label
	mountFd := c.fd
	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
	if err != nil {
		return err
	}

	switch m.Device {
		case "proc", "sysfs": ...
		case "mqueue": ...
		case "tmpfs": ...
		case "bind": ...
		case "cgroup": ...
		default: ...
	}
	if err := setRecAttr(m, rootfs); err != nil {
		return err
	}
	return nil
}

整体的流程不难看懂，不同的类型有不通的 mount 流程，而最后的 setRecAttr 感兴趣的可以看下 mount_setattr(2)。

就以 proc/sysfs 为例，就是检查一下 dst，确保是一个目录，并且不能是 symlink。注释这里提到了有意思的 symlink-exchange attacks，感兴趣的可以看看 mounts outside，提到了 symlink 导致的 mount 逃逸，讲的十分详细（其实我也就大略看了一下）。

	case "proc", "sysfs":
		// If the destination already exists and is not a directory, we bail
		// out This is to avoid mounting through a symlink or similar -- which
		// has been a "fun" attack scenario in the past.
		// TODO: This won't be necessary once we switch to libpathrs and we can
		//       stop all of these symlink-exchange attacks.
		if fi, err := os.Lstat(dest); err != nil {
			if !os.IsNotExist(err) {
				return err
			}
		} else if fi.Mode()&os.ModeDir == 0 {
			return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
		}
		if err := os.MkdirAll(dest, 0o755); err != nil {
			return err
		}
		// Selinux kernels do not support labeling of /proc or /sys
		return mountPropagate(m, rootfs, "", nil)

底层调用的都是 mountPropagate ，这是 runc 对 mount 的一层安全封装，确保没有一些恶意挂载：

// Do the mount operation followed by additional mounts required to take care
// of propagation flags. This will always be scoped inside the container rootfs.
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string, mountFd *int) error {}

其他类型的挂载实际上大同小异，它们基本都围绕 “安全” 为核心，对挂载做各种检查，并执行。

3、setupDev

	if setupDev {
		if err := createDevices(config); err != nil {
			return fmt.Errorf("error creating device nodes: %w", err)
		}
		if err := setupPtmx(config); err != nil {
			return fmt.Errorf("error setting up ptmx: %w", err)
		}
		if err := setupDevSymlinks(config.Rootfs); err != nil {
			return fmt.Errorf("error setting up /dev symlinks: %w", err)
		}
	}

这块内容略过，主要我也不是很了解比如 mknod 之类的指令。对 linux 有一定了解的小伙伴应该知道 dev 指的是设备，对应 /dev 目录。

我们知道 docker 可以用 --device 来绑定设备，createDevices 本质上也是通过 mount 来完成的，它这里会将 host 的设备通过 bind 或者 mknode 到容器目录中。

setupPtmx 是将 pts/ptmx 软链到了容器中，以便支持 pty。最后部分的 setupDevSymlinks 则是一些小优化，比如它会把标准输入输出的 fd 通过软链放到 /dev 底下。

4、容器初始化时简单的 hook

4.1 syncParentHooks

	// Signal the parent to run the pre-start hooks.
	// The hooks are run after the mounts are setup, but before we switch to the new
	// root, so that the old root is still available in the hooks for any mount
	// manipulations.
	// Note that iConfig.Cwd is not guaranteed to exist here.
	if err := syncParentHooks(pipe); err != nil {
		return err
	}

这块内容与主题无关，不过有点小意思。我们知道 runc 由父进程来创建 namespace，再由子进程来初始化容器，这里就用了 Pipe 来实现 PreStart，这个点正好是还没 chroot/pivot_root 的时候，理论上是可以做一些危险操作的，不过要注意，这个调用是发生在父进程：

// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
	// Tell parent.
	if err := writeSync(pipe, procHooks); err != nil {
		return err
	}

	// Wait for parent to give the all-clear.
	return readSync(pipe, procResume)
}

4.2 createContainerHooks

这个 Hooks 则是发生在当前进程（容器主进程），代码很简单，不多说：

	// The reason these operations are done here rather than in finalizeRootfs
	// is because the console-handling code gets quite sticky if we have to set
	// up the console before doing the pivot_root(2). This is because the
	// Console API has to also work with the ExecIn case, which means that the
	// API must be able to deal with being inside as well as outside the
	// container. It's just cleaner to do this here (at the expense of the
	// operation not being perfectly split).

	if err := unix.Chdir(config.Rootfs); err != nil {
		return &os.PathError{Op: "chdir", Path: config.Rootfs, Err: err}
	}

	s := iConfig.SpecState
	s.Pid = unix.Getpid()
	s.Status = specs.StateCreating
	if err := iConfig.Config.Hooks[configs.CreateContainer].RunHooks(s); err != nil {
		return err
	}

5、msMoveRoot/chroot/pivotRoot

我们知道，进入容器后，只能看到容器内的目录，这实际上就是这上面三个命令的功劳。可能大家最熟悉的就是 chroot，这个 jail 技术已经存在很多年了。

不过在 runc 中，chroot 并不是最优选择，chroot 设计之初就不是为了创建一个安全且隔离的环境，它存在不少限制。其实从 man page 的定义中就可以看出 pivotRoot 和 chRoot 的底层原理是不同的：

chroot - run command or interactive shell with special root directory
pivot_root - change the root mount

chroot 是改变了 cmd/shell 的 root dir，而 pivot_root 是直接改了 root mount，chroot 有一个著名的越狱方案就是在 chroot 中调用 chroot，这里直接贴一下维基百科的说法：

chroot 机制的设计中，并不包括抵抗特权用户（root）的蓄意篡改。在大多数的系统中，chroot环境没有设计出适当的堆栈，所以一个在chroot下执行的程序，可能会透过第二次chroot来获得足够权限，逃出chroot的限制。为了减轻这种安全漏洞所带来的风险，在使用chroot后，在chroot下执行的程序，应该尽快放弃root权限，或是改用其他机制来替代，例如FreeBSD jail。在某些操作系统中，例如FreeBSD，已经采取预防措施，来防止第二次chroot的攻击[1]。

在支持设备节点的文件系统中，一个在chroot中的root用户仍然可以创建设备节点和挂载在chroot根目录的文件系统；尽管，chroot机制不是被打算用来阻止低特权用户级访问系统设备。
在启动时，程序都期望能在某些预设位置找到scratch space，配置文件，设备节点和共享库。对于一个成功启动的被chroot的程序，在chroot目录必须最低限度配备的这些文件设置。这使得chroot难以作为一般的沙箱来使用。
只有root用户可以执行chroot。这是为了防止用户把一个setuid的程序放入一个特制的chroot监牢（例如一个有着假的/etc/passwd和/etc/shadow文件的chroot监牢）由于引起提权攻击。
在chroot的机制本身也不是为限制资源的使用而设计，如I/O，带宽，磁盘空间或CPU时间。大多数Unix系统都没有以完全文件系统为导向，以即给可能通过网络和过程控制，通过系统调用接口来提供一个破坏chroot的程序。

msMoveRoot 本质上也是调用了 chroot，是一个 chroot 的安全加强版：

// Before we move the root and chroot we have to mask all "full" sysfs and
	// procfs mounts which exist on the host. This is because while the kernel
	// has protections against mounting procfs if it has masks, when using
	// chroot(2) the *host* procfs mount is still reachable in the mount
	// namespace and the kernel permits procfs mounts inside --no-pivot
	// containers.
	//
	// Users shouldn't be using --no-pivot except in exceptional circumstances,
	// but to avoid such a trivial security flaw we apply a best-effort
	// protection here. The kernel only allows a mount of a pseudo-filesystem
	// like procfs or sysfs if there is a *full* mount (the root of the
	// filesystem is mounted) without any other locked mount points covering a
	// subtree of the mount.
	//
	// So we try to unmount (or mount tmpfs on top of) any mountpoint which is
	// a full mount of either sysfs or procfs (since those are the most
	// concerning filesystems to us).
	mountinfos, err := mountinfo.GetMounts(func(info *mountinfo.Info) (skip, stop bool) {
		// Collect every sysfs and procfs filesystem, except for those which
		// are non-full mounts or are inside the rootfs of the container.
		if info.Root != "/" ||
			(info.FSType != "proc" && info.FSType != "sysfs") ||
			strings.HasPrefix(info.Mountpoint, rootfs) {
			skip = true
		}
		return
	})
	if err != nil {
		return err
	}
	for _, info := range mountinfos {
		p := info.Mountpoint
		// Be sure umount events are not propagated to the host.
		if err := mount("", p, "", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
			if errors.Is(err, unix.ENOENT) {
				// If the mountpoint doesn't exist that means that we've
				// already blasted away some parent directory of the mountpoint
				// and so we don't care about this error.
				continue
			}
			return err
		}
		if err := unmount(p, unix.MNT_DETACH); err != nil {
			if !errors.Is(err, unix.EINVAL) && !errors.Is(err, unix.EPERM) {
				return err
			} else {
				// If we have not privileges for umounting (e.g. rootless), then
				// cover the path.
				if err := mount("tmpfs", p, "", "tmpfs", 0, ""); err != nil {
					return err
				}
			}
		}
	}

  // Move the rootfs on top of "/" in our mount namespace.
	if err := mount(rootfs, "/", "", "", unix.MS_MOVE, ""); err != nil {
		return err
	}
	return chroot()
}

代码很长，实际上做的事情不复杂：这里把当前 ns 中的 proc/sysfs，且不属于 rootfs 底下的 mount 过滤出来 umount 掉了。

最后做了一手 MS_MOVE，把 rootfs 这个 mount 挪到了 /，这里猜测是防止 chdir(../) chroot(/) 这种组合拳，因为把 mount 挪过去，原来的 rootfs mount 就不存在了，最后再执行一下 chroot。不过即使如此，runc 依旧不推荐使用 chroot。

6、finalizeRootfs

prepareRootfs 简单的过了一下，其实也就是它注释提到的那几件事情，进行设备、挂载点、fs的初始化，而 finalizeRootfs 是 prepareRootfs 的收尾。

// finalizeRootfs sets anything to ro if necessary. You must call
// prepareRootfs first.
func finalizeRootfs(config *configs.Config) (err error) {
	// All tmpfs mounts and /dev were previously mounted as rw
	// by mountPropagate. Remount them read-only as requested.
	for _, m := range config.Mounts {
		if m.Flags&unix.MS_RDONLY != unix.MS_RDONLY {
			continue
		}
		if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
			if err := remountReadonly(m); err != nil {
				return err
			}
		}
	}

	// set rootfs ( / ) as readonly
	if config.Readonlyfs {
		if err := setReadonly(); err != nil {
			return fmt.Errorf("error setting rootfs as readonly: %w", err)
		}
	}

	if config.Umask != nil {
		unix.Umask(int(*config.Umask))
	} else {
		unix.Umask(0o022)
	}
	return nil
}

finalizeRootfs 第一段代码实际上是和之前的 mountPropagate（就那个 runc 对 mount 操作的安全封装）交相呼应：

// Do the mount operation followed by additional mounts required to take care
// of propagation flags. This will always be scoped inside the container rootfs.
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string, mountFd *int) error {
	var (
		data  = label.FormatMountLabel(m.Data, mountLabel)
		flags = m.Flags
	)
	// Delay mounting the filesystem read-only if we need to do further
	// operations on it. We need to set up files in "/dev", and other tmpfs
	// mounts may need to be chmod-ed after mounting. These mounts will be
	// remounted ro later in finalizeRootfs(), if necessary.
	if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
		flags &= ^unix.MS_RDONLY
	}
  
  ...
}

由于 tmpfs 和 /dev 有可能会在 mount 之后做一些初始化，或者 chmod，所以当初挂的时候即使是 MS_RDONLY，也 ^ 掉了，在最后 finalize 的时候，如果配置了 MS_RDONLY，再 remount 一下，让它真正 mount 成 MS_RDONLY。

第二段代码也是一样的道理，如果配置中设置了 Readonlyfs，同样也是在最后关头再设置成只读。

第三段代码做了一个 umask，我们只看这个默认的 umask 022，因为正常的文件/目录如果未经过设置，是 666/777，umask 实际上就是把其他用户组的写权限拿掉，变成 644/755。

二、 runc 的使用

分析到这里，我们发现，runc 并没有对 rootfs 这个 mountpoint 是什么挂载去做定义，只是做了下 bind，而且允许我们自由定义 rootfs 如何挂载。至少看到这里，我们认为从外部提供一个 mount 配置挂载到 root 上是可行的。不如体验一下在不同的配置下， runc 是如何为我们生成容器的。

1、使用 runc 创建并进入容器

我们先跑下 runc spec && cat config.json，通过这个命令，能提供一个缺省的配置，实际上这坨配置就是 OCI-runtime-spec，描述如下：

{
	"ociVersion": "1.0.2-dev",
	"process": {
		"terminal": true,
		"user": {
			"uid": 0,
			"gid": 0
		},
		"args": [
			"sh"
		],
		"env": [
			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
			"TERM=xterm"
		],
		"cwd": "/",
		"capabilities": {
			"bounding": [
				"CAP_AUDIT_WRITE",
				"CAP_KILL",
				"CAP_NET_BIND_SERVICE"
			],
			"effective": [
				"CAP_AUDIT_WRITE",
				"CAP_KILL",
				"CAP_NET_BIND_SERVICE"
			],
			"permitted": [
				"CAP_AUDIT_WRITE",
				"CAP_KILL",
				"CAP_NET_BIND_SERVICE"
			],
			"ambient": [
				"CAP_AUDIT_WRITE",
				"CAP_KILL",
				"CAP_NET_BIND_SERVICE"
			]
		},
		"rlimits": [
			{
				"type": "RLIMIT_NOFILE",
				"hard": 1024,
				"soft": 1024
			}
		],
		"noNewPrivileges": true
	},
	"root": {
		"path": "rootfs",
		"readonly": true
	},
	"hostname": "runc",
	"mounts": [
		{
			"destination": "/proc",
			"type": "proc",
			"source": "proc"
		},
		{
			"destination": "/dev",
			"type": "tmpfs",
			"source": "tmpfs",
			"options": [
				"nosuid",
				"strictatime",
				"mode=755",
				"size=65536k"
			]
		},
		{
			"destination": "/dev/pts",
			"type": "devpts",
			"source": "devpts",
			"options": [
				"nosuid",
				"noexec",
				"newinstance",
				"ptmxmode=0666",
				"mode=0620",
				"gid=5"
			]
		},
		{
			"destination": "/dev/shm",
			"type": "tmpfs",
			"source": "shm",
			"options": [
				"nosuid",
				"noexec",
				"nodev",
				"mode=1777",
				"size=65536k"
			]
		},
		{
			"destination": "/dev/mqueue",
			"type": "mqueue",
			"source": "mqueue",
			"options": [
				"nosuid",
				"noexec",
				"nodev"
			]
		},
		{
			"destination": "/sys",
			"type": "sysfs",
			"source": "sysfs",
			"options": [
				"nosuid",
				"noexec",
				"nodev",
				"ro"
			]
		},
		{
			"destination": "/sys/fs/cgroup",
			"type": "cgroup",
			"source": "cgroup",
			"options": [
				"nosuid",
				"noexec",
				"nodev",
				"relatime",
				"ro"
			]
		}
	],
	"linux": {
		"resources": {
			"devices": [
				{
					"allow": false,
					"access": "rwm"
				}
			]
		},
		"namespaces": [
			{
				"type": "pid"
			},
			{
				"type": "network"
			},
			{
				"type": "ipc"
			},
			{
				"type": "uts"
			},
			{
				"type": "mount"
			}
		],
		"maskedPaths": [
			"/proc/acpi",
			"/proc/asound",
			"/proc/kcore",
			"/proc/keys",
			"/proc/latency_stats",
			"/proc/timer_list",
			"/proc/timer_stats",
			"/proc/sched_debug",
			"/sys/firmware",
			"/proc/scsi"
		],
		"readonlyPaths": [
			"/proc/bus",
			"/proc/fs",
			"/proc/irq",
			"/proc/sys",
			"/proc/sysrq-trigger"
		]
	}
}

不过 rootfs 这个缺省目录下并没有一套根文件系统（现在都不存在这个目录），直接运行肯定是会报错的，如下：

> runc run config.json
ERRO[0000] runc run failed: invalid rootfs: stat /root/rootfs: no such file or directory

这里借助 docker export 了一个 busybox 的根文件系统，并放在 /root/rootfs 下，并将刚才的配置 root.path 修改为 /root/rootfs：

> ls
VERSION  bin  custom  dev  etc  home  json  lib  lib64  proc  root  tmp  usr  var
> pwd
/root/rootfs

执行 runc 命令启动此容器：

> runc run config.json

/ # ls
VERSION  bin      custom   dev      etc      home     json     lib      lib64    proc     root     sys      tmp      usr      var

/ # echo $$
1

/ # ps -ef
PID   USER     TIME  COMMAND
    1 root      0:00 sh
    8 root      0:00 ps -ef
    
/ # mount
/dev/vda1 on / type ext4 (ro,noatime)
...

/ # echo something > test.log
sh: can't create test.log: Read-only file system

确实如配置那样，rootfs 被设置为只读，对应我们在第一小节里讲到的 finalizeRootfs 中的第二段操作。

	"root": {
		"path": "rootfs",
		"readonly": true
	},

2、尝试在 chroot 下进行越狱

我们试试在不安全的 chroot 底下进行越狱，先改一下刚才生成的配置，把 capabilities 的权限打开，不然有些命令比如 chroot 跑不了。另外就是 namespaces 需要去掉 mount（就是 NEWNS），如果打开了 NEWNS，根据我们前面的源码分析，它会自动去进行 pivot_root。最后再去掉 MaskPaths 和 ReadonlyPaths，否则无法通过安全检查：

{
	"ociVersion": "1.0.2-dev",
	"process": {
		"terminal": true,
		"user": {
			"uid": 0,
			"gid": 0
		},
		"args": [
			"bash"
		],
		"env": [
			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
			"TERM=xterm"
		],
		"cwd": "/",
		"capabilities": {
			"bounding": [
				"CAP_AUDIT_WRITE",
				"CAP_KILL",
				"CAP_NET_BIND_SERVICE",
                                "CAP_SYS_CHROOT",
                                "CAP_MKNOD",
                                "CAP_SYS_ADMIN"
			],
			"effective": [
				"CAP_AUDIT_WRITE",
				"CAP_KILL",
				"CAP_NET_BIND_SERVICE",
                                "CAP_SYS_CHROOT",
                                "CAP_MKNOD",
                                "CAP_SYS_ADMIN"
			],
			"permitted": [
				"CAP_AUDIT_WRITE",
				"CAP_KILL",
				"CAP_NET_BIND_SERVICE",
                                "CAP_SYS_CHROOT",
                                "CAP_MKNOD",
                                "CAP_SYS_ADMIN"
			],
			"ambient": [
				"CAP_AUDIT_WRITE",
				"CAP_KILL",
				"CAP_NET_BIND_SERVICE",
                                "CAP_SYS_CHROOT",
                                "CAP_MKNOD",
                                "CAP_SYS_ADMIN"
			]
		},
		"rlimits": [
			{
				"type": "RLIMIT_NOFILE",
				"hard": 1024,
				"soft": 1024
			}
		],
		"noNewPrivileges": true
	},
	"root": {
		"path": "/root/ubuntu"
	},
	"hostname": "runc",
	"mounts": [
		{
			"destination": "/proc",
			"type": "proc",
			"source": "proc"
		},
		{
			"destination": "/dev",
			"type": "tmpfs",
			"source": "tmpfs",
			"options": [
				"nosuid",
				"strictatime",
				"mode=755",
				"size=65536k"
			]
		},
		{
			"destination": "/dev/pts",
			"type": "devpts",
			"source": "devpts",
			"options": [
				"nosuid",
				"noexec",
				"newinstance",
				"ptmxmode=0666",
				"mode=0620",
				"gid=5"
			]
		},
		{
			"destination": "/dev/shm",
			"type": "tmpfs",
			"source": "shm",
			"options": [
				"nosuid",
				"noexec",
				"nodev",
				"mode=1777",
				"size=65536k"
			]
		},
		{
			"destination": "/dev/mqueue",
			"type": "mqueue",
			"source": "mqueue",
			"options": [
				"nosuid",
				"noexec",
				"nodev"
			]
		},
		{
			"destination": "/sys",
			"type": "sysfs",
			"source": "sysfs",
			"options": [
				"nosuid",
				"noexec",
				"nodev",
				"ro"
			]
		},
		{
			"destination": "/sys/fs/cgroup",
			"type": "cgroup",
			"source": "cgroup",
			"options": [
				"nosuid",
				"noexec",
				"nodev",
				"relatime",
				"ro"
			]
		}
	],
	"linux": {
		"resources": {
			"devices": [
				{
					"allow": false,
					"access": "rwm"
				}
			]
		},
		"namespaces": [
			{
				"type": "pid"
			},
			{
				"type": "network"
			},
			{
				"type": "ipc"
			},
			{
				"type": "uts"
			}
		]
	}
}

进入容器后，我们执行越狱教程中提供的代码，成功 break out：

// 进入容器
[root@master ~]# runc run config.json

root@runc:/# ls -la
total 104
drwxr-xr-x  21 root root  4096 Feb 16 13:56 .
drwxr-xr-x  21 root root  4096 Feb 16 13:56 ..
-rwxr-xr-x   1 root root     0 Feb 14 03:20 .dockerenv
lrwxrwxrwx   1 root root     7 Jan 26 02:03 bin -> usr/bin
drwxr-xr-x   2 root root  4096 Apr 18  2022 boot
-rwxr-xr-x   1 root root 29160 Feb 16 10:22 break
drwxr-xr-x   2 root root  4096 Feb 16 14:00 d1r1
drwxr-xr-x   2 root root  4096 Feb 16 14:00 d1r2
drwxr-xr-x   2 root root  4096 Feb 16 14:01 d1r3
drwxr-xr-x   5 root root   360 Feb 16 14:40 dev
drwxr-xr-x  32 root root  4096 Feb 14 03:20 etc
drwxr-xr-x   2 root root  4096 Apr 18  2022 home
lrwxrwxrwx   1 root root     7 Jan 26 02:03 lib -> usr/lib
lrwxrwxrwx   1 root root     9 Jan 26 02:03 lib32 -> usr/lib32
lrwxrwxrwx   1 root root     9 Jan 26 02:03 lib64 -> usr/lib64
lrwxrwxrwx   1 root root    10 Jan 26 02:03 libx32 -> usr/libx32
drwxr-xr-x   2 root root  4096 Jan 26 02:03 media
drwxr-xr-x   2 root root  4096 Jan 26 02:03 mnt
drwxr-xr-x   2 root root  4096 Jan 26 02:03 opt
dr-xr-xr-x 375 root root     0 Feb 16 14:40 proc
drwx------   2 root root  4096 Feb 16 10:23 root
drwxr-xr-x   6 root root  4096 Feb 14 03:20 run
lrwxrwxrwx   1 root root     8 Jan 26 02:03 sbin -> usr/sbin
drwxr-xr-x   2 root root  4096 Jan 26 02:03 srv
dr-xr-xr-x  12 root root     0 Feb 16 14:40 sys
drwxrwxrwt   2 root root  4096 Jan 26 02:06 tmp
drwxr-xr-x  14 root root  4096 Jan 26 02:03 usr
drwxr-xr-x  11 root root  4096 Jan 26 02:06 var
drwxr-xr-x   2 root root  4096 Feb 16 10:22 waterbuffalo

// 其实就是 chdir(..) + chroot(.)
root@runc:/# ./break

// 越狱成功
[root@runc /]# ls -la
total 18920
dr-xr-xr-x  23 root root     4096 Feb 16 22:39 .
dr-xr-xr-x  23 root root     4096 Feb 16 22:39 ..
drwxr-xr-x   3 root root     4096 Jan  8  2021 agent
drwxr-xr-x   3 root root     4096 Nov 24 16:03 api-helm
-rw-r--r--   1 root root        0 Oct 30  2020 .autorelabel
lrwxrwxrwx   1 root root        7 Dec 14  2020 bin -> usr/bin
dr-xr-xr-x   5 root root     4096 Nov 30  2021 boot
-rwxr-xr-x   1 root root 19261816 Dec  8 14:30 cloud-agent
-rw-------   1 root root    12288 Nov 25 11:27 .conf.txt.swp
drwxr-xr-x  13 root root     4096 Feb 16 22:39 data
drwxr-xr-x  17 root root    14140 Sep  9 16:20 dev
drwxr-xr-x 108 root root    12288 Feb 16 10:59 etc
drwxr-xr-x   2 root root     4096 Dec 14  2020 home
lrwxrwxrwx   1 root root        7 Dec 14  2020 lib -> usr/lib
lrwxrwxrwx   1 root root        9 Dec 14  2020 lib64 -> usr/lib64
drwx------   2 root root    16384 Aug 18  2020 lost+found
drwxr-xr-x   2 root root     4096 Dec 14  2020 media
drwxr-xr-x   2 root root     4096 Dec 14  2020 mnt
drwxr-xr-x   6 root root     4096 Sep  9 16:31 opt
dr-xr-xr-x 375 root root        0 Sep  9 16:19 proc
dr-xr-x---  31 root root     4096 Feb 16 22:34 root
drwxr-xr-x   2 root root     4096 Dec  5 11:40 rot
drwxr-xr-x  33 root root     1120 Feb 16 14:59 run
lrwxrwxrwx   1 root root        8 Dec 14  2020 sbin -> usr/sbin
drwxr-xr-x   2 root root     4096 Dec 14  2020 srv
dr-xr-xr-x  12 root root        0 Sep  9 16:19 sys
drwxr-xr-x   2 root root     4096 Dec  5 11:33 test
drwxrwxrwt   4 root root     4096 Feb 16 22:35 tmp
drwxr-xr-x  12 root root     4096 Nov 30  2021 usr
drwxr-xr-x  21 root root     4096 Sep  9 15:53 var
[root@runc /]#

至此也算是告一段落了，起码我们粗浅地了解了整个容器是如何初始化的，以及我们知道了，容器的 rootfs 是可以随意指定目录的。不过，开头提到的问题还是没能回答。不管是 Docker，还是 K8s，实际上都无法进行下列操作：

> docker run -it -d --name xxx -p 8091:8090 -v /xxx:/ ubuntu
docker: Error response from daemon: invalid volume specification: '/xxx:/': invalid mount config for type "bind": invalid specification: destination can't be '/'.
See 'docker run --help'.

下一期，我们将尝试从 OCI、CRI 的角度再度探讨这个问题。

文章如有错误，感谢指正。

Reference

小手册：https://man7.org/linux/man-pages/
runc：https://github.com/opencontainers
大佬博客文章 Linux: Mount Shared Subtrees：https://pages.dogdog.run/tech/mount_subtree.html

容器能不能将 volume 挂载直接挂到根目录？（上）—— 从 runc 说起

一、从 runc 源码开始

1、prepareRoot

1.1 RootPropagation

1.2 rootfsParentMountPrivate

1.3 bind

2、mountToRootfs

3、setupDev

4、容器初始化时简单的 hook

4.1 syncParentHooks

4.2 createContainerHooks

5、msMoveRoot/chroot/pivotRoot

6、finalizeRootfs

二、 runc 的使用

1、使用 runc 创建并进入容器

2、尝试在 chroot 下进行越狱

Reference

关于游戏付费的一点想法

我通过CKA和CKS啦！

《最新出炉》系列入门篇-Python+Playwright自动化测试-42-强大的可视化追踪利器Trace Viewer

大数据怎么学？对大数据开发领域及岗位的详细解读，完整理解大数据开发领域技术体系

安裝chromadb注意事項

前端面試題 - null是原始類型，但爲什麼typeof null的結果是object？

前端面試題 - 如何實現promise？

Java中的List

有遇到過嗎？同樣的規則 Excel 中比Python 結果大

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結