criService 實現了接口 runtime.RuntimeServiceServer
1. RunPodSandbox 函數
路徑 pkg/server/sandbox_run.go,創建以及啓動 sandbox,確認成功是 sandbox 狀態爲 ready
// RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure
// the sandbox is in ready state.
func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
config := r.GetConfig()
log.G(ctx).Debugf("Sandbox config %+v", config)
1.1 生成 ID,生成 name,註冊 name <--> key 映射關係,防治並行創建
// Generate unique id and name for the sandbox and reserve the name.
id := util.GenerateID()
metadata := config.GetMetadata()
if metadata == nil {
return nil, errors.New("sandbox config must include metadata")
}
name := makeSandboxName(metadata)
log.G(ctx).Debugf("Generated id %q for sandbox %q", id, name)
// Reserve the sandbox name to avoid concurrent `RunPodSandbox` request starting the
// same sandbox.
if err := c.sandboxNameIndex.Reserve(name, id); err != nil {
return nil, errors.Wrapf(err, "failed to reserve sandbox name %q", name)
}
1.2 實例化 Sandbox,初始狀態爲 unknown
// Create initial internal sandbox object.
sandbox := sandboxstore.NewSandbox(
sandboxstore.Metadata{
ID: id,
Name: name,
Config: config,
RuntimeHandler: r.GetRuntimeHandler(),
},
sandboxstore.Status{
State: sandboxstore.StateUnknown,
},
)
1.3 確保有鏡像,如果沒有鏡像則 pull 鏡像
// Ensure sandbox container image snapshot.
image, err := c.ensureImageExists(ctx, c.config.SandboxImage, config)
if err != nil {
return nil, errors.Wrapf(err, "failed to get sandbox image %q", c.config.SandboxImage)
}
containerdImage, err := c.toContainerdImage(ctx, *image)
if err != nil {
return nil, errors.Wrapf(err, "failed to get image from containerd %q", image.ID)
}
1.4 獲取 sandbox runtime
註解 io.kubernetes.cri.untrusted-workload = true,設置這個 untrusted 返回 untrusted runtime,否則返回默認 runtime io.containerd.runc.v1
[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "overlayfs"
default_runtime_name = "runc"
no_pivot = false
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v1"
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
// getSandboxRuntime returns the runtime configuration for sandbox.
// If the sandbox contains untrusted workload, runtime for untrusted workload will be returned,
// or else default runtime will be returned.
func (c *criService) getSandboxRuntime(config *runtime.PodSandboxConfig, runtimeHandler string) (criconfig.Runtime, error) {
if untrustedWorkload(config) {
// If the untrusted annotation is provided, runtimeHandler MUST be empty.
if runtimeHandler != "" && runtimeHandler != criconfig.RuntimeUntrusted {
return criconfig.Runtime{}, errors.New("untrusted workload with explicit runtime handler is not allowed")
}
// If the untrusted workload is requesting access to the host/node, this request will fail.
//
// Note: If the workload is marked untrusted but requests privileged, this can be granted, as the
// runtime may support this. For example, in a virtual-machine isolated runtime, privileged
// is a supported option, granting the workload to access the entire guest VM instead of host.
// TODO(windows): Deprecate this so that we don't need to handle it for windows.
if hostAccessingSandbox(config) {
return criconfig.Runtime{}, errors.New("untrusted workload with host access is not allowed")
}
runtimeHandler = criconfig.RuntimeUntrusted
}
if runtimeHandler == "" {
runtimeHandler = c.config.ContainerdConfig.DefaultRuntimeName
}
handler, ok := c.config.ContainerdConfig.Runtimes[runtimeHandler]
if !ok {
return criconfig.Runtime{}, errors.Errorf("no runtime for %q is configured", runtimeHandler)
}
return handler, nil
}
1.5 需要爲 pod 設置網絡
如果不是 host 網絡模式,需要創建 namespace
NewNetNS 創建網絡 namespace,在目錄 /var/run/netns/cni-%x-%x-%x-%x-%x
if podNetwork {
// If it is not in host network namespace then create a namespace and set the sandbox
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
// namespaces. If the pod is in host network namespace then both are empty and should not
// be used.
sandbox.NetNS, err = netns.NewNetNS()
if err != nil {
return nil, errors.Wrapf(err, "failed to create network namespace for sandbox %q", id)
}
sandbox.NetNSPath = sandbox.NetNS.GetPath()
2. setupPodNetwork 爲 sandbox 創建網絡
整理傳給 CNI 插件的配置,包括 sandbox ID,網絡 namespace,以及基本配置,如果包括 bandwidth,dns
// setupPodNetwork setups up the network for a pod
func (c *criService) setupPodNetwork(ctx context.Context, sandbox *sandboxstore.Sandbox) error {
var (
id = sandbox.ID
config = sandbox.Config
path = sandbox.NetNSPath
)
if c.netPlugin == nil {
return errors.New("cni config not initialized")
}
opts, err := cniNamespaceOpts(id, config)
if err != nil {
return errors.Wrap(err, "get cni namespace options")
}
2..1 netPlugin.Setup 最終調用 AddNetworkList CNI 插件接口爲 sandbox 配置網絡
最終調用 plugin 二進制爲 sandbox 配置網絡
這裏輕描淡寫,知道個過程過,假設配置配置網絡成功,接着看看做了哪些工作
result, err := c.netPlugin.Setup(ctx, id, path, opts...)
if err != nil {
return err
}
logDebugCNIResult(ctx, id, result)
// Check if the default interface has IP config
if configs, ok := result.Interfaces[defaultIfName]; ok && len(configs.IPConfigs) > 0 {
sandbox.IP, sandbox.AdditionalIPs = selectPodIPs(configs.IPConfigs)
sandbox.CNIResult = result
return nil
}
3. 生成 runtime spec 配置
可以使用 crictl pods,crictl inspectp $id 查看配置
func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig,
imageConfig *imagespec.ImageConfig, nsPath string, runtimePodAnnotations []string) (*runtimespec.Spec, error) {
// Creates a spec Generator with the default spec.
// TODO(random-liu): [P1] Compare the default settings with docker and containerd default.
specOpts := []oci.SpecOpts{
customopts.WithoutRunMount,
customopts.WithoutDefaultSecuritySettings,
customopts.WithRelativeRoot(relativeRootfsPath),
oci.WithEnv(imageConfig.Env),
oci.WithRootFSReadonly(),
oci.WithHostname(config.GetHostname()),
}
if imageConfig.WorkingDir != "" {
specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
}
3.1 label 的類型爲 sandbox
// Generate spec options that will be applied to the spec later.
specOpts, err := c.sandboxContainerSpecOpts(config, &image.ImageSpec.Config)
if err != nil {
return nil, errors.Wrap(err, "failed to generate sanbdox container spec options")
}
sandboxLabels := buildLabels(config.Labels, containerKindSandbox)
4. 存儲 sandbox 信息,創建 root 工作目錄
container, err := c.client.NewContainer(ctx, id, opts...)
if err != nil {
return nil, errors.Wrap(err, "failed to create containerd container")
}
// Create sandbox container root directories.
sandboxRootDir := c.getSandboxRootDir(id)
if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil {
return nil, errors.Wrapf(err, "failed to create sandbox root directory %q",
sandboxRootDir)
}
4.1 setupSandboxFiles 主要創建 hostname resolv.conf hosts 等文件
// Setup files required for the sandbox.
if err = c.setupSandboxFiles(id, config); err != nil {
return nil, errors.Wrapf(err, "failed to setup sandbox files")
}
5. 創建 sandbox 任務
這個其實最終是發送 task 請求,分別爲 CreateTaskRequest,StartRequest,創建以及啓動任務
taskOpts := c.taskOpts(ociRuntime.Type)
// We don't need stdio for sandbox container.
task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...)
if err != nil {
return nil, errors.Wrap(err, "failed to create containerd task")
}
// wait is a long running background request, no timeout needed.
exitCh, err := task.Wait(ctrdutil.NamespacedContext())
if err != nil {
return nil, errors.Wrap(err, "failed to wait for sandbox container task")
}
if err := task.Start(ctx); err != nil {
return nil, errors.Wrapf(err, "failed to start sandbox container task %q", id)
}
5.1 比如使用默認 tasks-service io.containerd.service.v1
func (l *local) Create(ctx context.Context, r *api.CreateTaskRequest, _ ...grpc.CallOption) (*api.CreateTaskResponse, error) {
container, err := l.getContainer(ctx, r.ContainerID)
if err != nil {
return nil, errdefs.ToGRPC(err)
}
checkpointPath, err := getRestorePath(container.Runtime.Name, r.Options)
if err != nil {
return nil, err
}
5.2 比如 io.containerd.runc.v1
實現路徑爲 contaienrd/runtime/v1/runtime.go
// Create a new task
func (r *Runtime) Create(ctx context.Context, id string, opts runtime.CreateOpts) (_ runtime.Task, err error) {
namespace, err := namespaces.NamespaceRequired(ctx)
if err != nil {
return nil, err
}
if err := identifiers.Validate(id); err != nil {
return nil, errors.Wrapf(err, "invalid task id")
}
ropts, err := r.getRuncOptions(ctx, id)
if err != nil {
return nil, err
}
啓動 shim 進程
/usr/bin/containerd-shim-runc-v1 -namespace k8s.io -id d84185af26fcc146b4787ed08543c49d327bb97171ed6b669618f9793a8545fc -address /run/containerd/containerd.sock
shimopt := ShimLocal(r.config, r.events)
if !r.config.NoShim {
var cgroup string
if opts.TaskOptions != nil {
v, err := typeurl.UnmarshalAny(opts.TaskOptions)
if err != nil {
return nil, err
}
cgroup = v.(*runctypes.CreateOptions).ShimCgroup
}
exitHandler := func() {
log.G(ctx).WithField("id", id).Info("shim reaped")
if _, err := r.tasks.Get(ctx, id); err != nil {
// Task was never started or was already successfully deleted
return
}
if err = r.cleanupAfterDeadShim(context.Background(), bundle, namespace, id); err != nil {
log.G(ctx).WithError(err).WithFields(logrus.Fields{
"id": id,
"namespace": namespace,
}).Warn("failed to clean up after killed shim")
}
}
shimopt = ShimRemote(r.config, r.address, cgroup, exitHandler)
}
與 shim 建立GRPC 連接,發送 CreateTaskRequest
sopts := &shim.CreateTaskRequest{
ID: id,
Bundle: bundle.path,
Runtime: rt,
Stdin: opts.IO.Stdin,
Stdout: opts.IO.Stdout,
Stderr: opts.IO.Stderr,
Terminal: opts.IO.Terminal,
Checkpoint: opts.Checkpoint,
Options: opts.TaskOptions,
}
for _, m := range opts.Rootfs {
sopts.Rootfs = append(sopts.Rootfs, &types.Mount{
Type: m.Type,
Source: m.Source,
Options: m.Options,
})
}
cr, err := s.Create(ctx, sopts)
if err != nil {
return nil, errdefs.FromGRPC(err)
}
startTaskRequest 一樣的流程
6. 更新 sandbox 狀態爲 ready
if err := sandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
// Set the pod sandbox as ready after successfully start sandbox container.
status.Pid = task.Pid()
status.State = sandboxstore.StateReady
status.CreatedAt = info.CreatedAt
return status, nil
}); err != nil {
return nil, errors.Wrap(err, "failed to update sandbox status")
}
總結:
RunPodSandbox 獲取配置,生成 ID,name 註冊 name <--> 映射關係,防止重複併發創建
確保 image 本地節點存在,不存在冊 pull image
獲取 runtime,根據 pod 註解,以及配置文件,如果 untrusted 則返回該 runtime,否則返回默認 runtime
爲 sandbox 創建網絡,與 docker-shim 不一樣的是這個先創建網絡
生成 spec 配置
發送 GRPC 創建以及啓動請求,成功將 sandbox 狀態改爲 ready