cgroup介紹

struct cgroup_root cgrp_dfl_root系統默認hierarchy,包含一個cgroup,包括了所有的tasks

cgroup每次mount的過程會創建一個新的hierarchy,以該mountpoint爲root,包含一個cgroup

/*
 * A cgroup_root represents the root of a cgroup hierarchy, and may be
 * associated with a kernfs_root to form an active hierarchy.  This is
 * internal to cgroup core.  Don't access directly from controllers.
 */
struct cgroup_root {
    struct kernfs_root *kf_root;

    /* The bitmask of subsystems attached to this hierarchy */

在mount的過程中指定該hierarchy對應的subsys信息,由-o參數指定,如果不指定則默認所有支持的subsys
    unsigned int subsys_mask;

    /* Unique id for this hierarchy. */
    int hierarchy_id;

    /* The root cgroup.  Root is destroyed on its release. */
    struct cgroup cgrp;

    /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
    atomic_t nr_cgrps;

    /* Wait while cgroups are being destroyed */

何時?觸發條件(rm刪除?)
    wait_queue_head_t wait;

    /* A list running through the active hierarchies */

所有的hierarchy被鏈接起來
    struct list_head root_list;

    /* Hierarchy-specific flags */
    unsigned int flags;

    /* IDs for cgroups in this hierarchy */
    struct idr cgroup_idr;

    /* The path to use for release notifications. */

notify_on_release爲1時,cgroup被釋放則執行agent程序,此處爲agent的路徑
    char release_agent_path[PATH_MAX];

    /* The name for this hierarchy - may be empty */
    char name[MAX_CGROUP_ROOT_NAMELEN];
};

 

在start_kernel最早期開始調用

/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.
 */
int __init cgroup_init_early(void)
{
    static struct cgroup_sb_opts __initdata opts;
    struct cgroup_subsys *ss;
    int i;

    init_cgroup_root(&cgrp_dfl_root, &opts);

注意此處的flags設置,該flags設置到cgroup_root的cgroup元素的cgroup_subsys_state的flag標識上,struct cgroup_subsys_state self爲每個cgroup的默認內部私有成員,其struct cgroup_subsys *ss爲NULL

/* bits in struct cgroup_subsys_state flags field */
enum {
    CSS_NO_REF    = (1 << 0), /* no reference counting for this css */
    CSS_ONLINE    = (1 << 1), /* between ->css_online() and ->css_offline() */
    CSS_RELEASED    = (1 << 2), /* refcnt reached zero, released */
};

    cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

init_task的css_set指向init_css_set

    RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

    for_each_subsys(ss, i) {
        WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
             "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
             i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
             ss->id, ss->name);
        WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
             "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

        ss->id = i;
        ss->name = cgroup_subsys_name[i];
        if (!ss->legacy_name)
            ss->legacy_name = cgroup_subsys_name[i];

        if (ss->early_init)
            cgroup_init_subsys(ss, true);
    }
    return 0;
}



/*
 * The default css_set - used by init and its children prior to any
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
struct css_set init_css_set = {
    .refcount        = ATOMIC_INIT(1),
    .cgrp_links        = LIST_HEAD_INIT(init_css_set.cgrp_links),
    .tasks            = LIST_HEAD_INIT(init_css_set.tasks),
    .mg_tasks        = LIST_HEAD_INIT(init_css_set.mg_tasks),
    .mg_preload_node    = LIST_HEAD_INIT(init_css_set.mg_preload_node),
    .mg_node        = LIST_HEAD_INIT(init_css_set.mg_node),
    .task_iters        = LIST_HEAD_INIT(init_css_set.task_iters),
};


static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
    struct cgroup_subsys_state *css;

    printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

    mutex_lock(&cgroup_mutex);

    idr_init(&ss->css_idr);
    INIT_LIST_HEAD(&ss->cfts);

    /* Create the root cgroup state for this subsystem */

對應subsys的root指向默認的hierarchy根節點
    ss->root = &cgrp_dfl_root;

每個對應的子系統分配其cgroup_subsys_state結構
    css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
    /* We don't handle early failures gracefully */
    BUG_ON(IS_ERR(css));

初始化css,其cgroup和ss賦值
    init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);

    /*
     * Root csses are never destroyed and we can't initialize
     * percpu_ref during early init.  Disable refcnting.
     */
    css->flags |= CSS_NO_REF;

    if (early) {
        /* allocation can't be done safely during early init */
        css->id = 1;
    } else {
        css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
        BUG_ON(css->id < 0);
    }

    /* Update the init_css_set to contain a subsys
     * pointer to this state - since the subsystem is
     * newly registered, all tasks and hence the
     * init_css_set is in the subsystem's root cgroup. */

將每個subsys填充到init_css_set的subsys結構內
    init_css_set.subsys[ss->id] = css;

    have_fork_callback |= (bool)ss->fork << ss->id;
    have_exit_callback |= (bool)ss->exit << ss->id;
    have_free_callback |= (bool)ss->free << ss->id;
    have_canfork_callback |= (bool)ss->can_fork << ss->id;

    /* At system boot, before all subsystems have been
     * registered, no tasks have been forked, so we don't
     * need to invoke fork callbacks here. */

初始化階段init_tasks的tasks鏈表爲空
    BUG_ON(!list_empty(&init_task.tasks));

    BUG_ON(online_css(css));

    mutex_unlock(&cgroup_mutex);
}


/**
 * cgroup_init - cgroup initialization
 *
 * Register cgroup filesystem and /proc file, and initialize
 * any subsystems that didn't request early init.
 */
int __init cgroup_init(void)
{
	struct cgroup_subsys *ss;
	int ssid;

	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
初始化讀寫信號量
	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
初始化cgroup文件類型操作數據結構
	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));

	cgroup_rstat_boot();

	/*
	 * The latency of the synchronize_rcu() is too high for cgroups,
	 * avoid it at the cost of forcing all readers into the slow path.
	 */
	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);

	get_user_ns(init_cgroup_ns.user_ns);

	mutex_lock(&cgroup_mutex);

	/*
	 * Add init_css_set to the hash table so that dfl_root can link to
	 * it during init.
	 */
將init_css_set.subsys信息進行哈希,映射到css_set_table表中
	hash_add(css_set_table, &init_css_set.hlist,
		 css_set_hash(init_css_set.subsys));

	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));

	mutex_unlock(&cgroup_mutex);

	for_each_subsys(ss, ssid) {
		if (ss->early_init) {
			struct cgroup_subsys_state *css =
				init_css_set.subsys[ss->id];

			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
						   GFP_KERNEL);
			BUG_ON(css->id < 0);
		} else {
			cgroup_init_subsys(ss, false);
		}

		list_add_tail(&init_css_set.e_cset_node[ssid],
			      &cgrp_dfl_root.cgrp.e_csets[ssid]);

		/*
		 * Setting dfl_root subsys_mask needs to consider the
		 * disabled flag and cftype registration needs kmalloc,
		 * both of which aren't available during early_init.
		 */
		if (cgroup_disable_mask & (1 << ssid)) {
			static_branch_disable(cgroup_subsys_enabled_key[ssid]);
			printk(KERN_INFO "Disabling %s control group subsystem\n",
			       ss->name);
			continue;
		}

		if (cgroup1_ssid_disabled(ssid))
			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
			       ss->name);

		cgrp_dfl_root.subsys_mask |= 1 << ss->id;

		/* implicit controllers must be threaded too */
		WARN_ON(ss->implicit_on_dfl && !ss->threaded);

		if (ss->implicit_on_dfl)
			cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
		else if (!ss->dfl_cftypes)
			cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;

		if (ss->threaded)
			cgrp_dfl_threaded_ss_mask |= 1 << ss->id;

		if (ss->dfl_cftypes == ss->legacy_cftypes) {
			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
		} else {
			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
		}

		if (ss->bind)
			ss->bind(init_css_set.subsys[ssid]);

		mutex_lock(&cgroup_mutex);
		css_populate_dir(init_css_set.subsys[ssid]);
		mutex_unlock(&cgroup_mutex);
	}

	/* init_css_set.subsys[] has been updated, re-hash */
	hash_del(&init_css_set.hlist);
	hash_add(css_set_table, &init_css_set.hlist,
		 css_set_hash(init_css_set.subsys));

	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
	WARN_ON(register_filesystem(&cgroup_fs_type));
	WARN_ON(register_filesystem(&cgroup2_fs_type));
	WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));

	return 0;
}

主要完成:

  1. rwsem信號量的初始化;
  2. 初始化cgroup文件系統文件類型結構,在文件系統中顯示的內容;
  3. 將init_css_set的subsys全部哈希到css_set_table中;
  4. cgroup_setup_root函數設置cgrp_dfl_root,對其進行初始化
  5. cgroup_init_subsys初始化未在early時期初始化的subsys;

 

cgroup_setup_root函數解析

這裏面有一個很重要的函數rebind_sybsystems,實現如下:

static int rebind_subsystems(struct cgroup_root *dst_root,
			     unsigned long ss_mask)
{
	struct cgroup *dcgrp = &dst_root->cgrp;
	struct cgroup_subsys *ss;
	unsigned long tmp_ss_mask;
	int ssid, i, ret;

	lockdep_assert_held(&cgroup_mutex);

	for_each_subsys_which(ss, ssid, &ss_mask) {
		/* if @ss has non-root csses attached to it, can't move */
		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
			return -EBUSY;

		/* can't move between two non-dummy roots either */
		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
			return -EBUSY;
	}

	/* skip creating root files on dfl_root for inhibited subsystems */
	tmp_ss_mask = ss_mask;
	if (dst_root == &cgrp_dfl_root)
		tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;

	for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
		struct cgroup *scgrp = &ss->root->cgrp;
		int tssid;

		/* 將scgrp中的dir信息遷移到dcgrp中,其對應的ss是要被轉移到新的root上,而dcgrp爲新的root對應的cgroup */
		ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
		if (!ret)
			continue;

		/*
		 * Rebinding back to the default root is not allowed to
		 * fail.  Using both default and non-default roots should
		 * be rare.  Moving subsystems back and forth even more so.
		 * Just warn about it and continue.
		 */
		if (dst_root == &cgrp_dfl_root) {
			if (cgrp_dfl_root_visible) {
				pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
					ret, ss_mask);
				pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
			}
			continue;
		}

		for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
			if (tssid == ssid)
				break;
			/* 將所有ssid之前的subsys對應的cftype內容全部清除,僅僅保留最後一個subsys對應的文件內容 */
			css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
		}
		return ret;
	}

	/*
	 * Nothing can fail from this point on.  Remove files for the
	 * removed subsystems and rebind each subsystem.
	 */
	for_each_subsys_which(ss, ssid, &ss_mask) {
		struct cgroup_root *src_root = ss->root;
		struct cgroup *scgrp = &src_root->cgrp;
		struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
		struct css_set *cset;

		WARN_ON(!css || cgroup_css(dcgrp, ss));

		css_clear_dir(css, NULL);

		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
		rcu_assign_pointer(dcgrp->subsys[ssid], css);
		ss->root = dst_root;
		css->cgroup = dcgrp;

		spin_lock_irq(&css_set_lock);
		hash_for_each(css_set_table, i, cset, hlist)
			list_move_tail(&cset->e_cset_node[ss->id],
				       &dcgrp->e_csets[ss->id]);
		spin_unlock_irq(&css_set_lock);

		src_root->subsys_mask &= ~(1 << ssid);
		scgrp->subtree_control &= ~(1 << ssid);
		cgroup_refresh_child_subsys_mask(scgrp);

		/* default hierarchy doesn't enable controllers by default */
		dst_root->subsys_mask |= 1 << ssid;
		if (dst_root == &cgrp_dfl_root) {
			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
		} else {
			dcgrp->subtree_control |= 1 << ssid;
			cgroup_refresh_child_subsys_mask(dcgrp);
			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
		}

		if (ss->bind)
			ss->bind(css);
	}

	kernfs_activate(dcgrp->kn);
	return 0;
}

再來看init_cgroup_root函數實現:

static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask,
			     int ref_flags)
{
	LIST_HEAD(tmp_links);
	struct cgroup *root_cgrp = &root->cgrp;
	struct css_set *cset;
	int i, ret;

	lockdep_assert_held(&cgroup_mutex);

	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
	if (ret < 0)
		goto out;
	root_cgrp->id = ret;

	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, ref_flags,
			      GFP_KERNEL);
	if (ret)
		goto out;

	/*
	 * We're accessing css_set_count without locking css_set_lock here,
	 * but that's OK - it can only be increased by someone holding
	 * cgroup_lock, and that's us. The worst that can happen is that we
	 * have some link structures left over
	 */
	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
	if (ret)
		goto cancel_ref;

	ret = cgroup_init_root_id(root);
	if (ret)
		goto cancel_ref;

	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
					   KERNFS_ROOT_CREATE_DEACTIVATED,
					   root_cgrp);
	if (IS_ERR(root->kf_root)) {
		ret = PTR_ERR(root->kf_root);
		goto exit_root_id;
	}
	root_cgrp->kn = root->kf_root->kn;

	ret = css_populate_dir(&root_cgrp->self, NULL);
	if (ret)
		goto destroy_root;

	ret = rebind_subsystems(root, ss_mask);
	if (ret)
		goto destroy_root;

	/*
	 * There must be no failure case after here, since rebinding takes
	 * care of subsystems' refcounts, which are explicitly dropped in
	 * the failure exit path.
	 */
	list_add(&root->root_list, &cgroup_roots);
	cgroup_root_count++;

	/*
	 * Link the root cgroup in this hierarchy into all the css_set
	 * objects.
	 */
	spin_lock_irq(&css_set_lock);
	hash_for_each(css_set_table, i, cset, hlist) {
		link_css_set(&tmp_links, cset, root_cgrp);
		if (css_set_populated(cset))
			cgroup_update_populated(root_cgrp, true);
	}
	spin_unlock_irq(&css_set_lock);

	BUG_ON(!list_empty(&root_cgrp->self.children));
	BUG_ON(atomic_read(&root->nr_cgrps) != 1);

	kernfs_activate(root_cgrp->kn);
	ret = 0;
	goto out;

destroy_root:
	kernfs_destroy_root(root->kf_root);
	root->kf_root = NULL;
exit_root_id:
	cgroup_exit_root_id(root);
cancel_ref:
	percpu_ref_exit(&root_cgrp->self.refcnt);
out:
	free_cgrp_cset_links(&tmp_links);
	return ret;
}

這倆面用到了下面函數來建立cgroup對應的文件信息,以顯示在文件系統中可以在用戶空間看到:

/**
 * css_populate_dir - create subsys files in a cgroup directory
 * @css: target css
 * @cgrp_overried: specify if target cgroup is different from css->cgroup
 *
 * On failure, no file is added.
 */
static int css_populate_dir(struct cgroup_subsys_state *css,
			    struct cgroup *cgrp_override)
{
	struct cgroup *cgrp = cgrp_override ?: css->cgroup;
	struct cftype *cfts, *failed_cfts;
	int ret;

	if (!css->ss) {
		if (cgroup_on_dfl(cgrp))
			cfts = cgroup_dfl_base_files;
		else
			cfts = cgroup_legacy_base_files;

		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
	}

	list_for_each_entry(cfts, &css->ss->cfts, node) {
		ret = cgroup_addrm_files(css, cgrp, cfts, true);
		if (ret < 0) {
			failed_cfts = cfts;
			goto err;
		}
	}
	return 0;
err:
	list_for_each_entry(cfts, &css->ss->cfts, node) {
		if (cfts == failed_cfts)
			break;
		cgroup_addrm_files(css, cgrp, cfts, false);
	}
	return ret;
}

接下來看看attach的過程:

/**
 * cgroup_taskset_migrate - migrate a taskset to a cgroup
 * @tset: taget taskset
 * @dst_cgrp: destination cgroup
 *
 * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
 * ->can_attach callbacks fails and guarantees that either all or none of
 * the tasks in @tset are migrated.  @tset is consumed regardless of
 * success.
 */
static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
				  struct cgroup *dst_cgrp)
{
	struct cgroup_subsys_state *css, *failed_css = NULL;
	struct task_struct *task, *tmp_task;
	struct css_set *cset, *tmp_cset;
	int i, ret;

	/* methods shouldn't be called if no task is actually migrating */
	if (list_empty(&tset->src_csets))
		return -ESRCH;

	/* check that we can legitimately attach to the cgroup */
	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->can_attach) {
			tset->ssid = i;
			ret = css->ss->can_attach(tset);
			if (ret) {
				failed_css = css;
				goto out_cancel_attach;
			}
		}
	}

	/*
	 * Now that we're guaranteed success, proceed to move all tasks to
	 * the new cgroup.  There are no failure cases after here, so this
	 * is the commit point.
	 */
	spin_lock_irq(&css_set_lock);
	list_for_each_entry(cset, &tset->src_csets, mg_node) {
		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
			struct css_set *from_cset = task_css_set(task);
			struct css_set *to_cset = cset->mg_dst_cset;

			get_css_set(to_cset);
			css_set_move_task(task, from_cset, to_cset, true);
			put_css_set_locked(from_cset);
		}
	}
	spin_unlock_irq(&css_set_lock);

	/*
	 * Migration is committed, all target tasks are now on dst_csets.
	 * Nothing is sensitive to fork() after this point.  Notify
	 * controllers that migration is complete.
	 */
	tset->csets = &tset->dst_csets;

	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->attach) {
			tset->ssid = i;
			css->ss->attach(tset);
		}
	}

	ret = 0;
	goto out_release_tset;

out_cancel_attach:
	for_each_e_css(css, i, dst_cgrp) {
		if (css == failed_css)
			break;
		if (css->ss->cancel_attach) {
			tset->ssid = i;
			css->ss->cancel_attach(tset);
		}
	}
out_release_tset:
	spin_lock_irq(&css_set_lock);
	list_splice_init(&tset->dst_csets, &tset->src_csets);
	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
		list_del_init(&cset->mg_node);
	}
	spin_unlock_irq(&css_set_lock);
	return ret;
}

/**
 * cgroup_migrate_finish - cleanup after attach
 * @preloaded_csets: list of preloaded css_sets
 *
 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
 * those functions for details.
 */
static void cgroup_migrate_finish(struct list_head *preloaded_csets)
{
	struct css_set *cset, *tmp_cset;

	lockdep_assert_held(&cgroup_mutex);

	spin_lock_irq(&css_set_lock);
	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
		cset->mg_src_cgrp = NULL;
		cset->mg_dst_cset = NULL;
		list_del_init(&cset->mg_preload_node);
		put_css_set_locked(cset);
	}
	spin_unlock_irq(&css_set_lock);
}

/**
 * cgroup_migrate_add_src - add a migration source css_set
 * @src_cset: the source css_set to add
 * @dst_cgrp: the destination cgroup
 * @preloaded_csets: list of preloaded css_sets
 *
 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
 * @src_cset and add it to @preloaded_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
 * This function may be called without holding cgroup_threadgroup_rwsem
 * even if the target is a process.  Threads may be created and destroyed
 * but as long as cgroup_mutex is not dropped, no new css_set can be put
 * into play and the preloaded css_sets are guaranteed to cover all
 * migrations.
 */
static void cgroup_migrate_add_src(struct css_set *src_cset,
				   struct cgroup *dst_cgrp,
				   struct list_head *preloaded_csets)
{
	struct cgroup *src_cgrp;

	lockdep_assert_held(&cgroup_mutex);
	lockdep_assert_held(&css_set_lock);

	/*
	 * If ->dead, @src_set is associated with one or more dead cgroups
	 * and doesn't contain any migratable tasks.  Ignore it early so
	 * that the rest of migration path doesn't get confused by it.
	 */
	if (src_cset->dead)
		return;

	/* 查找存在於dst_cgrp->root這個hierarchy而當前與src_cset關聯的cgroup 
	 * 在其調用函數cgroup_attach_task中,該src_cset就是task->cgroups,
	 * 即該task關聯的css_set,在這裏也就是找到了task所關聯的在dst_cgrp->root
	 * 這個hierarchy下的cgroup,找到的cgroup與dst_cgrp爲同一個hierarchy,即dst_cgrp->root
	 */
	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

	if (!list_empty(&src_cset->mg_preload_node))
		return;

	WARN_ON(src_cset->mg_src_cgrp);
	WARN_ON(!list_empty(&src_cset->mg_tasks));
	WARN_ON(!list_empty(&src_cset->mg_node));

	src_cset->mg_src_cgrp = src_cgrp;
	get_css_set(src_cset);
	list_add(&src_cset->mg_preload_node, preloaded_csets);
}
/**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
 * @dst_cgrp: the destination cgroup (may be %NULL)
 * @preloaded_csets: list of preloaded source css_sets
 *
 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
 * have been preloaded to @preloaded_csets.  This function looks up and
 * pins all destination css_sets, links each to its source, and append them
 * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
 * source css_set is assumed to be its cgroup on the default hierarchy.
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
 * @preloaded_csets.
 */
static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
				      struct list_head *preloaded_csets)
{
	LIST_HEAD(csets);
	struct css_set *src_cset, *tmp_cset;

	lockdep_assert_held(&cgroup_mutex);

	/*
	 * Except for the root, child_subsys_mask must be zero for a cgroup
	 * with tasks so that child cgroups don't compete against tasks.
	 */
	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
	    dst_cgrp->child_subsys_mask)
		return -EBUSY;

	/* look up the dst cset for each src cset and link it to src */
	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
		struct css_set *dst_cset;

		dst_cset = find_css_set(src_cset,
					dst_cgrp ?: src_cset->dfl_cgrp);
		if (!dst_cset)
			goto err;

		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

		/*
		 * If src cset equals dst, it's noop.  Drop the src.
		 * cgroup_migrate() will skip the cset too.  Note that we
		 * can't handle src == dst as some nodes are used by both.
		 */
		if (src_cset == dst_cset) {
			src_cset->mg_src_cgrp = NULL;
			list_del_init(&src_cset->mg_preload_node);
			put_css_set(src_cset);
			put_css_set(dst_cset);
			continue;
		}

		src_cset->mg_dst_cset = dst_cset;

		if (list_empty(&dst_cset->mg_preload_node))
			list_add(&dst_cset->mg_preload_node, &csets);
		else
			put_css_set(dst_cset);
	}

	list_splice_tail(&csets, preloaded_csets);
	return 0;
err:
	cgroup_migrate_finish(&csets);
	return -ENOMEM;
}

/**
 * cgroup_migrate - migrate a process or task to a cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
 * @cgrp: the destination cgroup
 *
 * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
 * process, the caller must be holding cgroup_threadgroup_rwsem.  The
 * caller is also responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
 *
 * As long as a controller's ->can_attach() doesn't fail, this function is
 * guaranteed to succeed.  This means that, excluding ->can_attach()
 * failure, when migrating multiple targets, the success or failure can be
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
			  struct cgroup *cgrp)
{
	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
	struct task_struct *task;

	/*
	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
	 * already PF_EXITING could be freed from underneath us unless we
	 * take an rcu_read_lock.
	 */
	spin_lock_irq(&css_set_lock);
	rcu_read_lock();
	task = leader;
	do {
		cgroup_taskset_add(task, &tset);
		if (!threadgroup)
			break;
	} while_each_thread(leader, task);
	rcu_read_unlock();
	spin_unlock_irq(&css_set_lock);

	return cgroup_taskset_migrate(&tset, cgrp);
}

/**
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
 * @dst_cgrp: the cgroup to attach to
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
 */
static int cgroup_attach_task(struct cgroup *dst_cgrp,
			      struct task_struct *leader, bool threadgroup)
{
	LIST_HEAD(preloaded_csets);
	struct task_struct *task;
	int ret;
	bool same_cgrp = true;

	/* look up all src csets */
	spin_lock_irq(&css_set_lock);
	rcu_read_lock();
	task = leader;
	do {
		/* 將task所在的css_set的mg_preload_node元素鏈接到preloaded_csets鏈表上
		 * 並且將task目前所屬的錮st_cgrp同hierarchy的cgroup賦值給task的錭ss_set的mg_src_cgrp元素
		 */
		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
				       &preloaded_csets);
		if (task_css_set(task)->mg_src_cgrp != dst_cgrp)
			same_cgrp = false;
		if (!threadgroup)
			break;
	} while_each_thread(leader, task);
	rcu_read_unlock();
	spin_unlock_irq(&css_set_lock);

	/* prepare dst csets and commit */
	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
	if (!ret)
		ret = cgroup_migrate(leader, threadgroup, dst_cgrp);

	cgroup_migrate_finish(&preloaded_csets);

	if (same_cgrp)
		ret = 0;

	return ret;
}

最後是cgroup_mount的操作:

static struct dentry *cgroup_mount(struct file_system_type *fs_type,
			 int flags, const char *unused_dev_name,
			 void *data)
{
	struct super_block *pinned_sb = NULL;
	struct cgroup_subsys *ss;
	struct cgroup_root *root;
	struct cgroup_sb_opts opts;
	struct dentry *dentry;
	int ret;
	int i;
	bool new_sb;
	bool new_root = false;

	/*
	 * The first time anyone tries to mount a cgroup, enable the list
	 * linking each css_set to its tasks and fix up all existing tasks.
	 */
	if (!use_task_css_set_links)
		cgroup_enable_task_cg_lists();

	mutex_lock(&cgroup_mutex);

	/* First find the desired set of subsystems */
	ret = parse_cgroupfs_options(data, &opts);
	if (ret)
		goto out_unlock;

	/* look for a matching existing root */
	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
		cgrp_dfl_root_visible = true;
		root = &cgrp_dfl_root;
		cgroup_get(&root->cgrp);
		ret = 0;
		goto out_unlock;
	}

	/*
	 * Destruction of cgroup root is asynchronous, so subsystems may
	 * still be dying after the previous unmount.  Let's drain the
	 * dying subsystems.  We just need to ensure that the ones
	 * unmounted previously finish dying and don't care about new ones
	 * starting.  Testing ref liveliness is good enough.
	 */
	for_each_subsys(ss, i) {
		if (!(opts.subsys_mask & (1 << i)) ||
		    ss->root == &cgrp_dfl_root)
			continue;

		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
			mutex_unlock(&cgroup_mutex);
			msleep(10);
			ret = restart_syscall();
			goto out_free;
		}
		cgroup_put(&ss->root->cgrp);
	}

	for_each_root(root) {
		bool name_match = false;

		if (root == &cgrp_dfl_root)
			continue;

		/*
		 * If we asked for a name then it must match.  Also, if
		 * name matches but sybsys_mask doesn't, we should fail.
		 * Remember whether name matched.
		 */
		if (opts.name) {
			if (strcmp(opts.name, root->name))
				continue;
			name_match = true;
		}

		/*
		 * If we asked for subsystems (or explicitly for no
		 * subsystems) then they must match.
		 */
		if ((opts.subsys_mask || opts.none) &&
		    (opts.subsys_mask != root->subsys_mask)) {
			if (!name_match)
				continue;
			ret = -EBUSY;
			goto out_unlock;
		}

		if (root->flags ^ opts.flags)
			pr_warn("new mount options do not match the existing superblock, will be ignored\n");

		/*
		 * We want to reuse @root whose lifetime is governed by its
		 * ->cgrp.  Let's check whether @root is alive and keep it
		 * that way.  As cgroup_kill_sb() can happen anytime, we
		 * want to block it by pinning the sb so that @root doesn't
		 * get killed before mount is complete.
		 *
		 * With the sb pinned, tryget_live can reliably indicate
		 * whether @root can be reused.  If it's being killed,
		 * drain it.  We can use wait_queue for the wait but this
		 * path is super cold.  Let's just sleep a bit and retry.
		 */
		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
		if (IS_ERR(pinned_sb) ||
		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
			mutex_unlock(&cgroup_mutex);
			if (!IS_ERR_OR_NULL(pinned_sb))
				deactivate_super(pinned_sb);
			msleep(10);
			ret = restart_syscall();
			goto out_free;
		}

		ret = 0;
		goto out_unlock;
	}

	/*
	 * No such thing, create a new one.  name= matching without subsys
	 * specification is allowed for already existing hierarchies but we
	 * can't create new one without subsys specification.
	 */
	if (!opts.subsys_mask && !opts.none) {
		ret = -EINVAL;
		goto out_unlock;
	}

	root = kzalloc(sizeof(*root), GFP_KERNEL);
	if (!root) {
		ret = -ENOMEM;
		goto out_unlock;
	}
	new_root = true;

	init_cgroup_root(root, &opts);

	ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
	if (ret)
		cgroup_free_root(root);

out_unlock:
	mutex_unlock(&cgroup_mutex);
out_free:
	kfree(opts.release_agent);
	kfree(opts.name);

	if (ret)
		return ERR_PTR(ret);

	dentry = kernfs_mount(fs_type, flags, root->kf_root,
				CGROUP_SUPER_MAGIC, &new_sb);
	if (IS_ERR(dentry) || !new_sb)
		cgroup_put(&root->cgrp);

	/*
	 * There's a race window after we release cgroup_mutex and before
	 * allocating a superblock. Make sure a concurrent process won't
	 * be able to re-use the root during this window by delaying the
	 * initialization of root refcnt.
	 */
	if (new_root) {
		mutex_lock(&cgroup_mutex);
		percpu_ref_reinit(&root->cgrp.self.refcnt);
		mutex_unlock(&cgroup_mutex);
	}

	/*
	 * If @pinned_sb, we're reusing an existing root and holding an
	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
	 */
	if (pinned_sb) {
		WARN_ON(new_sb);
		deactivate_super(pinned_sb);
	}

	return dentry;
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章