Blame - kernel/cgroup.c - codeaurora/cp-linux

blob: 4cb94b678e9fa81e125b7183cc9fa2c8f22843cd [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame^]	1	/*
				2	* Generic process-grouping system.
				3	*
				4	* Based originally on the cpuset system, extracted by Paul Menage
				5	* Copyright (C) 2006 Google, Inc
				6	*
				7	* Notifications support
				8	* Copyright (C) 2009 Nokia Corporation
				9	* Author: Kirill A. Shutemov
				10	*
				11	* Copyright notices from the original cpuset code:
				12	* --------------------------------------------------
				13	* Copyright (C) 2003 BULL SA.
				14	* Copyright (C) 2004-2006 Silicon Graphics, Inc.
				15	*
				16	* Portions derived from Patrick Mochel's sysfs code.
				17	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				18	*
				19	* 2003-10-10 Written by Simon Derr.
				20	* 2003-10-22 Updates by Stephen Hemminger.
				21	* 2004 May-July Rework by Paul Jackson.
				22	* ---------------------------------------------------
				23	*
				24	* This file is subject to the terms and conditions of the GNU General Public
				25	* License. See the file COPYING in the main directory of the Linux
				26	* distribution for more details.
				27	*/
				28
				29	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				30
				31	#include <linux/cgroup.h>
				32	#include <linux/cred.h>
				33	#include <linux/ctype.h>
				34	#include <linux/errno.h>
				35	#include <linux/init_task.h>
				36	#include <linux/kernel.h>
				37	#include <linux/list.h>
				38	#include <linux/magic.h>
				39	#include <linux/mm.h>
				40	#include <linux/mutex.h>
				41	#include <linux/mount.h>
				42	#include <linux/pagemap.h>
				43	#include <linux/proc_fs.h>
				44	#include <linux/rcupdate.h>
				45	#include <linux/sched.h>
				46	#include <linux/slab.h>
				47	#include <linux/spinlock.h>
				48	#include <linux/percpu-rwsem.h>
				49	#include <linux/string.h>
				50	#include <linux/sort.h>
				51	#include <linux/kmod.h>
				52	#include <linux/delayacct.h>
				53	#include <linux/cgroupstats.h>
				54	#include <linux/hashtable.h>
				55	#include <linux/pid_namespace.h>
				56	#include <linux/idr.h>
				57	#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
				58	#include <linux/kthread.h>
				59	#include <linux/delay.h>
				60	#include <linux/cpuset.h>
				61	#include <linux/atomic.h>
				62
				63	/*
				64	* pidlists linger the following amount before being destroyed. The goal
				65	* is avoiding frequent destruction in the middle of consecutive read calls
				66	* Expiring in the middle is a performance problem not a correctness one.
				67	* 1 sec should be enough.
				68	*/
				69	#define CGROUP_PIDLIST_DESTROY_DELAY HZ
				70
				71	#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
				72	MAX_CFTYPE_NAME + 2)
				73
				74	/*
				75	* cgroup_mutex is the master lock. Any modification to cgroup or its
				76	* hierarchy must be performed while holding it.
				77	*
				78	* css_set_lock protects task->cgroups pointer, the list of css_set
				79	* objects, and the chain of tasks off each css_set.
				80	*
				81	* These locks are exported if CONFIG_PROVE_RCU so that accessors in
				82	* cgroup.h can use them for lockdep annotations.
				83	*/
				84	#ifdef CONFIG_PROVE_RCU
				85	DEFINE_MUTEX(cgroup_mutex);
				86	DEFINE_SPINLOCK(css_set_lock);
				87	EXPORT_SYMBOL_GPL(cgroup_mutex);
				88	EXPORT_SYMBOL_GPL(css_set_lock);
				89	#else
				90	static DEFINE_MUTEX(cgroup_mutex);
				91	static DEFINE_SPINLOCK(css_set_lock);
				92	#endif
				93
				94	/*
				95	* Protects cgroup_idr and css_idr so that IDs can be released without
				96	* grabbing cgroup_mutex.
				97	*/
				98	static DEFINE_SPINLOCK(cgroup_idr_lock);
				99
				100	/*
				101	* Protects cgroup_file->kn for !self csses. It synchronizes notifications
				102	* against file removal/re-creation across css hiding.
				103	*/
				104	static DEFINE_SPINLOCK(cgroup_file_kn_lock);
				105
				106	/*
				107	* Protects cgroup_subsys->release_agent_path. Modifying it also requires
				108	* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
				109	*/
				110	static DEFINE_SPINLOCK(release_agent_path_lock);
				111
				112	struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
				113
				114	#define cgroup_assert_mutex_or_rcu_locked() \
				115	RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
				116	!lockdep_is_held(&cgroup_mutex), \
				117	"cgroup_mutex or RCU read lock required");
				118
				119	/*
				120	* cgroup destruction makes heavy use of work items and there can be a lot
				121	* of concurrent destructions. Use a separate workqueue so that cgroup
				122	* destruction work items don't end up filling up max_active of system_wq
				123	* which may lead to deadlock.
				124	*/
				125	static struct workqueue_struct *cgroup_destroy_wq;
				126
				127	/*
				128	* pidlist destructions need to be flushed on cgroup destruction. Use a
				129	* separate workqueue as flush domain.
				130	*/
				131	static struct workqueue_struct *cgroup_pidlist_destroy_wq;
				132
				133	/* generate an array of cgroup subsystem pointers */
				134	#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
				135	static struct cgroup_subsys *cgroup_subsys[] = {
				136	#include <linux/cgroup_subsys.h>
				137	};
				138	#undef SUBSYS
				139
				140	/* array of cgroup subsystem names */
				141	#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
				142	static const char *cgroup_subsys_name[] = {
				143	#include <linux/cgroup_subsys.h>
				144	};
				145	#undef SUBSYS
				146
				147	/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
				148	#define SUBSYS(_x) \
				149	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
				150	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
				151	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
				152	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
				153	#include <linux/cgroup_subsys.h>
				154	#undef SUBSYS
				155
				156	#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
				157	static struct static_key_true *cgroup_subsys_enabled_key[] = {
				158	#include <linux/cgroup_subsys.h>
				159	};
				160	#undef SUBSYS
				161
				162	#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
				163	static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
				164	#include <linux/cgroup_subsys.h>
				165	};
				166	#undef SUBSYS
				167
				168	/*
				169	* The default hierarchy, reserved for the subsystems that are otherwise
				170	* unattached - it never has more than a single cgroup, and all tasks are
				171	* part of that cgroup.
				172	*/
				173	struct cgroup_root cgrp_dfl_root;
				174	EXPORT_SYMBOL_GPL(cgrp_dfl_root);
				175
				176	/*
				177	* The default hierarchy always exists but is hidden until mounted for the
				178	* first time. This is for backward compatibility.
				179	*/
				180	static bool cgrp_dfl_root_visible;
				181
				182	/* some controllers are not supported in the default hierarchy */
				183	static unsigned long cgrp_dfl_root_inhibit_ss_mask;
				184
				185	/* The list of hierarchy roots */
				186
				187	static LIST_HEAD(cgroup_roots);
				188	static int cgroup_root_count;
				189
				190	/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
				191	static DEFINE_IDR(cgroup_hierarchy_idr);
				192
				193	/*
				194	* Assign a monotonically increasing serial number to csses. It guarantees
				195	* cgroups with bigger numbers are newer than those with smaller numbers.
				196	* Also, as csses are always appended to the parent's ->children list, it
				197	* guarantees that sibling csses are always sorted in the ascending serial
				198	* number order on the list. Protected by cgroup_mutex.
				199	*/
				200	static u64 css_serial_nr_next = 1;
				201
				202	/*
				203	* These bitmask flags indicate whether tasks in the fork and exit paths have
				204	* fork/exit handlers to call. This avoids us having to do extra work in the
				205	* fork/exit path to check which subsystems have fork/exit callbacks.
				206	*/
				207	static unsigned long have_fork_callback __read_mostly;
				208	static unsigned long have_exit_callback __read_mostly;
				209	static unsigned long have_free_callback __read_mostly;
				210
				211	/* Ditto for the can_fork callback. */
				212	static unsigned long have_canfork_callback __read_mostly;
				213
				214	static struct cftype cgroup_dfl_base_files[];
				215	static struct cftype cgroup_legacy_base_files[];
				216
				217	static int rebind_subsystems(struct cgroup_root *dst_root,
				218	unsigned long ss_mask);
				219	static void css_task_iter_advance(struct css_task_iter *it);
				220	static int cgroup_destroy_locked(struct cgroup *cgrp);
				221	static int create_css(struct cgroup cgrp, struct cgroup_subsys ss,
				222	bool visible);
				223	static void css_release(struct percpu_ref *ref);
				224	static void kill_css(struct cgroup_subsys_state *css);
				225	static int cgroup_addrm_files(struct cgroup_subsys_state *css,
				226	struct cgroup *cgrp, struct cftype cfts[],
				227	bool is_add);
				228
				229	/**
				230	* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
				231	* @ssid: subsys ID of interest
				232	*
				233	* cgroup_subsys_enabled() can only be used with literal subsys names which
				234	* is fine for individual subsystems but unsuitable for cgroup core. This
				235	* is slower static_key_enabled() based test indexed by @ssid.
				236	*/
				237	static bool cgroup_ssid_enabled(int ssid)
				238	{
				239	if (CGROUP_SUBSYS_COUNT == 0)
				240	return false;
				241
				242	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
				243	}
				244
				245	/**
				246	* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
				247	* @cgrp: the cgroup of interest
				248	*
				249	* The default hierarchy is the v2 interface of cgroup and this function
				250	* can be used to test whether a cgroup is on the default hierarchy for
				251	* cases where a subsystem should behave differnetly depending on the
				252	* interface version.
				253	*
				254	* The set of behaviors which change on the default hierarchy are still
				255	* being determined and the mount option is prefixed with __DEVEL__.
				256	*
				257	* List of changed behaviors:
				258	*
				259	* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
				260	* and "name" are disallowed.
				261	*
				262	* - When mounting an existing superblock, mount options should match.
				263	*
				264	* - Remount is disallowed.
				265	*
				266	* - rename(2) is disallowed.
				267	*
				268	* - "tasks" is removed. Everything should be at process granularity. Use
				269	* "cgroup.procs" instead.
				270	*
				271	* - "cgroup.procs" is not sorted. pids will be unique unless they got
				272	* recycled inbetween reads.
				273	*
				274	* - "release_agent" and "notify_on_release" are removed. Replacement
				275	* notification mechanism will be implemented.
				276	*
				277	* - "cgroup.clone_children" is removed.
				278	*
				279	* - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
				280	* and its descendants contain no task; otherwise, 1. The file also
				281	* generates kernfs notification which can be monitored through poll and
				282	* [di]notify when the value of the file changes.
				283	*
				284	* - cpuset: tasks will be kept in empty cpusets when hotplug happens and
				285	* take masks of ancestors with non-empty cpus/mems, instead of being
				286	* moved to an ancestor.
				287	*
				288	* - cpuset: a task can be moved into an empty cpuset, and again it takes
				289	* masks of ancestors.
				290	*
				291	* - memcg: use_hierarchy is on by default and the cgroup file for the flag
				292	* is not created.
				293	*
				294	* - blkcg: blk-throttle becomes properly hierarchical.
				295	*
				296	* - debug: disallowed on the default hierarchy.
				297	*/
				298	static bool cgroup_on_dfl(const struct cgroup *cgrp)
				299	{
				300	return cgrp->root == &cgrp_dfl_root;
				301	}
				302
				303	/* IDR wrappers which synchronize using cgroup_idr_lock */
				304	static int cgroup_idr_alloc(struct idr idr, void ptr, int start, int end,
				305	gfp_t gfp_mask)
				306	{
				307	int ret;
				308
				309	idr_preload(gfp_mask);
				310	spin_lock_bh(&cgroup_idr_lock);
				311	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
				312	spin_unlock_bh(&cgroup_idr_lock);
				313	idr_preload_end();
				314	return ret;
				315	}
				316
				317	static void cgroup_idr_replace(struct idr idr, void *ptr, int id)
				318	{
				319	void *ret;
				320
				321	spin_lock_bh(&cgroup_idr_lock);
				322	ret = idr_replace(idr, ptr, id);
				323	spin_unlock_bh(&cgroup_idr_lock);
				324	return ret;
				325	}
				326
				327	static void cgroup_idr_remove(struct idr *idr, int id)
				328	{
				329	spin_lock_bh(&cgroup_idr_lock);
				330	idr_remove(idr, id);
				331	spin_unlock_bh(&cgroup_idr_lock);
				332	}
				333
				334	static struct cgroup cgroup_parent(struct cgroup cgrp)
				335	{
				336	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
				337
				338	if (parent_css)
				339	return container_of(parent_css, struct cgroup, self);
				340	return NULL;
				341	}
				342
				343	/**
				344	* cgroup_css - obtain a cgroup's css for the specified subsystem
				345	* @cgrp: the cgroup of interest
				346	* @ss: the subsystem of interest (%NULL returns @cgrp->self)
				347	*
				348	* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
				349	* function must be called either under cgroup_mutex or rcu_read_lock() and
				350	* the caller is responsible for pinning the returned css if it wants to
				351	* keep accessing it outside the said locks. This function may return
				352	* %NULL if @cgrp doesn't have @subsys_id enabled.
				353	*/
				354	static struct cgroup_subsys_state cgroup_css(struct cgroup cgrp,
				355	struct cgroup_subsys *ss)
				356	{
				357	if (ss)
				358	return rcu_dereference_check(cgrp->subsys[ss->id],
				359	lockdep_is_held(&cgroup_mutex));
				360	else
				361	return &cgrp->self;
				362	}
				363
				364	/**
				365	* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
				366	* @cgrp: the cgroup of interest
				367	* @ss: the subsystem of interest (%NULL returns @cgrp->self)
				368	*
				369	* Similar to cgroup_css() but returns the effective css, which is defined
				370	* as the matching css of the nearest ancestor including self which has @ss
				371	* enabled. If @ss is associated with the hierarchy @cgrp is on, this
				372	* function is guaranteed to return non-NULL css.
				373	*/
				374	static struct cgroup_subsys_state cgroup_e_css(struct cgroup cgrp,
				375	struct cgroup_subsys *ss)
				376	{
				377	lockdep_assert_held(&cgroup_mutex);
				378
				379	if (!ss)
				380	return &cgrp->self;
				381
				382	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
				383	return NULL;
				384
				385	/*
				386	* This function is used while updating css associations and thus
				387	* can't test the csses directly. Use ->child_subsys_mask.
				388	*/
				389	while (cgroup_parent(cgrp) &&
				390	!(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
				391	cgrp = cgroup_parent(cgrp);
				392
				393	return cgroup_css(cgrp, ss);
				394	}
				395
				396	/**
				397	* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
				398	* @cgrp: the cgroup of interest
				399	* @ss: the subsystem of interest
				400	*
				401	* Find and get the effective css of @cgrp for @ss. The effective css is
				402	* defined as the matching css of the nearest ancestor including self which
				403	* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
				404	* the root css is returned, so this function always returns a valid css.
				405	* The returned css must be put using css_put().
				406	*/
				407	struct cgroup_subsys_state cgroup_get_e_css(struct cgroup cgrp,
				408	struct cgroup_subsys *ss)
				409	{
				410	struct cgroup_subsys_state *css;
				411
				412	rcu_read_lock();
				413
				414	do {
				415	css = cgroup_css(cgrp, ss);
				416
				417	if (css && css_tryget_online(css))
				418	goto out_unlock;
				419	cgrp = cgroup_parent(cgrp);
				420	} while (cgrp);
				421
				422	css = init_css_set.subsys[ss->id];
				423	css_get(css);
				424	out_unlock:
				425	rcu_read_unlock();
				426	return css;
				427	}
				428
				429	/* convenient tests for these bits */
				430	static inline bool cgroup_is_dead(const struct cgroup *cgrp)
				431	{
				432	return !(cgrp->self.flags & CSS_ONLINE);
				433	}
				434
				435	static void cgroup_get(struct cgroup *cgrp)
				436	{
				437	WARN_ON_ONCE(cgroup_is_dead(cgrp));
				438	css_get(&cgrp->self);
				439	}
				440
				441	static bool cgroup_tryget(struct cgroup *cgrp)
				442	{
				443	return css_tryget(&cgrp->self);
				444	}
				445
				446	static void cgroup_put(struct cgroup *cgrp)
				447	{
				448	css_put(&cgrp->self);
				449	}
				450
				451	struct cgroup_subsys_state of_css(struct kernfs_open_file of)
				452	{
				453	struct cgroup *cgrp = of->kn->parent->priv;
				454	struct cftype *cft = of_cft(of);
				455
				456	/*
				457	* This is open and unprotected implementation of cgroup_css().
				458	* seq_css() is only called from a kernfs file operation which has
				459	* an active reference on the file. Because all the subsystem
				460	* files are drained before a css is disassociated with a cgroup,
				461	* the matching css from the cgroup's subsys table is guaranteed to
				462	* be and stay valid until the enclosing operation is complete.
				463	*/
				464	if (cft->ss)
				465	return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
				466	else
				467	return &cgrp->self;
				468	}
				469	EXPORT_SYMBOL_GPL(of_css);
				470
				471	/**
				472	* cgroup_is_descendant - test ancestry
				473	* @cgrp: the cgroup to be tested
				474	* @ancestor: possible ancestor of @cgrp
				475	*
				476	* Test whether @cgrp is a descendant of @ancestor. It also returns %true
				477	* if @cgrp == @ancestor. This function is safe to call as long as @cgrp
				478	* and @ancestor are accessible.
				479	*/
				480	bool cgroup_is_descendant(struct cgroup cgrp, struct cgroup ancestor)
				481	{
				482	while (cgrp) {
				483	if (cgrp == ancestor)
				484	return true;
				485	cgrp = cgroup_parent(cgrp);
				486	}
				487	return false;
				488	}
				489
				490	static int notify_on_release(const struct cgroup *cgrp)
				491	{
				492	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
				493	}
				494
				495	/**
				496	* for_each_css - iterate all css's of a cgroup
				497	* @css: the iteration cursor
				498	* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
				499	* @cgrp: the target cgroup to iterate css's of
				500	*
				501	* Should be called under cgroup_[tree_]mutex.
				502	*/
				503	#define for_each_css(css, ssid, cgrp) \
				504	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
				505	if (!((css) = rcu_dereference_check( \
				506	(cgrp)->subsys[(ssid)], \
				507	lockdep_is_held(&cgroup_mutex)))) { } \
				508	else
				509
				510	/**
				511	* for_each_e_css - iterate all effective css's of a cgroup
				512	* @css: the iteration cursor
				513	* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
				514	* @cgrp: the target cgroup to iterate css's of
				515	*
				516	* Should be called under cgroup_[tree_]mutex.
				517	*/
				518	#define for_each_e_css(css, ssid, cgrp) \
				519	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
				520	if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
				521	; \
				522	else
				523
				524	/**
				525	* for_each_subsys - iterate all enabled cgroup subsystems
				526	* @ss: the iteration cursor
				527	* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
				528	*/
				529	#define for_each_subsys(ss, ssid) \
				530	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
				531	(((ss) = cgroup_subsys[ssid]) \|\| true); (ssid)++)
				532
				533	/**
				534	* for_each_subsys_which - filter for_each_subsys with a bitmask
				535	* @ss: the iteration cursor
				536	* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
				537	* @ss_maskp: a pointer to the bitmask
				538	*
				539	* The block will only run for cases where the ssid-th bit (1 << ssid) of
				540	* mask is set to 1.
				541	*/
				542	#define for_each_subsys_which(ss, ssid, ss_maskp) \
				543	if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
				544	(ssid) = 0; \
				545	else \
				546	for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
				547	if (((ss) = cgroup_subsys[ssid]) && false) \
				548	break; \
				549	else
				550
				551	/* iterate across the hierarchies */
				552	#define for_each_root(root) \
				553	list_for_each_entry((root), &cgroup_roots, root_list)
				554
				555	/* iterate over child cgrps, lock should be held throughout iteration */
				556	#define cgroup_for_each_live_child(child, cgrp) \
				557	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
				558	if (({ lockdep_assert_held(&cgroup_mutex); \
				559	cgroup_is_dead(child); })) \
				560	; \
				561	else
				562
				563	static void cgroup_release_agent(struct work_struct *work);
				564	static void check_for_release(struct cgroup *cgrp);
				565
				566	/*
				567	* A cgroup can be associated with multiple css_sets as different tasks may
				568	* belong to different cgroups on different hierarchies. In the other
				569	* direction, a css_set is naturally associated with multiple cgroups.
				570	* This M:N relationship is represented by the following link structure
				571	* which exists for each association and allows traversing the associations
				572	* from both sides.
				573	*/
				574	struct cgrp_cset_link {
				575	/* the cgroup and css_set this link associates */
				576	struct cgroup *cgrp;
				577	struct css_set *cset;
				578
				579	/* list of cgrp_cset_links anchored at cgrp->cset_links */
				580	struct list_head cset_link;
				581
				582	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
				583	struct list_head cgrp_link;
				584	};
				585
				586	/*
				587	* The default css_set - used by init and its children prior to any
				588	* hierarchies being mounted. It contains a pointer to the root state
				589	* for each subsystem. Also used to anchor the list of css_sets. Not
				590	* reference-counted, to improve performance when child cgroups
				591	* haven't been created.
				592	*/
				593	struct css_set init_css_set = {
				594	.refcount = ATOMIC_INIT(1),
				595	.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
				596	.tasks = LIST_HEAD_INIT(init_css_set.tasks),
				597	.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
				598	.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
				599	.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
				600	.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
				601	};
				602
				603	static int css_set_count = 1; /* 1 for init_css_set */
				604
				605	/**
				606	* css_set_populated - does a css_set contain any tasks?
				607	* @cset: target css_set
				608	*/
				609	static bool css_set_populated(struct css_set *cset)
				610	{
				611	lockdep_assert_held(&css_set_lock);
				612
				613	return !list_empty(&cset->tasks) \|\| !list_empty(&cset->mg_tasks);
				614	}
				615
				616	/**
				617	* cgroup_update_populated - updated populated count of a cgroup
				618	* @cgrp: the target cgroup
				619	* @populated: inc or dec populated count
				620	*
				621	* One of the css_sets associated with @cgrp is either getting its first
				622	* task or losing the last. Update @cgrp->populated_cnt accordingly. The
				623	* count is propagated towards root so that a given cgroup's populated_cnt
				624	* is zero iff the cgroup and all its descendants don't contain any tasks.
				625	*
				626	* @cgrp's interface file "cgroup.populated" is zero if
				627	* @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
				628	* changes from or to zero, userland is notified that the content of the
				629	* interface file has changed. This can be used to detect when @cgrp and
				630	* its descendants become populated or empty.
				631	*/
				632	static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
				633	{
				634	lockdep_assert_held(&css_set_lock);
				635
				636	do {
				637	bool trigger;
				638
				639	if (populated)
				640	trigger = !cgrp->populated_cnt++;
				641	else
				642	trigger = !--cgrp->populated_cnt;
				643
				644	if (!trigger)
				645	break;
				646
				647	check_for_release(cgrp);
				648	cgroup_file_notify(&cgrp->events_file);
				649
				650	cgrp = cgroup_parent(cgrp);
				651	} while (cgrp);
				652	}
				653
				654	/**
				655	* css_set_update_populated - update populated state of a css_set
				656	* @cset: target css_set
				657	* @populated: whether @cset is populated or depopulated
				658	*
				659	* @cset is either getting the first task or losing the last. Update the
				660	* ->populated_cnt of all associated cgroups accordingly.
				661	*/
				662	static void css_set_update_populated(struct css_set *cset, bool populated)
				663	{
				664	struct cgrp_cset_link *link;
				665
				666	lockdep_assert_held(&css_set_lock);
				667
				668	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
				669	cgroup_update_populated(link->cgrp, populated);
				670	}
				671
				672	/**
				673	* css_set_move_task - move a task from one css_set to another
				674	* @task: task being moved
				675	* @from_cset: css_set @task currently belongs to (may be NULL)
				676	* @to_cset: new css_set @task is being moved to (may be NULL)
				677	* @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
				678	*
				679	* Move @task from @from_cset to @to_cset. If @task didn't belong to any
				680	* css_set, @from_cset can be NULL. If @task is being disassociated
				681	* instead of moved, @to_cset can be NULL.
				682	*
				683	* This function automatically handles populated_cnt updates and
				684	* css_task_iter adjustments but the caller is responsible for managing
				685	* @from_cset and @to_cset's reference counts.
				686	*/
				687	static void css_set_move_task(struct task_struct *task,
				688	struct css_set from_cset, struct css_set to_cset,
				689	bool use_mg_tasks)
				690	{
				691	lockdep_assert_held(&css_set_lock);
				692
				693	if (from_cset) {
				694	struct css_task_iter it, pos;
				695
				696	WARN_ON_ONCE(list_empty(&task->cg_list));
				697
				698	/*
				699	* @task is leaving, advance task iterators which are
				700	* pointing to it so that they can resume at the next
				701	* position. Advancing an iterator might remove it from
				702	* the list, use safe walk. See css_task_iter_advance*()
				703	* for details.
				704	*/
				705	list_for_each_entry_safe(it, pos, &from_cset->task_iters,
				706	iters_node)
				707	if (it->task_pos == &task->cg_list)
				708	css_task_iter_advance(it);
				709
				710	list_del_init(&task->cg_list);
				711	if (!css_set_populated(from_cset))
				712	css_set_update_populated(from_cset, false);
				713	} else {
				714	WARN_ON_ONCE(!list_empty(&task->cg_list));
				715	}
				716
				717	if (to_cset) {
				718	/*
				719	* We are synchronized through cgroup_threadgroup_rwsem
				720	* against PF_EXITING setting such that we can't race
				721	* against cgroup_exit() changing the css_set to
				722	* init_css_set and dropping the old one.
				723	*/
				724	WARN_ON_ONCE(task->flags & PF_EXITING);
				725
				726	if (!css_set_populated(to_cset))
				727	css_set_update_populated(to_cset, true);
				728	rcu_assign_pointer(task->cgroups, to_cset);
				729	list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
				730	&to_cset->tasks);
				731	}
				732	}
				733
				734	/*
				735	* hash table for cgroup groups. This improves the performance to find
				736	* an existing css_set. This hash doesn't (currently) take into
				737	* account cgroups in empty hierarchies.
				738	*/
				739	#define CSS_SET_HASH_BITS 7
				740	static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
				741
				742	static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
				743	{
				744	unsigned long key = 0UL;
				745	struct cgroup_subsys *ss;
				746	int i;
				747
				748	for_each_subsys(ss, i)
				749	key += (unsigned long)css[i];
				750	key = (key >> 16) ^ key;
				751
				752	return key;
				753	}
				754
				755	static void put_css_set_locked(struct css_set *cset)
				756	{
				757	struct cgrp_cset_link link, tmp_link;
				758	struct cgroup_subsys *ss;
				759	int ssid;
				760
				761	lockdep_assert_held(&css_set_lock);
				762
				763	if (!atomic_dec_and_test(&cset->refcount))
				764	return;
				765
				766	/* This css_set is dead. unlink it and release cgroup and css refs */
				767	for_each_subsys(ss, ssid) {
				768	list_del(&cset->e_cset_node[ssid]);
				769	css_put(cset->subsys[ssid]);
				770	}
				771	hash_del(&cset->hlist);
				772	css_set_count--;
				773
				774	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
				775	list_del(&link->cset_link);
				776	list_del(&link->cgrp_link);
				777	if (cgroup_parent(link->cgrp))
				778	cgroup_put(link->cgrp);
				779	kfree(link);
				780	}
				781
				782	kfree_rcu(cset, rcu_head);
				783	}
				784
				785	static void put_css_set(struct css_set *cset)
				786	{
				787	/*
				788	* Ensure that the refcount doesn't hit zero while any readers
				789	* can see it. Similar to atomic_dec_and_lock(), but for an
				790	* rwlock
				791	*/
				792	if (atomic_add_unless(&cset->refcount, -1, 1))
				793	return;
				794
				795	spin_lock_bh(&css_set_lock);
				796	put_css_set_locked(cset);
				797	spin_unlock_bh(&css_set_lock);
				798	}
				799
				800	/*
				801	* refcounted get/put for css_set objects
				802	*/
				803	static inline void get_css_set(struct css_set *cset)
				804	{
				805	atomic_inc(&cset->refcount);
				806	}
				807
				808	/**
				809	* compare_css_sets - helper function for find_existing_css_set().
				810	* @cset: candidate css_set being tested
				811	* @old_cset: existing css_set for a task
				812	* @new_cgrp: cgroup that's being entered by the task
				813	* @template: desired set of css pointers in css_set (pre-calculated)
				814	*
				815	* Returns true if "cset" matches "old_cset" except for the hierarchy
				816	* which "new_cgrp" belongs to, for which it should match "new_cgrp".
				817	*/
				818	static bool compare_css_sets(struct css_set *cset,
				819	struct css_set *old_cset,
				820	struct cgroup *new_cgrp,
				821	struct cgroup_subsys_state *template[])
				822	{
				823	struct list_head l1, l2;
				824
				825	/*
				826	* On the default hierarchy, there can be csets which are
				827	* associated with the same set of cgroups but different csses.
				828	* Let's first ensure that csses match.
				829	*/
				830	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
				831	return false;
				832
				833	/*
				834	* Compare cgroup pointers in order to distinguish between
				835	* different cgroups in hierarchies. As different cgroups may
				836	* share the same effective css, this comparison is always
				837	* necessary.
				838	*/
				839	l1 = &cset->cgrp_links;
				840	l2 = &old_cset->cgrp_links;
				841	while (1) {
				842	struct cgrp_cset_link link1, link2;
				843	struct cgroup cgrp1, cgrp2;
				844
				845	l1 = l1->next;
				846	l2 = l2->next;
				847	/* See if we reached the end - both lists are equal length. */
				848	if (l1 == &cset->cgrp_links) {
				849	BUG_ON(l2 != &old_cset->cgrp_links);
				850	break;
				851	} else {
				852	BUG_ON(l2 == &old_cset->cgrp_links);
				853	}
				854	/* Locate the cgroups associated with these links. */
				855	link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
				856	link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
				857	cgrp1 = link1->cgrp;
				858	cgrp2 = link2->cgrp;
				859	/* Hierarchies should be linked in the same order. */
				860	BUG_ON(cgrp1->root != cgrp2->root);
				861
				862	/*
				863	* If this hierarchy is the hierarchy of the cgroup
				864	* that's changing, then we need to check that this
				865	* css_set points to the new cgroup; if it's any other
				866	* hierarchy, then this css_set should point to the
				867	* same cgroup as the old css_set.
				868	*/
				869	if (cgrp1->root == new_cgrp->root) {
				870	if (cgrp1 != new_cgrp)
				871	return false;
				872	} else {
				873	if (cgrp1 != cgrp2)
				874	return false;
				875	}
				876	}
				877	return true;
				878	}
				879
				880	/**
				881	* find_existing_css_set - init css array and find the matching css_set
				882	* @old_cset: the css_set that we're using before the cgroup transition
				883	* @cgrp: the cgroup that we're moving into
				884	* @template: out param for the new set of csses, should be clear on entry
				885	*/
				886	static struct css_set find_existing_css_set(struct css_set old_cset,
				887	struct cgroup *cgrp,
				888	struct cgroup_subsys_state *template[])
				889	{
				890	struct cgroup_root *root = cgrp->root;
				891	struct cgroup_subsys *ss;
				892	struct css_set *cset;
				893	unsigned long key;
				894	int i;
				895
				896	/*
				897	* Build the set of subsystem state objects that we want to see in the
				898	* new css_set. while subsystems can change globally, the entries here
				899	* won't change, so no need for locking.
				900	*/
				901	for_each_subsys(ss, i) {
				902	if (root->subsys_mask & (1UL << i)) {
				903	/*
				904	* @ss is in this hierarchy, so we want the
				905	* effective css from @cgrp.
				906	*/
				907	template[i] = cgroup_e_css(cgrp, ss);
				908	} else {
				909	/*
				910	* @ss is not in this hierarchy, so we don't want
				911	* to change the css.
				912	*/
				913	template[i] = old_cset->subsys[i];
				914	}
				915	}
				916
				917	key = css_set_hash(template);
				918	hash_for_each_possible(css_set_table, cset, hlist, key) {
				919	if (!compare_css_sets(cset, old_cset, cgrp, template))
				920	continue;
				921
				922	/* This css_set matches what we need */
				923	return cset;
				924	}
				925
				926	/* No existing cgroup group matched */
				927	return NULL;
				928	}
				929
				930	static void free_cgrp_cset_links(struct list_head *links_to_free)
				931	{
				932	struct cgrp_cset_link link, tmp_link;
				933
				934	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
				935	list_del(&link->cset_link);
				936	kfree(link);
				937	}
				938	}
				939
				940	/**
				941	* allocate_cgrp_cset_links - allocate cgrp_cset_links
				942	* @count: the number of links to allocate
				943	* @tmp_links: list_head the allocated links are put on
				944	*
				945	* Allocate @count cgrp_cset_link structures and chain them on @tmp_links
				946	* through ->cset_link. Returns 0 on success or -errno.
				947	*/
				948	static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
				949	{
				950	struct cgrp_cset_link *link;
				951	int i;
				952
				953	INIT_LIST_HEAD(tmp_links);
				954
				955	for (i = 0; i < count; i++) {
				956	link = kzalloc(sizeof(*link), GFP_KERNEL);
				957	if (!link) {
				958	free_cgrp_cset_links(tmp_links);
				959	return -ENOMEM;
				960	}
				961	list_add(&link->cset_link, tmp_links);
				962	}
				963	return 0;
				964	}
				965
				966	/**
				967	* link_css_set - a helper function to link a css_set to a cgroup
				968	* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
				969	* @cset: the css_set to be linked
				970	* @cgrp: the destination cgroup
				971	*/
				972	static void link_css_set(struct list_head tmp_links, struct css_set cset,
				973	struct cgroup *cgrp)
				974	{
				975	struct cgrp_cset_link *link;
				976
				977	BUG_ON(list_empty(tmp_links));
				978
				979	if (cgroup_on_dfl(cgrp))
				980	cset->dfl_cgrp = cgrp;
				981
				982	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
				983	link->cset = cset;
				984	link->cgrp = cgrp;
				985
				986	/*
				987	* Always add links to the tail of the lists so that the lists are
				988	* in choronological order.
				989	*/
				990	list_move_tail(&link->cset_link, &cgrp->cset_links);
				991	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
				992
				993	if (cgroup_parent(cgrp))
				994	cgroup_get(cgrp);
				995	}
				996
				997	/**
				998	* find_css_set - return a new css_set with one cgroup updated
				999	* @old_cset: the baseline css_set
				1000	* @cgrp: the cgroup to be updated
				1001	*
				1002	* Return a new css_set that's equivalent to @old_cset, but with @cgrp
				1003	* substituted into the appropriate hierarchy.
				1004	*/
				1005	static struct css_set find_css_set(struct css_set old_cset,
				1006	struct cgroup *cgrp)
				1007	{
				1008	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
				1009	struct css_set *cset;
				1010	struct list_head tmp_links;
				1011	struct cgrp_cset_link *link;
				1012	struct cgroup_subsys *ss;
				1013	unsigned long key;
				1014	int ssid;
				1015
				1016	lockdep_assert_held(&cgroup_mutex);
				1017
				1018	/* First see if we already have a cgroup group that matches
				1019	* the desired set */
				1020	spin_lock_bh(&css_set_lock);
				1021	cset = find_existing_css_set(old_cset, cgrp, template);
				1022	if (cset)
				1023	get_css_set(cset);
				1024	spin_unlock_bh(&css_set_lock);
				1025
				1026	if (cset)
				1027	return cset;
				1028
				1029	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
				1030	if (!cset)
				1031	return NULL;
				1032
				1033	/* Allocate all the cgrp_cset_link objects that we'll need */
				1034	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
				1035	kfree(cset);
				1036	return NULL;
				1037	}
				1038
				1039	atomic_set(&cset->refcount, 1);
				1040	INIT_LIST_HEAD(&cset->cgrp_links);
				1041	INIT_LIST_HEAD(&cset->tasks);
				1042	INIT_LIST_HEAD(&cset->mg_tasks);
				1043	INIT_LIST_HEAD(&cset->mg_preload_node);
				1044	INIT_LIST_HEAD(&cset->mg_node);
				1045	INIT_LIST_HEAD(&cset->task_iters);
				1046	INIT_HLIST_NODE(&cset->hlist);
				1047
				1048	/* Copy the set of subsystem state objects generated in
				1049	* find_existing_css_set() */
				1050	memcpy(cset->subsys, template, sizeof(cset->subsys));
				1051
				1052	spin_lock_bh(&css_set_lock);
				1053	/* Add reference counts and links from the new css_set. */
				1054	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
				1055	struct cgroup *c = link->cgrp;
				1056
				1057	if (c->root == cgrp->root)
				1058	c = cgrp;
				1059	link_css_set(&tmp_links, cset, c);
				1060	}
				1061
				1062	BUG_ON(!list_empty(&tmp_links));
				1063
				1064	css_set_count++;
				1065
				1066	/* Add @cset to the hash table */
				1067	key = css_set_hash(cset->subsys);
				1068	hash_add(css_set_table, &cset->hlist, key);
				1069
				1070	for_each_subsys(ss, ssid) {
				1071	struct cgroup_subsys_state *css = cset->subsys[ssid];
				1072
				1073	list_add_tail(&cset->e_cset_node[ssid],
				1074	&css->cgroup->e_csets[ssid]);
				1075	css_get(css);
				1076	}
				1077
				1078	spin_unlock_bh(&css_set_lock);
				1079
				1080	return cset;
				1081	}
				1082
				1083	static struct cgroup_root cgroup_root_from_kf(struct kernfs_root kf_root)
				1084	{
				1085	struct cgroup *root_cgrp = kf_root->kn->priv;
				1086
				1087	return root_cgrp->root;
				1088	}
				1089
				1090	static int cgroup_init_root_id(struct cgroup_root *root)
				1091	{
				1092	int id;
				1093
				1094	lockdep_assert_held(&cgroup_mutex);
				1095
				1096	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
				1097	if (id < 0)
				1098	return id;
				1099
				1100	root->hierarchy_id = id;
				1101	return 0;
				1102	}
				1103
				1104	static void cgroup_exit_root_id(struct cgroup_root *root)
				1105	{
				1106	lockdep_assert_held(&cgroup_mutex);
				1107
				1108	if (root->hierarchy_id) {
				1109	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
				1110	root->hierarchy_id = 0;
				1111	}
				1112	}
				1113
				1114	static void cgroup_free_root(struct cgroup_root *root)
				1115	{
				1116	if (root) {
				1117	/* hierarchy ID should already have been released */
				1118	WARN_ON_ONCE(root->hierarchy_id);
				1119
				1120	idr_destroy(&root->cgroup_idr);
				1121	kfree(root);
				1122	}
				1123	}
				1124
				1125	static void cgroup_destroy_root(struct cgroup_root *root)
				1126	{
				1127	struct cgroup *cgrp = &root->cgrp;
				1128	struct cgrp_cset_link link, tmp_link;
				1129
				1130	mutex_lock(&cgroup_mutex);
				1131
				1132	BUG_ON(atomic_read(&root->nr_cgrps));
				1133	BUG_ON(!list_empty(&cgrp->self.children));
				1134
				1135	/* Rebind all subsystems back to the default hierarchy */
				1136	rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
				1137
				1138	/*
				1139	* Release all the links from cset_links to this hierarchy's
				1140	* root cgroup
				1141	*/
				1142	spin_lock_bh(&css_set_lock);
				1143
				1144	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
				1145	list_del(&link->cset_link);
				1146	list_del(&link->cgrp_link);
				1147	kfree(link);
				1148	}
				1149
				1150	spin_unlock_bh(&css_set_lock);
				1151
				1152	if (!list_empty(&root->root_list)) {
				1153	list_del(&root->root_list);
				1154	cgroup_root_count--;
				1155	}
				1156
				1157	cgroup_exit_root_id(root);
				1158
				1159	mutex_unlock(&cgroup_mutex);
				1160
				1161	kernfs_destroy_root(root->kf_root);
				1162	cgroup_free_root(root);
				1163	}
				1164
				1165	/* look up cgroup associated with given css_set on the specified hierarchy */
				1166	static struct cgroup cset_cgroup_from_root(struct css_set cset,
				1167	struct cgroup_root *root)
				1168	{
				1169	struct cgroup *res = NULL;
				1170
				1171	lockdep_assert_held(&cgroup_mutex);
				1172	lockdep_assert_held(&css_set_lock);
				1173
				1174	if (cset == &init_css_set) {
				1175	res = &root->cgrp;
				1176	} else {
				1177	struct cgrp_cset_link *link;
				1178
				1179	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
				1180	struct cgroup *c = link->cgrp;
				1181
				1182	if (c->root == root) {
				1183	res = c;
				1184	break;
				1185	}
				1186	}
				1187	}
				1188
				1189	BUG_ON(!res);
				1190	return res;
				1191	}
				1192
				1193	/*
				1194	* Return the cgroup for "task" from the given hierarchy. Must be
				1195	* called with cgroup_mutex and css_set_lock held.
				1196	*/
				1197	static struct cgroup task_cgroup_from_root(struct task_struct task,
				1198	struct cgroup_root *root)
				1199	{
				1200	/*
				1201	* No need to lock the task - since we hold cgroup_mutex the
				1202	* task can't change groups, so the only thing that can happen
				1203	* is that it exits and its css is set back to init_css_set.
				1204	*/
				1205	return cset_cgroup_from_root(task_css_set(task), root);
				1206	}
				1207
				1208	/*
				1209	* A task must hold cgroup_mutex to modify cgroups.
				1210	*
				1211	* Any task can increment and decrement the count field without lock.
				1212	* So in general, code holding cgroup_mutex can't rely on the count
				1213	* field not changing. However, if the count goes to zero, then only
				1214	* cgroup_attach_task() can increment it again. Because a count of zero
				1215	* means that no tasks are currently attached, therefore there is no
				1216	* way a task attached to that cgroup can fork (the other way to
				1217	* increment the count). So code holding cgroup_mutex can safely
				1218	* assume that if the count is zero, it will stay zero. Similarly, if
				1219	* a task holds cgroup_mutex on a cgroup with zero count, it
				1220	* knows that the cgroup won't be removed, as cgroup_rmdir()
				1221	* needs that mutex.
				1222	*
				1223	* A cgroup can only be deleted if both its 'count' of using tasks
				1224	* is zero, and its list of 'children' cgroups is empty. Since all
				1225	* tasks in the system use _some_ cgroup, and since there is always at
				1226	* least one task in the system (init, pid == 1), therefore, root cgroup
				1227	* always has either children cgroups and/or using tasks. So we don't
				1228	* need a special hack to ensure that root cgroup cannot be deleted.
				1229	*
				1230	* P.S. One more locking exception. RCU is used to guard the
				1231	* update of a tasks cgroup pointer by cgroup_attach_task()
				1232	*/
				1233
				1234	static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
				1235	static const struct file_operations proc_cgroupstats_operations;
				1236
				1237	static char cgroup_file_name(struct cgroup cgrp, const struct cftype *cft,
				1238	char *buf)
				1239	{
				1240	struct cgroup_subsys *ss = cft->ss;
				1241
				1242	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
				1243	!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
				1244	snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
				1245	cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
				1246	cft->name);
				1247	else
				1248	strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
				1249	return buf;
				1250	}
				1251
				1252	/**
				1253	* cgroup_file_mode - deduce file mode of a control file
				1254	* @cft: the control file in question
				1255	*
				1256	* S_IRUGO for read, S_IWUSR for write.
				1257	*/
				1258	static umode_t cgroup_file_mode(const struct cftype *cft)
				1259	{
				1260	umode_t mode = 0;
				1261
				1262	if (cft->read_u64 \|\| cft->read_s64 \|\| cft->seq_show)
				1263	mode \|= S_IRUGO;
				1264
				1265	if (cft->write_u64 \|\| cft->write_s64 \|\| cft->write) {
				1266	if (cft->flags & CFTYPE_WORLD_WRITABLE)
				1267	mode \|= S_IWUGO;
				1268	else
				1269	mode \|= S_IWUSR;
				1270	}
				1271
				1272	return mode;
				1273	}
				1274
				1275	/**
				1276	* cgroup_calc_child_subsys_mask - calculate child_subsys_mask
				1277	* @cgrp: the target cgroup
				1278	* @subtree_control: the new subtree_control mask to consider
				1279	*
				1280	* On the default hierarchy, a subsystem may request other subsystems to be
				1281	* enabled together through its ->depends_on mask. In such cases, more
				1282	* subsystems than specified in "cgroup.subtree_control" may be enabled.
				1283	*
				1284	* This function calculates which subsystems need to be enabled if
				1285	* @subtree_control is to be applied to @cgrp. The returned mask is always
				1286	* a superset of @subtree_control and follows the usual hierarchy rules.
				1287	*/
				1288	static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
				1289	unsigned long subtree_control)
				1290	{
				1291	struct cgroup *parent = cgroup_parent(cgrp);
				1292	unsigned long cur_ss_mask = subtree_control;
				1293	struct cgroup_subsys *ss;
				1294	int ssid;
				1295
				1296	lockdep_assert_held(&cgroup_mutex);
				1297
				1298	if (!cgroup_on_dfl(cgrp))
				1299	return cur_ss_mask;
				1300
				1301	while (true) {
				1302	unsigned long new_ss_mask = cur_ss_mask;
				1303
				1304	for_each_subsys_which(ss, ssid, &cur_ss_mask)
				1305	new_ss_mask \|= ss->depends_on;
				1306
				1307	/*
				1308	* Mask out subsystems which aren't available. This can
				1309	* happen only if some depended-upon subsystems were bound
				1310	* to non-default hierarchies.
				1311	*/
				1312	if (parent)
				1313	new_ss_mask &= parent->child_subsys_mask;
				1314	else
				1315	new_ss_mask &= cgrp->root->subsys_mask;
				1316
				1317	if (new_ss_mask == cur_ss_mask)
				1318	break;
				1319	cur_ss_mask = new_ss_mask;
				1320	}
				1321
				1322	return cur_ss_mask;
				1323	}
				1324
				1325	/**
				1326	* cgroup_refresh_child_subsys_mask - update child_subsys_mask
				1327	* @cgrp: the target cgroup
				1328	*
				1329	* Update @cgrp->child_subsys_mask according to the current
				1330	* @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
				1331	*/
				1332	static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
				1333	{
				1334	cgrp->child_subsys_mask =
				1335	cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
				1336	}
				1337
				1338	/**
				1339	* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
				1340	* @kn: the kernfs_node being serviced
				1341	*
				1342	* This helper undoes cgroup_kn_lock_live() and should be invoked before
				1343	* the method finishes if locking succeeded. Note that once this function
				1344	* returns the cgroup returned by cgroup_kn_lock_live() may become
				1345	* inaccessible any time. If the caller intends to continue to access the
				1346	* cgroup, it should pin it before invoking this function.
				1347	*/
				1348	static void cgroup_kn_unlock(struct kernfs_node *kn)
				1349	{
				1350	struct cgroup *cgrp;
				1351
				1352	if (kernfs_type(kn) == KERNFS_DIR)
				1353	cgrp = kn->priv;
				1354	else
				1355	cgrp = kn->parent->priv;
				1356
				1357	mutex_unlock(&cgroup_mutex);
				1358
				1359	kernfs_unbreak_active_protection(kn);
				1360	cgroup_put(cgrp);
				1361	}
				1362
				1363	/**
				1364	* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
				1365	* @kn: the kernfs_node being serviced
				1366	*
				1367	* This helper is to be used by a cgroup kernfs method currently servicing
				1368	* @kn. It breaks the active protection, performs cgroup locking and
				1369	* verifies that the associated cgroup is alive. Returns the cgroup if
				1370	* alive; otherwise, %NULL. A successful return should be undone by a
				1371	* matching cgroup_kn_unlock() invocation.
				1372	*
				1373	* Any cgroup kernfs method implementation which requires locking the
				1374	* associated cgroup should use this helper. It avoids nesting cgroup
				1375	* locking under kernfs active protection and allows all kernfs operations
				1376	* including self-removal.
				1377	*/
				1378	static struct cgroup cgroup_kn_lock_live(struct kernfs_node kn)
				1379	{
				1380	struct cgroup *cgrp;
				1381
				1382	if (kernfs_type(kn) == KERNFS_DIR)
				1383	cgrp = kn->priv;
				1384	else
				1385	cgrp = kn->parent->priv;
				1386
				1387	/*
				1388	* We're gonna grab cgroup_mutex which nests outside kernfs
				1389	* active_ref. cgroup liveliness check alone provides enough
				1390	* protection against removal. Ensure @cgrp stays accessible and
				1391	* break the active_ref protection.
				1392	*/
				1393	if (!cgroup_tryget(cgrp))
				1394	return NULL;
				1395	kernfs_break_active_protection(kn);
				1396
				1397	mutex_lock(&cgroup_mutex);
				1398
				1399	if (!cgroup_is_dead(cgrp))
				1400	return cgrp;
				1401
				1402	cgroup_kn_unlock(kn);
				1403	return NULL;
				1404	}
				1405
				1406	static void cgroup_rm_file(struct cgroup cgrp, const struct cftype cft)
				1407	{
				1408	char name[CGROUP_FILE_NAME_MAX];
				1409
				1410	lockdep_assert_held(&cgroup_mutex);
				1411
				1412	if (cft->file_offset) {
				1413	struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
				1414	struct cgroup_file cfile = (void )css + cft->file_offset;
				1415
				1416	spin_lock_irq(&cgroup_file_kn_lock);
				1417	cfile->kn = NULL;
				1418	spin_unlock_irq(&cgroup_file_kn_lock);
				1419	}
				1420
				1421	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
				1422	}
				1423
				1424	/**
				1425	* css_clear_dir - remove subsys files in a cgroup directory
				1426	* @css: taget css
				1427	* @cgrp_override: specify if target cgroup is different from css->cgroup
				1428	*/
				1429	static void css_clear_dir(struct cgroup_subsys_state *css,
				1430	struct cgroup *cgrp_override)
				1431	{
				1432	struct cgroup *cgrp = cgrp_override ?: css->cgroup;
				1433	struct cftype *cfts;
				1434
				1435	list_for_each_entry(cfts, &css->ss->cfts, node)
				1436	cgroup_addrm_files(css, cgrp, cfts, false);
				1437	}
				1438
				1439	/**
				1440	* css_populate_dir - create subsys files in a cgroup directory
				1441	* @css: target css
				1442	* @cgrp_overried: specify if target cgroup is different from css->cgroup
				1443	*
				1444	* On failure, no file is added.
				1445	*/
				1446	static int css_populate_dir(struct cgroup_subsys_state *css,
				1447	struct cgroup *cgrp_override)
				1448	{
				1449	struct cgroup *cgrp = cgrp_override ?: css->cgroup;
				1450	struct cftype cfts, failed_cfts;
				1451	int ret;
				1452
				1453	if (!css->ss) {
				1454	if (cgroup_on_dfl(cgrp))
				1455	cfts = cgroup_dfl_base_files;
				1456	else
				1457	cfts = cgroup_legacy_base_files;
				1458
				1459	return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
				1460	}
				1461
				1462	list_for_each_entry(cfts, &css->ss->cfts, node) {
				1463	ret = cgroup_addrm_files(css, cgrp, cfts, true);
				1464	if (ret < 0) {
				1465	failed_cfts = cfts;
				1466	goto err;
				1467	}
				1468	}
				1469	return 0;
				1470	err:
				1471	list_for_each_entry(cfts, &css->ss->cfts, node) {
				1472	if (cfts == failed_cfts)
				1473	break;
				1474	cgroup_addrm_files(css, cgrp, cfts, false);
				1475	}
				1476	return ret;
				1477	}
				1478
				1479	static int rebind_subsystems(struct cgroup_root *dst_root,
				1480	unsigned long ss_mask)
				1481	{
				1482	struct cgroup *dcgrp = &dst_root->cgrp;
				1483	struct cgroup_subsys *ss;
				1484	unsigned long tmp_ss_mask;
				1485	int ssid, i, ret;
				1486
				1487	lockdep_assert_held(&cgroup_mutex);
				1488
				1489	for_each_subsys_which(ss, ssid, &ss_mask) {
				1490	/* if @ss has non-root csses attached to it, can't move */
				1491	if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
				1492	return -EBUSY;
				1493
				1494	/* can't move between two non-dummy roots either */
				1495	if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
				1496	return -EBUSY;
				1497	}
				1498
				1499	/* skip creating root files on dfl_root for inhibited subsystems */
				1500	tmp_ss_mask = ss_mask;
				1501	if (dst_root == &cgrp_dfl_root)
				1502	tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
				1503
				1504	for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
				1505	struct cgroup *scgrp = &ss->root->cgrp;
				1506	int tssid;
				1507
				1508	ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
				1509	if (!ret)
				1510	continue;
				1511
				1512	/*
				1513	* Rebinding back to the default root is not allowed to
				1514	* fail. Using both default and non-default roots should
				1515	* be rare. Moving subsystems back and forth even more so.
				1516	* Just warn about it and continue.
				1517	*/
				1518	if (dst_root == &cgrp_dfl_root) {
				1519	if (cgrp_dfl_root_visible) {
				1520	pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
				1521	ret, ss_mask);
				1522	pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
				1523	}
				1524	continue;
				1525	}
				1526
				1527	for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
				1528	if (tssid == ssid)
				1529	break;
				1530	css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
				1531	}
				1532	return ret;
				1533	}
				1534
				1535	/*
				1536	* Nothing can fail from this point on. Remove files for the
				1537	* removed subsystems and rebind each subsystem.
				1538	*/
				1539	for_each_subsys_which(ss, ssid, &ss_mask) {
				1540	struct cgroup_root *src_root = ss->root;
				1541	struct cgroup *scgrp = &src_root->cgrp;
				1542	struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
				1543	struct css_set *cset;
				1544
				1545	WARN_ON(!css \|\| cgroup_css(dcgrp, ss));
				1546
				1547	css_clear_dir(css, NULL);
				1548
				1549	RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
				1550	rcu_assign_pointer(dcgrp->subsys[ssid], css);
				1551	ss->root = dst_root;
				1552	css->cgroup = dcgrp;
				1553
				1554	spin_lock_bh(&css_set_lock);
				1555	hash_for_each(css_set_table, i, cset, hlist)
				1556	list_move_tail(&cset->e_cset_node[ss->id],
				1557	&dcgrp->e_csets[ss->id]);
				1558	spin_unlock_bh(&css_set_lock);
				1559
				1560	src_root->subsys_mask &= ~(1 << ssid);
				1561	scgrp->subtree_control &= ~(1 << ssid);
				1562	cgroup_refresh_child_subsys_mask(scgrp);
				1563
				1564	/* default hierarchy doesn't enable controllers by default */
				1565	dst_root->subsys_mask \|= 1 << ssid;
				1566	if (dst_root == &cgrp_dfl_root) {
				1567	static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
				1568	} else {
				1569	dcgrp->subtree_control \|= 1 << ssid;
				1570	cgroup_refresh_child_subsys_mask(dcgrp);
				1571	static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
				1572	}
				1573
				1574	if (ss->bind)
				1575	ss->bind(css);
				1576	}
				1577
				1578	kernfs_activate(dcgrp->kn);
				1579	return 0;
				1580	}
				1581
				1582	static int cgroup_show_options(struct seq_file *seq,
				1583	struct kernfs_root *kf_root)
				1584	{
				1585	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
				1586	struct cgroup_subsys *ss;
				1587	int ssid;
				1588
				1589	if (root != &cgrp_dfl_root)
				1590	for_each_subsys(ss, ssid)
				1591	if (root->subsys_mask & (1 << ssid))
				1592	seq_show_option(seq, ss->legacy_name, NULL);
				1593	if (root->flags & CGRP_ROOT_NOPREFIX)
				1594	seq_puts(seq, ",noprefix");
				1595	if (root->flags & CGRP_ROOT_XATTR)
				1596	seq_puts(seq, ",xattr");
				1597
				1598	spin_lock(&release_agent_path_lock);
				1599	if (strlen(root->release_agent_path))
				1600	seq_show_option(seq, "release_agent",
				1601	root->release_agent_path);
				1602	spin_unlock(&release_agent_path_lock);
				1603
				1604	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
				1605	seq_puts(seq, ",clone_children");
				1606	if (strlen(root->name))
				1607	seq_show_option(seq, "name", root->name);
				1608	return 0;
				1609	}
				1610
				1611	struct cgroup_sb_opts {
				1612	unsigned long subsys_mask;
				1613	unsigned int flags;
				1614	char *release_agent;
				1615	bool cpuset_clone_children;
				1616	char *name;
				1617	/* User explicitly requested empty subsystem */
				1618	bool none;
				1619	};
				1620
				1621	static int parse_cgroupfs_options(char data, struct cgroup_sb_opts opts)
				1622	{
				1623	char token, o = data;
				1624	bool all_ss = false, one_ss = false;
				1625	unsigned long mask = -1UL;
				1626	struct cgroup_subsys *ss;
				1627	int nr_opts = 0;
				1628	int i;
				1629
				1630	#ifdef CONFIG_CPUSETS
				1631	mask = ~(1U << cpuset_cgrp_id);
				1632	#endif
				1633
				1634	memset(opts, 0, sizeof(*opts));
				1635
				1636	while ((token = strsep(&o, ",")) != NULL) {
				1637	nr_opts++;
				1638
				1639	if (!*token)
				1640	return -EINVAL;
				1641	if (!strcmp(token, "none")) {
				1642	/* Explicitly have no subsystems */
				1643	opts->none = true;
				1644	continue;
				1645	}
				1646	if (!strcmp(token, "all")) {
				1647	/* Mutually exclusive option 'all' + subsystem name */
				1648	if (one_ss)
				1649	return -EINVAL;
				1650	all_ss = true;
				1651	continue;
				1652	}
				1653	if (!strcmp(token, "__DEVEL__sane_behavior")) {
				1654	opts->flags \|= CGRP_ROOT_SANE_BEHAVIOR;
				1655	continue;
				1656	}
				1657	if (!strcmp(token, "noprefix")) {
				1658	opts->flags \|= CGRP_ROOT_NOPREFIX;
				1659	continue;
				1660	}
				1661	if (!strcmp(token, "clone_children")) {
				1662	opts->cpuset_clone_children = true;
				1663	continue;
				1664	}
				1665	if (!strcmp(token, "xattr")) {
				1666	opts->flags \|= CGRP_ROOT_XATTR;
				1667	continue;
				1668	}
				1669	if (!strncmp(token, "release_agent=", 14)) {
				1670	/* Specifying two release agents is forbidden */
				1671	if (opts->release_agent)
				1672	return -EINVAL;
				1673	opts->release_agent =
				1674	kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
				1675	if (!opts->release_agent)
				1676	return -ENOMEM;
				1677	continue;
				1678	}
				1679	if (!strncmp(token, "name=", 5)) {
				1680	const char *name = token + 5;
				1681	/* Can't specify an empty name */
				1682	if (!strlen(name))
				1683	return -EINVAL;
				1684	/* Must match [\w.-]+ */
				1685	for (i = 0; i < strlen(name); i++) {
				1686	char c = name[i];
				1687	if (isalnum(c))
				1688	continue;
				1689	if ((c == '.') \|\| (c == '-') \|\| (c == '_'))
				1690	continue;
				1691	return -EINVAL;
				1692	}
				1693	/* Specifying two names is forbidden */
				1694	if (opts->name)
				1695	return -EINVAL;
				1696	opts->name = kstrndup(name,
				1697	MAX_CGROUP_ROOT_NAMELEN - 1,
				1698	GFP_KERNEL);
				1699	if (!opts->name)
				1700	return -ENOMEM;
				1701
				1702	continue;
				1703	}
				1704
				1705	for_each_subsys(ss, i) {
				1706	if (strcmp(token, ss->legacy_name))
				1707	continue;
				1708	if (!cgroup_ssid_enabled(i))
				1709	continue;
				1710
				1711	/* Mutually exclusive option 'all' + subsystem name */
				1712	if (all_ss)
				1713	return -EINVAL;
				1714	opts->subsys_mask \|= (1 << i);
				1715	one_ss = true;
				1716
				1717	break;
				1718	}
				1719	if (i == CGROUP_SUBSYS_COUNT)
				1720	return -ENOENT;
				1721	}
				1722
				1723	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
				1724	pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
				1725	if (nr_opts != 1) {
				1726	pr_err("sane_behavior: no other mount options allowed\n");
				1727	return -EINVAL;
				1728	}
				1729	return 0;
				1730	}
				1731
				1732	/*
				1733	* If the 'all' option was specified select all the subsystems,
				1734	* otherwise if 'none', 'name=' and a subsystem name options were
				1735	* not specified, let's default to 'all'
				1736	*/
				1737	if (all_ss \|\| (!one_ss && !opts->none && !opts->name))
				1738	for_each_subsys(ss, i)
				1739	if (cgroup_ssid_enabled(i))
				1740	opts->subsys_mask \|= (1 << i);
				1741
				1742	/*
				1743	* We either have to specify by name or by subsystems. (So all
				1744	* empty hierarchies must have a name).
				1745	*/
				1746	if (!opts->subsys_mask && !opts->name)
				1747	return -EINVAL;
				1748
				1749	/*
				1750	* Option noprefix was introduced just for backward compatibility
				1751	* with the old cpuset, so we allow noprefix only if mounting just
				1752	* the cpuset subsystem.
				1753	*/
				1754	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
				1755	return -EINVAL;
				1756
				1757	/* Can't specify "none" and some subsystems */
				1758	if (opts->subsys_mask && opts->none)
				1759	return -EINVAL;
				1760
				1761	return 0;
				1762	}
				1763
				1764	static int cgroup_remount(struct kernfs_root kf_root, int flags, char *data)
				1765	{
				1766	int ret = 0;
				1767	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
				1768	struct cgroup_sb_opts opts;
				1769	unsigned long added_mask, removed_mask;
				1770
				1771	if (root == &cgrp_dfl_root) {
				1772	pr_err("remount is not allowed\n");
				1773	return -EINVAL;
				1774	}
				1775
				1776	mutex_lock(&cgroup_mutex);
				1777
				1778	/* See what subsystems are wanted */
				1779	ret = parse_cgroupfs_options(data, &opts);
				1780	if (ret)
				1781	goto out_unlock;
				1782
				1783	if (opts.subsys_mask != root->subsys_mask \|\| opts.release_agent)
				1784	pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
				1785	task_tgid_nr(current), current->comm);
				1786
				1787	added_mask = opts.subsys_mask & ~root->subsys_mask;
				1788	removed_mask = root->subsys_mask & ~opts.subsys_mask;
				1789
				1790	/* Don't allow flags or name to change at remount */
				1791	if ((opts.flags ^ root->flags) \|\|
				1792	(opts.name && strcmp(opts.name, root->name))) {
				1793	pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
				1794	opts.flags, opts.name ?: "", root->flags, root->name);
				1795	ret = -EINVAL;
				1796	goto out_unlock;
				1797	}
				1798
				1799	/* remounting is not allowed for populated hierarchies */
				1800	if (!list_empty(&root->cgrp.self.children)) {
				1801	ret = -EBUSY;
				1802	goto out_unlock;
				1803	}
				1804
				1805	ret = rebind_subsystems(root, added_mask);
				1806	if (ret)
				1807	goto out_unlock;
				1808
				1809	rebind_subsystems(&cgrp_dfl_root, removed_mask);
				1810
				1811	if (opts.release_agent) {
				1812	spin_lock(&release_agent_path_lock);
				1813	strcpy(root->release_agent_path, opts.release_agent);
				1814	spin_unlock(&release_agent_path_lock);
				1815	}
				1816	out_unlock:
				1817	kfree(opts.release_agent);
				1818	kfree(opts.name);
				1819	mutex_unlock(&cgroup_mutex);
				1820	return ret;
				1821	}
				1822
				1823	/*
				1824	* To reduce the fork() overhead for systems that are not actually using
				1825	* their cgroups capability, we don't maintain the lists running through
				1826	* each css_set to its tasks until we see the list actually used - in other
				1827	* words after the first mount.
				1828	*/
				1829	static bool use_task_css_set_links __read_mostly;
				1830
				1831	static void cgroup_enable_task_cg_lists(void)
				1832	{
				1833	struct task_struct p, g;
				1834
				1835	spin_lock_bh(&css_set_lock);
				1836
				1837	if (use_task_css_set_links)
				1838	goto out_unlock;
				1839
				1840	use_task_css_set_links = true;
				1841
				1842	/*
				1843	* We need tasklist_lock because RCU is not safe against
				1844	* while_each_thread(). Besides, a forking task that has passed
				1845	* cgroup_post_fork() without seeing use_task_css_set_links = 1
				1846	* is not guaranteed to have its child immediately visible in the
				1847	* tasklist if we walk through it with RCU.
				1848	*/
				1849	read_lock(&tasklist_lock);
				1850	do_each_thread(g, p) {
				1851	WARN_ON_ONCE(!list_empty(&p->cg_list) \|\|
				1852	task_css_set(p) != &init_css_set);
				1853
				1854	/*
				1855	* We should check if the process is exiting, otherwise
				1856	* it will race with cgroup_exit() in that the list
				1857	* entry won't be deleted though the process has exited.
				1858	* Do it while holding siglock so that we don't end up
				1859	* racing against cgroup_exit().
				1860	*/
				1861	spin_lock_irq(&p->sighand->siglock);
				1862	if (!(p->flags & PF_EXITING)) {
				1863	struct css_set *cset = task_css_set(p);
				1864
				1865	if (!css_set_populated(cset))
				1866	css_set_update_populated(cset, true);
				1867	list_add_tail(&p->cg_list, &cset->tasks);
				1868	get_css_set(cset);
				1869	}
				1870	spin_unlock_irq(&p->sighand->siglock);
				1871	} while_each_thread(g, p);
				1872	read_unlock(&tasklist_lock);
				1873	out_unlock:
				1874	spin_unlock_bh(&css_set_lock);
				1875	}
				1876
				1877	static void init_cgroup_housekeeping(struct cgroup *cgrp)
				1878	{
				1879	struct cgroup_subsys *ss;
				1880	int ssid;
				1881
				1882	INIT_LIST_HEAD(&cgrp->self.sibling);
				1883	INIT_LIST_HEAD(&cgrp->self.children);
				1884	INIT_LIST_HEAD(&cgrp->cset_links);
				1885	INIT_LIST_HEAD(&cgrp->pidlists);
				1886	mutex_init(&cgrp->pidlist_mutex);
				1887	cgrp->self.cgroup = cgrp;
				1888	cgrp->self.flags \|= CSS_ONLINE;
				1889
				1890	for_each_subsys(ss, ssid)
				1891	INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
				1892
				1893	init_waitqueue_head(&cgrp->offline_waitq);
				1894	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
				1895	}
				1896
				1897	static void init_cgroup_root(struct cgroup_root *root,
				1898	struct cgroup_sb_opts *opts)
				1899	{
				1900	struct cgroup *cgrp = &root->cgrp;
				1901
				1902	INIT_LIST_HEAD(&root->root_list);
				1903	atomic_set(&root->nr_cgrps, 1);
				1904	cgrp->root = root;
				1905	init_cgroup_housekeeping(cgrp);
				1906	idr_init(&root->cgroup_idr);
				1907
				1908	root->flags = opts->flags;
				1909	if (opts->release_agent)
				1910	strcpy(root->release_agent_path, opts->release_agent);
				1911	if (opts->name)
				1912	strcpy(root->name, opts->name);
				1913	if (opts->cpuset_clone_children)
				1914	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
				1915	}
				1916
				1917	static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
				1918	{
				1919	LIST_HEAD(tmp_links);
				1920	struct cgroup *root_cgrp = &root->cgrp;
				1921	struct css_set *cset;
				1922	int i, ret;
				1923
				1924	lockdep_assert_held(&cgroup_mutex);
				1925
				1926	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
				1927	if (ret < 0)
				1928	goto out;
				1929	root_cgrp->id = ret;
				1930
				1931	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
				1932	GFP_KERNEL);
				1933	if (ret)
				1934	goto out;
				1935
				1936	/*
				1937	* We're accessing css_set_count without locking css_set_lock here,
				1938	* but that's OK - it can only be increased by someone holding
				1939	* cgroup_lock, and that's us. The worst that can happen is that we
				1940	* have some link structures left over
				1941	*/
				1942	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
				1943	if (ret)
				1944	goto cancel_ref;
				1945
				1946	ret = cgroup_init_root_id(root);
				1947	if (ret)
				1948	goto cancel_ref;
				1949
				1950	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
				1951	KERNFS_ROOT_CREATE_DEACTIVATED,
				1952	root_cgrp);
				1953	if (IS_ERR(root->kf_root)) {
				1954	ret = PTR_ERR(root->kf_root);
				1955	goto exit_root_id;
				1956	}
				1957	root_cgrp->kn = root->kf_root->kn;
				1958
				1959	ret = css_populate_dir(&root_cgrp->self, NULL);
				1960	if (ret)
				1961	goto destroy_root;
				1962
				1963	ret = rebind_subsystems(root, ss_mask);
				1964	if (ret)
				1965	goto destroy_root;
				1966
				1967	/*
				1968	* There must be no failure case after here, since rebinding takes
				1969	* care of subsystems' refcounts, which are explicitly dropped in
				1970	* the failure exit path.
				1971	*/
				1972	list_add(&root->root_list, &cgroup_roots);
				1973	cgroup_root_count++;
				1974
				1975	/*
				1976	* Link the root cgroup in this hierarchy into all the css_set
				1977	* objects.
				1978	*/
				1979	spin_lock_bh(&css_set_lock);
				1980	hash_for_each(css_set_table, i, cset, hlist) {
				1981	link_css_set(&tmp_links, cset, root_cgrp);
				1982	if (css_set_populated(cset))
				1983	cgroup_update_populated(root_cgrp, true);
				1984	}
				1985	spin_unlock_bh(&css_set_lock);
				1986
				1987	BUG_ON(!list_empty(&root_cgrp->self.children));
				1988	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
				1989
				1990	kernfs_activate(root_cgrp->kn);
				1991	ret = 0;
				1992	goto out;
				1993
				1994	destroy_root:
				1995	kernfs_destroy_root(root->kf_root);
				1996	root->kf_root = NULL;
				1997	exit_root_id:
				1998	cgroup_exit_root_id(root);
				1999	cancel_ref:
				2000	percpu_ref_exit(&root_cgrp->self.refcnt);
				2001	out:
				2002	free_cgrp_cset_links(&tmp_links);
				2003	return ret;
				2004	}
				2005
				2006	static struct dentry cgroup_mount(struct file_system_type fs_type,
				2007	int flags, const char *unused_dev_name,
				2008	void *data)
				2009	{
				2010	struct super_block *pinned_sb = NULL;
				2011	struct cgroup_subsys *ss;
				2012	struct cgroup_root *root;
				2013	struct cgroup_sb_opts opts;
				2014	struct dentry *dentry;
				2015	int ret;
				2016	int i;
				2017	bool new_sb;
				2018
				2019	/*
				2020	* The first time anyone tries to mount a cgroup, enable the list
				2021	* linking each css_set to its tasks and fix up all existing tasks.
				2022	*/
				2023	if (!use_task_css_set_links)
				2024	cgroup_enable_task_cg_lists();
				2025
				2026	mutex_lock(&cgroup_mutex);
				2027
				2028	/* First find the desired set of subsystems */
				2029	ret = parse_cgroupfs_options(data, &opts);
				2030	if (ret)
				2031	goto out_unlock;
				2032
				2033	/* look for a matching existing root */
				2034	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
				2035	cgrp_dfl_root_visible = true;
				2036	root = &cgrp_dfl_root;
				2037	cgroup_get(&root->cgrp);
				2038	ret = 0;
				2039	goto out_unlock;
				2040	}
				2041
				2042	/*
				2043	* Destruction of cgroup root is asynchronous, so subsystems may
				2044	* still be dying after the previous unmount. Let's drain the
				2045	* dying subsystems. We just need to ensure that the ones
				2046	* unmounted previously finish dying and don't care about new ones
				2047	* starting. Testing ref liveliness is good enough.
				2048	*/
				2049	for_each_subsys(ss, i) {
				2050	if (!(opts.subsys_mask & (1 << i)) \|\|
				2051	ss->root == &cgrp_dfl_root)
				2052	continue;
				2053
				2054	if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
				2055	mutex_unlock(&cgroup_mutex);
				2056	msleep(10);
				2057	ret = restart_syscall();
				2058	goto out_free;
				2059	}
				2060	cgroup_put(&ss->root->cgrp);
				2061	}
				2062
				2063	for_each_root(root) {
				2064	bool name_match = false;
				2065
				2066	if (root == &cgrp_dfl_root)
				2067	continue;
				2068
				2069	/*
				2070	* If we asked for a name then it must match. Also, if
				2071	* name matches but sybsys_mask doesn't, we should fail.
				2072	* Remember whether name matched.
				2073	*/
				2074	if (opts.name) {
				2075	if (strcmp(opts.name, root->name))
				2076	continue;
				2077	name_match = true;
				2078	}
				2079
				2080	/*
				2081	* If we asked for subsystems (or explicitly for no
				2082	* subsystems) then they must match.
				2083	*/
				2084	if ((opts.subsys_mask \|\| opts.none) &&
				2085	(opts.subsys_mask != root->subsys_mask)) {
				2086	if (!name_match)
				2087	continue;
				2088	ret = -EBUSY;
				2089	goto out_unlock;
				2090	}
				2091
				2092	if (root->flags ^ opts.flags)
				2093	pr_warn("new mount options do not match the existing superblock, will be ignored\n");
				2094
				2095	/*
				2096	* We want to reuse @root whose lifetime is governed by its
				2097	* ->cgrp. Let's check whether @root is alive and keep it
				2098	* that way. As cgroup_kill_sb() can happen anytime, we
				2099	* want to block it by pinning the sb so that @root doesn't
				2100	* get killed before mount is complete.
				2101	*
				2102	* With the sb pinned, tryget_live can reliably indicate
				2103	* whether @root can be reused. If it's being killed,
				2104	* drain it. We can use wait_queue for the wait but this
				2105	* path is super cold. Let's just sleep a bit and retry.
				2106	*/
				2107	pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
				2108	if (IS_ERR(pinned_sb) \|\|
				2109	!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
				2110	mutex_unlock(&cgroup_mutex);
				2111	if (!IS_ERR_OR_NULL(pinned_sb))
				2112	deactivate_super(pinned_sb);
				2113	msleep(10);
				2114	ret = restart_syscall();
				2115	goto out_free;
				2116	}
				2117
				2118	ret = 0;
				2119	goto out_unlock;
				2120	}
				2121
				2122	/*
				2123	* No such thing, create a new one. name= matching without subsys
				2124	* specification is allowed for already existing hierarchies but we
				2125	* can't create new one without subsys specification.
				2126	*/
				2127	if (!opts.subsys_mask && !opts.none) {
				2128	ret = -EINVAL;
				2129	goto out_unlock;
				2130	}
				2131
				2132	root = kzalloc(sizeof(*root), GFP_KERNEL);
				2133	if (!root) {
				2134	ret = -ENOMEM;
				2135	goto out_unlock;
				2136	}
				2137
				2138	init_cgroup_root(root, &opts);
				2139
				2140	ret = cgroup_setup_root(root, opts.subsys_mask);
				2141	if (ret)
				2142	cgroup_free_root(root);
				2143
				2144	out_unlock:
				2145	mutex_unlock(&cgroup_mutex);
				2146	out_free:
				2147	kfree(opts.release_agent);
				2148	kfree(opts.name);
				2149
				2150	if (ret)
				2151	return ERR_PTR(ret);
				2152
				2153	dentry = kernfs_mount(fs_type, flags, root->kf_root,
				2154	CGROUP_SUPER_MAGIC, &new_sb);
				2155	if (IS_ERR(dentry) \|\| !new_sb)
				2156	cgroup_put(&root->cgrp);
				2157
				2158	/*
				2159	* If @pinned_sb, we're reusing an existing root and holding an
				2160	* extra ref on its sb. Mount is complete. Put the extra ref.
				2161	*/
				2162	if (pinned_sb) {
				2163	WARN_ON(new_sb);
				2164	deactivate_super(pinned_sb);
				2165	}
				2166
				2167	return dentry;
				2168	}
				2169
				2170	static void cgroup_kill_sb(struct super_block *sb)
				2171	{
				2172	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
				2173	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
				2174
				2175	/*
				2176	* If @root doesn't have any mounts or children, start killing it.
				2177	* This prevents new mounts by disabling percpu_ref_tryget_live().
				2178	* cgroup_mount() may wait for @root's release.
				2179	*
				2180	* And don't kill the default root.
				2181	*/
				2182	if (!list_empty(&root->cgrp.self.children) \|\|
				2183	root == &cgrp_dfl_root)
				2184	cgroup_put(&root->cgrp);
				2185	else
				2186	percpu_ref_kill(&root->cgrp.self.refcnt);
				2187
				2188	kernfs_kill_sb(sb);
				2189	}
				2190
				2191	static struct file_system_type cgroup_fs_type = {
				2192	.name = "cgroup",
				2193	.mount = cgroup_mount,
				2194	.kill_sb = cgroup_kill_sb,
				2195	};
				2196
				2197	/**
				2198	* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
				2199	* @task: target task
				2200	* @buf: the buffer to write the path into
				2201	* @buflen: the length of the buffer
				2202	*
				2203	* Determine @task's cgroup on the first (the one with the lowest non-zero
				2204	* hierarchy_id) cgroup hierarchy and copy its path into @buf. This
				2205	* function grabs cgroup_mutex and shouldn't be used inside locks used by
				2206	* cgroup controller callbacks.
				2207	*
				2208	* Return value is the same as kernfs_path().
				2209	*/
				2210	char task_cgroup_path(struct task_struct task, char *buf, size_t buflen)
				2211	{
				2212	struct cgroup_root *root;
				2213	struct cgroup *cgrp;
				2214	int hierarchy_id = 1;
				2215	char *path = NULL;
				2216
				2217	mutex_lock(&cgroup_mutex);
				2218	spin_lock_bh(&css_set_lock);
				2219
				2220	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
				2221
				2222	if (root) {
				2223	cgrp = task_cgroup_from_root(task, root);
				2224	path = cgroup_path(cgrp, buf, buflen);
				2225	} else {
				2226	/* if no hierarchy exists, everyone is in "/" */
				2227	if (strlcpy(buf, "/", buflen) < buflen)
				2228	path = buf;
				2229	}
				2230
				2231	spin_unlock_bh(&css_set_lock);
				2232	mutex_unlock(&cgroup_mutex);
				2233	return path;
				2234	}
				2235	EXPORT_SYMBOL_GPL(task_cgroup_path);
				2236
				2237	/* used to track tasks and other necessary states during migration */
				2238	struct cgroup_taskset {
				2239	/* the src and dst cset list running through cset->mg_node */
				2240	struct list_head src_csets;
				2241	struct list_head dst_csets;
				2242
				2243	/* the subsys currently being processed */
				2244	int ssid;
				2245
				2246	/*
				2247	* Fields for cgroup_taskset_*() iteration.
				2248	*
				2249	* Before migration is committed, the target migration tasks are on
				2250	* ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
				2251	* the csets on ->dst_csets. ->csets point to either ->src_csets
				2252	* or ->dst_csets depending on whether migration is committed.
				2253	*
				2254	* ->cur_csets and ->cur_task point to the current task position
				2255	* during iteration.
				2256	*/
				2257	struct list_head *csets;
				2258	struct css_set *cur_cset;
				2259	struct task_struct *cur_task;
				2260	};
				2261
				2262	#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
				2263	.src_csets = LIST_HEAD_INIT(tset.src_csets), \
				2264	.dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
				2265	.csets = &tset.src_csets, \
				2266	}
				2267
				2268	/**
				2269	* cgroup_taskset_add - try to add a migration target task to a taskset
				2270	* @task: target task
				2271	* @tset: target taskset
				2272	*
				2273	* Add @task, which is a migration target, to @tset. This function becomes
				2274	* noop if @task doesn't need to be migrated. @task's css_set should have
				2275	* been added as a migration source and @task->cg_list will be moved from
				2276	* the css_set's tasks list to mg_tasks one.
				2277	*/
				2278	static void cgroup_taskset_add(struct task_struct *task,
				2279	struct cgroup_taskset *tset)
				2280	{
				2281	struct css_set *cset;
				2282
				2283	lockdep_assert_held(&css_set_lock);
				2284
				2285	/* @task either already exited or can't exit until the end */
				2286	if (task->flags & PF_EXITING)
				2287	return;
				2288
				2289	/* leave @task alone if post_fork() hasn't linked it yet */
				2290	if (list_empty(&task->cg_list))
				2291	return;
				2292
				2293	cset = task_css_set(task);
				2294	if (!cset->mg_src_cgrp)
				2295	return;
				2296
				2297	list_move_tail(&task->cg_list, &cset->mg_tasks);
				2298	if (list_empty(&cset->mg_node))
				2299	list_add_tail(&cset->mg_node, &tset->src_csets);
				2300	if (list_empty(&cset->mg_dst_cset->mg_node))
				2301	list_move_tail(&cset->mg_dst_cset->mg_node,
				2302	&tset->dst_csets);
				2303	}
				2304
				2305	/**
				2306	* cgroup_taskset_first - reset taskset and return the first task
				2307	* @tset: taskset of interest
				2308	* @dst_cssp: output variable for the destination css
				2309	*
				2310	* @tset iteration is initialized and the first task is returned.
				2311	*/
				2312	struct task_struct cgroup_taskset_first(struct cgroup_taskset tset,
				2313	struct cgroup_subsys_state **dst_cssp)
				2314	{
				2315	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
				2316	tset->cur_task = NULL;
				2317
				2318	return cgroup_taskset_next(tset, dst_cssp);
				2319	}
				2320
				2321	/**
				2322	* cgroup_taskset_next - iterate to the next task in taskset
				2323	* @tset: taskset of interest
				2324	* @dst_cssp: output variable for the destination css
				2325	*
				2326	* Return the next task in @tset. Iteration must have been initialized
				2327	* with cgroup_taskset_first().
				2328	*/
				2329	struct task_struct cgroup_taskset_next(struct cgroup_taskset tset,
				2330	struct cgroup_subsys_state **dst_cssp)
				2331	{
				2332	struct css_set *cset = tset->cur_cset;
				2333	struct task_struct *task = tset->cur_task;
				2334
				2335	while (&cset->mg_node != tset->csets) {
				2336	if (!task)
				2337	task = list_first_entry(&cset->mg_tasks,
				2338	struct task_struct, cg_list);
				2339	else
				2340	task = list_next_entry(task, cg_list);
				2341
				2342	if (&task->cg_list != &cset->mg_tasks) {
				2343	tset->cur_cset = cset;
				2344	tset->cur_task = task;
				2345
				2346	/*
				2347	* This function may be called both before and
				2348	* after cgroup_taskset_migrate(). The two cases
				2349	* can be distinguished by looking at whether @cset
				2350	* has its ->mg_dst_cset set.
				2351	*/
				2352	if (cset->mg_dst_cset)
				2353	*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
				2354	else
				2355	*dst_cssp = cset->subsys[tset->ssid];
				2356
				2357	return task;
				2358	}
				2359
				2360	cset = list_next_entry(cset, mg_node);
				2361	task = NULL;
				2362	}
				2363
				2364	return NULL;
				2365	}
				2366
				2367	/**
				2368	* cgroup_taskset_migrate - migrate a taskset to a cgroup
				2369	* @tset: taget taskset
				2370	* @dst_cgrp: destination cgroup
				2371	*
				2372	* Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
				2373	* ->can_attach callbacks fails and guarantees that either all or none of
				2374	* the tasks in @tset are migrated. @tset is consumed regardless of
				2375	* success.
				2376	*/
				2377	static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
				2378	struct cgroup *dst_cgrp)
				2379	{
				2380	struct cgroup_subsys_state css, failed_css = NULL;
				2381	struct task_struct task, tmp_task;
				2382	struct css_set cset, tmp_cset;
				2383	int i, ret;
				2384
				2385	/* methods shouldn't be called if no task is actually migrating */
				2386	if (list_empty(&tset->src_csets))
				2387	return 0;
				2388
				2389	/* check that we can legitimately attach to the cgroup */
				2390	for_each_e_css(css, i, dst_cgrp) {
				2391	if (css->ss->can_attach) {
				2392	tset->ssid = i;
				2393	ret = css->ss->can_attach(tset);
				2394	if (ret) {
				2395	failed_css = css;
				2396	goto out_cancel_attach;
				2397	}
				2398	}
				2399	}
				2400
				2401	/*
				2402	* Now that we're guaranteed success, proceed to move all tasks to
				2403	* the new cgroup. There are no failure cases after here, so this
				2404	* is the commit point.
				2405	*/
				2406	spin_lock_bh(&css_set_lock);
				2407	list_for_each_entry(cset, &tset->src_csets, mg_node) {
				2408	list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
				2409	struct css_set *from_cset = task_css_set(task);
				2410	struct css_set *to_cset = cset->mg_dst_cset;
				2411
				2412	get_css_set(to_cset);
				2413	css_set_move_task(task, from_cset, to_cset, true);
				2414	put_css_set_locked(from_cset);
				2415	}
				2416	}
				2417	spin_unlock_bh(&css_set_lock);
				2418
				2419	/*
				2420	* Migration is committed, all target tasks are now on dst_csets.
				2421	* Nothing is sensitive to fork() after this point. Notify
				2422	* controllers that migration is complete.
				2423	*/
				2424	tset->csets = &tset->dst_csets;
				2425
				2426	for_each_e_css(css, i, dst_cgrp) {
				2427	if (css->ss->attach) {
				2428	tset->ssid = i;
				2429	css->ss->attach(tset);
				2430	}
				2431	}
				2432
				2433	ret = 0;
				2434	goto out_release_tset;
				2435
				2436	out_cancel_attach:
				2437	for_each_e_css(css, i, dst_cgrp) {
				2438	if (css == failed_css)
				2439	break;
				2440	if (css->ss->cancel_attach) {
				2441	tset->ssid = i;
				2442	css->ss->cancel_attach(tset);
				2443	}
				2444	}
				2445	out_release_tset:
				2446	spin_lock_bh(&css_set_lock);
				2447	list_splice_init(&tset->dst_csets, &tset->src_csets);
				2448	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
				2449	list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
				2450	list_del_init(&cset->mg_node);
				2451	}
				2452	spin_unlock_bh(&css_set_lock);
				2453	return ret;
				2454	}
				2455
				2456	/**
				2457	* cgroup_migrate_finish - cleanup after attach
				2458	* @preloaded_csets: list of preloaded css_sets
				2459	*
				2460	* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
				2461	* those functions for details.
				2462	*/
				2463	static void cgroup_migrate_finish(struct list_head *preloaded_csets)
				2464	{
				2465	struct css_set cset, tmp_cset;
				2466
				2467	lockdep_assert_held(&cgroup_mutex);
				2468
				2469	spin_lock_bh(&css_set_lock);
				2470	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
				2471	cset->mg_src_cgrp = NULL;
				2472	cset->mg_dst_cset = NULL;
				2473	list_del_init(&cset->mg_preload_node);
				2474	put_css_set_locked(cset);
				2475	}
				2476	spin_unlock_bh(&css_set_lock);
				2477	}
				2478
				2479	/**
				2480	* cgroup_migrate_add_src - add a migration source css_set
				2481	* @src_cset: the source css_set to add
				2482	* @dst_cgrp: the destination cgroup
				2483	* @preloaded_csets: list of preloaded css_sets
				2484	*
				2485	* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
				2486	* @src_cset and add it to @preloaded_csets, which should later be cleaned
				2487	* up by cgroup_migrate_finish().
				2488	*
				2489	* This function may be called without holding cgroup_threadgroup_rwsem
				2490	* even if the target is a process. Threads may be created and destroyed
				2491	* but as long as cgroup_mutex is not dropped, no new css_set can be put
				2492	* into play and the preloaded css_sets are guaranteed to cover all
				2493	* migrations.
				2494	*/
				2495	static void cgroup_migrate_add_src(struct css_set *src_cset,
				2496	struct cgroup *dst_cgrp,
				2497	struct list_head *preloaded_csets)
				2498	{
				2499	struct cgroup *src_cgrp;
				2500
				2501	lockdep_assert_held(&cgroup_mutex);
				2502	lockdep_assert_held(&css_set_lock);
				2503
				2504	/*
				2505	* If ->dead, @src_set is associated with one or more dead cgroups
				2506	* and doesn't contain any migratable tasks. Ignore it early so
				2507	* that the rest of migration path doesn't get confused by it.
				2508	*/
				2509	if (src_cset->dead)
				2510	return;
				2511
				2512	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
				2513
				2514	if (!list_empty(&src_cset->mg_preload_node))
				2515	return;
				2516
				2517	WARN_ON(src_cset->mg_src_cgrp);
				2518	WARN_ON(!list_empty(&src_cset->mg_tasks));
				2519	WARN_ON(!list_empty(&src_cset->mg_node));
				2520
				2521	src_cset->mg_src_cgrp = src_cgrp;
				2522	get_css_set(src_cset);
				2523	list_add(&src_cset->mg_preload_node, preloaded_csets);
				2524	}
				2525
				2526	/**
				2527	* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
				2528	* @dst_cgrp: the destination cgroup (may be %NULL)
				2529	* @preloaded_csets: list of preloaded source css_sets
				2530	*
				2531	* Tasks are about to be moved to @dst_cgrp and all the source css_sets
				2532	* have been preloaded to @preloaded_csets. This function looks up and
				2533	* pins all destination css_sets, links each to its source, and append them
				2534	* to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
				2535	* source css_set is assumed to be its cgroup on the default hierarchy.
				2536	*
				2537	* This function must be called after cgroup_migrate_add_src() has been
				2538	* called on each migration source css_set. After migration is performed
				2539	* using cgroup_migrate(), cgroup_migrate_finish() must be called on
				2540	* @preloaded_csets.
				2541	*/
				2542	static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
				2543	struct list_head *preloaded_csets)
				2544	{
				2545	LIST_HEAD(csets);
				2546	struct css_set src_cset, tmp_cset;
				2547
				2548	lockdep_assert_held(&cgroup_mutex);
				2549
				2550	/*
				2551	* Except for the root, child_subsys_mask must be zero for a cgroup
				2552	* with tasks so that child cgroups don't compete against tasks.
				2553	*/
				2554	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
				2555	dst_cgrp->child_subsys_mask)
				2556	return -EBUSY;
				2557
				2558	/* look up the dst cset for each src cset and link it to src */
				2559	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
				2560	struct css_set *dst_cset;
				2561
				2562	dst_cset = find_css_set(src_cset,
				2563	dst_cgrp ?: src_cset->dfl_cgrp);
				2564	if (!dst_cset)
				2565	goto err;
				2566
				2567	WARN_ON_ONCE(src_cset->mg_dst_cset \|\| dst_cset->mg_dst_cset);
				2568
				2569	/*
				2570	* If src cset equals dst, it's noop. Drop the src.
				2571	* cgroup_migrate() will skip the cset too. Note that we
				2572	* can't handle src == dst as some nodes are used by both.
				2573	*/
				2574	if (src_cset == dst_cset) {
				2575	src_cset->mg_src_cgrp = NULL;
				2576	list_del_init(&src_cset->mg_preload_node);
				2577	put_css_set(src_cset);
				2578	put_css_set(dst_cset);
				2579	continue;
				2580	}
				2581
				2582	src_cset->mg_dst_cset = dst_cset;
				2583
				2584	if (list_empty(&dst_cset->mg_preload_node))
				2585	list_add(&dst_cset->mg_preload_node, &csets);
				2586	else
				2587	put_css_set(dst_cset);
				2588	}
				2589
				2590	list_splice_tail(&csets, preloaded_csets);
				2591	return 0;
				2592	err:
				2593	cgroup_migrate_finish(&csets);
				2594	return -ENOMEM;
				2595	}
				2596
				2597	/**
				2598	* cgroup_migrate - migrate a process or task to a cgroup
				2599	* @leader: the leader of the process or the task to migrate
				2600	* @threadgroup: whether @leader points to the whole process or a single task
				2601	* @cgrp: the destination cgroup
				2602	*
				2603	* Migrate a process or task denoted by @leader to @cgrp. If migrating a
				2604	* process, the caller must be holding cgroup_threadgroup_rwsem. The
				2605	* caller is also responsible for invoking cgroup_migrate_add_src() and
				2606	* cgroup_migrate_prepare_dst() on the targets before invoking this
				2607	* function and following up with cgroup_migrate_finish().
				2608	*
				2609	* As long as a controller's ->can_attach() doesn't fail, this function is
				2610	* guaranteed to succeed. This means that, excluding ->can_attach()
				2611	* failure, when migrating multiple targets, the success or failure can be
				2612	* decided for all targets by invoking group_migrate_prepare_dst() before
				2613	* actually starting migrating.
				2614	*/
				2615	static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
				2616	struct cgroup *cgrp)
				2617	{
				2618	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
				2619	struct task_struct *task;
				2620
				2621	/*
				2622	* Prevent freeing of tasks while we take a snapshot. Tasks that are
				2623	* already PF_EXITING could be freed from underneath us unless we
				2624	* take an rcu_read_lock.
				2625	*/
				2626	spin_lock_bh(&css_set_lock);
				2627	rcu_read_lock();
				2628	task = leader;
				2629	do {
				2630	cgroup_taskset_add(task, &tset);
				2631	if (!threadgroup)
				2632	break;
				2633	} while_each_thread(leader, task);
				2634	rcu_read_unlock();
				2635	spin_unlock_bh(&css_set_lock);
				2636
				2637	return cgroup_taskset_migrate(&tset, cgrp);
				2638	}
				2639
				2640	/**
				2641	* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
				2642	* @dst_cgrp: the cgroup to attach to
				2643	* @leader: the task or the leader of the threadgroup to be attached
				2644	* @threadgroup: attach the whole threadgroup?
				2645	*
				2646	* Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
				2647	*/
				2648	static int cgroup_attach_task(struct cgroup *dst_cgrp,
				2649	struct task_struct *leader, bool threadgroup)
				2650	{
				2651	LIST_HEAD(preloaded_csets);
				2652	struct task_struct *task;
				2653	int ret;
				2654
				2655	/* look up all src csets */
				2656	spin_lock_bh(&css_set_lock);
				2657	rcu_read_lock();
				2658	task = leader;
				2659	do {
				2660	cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
				2661	&preloaded_csets);
				2662	if (!threadgroup)
				2663	break;
				2664	} while_each_thread(leader, task);
				2665	rcu_read_unlock();
				2666	spin_unlock_bh(&css_set_lock);
				2667
				2668	/* prepare dst csets and commit */
				2669	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
				2670	if (!ret)
				2671	ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
				2672
				2673	cgroup_migrate_finish(&preloaded_csets);
				2674	return ret;
				2675	}
				2676
				2677	static int cgroup_procs_write_permission(struct task_struct *task,
				2678	struct cgroup *dst_cgrp,
				2679	struct kernfs_open_file *of)
				2680	{
				2681	const struct cred *cred = current_cred();
				2682	const struct cred *tcred = get_task_cred(task);
				2683	int ret = 0;
				2684
				2685	/*
				2686	* even if we're attaching all tasks in the thread group, we only
				2687	* need to check permissions on one of them.
				2688	*/
				2689	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
				2690	!uid_eq(cred->euid, tcred->uid) &&
				2691	!uid_eq(cred->euid, tcred->suid))
				2692	ret = -EACCES;
				2693
				2694	if (!ret && cgroup_on_dfl(dst_cgrp)) {
				2695	struct super_block *sb = of->file->f_path.dentry->d_sb;
				2696	struct cgroup *cgrp;
				2697	struct inode *inode;
				2698
				2699	spin_lock_bh(&css_set_lock);
				2700	cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
				2701	spin_unlock_bh(&css_set_lock);
				2702
				2703	while (!cgroup_is_descendant(dst_cgrp, cgrp))
				2704	cgrp = cgroup_parent(cgrp);
				2705
				2706	ret = -ENOMEM;
				2707	inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
				2708	if (inode) {
				2709	ret = inode_permission(inode, MAY_WRITE);
				2710	iput(inode);
				2711	}
				2712	}
				2713
				2714	put_cred(tcred);
				2715	return ret;
				2716	}
				2717
				2718	/*
				2719	* Find the task_struct of the task to attach by vpid and pass it along to the
				2720	* function to attach either it or all tasks in its threadgroup. Will lock
				2721	* cgroup_mutex and threadgroup.
				2722	*/
				2723	static ssize_t __cgroup_procs_write(struct kernfs_open_file of, char buf,
				2724	size_t nbytes, loff_t off, bool threadgroup)
				2725	{
				2726	struct task_struct *tsk;
				2727	struct cgroup_subsys *ss;
				2728	struct cgroup *cgrp;
				2729	pid_t pid;
				2730	int ssid, ret;
				2731
				2732	if (kstrtoint(strstrip(buf), 0, &pid) \|\| pid < 0)
				2733	return -EINVAL;
				2734
				2735	cgrp = cgroup_kn_lock_live(of->kn);
				2736	if (!cgrp)
				2737	return -ENODEV;
				2738
				2739	percpu_down_write(&cgroup_threadgroup_rwsem);
				2740	rcu_read_lock();
				2741	if (pid) {
				2742	tsk = find_task_by_vpid(pid);
				2743	if (!tsk) {
				2744	ret = -ESRCH;
				2745	goto out_unlock_rcu;
				2746	}
				2747	} else {
				2748	tsk = current;
				2749	}
				2750
				2751	if (threadgroup)
				2752	tsk = tsk->group_leader;
				2753
				2754	/*
				2755	* kthreads may acquire PF_NO_SETAFFINITY during initialization.
				2756	* If userland migrates such a kthread to a non-root cgroup, it can
				2757	* become trapped in a cpuset, or RT kthread may be born in a
				2758	* cgroup with no rt_runtime allocated. Just say no.
				2759	*/
				2760	if (tsk->no_cgroup_migration \|\| (tsk->flags & PF_NO_SETAFFINITY)) {
				2761	ret = -EINVAL;
				2762	goto out_unlock_rcu;
				2763	}
				2764
				2765	get_task_struct(tsk);
				2766	rcu_read_unlock();
				2767
				2768	ret = cgroup_procs_write_permission(tsk, cgrp, of);
				2769	if (!ret)
				2770	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
				2771
				2772	put_task_struct(tsk);
				2773	goto out_unlock_threadgroup;
				2774
				2775	out_unlock_rcu:
				2776	rcu_read_unlock();
				2777	out_unlock_threadgroup:
				2778	percpu_up_write(&cgroup_threadgroup_rwsem);
				2779	for_each_subsys(ss, ssid)
				2780	if (ss->post_attach)
				2781	ss->post_attach();
				2782	cgroup_kn_unlock(of->kn);
				2783	return ret ?: nbytes;
				2784	}
				2785
				2786	/**
				2787	* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
				2788	* @from: attach to all cgroups of a given task
				2789	* @tsk: the task to be attached
				2790	*/
				2791	int cgroup_attach_task_all(struct task_struct from, struct task_struct tsk)
				2792	{
				2793	struct cgroup_root *root;
				2794	int retval = 0;
				2795
				2796	mutex_lock(&cgroup_mutex);
				2797	for_each_root(root) {
				2798	struct cgroup *from_cgrp;
				2799
				2800	if (root == &cgrp_dfl_root)
				2801	continue;
				2802
				2803	spin_lock_bh(&css_set_lock);
				2804	from_cgrp = task_cgroup_from_root(from, root);
				2805	spin_unlock_bh(&css_set_lock);
				2806
				2807	retval = cgroup_attach_task(from_cgrp, tsk, false);
				2808	if (retval)
				2809	break;
				2810	}
				2811	mutex_unlock(&cgroup_mutex);
				2812
				2813	return retval;
				2814	}
				2815	EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
				2816
				2817	static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
				2818	char *buf, size_t nbytes, loff_t off)
				2819	{
				2820	return __cgroup_procs_write(of, buf, nbytes, off, false);
				2821	}
				2822
				2823	static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
				2824	char *buf, size_t nbytes, loff_t off)
				2825	{
				2826	return __cgroup_procs_write(of, buf, nbytes, off, true);
				2827	}
				2828
				2829	static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
				2830	char *buf, size_t nbytes, loff_t off)
				2831	{
				2832	struct cgroup *cgrp;
				2833
				2834	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
				2835
				2836	cgrp = cgroup_kn_lock_live(of->kn);
				2837	if (!cgrp)
				2838	return -ENODEV;
				2839	spin_lock(&release_agent_path_lock);
				2840	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
				2841	sizeof(cgrp->root->release_agent_path));
				2842	spin_unlock(&release_agent_path_lock);
				2843	cgroup_kn_unlock(of->kn);
				2844	return nbytes;
				2845	}
				2846
				2847	static int cgroup_release_agent_show(struct seq_file seq, void v)
				2848	{
				2849	struct cgroup *cgrp = seq_css(seq)->cgroup;
				2850
				2851	spin_lock(&release_agent_path_lock);
				2852	seq_puts(seq, cgrp->root->release_agent_path);
				2853	spin_unlock(&release_agent_path_lock);
				2854	seq_putc(seq, '\n');
				2855	return 0;
				2856	}
				2857
				2858	static int cgroup_sane_behavior_show(struct seq_file seq, void v)
				2859	{
				2860	seq_puts(seq, "0\n");
				2861	return 0;
				2862	}
				2863
				2864	static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
				2865	{
				2866	struct cgroup_subsys *ss;
				2867	bool printed = false;
				2868	int ssid;
				2869
				2870	for_each_subsys_which(ss, ssid, &ss_mask) {
				2871	if (printed)
				2872	seq_putc(seq, ' ');
				2873	seq_printf(seq, "%s", ss->name);
				2874	printed = true;
				2875	}
				2876	if (printed)
				2877	seq_putc(seq, '\n');
				2878	}
				2879
				2880	/* show controllers which are currently attached to the default hierarchy */
				2881	static int cgroup_root_controllers_show(struct seq_file seq, void v)
				2882	{
				2883	struct cgroup *cgrp = seq_css(seq)->cgroup;
				2884
				2885	cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
				2886	~cgrp_dfl_root_inhibit_ss_mask);
				2887	return 0;
				2888	}
				2889
				2890	/* show controllers which are enabled from the parent */
				2891	static int cgroup_controllers_show(struct seq_file seq, void v)
				2892	{
				2893	struct cgroup *cgrp = seq_css(seq)->cgroup;
				2894
				2895	cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
				2896	return 0;
				2897	}
				2898
				2899	/* show controllers which are enabled for a given cgroup's children */
				2900	static int cgroup_subtree_control_show(struct seq_file seq, void v)
				2901	{
				2902	struct cgroup *cgrp = seq_css(seq)->cgroup;
				2903
				2904	cgroup_print_ss_mask(seq, cgrp->subtree_control);
				2905	return 0;
				2906	}
				2907
				2908	/**
				2909	* cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
				2910	* @cgrp: root of the subtree to update csses for
				2911	*
				2912	* @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
				2913	* css associations need to be updated accordingly. This function looks up
				2914	* all css_sets which are attached to the subtree, creates the matching
				2915	* updated css_sets and migrates the tasks to the new ones.
				2916	*/
				2917	static int cgroup_update_dfl_csses(struct cgroup *cgrp)
				2918	{
				2919	LIST_HEAD(preloaded_csets);
				2920	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
				2921	struct cgroup_subsys_state *css;
				2922	struct css_set *src_cset;
				2923	int ret;
				2924
				2925	lockdep_assert_held(&cgroup_mutex);
				2926
				2927	percpu_down_write(&cgroup_threadgroup_rwsem);
				2928
				2929	/* look up all csses currently attached to @cgrp's subtree */
				2930	spin_lock_bh(&css_set_lock);
				2931	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
				2932	struct cgrp_cset_link *link;
				2933
				2934	/* self is not affected by child_subsys_mask change */
				2935	if (css->cgroup == cgrp)
				2936	continue;
				2937
				2938	list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
				2939	cgroup_migrate_add_src(link->cset, cgrp,
				2940	&preloaded_csets);
				2941	}
				2942	spin_unlock_bh(&css_set_lock);
				2943
				2944	/* NULL dst indicates self on default hierarchy */
				2945	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
				2946	if (ret)
				2947	goto out_finish;
				2948
				2949	spin_lock_bh(&css_set_lock);
				2950	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
				2951	struct task_struct task, ntask;
				2952
				2953	/* src_csets precede dst_csets, break on the first dst_cset */
				2954	if (!src_cset->mg_src_cgrp)
				2955	break;
				2956
				2957	/* all tasks in src_csets need to be migrated */
				2958	list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
				2959	cgroup_taskset_add(task, &tset);
				2960	}
				2961	spin_unlock_bh(&css_set_lock);
				2962
				2963	ret = cgroup_taskset_migrate(&tset, cgrp);
				2964	out_finish:
				2965	cgroup_migrate_finish(&preloaded_csets);
				2966	percpu_up_write(&cgroup_threadgroup_rwsem);
				2967	return ret;
				2968	}
				2969
				2970	/* change the enabled child controllers for a cgroup in the default hierarchy */
				2971	static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
				2972	char *buf, size_t nbytes,
				2973	loff_t off)
				2974	{
				2975	unsigned long enable = 0, disable = 0;
				2976	unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
				2977	struct cgroup cgrp, child;
				2978	struct cgroup_subsys *ss;
				2979	char *tok;
				2980	int ssid, ret;
				2981
				2982	/*
				2983	* Parse input - space separated list of subsystem names prefixed
				2984	* with either + or -.
				2985	*/
				2986	buf = strstrip(buf);
				2987	while ((tok = strsep(&buf, " "))) {
				2988	unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
				2989
				2990	if (tok[0] == '\0')
				2991	continue;
				2992	for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
				2993	if (!cgroup_ssid_enabled(ssid) \|\|
				2994	strcmp(tok + 1, ss->name))
				2995	continue;
				2996
				2997	if (*tok == '+') {
				2998	enable \|= 1 << ssid;
				2999	disable &= ~(1 << ssid);
				3000	} else if (*tok == '-') {
				3001	disable \|= 1 << ssid;
				3002	enable &= ~(1 << ssid);
				3003	} else {
				3004	return -EINVAL;
				3005	}
				3006	break;
				3007	}
				3008	if (ssid == CGROUP_SUBSYS_COUNT)
				3009	return -EINVAL;
				3010	}
				3011
				3012	cgrp = cgroup_kn_lock_live(of->kn);
				3013	if (!cgrp)
				3014	return -ENODEV;
				3015
				3016	for_each_subsys(ss, ssid) {
				3017	if (enable & (1 << ssid)) {
				3018	if (cgrp->subtree_control & (1 << ssid)) {
				3019	enable &= ~(1 << ssid);
				3020	continue;
				3021	}
				3022
				3023	/* unavailable or not enabled on the parent? */
				3024	if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) \|\|
				3025	(cgroup_parent(cgrp) &&
				3026	!(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
				3027	ret = -ENOENT;
				3028	goto out_unlock;
				3029	}
				3030	} else if (disable & (1 << ssid)) {
				3031	if (!(cgrp->subtree_control & (1 << ssid))) {
				3032	disable &= ~(1 << ssid);
				3033	continue;
				3034	}
				3035
				3036	/* a child has it enabled? */
				3037	cgroup_for_each_live_child(child, cgrp) {
				3038	if (child->subtree_control & (1 << ssid)) {
				3039	ret = -EBUSY;
				3040	goto out_unlock;
				3041	}
				3042	}
				3043	}
				3044	}
				3045
				3046	if (!enable && !disable) {
				3047	ret = 0;
				3048	goto out_unlock;
				3049	}
				3050
				3051	/*
				3052	* Except for the root, subtree_control must be zero for a cgroup
				3053	* with tasks so that child cgroups don't compete against tasks.
				3054	*/
				3055	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
				3056	ret = -EBUSY;
				3057	goto out_unlock;
				3058	}
				3059
				3060	/*
				3061	* Update subsys masks and calculate what needs to be done. More
				3062	* subsystems than specified may need to be enabled or disabled
				3063	* depending on subsystem dependencies.
				3064	*/
				3065	old_sc = cgrp->subtree_control;
				3066	old_ss = cgrp->child_subsys_mask;
				3067	new_sc = (old_sc \| enable) & ~disable;
				3068	new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
				3069
				3070	css_enable = ~old_ss & new_ss;
				3071	css_disable = old_ss & ~new_ss;
				3072	enable \|= css_enable;
				3073	disable \|= css_disable;
				3074
				3075	/*
				3076	* Because css offlining is asynchronous, userland might try to
				3077	* re-enable the same controller while the previous instance is
				3078	* still around. In such cases, wait till it's gone using
				3079	* offline_waitq.
				3080	*/
				3081	for_each_subsys_which(ss, ssid, &css_enable) {
				3082	cgroup_for_each_live_child(child, cgrp) {
				3083	DEFINE_WAIT(wait);
				3084
				3085	if (!cgroup_css(child, ss))
				3086	continue;
				3087
				3088	cgroup_get(child);
				3089	prepare_to_wait(&child->offline_waitq, &wait,
				3090	TASK_UNINTERRUPTIBLE);
				3091	cgroup_kn_unlock(of->kn);
				3092	schedule();
				3093	finish_wait(&child->offline_waitq, &wait);
				3094	cgroup_put(child);
				3095
				3096	return restart_syscall();
				3097	}
				3098	}
				3099
				3100	cgrp->subtree_control = new_sc;
				3101	cgrp->child_subsys_mask = new_ss;
				3102
				3103	/*
				3104	* Create new csses or make the existing ones visible. A css is
				3105	* created invisible if it's being implicitly enabled through
				3106	* dependency. An invisible css is made visible when the userland
				3107	* explicitly enables it.
				3108	*/
				3109	for_each_subsys(ss, ssid) {
				3110	if (!(enable & (1 << ssid)))
				3111	continue;
				3112
				3113	cgroup_for_each_live_child(child, cgrp) {
				3114	if (css_enable & (1 << ssid))
				3115	ret = create_css(child, ss,
				3116	cgrp->subtree_control & (1 << ssid));
				3117	else
				3118	ret = css_populate_dir(cgroup_css(child, ss),
				3119	NULL);
				3120	if (ret)
				3121	goto err_undo_css;
				3122	}
				3123	}
				3124
				3125	/*
				3126	* At this point, cgroup_e_css() results reflect the new csses
				3127	* making the following cgroup_update_dfl_csses() properly update
				3128	* css associations of all tasks in the subtree.
				3129	*/
				3130	ret = cgroup_update_dfl_csses(cgrp);
				3131	if (ret)
				3132	goto err_undo_css;
				3133
				3134	/*
				3135	* All tasks are migrated out of disabled csses. Kill or hide
				3136	* them. A css is hidden when the userland requests it to be
				3137	* disabled while other subsystems are still depending on it. The
				3138	* css must not actively control resources and be in the vanilla
				3139	* state if it's made visible again later. Controllers which may
				3140	* be depended upon should provide ->css_reset() for this purpose.
				3141	*/
				3142	for_each_subsys(ss, ssid) {
				3143	if (!(disable & (1 << ssid)))
				3144	continue;
				3145
				3146	cgroup_for_each_live_child(child, cgrp) {
				3147	struct cgroup_subsys_state *css = cgroup_css(child, ss);
				3148
				3149	if (css_disable & (1 << ssid)) {
				3150	kill_css(css);
				3151	} else {
				3152	css_clear_dir(css, NULL);
				3153	if (ss->css_reset)
				3154	ss->css_reset(css);
				3155	}
				3156	}
				3157	}
				3158
				3159	/*
				3160	* The effective csses of all the descendants (excluding @cgrp) may
				3161	* have changed. Subsystems can optionally subscribe to this event
				3162	* by implementing ->css_e_css_changed() which is invoked if any of
				3163	* the effective csses seen from the css's cgroup may have changed.
				3164	*/
				3165	for_each_subsys(ss, ssid) {
				3166	struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
				3167	struct cgroup_subsys_state *css;
				3168
				3169	if (!ss->css_e_css_changed \|\| !this_css)
				3170	continue;
				3171
				3172	css_for_each_descendant_pre(css, this_css)
				3173	if (css != this_css)
				3174	ss->css_e_css_changed(css);
				3175	}
				3176
				3177	kernfs_activate(cgrp->kn);
				3178	ret = 0;
				3179	out_unlock:
				3180	cgroup_kn_unlock(of->kn);
				3181	return ret ?: nbytes;
				3182
				3183	err_undo_css:
				3184	cgrp->subtree_control = old_sc;
				3185	cgrp->child_subsys_mask = old_ss;
				3186
				3187	for_each_subsys(ss, ssid) {
				3188	if (!(enable & (1 << ssid)))
				3189	continue;
				3190
				3191	cgroup_for_each_live_child(child, cgrp) {
				3192	struct cgroup_subsys_state *css = cgroup_css(child, ss);
				3193
				3194	if (!css)
				3195	continue;
				3196
				3197	if (css_enable & (1 << ssid))
				3198	kill_css(css);
				3199	else
				3200	css_clear_dir(css, NULL);
				3201	}
				3202	}
				3203	goto out_unlock;
				3204	}
				3205
				3206	static int cgroup_events_show(struct seq_file seq, void v)
				3207	{
				3208	seq_printf(seq, "populated %d\n",
				3209	cgroup_is_populated(seq_css(seq)->cgroup));
				3210	return 0;
				3211	}
				3212
				3213	static ssize_t cgroup_file_write(struct kernfs_open_file of, char buf,
				3214	size_t nbytes, loff_t off)
				3215	{
				3216	struct cgroup *cgrp = of->kn->parent->priv;
				3217	struct cftype *cft = of->kn->priv;
				3218	struct cgroup_subsys_state *css;
				3219	int ret;
				3220
				3221	if (cft->write)
				3222	return cft->write(of, buf, nbytes, off);
				3223
				3224	/*
				3225	* kernfs guarantees that a file isn't deleted with operations in
				3226	* flight, which means that the matching css is and stays alive and
				3227	* doesn't need to be pinned. The RCU locking is not necessary
				3228	* either. It's just for the convenience of using cgroup_css().
				3229	*/
				3230	rcu_read_lock();
				3231	css = cgroup_css(cgrp, cft->ss);
				3232	rcu_read_unlock();
				3233
				3234	if (cft->write_u64) {
				3235	unsigned long long v;
				3236	ret = kstrtoull(buf, 0, &v);
				3237	if (!ret)
				3238	ret = cft->write_u64(css, cft, v);
				3239	} else if (cft->write_s64) {
				3240	long long v;
				3241	ret = kstrtoll(buf, 0, &v);
				3242	if (!ret)
				3243	ret = cft->write_s64(css, cft, v);
				3244	} else {
				3245	ret = -EINVAL;
				3246	}
				3247
				3248	return ret ?: nbytes;
				3249	}
				3250
				3251	static void cgroup_seqfile_start(struct seq_file seq, loff_t *ppos)
				3252	{
				3253	return seq_cft(seq)->seq_start(seq, ppos);
				3254	}
				3255
				3256	static void cgroup_seqfile_next(struct seq_file seq, void v, loff_t ppos)
				3257	{
				3258	return seq_cft(seq)->seq_next(seq, v, ppos);
				3259	}
				3260
				3261	static void cgroup_seqfile_stop(struct seq_file seq, void v)
				3262	{
				3263	seq_cft(seq)->seq_stop(seq, v);
				3264	}
				3265
				3266	static int cgroup_seqfile_show(struct seq_file m, void arg)
				3267	{
				3268	struct cftype *cft = seq_cft(m);
				3269	struct cgroup_subsys_state *css = seq_css(m);
				3270
				3271	if (cft->seq_show)
				3272	return cft->seq_show(m, arg);
				3273
				3274	if (cft->read_u64)
				3275	seq_printf(m, "%llu\n", cft->read_u64(css, cft));
				3276	else if (cft->read_s64)
				3277	seq_printf(m, "%lld\n", cft->read_s64(css, cft));
				3278	else
				3279	return -EINVAL;
				3280	return 0;
				3281	}
				3282
				3283	static struct kernfs_ops cgroup_kf_single_ops = {
				3284	.atomic_write_len = PAGE_SIZE,
				3285	.write = cgroup_file_write,
				3286	.seq_show = cgroup_seqfile_show,
				3287	};
				3288
				3289	static struct kernfs_ops cgroup_kf_ops = {
				3290	.atomic_write_len = PAGE_SIZE,
				3291	.write = cgroup_file_write,
				3292	.seq_start = cgroup_seqfile_start,
				3293	.seq_next = cgroup_seqfile_next,
				3294	.seq_stop = cgroup_seqfile_stop,
				3295	.seq_show = cgroup_seqfile_show,
				3296	};
				3297
				3298	/*
				3299	* cgroup_rename - Only allow simple rename of directories in place.
				3300	*/
				3301	static int cgroup_rename(struct kernfs_node kn, struct kernfs_node new_parent,
				3302	const char *new_name_str)
				3303	{
				3304	struct cgroup *cgrp = kn->priv;
				3305	int ret;
				3306
				3307	if (kernfs_type(kn) != KERNFS_DIR)
				3308	return -ENOTDIR;
				3309	if (kn->parent != new_parent)
				3310	return -EIO;
				3311
				3312	/*
				3313	* This isn't a proper migration and its usefulness is very
				3314	* limited. Disallow on the default hierarchy.
				3315	*/
				3316	if (cgroup_on_dfl(cgrp))
				3317	return -EPERM;
				3318
				3319	/*
				3320	* We're gonna grab cgroup_mutex which nests outside kernfs
				3321	* active_ref. kernfs_rename() doesn't require active_ref
				3322	* protection. Break them before grabbing cgroup_mutex.
				3323	*/
				3324	kernfs_break_active_protection(new_parent);
				3325	kernfs_break_active_protection(kn);
				3326
				3327	mutex_lock(&cgroup_mutex);
				3328
				3329	ret = kernfs_rename(kn, new_parent, new_name_str);
				3330
				3331	mutex_unlock(&cgroup_mutex);
				3332
				3333	kernfs_unbreak_active_protection(kn);
				3334	kernfs_unbreak_active_protection(new_parent);
				3335	return ret;
				3336	}
				3337
				3338	/* set uid and gid of cgroup dirs and files to that of the creator */
				3339	static int cgroup_kn_set_ugid(struct kernfs_node *kn)
				3340	{
				3341	struct iattr iattr = { .ia_valid = ATTR_UID \| ATTR_GID,
				3342	.ia_uid = current_fsuid(),
				3343	.ia_gid = current_fsgid(), };
				3344
				3345	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
				3346	gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
				3347	return 0;
				3348
				3349	return kernfs_setattr(kn, &iattr);
				3350	}
				3351
				3352	static int cgroup_add_file(struct cgroup_subsys_state css, struct cgroup cgrp,
				3353	struct cftype *cft)
				3354	{
				3355	char name[CGROUP_FILE_NAME_MAX];
				3356	struct kernfs_node *kn;
				3357	struct lock_class_key *key = NULL;
				3358	int ret;
				3359
				3360	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				3361	key = &cft->lockdep_key;
				3362	#endif
				3363	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
				3364	cgroup_file_mode(cft), 0, cft->kf_ops, cft,
				3365	NULL, key);
				3366	if (IS_ERR(kn))
				3367	return PTR_ERR(kn);
				3368
				3369	ret = cgroup_kn_set_ugid(kn);
				3370	if (ret) {
				3371	kernfs_remove(kn);
				3372	return ret;
				3373	}
				3374
				3375	if (cft->file_offset) {
				3376	struct cgroup_file cfile = (void )css + cft->file_offset;
				3377
				3378	spin_lock_irq(&cgroup_file_kn_lock);
				3379	cfile->kn = kn;
				3380	spin_unlock_irq(&cgroup_file_kn_lock);
				3381	}
				3382
				3383	return 0;
				3384	}
				3385
				3386	/**
				3387	* cgroup_addrm_files - add or remove files to a cgroup directory
				3388	* @css: the target css
				3389	* @cgrp: the target cgroup (usually css->cgroup)
				3390	* @cfts: array of cftypes to be added
				3391	* @is_add: whether to add or remove
				3392	*
				3393	* Depending on @is_add, add or remove files defined by @cfts on @cgrp.
				3394	* For removals, this function never fails.
				3395	*/
				3396	static int cgroup_addrm_files(struct cgroup_subsys_state *css,
				3397	struct cgroup *cgrp, struct cftype cfts[],
				3398	bool is_add)
				3399	{
				3400	struct cftype cft, cft_end = NULL;
				3401	int ret;
				3402
				3403	lockdep_assert_held(&cgroup_mutex);
				3404
				3405	restart:
				3406	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
				3407	/* does cft->flags tell us to skip this file on @cgrp? */
				3408	if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
				3409	continue;
				3410	if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
				3411	continue;
				3412	if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
				3413	continue;
				3414	if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
				3415	continue;
				3416
				3417	if (is_add) {
				3418	ret = cgroup_add_file(css, cgrp, cft);
				3419	if (ret) {
				3420	pr_warn("%s: failed to add %s, err=%d\n",
				3421	__func__, cft->name, ret);
				3422	cft_end = cft;
				3423	is_add = false;
				3424	goto restart;
				3425	}
				3426	} else {
				3427	cgroup_rm_file(cgrp, cft);
				3428	}
				3429	}
				3430	return 0;
				3431	}
				3432
				3433	static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
				3434	{
				3435	LIST_HEAD(pending);
				3436	struct cgroup_subsys *ss = cfts[0].ss;
				3437	struct cgroup *root = &ss->root->cgrp;
				3438	struct cgroup_subsys_state *css;
				3439	int ret = 0;
				3440
				3441	lockdep_assert_held(&cgroup_mutex);
				3442
				3443	/* add/rm files for all cgroups created before */
				3444	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
				3445	struct cgroup *cgrp = css->cgroup;
				3446
				3447	if (cgroup_is_dead(cgrp))
				3448	continue;
				3449
				3450	ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
				3451	if (ret)
				3452	break;
				3453	}
				3454
				3455	if (is_add && !ret)
				3456	kernfs_activate(root->kn);
				3457	return ret;
				3458	}
				3459
				3460	static void cgroup_exit_cftypes(struct cftype *cfts)
				3461	{
				3462	struct cftype *cft;
				3463
				3464	for (cft = cfts; cft->name[0] != '\0'; cft++) {
				3465	/* free copy for custom atomic_write_len, see init_cftypes() */
				3466	if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
				3467	kfree(cft->kf_ops);
				3468	cft->kf_ops = NULL;
				3469	cft->ss = NULL;
				3470
				3471	/* revert flags set by cgroup core while adding @cfts */
				3472	cft->flags &= ~(__CFTYPE_ONLY_ON_DFL \| __CFTYPE_NOT_ON_DFL);
				3473	}
				3474	}
				3475
				3476	static int cgroup_init_cftypes(struct cgroup_subsys ss, struct cftype cfts)
				3477	{
				3478	struct cftype *cft;
				3479
				3480	for (cft = cfts; cft->name[0] != '\0'; cft++) {
				3481	struct kernfs_ops *kf_ops;
				3482
				3483	WARN_ON(cft->ss \|\| cft->kf_ops);
				3484
				3485	if (cft->seq_start)
				3486	kf_ops = &cgroup_kf_ops;
				3487	else
				3488	kf_ops = &cgroup_kf_single_ops;
				3489
				3490	/*
				3491	* Ugh... if @cft wants a custom max_write_len, we need to
				3492	* make a copy of kf_ops to set its atomic_write_len.
				3493	*/
				3494	if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
				3495	kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
				3496	if (!kf_ops) {
				3497	cgroup_exit_cftypes(cfts);
				3498	return -ENOMEM;
				3499	}
				3500	kf_ops->atomic_write_len = cft->max_write_len;
				3501	}
				3502
				3503	cft->kf_ops = kf_ops;
				3504	cft->ss = ss;
				3505	}
				3506
				3507	return 0;
				3508	}
				3509
				3510	static int cgroup_rm_cftypes_locked(struct cftype *cfts)
				3511	{
				3512	lockdep_assert_held(&cgroup_mutex);
				3513
				3514	if (!cfts \|\| !cfts[0].ss)
				3515	return -ENOENT;
				3516
				3517	list_del(&cfts->node);
				3518	cgroup_apply_cftypes(cfts, false);
				3519	cgroup_exit_cftypes(cfts);
				3520	return 0;
				3521	}
				3522
				3523	/**
				3524	* cgroup_rm_cftypes - remove an array of cftypes from a subsystem
				3525	* @cfts: zero-length name terminated array of cftypes
				3526	*
				3527	* Unregister @cfts. Files described by @cfts are removed from all
				3528	* existing cgroups and all future cgroups won't have them either. This
				3529	* function can be called anytime whether @cfts' subsys is attached or not.
				3530	*
				3531	* Returns 0 on successful unregistration, -ENOENT if @cfts is not
				3532	* registered.
				3533	*/
				3534	int cgroup_rm_cftypes(struct cftype *cfts)
				3535	{
				3536	int ret;
				3537
				3538	mutex_lock(&cgroup_mutex);
				3539	ret = cgroup_rm_cftypes_locked(cfts);
				3540	mutex_unlock(&cgroup_mutex);
				3541	return ret;
				3542	}
				3543
				3544	/**
				3545	* cgroup_add_cftypes - add an array of cftypes to a subsystem
				3546	* @ss: target cgroup subsystem
				3547	* @cfts: zero-length name terminated array of cftypes
				3548	*
				3549	* Register @cfts to @ss. Files described by @cfts are created for all
				3550	* existing cgroups to which @ss is attached and all future cgroups will
				3551	* have them too. This function can be called anytime whether @ss is
				3552	* attached or not.
				3553	*
				3554	* Returns 0 on successful registration, -errno on failure. Note that this
				3555	* function currently returns 0 as long as @cfts registration is successful
				3556	* even if some file creation attempts on existing cgroups fail.
				3557	*/
				3558	static int cgroup_add_cftypes(struct cgroup_subsys ss, struct cftype cfts)
				3559	{
				3560	int ret;
				3561
				3562	if (!cgroup_ssid_enabled(ss->id))
				3563	return 0;
				3564
				3565	if (!cfts \|\| cfts[0].name[0] == '\0')
				3566	return 0;
				3567
				3568	ret = cgroup_init_cftypes(ss, cfts);
				3569	if (ret)
				3570	return ret;
				3571
				3572	mutex_lock(&cgroup_mutex);
				3573
				3574	list_add_tail(&cfts->node, &ss->cfts);
				3575	ret = cgroup_apply_cftypes(cfts, true);
				3576	if (ret)
				3577	cgroup_rm_cftypes_locked(cfts);
				3578
				3579	mutex_unlock(&cgroup_mutex);
				3580	return ret;
				3581	}
				3582
				3583	/**
				3584	* cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
				3585	* @ss: target cgroup subsystem
				3586	* @cfts: zero-length name terminated array of cftypes
				3587	*
				3588	* Similar to cgroup_add_cftypes() but the added files are only used for
				3589	* the default hierarchy.
				3590	*/
				3591	int cgroup_add_dfl_cftypes(struct cgroup_subsys ss, struct cftype cfts)
				3592	{
				3593	struct cftype *cft;
				3594
				3595	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
				3596	cft->flags \|= __CFTYPE_ONLY_ON_DFL;
				3597	return cgroup_add_cftypes(ss, cfts);
				3598	}
				3599
				3600	/**
				3601	* cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
				3602	* @ss: target cgroup subsystem
				3603	* @cfts: zero-length name terminated array of cftypes
				3604	*
				3605	* Similar to cgroup_add_cftypes() but the added files are only used for
				3606	* the legacy hierarchies.
				3607	*/
				3608	int cgroup_add_legacy_cftypes(struct cgroup_subsys ss, struct cftype cfts)
				3609	{
				3610	struct cftype *cft;
				3611
				3612	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
				3613	cft->flags \|= __CFTYPE_NOT_ON_DFL;
				3614	return cgroup_add_cftypes(ss, cfts);
				3615	}
				3616
				3617	/**
				3618	* cgroup_file_notify - generate a file modified event for a cgroup_file
				3619	* @cfile: target cgroup_file
				3620	*
				3621	* @cfile must have been obtained by setting cftype->file_offset.
				3622	*/
				3623	void cgroup_file_notify(struct cgroup_file *cfile)
				3624	{
				3625	unsigned long flags;
				3626
				3627	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
				3628	if (cfile->kn)
				3629	kernfs_notify(cfile->kn);
				3630	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
				3631	}
				3632
				3633	/**
				3634	* cgroup_task_count - count the number of tasks in a cgroup.
				3635	* @cgrp: the cgroup in question
				3636	*
				3637	* Return the number of tasks in the cgroup.
				3638	*/
				3639	static int cgroup_task_count(const struct cgroup *cgrp)
				3640	{
				3641	int count = 0;
				3642	struct cgrp_cset_link *link;
				3643
				3644	spin_lock_bh(&css_set_lock);
				3645	list_for_each_entry(link, &cgrp->cset_links, cset_link)
				3646	count += atomic_read(&link->cset->refcount);
				3647	spin_unlock_bh(&css_set_lock);
				3648	return count;
				3649	}
				3650
				3651	/**
				3652	* css_next_child - find the next child of a given css
				3653	* @pos: the current position (%NULL to initiate traversal)
				3654	* @parent: css whose children to walk
				3655	*
				3656	* This function returns the next child of @parent and should be called
				3657	* under either cgroup_mutex or RCU read lock. The only requirement is
				3658	* that @parent and @pos are accessible. The next sibling is guaranteed to
				3659	* be returned regardless of their states.
				3660	*
				3661	* If a subsystem synchronizes ->css_online() and the start of iteration, a
				3662	* css which finished ->css_online() is guaranteed to be visible in the
				3663	* future iterations and will stay visible until the last reference is put.
				3664	* A css which hasn't finished ->css_online() or already finished
				3665	* ->css_offline() may show up during traversal. It's each subsystem's
				3666	* responsibility to synchronize against on/offlining.
				3667	*/
				3668	struct cgroup_subsys_state css_next_child(struct cgroup_subsys_state pos,
				3669	struct cgroup_subsys_state *parent)
				3670	{
				3671	struct cgroup_subsys_state *next;
				3672
				3673	cgroup_assert_mutex_or_rcu_locked();
				3674
				3675	/*
				3676	* @pos could already have been unlinked from the sibling list.
				3677	* Once a cgroup is removed, its ->sibling.next is no longer
				3678	* updated when its next sibling changes. CSS_RELEASED is set when
				3679	* @pos is taken off list, at which time its next pointer is valid,
				3680	* and, as releases are serialized, the one pointed to by the next
				3681	* pointer is guaranteed to not have started release yet. This
				3682	* implies that if we observe !CSS_RELEASED on @pos in this RCU
				3683	* critical section, the one pointed to by its next pointer is
				3684	* guaranteed to not have finished its RCU grace period even if we
				3685	* have dropped rcu_read_lock() inbetween iterations.
				3686	*
				3687	* If @pos has CSS_RELEASED set, its next pointer can't be
				3688	* dereferenced; however, as each css is given a monotonically
				3689	* increasing unique serial number and always appended to the
				3690	* sibling list, the next one can be found by walking the parent's
				3691	* children until the first css with higher serial number than
				3692	* @pos's. While this path can be slower, it happens iff iteration
				3693	* races against release and the race window is very small.
				3694	*/
				3695	if (!pos) {
				3696	next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
				3697	} else if (likely(!(pos->flags & CSS_RELEASED))) {
				3698	next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
				3699	} else {
				3700	list_for_each_entry_rcu(next, &parent->children, sibling)
				3701	if (next->serial_nr > pos->serial_nr)
				3702	break;
				3703	}
				3704
				3705	/*
				3706	* @next, if not pointing to the head, can be dereferenced and is
				3707	* the next sibling.
				3708	*/
				3709	if (&next->sibling != &parent->children)
				3710	return next;
				3711	return NULL;
				3712	}
				3713
				3714	/**
				3715	* css_next_descendant_pre - find the next descendant for pre-order walk
				3716	* @pos: the current position (%NULL to initiate traversal)
				3717	* @root: css whose descendants to walk
				3718	*
				3719	* To be used by css_for_each_descendant_pre(). Find the next descendant
				3720	* to visit for pre-order traversal of @root's descendants. @root is
				3721	* included in the iteration and the first node to be visited.
				3722	*
				3723	* While this function requires cgroup_mutex or RCU read locking, it
				3724	* doesn't require the whole traversal to be contained in a single critical
				3725	* section. This function will return the correct next descendant as long
				3726	* as both @pos and @root are accessible and @pos is a descendant of @root.
				3727	*
				3728	* If a subsystem synchronizes ->css_online() and the start of iteration, a
				3729	* css which finished ->css_online() is guaranteed to be visible in the
				3730	* future iterations and will stay visible until the last reference is put.
				3731	* A css which hasn't finished ->css_online() or already finished
				3732	* ->css_offline() may show up during traversal. It's each subsystem's
				3733	* responsibility to synchronize against on/offlining.
				3734	*/
				3735	struct cgroup_subsys_state *
				3736	css_next_descendant_pre(struct cgroup_subsys_state *pos,
				3737	struct cgroup_subsys_state *root)
				3738	{
				3739	struct cgroup_subsys_state *next;
				3740
				3741	cgroup_assert_mutex_or_rcu_locked();
				3742
				3743	/* if first iteration, visit @root */
				3744	if (!pos)
				3745	return root;
				3746
				3747	/* visit the first child if exists */
				3748	next = css_next_child(NULL, pos);
				3749	if (next)
				3750	return next;
				3751
				3752	/* no child, visit my or the closest ancestor's next sibling */
				3753	while (pos != root) {
				3754	next = css_next_child(pos, pos->parent);
				3755	if (next)
				3756	return next;
				3757	pos = pos->parent;
				3758	}
				3759
				3760	return NULL;
				3761	}
				3762
				3763	/**
				3764	* css_rightmost_descendant - return the rightmost descendant of a css
				3765	* @pos: css of interest
				3766	*
				3767	* Return the rightmost descendant of @pos. If there's no descendant, @pos
				3768	* is returned. This can be used during pre-order traversal to skip
				3769	* subtree of @pos.
				3770	*
				3771	* While this function requires cgroup_mutex or RCU read locking, it
				3772	* doesn't require the whole traversal to be contained in a single critical
				3773	* section. This function will return the correct rightmost descendant as
				3774	* long as @pos is accessible.
				3775	*/
				3776	struct cgroup_subsys_state *
				3777	css_rightmost_descendant(struct cgroup_subsys_state *pos)
				3778	{
				3779	struct cgroup_subsys_state last, tmp;
				3780
				3781	cgroup_assert_mutex_or_rcu_locked();
				3782
				3783	do {
				3784	last = pos;
				3785	/* ->prev isn't RCU safe, walk ->next till the end */
				3786	pos = NULL;
				3787	css_for_each_child(tmp, last)
				3788	pos = tmp;
				3789	} while (pos);
				3790
				3791	return last;
				3792	}
				3793
				3794	static struct cgroup_subsys_state *
				3795	css_leftmost_descendant(struct cgroup_subsys_state *pos)
				3796	{
				3797	struct cgroup_subsys_state *last;
				3798
				3799	do {
				3800	last = pos;
				3801	pos = css_next_child(NULL, pos);
				3802	} while (pos);
				3803
				3804	return last;
				3805	}
				3806
				3807	/**
				3808	* css_next_descendant_post - find the next descendant for post-order walk
				3809	* @pos: the current position (%NULL to initiate traversal)
				3810	* @root: css whose descendants to walk
				3811	*
				3812	* To be used by css_for_each_descendant_post(). Find the next descendant
				3813	* to visit for post-order traversal of @root's descendants. @root is
				3814	* included in the iteration and the last node to be visited.
				3815	*
				3816	* While this function requires cgroup_mutex or RCU read locking, it
				3817	* doesn't require the whole traversal to be contained in a single critical
				3818	* section. This function will return the correct next descendant as long
				3819	* as both @pos and @cgroup are accessible and @pos is a descendant of
				3820	* @cgroup.
				3821	*
				3822	* If a subsystem synchronizes ->css_online() and the start of iteration, a
				3823	* css which finished ->css_online() is guaranteed to be visible in the
				3824	* future iterations and will stay visible until the last reference is put.
				3825	* A css which hasn't finished ->css_online() or already finished
				3826	* ->css_offline() may show up during traversal. It's each subsystem's
				3827	* responsibility to synchronize against on/offlining.
				3828	*/
				3829	struct cgroup_subsys_state *
				3830	css_next_descendant_post(struct cgroup_subsys_state *pos,
				3831	struct cgroup_subsys_state *root)
				3832	{
				3833	struct cgroup_subsys_state *next;
				3834
				3835	cgroup_assert_mutex_or_rcu_locked();
				3836
				3837	/* if first iteration, visit leftmost descendant which may be @root */
				3838	if (!pos)
				3839	return css_leftmost_descendant(root);
				3840
				3841	/* if we visited @root, we're done */
				3842	if (pos == root)
				3843	return NULL;
				3844
				3845	/* if there's an unvisited sibling, visit its leftmost descendant */
				3846	next = css_next_child(pos, pos->parent);
				3847	if (next)
				3848	return css_leftmost_descendant(next);
				3849
				3850	/* no sibling left, visit parent */
				3851	return pos->parent;
				3852	}
				3853
				3854	/**
				3855	* css_has_online_children - does a css have online children
				3856	* @css: the target css
				3857	*
				3858	* Returns %true if @css has any online children; otherwise, %false. This
				3859	* function can be called from any context but the caller is responsible
				3860	* for synchronizing against on/offlining as necessary.
				3861	*/
				3862	bool css_has_online_children(struct cgroup_subsys_state *css)
				3863	{
				3864	struct cgroup_subsys_state *child;
				3865	bool ret = false;
				3866
				3867	rcu_read_lock();
				3868	css_for_each_child(child, css) {
				3869	if (child->flags & CSS_ONLINE) {
				3870	ret = true;
				3871	break;
				3872	}
				3873	}
				3874	rcu_read_unlock();
				3875	return ret;
				3876	}
				3877
				3878	/**
				3879	* css_task_iter_advance_css_set - advance a task itererator to the next css_set
				3880	* @it: the iterator to advance
				3881	*
				3882	* Advance @it to the next css_set to walk.
				3883	*/
				3884	static void css_task_iter_advance_css_set(struct css_task_iter *it)
				3885	{
				3886	struct list_head *l = it->cset_pos;
				3887	struct cgrp_cset_link *link;
				3888	struct css_set *cset;
				3889
				3890	lockdep_assert_held(&css_set_lock);
				3891
				3892	/* Advance to the next non-empty css_set */
				3893	do {
				3894	l = l->next;
				3895	if (l == it->cset_head) {
				3896	it->cset_pos = NULL;
				3897	it->task_pos = NULL;
				3898	return;
				3899	}
				3900
				3901	if (it->ss) {
				3902	cset = container_of(l, struct css_set,
				3903	e_cset_node[it->ss->id]);
				3904	} else {
				3905	link = list_entry(l, struct cgrp_cset_link, cset_link);
				3906	cset = link->cset;
				3907	}
				3908	} while (!css_set_populated(cset));
				3909
				3910	it->cset_pos = l;
				3911
				3912	if (!list_empty(&cset->tasks))
				3913	it->task_pos = cset->tasks.next;
				3914	else
				3915	it->task_pos = cset->mg_tasks.next;
				3916
				3917	it->tasks_head = &cset->tasks;
				3918	it->mg_tasks_head = &cset->mg_tasks;
				3919
				3920	/*
				3921	* We don't keep css_sets locked across iteration steps and thus
				3922	* need to take steps to ensure that iteration can be resumed after
				3923	* the lock is re-acquired. Iteration is performed at two levels -
				3924	* css_sets and tasks in them.
				3925	*
				3926	* Once created, a css_set never leaves its cgroup lists, so a
				3927	* pinned css_set is guaranteed to stay put and we can resume
				3928	* iteration afterwards.
				3929	*
				3930	* Tasks may leave @cset across iteration steps. This is resolved
				3931	* by registering each iterator with the css_set currently being
				3932	* walked and making css_set_move_task() advance iterators whose
				3933	* next task is leaving.
				3934	*/
				3935	if (it->cur_cset) {
				3936	list_del(&it->iters_node);
				3937	put_css_set_locked(it->cur_cset);
				3938	}
				3939	get_css_set(cset);
				3940	it->cur_cset = cset;
				3941	list_add(&it->iters_node, &cset->task_iters);
				3942	}
				3943
				3944	static void css_task_iter_advance(struct css_task_iter *it)
				3945	{
				3946	struct list_head *l = it->task_pos;
				3947
				3948	lockdep_assert_held(&css_set_lock);
				3949	WARN_ON_ONCE(!l);
				3950
				3951	/*
				3952	* Advance iterator to find next entry. cset->tasks is consumed
				3953	* first and then ->mg_tasks. After ->mg_tasks, we move onto the
				3954	* next cset.
				3955	*/
				3956	l = l->next;
				3957
				3958	if (l == it->tasks_head)
				3959	l = it->mg_tasks_head->next;
				3960
				3961	if (l == it->mg_tasks_head)
				3962	css_task_iter_advance_css_set(it);
				3963	else
				3964	it->task_pos = l;
				3965	}
				3966
				3967	/**
				3968	* css_task_iter_start - initiate task iteration
				3969	* @css: the css to walk tasks of
				3970	* @it: the task iterator to use
				3971	*
				3972	* Initiate iteration through the tasks of @css. The caller can call
				3973	* css_task_iter_next() to walk through the tasks until the function
				3974	* returns NULL. On completion of iteration, css_task_iter_end() must be
				3975	* called.
				3976	*/
				3977	void css_task_iter_start(struct cgroup_subsys_state *css,
				3978	struct css_task_iter *it)
				3979	{
				3980	/* no one should try to iterate before mounting cgroups */
				3981	WARN_ON_ONCE(!use_task_css_set_links);
				3982
				3983	memset(it, 0, sizeof(*it));
				3984
				3985	spin_lock_bh(&css_set_lock);
				3986
				3987	it->ss = css->ss;
				3988
				3989	if (it->ss)
				3990	it->cset_pos = &css->cgroup->e_csets[css->ss->id];
				3991	else
				3992	it->cset_pos = &css->cgroup->cset_links;
				3993
				3994	it->cset_head = it->cset_pos;
				3995
				3996	css_task_iter_advance_css_set(it);
				3997
				3998	spin_unlock_bh(&css_set_lock);
				3999	}
				4000
				4001	/**
				4002	* css_task_iter_next - return the next task for the iterator
				4003	* @it: the task iterator being iterated
				4004	*
				4005	* The "next" function for task iteration. @it should have been
				4006	* initialized via css_task_iter_start(). Returns NULL when the iteration
				4007	* reaches the end.
				4008	*/
				4009	struct task_struct css_task_iter_next(struct css_task_iter it)
				4010	{
				4011	if (it->cur_task) {
				4012	put_task_struct(it->cur_task);
				4013	it->cur_task = NULL;
				4014	}
				4015
				4016	spin_lock_bh(&css_set_lock);
				4017
				4018	if (it->task_pos) {
				4019	it->cur_task = list_entry(it->task_pos, struct task_struct,
				4020	cg_list);
				4021	get_task_struct(it->cur_task);
				4022	css_task_iter_advance(it);
				4023	}
				4024
				4025	spin_unlock_bh(&css_set_lock);
				4026
				4027	return it->cur_task;
				4028	}
				4029
				4030	/**
				4031	* css_task_iter_end - finish task iteration
				4032	* @it: the task iterator to finish
				4033	*
				4034	* Finish task iteration started by css_task_iter_start().
				4035	*/
				4036	void css_task_iter_end(struct css_task_iter *it)
				4037	{
				4038	if (it->cur_cset) {
				4039	spin_lock_bh(&css_set_lock);
				4040	list_del(&it->iters_node);
				4041	put_css_set_locked(it->cur_cset);
				4042	spin_unlock_bh(&css_set_lock);
				4043	}
				4044
				4045	if (it->cur_task)
				4046	put_task_struct(it->cur_task);
				4047	}
				4048
				4049	/**
				4050	* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
				4051	* @to: cgroup to which the tasks will be moved
				4052	* @from: cgroup in which the tasks currently reside
				4053	*
				4054	* Locking rules between cgroup_post_fork() and the migration path
				4055	* guarantee that, if a task is forking while being migrated, the new child
				4056	* is guaranteed to be either visible in the source cgroup after the
				4057	* parent's migration is complete or put into the target cgroup. No task
				4058	* can slip out of migration through forking.
				4059	*/
				4060	int cgroup_transfer_tasks(struct cgroup to, struct cgroup from)
				4061	{
				4062	LIST_HEAD(preloaded_csets);
				4063	struct cgrp_cset_link *link;
				4064	struct css_task_iter it;
				4065	struct task_struct *task;
				4066	int ret;
				4067
				4068	mutex_lock(&cgroup_mutex);
				4069
				4070	/* all tasks in @from are being moved, all csets are source */
				4071	spin_lock_bh(&css_set_lock);
				4072	list_for_each_entry(link, &from->cset_links, cset_link)
				4073	cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
				4074	spin_unlock_bh(&css_set_lock);
				4075
				4076	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
				4077	if (ret)
				4078	goto out_err;
				4079
				4080	/*
				4081	* Migrate tasks one-by-one until @form is empty. This fails iff
				4082	* ->can_attach() fails.
				4083	*/
				4084	do {
				4085	css_task_iter_start(&from->self, &it);
				4086	task = css_task_iter_next(&it);
				4087	if (task)
				4088	get_task_struct(task);
				4089	css_task_iter_end(&it);
				4090
				4091	if (task) {
				4092	ret = cgroup_migrate(task, false, to);
				4093	put_task_struct(task);
				4094	}
				4095	} while (task && !ret);
				4096	out_err:
				4097	cgroup_migrate_finish(&preloaded_csets);
				4098	mutex_unlock(&cgroup_mutex);
				4099	return ret;
				4100	}
				4101
				4102	/*
				4103	* Stuff for reading the 'tasks'/'procs' files.
				4104	*
				4105	* Reading this file can return large amounts of data if a cgroup has
				4106	* lots of attached tasks. So it may need several calls to read(),
				4107	* but we cannot guarantee that the information we produce is correct
				4108	* unless we produce it entirely atomically.
				4109	*
				4110	*/
				4111
				4112	/* which pidlist file are we talking about? */
				4113	enum cgroup_filetype {
				4114	CGROUP_FILE_PROCS,
				4115	CGROUP_FILE_TASKS,
				4116	};
				4117
				4118	/*
				4119	* A pidlist is a list of pids that virtually represents the contents of one
				4120	* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
				4121	* a pair (one each for procs, tasks) for each pid namespace that's relevant
				4122	* to the cgroup.
				4123	*/
				4124	struct cgroup_pidlist {
				4125	/*
				4126	* used to find which pidlist is wanted. doesn't change as long as
				4127	* this particular list stays in the list.
				4128	*/
				4129	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
				4130	/* array of xids */
				4131	pid_t *list;
				4132	/* how many elements the above list has */
				4133	int length;
				4134	/* each of these stored in a list by its cgroup */
				4135	struct list_head links;
				4136	/* pointer to the cgroup we belong to, for list removal purposes */
				4137	struct cgroup *owner;
				4138	/* for delayed destruction */
				4139	struct delayed_work destroy_dwork;
				4140	};
				4141
				4142	/*
				4143	* The following two functions "fix" the issue where there are more pids
				4144	* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
				4145	* TODO: replace with a kernel-wide solution to this problem
				4146	*/
				4147	#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
				4148	static void *pidlist_allocate(int count)
				4149	{
				4150	if (PIDLIST_TOO_LARGE(count))
				4151	return vmalloc(count * sizeof(pid_t));
				4152	else
				4153	return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
				4154	}
				4155
				4156	static void pidlist_free(void *p)
				4157	{
				4158	kvfree(p);
				4159	}
				4160
				4161	/*
				4162	* Used to destroy all pidlists lingering waiting for destroy timer. None
				4163	* should be left afterwards.
				4164	*/
				4165	static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
				4166	{
				4167	struct cgroup_pidlist l, tmp_l;
				4168
				4169	mutex_lock(&cgrp->pidlist_mutex);
				4170	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
				4171	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
				4172	mutex_unlock(&cgrp->pidlist_mutex);
				4173
				4174	flush_workqueue(cgroup_pidlist_destroy_wq);
				4175	BUG_ON(!list_empty(&cgrp->pidlists));
				4176	}
				4177
				4178	static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
				4179	{
				4180	struct delayed_work *dwork = to_delayed_work(work);
				4181	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
				4182	destroy_dwork);
				4183	struct cgroup_pidlist *tofree = NULL;
				4184
				4185	mutex_lock(&l->owner->pidlist_mutex);
				4186
				4187	/*
				4188	* Destroy iff we didn't get queued again. The state won't change
				4189	* as destroy_dwork can only be queued while locked.
				4190	*/
				4191	if (!delayed_work_pending(dwork)) {
				4192	list_del(&l->links);
				4193	pidlist_free(l->list);
				4194	put_pid_ns(l->key.ns);
				4195	tofree = l;
				4196	}
				4197
				4198	mutex_unlock(&l->owner->pidlist_mutex);
				4199	kfree(tofree);
				4200	}
				4201
				4202	/*
				4203	* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
				4204	* Returns the number of unique elements.
				4205	*/
				4206	static int pidlist_uniq(pid_t *list, int length)
				4207	{
				4208	int src, dest = 1;
				4209
				4210	/*
				4211	* we presume the 0th element is unique, so i starts at 1. trivial
				4212	* edge cases first; no work needs to be done for either
				4213	*/
				4214	if (length == 0 \|\| length == 1)
				4215	return length;
				4216	/* src and dest walk down the list; dest counts unique elements */
				4217	for (src = 1; src < length; src++) {
				4218	/* find next unique element */
				4219	while (list[src] == list[src-1]) {
				4220	src++;
				4221	if (src == length)
				4222	goto after;
				4223	}
				4224	/* dest always points to where the next unique element goes */
				4225	list[dest] = list[src];
				4226	dest++;
				4227	}
				4228	after:
				4229	return dest;
				4230	}
				4231
				4232	/*
				4233	* The two pid files - task and cgroup.procs - guaranteed that the result
				4234	* is sorted, which forced this whole pidlist fiasco. As pid order is
				4235	* different per namespace, each namespace needs differently sorted list,
				4236	* making it impossible to use, for example, single rbtree of member tasks
				4237	* sorted by task pointer. As pidlists can be fairly large, allocating one
				4238	* per open file is dangerous, so cgroup had to implement shared pool of
				4239	* pidlists keyed by cgroup and namespace.
				4240	*
				4241	* All this extra complexity was caused by the original implementation
				4242	* committing to an entirely unnecessary property. In the long term, we
				4243	* want to do away with it. Explicitly scramble sort order if on the
				4244	* default hierarchy so that no such expectation exists in the new
				4245	* interface.
				4246	*
				4247	* Scrambling is done by swapping every two consecutive bits, which is
				4248	* non-identity one-to-one mapping which disturbs sort order sufficiently.
				4249	*/
				4250	static pid_t pid_fry(pid_t pid)
				4251	{
				4252	unsigned a = pid & 0x55555555;
				4253	unsigned b = pid & 0xAAAAAAAA;
				4254
				4255	return (a << 1) \| (b >> 1);
				4256	}
				4257
				4258	static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
				4259	{
				4260	if (cgroup_on_dfl(cgrp))
				4261	return pid_fry(pid);
				4262	else
				4263	return pid;
				4264	}
				4265
				4266	static int cmppid(const void a, const void b)
				4267	{
				4268	return (pid_t )a - (pid_t )b;
				4269	}
				4270
				4271	static int fried_cmppid(const void a, const void b)
				4272	{
				4273	return pid_fry((pid_t )a) - pid_fry((pid_t )b);
				4274	}
				4275
				4276	static struct cgroup_pidlist cgroup_pidlist_find(struct cgroup cgrp,
				4277	enum cgroup_filetype type)
				4278	{
				4279	struct cgroup_pidlist *l;
				4280	/* don't need task_nsproxy() if we're looking at ourself */
				4281	struct pid_namespace *ns = task_active_pid_ns(current);
				4282
				4283	lockdep_assert_held(&cgrp->pidlist_mutex);
				4284
				4285	list_for_each_entry(l, &cgrp->pidlists, links)
				4286	if (l->key.type == type && l->key.ns == ns)
				4287	return l;
				4288	return NULL;
				4289	}
				4290
				4291	/*
				4292	* find the appropriate pidlist for our purpose (given procs vs tasks)
				4293	* returns with the lock on that pidlist already held, and takes care
				4294	* of the use count, or returns NULL with no locks held if we're out of
				4295	* memory.
				4296	*/
				4297	static struct cgroup_pidlist cgroup_pidlist_find_create(struct cgroup cgrp,
				4298	enum cgroup_filetype type)
				4299	{
				4300	struct cgroup_pidlist *l;
				4301
				4302	lockdep_assert_held(&cgrp->pidlist_mutex);
				4303
				4304	l = cgroup_pidlist_find(cgrp, type);
				4305	if (l)
				4306	return l;
				4307
				4308	/* entry not found; create a new one */
				4309	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
				4310	if (!l)
				4311	return l;
				4312
				4313	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
				4314	l->key.type = type;
				4315	/* don't need task_nsproxy() if we're looking at ourself */
				4316	l->key.ns = get_pid_ns(task_active_pid_ns(current));
				4317	l->owner = cgrp;
				4318	list_add(&l->links, &cgrp->pidlists);
				4319	return l;
				4320	}
				4321
				4322	/*
				4323	* Load a cgroup's pidarray with either procs' tgids or tasks' pids
				4324	*/
				4325	static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
				4326	struct cgroup_pidlist **lp)
				4327	{
				4328	pid_t *array;
				4329	int length;
				4330	int pid, n = 0; /* used for populating the array */
				4331	struct css_task_iter it;
				4332	struct task_struct *tsk;
				4333	struct cgroup_pidlist *l;
				4334
				4335	lockdep_assert_held(&cgrp->pidlist_mutex);
				4336
				4337	/*
				4338	* If cgroup gets more users after we read count, we won't have
				4339	* enough space - tough. This race is indistinguishable to the
				4340	* caller from the case that the additional cgroup users didn't
				4341	* show up until sometime later on.
				4342	*/
				4343	length = cgroup_task_count(cgrp);
				4344	array = pidlist_allocate(length);
				4345	if (!array)
				4346	return -ENOMEM;
				4347	/* now, populate the array */
				4348	css_task_iter_start(&cgrp->self, &it);
				4349	while ((tsk = css_task_iter_next(&it))) {
				4350	if (unlikely(n == length))
				4351	break;
				4352	/* get tgid or pid for procs or tasks file respectively */
				4353	if (type == CGROUP_FILE_PROCS)
				4354	pid = task_tgid_vnr(tsk);
				4355	else
				4356	pid = task_pid_vnr(tsk);
				4357	if (pid > 0) /* make sure to only use valid results */
				4358	array[n++] = pid;
				4359	}
				4360	css_task_iter_end(&it);
				4361	length = n;
				4362	/* now sort & (if procs) strip out duplicates */
				4363	if (cgroup_on_dfl(cgrp))
				4364	sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
				4365	else
				4366	sort(array, length, sizeof(pid_t), cmppid, NULL);
				4367	if (type == CGROUP_FILE_PROCS)
				4368	length = pidlist_uniq(array, length);
				4369
				4370	l = cgroup_pidlist_find_create(cgrp, type);
				4371	if (!l) {
				4372	pidlist_free(array);
				4373	return -ENOMEM;
				4374	}
				4375
				4376	/* store array, freeing old if necessary */
				4377	pidlist_free(l->list);
				4378	l->list = array;
				4379	l->length = length;
				4380	*lp = l;
				4381	return 0;
				4382	}
				4383
				4384	/**
				4385	* cgroupstats_build - build and fill cgroupstats
				4386	* @stats: cgroupstats to fill information into
				4387	* @dentry: A dentry entry belonging to the cgroup for which stats have
				4388	* been requested.
				4389	*
				4390	* Build and fill cgroupstats so that taskstats can export it to user
				4391	* space.
				4392	*/
				4393	int cgroupstats_build(struct cgroupstats stats, struct dentry dentry)
				4394	{
				4395	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
				4396	struct cgroup *cgrp;
				4397	struct css_task_iter it;
				4398	struct task_struct *tsk;
				4399
				4400	/* it should be kernfs_node belonging to cgroupfs and is a directory */
				4401	if (dentry->d_sb->s_type != &cgroup_fs_type \|\| !kn \|\|
				4402	kernfs_type(kn) != KERNFS_DIR)
				4403	return -EINVAL;
				4404
				4405	mutex_lock(&cgroup_mutex);
				4406
				4407	/*
				4408	* We aren't being called from kernfs and there's no guarantee on
				4409	* @kn->priv's validity. For this and css_tryget_online_from_dir(),
				4410	* @kn->priv is RCU safe. Let's do the RCU dancing.
				4411	*/
				4412	rcu_read_lock();
				4413	cgrp = rcu_dereference(kn->priv);
				4414	if (!cgrp \|\| cgroup_is_dead(cgrp)) {
				4415	rcu_read_unlock();
				4416	mutex_unlock(&cgroup_mutex);
				4417	return -ENOENT;
				4418	}
				4419	rcu_read_unlock();
				4420
				4421	css_task_iter_start(&cgrp->self, &it);
				4422	while ((tsk = css_task_iter_next(&it))) {
				4423	switch (tsk->state) {
				4424	case TASK_RUNNING:
				4425	stats->nr_running++;
				4426	break;
				4427	case TASK_INTERRUPTIBLE:
				4428	stats->nr_sleeping++;
				4429	break;
				4430	case TASK_UNINTERRUPTIBLE:
				4431	stats->nr_uninterruptible++;
				4432	break;
				4433	case TASK_STOPPED:
				4434	stats->nr_stopped++;
				4435	break;
				4436	default:
				4437	if (delayacct_is_task_waiting_on_io(tsk))
				4438	stats->nr_io_wait++;
				4439	break;
				4440	}
				4441	}
				4442	css_task_iter_end(&it);
				4443
				4444	mutex_unlock(&cgroup_mutex);
				4445	return 0;
				4446	}
				4447
				4448
				4449	/*
				4450	* seq_file methods for the tasks/procs files. The seq_file position is the
				4451	* next pid to display; the seq_file iterator is a pointer to the pid
				4452	* in the cgroup->l->list array.
				4453	*/
				4454
				4455	static void cgroup_pidlist_start(struct seq_file s, loff_t *pos)
				4456	{
				4457	/*
				4458	* Initially we receive a position value that corresponds to
				4459	* one more than the last pid shown (or 0 on the first call or
				4460	* after a seek to the start). Use a binary-search to find the
				4461	* next pid to display, if any
				4462	*/
				4463	struct kernfs_open_file *of = s->private;
				4464	struct cgroup *cgrp = seq_css(s)->cgroup;
				4465	struct cgroup_pidlist *l;
				4466	enum cgroup_filetype type = seq_cft(s)->private;
				4467	int index = 0, pid = *pos;
				4468	int *iter, ret;
				4469
				4470	mutex_lock(&cgrp->pidlist_mutex);
				4471
				4472	/*
				4473	* !NULL @of->priv indicates that this isn't the first start()
				4474	* after open. If the matching pidlist is around, we can use that.
				4475	* Look for it. Note that @of->priv can't be used directly. It
				4476	* could already have been destroyed.
				4477	*/
				4478	if (of->priv)
				4479	of->priv = cgroup_pidlist_find(cgrp, type);
				4480
				4481	/*
				4482	* Either this is the first start() after open or the matching
				4483	* pidlist has been destroyed inbetween. Create a new one.
				4484	*/
				4485	if (!of->priv) {
				4486	ret = pidlist_array_load(cgrp, type,
				4487	(struct cgroup_pidlist **)&of->priv);
				4488	if (ret)
				4489	return ERR_PTR(ret);
				4490	}
				4491	l = of->priv;
				4492
				4493	if (pid) {
				4494	int end = l->length;
				4495
				4496	while (index < end) {
				4497	int mid = (index + end) / 2;
				4498	if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
				4499	index = mid;
				4500	break;
				4501	} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
				4502	index = mid + 1;
				4503	else
				4504	end = mid;
				4505	}
				4506	}
				4507	/* If we're off the end of the array, we're done */
				4508	if (index >= l->length)
				4509	return NULL;
				4510	/* Update the abstract position to be the actual pid that we found */
				4511	iter = l->list + index;
				4512	pos = cgroup_pid_fry(cgrp, iter);
				4513	return iter;
				4514	}
				4515
				4516	static void cgroup_pidlist_stop(struct seq_file s, void v)
				4517	{
				4518	struct kernfs_open_file *of = s->private;
				4519	struct cgroup_pidlist *l = of->priv;
				4520
				4521	if (l)
				4522	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
				4523	CGROUP_PIDLIST_DESTROY_DELAY);
				4524	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
				4525	}
				4526
				4527	static void cgroup_pidlist_next(struct seq_file s, void v, loff_t pos)
				4528	{
				4529	struct kernfs_open_file *of = s->private;
				4530	struct cgroup_pidlist *l = of->priv;
				4531	pid_t *p = v;
				4532	pid_t *end = l->list + l->length;
				4533	/*
				4534	* Advance to the next pid in the array. If this goes off the
				4535	* end, we're done
				4536	*/
				4537	p++;
				4538	if (p >= end) {
				4539	return NULL;
				4540	} else {
				4541	pos = cgroup_pid_fry(seq_css(s)->cgroup, p);
				4542	return p;
				4543	}
				4544	}
				4545
				4546	static int cgroup_pidlist_show(struct seq_file s, void v)
				4547	{
				4548	seq_printf(s, "%d\n", (int )v);
				4549
				4550	return 0;
				4551	}
				4552
				4553	static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
				4554	struct cftype *cft)
				4555	{
				4556	return notify_on_release(css->cgroup);
				4557	}
				4558
				4559	static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
				4560	struct cftype *cft, u64 val)
				4561	{
				4562	if (val)
				4563	set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
				4564	else
				4565	clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
				4566	return 0;
				4567	}
				4568
				4569	static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
				4570	struct cftype *cft)
				4571	{
				4572	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				4573	}
				4574
				4575	static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
				4576	struct cftype *cft, u64 val)
				4577	{
				4578	if (val)
				4579	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				4580	else
				4581	clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
				4582	return 0;
				4583	}
				4584
				4585	/* cgroup core interface files for the default hierarchy */
				4586	static struct cftype cgroup_dfl_base_files[] = {
				4587	{
				4588	.name = "cgroup.procs",
				4589	.file_offset = offsetof(struct cgroup, procs_file),
				4590	.seq_start = cgroup_pidlist_start,
				4591	.seq_next = cgroup_pidlist_next,
				4592	.seq_stop = cgroup_pidlist_stop,
				4593	.seq_show = cgroup_pidlist_show,
				4594	.private = CGROUP_FILE_PROCS,
				4595	.write = cgroup_procs_write,
				4596	},
				4597	{
				4598	.name = "cgroup.controllers",
				4599	.flags = CFTYPE_ONLY_ON_ROOT,
				4600	.seq_show = cgroup_root_controllers_show,
				4601	},
				4602	{
				4603	.name = "cgroup.controllers",
				4604	.flags = CFTYPE_NOT_ON_ROOT,
				4605	.seq_show = cgroup_controllers_show,
				4606	},
				4607	{
				4608	.name = "cgroup.subtree_control",
				4609	.seq_show = cgroup_subtree_control_show,
				4610	.write = cgroup_subtree_control_write,
				4611	},
				4612	{
				4613	.name = "cgroup.events",
				4614	.flags = CFTYPE_NOT_ON_ROOT,
				4615	.file_offset = offsetof(struct cgroup, events_file),
				4616	.seq_show = cgroup_events_show,
				4617	},
				4618	{ } /* terminate */
				4619	};
				4620
				4621	/* cgroup core interface files for the legacy hierarchies */
				4622	static struct cftype cgroup_legacy_base_files[] = {
				4623	{
				4624	.name = "cgroup.procs",
				4625	.seq_start = cgroup_pidlist_start,
				4626	.seq_next = cgroup_pidlist_next,
				4627	.seq_stop = cgroup_pidlist_stop,
				4628	.seq_show = cgroup_pidlist_show,
				4629	.private = CGROUP_FILE_PROCS,
				4630	.write = cgroup_procs_write,
				4631	},
				4632	{
				4633	.name = "cgroup.clone_children",
				4634	.read_u64 = cgroup_clone_children_read,
				4635	.write_u64 = cgroup_clone_children_write,
				4636	},
				4637	{
				4638	.name = "cgroup.sane_behavior",
				4639	.flags = CFTYPE_ONLY_ON_ROOT,
				4640	.seq_show = cgroup_sane_behavior_show,
				4641	},
				4642	{
				4643	.name = "tasks",
				4644	.seq_start = cgroup_pidlist_start,
				4645	.seq_next = cgroup_pidlist_next,
				4646	.seq_stop = cgroup_pidlist_stop,
				4647	.seq_show = cgroup_pidlist_show,
				4648	.private = CGROUP_FILE_TASKS,
				4649	.write = cgroup_tasks_write,
				4650	},
				4651	{
				4652	.name = "notify_on_release",
				4653	.read_u64 = cgroup_read_notify_on_release,
				4654	.write_u64 = cgroup_write_notify_on_release,
				4655	},
				4656	{
				4657	.name = "release_agent",
				4658	.flags = CFTYPE_ONLY_ON_ROOT,
				4659	.seq_show = cgroup_release_agent_show,
				4660	.write = cgroup_release_agent_write,
				4661	.max_write_len = PATH_MAX - 1,
				4662	},
				4663	{ } /* terminate */
				4664	};
				4665
				4666	/*
				4667	* css destruction is four-stage process.
				4668	*
				4669	* 1. Destruction starts. Killing of the percpu_ref is initiated.
				4670	* Implemented in kill_css().
				4671	*
				4672	* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
				4673	* and thus css_tryget_online() is guaranteed to fail, the css can be
				4674	* offlined by invoking offline_css(). After offlining, the base ref is
				4675	* put. Implemented in css_killed_work_fn().
				4676	*
				4677	* 3. When the percpu_ref reaches zero, the only possible remaining
				4678	* accessors are inside RCU read sections. css_release() schedules the
				4679	* RCU callback.
				4680	*
				4681	* 4. After the grace period, the css can be freed. Implemented in
				4682	* css_free_work_fn().
				4683	*
				4684	* It is actually hairier because both step 2 and 4 require process context
				4685	* and thus involve punting to css->destroy_work adding two additional
				4686	* steps to the already complex sequence.
				4687	*/
				4688	static void css_free_work_fn(struct work_struct *work)
				4689	{
				4690	struct cgroup_subsys_state *css =
				4691	container_of(work, struct cgroup_subsys_state, destroy_work);
				4692	struct cgroup_subsys *ss = css->ss;
				4693	struct cgroup *cgrp = css->cgroup;
				4694
				4695	percpu_ref_exit(&css->refcnt);
				4696
				4697	if (ss) {
				4698	/* css free path */
				4699	struct cgroup_subsys_state *parent = css->parent;
				4700	int id = css->id;
				4701
				4702	ss->css_free(css);
				4703	cgroup_idr_remove(&ss->css_idr, id);
				4704	cgroup_put(cgrp);
				4705
				4706	if (parent)
				4707	css_put(parent);
				4708	} else {
				4709	/* cgroup free path */
				4710	atomic_dec(&cgrp->root->nr_cgrps);
				4711	cgroup_pidlist_destroy_all(cgrp);
				4712	cancel_work_sync(&cgrp->release_agent_work);
				4713
				4714	if (cgroup_parent(cgrp)) {
				4715	/*
				4716	* We get a ref to the parent, and put the ref when
				4717	* this cgroup is being freed, so it's guaranteed
				4718	* that the parent won't be destroyed before its
				4719	* children.
				4720	*/
				4721	cgroup_put(cgroup_parent(cgrp));
				4722	kernfs_put(cgrp->kn);
				4723	kfree(cgrp);
				4724	} else {
				4725	/*
				4726	* This is root cgroup's refcnt reaching zero,
				4727	* which indicates that the root should be
				4728	* released.
				4729	*/
				4730	cgroup_destroy_root(cgrp->root);
				4731	}
				4732	}
				4733	}
				4734
				4735	static void css_free_rcu_fn(struct rcu_head *rcu_head)
				4736	{
				4737	struct cgroup_subsys_state *css =
				4738	container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
				4739
				4740	INIT_WORK(&css->destroy_work, css_free_work_fn);
				4741	queue_work(cgroup_destroy_wq, &css->destroy_work);
				4742	}
				4743
				4744	static void css_release_work_fn(struct work_struct *work)
				4745	{
				4746	struct cgroup_subsys_state *css =
				4747	container_of(work, struct cgroup_subsys_state, destroy_work);
				4748	struct cgroup_subsys *ss = css->ss;
				4749	struct cgroup *cgrp = css->cgroup;
				4750
				4751	mutex_lock(&cgroup_mutex);
				4752
				4753	css->flags \|= CSS_RELEASED;
				4754	list_del_rcu(&css->sibling);
				4755
				4756	if (ss) {
				4757	/* css release path */
				4758	cgroup_idr_replace(&ss->css_idr, NULL, css->id);
				4759	if (ss->css_released)
				4760	ss->css_released(css);
				4761	} else {
				4762	/* cgroup release path */
				4763	cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
				4764	cgrp->id = -1;
				4765
				4766	/*
				4767	* There are two control paths which try to determine
				4768	* cgroup from dentry without going through kernfs -
				4769	* cgroupstats_build() and css_tryget_online_from_dir().
				4770	* Those are supported by RCU protecting clearing of
				4771	* cgrp->kn->priv backpointer.
				4772	*/
				4773	RCU_INIT_POINTER((void __rcu __force *)&cgrp->kn->priv, NULL);
				4774	}
				4775
				4776	mutex_unlock(&cgroup_mutex);
				4777
				4778	call_rcu(&css->rcu_head, css_free_rcu_fn);
				4779	}
				4780
				4781	static void css_release(struct percpu_ref *ref)
				4782	{
				4783	struct cgroup_subsys_state *css =
				4784	container_of(ref, struct cgroup_subsys_state, refcnt);
				4785
				4786	INIT_WORK(&css->destroy_work, css_release_work_fn);
				4787	queue_work(cgroup_destroy_wq, &css->destroy_work);
				4788	}
				4789
				4790	static void init_and_link_css(struct cgroup_subsys_state *css,
				4791	struct cgroup_subsys ss, struct cgroup cgrp)
				4792	{
				4793	lockdep_assert_held(&cgroup_mutex);
				4794
				4795	cgroup_get(cgrp);
				4796
				4797	memset(css, 0, sizeof(*css));
				4798	css->cgroup = cgrp;
				4799	css->ss = ss;
				4800	css->id = -1;
				4801	INIT_LIST_HEAD(&css->sibling);
				4802	INIT_LIST_HEAD(&css->children);
				4803	css->serial_nr = css_serial_nr_next++;
				4804	atomic_set(&css->online_cnt, 0);
				4805
				4806	if (cgroup_parent(cgrp)) {
				4807	css->parent = cgroup_css(cgroup_parent(cgrp), ss);
				4808	css_get(css->parent);
				4809	}
				4810
				4811	BUG_ON(cgroup_css(cgrp, ss));
				4812	}
				4813
				4814	/* invoke ->css_online() on a new CSS and mark it online if successful */
				4815	static int online_css(struct cgroup_subsys_state *css)
				4816	{
				4817	struct cgroup_subsys *ss = css->ss;
				4818	int ret = 0;
				4819
				4820	lockdep_assert_held(&cgroup_mutex);
				4821
				4822	if (ss->css_online)
				4823	ret = ss->css_online(css);
				4824	if (!ret) {
				4825	css->flags \|= CSS_ONLINE;
				4826	rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
				4827
				4828	atomic_inc(&css->online_cnt);
				4829	if (css->parent)
				4830	atomic_inc(&css->parent->online_cnt);
				4831	}
				4832	return ret;
				4833	}
				4834
				4835	/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
				4836	static void offline_css(struct cgroup_subsys_state *css)
				4837	{
				4838	struct cgroup_subsys *ss = css->ss;
				4839
				4840	lockdep_assert_held(&cgroup_mutex);
				4841
				4842	if (!(css->flags & CSS_ONLINE))
				4843	return;
				4844
				4845	if (ss->css_offline)
				4846	ss->css_offline(css);
				4847
				4848	css->flags &= ~CSS_ONLINE;
				4849	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
				4850
				4851	wake_up_all(&css->cgroup->offline_waitq);
				4852	}
				4853
				4854	/**
				4855	* create_css - create a cgroup_subsys_state
				4856	* @cgrp: the cgroup new css will be associated with
				4857	* @ss: the subsys of new css
				4858	* @visible: whether to create control knobs for the new css or not
				4859	*
				4860	* Create a new css associated with @cgrp - @ss pair. On success, the new
				4861	* css is online and installed in @cgrp with all interface files created if
				4862	* @visible. Returns 0 on success, -errno on failure.
				4863	*/
				4864	static int create_css(struct cgroup cgrp, struct cgroup_subsys ss,
				4865	bool visible)
				4866	{
				4867	struct cgroup *parent = cgroup_parent(cgrp);
				4868	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
				4869	struct cgroup_subsys_state *css;
				4870	int err;
				4871
				4872	lockdep_assert_held(&cgroup_mutex);
				4873
				4874	css = ss->css_alloc(parent_css);
				4875	if (IS_ERR(css))
				4876	return PTR_ERR(css);
				4877
				4878	init_and_link_css(css, ss, cgrp);
				4879
				4880	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
				4881	if (err)
				4882	goto err_free_css;
				4883
				4884	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
				4885	if (err < 0)
				4886	goto err_free_percpu_ref;
				4887	css->id = err;
				4888
				4889	if (visible) {
				4890	err = css_populate_dir(css, NULL);
				4891	if (err)
				4892	goto err_free_id;
				4893	}
				4894
				4895	/* @css is ready to be brought online now, make it visible */
				4896	list_add_tail_rcu(&css->sibling, &parent_css->children);
				4897	cgroup_idr_replace(&ss->css_idr, css, css->id);
				4898
				4899	err = online_css(css);
				4900	if (err)
				4901	goto err_list_del;
				4902
				4903	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
				4904	cgroup_parent(parent)) {
				4905	pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
				4906	current->comm, current->pid, ss->name);
				4907	if (!strcmp(ss->name, "memory"))
				4908	pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
				4909	ss->warned_broken_hierarchy = true;
				4910	}
				4911
				4912	return 0;
				4913
				4914	err_list_del:
				4915	list_del_rcu(&css->sibling);
				4916	css_clear_dir(css, NULL);
				4917	err_free_id:
				4918	cgroup_idr_remove(&ss->css_idr, css->id);
				4919	err_free_percpu_ref:
				4920	percpu_ref_exit(&css->refcnt);
				4921	err_free_css:
				4922	call_rcu(&css->rcu_head, css_free_rcu_fn);
				4923	return err;
				4924	}
				4925
				4926	static int cgroup_mkdir(struct kernfs_node parent_kn, const char name,
				4927	umode_t mode)
				4928	{
				4929	struct cgroup parent, cgrp;
				4930	struct cgroup_root *root;
				4931	struct cgroup_subsys *ss;
				4932	struct kernfs_node *kn;
				4933	int ssid, ret;
				4934
				4935	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
				4936	*/
				4937	if (strchr(name, '\n'))
				4938	return -EINVAL;
				4939
				4940	parent = cgroup_kn_lock_live(parent_kn);
				4941	if (!parent)
				4942	return -ENODEV;
				4943	root = parent->root;
				4944
				4945	/* allocate the cgroup and its ID, 0 is reserved for the root */
				4946	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
				4947	if (!cgrp) {
				4948	ret = -ENOMEM;
				4949	goto out_unlock;
				4950	}
				4951
				4952	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
				4953	if (ret)
				4954	goto out_free_cgrp;
				4955
				4956	/*
				4957	* Temporarily set the pointer to NULL, so idr_find() won't return
				4958	* a half-baked cgroup.
				4959	*/
				4960	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
				4961	if (cgrp->id < 0) {
				4962	ret = -ENOMEM;
				4963	goto out_cancel_ref;
				4964	}
				4965
				4966	init_cgroup_housekeeping(cgrp);
				4967
				4968	cgrp->self.parent = &parent->self;
				4969	cgrp->root = root;
				4970
				4971	if (notify_on_release(parent))
				4972	set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
				4973
				4974	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
				4975	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
				4976
				4977	/* create the directory */
				4978	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
				4979	if (IS_ERR(kn)) {
				4980	ret = PTR_ERR(kn);
				4981	goto out_free_id;
				4982	}
				4983	cgrp->kn = kn;
				4984
				4985	/*
				4986	* This extra ref will be put in cgroup_free_fn() and guarantees
				4987	* that @cgrp->kn is always accessible.
				4988	*/
				4989	kernfs_get(kn);
				4990
				4991	cgrp->self.serial_nr = css_serial_nr_next++;
				4992
				4993	/* allocation complete, commit to creation */
				4994	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
				4995	atomic_inc(&root->nr_cgrps);
				4996	cgroup_get(parent);
				4997
				4998	/*
				4999	* @cgrp is now fully operational. If something fails after this
				5000	* point, it'll be released via the normal destruction path.
				5001	*/
				5002	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
				5003
				5004	ret = cgroup_kn_set_ugid(kn);
				5005	if (ret)
				5006	goto out_destroy;
				5007
				5008	ret = css_populate_dir(&cgrp->self, NULL);
				5009	if (ret)
				5010	goto out_destroy;
				5011
				5012	/* let's create and online css's */
				5013	for_each_subsys(ss, ssid) {
				5014	if (parent->child_subsys_mask & (1 << ssid)) {
				5015	ret = create_css(cgrp, ss,
				5016	parent->subtree_control & (1 << ssid));
				5017	if (ret)
				5018	goto out_destroy;
				5019	}
				5020	}
				5021
				5022	/*
				5023	* On the default hierarchy, a child doesn't automatically inherit
				5024	* subtree_control from the parent. Each is configured manually.
				5025	*/
				5026	if (!cgroup_on_dfl(cgrp)) {
				5027	cgrp->subtree_control = parent->subtree_control;
				5028	cgroup_refresh_child_subsys_mask(cgrp);
				5029	}
				5030
				5031	kernfs_activate(kn);
				5032
				5033	ret = 0;
				5034	goto out_unlock;
				5035
				5036	out_free_id:
				5037	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
				5038	out_cancel_ref:
				5039	percpu_ref_exit(&cgrp->self.refcnt);
				5040	out_free_cgrp:
				5041	kfree(cgrp);
				5042	out_unlock:
				5043	cgroup_kn_unlock(parent_kn);
				5044	return ret;
				5045
				5046	out_destroy:
				5047	cgroup_destroy_locked(cgrp);
				5048	goto out_unlock;
				5049	}
				5050
				5051	/*
				5052	* This is called when the refcnt of a css is confirmed to be killed.
				5053	* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
				5054	* initate destruction and put the css ref from kill_css().
				5055	*/
				5056	static void css_killed_work_fn(struct work_struct *work)
				5057	{
				5058	struct cgroup_subsys_state *css =
				5059	container_of(work, struct cgroup_subsys_state, destroy_work);
				5060
				5061	mutex_lock(&cgroup_mutex);
				5062
				5063	do {
				5064	offline_css(css);
				5065	css_put(css);
				5066	/* @css can't go away while we're holding cgroup_mutex */
				5067	css = css->parent;
				5068	} while (css && atomic_dec_and_test(&css->online_cnt));
				5069
				5070	mutex_unlock(&cgroup_mutex);
				5071	}
				5072
				5073	/* css kill confirmation processing requires process context, bounce */
				5074	static void css_killed_ref_fn(struct percpu_ref *ref)
				5075	{
				5076	struct cgroup_subsys_state *css =
				5077	container_of(ref, struct cgroup_subsys_state, refcnt);
				5078
				5079	if (atomic_dec_and_test(&css->online_cnt)) {
				5080	INIT_WORK(&css->destroy_work, css_killed_work_fn);
				5081	queue_work(cgroup_destroy_wq, &css->destroy_work);
				5082	}
				5083	}
				5084
				5085	/**
				5086	* kill_css - destroy a css
				5087	* @css: css to destroy
				5088	*
				5089	* This function initiates destruction of @css by removing cgroup interface
				5090	* files and putting its base reference. ->css_offline() will be invoked
				5091	* asynchronously once css_tryget_online() is guaranteed to fail and when
				5092	* the reference count reaches zero, @css will be released.
				5093	*/
				5094	static void kill_css(struct cgroup_subsys_state *css)
				5095	{
				5096	lockdep_assert_held(&cgroup_mutex);
				5097
				5098	/*
				5099	* This must happen before css is disassociated with its cgroup.
				5100	* See seq_css() for details.
				5101	*/
				5102	css_clear_dir(css, NULL);
				5103
				5104	/*
				5105	* Killing would put the base ref, but we need to keep it alive
				5106	* until after ->css_offline().
				5107	*/
				5108	css_get(css);
				5109
				5110	/*
				5111	* cgroup core guarantees that, by the time ->css_offline() is
				5112	* invoked, no new css reference will be given out via
				5113	* css_tryget_online(). We can't simply call percpu_ref_kill() and
				5114	* proceed to offlining css's because percpu_ref_kill() doesn't
				5115	* guarantee that the ref is seen as killed on all CPUs on return.
				5116	*
				5117	* Use percpu_ref_kill_and_confirm() to get notifications as each
				5118	* css is confirmed to be seen as killed on all CPUs.
				5119	*/
				5120	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
				5121	}
				5122
				5123	/**
				5124	* cgroup_destroy_locked - the first stage of cgroup destruction
				5125	* @cgrp: cgroup to be destroyed
				5126	*
				5127	* css's make use of percpu refcnts whose killing latency shouldn't be
				5128	* exposed to userland and are RCU protected. Also, cgroup core needs to
				5129	* guarantee that css_tryget_online() won't succeed by the time
				5130	* ->css_offline() is invoked. To satisfy all the requirements,
				5131	* destruction is implemented in the following two steps.
				5132	*
				5133	* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
				5134	* userland visible parts and start killing the percpu refcnts of
				5135	* css's. Set up so that the next stage will be kicked off once all
				5136	* the percpu refcnts are confirmed to be killed.
				5137	*
				5138	* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
				5139	* rest of destruction. Once all cgroup references are gone, the
				5140	* cgroup is RCU-freed.
				5141	*
				5142	* This function implements s1. After this step, @cgrp is gone as far as
				5143	* the userland is concerned and a new cgroup with the same name may be
				5144	* created. As cgroup doesn't care about the names internally, this
				5145	* doesn't cause any problem.
				5146	*/
				5147	static int cgroup_destroy_locked(struct cgroup *cgrp)
				5148	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
				5149	{
				5150	struct cgroup_subsys_state *css;
				5151	struct cgrp_cset_link *link;
				5152	int ssid;
				5153
				5154	lockdep_assert_held(&cgroup_mutex);
				5155
				5156	/*
				5157	* Only migration can raise populated from zero and we're already
				5158	* holding cgroup_mutex.
				5159	*/
				5160	if (cgroup_is_populated(cgrp))
				5161	return -EBUSY;
				5162
				5163	/*
				5164	* Make sure there's no live children. We can't test emptiness of
				5165	* ->self.children as dead children linger on it while being
				5166	* drained; otherwise, "rmdir parent/child parent" may fail.
				5167	*/
				5168	if (css_has_online_children(&cgrp->self))
				5169	return -EBUSY;
				5170
				5171	/*
				5172	* Mark @cgrp and the associated csets dead. The former prevents
				5173	* further task migration and child creation by disabling
				5174	* cgroup_lock_live_group(). The latter makes the csets ignored by
				5175	* the migration path.
				5176	*/
				5177	cgrp->self.flags &= ~CSS_ONLINE;
				5178
				5179	spin_lock_bh(&css_set_lock);
				5180	list_for_each_entry(link, &cgrp->cset_links, cset_link)
				5181	link->cset->dead = true;
				5182	spin_unlock_bh(&css_set_lock);
				5183
				5184	/* initiate massacre of all css's */
				5185	for_each_css(css, ssid, cgrp)
				5186	kill_css(css);
				5187
				5188	/*
				5189	* Remove @cgrp directory along with the base files. @cgrp has an
				5190	* extra ref on its kn.
				5191	*/
				5192	kernfs_remove(cgrp->kn);
				5193
				5194	check_for_release(cgroup_parent(cgrp));
				5195
				5196	/* put the base reference */
				5197	percpu_ref_kill(&cgrp->self.refcnt);
				5198
				5199	return 0;
				5200	};
				5201
				5202	static int cgroup_rmdir(struct kernfs_node *kn)
				5203	{
				5204	struct cgroup *cgrp;
				5205	int ret = 0;
				5206
				5207	cgrp = cgroup_kn_lock_live(kn);
				5208	if (!cgrp)
				5209	return 0;
				5210
				5211	ret = cgroup_destroy_locked(cgrp);
				5212
				5213	cgroup_kn_unlock(kn);
				5214	return ret;
				5215	}
				5216
				5217	static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
				5218	.remount_fs = cgroup_remount,
				5219	.show_options = cgroup_show_options,
				5220	.mkdir = cgroup_mkdir,
				5221	.rmdir = cgroup_rmdir,
				5222	.rename = cgroup_rename,
				5223	};
				5224
				5225	static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
				5226	{
				5227	struct cgroup_subsys_state *css;
				5228
				5229	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
				5230
				5231	mutex_lock(&cgroup_mutex);
				5232
				5233	idr_init(&ss->css_idr);
				5234	INIT_LIST_HEAD(&ss->cfts);
				5235
				5236	/* Create the root cgroup state for this subsystem */
				5237	ss->root = &cgrp_dfl_root;
				5238	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
				5239	/* We don't handle early failures gracefully */
				5240	BUG_ON(IS_ERR(css));
				5241	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
				5242
				5243	/*
				5244	* Root csses are never destroyed and we can't initialize
				5245	* percpu_ref during early init. Disable refcnting.
				5246	*/
				5247	css->flags \|= CSS_NO_REF;
				5248
				5249	if (early) {
				5250	/* allocation can't be done safely during early init */
				5251	css->id = 1;
				5252	} else {
				5253	css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
				5254	BUG_ON(css->id < 0);
				5255	}
				5256
				5257	/* Update the init_css_set to contain a subsys
				5258	* pointer to this state - since the subsystem is
				5259	* newly registered, all tasks and hence the
				5260	* init_css_set is in the subsystem's root cgroup. */
				5261	init_css_set.subsys[ss->id] = css;
				5262
				5263	have_fork_callback \|= (bool)ss->fork << ss->id;
				5264	have_exit_callback \|= (bool)ss->exit << ss->id;
				5265	have_free_callback \|= (bool)ss->free << ss->id;
				5266	have_canfork_callback \|= (bool)ss->can_fork << ss->id;
				5267
				5268	/* At system boot, before all subsystems have been
				5269	* registered, no tasks have been forked, so we don't
				5270	* need to invoke fork callbacks here. */
				5271	BUG_ON(!list_empty(&init_task.tasks));
				5272
				5273	BUG_ON(online_css(css));
				5274
				5275	mutex_unlock(&cgroup_mutex);
				5276	}
				5277
				5278	/**
				5279	* cgroup_init_early - cgroup initialization at system boot
				5280	*
				5281	* Initialize cgroups at system boot, and initialize any
				5282	* subsystems that request early init.
				5283	*/
				5284	int __init cgroup_init_early(void)
				5285	{
				5286	static struct cgroup_sb_opts __initdata opts;
				5287	struct cgroup_subsys *ss;
				5288	int i;
				5289
				5290	init_cgroup_root(&cgrp_dfl_root, &opts);
				5291	cgrp_dfl_root.cgrp.self.flags \|= CSS_NO_REF;
				5292
				5293	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
				5294
				5295	for_each_subsys(ss, i) {
				5296	WARN(!ss->css_alloc \|\| !ss->css_free \|\| ss->name \|\| ss->id,
				5297	"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
				5298	i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
				5299	ss->id, ss->name);
				5300	WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
				5301	"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
				5302
				5303	ss->id = i;
				5304	ss->name = cgroup_subsys_name[i];
				5305	if (!ss->legacy_name)
				5306	ss->legacy_name = cgroup_subsys_name[i];
				5307
				5308	if (ss->early_init)
				5309	cgroup_init_subsys(ss, true);
				5310	}
				5311	return 0;
				5312	}
				5313
				5314	static unsigned long cgroup_disable_mask __initdata;
				5315
				5316	/**
				5317	* cgroup_init - cgroup initialization
				5318	*
				5319	* Register cgroup filesystem and /proc file, and initialize
				5320	* any subsystems that didn't request early init.
				5321	*/
				5322	int __init cgroup_init(void)
				5323	{
				5324	struct cgroup_subsys *ss;
				5325	unsigned long key;
				5326	int ssid;
				5327
				5328	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
				5329	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
				5330	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
				5331
				5332	mutex_lock(&cgroup_mutex);
				5333
				5334	/* Add init_css_set to the hash table */
				5335	key = css_set_hash(init_css_set.subsys);
				5336	hash_add(css_set_table, &init_css_set.hlist, key);
				5337
				5338	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
				5339
				5340	mutex_unlock(&cgroup_mutex);
				5341
				5342	for_each_subsys(ss, ssid) {
				5343	if (ss->early_init) {
				5344	struct cgroup_subsys_state *css =
				5345	init_css_set.subsys[ss->id];
				5346
				5347	css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
				5348	GFP_KERNEL);
				5349	BUG_ON(css->id < 0);
				5350	} else {
				5351	cgroup_init_subsys(ss, false);
				5352	}
				5353
				5354	list_add_tail(&init_css_set.e_cset_node[ssid],
				5355	&cgrp_dfl_root.cgrp.e_csets[ssid]);
				5356
				5357	/*
				5358	* Setting dfl_root subsys_mask needs to consider the
				5359	* disabled flag and cftype registration needs kmalloc,
				5360	* both of which aren't available during early_init.
				5361	*/
				5362	if (cgroup_disable_mask & (1 << ssid)) {
				5363	static_branch_disable(cgroup_subsys_enabled_key[ssid]);
				5364	printk(KERN_INFO "Disabling %s control group subsystem\n",
				5365	ss->name);
				5366	continue;
				5367	}
				5368
				5369	cgrp_dfl_root.subsys_mask \|= 1 << ss->id;
				5370
				5371	if (!ss->dfl_cftypes)
				5372	cgrp_dfl_root_inhibit_ss_mask \|= 1 << ss->id;
				5373
				5374	if (ss->dfl_cftypes == ss->legacy_cftypes) {
				5375	WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
				5376	} else {
				5377	WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
				5378	WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
				5379	}
				5380
				5381	if (ss->bind)
				5382	ss->bind(init_css_set.subsys[ssid]);
				5383	}
				5384
				5385	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
				5386	WARN_ON(register_filesystem(&cgroup_fs_type));
				5387	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
				5388
				5389	return 0;
				5390	}
				5391
				5392	static int __init cgroup_wq_init(void)
				5393	{
				5394	/*
				5395	* There isn't much point in executing destruction path in
				5396	* parallel. Good chunk is serialized with cgroup_mutex anyway.
				5397	* Use 1 for @max_active.
				5398	*
				5399	* We would prefer to do this in cgroup_init() above, but that
				5400	* is called before init_workqueues(): so leave this until after.
				5401	*/
				5402	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
				5403	BUG_ON(!cgroup_destroy_wq);
				5404
				5405	/*
				5406	* Used to destroy pidlists and separate to serve as flush domain.
				5407	* Cap @max_active to 1 too.
				5408	*/
				5409	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
				5410	0, 1);
				5411	BUG_ON(!cgroup_pidlist_destroy_wq);
				5412
				5413	return 0;
				5414	}
				5415	core_initcall(cgroup_wq_init);
				5416
				5417	/*
				5418	* proc_cgroup_show()
				5419	* - Print task's cgroup paths into seq_file, one line for each hierarchy
				5420	* - Used for /proc/<pid>/cgroup.
				5421	*/
				5422	int proc_cgroup_show(struct seq_file m, struct pid_namespace ns,
				5423	struct pid pid, struct task_struct tsk)
				5424	{
				5425	char buf, path;
				5426	int retval;
				5427	struct cgroup_root *root;
				5428
				5429	retval = -ENOMEM;
				5430	buf = kmalloc(PATH_MAX, GFP_KERNEL);
				5431	if (!buf)
				5432	goto out;
				5433
				5434	mutex_lock(&cgroup_mutex);
				5435	spin_lock_bh(&css_set_lock);
				5436
				5437	for_each_root(root) {
				5438	struct cgroup_subsys *ss;
				5439	struct cgroup *cgrp;
				5440	int ssid, count = 0;
				5441
				5442	if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
				5443	continue;
				5444
				5445	seq_printf(m, "%d:", root->hierarchy_id);
				5446	if (root != &cgrp_dfl_root)
				5447	for_each_subsys(ss, ssid)
				5448	if (root->subsys_mask & (1 << ssid))
				5449	seq_printf(m, "%s%s", count++ ? "," : "",
				5450	ss->legacy_name);
				5451	if (strlen(root->name))
				5452	seq_printf(m, "%sname=%s", count ? "," : "",
				5453	root->name);
				5454	seq_putc(m, ':');
				5455
				5456	cgrp = task_cgroup_from_root(tsk, root);
				5457
				5458	/*
				5459	* On traditional hierarchies, all zombie tasks show up as
				5460	* belonging to the root cgroup. On the default hierarchy,
				5461	* while a zombie doesn't show up in "cgroup.procs" and
				5462	* thus can't be migrated, its /proc/PID/cgroup keeps
				5463	* reporting the cgroup it belonged to before exiting. If
				5464	* the cgroup is removed before the zombie is reaped,
				5465	* " (deleted)" is appended to the cgroup path.
				5466	*/
				5467	if (cgroup_on_dfl(cgrp) \|\| !(tsk->flags & PF_EXITING)) {
				5468	path = cgroup_path(cgrp, buf, PATH_MAX);
				5469	if (!path) {
				5470	retval = -ENAMETOOLONG;
				5471	goto out_unlock;
				5472	}
				5473	} else {
				5474	path = "/";
				5475	}
				5476
				5477	seq_puts(m, path);
				5478
				5479	if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
				5480	seq_puts(m, " (deleted)\n");
				5481	else
				5482	seq_putc(m, '\n');
				5483	}
				5484
				5485	retval = 0;
				5486	out_unlock:
				5487	spin_unlock_bh(&css_set_lock);
				5488	mutex_unlock(&cgroup_mutex);
				5489	kfree(buf);
				5490	out:
				5491	return retval;
				5492	}
				5493
				5494	/* Display information about each subsystem and each hierarchy */
				5495	static int proc_cgroupstats_show(struct seq_file m, void v)
				5496	{
				5497	struct cgroup_subsys *ss;
				5498	int i;
				5499
				5500	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
				5501	/*
				5502	* ideally we don't want subsystems moving around while we do this.
				5503	* cgroup_mutex is also necessary to guarantee an atomic snapshot of
				5504	* subsys/hierarchy state.
				5505	*/
				5506	mutex_lock(&cgroup_mutex);
				5507
				5508	for_each_subsys(ss, i)
				5509	seq_printf(m, "%s\t%d\t%d\t%d\n",
				5510	ss->legacy_name, ss->root->hierarchy_id,
				5511	atomic_read(&ss->root->nr_cgrps),
				5512	cgroup_ssid_enabled(i));
				5513
				5514	mutex_unlock(&cgroup_mutex);
				5515	return 0;
				5516	}
				5517
				5518	static int cgroupstats_open(struct inode inode, struct file file)
				5519	{
				5520	return single_open(file, proc_cgroupstats_show, NULL);
				5521	}
				5522
				5523	static const struct file_operations proc_cgroupstats_operations = {
				5524	.open = cgroupstats_open,
				5525	.read = seq_read,
				5526	.llseek = seq_lseek,
				5527	.release = single_release,
				5528	};
				5529
				5530	static void *subsys_canfork_priv_p(void ss_priv[CGROUP_CANFORK_COUNT], int i)
				5531	{
				5532	if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
				5533	return &ss_priv[i - CGROUP_CANFORK_START];
				5534	return NULL;
				5535	}
				5536
				5537	static void subsys_canfork_priv(void ss_priv[CGROUP_CANFORK_COUNT], int i)
				5538	{
				5539	void **private = subsys_canfork_priv_p(ss_priv, i);
				5540	return private ? *private : NULL;
				5541	}
				5542
				5543	/**
				5544	* cgroup_fork - initialize cgroup related fields during copy_process()
				5545	* @child: pointer to task_struct of forking parent process.
				5546	*
				5547	* A task is associated with the init_css_set until cgroup_post_fork()
				5548	* attaches it to the parent's css_set. Empty cg_list indicates that
				5549	* @child isn't holding reference to its css_set.
				5550	*/
				5551	void cgroup_fork(struct task_struct *child)
				5552	{
				5553	RCU_INIT_POINTER(child->cgroups, &init_css_set);
				5554	INIT_LIST_HEAD(&child->cg_list);
				5555	}
				5556
				5557	/**
				5558	* cgroup_can_fork - called on a new task before the process is exposed
				5559	* @child: the task in question.
				5560	*
				5561	* This calls the subsystem can_fork() callbacks. If the can_fork() callback
				5562	* returns an error, the fork aborts with that error code. This allows for
				5563	* a cgroup subsystem to conditionally allow or deny new forks.
				5564	*/
				5565	int cgroup_can_fork(struct task_struct *child,
				5566	void *ss_priv[CGROUP_CANFORK_COUNT])
				5567	{
				5568	struct cgroup_subsys *ss;
				5569	int i, j, ret;
				5570
				5571	for_each_subsys_which(ss, i, &have_canfork_callback) {
				5572	ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
				5573	if (ret)
				5574	goto out_revert;
				5575	}
				5576
				5577	return 0;
				5578
				5579	out_revert:
				5580	for_each_subsys(ss, j) {
				5581	if (j >= i)
				5582	break;
				5583	if (ss->cancel_fork)
				5584	ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
				5585	}
				5586
				5587	return ret;
				5588	}
				5589
				5590	/**
				5591	* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
				5592	* @child: the task in question
				5593	*
				5594	* This calls the cancel_fork() callbacks if a fork failed after
				5595	* cgroup_can_fork() succeded.
				5596	*/
				5597	void cgroup_cancel_fork(struct task_struct *child,
				5598	void *ss_priv[CGROUP_CANFORK_COUNT])
				5599	{
				5600	struct cgroup_subsys *ss;
				5601	int i;
				5602
				5603	for_each_subsys(ss, i)
				5604	if (ss->cancel_fork)
				5605	ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
				5606	}
				5607
				5608	/**
				5609	* cgroup_post_fork - called on a new task after adding it to the task list
				5610	* @child: the task in question
				5611	*
				5612	* Adds the task to the list running through its css_set if necessary and
				5613	* call the subsystem fork() callbacks. Has to be after the task is
				5614	* visible on the task list in case we race with the first call to
				5615	* cgroup_task_iter_start() - to guarantee that the new task ends up on its
				5616	* list.
				5617	*/
				5618	void cgroup_post_fork(struct task_struct *child,
				5619	void *old_ss_priv[CGROUP_CANFORK_COUNT])
				5620	{
				5621	struct cgroup_subsys *ss;
				5622	int i;
				5623
				5624	/*
				5625	* This may race against cgroup_enable_task_cg_lists(). As that
				5626	* function sets use_task_css_set_links before grabbing
				5627	* tasklist_lock and we just went through tasklist_lock to add
				5628	* @child, it's guaranteed that either we see the set
				5629	* use_task_css_set_links or cgroup_enable_task_cg_lists() sees
				5630	* @child during its iteration.
				5631	*
				5632	* If we won the race, @child is associated with %current's
				5633	* css_set. Grabbing css_set_lock guarantees both that the
				5634	* association is stable, and, on completion of the parent's
				5635	* migration, @child is visible in the source of migration or
				5636	* already in the destination cgroup. This guarantee is necessary
				5637	* when implementing operations which need to migrate all tasks of
				5638	* a cgroup to another.
				5639	*
				5640	* Note that if we lose to cgroup_enable_task_cg_lists(), @child
				5641	* will remain in init_css_set. This is safe because all tasks are
				5642	* in the init_css_set before cg_links is enabled and there's no
				5643	* operation which transfers all tasks out of init_css_set.
				5644	*/
				5645	if (use_task_css_set_links) {
				5646	struct css_set *cset;
				5647
				5648	spin_lock_bh(&css_set_lock);
				5649	cset = task_css_set(current);
				5650	if (list_empty(&child->cg_list)) {
				5651	get_css_set(cset);
				5652	css_set_move_task(child, NULL, cset, false);
				5653	}
				5654	spin_unlock_bh(&css_set_lock);
				5655	}
				5656
				5657	/*
				5658	* Call ss->fork(). This must happen after @child is linked on
				5659	* css_set; otherwise, @child might change state between ->fork()
				5660	* and addition to css_set.
				5661	*/
				5662	for_each_subsys_which(ss, i, &have_fork_callback)
				5663	ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
				5664	}
				5665
				5666	/**
				5667	* cgroup_exit - detach cgroup from exiting task
				5668	* @tsk: pointer to task_struct of exiting process
				5669	*
				5670	* Description: Detach cgroup from @tsk and release it.
				5671	*
				5672	* Note that cgroups marked notify_on_release force every task in
				5673	* them to take the global cgroup_mutex mutex when exiting.
				5674	* This could impact scaling on very large systems. Be reluctant to
				5675	* use notify_on_release cgroups where very high task exit scaling
				5676	* is required on large systems.
				5677	*
				5678	* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
				5679	* call cgroup_exit() while the task is still competent to handle
				5680	* notify_on_release(), then leave the task attached to the root cgroup in
				5681	* each hierarchy for the remainder of its exit. No need to bother with
				5682	* init_css_set refcnting. init_css_set never goes away and we can't race
				5683	* with migration path - PF_EXITING is visible to migration path.
				5684	*/
				5685	void cgroup_exit(struct task_struct *tsk)
				5686	{
				5687	struct cgroup_subsys *ss;
				5688	struct css_set *cset;
				5689	int i;
				5690
				5691	/*
				5692	* Unlink from @tsk from its css_set. As migration path can't race
				5693	* with us, we can check css_set and cg_list without synchronization.
				5694	*/
				5695	cset = task_css_set(tsk);
				5696
				5697	if (!list_empty(&tsk->cg_list)) {
				5698	spin_lock_bh(&css_set_lock);
				5699	css_set_move_task(tsk, cset, NULL, false);
				5700	spin_unlock_bh(&css_set_lock);
				5701	} else {
				5702	get_css_set(cset);
				5703	}
				5704
				5705	/* see cgroup_post_fork() for details */
				5706	for_each_subsys_which(ss, i, &have_exit_callback)
				5707	ss->exit(tsk);
				5708	}
				5709
				5710	void cgroup_free(struct task_struct *task)
				5711	{
				5712	struct css_set *cset = task_css_set(task);
				5713	struct cgroup_subsys *ss;
				5714	int ssid;
				5715
				5716	for_each_subsys_which(ss, ssid, &have_free_callback)
				5717	ss->free(task);
				5718
				5719	put_css_set(cset);
				5720	}
				5721
				5722	static void check_for_release(struct cgroup *cgrp)
				5723	{
				5724	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
				5725	!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
				5726	schedule_work(&cgrp->release_agent_work);
				5727	}
				5728
				5729	/*
				5730	* Notify userspace when a cgroup is released, by running the
				5731	* configured release agent with the name of the cgroup (path
				5732	* relative to the root of cgroup file system) as the argument.
				5733	*
				5734	* Most likely, this user command will try to rmdir this cgroup.
				5735	*
				5736	* This races with the possibility that some other task will be
				5737	* attached to this cgroup before it is removed, or that some other
				5738	* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
				5739	* The presumed 'rmdir' will fail quietly if this cgroup is no longer
				5740	* unused, and this cgroup will be reprieved from its death sentence,
				5741	* to continue to serve a useful existence. Next time it's released,
				5742	* we will get notified again, if it still has 'notify_on_release' set.
				5743	*
				5744	* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
				5745	* means only wait until the task is successfully execve()'d. The
				5746	* separate release agent task is forked by call_usermodehelper(),
				5747	* then control in this thread returns here, without waiting for the
				5748	* release agent task. We don't bother to wait because the caller of
				5749	* this routine has no use for the exit status of the release agent
				5750	* task, so no sense holding our caller up for that.
				5751	*/
				5752	static void cgroup_release_agent(struct work_struct *work)
				5753	{
				5754	struct cgroup *cgrp =
				5755	container_of(work, struct cgroup, release_agent_work);
				5756	char pathbuf = NULL, agentbuf = NULL, *path;
				5757	char argv[3], envp[3];
				5758
				5759	mutex_lock(&cgroup_mutex);
				5760
				5761	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
				5762	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
				5763	if (!pathbuf \|\| !agentbuf)
				5764	goto out;
				5765
				5766	path = cgroup_path(cgrp, pathbuf, PATH_MAX);
				5767	if (!path)
				5768	goto out;
				5769
				5770	argv[0] = agentbuf;
				5771	argv[1] = path;
				5772	argv[2] = NULL;
				5773
				5774	/* minimal command environment */
				5775	envp[0] = "HOME=/";
				5776	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
				5777	envp[2] = NULL;
				5778
				5779	mutex_unlock(&cgroup_mutex);
				5780	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
				5781	goto out_free;
				5782	out:
				5783	mutex_unlock(&cgroup_mutex);
				5784	out_free:
				5785	kfree(agentbuf);
				5786	kfree(pathbuf);
				5787	}
				5788
				5789	static int __init cgroup_disable(char *str)
				5790	{
				5791	struct cgroup_subsys *ss;
				5792	char *token;
				5793	int i;
				5794
				5795	while ((token = strsep(&str, ",")) != NULL) {
				5796	if (!*token)
				5797	continue;
				5798
				5799	for_each_subsys(ss, i) {
				5800	if (strcmp(token, ss->name) &&
				5801	strcmp(token, ss->legacy_name))
				5802	continue;
				5803	cgroup_disable_mask \|= 1 << i;
				5804	}
				5805	}
				5806	return 1;
				5807	}
				5808	__setup("cgroup_disable=", cgroup_disable);
				5809
				5810	/**
				5811	* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
				5812	* @dentry: directory dentry of interest
				5813	* @ss: subsystem of interest
				5814	*
				5815	* If @dentry is a directory for a cgroup which has @ss enabled on it, try
				5816	* to get the corresponding css and return it. If such css doesn't exist
				5817	* or can't be pinned, an ERR_PTR value is returned.
				5818	*/
				5819	struct cgroup_subsys_state css_tryget_online_from_dir(struct dentry dentry,
				5820	struct cgroup_subsys *ss)
				5821	{
				5822	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
				5823	struct cgroup_subsys_state *css = NULL;
				5824	struct cgroup *cgrp;
				5825
				5826	/* is @dentry a cgroup dir? */
				5827	if (dentry->d_sb->s_type != &cgroup_fs_type \|\| !kn \|\|
				5828	kernfs_type(kn) != KERNFS_DIR)
				5829	return ERR_PTR(-EBADF);
				5830
				5831	rcu_read_lock();
				5832
				5833	/*
				5834	* This path doesn't originate from kernfs and @kn could already
				5835	* have been or be removed at any point. @kn->priv is RCU
				5836	* protected for this access. See css_release_work_fn() for details.
				5837	*/
				5838	cgrp = rcu_dereference(kn->priv);
				5839	if (cgrp)
				5840	css = cgroup_css(cgrp, ss);
				5841
				5842	if (!css \|\| !css_tryget_online(css))
				5843	css = ERR_PTR(-ENOENT);
				5844
				5845	rcu_read_unlock();
				5846	return css;
				5847	}
				5848
				5849	/**
				5850	* css_from_id - lookup css by id
				5851	* @id: the cgroup id
				5852	* @ss: cgroup subsys to be looked into
				5853	*
				5854	* Returns the css if there's valid one with @id, otherwise returns NULL.
				5855	* Should be called under rcu_read_lock().
				5856	*/
				5857	struct cgroup_subsys_state css_from_id(int id, struct cgroup_subsys ss)
				5858	{
				5859	WARN_ON_ONCE(!rcu_read_lock_held());
				5860	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
				5861	}
				5862
				5863	#ifdef CONFIG_CGROUP_DEBUG
				5864	static struct cgroup_subsys_state *
				5865	debug_css_alloc(struct cgroup_subsys_state *parent_css)
				5866	{
				5867	struct cgroup_subsys_state css = kzalloc(sizeof(css), GFP_KERNEL);
				5868
				5869	if (!css)
				5870	return ERR_PTR(-ENOMEM);
				5871
				5872	return css;
				5873	}
				5874
				5875	static void debug_css_free(struct cgroup_subsys_state *css)
				5876	{
				5877	kfree(css);
				5878	}
				5879
				5880	static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
				5881	struct cftype *cft)
				5882	{
				5883	return cgroup_task_count(css->cgroup);
				5884	}
				5885
				5886	static u64 current_css_set_read(struct cgroup_subsys_state *css,
				5887	struct cftype *cft)
				5888	{
				5889	return (u64)(unsigned long)current->cgroups;
				5890	}
				5891
				5892	static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
				5893	struct cftype *cft)
				5894	{
				5895	u64 count;
				5896
				5897	rcu_read_lock();
				5898	count = atomic_read(&task_css_set(current)->refcount);
				5899	rcu_read_unlock();
				5900	return count;
				5901	}
				5902
				5903	static int current_css_set_cg_links_read(struct seq_file seq, void v)
				5904	{
				5905	struct cgrp_cset_link *link;
				5906	struct css_set *cset;
				5907	char *name_buf;
				5908
				5909	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
				5910	if (!name_buf)
				5911	return -ENOMEM;
				5912
				5913	spin_lock_bh(&css_set_lock);
				5914	rcu_read_lock();
				5915	cset = rcu_dereference(current->cgroups);
				5916	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
				5917	struct cgroup *c = link->cgrp;
				5918
				5919	cgroup_name(c, name_buf, NAME_MAX + 1);
				5920	seq_printf(seq, "Root %d group %s\n",
				5921	c->root->hierarchy_id, name_buf);
				5922	}
				5923	rcu_read_unlock();
				5924	spin_unlock_bh(&css_set_lock);
				5925	kfree(name_buf);
				5926	return 0;
				5927	}
				5928
				5929	#define MAX_TASKS_SHOWN_PER_CSS 25
				5930	static int cgroup_css_links_read(struct seq_file seq, void v)
				5931	{
				5932	struct cgroup_subsys_state *css = seq_css(seq);
				5933	struct cgrp_cset_link *link;
				5934
				5935	spin_lock_bh(&css_set_lock);
				5936	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
				5937	struct css_set *cset = link->cset;
				5938	struct task_struct *task;
				5939	int count = 0;
				5940
				5941	seq_printf(seq, "css_set %p\n", cset);
				5942
				5943	list_for_each_entry(task, &cset->tasks, cg_list) {
				5944	if (count++ > MAX_TASKS_SHOWN_PER_CSS)
				5945	goto overflow;
				5946	seq_printf(seq, " task %d\n", task_pid_vnr(task));
				5947	}
				5948
				5949	list_for_each_entry(task, &cset->mg_tasks, cg_list) {
				5950	if (count++ > MAX_TASKS_SHOWN_PER_CSS)
				5951	goto overflow;
				5952	seq_printf(seq, " task %d\n", task_pid_vnr(task));
				5953	}
				5954	continue;
				5955	overflow:
				5956	seq_puts(seq, " ...\n");
				5957	}
				5958	spin_unlock_bh(&css_set_lock);
				5959	return 0;
				5960	}
				5961
				5962	static u64 releasable_read(struct cgroup_subsys_state css, struct cftype cft)
				5963	{
				5964	return (!cgroup_is_populated(css->cgroup) &&
				5965	!css_has_online_children(&css->cgroup->self));
				5966	}
				5967
				5968	static struct cftype debug_files[] = {
				5969	{
				5970	.name = "taskcount",
				5971	.read_u64 = debug_taskcount_read,
				5972	},
				5973
				5974	{
				5975	.name = "current_css_set",
				5976	.read_u64 = current_css_set_read,
				5977	},
				5978
				5979	{
				5980	.name = "current_css_set_refcount",
				5981	.read_u64 = current_css_set_refcount_read,
				5982	},
				5983
				5984	{
				5985	.name = "current_css_set_cg_links",
				5986	.seq_show = current_css_set_cg_links_read,
				5987	},
				5988
				5989	{
				5990	.name = "cgroup_css_links",
				5991	.seq_show = cgroup_css_links_read,
				5992	},
				5993
				5994	{
				5995	.name = "releasable",
				5996	.read_u64 = releasable_read,
				5997	},
				5998
				5999	{ } /* terminate */
				6000	};
				6001
				6002	struct cgroup_subsys debug_cgrp_subsys = {
				6003	.css_alloc = debug_css_alloc,
				6004	.css_free = debug_css_free,
				6005	.legacy_cftypes = debug_files,
				6006	};
				6007	#endif /* CONFIG_CGROUP_DEBUG */