Blame - kernel/sched/fair.c - codeaurora/cp-linux

blob: 812069b66f47fb82e282a8429b79af0d3f83f26b [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
				18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
				21	*/
				22
				23	#include <linux/latencytop.h>
				24	#include <linux/sched.h>
				25	#include <linux/cpumask.h>
				26	#include <linux/cpuidle.h>
				27	#include <linux/slab.h>
				28	#include <linux/profile.h>
				29	#include <linux/interrupt.h>
				30	#include <linux/mempolicy.h>
				31	#include <linux/migrate.h>
				32	#include <linux/task_work.h>
				33
				34	#include <trace/events/sched.h>
				35
				36	#include "sched.h"
				37
				38	/*
				39	* Targeted preemption latency for CPU-bound tasks:
				40	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
				41	*
				42	* NOTE: this latency value is not the same as the concept of
				43	* 'timeslice length' - timeslices in CFS are of variable length
				44	* and have no persistent notion like in traditional, time-slice
				45	* based scheduling concepts.
				46	*
				47	* (to see the precise effective timeslice length of your workload,
				48	* run vmstat and monitor the context-switches (cs) field)
				49	*/
				50	unsigned int sysctl_sched_latency = 6000000ULL;
				51	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
				52
				53	/*
				54	* The initial- and re-scaling of tunables is configurable
				55	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				56	*
				57	* Options are:
				58	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				59	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				60	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				61	*/
				62	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				63	= SCHED_TUNABLESCALING_LOG;
				64
				65	/*
				66	* Minimal preemption granularity for CPU-bound tasks:
				67	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
				68	*/
				69	unsigned int sysctl_sched_min_granularity = 750000ULL;
				70	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
				71
				72	/*
				73	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				74	*/
				75	static unsigned int sched_nr_latency = 8;
				76
				77	/*
				78	* After fork, child runs first. If set to 0 (default) then
				79	* parent will (try to) run first.
				80	*/
				81	unsigned int sysctl_sched_child_runs_first __read_mostly;
				82
				83	/*
				84	* SCHED_OTHER wake-up granularity.
				85	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
				86	*
				87	* This option delays the preemption effects of decoupled workloads
				88	* and reduces their over-scheduling. Synchronous workloads will still
				89	* have immediate wakeup/sleep latencies.
				90	*/
				91	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
				92	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
				93
				94	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				95
				96	/*
				97	* The exponential sliding window over which load is averaged for shares
				98	* distribution.
				99	* (default: 10msec)
				100	*/
				101	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				102
				103	#ifdef CONFIG_CFS_BANDWIDTH
				104	/*
				105	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				106	* each time a cfs_rq requests quota.
				107	*
				108	* Note: in the case that the slice exceeds the runtime remaining (either due
				109	* to consumption or the quota being specified to be smaller than the slice)
				110	* we will always only issue the remaining available time.
				111	*
				112	* default: 5 msec, units: microseconds
				113	*/
				114	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				115	#endif
				116
				117	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
				118	{
				119	lw->weight += inc;
				120	lw->inv_weight = 0;
				121	}
				122
				123	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
				124	{
				125	lw->weight -= dec;
				126	lw->inv_weight = 0;
				127	}
				128
				129	static inline void update_load_set(struct load_weight *lw, unsigned long w)
				130	{
				131	lw->weight = w;
				132	lw->inv_weight = 0;
				133	}
				134
				135	/*
				136	* Increase the granularity value when there are more CPUs,
				137	* because with more CPUs the 'effective latency' as visible
				138	* to users decreases. But the relationship is not linear,
				139	* so pick a second-best guess by going with the log2 of the
				140	* number of CPUs.
				141	*
				142	* This idea comes from the SD scheduler of Con Kolivas:
				143	*/
				144	static unsigned int get_update_sysctl_factor(void)
				145	{
				146	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
				147	unsigned int factor;
				148
				149	switch (sysctl_sched_tunable_scaling) {
				150	case SCHED_TUNABLESCALING_NONE:
				151	factor = 1;
				152	break;
				153	case SCHED_TUNABLESCALING_LINEAR:
				154	factor = cpus;
				155	break;
				156	case SCHED_TUNABLESCALING_LOG:
				157	default:
				158	factor = 1 + ilog2(cpus);
				159	break;
				160	}
				161
				162	return factor;
				163	}
				164
				165	static void update_sysctl(void)
				166	{
				167	unsigned int factor = get_update_sysctl_factor();
				168
				169	#define SET_SYSCTL(name) \
				170	(sysctl_##name = (factor) * normalized_sysctl_##name)
				171	SET_SYSCTL(sched_min_granularity);
				172	SET_SYSCTL(sched_latency);
				173	SET_SYSCTL(sched_wakeup_granularity);
				174	#undef SET_SYSCTL
				175	}
				176
				177	void sched_init_granularity(void)
				178	{
				179	update_sysctl();
				180	}
				181
				182	#define WMULT_CONST (~0U)
				183	#define WMULT_SHIFT 32
				184
				185	static void __update_inv_weight(struct load_weight *lw)
				186	{
				187	unsigned long w;
				188
				189	if (likely(lw->inv_weight))
				190	return;
				191
				192	w = scale_load_down(lw->weight);
				193
				194	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				195	lw->inv_weight = 1;
				196	else if (unlikely(!w))
				197	lw->inv_weight = WMULT_CONST;
				198	else
				199	lw->inv_weight = WMULT_CONST / w;
				200	}
				201
				202	/*
				203	* delta_exec * weight / lw.weight
				204	* OR
				205	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
				206	*
				207	* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
				208	* we're guaranteed shift stays positive because inv_weight is guaranteed to
				209	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
				210	*
				211	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
				212	* weight/lw.weight <= 1, and therefore our shift will also be positive.
				213	*/
				214	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
				215	{
				216	u64 fact = scale_load_down(weight);
				217	int shift = WMULT_SHIFT;
				218
				219	__update_inv_weight(lw);
				220
				221	if (unlikely(fact >> 32)) {
				222	while (fact >> 32) {
				223	fact >>= 1;
				224	shift--;
				225	}
				226	}
				227
				228	/* hint to use a 32x32->64 mul */
				229	fact = (u64)(u32)fact * lw->inv_weight;
				230
				231	while (fact >> 32) {
				232	fact >>= 1;
				233	shift--;
				234	}
				235
				236	return mul_u64_u32_shr(delta_exec, fact, shift);
				237	}
				238
				239
				240	const struct sched_class fair_sched_class;
				241
				242	/**************************************************************
				243	* CFS operations on generic schedulable entities:
				244	*/
				245
				246	#ifdef CONFIG_FAIR_GROUP_SCHED
				247
				248	/* cpu runqueue to which this cfs_rq is attached */
				249	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				250	{
				251	return cfs_rq->rq;
				252	}
				253
				254	/* An entity is a task if it doesn't "own" a runqueue */
				255	#define entity_is_task(se) (!se->my_q)
				256
				257	static inline struct task_struct task_of(struct sched_entity se)
				258	{
				259	#ifdef CONFIG_SCHED_DEBUG
				260	WARN_ON_ONCE(!entity_is_task(se));
				261	#endif
				262	return container_of(se, struct task_struct, se);
				263	}
				264
				265	/* Walk up scheduling entities hierarchy */
				266	#define for_each_sched_entity(se) \
				267	for (; se; se = se->parent)
				268
				269	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				270	{
				271	return p->se.cfs_rq;
				272	}
				273
				274	/* runqueue on which this entity is (to be) queued */
				275	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				276	{
				277	return se->cfs_rq;
				278	}
				279
				280	/* runqueue "owned" by this group */
				281	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				282	{
				283	return grp->my_q;
				284	}
				285
				286	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				287	{
				288	if (!cfs_rq->on_list) {
				289	/*
				290	* Ensure we either appear before our parent (if already
				291	* enqueued) or force our parent to appear after us when it is
				292	* enqueued. The fact that we always enqueue bottom-up
				293	* reduces this to two cases.
				294	*/
				295	if (cfs_rq->tg->parent &&
				296	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				297	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
				298	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				299	} else {
				300	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				301	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				302	}
				303
				304	cfs_rq->on_list = 1;
				305	}
				306	}
				307
				308	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				309	{
				310	if (cfs_rq->on_list) {
				311	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				312	cfs_rq->on_list = 0;
				313	}
				314	}
				315
				316	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				317	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				318	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				319
				320	/* Do the two (enqueued) entities belong to the same group ? */
				321	static inline struct cfs_rq *
				322	is_same_group(struct sched_entity se, struct sched_entity pse)
				323	{
				324	if (se->cfs_rq == pse->cfs_rq)
				325	return se->cfs_rq;
				326
				327	return NULL;
				328	}
				329
				330	static inline struct sched_entity parent_entity(struct sched_entity se)
				331	{
				332	return se->parent;
				333	}
				334
				335	static void
				336	find_matching_se(struct sched_entity se, struct sched_entity pse)
				337	{
				338	int se_depth, pse_depth;
				339
				340	/*
				341	* preemption test can be made between sibling entities who are in the
				342	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				343	* both tasks until we find their ancestors who are siblings of common
				344	* parent.
				345	*/
				346
				347	/* First walk up until both entities are at same depth */
				348	se_depth = (*se)->depth;
				349	pse_depth = (*pse)->depth;
				350
				351	while (se_depth > pse_depth) {
				352	se_depth--;
				353	se = parent_entity(se);
				354	}
				355
				356	while (pse_depth > se_depth) {
				357	pse_depth--;
				358	pse = parent_entity(pse);
				359	}
				360
				361	while (!is_same_group(se, pse)) {
				362	se = parent_entity(se);
				363	pse = parent_entity(pse);
				364	}
				365	}
				366
				367	#else /* !CONFIG_FAIR_GROUP_SCHED */
				368
				369	static inline struct task_struct task_of(struct sched_entity se)
				370	{
				371	return container_of(se, struct task_struct, se);
				372	}
				373
				374	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				375	{
				376	return container_of(cfs_rq, struct rq, cfs);
				377	}
				378
				379	#define entity_is_task(se) 1
				380
				381	#define for_each_sched_entity(se) \
				382	for (; se; se = NULL)
				383
				384	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				385	{
				386	return &task_rq(p)->cfs;
				387	}
				388
				389	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				390	{
				391	struct task_struct *p = task_of(se);
				392	struct rq *rq = task_rq(p);
				393
				394	return &rq->cfs;
				395	}
				396
				397	/* runqueue "owned" by this group */
				398	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				399	{
				400	return NULL;
				401	}
				402
				403	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				404	{
				405	}
				406
				407	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				408	{
				409	}
				410
				411	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				412	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				413
				414	static inline struct sched_entity parent_entity(struct sched_entity se)
				415	{
				416	return NULL;
				417	}
				418
				419	static inline void
				420	find_matching_se(struct sched_entity se, struct sched_entity pse)
				421	{
				422	}
				423
				424	#endif /* CONFIG_FAIR_GROUP_SCHED */
				425
				426	static __always_inline
				427	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
				428
				429	/**************************************************************
				430	* Scheduling class tree data structure manipulation methods:
				431	*/
				432
				433	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
				434	{
				435	s64 delta = (s64)(vruntime - max_vruntime);
				436	if (delta > 0)
				437	max_vruntime = vruntime;
				438
				439	return max_vruntime;
				440	}
				441
				442	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
				443	{
				444	s64 delta = (s64)(vruntime - min_vruntime);
				445	if (delta < 0)
				446	min_vruntime = vruntime;
				447
				448	return min_vruntime;
				449	}
				450
				451	static inline int entity_before(struct sched_entity *a,
				452	struct sched_entity *b)
				453	{
				454	return (s64)(a->vruntime - b->vruntime) < 0;
				455	}
				456
				457	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				458	{
				459	u64 vruntime = cfs_rq->min_vruntime;
				460
				461	if (cfs_rq->curr)
				462	vruntime = cfs_rq->curr->vruntime;
				463
				464	if (cfs_rq->rb_leftmost) {
				465	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				466	struct sched_entity,
				467	run_node);
				468
				469	if (!cfs_rq->curr)
				470	vruntime = se->vruntime;
				471	else
				472	vruntime = min_vruntime(vruntime, se->vruntime);
				473	}
				474
				475	/* ensure we never gain time by being placed backwards. */
				476	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
				477	#ifndef CONFIG_64BIT
				478	smp_wmb();
				479	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				480	#endif
				481	}
				482
				483	/*
				484	* Enqueue an entity into the rb-tree:
				485	*/
				486	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
				487	{
				488	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				489	struct rb_node *parent = NULL;
				490	struct sched_entity *entry;
				491	int leftmost = 1;
				492
				493	/*
				494	* Find the right place in the rbtree:
				495	*/
				496	while (*link) {
				497	parent = *link;
				498	entry = rb_entry(parent, struct sched_entity, run_node);
				499	/*
				500	* We dont care about collisions. Nodes with
				501	* the same key stay together.
				502	*/
				503	if (entity_before(se, entry)) {
				504	link = &parent->rb_left;
				505	} else {
				506	link = &parent->rb_right;
				507	leftmost = 0;
				508	}
				509	}
				510
				511	/*
				512	* Maintain a cache of leftmost tree entries (it is frequently
				513	* used):
				514	*/
				515	if (leftmost)
				516	cfs_rq->rb_leftmost = &se->run_node;
				517
				518	rb_link_node(&se->run_node, parent, link);
				519	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
				520	}
				521
				522	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
				523	{
				524	if (cfs_rq->rb_leftmost == &se->run_node) {
				525	struct rb_node *next_node;
				526
				527	next_node = rb_next(&se->run_node);
				528	cfs_rq->rb_leftmost = next_node;
				529	}
				530
				531	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
				532	}
				533
				534	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
				535	{
				536	struct rb_node *left = cfs_rq->rb_leftmost;
				537
				538	if (!left)
				539	return NULL;
				540
				541	return rb_entry(left, struct sched_entity, run_node);
				542	}
				543
				544	static struct sched_entity __pick_next_entity(struct sched_entity se)
				545	{
				546	struct rb_node *next = rb_next(&se->run_node);
				547
				548	if (!next)
				549	return NULL;
				550
				551	return rb_entry(next, struct sched_entity, run_node);
				552	}
				553
				554	#ifdef CONFIG_SCHED_DEBUG
				555	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
				556	{
				557	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
				558
				559	if (!last)
				560	return NULL;
				561
				562	return rb_entry(last, struct sched_entity, run_node);
				563	}
				564
				565	/**************************************************************
				566	* Scheduling class statistics methods:
				567	*/
				568
				569	int sched_proc_update_handler(struct ctl_table *table, int write,
				570	void __user buffer, size_t lenp,
				571	loff_t *ppos)
				572	{
				573	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				574	unsigned int factor = get_update_sysctl_factor();
				575
				576	if (ret \|\| !write)
				577	return ret;
				578
				579	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				580	sysctl_sched_min_granularity);
				581
				582	#define WRT_SYSCTL(name) \
				583	(normalized_sysctl_##name = sysctl_##name / (factor))
				584	WRT_SYSCTL(sched_min_granularity);
				585	WRT_SYSCTL(sched_latency);
				586	WRT_SYSCTL(sched_wakeup_granularity);
				587	#undef WRT_SYSCTL
				588
				589	return 0;
				590	}
				591	#endif
				592
				593	/*
				594	* delta /= w
				595	*/
				596	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
				597	{
				598	if (unlikely(se->load.weight != NICE_0_LOAD))
				599	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
				600
				601	return delta;
				602	}
				603
				604	/*
				605	* The idea is to set a period in which each task runs once.
				606	*
				607	* When there are too many tasks (sched_nr_latency) we have to stretch
				608	* this period because otherwise the slices get too small.
				609	*
				610	* p = (nr <= nl) ? l : l*nr/nl
				611	*/
				612	static u64 __sched_period(unsigned long nr_running)
				613	{
				614	if (unlikely(nr_running > sched_nr_latency))
				615	return nr_running * sysctl_sched_min_granularity;
				616	else
				617	return sysctl_sched_latency;
				618	}
				619
				620	/*
				621	* We calculate the wall-time slice from the period by taking a part
				622	* proportional to the weight.
				623	*
				624	* s = p*P[w/rw]
				625	*/
				626	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
				627	{
				628	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
				629
				630	for_each_sched_entity(se) {
				631	struct load_weight *load;
				632	struct load_weight lw;
				633
				634	cfs_rq = cfs_rq_of(se);
				635	load = &cfs_rq->load;
				636
				637	if (unlikely(!se->on_rq)) {
				638	lw = cfs_rq->load;
				639
				640	update_load_add(&lw, se->load.weight);
				641	load = &lw;
				642	}
				643	slice = __calc_delta(slice, se->load.weight, load);
				644	}
				645	return slice;
				646	}
				647
				648	/*
				649	* We calculate the vruntime slice of a to-be-inserted task.
				650	*
				651	* vs = s/w
				652	*/
				653	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
				654	{
				655	return calc_delta_fair(sched_slice(cfs_rq, se), se);
				656	}
				657
				658	#ifdef CONFIG_SMP
				659	static int select_idle_sibling(struct task_struct *p, int cpu);
				660	static unsigned long task_h_load(struct task_struct *p);
				661
				662	/*
				663	* We choose a half-life close to 1 scheduling period.
				664	* Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
				665	* dependent on this value.
				666	*/
				667	#define LOAD_AVG_PERIOD 32
				668	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				669	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
				670
				671	/* Give new sched_entity start runnable values to heavy its load in infant time */
				672	void init_entity_runnable_average(struct sched_entity *se)
				673	{
				674	struct sched_avg *sa = &se->avg;
				675
				676	sa->last_update_time = 0;
				677	/*
				678	* sched_avg's period_contrib should be strictly less then 1024, so
				679	* we give it 1023 to make sure it is almost a period (1024us), and
				680	* will definitely be update (after enqueue).
				681	*/
				682	sa->period_contrib = 1023;
				683	sa->load_avg = scale_load_down(se->load.weight);
				684	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
				685	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
				686	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
				687	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
				688	}
				689
				690	#else
				691	void init_entity_runnable_average(struct sched_entity *se)
				692	{
				693	}
				694	#endif
				695
				696	/*
				697	* Update the current task's runtime statistics.
				698	*/
				699	static void update_curr(struct cfs_rq *cfs_rq)
				700	{
				701	struct sched_entity *curr = cfs_rq->curr;
				702	u64 now = rq_clock_task(rq_of(cfs_rq));
				703	u64 delta_exec;
				704
				705	if (unlikely(!curr))
				706	return;
				707
				708	delta_exec = now - curr->exec_start;
				709	if (unlikely((s64)delta_exec <= 0))
				710	return;
				711
				712	curr->exec_start = now;
				713
				714	schedstat_set(curr->statistics.exec_max,
				715	max(delta_exec, curr->statistics.exec_max));
				716
				717	curr->sum_exec_runtime += delta_exec;
				718	schedstat_add(cfs_rq, exec_clock, delta_exec);
				719
				720	curr->vruntime += calc_delta_fair(delta_exec, curr);
				721	update_min_vruntime(cfs_rq);
				722
				723	if (entity_is_task(curr)) {
				724	struct task_struct *curtask = task_of(curr);
				725
				726	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
				727	cpuacct_charge(curtask, delta_exec);
				728	account_group_exec_runtime(curtask, delta_exec);
				729	}
				730
				731	account_cfs_rq_runtime(cfs_rq, delta_exec);
				732	}
				733
				734	static void update_curr_fair(struct rq *rq)
				735	{
				736	update_curr(cfs_rq_of(&rq->curr->se));
				737	}
				738
				739	static inline void
				740	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
				741	{
				742	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
				743	}
				744
				745	/*
				746	* Task is being enqueued - update stats:
				747	*/
				748	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				749	{
				750	/*
				751	* Are we enqueueing a waiting task? (for current tasks
				752	* a dequeue/enqueue event is a NOP)
				753	*/
				754	if (se != cfs_rq->curr)
				755	update_stats_wait_start(cfs_rq, se);
				756	}
				757
				758	static void
				759	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
				760	{
				761	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
				762	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
				763	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				764	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
				765	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
				766	#ifdef CONFIG_SCHEDSTATS
				767	if (entity_is_task(se)) {
				768	trace_sched_stat_wait(task_of(se),
				769	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
				770	}
				771	#endif
				772	schedstat_set(se->statistics.wait_start, 0);
				773	}
				774
				775	static inline void
				776	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				777	{
				778	/*
				779	* Mark the end of the wait period if dequeueing a
				780	* waiting task:
				781	*/
				782	if (se != cfs_rq->curr)
				783	update_stats_wait_end(cfs_rq, se);
				784	}
				785
				786	/*
				787	* We are picking a new current task - update its stats:
				788	*/
				789	static inline void
				790	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
				791	{
				792	/*
				793	* We are starting a new run period:
				794	*/
				795	se->exec_start = rq_clock_task(rq_of(cfs_rq));
				796	}
				797
				798	/**************************************************
				799	* Scheduling class queueing methods:
				800	*/
				801
				802	#ifdef CONFIG_NUMA_BALANCING
				803	/*
				804	* Approximate time to scan a full NUMA task in ms. The task scan period is
				805	* calculated based on the tasks virtual memory size and
				806	* numa_balancing_scan_size.
				807	*/
				808	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
				809	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
				810
				811	/* Portion of address space to scan in MB */
				812	unsigned int sysctl_numa_balancing_scan_size = 256;
				813
				814	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				815	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				816
				817	static unsigned int task_nr_scan_windows(struct task_struct *p)
				818	{
				819	unsigned long rss = 0;
				820	unsigned long nr_scan_pages;
				821
				822	/*
				823	* Calculations based on RSS as non-present and empty pages are skipped
				824	* by the PTE scanner and NUMA hinting faults should be trapped based
				825	* on resident pages
				826	*/
				827	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
				828	rss = get_mm_rss(p->mm);
				829	if (!rss)
				830	rss = nr_scan_pages;
				831
				832	rss = round_up(rss, nr_scan_pages);
				833	return rss / nr_scan_pages;
				834	}
				835
				836	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
				837	#define MAX_SCAN_WINDOW 2560
				838
				839	static unsigned int task_scan_min(struct task_struct *p)
				840	{
				841	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
				842	unsigned int scan, floor;
				843	unsigned int windows = 1;
				844
				845	if (scan_size < MAX_SCAN_WINDOW)
				846	windows = MAX_SCAN_WINDOW / scan_size;
				847	floor = 1000 / windows;
				848
				849	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
				850	return max_t(unsigned int, floor, scan);
				851	}
				852
				853	static unsigned int task_scan_max(struct task_struct *p)
				854	{
				855	unsigned int smin = task_scan_min(p);
				856	unsigned int smax;
				857
				858	/* Watch for min being lower than max due to floor calculations */
				859	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
				860	return max(smin, smax);
				861	}
				862
				863	static void account_numa_enqueue(struct rq rq, struct task_struct p)
				864	{
				865	rq->nr_numa_running += (p->numa_preferred_nid != -1);
				866	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
				867	}
				868
				869	static void account_numa_dequeue(struct rq rq, struct task_struct p)
				870	{
				871	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
				872	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
				873	}
				874
				875	struct numa_group {
				876	atomic_t refcount;
				877
				878	spinlock_t lock; /* nr_tasks, tasks */
				879	int nr_tasks;
				880	pid_t gid;
				881
				882	struct rcu_head rcu;
				883	nodemask_t active_nodes;
				884	unsigned long total_faults;
				885	/*
				886	* Faults_cpu is used to decide whether memory should move
				887	* towards the CPU. As a consequence, these stats are weighted
				888	* more by CPU use than by memory faults.
				889	*/
				890	unsigned long *faults_cpu;
				891	unsigned long faults[0];
				892	};
				893
				894	/* Shared or private faults. */
				895	#define NR_NUMA_HINT_FAULT_TYPES 2
				896
				897	/* Memory and CPU locality */
				898	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
				899
				900	/* Averaged statistics, and temporary buffers. */
				901	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
				902
				903	pid_t task_numa_group_id(struct task_struct *p)
				904	{
				905	return p->numa_group ? p->numa_group->gid : 0;
				906	}
				907
				908	/*
				909	* The averaged statistics, shared & private, memory & cpu,
				910	* occupy the first half of the array. The second half of the
				911	* array is for current counters, which are averaged into the
				912	* first set by task_numa_placement.
				913	*/
				914	static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
				915	{
				916	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
				917	}
				918
				919	static inline unsigned long task_faults(struct task_struct *p, int nid)
				920	{
				921	if (!p->numa_faults)
				922	return 0;
				923
				924	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				925	p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
				926	}
				927
				928	static inline unsigned long group_faults(struct task_struct *p, int nid)
				929	{
				930	if (!p->numa_group)
				931	return 0;
				932
				933	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				934	p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
				935	}
				936
				937	static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
				938	{
				939	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
				940	group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
				941	}
				942
				943	/* Handle placement on systems where not all nodes are directly connected. */
				944	static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
				945	int maxdist, bool task)
				946	{
				947	unsigned long score = 0;
				948	int node;
				949
				950	/*
				951	* All nodes are directly connected, and the same distance
				952	* from each other. No need for fancy placement algorithms.
				953	*/
				954	if (sched_numa_topology_type == NUMA_DIRECT)
				955	return 0;
				956
				957	/*
				958	* This code is called for each node, introducing N^2 complexity,
				959	* which should be ok given the number of nodes rarely exceeds 8.
				960	*/
				961	for_each_online_node(node) {
				962	unsigned long faults;
				963	int dist = node_distance(nid, node);
				964
				965	/*
				966	* The furthest away nodes in the system are not interesting
				967	* for placement; nid was already counted.
				968	*/
				969	if (dist == sched_max_numa_distance \|\| node == nid)
				970	continue;
				971
				972	/*
				973	* On systems with a backplane NUMA topology, compare groups
				974	* of nodes, and move tasks towards the group with the most
				975	* memory accesses. When comparing two nodes at distance
				976	* "hoplimit", only nodes closer by than "hoplimit" are part
				977	* of each group. Skip other nodes.
				978	*/
				979	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				980	dist > maxdist)
				981	continue;
				982
				983	/* Add up the faults from nearby nodes. */
				984	if (task)
				985	faults = task_faults(p, node);
				986	else
				987	faults = group_faults(p, node);
				988
				989	/*
				990	* On systems with a glueless mesh NUMA topology, there are
				991	* no fixed "groups of nodes". Instead, nodes that are not
				992	* directly connected bounce traffic through intermediate
				993	* nodes; a numa_group can occupy any set of nodes.
				994	* The further away a node is, the less the faults count.
				995	* This seems to result in good task placement.
				996	*/
				997	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				998	faults *= (sched_max_numa_distance - dist);
				999	faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
				1000	}
				1001
				1002	score += faults;
				1003	}
				1004
				1005	return score;
				1006	}
				1007
				1008	/*
				1009	* These return the fraction of accesses done by a particular task, or
				1010	* task group, on a particular numa node. The group weight is given a
				1011	* larger multiplier, in order to group tasks together that are almost
				1012	* evenly spread out between numa nodes.
				1013	*/
				1014	static inline unsigned long task_weight(struct task_struct *p, int nid,
				1015	int dist)
				1016	{
				1017	unsigned long faults, total_faults;
				1018
				1019	if (!p->numa_faults)
				1020	return 0;
				1021
				1022	total_faults = p->total_numa_faults;
				1023
				1024	if (!total_faults)
				1025	return 0;
				1026
				1027	faults = task_faults(p, nid);
				1028	faults += score_nearby_nodes(p, nid, dist, true);
				1029
				1030	return 1000 * faults / total_faults;
				1031	}
				1032
				1033	static inline unsigned long group_weight(struct task_struct *p, int nid,
				1034	int dist)
				1035	{
				1036	unsigned long faults, total_faults;
				1037
				1038	if (!p->numa_group)
				1039	return 0;
				1040
				1041	total_faults = p->numa_group->total_faults;
				1042
				1043	if (!total_faults)
				1044	return 0;
				1045
				1046	faults = group_faults(p, nid);
				1047	faults += score_nearby_nodes(p, nid, dist, false);
				1048
				1049	return 1000 * faults / total_faults;
				1050	}
				1051
				1052	bool should_numa_migrate_memory(struct task_struct p, struct page page,
				1053	int src_nid, int dst_cpu)
				1054	{
				1055	struct numa_group *ng = p->numa_group;
				1056	int dst_nid = cpu_to_node(dst_cpu);
				1057	int last_cpupid, this_cpupid;
				1058
				1059	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
				1060
				1061	/*
				1062	* Multi-stage node selection is used in conjunction with a periodic
				1063	* migration fault to build a temporal task<->page relation. By using
				1064	* a two-stage filter we remove short/unlikely relations.
				1065	*
				1066	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
				1067	* a task's usage of a particular page (n_p) per total usage of this
				1068	* page (n_t) (in a given time-span) to a probability.
				1069	*
				1070	* Our periodic faults will sample this probability and getting the
				1071	* same result twice in a row, given these samples are fully
				1072	* independent, is then given by P(n)^2, provided our sample period
				1073	* is sufficiently short compared to the usage pattern.
				1074	*
				1075	* This quadric squishes small probabilities, making it less likely we
				1076	* act on an unlikely task<->page relation.
				1077	*/
				1078	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
				1079	if (!cpupid_pid_unset(last_cpupid) &&
				1080	cpupid_to_nid(last_cpupid) != dst_nid)
				1081	return false;
				1082
				1083	/* Always allow migrate on private faults */
				1084	if (cpupid_match_pid(p, last_cpupid))
				1085	return true;
				1086
				1087	/* A shared fault, but p->numa_group has not been set up yet. */
				1088	if (!ng)
				1089	return true;
				1090
				1091	/*
				1092	* Do not migrate if the destination is not a node that
				1093	* is actively used by this numa group.
				1094	*/
				1095	if (!node_isset(dst_nid, ng->active_nodes))
				1096	return false;
				1097
				1098	/*
				1099	* Source is a node that is not actively used by this
				1100	* numa group, while the destination is. Migrate.
				1101	*/
				1102	if (!node_isset(src_nid, ng->active_nodes))
				1103	return true;
				1104
				1105	/*
				1106	* Both source and destination are nodes in active
				1107	* use by this numa group. Maximize memory bandwidth
				1108	* by migrating from more heavily used groups, to less
				1109	* heavily used ones, spreading the load around.
				1110	* Use a 1/4 hysteresis to avoid spurious page movement.
				1111	*/
				1112	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
				1113	}
				1114
				1115	static unsigned long weighted_cpuload(const int cpu);
				1116	static unsigned long source_load(int cpu, int type);
				1117	static unsigned long target_load(int cpu, int type);
				1118	static unsigned long capacity_of(int cpu);
				1119	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
				1120
				1121	/* Cached statistics for all CPUs within a node */
				1122	struct numa_stats {
				1123	unsigned long nr_running;
				1124	unsigned long load;
				1125
				1126	/* Total compute capacity of CPUs on a node */
				1127	unsigned long compute_capacity;
				1128
				1129	/* Approximate capacity in terms of runnable tasks on a node */
				1130	unsigned long task_capacity;
				1131	int has_free_capacity;
				1132	};
				1133
				1134	/*
				1135	* XXX borrowed from update_sg_lb_stats
				1136	*/
				1137	static void update_numa_stats(struct numa_stats *ns, int nid)
				1138	{
				1139	int smt, cpu, cpus = 0;
				1140	unsigned long capacity;
				1141
				1142	memset(ns, 0, sizeof(*ns));
				1143	for_each_cpu(cpu, cpumask_of_node(nid)) {
				1144	struct rq *rq = cpu_rq(cpu);
				1145
				1146	ns->nr_running += rq->nr_running;
				1147	ns->load += weighted_cpuload(cpu);
				1148	ns->compute_capacity += capacity_of(cpu);
				1149
				1150	cpus++;
				1151	}
				1152
				1153	/*
				1154	* If we raced with hotplug and there are no CPUs left in our mask
				1155	* the @ns structure is NULL'ed and task_numa_compare() will
				1156	* not find this node attractive.
				1157	*
				1158	* We'll either bail at !has_free_capacity, or we'll detect a huge
				1159	* imbalance and bail there.
				1160	*/
				1161	if (!cpus)
				1162	return;
				1163
				1164	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
				1165	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
				1166	capacity = cpus / smt; /* cores */
				1167
				1168	ns->task_capacity = min_t(unsigned, capacity,
				1169	DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
				1170	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
				1171	}
				1172
				1173	struct task_numa_env {
				1174	struct task_struct *p;
				1175
				1176	int src_cpu, src_nid;
				1177	int dst_cpu, dst_nid;
				1178
				1179	struct numa_stats src_stats, dst_stats;
				1180
				1181	int imbalance_pct;
				1182	int dist;
				1183
				1184	struct task_struct *best_task;
				1185	long best_imp;
				1186	int best_cpu;
				1187	};
				1188
				1189	static void task_numa_assign(struct task_numa_env *env,
				1190	struct task_struct *p, long imp)
				1191	{
				1192	if (env->best_task)
				1193	put_task_struct(env->best_task);
				1194
				1195	env->best_task = p;
				1196	env->best_imp = imp;
				1197	env->best_cpu = env->dst_cpu;
				1198	}
				1199
				1200	static bool load_too_imbalanced(long src_load, long dst_load,
				1201	struct task_numa_env *env)
				1202	{
				1203	long imb, old_imb;
				1204	long orig_src_load, orig_dst_load;
				1205	long src_capacity, dst_capacity;
				1206
				1207	/*
				1208	* The load is corrected for the CPU capacity available on each node.
				1209	*
				1210	* src_load dst_load
				1211	* ------------ vs ---------
				1212	* src_capacity dst_capacity
				1213	*/
				1214	src_capacity = env->src_stats.compute_capacity;
				1215	dst_capacity = env->dst_stats.compute_capacity;
				1216
				1217	/* We care about the slope of the imbalance, not the direction. */
				1218	if (dst_load < src_load)
				1219	swap(dst_load, src_load);
				1220
				1221	/* Is the difference below the threshold? */
				1222	imb = dst_load * src_capacity * 100 -
				1223	src_load * dst_capacity * env->imbalance_pct;
				1224	if (imb <= 0)
				1225	return false;
				1226
				1227	/*
				1228	* The imbalance is above the allowed threshold.
				1229	* Compare it with the old imbalance.
				1230	*/
				1231	orig_src_load = env->src_stats.load;
				1232	orig_dst_load = env->dst_stats.load;
				1233
				1234	if (orig_dst_load < orig_src_load)
				1235	swap(orig_dst_load, orig_src_load);
				1236
				1237	old_imb = orig_dst_load * src_capacity * 100 -
				1238	orig_src_load * dst_capacity * env->imbalance_pct;
				1239
				1240	/* Would this change make things worse? */
				1241	return (imb > old_imb);
				1242	}
				1243
				1244	/*
				1245	* This checks if the overall compute and NUMA accesses of the system would
				1246	* be improved if the source tasks was migrated to the target dst_cpu taking
				1247	* into account that it might be best if task running on the dst_cpu should
				1248	* be exchanged with the source task
				1249	*/
				1250	static void task_numa_compare(struct task_numa_env *env,
				1251	long taskimp, long groupimp)
				1252	{
				1253	struct rq *src_rq = cpu_rq(env->src_cpu);
				1254	struct rq *dst_rq = cpu_rq(env->dst_cpu);
				1255	struct task_struct *cur;
				1256	long src_load, dst_load;
				1257	long load;
				1258	long imp = env->p->numa_group ? groupimp : taskimp;
				1259	long moveimp = imp;
				1260	int dist = env->dist;
				1261	bool assigned = false;
				1262
				1263	rcu_read_lock();
				1264
				1265	raw_spin_lock_irq(&dst_rq->lock);
				1266	cur = dst_rq->curr;
				1267	/*
				1268	* No need to move the exiting task or idle task.
				1269	*/
				1270	if ((cur->flags & PF_EXITING) \|\| is_idle_task(cur))
				1271	cur = NULL;
				1272	else {
				1273	/*
				1274	* The task_struct must be protected here to protect the
				1275	* p->numa_faults access in the task_weight since the
				1276	* numa_faults could already be freed in the following path:
				1277	* finish_task_switch()
				1278	* --> put_task_struct()
				1279	* --> __put_task_struct()
				1280	* --> task_numa_free()
				1281	*/
				1282	get_task_struct(cur);
				1283	}
				1284
				1285	raw_spin_unlock_irq(&dst_rq->lock);
				1286
				1287	/*
				1288	* Because we have preemption enabled we can get migrated around and
				1289	* end try selecting ourselves (current == env->p) as a swap candidate.
				1290	*/
				1291	if (cur == env->p)
				1292	goto unlock;
				1293
				1294	/*
				1295	* "imp" is the fault differential for the source task between the
				1296	* source and destination node. Calculate the total differential for
				1297	* the source task and potential destination task. The more negative
				1298	* the value is, the more rmeote accesses that would be expected to
				1299	* be incurred if the tasks were swapped.
				1300	*/
				1301	if (cur) {
				1302	/* Skip this swap candidate if cannot move to the source cpu */
				1303	if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
				1304	goto unlock;
				1305
				1306	/*
				1307	* If dst and source tasks are in the same NUMA group, or not
				1308	* in any group then look only at task weights.
				1309	*/
				1310	if (cur->numa_group == env->p->numa_group) {
				1311	imp = taskimp + task_weight(cur, env->src_nid, dist) -
				1312	task_weight(cur, env->dst_nid, dist);
				1313	/*
				1314	* Add some hysteresis to prevent swapping the
				1315	* tasks within a group over tiny differences.
				1316	*/
				1317	if (cur->numa_group)
				1318	imp -= imp/16;
				1319	} else {
				1320	/*
				1321	* Compare the group weights. If a task is all by
				1322	* itself (not part of a group), use the task weight
				1323	* instead.
				1324	*/
				1325	if (cur->numa_group)
				1326	imp += group_weight(cur, env->src_nid, dist) -
				1327	group_weight(cur, env->dst_nid, dist);
				1328	else
				1329	imp += task_weight(cur, env->src_nid, dist) -
				1330	task_weight(cur, env->dst_nid, dist);
				1331	}
				1332	}
				1333
				1334	if (imp <= env->best_imp && moveimp <= env->best_imp)
				1335	goto unlock;
				1336
				1337	if (!cur) {
				1338	/* Is there capacity at our destination? */
				1339	if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
				1340	!env->dst_stats.has_free_capacity)
				1341	goto unlock;
				1342
				1343	goto balance;
				1344	}
				1345
				1346	/* Balance doesn't matter much if we're running a task per cpu */
				1347	if (imp > env->best_imp && src_rq->nr_running == 1 &&
				1348	dst_rq->nr_running == 1)
				1349	goto assign;
				1350
				1351	/*
				1352	* In the overloaded case, try and keep the load balanced.
				1353	*/
				1354	balance:
				1355	load = task_h_load(env->p);
				1356	dst_load = env->dst_stats.load + load;
				1357	src_load = env->src_stats.load - load;
				1358
				1359	if (moveimp > imp && moveimp > env->best_imp) {
				1360	/*
				1361	* If the improvement from just moving env->p direction is
				1362	* better than swapping tasks around, check if a move is
				1363	* possible. Store a slightly smaller score than moveimp,
				1364	* so an actually idle CPU will win.
				1365	*/
				1366	if (!load_too_imbalanced(src_load, dst_load, env)) {
				1367	imp = moveimp - 1;
				1368	put_task_struct(cur);
				1369	cur = NULL;
				1370	goto assign;
				1371	}
				1372	}
				1373
				1374	if (imp <= env->best_imp)
				1375	goto unlock;
				1376
				1377	if (cur) {
				1378	load = task_h_load(cur);
				1379	dst_load -= load;
				1380	src_load += load;
				1381	}
				1382
				1383	if (load_too_imbalanced(src_load, dst_load, env))
				1384	goto unlock;
				1385
				1386	/*
				1387	* One idle CPU per node is evaluated for a task numa move.
				1388	* Call select_idle_sibling to maybe find a better one.
				1389	*/
				1390	if (!cur)
				1391	env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
				1392
				1393	assign:
				1394	assigned = true;
				1395	task_numa_assign(env, cur, imp);
				1396	unlock:
				1397	rcu_read_unlock();
				1398	/*
				1399	* The dst_rq->curr isn't assigned. The protection for task_struct is
				1400	* finished.
				1401	*/
				1402	if (cur && !assigned)
				1403	put_task_struct(cur);
				1404	}
				1405
				1406	static void task_numa_find_cpu(struct task_numa_env *env,
				1407	long taskimp, long groupimp)
				1408	{
				1409	int cpu;
				1410
				1411	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
				1412	/* Skip this CPU if the source task cannot migrate */
				1413	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
				1414	continue;
				1415
				1416	env->dst_cpu = cpu;
				1417	task_numa_compare(env, taskimp, groupimp);
				1418	}
				1419	}
				1420
				1421	/* Only move tasks to a NUMA node less busy than the current node. */
				1422	static bool numa_has_capacity(struct task_numa_env *env)
				1423	{
				1424	struct numa_stats *src = &env->src_stats;
				1425	struct numa_stats *dst = &env->dst_stats;
				1426
				1427	if (src->has_free_capacity && !dst->has_free_capacity)
				1428	return false;
				1429
				1430	/*
				1431	* Only consider a task move if the source has a higher load
				1432	* than the destination, corrected for CPU capacity on each node.
				1433	*
				1434	* src->load dst->load
				1435	* --------------------- vs ---------------------
				1436	* src->compute_capacity dst->compute_capacity
				1437	*/
				1438	if (src->load * dst->compute_capacity * env->imbalance_pct >
				1439
				1440	dst->load * src->compute_capacity * 100)
				1441	return true;
				1442
				1443	return false;
				1444	}
				1445
				1446	static int task_numa_migrate(struct task_struct *p)
				1447	{
				1448	struct task_numa_env env = {
				1449	.p = p,
				1450
				1451	.src_cpu = task_cpu(p),
				1452	.src_nid = task_node(p),
				1453
				1454	.imbalance_pct = 112,
				1455
				1456	.best_task = NULL,
				1457	.best_imp = 0,
				1458	.best_cpu = -1
				1459	};
				1460	struct sched_domain *sd;
				1461	unsigned long taskweight, groupweight;
				1462	int nid, ret, dist;
				1463	long taskimp, groupimp;
				1464
				1465	/*
				1466	* Pick the lowest SD_NUMA domain, as that would have the smallest
				1467	* imbalance and would be the first to start moving tasks about.
				1468	*
				1469	* And we want to avoid any moving of tasks about, as that would create
				1470	* random movement of tasks -- counter the numa conditions we're trying
				1471	* to satisfy here.
				1472	*/
				1473	rcu_read_lock();
				1474	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
				1475	if (sd)
				1476	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
				1477	rcu_read_unlock();
				1478
				1479	/*
				1480	* Cpusets can break the scheduler domain tree into smaller
				1481	* balance domains, some of which do not cross NUMA boundaries.
				1482	* Tasks that are "trapped" in such domains cannot be migrated
				1483	* elsewhere, so there is no point in (re)trying.
				1484	*/
				1485	if (unlikely(!sd)) {
				1486	p->numa_preferred_nid = task_node(p);
				1487	return -EINVAL;
				1488	}
				1489
				1490	env.dst_nid = p->numa_preferred_nid;
				1491	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
				1492	taskweight = task_weight(p, env.src_nid, dist);
				1493	groupweight = group_weight(p, env.src_nid, dist);
				1494	update_numa_stats(&env.src_stats, env.src_nid);
				1495	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
				1496	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
				1497	update_numa_stats(&env.dst_stats, env.dst_nid);
				1498
				1499	/* Try to find a spot on the preferred nid. */
				1500	if (numa_has_capacity(&env))
				1501	task_numa_find_cpu(&env, taskimp, groupimp);
				1502
				1503	/*
				1504	* Look at other nodes in these cases:
				1505	* - there is no space available on the preferred_nid
				1506	* - the task is part of a numa_group that is interleaved across
				1507	* multiple NUMA nodes; in order to better consolidate the group,
				1508	* we need to check other locations.
				1509	*/
				1510	if (env.best_cpu == -1 \|\| (p->numa_group &&
				1511	nodes_weight(p->numa_group->active_nodes) > 1)) {
				1512	for_each_online_node(nid) {
				1513	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
				1514	continue;
				1515
				1516	dist = node_distance(env.src_nid, env.dst_nid);
				1517	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1518	dist != env.dist) {
				1519	taskweight = task_weight(p, env.src_nid, dist);
				1520	groupweight = group_weight(p, env.src_nid, dist);
				1521	}
				1522
				1523	/* Only consider nodes where both task and groups benefit */
				1524	taskimp = task_weight(p, nid, dist) - taskweight;
				1525	groupimp = group_weight(p, nid, dist) - groupweight;
				1526	if (taskimp < 0 && groupimp < 0)
				1527	continue;
				1528
				1529	env.dist = dist;
				1530	env.dst_nid = nid;
				1531	update_numa_stats(&env.dst_stats, env.dst_nid);
				1532	if (numa_has_capacity(&env))
				1533	task_numa_find_cpu(&env, taskimp, groupimp);
				1534	}
				1535	}
				1536
				1537	/*
				1538	* If the task is part of a workload that spans multiple NUMA nodes,
				1539	* and is migrating into one of the workload's active nodes, remember
				1540	* this node as the task's preferred numa node, so the workload can
				1541	* settle down.
				1542	* A task that migrated to a second choice node will be better off
				1543	* trying for a better one later. Do not set the preferred node here.
				1544	*/
				1545	if (p->numa_group) {
				1546	if (env.best_cpu == -1)
				1547	nid = env.src_nid;
				1548	else
				1549	nid = env.dst_nid;
				1550
				1551	if (node_isset(nid, p->numa_group->active_nodes))
				1552	sched_setnuma(p, env.dst_nid);
				1553	}
				1554
				1555	/* No better CPU than the current one was found. */
				1556	if (env.best_cpu == -1)
				1557	return -EAGAIN;
				1558
				1559	/*
				1560	* Reset the scan period if the task is being rescheduled on an
				1561	* alternative node to recheck if the tasks is now properly placed.
				1562	*/
				1563	p->numa_scan_period = task_scan_min(p);
				1564
				1565	if (env.best_task == NULL) {
				1566	ret = migrate_task_to(p, env.best_cpu);
				1567	if (ret != 0)
				1568	trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
				1569	return ret;
				1570	}
				1571
				1572	ret = migrate_swap(p, env.best_task);
				1573	if (ret != 0)
				1574	trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
				1575	put_task_struct(env.best_task);
				1576	return ret;
				1577	}
				1578
				1579	/* Attempt to migrate a task to a CPU on the preferred node. */
				1580	static void numa_migrate_preferred(struct task_struct *p)
				1581	{
				1582	unsigned long interval = HZ;
				1583
				1584	/* This task has no NUMA fault statistics yet */
				1585	if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults))
				1586	return;
				1587
				1588	/* Periodically retry migrating the task to the preferred node */
				1589	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
				1590	p->numa_migrate_retry = jiffies + interval;
				1591
				1592	/* Success if task is already running on preferred CPU */
				1593	if (task_node(p) == p->numa_preferred_nid)
				1594	return;
				1595
				1596	/* Otherwise, try migrate to a CPU on the preferred node */
				1597	task_numa_migrate(p);
				1598	}
				1599
				1600	/*
				1601	* Find the nodes on which the workload is actively running. We do this by
				1602	* tracking the nodes from which NUMA hinting faults are triggered. This can
				1603	* be different from the set of nodes where the workload's memory is currently
				1604	* located.
				1605	*
				1606	* The bitmask is used to make smarter decisions on when to do NUMA page
				1607	* migrations, To prevent flip-flopping, and excessive page migrations, nodes
				1608	* are added when they cause over 6/16 of the maximum number of faults, but
				1609	* only removed when they drop below 3/16.
				1610	*/
				1611	static void update_numa_active_node_mask(struct numa_group *numa_group)
				1612	{
				1613	unsigned long faults, max_faults = 0;
				1614	int nid;
				1615
				1616	for_each_online_node(nid) {
				1617	faults = group_faults_cpu(numa_group, nid);
				1618	if (faults > max_faults)
				1619	max_faults = faults;
				1620	}
				1621
				1622	for_each_online_node(nid) {
				1623	faults = group_faults_cpu(numa_group, nid);
				1624	if (!node_isset(nid, numa_group->active_nodes)) {
				1625	if (faults > max_faults * 6 / 16)
				1626	node_set(nid, numa_group->active_nodes);
				1627	} else if (faults < max_faults * 3 / 16)
				1628	node_clear(nid, numa_group->active_nodes);
				1629	}
				1630	}
				1631
				1632	/*
				1633	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
				1634	* increments. The more local the fault statistics are, the higher the scan
				1635	* period will be for the next scan window. If local/(local+remote) ratio is
				1636	* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
				1637	* the scan period will decrease. Aim for 70% local accesses.
				1638	*/
				1639	#define NUMA_PERIOD_SLOTS 10
				1640	#define NUMA_PERIOD_THRESHOLD 7
				1641
				1642	/*
				1643	* Increase the scan period (slow down scanning) if the majority of
				1644	* our memory is already on our local node, or if the majority of
				1645	* the page accesses are shared with other processes.
				1646	* Otherwise, decrease the scan period.
				1647	*/
				1648	static void update_task_scan_period(struct task_struct *p,
				1649	unsigned long shared, unsigned long private)
				1650	{
				1651	unsigned int period_slot;
				1652	int ratio;
				1653	int diff;
				1654
				1655	unsigned long remote = p->numa_faults_locality[0];
				1656	unsigned long local = p->numa_faults_locality[1];
				1657
				1658	/*
				1659	* If there were no record hinting faults then either the task is
				1660	* completely idle or all activity is areas that are not of interest
				1661	* to automatic numa balancing. Related to that, if there were failed
				1662	* migration then it implies we are migrating too quickly or the local
				1663	* node is overloaded. In either case, scan slower
				1664	*/
				1665	if (local + shared == 0 \|\| p->numa_faults_locality[2]) {
				1666	p->numa_scan_period = min(p->numa_scan_period_max,
				1667	p->numa_scan_period << 1);
				1668
				1669	p->mm->numa_next_scan = jiffies +
				1670	msecs_to_jiffies(p->numa_scan_period);
				1671
				1672	return;
				1673	}
				1674
				1675	/*
				1676	* Prepare to scale scan period relative to the current period.
				1677	* == NUMA_PERIOD_THRESHOLD scan period stays the same
				1678	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
				1679	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
				1680	*/
				1681	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
				1682	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
				1683	if (ratio >= NUMA_PERIOD_THRESHOLD) {
				1684	int slot = ratio - NUMA_PERIOD_THRESHOLD;
				1685	if (!slot)
				1686	slot = 1;
				1687	diff = slot * period_slot;
				1688	} else {
				1689	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
				1690
				1691	/*
				1692	* Scale scan rate increases based on sharing. There is an
				1693	* inverse relationship between the degree of sharing and
				1694	* the adjustment made to the scanning period. Broadly
				1695	* speaking the intent is that there is little point
				1696	* scanning faster if shared accesses dominate as it may
				1697	* simply bounce migrations uselessly
				1698	*/
				1699	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
				1700	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
				1701	}
				1702
				1703	p->numa_scan_period = clamp(p->numa_scan_period + diff,
				1704	task_scan_min(p), task_scan_max(p));
				1705	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
				1706	}
				1707
				1708	/*
				1709	* Get the fraction of time the task has been running since the last
				1710	* NUMA placement cycle. The scheduler keeps similar statistics, but
				1711	* decays those on a 32ms period, which is orders of magnitude off
				1712	* from the dozens-of-seconds NUMA balancing period. Use the scheduler
				1713	* stats only if the task is so new there are no NUMA statistics yet.
				1714	*/
				1715	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)
				1716	{
				1717	u64 runtime, delta, now;
				1718	/* Use the start of this time slice to avoid calculations. */
				1719	now = p->se.exec_start;
				1720	runtime = p->se.sum_exec_runtime;
				1721
				1722	if (p->last_task_numa_placement) {
				1723	delta = runtime - p->last_sum_exec_runtime;
				1724	*period = now - p->last_task_numa_placement;
				1725	} else {
				1726	delta = p->se.avg.load_sum / p->se.load.weight;
				1727	*period = LOAD_AVG_MAX;
				1728	}
				1729
				1730	p->last_sum_exec_runtime = runtime;
				1731	p->last_task_numa_placement = now;
				1732
				1733	return delta;
				1734	}
				1735
				1736	/*
				1737	* Determine the preferred nid for a task in a numa_group. This needs to
				1738	* be done in a way that produces consistent results with group_weight,
				1739	* otherwise workloads might not converge.
				1740	*/
				1741	static int preferred_group_nid(struct task_struct *p, int nid)
				1742	{
				1743	nodemask_t nodes;
				1744	int dist;
				1745
				1746	/* Direct connections between all NUMA nodes. */
				1747	if (sched_numa_topology_type == NUMA_DIRECT)
				1748	return nid;
				1749
				1750	/*
				1751	* On a system with glueless mesh NUMA topology, group_weight
				1752	* scores nodes according to the number of NUMA hinting faults on
				1753	* both the node itself, and on nearby nodes.
				1754	*/
				1755	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				1756	unsigned long score, max_score = 0;
				1757	int node, max_node = nid;
				1758
				1759	dist = sched_max_numa_distance;
				1760
				1761	for_each_online_node(node) {
				1762	score = group_weight(p, node, dist);
				1763	if (score > max_score) {
				1764	max_score = score;
				1765	max_node = node;
				1766	}
				1767	}
				1768	return max_node;
				1769	}
				1770
				1771	/*
				1772	* Finding the preferred nid in a system with NUMA backplane
				1773	* interconnect topology is more involved. The goal is to locate
				1774	* tasks from numa_groups near each other in the system, and
				1775	* untangle workloads from different sides of the system. This requires
				1776	* searching down the hierarchy of node groups, recursively searching
				1777	* inside the highest scoring group of nodes. The nodemask tricks
				1778	* keep the complexity of the search down.
				1779	*/
				1780	nodes = node_online_map;
				1781	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
				1782	unsigned long max_faults = 0;
				1783	nodemask_t max_group = NODE_MASK_NONE;
				1784	int a, b;
				1785
				1786	/* Are there nodes at this distance from each other? */
				1787	if (!find_numa_distance(dist))
				1788	continue;
				1789
				1790	for_each_node_mask(a, nodes) {
				1791	unsigned long faults = 0;
				1792	nodemask_t this_group;
				1793	nodes_clear(this_group);
				1794
				1795	/* Sum group's NUMA faults; includes a==b case. */
				1796	for_each_node_mask(b, nodes) {
				1797	if (node_distance(a, b) < dist) {
				1798	faults += group_faults(p, b);
				1799	node_set(b, this_group);
				1800	node_clear(b, nodes);
				1801	}
				1802	}
				1803
				1804	/* Remember the top group. */
				1805	if (faults > max_faults) {
				1806	max_faults = faults;
				1807	max_group = this_group;
				1808	/*
				1809	* subtle: at the smallest distance there is
				1810	* just one node left in each "group", the
				1811	* winner is the preferred nid.
				1812	*/
				1813	nid = a;
				1814	}
				1815	}
				1816	/* Next round, evaluate the nodes within max_group. */
				1817	if (!max_faults)
				1818	break;
				1819	nodes = max_group;
				1820	}
				1821	return nid;
				1822	}
				1823
				1824	static void task_numa_placement(struct task_struct *p)
				1825	{
				1826	int seq, nid, max_nid = -1, max_group_nid = -1;
				1827	unsigned long max_faults = 0, max_group_faults = 0;
				1828	unsigned long fault_types[2] = { 0, 0 };
				1829	unsigned long total_faults;
				1830	u64 runtime, period;
				1831	spinlock_t *group_lock = NULL;
				1832
				1833	/*
				1834	* The p->mm->numa_scan_seq field gets updated without
				1835	* exclusive access. Use READ_ONCE() here to ensure
				1836	* that the field is read in a single access:
				1837	*/
				1838	seq = READ_ONCE(p->mm->numa_scan_seq);
				1839	if (p->numa_scan_seq == seq)
				1840	return;
				1841	p->numa_scan_seq = seq;
				1842	p->numa_scan_period_max = task_scan_max(p);
				1843
				1844	total_faults = p->numa_faults_locality[0] +
				1845	p->numa_faults_locality[1];
				1846	runtime = numa_get_avg_runtime(p, &period);
				1847
				1848	/* If the task is part of a group prevent parallel updates to group stats */
				1849	if (p->numa_group) {
				1850	group_lock = &p->numa_group->lock;
				1851	spin_lock_irq(group_lock);
				1852	}
				1853
				1854	/* Find the node with the highest number of faults */
				1855	for_each_online_node(nid) {
				1856	/* Keep track of the offsets in numa_faults array */
				1857	int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
				1858	unsigned long faults = 0, group_faults = 0;
				1859	int priv;
				1860
				1861	for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
				1862	long diff, f_diff, f_weight;
				1863
				1864	mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
				1865	membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
				1866	cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
				1867	cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
				1868
				1869	/* Decay existing window, copy faults since last scan */
				1870	diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
				1871	fault_types[priv] += p->numa_faults[membuf_idx];
				1872	p->numa_faults[membuf_idx] = 0;
				1873
				1874	/*
				1875	* Normalize the faults_from, so all tasks in a group
				1876	* count according to CPU use, instead of by the raw
				1877	* number of faults. Tasks with little runtime have
				1878	* little over-all impact on throughput, and thus their
				1879	* faults are less important.
				1880	*/
				1881	f_weight = div64_u64(runtime << 16, period + 1);
				1882	f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
				1883	(total_faults + 1);
				1884	f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
				1885	p->numa_faults[cpubuf_idx] = 0;
				1886
				1887	p->numa_faults[mem_idx] += diff;
				1888	p->numa_faults[cpu_idx] += f_diff;
				1889	faults += p->numa_faults[mem_idx];
				1890	p->total_numa_faults += diff;
				1891	if (p->numa_group) {
				1892	/*
				1893	* safe because we can only change our own group
				1894	*
				1895	* mem_idx represents the offset for a given
				1896	* nid and priv in a specific region because it
				1897	* is at the beginning of the numa_faults array.
				1898	*/
				1899	p->numa_group->faults[mem_idx] += diff;
				1900	p->numa_group->faults_cpu[mem_idx] += f_diff;
				1901	p->numa_group->total_faults += diff;
				1902	group_faults += p->numa_group->faults[mem_idx];
				1903	}
				1904	}
				1905
				1906	if (faults > max_faults) {
				1907	max_faults = faults;
				1908	max_nid = nid;
				1909	}
				1910
				1911	if (group_faults > max_group_faults) {
				1912	max_group_faults = group_faults;
				1913	max_group_nid = nid;
				1914	}
				1915	}
				1916
				1917	update_task_scan_period(p, fault_types[0], fault_types[1]);
				1918
				1919	if (p->numa_group) {
				1920	update_numa_active_node_mask(p->numa_group);
				1921	spin_unlock_irq(group_lock);
				1922	max_nid = preferred_group_nid(p, max_group_nid);
				1923	}
				1924
				1925	if (max_faults) {
				1926	/* Set the new preferred node */
				1927	if (max_nid != p->numa_preferred_nid)
				1928	sched_setnuma(p, max_nid);
				1929
				1930	if (task_node(p) != p->numa_preferred_nid)
				1931	numa_migrate_preferred(p);
				1932	}
				1933	}
				1934
				1935	static inline int get_numa_group(struct numa_group *grp)
				1936	{
				1937	return atomic_inc_not_zero(&grp->refcount);
				1938	}
				1939
				1940	static inline void put_numa_group(struct numa_group *grp)
				1941	{
				1942	if (atomic_dec_and_test(&grp->refcount))
				1943	kfree_rcu(grp, rcu);
				1944	}
				1945
				1946	static void task_numa_group(struct task_struct *p, int cpupid, int flags,
				1947	int *priv)
				1948	{
				1949	struct numa_group grp, my_grp;
				1950	struct task_struct *tsk;
				1951	bool join = false;
				1952	int cpu = cpupid_to_cpu(cpupid);
				1953	int i;
				1954
				1955	if (unlikely(!p->numa_group)) {
				1956	unsigned int size = sizeof(struct numa_group) +
				1957	4nr_node_idssizeof(unsigned long);
				1958
				1959	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
				1960	if (!grp)
				1961	return;
				1962
				1963	atomic_set(&grp->refcount, 1);
				1964	spin_lock_init(&grp->lock);
				1965	grp->gid = p->pid;
				1966	/* Second half of the array tracks nids where faults happen */
				1967	grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
				1968	nr_node_ids;
				1969
				1970	node_set(task_node(current), grp->active_nodes);
				1971
				1972	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
				1973	grp->faults[i] = p->numa_faults[i];
				1974
				1975	grp->total_faults = p->total_numa_faults;
				1976
				1977	grp->nr_tasks++;
				1978	rcu_assign_pointer(p->numa_group, grp);
				1979	}
				1980
				1981	rcu_read_lock();
				1982	tsk = READ_ONCE(cpu_rq(cpu)->curr);
				1983
				1984	if (!cpupid_match_pid(tsk, cpupid))
				1985	goto no_join;
				1986
				1987	grp = rcu_dereference(tsk->numa_group);
				1988	if (!grp)
				1989	goto no_join;
				1990
				1991	my_grp = p->numa_group;
				1992	if (grp == my_grp)
				1993	goto no_join;
				1994
				1995	/*
				1996	* Only join the other group if its bigger; if we're the bigger group,
				1997	* the other task will join us.
				1998	*/
				1999	if (my_grp->nr_tasks > grp->nr_tasks)
				2000	goto no_join;
				2001
				2002	/*
				2003	* Tie-break on the grp address.
				2004	*/
				2005	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
				2006	goto no_join;
				2007
				2008	/* Always join threads in the same process. */
				2009	if (tsk->mm == current->mm)
				2010	join = true;
				2011
				2012	/* Simple filter to avoid false positives due to PID collisions */
				2013	if (flags & TNF_SHARED)
				2014	join = true;
				2015
				2016	/* Update priv based on whether false sharing was detected */
				2017	*priv = !join;
				2018
				2019	if (join && !get_numa_group(grp))
				2020	goto no_join;
				2021
				2022	rcu_read_unlock();
				2023
				2024	if (!join)
				2025	return;
				2026
				2027	BUG_ON(irqs_disabled());
				2028	double_lock_irq(&my_grp->lock, &grp->lock);
				2029
				2030	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
				2031	my_grp->faults[i] -= p->numa_faults[i];
				2032	grp->faults[i] += p->numa_faults[i];
				2033	}
				2034	my_grp->total_faults -= p->total_numa_faults;
				2035	grp->total_faults += p->total_numa_faults;
				2036
				2037	my_grp->nr_tasks--;
				2038	grp->nr_tasks++;
				2039
				2040	spin_unlock(&my_grp->lock);
				2041	spin_unlock_irq(&grp->lock);
				2042
				2043	rcu_assign_pointer(p->numa_group, grp);
				2044
				2045	put_numa_group(my_grp);
				2046	return;
				2047
				2048	no_join:
				2049	rcu_read_unlock();
				2050	return;
				2051	}
				2052
				2053	void task_numa_free(struct task_struct *p)
				2054	{
				2055	struct numa_group *grp = p->numa_group;
				2056	void *numa_faults = p->numa_faults;
				2057	unsigned long flags;
				2058	int i;
				2059
				2060	if (grp) {
				2061	spin_lock_irqsave(&grp->lock, flags);
				2062	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
				2063	grp->faults[i] -= p->numa_faults[i];
				2064	grp->total_faults -= p->total_numa_faults;
				2065
				2066	grp->nr_tasks--;
				2067	spin_unlock_irqrestore(&grp->lock, flags);
				2068	RCU_INIT_POINTER(p->numa_group, NULL);
				2069	put_numa_group(grp);
				2070	}
				2071
				2072	p->numa_faults = NULL;
				2073	kfree(numa_faults);
				2074	}
				2075
				2076	/*
				2077	* Got a PROT_NONE fault for a page on @node.
				2078	*/
				2079	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
				2080	{
				2081	struct task_struct *p = current;
				2082	bool migrated = flags & TNF_MIGRATED;
				2083	int cpu_node = task_node(current);
				2084	int local = !!(flags & TNF_FAULT_LOCAL);
				2085	int priv;
				2086
				2087	if (!static_branch_likely(&sched_numa_balancing))
				2088	return;
				2089
				2090	/* for example, ksmd faulting in a user's mm */
				2091	if (!p->mm)
				2092	return;
				2093
				2094	/* Allocate buffer to track faults on a per-node basis */
				2095	if (unlikely(!p->numa_faults)) {
				2096	int size = sizeof(p->numa_faults)
				2097	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
				2098
				2099	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
				2100	if (!p->numa_faults)
				2101	return;
				2102
				2103	p->total_numa_faults = 0;
				2104	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
				2105	}
				2106
				2107	/*
				2108	* First accesses are treated as private, otherwise consider accesses
				2109	* to be private if the accessing pid has not changed
				2110	*/
				2111	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
				2112	priv = 1;
				2113	} else {
				2114	priv = cpupid_match_pid(p, last_cpupid);
				2115	if (!priv && !(flags & TNF_NO_GROUP))
				2116	task_numa_group(p, last_cpupid, flags, &priv);
				2117	}
				2118
				2119	/*
				2120	* If a workload spans multiple NUMA nodes, a shared fault that
				2121	* occurs wholly within the set of nodes that the workload is
				2122	* actively using should be counted as local. This allows the
				2123	* scan rate to slow down when a workload has settled down.
				2124	*/
				2125	if (!priv && !local && p->numa_group &&
				2126	node_isset(cpu_node, p->numa_group->active_nodes) &&
				2127	node_isset(mem_node, p->numa_group->active_nodes))
				2128	local = 1;
				2129
				2130	task_numa_placement(p);
				2131
				2132	/*
				2133	* Retry task to preferred node migration periodically, in case it
				2134	* case it previously failed, or the scheduler moved us.
				2135	*/
				2136	if (time_after(jiffies, p->numa_migrate_retry))
				2137	numa_migrate_preferred(p);
				2138
				2139	if (migrated)
				2140	p->numa_pages_migrated += pages;
				2141	if (flags & TNF_MIGRATE_FAIL)
				2142	p->numa_faults_locality[2] += pages;
				2143
				2144	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
				2145	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
				2146	p->numa_faults_locality[local] += pages;
				2147	}
				2148
				2149	static void reset_ptenuma_scan(struct task_struct *p)
				2150	{
				2151	/*
				2152	* We only did a read acquisition of the mmap sem, so
				2153	* p->mm->numa_scan_seq is written to without exclusive access
				2154	* and the update is not guaranteed to be atomic. That's not
				2155	* much of an issue though, since this is just used for
				2156	* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
				2157	* expensive, to avoid any form of compiler optimizations:
				2158	*/
				2159	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
				2160	p->mm->numa_scan_offset = 0;
				2161	}
				2162
				2163	/*
				2164	* The expensive part of numa migration is done from task_work context.
				2165	* Triggered from task_tick_numa().
				2166	*/
				2167	void task_numa_work(struct callback_head *work)
				2168	{
				2169	unsigned long migrate, next_scan, now = jiffies;
				2170	struct task_struct *p = current;
				2171	struct mm_struct *mm = p->mm;
				2172	struct vm_area_struct *vma;
				2173	unsigned long start, end;
				2174	unsigned long nr_pte_updates = 0;
				2175	long pages, virtpages;
				2176
				2177	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				2178
				2179	work->next = work; /* protect against double add */
				2180	/*
				2181	* Who cares about NUMA placement when they're dying.
				2182	*
				2183	* NOTE: make sure not to dereference p->mm before this check,
				2184	* exit_task_work() happens _after_ exit_mm() so we could be called
				2185	* without p->mm even though we still had it when we enqueued this
				2186	* work.
				2187	*/
				2188	if (p->flags & PF_EXITING)
				2189	return;
				2190
				2191	if (!mm->numa_next_scan) {
				2192	mm->numa_next_scan = now +
				2193	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
				2194	}
				2195
				2196	/*
				2197	* Enforce maximal scan/migration frequency..
				2198	*/
				2199	migrate = mm->numa_next_scan;
				2200	if (time_before(now, migrate))
				2201	return;
				2202
				2203	if (p->numa_scan_period == 0) {
				2204	p->numa_scan_period_max = task_scan_max(p);
				2205	p->numa_scan_period = task_scan_min(p);
				2206	}
				2207
				2208	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
				2209	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				2210	return;
				2211
				2212	/*
				2213	* Delay this task enough that another task of this mm will likely win
				2214	* the next time around.
				2215	*/
				2216	p->node_stamp += 2 * TICK_NSEC;
				2217
				2218	start = mm->numa_scan_offset;
				2219	pages = sysctl_numa_balancing_scan_size;
				2220	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				2221	virtpages = pages * 8; /* Scan up to this much virtual space */
				2222	if (!pages)
				2223	return;
				2224
				2225
				2226	down_read(&mm->mmap_sem);
				2227	vma = find_vma(mm, start);
				2228	if (!vma) {
				2229	reset_ptenuma_scan(p);
				2230	start = 0;
				2231	vma = mm->mmap;
				2232	}
				2233	for (; vma; vma = vma->vm_next) {
				2234	if (!vma_migratable(vma) \|\| !vma_policy_mof(vma) \|\|
				2235	is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_MIXEDMAP)) {
				2236	continue;
				2237	}
				2238
				2239	/*
				2240	* Shared library pages mapped by multiple processes are not
				2241	* migrated as it is expected they are cache replicated. Avoid
				2242	* hinting faults in read-only file-backed mappings or the vdso
				2243	* as migrating the pages will be of marginal benefit.
				2244	*/
				2245	if (!vma->vm_mm \|\|
				2246	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))
				2247	continue;
				2248
				2249	/*
				2250	* Skip inaccessible VMAs to avoid any confusion between
				2251	* PROT_NONE and NUMA hinting ptes
				2252	*/
				2253	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
				2254	continue;
				2255
				2256	do {
				2257	start = max(start, vma->vm_start);
				2258	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				2259	end = min(end, vma->vm_end);
				2260	nr_pte_updates = change_prot_numa(vma, start, end);
				2261
				2262	/*
				2263	* Try to scan sysctl_numa_balancing_size worth of
				2264	* hpages that have at least one present PTE that
				2265	* is not already pte-numa. If the VMA contains
				2266	* areas that are unused or already full of prot_numa
				2267	* PTEs, scan up to virtpages, to skip through those
				2268	* areas faster.
				2269	*/
				2270	if (nr_pte_updates)
				2271	pages -= (end - start) >> PAGE_SHIFT;
				2272	virtpages -= (end - start) >> PAGE_SHIFT;
				2273
				2274	start = end;
				2275	if (pages <= 0 \|\| virtpages <= 0)
				2276	goto out;
				2277
				2278	cond_resched();
				2279	} while (end != vma->vm_end);
				2280	}
				2281
				2282	out:
				2283	/*
				2284	* It is possible to reach the end of the VMA list but the last few
				2285	* VMAs are not guaranteed to the vma_migratable. If they are not, we
				2286	* would find the !migratable VMA on the next scan but not reset the
				2287	* scanner to the start so check it now.
				2288	*/
				2289	if (vma)
				2290	mm->numa_scan_offset = start;
				2291	else
				2292	reset_ptenuma_scan(p);
				2293	up_read(&mm->mmap_sem);
				2294	}
				2295
				2296	/*
				2297	* Drive the periodic memory faults..
				2298	*/
				2299	void task_tick_numa(struct rq rq, struct task_struct curr)
				2300	{
				2301	struct callback_head *work = &curr->numa_work;
				2302	u64 period, now;
				2303
				2304	/*
				2305	* We don't care about NUMA placement if we don't have memory.
				2306	*/
				2307	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				2308	return;
				2309
				2310	/*
				2311	* Using runtime rather than walltime has the dual advantage that
				2312	* we (mostly) drive the selection from busy threads and that the
				2313	* task needs to have done some actual work before we bother with
				2314	* NUMA placement.
				2315	*/
				2316	now = curr->se.sum_exec_runtime;
				2317	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				2318
				2319	if (now > curr->node_stamp + period) {
				2320	if (!curr->node_stamp)
				2321	curr->numa_scan_period = task_scan_min(curr);
				2322	curr->node_stamp += period;
				2323
				2324	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				2325	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				2326	task_work_add(curr, work, true);
				2327	}
				2328	}
				2329	}
				2330	#else
				2331	static void task_tick_numa(struct rq rq, struct task_struct curr)
				2332	{
				2333	}
				2334
				2335	static inline void account_numa_enqueue(struct rq rq, struct task_struct p)
				2336	{
				2337	}
				2338
				2339	static inline void account_numa_dequeue(struct rq rq, struct task_struct p)
				2340	{
				2341	}
				2342	#endif /* CONFIG_NUMA_BALANCING */
				2343
				2344	static void
				2345	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				2346	{
				2347	update_load_add(&cfs_rq->load, se->load.weight);
				2348	if (!parent_entity(se))
				2349	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
				2350	#ifdef CONFIG_SMP
				2351	if (entity_is_task(se)) {
				2352	struct rq *rq = rq_of(cfs_rq);
				2353
				2354	account_numa_enqueue(rq, task_of(se));
				2355	list_add(&se->group_node, &rq->cfs_tasks);
				2356	}
				2357	#endif
				2358	cfs_rq->nr_running++;
				2359	}
				2360
				2361	static void
				2362	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				2363	{
				2364	update_load_sub(&cfs_rq->load, se->load.weight);
				2365	if (!parent_entity(se))
				2366	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
				2367	if (entity_is_task(se)) {
				2368	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
				2369	list_del_init(&se->group_node);
				2370	}
				2371	cfs_rq->nr_running--;
				2372	}
				2373
				2374	#ifdef CONFIG_FAIR_GROUP_SCHED
				2375	# ifdef CONFIG_SMP
				2376	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				2377	{
				2378	long tg_weight;
				2379
				2380	/*
				2381	* Use this CPU's real-time load instead of the last load contribution
				2382	* as the updating of the contribution is delayed, and we will use the
				2383	* the real-time load to calc the share. See update_tg_load_avg().
				2384	*/
				2385	tg_weight = atomic_long_read(&tg->load_avg);
				2386	tg_weight -= cfs_rq->tg_load_avg_contrib;
				2387	tg_weight += cfs_rq->load.weight;
				2388
				2389	return tg_weight;
				2390	}
				2391
				2392	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
				2393	{
				2394	long tg_weight, load, shares;
				2395
				2396	tg_weight = calc_tg_weight(tg, cfs_rq);
				2397	load = cfs_rq->load.weight;
				2398
				2399	shares = (tg->shares * load);
				2400	if (tg_weight)
				2401	shares /= tg_weight;
				2402
				2403	if (shares < MIN_SHARES)
				2404	shares = MIN_SHARES;
				2405	if (shares > tg->shares)
				2406	shares = tg->shares;
				2407
				2408	return shares;
				2409	}
				2410	# else /* CONFIG_SMP */
				2411	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
				2412	{
				2413	return tg->shares;
				2414	}
				2415	# endif /* CONFIG_SMP */
				2416	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				2417	unsigned long weight)
				2418	{
				2419	if (se->on_rq) {
				2420	/* commit outstanding execution time */
				2421	if (cfs_rq->curr == se)
				2422	update_curr(cfs_rq);
				2423	account_entity_dequeue(cfs_rq, se);
				2424	}
				2425
				2426	update_load_set(&se->load, weight);
				2427
				2428	if (se->on_rq)
				2429	account_entity_enqueue(cfs_rq, se);
				2430	}
				2431
				2432	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				2433
				2434	static void update_cfs_shares(struct cfs_rq *cfs_rq)
				2435	{
				2436	struct task_group *tg;
				2437	struct sched_entity *se;
				2438	long shares;
				2439
				2440	tg = cfs_rq->tg;
				2441	se = tg->se[cpu_of(rq_of(cfs_rq))];
				2442	if (!se \|\| throttled_hierarchy(cfs_rq))
				2443	return;
				2444	#ifndef CONFIG_SMP
				2445	if (likely(se->load.weight == tg->shares))
				2446	return;
				2447	#endif
				2448	shares = calc_cfs_shares(cfs_rq, tg);
				2449
				2450	reweight_entity(cfs_rq_of(se), se, shares);
				2451	}
				2452	#else /* CONFIG_FAIR_GROUP_SCHED */
				2453	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
				2454	{
				2455	}
				2456	#endif /* CONFIG_FAIR_GROUP_SCHED */
				2457
				2458	#ifdef CONFIG_SMP
				2459	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				2460	static const u32 runnable_avg_yN_inv[] = {
				2461	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				2462	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				2463	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				2464	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				2465	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				2466	0x85aac367, 0x82cd8698,
				2467	};
				2468
				2469	/*
				2470	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				2471	* over-estimates when re-combining.
				2472	*/
				2473	static const u32 runnable_avg_yN_sum[] = {
				2474	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				2475	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				2476	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				2477	};
				2478
				2479	/*
				2480	* Approximate:
				2481	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				2482	*/
				2483	static __always_inline u64 decay_load(u64 val, u64 n)
				2484	{
				2485	unsigned int local_n;
				2486
				2487	if (!n)
				2488	return val;
				2489	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				2490	return 0;
				2491
				2492	/* after bounds checking we can collapse to 32-bit */
				2493	local_n = n;
				2494
				2495	/*
				2496	* As y^PERIOD = 1/2, we can combine
				2497	* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
				2498	* With a look-up table which covers y^n (n<PERIOD)
				2499	*
				2500	* To achieve constant time decay_load.
				2501	*/
				2502	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				2503	val >>= local_n / LOAD_AVG_PERIOD;
				2504	local_n %= LOAD_AVG_PERIOD;
				2505	}
				2506
				2507	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
				2508	return val;
				2509	}
				2510
				2511	/*
				2512	* For updates fully spanning n periods, the contribution to runnable
				2513	* average will be: \Sum 1024*y^n
				2514	*
				2515	* We can compute this reasonably efficiently by combining:
				2516	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				2517	*/
				2518	static u32 __compute_runnable_contrib(u64 n)
				2519	{
				2520	u32 contrib = 0;
				2521
				2522	if (likely(n <= LOAD_AVG_PERIOD))
				2523	return runnable_avg_yN_sum[n];
				2524	else if (unlikely(n >= LOAD_AVG_MAX_N))
				2525	return LOAD_AVG_MAX;
				2526
				2527	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				2528	do {
				2529	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				2530	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				2531
				2532	n -= LOAD_AVG_PERIOD;
				2533	} while (n > LOAD_AVG_PERIOD);
				2534
				2535	contrib = decay_load(contrib, n);
				2536	return contrib + runnable_avg_yN_sum[n];
				2537	}
				2538
				2539	#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 \|\| SCHED_CAPACITY_SHIFT != 10
				2540	#error "load tracking assumes 2^10 as unit"
				2541	#endif
				2542
				2543	#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
				2544
				2545	/*
				2546	* We can represent the historical contribution to runnable average as the
				2547	* coefficients of a geometric series. To do this we sub-divide our runnable
				2548	* history into segments of approximately 1ms (1024us); label the segment that
				2549	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				2550	*
				2551	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				2552	* p0 p1 p2
				2553	* (now) (~1ms ago) (~2ms ago)
				2554	*
				2555	* Let u_i denote the fraction of p_i that the entity was runnable.
				2556	*
				2557	* We then designate the fractions u_i as our co-efficients, yielding the
				2558	* following representation of historical load:
				2559	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				2560	*
				2561	* We choose y based on the with of a reasonably scheduling period, fixing:
				2562	* y^32 = 0.5
				2563	*
				2564	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				2565	* approximately half as much as the contribution to load within the last ms
				2566	* (u_0).
				2567	*
				2568	* When a period "rolls over" and we have new u_0`, multiplying the previous
				2569	* sum again by y is sufficient to update:
				2570	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				2571	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				2572	*/
				2573	static __always_inline int
				2574	__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
				2575	unsigned long weight, int running, struct cfs_rq *cfs_rq)
				2576	{
				2577	u64 delta, scaled_delta, periods;
				2578	u32 contrib;
				2579	unsigned int delta_w, scaled_delta_w, decayed = 0;
				2580	unsigned long scale_freq, scale_cpu;
				2581
				2582	delta = now - sa->last_update_time;
				2583	/*
				2584	* This should only happen when time goes backwards, which it
				2585	* unfortunately does during sched clock init when we swap over to TSC.
				2586	*/
				2587	if ((s64)delta < 0) {
				2588	sa->last_update_time = now;
				2589	return 0;
				2590	}
				2591
				2592	/*
				2593	* Use 1024ns as the unit of measurement since it's a reasonable
				2594	* approximation of 1us and fast to compute.
				2595	*/
				2596	delta >>= 10;
				2597	if (!delta)
				2598	return 0;
				2599	sa->last_update_time = now;
				2600
				2601	scale_freq = arch_scale_freq_capacity(NULL, cpu);
				2602	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
				2603
				2604	/* delta_w is the amount already accumulated against our next period */
				2605	delta_w = sa->period_contrib;
				2606	if (delta + delta_w >= 1024) {
				2607	decayed = 1;
				2608
				2609	/* how much left for next period will start over, we don't know yet */
				2610	sa->period_contrib = 0;
				2611
				2612	/*
				2613	* Now that we know we're crossing a period boundary, figure
				2614	* out how much from delta we need to complete the current
				2615	* period and accrue it.
				2616	*/
				2617	delta_w = 1024 - delta_w;
				2618	scaled_delta_w = cap_scale(delta_w, scale_freq);
				2619	if (weight) {
				2620	sa->load_sum += weight * scaled_delta_w;
				2621	if (cfs_rq) {
				2622	cfs_rq->runnable_load_sum +=
				2623	weight * scaled_delta_w;
				2624	}
				2625	}
				2626	if (running)
				2627	sa->util_sum += scaled_delta_w * scale_cpu;
				2628
				2629	delta -= delta_w;
				2630
				2631	/* Figure out how many additional periods this update spans */
				2632	periods = delta / 1024;
				2633	delta %= 1024;
				2634
				2635	sa->load_sum = decay_load(sa->load_sum, periods + 1);
				2636	if (cfs_rq) {
				2637	cfs_rq->runnable_load_sum =
				2638	decay_load(cfs_rq->runnable_load_sum, periods + 1);
				2639	}
				2640	sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
				2641
				2642	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				2643	contrib = __compute_runnable_contrib(periods);
				2644	contrib = cap_scale(contrib, scale_freq);
				2645	if (weight) {
				2646	sa->load_sum += weight * contrib;
				2647	if (cfs_rq)
				2648	cfs_rq->runnable_load_sum += weight * contrib;
				2649	}
				2650	if (running)
				2651	sa->util_sum += contrib * scale_cpu;
				2652	}
				2653
				2654	/* Remainder of delta accrued against u_0` */
				2655	scaled_delta = cap_scale(delta, scale_freq);
				2656	if (weight) {
				2657	sa->load_sum += weight * scaled_delta;
				2658	if (cfs_rq)
				2659	cfs_rq->runnable_load_sum += weight * scaled_delta;
				2660	}
				2661	if (running)
				2662	sa->util_sum += scaled_delta * scale_cpu;
				2663
				2664	sa->period_contrib += delta;
				2665
				2666	if (decayed) {
				2667	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
				2668	if (cfs_rq) {
				2669	cfs_rq->runnable_load_avg =
				2670	div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
				2671	}
				2672	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
				2673	}
				2674
				2675	return decayed;
				2676	}
				2677
				2678	#ifdef CONFIG_FAIR_GROUP_SCHED
				2679	/*
				2680	* Updating tg's load_avg is necessary before update_cfs_share (which is done)
				2681	* and effective_load (which is not done because it is too costly).
				2682	*/
				2683	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
				2684	{
				2685	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
				2686
				2687	if (force \|\| abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
				2688	atomic_long_add(delta, &cfs_rq->tg->load_avg);
				2689	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
				2690	}
				2691	}
				2692
				2693	#else /* CONFIG_FAIR_GROUP_SCHED */
				2694	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
				2695	#endif /* CONFIG_FAIR_GROUP_SCHED */
				2696
				2697	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				2698
				2699	/*
				2700	* Unsigned subtract and clamp on underflow.
				2701	*
				2702	* Explicitly do a load-store to ensure the intermediate value never hits
				2703	* memory. This allows lockless observations without ever seeing the negative
				2704	* values.
				2705	*/
				2706	#define sub_positive(_ptr, _val) do { \
				2707	typeof(_ptr) ptr = (_ptr); \
				2708	typeof(*ptr) val = (_val); \
				2709	typeof(ptr) res, var = READ_ONCE(ptr); \
				2710	res = var - val; \
				2711	if (res > var) \
				2712	res = 0; \
				2713	WRITE_ONCE(*ptr, res); \
				2714	} while (0)
				2715
				2716	/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
				2717	static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
				2718	{
				2719	struct sched_avg *sa = &cfs_rq->avg;
				2720	int decayed, removed = 0;
				2721
				2722	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
				2723	s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
				2724	sub_positive(&sa->load_avg, r);
				2725	sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
				2726	removed = 1;
				2727	}
				2728
				2729	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
				2730	long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
				2731	sub_positive(&sa->util_avg, r);
				2732	sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
				2733	}
				2734
				2735	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
				2736	scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
				2737
				2738	#ifndef CONFIG_64BIT
				2739	smp_wmb();
				2740	cfs_rq->load_last_update_time_copy = sa->last_update_time;
				2741	#endif
				2742
				2743	return decayed \|\| removed;
				2744	}
				2745
				2746	/* Update task and its cfs_rq load average */
				2747	static inline void update_load_avg(struct sched_entity *se, int update_tg)
				2748	{
				2749	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2750	u64 now = cfs_rq_clock_task(cfs_rq);
				2751	int cpu = cpu_of(rq_of(cfs_rq));
				2752
				2753	/*
				2754	* Track task load average for carrying it to new CPU after migrated, and
				2755	* track group sched_entity load average for task_h_load calc in migration
				2756	*/
				2757	__update_load_avg(now, cpu, &se->avg,
				2758	se->on_rq * scale_load_down(se->load.weight),
				2759	cfs_rq->curr == se, NULL);
				2760
				2761	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
				2762	update_tg_load_avg(cfs_rq, 0);
				2763	}
				2764
				2765	static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2766	{
				2767	if (!sched_feat(ATTACH_AGE_LOAD))
				2768	goto skip_aging;
				2769
				2770	/*
				2771	* If we got migrated (either between CPUs or between cgroups) we'll
				2772	* have aged the average right before clearing @last_update_time.
				2773	*/
				2774	if (se->avg.last_update_time) {
				2775	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
				2776	&se->avg, 0, 0, NULL);
				2777
				2778	/*
				2779	* XXX: we could have just aged the entire load away if we've been
				2780	* absent from the fair class for too long.
				2781	*/
				2782	}
				2783
				2784	skip_aging:
				2785	se->avg.last_update_time = cfs_rq->avg.last_update_time;
				2786	cfs_rq->avg.load_avg += se->avg.load_avg;
				2787	cfs_rq->avg.load_sum += se->avg.load_sum;
				2788	cfs_rq->avg.util_avg += se->avg.util_avg;
				2789	cfs_rq->avg.util_sum += se->avg.util_sum;
				2790	}
				2791
				2792	static void detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2793	{
				2794	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
				2795	&se->avg, se->on_rq * scale_load_down(se->load.weight),
				2796	cfs_rq->curr == se, NULL);
				2797
				2798	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
				2799	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
				2800	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
				2801	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
				2802	}
				2803
				2804	/* Add the load generated by se into cfs_rq's load average */
				2805	static inline void
				2806	enqueue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2807	{
				2808	struct sched_avg *sa = &se->avg;
				2809	u64 now = cfs_rq_clock_task(cfs_rq);
				2810	int migrated, decayed;
				2811
				2812	migrated = !sa->last_update_time;
				2813	if (!migrated) {
				2814	__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
				2815	se->on_rq * scale_load_down(se->load.weight),
				2816	cfs_rq->curr == se, NULL);
				2817	}
				2818
				2819	decayed = update_cfs_rq_load_avg(now, cfs_rq);
				2820
				2821	cfs_rq->runnable_load_avg += sa->load_avg;
				2822	cfs_rq->runnable_load_sum += sa->load_sum;
				2823
				2824	if (migrated)
				2825	attach_entity_load_avg(cfs_rq, se);
				2826
				2827	if (decayed \|\| migrated)
				2828	update_tg_load_avg(cfs_rq, 0);
				2829	}
				2830
				2831	/* Remove the runnable load generated by se from cfs_rq's runnable load average */
				2832	static inline void
				2833	dequeue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2834	{
				2835	update_load_avg(se, 1);
				2836
				2837	cfs_rq->runnable_load_avg =
				2838	max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
				2839	cfs_rq->runnable_load_sum =
				2840	max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
				2841	}
				2842
				2843	/*
				2844	* Task first catches up with cfs_rq, and then subtract
				2845	* itself from the cfs_rq (task must be off the queue now).
				2846	*/
				2847	void remove_entity_load_avg(struct sched_entity *se)
				2848	{
				2849	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2850	u64 last_update_time;
				2851
				2852	#ifndef CONFIG_64BIT
				2853	u64 last_update_time_copy;
				2854
				2855	do {
				2856	last_update_time_copy = cfs_rq->load_last_update_time_copy;
				2857	smp_rmb();
				2858	last_update_time = cfs_rq->avg.last_update_time;
				2859	} while (last_update_time != last_update_time_copy);
				2860	#else
				2861	last_update_time = cfs_rq->avg.last_update_time;
				2862	#endif
				2863
				2864	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
				2865	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
				2866	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
				2867	}
				2868
				2869	/*
				2870	* Update the rq's load with the elapsed running time before entering
				2871	* idle. if the last scheduled task is not a CFS task, idle_enter will
				2872	* be the only way to update the runnable statistic.
				2873	*/
				2874	void idle_enter_fair(struct rq *this_rq)
				2875	{
				2876	}
				2877
				2878	/*
				2879	* Update the rq's load with the elapsed idle time before a task is
				2880	* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
				2881	* be the only way to update the runnable statistic.
				2882	*/
				2883	void idle_exit_fair(struct rq *this_rq)
				2884	{
				2885	}
				2886
				2887	static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
				2888	{
				2889	return cfs_rq->runnable_load_avg;
				2890	}
				2891
				2892	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
				2893	{
				2894	return cfs_rq->avg.load_avg;
				2895	}
				2896
				2897	static int idle_balance(struct rq *this_rq);
				2898
				2899	#else /* CONFIG_SMP */
				2900
				2901	static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
				2902	static inline void
				2903	enqueue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				2904	static inline void
				2905	dequeue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				2906	static inline void remove_entity_load_avg(struct sched_entity *se) {}
				2907
				2908	static inline void
				2909	attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				2910	static inline void
				2911	detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				2912
				2913	static inline int idle_balance(struct rq *rq)
				2914	{
				2915	return 0;
				2916	}
				2917
				2918	#endif /* CONFIG_SMP */
				2919
				2920	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
				2921	{
				2922	#ifdef CONFIG_SCHEDSTATS
				2923	struct task_struct *tsk = NULL;
				2924
				2925	if (entity_is_task(se))
				2926	tsk = task_of(se);
				2927
				2928	if (se->statistics.sleep_start) {
				2929	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
				2930
				2931	if ((s64)delta < 0)
				2932	delta = 0;
				2933
				2934	if (unlikely(delta > se->statistics.sleep_max))
				2935	se->statistics.sleep_max = delta;
				2936
				2937	se->statistics.sleep_start = 0;
				2938	se->statistics.sum_sleep_runtime += delta;
				2939
				2940	if (tsk) {
				2941	account_scheduler_latency(tsk, delta >> 10, 1);
				2942	trace_sched_stat_sleep(tsk, delta);
				2943	}
				2944	}
				2945	if (se->statistics.block_start) {
				2946	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
				2947
				2948	if ((s64)delta < 0)
				2949	delta = 0;
				2950
				2951	if (unlikely(delta > se->statistics.block_max))
				2952	se->statistics.block_max = delta;
				2953
				2954	se->statistics.block_start = 0;
				2955	se->statistics.sum_sleep_runtime += delta;
				2956
				2957	if (tsk) {
				2958	if (tsk->in_iowait) {
				2959	se->statistics.iowait_sum += delta;
				2960	se->statistics.iowait_count++;
				2961	trace_sched_stat_iowait(tsk, delta);
				2962	}
				2963
				2964	trace_sched_stat_blocked(tsk, delta);
				2965
				2966	/*
				2967	* Blocking time is in units of nanosecs, so shift by
				2968	* 20 to get a milliseconds-range estimation of the
				2969	* amount of time that the task spent sleeping:
				2970	*/
				2971	if (unlikely(prof_on == SLEEP_PROFILING)) {
				2972	profile_hits(SLEEP_PROFILING,
				2973	(void *)get_wchan(tsk),
				2974	delta >> 20);
				2975	}
				2976	account_scheduler_latency(tsk, delta >> 10, 0);
				2977	}
				2978	}
				2979	#endif
				2980	}
				2981
				2982	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				2983	{
				2984	#ifdef CONFIG_SCHED_DEBUG
				2985	s64 d = se->vruntime - cfs_rq->min_vruntime;
				2986
				2987	if (d < 0)
				2988	d = -d;
				2989
				2990	if (d > 3*sysctl_sched_latency)
				2991	schedstat_inc(cfs_rq, nr_spread_over);
				2992	#endif
				2993	}
				2994
				2995	static void
				2996	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				2997	{
				2998	u64 vruntime = cfs_rq->min_vruntime;
				2999
				3000	/*
				3001	* The 'current' period is already promised to the current tasks,
				3002	* however the extra weight of the new task will slow them down a
				3003	* little, place the new task so that it fits in the slot that
				3004	* stays open at the end.
				3005	*/
				3006	if (initial && sched_feat(START_DEBIT))
				3007	vruntime += sched_vslice(cfs_rq, se);
				3008
				3009	/* sleeps up to a single latency don't count. */
				3010	if (!initial) {
				3011	unsigned long thresh = sysctl_sched_latency;
				3012
				3013	/*
				3014	* Halve their sleep time's effect, to allow
				3015	* for a gentler effect of sleepers:
				3016	*/
				3017	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				3018	thresh >>= 1;
				3019
				3020	vruntime -= thresh;
				3021	}
				3022
				3023	/* ensure we never gain time by being placed backwards. */
				3024	se->vruntime = max_vruntime(se->vruntime, vruntime);
				3025	}
				3026
				3027	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				3028
				3029	static void
				3030	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
				3031	{
				3032	/*
				3033	* Update the normalized vruntime before updating min_vruntime
				3034	* through calling update_curr().
				3035	*/
				3036	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
				3037	se->vruntime += cfs_rq->min_vruntime;
				3038
				3039	/*
				3040	* Update run-time statistics of the 'current'.
				3041	*/
				3042	update_curr(cfs_rq);
				3043	enqueue_entity_load_avg(cfs_rq, se);
				3044	account_entity_enqueue(cfs_rq, se);
				3045	update_cfs_shares(cfs_rq);
				3046
				3047	if (flags & ENQUEUE_WAKEUP) {
				3048	place_entity(cfs_rq, se, 0);
				3049	enqueue_sleeper(cfs_rq, se);
				3050	}
				3051
				3052	update_stats_enqueue(cfs_rq, se);
				3053	check_spread(cfs_rq, se);
				3054	if (se != cfs_rq->curr)
				3055	__enqueue_entity(cfs_rq, se);
				3056	se->on_rq = 1;
				3057
				3058	if (cfs_rq->nr_running == 1) {
				3059	list_add_leaf_cfs_rq(cfs_rq);
				3060	check_enqueue_throttle(cfs_rq);
				3061	}
				3062	}
				3063
				3064	static void __clear_buddies_last(struct sched_entity *se)
				3065	{
				3066	for_each_sched_entity(se) {
				3067	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3068	if (cfs_rq->last != se)
				3069	break;
				3070
				3071	cfs_rq->last = NULL;
				3072	}
				3073	}
				3074
				3075	static void __clear_buddies_next(struct sched_entity *se)
				3076	{
				3077	for_each_sched_entity(se) {
				3078	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3079	if (cfs_rq->next != se)
				3080	break;
				3081
				3082	cfs_rq->next = NULL;
				3083	}
				3084	}
				3085
				3086	static void __clear_buddies_skip(struct sched_entity *se)
				3087	{
				3088	for_each_sched_entity(se) {
				3089	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3090	if (cfs_rq->skip != se)
				3091	break;
				3092
				3093	cfs_rq->skip = NULL;
				3094	}
				3095	}
				3096
				3097	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				3098	{
				3099	if (cfs_rq->last == se)
				3100	__clear_buddies_last(se);
				3101
				3102	if (cfs_rq->next == se)
				3103	__clear_buddies_next(se);
				3104
				3105	if (cfs_rq->skip == se)
				3106	__clear_buddies_skip(se);
				3107	}
				3108
				3109	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				3110
				3111	static void
				3112	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
				3113	{
				3114	/*
				3115	* Update run-time statistics of the 'current'.
				3116	*/
				3117	update_curr(cfs_rq);
				3118	dequeue_entity_load_avg(cfs_rq, se);
				3119
				3120	update_stats_dequeue(cfs_rq, se);
				3121	if (flags & DEQUEUE_SLEEP) {
				3122	#ifdef CONFIG_SCHEDSTATS
				3123	if (entity_is_task(se)) {
				3124	struct task_struct *tsk = task_of(se);
				3125
				3126	if (tsk->state & TASK_INTERRUPTIBLE)
				3127	se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
				3128	if (tsk->state & TASK_UNINTERRUPTIBLE)
				3129	se->statistics.block_start = rq_clock(rq_of(cfs_rq));
				3130	}
				3131	#endif
				3132	}
				3133
				3134	clear_buddies(cfs_rq, se);
				3135
				3136	if (se != cfs_rq->curr)
				3137	__dequeue_entity(cfs_rq, se);
				3138	se->on_rq = 0;
				3139	account_entity_dequeue(cfs_rq, se);
				3140
				3141	/*
				3142	* Normalize the entity after updating the min_vruntime because the
				3143	* update can refer to the ->curr item and we need to reflect this
				3144	* movement in our normalized position.
				3145	*/
				3146	if (!(flags & DEQUEUE_SLEEP))
				3147	se->vruntime -= cfs_rq->min_vruntime;
				3148
				3149	/* return excess runtime on last dequeue */
				3150	return_cfs_rq_runtime(cfs_rq);
				3151
				3152	update_min_vruntime(cfs_rq);
				3153	update_cfs_shares(cfs_rq);
				3154	}
				3155
				3156	/*
				3157	* Preempt the current task with a newly woken task if needed:
				3158	*/
				3159	static void
				3160	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
				3161	{
				3162	unsigned long ideal_runtime, delta_exec;
				3163	struct sched_entity *se;
				3164	s64 delta;
				3165
				3166	ideal_runtime = sched_slice(cfs_rq, curr);
				3167	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
				3168	if (delta_exec > ideal_runtime) {
				3169	resched_curr(rq_of(cfs_rq));
				3170	/*
				3171	* The current task ran long enough, ensure it doesn't get
				3172	* re-elected due to buddy favours.
				3173	*/
				3174	clear_buddies(cfs_rq, curr);
				3175	return;
				3176	}
				3177
				3178	/*
				3179	* Ensure that a task that missed wakeup preemption by a
				3180	* narrow margin doesn't have to wait for a full slice.
				3181	* This also mitigates buddy induced latencies under load.
				3182	*/
				3183	if (delta_exec < sysctl_sched_min_granularity)
				3184	return;
				3185
				3186	se = __pick_first_entity(cfs_rq);
				3187	delta = curr->vruntime - se->vruntime;
				3188
				3189	if (delta < 0)
				3190	return;
				3191
				3192	if (delta > ideal_runtime)
				3193	resched_curr(rq_of(cfs_rq));
				3194	}
				3195
				3196	static void
				3197	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
				3198	{
				3199	/* 'current' is not kept within the tree. */
				3200	if (se->on_rq) {
				3201	/*
				3202	* Any task has to be enqueued before it get to execute on
				3203	* a CPU. So account for the time it spent waiting on the
				3204	* runqueue.
				3205	*/
				3206	update_stats_wait_end(cfs_rq, se);
				3207	__dequeue_entity(cfs_rq, se);
				3208	update_load_avg(se, 1);
				3209	}
				3210
				3211	update_stats_curr_start(cfs_rq, se);
				3212	cfs_rq->curr = se;
				3213	#ifdef CONFIG_SCHEDSTATS
				3214	/*
				3215	* Track our maximum slice length, if the CPU's load is at
				3216	* least twice that of our own weight (i.e. dont track it
				3217	* when there are only lesser-weight tasks around):
				3218	*/
				3219	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
				3220	se->statistics.slice_max = max(se->statistics.slice_max,
				3221	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				3222	}
				3223	#endif
				3224	se->prev_sum_exec_runtime = se->sum_exec_runtime;
				3225	}
				3226
				3227	static int
				3228	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				3229
				3230	/*
				3231	* Pick the next process, keeping these things in mind, in this order:
				3232	* 1) keep things fair between processes/task groups
				3233	* 2) pick the "next" process, since someone really wants that to run
				3234	* 3) pick the "last" process, for cache locality
				3235	* 4) do not run the "skip" process, if something else is available
				3236	*/
				3237	static struct sched_entity *
				3238	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
				3239	{
				3240	struct sched_entity *left = __pick_first_entity(cfs_rq);
				3241	struct sched_entity *se;
				3242
				3243	/*
				3244	* If curr is set we have to see if its left of the leftmost entity
				3245	* still in the tree, provided there was anything in the tree at all.
				3246	*/
				3247	if (!left \|\| (curr && entity_before(curr, left)))
				3248	left = curr;
				3249
				3250	se = left; /* ideally we run the leftmost entity */
				3251
				3252	/*
				3253	* Avoid running the skip buddy, if running something else can
				3254	* be done without getting too unfair.
				3255	*/
				3256	if (cfs_rq->skip == se) {
				3257	struct sched_entity *second;
				3258
				3259	if (se == curr) {
				3260	second = __pick_first_entity(cfs_rq);
				3261	} else {
				3262	second = __pick_next_entity(se);
				3263	if (!second \|\| (curr && entity_before(curr, second)))
				3264	second = curr;
				3265	}
				3266
				3267	if (second && wakeup_preempt_entity(second, left) < 1)
				3268	se = second;
				3269	}
				3270
				3271	/*
				3272	* Prefer last buddy, try to return the CPU to a preempted task.
				3273	*/
				3274	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				3275	se = cfs_rq->last;
				3276
				3277	/*
				3278	* Someone really wants this to run. If it's not unfair, run it.
				3279	*/
				3280	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				3281	se = cfs_rq->next;
				3282
				3283	clear_buddies(cfs_rq, se);
				3284
				3285	return se;
				3286	}
				3287
				3288	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				3289
				3290	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
				3291	{
				3292	/*
				3293	* If still on the runqueue then deactivate_task()
				3294	* was not called and update_curr() has to be done:
				3295	*/
				3296	if (prev->on_rq)
				3297	update_curr(cfs_rq);
				3298
				3299	/* throttle cfs_rqs exceeding runtime */
				3300	check_cfs_rq_runtime(cfs_rq);
				3301
				3302	check_spread(cfs_rq, prev);
				3303	if (prev->on_rq) {
				3304	update_stats_wait_start(cfs_rq, prev);
				3305	/* Put 'current' back into the tree. */
				3306	__enqueue_entity(cfs_rq, prev);
				3307	/* in !on_rq case, update occurred at dequeue */
				3308	update_load_avg(prev, 0);
				3309	}
				3310	cfs_rq->curr = NULL;
				3311	}
				3312
				3313	static void
				3314	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
				3315	{
				3316	/*
				3317	* Update run-time statistics of the 'current'.
				3318	*/
				3319	update_curr(cfs_rq);
				3320
				3321	/*
				3322	* Ensure that runnable average is periodically updated.
				3323	*/
				3324	update_load_avg(curr, 1);
				3325	update_cfs_shares(cfs_rq);
				3326
				3327	#ifdef CONFIG_SCHED_HRTICK
				3328	/*
				3329	* queued ticks are scheduled to match the slice, so don't bother
				3330	* validating it and just reschedule.
				3331	*/
				3332	if (queued) {
				3333	resched_curr(rq_of(cfs_rq));
				3334	return;
				3335	}
				3336	/*
				3337	* don't let the period tick interfere with the hrtick preemption
				3338	*/
				3339	if (!sched_feat(DOUBLE_TICK) &&
				3340	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				3341	return;
				3342	#endif
				3343
				3344	if (cfs_rq->nr_running > 1)
				3345	check_preempt_tick(cfs_rq, curr);
				3346	}
				3347
				3348
				3349	/**************************************************
				3350	* CFS bandwidth control machinery
				3351	*/
				3352
				3353	#ifdef CONFIG_CFS_BANDWIDTH
				3354
				3355	#ifdef HAVE_JUMP_LABEL
				3356	static struct static_key __cfs_bandwidth_used;
				3357
				3358	static inline bool cfs_bandwidth_used(void)
				3359	{
				3360	return static_key_false(&__cfs_bandwidth_used);
				3361	}
				3362
				3363	void cfs_bandwidth_usage_inc(void)
				3364	{
				3365	static_key_slow_inc(&__cfs_bandwidth_used);
				3366	}
				3367
				3368	void cfs_bandwidth_usage_dec(void)
				3369	{
				3370	static_key_slow_dec(&__cfs_bandwidth_used);
				3371	}
				3372	#else /* HAVE_JUMP_LABEL */
				3373	static bool cfs_bandwidth_used(void)
				3374	{
				3375	return true;
				3376	}
				3377
				3378	void cfs_bandwidth_usage_inc(void) {}
				3379	void cfs_bandwidth_usage_dec(void) {}
				3380	#endif /* HAVE_JUMP_LABEL */
				3381
				3382	/*
				3383	* default period for cfs group bandwidth.
				3384	* default: 0.1s, units: nanoseconds
				3385	*/
				3386	static inline u64 default_cfs_period(void)
				3387	{
				3388	return 100000000ULL;
				3389	}
				3390
				3391	static inline u64 sched_cfs_bandwidth_slice(void)
				3392	{
				3393	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				3394	}
				3395
				3396	/*
				3397	* Replenish runtime according to assigned quota and update expiration time.
				3398	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				3399	* additional synchronization around rq->lock.
				3400	*
				3401	* requires cfs_b->lock
				3402	*/
				3403	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
				3404	{
				3405	u64 now;
				3406
				3407	if (cfs_b->quota == RUNTIME_INF)
				3408	return;
				3409
				3410	now = sched_clock_cpu(smp_processor_id());
				3411	cfs_b->runtime = cfs_b->quota;
				3412	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				3413	}
				3414
				3415	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				3416	{
				3417	return &tg->cfs_bandwidth;
				3418	}
				3419
				3420	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				3421	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				3422	{
				3423	if (unlikely(cfs_rq->throttle_count))
				3424	return cfs_rq->throttled_clock_task;
				3425
				3426	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
				3427	}
				3428
				3429	/* returns 0 on failure to allocate runtime */
				3430	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3431	{
				3432	struct task_group *tg = cfs_rq->tg;
				3433	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
				3434	u64 amount = 0, min_amount, expires;
				3435
				3436	/* note: this is a positive sum as runtime_remaining <= 0 */
				3437	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				3438
				3439	raw_spin_lock(&cfs_b->lock);
				3440	if (cfs_b->quota == RUNTIME_INF)
				3441	amount = min_amount;
				3442	else {
				3443	start_cfs_bandwidth(cfs_b);
				3444
				3445	if (cfs_b->runtime > 0) {
				3446	amount = min(cfs_b->runtime, min_amount);
				3447	cfs_b->runtime -= amount;
				3448	cfs_b->idle = 0;
				3449	}
				3450	}
				3451	expires = cfs_b->runtime_expires;
				3452	raw_spin_unlock(&cfs_b->lock);
				3453
				3454	cfs_rq->runtime_remaining += amount;
				3455	/*
				3456	* we may have advanced our local expiration to account for allowed
				3457	* spread between our sched_clock and the one on which runtime was
				3458	* issued.
				3459	*/
				3460	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				3461	cfs_rq->runtime_expires = expires;
				3462
				3463	return cfs_rq->runtime_remaining > 0;
				3464	}
				3465
				3466	/*
				3467	* Note: This depends on the synchronization provided by sched_clock and the
				3468	* fact that rq->clock snapshots this value.
				3469	*/
				3470	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3471	{
				3472	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3473
				3474	/* if the deadline is ahead of our clock, nothing to do */
				3475	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
				3476	return;
				3477
				3478	if (cfs_rq->runtime_remaining < 0)
				3479	return;
				3480
				3481	/*
				3482	* If the local deadline has passed we have to consider the
				3483	* possibility that our sched_clock is 'fast' and the global deadline
				3484	* has not truly expired.
				3485	*
				3486	* Fortunately we can check determine whether this the case by checking
				3487	* whether the global deadline has advanced. It is valid to compare
				3488	* cfs_b->runtime_expires without any locks since we only care about
				3489	* exact equality, so a partial write will still work.
				3490	*/
				3491
				3492	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
				3493	/* extend local deadline, drift is bounded above by 2 ticks */
				3494	cfs_rq->runtime_expires += TICK_NSEC;
				3495	} else {
				3496	/* global deadline is ahead, expiration has passed */
				3497	cfs_rq->runtime_remaining = 0;
				3498	}
				3499	}
				3500
				3501	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
				3502	{
				3503	/* dock delta_exec before expiring quota (as it could span periods) */
				3504	cfs_rq->runtime_remaining -= delta_exec;
				3505	expire_cfs_rq_runtime(cfs_rq);
				3506
				3507	if (likely(cfs_rq->runtime_remaining > 0))
				3508	return;
				3509
				3510	/*
				3511	* if we're unable to extend our runtime we resched so that the active
				3512	* hierarchy can be throttled
				3513	*/
				3514	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				3515	resched_curr(rq_of(cfs_rq));
				3516	}
				3517
				3518	static __always_inline
				3519	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
				3520	{
				3521	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
				3522	return;
				3523
				3524	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				3525	}
				3526
				3527	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				3528	{
				3529	return cfs_bandwidth_used() && cfs_rq->throttled;
				3530	}
				3531
				3532	/* check whether cfs_rq, or any parent, is throttled */
				3533	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				3534	{
				3535	return cfs_bandwidth_used() && cfs_rq->throttle_count;
				3536	}
				3537
				3538	/*
				3539	* Ensure that neither of the group entities corresponding to src_cpu or
				3540	* dest_cpu are members of a throttled hierarchy when performing group
				3541	* load-balance operations.
				3542	*/
				3543	static inline int throttled_lb_pair(struct task_group *tg,
				3544	int src_cpu, int dest_cpu)
				3545	{
				3546	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				3547
				3548	src_cfs_rq = tg->cfs_rq[src_cpu];
				3549	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				3550
				3551	return throttled_hierarchy(src_cfs_rq) \|\|
				3552	throttled_hierarchy(dest_cfs_rq);
				3553	}
				3554
				3555	/* updated child weight may affect parent so we have to do this bottom up */
				3556	static int tg_unthrottle_up(struct task_group tg, void data)
				3557	{
				3558	struct rq *rq = data;
				3559	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				3560
				3561	cfs_rq->throttle_count--;
				3562	#ifdef CONFIG_SMP
				3563	if (!cfs_rq->throttle_count) {
				3564	/* adjust cfs_rq_clock_task() */
				3565	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
				3566	cfs_rq->throttled_clock_task;
				3567	}
				3568	#endif
				3569
				3570	return 0;
				3571	}
				3572
				3573	static int tg_throttle_down(struct task_group tg, void data)
				3574	{
				3575	struct rq *rq = data;
				3576	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				3577
				3578	/* group is entering throttled state, stop time */
				3579	if (!cfs_rq->throttle_count)
				3580	cfs_rq->throttled_clock_task = rq_clock_task(rq);
				3581	cfs_rq->throttle_count++;
				3582
				3583	return 0;
				3584	}
				3585
				3586	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
				3587	{
				3588	struct rq *rq = rq_of(cfs_rq);
				3589	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3590	struct sched_entity *se;
				3591	long task_delta, dequeue = 1;
				3592	bool empty;
				3593
				3594	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				3595
				3596	/* freeze hierarchy runnable averages while throttled */
				3597	rcu_read_lock();
				3598	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				3599	rcu_read_unlock();
				3600
				3601	task_delta = cfs_rq->h_nr_running;
				3602	for_each_sched_entity(se) {
				3603	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				3604	/* throttled entity or throttle-on-deactivate */
				3605	if (!se->on_rq)
				3606	break;
				3607
				3608	if (dequeue)
				3609	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				3610	qcfs_rq->h_nr_running -= task_delta;
				3611
				3612	if (qcfs_rq->load.weight)
				3613	dequeue = 0;
				3614	}
				3615
				3616	if (!se)
				3617	sub_nr_running(rq, task_delta);
				3618
				3619	cfs_rq->throttled = 1;
				3620	cfs_rq->throttled_clock = rq_clock(rq);
				3621	raw_spin_lock(&cfs_b->lock);
				3622	empty = list_empty(&cfs_b->throttled_cfs_rq);
				3623
				3624	/*
				3625	* Add to the _head_ of the list, so that an already-started
				3626	* distribute_cfs_runtime will not see us
				3627	*/
				3628	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				3629
				3630	/*
				3631	* If we're the first throttled task, make sure the bandwidth
				3632	* timer is running.
				3633	*/
				3634	if (empty)
				3635	start_cfs_bandwidth(cfs_b);
				3636
				3637	raw_spin_unlock(&cfs_b->lock);
				3638	}
				3639
				3640	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
				3641	{
				3642	struct rq *rq = rq_of(cfs_rq);
				3643	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3644	struct sched_entity *se;
				3645	int enqueue = 1;
				3646	long task_delta;
				3647
				3648	se = cfs_rq->tg->se[cpu_of(rq)];
				3649
				3650	cfs_rq->throttled = 0;
				3651
				3652	update_rq_clock(rq);
				3653
				3654	raw_spin_lock(&cfs_b->lock);
				3655	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
				3656	list_del_rcu(&cfs_rq->throttled_list);
				3657	raw_spin_unlock(&cfs_b->lock);
				3658
				3659	/* update hierarchical throttle state */
				3660	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				3661
				3662	if (!cfs_rq->load.weight)
				3663	return;
				3664
				3665	task_delta = cfs_rq->h_nr_running;
				3666	for_each_sched_entity(se) {
				3667	if (se->on_rq)
				3668	enqueue = 0;
				3669
				3670	cfs_rq = cfs_rq_of(se);
				3671	if (enqueue)
				3672	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				3673	cfs_rq->h_nr_running += task_delta;
				3674
				3675	if (cfs_rq_throttled(cfs_rq))
				3676	break;
				3677	}
				3678
				3679	if (!se)
				3680	add_nr_running(rq, task_delta);
				3681
				3682	/* determine whether we need to wake up potentially idle cpu */
				3683	if (rq->curr == rq->idle && rq->cfs.nr_running)
				3684	resched_curr(rq);
				3685	}
				3686
				3687	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				3688	u64 remaining, u64 expires)
				3689	{
				3690	struct cfs_rq *cfs_rq;
				3691	u64 runtime;
				3692	u64 starting_runtime = remaining;
				3693
				3694	rcu_read_lock();
				3695	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				3696	throttled_list) {
				3697	struct rq *rq = rq_of(cfs_rq);
				3698
				3699	raw_spin_lock(&rq->lock);
				3700	if (!cfs_rq_throttled(cfs_rq))
				3701	goto next;
				3702
				3703	runtime = -cfs_rq->runtime_remaining + 1;
				3704	if (runtime > remaining)
				3705	runtime = remaining;
				3706	remaining -= runtime;
				3707
				3708	cfs_rq->runtime_remaining += runtime;
				3709	cfs_rq->runtime_expires = expires;
				3710
				3711	/* we check whether we're throttled above */
				3712	if (cfs_rq->runtime_remaining > 0)
				3713	unthrottle_cfs_rq(cfs_rq);
				3714
				3715	next:
				3716	raw_spin_unlock(&rq->lock);
				3717
				3718	if (!remaining)
				3719	break;
				3720	}
				3721	rcu_read_unlock();
				3722
				3723	return starting_runtime - remaining;
				3724	}
				3725
				3726	/*
				3727	* Responsible for refilling a task_group's bandwidth and unthrottling its
				3728	* cfs_rqs as appropriate. If there has been no activity within the last
				3729	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				3730	* used to track this state.
				3731	*/
				3732	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				3733	{
				3734	u64 runtime, runtime_expires;
				3735	int throttled;
				3736
				3737	/* no need to continue the timer with no bandwidth constraint */
				3738	if (cfs_b->quota == RUNTIME_INF)
				3739	goto out_deactivate;
				3740
				3741	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				3742	cfs_b->nr_periods += overrun;
				3743
				3744	/*
				3745	* idle depends on !throttled (for the case of a large deficit), and if
				3746	* we're going inactive then everything else can be deferred
				3747	*/
				3748	if (cfs_b->idle && !throttled)
				3749	goto out_deactivate;
				3750
				3751	__refill_cfs_bandwidth_runtime(cfs_b);
				3752
				3753	if (!throttled) {
				3754	/* mark as potentially idle for the upcoming period */
				3755	cfs_b->idle = 1;
				3756	return 0;
				3757	}
				3758
				3759	/* account preceding periods in which throttling occurred */
				3760	cfs_b->nr_throttled += overrun;
				3761
				3762	runtime_expires = cfs_b->runtime_expires;
				3763
				3764	/*
				3765	* This check is repeated as we are holding onto the new bandwidth while
				3766	* we unthrottle. This can potentially race with an unthrottled group
				3767	* trying to acquire new bandwidth from the global pool. This can result
				3768	* in us over-using our runtime if it is all used during this loop, but
				3769	* only by limited amounts in that extreme case.
				3770	*/
				3771	while (throttled && cfs_b->runtime > 0) {
				3772	runtime = cfs_b->runtime;
				3773	raw_spin_unlock(&cfs_b->lock);
				3774	/* we can't nest cfs_b->lock while distributing bandwidth */
				3775	runtime = distribute_cfs_runtime(cfs_b, runtime,
				3776	runtime_expires);
				3777	raw_spin_lock(&cfs_b->lock);
				3778
				3779	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				3780
				3781	cfs_b->runtime -= min(runtime, cfs_b->runtime);
				3782	}
				3783
				3784	/*
				3785	* While we are ensured activity in the period following an
				3786	* unthrottle, this also covers the case in which the new bandwidth is
				3787	* insufficient to cover the existing bandwidth deficit. (Forcing the
				3788	* timer to remain active while there are any throttled entities.)
				3789	*/
				3790	cfs_b->idle = 0;
				3791
				3792	return 0;
				3793
				3794	out_deactivate:
				3795	return 1;
				3796	}
				3797
				3798	/* a cfs_rq won't donate quota below this amount */
				3799	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				3800	/* minimum remaining period time to redistribute slack quota */
				3801	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				3802	/* how long we wait to gather additional slack before distributing */
				3803	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				3804
				3805	/*
				3806	* Are we near the end of the current quota period?
				3807	*
				3808	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
				3809	* hrtimer base being cleared by hrtimer_start. In the case of
				3810	* migrate_hrtimers, base is never cleared, so we are fine.
				3811	*/
				3812	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				3813	{
				3814	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				3815	u64 remaining;
				3816
				3817	/* if the call-back is running a quota refresh is already occurring */
				3818	if (hrtimer_callback_running(refresh_timer))
				3819	return 1;
				3820
				3821	/* is a quota refresh about to occur? */
				3822	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				3823	if (remaining < min_expire)
				3824	return 1;
				3825
				3826	return 0;
				3827	}
				3828
				3829	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				3830	{
				3831	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				3832
				3833	/* if there's a quota refresh soon don't bother with slack */
				3834	if (runtime_refresh_within(cfs_b, min_left))
				3835	return;
				3836
				3837	hrtimer_start(&cfs_b->slack_timer,
				3838	ns_to_ktime(cfs_bandwidth_slack_period),
				3839	HRTIMER_MODE_REL);
				3840	}
				3841
				3842	/* we know any runtime found here is valid as update_curr() precedes return */
				3843	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3844	{
				3845	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3846	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				3847
				3848	if (slack_runtime <= 0)
				3849	return;
				3850
				3851	raw_spin_lock(&cfs_b->lock);
				3852	if (cfs_b->quota != RUNTIME_INF &&
				3853	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				3854	cfs_b->runtime += slack_runtime;
				3855
				3856	/* we are under rq->lock, defer unthrottling using a timer */
				3857	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				3858	!list_empty(&cfs_b->throttled_cfs_rq))
				3859	start_cfs_slack_bandwidth(cfs_b);
				3860	}
				3861	raw_spin_unlock(&cfs_b->lock);
				3862
				3863	/* even if it's not valid for return we don't want to try again */
				3864	cfs_rq->runtime_remaining -= slack_runtime;
				3865	}
				3866
				3867	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3868	{
				3869	if (!cfs_bandwidth_used())
				3870	return;
				3871
				3872	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
				3873	return;
				3874
				3875	__return_cfs_rq_runtime(cfs_rq);
				3876	}
				3877
				3878	/*
				3879	* This is done with a timer (instead of inline with bandwidth return) since
				3880	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				3881	*/
				3882	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				3883	{
				3884	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				3885	u64 expires;
				3886
				3887	/* confirm we're still not at a refresh boundary */
				3888	raw_spin_lock(&cfs_b->lock);
				3889	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
				3890	raw_spin_unlock(&cfs_b->lock);
				3891	return;
				3892	}
				3893
				3894	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
				3895	runtime = cfs_b->runtime;
				3896
				3897	expires = cfs_b->runtime_expires;
				3898	raw_spin_unlock(&cfs_b->lock);
				3899
				3900	if (!runtime)
				3901	return;
				3902
				3903	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				3904
				3905	raw_spin_lock(&cfs_b->lock);
				3906	if (expires == cfs_b->runtime_expires)
				3907	cfs_b->runtime -= min(runtime, cfs_b->runtime);
				3908	raw_spin_unlock(&cfs_b->lock);
				3909	}
				3910
				3911	/*
				3912	* When a group wakes up we want to make sure that its quota is not already
				3913	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				3914	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				3915	*/
				3916	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				3917	{
				3918	if (!cfs_bandwidth_used())
				3919	return;
				3920
				3921	/* Synchronize hierarchical throttle counter: */
				3922	if (unlikely(!cfs_rq->throttle_uptodate)) {
				3923	struct rq *rq = rq_of(cfs_rq);
				3924	struct cfs_rq *pcfs_rq;
				3925	struct task_group *tg;
				3926
				3927	cfs_rq->throttle_uptodate = 1;
				3928
				3929	/* Get closest up-to-date node, because leaves go first: */
				3930	for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
				3931	pcfs_rq = tg->cfs_rq[cpu_of(rq)];
				3932	if (pcfs_rq->throttle_uptodate)
				3933	break;
				3934	}
				3935	if (tg) {
				3936	cfs_rq->throttle_count = pcfs_rq->throttle_count;
				3937	cfs_rq->throttled_clock_task = rq_clock_task(rq);
				3938	}
				3939	}
				3940
				3941	/* an active group must be handled by the update_curr()->put() path */
				3942	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				3943	return;
				3944
				3945	/* ensure the group is not already throttled */
				3946	if (cfs_rq_throttled(cfs_rq))
				3947	return;
				3948
				3949	/* update runtime allocation */
				3950	account_cfs_rq_runtime(cfs_rq, 0);
				3951	if (cfs_rq->runtime_remaining <= 0)
				3952	throttle_cfs_rq(cfs_rq);
				3953	}
				3954
				3955	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				3956	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3957	{
				3958	if (!cfs_bandwidth_used())
				3959	return false;
				3960
				3961	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				3962	return false;
				3963
				3964	/*
				3965	* it's possible for a throttled entity to be forced into a running
				3966	* state (e.g. set_curr_task), in this case we're finished.
				3967	*/
				3968	if (cfs_rq_throttled(cfs_rq))
				3969	return true;
				3970
				3971	throttle_cfs_rq(cfs_rq);
				3972	return true;
				3973	}
				3974
				3975	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				3976	{
				3977	struct cfs_bandwidth *cfs_b =
				3978	container_of(timer, struct cfs_bandwidth, slack_timer);
				3979
				3980	do_sched_cfs_slack_timer(cfs_b);
				3981
				3982	return HRTIMER_NORESTART;
				3983	}
				3984
				3985	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				3986	{
				3987	struct cfs_bandwidth *cfs_b =
				3988	container_of(timer, struct cfs_bandwidth, period_timer);
				3989	int overrun;
				3990	int idle = 0;
				3991
				3992	raw_spin_lock(&cfs_b->lock);
				3993	for (;;) {
				3994	overrun = hrtimer_forward_now(timer, cfs_b->period);
				3995	if (!overrun)
				3996	break;
				3997
				3998	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				3999	}
				4000	if (idle)
				4001	cfs_b->period_active = 0;
				4002	raw_spin_unlock(&cfs_b->lock);
				4003
				4004	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				4005	}
				4006
				4007	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4008	{
				4009	raw_spin_lock_init(&cfs_b->lock);
				4010	cfs_b->runtime = 0;
				4011	cfs_b->quota = RUNTIME_INF;
				4012	cfs_b->period = ns_to_ktime(default_cfs_period());
				4013
				4014	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				4015	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
				4016	cfs_b->period_timer.function = sched_cfs_period_timer;
				4017	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				4018	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				4019	}
				4020
				4021	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4022	{
				4023	cfs_rq->runtime_enabled = 0;
				4024	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				4025	}
				4026
				4027	void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4028	{
				4029	lockdep_assert_held(&cfs_b->lock);
				4030
				4031	if (!cfs_b->period_active) {
				4032	cfs_b->period_active = 1;
				4033	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
				4034	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
				4035	}
				4036	}
				4037
				4038	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4039	{
				4040	/* init_cfs_bandwidth() was not called */
				4041	if (!cfs_b->throttled_cfs_rq.next)
				4042	return;
				4043
				4044	hrtimer_cancel(&cfs_b->period_timer);
				4045	hrtimer_cancel(&cfs_b->slack_timer);
				4046	}
				4047
				4048	static void __maybe_unused update_runtime_enabled(struct rq *rq)
				4049	{
				4050	struct cfs_rq *cfs_rq;
				4051
				4052	for_each_leaf_cfs_rq(rq, cfs_rq) {
				4053	struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
				4054
				4055	raw_spin_lock(&cfs_b->lock);
				4056	cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
				4057	raw_spin_unlock(&cfs_b->lock);
				4058	}
				4059	}
				4060
				4061	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
				4062	{
				4063	struct cfs_rq *cfs_rq;
				4064
				4065	for_each_leaf_cfs_rq(rq, cfs_rq) {
				4066	if (!cfs_rq->runtime_enabled)
				4067	continue;
				4068
				4069	/*
				4070	* clock_task is not advancing so we just need to make sure
				4071	* there's some valid quota amount
				4072	*/
				4073	cfs_rq->runtime_remaining = 1;
				4074	/*
				4075	* Offline rq is schedulable till cpu is completely disabled
				4076	* in take_cpu_down(), so we prevent new cfs throttling here.
				4077	*/
				4078	cfs_rq->runtime_enabled = 0;
				4079
				4080	if (cfs_rq_throttled(cfs_rq))
				4081	unthrottle_cfs_rq(cfs_rq);
				4082	}
				4083	}
				4084
				4085	#else /* CONFIG_CFS_BANDWIDTH */
				4086	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				4087	{
				4088	return rq_clock_task(rq_of(cfs_rq));
				4089	}
				4090
				4091	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
				4092	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
				4093	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
				4094	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				4095
				4096	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				4097	{
				4098	return 0;
				4099	}
				4100
				4101	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				4102	{
				4103	return 0;
				4104	}
				4105
				4106	static inline int throttled_lb_pair(struct task_group *tg,
				4107	int src_cpu, int dest_cpu)
				4108	{
				4109	return 0;
				4110	}
				4111
				4112	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				4113
				4114	#ifdef CONFIG_FAIR_GROUP_SCHED
				4115	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				4116	#endif
				4117
				4118	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				4119	{
				4120	return NULL;
				4121	}
				4122	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				4123	static inline void update_runtime_enabled(struct rq *rq) {}
				4124	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
				4125
				4126	#endif /* CONFIG_CFS_BANDWIDTH */
				4127
				4128	/**************************************************
				4129	* CFS operations on tasks:
				4130	*/
				4131
				4132	#ifdef CONFIG_SCHED_HRTICK
				4133	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				4134	{
				4135	struct sched_entity *se = &p->se;
				4136	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4137
				4138	WARN_ON(task_rq(p) != rq);
				4139
				4140	if (cfs_rq->nr_running > 1) {
				4141	u64 slice = sched_slice(cfs_rq, se);
				4142	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				4143	s64 delta = slice - ran;
				4144
				4145	if (delta < 0) {
				4146	if (rq->curr == p)
				4147	resched_curr(rq);
				4148	return;
				4149	}
				4150	hrtick_start(rq, delta);
				4151	}
				4152	}
				4153
				4154	/*
				4155	* called from enqueue/dequeue and updates the hrtick when the
				4156	* current task is from our class and nr_running is low enough
				4157	* to matter.
				4158	*/
				4159	static void hrtick_update(struct rq *rq)
				4160	{
				4161	struct task_struct *curr = rq->curr;
				4162
				4163	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
				4164	return;
				4165
				4166	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				4167	hrtick_start_fair(rq, curr);
				4168	}
				4169	#else /* !CONFIG_SCHED_HRTICK */
				4170	static inline void
				4171	hrtick_start_fair(struct rq rq, struct task_struct p)
				4172	{
				4173	}
				4174
				4175	static inline void hrtick_update(struct rq *rq)
				4176	{
				4177	}
				4178	#endif
				4179
				4180	/*
				4181	* The enqueue_task method is called before nr_running is
				4182	* increased. Here we update the fair scheduling stats and
				4183	* then put the task into the rbtree:
				4184	*/
				4185	static void
				4186	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
				4187	{
				4188	struct cfs_rq *cfs_rq;
				4189	struct sched_entity *se = &p->se;
				4190
				4191	for_each_sched_entity(se) {
				4192	if (se->on_rq)
				4193	break;
				4194	cfs_rq = cfs_rq_of(se);
				4195	enqueue_entity(cfs_rq, se, flags);
				4196
				4197	/*
				4198	* end evaluation on encountering a throttled cfs_rq
				4199	*
				4200	* note: in the case of encountering a throttled cfs_rq we will
				4201	* post the final h_nr_running increment below.
				4202	*/
				4203	if (cfs_rq_throttled(cfs_rq))
				4204	break;
				4205	cfs_rq->h_nr_running++;
				4206
				4207	flags = ENQUEUE_WAKEUP;
				4208	}
				4209
				4210	for_each_sched_entity(se) {
				4211	cfs_rq = cfs_rq_of(se);
				4212	cfs_rq->h_nr_running++;
				4213
				4214	if (cfs_rq_throttled(cfs_rq))
				4215	break;
				4216
				4217	update_load_avg(se, 1);
				4218	update_cfs_shares(cfs_rq);
				4219	}
				4220
				4221	if (!se)
				4222	add_nr_running(rq, 1);
				4223
				4224	hrtick_update(rq);
				4225	}
				4226
				4227	static void set_next_buddy(struct sched_entity *se);
				4228
				4229	/*
				4230	* The dequeue_task method is called before nr_running is
				4231	* decreased. We remove the task from the rbtree and
				4232	* update the fair scheduling stats:
				4233	*/
				4234	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
				4235	{
				4236	struct cfs_rq *cfs_rq;
				4237	struct sched_entity *se = &p->se;
				4238	int task_sleep = flags & DEQUEUE_SLEEP;
				4239
				4240	for_each_sched_entity(se) {
				4241	cfs_rq = cfs_rq_of(se);
				4242	dequeue_entity(cfs_rq, se, flags);
				4243
				4244	/*
				4245	* end evaluation on encountering a throttled cfs_rq
				4246	*
				4247	* note: in the case of encountering a throttled cfs_rq we will
				4248	* post the final h_nr_running decrement below.
				4249	*/
				4250	if (cfs_rq_throttled(cfs_rq))
				4251	break;
				4252	cfs_rq->h_nr_running--;
				4253
				4254	/* Don't dequeue parent if it has other entities besides us */
				4255	if (cfs_rq->load.weight) {
				4256	/* Avoid re-evaluating load for this entity: */
				4257	se = parent_entity(se);
				4258	/*
				4259	* Bias pick_next to pick a task from this cfs_rq, as
				4260	* p is sleeping when it is within its sched_slice.
				4261	*/
				4262	if (task_sleep && se && !throttled_hierarchy(cfs_rq))
				4263	set_next_buddy(se);
				4264	break;
				4265	}
				4266	flags \|= DEQUEUE_SLEEP;
				4267	}
				4268
				4269	for_each_sched_entity(se) {
				4270	cfs_rq = cfs_rq_of(se);
				4271	cfs_rq->h_nr_running--;
				4272
				4273	if (cfs_rq_throttled(cfs_rq))
				4274	break;
				4275
				4276	update_load_avg(se, 1);
				4277	update_cfs_shares(cfs_rq);
				4278	}
				4279
				4280	if (!se)
				4281	sub_nr_running(rq, 1);
				4282
				4283	hrtick_update(rq);
				4284	}
				4285
				4286	#ifdef CONFIG_SMP
				4287
				4288	/*
				4289	* per rq 'load' arrray crap; XXX kill this.
				4290	*/
				4291
				4292	/*
				4293	* The exact cpuload at various idx values, calculated at every tick would be
				4294	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
				4295	*
				4296	* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
				4297	* on nth tick when cpu may be busy, then we have:
				4298	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
				4299	* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
				4300	*
				4301	* decay_load_missed() below does efficient calculation of
				4302	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
				4303	* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
				4304	*
				4305	* The calculation is approximated on a 128 point scale.
				4306	* degrade_zero_ticks is the number of ticks after which load at any
				4307	* particular idx is approximated to be zero.
				4308	* degrade_factor is a precomputed table, a row for each load idx.
				4309	* Each column corresponds to degradation factor for a power of two ticks,
				4310	* based on 128 point scale.
				4311	* Example:
				4312	* row 2, col 3 (=12) says that the degradation at load idx 2 after
				4313	* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
				4314	*
				4315	* With this power of 2 load factors, we can degrade the load n times
				4316	* by looking at 1 bits in n and doing as many mult/shift instead of
				4317	* n mult/shifts needed by the exact degradation.
				4318	*/
				4319	#define DEGRADE_SHIFT 7
				4320	static const unsigned char
				4321	degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
				4322	static const unsigned char
				4323	degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
				4324	{0, 0, 0, 0, 0, 0, 0, 0},
				4325	{64, 32, 8, 0, 0, 0, 0, 0},
				4326	{96, 72, 40, 12, 1, 0, 0},
				4327	{112, 98, 75, 43, 15, 1, 0},
				4328	{120, 112, 98, 76, 45, 16, 2} };
				4329
				4330	/*
				4331	* Update cpu_load for any missed ticks, due to tickless idle. The backlog
				4332	* would be when CPU is idle and so we just decay the old load without
				4333	* adding any new load.
				4334	*/
				4335	static unsigned long
				4336	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
				4337	{
				4338	int j = 0;
				4339
				4340	if (!missed_updates)
				4341	return load;
				4342
				4343	if (missed_updates >= degrade_zero_ticks[idx])
				4344	return 0;
				4345
				4346	if (idx == 1)
				4347	return load >> missed_updates;
				4348
				4349	while (missed_updates) {
				4350	if (missed_updates % 2)
				4351	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
				4352
				4353	missed_updates >>= 1;
				4354	j++;
				4355	}
				4356	return load;
				4357	}
				4358
				4359	/*
				4360	* Update rq->cpu_load[] statistics. This function is usually called every
				4361	* scheduler tick (TICK_NSEC). With tickless idle this will not be called
				4362	* every tick. We fix it up based on jiffies.
				4363	*/
				4364	static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
				4365	unsigned long pending_updates)
				4366	{
				4367	int i, scale;
				4368
				4369	this_rq->nr_load_updates++;
				4370
				4371	/* Update our load: */
				4372	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
				4373	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
				4374	unsigned long old_load, new_load;
				4375
				4376	/* scale is effectively 1 << i now, and >> i divides by scale */
				4377
				4378	old_load = this_rq->cpu_load[i];
				4379	old_load = decay_load_missed(old_load, pending_updates - 1, i);
				4380	new_load = this_load;
				4381	/*
				4382	* Round up the averaging division if load is increasing. This
				4383	* prevents us from getting stuck on 9 if the load is 10, for
				4384	* example.
				4385	*/
				4386	if (new_load > old_load)
				4387	new_load += scale - 1;
				4388
				4389	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
				4390	}
				4391
				4392	sched_avg_update(this_rq);
				4393	}
				4394
				4395	/* Used instead of source_load when we know the type == 0 */
				4396	static unsigned long weighted_cpuload(const int cpu)
				4397	{
				4398	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
				4399	}
				4400
				4401	#ifdef CONFIG_NO_HZ_COMMON
				4402	/*
				4403	* There is no sane way to deal with nohz on smp when using jiffies because the
				4404	* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
				4405	* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
				4406	*
				4407	* Therefore we cannot use the delta approach from the regular tick since that
				4408	* would seriously skew the load calculation. However we'll make do for those
				4409	* updates happening while idle (nohz_idle_balance) or coming out of idle
				4410	* (tick_nohz_idle_exit).
				4411	*
				4412	* This means we might still be one tick off for nohz periods.
				4413	*/
				4414
				4415	/*
				4416	* Called from nohz_idle_balance() to update the load ratings before doing the
				4417	* idle balance.
				4418	*/
				4419	static void update_idle_cpu_load(struct rq *this_rq)
				4420	{
				4421	unsigned long curr_jiffies = READ_ONCE(jiffies);
				4422	unsigned long load = weighted_cpuload(cpu_of(this_rq));
				4423	unsigned long pending_updates;
				4424
				4425	/*
				4426	* bail if there's load or we're actually up-to-date.
				4427	*/
				4428	if (load \|\| curr_jiffies == this_rq->last_load_update_tick)
				4429	return;
				4430
				4431	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
				4432	this_rq->last_load_update_tick = curr_jiffies;
				4433
				4434	__update_cpu_load(this_rq, load, pending_updates);
				4435	}
				4436
				4437	/*
				4438	* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
				4439	*/
				4440	void update_cpu_load_nohz(void)
				4441	{
				4442	struct rq *this_rq = this_rq();
				4443	unsigned long curr_jiffies = READ_ONCE(jiffies);
				4444	unsigned long pending_updates;
				4445
				4446	if (curr_jiffies == this_rq->last_load_update_tick)
				4447	return;
				4448
				4449	raw_spin_lock(&this_rq->lock);
				4450	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
				4451	if (pending_updates) {
				4452	this_rq->last_load_update_tick = curr_jiffies;
				4453	/*
				4454	* We were idle, this means load 0, the current load might be
				4455	* !0 due to remote wakeups and the sort.
				4456	*/
				4457	__update_cpu_load(this_rq, 0, pending_updates);
				4458	}
				4459	raw_spin_unlock(&this_rq->lock);
				4460	}
				4461	#endif /* CONFIG_NO_HZ */
				4462
				4463	/*
				4464	* Called from scheduler_tick()
				4465	*/
				4466	void update_cpu_load_active(struct rq *this_rq)
				4467	{
				4468	unsigned long load = weighted_cpuload(cpu_of(this_rq));
				4469	/*
				4470	* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
				4471	*/
				4472	this_rq->last_load_update_tick = jiffies;
				4473	__update_cpu_load(this_rq, load, 1);
				4474	}
				4475
				4476	/*
				4477	* Return a low guess at the load of a migration-source cpu weighted
				4478	* according to the scheduling class and "nice" value.
				4479	*
				4480	* We want to under-estimate the load of migration sources, to
				4481	* balance conservatively.
				4482	*/
				4483	static unsigned long source_load(int cpu, int type)
				4484	{
				4485	struct rq *rq = cpu_rq(cpu);
				4486	unsigned long total = weighted_cpuload(cpu);
				4487
				4488	if (type == 0 \|\| !sched_feat(LB_BIAS))
				4489	return total;
				4490
				4491	return min(rq->cpu_load[type-1], total);
				4492	}
				4493
				4494	/*
				4495	* Return a high guess at the load of a migration-target cpu weighted
				4496	* according to the scheduling class and "nice" value.
				4497	*/
				4498	static unsigned long target_load(int cpu, int type)
				4499	{
				4500	struct rq *rq = cpu_rq(cpu);
				4501	unsigned long total = weighted_cpuload(cpu);
				4502
				4503	if (type == 0 \|\| !sched_feat(LB_BIAS))
				4504	return total;
				4505
				4506	return max(rq->cpu_load[type-1], total);
				4507	}
				4508
				4509	static unsigned long capacity_of(int cpu)
				4510	{
				4511	return cpu_rq(cpu)->cpu_capacity;
				4512	}
				4513
				4514	static unsigned long capacity_orig_of(int cpu)
				4515	{
				4516	return cpu_rq(cpu)->cpu_capacity_orig;
				4517	}
				4518
				4519	static unsigned long cpu_avg_load_per_task(int cpu)
				4520	{
				4521	struct rq *rq = cpu_rq(cpu);
				4522	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
				4523	unsigned long load_avg = weighted_cpuload(cpu);
				4524
				4525	if (nr_running)
				4526	return load_avg / nr_running;
				4527
				4528	return 0;
				4529	}
				4530
				4531	static void record_wakee(struct task_struct *p)
				4532	{
				4533	/*
				4534	* Rough decay (wiping) for cost saving, don't worry
				4535	* about the boundary, really active task won't care
				4536	* about the loss.
				4537	*/
				4538	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
				4539	current->wakee_flips >>= 1;
				4540	current->wakee_flip_decay_ts = jiffies;
				4541	}
				4542
				4543	if (current->last_wakee != p) {
				4544	current->last_wakee = p;
				4545	current->wakee_flips++;
				4546	}
				4547	}
				4548
				4549	static void task_waking_fair(struct task_struct *p)
				4550	{
				4551	struct sched_entity *se = &p->se;
				4552	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4553	u64 min_vruntime;
				4554
				4555	#ifndef CONFIG_64BIT
				4556	u64 min_vruntime_copy;
				4557
				4558	do {
				4559	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				4560	smp_rmb();
				4561	min_vruntime = cfs_rq->min_vruntime;
				4562	} while (min_vruntime != min_vruntime_copy);
				4563	#else
				4564	min_vruntime = cfs_rq->min_vruntime;
				4565	#endif
				4566
				4567	se->vruntime -= min_vruntime;
				4568	record_wakee(p);
				4569	}
				4570
				4571	#ifdef CONFIG_FAIR_GROUP_SCHED
				4572	/*
				4573	* effective_load() calculates the load change as seen from the root_task_group
				4574	*
				4575	* Adding load to a group doesn't make a group heavier, but can cause movement
				4576	* of group shares between cpus. Assuming the shares were perfectly aligned one
				4577	* can calculate the shift in shares.
				4578	*
				4579	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				4580	* on this @cpu and results in a total addition (subtraction) of @wg to the
				4581	* total group weight.
				4582	*
				4583	* Given a runqueue weight distribution (rw_i) we can compute a shares
				4584	* distribution (s_i) using:
				4585	*
				4586	* s_i = rw_i / \Sum rw_j (1)
				4587	*
				4588	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				4589	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				4590	* shares distribution (s_i):
				4591	*
				4592	* rw_i = { 2, 4, 1, 0 }
				4593	* s_i = { 2/7, 4/7, 1/7, 0 }
				4594	*
				4595	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				4596	* task used to run on and the CPU the waker is running on), we need to
				4597	* compute the effect of waking a task on either CPU and, in case of a sync
				4598	* wakeup, compute the effect of the current task going to sleep.
				4599	*
				4600	* So for a change of @wl to the local @cpu with an overall group weight change
				4601	* of @wl we can compute the new shares distribution (s'_i) using:
				4602	*
				4603	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				4604	*
				4605	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				4606	* differences in waking a task to CPU 0. The additional task changes the
				4607	* weight and shares distributions like:
				4608	*
				4609	* rw'_i = { 3, 4, 1, 0 }
				4610	* s'_i = { 3/8, 4/8, 1/8, 0 }
				4611	*
				4612	* We can then compute the difference in effective weight by using:
				4613	*
				4614	* dw_i = S * (s'_i - s_i) (3)
				4615	*
				4616	* Where 'S' is the group weight as seen by its parent.
				4617	*
				4618	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				4619	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				4620	* 4/7) times the weight of the group.
				4621	*/
				4622	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
				4623	{
				4624	struct sched_entity *se = tg->se[cpu];
				4625
				4626	if (!tg->parent) /* the trivial, non-cgroup case */
				4627	return wl;
				4628
				4629	for_each_sched_entity(se) {
				4630	struct cfs_rq *cfs_rq = se->my_q;
				4631	long W, w = cfs_rq_load_avg(cfs_rq);
				4632
				4633	tg = cfs_rq->tg;
				4634
				4635	/*
				4636	* W = @wg + \Sum rw_j
				4637	*/
				4638	W = wg + atomic_long_read(&tg->load_avg);
				4639
				4640	/* Ensure \Sum rw_j >= rw_i */
				4641	W -= cfs_rq->tg_load_avg_contrib;
				4642	W += w;
				4643
				4644	/*
				4645	* w = rw_i + @wl
				4646	*/
				4647	w += wl;
				4648
				4649	/*
				4650	* wl = S * s'_i; see (2)
				4651	*/
				4652	if (W > 0 && w < W)
				4653	wl = (w * (long)tg->shares) / W;
				4654	else
				4655	wl = tg->shares;
				4656
				4657	/*
				4658	* Per the above, wl is the new se->load.weight value; since
				4659	* those are clipped to [MIN_SHARES, ...) do so now. See
				4660	* calc_cfs_shares().
				4661	*/
				4662	if (wl < MIN_SHARES)
				4663	wl = MIN_SHARES;
				4664
				4665	/*
				4666	* wl = dw_i = S * (s'_i - s_i); see (3)
				4667	*/
				4668	wl -= se->avg.load_avg;
				4669
				4670	/*
				4671	* Recursively apply this logic to all parent groups to compute
				4672	* the final effective load change on the root group. Since
				4673	* only the @tg group gets extra weight, all parent groups can
				4674	* only redistribute existing shares. @wl is the shift in shares
				4675	* resulting from this level per the above.
				4676	*/
				4677	wg = 0;
				4678	}
				4679
				4680	return wl;
				4681	}
				4682	#else
				4683
				4684	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
				4685	{
				4686	return wl;
				4687	}
				4688
				4689	#endif
				4690
				4691	/*
				4692	* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
				4693	* A waker of many should wake a different task than the one last awakened
				4694	* at a frequency roughly N times higher than one of its wakees. In order
				4695	* to determine whether we should let the load spread vs consolodating to
				4696	* shared cache, we look for a minimum 'flip' frequency of llc_size in one
				4697	* partner, and a factor of lls_size higher frequency in the other. With
				4698	* both conditions met, we can be relatively sure that the relationship is
				4699	* non-monogamous, with partner count exceeding socket size. Waker/wakee
				4700	* being client/server, worker/dispatcher, interrupt source or whatever is
				4701	* irrelevant, spread criteria is apparent partner count exceeds socket size.
				4702	*/
				4703	static int wake_wide(struct task_struct *p)
				4704	{
				4705	unsigned int master = current->wakee_flips;
				4706	unsigned int slave = p->wakee_flips;
				4707	int factor = this_cpu_read(sd_llc_size);
				4708
				4709	if (master < slave)
				4710	swap(master, slave);
				4711	if (slave < factor \|\| master < slave * factor)
				4712	return 0;
				4713	return 1;
				4714	}
				4715
				4716	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
				4717	{
				4718	s64 this_load, load;
				4719	s64 this_eff_load, prev_eff_load;
				4720	int idx, this_cpu, prev_cpu;
				4721	struct task_group *tg;
				4722	unsigned long weight;
				4723	int balanced;
				4724
				4725	idx = sd->wake_idx;
				4726	this_cpu = smp_processor_id();
				4727	prev_cpu = task_cpu(p);
				4728	load = source_load(prev_cpu, idx);
				4729	this_load = target_load(this_cpu, idx);
				4730
				4731	/*
				4732	* If sync wakeup then subtract the (maximum possible)
				4733	* effect of the currently running task from the load
				4734	* of the current CPU:
				4735	*/
				4736	if (sync) {
				4737	tg = task_group(current);
				4738	weight = current->se.avg.load_avg;
				4739
				4740	this_load += effective_load(tg, this_cpu, -weight, -weight);
				4741	load += effective_load(tg, prev_cpu, 0, -weight);
				4742	}
				4743
				4744	tg = task_group(p);
				4745	weight = p->se.avg.load_avg;
				4746
				4747	/*
				4748	* In low-load situations, where prev_cpu is idle and this_cpu is idle
				4749	* due to the sync cause above having dropped this_load to 0, we'll
				4750	* always have an imbalance, but there's really nothing you can do
				4751	* about that, so that's good too.
				4752	*
				4753	* Otherwise check if either cpus are near enough in load to allow this
				4754	* task to be woken on this_cpu.
				4755	*/
				4756	this_eff_load = 100;
				4757	this_eff_load *= capacity_of(prev_cpu);
				4758
				4759	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				4760	prev_eff_load *= capacity_of(this_cpu);
				4761
				4762	if (this_load > 0) {
				4763	this_eff_load *= this_load +
				4764	effective_load(tg, this_cpu, weight, weight);
				4765
				4766	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				4767	}
				4768
				4769	balanced = this_eff_load <= prev_eff_load;
				4770
				4771	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
				4772
				4773	if (!balanced)
				4774	return 0;
				4775
				4776	schedstat_inc(sd, ttwu_move_affine);
				4777	schedstat_inc(p, se.statistics.nr_wakeups_affine);
				4778
				4779	return 1;
				4780	}
				4781
				4782	/*
				4783	* find_idlest_group finds and returns the least busy CPU group within the
				4784	* domain.
				4785	*/
				4786	static struct sched_group *
				4787	find_idlest_group(struct sched_domain sd, struct task_struct p,
				4788	int this_cpu, int sd_flag)
				4789	{
				4790	struct sched_group idlest = NULL, group = sd->groups;
				4791	unsigned long min_load = ULONG_MAX, this_load = 0;
				4792	int load_idx = sd->forkexec_idx;
				4793	int imbalance = 100 + (sd->imbalance_pct-100)/2;
				4794
				4795	if (sd_flag & SD_BALANCE_WAKE)
				4796	load_idx = sd->wake_idx;
				4797
				4798	do {
				4799	unsigned long load, avg_load;
				4800	int local_group;
				4801	int i;
				4802
				4803	/* Skip over this group if it has no CPUs allowed */
				4804	if (!cpumask_intersects(sched_group_cpus(group),
				4805	tsk_cpus_allowed(p)))
				4806	continue;
				4807
				4808	local_group = cpumask_test_cpu(this_cpu,
				4809	sched_group_cpus(group));
				4810
				4811	/* Tally up the load of all CPUs in the group */
				4812	avg_load = 0;
				4813
				4814	for_each_cpu(i, sched_group_cpus(group)) {
				4815	/* Bias balancing toward cpus of our domain */
				4816	if (local_group)
				4817	load = source_load(i, load_idx);
				4818	else
				4819	load = target_load(i, load_idx);
				4820
				4821	avg_load += load;
				4822	}
				4823
				4824	/* Adjust by relative CPU capacity of the group */
				4825	avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
				4826
				4827	if (local_group) {
				4828	this_load = avg_load;
				4829	} else if (avg_load < min_load) {
				4830	min_load = avg_load;
				4831	idlest = group;
				4832	}
				4833	} while (group = group->next, group != sd->groups);
				4834
				4835	if (!idlest \|\| 100this_load < imbalancemin_load)
				4836	return NULL;
				4837	return idlest;
				4838	}
				4839
				4840	/*
				4841	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				4842	*/
				4843	static int
				4844	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				4845	{
				4846	unsigned long load, min_load = ULONG_MAX;
				4847	unsigned int min_exit_latency = UINT_MAX;
				4848	u64 latest_idle_timestamp = 0;
				4849	int least_loaded_cpu = this_cpu;
				4850	int shallowest_idle_cpu = -1;
				4851	int i;
				4852
				4853	/* Traverse only the allowed CPUs */
				4854	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
				4855	if (idle_cpu(i)) {
				4856	struct rq *rq = cpu_rq(i);
				4857	struct cpuidle_state *idle = idle_get_state(rq);
				4858	if (idle && idle->exit_latency < min_exit_latency) {
				4859	/*
				4860	* We give priority to a CPU whose idle state
				4861	* has the smallest exit latency irrespective
				4862	* of any idle timestamp.
				4863	*/
				4864	min_exit_latency = idle->exit_latency;
				4865	latest_idle_timestamp = rq->idle_stamp;
				4866	shallowest_idle_cpu = i;
				4867	} else if ((!idle \|\| idle->exit_latency == min_exit_latency) &&
				4868	rq->idle_stamp > latest_idle_timestamp) {
				4869	/*
				4870	* If equal or no active idle state, then
				4871	* the most recently idled CPU might have
				4872	* a warmer cache.
				4873	*/
				4874	latest_idle_timestamp = rq->idle_stamp;
				4875	shallowest_idle_cpu = i;
				4876	}
				4877	} else if (shallowest_idle_cpu == -1) {
				4878	load = weighted_cpuload(i);
				4879	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				4880	min_load = load;
				4881	least_loaded_cpu = i;
				4882	}
				4883	}
				4884	}
				4885
				4886	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
				4887	}
				4888
				4889	/*
				4890	* Try and locate an idle CPU in the sched_domain.
				4891	*/
				4892	static int select_idle_sibling(struct task_struct *p, int target)
				4893	{
				4894	struct sched_domain *sd;
				4895	struct sched_group *sg;
				4896	int i = task_cpu(p);
				4897
				4898	if (idle_cpu(target))
				4899	return target;
				4900
				4901	/*
				4902	* If the prevous cpu is cache affine and idle, don't be stupid.
				4903	*/
				4904	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
				4905	return i;
				4906
				4907	/*
				4908	* Otherwise, iterate the domains and find an elegible idle cpu.
				4909	*/
				4910	sd = rcu_dereference(per_cpu(sd_llc, target));
				4911	for_each_lower_domain(sd) {
				4912	sg = sd->groups;
				4913	do {
				4914	if (!cpumask_intersects(sched_group_cpus(sg),
				4915	tsk_cpus_allowed(p)))
				4916	goto next;
				4917
				4918	for_each_cpu(i, sched_group_cpus(sg)) {
				4919	if (i == target \|\| !idle_cpu(i))
				4920	goto next;
				4921	}
				4922
				4923	target = cpumask_first_and(sched_group_cpus(sg),
				4924	tsk_cpus_allowed(p));
				4925	goto done;
				4926	next:
				4927	sg = sg->next;
				4928	} while (sg != sd->groups);
				4929	}
				4930	done:
				4931	return target;
				4932	}
				4933
				4934	/*
				4935	* cpu_util returns the amount of capacity of a CPU that is used by CFS
				4936	* tasks. The unit of the return value must be the one of capacity so we can
				4937	* compare the utilization with the capacity of the CPU that is available for
				4938	* CFS task (ie cpu_capacity).
				4939	*
				4940	* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
				4941	* recent utilization of currently non-runnable tasks on a CPU. It represents
				4942	* the amount of utilization of a CPU in the range [0..capacity_orig] where
				4943	* capacity_orig is the cpu_capacity available at the highest frequency
				4944	* (arch_scale_freq_capacity()).
				4945	* The utilization of a CPU converges towards a sum equal to or less than the
				4946	* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
				4947	* the running time on this CPU scaled by capacity_curr.
				4948	*
				4949	* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
				4950	* higher than capacity_orig because of unfortunate rounding in
				4951	* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
				4952	* the average stabilizes with the new running time. We need to check that the
				4953	* utilization stays within the range of [0..capacity_orig] and cap it if
				4954	* necessary. Without utilization capping, a group could be seen as overloaded
				4955	* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
				4956	* available capacity. We allow utilization to overshoot capacity_curr (but not
				4957	* capacity_orig) as it useful for predicting the capacity required after task
				4958	* migrations (scheduler-driven DVFS).
				4959	*/
				4960	static int cpu_util(int cpu)
				4961	{
				4962	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
				4963	unsigned long capacity = capacity_orig_of(cpu);
				4964
				4965	return (util >= capacity) ? capacity : util;
				4966	}
				4967
				4968	/*
				4969	* select_task_rq_fair: Select target runqueue for the waking task in domains
				4970	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
				4971	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
				4972	*
				4973	* Balances load by selecting the idlest cpu in the idlest group, or under
				4974	* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
				4975	*
				4976	* Returns the target cpu number.
				4977	*
				4978	* preempt must be disabled.
				4979	*/
				4980	static int
				4981	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
				4982	{
				4983	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
				4984	int cpu = smp_processor_id();
				4985	int new_cpu = prev_cpu;
				4986	int want_affine = 0;
				4987	int sync = wake_flags & WF_SYNC;
				4988
				4989	if (sd_flag & SD_BALANCE_WAKE)
				4990	want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
				4991
				4992	rcu_read_lock();
				4993	for_each_domain(cpu, tmp) {
				4994	if (!(tmp->flags & SD_LOAD_BALANCE))
				4995	break;
				4996
				4997	/*
				4998	* If both cpu and prev_cpu are part of this domain,
				4999	* cpu is a valid SD_WAKE_AFFINE target.
				5000	*/
				5001	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				5002	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				5003	affine_sd = tmp;
				5004	break;
				5005	}
				5006
				5007	if (tmp->flags & sd_flag)
				5008	sd = tmp;
				5009	else if (!want_affine)
				5010	break;
				5011	}
				5012
				5013	if (affine_sd) {
				5014	sd = NULL; /* Prefer wake_affine over balance flags */
				5015	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
				5016	new_cpu = cpu;
				5017	}
				5018
				5019	if (!sd) {
				5020	if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
				5021	new_cpu = select_idle_sibling(p, new_cpu);
				5022
				5023	} else while (sd) {
				5024	struct sched_group *group;
				5025	int weight;
				5026
				5027	if (!(sd->flags & sd_flag)) {
				5028	sd = sd->child;
				5029	continue;
				5030	}
				5031
				5032	group = find_idlest_group(sd, p, cpu, sd_flag);
				5033	if (!group) {
				5034	sd = sd->child;
				5035	continue;
				5036	}
				5037
				5038	new_cpu = find_idlest_cpu(group, p, cpu);
				5039	if (new_cpu == -1 \|\| new_cpu == cpu) {
				5040	/* Now try balancing at a lower domain level of cpu */
				5041	sd = sd->child;
				5042	continue;
				5043	}
				5044
				5045	/* Now try balancing at a lower domain level of new_cpu */
				5046	cpu = new_cpu;
				5047	weight = sd->span_weight;
				5048	sd = NULL;
				5049	for_each_domain(cpu, tmp) {
				5050	if (weight <= tmp->span_weight)
				5051	break;
				5052	if (tmp->flags & sd_flag)
				5053	sd = tmp;
				5054	}
				5055	/* while loop will break here if sd == NULL */
				5056	}
				5057	rcu_read_unlock();
				5058
				5059	return new_cpu;
				5060	}
				5061
				5062	/*
				5063	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				5064	* cfs_rq_of(p) references at time of call are still valid and identify the
				5065	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				5066	* other assumptions, including the state of rq->lock, should be made.
				5067	*/
				5068	static void migrate_task_rq_fair(struct task_struct *p)
				5069	{
				5070	/*
				5071	* We are supposed to update the task to "current" time, then its up to date
				5072	* and ready to go to new CPU/cfs_rq. But we have difficulty in getting
				5073	* what current time is, so simply throw away the out-of-date time. This
				5074	* will result in the wakee task is less decayed, but giving the wakee more
				5075	* load sounds not bad.
				5076	*/
				5077	remove_entity_load_avg(&p->se);
				5078
				5079	/* Tell new CPU we are migrated */
				5080	p->se.avg.last_update_time = 0;
				5081
				5082	/* We have migrated, no longer consider this task hot */
				5083	p->se.exec_start = 0;
				5084	}
				5085
				5086	static void task_dead_fair(struct task_struct *p)
				5087	{
				5088	remove_entity_load_avg(&p->se);
				5089	}
				5090	#endif /* CONFIG_SMP */
				5091
				5092	static unsigned long
				5093	wakeup_gran(struct sched_entity curr, struct sched_entity se)
				5094	{
				5095	unsigned long gran = sysctl_sched_wakeup_granularity;
				5096
				5097	/*
				5098	* Since its curr running now, convert the gran from real-time
				5099	* to virtual-time in his units.
				5100	*
				5101	* By using 'se' instead of 'curr' we penalize light tasks, so
				5102	* they get preempted easier. That is, if 'se' < 'curr' then
				5103	* the resulting gran will be larger, therefore penalizing the
				5104	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				5105	* be smaller, again penalizing the lighter task.
				5106	*
				5107	* This is especially important for buddies when the leftmost
				5108	* task is higher priority than the buddy.
				5109	*/
				5110	return calc_delta_fair(gran, se);
				5111	}
				5112
				5113	/*
				5114	* Should 'se' preempt 'curr'.
				5115	*
				5116	* \|s1
				5117	* \|s2
				5118	* \|s3
				5119	* g
				5120	* \|<--->\|c
				5121	*
				5122	* w(c, s1) = -1
				5123	* w(c, s2) = 0
				5124	* w(c, s3) = 1
				5125	*
				5126	*/
				5127	static int
				5128	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				5129	{
				5130	s64 gran, vdiff = curr->vruntime - se->vruntime;
				5131
				5132	if (vdiff <= 0)
				5133	return -1;
				5134
				5135	gran = wakeup_gran(curr, se);
				5136	if (vdiff > gran)
				5137	return 1;
				5138
				5139	return 0;
				5140	}
				5141
				5142	static void set_last_buddy(struct sched_entity *se)
				5143	{
				5144	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				5145	return;
				5146
				5147	for_each_sched_entity(se)
				5148	cfs_rq_of(se)->last = se;
				5149	}
				5150
				5151	static void set_next_buddy(struct sched_entity *se)
				5152	{
				5153	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				5154	return;
				5155
				5156	for_each_sched_entity(se)
				5157	cfs_rq_of(se)->next = se;
				5158	}
				5159
				5160	static void set_skip_buddy(struct sched_entity *se)
				5161	{
				5162	for_each_sched_entity(se)
				5163	cfs_rq_of(se)->skip = se;
				5164	}
				5165
				5166	/*
				5167	* Preempt the current task with a newly woken task if needed:
				5168	*/
				5169	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
				5170	{
				5171	struct task_struct *curr = rq->curr;
				5172	struct sched_entity se = &curr->se, pse = &p->se;
				5173	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				5174	int scale = cfs_rq->nr_running >= sched_nr_latency;
				5175	int next_buddy_marked = 0;
				5176
				5177	if (unlikely(se == pse))
				5178	return;
				5179
				5180	/*
				5181	* This is possible from callers such as attach_tasks(), in which we
				5182	* unconditionally check_prempt_curr() after an enqueue (which may have
				5183	* lead to a throttle). This both saves work and prevents false
				5184	* next-buddy nomination below.
				5185	*/
				5186	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				5187	return;
				5188
				5189	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
				5190	set_next_buddy(pse);
				5191	next_buddy_marked = 1;
				5192	}
				5193
				5194	/*
				5195	* We can come here with TIF_NEED_RESCHED already set from new task
				5196	* wake up path.
				5197	*
				5198	* Note: this also catches the edge-case of curr being in a throttled
				5199	* group (e.g. via set_curr_task), since update_curr() (in the
				5200	* enqueue of curr) will have resulted in resched being set. This
				5201	* prevents us from potentially nominating it as a false LAST_BUDDY
				5202	* below.
				5203	*/
				5204	if (test_tsk_need_resched(curr))
				5205	return;
				5206
				5207	/* Idle tasks are by definition preempted by non-idle tasks. */
				5208	if (unlikely(curr->policy == SCHED_IDLE) &&
				5209	likely(p->policy != SCHED_IDLE))
				5210	goto preempt;
				5211
				5212	/*
				5213	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				5214	* is driven by the tick):
				5215	*/
				5216	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
				5217	return;
				5218
				5219	find_matching_se(&se, &pse);
				5220	update_curr(cfs_rq_of(se));
				5221	BUG_ON(!pse);
				5222	if (wakeup_preempt_entity(se, pse) == 1) {
				5223	/*
				5224	* Bias pick_next to pick the sched entity that is
				5225	* triggering this preemption.
				5226	*/
				5227	if (!next_buddy_marked)
				5228	set_next_buddy(pse);
				5229	goto preempt;
				5230	}
				5231
				5232	return;
				5233
				5234	preempt:
				5235	resched_curr(rq);
				5236	/*
				5237	* Only set the backward buddy when the current task is still
				5238	* on the rq. This can happen when a wakeup gets interleaved
				5239	* with schedule on the ->pre_schedule() or idle_balance()
				5240	* point, either of which can * drop the rq lock.
				5241	*
				5242	* Also, during early boot the idle thread is in the fair class,
				5243	* for obvious reasons its a bad idea to schedule back to it.
				5244	*/
				5245	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				5246	return;
				5247
				5248	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				5249	set_last_buddy(se);
				5250	}
				5251
				5252	static struct task_struct *
				5253	pick_next_task_fair(struct rq rq, struct task_struct prev)
				5254	{
				5255	struct cfs_rq *cfs_rq = &rq->cfs;
				5256	struct sched_entity *se;
				5257	struct task_struct *p;
				5258	int new_tasks;
				5259
				5260	again:
				5261	#ifdef CONFIG_FAIR_GROUP_SCHED
				5262	if (!cfs_rq->nr_running)
				5263	goto idle;
				5264
				5265	if (prev->sched_class != &fair_sched_class)
				5266	goto simple;
				5267
				5268	/*
				5269	* Because of the set_next_buddy() in dequeue_task_fair() it is rather
				5270	* likely that a next task is from the same cgroup as the current.
				5271	*
				5272	* Therefore attempt to avoid putting and setting the entire cgroup
				5273	* hierarchy, only change the part that actually changes.
				5274	*/
				5275
				5276	do {
				5277	struct sched_entity *curr = cfs_rq->curr;
				5278
				5279	/*
				5280	* Since we got here without doing put_prev_entity() we also
				5281	* have to consider cfs_rq->curr. If it is still a runnable
				5282	* entity, update_curr() will update its vruntime, otherwise
				5283	* forget we've ever seen it.
				5284	*/
				5285	if (curr) {
				5286	if (curr->on_rq)
				5287	update_curr(cfs_rq);
				5288	else
				5289	curr = NULL;
				5290
				5291	/*
				5292	* This call to check_cfs_rq_runtime() will do the
				5293	* throttle and dequeue its entity in the parent(s).
				5294	* Therefore the 'simple' nr_running test will indeed
				5295	* be correct.
				5296	*/
				5297	if (unlikely(check_cfs_rq_runtime(cfs_rq)))
				5298	goto simple;
				5299	}
				5300
				5301	se = pick_next_entity(cfs_rq, curr);
				5302	cfs_rq = group_cfs_rq(se);
				5303	} while (cfs_rq);
				5304
				5305	p = task_of(se);
				5306
				5307	/*
				5308	* Since we haven't yet done put_prev_entity and if the selected task
				5309	* is a different task than we started out with, try and touch the
				5310	* least amount of cfs_rqs.
				5311	*/
				5312	if (prev != p) {
				5313	struct sched_entity *pse = &prev->se;
				5314
				5315	while (!(cfs_rq = is_same_group(se, pse))) {
				5316	int se_depth = se->depth;
				5317	int pse_depth = pse->depth;
				5318
				5319	if (se_depth <= pse_depth) {
				5320	put_prev_entity(cfs_rq_of(pse), pse);
				5321	pse = parent_entity(pse);
				5322	}
				5323	if (se_depth >= pse_depth) {
				5324	set_next_entity(cfs_rq_of(se), se);
				5325	se = parent_entity(se);
				5326	}
				5327	}
				5328
				5329	put_prev_entity(cfs_rq, pse);
				5330	set_next_entity(cfs_rq, se);
				5331	}
				5332
				5333	if (hrtick_enabled(rq))
				5334	hrtick_start_fair(rq, p);
				5335
				5336	return p;
				5337	simple:
				5338	cfs_rq = &rq->cfs;
				5339	#endif
				5340
				5341	if (!cfs_rq->nr_running)
				5342	goto idle;
				5343
				5344	put_prev_task(rq, prev);
				5345
				5346	do {
				5347	se = pick_next_entity(cfs_rq, NULL);
				5348	set_next_entity(cfs_rq, se);
				5349	cfs_rq = group_cfs_rq(se);
				5350	} while (cfs_rq);
				5351
				5352	p = task_of(se);
				5353
				5354	if (hrtick_enabled(rq))
				5355	hrtick_start_fair(rq, p);
				5356
				5357	return p;
				5358
				5359	idle:
				5360	/*
				5361	* This is OK, because current is on_cpu, which avoids it being picked
				5362	* for load-balance and preemption/IRQs are still disabled avoiding
				5363	* further scheduler activity on it and we're being very careful to
				5364	* re-start the picking loop.
				5365	*/
				5366	lockdep_unpin_lock(&rq->lock);
				5367	new_tasks = idle_balance(rq);
				5368	lockdep_pin_lock(&rq->lock);
				5369	/*
				5370	* Because idle_balance() releases (and re-acquires) rq->lock, it is
				5371	* possible for any higher priority task to appear. In that case we
				5372	* must re-start the pick_next_entity() loop.
				5373	*/
				5374	if (new_tasks < 0)
				5375	return RETRY_TASK;
				5376
				5377	if (new_tasks > 0)
				5378	goto again;
				5379
				5380	return NULL;
				5381	}
				5382
				5383	/*
				5384	* Account for a descheduled task:
				5385	*/
				5386	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
				5387	{
				5388	struct sched_entity *se = &prev->se;
				5389	struct cfs_rq *cfs_rq;
				5390
				5391	for_each_sched_entity(se) {
				5392	cfs_rq = cfs_rq_of(se);
				5393	put_prev_entity(cfs_rq, se);
				5394	}
				5395	}
				5396
				5397	/*
				5398	* sched_yield() is very simple
				5399	*
				5400	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				5401	*/
				5402	static void yield_task_fair(struct rq *rq)
				5403	{
				5404	struct task_struct *curr = rq->curr;
				5405	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				5406	struct sched_entity *se = &curr->se;
				5407
				5408	/*
				5409	* Are we the only task in the tree?
				5410	*/
				5411	if (unlikely(rq->nr_running == 1))
				5412	return;
				5413
				5414	clear_buddies(cfs_rq, se);
				5415
				5416	if (curr->policy != SCHED_BATCH) {
				5417	update_rq_clock(rq);
				5418	/*
				5419	* Update run-time statistics of the 'current'.
				5420	*/
				5421	update_curr(cfs_rq);
				5422	/*
				5423	* Tell update_rq_clock() that we've just updated,
				5424	* so we don't do microscopic update in schedule()
				5425	* and double the fastpath cost.
				5426	*/
				5427	rq_clock_skip_update(rq, true);
				5428	}
				5429
				5430	set_skip_buddy(se);
				5431	}
				5432
				5433	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				5434	{
				5435	struct sched_entity *se = &p->se;
				5436
				5437	/* throttled hierarchies are not runnable */
				5438	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
				5439	return false;
				5440
				5441	/* Tell the scheduler that we'd really like pse to run next. */
				5442	set_next_buddy(se);
				5443
				5444	yield_task_fair(rq);
				5445
				5446	return true;
				5447	}
				5448
				5449	#ifdef CONFIG_SMP
				5450	/**************************************************
				5451	* Fair scheduling class load-balancing methods.
				5452	*
				5453	* BASICS
				5454	*
				5455	* The purpose of load-balancing is to achieve the same basic fairness the
				5456	* per-cpu scheduler provides, namely provide a proportional amount of compute
				5457	* time to each task. This is expressed in the following equation:
				5458	*
				5459	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				5460	*
				5461	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				5462	* W_i,0 is defined as:
				5463	*
				5464	* W_i,0 = \Sum_j w_i,j (2)
				5465	*
				5466	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				5467	* is derived from the nice value as per prio_to_weight[].
				5468	*
				5469	* The weight average is an exponential decay average of the instantaneous
				5470	* weight:
				5471	*
				5472	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				5473	*
				5474	* C_i is the compute capacity of cpu i, typically it is the
				5475	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				5476	* can also include other factors [XXX].
				5477	*
				5478	* To achieve this balance we define a measure of imbalance which follows
				5479	* directly from (1):
				5480	*
				5481	* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
				5482	*
				5483	* We them move tasks around to minimize the imbalance. In the continuous
				5484	* function space it is obvious this converges, in the discrete case we get
				5485	* a few fun cases generally called infeasible weight scenarios.
				5486	*
				5487	* [XXX expand on:
				5488	* - infeasible weights;
				5489	* - local vs global optima in the discrete case. ]
				5490	*
				5491	*
				5492	* SCHED DOMAINS
				5493	*
				5494	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				5495	* for all i,j solution, we create a tree of cpus that follows the hardware
				5496	* topology where each level pairs two lower groups (or better). This results
				5497	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				5498	* tree to only the first of the previous level and we decrease the frequency
				5499	* of load-balance at each level inv. proportional to the number of cpus in
				5500	* the groups.
				5501	*
				5502	* This yields:
				5503	*
				5504	* log_2 n 1 n
				5505	* \Sum { --- * --- * 2^i } = O(n) (5)
				5506	* i = 0 2^i 2^i
				5507	* `- size of each group
				5508	* \| \| `- number of cpus doing load-balance
				5509	* \| `- freq
				5510	* `- sum over all levels
				5511	*
				5512	* Coupled with a limit on how many tasks we can migrate every balance pass,
				5513	* this makes (5) the runtime complexity of the balancer.
				5514	*
				5515	* An important property here is that each CPU is still (indirectly) connected
				5516	* to every other cpu in at most O(log n) steps:
				5517	*
				5518	* The adjacency matrix of the resulting graph is given by:
				5519	*
				5520	* log_2 n
				5521	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				5522	* k = 0
				5523	*
				5524	* And you'll find that:
				5525	*
				5526	* A^(log_2 n)_i,j != 0 for all i,j (7)
				5527	*
				5528	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				5529	* The task movement gives a factor of O(m), giving a convergence complexity
				5530	* of:
				5531	*
				5532	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				5533	*
				5534	*
				5535	* WORK CONSERVING
				5536	*
				5537	* In order to avoid CPUs going idle while there's still work to do, new idle
				5538	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				5539	* tree itself instead of relying on other CPUs to bring it work.
				5540	*
				5541	* This adds some complexity to both (5) and (8) but it reduces the total idle
				5542	* time.
				5543	*
				5544	* [XXX more?]
				5545	*
				5546	*
				5547	* CGROUPS
				5548	*
				5549	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				5550	*
				5551	* s_k,i
				5552	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				5553	* S_k
				5554	*
				5555	* Where
				5556	*
				5557	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				5558	*
				5559	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				5560	*
				5561	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				5562	* property.
				5563	*
				5564	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				5565	* rewrite all of this once again.]
				5566	*/
				5567
				5568	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				5569
				5570	enum fbq_type { regular, remote, all };
				5571
				5572	#define LBF_ALL_PINNED 0x01
				5573	#define LBF_NEED_BREAK 0x02
				5574	#define LBF_DST_PINNED 0x04
				5575	#define LBF_SOME_PINNED 0x08
				5576
				5577	struct lb_env {
				5578	struct sched_domain *sd;
				5579
				5580	struct rq *src_rq;
				5581	int src_cpu;
				5582
				5583	int dst_cpu;
				5584	struct rq *dst_rq;
				5585
				5586	struct cpumask *dst_grpmask;
				5587	int new_dst_cpu;
				5588	enum cpu_idle_type idle;
				5589	long imbalance;
				5590	/* The set of CPUs under consideration for load-balancing */
				5591	struct cpumask *cpus;
				5592
				5593	unsigned int flags;
				5594
				5595	unsigned int loop;
				5596	unsigned int loop_break;
				5597	unsigned int loop_max;
				5598
				5599	enum fbq_type fbq_type;
				5600	struct list_head tasks;
				5601	};
				5602
				5603	/*
				5604	* Is this task likely cache-hot:
				5605	*/
				5606	static int task_hot(struct task_struct p, struct lb_env env)
				5607	{
				5608	s64 delta;
				5609
				5610	lockdep_assert_held(&env->src_rq->lock);
				5611
				5612	if (p->sched_class != &fair_sched_class)
				5613	return 0;
				5614
				5615	if (unlikely(p->policy == SCHED_IDLE))
				5616	return 0;
				5617
				5618	/*
				5619	* Buddy candidates are cache hot:
				5620	*/
				5621	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
				5622	(&p->se == cfs_rq_of(&p->se)->next \|\|
				5623	&p->se == cfs_rq_of(&p->se)->last))
				5624	return 1;
				5625
				5626	if (sysctl_sched_migration_cost == -1)
				5627	return 1;
				5628	if (sysctl_sched_migration_cost == 0)
				5629	return 0;
				5630
				5631	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
				5632
				5633	return delta < (s64)sysctl_sched_migration_cost;
				5634	}
				5635
				5636	#ifdef CONFIG_NUMA_BALANCING
				5637	/*
				5638	* Returns 1, if task migration degrades locality
				5639	* Returns 0, if task migration improves locality i.e migration preferred.
				5640	* Returns -1, if task migration is not affected by locality.
				5641	*/
				5642	static int migrate_degrades_locality(struct task_struct p, struct lb_env env)
				5643	{
				5644	struct numa_group *numa_group = rcu_dereference(p->numa_group);
				5645	unsigned long src_faults, dst_faults;
				5646	int src_nid, dst_nid;
				5647
				5648	if (!static_branch_likely(&sched_numa_balancing))
				5649	return -1;
				5650
				5651	if (!p->numa_faults \|\| !(env->sd->flags & SD_NUMA))
				5652	return -1;
				5653
				5654	src_nid = cpu_to_node(env->src_cpu);
				5655	dst_nid = cpu_to_node(env->dst_cpu);
				5656
				5657	if (src_nid == dst_nid)
				5658	return -1;
				5659
				5660	/* Migrating away from the preferred node is always bad. */
				5661	if (src_nid == p->numa_preferred_nid) {
				5662	if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
				5663	return 1;
				5664	else
				5665	return -1;
				5666	}
				5667
				5668	/* Encourage migration to the preferred node. */
				5669	if (dst_nid == p->numa_preferred_nid)
				5670	return 0;
				5671
				5672	if (numa_group) {
				5673	src_faults = group_faults(p, src_nid);
				5674	dst_faults = group_faults(p, dst_nid);
				5675	} else {
				5676	src_faults = task_faults(p, src_nid);
				5677	dst_faults = task_faults(p, dst_nid);
				5678	}
				5679
				5680	return dst_faults < src_faults;
				5681	}
				5682
				5683	#else
				5684	static inline int migrate_degrades_locality(struct task_struct *p,
				5685	struct lb_env *env)
				5686	{
				5687	return -1;
				5688	}
				5689	#endif
				5690
				5691	/*
				5692	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				5693	*/
				5694	static
				5695	int can_migrate_task(struct task_struct p, struct lb_env env)
				5696	{
				5697	int tsk_cache_hot;
				5698
				5699	lockdep_assert_held(&env->src_rq->lock);
				5700
				5701	/*
				5702	* We do not migrate tasks that are:
				5703	* 1) throttled_lb_pair, or
				5704	* 2) cannot be migrated to this CPU due to cpus_allowed, or
				5705	* 3) running (obviously), or
				5706	* 4) are cache-hot on their current CPU.
				5707	*/
				5708	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				5709	return 0;
				5710
				5711	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
				5712	int cpu;
				5713
				5714	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
				5715
				5716	env->flags \|= LBF_SOME_PINNED;
				5717
				5718	/*
				5719	* Remember if this task can be migrated to any other cpu in
				5720	* our sched_group. We may want to revisit it if we couldn't
				5721	* meet load balance goals by pulling other tasks on src_cpu.
				5722	*
				5723	* Also avoid computing new_dst_cpu if we have already computed
				5724	* one in current iteration.
				5725	*/
				5726	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))
				5727	return 0;
				5728
				5729	/* Prevent to re-select dst_cpu via env's cpus */
				5730	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
				5731	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
				5732	env->flags \|= LBF_DST_PINNED;
				5733	env->new_dst_cpu = cpu;
				5734	break;
				5735	}
				5736	}
				5737
				5738	return 0;
				5739	}
				5740
				5741	/* Record that we found atleast one task that could run on dst_cpu */
				5742	env->flags &= ~LBF_ALL_PINNED;
				5743
				5744	if (task_running(env->src_rq, p)) {
				5745	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
				5746	return 0;
				5747	}
				5748
				5749	/*
				5750	* Aggressive migration if:
				5751	* 1) destination numa is preferred
				5752	* 2) task is cache cold, or
				5753	* 3) too many balance attempts have failed.
				5754	*/
				5755	tsk_cache_hot = migrate_degrades_locality(p, env);
				5756	if (tsk_cache_hot == -1)
				5757	tsk_cache_hot = task_hot(p, env);
				5758
				5759	if (tsk_cache_hot <= 0 \|\|
				5760	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
				5761	if (tsk_cache_hot == 1) {
				5762	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
				5763	schedstat_inc(p, se.statistics.nr_forced_migrations);
				5764	}
				5765	return 1;
				5766	}
				5767
				5768	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
				5769	return 0;
				5770	}
				5771
				5772	/*
				5773	* detach_task() -- detach the task for the migration specified in env
				5774	*/
				5775	static void detach_task(struct task_struct p, struct lb_env env)
				5776	{
				5777	lockdep_assert_held(&env->src_rq->lock);
				5778
				5779	deactivate_task(env->src_rq, p, 0);
				5780	p->on_rq = TASK_ON_RQ_MIGRATING;
				5781	set_task_cpu(p, env->dst_cpu);
				5782	}
				5783
				5784	/*
				5785	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
				5786	* part of active balancing operations within "domain".
				5787	*
				5788	* Returns a task if successful and NULL otherwise.
				5789	*/
				5790	static struct task_struct detach_one_task(struct lb_env env)
				5791	{
				5792	struct task_struct p, n;
				5793
				5794	lockdep_assert_held(&env->src_rq->lock);
				5795
				5796	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
				5797	if (!can_migrate_task(p, env))
				5798	continue;
				5799
				5800	detach_task(p, env);
				5801
				5802	/*
				5803	* Right now, this is only the second place where
				5804	* lb_gained[env->idle] is updated (other is detach_tasks)
				5805	* so we can safely collect stats here rather than
				5806	* inside detach_tasks().
				5807	*/
				5808	schedstat_inc(env->sd, lb_gained[env->idle]);
				5809	return p;
				5810	}
				5811	return NULL;
				5812	}
				5813
				5814	static const unsigned int sched_nr_migrate_break = 32;
				5815
				5816	/*
				5817	* detach_tasks() -- tries to detach up to imbalance weighted load from
				5818	* busiest_rq, as part of a balancing operation within domain "sd".
				5819	*
				5820	* Returns number of detached tasks if successful and 0 otherwise.
				5821	*/
				5822	static int detach_tasks(struct lb_env *env)
				5823	{
				5824	struct list_head *tasks = &env->src_rq->cfs_tasks;
				5825	struct task_struct *p;
				5826	unsigned long load;
				5827	int detached = 0;
				5828
				5829	lockdep_assert_held(&env->src_rq->lock);
				5830
				5831	if (env->imbalance <= 0)
				5832	return 0;
				5833
				5834	while (!list_empty(tasks)) {
				5835	/*
				5836	* We don't want to steal all, otherwise we may be treated likewise,
				5837	* which could at worst lead to a livelock crash.
				5838	*/
				5839	if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
				5840	break;
				5841
				5842	p = list_first_entry(tasks, struct task_struct, se.group_node);
				5843
				5844	env->loop++;
				5845	/* We've more or less seen every task there is, call it quits */
				5846	if (env->loop > env->loop_max)
				5847	break;
				5848
				5849	/* take a breather every nr_migrate tasks */
				5850	if (env->loop > env->loop_break) {
				5851	env->loop_break += sched_nr_migrate_break;
				5852	env->flags \|= LBF_NEED_BREAK;
				5853	break;
				5854	}
				5855
				5856	if (!can_migrate_task(p, env))
				5857	goto next;
				5858
				5859	load = task_h_load(p);
				5860
				5861	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
				5862	goto next;
				5863
				5864	if ((load / 2) > env->imbalance)
				5865	goto next;
				5866
				5867	detach_task(p, env);
				5868	list_add(&p->se.group_node, &env->tasks);
				5869
				5870	detached++;
				5871	env->imbalance -= load;
				5872
				5873	#ifdef CONFIG_PREEMPT
				5874	/*
				5875	* NEWIDLE balancing is a source of latency, so preemptible
				5876	* kernels will stop after the first task is detached to minimize
				5877	* the critical section.
				5878	*/
				5879	if (env->idle == CPU_NEWLY_IDLE)
				5880	break;
				5881	#endif
				5882
				5883	/*
				5884	* We only want to steal up to the prescribed amount of
				5885	* weighted load.
				5886	*/
				5887	if (env->imbalance <= 0)
				5888	break;
				5889
				5890	continue;
				5891	next:
				5892	list_move_tail(&p->se.group_node, tasks);
				5893	}
				5894
				5895	/*
				5896	* Right now, this is one of only two places we collect this stat
				5897	* so we can safely collect detach_one_task() stats here rather
				5898	* than inside detach_one_task().
				5899	*/
				5900	schedstat_add(env->sd, lb_gained[env->idle], detached);
				5901
				5902	return detached;
				5903	}
				5904
				5905	/*
				5906	* attach_task() -- attach the task detached by detach_task() to its new rq.
				5907	*/
				5908	static void attach_task(struct rq rq, struct task_struct p)
				5909	{
				5910	lockdep_assert_held(&rq->lock);
				5911
				5912	BUG_ON(task_rq(p) != rq);
				5913	p->on_rq = TASK_ON_RQ_QUEUED;
				5914	activate_task(rq, p, 0);
				5915	check_preempt_curr(rq, p, 0);
				5916	}
				5917
				5918	/*
				5919	* attach_one_task() -- attaches the task returned from detach_one_task() to
				5920	* its new rq.
				5921	*/
				5922	static void attach_one_task(struct rq rq, struct task_struct p)
				5923	{
				5924	raw_spin_lock(&rq->lock);
				5925	attach_task(rq, p);
				5926	raw_spin_unlock(&rq->lock);
				5927	}
				5928
				5929	/*
				5930	* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
				5931	* new rq.
				5932	*/
				5933	static void attach_tasks(struct lb_env *env)
				5934	{
				5935	struct list_head *tasks = &env->tasks;
				5936	struct task_struct *p;
				5937
				5938	raw_spin_lock(&env->dst_rq->lock);
				5939
				5940	while (!list_empty(tasks)) {
				5941	p = list_first_entry(tasks, struct task_struct, se.group_node);
				5942	list_del_init(&p->se.group_node);
				5943
				5944	attach_task(env->dst_rq, p);
				5945	}
				5946
				5947	raw_spin_unlock(&env->dst_rq->lock);
				5948	}
				5949
				5950	#ifdef CONFIG_FAIR_GROUP_SCHED
				5951	static void update_blocked_averages(int cpu)
				5952	{
				5953	struct rq *rq = cpu_rq(cpu);
				5954	struct cfs_rq *cfs_rq;
				5955	unsigned long flags;
				5956
				5957	raw_spin_lock_irqsave(&rq->lock, flags);
				5958	update_rq_clock(rq);
				5959
				5960	/*
				5961	* Iterates the task_group tree in a bottom up fashion, see
				5962	* list_add_leaf_cfs_rq() for details.
				5963	*/
				5964	for_each_leaf_cfs_rq(rq, cfs_rq) {
				5965	/* throttled entities do not contribute to load */
				5966	if (throttled_hierarchy(cfs_rq))
				5967	continue;
				5968
				5969	if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
				5970	update_tg_load_avg(cfs_rq, 0);
				5971	}
				5972	raw_spin_unlock_irqrestore(&rq->lock, flags);
				5973	}
				5974
				5975	/*
				5976	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
				5977	* This needs to be done in a top-down fashion because the load of a child
				5978	* group is a fraction of its parents load.
				5979	*/
				5980	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
				5981	{
				5982	struct rq *rq = rq_of(cfs_rq);
				5983	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
				5984	unsigned long now = jiffies;
				5985	unsigned long load;
				5986
				5987	if (cfs_rq->last_h_load_update == now)
				5988	return;
				5989
				5990	cfs_rq->h_load_next = NULL;
				5991	for_each_sched_entity(se) {
				5992	cfs_rq = cfs_rq_of(se);
				5993	cfs_rq->h_load_next = se;
				5994	if (cfs_rq->last_h_load_update == now)
				5995	break;
				5996	}
				5997
				5998	if (!se) {
				5999	cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
				6000	cfs_rq->last_h_load_update = now;
				6001	}
				6002
				6003	while ((se = cfs_rq->h_load_next) != NULL) {
				6004	load = cfs_rq->h_load;
				6005	load = div64_ul(load * se->avg.load_avg,
				6006	cfs_rq_load_avg(cfs_rq) + 1);
				6007	cfs_rq = group_cfs_rq(se);
				6008	cfs_rq->h_load = load;
				6009	cfs_rq->last_h_load_update = now;
				6010	}
				6011	}
				6012
				6013	static unsigned long task_h_load(struct task_struct *p)
				6014	{
				6015	struct cfs_rq *cfs_rq = task_cfs_rq(p);
				6016
				6017	update_cfs_rq_h_load(cfs_rq);
				6018	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
				6019	cfs_rq_load_avg(cfs_rq) + 1);
				6020	}
				6021	#else
				6022	static inline void update_blocked_averages(int cpu)
				6023	{
				6024	struct rq *rq = cpu_rq(cpu);
				6025	struct cfs_rq *cfs_rq = &rq->cfs;
				6026	unsigned long flags;
				6027
				6028	raw_spin_lock_irqsave(&rq->lock, flags);
				6029	update_rq_clock(rq);
				6030	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
				6031	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6032	}
				6033
				6034	static unsigned long task_h_load(struct task_struct *p)
				6035	{
				6036	return p->se.avg.load_avg;
				6037	}
				6038	#endif
				6039
				6040	/******** Helpers for find_busiest_group **********************/
				6041
				6042	enum group_type {
				6043	group_other = 0,
				6044	group_imbalanced,
				6045	group_overloaded,
				6046	};
				6047
				6048	/*
				6049	* sg_lb_stats - stats of a sched_group required for load_balancing
				6050	*/
				6051	struct sg_lb_stats {
				6052	unsigned long avg_load; /Avg load across the CPUs of the group /
				6053	unsigned long group_load; /* Total load over the CPUs of the group */
				6054	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
				6055	unsigned long load_per_task;
				6056	unsigned long group_capacity;
				6057	unsigned long group_util; /* Total utilization of the group */
				6058	unsigned int sum_nr_running; /* Nr tasks running in the group */
				6059	unsigned int idle_cpus;
				6060	unsigned int group_weight;
				6061	enum group_type group_type;
				6062	int group_no_capacity;
				6063	#ifdef CONFIG_NUMA_BALANCING
				6064	unsigned int nr_numa_running;
				6065	unsigned int nr_preferred_running;
				6066	#endif
				6067	};
				6068
				6069	/*
				6070	* sd_lb_stats - Structure to store the statistics of a sched_domain
				6071	* during load balancing.
				6072	*/
				6073	struct sd_lb_stats {
				6074	struct sched_group busiest; / Busiest group in this sd */
				6075	struct sched_group local; / Local group in this sd */
				6076	unsigned long total_load; /* Total load of all groups in sd */
				6077	unsigned long total_capacity; /* Total capacity of all groups in sd */
				6078	unsigned long avg_load; /* Average load across all groups in sd */
				6079
				6080	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
				6081	struct sg_lb_stats local_stat; /* Statistics of the local group */
				6082	};
				6083
				6084	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
				6085	{
				6086	/*
				6087	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
				6088	* local_stat because update_sg_lb_stats() does a full clear/assignment.
				6089	* We must however clear busiest_stat::avg_load because
				6090	* update_sd_pick_busiest() reads this before assignment.
				6091	*/
				6092	*sds = (struct sd_lb_stats){
				6093	.busiest = NULL,
				6094	.local = NULL,
				6095	.total_load = 0UL,
				6096	.total_capacity = 0UL,
				6097	.busiest_stat = {
				6098	.avg_load = 0UL,
				6099	.sum_nr_running = 0,
				6100	.group_type = group_other,
				6101	},
				6102	};
				6103	}
				6104
				6105	/**
				6106	* get_sd_load_idx - Obtain the load index for a given sched domain.
				6107	* @sd: The sched_domain whose load_idx is to be obtained.
				6108	* @idle: The idle status of the CPU for whose sd load_idx is obtained.
				6109	*
				6110	* Return: The load index.
				6111	*/
				6112	static inline int get_sd_load_idx(struct sched_domain *sd,
				6113	enum cpu_idle_type idle)
				6114	{
				6115	int load_idx;
				6116
				6117	switch (idle) {
				6118	case CPU_NOT_IDLE:
				6119	load_idx = sd->busy_idx;
				6120	break;
				6121
				6122	case CPU_NEWLY_IDLE:
				6123	load_idx = sd->newidle_idx;
				6124	break;
				6125	default:
				6126	load_idx = sd->idle_idx;
				6127	break;
				6128	}
				6129
				6130	return load_idx;
				6131	}
				6132
				6133	static unsigned long scale_rt_capacity(int cpu)
				6134	{
				6135	struct rq *rq = cpu_rq(cpu);
				6136	u64 total, used, age_stamp, avg;
				6137	s64 delta;
				6138
				6139	/*
				6140	* Since we're reading these variables without serialization make sure
				6141	* we read them once before doing sanity checks on them.
				6142	*/
				6143	age_stamp = READ_ONCE(rq->age_stamp);
				6144	avg = READ_ONCE(rq->rt_avg);
				6145	delta = __rq_clock_broken(rq) - age_stamp;
				6146
				6147	if (unlikely(delta < 0))
				6148	delta = 0;
				6149
				6150	total = sched_avg_period() + delta;
				6151
				6152	used = div_u64(avg, total);
				6153
				6154	if (likely(used < SCHED_CAPACITY_SCALE))
				6155	return SCHED_CAPACITY_SCALE - used;
				6156
				6157	return 1;
				6158	}
				6159
				6160	static void update_cpu_capacity(struct sched_domain *sd, int cpu)
				6161	{
				6162	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
				6163	struct sched_group *sdg = sd->groups;
				6164
				6165	cpu_rq(cpu)->cpu_capacity_orig = capacity;
				6166
				6167	capacity *= scale_rt_capacity(cpu);
				6168	capacity >>= SCHED_CAPACITY_SHIFT;
				6169
				6170	if (!capacity)
				6171	capacity = 1;
				6172
				6173	cpu_rq(cpu)->cpu_capacity = capacity;
				6174	sdg->sgc->capacity = capacity;
				6175	}
				6176
				6177	void update_group_capacity(struct sched_domain *sd, int cpu)
				6178	{
				6179	struct sched_domain *child = sd->child;
				6180	struct sched_group group, sdg = sd->groups;
				6181	unsigned long capacity;
				6182	unsigned long interval;
				6183
				6184	interval = msecs_to_jiffies(sd->balance_interval);
				6185	interval = clamp(interval, 1UL, max_load_balance_interval);
				6186	sdg->sgc->next_update = jiffies + interval;
				6187
				6188	if (!child) {
				6189	update_cpu_capacity(sd, cpu);
				6190	return;
				6191	}
				6192
				6193	capacity = 0;
				6194
				6195	if (child->flags & SD_OVERLAP) {
				6196	/*
				6197	* SD_OVERLAP domains cannot assume that child groups
				6198	* span the current group.
				6199	*/
				6200
				6201	for_each_cpu(cpu, sched_group_cpus(sdg)) {
				6202	struct sched_group_capacity *sgc;
				6203	struct rq *rq = cpu_rq(cpu);
				6204
				6205	/*
				6206	* build_sched_domains() -> init_sched_groups_capacity()
				6207	* gets here before we've attached the domains to the
				6208	* runqueues.
				6209	*
				6210	* Use capacity_of(), which is set irrespective of domains
				6211	* in update_cpu_capacity().
				6212	*
				6213	* This avoids capacity from being 0 and
				6214	* causing divide-by-zero issues on boot.
				6215	*/
				6216	if (unlikely(!rq->sd)) {
				6217	capacity += capacity_of(cpu);
				6218	continue;
				6219	}
				6220
				6221	sgc = rq->sd->groups->sgc;
				6222	capacity += sgc->capacity;
				6223	}
				6224	} else {
				6225	/*
				6226	* !SD_OVERLAP domains can assume that child groups
				6227	* span the current group.
				6228	*/
				6229
				6230	group = child->groups;
				6231	do {
				6232	capacity += group->sgc->capacity;
				6233	group = group->next;
				6234	} while (group != child->groups);
				6235	}
				6236
				6237	sdg->sgc->capacity = capacity;
				6238	}
				6239
				6240	/*
				6241	* Check whether the capacity of the rq has been noticeably reduced by side
				6242	* activity. The imbalance_pct is used for the threshold.
				6243	* Return true is the capacity is reduced
				6244	*/
				6245	static inline int
				6246	check_cpu_capacity(struct rq rq, struct sched_domain sd)
				6247	{
				6248	return ((rq->cpu_capacity * sd->imbalance_pct) <
				6249	(rq->cpu_capacity_orig * 100));
				6250	}
				6251
				6252	/*
				6253	* Group imbalance indicates (and tries to solve) the problem where balancing
				6254	* groups is inadequate due to tsk_cpus_allowed() constraints.
				6255	*
				6256	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
				6257	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
				6258	* Something like:
				6259	*
				6260	* { 0 1 2 3 } { 4 5 6 7 }
				6261	* * * * *
				6262	*
				6263	* If we were to balance group-wise we'd place two tasks in the first group and
				6264	* two tasks in the second group. Clearly this is undesired as it will overload
				6265	* cpu 3 and leave one of the cpus in the second group unused.
				6266	*
				6267	* The current solution to this issue is detecting the skew in the first group
				6268	* by noticing the lower domain failed to reach balance and had difficulty
				6269	* moving tasks due to affinity constraints.
				6270	*
				6271	* When this is so detected; this group becomes a candidate for busiest; see
				6272	* update_sd_pick_busiest(). And calculate_imbalance() and
				6273	* find_busiest_group() avoid some of the usual balance conditions to allow it
				6274	* to create an effective group imbalance.
				6275	*
				6276	* This is a somewhat tricky proposition since the next run might not find the
				6277	* group imbalance and decide the groups need to be balanced again. A most
				6278	* subtle and fragile situation.
				6279	*/
				6280
				6281	static inline int sg_imbalanced(struct sched_group *group)
				6282	{
				6283	return group->sgc->imbalance;
				6284	}
				6285
				6286	/*
				6287	* group_has_capacity returns true if the group has spare capacity that could
				6288	* be used by some tasks.
				6289	* We consider that a group has spare capacity if the * number of task is
				6290	* smaller than the number of CPUs or if the utilization is lower than the
				6291	* available capacity for CFS tasks.
				6292	* For the latter, we use a threshold to stabilize the state, to take into
				6293	* account the variance of the tasks' load and to return true if the available
				6294	* capacity in meaningful for the load balancer.
				6295	* As an example, an available capacity of 1% can appear but it doesn't make
				6296	* any benefit for the load balance.
				6297	*/
				6298	static inline bool
				6299	group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
				6300	{
				6301	if (sgs->sum_nr_running < sgs->group_weight)
				6302	return true;
				6303
				6304	if ((sgs->group_capacity * 100) >
				6305	(sgs->group_util * env->sd->imbalance_pct))
				6306	return true;
				6307
				6308	return false;
				6309	}
				6310
				6311	/*
				6312	* group_is_overloaded returns true if the group has more tasks than it can
				6313	* handle.
				6314	* group_is_overloaded is not equals to !group_has_capacity because a group
				6315	* with the exact right number of tasks, has no more spare capacity but is not
				6316	* overloaded so both group_has_capacity and group_is_overloaded return
				6317	* false.
				6318	*/
				6319	static inline bool
				6320	group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
				6321	{
				6322	if (sgs->sum_nr_running <= sgs->group_weight)
				6323	return false;
				6324
				6325	if ((sgs->group_capacity * 100) <
				6326	(sgs->group_util * env->sd->imbalance_pct))
				6327	return true;
				6328
				6329	return false;
				6330	}
				6331
				6332	static inline enum
				6333	group_type group_classify(struct sched_group *group,
				6334	struct sg_lb_stats *sgs)
				6335	{
				6336	if (sgs->group_no_capacity)
				6337	return group_overloaded;
				6338
				6339	if (sg_imbalanced(group))
				6340	return group_imbalanced;
				6341
				6342	return group_other;
				6343	}
				6344
				6345	/**
				6346	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
				6347	* @env: The load balancing environment.
				6348	* @group: sched_group whose statistics are to be updated.
				6349	* @load_idx: Load index of sched_domain of this_cpu for load calc.
				6350	* @local_group: Does group contain this_cpu.
				6351	* @sgs: variable to hold the statistics for this group.
				6352	* @overload: Indicate more than one runnable task for any CPU.
				6353	*/
				6354	static inline void update_sg_lb_stats(struct lb_env *env,
				6355	struct sched_group *group, int load_idx,
				6356	int local_group, struct sg_lb_stats *sgs,
				6357	bool *overload)
				6358	{
				6359	unsigned long load;
				6360	int i;
				6361
				6362	memset(sgs, 0, sizeof(*sgs));
				6363
				6364	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
				6365	struct rq *rq = cpu_rq(i);
				6366
				6367	/* Bias balancing toward cpus of our domain */
				6368	if (local_group)
				6369	load = target_load(i, load_idx);
				6370	else
				6371	load = source_load(i, load_idx);
				6372
				6373	sgs->group_load += load;
				6374	sgs->group_util += cpu_util(i);
				6375	sgs->sum_nr_running += rq->cfs.h_nr_running;
				6376
				6377	if (rq->nr_running > 1)
				6378	*overload = true;
				6379
				6380	#ifdef CONFIG_NUMA_BALANCING
				6381	sgs->nr_numa_running += rq->nr_numa_running;
				6382	sgs->nr_preferred_running += rq->nr_preferred_running;
				6383	#endif
				6384	sgs->sum_weighted_load += weighted_cpuload(i);
				6385	if (idle_cpu(i))
				6386	sgs->idle_cpus++;
				6387	}
				6388
				6389	/* Adjust by relative CPU capacity of the group */
				6390	sgs->group_capacity = group->sgc->capacity;
				6391	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
				6392
				6393	if (sgs->sum_nr_running)
				6394	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
				6395
				6396	sgs->group_weight = group->group_weight;
				6397
				6398	sgs->group_no_capacity = group_is_overloaded(env, sgs);
				6399	sgs->group_type = group_classify(group, sgs);
				6400	}
				6401
				6402	/**
				6403	* update_sd_pick_busiest - return 1 on busiest group
				6404	* @env: The load balancing environment.
				6405	* @sds: sched_domain statistics
				6406	* @sg: sched_group candidate to be checked for being the busiest
				6407	* @sgs: sched_group statistics
				6408	*
				6409	* Determine if @sg is a busier group than the previously selected
				6410	* busiest group.
				6411	*
				6412	* Return: %true if @sg is a busier group than the previously selected
				6413	* busiest group. %false otherwise.
				6414	*/
				6415	static bool update_sd_pick_busiest(struct lb_env *env,
				6416	struct sd_lb_stats *sds,
				6417	struct sched_group *sg,
				6418	struct sg_lb_stats *sgs)
				6419	{
				6420	struct sg_lb_stats *busiest = &sds->busiest_stat;
				6421
				6422	if (sgs->group_type > busiest->group_type)
				6423	return true;
				6424
				6425	if (sgs->group_type < busiest->group_type)
				6426	return false;
				6427
				6428	if (sgs->avg_load <= busiest->avg_load)
				6429	return false;
				6430
				6431	/* This is the busiest node in its class. */
				6432	if (!(env->sd->flags & SD_ASYM_PACKING))
				6433	return true;
				6434
				6435	/*
				6436	* ASYM_PACKING needs to move all the work to the lowest
				6437	* numbered CPUs in the group, therefore mark all groups
				6438	* higher than ourself as busy.
				6439	*/
				6440	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
				6441	if (!sds->busiest)
				6442	return true;
				6443
				6444	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				6445	return true;
				6446	}
				6447
				6448	return false;
				6449	}
				6450
				6451	#ifdef CONFIG_NUMA_BALANCING
				6452	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				6453	{
				6454	if (sgs->sum_nr_running > sgs->nr_numa_running)
				6455	return regular;
				6456	if (sgs->sum_nr_running > sgs->nr_preferred_running)
				6457	return remote;
				6458	return all;
				6459	}
				6460
				6461	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				6462	{
				6463	if (rq->nr_running > rq->nr_numa_running)
				6464	return regular;
				6465	if (rq->nr_running > rq->nr_preferred_running)
				6466	return remote;
				6467	return all;
				6468	}
				6469	#else
				6470	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				6471	{
				6472	return all;
				6473	}
				6474
				6475	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				6476	{
				6477	return regular;
				6478	}
				6479	#endif /* CONFIG_NUMA_BALANCING */
				6480
				6481	/**
				6482	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
				6483	* @env: The load balancing environment.
				6484	* @sds: variable to hold the statistics for this sched_domain.
				6485	*/
				6486	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
				6487	{
				6488	struct sched_domain *child = env->sd->child;
				6489	struct sched_group *sg = env->sd->groups;
				6490	struct sg_lb_stats tmp_sgs;
				6491	int load_idx, prefer_sibling = 0;
				6492	bool overload = false;
				6493
				6494	if (child && child->flags & SD_PREFER_SIBLING)
				6495	prefer_sibling = 1;
				6496
				6497	load_idx = get_sd_load_idx(env->sd, env->idle);
				6498
				6499	do {
				6500	struct sg_lb_stats *sgs = &tmp_sgs;
				6501	int local_group;
				6502
				6503	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
				6504	if (local_group) {
				6505	sds->local = sg;
				6506	sgs = &sds->local_stat;
				6507
				6508	if (env->idle != CPU_NEWLY_IDLE \|\|
				6509	time_after_eq(jiffies, sg->sgc->next_update))
				6510	update_group_capacity(env->sd, env->dst_cpu);
				6511	}
				6512
				6513	update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
				6514	&overload);
				6515
				6516	if (local_group)
				6517	goto next_group;
				6518
				6519	/*
				6520	* In case the child domain prefers tasks go to siblings
				6521	* first, lower the sg capacity so that we'll try
				6522	* and move all the excess tasks away. We lower the capacity
				6523	* of a group only if the local group has the capacity to fit
				6524	* these excess tasks. The extra check prevents the case where
				6525	* you always pull from the heaviest group when it is already
				6526	* under-utilized (possible with a large weight task outweighs
				6527	* the tasks on the system).
				6528	*/
				6529	if (prefer_sibling && sds->local &&
				6530	group_has_capacity(env, &sds->local_stat) &&
				6531	(sgs->sum_nr_running > 1)) {
				6532	sgs->group_no_capacity = 1;
				6533	sgs->group_type = group_classify(sg, sgs);
				6534	}
				6535
				6536	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
				6537	sds->busiest = sg;
				6538	sds->busiest_stat = *sgs;
				6539	}
				6540
				6541	next_group:
				6542	/* Now, start updating sd_lb_stats */
				6543	sds->total_load += sgs->group_load;
				6544	sds->total_capacity += sgs->group_capacity;
				6545
				6546	sg = sg->next;
				6547	} while (sg != env->sd->groups);
				6548
				6549	if (env->sd->flags & SD_NUMA)
				6550	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
				6551
				6552	if (!env->sd->parent) {
				6553	/* update overload indicator if we are at root domain */
				6554	if (env->dst_rq->rd->overload != overload)
				6555	env->dst_rq->rd->overload = overload;
				6556	}
				6557
				6558	}
				6559
				6560	/**
				6561	* check_asym_packing - Check to see if the group is packed into the
				6562	* sched doman.
				6563	*
				6564	* This is primarily intended to used at the sibling level. Some
				6565	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				6566	* case of POWER7, it can move to lower SMT modes only when higher
				6567	* threads are idle. When in lower SMT modes, the threads will
				6568	* perform better since they share less core resources. Hence when we
				6569	* have idle threads, we want them to be the higher ones.
				6570	*
				6571	* This packing function is run on idle threads. It checks to see if
				6572	* the busiest CPU in this domain (core in the P7 case) has a higher
				6573	* CPU number than the packing function is being run on. Here we are
				6574	* assuming lower CPU number will be equivalent to lower a SMT thread
				6575	* number.
				6576	*
				6577	* Return: 1 when packing is required and a task should be moved to
				6578	* this CPU. The amount of the imbalance is returned in *imbalance.
				6579	*
				6580	* @env: The load balancing environment.
				6581	* @sds: Statistics of the sched_domain which is to be packed
				6582	*/
				6583	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
				6584	{
				6585	int busiest_cpu;
				6586
				6587	if (!(env->sd->flags & SD_ASYM_PACKING))
				6588	return 0;
				6589
				6590	if (!sds->busiest)
				6591	return 0;
				6592
				6593	busiest_cpu = group_first_cpu(sds->busiest);
				6594	if (env->dst_cpu > busiest_cpu)
				6595	return 0;
				6596
				6597	env->imbalance = DIV_ROUND_CLOSEST(
				6598	sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
				6599	SCHED_CAPACITY_SCALE);
				6600
				6601	return 1;
				6602	}
				6603
				6604	/**
				6605	* fix_small_imbalance - Calculate the minor imbalance that exists
				6606	* amongst the groups of a sched_domain, during
				6607	* load balancing.
				6608	* @env: The load balancing environment.
				6609	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
				6610	*/
				6611	static inline
				6612	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
				6613	{
				6614	unsigned long tmp, capa_now = 0, capa_move = 0;
				6615	unsigned int imbn = 2;
				6616	unsigned long scaled_busy_load_per_task;
				6617	struct sg_lb_stats local, busiest;
				6618
				6619	local = &sds->local_stat;
				6620	busiest = &sds->busiest_stat;
				6621
				6622	if (!local->sum_nr_running)
				6623	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
				6624	else if (busiest->load_per_task > local->load_per_task)
				6625	imbn = 1;
				6626
				6627	scaled_busy_load_per_task =
				6628	(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
				6629	busiest->group_capacity;
				6630
				6631	if (busiest->avg_load + scaled_busy_load_per_task >=
				6632	local->avg_load + (scaled_busy_load_per_task * imbn)) {
				6633	env->imbalance = busiest->load_per_task;
				6634	return;
				6635	}
				6636
				6637	/*
				6638	* OK, we don't have enough imbalance to justify moving tasks,
				6639	* however we may be able to increase total CPU capacity used by
				6640	* moving them.
				6641	*/
				6642
				6643	capa_now += busiest->group_capacity *
				6644	min(busiest->load_per_task, busiest->avg_load);
				6645	capa_now += local->group_capacity *
				6646	min(local->load_per_task, local->avg_load);
				6647	capa_now /= SCHED_CAPACITY_SCALE;
				6648
				6649	/* Amount of load we'd subtract */
				6650	if (busiest->avg_load > scaled_busy_load_per_task) {
				6651	capa_move += busiest->group_capacity *
				6652	min(busiest->load_per_task,
				6653	busiest->avg_load - scaled_busy_load_per_task);
				6654	}
				6655
				6656	/* Amount of load we'd add */
				6657	if (busiest->avg_load * busiest->group_capacity <
				6658	busiest->load_per_task * SCHED_CAPACITY_SCALE) {
				6659	tmp = (busiest->avg_load * busiest->group_capacity) /
				6660	local->group_capacity;
				6661	} else {
				6662	tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
				6663	local->group_capacity;
				6664	}
				6665	capa_move += local->group_capacity *
				6666	min(local->load_per_task, local->avg_load + tmp);
				6667	capa_move /= SCHED_CAPACITY_SCALE;
				6668
				6669	/* Move if we gain throughput */
				6670	if (capa_move > capa_now)
				6671	env->imbalance = busiest->load_per_task;
				6672	}
				6673
				6674	/**
				6675	* calculate_imbalance - Calculate the amount of imbalance present within the
				6676	* groups of a given sched_domain during load balance.
				6677	* @env: load balance environment
				6678	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
				6679	*/
				6680	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
				6681	{
				6682	unsigned long max_pull, load_above_capacity = ~0UL;
				6683	struct sg_lb_stats local, busiest;
				6684
				6685	local = &sds->local_stat;
				6686	busiest = &sds->busiest_stat;
				6687
				6688	if (busiest->group_type == group_imbalanced) {
				6689	/*
				6690	* In the group_imb case we cannot rely on group-wide averages
				6691	* to ensure cpu-load equilibrium, look at wider averages. XXX
				6692	*/
				6693	busiest->load_per_task =
				6694	min(busiest->load_per_task, sds->avg_load);
				6695	}
				6696
				6697	/*
				6698	* In the presence of smp nice balancing, certain scenarios can have
				6699	* max load less than avg load(as we skip the groups at or below
				6700	* its cpu_capacity, while calculating max_load..)
				6701	*/
				6702	if (busiest->avg_load <= sds->avg_load \|\|
				6703	local->avg_load >= sds->avg_load) {
				6704	env->imbalance = 0;
				6705	return fix_small_imbalance(env, sds);
				6706	}
				6707
				6708	/*
				6709	* If there aren't any idle cpus, avoid creating some.
				6710	*/
				6711	if (busiest->group_type == group_overloaded &&
				6712	local->group_type == group_overloaded) {
				6713	load_above_capacity = busiest->sum_nr_running *
				6714	SCHED_LOAD_SCALE;
				6715	if (load_above_capacity > busiest->group_capacity)
				6716	load_above_capacity -= busiest->group_capacity;
				6717	else
				6718	load_above_capacity = ~0UL;
				6719	}
				6720
				6721	/*
				6722	* We're trying to get all the cpus to the average_load, so we don't
				6723	* want to push ourselves above the average load, nor do we wish to
				6724	* reduce the max loaded cpu below the average load. At the same time,
				6725	* we also don't want to reduce the group load below the group capacity
				6726	* (so that we can implement power-savings policies etc). Thus we look
				6727	* for the minimum possible imbalance.
				6728	*/
				6729	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
				6730
				6731	/* How much load to actually move to equalise the imbalance */
				6732	env->imbalance = min(
				6733	max_pull * busiest->group_capacity,
				6734	(sds->avg_load - local->avg_load) * local->group_capacity
				6735	) / SCHED_CAPACITY_SCALE;
				6736
				6737	/*
				6738	* if *imbalance is less than the average load per runnable task
				6739	* there is no guarantee that any tasks will be moved so we'll have
				6740	* a think about bumping its value to force at least one task to be
				6741	* moved
				6742	*/
				6743	if (env->imbalance < busiest->load_per_task)
				6744	return fix_small_imbalance(env, sds);
				6745	}
				6746
				6747	/***** find_busiest_group() helpers end here *******************/
				6748
				6749	/**
				6750	* find_busiest_group - Returns the busiest group within the sched_domain
				6751	* if there is an imbalance. If there isn't an imbalance, and
				6752	* the user has opted for power-savings, it returns a group whose
				6753	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				6754	* such a group exists.
				6755	*
				6756	* Also calculates the amount of weighted load which should be moved
				6757	* to restore balance.
				6758	*
				6759	* @env: The load balancing environment.
				6760	*
				6761	* Return: - The busiest group if imbalance exists.
				6762	* - If no imbalance and user has opted for power-savings balance,
				6763	* return the least loaded group whose CPUs can be
				6764	* put to idle by rebalancing its tasks onto our group.
				6765	*/
				6766	static struct sched_group find_busiest_group(struct lb_env env)
				6767	{
				6768	struct sg_lb_stats local, busiest;
				6769	struct sd_lb_stats sds;
				6770
				6771	init_sd_lb_stats(&sds);
				6772
				6773	/*
				6774	* Compute the various statistics relavent for load balancing at
				6775	* this level.
				6776	*/
				6777	update_sd_lb_stats(env, &sds);
				6778	local = &sds.local_stat;
				6779	busiest = &sds.busiest_stat;
				6780
				6781	/* ASYM feature bypasses nice load balance check */
				6782	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				6783	check_asym_packing(env, &sds))
				6784	return sds.busiest;
				6785
				6786	/* There is no busy sibling group to pull tasks from */
				6787	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
				6788	goto out_balanced;
				6789
				6790	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
				6791	/ sds.total_capacity;
				6792
				6793	/*
				6794	* If the busiest group is imbalanced the below checks don't
				6795	* work because they assume all things are equal, which typically
				6796	* isn't true due to cpus_allowed constraints and the like.
				6797	*/
				6798	if (busiest->group_type == group_imbalanced)
				6799	goto force_balance;
				6800
				6801	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
				6802	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
				6803	busiest->group_no_capacity)
				6804	goto force_balance;
				6805
				6806	/*
				6807	* If the local group is busier than the selected busiest group
				6808	* don't try and pull any tasks.
				6809	*/
				6810	if (local->avg_load >= busiest->avg_load)
				6811	goto out_balanced;
				6812
				6813	/*
				6814	* Don't pull any tasks if this group is already above the domain
				6815	* average load.
				6816	*/
				6817	if (local->avg_load >= sds.avg_load)
				6818	goto out_balanced;
				6819
				6820	if (env->idle == CPU_IDLE) {
				6821	/*
				6822	* This cpu is idle. If the busiest group is not overloaded
				6823	* and there is no imbalance between this and busiest group
				6824	* wrt idle cpus, it is balanced. The imbalance becomes
				6825	* significant if the diff is greater than 1 otherwise we
				6826	* might end up to just move the imbalance on another group
				6827	*/
				6828	if ((busiest->group_type != group_overloaded) &&
				6829	(local->idle_cpus <= (busiest->idle_cpus + 1)))
				6830	goto out_balanced;
				6831	} else {
				6832	/*
				6833	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				6834	* imbalance_pct to be conservative.
				6835	*/
				6836	if (100 * busiest->avg_load <=
				6837	env->sd->imbalance_pct * local->avg_load)
				6838	goto out_balanced;
				6839	}
				6840
				6841	force_balance:
				6842	/* Looks like there is an imbalance. Compute it */
				6843	calculate_imbalance(env, &sds);
				6844	return sds.busiest;
				6845
				6846	out_balanced:
				6847	env->imbalance = 0;
				6848	return NULL;
				6849	}
				6850
				6851	/*
				6852	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				6853	*/
				6854	static struct rq find_busiest_queue(struct lb_env env,
				6855	struct sched_group *group)
				6856	{
				6857	struct rq busiest = NULL, rq;
				6858	unsigned long busiest_load = 0, busiest_capacity = 1;
				6859	int i;
				6860
				6861	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
				6862	unsigned long capacity, wl;
				6863	enum fbq_type rt;
				6864
				6865	rq = cpu_rq(i);
				6866	rt = fbq_classify_rq(rq);
				6867
				6868	/*
				6869	* We classify groups/runqueues into three groups:
				6870	* - regular: there are !numa tasks
				6871	* - remote: there are numa tasks that run on the 'wrong' node
				6872	* - all: there is no distinction
				6873	*
				6874	* In order to avoid migrating ideally placed numa tasks,
				6875	* ignore those when there's better options.
				6876	*
				6877	* If we ignore the actual busiest queue to migrate another
				6878	* task, the next balance pass can still reduce the busiest
				6879	* queue by moving tasks around inside the node.
				6880	*
				6881	* If we cannot move enough load due to this classification
				6882	* the next pass will adjust the group classification and
				6883	* allow migration of more tasks.
				6884	*
				6885	* Both cases only affect the total convergence complexity.
				6886	*/
				6887	if (rt > env->fbq_type)
				6888	continue;
				6889
				6890	capacity = capacity_of(i);
				6891
				6892	wl = weighted_cpuload(i);
				6893
				6894	/*
				6895	* When comparing with imbalance, use weighted_cpuload()
				6896	* which is not scaled with the cpu capacity.
				6897	*/
				6898
				6899	if (rq->nr_running == 1 && wl > env->imbalance &&
				6900	!check_cpu_capacity(rq, env->sd))
				6901	continue;
				6902
				6903	/*
				6904	* For the load comparisons with the other cpu's, consider
				6905	* the weighted_cpuload() scaled with the cpu capacity, so
				6906	* that the load can be moved away from the cpu that is
				6907	* potentially running at a lower capacity.
				6908	*
				6909	* Thus we're looking for max(wl_i / capacity_i), crosswise
				6910	* multiplication to rid ourselves of the division works out
				6911	* to: wl_i * capacity_j > wl_j * capacity_i; where j is
				6912	* our previous maximum.
				6913	*/
				6914	if (wl * busiest_capacity > busiest_load * capacity) {
				6915	busiest_load = wl;
				6916	busiest_capacity = capacity;
				6917	busiest = rq;
				6918	}
				6919	}
				6920
				6921	return busiest;
				6922	}
				6923
				6924	/*
				6925	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				6926	* so long as it is large enough.
				6927	*/
				6928	#define MAX_PINNED_INTERVAL 512
				6929
				6930	/* Working cpumask for load_balance and load_balance_newidle. */
				6931	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
				6932
				6933	static int need_active_balance(struct lb_env *env)
				6934	{
				6935	struct sched_domain *sd = env->sd;
				6936
				6937	if (env->idle == CPU_NEWLY_IDLE) {
				6938
				6939	/*
				6940	* ASYM_PACKING needs to force migrate tasks from busy but
				6941	* higher numbered CPUs in order to pack all tasks in the
				6942	* lowest numbered CPUs.
				6943	*/
				6944	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
				6945	return 1;
				6946	}
				6947
				6948	/*
				6949	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
				6950	* It's worth migrating the task if the src_cpu's capacity is reduced
				6951	* because of other sched_class or IRQs if more capacity stays
				6952	* available on dst_cpu.
				6953	*/
				6954	if ((env->idle != CPU_NOT_IDLE) &&
				6955	(env->src_rq->cfs.h_nr_running == 1)) {
				6956	if ((check_cpu_capacity(env->src_rq, sd)) &&
				6957	(capacity_of(env->src_cpu)sd->imbalance_pct < capacity_of(env->dst_cpu)100))
				6958	return 1;
				6959	}
				6960
				6961	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				6962	}
				6963
				6964	static int active_load_balance_cpu_stop(void *data);
				6965
				6966	static int should_we_balance(struct lb_env *env)
				6967	{
				6968	struct sched_group *sg = env->sd->groups;
				6969	struct cpumask sg_cpus, sg_mask;
				6970	int cpu, balance_cpu = -1;
				6971
				6972	/*
				6973	* In the newly idle case, we will allow all the cpu's
				6974	* to do the newly idle load balance.
				6975	*/
				6976	if (env->idle == CPU_NEWLY_IDLE)
				6977	return 1;
				6978
				6979	sg_cpus = sched_group_cpus(sg);
				6980	sg_mask = sched_group_mask(sg);
				6981	/* Try to find first idle cpu */
				6982	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
				6983	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))
				6984	continue;
				6985
				6986	balance_cpu = cpu;
				6987	break;
				6988	}
				6989
				6990	if (balance_cpu == -1)
				6991	balance_cpu = group_balance_cpu(sg);
				6992
				6993	/*
				6994	* First idle cpu or the first cpu(busiest) in this sched group
				6995	* is eligible for doing load balancing at this and above domains.
				6996	*/
				6997	return balance_cpu == env->dst_cpu;
				6998	}
				6999
				7000	/*
				7001	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				7002	* tasks if there is an imbalance.
				7003	*/
				7004	static int load_balance(int this_cpu, struct rq *this_rq,
				7005	struct sched_domain *sd, enum cpu_idle_type idle,
				7006	int *continue_balancing)
				7007	{
				7008	int ld_moved, cur_ld_moved, active_balance = 0;
				7009	struct sched_domain *sd_parent = sd->parent;
				7010	struct sched_group *group;
				7011	struct rq *busiest;
				7012	unsigned long flags;
				7013	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
				7014
				7015	struct lb_env env = {
				7016	.sd = sd,
				7017	.dst_cpu = this_cpu,
				7018	.dst_rq = this_rq,
				7019	.dst_grpmask = sched_group_cpus(sd->groups),
				7020	.idle = idle,
				7021	.loop_break = sched_nr_migrate_break,
				7022	.cpus = cpus,
				7023	.fbq_type = all,
				7024	.tasks = LIST_HEAD_INIT(env.tasks),
				7025	};
				7026
				7027	/*
				7028	* For NEWLY_IDLE load_balancing, we don't need to consider
				7029	* other cpus in our group
				7030	*/
				7031	if (idle == CPU_NEWLY_IDLE)
				7032	env.dst_grpmask = NULL;
				7033
				7034	cpumask_copy(cpus, cpu_active_mask);
				7035
				7036	schedstat_inc(sd, lb_count[idle]);
				7037
				7038	redo:
				7039	if (!should_we_balance(&env)) {
				7040	*continue_balancing = 0;
				7041	goto out_balanced;
				7042	}
				7043
				7044	group = find_busiest_group(&env);
				7045	if (!group) {
				7046	schedstat_inc(sd, lb_nobusyg[idle]);
				7047	goto out_balanced;
				7048	}
				7049
				7050	busiest = find_busiest_queue(&env, group);
				7051	if (!busiest) {
				7052	schedstat_inc(sd, lb_nobusyq[idle]);
				7053	goto out_balanced;
				7054	}
				7055
				7056	BUG_ON(busiest == env.dst_rq);
				7057
				7058	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
				7059
				7060	env.src_cpu = busiest->cpu;
				7061	env.src_rq = busiest;
				7062
				7063	ld_moved = 0;
				7064	if (busiest->nr_running > 1) {
				7065	/*
				7066	* Attempt to move tasks. If find_busiest_group has found
				7067	* an imbalance but busiest->nr_running <= 1, the group is
				7068	* still unbalanced. ld_moved simply stays zero, so it is
				7069	* correctly treated as an imbalance.
				7070	*/
				7071	env.flags \|= LBF_ALL_PINNED;
				7072	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
				7073
				7074	more_balance:
				7075	raw_spin_lock_irqsave(&busiest->lock, flags);
				7076
				7077	/*
				7078	* cur_ld_moved - load moved in current iteration
				7079	* ld_moved - cumulative load moved across iterations
				7080	*/
				7081	cur_ld_moved = detach_tasks(&env);
				7082
				7083	/*
				7084	* We've detached some tasks from busiest_rq. Every
				7085	* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
				7086	* unlock busiest->lock, and we are able to be sure
				7087	* that nobody can manipulate the tasks in parallel.
				7088	* See task_rq_lock() family for the details.
				7089	*/
				7090
				7091	raw_spin_unlock(&busiest->lock);
				7092
				7093	if (cur_ld_moved) {
				7094	attach_tasks(&env);
				7095	ld_moved += cur_ld_moved;
				7096	}
				7097
				7098	local_irq_restore(flags);
				7099
				7100	if (env.flags & LBF_NEED_BREAK) {
				7101	env.flags &= ~LBF_NEED_BREAK;
				7102	goto more_balance;
				7103	}
				7104
				7105	/*
				7106	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				7107	* us and move them to an alternate dst_cpu in our sched_group
				7108	* where they can run. The upper limit on how many times we
				7109	* iterate on same src_cpu is dependent on number of cpus in our
				7110	* sched_group.
				7111	*
				7112	* This changes load balance semantics a bit on who can move
				7113	* load to a given_cpu. In addition to the given_cpu itself
				7114	* (or a ilb_cpu acting on its behalf where given_cpu is
				7115	* nohz-idle), we now have balance_cpu in a position to move
				7116	* load to given_cpu. In rare situations, this may cause
				7117	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				7118	* _independently_ and at _same_ time to move some load to
				7119	* given_cpu) causing exceess load to be moved to given_cpu.
				7120	* This however should not happen so much in practice and
				7121	* moreover subsequent load balance cycles should correct the
				7122	* excess load moved.
				7123	*/
				7124	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
				7125
				7126	/* Prevent to re-select dst_cpu via env's cpus */
				7127	cpumask_clear_cpu(env.dst_cpu, env.cpus);
				7128
				7129	env.dst_rq = cpu_rq(env.new_dst_cpu);
				7130	env.dst_cpu = env.new_dst_cpu;
				7131	env.flags &= ~LBF_DST_PINNED;
				7132	env.loop = 0;
				7133	env.loop_break = sched_nr_migrate_break;
				7134
				7135	/*
				7136	* Go back to "more_balance" rather than "redo" since we
				7137	* need to continue with same src_cpu.
				7138	*/
				7139	goto more_balance;
				7140	}
				7141
				7142	/*
				7143	* We failed to reach balance because of affinity.
				7144	*/
				7145	if (sd_parent) {
				7146	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
				7147
				7148	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
				7149	*group_imbalance = 1;
				7150	}
				7151
				7152	/* All tasks on this runqueue were pinned by CPU affinity */
				7153	if (unlikely(env.flags & LBF_ALL_PINNED)) {
				7154	cpumask_clear_cpu(cpu_of(busiest), cpus);
				7155	if (!cpumask_empty(cpus)) {
				7156	env.loop = 0;
				7157	env.loop_break = sched_nr_migrate_break;
				7158	goto redo;
				7159	}
				7160	goto out_all_pinned;
				7161	}
				7162	}
				7163
				7164	if (!ld_moved) {
				7165	schedstat_inc(sd, lb_failed[idle]);
				7166	/*
				7167	* Increment the failure counter only on periodic balance.
				7168	* We do not want newidle balance, which can be very
				7169	* frequent, pollute the failure counter causing
				7170	* excessive cache_hot migrations and active balances.
				7171	*/
				7172	if (idle != CPU_NEWLY_IDLE)
				7173	sd->nr_balance_failed++;
				7174
				7175	if (need_active_balance(&env)) {
				7176	raw_spin_lock_irqsave(&busiest->lock, flags);
				7177
				7178	/* don't kick the active_load_balance_cpu_stop,
				7179	* if the curr task on busiest cpu can't be
				7180	* moved to this_cpu
				7181	*/
				7182	if (!cpumask_test_cpu(this_cpu,
				7183	tsk_cpus_allowed(busiest->curr))) {
				7184	raw_spin_unlock_irqrestore(&busiest->lock,
				7185	flags);
				7186	env.flags \|= LBF_ALL_PINNED;
				7187	goto out_one_pinned;
				7188	}
				7189
				7190	/*
				7191	* ->active_balance synchronizes accesses to
				7192	* ->active_balance_work. Once set, it's cleared
				7193	* only after active load balance is finished.
				7194	*/
				7195	if (!busiest->active_balance) {
				7196	busiest->active_balance = 1;
				7197	busiest->push_cpu = this_cpu;
				7198	active_balance = 1;
				7199	}
				7200	raw_spin_unlock_irqrestore(&busiest->lock, flags);
				7201
				7202	if (active_balance) {
				7203	stop_one_cpu_nowait(cpu_of(busiest),
				7204	active_load_balance_cpu_stop, busiest,
				7205	&busiest->active_balance_work);
				7206	}
				7207
				7208	/*
				7209	* We've kicked active balancing, reset the failure
				7210	* counter.
				7211	*/
				7212	sd->nr_balance_failed = sd->cache_nice_tries+1;
				7213	}
				7214	} else
				7215	sd->nr_balance_failed = 0;
				7216
				7217	if (likely(!active_balance)) {
				7218	/* We were unbalanced, so reset the balancing interval */
				7219	sd->balance_interval = sd->min_interval;
				7220	} else {
				7221	/*
				7222	* If we've begun active balancing, start to back off. This
				7223	* case may not be covered by the all_pinned logic if there
				7224	* is only 1 task on the busy runqueue (because we don't call
				7225	* detach_tasks).
				7226	*/
				7227	if (sd->balance_interval < sd->max_interval)
				7228	sd->balance_interval *= 2;
				7229	}
				7230
				7231	goto out;
				7232
				7233	out_balanced:
				7234	/*
				7235	* We reach balance although we may have faced some affinity
				7236	* constraints. Clear the imbalance flag if it was set.
				7237	*/
				7238	if (sd_parent) {
				7239	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
				7240
				7241	if (*group_imbalance)
				7242	*group_imbalance = 0;
				7243	}
				7244
				7245	out_all_pinned:
				7246	/*
				7247	* We reach balance because all tasks are pinned at this level so
				7248	* we can't migrate them. Let the imbalance flag set so parent level
				7249	* can try to migrate them.
				7250	*/
				7251	schedstat_inc(sd, lb_balanced[idle]);
				7252
				7253	sd->nr_balance_failed = 0;
				7254
				7255	out_one_pinned:
				7256	/* tune up the balancing interval */
				7257	if (((env.flags & LBF_ALL_PINNED) &&
				7258	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
				7259	(sd->balance_interval < sd->max_interval))
				7260	sd->balance_interval *= 2;
				7261
				7262	ld_moved = 0;
				7263	out:
				7264	return ld_moved;
				7265	}
				7266
				7267	static inline unsigned long
				7268	get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
				7269	{
				7270	unsigned long interval = sd->balance_interval;
				7271
				7272	if (cpu_busy)
				7273	interval *= sd->busy_factor;
				7274
				7275	/* scale ms to jiffies */
				7276	interval = msecs_to_jiffies(interval);
				7277	interval = clamp(interval, 1UL, max_load_balance_interval);
				7278
				7279	return interval;
				7280	}
				7281
				7282	static inline void
				7283	update_next_balance(struct sched_domain sd, int cpu_busy, unsigned long next_balance)
				7284	{
				7285	unsigned long interval, next;
				7286
				7287	interval = get_sd_balance_interval(sd, cpu_busy);
				7288	next = sd->last_balance + interval;
				7289
				7290	if (time_after(*next_balance, next))
				7291	*next_balance = next;
				7292	}
				7293
				7294	/*
				7295	* idle_balance is called by schedule() if this_cpu is about to become
				7296	* idle. Attempts to pull tasks from other CPUs.
				7297	*/
				7298	static int idle_balance(struct rq *this_rq)
				7299	{
				7300	unsigned long next_balance = jiffies + HZ;
				7301	int this_cpu = this_rq->cpu;
				7302	struct sched_domain *sd;
				7303	int pulled_task = 0;
				7304	u64 curr_cost = 0;
				7305
				7306	idle_enter_fair(this_rq);
				7307
				7308	/*
				7309	* We must set idle_stamp _before_ calling idle_balance(), such that we
				7310	* measure the duration of idle_balance() as idle time.
				7311	*/
				7312	this_rq->idle_stamp = rq_clock(this_rq);
				7313
				7314	if (this_rq->avg_idle < sysctl_sched_migration_cost \|\|
				7315	!this_rq->rd->overload) {
				7316	rcu_read_lock();
				7317	sd = rcu_dereference_check_sched_domain(this_rq->sd);
				7318	if (sd)
				7319	update_next_balance(sd, 0, &next_balance);
				7320	rcu_read_unlock();
				7321
				7322	goto out;
				7323	}
				7324
				7325	raw_spin_unlock(&this_rq->lock);
				7326
				7327	update_blocked_averages(this_cpu);
				7328	rcu_read_lock();
				7329	for_each_domain(this_cpu, sd) {
				7330	int continue_balancing = 1;
				7331	u64 t0, domain_cost;
				7332
				7333	if (!(sd->flags & SD_LOAD_BALANCE))
				7334	continue;
				7335
				7336	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
				7337	update_next_balance(sd, 0, &next_balance);
				7338	break;
				7339	}
				7340
				7341	if (sd->flags & SD_BALANCE_NEWIDLE) {
				7342	t0 = sched_clock_cpu(this_cpu);
				7343
				7344	pulled_task = load_balance(this_cpu, this_rq,
				7345	sd, CPU_NEWLY_IDLE,
				7346	&continue_balancing);
				7347
				7348	domain_cost = sched_clock_cpu(this_cpu) - t0;
				7349	if (domain_cost > sd->max_newidle_lb_cost)
				7350	sd->max_newidle_lb_cost = domain_cost;
				7351
				7352	curr_cost += domain_cost;
				7353	}
				7354
				7355	update_next_balance(sd, 0, &next_balance);
				7356
				7357	/*
				7358	* Stop searching for tasks to pull if there are
				7359	* now runnable tasks on this rq.
				7360	*/
				7361	if (pulled_task \|\| this_rq->nr_running > 0)
				7362	break;
				7363	}
				7364	rcu_read_unlock();
				7365
				7366	raw_spin_lock(&this_rq->lock);
				7367
				7368	if (curr_cost > this_rq->max_idle_balance_cost)
				7369	this_rq->max_idle_balance_cost = curr_cost;
				7370
				7371	/*
				7372	* While browsing the domains, we released the rq lock, a task could
				7373	* have been enqueued in the meantime. Since we're not going idle,
				7374	* pretend we pulled a task.
				7375	*/
				7376	if (this_rq->cfs.h_nr_running && !pulled_task)
				7377	pulled_task = 1;
				7378
				7379	out:
				7380	/* Move the next balance forward */
				7381	if (time_after(this_rq->next_balance, next_balance))
				7382	this_rq->next_balance = next_balance;
				7383
				7384	/* Is there a task of a high priority class? */
				7385	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
				7386	pulled_task = -1;
				7387
				7388	if (pulled_task) {
				7389	idle_exit_fair(this_rq);
				7390	this_rq->idle_stamp = 0;
				7391	}
				7392
				7393	return pulled_task;
				7394	}
				7395
				7396	/*
				7397	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				7398	* running tasks off the busiest CPU onto idle CPUs. It requires at
				7399	* least 1 task to be running on each physical CPU where possible, and
				7400	* avoids physical / logical imbalances.
				7401	*/
				7402	static int active_load_balance_cpu_stop(void *data)
				7403	{
				7404	struct rq *busiest_rq = data;
				7405	int busiest_cpu = cpu_of(busiest_rq);
				7406	int target_cpu = busiest_rq->push_cpu;
				7407	struct rq *target_rq = cpu_rq(target_cpu);
				7408	struct sched_domain *sd;
				7409	struct task_struct *p = NULL;
				7410
				7411	raw_spin_lock_irq(&busiest_rq->lock);
				7412
				7413	/* make sure the requested cpu hasn't gone down in the meantime */
				7414	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				7415	!busiest_rq->active_balance))
				7416	goto out_unlock;
				7417
				7418	/* Is there any task to move? */
				7419	if (busiest_rq->nr_running <= 1)
				7420	goto out_unlock;
				7421
				7422	/*
				7423	* This condition is "impossible", if it occurs
				7424	* we need to fix it. Originally reported by
				7425	* Bjorn Helgaas on a 128-cpu setup.
				7426	*/
				7427	BUG_ON(busiest_rq == target_rq);
				7428
				7429	/* Search for an sd spanning us and the target CPU. */
				7430	rcu_read_lock();
				7431	for_each_domain(target_cpu, sd) {
				7432	if ((sd->flags & SD_LOAD_BALANCE) &&
				7433	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				7434	break;
				7435	}
				7436
				7437	if (likely(sd)) {
				7438	struct lb_env env = {
				7439	.sd = sd,
				7440	.dst_cpu = target_cpu,
				7441	.dst_rq = target_rq,
				7442	.src_cpu = busiest_rq->cpu,
				7443	.src_rq = busiest_rq,
				7444	.idle = CPU_IDLE,
				7445	};
				7446
				7447	schedstat_inc(sd, alb_count);
				7448
				7449	p = detach_one_task(&env);
				7450	if (p)
				7451	schedstat_inc(sd, alb_pushed);
				7452	else
				7453	schedstat_inc(sd, alb_failed);
				7454	}
				7455	rcu_read_unlock();
				7456	out_unlock:
				7457	busiest_rq->active_balance = 0;
				7458	raw_spin_unlock(&busiest_rq->lock);
				7459
				7460	if (p)
				7461	attach_one_task(target_rq, p);
				7462
				7463	local_irq_enable();
				7464
				7465	return 0;
				7466	}
				7467
				7468	static inline int on_null_domain(struct rq *rq)
				7469	{
				7470	return unlikely(!rcu_dereference_sched(rq->sd));
				7471	}
				7472
				7473	#ifdef CONFIG_NO_HZ_COMMON
				7474	/*
				7475	* idle load balancing details
				7476	* - When one of the busy CPUs notice that there may be an idle rebalancing
				7477	* needed, they will kick the idle load balancer, which then does idle
				7478	* load balancing for all the idle CPUs.
				7479	*/
				7480	static struct {
				7481	cpumask_var_t idle_cpus_mask;
				7482	atomic_t nr_cpus;
				7483	unsigned long next_balance; /* in jiffy units */
				7484	} nohz ____cacheline_aligned;
				7485
				7486	static inline int find_new_ilb(void)
				7487	{
				7488	int ilb = cpumask_first(nohz.idle_cpus_mask);
				7489
				7490	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				7491	return ilb;
				7492
				7493	return nr_cpu_ids;
				7494	}
				7495
				7496	/*
				7497	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				7498	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				7499	* CPU (if there is one).
				7500	*/
				7501	static void nohz_balancer_kick(void)
				7502	{
				7503	int ilb_cpu;
				7504
				7505	nohz.next_balance++;
				7506
				7507	ilb_cpu = find_new_ilb();
				7508
				7509	if (ilb_cpu >= nr_cpu_ids)
				7510	return;
				7511
				7512	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
				7513	return;
				7514	/*
				7515	* Use smp_send_reschedule() instead of resched_cpu().
				7516	* This way we generate a sched IPI on the target cpu which
				7517	* is idle. And the softirq performing nohz idle load balance
				7518	* will be run before returning from the IPI.
				7519	*/
				7520	smp_send_reschedule(ilb_cpu);
				7521	return;
				7522	}
				7523
				7524	static inline void nohz_balance_exit_idle(int cpu)
				7525	{
				7526	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				7527	/*
				7528	* Completely isolated CPUs don't ever set, so we must test.
				7529	*/
				7530	if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
				7531	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				7532	atomic_dec(&nohz.nr_cpus);
				7533	}
				7534	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				7535	}
				7536	}
				7537
				7538	static inline void set_cpu_sd_state_busy(void)
				7539	{
				7540	struct sched_domain *sd;
				7541	int cpu = smp_processor_id();
				7542
				7543	rcu_read_lock();
				7544	sd = rcu_dereference(per_cpu(sd_busy, cpu));
				7545
				7546	if (!sd \|\| !sd->nohz_idle)
				7547	goto unlock;
				7548	sd->nohz_idle = 0;
				7549
				7550	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
				7551	unlock:
				7552	rcu_read_unlock();
				7553	}
				7554
				7555	void set_cpu_sd_state_idle(void)
				7556	{
				7557	struct sched_domain *sd;
				7558	int cpu = smp_processor_id();
				7559
				7560	rcu_read_lock();
				7561	sd = rcu_dereference(per_cpu(sd_busy, cpu));
				7562
				7563	if (!sd \|\| sd->nohz_idle)
				7564	goto unlock;
				7565	sd->nohz_idle = 1;
				7566
				7567	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
				7568	unlock:
				7569	rcu_read_unlock();
				7570	}
				7571
				7572	/*
				7573	* This routine will record that the cpu is going idle with tick stopped.
				7574	* This info will be used in performing idle load balancing in the future.
				7575	*/
				7576	void nohz_balance_enter_idle(int cpu)
				7577	{
				7578	/*
				7579	* If this cpu is going down, then nothing needs to be done.
				7580	*/
				7581	if (!cpu_active(cpu))
				7582	return;
				7583
				7584	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				7585	return;
				7586
				7587	/*
				7588	* If we're a completely isolated CPU, we don't play.
				7589	*/
				7590	if (on_null_domain(cpu_rq(cpu)))
				7591	return;
				7592
				7593	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				7594	atomic_inc(&nohz.nr_cpus);
				7595	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				7596	}
				7597
				7598	static int sched_ilb_notifier(struct notifier_block *nfb,
				7599	unsigned long action, void *hcpu)
				7600	{
				7601	switch (action & ~CPU_TASKS_FROZEN) {
				7602	case CPU_DYING:
				7603	nohz_balance_exit_idle(smp_processor_id());
				7604	return NOTIFY_OK;
				7605	default:
				7606	return NOTIFY_DONE;
				7607	}
				7608	}
				7609	#endif
				7610
				7611	static DEFINE_SPINLOCK(balancing);
				7612
				7613	/*
				7614	* Scale the max load_balance interval with the number of CPUs in the system.
				7615	* This trades load-balance latency on larger machines for less cross talk.
				7616	*/
				7617	void update_max_interval(void)
				7618	{
				7619	max_load_balance_interval = HZ*num_online_cpus()/10;
				7620	}
				7621
				7622	/*
				7623	* It checks each scheduling domain to see if it is due to be balanced,
				7624	* and initiates a balancing operation if so.
				7625	*
				7626	* Balancing parameters are set up in init_sched_domains.
				7627	*/
				7628	static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
				7629	{
				7630	int continue_balancing = 1;
				7631	int cpu = rq->cpu;
				7632	unsigned long interval;
				7633	struct sched_domain *sd;
				7634	/* Earliest time when we have to do rebalance again */
				7635	unsigned long next_balance = jiffies + 60*HZ;
				7636	int update_next_balance = 0;
				7637	int need_serialize, need_decay = 0;
				7638	u64 max_cost = 0;
				7639
				7640	update_blocked_averages(cpu);
				7641
				7642	rcu_read_lock();
				7643	for_each_domain(cpu, sd) {
				7644	/*
				7645	* Decay the newidle max times here because this is a regular
				7646	* visit to all the domains. Decay ~1% per second.
				7647	*/
				7648	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
				7649	sd->max_newidle_lb_cost =
				7650	(sd->max_newidle_lb_cost * 253) / 256;
				7651	sd->next_decay_max_lb_cost = jiffies + HZ;
				7652	need_decay = 1;
				7653	}
				7654	max_cost += sd->max_newidle_lb_cost;
				7655
				7656	if (!(sd->flags & SD_LOAD_BALANCE))
				7657	continue;
				7658
				7659	/*
				7660	* Stop the load balance at this level. There is another
				7661	* CPU in our sched group which is doing load balancing more
				7662	* actively.
				7663	*/
				7664	if (!continue_balancing) {
				7665	if (need_decay)
				7666	continue;
				7667	break;
				7668	}
				7669
				7670	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
				7671
				7672	need_serialize = sd->flags & SD_SERIALIZE;
				7673	if (need_serialize) {
				7674	if (!spin_trylock(&balancing))
				7675	goto out;
				7676	}
				7677
				7678	if (time_after_eq(jiffies, sd->last_balance + interval)) {
				7679	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
				7680	/*
				7681	* The LBF_DST_PINNED logic could have changed
				7682	* env->dst_cpu, so we can't know our idle
				7683	* state even if we migrated tasks. Update it.
				7684	*/
				7685	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
				7686	}
				7687	sd->last_balance = jiffies;
				7688	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
				7689	}
				7690	if (need_serialize)
				7691	spin_unlock(&balancing);
				7692	out:
				7693	if (time_after(next_balance, sd->last_balance + interval)) {
				7694	next_balance = sd->last_balance + interval;
				7695	update_next_balance = 1;
				7696	}
				7697	}
				7698	if (need_decay) {
				7699	/*
				7700	* Ensure the rq-wide value also decays but keep it at a
				7701	* reasonable floor to avoid funnies with rq->avg_idle.
				7702	*/
				7703	rq->max_idle_balance_cost =
				7704	max((u64)sysctl_sched_migration_cost, max_cost);
				7705	}
				7706	rcu_read_unlock();
				7707
				7708	/*
				7709	* next_balance will be updated only when there is a need.
				7710	* When the cpu is attached to null domain for ex, it will not be
				7711	* updated.
				7712	*/
				7713	if (likely(update_next_balance)) {
				7714	rq->next_balance = next_balance;
				7715
				7716	#ifdef CONFIG_NO_HZ_COMMON
				7717	/*
				7718	* If this CPU has been elected to perform the nohz idle
				7719	* balance. Other idle CPUs have already rebalanced with
				7720	* nohz_idle_balance() and nohz.next_balance has been
				7721	* updated accordingly. This CPU is now running the idle load
				7722	* balance for itself and we need to update the
				7723	* nohz.next_balance accordingly.
				7724	*/
				7725	if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
				7726	nohz.next_balance = rq->next_balance;
				7727	#endif
				7728	}
				7729	}
				7730
				7731	#ifdef CONFIG_NO_HZ_COMMON
				7732	/*
				7733	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
				7734	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				7735	*/
				7736	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
				7737	{
				7738	int this_cpu = this_rq->cpu;
				7739	struct rq *rq;
				7740	int balance_cpu;
				7741	/* Earliest time when we have to do rebalance again */
				7742	unsigned long next_balance = jiffies + 60*HZ;
				7743	int update_next_balance = 0;
				7744
				7745	if (idle != CPU_IDLE \|\|
				7746	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				7747	goto end;
				7748
				7749	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
				7750	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
				7751	continue;
				7752
				7753	/*
				7754	* If this cpu gets work to do, stop the load balancing
				7755	* work being done for other cpus. Next load
				7756	* balancing owner will pick it up.
				7757	*/
				7758	if (need_resched())
				7759	break;
				7760
				7761	rq = cpu_rq(balance_cpu);
				7762
				7763	/*
				7764	* If time for next balance is due,
				7765	* do the balance.
				7766	*/
				7767	if (time_after_eq(jiffies, rq->next_balance)) {
				7768	raw_spin_lock_irq(&rq->lock);
				7769	update_rq_clock(rq);
				7770	update_idle_cpu_load(rq);
				7771	raw_spin_unlock_irq(&rq->lock);
				7772	rebalance_domains(rq, CPU_IDLE);
				7773	}
				7774
				7775	if (time_after(next_balance, rq->next_balance)) {
				7776	next_balance = rq->next_balance;
				7777	update_next_balance = 1;
				7778	}
				7779	}
				7780
				7781	/*
				7782	* next_balance will be updated only when there is a need.
				7783	* When the CPU is attached to null domain for ex, it will not be
				7784	* updated.
				7785	*/
				7786	if (likely(update_next_balance))
				7787	nohz.next_balance = next_balance;
				7788	end:
				7789	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
				7790	}
				7791
				7792	/*
				7793	* Current heuristic for kicking the idle load balancer in the presence
				7794	* of an idle cpu in the system.
				7795	* - This rq has more than one task.
				7796	* - This rq has at least one CFS task and the capacity of the CPU is
				7797	* significantly reduced because of RT tasks or IRQs.
				7798	* - At parent of LLC scheduler domain level, this cpu's scheduler group has
				7799	* multiple busy cpu.
				7800	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				7801	* domain span are idle.
				7802	*/
				7803	static inline bool nohz_kick_needed(struct rq *rq)
				7804	{
				7805	unsigned long now = jiffies;
				7806	struct sched_domain *sd;
				7807	struct sched_group_capacity *sgc;
				7808	int nr_busy, cpu = rq->cpu;
				7809	bool kick = false;
				7810
				7811	if (unlikely(rq->idle_balance))
				7812	return false;
				7813
				7814	/*
				7815	* We may be recently in ticked or tickless idle mode. At the first
				7816	* busy tick after returning from idle, we will update the busy stats.
				7817	*/
				7818	set_cpu_sd_state_busy();
				7819	nohz_balance_exit_idle(cpu);
				7820
				7821	/*
				7822	* None are in tickless mode and hence no need for NOHZ idle load
				7823	* balancing.
				7824	*/
				7825	if (likely(!atomic_read(&nohz.nr_cpus)))
				7826	return false;
				7827
				7828	if (time_before(now, nohz.next_balance))
				7829	return false;
				7830
				7831	if (rq->nr_running >= 2)
				7832	return true;
				7833
				7834	rcu_read_lock();
				7835	sd = rcu_dereference(per_cpu(sd_busy, cpu));
				7836	if (sd) {
				7837	sgc = sd->groups->sgc;
				7838	nr_busy = atomic_read(&sgc->nr_busy_cpus);
				7839
				7840	if (nr_busy > 1) {
				7841	kick = true;
				7842	goto unlock;
				7843	}
				7844
				7845	}
				7846
				7847	sd = rcu_dereference(rq->sd);
				7848	if (sd) {
				7849	if ((rq->cfs.h_nr_running >= 1) &&
				7850	check_cpu_capacity(rq, sd)) {
				7851	kick = true;
				7852	goto unlock;
				7853	}
				7854	}
				7855
				7856	sd = rcu_dereference(per_cpu(sd_asym, cpu));
				7857	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
				7858	sched_domain_span(sd)) < cpu)) {
				7859	kick = true;
				7860	goto unlock;
				7861	}
				7862
				7863	unlock:
				7864	rcu_read_unlock();
				7865	return kick;
				7866	}
				7867	#else
				7868	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
				7869	#endif
				7870
				7871	/*
				7872	* run_rebalance_domains is triggered when needed from the scheduler tick.
				7873	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
				7874	*/
				7875	static void run_rebalance_domains(struct softirq_action *h)
				7876	{
				7877	struct rq *this_rq = this_rq();
				7878	enum cpu_idle_type idle = this_rq->idle_balance ?
				7879	CPU_IDLE : CPU_NOT_IDLE;
				7880
				7881	/*
				7882	* If this cpu has a pending nohz_balance_kick, then do the
				7883	* balancing on behalf of the other idle cpus whose ticks are
				7884	* stopped. Do nohz_idle_balance before rebalance_domains to
				7885	* give the idle cpus a chance to load balance. Else we may
				7886	* load balance only within the local sched_domain hierarchy
				7887	* and abort nohz_idle_balance altogether if we pull some load.
				7888	*/
				7889	nohz_idle_balance(this_rq, idle);
				7890	rebalance_domains(this_rq, idle);
				7891	}
				7892
				7893	/*
				7894	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
				7895	*/
				7896	void trigger_load_balance(struct rq *rq)
				7897	{
				7898	/* Don't need to rebalance while attached to NULL domain */
				7899	if (unlikely(on_null_domain(rq)))
				7900	return;
				7901
				7902	if (time_after_eq(jiffies, rq->next_balance))
				7903	raise_softirq(SCHED_SOFTIRQ);
				7904	#ifdef CONFIG_NO_HZ_COMMON
				7905	if (nohz_kick_needed(rq))
				7906	nohz_balancer_kick();
				7907	#endif
				7908	}
				7909
				7910	static void rq_online_fair(struct rq *rq)
				7911	{
				7912	update_sysctl();
				7913
				7914	update_runtime_enabled(rq);
				7915	}
				7916
				7917	static void rq_offline_fair(struct rq *rq)
				7918	{
				7919	update_sysctl();
				7920
				7921	/* Ensure any throttled groups are reachable by pick_next_task */
				7922	unthrottle_offline_cfs_rqs(rq);
				7923	}
				7924
				7925	#endif /* CONFIG_SMP */
				7926
				7927	/*
				7928	* scheduler tick hitting a task of our scheduling class:
				7929	*/
				7930	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
				7931	{
				7932	struct cfs_rq *cfs_rq;
				7933	struct sched_entity *se = &curr->se;
				7934
				7935	for_each_sched_entity(se) {
				7936	cfs_rq = cfs_rq_of(se);
				7937	entity_tick(cfs_rq, se, queued);
				7938	}
				7939
				7940	if (static_branch_unlikely(&sched_numa_balancing))
				7941	task_tick_numa(rq, curr);
				7942	}
				7943
				7944	/*
				7945	* called on fork with the child task as argument from the parent's context
				7946	* - child not yet on the tasklist
				7947	* - preemption disabled
				7948	*/
				7949	static void task_fork_fair(struct task_struct *p)
				7950	{
				7951	struct cfs_rq *cfs_rq;
				7952	struct sched_entity se = &p->se, curr;
				7953	int this_cpu = smp_processor_id();
				7954	struct rq *rq = this_rq();
				7955	unsigned long flags;
				7956
				7957	raw_spin_lock_irqsave(&rq->lock, flags);
				7958
				7959	update_rq_clock(rq);
				7960
				7961	cfs_rq = task_cfs_rq(current);
				7962	curr = cfs_rq->curr;
				7963
				7964	/*
				7965	* Not only the cpu but also the task_group of the parent might have
				7966	* been changed after parent->se.parent,cfs_rq were copied to
				7967	* child->se.parent,cfs_rq. So call __set_task_cpu() to make those
				7968	* of child point to valid ones.
				7969	*/
				7970	rcu_read_lock();
				7971	__set_task_cpu(p, this_cpu);
				7972	rcu_read_unlock();
				7973
				7974	update_curr(cfs_rq);
				7975
				7976	if (curr)
				7977	se->vruntime = curr->vruntime;
				7978	place_entity(cfs_rq, se, 1);
				7979
				7980	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
				7981	/*
				7982	* Upon rescheduling, sched_class::put_prev_task() will place
				7983	* 'current' within the tree based on its new key value.
				7984	*/
				7985	swap(curr->vruntime, se->vruntime);
				7986	resched_curr(rq);
				7987	}
				7988
				7989	se->vruntime -= cfs_rq->min_vruntime;
				7990
				7991	raw_spin_unlock_irqrestore(&rq->lock, flags);
				7992	}
				7993
				7994	/*
				7995	* Priority of the task has changed. Check to see if we preempt
				7996	* the current task.
				7997	*/
				7998	static void
				7999	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
				8000	{
				8001	if (!task_on_rq_queued(p))
				8002	return;
				8003
				8004	/*
				8005	* Reschedule if we are currently running on this runqueue and
				8006	* our priority decreased, or if we are not currently running on
				8007	* this runqueue and our priority is higher than the current's
				8008	*/
				8009	if (rq->curr == p) {
				8010	if (p->prio > oldprio)
				8011	resched_curr(rq);
				8012	} else
				8013	check_preempt_curr(rq, p, 0);
				8014	}
				8015
				8016	static inline bool vruntime_normalized(struct task_struct *p)
				8017	{
				8018	struct sched_entity *se = &p->se;
				8019
				8020	/*
				8021	* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
				8022	* the dequeue_entity(.flags=0) will already have normalized the
				8023	* vruntime.
				8024	*/
				8025	if (p->on_rq)
				8026	return true;
				8027
				8028	/*
				8029	* When !on_rq, vruntime of the task has usually NOT been normalized.
				8030	* But there are some cases where it has already been normalized:
				8031	*
				8032	* - A forked child which is waiting for being woken up by
				8033	* wake_up_new_task().
				8034	* - A task which has been woken up by try_to_wake_up() and
				8035	* waiting for actually being woken up by sched_ttwu_pending().
				8036	*/
				8037	if (!se->sum_exec_runtime \|\| p->state == TASK_WAKING)
				8038	return true;
				8039
				8040	return false;
				8041	}
				8042
				8043	static void detach_task_cfs_rq(struct task_struct *p)
				8044	{
				8045	struct sched_entity *se = &p->se;
				8046	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				8047
				8048	if (!vruntime_normalized(p)) {
				8049	/*
				8050	* Fix up our vruntime so that the current sleep doesn't
				8051	* cause 'unlimited' sleep bonus.
				8052	*/
				8053	place_entity(cfs_rq, se, 0);
				8054	se->vruntime -= cfs_rq->min_vruntime;
				8055	}
				8056
				8057	/* Catch up with the cfs_rq and remove our load when we leave */
				8058	detach_entity_load_avg(cfs_rq, se);
				8059	}
				8060
				8061	static void attach_task_cfs_rq(struct task_struct *p)
				8062	{
				8063	struct sched_entity *se = &p->se;
				8064	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				8065
				8066	#ifdef CONFIG_FAIR_GROUP_SCHED
				8067	/*
				8068	* Since the real-depth could have been changed (only FAIR
				8069	* class maintain depth value), reset depth properly.
				8070	*/
				8071	se->depth = se->parent ? se->parent->depth + 1 : 0;
				8072	#endif
				8073
				8074	/* Synchronize task with its cfs_rq */
				8075	attach_entity_load_avg(cfs_rq, se);
				8076
				8077	if (!vruntime_normalized(p))
				8078	se->vruntime += cfs_rq->min_vruntime;
				8079	}
				8080
				8081	static void switched_from_fair(struct rq rq, struct task_struct p)
				8082	{
				8083	detach_task_cfs_rq(p);
				8084	}
				8085
				8086	static void switched_to_fair(struct rq rq, struct task_struct p)
				8087	{
				8088	attach_task_cfs_rq(p);
				8089
				8090	if (task_on_rq_queued(p)) {
				8091	/*
				8092	* We were most likely switched from sched_rt, so
				8093	* kick off the schedule if running, otherwise just see
				8094	* if we can still preempt the current task.
				8095	*/
				8096	if (rq->curr == p)
				8097	resched_curr(rq);
				8098	else
				8099	check_preempt_curr(rq, p, 0);
				8100	}
				8101	}
				8102
				8103	/* Account for a task changing its policy or group.
				8104	*
				8105	* This routine is mostly called to set cfs_rq->curr field when a task
				8106	* migrates between groups/classes.
				8107	*/
				8108	static void set_curr_task_fair(struct rq *rq)
				8109	{
				8110	struct sched_entity *se = &rq->curr->se;
				8111
				8112	for_each_sched_entity(se) {
				8113	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				8114
				8115	set_next_entity(cfs_rq, se);
				8116	/* ensure bandwidth has been allocated on our new cfs_rq */
				8117	account_cfs_rq_runtime(cfs_rq, 0);
				8118	}
				8119	}
				8120
				8121	void init_cfs_rq(struct cfs_rq *cfs_rq)
				8122	{
				8123	cfs_rq->tasks_timeline = RB_ROOT;
				8124	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				8125	#ifndef CONFIG_64BIT
				8126	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				8127	#endif
				8128	#ifdef CONFIG_SMP
				8129	atomic_long_set(&cfs_rq->removed_load_avg, 0);
				8130	atomic_long_set(&cfs_rq->removed_util_avg, 0);
				8131	#endif
				8132	}
				8133
				8134	#ifdef CONFIG_FAIR_GROUP_SCHED
				8135	static void task_move_group_fair(struct task_struct *p)
				8136	{
				8137	detach_task_cfs_rq(p);
				8138	set_task_rq(p, task_cpu(p));
				8139
				8140	#ifdef CONFIG_SMP
				8141	/* Tell se's cfs_rq has been changed -- migrated */
				8142	p->se.avg.last_update_time = 0;
				8143	#endif
				8144	attach_task_cfs_rq(p);
				8145	}
				8146
				8147	void free_fair_sched_group(struct task_group *tg)
				8148	{
				8149	int i;
				8150
				8151	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				8152
				8153	for_each_possible_cpu(i) {
				8154	if (tg->cfs_rq)
				8155	kfree(tg->cfs_rq[i]);
				8156	if (tg->se) {
				8157	if (tg->se[i])
				8158	remove_entity_load_avg(tg->se[i]);
				8159	kfree(tg->se[i]);
				8160	}
				8161	}
				8162
				8163	kfree(tg->cfs_rq);
				8164	kfree(tg->se);
				8165	}
				8166
				8167	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				8168	{
				8169	struct cfs_rq *cfs_rq;
				8170	struct sched_entity *se;
				8171	int i;
				8172
				8173	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				8174	if (!tg->cfs_rq)
				8175	goto err;
				8176	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				8177	if (!tg->se)
				8178	goto err;
				8179
				8180	tg->shares = NICE_0_LOAD;
				8181
				8182	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				8183
				8184	for_each_possible_cpu(i) {
				8185	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				8186	GFP_KERNEL, cpu_to_node(i));
				8187	if (!cfs_rq)
				8188	goto err;
				8189
				8190	se = kzalloc_node(sizeof(struct sched_entity),
				8191	GFP_KERNEL, cpu_to_node(i));
				8192	if (!se)
				8193	goto err_free_rq;
				8194
				8195	init_cfs_rq(cfs_rq);
				8196	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				8197	init_entity_runnable_average(se);
				8198	}
				8199
				8200	return 1;
				8201
				8202	err_free_rq:
				8203	kfree(cfs_rq);
				8204	err:
				8205	return 0;
				8206	}
				8207
				8208	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				8209	{
				8210	struct rq *rq = cpu_rq(cpu);
				8211	unsigned long flags;
				8212
				8213	/*
				8214	* Only empty task groups can be destroyed; so we can speculatively
				8215	* check on_list without danger of it being re-added.
				8216	*/
				8217	if (!tg->cfs_rq[cpu]->on_list)
				8218	return;
				8219
				8220	raw_spin_lock_irqsave(&rq->lock, flags);
				8221	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				8222	raw_spin_unlock_irqrestore(&rq->lock, flags);
				8223	}
				8224
				8225	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				8226	struct sched_entity *se, int cpu,
				8227	struct sched_entity *parent)
				8228	{
				8229	struct rq *rq = cpu_rq(cpu);
				8230
				8231	cfs_rq->tg = tg;
				8232	cfs_rq->rq = rq;
				8233	init_cfs_rq_runtime(cfs_rq);
				8234
				8235	tg->cfs_rq[cpu] = cfs_rq;
				8236	tg->se[cpu] = se;
				8237
				8238	/* se could be NULL for root_task_group */
				8239	if (!se)
				8240	return;
				8241
				8242	if (!parent) {
				8243	se->cfs_rq = &rq->cfs;
				8244	se->depth = 0;
				8245	} else {
				8246	se->cfs_rq = parent->my_q;
				8247	se->depth = parent->depth + 1;
				8248	}
				8249
				8250	se->my_q = cfs_rq;
				8251	/* guarantee group entities always have weight */
				8252	update_load_set(&se->load, NICE_0_LOAD);
				8253	se->parent = parent;
				8254	}
				8255
				8256	static DEFINE_MUTEX(shares_mutex);
				8257
				8258	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				8259	{
				8260	int i;
				8261	unsigned long flags;
				8262
				8263	/*
				8264	* We can't change the weight of the root cgroup.
				8265	*/
				8266	if (!tg->se[0])
				8267	return -EINVAL;
				8268
				8269	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				8270
				8271	mutex_lock(&shares_mutex);
				8272	if (tg->shares == shares)
				8273	goto done;
				8274
				8275	tg->shares = shares;
				8276	for_each_possible_cpu(i) {
				8277	struct rq *rq = cpu_rq(i);
				8278	struct sched_entity *se;
				8279
				8280	se = tg->se[i];
				8281	/* Propagate contribution to hierarchy */
				8282	raw_spin_lock_irqsave(&rq->lock, flags);
				8283
				8284	/* Possible calls to update_curr() need rq clock */
				8285	update_rq_clock(rq);
				8286	for_each_sched_entity(se)
				8287	update_cfs_shares(group_cfs_rq(se));
				8288	raw_spin_unlock_irqrestore(&rq->lock, flags);
				8289	}
				8290
				8291	done:
				8292	mutex_unlock(&shares_mutex);
				8293	return 0;
				8294	}
				8295	#else /* CONFIG_FAIR_GROUP_SCHED */
				8296
				8297	void free_fair_sched_group(struct task_group *tg) { }
				8298
				8299	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				8300	{
				8301	return 1;
				8302	}
				8303
				8304	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				8305
				8306	#endif /* CONFIG_FAIR_GROUP_SCHED */
				8307
				8308
				8309	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
				8310	{
				8311	struct sched_entity *se = &task->se;
				8312	unsigned int rr_interval = 0;
				8313
				8314	/*
				8315	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				8316	* idle runqueue:
				8317	*/
				8318	if (rq->cfs.load.weight)
				8319	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
				8320
				8321	return rr_interval;
				8322	}
				8323
				8324	/*
				8325	* All the scheduling class methods:
				8326	*/
				8327	const struct sched_class fair_sched_class = {
				8328	.next = &idle_sched_class,
				8329	.enqueue_task = enqueue_task_fair,
				8330	.dequeue_task = dequeue_task_fair,
				8331	.yield_task = yield_task_fair,
				8332	.yield_to_task = yield_to_task_fair,
				8333
				8334	.check_preempt_curr = check_preempt_wakeup,
				8335
				8336	.pick_next_task = pick_next_task_fair,
				8337	.put_prev_task = put_prev_task_fair,
				8338
				8339	#ifdef CONFIG_SMP
				8340	.select_task_rq = select_task_rq_fair,
				8341	.migrate_task_rq = migrate_task_rq_fair,
				8342
				8343	.rq_online = rq_online_fair,
				8344	.rq_offline = rq_offline_fair,
				8345
				8346	.task_waking = task_waking_fair,
				8347	.task_dead = task_dead_fair,
				8348	.set_cpus_allowed = set_cpus_allowed_common,
				8349	#endif
				8350
				8351	.set_curr_task = set_curr_task_fair,
				8352	.task_tick = task_tick_fair,
				8353	.task_fork = task_fork_fair,
				8354
				8355	.prio_changed = prio_changed_fair,
				8356	.switched_from = switched_from_fair,
				8357	.switched_to = switched_to_fair,
				8358
				8359	.get_rr_interval = get_rr_interval_fair,
				8360
				8361	.update_curr = update_curr_fair,
				8362
				8363	#ifdef CONFIG_FAIR_GROUP_SCHED
				8364	.task_move_group = task_move_group_fair,
				8365	#endif
				8366	};
				8367
				8368	#ifdef CONFIG_SCHED_DEBUG
				8369	void print_cfs_stats(struct seq_file *m, int cpu)
				8370	{
				8371	struct cfs_rq *cfs_rq;
				8372
				8373	rcu_read_lock();
				8374	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
				8375	print_cfs_rq(m, cpu, cfs_rq);
				8376	rcu_read_unlock();
				8377	}
				8378
				8379	#ifdef CONFIG_NUMA_BALANCING
				8380	void show_numa_stats(struct task_struct p, struct seq_file m)
				8381	{
				8382	int node;
				8383	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
				8384
				8385	for_each_online_node(node) {
				8386	if (p->numa_faults) {
				8387	tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
				8388	tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
				8389	}
				8390	if (p->numa_group) {
				8391	gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
				8392	gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
				8393	}
				8394	print_numa_stats(m, node, tsf, tpf, gsf, gpf);
				8395	}
				8396	}
				8397	#endif /* CONFIG_NUMA_BALANCING */
				8398	#endif /* CONFIG_SCHED_DEBUG */
				8399
				8400	__init void init_sched_fair_class(void)
				8401	{
				8402	#ifdef CONFIG_SMP
				8403	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				8404
				8405	#ifdef CONFIG_NO_HZ_COMMON
				8406	nohz.next_balance = jiffies;
				8407	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
				8408	cpu_notifier(sched_ilb_notifier, 0);
				8409	#endif
				8410	#endif /* SMP */
				8411
				8412	}