Blame - net/sched/sch_qfq.c - codeaurora/cp-linux

blob: 8d2d8d953432e83d8523cc42923793eb852f599c [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1	/*
				2	* net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
				3	*
				4	* Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
				5	* Copyright (c) 2012 Paolo Valente.
				6	*
				7	* This program is free software; you can redistribute it and/or
				8	* modify it under the terms of the GNU General Public License
				9	* version 2 as published by the Free Software Foundation.
				10	*/
				11
				12	#include <linux/module.h>
				13	#include <linux/init.h>
				14	#include <linux/bitops.h>
				15	#include <linux/errno.h>
				16	#include <linux/netdevice.h>
				17	#include <linux/pkt_sched.h>
				18	#include <net/sch_generic.h>
				19	#include <net/pkt_sched.h>
				20	#include <net/pkt_cls.h>
				21
				22
				23	/* Quick Fair Queueing Plus
				24	========================
				25
				26	Sources:
				27
				28	[1] Paolo Valente,
				29	"Reducing the Execution Time of Fair-Queueing Schedulers."
				30	http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
				31
				32	Sources for QFQ:
				33
				34	[2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
				35	Packet Scheduling with Tight Bandwidth Distribution Guarantees."
				36
				37	See also:
				38	http://retis.sssup.it/~fabio/linux/qfq/
				39	*/
				40
				41	/*
				42
				43	QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
				44	classes. Each aggregate is timestamped with a virtual start time S
				45	and a virtual finish time F, and scheduled according to its
				46	timestamps. S and F are computed as a function of a system virtual
				47	time function V. The classes within each aggregate are instead
				48	scheduled with DRR.
				49
				50	To speed up operations, QFQ+ divides also aggregates into a limited
				51	number of groups. Which group a class belongs to depends on the
				52	ratio between the maximum packet length for the class and the weight
				53	of the class. Groups have their own S and F. In the end, QFQ+
				54	schedules groups, then aggregates within groups, then classes within
				55	aggregates. See [1] and [2] for a full description.
				56
				57	Virtual time computations.
				58
				59	S, F and V are all computed in fixed point arithmetic with
				60	FRAC_BITS decimal bits.
				61
				62	QFQ_MAX_INDEX is the maximum index allowed for a group. We need
				63	one bit per index.
				64	QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
				65
				66	The layout of the bits is as below:
				67
				68	[ MTU_SHIFT ][ FRAC_BITS ]
				69	[ MAX_INDEX ][ MIN_SLOT_SHIFT ]
				70	^.__grp->index = 0
				71	*.__grp->slot_shift
				72
				73	where MIN_SLOT_SHIFT is derived by difference from the others.
				74
				75	The max group index corresponds to Lmax/w_min, where
				76	Lmax=1<<MTU_SHIFT, w_min = 1 .
				77	From this, and knowing how many groups (MAX_INDEX) we want,
				78	we can derive the shift corresponding to each group.
				79
				80	Because we often need to compute
				81	F = S + len/w_i and V = V + len/wsum
				82	instead of storing w_i store the value
				83	inv_w = (1<<FRAC_BITS)/w_i
				84	so we can do F = S + len * inv_w * wsum.
				85	We use W_TOT in the formulas so we can easily move between
				86	static and adaptive weight sum.
				87
				88	The per-scheduler-instance data contain all the data structures
				89	for the scheduler: bitmaps and bucket lists.
				90
				91	*/
				92
				93	/*
				94	* Maximum number of consecutive slots occupied by backlogged classes
				95	* inside a group.
				96	*/
				97	#define QFQ_MAX_SLOTS 32
				98
				99	/*
				100	* Shifts used for aggregate<->group mapping. We allow class weights that are
				101	* in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
				102	* group with the smallest index that can support the L_i / r_i configured
				103	* for the classes in the aggregate.
				104	*
				105	* grp->index is the index of the group; and grp->slot_shift
				106	* is the shift for the corresponding (scaled) sigma_i.
				107	*/
				108	#define QFQ_MAX_INDEX 24
				109	#define QFQ_MAX_WSHIFT 10
				110
				111	#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
				112	#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
				113
				114	#define FRAC_BITS 30 /* fixed point arithmetic */
				115	#define ONE_FP (1UL << FRAC_BITS)
				116
				117	#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
				118	#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
				119
				120	#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
				121
				122	/*
				123	* Possible group states. These values are used as indexes for the bitmaps
				124	* array of struct qfq_queue.
				125	*/
				126	enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
				127
				128	struct qfq_group;
				129
				130	struct qfq_aggregate;
				131
				132	struct qfq_class {
				133	struct Qdisc_class_common common;
				134
				135	unsigned int refcnt;
				136	unsigned int filter_cnt;
				137
				138	struct gnet_stats_basic_packed bstats;
				139	struct gnet_stats_queue qstats;
				140	struct gnet_stats_rate_est64 rate_est;
				141	struct Qdisc *qdisc;
				142	struct list_head alist; /* Link for active-classes list. */
				143	struct qfq_aggregate agg; / Parent aggregate. */
				144	int deficit; /* DRR deficit counter. */
				145	};
				146
				147	struct qfq_aggregate {
				148	struct hlist_node next; /* Link for the slot list. */
				149	u64 S, F; /* flow timestamps (exact) */
				150
				151	/* group we belong to. In principle we would need the index,
				152	* which is log_2(lmax/weight), but we never reference it
				153	* directly, only the group.
				154	*/
				155	struct qfq_group *grp;
				156
				157	/* these are copied from the flowset. */
				158	u32 class_weight; /* Weight of each class in this aggregate. */
				159	/* Max pkt size for the classes in this aggregate, DRR quantum. */
				160	int lmax;
				161
				162	u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
				163	u32 budgetmax; /* Max budget for this aggregate. */
				164	u32 initial_budget, budget; /* Initial and current budget. */
				165
				166	int num_classes; /* Number of classes in this aggr. */
				167	struct list_head active; /* DRR queue of active classes. */
				168
				169	struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
				170	};
				171
				172	struct qfq_group {
				173	u64 S, F; /* group timestamps (approx). */
				174	unsigned int slot_shift; /* Slot shift. */
				175	unsigned int index; /* Group index. */
				176	unsigned int front; /* Index of the front slot. */
				177	unsigned long full_slots; /* non-empty slots */
				178
				179	/* Array of RR lists of active aggregates. */
				180	struct hlist_head slots[QFQ_MAX_SLOTS];
				181	};
				182
				183	struct qfq_sched {
				184	struct tcf_proto __rcu *filter_list;
				185	struct Qdisc_class_hash clhash;
				186
				187	u64 oldV, V; /* Precise virtual times. */
				188	struct qfq_aggregate in_serv_agg; / Aggregate being served. */
				189	u32 wsum; /* weight sum */
				190	u32 iwsum; /* inverse weight sum */
				191
				192	unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
				193	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
				194	u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
				195
				196	u32 max_agg_classes; /* Max number of classes per aggr. */
				197	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
				198	};
				199
				200	/*
				201	* Possible reasons why the timestamps of an aggregate are updated
				202	* enqueue: the aggregate switches from idle to active and must scheduled
				203	* for service
				204	* requeue: the aggregate finishes its budget, so it stops being served and
				205	* must be rescheduled for service
				206	*/
				207	enum update_reason {enqueue, requeue};
				208
				209	static struct qfq_class qfq_find_class(struct Qdisc sch, u32 classid)
				210	{
				211	struct qfq_sched *q = qdisc_priv(sch);
				212	struct Qdisc_class_common *clc;
				213
				214	clc = qdisc_class_find(&q->clhash, classid);
				215	if (clc == NULL)
				216	return NULL;
				217	return container_of(clc, struct qfq_class, common);
				218	}
				219
				220	static void qfq_purge_queue(struct qfq_class *cl)
				221	{
				222	unsigned int len = cl->qdisc->q.qlen;
				223	unsigned int backlog = cl->qdisc->qstats.backlog;
				224
				225	qdisc_reset(cl->qdisc);
				226	qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
				227	}
				228
				229	static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
				230	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
				231	[TCA_QFQ_LMAX] = { .type = NLA_U32 },
				232	};
				233
				234	/*
				235	* Calculate a flow index, given its weight and maximum packet length.
				236	* index = log_2(maxlen/weight) but we need to apply the scaling.
				237	* This is used only once at flow creation.
				238	*/
				239	static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
				240	{
				241	u64 slot_size = (u64)maxlen * inv_w;
				242	unsigned long size_map;
				243	int index = 0;
				244
				245	size_map = slot_size >> min_slot_shift;
				246	if (!size_map)
				247	goto out;
				248
				249	index = __fls(size_map) + 1; /* basically a log_2 */
				250	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
				251
				252	if (index < 0)
				253	index = 0;
				254	out:
				255	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
				256	(unsigned long) ONE_FP/inv_w, maxlen, index);
				257
				258	return index;
				259	}
				260
				261	static void qfq_deactivate_agg(struct qfq_sched , struct qfq_aggregate );
				262	static void qfq_activate_agg(struct qfq_sched , struct qfq_aggregate ,
				263	enum update_reason);
				264
				265	static void qfq_init_agg(struct qfq_sched q, struct qfq_aggregate agg,
				266	u32 lmax, u32 weight)
				267	{
				268	INIT_LIST_HEAD(&agg->active);
				269	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
				270
				271	agg->lmax = lmax;
				272	agg->class_weight = weight;
				273	}
				274
				275	static struct qfq_aggregate qfq_find_agg(struct qfq_sched q,
				276	u32 lmax, u32 weight)
				277	{
				278	struct qfq_aggregate *agg;
				279
				280	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
				281	if (agg->lmax == lmax && agg->class_weight == weight)
				282	return agg;
				283
				284	return NULL;
				285	}
				286
				287
				288	/* Update aggregate as a function of the new number of classes. */
				289	static void qfq_update_agg(struct qfq_sched q, struct qfq_aggregate agg,
				290	int new_num_classes)
				291	{
				292	u32 new_agg_weight;
				293
				294	if (new_num_classes == q->max_agg_classes)
				295	hlist_del_init(&agg->nonfull_next);
				296
				297	if (agg->num_classes > new_num_classes &&
				298	new_num_classes == q->max_agg_classes - 1) /* agg no more full */
				299	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
				300
				301	/* The next assignment may let
				302	* agg->initial_budget > agg->budgetmax
				303	* hold, we will take it into account in charge_actual_service().
				304	*/
				305	agg->budgetmax = new_num_classes * agg->lmax;
				306	new_agg_weight = agg->class_weight * new_num_classes;
				307	agg->inv_w = ONE_FP/new_agg_weight;
				308
				309	if (agg->grp == NULL) {
				310	int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
				311	q->min_slot_shift);
				312	agg->grp = &q->groups[i];
				313	}
				314
				315	q->wsum +=
				316	(int) agg->class_weight * (new_num_classes - agg->num_classes);
				317	q->iwsum = ONE_FP / q->wsum;
				318
				319	agg->num_classes = new_num_classes;
				320	}
				321
				322	/* Add class to aggregate. */
				323	static void qfq_add_to_agg(struct qfq_sched *q,
				324	struct qfq_aggregate *agg,
				325	struct qfq_class *cl)
				326	{
				327	cl->agg = agg;
				328
				329	qfq_update_agg(q, agg, agg->num_classes+1);
				330	if (cl->qdisc->q.qlen > 0) { /* adding an active class */
				331	list_add_tail(&cl->alist, &agg->active);
				332	if (list_first_entry(&agg->active, struct qfq_class, alist) ==
				333	cl && q->in_serv_agg != agg) /* agg was inactive */
				334	qfq_activate_agg(q, agg, enqueue); /* schedule agg */
				335	}
				336	}
				337
				338	static struct qfq_aggregate qfq_choose_next_agg(struct qfq_sched );
				339
				340	static void qfq_destroy_agg(struct qfq_sched q, struct qfq_aggregate agg)
				341	{
				342	hlist_del_init(&agg->nonfull_next);
				343	q->wsum -= agg->class_weight;
				344	if (q->wsum != 0)
				345	q->iwsum = ONE_FP / q->wsum;
				346
				347	if (q->in_serv_agg == agg)
				348	q->in_serv_agg = qfq_choose_next_agg(q);
				349	kfree(agg);
				350	}
				351
				352	/* Deschedule class from within its parent aggregate. */
				353	static void qfq_deactivate_class(struct qfq_sched q, struct qfq_class cl)
				354	{
				355	struct qfq_aggregate *agg = cl->agg;
				356
				357
				358	list_del(&cl->alist); /* remove from RR queue of the aggregate */
				359	if (list_empty(&agg->active)) /* agg is now inactive */
				360	qfq_deactivate_agg(q, agg);
				361	}
				362
				363	/* Remove class from its parent aggregate. */
				364	static void qfq_rm_from_agg(struct qfq_sched q, struct qfq_class cl)
				365	{
				366	struct qfq_aggregate *agg = cl->agg;
				367
				368	cl->agg = NULL;
				369	if (agg->num_classes == 1) { /* agg being emptied, destroy it */
				370	qfq_destroy_agg(q, agg);
				371	return;
				372	}
				373	qfq_update_agg(q, agg, agg->num_classes-1);
				374	}
				375
				376	/* Deschedule class and remove it from its parent aggregate. */
				377	static void qfq_deact_rm_from_agg(struct qfq_sched q, struct qfq_class cl)
				378	{
				379	if (cl->qdisc->q.qlen > 0) /* class is active */
				380	qfq_deactivate_class(q, cl);
				381
				382	qfq_rm_from_agg(q, cl);
				383	}
				384
				385	/* Move class to a new aggregate, matching the new class weight and/or lmax */
				386	static int qfq_change_agg(struct Qdisc sch, struct qfq_class cl, u32 weight,
				387	u32 lmax)
				388	{
				389	struct qfq_sched *q = qdisc_priv(sch);
				390	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
				391
				392	if (new_agg == NULL) { /* create new aggregate */
				393	new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
				394	if (new_agg == NULL)
				395	return -ENOBUFS;
				396	qfq_init_agg(q, new_agg, lmax, weight);
				397	}
				398	qfq_deact_rm_from_agg(q, cl);
				399	qfq_add_to_agg(q, new_agg, cl);
				400
				401	return 0;
				402	}
				403
				404	static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
				405	struct nlattr *tca, unsigned long arg)
				406	{
				407	struct qfq_sched *q = qdisc_priv(sch);
				408	struct qfq_class cl = (struct qfq_class )*arg;
				409	bool existing = false;
				410	struct nlattr *tb[TCA_QFQ_MAX + 1];
				411	struct qfq_aggregate *new_agg = NULL;
				412	u32 weight, lmax, inv_w;
				413	int err;
				414	int delta_w;
				415
				416	if (tca[TCA_OPTIONS] == NULL) {
				417	pr_notice("qfq: no options\n");
				418	return -EINVAL;
				419	}
				420
				421	err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
				422	if (err < 0)
				423	return err;
				424
				425	if (tb[TCA_QFQ_WEIGHT]) {
				426	weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
				427	if (!weight \|\| weight > (1UL << QFQ_MAX_WSHIFT)) {
				428	pr_notice("qfq: invalid weight %u\n", weight);
				429	return -EINVAL;
				430	}
				431	} else
				432	weight = 1;
				433
				434	if (tb[TCA_QFQ_LMAX]) {
				435	lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
				436	if (lmax < QFQ_MIN_LMAX \|\| lmax > (1UL << QFQ_MTU_SHIFT)) {
				437	pr_notice("qfq: invalid max length %u\n", lmax);
				438	return -EINVAL;
				439	}
				440	} else
				441	lmax = psched_mtu(qdisc_dev(sch));
				442
				443	inv_w = ONE_FP / weight;
				444	weight = ONE_FP / inv_w;
				445
				446	if (cl != NULL &&
				447	lmax == cl->agg->lmax &&
				448	weight == cl->agg->class_weight)
				449	return 0; /* nothing to change */
				450
				451	delta_w = weight - (cl ? cl->agg->class_weight : 0);
				452
				453	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
				454	pr_notice("qfq: total weight out of range (%d + %u)\n",
				455	delta_w, q->wsum);
				456	return -EINVAL;
				457	}
				458
				459	if (cl != NULL) { /* modify existing class */
				460	if (tca[TCA_RATE]) {
				461	err = gen_replace_estimator(&cl->bstats, NULL,
				462	&cl->rate_est,
				463	qdisc_root_sleeping_lock(sch),
				464	tca[TCA_RATE]);
				465	if (err)
				466	return err;
				467	}
				468	existing = true;
				469	goto set_change_agg;
				470	}
				471
				472	/* create and init new class */
				473	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
				474	if (cl == NULL)
				475	return -ENOBUFS;
				476
				477	cl->refcnt = 1;
				478	cl->common.classid = classid;
				479	cl->deficit = lmax;
				480
				481	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
				482	&pfifo_qdisc_ops, classid);
				483	if (cl->qdisc == NULL)
				484	cl->qdisc = &noop_qdisc;
				485
				486	if (tca[TCA_RATE]) {
				487	err = gen_new_estimator(&cl->bstats, NULL,
				488	&cl->rate_est,
				489	qdisc_root_sleeping_lock(sch),
				490	tca[TCA_RATE]);
				491	if (err)
				492	goto destroy_class;
				493	}
				494
				495	sch_tree_lock(sch);
				496	qdisc_class_hash_insert(&q->clhash, &cl->common);
				497	sch_tree_unlock(sch);
				498
				499	qdisc_class_hash_grow(sch, &q->clhash);
				500
				501	set_change_agg:
				502	sch_tree_lock(sch);
				503	new_agg = qfq_find_agg(q, lmax, weight);
				504	if (new_agg == NULL) { /* create new aggregate */
				505	sch_tree_unlock(sch);
				506	new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
				507	if (new_agg == NULL) {
				508	err = -ENOBUFS;
				509	gen_kill_estimator(&cl->bstats, &cl->rate_est);
				510	goto destroy_class;
				511	}
				512	sch_tree_lock(sch);
				513	qfq_init_agg(q, new_agg, lmax, weight);
				514	}
				515	if (existing)
				516	qfq_deact_rm_from_agg(q, cl);
				517	qfq_add_to_agg(q, new_agg, cl);
				518	sch_tree_unlock(sch);
				519
				520	*arg = (unsigned long)cl;
				521	return 0;
				522
				523	destroy_class:
				524	qdisc_destroy(cl->qdisc);
				525	kfree(cl);
				526	return err;
				527	}
				528
				529	static void qfq_destroy_class(struct Qdisc sch, struct qfq_class cl)
				530	{
				531	struct qfq_sched *q = qdisc_priv(sch);
				532
				533	qfq_rm_from_agg(q, cl);
				534	gen_kill_estimator(&cl->bstats, &cl->rate_est);
				535	qdisc_destroy(cl->qdisc);
				536	kfree(cl);
				537	}
				538
				539	static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
				540	{
				541	struct qfq_sched *q = qdisc_priv(sch);
				542	struct qfq_class cl = (struct qfq_class )arg;
				543
				544	if (cl->filter_cnt > 0)
				545	return -EBUSY;
				546
				547	sch_tree_lock(sch);
				548
				549	qfq_purge_queue(cl);
				550	qdisc_class_hash_remove(&q->clhash, &cl->common);
				551
				552	BUG_ON(--cl->refcnt == 0);
				553	/*
				554	* This shouldn't happen: we "hold" one cops->get() when called
				555	* from tc_ctl_tclass; the destroy method is done from cops->put().
				556	*/
				557
				558	sch_tree_unlock(sch);
				559	return 0;
				560	}
				561
				562	static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
				563	{
				564	struct qfq_class *cl = qfq_find_class(sch, classid);
				565
				566	if (cl != NULL)
				567	cl->refcnt++;
				568
				569	return (unsigned long)cl;
				570	}
				571
				572	static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
				573	{
				574	struct qfq_class cl = (struct qfq_class )arg;
				575
				576	if (--cl->refcnt == 0)
				577	qfq_destroy_class(sch, cl);
				578	}
				579
				580	static struct tcf_proto __rcu *qfq_tcf_chain(struct Qdisc sch,
				581	unsigned long cl)
				582	{
				583	struct qfq_sched *q = qdisc_priv(sch);
				584
				585	if (cl)
				586	return NULL;
				587
				588	return &q->filter_list;
				589	}
				590
				591	static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
				592	u32 classid)
				593	{
				594	struct qfq_class *cl = qfq_find_class(sch, classid);
				595
				596	if (cl != NULL)
				597	cl->filter_cnt++;
				598
				599	return (unsigned long)cl;
				600	}
				601
				602	static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
				603	{
				604	struct qfq_class cl = (struct qfq_class )arg;
				605
				606	cl->filter_cnt--;
				607	}
				608
				609	static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
				610	struct Qdisc new, struct Qdisc *old)
				611	{
				612	struct qfq_class cl = (struct qfq_class )arg;
				613
				614	if (new == NULL) {
				615	new = qdisc_create_dflt(sch->dev_queue,
				616	&pfifo_qdisc_ops, cl->common.classid);
				617	if (new == NULL)
				618	new = &noop_qdisc;
				619	}
				620
				621	*old = qdisc_replace(sch, new, &cl->qdisc);
				622	return 0;
				623	}
				624
				625	static struct Qdisc qfq_class_leaf(struct Qdisc sch, unsigned long arg)
				626	{
				627	struct qfq_class cl = (struct qfq_class )arg;
				628
				629	return cl->qdisc;
				630	}
				631
				632	static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
				633	struct sk_buff skb, struct tcmsg tcm)
				634	{
				635	struct qfq_class cl = (struct qfq_class )arg;
				636	struct nlattr *nest;
				637
				638	tcm->tcm_parent = TC_H_ROOT;
				639	tcm->tcm_handle = cl->common.classid;
				640	tcm->tcm_info = cl->qdisc->handle;
				641
				642	nest = nla_nest_start(skb, TCA_OPTIONS);
				643	if (nest == NULL)
				644	goto nla_put_failure;
				645	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) \|\|
				646	nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
				647	goto nla_put_failure;
				648	return nla_nest_end(skb, nest);
				649
				650	nla_put_failure:
				651	nla_nest_cancel(skb, nest);
				652	return -EMSGSIZE;
				653	}
				654
				655	static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
				656	struct gnet_dump *d)
				657	{
				658	struct qfq_class cl = (struct qfq_class )arg;
				659	struct tc_qfq_stats xstats;
				660
				661	memset(&xstats, 0, sizeof(xstats));
				662
				663	xstats.weight = cl->agg->class_weight;
				664	xstats.lmax = cl->agg->lmax;
				665
				666	if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 \|\|
				667	gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 \|\|
				668	gnet_stats_copy_queue(d, NULL,
				669	&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
				670	return -1;
				671
				672	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
				673	}
				674
				675	static void qfq_walk(struct Qdisc sch, struct qdisc_walker arg)
				676	{
				677	struct qfq_sched *q = qdisc_priv(sch);
				678	struct qfq_class *cl;
				679	unsigned int i;
				680
				681	if (arg->stop)
				682	return;
				683
				684	for (i = 0; i < q->clhash.hashsize; i++) {
				685	hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
				686	if (arg->count < arg->skip) {
				687	arg->count++;
				688	continue;
				689	}
				690	if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
				691	arg->stop = 1;
				692	return;
				693	}
				694	arg->count++;
				695	}
				696	}
				697	}
				698
				699	static struct qfq_class qfq_classify(struct sk_buff skb, struct Qdisc *sch,
				700	int *qerr)
				701	{
				702	struct qfq_sched *q = qdisc_priv(sch);
				703	struct qfq_class *cl;
				704	struct tcf_result res;
				705	struct tcf_proto *fl;
				706	int result;
				707
				708	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
				709	pr_debug("qfq_classify: found %d\n", skb->priority);
				710	cl = qfq_find_class(sch, skb->priority);
				711	if (cl != NULL)
				712	return cl;
				713	}
				714
				715	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
				716	fl = rcu_dereference_bh(q->filter_list);
				717	result = tc_classify(skb, fl, &res, false);
				718	if (result >= 0) {
				719	#ifdef CONFIG_NET_CLS_ACT
				720	switch (result) {
				721	case TC_ACT_QUEUED:
				722	case TC_ACT_STOLEN:
				723	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
				724	case TC_ACT_SHOT:
				725	return NULL;
				726	}
				727	#endif
				728	cl = (struct qfq_class *)res.class;
				729	if (cl == NULL)
				730	cl = qfq_find_class(sch, res.classid);
				731	return cl;
				732	}
				733
				734	return NULL;
				735	}
				736
				737	/* Generic comparison function, handling wraparound. */
				738	static inline int qfq_gt(u64 a, u64 b)
				739	{
				740	return (s64)(a - b) > 0;
				741	}
				742
				743	/* Round a precise timestamp to its slotted value. */
				744	static inline u64 qfq_round_down(u64 ts, unsigned int shift)
				745	{
				746	return ts & ~((1ULL << shift) - 1);
				747	}
				748
				749	/* return the pointer to the group with lowest index in the bitmap */
				750	static inline struct qfq_group qfq_ffs(struct qfq_sched q,
				751	unsigned long bitmap)
				752	{
				753	int index = __ffs(bitmap);
				754	return &q->groups[index];
				755	}
				756	/* Calculate a mask to mimic what would be ffs_from(). */
				757	static inline unsigned long mask_from(unsigned long bitmap, int from)
				758	{
				759	return bitmap & ~((1UL << from) - 1);
				760	}
				761
				762	/*
				763	* The state computation relies on ER=0, IR=1, EB=2, IB=3
				764	* First compute eligibility comparing grp->S, q->V,
				765	* then check if someone is blocking us and possibly add EB
				766	*/
				767	static int qfq_calc_state(struct qfq_sched q, const struct qfq_group grp)
				768	{
				769	/* if S > V we are not eligible */
				770	unsigned int state = qfq_gt(grp->S, q->V);
				771	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
				772	struct qfq_group *next;
				773
				774	if (mask) {
				775	next = qfq_ffs(q, mask);
				776	if (qfq_gt(grp->F, next->F))
				777	state \|= EB;
				778	}
				779
				780	return state;
				781	}
				782
				783
				784	/*
				785	* In principle
				786	* q->bitmaps[dst] \|= q->bitmaps[src] & mask;
				787	* q->bitmaps[src] &= ~mask;
				788	* but we should make sure that src != dst
				789	*/
				790	static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
				791	int src, int dst)
				792	{
				793	q->bitmaps[dst] \|= q->bitmaps[src] & mask;
				794	q->bitmaps[src] &= ~mask;
				795	}
				796
				797	static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
				798	{
				799	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
				800	struct qfq_group *next;
				801
				802	if (mask) {
				803	next = qfq_ffs(q, mask);
				804	if (!qfq_gt(next->F, old_F))
				805	return;
				806	}
				807
				808	mask = (1UL << index) - 1;
				809	qfq_move_groups(q, mask, EB, ER);
				810	qfq_move_groups(q, mask, IB, IR);
				811	}
				812
				813	/*
				814	* perhaps
				815	*
				816	old_V ^= q->V;
				817	old_V >>= q->min_slot_shift;
				818	if (old_V) {
				819	...
				820	}
				821	*
				822	*/
				823	static void qfq_make_eligible(struct qfq_sched *q)
				824	{
				825	unsigned long vslot = q->V >> q->min_slot_shift;
				826	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
				827
				828	if (vslot != old_vslot) {
				829	unsigned long mask;
				830	int last_flip_pos = fls(vslot ^ old_vslot);
				831
				832	if (last_flip_pos > 31) /* higher than the number of groups */
				833	mask = ~0UL; /* make all groups eligible */
				834	else
				835	mask = (1UL << last_flip_pos) - 1;
				836
				837	qfq_move_groups(q, mask, IR, ER);
				838	qfq_move_groups(q, mask, IB, EB);
				839	}
				840	}
				841
				842	/*
				843	* The index of the slot in which the input aggregate agg is to be
				844	* inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
				845	* and not a '-1' because the start time of the group may be moved
				846	* backward by one slot after the aggregate has been inserted, and
				847	* this would cause non-empty slots to be right-shifted by one
				848	* position.
				849	*
				850	* QFQ+ fully satisfies this bound to the slot index if the parameters
				851	* of the classes are not changed dynamically, and if QFQ+ never
				852	* happens to postpone the service of agg unjustly, i.e., it never
				853	* happens that the aggregate becomes backlogged and eligible, or just
				854	* eligible, while an aggregate with a higher approximated finish time
				855	* is being served. In particular, in this case QFQ+ guarantees that
				856	* the timestamps of agg are low enough that the slot index is never
				857	* higher than 2. Unfortunately, QFQ+ cannot provide the same
				858	* guarantee if it happens to unjustly postpone the service of agg, or
				859	* if the parameters of some class are changed.
				860	*
				861	* As for the first event, i.e., an out-of-order service, the
				862	* upper bound to the slot index guaranteed by QFQ+ grows to
				863	* 2 +
				864	* QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
				865	* (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
				866	*
				867	* The following function deals with this problem by backward-shifting
				868	* the timestamps of agg, if needed, so as to guarantee that the slot
				869	* index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
				870	* cause the service of other aggregates to be postponed, yet the
				871	* worst-case guarantees of these aggregates are not violated. In
				872	* fact, in case of no out-of-order service, the timestamps of agg
				873	* would have been even lower than they are after the backward shift,
				874	* because QFQ+ would have guaranteed a maximum value equal to 2 for
				875	* the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
				876	* service is postponed because of the backward-shift would have
				877	* however waited for the service of agg before being served.
				878	*
				879	* The other event that may cause the slot index to be higher than 2
				880	* for agg is a recent change of the parameters of some class. If the
				881	* weight of a class is increased or the lmax (max_pkt_size) of the
				882	* class is decreased, then a new aggregate with smaller slot size
				883	* than the original parent aggregate of the class may happen to be
				884	* activated. The activation of this aggregate should be properly
				885	* delayed to when the service of the class has finished in the ideal
				886	* system tracked by QFQ+. If the activation of the aggregate is not
				887	* delayed to this reference time instant, then this aggregate may be
				888	* unjustly served before other aggregates waiting for service. This
				889	* may cause the above bound to the slot index to be violated for some
				890	* of these unlucky aggregates.
				891	*
				892	* Instead of delaying the activation of the new aggregate, which is
				893	* quite complex, the above-discussed capping of the slot index is
				894	* used to handle also the consequences of a change of the parameters
				895	* of a class.
				896	*/
				897	static void qfq_slot_insert(struct qfq_group grp, struct qfq_aggregate agg,
				898	u64 roundedS)
				899	{
				900	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
				901	unsigned int i; /* slot index in the bucket list */
				902
				903	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
				904	u64 deltaS = roundedS - grp->S -
				905	((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
				906	agg->S -= deltaS;
				907	agg->F -= deltaS;
				908	slot = QFQ_MAX_SLOTS - 2;
				909	}
				910
				911	i = (grp->front + slot) % QFQ_MAX_SLOTS;
				912
				913	hlist_add_head(&agg->next, &grp->slots[i]);
				914	__set_bit(slot, &grp->full_slots);
				915	}
				916
				917	/* Maybe introduce hlist_first_entry?? */
				918	static struct qfq_aggregate qfq_slot_head(struct qfq_group grp)
				919	{
				920	return hlist_entry(grp->slots[grp->front].first,
				921	struct qfq_aggregate, next);
				922	}
				923
				924	/*
				925	* remove the entry from the slot
				926	*/
				927	static void qfq_front_slot_remove(struct qfq_group *grp)
				928	{
				929	struct qfq_aggregate *agg = qfq_slot_head(grp);
				930
				931	BUG_ON(!agg);
				932	hlist_del(&agg->next);
				933	if (hlist_empty(&grp->slots[grp->front]))
				934	__clear_bit(0, &grp->full_slots);
				935	}
				936
				937	/*
				938	* Returns the first aggregate in the first non-empty bucket of the
				939	* group. As a side effect, adjusts the bucket list so the first
				940	* non-empty bucket is at position 0 in full_slots.
				941	*/
				942	static struct qfq_aggregate qfq_slot_scan(struct qfq_group grp)
				943	{
				944	unsigned int i;
				945
				946	pr_debug("qfq slot_scan: grp %u full %#lx\n",
				947	grp->index, grp->full_slots);
				948
				949	if (grp->full_slots == 0)
				950	return NULL;
				951
				952	i = __ffs(grp->full_slots); /* zero based */
				953	if (i > 0) {
				954	grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
				955	grp->full_slots >>= i;
				956	}
				957
				958	return qfq_slot_head(grp);
				959	}
				960
				961	/*
				962	* adjust the bucket list. When the start time of a group decreases,
				963	* we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
				964	* move the objects. The mask of occupied slots must be shifted
				965	* because we use ffs() to find the first non-empty slot.
				966	* This covers decreases in the group's start time, but what about
				967	* increases of the start time ?
				968	* Here too we should make sure that i is less than 32
				969	*/
				970	static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
				971	{
				972	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
				973
				974	grp->full_slots <<= i;
				975	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
				976	}
				977
				978	static void qfq_update_eligible(struct qfq_sched *q)
				979	{
				980	struct qfq_group *grp;
				981	unsigned long ineligible;
				982
				983	ineligible = q->bitmaps[IR] \| q->bitmaps[IB];
				984	if (ineligible) {
				985	if (!q->bitmaps[ER]) {
				986	grp = qfq_ffs(q, ineligible);
				987	if (qfq_gt(grp->S, q->V))
				988	q->V = grp->S;
				989	}
				990	qfq_make_eligible(q);
				991	}
				992	}
				993
				994	/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
				995	static void agg_dequeue(struct qfq_aggregate *agg,
				996	struct qfq_class *cl, unsigned int len)
				997	{
				998	qdisc_dequeue_peeked(cl->qdisc);
				999
				1000	cl->deficit -= (int) len;
				1001
				1002	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
				1003	list_del(&cl->alist);
				1004	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
				1005	cl->deficit += agg->lmax;
				1006	list_move_tail(&cl->alist, &agg->active);
				1007	}
				1008	}
				1009
				1010	static inline struct sk_buff qfq_peek_skb(struct qfq_aggregate agg,
				1011	struct qfq_class **cl,
				1012	unsigned int *len)
				1013	{
				1014	struct sk_buff *skb;
				1015
				1016	*cl = list_first_entry(&agg->active, struct qfq_class, alist);
				1017	skb = (cl)->qdisc->ops->peek((cl)->qdisc);
				1018	if (skb == NULL)
				1019	WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
				1020	else
				1021	*len = qdisc_pkt_len(skb);
				1022
				1023	return skb;
				1024	}
				1025
				1026	/* Update F according to the actual service received by the aggregate. */
				1027	static inline void charge_actual_service(struct qfq_aggregate *agg)
				1028	{
				1029	/* Compute the service received by the aggregate, taking into
				1030	* account that, after decreasing the number of classes in
				1031	* agg, it may happen that
				1032	* agg->initial_budget - agg->budget > agg->bugdetmax
				1033	*/
				1034	u32 service_received = min(agg->budgetmax,
				1035	agg->initial_budget - agg->budget);
				1036
				1037	agg->F = agg->S + (u64)service_received * agg->inv_w;
				1038	}
				1039
				1040	/* Assign a reasonable start time for a new aggregate in group i.
				1041	* Admissible values for \hat(F) are multiples of \sigma_i
				1042	* no greater than V+\sigma_i . Larger values mean that
				1043	* we had a wraparound so we consider the timestamp to be stale.
				1044	*
				1045	* If F is not stale and F >= V then we set S = F.
				1046	* Otherwise we should assign S = V, but this may violate
				1047	* the ordering in EB (see [2]). So, if we have groups in ER,
				1048	* set S to the F_j of the first group j which would be blocking us.
				1049	* We are guaranteed not to move S backward because
				1050	* otherwise our group i would still be blocked.
				1051	*/
				1052	static void qfq_update_start(struct qfq_sched q, struct qfq_aggregate agg)
				1053	{
				1054	unsigned long mask;
				1055	u64 limit, roundedF;
				1056	int slot_shift = agg->grp->slot_shift;
				1057
				1058	roundedF = qfq_round_down(agg->F, slot_shift);
				1059	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
				1060
				1061	if (!qfq_gt(agg->F, q->V) \|\| qfq_gt(roundedF, limit)) {
				1062	/* timestamp was stale */
				1063	mask = mask_from(q->bitmaps[ER], agg->grp->index);
				1064	if (mask) {
				1065	struct qfq_group *next = qfq_ffs(q, mask);
				1066	if (qfq_gt(roundedF, next->F)) {
				1067	if (qfq_gt(limit, next->F))
				1068	agg->S = next->F;
				1069	else /* preserve timestamp correctness */
				1070	agg->S = limit;
				1071	return;
				1072	}
				1073	}
				1074	agg->S = q->V;
				1075	} else /* timestamp is not stale */
				1076	agg->S = agg->F;
				1077	}
				1078
				1079	/* Update the timestamps of agg before scheduling/rescheduling it for
				1080	* service. In particular, assign to agg->F its maximum possible
				1081	* value, i.e., the virtual finish time with which the aggregate
				1082	* should be labeled if it used all its budget once in service.
				1083	*/
				1084	static inline void
				1085	qfq_update_agg_ts(struct qfq_sched *q,
				1086	struct qfq_aggregate *agg, enum update_reason reason)
				1087	{
				1088	if (reason != requeue)
				1089	qfq_update_start(q, agg);
				1090	else /* just charge agg for the service received */
				1091	agg->S = agg->F;
				1092
				1093	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
				1094	}
				1095
				1096	static void qfq_schedule_agg(struct qfq_sched q, struct qfq_aggregate agg);
				1097
				1098	static struct sk_buff qfq_dequeue(struct Qdisc sch)
				1099	{
				1100	struct qfq_sched *q = qdisc_priv(sch);
				1101	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
				1102	struct qfq_class *cl;
				1103	struct sk_buff *skb = NULL;
				1104	/* next-packet len, 0 means no more active classes in in-service agg */
				1105	unsigned int len = 0;
				1106
				1107	if (in_serv_agg == NULL)
				1108	return NULL;
				1109
				1110	if (!list_empty(&in_serv_agg->active))
				1111	skb = qfq_peek_skb(in_serv_agg, &cl, &len);
				1112
				1113	/*
				1114	* If there are no active classes in the in-service aggregate,
				1115	* or if the aggregate has not enough budget to serve its next
				1116	* class, then choose the next aggregate to serve.
				1117	*/
				1118	if (len == 0 \|\| in_serv_agg->budget < len) {
				1119	charge_actual_service(in_serv_agg);
				1120
				1121	/* recharge the budget of the aggregate */
				1122	in_serv_agg->initial_budget = in_serv_agg->budget =
				1123	in_serv_agg->budgetmax;
				1124
				1125	if (!list_empty(&in_serv_agg->active)) {
				1126	/*
				1127	* Still active: reschedule for
				1128	* service. Possible optimization: if no other
				1129	* aggregate is active, then there is no point
				1130	* in rescheduling this aggregate, and we can
				1131	* just keep it as the in-service one. This
				1132	* should be however a corner case, and to
				1133	* handle it, we would need to maintain an
				1134	* extra num_active_aggs field.
				1135	*/
				1136	qfq_update_agg_ts(q, in_serv_agg, requeue);
				1137	qfq_schedule_agg(q, in_serv_agg);
				1138	} else if (sch->q.qlen == 0) { /* no aggregate to serve */
				1139	q->in_serv_agg = NULL;
				1140	return NULL;
				1141	}
				1142
				1143	/*
				1144	* If we get here, there are other aggregates queued:
				1145	* choose the new aggregate to serve.
				1146	*/
				1147	in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
				1148	skb = qfq_peek_skb(in_serv_agg, &cl, &len);
				1149	}
				1150	if (!skb)
				1151	return NULL;
				1152
				1153	sch->q.qlen--;
				1154	qdisc_bstats_update(sch, skb);
				1155
				1156	agg_dequeue(in_serv_agg, cl, len);
				1157	/* If lmax is lowered, through qfq_change_class, for a class
				1158	* owning pending packets with larger size than the new value
				1159	* of lmax, then the following condition may hold.
				1160	*/
				1161	if (unlikely(in_serv_agg->budget < len))
				1162	in_serv_agg->budget = 0;
				1163	else
				1164	in_serv_agg->budget -= len;
				1165
				1166	q->V += (u64)len * q->iwsum;
				1167	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
				1168	len, (unsigned long long) in_serv_agg->F,
				1169	(unsigned long long) q->V);
				1170
				1171	return skb;
				1172	}
				1173
				1174	static struct qfq_aggregate qfq_choose_next_agg(struct qfq_sched q)
				1175	{
				1176	struct qfq_group *grp;
				1177	struct qfq_aggregate agg, new_front_agg;
				1178	u64 old_F;
				1179
				1180	qfq_update_eligible(q);
				1181	q->oldV = q->V;
				1182
				1183	if (!q->bitmaps[ER])
				1184	return NULL;
				1185
				1186	grp = qfq_ffs(q, q->bitmaps[ER]);
				1187	old_F = grp->F;
				1188
				1189	agg = qfq_slot_head(grp);
				1190
				1191	/* agg starts to be served, remove it from schedule */
				1192	qfq_front_slot_remove(grp);
				1193
				1194	new_front_agg = qfq_slot_scan(grp);
				1195
				1196	if (new_front_agg == NULL) /* group is now inactive, remove from ER */
				1197	__clear_bit(grp->index, &q->bitmaps[ER]);
				1198	else {
				1199	u64 roundedS = qfq_round_down(new_front_agg->S,
				1200	grp->slot_shift);
				1201	unsigned int s;
				1202
				1203	if (grp->S == roundedS)
				1204	return agg;
				1205	grp->S = roundedS;
				1206	grp->F = roundedS + (2ULL << grp->slot_shift);
				1207	__clear_bit(grp->index, &q->bitmaps[ER]);
				1208	s = qfq_calc_state(q, grp);
				1209	__set_bit(grp->index, &q->bitmaps[s]);
				1210	}
				1211
				1212	qfq_unblock_groups(q, grp->index, old_F);
				1213
				1214	return agg;
				1215	}
				1216
				1217	static int qfq_enqueue(struct sk_buff skb, struct Qdisc sch)
				1218	{
				1219	struct qfq_sched *q = qdisc_priv(sch);
				1220	struct qfq_class *cl;
				1221	struct qfq_aggregate *agg;
				1222	int err = 0;
				1223
				1224	cl = qfq_classify(skb, sch, &err);
				1225	if (cl == NULL) {
				1226	if (err & __NET_XMIT_BYPASS)
				1227	qdisc_qstats_drop(sch);
				1228	kfree_skb(skb);
				1229	return err;
				1230	}
				1231	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
				1232
				1233	if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
				1234	pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
				1235	cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
				1236	err = qfq_change_agg(sch, cl, cl->agg->class_weight,
				1237	qdisc_pkt_len(skb));
				1238	if (err)
				1239	return err;
				1240	}
				1241
				1242	err = qdisc_enqueue(skb, cl->qdisc);
				1243	if (unlikely(err != NET_XMIT_SUCCESS)) {
				1244	pr_debug("qfq_enqueue: enqueue failed %d\n", err);
				1245	if (net_xmit_drop_count(err)) {
				1246	cl->qstats.drops++;
				1247	qdisc_qstats_drop(sch);
				1248	}
				1249	return err;
				1250	}
				1251
				1252	bstats_update(&cl->bstats, skb);
				1253	++sch->q.qlen;
				1254
				1255	agg = cl->agg;
				1256	/* if the queue was not empty, then done here */
				1257	if (cl->qdisc->q.qlen != 1) {
				1258	if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
				1259	list_first_entry(&agg->active, struct qfq_class, alist)
				1260	== cl && cl->deficit < qdisc_pkt_len(skb))
				1261	list_move_tail(&cl->alist, &agg->active);
				1262
				1263	return err;
				1264	}
				1265
				1266	/* schedule class for service within the aggregate */
				1267	cl->deficit = agg->lmax;
				1268	list_add_tail(&cl->alist, &agg->active);
				1269
				1270	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl \|\|
				1271	q->in_serv_agg == agg)
				1272	return err; /* non-empty or in service, nothing else to do */
				1273
				1274	qfq_activate_agg(q, agg, enqueue);
				1275
				1276	return err;
				1277	}
				1278
				1279	/*
				1280	* Schedule aggregate according to its timestamps.
				1281	*/
				1282	static void qfq_schedule_agg(struct qfq_sched q, struct qfq_aggregate agg)
				1283	{
				1284	struct qfq_group *grp = agg->grp;
				1285	u64 roundedS;
				1286	int s;
				1287
				1288	roundedS = qfq_round_down(agg->S, grp->slot_shift);
				1289
				1290	/*
				1291	* Insert agg in the correct bucket.
				1292	* If agg->S >= grp->S we don't need to adjust the
				1293	* bucket list and simply go to the insertion phase.
				1294	* Otherwise grp->S is decreasing, we must make room
				1295	* in the bucket list, and also recompute the group state.
				1296	* Finally, if there were no flows in this group and nobody
				1297	* was in ER make sure to adjust V.
				1298	*/
				1299	if (grp->full_slots) {
				1300	if (!qfq_gt(grp->S, agg->S))
				1301	goto skip_update;
				1302
				1303	/* create a slot for this agg->S */
				1304	qfq_slot_rotate(grp, roundedS);
				1305	/* group was surely ineligible, remove */
				1306	__clear_bit(grp->index, &q->bitmaps[IR]);
				1307	__clear_bit(grp->index, &q->bitmaps[IB]);
				1308	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
				1309	q->in_serv_agg == NULL)
				1310	q->V = roundedS;
				1311
				1312	grp->S = roundedS;
				1313	grp->F = roundedS + (2ULL << grp->slot_shift);
				1314	s = qfq_calc_state(q, grp);
				1315	__set_bit(grp->index, &q->bitmaps[s]);
				1316
				1317	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
				1318	s, q->bitmaps[s],
				1319	(unsigned long long) agg->S,
				1320	(unsigned long long) agg->F,
				1321	(unsigned long long) q->V);
				1322
				1323	skip_update:
				1324	qfq_slot_insert(grp, agg, roundedS);
				1325	}
				1326
				1327
				1328	/* Update agg ts and schedule agg for service */
				1329	static void qfq_activate_agg(struct qfq_sched q, struct qfq_aggregate agg,
				1330	enum update_reason reason)
				1331	{
				1332	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
				1333
				1334	qfq_update_agg_ts(q, agg, reason);
				1335	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
				1336	q->in_serv_agg = agg; /* start serving this aggregate */
				1337	/* update V: to be in service, agg must be eligible */
				1338	q->oldV = q->V = agg->S;
				1339	} else if (agg != q->in_serv_agg)
				1340	qfq_schedule_agg(q, agg);
				1341	}
				1342
				1343	static void qfq_slot_remove(struct qfq_sched q, struct qfq_group grp,
				1344	struct qfq_aggregate *agg)
				1345	{
				1346	unsigned int i, offset;
				1347	u64 roundedS;
				1348
				1349	roundedS = qfq_round_down(agg->S, grp->slot_shift);
				1350	offset = (roundedS - grp->S) >> grp->slot_shift;
				1351
				1352	i = (grp->front + offset) % QFQ_MAX_SLOTS;
				1353
				1354	hlist_del(&agg->next);
				1355	if (hlist_empty(&grp->slots[i]))
				1356	__clear_bit(offset, &grp->full_slots);
				1357	}
				1358
				1359	/*
				1360	* Called to forcibly deschedule an aggregate. If the aggregate is
				1361	* not in the front bucket, or if the latter has other aggregates in
				1362	* the front bucket, we can simply remove the aggregate with no other
				1363	* side effects.
				1364	* Otherwise we must propagate the event up.
				1365	*/
				1366	static void qfq_deactivate_agg(struct qfq_sched q, struct qfq_aggregate agg)
				1367	{
				1368	struct qfq_group *grp = agg->grp;
				1369	unsigned long mask;
				1370	u64 roundedS;
				1371	int s;
				1372
				1373	if (agg == q->in_serv_agg) {
				1374	charge_actual_service(agg);
				1375	q->in_serv_agg = qfq_choose_next_agg(q);
				1376	return;
				1377	}
				1378
				1379	agg->F = agg->S;
				1380	qfq_slot_remove(q, grp, agg);
				1381
				1382	if (!grp->full_slots) {
				1383	__clear_bit(grp->index, &q->bitmaps[IR]);
				1384	__clear_bit(grp->index, &q->bitmaps[EB]);
				1385	__clear_bit(grp->index, &q->bitmaps[IB]);
				1386
				1387	if (test_bit(grp->index, &q->bitmaps[ER]) &&
				1388	!(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
				1389	mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
				1390	if (mask)
				1391	mask = ~((1UL << __fls(mask)) - 1);
				1392	else
				1393	mask = ~0UL;
				1394	qfq_move_groups(q, mask, EB, ER);
				1395	qfq_move_groups(q, mask, IB, IR);
				1396	}
				1397	__clear_bit(grp->index, &q->bitmaps[ER]);
				1398	} else if (hlist_empty(&grp->slots[grp->front])) {
				1399	agg = qfq_slot_scan(grp);
				1400	roundedS = qfq_round_down(agg->S, grp->slot_shift);
				1401	if (grp->S != roundedS) {
				1402	__clear_bit(grp->index, &q->bitmaps[ER]);
				1403	__clear_bit(grp->index, &q->bitmaps[IR]);
				1404	__clear_bit(grp->index, &q->bitmaps[EB]);
				1405	__clear_bit(grp->index, &q->bitmaps[IB]);
				1406	grp->S = roundedS;
				1407	grp->F = roundedS + (2ULL << grp->slot_shift);
				1408	s = qfq_calc_state(q, grp);
				1409	__set_bit(grp->index, &q->bitmaps[s]);
				1410	}
				1411	}
				1412	}
				1413
				1414	static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
				1415	{
				1416	struct qfq_sched *q = qdisc_priv(sch);
				1417	struct qfq_class cl = (struct qfq_class )arg;
				1418
				1419	if (cl->qdisc->q.qlen == 0)
				1420	qfq_deactivate_class(q, cl);
				1421	}
				1422
				1423	static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
				1424	struct hlist_head *slot)
				1425	{
				1426	struct qfq_aggregate *agg;
				1427	struct qfq_class *cl;
				1428	unsigned int len;
				1429
				1430	hlist_for_each_entry(agg, slot, next) {
				1431	list_for_each_entry(cl, &agg->active, alist) {
				1432
				1433	if (!cl->qdisc->ops->drop)
				1434	continue;
				1435
				1436	len = cl->qdisc->ops->drop(cl->qdisc);
				1437	if (len > 0) {
				1438	if (cl->qdisc->q.qlen == 0)
				1439	qfq_deactivate_class(q, cl);
				1440
				1441	return len;
				1442	}
				1443	}
				1444	}
				1445	return 0;
				1446	}
				1447
				1448	static unsigned int qfq_drop(struct Qdisc *sch)
				1449	{
				1450	struct qfq_sched *q = qdisc_priv(sch);
				1451	struct qfq_group *grp;
				1452	unsigned int i, j, len;
				1453
				1454	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
				1455	grp = &q->groups[i];
				1456	for (j = 0; j < QFQ_MAX_SLOTS; j++) {
				1457	len = qfq_drop_from_slot(q, &grp->slots[j]);
				1458	if (len > 0) {
				1459	sch->q.qlen--;
				1460	return len;
				1461	}
				1462	}
				1463
				1464	}
				1465
				1466	return 0;
				1467	}
				1468
				1469	static int qfq_init_qdisc(struct Qdisc sch, struct nlattr opt)
				1470	{
				1471	struct qfq_sched *q = qdisc_priv(sch);
				1472	struct qfq_group *grp;
				1473	int i, j, err;
				1474	u32 max_cl_shift, maxbudg_shift, max_classes;
				1475
				1476	err = qdisc_class_hash_init(&q->clhash);
				1477	if (err < 0)
				1478	return err;
				1479
				1480	if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
				1481	max_classes = QFQ_MAX_AGG_CLASSES;
				1482	else
				1483	max_classes = qdisc_dev(sch)->tx_queue_len + 1;
				1484	/* max_cl_shift = floor(log_2(max_classes)) */
				1485	max_cl_shift = __fls(max_classes);
				1486	q->max_agg_classes = 1<<max_cl_shift;
				1487
				1488	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */
				1489	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
				1490	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
				1491
				1492	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
				1493	grp = &q->groups[i];
				1494	grp->index = i;
				1495	grp->slot_shift = q->min_slot_shift + i;
				1496	for (j = 0; j < QFQ_MAX_SLOTS; j++)
				1497	INIT_HLIST_HEAD(&grp->slots[j]);
				1498	}
				1499
				1500	INIT_HLIST_HEAD(&q->nonfull_aggs);
				1501
				1502	return 0;
				1503	}
				1504
				1505	static void qfq_reset_qdisc(struct Qdisc *sch)
				1506	{
				1507	struct qfq_sched *q = qdisc_priv(sch);
				1508	struct qfq_class *cl;
				1509	unsigned int i;
				1510
				1511	for (i = 0; i < q->clhash.hashsize; i++) {
				1512	hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
				1513	if (cl->qdisc->q.qlen > 0)
				1514	qfq_deactivate_class(q, cl);
				1515
				1516	qdisc_reset(cl->qdisc);
				1517	}
				1518	}
				1519	sch->q.qlen = 0;
				1520	}
				1521
				1522	static void qfq_destroy_qdisc(struct Qdisc *sch)
				1523	{
				1524	struct qfq_sched *q = qdisc_priv(sch);
				1525	struct qfq_class *cl;
				1526	struct hlist_node *next;
				1527	unsigned int i;
				1528
				1529	tcf_destroy_chain(&q->filter_list);
				1530
				1531	for (i = 0; i < q->clhash.hashsize; i++) {
				1532	hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
				1533	common.hnode) {
				1534	qfq_destroy_class(sch, cl);
				1535	}
				1536	}
				1537	qdisc_class_hash_destroy(&q->clhash);
				1538	}
				1539
				1540	static const struct Qdisc_class_ops qfq_class_ops = {
				1541	.change = qfq_change_class,
				1542	.delete = qfq_delete_class,
				1543	.get = qfq_get_class,
				1544	.put = qfq_put_class,
				1545	.tcf_chain = qfq_tcf_chain,
				1546	.bind_tcf = qfq_bind_tcf,
				1547	.unbind_tcf = qfq_unbind_tcf,
				1548	.graft = qfq_graft_class,
				1549	.leaf = qfq_class_leaf,
				1550	.qlen_notify = qfq_qlen_notify,
				1551	.dump = qfq_dump_class,
				1552	.dump_stats = qfq_dump_class_stats,
				1553	.walk = qfq_walk,
				1554	};
				1555
				1556	static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
				1557	.cl_ops = &qfq_class_ops,
				1558	.id = "qfq",
				1559	.priv_size = sizeof(struct qfq_sched),
				1560	.enqueue = qfq_enqueue,
				1561	.dequeue = qfq_dequeue,
				1562	.peek = qdisc_peek_dequeued,
				1563	.drop = qfq_drop,
				1564	.init = qfq_init_qdisc,
				1565	.reset = qfq_reset_qdisc,
				1566	.destroy = qfq_destroy_qdisc,
				1567	.owner = THIS_MODULE,
				1568	};
				1569
				1570	static int __init qfq_init(void)
				1571	{
				1572	return register_qdisc(&qfq_qdisc_ops);
				1573	}
				1574
				1575	static void __exit qfq_exit(void)
				1576	{
				1577	unregister_qdisc(&qfq_qdisc_ops);
				1578	}
				1579
				1580	module_init(qfq_init);
				1581	module_exit(qfq_exit);
				1582	MODULE_LICENSE("GPL");