Blame - net/sched/sch_netem.c - codeaurora/cp-linux

blob: b7c29d5b6f04e56df0107fdd00fb7d7803aadfe9 [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1	/*
				2	* net/sched/sch_netem.c Network emulator
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation; either version
				7	* 2 of the License.
				8	*
				9	* Many of the algorithms and ideas for this came from
				10	* NIST Net which is not copyrighted.
				11	*
				12	* Authors: Stephen Hemminger <shemminger@osdl.org>
				13	* Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
				14	*/
				15
				16	#include <linux/mm.h>
				17	#include <linux/module.h>
				18	#include <linux/slab.h>
				19	#include <linux/types.h>
				20	#include <linux/kernel.h>
				21	#include <linux/errno.h>
				22	#include <linux/skbuff.h>
				23	#include <linux/vmalloc.h>
				24	#include <linux/rtnetlink.h>
				25	#include <linux/reciprocal_div.h>
				26	#include <linux/rbtree.h>
				27
				28	#include <net/netlink.h>
				29	#include <net/pkt_sched.h>
				30	#include <net/inet_ecn.h>
				31
				32	#define VERSION "1.3"
				33
				34	/* Network Emulation Queuing algorithm.
				35	====================================
				36
				37	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
				38	Network Emulation Tool
				39	[2] Luigi Rizzo, DummyNet for FreeBSD
				40
				41	----------------------------------------------------------------
				42
				43	This started out as a simple way to delay outgoing packets to
				44	test TCP but has grown to include most of the functionality
				45	of a full blown network emulator like NISTnet. It can delay
				46	packets and add random jitter (and correlation). The random
				47	distribution can be loaded from a table as well to provide
				48	normal, Pareto, or experimental curves. Packet loss,
				49	duplication, and reordering can also be emulated.
				50
				51	This qdisc does not do classification that can be handled in
				52	layering other disciplines. It does not need to do bandwidth
				53	control either since that can be handled by using token
				54	bucket or other rate control.
				55
				56	Correlated Loss Generator models
				57
				58	Added generation of correlated loss according to the
				59	"Gilbert-Elliot" model, a 4-state markov model.
				60
				61	References:
				62	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
				63	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
				64	and intuitive loss model for packet networks and its implementation
				65	in the Netem module in the Linux kernel", available in [1]
				66
				67	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
				68	Fabio Ludovici <fabio.ludovici at yahoo.it>
				69	*/
				70
				71	struct netem_sched_data {
				72	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
				73	struct rb_root t_root;
				74
				75	/* optional qdisc for classful handling (NULL at netem init) */
				76	struct Qdisc *qdisc;
				77
				78	struct qdisc_watchdog watchdog;
				79
				80	psched_tdiff_t latency;
				81	psched_tdiff_t jitter;
				82
				83	u32 loss;
				84	u32 ecn;
				85	u32 limit;
				86	u32 counter;
				87	u32 gap;
				88	u32 duplicate;
				89	u32 reorder;
				90	u32 corrupt;
				91	u64 rate;
				92	s32 packet_overhead;
				93	u32 cell_size;
				94	struct reciprocal_value cell_size_reciprocal;
				95	s32 cell_overhead;
				96
				97	struct crndstate {
				98	u32 last;
				99	u32 rho;
				100	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
				101
				102	struct disttable {
				103	u32 size;
				104	s16 table[0];
				105	} *delay_dist;
				106
				107	enum {
				108	CLG_RANDOM,
				109	CLG_4_STATES,
				110	CLG_GILB_ELL,
				111	} loss_model;
				112
				113	enum {
				114	TX_IN_GAP_PERIOD = 1,
				115	TX_IN_BURST_PERIOD,
				116	LOST_IN_GAP_PERIOD,
				117	LOST_IN_BURST_PERIOD,
				118	} _4_state_model;
				119
				120	enum {
				121	GOOD_STATE = 1,
				122	BAD_STATE,
				123	} GE_state_model;
				124
				125	/* Correlated Loss Generation models */
				126	struct clgstate {
				127	/* state of the Markov chain */
				128	u8 state;
				129
				130	/* 4-states and Gilbert-Elliot models */
				131	u32 a1; /* p13 for 4-states or p for GE */
				132	u32 a2; /* p31 for 4-states or r for GE */
				133	u32 a3; /* p32 for 4-states or h for GE */
				134	u32 a4; /* p14 for 4-states or 1-k for GE */
				135	u32 a5; /* p23 used only in 4-states */
				136	} clg;
				137
				138	};
				139
				140	/* Time stamp put into socket buffer control block
				141	* Only valid when skbs are in our internal t(ime)fifo queue.
				142	*
				143	* As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
				144	* and skb->next & skb->prev are scratch space for a qdisc,
				145	* we save skb->tstamp value in skb->cb[] before destroying it.
				146	*/
				147	struct netem_skb_cb {
				148	psched_time_t time_to_send;
				149	ktime_t tstamp_save;
				150	};
				151
				152
				153	static struct sk_buff netem_rb_to_skb(struct rb_node rb)
				154	{
				155	return container_of(rb, struct sk_buff, rbnode);
				156	}
				157
				158	static inline struct netem_skb_cb netem_skb_cb(struct sk_buff skb)
				159	{
				160	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
				161	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
				162	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
				163	}
				164
				165	/* init_crandom - initialize correlated random number generator
				166	* Use entropy source for initial seed.
				167	*/
				168	static void init_crandom(struct crndstate *state, unsigned long rho)
				169	{
				170	state->rho = rho;
				171	state->last = prandom_u32();
				172	}
				173
				174	/* get_crandom - correlated random number generator
				175	* Next number depends on last value.
				176	* rho is scaled to avoid floating point.
				177	*/
				178	static u32 get_crandom(struct crndstate *state)
				179	{
				180	u64 value, rho;
				181	unsigned long answer;
				182
				183	if (state->rho == 0) /* no correlation */
				184	return prandom_u32();
				185
				186	value = prandom_u32();
				187	rho = (u64)state->rho + 1;
				188	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
				189	state->last = answer;
				190	return answer;
				191	}
				192
				193	/* loss_4state - 4-state model loss generator
				194	* Generates losses according to the 4-state Markov chain adopted in
				195	* the GI (General and Intuitive) loss model.
				196	*/
				197	static bool loss_4state(struct netem_sched_data *q)
				198	{
				199	struct clgstate *clg = &q->clg;
				200	u32 rnd = prandom_u32();
				201
				202	/*
				203	* Makes a comparison between rnd and the transition
				204	* probabilities outgoing from the current state, then decides the
				205	* next state and if the next packet has to be transmitted or lost.
				206	* The four states correspond to:
				207	* TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
				208	* LOST_IN_BURST_PERIOD => isolated losses within a gap period
				209	* LOST_IN_GAP_PERIOD => lost packets within a burst period
				210	* TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
				211	*/
				212	switch (clg->state) {
				213	case TX_IN_GAP_PERIOD:
				214	if (rnd < clg->a4) {
				215	clg->state = LOST_IN_BURST_PERIOD;
				216	return true;
				217	} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
				218	clg->state = LOST_IN_GAP_PERIOD;
				219	return true;
				220	} else if (clg->a1 + clg->a4 < rnd) {
				221	clg->state = TX_IN_GAP_PERIOD;
				222	}
				223
				224	break;
				225	case TX_IN_BURST_PERIOD:
				226	if (rnd < clg->a5) {
				227	clg->state = LOST_IN_GAP_PERIOD;
				228	return true;
				229	} else {
				230	clg->state = TX_IN_BURST_PERIOD;
				231	}
				232
				233	break;
				234	case LOST_IN_GAP_PERIOD:
				235	if (rnd < clg->a3)
				236	clg->state = TX_IN_BURST_PERIOD;
				237	else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
				238	clg->state = TX_IN_GAP_PERIOD;
				239	} else if (clg->a2 + clg->a3 < rnd) {
				240	clg->state = LOST_IN_GAP_PERIOD;
				241	return true;
				242	}
				243	break;
				244	case LOST_IN_BURST_PERIOD:
				245	clg->state = TX_IN_GAP_PERIOD;
				246	break;
				247	}
				248
				249	return false;
				250	}
				251
				252	/* loss_gilb_ell - Gilbert-Elliot model loss generator
				253	* Generates losses according to the Gilbert-Elliot loss model or
				254	* its special cases (Gilbert or Simple Gilbert)
				255	*
				256	* Makes a comparison between random number and the transition
				257	* probabilities outgoing from the current state, then decides the
				258	* next state. A second random number is extracted and the comparison
				259	* with the loss probability of the current state decides if the next
				260	* packet will be transmitted or lost.
				261	*/
				262	static bool loss_gilb_ell(struct netem_sched_data *q)
				263	{
				264	struct clgstate *clg = &q->clg;
				265
				266	switch (clg->state) {
				267	case GOOD_STATE:
				268	if (prandom_u32() < clg->a1)
				269	clg->state = BAD_STATE;
				270	if (prandom_u32() < clg->a4)
				271	return true;
				272	break;
				273	case BAD_STATE:
				274	if (prandom_u32() < clg->a2)
				275	clg->state = GOOD_STATE;
				276	if (prandom_u32() > clg->a3)
				277	return true;
				278	}
				279
				280	return false;
				281	}
				282
				283	static bool loss_event(struct netem_sched_data *q)
				284	{
				285	switch (q->loss_model) {
				286	case CLG_RANDOM:
				287	/* Random packet drop 0 => none, ~0 => all */
				288	return q->loss && q->loss >= get_crandom(&q->loss_cor);
				289
				290	case CLG_4_STATES:
				291	/* 4state loss model algorithm (used also for GI model)
				292	* Extracts a value from the markov 4 state loss generator,
				293	* if it is 1 drops a packet and if needed writes the event in
				294	* the kernel logs
				295	*/
				296	return loss_4state(q);
				297
				298	case CLG_GILB_ELL:
				299	/* Gilbert-Elliot loss model algorithm
				300	* Extracts a value from the Gilbert-Elliot loss generator,
				301	* if it is 1 drops a packet and if needed writes the event in
				302	* the kernel logs
				303	*/
				304	return loss_gilb_ell(q);
				305	}
				306
				307	return false; /* not reached */
				308	}
				309
				310
				311	/* tabledist - return a pseudo-randomly distributed value with mean mu and
				312	* std deviation sigma. Uses table lookup to approximate the desired
				313	* distribution, and a uniformly-distributed pseudo-random source.
				314	*/
				315	static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
				316	struct crndstate *state,
				317	const struct disttable *dist)
				318	{
				319	psched_tdiff_t x;
				320	long t;
				321	u32 rnd;
				322
				323	if (sigma == 0)
				324	return mu;
				325
				326	rnd = get_crandom(state);
				327
				328	/* default uniform distribution */
				329	if (dist == NULL)
				330	return (rnd % (2*sigma)) - sigma + mu;
				331
				332	t = dist->table[rnd % dist->size];
				333	x = (sigma % NETEM_DIST_SCALE) * t;
				334	if (x >= 0)
				335	x += NETEM_DIST_SCALE/2;
				336	else
				337	x -= NETEM_DIST_SCALE/2;
				338
				339	return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
				340	}
				341
				342	static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
				343	{
				344	u64 ticks;
				345
				346	len += q->packet_overhead;
				347
				348	if (q->cell_size) {
				349	u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
				350
				351	if (len > cells * q->cell_size) /* extra cell needed for remainder */
				352	cells++;
				353	len = cells * (q->cell_size + q->cell_overhead);
				354	}
				355
				356	ticks = (u64)len * NSEC_PER_SEC;
				357
				358	do_div(ticks, q->rate);
				359	return PSCHED_NS2TICKS(ticks);
				360	}
				361
				362	static void tfifo_reset(struct Qdisc *sch)
				363	{
				364	struct netem_sched_data *q = qdisc_priv(sch);
				365	struct rb_node *p;
				366
				367	while ((p = rb_first(&q->t_root))) {
				368	struct sk_buff *skb = netem_rb_to_skb(p);
				369
				370	rb_erase(p, &q->t_root);
				371	skb->next = NULL;
				372	skb->prev = NULL;
				373	kfree_skb(skb);
				374	}
				375	}
				376
				377	static void tfifo_enqueue(struct sk_buff nskb, struct Qdisc sch)
				378	{
				379	struct netem_sched_data *q = qdisc_priv(sch);
				380	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
				381	struct rb_node *p = &q->t_root.rb_node, parent = NULL;
				382
				383	while (*p) {
				384	struct sk_buff *skb;
				385
				386	parent = *p;
				387	skb = netem_rb_to_skb(parent);
				388	if (tnext >= netem_skb_cb(skb)->time_to_send)
				389	p = &parent->rb_right;
				390	else
				391	p = &parent->rb_left;
				392	}
				393	rb_link_node(&nskb->rbnode, parent, p);
				394	rb_insert_color(&nskb->rbnode, &q->t_root);
				395	sch->q.qlen++;
				396	}
				397
				398	/* netem can't properly corrupt a megapacket (like we get from GSO), so instead
				399	* when we statistically choose to corrupt one, we instead segment it, returning
				400	* the first packet to be corrupted, and re-enqueue the remaining frames
				401	*/
				402	static struct sk_buff netem_segment(struct sk_buff skb, struct Qdisc *sch)
				403	{
				404	struct sk_buff *segs;
				405	netdev_features_t features = netif_skb_features(skb);
				406
				407	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
				408
				409	if (IS_ERR_OR_NULL(segs)) {
				410	qdisc_reshape_fail(skb, sch);
				411	return NULL;
				412	}
				413	consume_skb(skb);
				414	return segs;
				415	}
				416
				417	/*
				418	* Insert one skb into qdisc.
				419	* Note: parent depends on return value to account for queue length.
				420	* NET_XMIT_DROP: queue length didn't change.
				421	* NET_XMIT_SUCCESS: one skb was queued.
				422	*/
				423	static int netem_enqueue(struct sk_buff skb, struct Qdisc sch)
				424	{
				425	struct netem_sched_data *q = qdisc_priv(sch);
				426	/* We don't fill cb now as skb_unshare() may invalidate it */
				427	struct netem_skb_cb *cb;
				428	struct sk_buff *skb2;
				429	struct sk_buff *segs = NULL;
				430	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
				431	int nb = 0;
				432	int count = 1;
				433	int rc = NET_XMIT_SUCCESS;
				434
				435	/* Random duplication */
				436	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
				437	++count;
				438
				439	/* Drop packet? */
				440	if (loss_event(q)) {
				441	if (q->ecn && INET_ECN_set_ce(skb))
				442	qdisc_qstats_drop(sch); /* mark packet */
				443	else
				444	--count;
				445	}
				446	if (count == 0) {
				447	qdisc_qstats_drop(sch);
				448	kfree_skb(skb);
				449	return NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
				450	}
				451
				452	/* If a delay is expected, orphan the skb. (orphaning usually takes
				453	* place at TX completion time, so _before_ the link transit delay)
				454	*/
				455	if (q->latency \|\| q->jitter)
				456	skb_orphan_partial(skb);
				457
				458	/*
				459	* If we need to duplicate packet, then re-insert at top of the
				460	* qdisc tree, since parent queuer expects that only one
				461	* skb will be queued.
				462	*/
				463	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
				464	struct Qdisc *rootq = qdisc_root(sch);
				465	u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
				466
				467	q->duplicate = 0;
				468	rootq->enqueue(skb2, rootq);
				469	q->duplicate = dupsave;
				470	}
				471
				472	/*
				473	* Randomized packet corruption.
				474	* Make copy if needed since we are modifying
				475	* If packet is going to be hardware checksummed, then
				476	* do it now in software before we mangle it.
				477	*/
				478	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
				479	if (skb_is_gso(skb)) {
				480	segs = netem_segment(skb, sch);
				481	if (!segs)
				482	return NET_XMIT_DROP;
				483	} else {
				484	segs = skb;
				485	}
				486
				487	skb = segs;
				488	segs = segs->next;
				489
				490	if (!(skb = skb_unshare(skb, GFP_ATOMIC)) \|\|
				491	(skb->ip_summed == CHECKSUM_PARTIAL &&
				492	skb_checksum_help(skb))) {
				493	rc = qdisc_drop(skb, sch);
				494	goto finish_segs;
				495	}
				496
				497	skb->data[prandom_u32() % skb_headlen(skb)] ^=
				498	1<<(prandom_u32() % 8);
				499	}
				500
				501	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
				502	return qdisc_reshape_fail(skb, sch);
				503
				504	qdisc_qstats_backlog_inc(sch, skb);
				505
				506	cb = netem_skb_cb(skb);
				507	if (q->gap == 0 \|\| /* not doing reordering */
				508	q->counter < q->gap - 1 \|\| /* inside last reordering gap */
				509	q->reorder < get_crandom(&q->reorder_cor)) {
				510	psched_time_t now;
				511	psched_tdiff_t delay;
				512
				513	delay = tabledist(q->latency, q->jitter,
				514	&q->delay_cor, q->delay_dist);
				515
				516	now = psched_get_time();
				517
				518	if (q->rate) {
				519	struct sk_buff *last;
				520
				521	if (!skb_queue_empty(&sch->q))
				522	last = skb_peek_tail(&sch->q);
				523	else
				524	last = netem_rb_to_skb(rb_last(&q->t_root));
				525	if (last) {
				526	/*
				527	* Last packet in queue is reference point (now),
				528	* calculate this time bonus and subtract
				529	* from delay.
				530	*/
				531	delay -= netem_skb_cb(last)->time_to_send - now;
				532	delay = max_t(psched_tdiff_t, 0, delay);
				533	now = netem_skb_cb(last)->time_to_send;
				534	}
				535
				536	delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
				537	}
				538
				539	cb->time_to_send = now + delay;
				540	cb->tstamp_save = skb->tstamp;
				541	++q->counter;
				542	tfifo_enqueue(skb, sch);
				543	} else {
				544	/*
				545	* Do re-ordering by putting one out of N packets at the front
				546	* of the queue.
				547	*/
				548	cb->time_to_send = psched_get_time();
				549	q->counter = 0;
				550
				551	__skb_queue_head(&sch->q, skb);
				552	sch->qstats.requeues++;
				553	}
				554
				555	finish_segs:
				556	if (segs) {
				557	while (segs) {
				558	skb2 = segs->next;
				559	segs->next = NULL;
				560	qdisc_skb_cb(segs)->pkt_len = segs->len;
				561	last_len = segs->len;
				562	rc = qdisc_enqueue(segs, sch);
				563	if (rc != NET_XMIT_SUCCESS) {
				564	if (net_xmit_drop_count(rc))
				565	qdisc_qstats_drop(sch);
				566	} else {
				567	nb++;
				568	len += last_len;
				569	}
				570	segs = skb2;
				571	}
				572	sch->q.qlen += nb;
				573	if (nb > 1)
				574	qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
				575	}
				576	return NET_XMIT_SUCCESS;
				577	}
				578
				579	static unsigned int netem_drop(struct Qdisc *sch)
				580	{
				581	struct netem_sched_data *q = qdisc_priv(sch);
				582	unsigned int len;
				583
				584	len = qdisc_queue_drop(sch);
				585
				586	if (!len) {
				587	struct rb_node *p = rb_first(&q->t_root);
				588
				589	if (p) {
				590	struct sk_buff *skb = netem_rb_to_skb(p);
				591
				592	rb_erase(p, &q->t_root);
				593	sch->q.qlen--;
				594	skb->next = NULL;
				595	skb->prev = NULL;
				596	qdisc_qstats_backlog_dec(sch, skb);
				597	kfree_skb(skb);
				598	}
				599	}
				600	if (!len && q->qdisc && q->qdisc->ops->drop)
				601	len = q->qdisc->ops->drop(q->qdisc);
				602	if (len)
				603	qdisc_qstats_drop(sch);
				604
				605	return len;
				606	}
				607
				608	static struct sk_buff netem_dequeue(struct Qdisc sch)
				609	{
				610	struct netem_sched_data *q = qdisc_priv(sch);
				611	struct sk_buff *skb;
				612	struct rb_node *p;
				613
				614	if (qdisc_is_throttled(sch))
				615	return NULL;
				616
				617	tfifo_dequeue:
				618	skb = __skb_dequeue(&sch->q);
				619	if (skb) {
				620	qdisc_qstats_backlog_dec(sch, skb);
				621	deliver:
				622	qdisc_unthrottled(sch);
				623	qdisc_bstats_update(sch, skb);
				624	return skb;
				625	}
				626	p = rb_first(&q->t_root);
				627	if (p) {
				628	psched_time_t time_to_send;
				629
				630	skb = netem_rb_to_skb(p);
				631
				632	/* if more time remaining? */
				633	time_to_send = netem_skb_cb(skb)->time_to_send;
				634	if (time_to_send <= psched_get_time()) {
				635	rb_erase(p, &q->t_root);
				636
				637	sch->q.qlen--;
				638	qdisc_qstats_backlog_dec(sch, skb);
				639	skb->next = NULL;
				640	skb->prev = NULL;
				641	skb->tstamp = netem_skb_cb(skb)->tstamp_save;
				642
				643	#ifdef CONFIG_NET_CLS_ACT
				644	/*
				645	* If it's at ingress let's pretend the delay is
				646	* from the network (tstamp will be updated).
				647	*/
				648	if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
				649	skb->tstamp.tv64 = 0;
				650	#endif
				651
				652	if (q->qdisc) {
				653	unsigned int pkt_len = qdisc_pkt_len(skb);
				654	int err = qdisc_enqueue(skb, q->qdisc);
				655
				656	if (err != NET_XMIT_SUCCESS &&
				657	net_xmit_drop_count(err)) {
				658	qdisc_qstats_drop(sch);
				659	qdisc_tree_reduce_backlog(sch, 1,
				660	pkt_len);
				661	}
				662	goto tfifo_dequeue;
				663	}
				664	goto deliver;
				665	}
				666
				667	if (q->qdisc) {
				668	skb = q->qdisc->ops->dequeue(q->qdisc);
				669	if (skb)
				670	goto deliver;
				671	}
				672	qdisc_watchdog_schedule(&q->watchdog, time_to_send);
				673	}
				674
				675	if (q->qdisc) {
				676	skb = q->qdisc->ops->dequeue(q->qdisc);
				677	if (skb)
				678	goto deliver;
				679	}
				680	return NULL;
				681	}
				682
				683	static void netem_reset(struct Qdisc *sch)
				684	{
				685	struct netem_sched_data *q = qdisc_priv(sch);
				686
				687	qdisc_reset_queue(sch);
				688	tfifo_reset(sch);
				689	if (q->qdisc)
				690	qdisc_reset(q->qdisc);
				691	qdisc_watchdog_cancel(&q->watchdog);
				692	}
				693
				694	static void dist_free(struct disttable *d)
				695	{
				696	kvfree(d);
				697	}
				698
				699	/*
				700	* Distribution data is a variable size payload containing
				701	* signed 16 bit values.
				702	*/
				703	static int get_dist_table(struct Qdisc sch, const struct nlattr attr)
				704	{
				705	struct netem_sched_data *q = qdisc_priv(sch);
				706	size_t n = nla_len(attr)/sizeof(__s16);
				707	const __s16 *data = nla_data(attr);
				708	spinlock_t *root_lock;
				709	struct disttable *d;
				710	int i;
				711	size_t s;
				712
				713	if (n > NETEM_DIST_MAX)
				714	return -EINVAL;
				715
				716	s = sizeof(struct disttable) + n * sizeof(s16);
				717	d = kmalloc(s, GFP_KERNEL \| __GFP_NOWARN);
				718	if (!d)
				719	d = vmalloc(s);
				720	if (!d)
				721	return -ENOMEM;
				722
				723	d->size = n;
				724	for (i = 0; i < n; i++)
				725	d->table[i] = data[i];
				726
				727	root_lock = qdisc_root_sleeping_lock(sch);
				728
				729	spin_lock_bh(root_lock);
				730	swap(q->delay_dist, d);
				731	spin_unlock_bh(root_lock);
				732
				733	dist_free(d);
				734	return 0;
				735	}
				736
				737	static void get_correlation(struct netem_sched_data q, const struct nlattr attr)
				738	{
				739	const struct tc_netem_corr *c = nla_data(attr);
				740
				741	init_crandom(&q->delay_cor, c->delay_corr);
				742	init_crandom(&q->loss_cor, c->loss_corr);
				743	init_crandom(&q->dup_cor, c->dup_corr);
				744	}
				745
				746	static void get_reorder(struct netem_sched_data q, const struct nlattr attr)
				747	{
				748	const struct tc_netem_reorder *r = nla_data(attr);
				749
				750	q->reorder = r->probability;
				751	init_crandom(&q->reorder_cor, r->correlation);
				752	}
				753
				754	static void get_corrupt(struct netem_sched_data q, const struct nlattr attr)
				755	{
				756	const struct tc_netem_corrupt *r = nla_data(attr);
				757
				758	q->corrupt = r->probability;
				759	init_crandom(&q->corrupt_cor, r->correlation);
				760	}
				761
				762	static void get_rate(struct netem_sched_data q, const struct nlattr attr)
				763	{
				764	const struct tc_netem_rate *r = nla_data(attr);
				765
				766	q->rate = r->rate;
				767	q->packet_overhead = r->packet_overhead;
				768	q->cell_size = r->cell_size;
				769	q->cell_overhead = r->cell_overhead;
				770	if (q->cell_size)
				771	q->cell_size_reciprocal = reciprocal_value(q->cell_size);
				772	else
				773	q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
				774	}
				775
				776	static int get_loss_clg(struct netem_sched_data q, const struct nlattr attr)
				777	{
				778	const struct nlattr *la;
				779	int rem;
				780
				781	nla_for_each_nested(la, attr, rem) {
				782	u16 type = nla_type(la);
				783
				784	switch (type) {
				785	case NETEM_LOSS_GI: {
				786	const struct tc_netem_gimodel *gi = nla_data(la);
				787
				788	if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
				789	pr_info("netem: incorrect gi model size\n");
				790	return -EINVAL;
				791	}
				792
				793	q->loss_model = CLG_4_STATES;
				794
				795	q->clg.state = TX_IN_GAP_PERIOD;
				796	q->clg.a1 = gi->p13;
				797	q->clg.a2 = gi->p31;
				798	q->clg.a3 = gi->p32;
				799	q->clg.a4 = gi->p14;
				800	q->clg.a5 = gi->p23;
				801	break;
				802	}
				803
				804	case NETEM_LOSS_GE: {
				805	const struct tc_netem_gemodel *ge = nla_data(la);
				806
				807	if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
				808	pr_info("netem: incorrect ge model size\n");
				809	return -EINVAL;
				810	}
				811
				812	q->loss_model = CLG_GILB_ELL;
				813	q->clg.state = GOOD_STATE;
				814	q->clg.a1 = ge->p;
				815	q->clg.a2 = ge->r;
				816	q->clg.a3 = ge->h;
				817	q->clg.a4 = ge->k1;
				818	break;
				819	}
				820
				821	default:
				822	pr_info("netem: unknown loss type %u\n", type);
				823	return -EINVAL;
				824	}
				825	}
				826
				827	return 0;
				828	}
				829
				830	static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
				831	[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
				832	[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
				833	[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
				834	[TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
				835	[TCA_NETEM_LOSS] = { .type = NLA_NESTED },
				836	[TCA_NETEM_ECN] = { .type = NLA_U32 },
				837	[TCA_NETEM_RATE64] = { .type = NLA_U64 },
				838	};
				839
				840	static int parse_attr(struct nlattr tb[], int maxtype, struct nlattr nla,
				841	const struct nla_policy *policy, int len)
				842	{
				843	int nested_len = nla_len(nla) - NLA_ALIGN(len);
				844
				845	if (nested_len < 0) {
				846	pr_info("netem: invalid attributes len %d\n", nested_len);
				847	return -EINVAL;
				848	}
				849
				850	if (nested_len >= nla_attr_size(0))
				851	return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
				852	nested_len, policy);
				853
				854	memset(tb, 0, sizeof(struct nlattr ) (maxtype + 1));
				855	return 0;
				856	}
				857
				858	/* Parse netlink message to set options */
				859	static int netem_change(struct Qdisc sch, struct nlattr opt)
				860	{
				861	struct netem_sched_data *q = qdisc_priv(sch);
				862	struct nlattr *tb[TCA_NETEM_MAX + 1];
				863	struct tc_netem_qopt *qopt;
				864	struct clgstate old_clg;
				865	int old_loss_model = CLG_RANDOM;
				866	int ret;
				867
				868	if (opt == NULL)
				869	return -EINVAL;
				870
				871	qopt = nla_data(opt);
				872	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
				873	if (ret < 0)
				874	return ret;
				875
				876	/* backup q->clg and q->loss_model */
				877	old_clg = q->clg;
				878	old_loss_model = q->loss_model;
				879
				880	if (tb[TCA_NETEM_LOSS]) {
				881	ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
				882	if (ret) {
				883	q->loss_model = old_loss_model;
				884	return ret;
				885	}
				886	} else {
				887	q->loss_model = CLG_RANDOM;
				888	}
				889
				890	if (tb[TCA_NETEM_DELAY_DIST]) {
				891	ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
				892	if (ret) {
				893	/* recover clg and loss_model, in case of
				894	* q->clg and q->loss_model were modified
				895	* in get_loss_clg()
				896	*/
				897	q->clg = old_clg;
				898	q->loss_model = old_loss_model;
				899	return ret;
				900	}
				901	}
				902
				903	sch->limit = qopt->limit;
				904
				905	q->latency = qopt->latency;
				906	q->jitter = qopt->jitter;
				907	q->limit = qopt->limit;
				908	q->gap = qopt->gap;
				909	q->counter = 0;
				910	q->loss = qopt->loss;
				911	q->duplicate = qopt->duplicate;
				912
				913	/* for compatibility with earlier versions.
				914	* if gap is set, need to assume 100% probability
				915	*/
				916	if (q->gap)
				917	q->reorder = ~0;
				918
				919	if (tb[TCA_NETEM_CORR])
				920	get_correlation(q, tb[TCA_NETEM_CORR]);
				921
				922	if (tb[TCA_NETEM_REORDER])
				923	get_reorder(q, tb[TCA_NETEM_REORDER]);
				924
				925	if (tb[TCA_NETEM_CORRUPT])
				926	get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
				927
				928	if (tb[TCA_NETEM_RATE])
				929	get_rate(q, tb[TCA_NETEM_RATE]);
				930
				931	if (tb[TCA_NETEM_RATE64])
				932	q->rate = max_t(u64, q->rate,
				933	nla_get_u64(tb[TCA_NETEM_RATE64]));
				934
				935	if (tb[TCA_NETEM_ECN])
				936	q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
				937
				938	return ret;
				939	}
				940
				941	static int netem_init(struct Qdisc sch, struct nlattr opt)
				942	{
				943	struct netem_sched_data *q = qdisc_priv(sch);
				944	int ret;
				945
				946	if (!opt)
				947	return -EINVAL;
				948
				949	qdisc_watchdog_init(&q->watchdog, sch);
				950
				951	q->loss_model = CLG_RANDOM;
				952	ret = netem_change(sch, opt);
				953	if (ret)
				954	pr_info("netem: change failed\n");
				955	return ret;
				956	}
				957
				958	static void netem_destroy(struct Qdisc *sch)
				959	{
				960	struct netem_sched_data *q = qdisc_priv(sch);
				961
				962	qdisc_watchdog_cancel(&q->watchdog);
				963	if (q->qdisc)
				964	qdisc_destroy(q->qdisc);
				965	dist_free(q->delay_dist);
				966	}
				967
				968	static int dump_loss_model(const struct netem_sched_data *q,
				969	struct sk_buff *skb)
				970	{
				971	struct nlattr *nest;
				972
				973	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
				974	if (nest == NULL)
				975	goto nla_put_failure;
				976
				977	switch (q->loss_model) {
				978	case CLG_RANDOM:
				979	/* legacy loss model */
				980	nla_nest_cancel(skb, nest);
				981	return 0; /* no data */
				982
				983	case CLG_4_STATES: {
				984	struct tc_netem_gimodel gi = {
				985	.p13 = q->clg.a1,
				986	.p31 = q->clg.a2,
				987	.p32 = q->clg.a3,
				988	.p14 = q->clg.a4,
				989	.p23 = q->clg.a5,
				990	};
				991
				992	if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
				993	goto nla_put_failure;
				994	break;
				995	}
				996	case CLG_GILB_ELL: {
				997	struct tc_netem_gemodel ge = {
				998	.p = q->clg.a1,
				999	.r = q->clg.a2,
				1000	.h = q->clg.a3,
				1001	.k1 = q->clg.a4,
				1002	};
				1003
				1004	if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
				1005	goto nla_put_failure;
				1006	break;
				1007	}
				1008	}
				1009
				1010	nla_nest_end(skb, nest);
				1011	return 0;
				1012
				1013	nla_put_failure:
				1014	nla_nest_cancel(skb, nest);
				1015	return -1;
				1016	}
				1017
				1018	static int netem_dump(struct Qdisc sch, struct sk_buff skb)
				1019	{
				1020	const struct netem_sched_data *q = qdisc_priv(sch);
				1021	struct nlattr nla = (struct nlattr ) skb_tail_pointer(skb);
				1022	struct tc_netem_qopt qopt;
				1023	struct tc_netem_corr cor;
				1024	struct tc_netem_reorder reorder;
				1025	struct tc_netem_corrupt corrupt;
				1026	struct tc_netem_rate rate;
				1027
				1028	qopt.latency = q->latency;
				1029	qopt.jitter = q->jitter;
				1030	qopt.limit = q->limit;
				1031	qopt.loss = q->loss;
				1032	qopt.gap = q->gap;
				1033	qopt.duplicate = q->duplicate;
				1034	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
				1035	goto nla_put_failure;
				1036
				1037	cor.delay_corr = q->delay_cor.rho;
				1038	cor.loss_corr = q->loss_cor.rho;
				1039	cor.dup_corr = q->dup_cor.rho;
				1040	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
				1041	goto nla_put_failure;
				1042
				1043	reorder.probability = q->reorder;
				1044	reorder.correlation = q->reorder_cor.rho;
				1045	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
				1046	goto nla_put_failure;
				1047
				1048	corrupt.probability = q->corrupt;
				1049	corrupt.correlation = q->corrupt_cor.rho;
				1050	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
				1051	goto nla_put_failure;
				1052
				1053	if (q->rate >= (1ULL << 32)) {
				1054	if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
				1055	goto nla_put_failure;
				1056	rate.rate = ~0U;
				1057	} else {
				1058	rate.rate = q->rate;
				1059	}
				1060	rate.packet_overhead = q->packet_overhead;
				1061	rate.cell_size = q->cell_size;
				1062	rate.cell_overhead = q->cell_overhead;
				1063	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
				1064	goto nla_put_failure;
				1065
				1066	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
				1067	goto nla_put_failure;
				1068
				1069	if (dump_loss_model(q, skb) != 0)
				1070	goto nla_put_failure;
				1071
				1072	return nla_nest_end(skb, nla);
				1073
				1074	nla_put_failure:
				1075	nlmsg_trim(skb, nla);
				1076	return -1;
				1077	}
				1078
				1079	static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
				1080	struct sk_buff skb, struct tcmsg tcm)
				1081	{
				1082	struct netem_sched_data *q = qdisc_priv(sch);
				1083
				1084	if (cl != 1 \|\| !q->qdisc) /* only one class */
				1085	return -ENOENT;
				1086
				1087	tcm->tcm_handle \|= TC_H_MIN(1);
				1088	tcm->tcm_info = q->qdisc->handle;
				1089
				1090	return 0;
				1091	}
				1092
				1093	static int netem_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
				1094	struct Qdisc **old)
				1095	{
				1096	struct netem_sched_data *q = qdisc_priv(sch);
				1097
				1098	*old = qdisc_replace(sch, new, &q->qdisc);
				1099	return 0;
				1100	}
				1101
				1102	static struct Qdisc netem_leaf(struct Qdisc sch, unsigned long arg)
				1103	{
				1104	struct netem_sched_data *q = qdisc_priv(sch);
				1105	return q->qdisc;
				1106	}
				1107
				1108	static unsigned long netem_get(struct Qdisc *sch, u32 classid)
				1109	{
				1110	return 1;
				1111	}
				1112
				1113	static void netem_put(struct Qdisc *sch, unsigned long arg)
				1114	{
				1115	}
				1116
				1117	static void netem_walk(struct Qdisc sch, struct qdisc_walker walker)
				1118	{
				1119	if (!walker->stop) {
				1120	if (walker->count >= walker->skip)
				1121	if (walker->fn(sch, 1, walker) < 0) {
				1122	walker->stop = 1;
				1123	return;
				1124	}
				1125	walker->count++;
				1126	}
				1127	}
				1128
				1129	static const struct Qdisc_class_ops netem_class_ops = {
				1130	.graft = netem_graft,
				1131	.leaf = netem_leaf,
				1132	.get = netem_get,
				1133	.put = netem_put,
				1134	.walk = netem_walk,
				1135	.dump = netem_dump_class,
				1136	};
				1137
				1138	static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
				1139	.id = "netem",
				1140	.cl_ops = &netem_class_ops,
				1141	.priv_size = sizeof(struct netem_sched_data),
				1142	.enqueue = netem_enqueue,
				1143	.dequeue = netem_dequeue,
				1144	.peek = qdisc_peek_dequeued,
				1145	.drop = netem_drop,
				1146	.init = netem_init,
				1147	.reset = netem_reset,
				1148	.destroy = netem_destroy,
				1149	.change = netem_change,
				1150	.dump = netem_dump,
				1151	.owner = THIS_MODULE,
				1152	};
				1153
				1154
				1155	static int __init netem_module_init(void)
				1156	{
				1157	pr_info("netem: version " VERSION "\n");
				1158	return register_qdisc(&netem_qdisc_ops);
				1159	}
				1160	static void __exit netem_module_exit(void)
				1161	{
				1162	unregister_qdisc(&netem_qdisc_ops);
				1163	}
				1164	module_init(netem_module_init)
				1165	module_exit(netem_module_exit)
				1166	MODULE_LICENSE("GPL");