Blame - net/ipv4/tcp_input.c - codeaurora/cp-linux

blob: 5e4a30d5262c90a77cf86b0a591df7eedd40f526 [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* Implementation of the Transmission Control Protocol(TCP).
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Mark Evans, <evansmp@uhura.aston.ac.uk>
				11	* Corey Minyard <wf-rch!minyard@relay.EU.net>
				12	* Florian La Roche, <flla@stud.uni-sb.de>
				13	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
				14	* Linus Torvalds, <torvalds@cs.helsinki.fi>
				15	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				16	* Matthew Dillon, <dillon@apollo.west.oic.com>
				17	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				18	* Jorge Cwik, <jorge@laser.satlink.net>
				19	*/
				20
				21	/*
				22	* Changes:
				23	* Pedro Roque : Fast Retransmit/Recovery.
				24	* Two receive queues.
				25	* Retransmit queue handled by TCP.
				26	* Better retransmit timer handling.
				27	* New congestion avoidance.
				28	* Header prediction.
				29	* Variable renaming.
				30	*
				31	* Eric : Fast Retransmit.
				32	* Randy Scott : MSS option defines.
				33	* Eric Schenk : Fixes to slow start algorithm.
				34	* Eric Schenk : Yet another double ACK bug.
				35	* Eric Schenk : Delayed ACK bug fixes.
				36	* Eric Schenk : Floyd style fast retrans war avoidance.
				37	* David S. Miller : Don't allow zero congestion window.
				38	* Eric Schenk : Fix retransmitter so that it sends
				39	* next packet on ack of previous packet.
				40	* Andi Kleen : Moved open_request checking here
				41	* and process RSTs for open_requests.
				42	* Andi Kleen : Better prune_queue, and other fixes.
				43	* Andrey Savochkin: Fix RTT measurements in the presence of
				44	* timestamps.
				45	* Andrey Savochkin: Check sequence numbers correctly when
				46	* removing SACKs due to in sequence incoming
				47	* data segments.
				48	* Andi Kleen: Make sure we never ack data there is not
				49	* enough room for. Also make this condition
				50	* a fatal error if it might still happen.
				51	* Andi Kleen: Add tcp_measure_rcv_mss to make
				52	* connections with MSS<min(MTU,ann. MSS)
				53	* work without delayed acks.
				54	* Andi Kleen: Process packets with PSH set in the
				55	* fast path.
				56	* J Hadi Salim: ECN support
				57	* Andrei Gurtov,
				58	* Pasi Sarolahti,
				59	* Panu Kuhlberg: Experimental audit of TCP (re)transmission
				60	* engine. Lots of bugs are found.
				61	* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
				62	*/
				63
				64	#define pr_fmt(fmt) "TCP: " fmt
				65
				66	#include <linux/mm.h>
				67	#include <linux/slab.h>
				68	#include <linux/module.h>
				69	#include <linux/sysctl.h>
				70	#include <linux/kernel.h>
				71	#include <linux/prefetch.h>
				72	#include <net/dst.h>
				73	#include <net/tcp.h>
				74	#include <net/inet_common.h>
				75	#include <linux/ipsec.h>
				76	#include <asm/unaligned.h>
				77	#include <linux/errqueue.h>
				78
				79	int sysctl_tcp_timestamps __read_mostly = 1;
				80	int sysctl_tcp_window_scaling __read_mostly = 1;
				81	int sysctl_tcp_sack __read_mostly = 1;
				82	int sysctl_tcp_fack __read_mostly = 1;
				83	int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
				84	int sysctl_tcp_max_reordering __read_mostly = 300;
				85	EXPORT_SYMBOL(sysctl_tcp_reordering);
				86	int sysctl_tcp_dsack __read_mostly = 1;
				87	int sysctl_tcp_app_win __read_mostly = 31;
				88	int sysctl_tcp_adv_win_scale __read_mostly = 1;
				89	EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
				90
				91	/* rfc5961 challenge ack rate limiting */
				92	int sysctl_tcp_challenge_ack_limit = 1000;
				93
				94	int sysctl_tcp_stdurg __read_mostly;
				95	int sysctl_tcp_rfc1337 __read_mostly;
				96	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
				97	int sysctl_tcp_frto __read_mostly = 2;
				98	int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
				99
				100	int sysctl_tcp_thin_dupack __read_mostly;
				101
				102	int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
				103	int sysctl_tcp_early_retrans __read_mostly = 3;
				104	int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
				105
				106	#define FLAG_DATA 0x01 /* Incoming frame contained data. */
				107	#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
				108	#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
				109	#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
				110	#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
				111	#define FLAG_DATA_SACKED 0x20 /* New SACK. */
				112	#define FLAG_ECE 0x40 /* ECE in this ACK */
				113	#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
				114	#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
				115	#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
				116	#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
				117	#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
				118	#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
				119	#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
				120
				121	#define FLAG_ACKED (FLAG_DATA_ACKED\|FLAG_SYN_ACKED)
				122	#define FLAG_NOT_DUP (FLAG_DATA\|FLAG_WIN_UPDATE\|FLAG_ACKED)
				123	#define FLAG_CA_ALERT (FLAG_DATA_SACKED\|FLAG_ECE)
				124	#define FLAG_FORWARD_PROGRESS (FLAG_ACKED\|FLAG_DATA_SACKED)
				125
				126	#define TCP_REMNANT (TCP_FLAG_FIN\|TCP_FLAG_URG\|TCP_FLAG_SYN\|TCP_FLAG_PSH)
				127	#define TCP_HP_BITS (~(TCP_RESERVED_BITS\|TCP_FLAG_PSH))
				128
				129	/* Adapt the MSS value used to make delayed ack decision to the
				130	* real world.
				131	*/
				132	static void tcp_measure_rcv_mss(struct sock sk, const struct sk_buff skb)
				133	{
				134	struct inet_connection_sock *icsk = inet_csk(sk);
				135	const unsigned int lss = icsk->icsk_ack.last_seg_size;
				136	unsigned int len;
				137
				138	icsk->icsk_ack.last_seg_size = 0;
				139
				140	/* skb->len may jitter because of SACKs, even if peer
				141	* sends good full-sized frames.
				142	*/
				143	len = skb_shinfo(skb)->gso_size ? : skb->len;
				144	if (len >= icsk->icsk_ack.rcv_mss) {
				145	icsk->icsk_ack.rcv_mss = len;
				146	} else {
				147	/* Otherwise, we make more careful check taking into account,
				148	* that SACKs block is variable.
				149	*
				150	* "len" is invariant segment length, including TCP header.
				151	*/
				152	len += skb->data - skb_transport_header(skb);
				153	if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) \|\|
				154	/* If PSH is not set, packet should be
				155	* full sized, provided peer TCP is not badly broken.
				156	* This observation (if it is correct 8)) allows
				157	* to handle super-low mtu links fairly.
				158	*/
				159	(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
				160	!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
				161	/* Subtract also invariant (if peer is RFC compliant),
				162	* tcp header plus fixed timestamp option length.
				163	* Resulting "len" is MSS free of SACK jitter.
				164	*/
				165	len -= tcp_sk(sk)->tcp_header_len;
				166	icsk->icsk_ack.last_seg_size = len;
				167	if (len == lss) {
				168	icsk->icsk_ack.rcv_mss = len;
				169	return;
				170	}
				171	}
				172	if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
				173	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED2;
				174	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
				175	}
				176	}
				177
				178	static void tcp_incr_quickack(struct sock *sk)
				179	{
				180	struct inet_connection_sock *icsk = inet_csk(sk);
				181	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
				182
				183	if (quickacks == 0)
				184	quickacks = 2;
				185	if (quickacks > icsk->icsk_ack.quick)
				186	icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
				187	}
				188
				189	static void tcp_enter_quickack_mode(struct sock *sk)
				190	{
				191	struct inet_connection_sock *icsk = inet_csk(sk);
				192	tcp_incr_quickack(sk);
				193	icsk->icsk_ack.pingpong = 0;
				194	icsk->icsk_ack.ato = TCP_ATO_MIN;
				195	}
				196
				197	/* Send ACKs quickly, if "quick" count is not exhausted
				198	* and the session is not interactive.
				199	*/
				200
				201	static bool tcp_in_quickack_mode(struct sock *sk)
				202	{
				203	const struct inet_connection_sock *icsk = inet_csk(sk);
				204	const struct dst_entry *dst = __sk_dst_get(sk);
				205
				206	return (dst && dst_metric(dst, RTAX_QUICKACK)) \|\|
				207	(icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
				208	}
				209
				210	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
				211	{
				212	if (tp->ecn_flags & TCP_ECN_OK)
				213	tp->ecn_flags \|= TCP_ECN_QUEUE_CWR;
				214	}
				215
				216	static void tcp_ecn_accept_cwr(struct tcp_sock tp, const struct sk_buff skb)
				217	{
				218	if (tcp_hdr(skb)->cwr)
				219	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
				220	}
				221
				222	static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
				223	{
				224	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
				225	}
				226
				227	static void __tcp_ecn_check_ce(struct tcp_sock tp, const struct sk_buff skb)
				228	{
				229	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
				230	case INET_ECN_NOT_ECT:
				231	/* Funny extension: if ECT is not set on a segment,
				232	* and we already seen ECT on a previous segment,
				233	* it is probably a retransmit.
				234	*/
				235	if (tp->ecn_flags & TCP_ECN_SEEN)
				236	tcp_enter_quickack_mode((struct sock *)tp);
				237	break;
				238	case INET_ECN_CE:
				239	if (tcp_ca_needs_ecn((struct sock *)tp))
				240	tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
				241
				242	if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
				243	/* Better not delay acks, sender can have a very low cwnd */
				244	tcp_enter_quickack_mode((struct sock *)tp);
				245	tp->ecn_flags \|= TCP_ECN_DEMAND_CWR;
				246	}
				247	tp->ecn_flags \|= TCP_ECN_SEEN;
				248	break;
				249	default:
				250	if (tcp_ca_needs_ecn((struct sock *)tp))
				251	tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
				252	tp->ecn_flags \|= TCP_ECN_SEEN;
				253	break;
				254	}
				255	}
				256
				257	static void tcp_ecn_check_ce(struct tcp_sock tp, const struct sk_buff skb)
				258	{
				259	if (tp->ecn_flags & TCP_ECN_OK)
				260	__tcp_ecn_check_ce(tp, skb);
				261	}
				262
				263	static void tcp_ecn_rcv_synack(struct tcp_sock tp, const struct tcphdr th)
				264	{
				265	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| th->cwr))
				266	tp->ecn_flags &= ~TCP_ECN_OK;
				267	}
				268
				269	static void tcp_ecn_rcv_syn(struct tcp_sock tp, const struct tcphdr th)
				270	{
				271	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| !th->cwr))
				272	tp->ecn_flags &= ~TCP_ECN_OK;
				273	}
				274
				275	static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock tp, const struct tcphdr th)
				276	{
				277	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
				278	return true;
				279	return false;
				280	}
				281
				282	/* Buffer size and advertised window tuning.
				283	*
				284	* 1. Tuning sk->sk_sndbuf, when connection enters established state.
				285	*/
				286
				287	static void tcp_sndbuf_expand(struct sock *sk)
				288	{
				289	const struct tcp_sock *tp = tcp_sk(sk);
				290	int sndmem, per_mss;
				291	u32 nr_segs;
				292
				293	/* Worst case is non GSO/TSO : each frame consumes one skb
				294	* and skb->head is kmalloced using power of two area of memory
				295	*/
				296	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
				297	MAX_TCP_HEADER +
				298	SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
				299
				300	per_mss = roundup_pow_of_two(per_mss) +
				301	SKB_DATA_ALIGN(sizeof(struct sk_buff));
				302
				303	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
				304	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
				305
				306	/* Fast Recovery (RFC 5681 3.2) :
				307	* Cubic needs 1.7 factor, rounded to 2 to include
				308	* extra cushion (application might react slowly to POLLOUT)
				309	*/
				310	sndmem = 2 * nr_segs * per_mss;
				311
				312	if (sk->sk_sndbuf < sndmem)
				313	sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
				314	}
				315
				316	/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
				317	*
				318	* All tcp_full_space() is split to two parts: "network" buffer, allocated
				319	* forward and advertised in receiver window (tp->rcv_wnd) and
				320	* "application buffer", required to isolate scheduling/application
				321	* latencies from network.
				322	* window_clamp is maximal advertised window. It can be less than
				323	* tcp_full_space(), in this case tcp_full_space() - window_clamp
				324	* is reserved for "application" buffer. The less window_clamp is
				325	* the smoother our behaviour from viewpoint of network, but the lower
				326	* throughput and the higher sensitivity of the connection to losses. 8)
				327	*
				328	* rcv_ssthresh is more strict window_clamp used at "slow start"
				329	* phase to predict further behaviour of this connection.
				330	* It is used for two goals:
				331	* - to enforce header prediction at sender, even when application
				332	* requires some significant "application buffer". It is check #1.
				333	* - to prevent pruning of receive queue because of misprediction
				334	* of receiver window. Check #2.
				335	*
				336	* The scheme does not work when sender sends good segments opening
				337	* window and then starts to feed us spaghetti. But it should work
				338	* in common situations. Otherwise, we have to rely on queue collapsing.
				339	*/
				340
				341	/* Slow part of check#2. */
				342	static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb)
				343	{
				344	struct tcp_sock *tp = tcp_sk(sk);
				345	/* Optimize this! */
				346	int truesize = tcp_win_from_space(skb->truesize) >> 1;
				347	int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
				348
				349	while (tp->rcv_ssthresh <= window) {
				350	if (truesize <= skb->len)
				351	return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
				352
				353	truesize >>= 1;
				354	window >>= 1;
				355	}
				356	return 0;
				357	}
				358
				359	static void tcp_grow_window(struct sock sk, const struct sk_buff skb)
				360	{
				361	struct tcp_sock *tp = tcp_sk(sk);
				362
				363	/* Check #1 */
				364	if (tp->rcv_ssthresh < tp->window_clamp &&
				365	(int)tp->rcv_ssthresh < tcp_space(sk) &&
				366	!tcp_under_memory_pressure(sk)) {
				367	int incr;
				368
				369	/* Check #2. Increase window, if skb with such overhead
				370	* will fit to rcvbuf in future.
				371	*/
				372	if (tcp_win_from_space(skb->truesize) <= skb->len)
				373	incr = 2 * tp->advmss;
				374	else
				375	incr = __tcp_grow_window(sk, skb);
				376
				377	if (incr) {
				378	incr = max_t(int, incr, 2 * skb->len);
				379	tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
				380	tp->window_clamp);
				381	inet_csk(sk)->icsk_ack.quick \|= 1;
				382	}
				383	}
				384	}
				385
				386	/* 3. Tuning rcvbuf, when connection enters established state. */
				387	static void tcp_fixup_rcvbuf(struct sock *sk)
				388	{
				389	u32 mss = tcp_sk(sk)->advmss;
				390	int rcvmem;
				391
				392	rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
				393	tcp_default_init_rwnd(mss);
				394
				395	/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
				396	* Allow enough cushion so that sender is not limited by our window
				397	*/
				398	if (sysctl_tcp_moderate_rcvbuf)
				399	rcvmem <<= 2;
				400
				401	if (sk->sk_rcvbuf < rcvmem)
				402	sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
				403	}
				404
				405	/* 4. Try to fixup all. It is made immediately after connection enters
				406	* established state.
				407	*/
				408	void tcp_init_buffer_space(struct sock *sk)
				409	{
				410	struct tcp_sock *tp = tcp_sk(sk);
				411	int maxwin;
				412
				413	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
				414	tcp_fixup_rcvbuf(sk);
				415	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
				416	tcp_sndbuf_expand(sk);
				417
				418	tp->rcvq_space.space = tp->rcv_wnd;
				419	tp->rcvq_space.time = tcp_time_stamp;
				420	tp->rcvq_space.seq = tp->copied_seq;
				421
				422	maxwin = tcp_full_space(sk);
				423
				424	if (tp->window_clamp >= maxwin) {
				425	tp->window_clamp = maxwin;
				426
				427	if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
				428	tp->window_clamp = max(maxwin -
				429	(maxwin >> sysctl_tcp_app_win),
				430	4 * tp->advmss);
				431	}
				432
				433	/* Force reservation of one segment. */
				434	if (sysctl_tcp_app_win &&
				435	tp->window_clamp > 2 * tp->advmss &&
				436	tp->window_clamp + tp->advmss > maxwin)
				437	tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
				438
				439	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
				440	tp->snd_cwnd_stamp = tcp_time_stamp;
				441	}
				442
				443	/* 5. Recalculate window clamp after socket hit its memory bounds. */
				444	static void tcp_clamp_window(struct sock *sk)
				445	{
				446	struct tcp_sock *tp = tcp_sk(sk);
				447	struct inet_connection_sock *icsk = inet_csk(sk);
				448
				449	icsk->icsk_ack.quick = 0;
				450
				451	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
				452	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
				453	!tcp_under_memory_pressure(sk) &&
				454	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
				455	sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
				456	sysctl_tcp_rmem[2]);
				457	}
				458	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
				459	tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
				460	}
				461
				462	/* Initialize RCV_MSS value.
				463	* RCV_MSS is an our guess about MSS used by the peer.
				464	* We haven't any direct information about the MSS.
				465	* It's better to underestimate the RCV_MSS rather than overestimate.
				466	* Overestimations make us ACKing less frequently than needed.
				467	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
				468	*/
				469	void tcp_initialize_rcv_mss(struct sock *sk)
				470	{
				471	const struct tcp_sock *tp = tcp_sk(sk);
				472	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
				473
				474	hint = min(hint, tp->rcv_wnd / 2);
				475	hint = min(hint, TCP_MSS_DEFAULT);
				476	hint = max(hint, TCP_MIN_MSS);
				477
				478	inet_csk(sk)->icsk_ack.rcv_mss = hint;
				479	}
				480	EXPORT_SYMBOL(tcp_initialize_rcv_mss);
				481
				482	/* Receiver "autotuning" code.
				483	*
				484	* The algorithm for RTT estimation w/o timestamps is based on
				485	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
				486	* <http://public.lanl.gov/radiant/pubs.html#DRS>
				487	*
				488	* More detail on this code can be found at
				489	* <http://staff.psc.edu/jheffner/>,
				490	* though this reference is out of date. A new paper
				491	* is pending.
				492	*/
				493	static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
				494	{
				495	u32 new_sample = tp->rcv_rtt_est.rtt;
				496	long m = sample;
				497
				498	if (m == 0)
				499	m = 1;
				500
				501	if (new_sample != 0) {
				502	/* If we sample in larger samples in the non-timestamp
				503	* case, we could grossly overestimate the RTT especially
				504	* with chatty applications or bulk transfer apps which
				505	* are stalled on filesystem I/O.
				506	*
				507	* Also, since we are only going for a minimum in the
				508	* non-timestamp case, we do not smooth things out
				509	* else with timestamps disabled convergence takes too
				510	* long.
				511	*/
				512	if (!win_dep) {
				513	m -= (new_sample >> 3);
				514	new_sample += m;
				515	} else {
				516	m <<= 3;
				517	if (m < new_sample)
				518	new_sample = m;
				519	}
				520	} else {
				521	/* No previous measure. */
				522	new_sample = m << 3;
				523	}
				524
				525	if (tp->rcv_rtt_est.rtt != new_sample)
				526	tp->rcv_rtt_est.rtt = new_sample;
				527	}
				528
				529	static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
				530	{
				531	if (tp->rcv_rtt_est.time == 0)
				532	goto new_measure;
				533	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
				534	return;
				535	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
				536
				537	new_measure:
				538	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
				539	tp->rcv_rtt_est.time = tcp_time_stamp;
				540	}
				541
				542	static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
				543	const struct sk_buff *skb)
				544	{
				545	struct tcp_sock *tp = tcp_sk(sk);
				546	if (tp->rx_opt.rcv_tsecr &&
				547	(TCP_SKB_CB(skb)->end_seq -
				548	TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
				549	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
				550	}
				551
				552	/*
				553	* This function should be called every time data is copied to user space.
				554	* It calculates the appropriate TCP receive buffer space.
				555	*/
				556	void tcp_rcv_space_adjust(struct sock *sk)
				557	{
				558	struct tcp_sock *tp = tcp_sk(sk);
				559	int time;
				560	int copied;
				561
				562	time = tcp_time_stamp - tp->rcvq_space.time;
				563	if (time < (tp->rcv_rtt_est.rtt >> 3) \|\| tp->rcv_rtt_est.rtt == 0)
				564	return;
				565
				566	/* Number of bytes copied to user in last RTT */
				567	copied = tp->copied_seq - tp->rcvq_space.seq;
				568	if (copied <= tp->rcvq_space.space)
				569	goto new_measure;
				570
				571	/* A bit of theory :
				572	* copied = bytes received in previous RTT, our base window
				573	* To cope with packet losses, we need a 2x factor
				574	* To cope with slow start, and sender growing its cwin by 100 %
				575	* every RTT, we need a 4x factor, because the ACK we are sending
				576	* now is for the next RTT, not the current one :
				577	* <prev RTT . ><current RTT .. ><next RTT .... >
				578	*/
				579
				580	if (sysctl_tcp_moderate_rcvbuf &&
				581	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
				582	int rcvwin, rcvmem, rcvbuf;
				583
				584	/* minimal window to cope with packet losses, assuming
				585	* steady state. Add some cushion because of small variations.
				586	*/
				587	rcvwin = (copied << 1) + 16 * tp->advmss;
				588
				589	/* If rate increased by 25%,
				590	* assume slow start, rcvwin = 3 * copied
				591	* If rate increased by 50%,
				592	* assume sender can use 2x growth, rcvwin = 4 * copied
				593	*/
				594	if (copied >=
				595	tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
				596	if (copied >=
				597	tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
				598	rcvwin <<= 1;
				599	else
				600	rcvwin += (rcvwin >> 1);
				601	}
				602
				603	rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
				604	while (tcp_win_from_space(rcvmem) < tp->advmss)
				605	rcvmem += 128;
				606
				607	rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
				608	if (rcvbuf > sk->sk_rcvbuf) {
				609	sk->sk_rcvbuf = rcvbuf;
				610
				611	/* Make the window clamp follow along. */
				612	tp->window_clamp = rcvwin;
				613	}
				614	}
				615	tp->rcvq_space.space = copied;
				616
				617	new_measure:
				618	tp->rcvq_space.seq = tp->copied_seq;
				619	tp->rcvq_space.time = tcp_time_stamp;
				620	}
				621
				622	/* There is something which you must keep in mind when you analyze the
				623	* behavior of the tp->ato delayed ack timeout interval. When a
				624	* connection starts up, we want to ack as quickly as possible. The
				625	* problem is that "good" TCP's do slow start at the beginning of data
				626	* transmission. The means that until we send the first few ACK's the
				627	* sender will sit on his end and only queue most of his data, because
				628	* he can only send snd_cwnd unacked packets at any given time. For
				629	* each ACK we send, he increments snd_cwnd and transmits more of his
				630	* queue. -DaveM
				631	*/
				632	static void tcp_event_data_recv(struct sock sk, struct sk_buff skb)
				633	{
				634	struct tcp_sock *tp = tcp_sk(sk);
				635	struct inet_connection_sock *icsk = inet_csk(sk);
				636	u32 now;
				637
				638	inet_csk_schedule_ack(sk);
				639
				640	tcp_measure_rcv_mss(sk, skb);
				641
				642	tcp_rcv_rtt_measure(tp);
				643
				644	now = tcp_time_stamp;
				645
				646	if (!icsk->icsk_ack.ato) {
				647	/* The _first_ data packet received, initialize
				648	* delayed ACK engine.
				649	*/
				650	tcp_incr_quickack(sk);
				651	icsk->icsk_ack.ato = TCP_ATO_MIN;
				652	} else {
				653	int m = now - icsk->icsk_ack.lrcvtime;
				654
				655	if (m <= TCP_ATO_MIN / 2) {
				656	/* The fastest case is the first. */
				657	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
				658	} else if (m < icsk->icsk_ack.ato) {
				659	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
				660	if (icsk->icsk_ack.ato > icsk->icsk_rto)
				661	icsk->icsk_ack.ato = icsk->icsk_rto;
				662	} else if (m > icsk->icsk_rto) {
				663	/* Too long gap. Apparently sender failed to
				664	* restart window, so that we send ACKs quickly.
				665	*/
				666	tcp_incr_quickack(sk);
				667	sk_mem_reclaim(sk);
				668	}
				669	}
				670	icsk->icsk_ack.lrcvtime = now;
				671
				672	tcp_ecn_check_ce(tp, skb);
				673
				674	if (skb->len >= 128)
				675	tcp_grow_window(sk, skb);
				676	}
				677
				678	/* Called to compute a smoothed rtt estimate. The data fed to this
				679	* routine either comes from timestamps, or from segments that were
				680	* known _not_ to have been retransmitted [see Karn/Partridge
				681	* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
				682	* piece by Van Jacobson.
				683	* NOTE: the next three routines used to be one big routine.
				684	* To save cycles in the RFC 1323 implementation it was better to break
				685	* it up into three procedures. -- erics
				686	*/
				687	static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
				688	{
				689	struct tcp_sock *tp = tcp_sk(sk);
				690	long m = mrtt_us; /* RTT */
				691	u32 srtt = tp->srtt_us;
				692
				693	/* The following amusing code comes from Jacobson's
				694	* article in SIGCOMM '88. Note that rtt and mdev
				695	* are scaled versions of rtt and mean deviation.
				696	* This is designed to be as fast as possible
				697	* m stands for "measurement".
				698	*
				699	* On a 1990 paper the rto value is changed to:
				700	* RTO = rtt + 4 * mdev
				701	*
				702	* Funny. This algorithm seems to be very broken.
				703	* These formulae increase RTO, when it should be decreased, increase
				704	* too slowly, when it should be increased quickly, decrease too quickly
				705	* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
				706	* does not matter how to _calculate_ it. Seems, it was trap
				707	* that VJ failed to avoid. 8)
				708	*/
				709	if (srtt != 0) {
				710	m -= (srtt >> 3); /* m is now error in rtt est */
				711	srtt += m; /* rtt = 7/8 rtt + 1/8 new */
				712	if (m < 0) {
				713	m = -m; /* m is now abs(error) */
				714	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				715	/* This is similar to one of Eifel findings.
				716	* Eifel blocks mdev updates when rtt decreases.
				717	* This solution is a bit different: we use finer gain
				718	* for mdev in this case (alpha*beta).
				719	* Like Eifel it also prevents growth of rto,
				720	* but also it limits too fast rto decreases,
				721	* happening in pure Eifel.
				722	*/
				723	if (m > 0)
				724	m >>= 3;
				725	} else {
				726	m -= (tp->mdev_us >> 2); /* similar update on mdev */
				727	}
				728	tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
				729	if (tp->mdev_us > tp->mdev_max_us) {
				730	tp->mdev_max_us = tp->mdev_us;
				731	if (tp->mdev_max_us > tp->rttvar_us)
				732	tp->rttvar_us = tp->mdev_max_us;
				733	}
				734	if (after(tp->snd_una, tp->rtt_seq)) {
				735	if (tp->mdev_max_us < tp->rttvar_us)
				736	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
				737	tp->rtt_seq = tp->snd_nxt;
				738	tp->mdev_max_us = tcp_rto_min_us(sk);
				739	}
				740	} else {
				741	/* no previous measure. */
				742	srtt = m << 3; /* take the measured time to be rtt */
				743	tp->mdev_us = m << 1; /* make sure rto = 3rtt /
				744	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
				745	tp->mdev_max_us = tp->rttvar_us;
				746	tp->rtt_seq = tp->snd_nxt;
				747	}
				748	tp->srtt_us = max(1U, srtt);
				749	}
				750
				751	/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
				752	* Note: TCP stack does not yet implement pacing.
				753	* FQ packet scheduler can be used to implement cheap but effective
				754	* TCP pacing, to smooth the burst on large writes when packets
				755	* in flight is significantly lower than cwnd (or rwin)
				756	*/
				757	int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
				758	int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
				759
				760	static void tcp_update_pacing_rate(struct sock *sk)
				761	{
				762	const struct tcp_sock *tp = tcp_sk(sk);
				763	u64 rate;
				764
				765	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
				766	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
				767
				768	/* current rate is (cwnd * mss) / srtt
				769	* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
				770	* In Congestion Avoidance phase, set it to 120 % the current rate.
				771	*
				772	* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
				773	* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
				774	* end of slow start and should slow down.
				775	*/
				776	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
				777	rate *= sysctl_tcp_pacing_ss_ratio;
				778	else
				779	rate *= sysctl_tcp_pacing_ca_ratio;
				780
				781	rate *= max(tp->snd_cwnd, tp->packets_out);
				782
				783	if (likely(tp->srtt_us))
				784	do_div(rate, tp->srtt_us);
				785
				786	/* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
				787	* without any lock. We want to make sure compiler wont store
				788	* intermediate values in this location.
				789	*/
				790	ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
				791	sk->sk_max_pacing_rate);
				792	}
				793
				794	/* Calculate rto without backoff. This is the second half of Van Jacobson's
				795	* routine referred to above.
				796	*/
				797	static void tcp_set_rto(struct sock *sk)
				798	{
				799	const struct tcp_sock *tp = tcp_sk(sk);
				800	/* Old crap is replaced with new one. 8)
				801	*
				802	* More seriously:
				803	* 1. If rtt variance happened to be less 50msec, it is hallucination.
				804	* It cannot be less due to utterly erratic ACK generation made
				805	* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
				806	* to do with delayed acks, because at cwnd>2 true delack timeout
				807	* is invisible. Actually, Linux-2.4 also generates erratic
				808	* ACKs in some circumstances.
				809	*/
				810	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
				811
				812	/* 2. Fixups made earlier cannot be right.
				813	* If we do not estimate RTO correctly without them,
				814	* all the algo is pure shit and should be replaced
				815	* with correct one. It is exactly, which we pretend to do.
				816	*/
				817
				818	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
				819	* guarantees that rto is higher.
				820	*/
				821	tcp_bound_rto(sk);
				822	}
				823
				824	__u32 tcp_init_cwnd(const struct tcp_sock tp, const struct dst_entry dst)
				825	{
				826	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
				827
				828	if (!cwnd)
				829	cwnd = TCP_INIT_CWND;
				830	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
				831	}
				832
				833	/*
				834	* Packet counting of FACK is based on in-order assumptions, therefore TCP
				835	* disables it when reordering is detected
				836	*/
				837	void tcp_disable_fack(struct tcp_sock *tp)
				838	{
				839	/* RFC3517 uses different metric in lost marker => reset on change */
				840	if (tcp_is_fack(tp))
				841	tp->lost_skb_hint = NULL;
				842	tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
				843	}
				844
				845	/* Take a notice that peer is sending D-SACKs */
				846	static void tcp_dsack_seen(struct tcp_sock *tp)
				847	{
				848	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
				849	}
				850
				851	static void tcp_update_reordering(struct sock *sk, const int metric,
				852	const int ts)
				853	{
				854	struct tcp_sock *tp = tcp_sk(sk);
				855	if (metric > tp->reordering) {
				856	int mib_idx;
				857
				858	tp->reordering = min(sysctl_tcp_max_reordering, metric);
				859
				860	/* This exciting event is worth to be remembered. 8) */
				861	if (ts)
				862	mib_idx = LINUX_MIB_TCPTSREORDER;
				863	else if (tcp_is_reno(tp))
				864	mib_idx = LINUX_MIB_TCPRENOREORDER;
				865	else if (tcp_is_fack(tp))
				866	mib_idx = LINUX_MIB_TCPFACKREORDER;
				867	else
				868	mib_idx = LINUX_MIB_TCPSACKREORDER;
				869
				870	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				871	#if FASTRETRANS_DEBUG > 1
				872	pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
				873	tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
				874	tp->reordering,
				875	tp->fackets_out,
				876	tp->sacked_out,
				877	tp->undo_marker ? tp->undo_retrans : 0);
				878	#endif
				879	tcp_disable_fack(tp);
				880	}
				881
				882	if (metric > 0)
				883	tcp_disable_early_retrans(tp);
				884	tp->rack.reord = 1;
				885	}
				886
				887	/* This must be called before lost_out is incremented */
				888	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct sk_buff skb)
				889	{
				890	if (!tp->retransmit_skb_hint \|\|
				891	before(TCP_SKB_CB(skb)->seq,
				892	TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
				893	tp->retransmit_skb_hint = skb;
				894
				895	if (!tp->lost_out \|\|
				896	after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
				897	tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
				898	}
				899
				900	static void tcp_skb_mark_lost(struct tcp_sock tp, struct sk_buff skb)
				901	{
				902	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				903	tcp_verify_retransmit_hint(tp, skb);
				904
				905	tp->lost_out += tcp_skb_pcount(skb);
				906	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				907	}
				908	}
				909
				910	void tcp_skb_mark_lost_uncond_verify(struct tcp_sock tp, struct sk_buff skb)
				911	{
				912	tcp_verify_retransmit_hint(tp, skb);
				913
				914	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
				915	tp->lost_out += tcp_skb_pcount(skb);
				916	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				917	}
				918	}
				919
				920	/* This procedure tags the retransmission queue when SACKs arrive.
				921	*
				922	* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
				923	* Packets in queue with these bits set are counted in variables
				924	* sacked_out, retrans_out and lost_out, correspondingly.
				925	*
				926	* Valid combinations are:
				927	* Tag InFlight Description
				928	* 0 1 - orig segment is in flight.
				929	* S 0 - nothing flies, orig reached receiver.
				930	* L 0 - nothing flies, orig lost by net.
				931	* R 2 - both orig and retransmit are in flight.
				932	* L\|R 1 - orig is lost, retransmit is in flight.
				933	* S\|R 1 - orig reached receiver, retrans is still in flight.
				934	* (L\|S\|R is logically valid, it could occur when L\|R is sacked,
				935	* but it is equivalent to plain S and code short-curcuits it to S.
				936	* L\|S is logically invalid, it would mean -1 packet in flight 8))
				937	*
				938	* These 6 states form finite state machine, controlled by the following events:
				939	* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
				940	* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
				941	* 3. Loss detection event of two flavors:
				942	* A. Scoreboard estimator decided the packet is lost.
				943	* A'. Reno "three dupacks" marks head of queue lost.
				944	* A''. Its FACK modification, head until snd.fack is lost.
				945	* B. SACK arrives sacking SND.NXT at the moment, when the
				946	* segment was retransmitted.
				947	* 4. D-SACK added new rule: D-SACK changes any tag to S.
				948	*
				949	* It is pleasant to note, that state diagram turns out to be commutative,
				950	* so that we are allowed not to be bothered by order of our actions,
				951	* when multiple events arrive simultaneously. (see the function below).
				952	*
				953	* Reordering detection.
				954	* --------------------
				955	* Reordering metric is maximal distance, which a packet can be displaced
				956	* in packet stream. With SACKs we can estimate it:
				957	*
				958	* 1. SACK fills old hole and the corresponding segment was not
				959	* ever retransmitted -> reordering. Alas, we cannot use it
				960	* when segment was retransmitted.
				961	* 2. The last flaw is solved with D-SACK. D-SACK arrives
				962	* for retransmitted and already SACKed segment -> reordering..
				963	* Both of these heuristics are not used in Loss state, when we cannot
				964	* account for retransmits accurately.
				965	*
				966	* SACK block validation.
				967	* ----------------------
				968	*
				969	* SACK block range validation checks that the received SACK block fits to
				970	* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
				971	* Note that SND.UNA is not included to the range though being valid because
				972	* it means that the receiver is rather inconsistent with itself reporting
				973	* SACK reneging when it should advance SND.UNA. Such SACK block this is
				974	* perfectly valid, however, in light of RFC2018 which explicitly states
				975	* that "SACK block MUST reflect the newest segment. Even if the newest
				976	* segment is going to be discarded ...", not that it looks very clever
				977	* in case of head skb. Due to potentional receiver driven attacks, we
				978	* choose to avoid immediate execution of a walk in write queue due to
				979	* reneging and defer head skb's loss recovery to standard loss recovery
				980	* procedure that will eventually trigger (nothing forbids us doing this).
				981	*
				982	* Implements also blockage to start_seq wrap-around. Problem lies in the
				983	* fact that though start_seq (s) is before end_seq (i.e., not reversed),
				984	* there's no guarantee that it will be before snd_nxt (n). The problem
				985	* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
				986	* wrap (s_w):
				987	*
				988	* <- outs wnd -> <- wrapzone ->
				989	* u e n u_w e_w s n_w
				990	* \| \| \| \| \| \| \|
				991	* \|<------------+------+----- TCP seqno space --------------+---------->\|
				992	* ...-- <2^31 ->\| \|<--------...
				993	* ...---- >2^31 ------>\| \|<--------...
				994	*
				995	* Current code wouldn't be vulnerable but it's better still to discard such
				996	* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
				997	* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
				998	* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
				999	* equal to the ideal case (infinite seqno space without wrap caused issues).
				1000	*
				1001	* With D-SACK the lower bound is extended to cover sequence space below
				1002	* SND.UNA down to undo_marker, which is the last point of interest. Yet
				1003	* again, D-SACK block must not to go across snd_una (for the same reason as
				1004	* for the normal SACK blocks, explained above). But there all simplicity
				1005	* ends, TCP might receive valid D-SACKs below that. As long as they reside
				1006	* fully below undo_marker they do not affect behavior in anyway and can
				1007	* therefore be safely ignored. In rare cases (which are more or less
				1008	* theoretical ones), the D-SACK will nicely cross that boundary due to skb
				1009	* fragmentation and packet reordering past skb's retransmission. To consider
				1010	* them correctly, the acceptable range must be extended even more though
				1011	* the exact amount is rather hard to quantify. However, tp->max_window can
				1012	* be used as an exaggerated estimate.
				1013	*/
				1014	static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
				1015	u32 start_seq, u32 end_seq)
				1016	{
				1017	/* Too far in future, or reversed (interpretation is ambiguous) */
				1018	if (after(end_seq, tp->snd_nxt) \|\| !before(start_seq, end_seq))
				1019	return false;
				1020
				1021	/* Nasty start_seq wrap-around check (see comments above) */
				1022	if (!before(start_seq, tp->snd_nxt))
				1023	return false;
				1024
				1025	/* In outstanding window? ...This is valid exit for D-SACKs too.
				1026	* start_seq == snd_una is non-sensical (see comments above)
				1027	*/
				1028	if (after(start_seq, tp->snd_una))
				1029	return true;
				1030
				1031	if (!is_dsack \|\| !tp->undo_marker)
				1032	return false;
				1033
				1034	/* ...Then it's D-SACK, and must reside below snd_una completely */
				1035	if (after(end_seq, tp->snd_una))
				1036	return false;
				1037
				1038	if (!before(start_seq, tp->undo_marker))
				1039	return true;
				1040
				1041	/* Too old */
				1042	if (!after(end_seq, tp->undo_marker))
				1043	return false;
				1044
				1045	/* Undo_marker boundary crossing (overestimates a lot). Known already:
				1046	* start_seq < undo_marker and end_seq >= undo_marker.
				1047	*/
				1048	return !before(start_seq, end_seq - tp->max_window);
				1049	}
				1050
				1051	static bool tcp_check_dsack(struct sock sk, const struct sk_buff ack_skb,
				1052	struct tcp_sack_block_wire *sp, int num_sacks,
				1053	u32 prior_snd_una)
				1054	{
				1055	struct tcp_sock *tp = tcp_sk(sk);
				1056	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
				1057	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
				1058	bool dup_sack = false;
				1059
				1060	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
				1061	dup_sack = true;
				1062	tcp_dsack_seen(tp);
				1063	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
				1064	} else if (num_sacks > 1) {
				1065	u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
				1066	u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
				1067
				1068	if (!after(end_seq_0, end_seq_1) &&
				1069	!before(start_seq_0, start_seq_1)) {
				1070	dup_sack = true;
				1071	tcp_dsack_seen(tp);
				1072	NET_INC_STATS_BH(sock_net(sk),
				1073	LINUX_MIB_TCPDSACKOFORECV);
				1074	}
				1075	}
				1076
				1077	/* D-SACK for already forgotten data... Do dumb counting. */
				1078	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
				1079	!after(end_seq_0, prior_snd_una) &&
				1080	after(end_seq_0, tp->undo_marker))
				1081	tp->undo_retrans--;
				1082
				1083	return dup_sack;
				1084	}
				1085
				1086	struct tcp_sacktag_state {
				1087	int reord;
				1088	int fack_count;
				1089	/* Timestamps for earliest and latest never-retransmitted segment
				1090	* that was SACKed. RTO needs the earliest RTT to stay conservative,
				1091	* but congestion control should still get an accurate delay signal.
				1092	*/
				1093	struct skb_mstamp first_sackt;
				1094	struct skb_mstamp last_sackt;
				1095	int flag;
				1096	};
				1097
				1098	/* Check if skb is fully within the SACK block. In presence of GSO skbs,
				1099	* the incoming SACK may not exactly match but we can find smaller MSS
				1100	* aligned portion of it that matches. Therefore we might need to fragment
				1101	* which may fail and creates some hassle (caller must handle error case
				1102	* returns).
				1103	*
				1104	* FIXME: this could be merged to shift decision code
				1105	*/
				1106	static int tcp_match_skb_to_sack(struct sock sk, struct sk_buff skb,
				1107	u32 start_seq, u32 end_seq)
				1108	{
				1109	int err;
				1110	bool in_sack;
				1111	unsigned int pkt_len;
				1112	unsigned int mss;
				1113
				1114	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1115	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1116
				1117	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
				1118	after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
				1119	mss = tcp_skb_mss(skb);
				1120	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1121
				1122	if (!in_sack) {
				1123	pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
				1124	if (pkt_len < mss)
				1125	pkt_len = mss;
				1126	} else {
				1127	pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
				1128	if (pkt_len < mss)
				1129	return -EINVAL;
				1130	}
				1131
				1132	/* Round if necessary so that SACKs cover only full MSSes
				1133	* and/or the remaining small portion (if present)
				1134	*/
				1135	if (pkt_len > mss) {
				1136	unsigned int new_len = (pkt_len / mss) * mss;
				1137	if (!in_sack && new_len < pkt_len)
				1138	new_len += mss;
				1139	pkt_len = new_len;
				1140	}
				1141
				1142	if (pkt_len >= skb->len && !in_sack)
				1143	return 0;
				1144
				1145	err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
				1146	if (err < 0)
				1147	return err;
				1148	}
				1149
				1150	return in_sack;
				1151	}
				1152
				1153	/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
				1154	static u8 tcp_sacktag_one(struct sock *sk,
				1155	struct tcp_sacktag_state *state, u8 sacked,
				1156	u32 start_seq, u32 end_seq,
				1157	int dup_sack, int pcount,
				1158	const struct skb_mstamp *xmit_time)
				1159	{
				1160	struct tcp_sock *tp = tcp_sk(sk);
				1161	int fack_count = state->fack_count;
				1162
				1163	/* Account D-SACK for retransmitted packet. */
				1164	if (dup_sack && (sacked & TCPCB_RETRANS)) {
				1165	if (tp->undo_marker && tp->undo_retrans > 0 &&
				1166	after(end_seq, tp->undo_marker))
				1167	tp->undo_retrans--;
				1168	if (sacked & TCPCB_SACKED_ACKED)
				1169	state->reord = min(fack_count, state->reord);
				1170	}
				1171
				1172	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
				1173	if (!after(end_seq, tp->snd_una))
				1174	return sacked;
				1175
				1176	if (!(sacked & TCPCB_SACKED_ACKED)) {
				1177	tcp_rack_advance(tp, xmit_time, sacked);
				1178
				1179	if (sacked & TCPCB_SACKED_RETRANS) {
				1180	/* If the segment is not tagged as lost,
				1181	* we do not clear RETRANS, believing
				1182	* that retransmission is still in flight.
				1183	*/
				1184	if (sacked & TCPCB_LOST) {
				1185	sacked &= ~(TCPCB_LOST\|TCPCB_SACKED_RETRANS);
				1186	tp->lost_out -= pcount;
				1187	tp->retrans_out -= pcount;
				1188	}
				1189	} else {
				1190	if (!(sacked & TCPCB_RETRANS)) {
				1191	/* New sack for not retransmitted frame,
				1192	* which was in hole. It is reordering.
				1193	*/
				1194	if (before(start_seq,
				1195	tcp_highest_sack_seq(tp)))
				1196	state->reord = min(fack_count,
				1197	state->reord);
				1198	if (!after(end_seq, tp->high_seq))
				1199	state->flag \|= FLAG_ORIG_SACK_ACKED;
				1200	if (state->first_sackt.v64 == 0)
				1201	state->first_sackt = *xmit_time;
				1202	state->last_sackt = *xmit_time;
				1203	}
				1204
				1205	if (sacked & TCPCB_LOST) {
				1206	sacked &= ~TCPCB_LOST;
				1207	tp->lost_out -= pcount;
				1208	}
				1209	}
				1210
				1211	sacked \|= TCPCB_SACKED_ACKED;
				1212	state->flag \|= FLAG_DATA_SACKED;
				1213	tp->sacked_out += pcount;
				1214
				1215	fack_count += pcount;
				1216
				1217	/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
				1218	if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
				1219	before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
				1220	tp->lost_cnt_hint += pcount;
				1221
				1222	if (fack_count > tp->fackets_out)
				1223	tp->fackets_out = fack_count;
				1224	}
				1225
				1226	/* D-SACK. We can detect redundant retransmission in S\|R and plain R
				1227	* frames and clear it. undo_retrans is decreased above, L\|R frames
				1228	* are accounted above as well.
				1229	*/
				1230	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
				1231	sacked &= ~TCPCB_SACKED_RETRANS;
				1232	tp->retrans_out -= pcount;
				1233	}
				1234
				1235	return sacked;
				1236	}
				1237
				1238	/* Shift newly-SACKed bytes from this skb to the immediately previous
				1239	* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
				1240	*/
				1241	static bool tcp_shifted_skb(struct sock sk, struct sk_buff skb,
				1242	struct tcp_sacktag_state *state,
				1243	unsigned int pcount, int shifted, int mss,
				1244	bool dup_sack)
				1245	{
				1246	struct tcp_sock *tp = tcp_sk(sk);
				1247	struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
				1248	u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
				1249	u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
				1250
				1251	BUG_ON(!pcount);
				1252
				1253	/* Adjust counters and hints for the newly sacked sequence
				1254	* range but discard the return value since prev is already
				1255	* marked. We must tag the range first because the seq
				1256	* advancement below implicitly advances
				1257	* tcp_highest_sack_seq() when skb is highest_sack.
				1258	*/
				1259	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
				1260	start_seq, end_seq, dup_sack, pcount,
				1261	&skb->skb_mstamp);
				1262
				1263	if (skb == tp->lost_skb_hint)
				1264	tp->lost_cnt_hint += pcount;
				1265
				1266	TCP_SKB_CB(prev)->end_seq += shifted;
				1267	TCP_SKB_CB(skb)->seq += shifted;
				1268
				1269	tcp_skb_pcount_add(prev, pcount);
Kyle Swenson	e01461f	2021-03-15 11:14:57 -0600	[diff] [blame]	1270	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1271	tcp_skb_pcount_add(skb, -pcount);
				1272
				1273	/* When we're adding to gso_segs == 1, gso_size will be zero,
				1274	* in theory this shouldn't be necessary but as long as DSACK
				1275	* code can come after this skb later on it's better to keep
				1276	* setting gso_size to something.
				1277	*/
				1278	if (!TCP_SKB_CB(prev)->tcp_gso_size)
				1279	TCP_SKB_CB(prev)->tcp_gso_size = mss;
				1280
				1281	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
				1282	if (tcp_skb_pcount(skb) <= 1)
				1283	TCP_SKB_CB(skb)->tcp_gso_size = 0;
				1284
				1285	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
				1286	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
				1287
				1288	if (skb->len > 0) {
				1289	BUG_ON(!tcp_skb_pcount(skb));
				1290	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
				1291	return false;
				1292	}
				1293
				1294	/* Whole SKB was eaten :-) */
				1295
				1296	if (skb == tp->retransmit_skb_hint)
				1297	tp->retransmit_skb_hint = prev;
				1298	if (skb == tp->lost_skb_hint) {
				1299	tp->lost_skb_hint = prev;
				1300	tp->lost_cnt_hint -= tcp_skb_pcount(prev);
				1301	}
				1302
				1303	TCP_SKB_CB(prev)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
				1304	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				1305	TCP_SKB_CB(prev)->end_seq++;
				1306
				1307	if (skb == tcp_highest_sack(sk))
				1308	tcp_advance_highest_sack(sk, skb);
				1309
				1310	tcp_unlink_write_queue(skb, sk);
				1311	sk_wmem_free_skb(sk, skb);
				1312
				1313	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
				1314
				1315	return true;
				1316	}
				1317
				1318	/* I wish gso_size would have a bit more sane initialization than
				1319	* something-or-zero which complicates things
				1320	*/
				1321	static int tcp_skb_seglen(const struct sk_buff *skb)
				1322	{
				1323	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
				1324	}
				1325
				1326	/* Shifting pages past head area doesn't work */
				1327	static int skb_can_shift(const struct sk_buff *skb)
				1328	{
				1329	return !skb_headlen(skb) && skb_is_nonlinear(skb);
				1330	}
				1331
Kyle Swenson	e01461f	2021-03-15 11:14:57 -0600	[diff] [blame]	1332	int tcp_skb_shift(struct sk_buff to, struct sk_buff from,
				1333	int pcount, int shiftlen)
				1334	{
				1335	/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
				1336	* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
				1337	* to make sure not storing more than 65535 * 8 bytes per skb,
				1338	* even if current MSS is bigger.
				1339	*/
				1340	if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
				1341	return 0;
				1342	if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
				1343	return 0;
				1344	return skb_shift(to, from, shiftlen);
				1345	}
				1346
Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1347	/* Try collapsing SACK blocks spanning across multiple skbs to a single
				1348	* skb.
				1349	*/
				1350	static struct sk_buff tcp_shift_skb_data(struct sock sk, struct sk_buff *skb,
				1351	struct tcp_sacktag_state *state,
				1352	u32 start_seq, u32 end_seq,
				1353	bool dup_sack)
				1354	{
				1355	struct tcp_sock *tp = tcp_sk(sk);
				1356	struct sk_buff *prev;
				1357	int mss;
Kyle Swenson	e01461f	2021-03-15 11:14:57 -0600	[diff] [blame]	1358	int next_pcount;
Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1359	int pcount = 0;
				1360	int len;
				1361	int in_sack;
				1362
				1363	if (!sk_can_gso(sk))
				1364	goto fallback;
				1365
				1366	/* Normally R but no L won't result in plain S */
				1367	if (!dup_sack &&
				1368	(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
				1369	goto fallback;
				1370	if (!skb_can_shift(skb))
				1371	goto fallback;
				1372	/* This frame is about to be dropped (was ACKed). */
				1373	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				1374	goto fallback;
				1375
				1376	/* Can only happen with delayed DSACK + discard craziness */
				1377	if (unlikely(skb == tcp_write_queue_head(sk)))
				1378	goto fallback;
				1379	prev = tcp_write_queue_prev(sk, skb);
				1380
				1381	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
				1382	goto fallback;
				1383
				1384	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				1385	!before(end_seq, TCP_SKB_CB(skb)->end_seq);
				1386
				1387	if (in_sack) {
				1388	len = skb->len;
				1389	pcount = tcp_skb_pcount(skb);
				1390	mss = tcp_skb_seglen(skb);
				1391
				1392	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1393	* drop this restriction as unnecessary
				1394	*/
				1395	if (mss != tcp_skb_seglen(prev))
				1396	goto fallback;
				1397	} else {
				1398	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
				1399	goto noop;
				1400	/* CHECKME: This is non-MSS split case only?, this will
				1401	* cause skipped skbs due to advancing loop btw, original
				1402	* has that feature too
				1403	*/
				1404	if (tcp_skb_pcount(skb) <= 1)
				1405	goto noop;
				1406
				1407	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
				1408	if (!in_sack) {
				1409	/* TODO: head merge to next could be attempted here
				1410	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
				1411	* though it might not be worth of the additional hassle
				1412	*
				1413	* ...we can probably just fallback to what was done
				1414	* previously. We could try merging non-SACKed ones
				1415	* as well but it probably isn't going to buy off
				1416	* because later SACKs might again split them, and
				1417	* it would make skb timestamp tracking considerably
				1418	* harder problem.
				1419	*/
				1420	goto fallback;
				1421	}
				1422
				1423	len = end_seq - TCP_SKB_CB(skb)->seq;
				1424	BUG_ON(len < 0);
				1425	BUG_ON(len > skb->len);
				1426
				1427	/* MSS boundaries should be honoured or else pcount will
				1428	* severely break even though it makes things bit trickier.
				1429	* Optimize common case to avoid most of the divides
				1430	*/
				1431	mss = tcp_skb_mss(skb);
				1432
				1433	/* TODO: Fix DSACKs to not fragment already SACKed and we can
				1434	* drop this restriction as unnecessary
				1435	*/
				1436	if (mss != tcp_skb_seglen(prev))
				1437	goto fallback;
				1438
				1439	if (len == mss) {
				1440	pcount = 1;
				1441	} else if (len < mss) {
				1442	goto noop;
				1443	} else {
				1444	pcount = len / mss;
				1445	len = pcount * mss;
				1446	}
				1447	}
				1448
				1449	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
				1450	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
				1451	goto fallback;
				1452
Kyle Swenson	e01461f	2021-03-15 11:14:57 -0600	[diff] [blame]	1453	if (!tcp_skb_shift(prev, skb, pcount, len))
Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1454	goto fallback;
				1455	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
				1456	goto out;
				1457
				1458	/* Hole filled allows collapsing with the next as well, this is very
				1459	* useful when hole on every nth skb pattern happens
				1460	*/
				1461	if (prev == tcp_write_queue_tail(sk))
				1462	goto out;
				1463	skb = tcp_write_queue_next(sk, prev);
				1464
				1465	if (!skb_can_shift(skb) \|\|
				1466	(skb == tcp_send_head(sk)) \|\|
				1467	((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) \|\|
				1468	(mss != tcp_skb_seglen(skb)))
				1469	goto out;
				1470
				1471	len = skb->len;
Kyle Swenson	e01461f	2021-03-15 11:14:57 -0600	[diff] [blame]	1472	next_pcount = tcp_skb_pcount(skb);
				1473	if (tcp_skb_shift(prev, skb, next_pcount, len)) {
				1474	pcount += next_pcount;
				1475	tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0);
Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1476	}
Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame]	1477	out:
				1478	state->fack_count += pcount;
				1479	return prev;
				1480
				1481	noop:
				1482	return skb;
				1483
				1484	fallback:
				1485	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
				1486	return NULL;
				1487	}
				1488
				1489	static struct sk_buff tcp_sacktag_walk(struct sk_buff skb, struct sock *sk,
				1490	struct tcp_sack_block *next_dup,
				1491	struct tcp_sacktag_state *state,
				1492	u32 start_seq, u32 end_seq,
				1493	bool dup_sack_in)
				1494	{
				1495	struct tcp_sock *tp = tcp_sk(sk);
				1496	struct sk_buff *tmp;
				1497
				1498	tcp_for_write_queue_from(skb, sk) {
				1499	int in_sack = 0;
				1500	bool dup_sack = dup_sack_in;
				1501
				1502	if (skb == tcp_send_head(sk))
				1503	break;
				1504
				1505	/* queue is in-order => we can short-circuit the walk early */
				1506	if (!before(TCP_SKB_CB(skb)->seq, end_seq))
				1507	break;
				1508
				1509	if (next_dup &&
				1510	before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
				1511	in_sack = tcp_match_skb_to_sack(sk, skb,
				1512	next_dup->start_seq,
				1513	next_dup->end_seq);
				1514	if (in_sack > 0)
				1515	dup_sack = true;
				1516	}
				1517
				1518	/* skb reference here is a bit tricky to get right, since
				1519	* shifting can eat and free both this skb and the next,
				1520	* so not even _safe variant of the loop is enough.
				1521	*/
				1522	if (in_sack <= 0) {
				1523	tmp = tcp_shift_skb_data(sk, skb, state,
				1524	start_seq, end_seq, dup_sack);
				1525	if (tmp) {
				1526	if (tmp != skb) {
				1527	skb = tmp;
				1528	continue;
				1529	}
				1530
				1531	in_sack = 0;
				1532	} else {
				1533	in_sack = tcp_match_skb_to_sack(sk, skb,
				1534	start_seq,
				1535	end_seq);
				1536	}
				1537	}
				1538
				1539	if (unlikely(in_sack < 0))
				1540	break;
				1541
				1542	if (in_sack) {
				1543	TCP_SKB_CB(skb)->sacked =
				1544	tcp_sacktag_one(sk,
				1545	state,
				1546	TCP_SKB_CB(skb)->sacked,
				1547	TCP_SKB_CB(skb)->seq,
				1548	TCP_SKB_CB(skb)->end_seq,
				1549	dup_sack,
				1550	tcp_skb_pcount(skb),
				1551	&skb->skb_mstamp);
				1552
				1553	if (!before(TCP_SKB_CB(skb)->seq,
				1554	tcp_highest_sack_seq(tp)))
				1555	tcp_advance_highest_sack(sk, skb);
				1556	}
				1557
				1558	state->fack_count += tcp_skb_pcount(skb);
				1559	}
				1560	return skb;
				1561	}
				1562
				1563	/* Avoid all extra work that is being done by sacktag while walking in
				1564	* a normal way
				1565	*/
				1566	static struct sk_buff tcp_sacktag_skip(struct sk_buff skb, struct sock *sk,
				1567	struct tcp_sacktag_state *state,
				1568	u32 skip_to_seq)
				1569	{
				1570	tcp_for_write_queue_from(skb, sk) {
				1571	if (skb == tcp_send_head(sk))
				1572	break;
				1573
				1574	if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
				1575	break;
				1576
				1577	state->fack_count += tcp_skb_pcount(skb);
				1578	}
				1579	return skb;
				1580	}
				1581
				1582	static struct sk_buff tcp_maybe_skipping_dsack(struct sk_buff skb,
				1583	struct sock *sk,
				1584	struct tcp_sack_block *next_dup,
				1585	struct tcp_sacktag_state *state,
				1586	u32 skip_to_seq)
				1587	{
				1588	if (!next_dup)
				1589	return skb;
				1590
				1591	if (before(next_dup->start_seq, skip_to_seq)) {
				1592	skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
				1593	skb = tcp_sacktag_walk(skb, sk, NULL, state,
				1594	next_dup->start_seq, next_dup->end_seq,
				1595	1);
				1596	}
				1597
				1598	return skb;
				1599	}
				1600
				1601	static int tcp_sack_cache_ok(const struct tcp_sock tp, const struct tcp_sack_block cache)
				1602	{
				1603	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1604	}
				1605
				1606	static int
				1607	tcp_sacktag_write_queue(struct sock sk, const struct sk_buff ack_skb,
				1608	u32 prior_snd_una, struct tcp_sacktag_state *state)
				1609	{
				1610	struct tcp_sock *tp = tcp_sk(sk);
				1611	const unsigned char *ptr = (skb_transport_header(ack_skb) +
				1612	TCP_SKB_CB(ack_skb)->sacked);
				1613	struct tcp_sack_block_wire sp_wire = (struct tcp_sack_block_wire )(ptr+2);
				1614	struct tcp_sack_block sp[TCP_NUM_SACKS];
				1615	struct tcp_sack_block *cache;
				1616	struct sk_buff *skb;
				1617	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
				1618	int used_sacks;
				1619	bool found_dup_sack = false;
				1620	int i, j;
				1621	int first_sack_index;
				1622
				1623	state->flag = 0;
				1624	state->reord = tp->packets_out;
				1625
				1626	if (!tp->sacked_out) {
				1627	if (WARN_ON(tp->fackets_out))
				1628	tp->fackets_out = 0;
				1629	tcp_highest_sack_reset(sk);
				1630	}
				1631
				1632	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
				1633	num_sacks, prior_snd_una);
				1634	if (found_dup_sack)
				1635	state->flag \|= FLAG_DSACKING_ACK;
				1636
				1637	/* Eliminate too old ACKs, but take into
				1638	* account more or less fresh ones, they can
				1639	* contain valid SACK info.
				1640	*/
				1641	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
				1642	return 0;
				1643
				1644	if (!tp->packets_out)
				1645	goto out;
				1646
				1647	used_sacks = 0;
				1648	first_sack_index = 0;
				1649	for (i = 0; i < num_sacks; i++) {
				1650	bool dup_sack = !i && found_dup_sack;
				1651
				1652	sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
				1653	sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
				1654
				1655	if (!tcp_is_sackblock_valid(tp, dup_sack,
				1656	sp[used_sacks].start_seq,
				1657	sp[used_sacks].end_seq)) {
				1658	int mib_idx;
				1659
				1660	if (dup_sack) {
				1661	if (!tp->undo_marker)
				1662	mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
				1663	else
				1664	mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
				1665	} else {
				1666	/* Don't count olds caused by ACK reordering */
				1667	if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
				1668	!after(sp[used_sacks].end_seq, tp->snd_una))
				1669	continue;
				1670	mib_idx = LINUX_MIB_TCPSACKDISCARD;
				1671	}
				1672
				1673	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				1674	if (i == 0)
				1675	first_sack_index = -1;
				1676	continue;
				1677	}
				1678
				1679	/* Ignore very old stuff early */
				1680	if (!after(sp[used_sacks].end_seq, prior_snd_una))
				1681	continue;
				1682
				1683	used_sacks++;
				1684	}
				1685
				1686	/* order SACK blocks to allow in order walk of the retrans queue */
				1687	for (i = used_sacks - 1; i > 0; i--) {
				1688	for (j = 0; j < i; j++) {
				1689	if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
				1690	swap(sp[j], sp[j + 1]);
				1691
				1692	/* Track where the first SACK block goes to */
				1693	if (j == first_sack_index)
				1694	first_sack_index = j + 1;
				1695	}
				1696	}
				1697	}
				1698
				1699	skb = tcp_write_queue_head(sk);
				1700	state->fack_count = 0;
				1701	i = 0;
				1702
				1703	if (!tp->sacked_out) {
				1704	/* It's already past, so skip checking against it */
				1705	cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
				1706	} else {
				1707	cache = tp->recv_sack_cache;
				1708	/* Skip empty blocks in at head of the cache */
				1709	while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
				1710	!cache->end_seq)
				1711	cache++;
				1712	}
				1713
				1714	while (i < used_sacks) {
				1715	u32 start_seq = sp[i].start_seq;
				1716	u32 end_seq = sp[i].end_seq;
				1717	bool dup_sack = (found_dup_sack && (i == first_sack_index));
				1718	struct tcp_sack_block *next_dup = NULL;
				1719
				1720	if (found_dup_sack && ((i + 1) == first_sack_index))
				1721	next_dup = &sp[i + 1];
				1722
				1723	/* Skip too early cached blocks */
				1724	while (tcp_sack_cache_ok(tp, cache) &&
				1725	!before(start_seq, cache->end_seq))
				1726	cache++;
				1727
				1728	/* Can skip some work by looking recv_sack_cache? */
				1729	if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
				1730	after(end_seq, cache->start_seq)) {
				1731
				1732	/* Head todo? */
				1733	if (before(start_seq, cache->start_seq)) {
				1734	skb = tcp_sacktag_skip(skb, sk, state,
				1735	start_seq);
				1736	skb = tcp_sacktag_walk(skb, sk, next_dup,
				1737	state,
				1738	start_seq,
				1739	cache->start_seq,
				1740	dup_sack);
				1741	}
				1742
				1743	/* Rest of the block already fully processed? */
				1744	if (!after(end_seq, cache->end_seq))
				1745	goto advance_sp;
				1746
				1747	skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
				1748	state,
				1749	cache->end_seq);
				1750
				1751	/* ...tail remains todo... */
				1752	if (tcp_highest_sack_seq(tp) == cache->end_seq) {
				1753	/* ...but better entrypoint exists! */
				1754	skb = tcp_highest_sack(sk);
				1755	if (!skb)
				1756	break;
				1757	state->fack_count = tp->fackets_out;
				1758	cache++;
				1759	goto walk;
				1760	}
				1761
				1762	skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
				1763	/* Check overlap against next cached too (past this one already) */
				1764	cache++;
				1765	continue;
				1766	}
				1767
				1768	if (!before(start_seq, tcp_highest_sack_seq(tp))) {
				1769	skb = tcp_highest_sack(sk);
				1770	if (!skb)
				1771	break;
				1772	state->fack_count = tp->fackets_out;
				1773	}
				1774	skb = tcp_sacktag_skip(skb, sk, state, start_seq);
				1775
				1776	walk:
				1777	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
				1778	start_seq, end_seq, dup_sack);
				1779
				1780	advance_sp:
				1781	i++;
				1782	}
				1783
				1784	/* Clear the head of the cache sack blocks so we can skip it next time */
				1785	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
				1786	tp->recv_sack_cache[i].start_seq = 0;
				1787	tp->recv_sack_cache[i].end_seq = 0;
				1788	}
				1789	for (j = 0; j < used_sacks; j++)
				1790	tp->recv_sack_cache[i++] = sp[j];
				1791
				1792	if ((state->reord < tp->fackets_out) &&
				1793	((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) \|\| tp->undo_marker))
				1794	tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
				1795
				1796	tcp_verify_left_out(tp);
				1797	out:
				1798
				1799	#if FASTRETRANS_DEBUG > 0
				1800	WARN_ON((int)tp->sacked_out < 0);
				1801	WARN_ON((int)tp->lost_out < 0);
				1802	WARN_ON((int)tp->retrans_out < 0);
				1803	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
				1804	#endif
				1805	return state->flag;
				1806	}
				1807
				1808	/* Limits sacked_out so that sum with lost_out isn't ever larger than
				1809	* packets_out. Returns false if sacked_out adjustement wasn't necessary.
				1810	*/
				1811	static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
				1812	{
				1813	u32 holes;
				1814
				1815	holes = max(tp->lost_out, 1U);
				1816	holes = min(holes, tp->packets_out);
				1817
				1818	if ((tp->sacked_out + holes) > tp->packets_out) {
				1819	tp->sacked_out = tp->packets_out - holes;
				1820	return true;
				1821	}
				1822	return false;
				1823	}
				1824
				1825	/* If we receive more dupacks than we expected counting segments
				1826	* in assumption of absent reordering, interpret this as reordering.
				1827	* The only another reason could be bug in receiver TCP.
				1828	*/
				1829	static void tcp_check_reno_reordering(struct sock *sk, const int addend)
				1830	{
				1831	struct tcp_sock *tp = tcp_sk(sk);
				1832	if (tcp_limit_reno_sacked(tp))
				1833	tcp_update_reordering(sk, tp->packets_out + addend, 0);
				1834	}
				1835
				1836	/* Emulate SACKs for SACKless connection: account for a new dupack. */
				1837
				1838	static void tcp_add_reno_sack(struct sock *sk)
				1839	{
				1840	struct tcp_sock *tp = tcp_sk(sk);
				1841	tp->sacked_out++;
				1842	tcp_check_reno_reordering(sk, 0);
				1843	tcp_verify_left_out(tp);
				1844	}
				1845
				1846	/* Account for ACK, ACKing some data in Reno Recovery phase. */
				1847
				1848	static void tcp_remove_reno_sacks(struct sock *sk, int acked)
				1849	{
				1850	struct tcp_sock *tp = tcp_sk(sk);
				1851
				1852	if (acked > 0) {
				1853	/* One ACK acked hole. The rest eat duplicate ACKs. */
				1854	if (acked - 1 >= tp->sacked_out)
				1855	tp->sacked_out = 0;
				1856	else
				1857	tp->sacked_out -= acked - 1;
				1858	}
				1859	tcp_check_reno_reordering(sk, acked);
				1860	tcp_verify_left_out(tp);
				1861	}
				1862
				1863	static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
				1864	{
				1865	tp->sacked_out = 0;
				1866	}
				1867
				1868	void tcp_clear_retrans(struct tcp_sock *tp)
				1869	{
				1870	tp->retrans_out = 0;
				1871	tp->lost_out = 0;
				1872	tp->undo_marker = 0;
				1873	tp->undo_retrans = -1;
				1874	tp->fackets_out = 0;
				1875	tp->sacked_out = 0;
				1876	}
				1877
				1878	static inline void tcp_init_undo(struct tcp_sock *tp)
				1879	{
				1880	tp->undo_marker = tp->snd_una;
				1881	/* Retransmission still in flight may cause DSACKs later. */
				1882	tp->undo_retrans = tp->retrans_out ? : -1;
				1883	}
				1884
				1885	/* Enter Loss state. If we detect SACK reneging, forget all SACK information
				1886	* and reset tags completely, otherwise preserve SACKs. If receiver
				1887	* dropped its ofo queue, we will know this due to reneging detection.
				1888	*/
				1889	void tcp_enter_loss(struct sock *sk)
				1890	{
				1891	const struct inet_connection_sock *icsk = inet_csk(sk);
				1892	struct tcp_sock *tp = tcp_sk(sk);
				1893	struct sk_buff *skb;
				1894	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
				1895	bool is_reneg; /* is receiver reneging on SACKs? */
				1896
				1897	/* Reduce ssthresh if it has not yet been made inside this window. */
				1898	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
				1899	!after(tp->high_seq, tp->snd_una) \|\|
				1900	(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
				1901	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				1902	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
				1903	tcp_ca_event(sk, CA_EVENT_LOSS);
				1904	tcp_init_undo(tp);
				1905	}
				1906	tp->snd_cwnd = 1;
				1907	tp->snd_cwnd_cnt = 0;
				1908	tp->snd_cwnd_stamp = tcp_time_stamp;
				1909
				1910	tp->retrans_out = 0;
				1911	tp->lost_out = 0;
				1912
				1913	if (tcp_is_reno(tp))
				1914	tcp_reset_reno_sack(tp);
				1915
				1916	skb = tcp_write_queue_head(sk);
				1917	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
				1918	if (is_reneg) {
				1919	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
				1920	tp->sacked_out = 0;
				1921	tp->fackets_out = 0;
				1922	}
				1923	tcp_clear_all_retrans_hints(tp);
				1924
				1925	tcp_for_write_queue(skb, sk) {
				1926	if (skb == tcp_send_head(sk))
				1927	break;
				1928
				1929	TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)\|TCPCB_SACKED_ACKED;
				1930	if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) \|\| is_reneg) {
				1931	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
				1932	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
				1933	tp->lost_out += tcp_skb_pcount(skb);
				1934	tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
				1935	}
				1936	}
				1937	tcp_verify_left_out(tp);
				1938
				1939	/* Timeout in disordered state after receiving substantial DUPACKs
				1940	* suggests that the degree of reordering is over-estimated.
				1941	*/
				1942	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
				1943	tp->sacked_out >= sysctl_tcp_reordering)
				1944	tp->reordering = min_t(unsigned int, tp->reordering,
				1945	sysctl_tcp_reordering);
				1946	tcp_set_ca_state(sk, TCP_CA_Loss);
				1947	tp->high_seq = tp->snd_nxt;
				1948	tcp_ecn_queue_cwr(tp);
				1949
				1950	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
				1951	* loss recovery is underway except recurring timeout(s) on
				1952	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
				1953	*/
				1954	tp->frto = sysctl_tcp_frto &&
				1955	(new_recovery \|\| icsk->icsk_retransmits) &&
				1956	!inet_csk(sk)->icsk_mtup.probe_size;
				1957	}
				1958
				1959	/* If ACK arrived pointing to a remembered SACK, it means that our
				1960	* remembered SACKs do not reflect real state of receiver i.e.
				1961	* receiver _host_ is heavily congested (or buggy).
				1962	*
				1963	* To avoid big spurious retransmission bursts due to transient SACK
				1964	* scoreboard oddities that look like reneging, we give the receiver a
				1965	* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
				1966	* restore sanity to the SACK scoreboard. If the apparent reneging
				1967	* persists until this RTO then we'll clear the SACK scoreboard.
				1968	*/
				1969	static bool tcp_check_sack_reneging(struct sock *sk, int flag)
				1970	{
				1971	if (flag & FLAG_SACK_RENEGING) {
				1972	struct tcp_sock *tp = tcp_sk(sk);
				1973	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
				1974	msecs_to_jiffies(10));
				1975
				1976	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				1977	delay, TCP_RTO_MAX);
				1978	return true;
				1979	}
				1980	return false;
				1981	}
				1982
				1983	static inline int tcp_fackets_out(const struct tcp_sock *tp)
				1984	{
				1985	return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
				1986	}
				1987
				1988	/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
				1989	* counter when SACK is enabled (without SACK, sacked_out is used for
				1990	* that purpose).
				1991	*
				1992	* Instead, with FACK TCP uses fackets_out that includes both SACKed
				1993	* segments up to the highest received SACK block so far and holes in
				1994	* between them.
				1995	*
				1996	* With reordering, holes may still be in flight, so RFC3517 recovery
				1997	* uses pure sacked_out (total number of SACKed segments) even though
				1998	* it violates the RFC that uses duplicate ACKs, often these are equal
				1999	* but when e.g. out-of-window ACKs or packet duplication occurs,
				2000	* they differ. Since neither occurs due to loss, TCP should really
				2001	* ignore them.
				2002	*/
				2003	static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
				2004	{
				2005	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
				2006	}
				2007
				2008	static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
				2009	{
				2010	struct tcp_sock *tp = tcp_sk(sk);
				2011	unsigned long delay;
				2012
				2013	/* Delay early retransmit and entering fast recovery for
				2014	* max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
				2015	* available, or RTO is scheduled to fire first.
				2016	*/
				2017	if (sysctl_tcp_early_retrans < 2 \|\| sysctl_tcp_early_retrans > 3 \|\|
				2018	(flag & FLAG_ECE) \|\| !tp->srtt_us)
				2019	return false;
				2020
				2021	delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
				2022	msecs_to_jiffies(2));
				2023
				2024	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
				2025	return false;
				2026
				2027	inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
				2028	TCP_RTO_MAX);
				2029	return true;
				2030	}
				2031
				2032	/* Linux NewReno/SACK/FACK/ECN state machine.
				2033	* --------------------------------------
				2034	*
				2035	* "Open" Normal state, no dubious events, fast path.
				2036	* "Disorder" In all the respects it is "Open",
				2037	* but requires a bit more attention. It is entered when
				2038	* we see some SACKs or dupacks. It is split of "Open"
				2039	* mainly to move some processing from fast path to slow one.
				2040	* "CWR" CWND was reduced due to some Congestion Notification event.
				2041	* It can be ECN, ICMP source quench, local device congestion.
				2042	* "Recovery" CWND was reduced, we are fast-retransmitting.
				2043	* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
				2044	*
				2045	* tcp_fastretrans_alert() is entered:
				2046	* - each incoming ACK, if state is not "Open"
				2047	* - when arrived ACK is unusual, namely:
				2048	* * SACK
				2049	* * Duplicate ACK.
				2050	* * ECN ECE.
				2051	*
				2052	* Counting packets in flight is pretty simple.
				2053	*
				2054	* in_flight = packets_out - left_out + retrans_out
				2055	*
				2056	* packets_out is SND.NXT-SND.UNA counted in packets.
				2057	*
				2058	* retrans_out is number of retransmitted segments.
				2059	*
				2060	* left_out is number of segments left network, but not ACKed yet.
				2061	*
				2062	* left_out = sacked_out + lost_out
				2063	*
				2064	* sacked_out: Packets, which arrived to receiver out of order
				2065	* and hence not ACKed. With SACKs this number is simply
				2066	* amount of SACKed data. Even without SACKs
				2067	* it is easy to give pretty reliable estimate of this number,
				2068	* counting duplicate ACKs.
				2069	*
				2070	* lost_out: Packets lost by network. TCP has no explicit
				2071	* "loss notification" feedback from network (for now).
				2072	* It means that this number can be only _guessed_.
				2073	* Actually, it is the heuristics to predict lossage that
				2074	* distinguishes different algorithms.
				2075	*
				2076	* F.e. after RTO, when all the queue is considered as lost,
				2077	* lost_out = packets_out and in_flight = retrans_out.
				2078	*
				2079	* Essentially, we have now two algorithms counting
				2080	* lost packets.
				2081	*
				2082	* FACK: It is the simplest heuristics. As soon as we decided
				2083	* that something is lost, we decide that _all_ not SACKed
				2084	* packets until the most forward SACK are lost. I.e.
				2085	* lost_out = fackets_out - sacked_out and left_out = fackets_out.
				2086	* It is absolutely correct estimate, if network does not reorder
				2087	* packets. And it loses any connection to reality when reordering
				2088	* takes place. We use FACK by default until reordering
				2089	* is suspected on the path to this destination.
				2090	*
				2091	* NewReno: when Recovery is entered, we assume that one segment
				2092	* is lost (classic Reno). While we are in Recovery and
				2093	* a partial ACK arrives, we assume that one more packet
				2094	* is lost (NewReno). This heuristics are the same in NewReno
				2095	* and SACK.
				2096	*
				2097	* Imagine, that's all! Forget about all this shamanism about CWND inflation
				2098	* deflation etc. CWND is real congestion window, never inflated, changes
				2099	* only according to classic VJ rules.
				2100	*
				2101	* Really tricky (and requiring careful tuning) part of algorithm
				2102	* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
				2103	* The first determines the moment _when_ we should reduce CWND and,
				2104	* hence, slow down forward transmission. In fact, it determines the moment
				2105	* when we decide that hole is caused by loss, rather than by a reorder.
				2106	*
				2107	* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
				2108	* holes, caused by lost packets.
				2109	*
				2110	* And the most logically complicated part of algorithm is undo
				2111	* heuristics. We detect false retransmits due to both too early
				2112	* fast retransmit (reordering) and underestimated RTO, analyzing
				2113	* timestamps and D-SACKs. When we detect that some segments were
				2114	* retransmitted by mistake and CWND reduction was wrong, we undo
				2115	* window reduction and abort recovery phase. This logic is hidden
				2116	* inside several functions named tcp_try_undo_<something>.
				2117	*/
				2118
				2119	/* This function decides, when we should leave Disordered state
				2120	* and enter Recovery phase, reducing congestion window.
				2121	*
				2122	* Main question: may we further continue forward transmission
				2123	* with the same cwnd?
				2124	*/
				2125	static bool tcp_time_to_recover(struct sock *sk, int flag)
				2126	{
				2127	struct tcp_sock *tp = tcp_sk(sk);
				2128	__u32 packets_out;
				2129
				2130	/* Trick#1: The loss is proven. */
				2131	if (tp->lost_out)
				2132	return true;
				2133
				2134	/* Not-A-Trick#2 : Classic rule... */
				2135	if (tcp_dupack_heuristics(tp) > tp->reordering)
				2136	return true;
				2137
				2138	/* Trick#4: It is still not OK... But will it be useful to delay
				2139	* recovery more?
				2140	*/
				2141	packets_out = tp->packets_out;
				2142	if (packets_out <= tp->reordering &&
				2143	tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
				2144	!tcp_may_send_now(sk)) {
				2145	/* We have nothing to send. This connection is limited
				2146	* either by receiver window or by application.
				2147	*/
				2148	return true;
				2149	}
				2150
				2151	/* If a thin stream is detected, retransmit after first
				2152	* received dupack. Employ only if SACK is supported in order
				2153	* to avoid possible corner-case series of spurious retransmissions
				2154	* Use only if there are no unsent data.
				2155	*/
				2156	if ((tp->thin_dupack \|\| sysctl_tcp_thin_dupack) &&
				2157	tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
				2158	tcp_is_sack(tp) && !tcp_send_head(sk))
				2159	return true;
				2160
				2161	/* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
				2162	* retransmissions due to small network reorderings, we implement
				2163	* Mitigation A.3 in the RFC and delay the retransmission for a short
				2164	* interval if appropriate.
				2165	*/
				2166	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
				2167	(tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
				2168	!tcp_may_send_now(sk))
				2169	return !tcp_pause_early_retransmit(sk, flag);
				2170
				2171	return false;
				2172	}
				2173
				2174	/* Detect loss in event "A" above by marking head of queue up as lost.
				2175	* For FACK or non-SACK(Reno) senders, the first "packets" number of segments
				2176	* are considered lost. For RFC3517 SACK, a segment is considered lost if it
				2177	* has at least tp->reordering SACKed seqments above it; "packets" refers to
				2178	* the maximum SACKed segments to pass before reaching this limit.
				2179	*/
				2180	static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
				2181	{
				2182	struct tcp_sock *tp = tcp_sk(sk);
				2183	struct sk_buff *skb;
				2184	int cnt, oldcnt, lost;
				2185	unsigned int mss;
				2186	/* Use SACK to deduce losses of new sequences sent during recovery */
				2187	const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
				2188
				2189	WARN_ON(packets > tp->packets_out);
				2190	if (tp->lost_skb_hint) {
				2191	skb = tp->lost_skb_hint;
				2192	cnt = tp->lost_cnt_hint;
				2193	/* Head already handled? */
				2194	if (mark_head && skb != tcp_write_queue_head(sk))
				2195	return;
				2196	} else {
				2197	skb = tcp_write_queue_head(sk);
				2198	cnt = 0;
				2199	}
				2200
				2201	tcp_for_write_queue_from(skb, sk) {
				2202	if (skb == tcp_send_head(sk))
				2203	break;
				2204	/* TODO: do this better */
				2205	/* this is not the most efficient way to do this... */
				2206	tp->lost_skb_hint = skb;
				2207	tp->lost_cnt_hint = cnt;
				2208
				2209	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
				2210	break;
				2211
				2212	oldcnt = cnt;
				2213	if (tcp_is_fack(tp) \|\| tcp_is_reno(tp) \|\|
				2214	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				2215	cnt += tcp_skb_pcount(skb);
				2216
				2217	if (cnt > packets) {
				2218	if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) \|\|
				2219	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) \|\|
				2220	(oldcnt >= packets))
				2221	break;
				2222
				2223	mss = tcp_skb_mss(skb);
				2224	/* If needed, chop off the prefix to mark as lost. */
				2225	lost = (packets - oldcnt) * mss;
				2226	if (lost < skb->len &&
				2227	tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
				2228	break;
				2229	cnt = packets;
				2230	}
				2231
				2232	tcp_skb_mark_lost(tp, skb);
				2233
				2234	if (mark_head)
				2235	break;
				2236	}
				2237	tcp_verify_left_out(tp);
				2238	}
				2239
				2240	/* Account newly detected lost packet(s) */
				2241
				2242	static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
				2243	{
				2244	struct tcp_sock *tp = tcp_sk(sk);
				2245
				2246	if (tcp_is_reno(tp)) {
				2247	tcp_mark_head_lost(sk, 1, 1);
				2248	} else if (tcp_is_fack(tp)) {
				2249	int lost = tp->fackets_out - tp->reordering;
				2250	if (lost <= 0)
				2251	lost = 1;
				2252	tcp_mark_head_lost(sk, lost, 0);
				2253	} else {
				2254	int sacked_upto = tp->sacked_out - tp->reordering;
				2255	if (sacked_upto >= 0)
				2256	tcp_mark_head_lost(sk, sacked_upto, 0);
				2257	else if (fast_rexmit)
				2258	tcp_mark_head_lost(sk, 1, 1);
				2259	}
				2260	}
				2261
				2262	/* CWND moderation, preventing bursts due to too big ACKs
				2263	* in dubious situations.
				2264	*/
				2265	static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
				2266	{
				2267	tp->snd_cwnd = min(tp->snd_cwnd,
				2268	tcp_packets_in_flight(tp) + tcp_max_burst(tp));
				2269	tp->snd_cwnd_stamp = tcp_time_stamp;
				2270	}
				2271
				2272	static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
				2273	{
				2274	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2275	before(tp->rx_opt.rcv_tsecr, when);
				2276	}
				2277
				2278	/* skb is spurious retransmitted if the returned timestamp echo
				2279	* reply is prior to the skb transmission time
				2280	*/
				2281	static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
				2282	const struct sk_buff *skb)
				2283	{
				2284	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
				2285	tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
				2286	}
				2287
				2288	/* Nothing was retransmitted or returned timestamp is less
				2289	* than timestamp of the first retransmission.
				2290	*/
				2291	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
				2292	{
				2293	return !tp->retrans_stamp \|\|
				2294	tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
				2295	}
				2296
				2297	/* Undo procedures. */
				2298
				2299	/* We can clear retrans_stamp when there are no retransmissions in the
				2300	* window. It would seem that it is trivially available for us in
				2301	* tp->retrans_out, however, that kind of assumptions doesn't consider
				2302	* what will happen if errors occur when sending retransmission for the
				2303	* second time. ...It could the that such segment has only
				2304	* TCPCB_EVER_RETRANS set at the present time. It seems that checking
				2305	* the head skb is enough except for some reneging corner cases that
				2306	* are not worth the effort.
				2307	*
				2308	* Main reason for all this complexity is the fact that connection dying
				2309	* time now depends on the validity of the retrans_stamp, in particular,
				2310	* that successive retransmissions of a segment must not advance
				2311	* retrans_stamp under any conditions.
				2312	*/
				2313	static bool tcp_any_retrans_done(const struct sock *sk)
				2314	{
				2315	const struct tcp_sock *tp = tcp_sk(sk);
				2316	struct sk_buff *skb;
				2317
				2318	if (tp->retrans_out)
				2319	return true;
				2320
				2321	skb = tcp_write_queue_head(sk);
				2322	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
				2323	return true;
				2324
				2325	return false;
				2326	}
				2327
				2328	#if FASTRETRANS_DEBUG > 1
				2329	static void DBGUNDO(struct sock sk, const char msg)
				2330	{
				2331	struct tcp_sock *tp = tcp_sk(sk);
				2332	struct inet_sock *inet = inet_sk(sk);
				2333
				2334	if (sk->sk_family == AF_INET) {
				2335	pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
				2336	msg,
				2337	&inet->inet_daddr, ntohs(inet->inet_dport),
				2338	tp->snd_cwnd, tcp_left_out(tp),
				2339	tp->snd_ssthresh, tp->prior_ssthresh,
				2340	tp->packets_out);
				2341	}
				2342	#if IS_ENABLED(CONFIG_IPV6)
				2343	else if (sk->sk_family == AF_INET6) {
				2344	pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
				2345	msg,
				2346	&sk->sk_v6_daddr, ntohs(inet->inet_dport),
				2347	tp->snd_cwnd, tcp_left_out(tp),
				2348	tp->snd_ssthresh, tp->prior_ssthresh,
				2349	tp->packets_out);
				2350	}
				2351	#endif
				2352	}
				2353	#else
				2354	#define DBGUNDO(x...) do { } while (0)
				2355	#endif
				2356
				2357	static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
				2358	{
				2359	struct tcp_sock *tp = tcp_sk(sk);
				2360
				2361	if (unmark_loss) {
				2362	struct sk_buff *skb;
				2363
				2364	tcp_for_write_queue(skb, sk) {
				2365	if (skb == tcp_send_head(sk))
				2366	break;
				2367	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
				2368	}
				2369	tp->lost_out = 0;
				2370	tcp_clear_all_retrans_hints(tp);
				2371	}
				2372
				2373	if (tp->prior_ssthresh) {
				2374	const struct inet_connection_sock *icsk = inet_csk(sk);
				2375
				2376	if (icsk->icsk_ca_ops->undo_cwnd)
				2377	tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
				2378	else
				2379	tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
				2380
				2381	if (tp->prior_ssthresh > tp->snd_ssthresh) {
				2382	tp->snd_ssthresh = tp->prior_ssthresh;
				2383	tcp_ecn_withdraw_cwr(tp);
				2384	}
				2385	} else {
				2386	tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
				2387	}
				2388	tp->snd_cwnd_stamp = tcp_time_stamp;
				2389	tp->undo_marker = 0;
				2390	}
				2391
				2392	static inline bool tcp_may_undo(const struct tcp_sock *tp)
				2393	{
				2394	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
				2395	}
				2396
				2397	/* People celebrate: "We love our President!" */
				2398	static bool tcp_try_undo_recovery(struct sock *sk)
				2399	{
				2400	struct tcp_sock *tp = tcp_sk(sk);
				2401
				2402	if (tcp_may_undo(tp)) {
				2403	int mib_idx;
				2404
				2405	/* Happy end! We did not retransmit anything
				2406	* or our original transmission succeeded.
				2407	*/
				2408	DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
				2409	tcp_undo_cwnd_reduction(sk, false);
				2410	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
				2411	mib_idx = LINUX_MIB_TCPLOSSUNDO;
				2412	else
				2413	mib_idx = LINUX_MIB_TCPFULLUNDO;
				2414
				2415	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				2416	}
				2417	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
				2418	/* Hold old state until something above high_seq
				2419	* is ACKed. For Reno it is MUST to prevent false
				2420	* fast retransmits (RFC2582). SACK TCP is safe. */
				2421	tcp_moderate_cwnd(tp);
				2422	if (!tcp_any_retrans_done(sk))
				2423	tp->retrans_stamp = 0;
				2424	return true;
				2425	}
				2426	tcp_set_ca_state(sk, TCP_CA_Open);
				2427	return false;
				2428	}
				2429
				2430	/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
				2431	static bool tcp_try_undo_dsack(struct sock *sk)
				2432	{
				2433	struct tcp_sock *tp = tcp_sk(sk);
				2434
				2435	if (tp->undo_marker && !tp->undo_retrans) {
				2436	DBGUNDO(sk, "D-SACK");
				2437	tcp_undo_cwnd_reduction(sk, false);
				2438	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
				2439	return true;
				2440	}
				2441	return false;
				2442	}
				2443
				2444	/* Undo during loss recovery after partial ACK or using F-RTO. */
				2445	static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
				2446	{
				2447	struct tcp_sock *tp = tcp_sk(sk);
				2448
				2449	if (frto_undo \|\| tcp_may_undo(tp)) {
				2450	tcp_undo_cwnd_reduction(sk, true);
				2451
				2452	DBGUNDO(sk, "partial loss");
				2453	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
				2454	if (frto_undo)
				2455	NET_INC_STATS_BH(sock_net(sk),
				2456	LINUX_MIB_TCPSPURIOUSRTOS);
				2457	inet_csk(sk)->icsk_retransmits = 0;
				2458	if (frto_undo \|\| tcp_is_sack(tp))
				2459	tcp_set_ca_state(sk, TCP_CA_Open);
				2460	return true;
				2461	}
				2462	return false;
				2463	}
				2464
				2465	/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
				2466	* It computes the number of packets to send (sndcnt) based on packets newly
				2467	* delivered:
				2468	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
				2469	* cwnd reductions across a full RTT.
				2470	* 2) Otherwise PRR uses packet conservation to send as much as delivered.
				2471	* But when the retransmits are acked without further losses, PRR
				2472	* slow starts cwnd up to ssthresh to speed up the recovery.
				2473	*/
				2474	static void tcp_init_cwnd_reduction(struct sock *sk)
				2475	{
				2476	struct tcp_sock *tp = tcp_sk(sk);
				2477
				2478	tp->high_seq = tp->snd_nxt;
				2479	tp->tlp_high_seq = 0;
				2480	tp->snd_cwnd_cnt = 0;
				2481	tp->prior_cwnd = tp->snd_cwnd;
				2482	tp->prr_delivered = 0;
				2483	tp->prr_out = 0;
				2484	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
				2485	tcp_ecn_queue_cwr(tp);
				2486	}
				2487
				2488	static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
				2489	int fast_rexmit, int flag)
				2490	{
				2491	struct tcp_sock *tp = tcp_sk(sk);
				2492	int sndcnt = 0;
				2493	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
				2494	int newly_acked_sacked = prior_unsacked -
				2495	(tp->packets_out - tp->sacked_out);
				2496
				2497	if (newly_acked_sacked <= 0 \|\| WARN_ON_ONCE(!tp->prior_cwnd))
				2498	return;
				2499
				2500	tp->prr_delivered += newly_acked_sacked;
				2501	if (delta < 0) {
				2502	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
				2503	tp->prior_cwnd - 1;
				2504	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
				2505	} else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
				2506	!(flag & FLAG_LOST_RETRANS)) {
				2507	sndcnt = min_t(int, delta,
				2508	max_t(int, tp->prr_delivered - tp->prr_out,
				2509	newly_acked_sacked) + 1);
				2510	} else {
				2511	sndcnt = min(delta, newly_acked_sacked);
				2512	}
				2513	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
				2514	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
				2515	}
				2516
				2517	static inline void tcp_end_cwnd_reduction(struct sock *sk)
				2518	{
				2519	struct tcp_sock *tp = tcp_sk(sk);
				2520
				2521	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
				2522	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
				2523	(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR \|\| tp->undo_marker)) {
				2524	tp->snd_cwnd = tp->snd_ssthresh;
				2525	tp->snd_cwnd_stamp = tcp_time_stamp;
				2526	}
				2527	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
				2528	}
				2529
				2530	/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
				2531	void tcp_enter_cwr(struct sock *sk)
				2532	{
				2533	struct tcp_sock *tp = tcp_sk(sk);
				2534
				2535	tp->prior_ssthresh = 0;
				2536	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
				2537	tp->undo_marker = 0;
				2538	tcp_init_cwnd_reduction(sk);
				2539	tcp_set_ca_state(sk, TCP_CA_CWR);
				2540	}
				2541	}
				2542	EXPORT_SYMBOL(tcp_enter_cwr);
				2543
				2544	static void tcp_try_keep_open(struct sock *sk)
				2545	{
				2546	struct tcp_sock *tp = tcp_sk(sk);
				2547	int state = TCP_CA_Open;
				2548
				2549	if (tcp_left_out(tp) \|\| tcp_any_retrans_done(sk))
				2550	state = TCP_CA_Disorder;
				2551
				2552	if (inet_csk(sk)->icsk_ca_state != state) {
				2553	tcp_set_ca_state(sk, state);
				2554	tp->high_seq = tp->snd_nxt;
				2555	}
				2556	}
				2557
				2558	static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
				2559	{
				2560	struct tcp_sock *tp = tcp_sk(sk);
				2561
				2562	tcp_verify_left_out(tp);
				2563
				2564	if (!tcp_any_retrans_done(sk))
				2565	tp->retrans_stamp = 0;
				2566
				2567	if (flag & FLAG_ECE)
				2568	tcp_enter_cwr(sk);
				2569
				2570	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
				2571	tcp_try_keep_open(sk);
				2572	} else {
				2573	tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
				2574	}
				2575	}
				2576
				2577	static void tcp_mtup_probe_failed(struct sock *sk)
				2578	{
				2579	struct inet_connection_sock *icsk = inet_csk(sk);
				2580
				2581	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
				2582	icsk->icsk_mtup.probe_size = 0;
				2583	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
				2584	}
				2585
				2586	static void tcp_mtup_probe_success(struct sock *sk)
				2587	{
				2588	struct tcp_sock *tp = tcp_sk(sk);
				2589	struct inet_connection_sock *icsk = inet_csk(sk);
				2590
				2591	/* FIXME: breaks with very large cwnd */
				2592	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2593	tp->snd_cwnd = tp->snd_cwnd *
				2594	tcp_mss_to_mtu(sk, tp->mss_cache) /
				2595	icsk->icsk_mtup.probe_size;
				2596	tp->snd_cwnd_cnt = 0;
				2597	tp->snd_cwnd_stamp = tcp_time_stamp;
				2598	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2599
				2600	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
				2601	icsk->icsk_mtup.probe_size = 0;
				2602	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				2603	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
				2604	}
				2605
				2606	/* Do a simple retransmit without using the backoff mechanisms in
				2607	* tcp_timer. This is used for path mtu discovery.
				2608	* The socket is already locked here.
				2609	*/
				2610	void tcp_simple_retransmit(struct sock *sk)
				2611	{
				2612	const struct inet_connection_sock *icsk = inet_csk(sk);
				2613	struct tcp_sock *tp = tcp_sk(sk);
				2614	struct sk_buff *skb;
				2615	unsigned int mss = tcp_current_mss(sk);
				2616	u32 prior_lost = tp->lost_out;
				2617
				2618	tcp_for_write_queue(skb, sk) {
				2619	if (skb == tcp_send_head(sk))
				2620	break;
				2621	if (tcp_skb_seglen(skb) > mss &&
				2622	!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
				2623	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
				2624	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				2625	tp->retrans_out -= tcp_skb_pcount(skb);
				2626	}
				2627	tcp_skb_mark_lost_uncond_verify(tp, skb);
				2628	}
				2629	}
				2630
				2631	tcp_clear_retrans_hints_partial(tp);
				2632
				2633	if (prior_lost == tp->lost_out)
				2634	return;
				2635
				2636	if (tcp_is_reno(tp))
				2637	tcp_limit_reno_sacked(tp);
				2638
				2639	tcp_verify_left_out(tp);
				2640
				2641	/* Don't muck with the congestion window here.
				2642	* Reason is that we do not increase amount of _data_
				2643	* in network, but units changed and effective
				2644	* cwnd/ssthresh really reduced now.
				2645	*/
				2646	if (icsk->icsk_ca_state != TCP_CA_Loss) {
				2647	tp->high_seq = tp->snd_nxt;
				2648	tp->snd_ssthresh = tcp_current_ssthresh(sk);
				2649	tp->prior_ssthresh = 0;
				2650	tp->undo_marker = 0;
				2651	tcp_set_ca_state(sk, TCP_CA_Loss);
				2652	}
				2653	tcp_xmit_retransmit_queue(sk);
				2654	}
				2655	EXPORT_SYMBOL(tcp_simple_retransmit);
				2656
				2657	static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
				2658	{
				2659	struct tcp_sock *tp = tcp_sk(sk);
				2660	int mib_idx;
				2661
				2662	if (tcp_is_reno(tp))
				2663	mib_idx = LINUX_MIB_TCPRENORECOVERY;
				2664	else
				2665	mib_idx = LINUX_MIB_TCPSACKRECOVERY;
				2666
				2667	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				2668
				2669	tp->prior_ssthresh = 0;
				2670	tcp_init_undo(tp);
				2671
				2672	if (!tcp_in_cwnd_reduction(sk)) {
				2673	if (!ece_ack)
				2674	tp->prior_ssthresh = tcp_current_ssthresh(sk);
				2675	tcp_init_cwnd_reduction(sk);
				2676	}
				2677	tcp_set_ca_state(sk, TCP_CA_Recovery);
				2678	}
				2679
				2680	/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
				2681	* recovered or spurious. Otherwise retransmits more on partial ACKs.
				2682	*/
				2683	static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
				2684	{
				2685	struct tcp_sock *tp = tcp_sk(sk);
				2686	bool recovered = !before(tp->snd_una, tp->high_seq);
				2687
				2688	if ((flag & FLAG_SND_UNA_ADVANCED) &&
				2689	tcp_try_undo_loss(sk, false))
				2690	return;
				2691
				2692	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
				2693	/* Step 3.b. A timeout is spurious if not all data are
				2694	* lost, i.e., never-retransmitted data are (s)acked.
				2695	*/
				2696	if ((flag & FLAG_ORIG_SACK_ACKED) &&
				2697	tcp_try_undo_loss(sk, true))
				2698	return;
				2699
				2700	if (after(tp->snd_nxt, tp->high_seq)) {
				2701	if (flag & FLAG_DATA_SACKED \|\| is_dupack)
				2702	tp->frto = 0; /* Step 3.a. loss was real */
				2703	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
				2704	tp->high_seq = tp->snd_nxt;
				2705	__tcp_push_pending_frames(sk, tcp_current_mss(sk),
				2706	TCP_NAGLE_OFF);
				2707	if (after(tp->snd_nxt, tp->high_seq))
				2708	return; /* Step 2.b */
				2709	tp->frto = 0;
				2710	}
				2711	}
				2712
				2713	if (recovered) {
				2714	/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
				2715	tcp_try_undo_recovery(sk);
				2716	return;
				2717	}
				2718	if (tcp_is_reno(tp)) {
				2719	/* A Reno DUPACK means new data in F-RTO step 2.b above are
				2720	* delivered. Lower inflight to clock out (re)tranmissions.
				2721	*/
				2722	if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
				2723	tcp_add_reno_sack(sk);
				2724	else if (flag & FLAG_SND_UNA_ADVANCED)
				2725	tcp_reset_reno_sack(tp);
				2726	}
				2727	tcp_xmit_retransmit_queue(sk);
				2728	}
				2729
				2730	/* Undo during fast recovery after partial ACK. */
				2731	static bool tcp_try_undo_partial(struct sock *sk, const int acked,
				2732	const int prior_unsacked, int flag)
				2733	{
				2734	struct tcp_sock *tp = tcp_sk(sk);
				2735
				2736	if (tp->undo_marker && tcp_packet_delayed(tp)) {
				2737	/* Plain luck! Hole if filled with delayed
				2738	* packet, rather than with a retransmit.
				2739	*/
				2740	tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
				2741
				2742	/* We are getting evidence that the reordering degree is higher
				2743	* than we realized. If there are no retransmits out then we
				2744	* can undo. Otherwise we clock out new packets but do not
				2745	* mark more packets lost or retransmit more.
				2746	*/
				2747	if (tp->retrans_out) {
				2748	tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
				2749	return true;
				2750	}
				2751
				2752	if (!tcp_any_retrans_done(sk))
				2753	tp->retrans_stamp = 0;
				2754
				2755	DBGUNDO(sk, "partial recovery");
				2756	tcp_undo_cwnd_reduction(sk, true);
				2757	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
				2758	tcp_try_keep_open(sk);
				2759	return true;
				2760	}
				2761	return false;
				2762	}
				2763
				2764	/* Process an event, which can update packets-in-flight not trivially.
				2765	* Main goal of this function is to calculate new estimate for left_out,
				2766	* taking into account both packets sitting in receiver's buffer and
				2767	* packets lost by network.
				2768	*
				2769	* Besides that it does CWND reduction, when packet loss is detected
				2770	* and changes state of machine.
				2771	*
				2772	* It does _not_ decide what to send, it is made in function
				2773	* tcp_xmit_retransmit_queue().
				2774	*/
				2775	static void tcp_fastretrans_alert(struct sock *sk, const int acked,
				2776	const int prior_unsacked,
				2777	bool is_dupack, int flag)
				2778	{
				2779	struct inet_connection_sock *icsk = inet_csk(sk);
				2780	struct tcp_sock *tp = tcp_sk(sk);
				2781	bool do_lost = is_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
				2782	(tcp_fackets_out(tp) > tp->reordering));
				2783	int fast_rexmit = 0;
				2784
				2785	if (WARN_ON(!tp->packets_out && tp->sacked_out))
				2786	tp->sacked_out = 0;
				2787	if (WARN_ON(!tp->sacked_out && tp->fackets_out))
				2788	tp->fackets_out = 0;
				2789
				2790	/* Now state machine starts.
				2791	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
				2792	if (flag & FLAG_ECE)
				2793	tp->prior_ssthresh = 0;
				2794
				2795	/* B. In all the states check for reneging SACKs. */
				2796	if (tcp_check_sack_reneging(sk, flag))
				2797	return;
				2798
				2799	/* C. Check consistency of the current state. */
				2800	tcp_verify_left_out(tp);
				2801
				2802	/* D. Check state exit conditions. State can be terminated
				2803	* when high_seq is ACKed. */
				2804	if (icsk->icsk_ca_state == TCP_CA_Open) {
				2805	WARN_ON(tp->retrans_out != 0);
				2806	tp->retrans_stamp = 0;
				2807	} else if (!before(tp->snd_una, tp->high_seq)) {
				2808	switch (icsk->icsk_ca_state) {
				2809	case TCP_CA_CWR:
				2810	/* CWR is to be held something above high_seq
				2811	* is ACKed for CWR bit to reach receiver. */
				2812	if (tp->snd_una != tp->high_seq) {
				2813	tcp_end_cwnd_reduction(sk);
				2814	tcp_set_ca_state(sk, TCP_CA_Open);
				2815	}
				2816	break;
				2817
				2818	case TCP_CA_Recovery:
				2819	if (tcp_is_reno(tp))
				2820	tcp_reset_reno_sack(tp);
				2821	if (tcp_try_undo_recovery(sk))
				2822	return;
				2823	tcp_end_cwnd_reduction(sk);
				2824	break;
				2825	}
				2826	}
				2827
				2828	/* Use RACK to detect loss */
				2829	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
				2830	tcp_rack_mark_lost(sk))
				2831	flag \|= FLAG_LOST_RETRANS;
				2832
				2833	/* E. Process state. */
				2834	switch (icsk->icsk_ca_state) {
				2835	case TCP_CA_Recovery:
				2836	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
				2837	if (tcp_is_reno(tp) && is_dupack)
				2838	tcp_add_reno_sack(sk);
				2839	} else {
				2840	if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
				2841	return;
				2842	/* Partial ACK arrived. Force fast retransmit. */
				2843	do_lost = tcp_is_reno(tp) \|\|
				2844	tcp_fackets_out(tp) > tp->reordering;
				2845	}
				2846	if (tcp_try_undo_dsack(sk)) {
				2847	tcp_try_keep_open(sk);
				2848	return;
				2849	}
				2850	break;
				2851	case TCP_CA_Loss:
				2852	tcp_process_loss(sk, flag, is_dupack);
				2853	if (icsk->icsk_ca_state != TCP_CA_Open &&
				2854	!(flag & FLAG_LOST_RETRANS))
				2855	return;
				2856	/* Change state if cwnd is undone or retransmits are lost */
				2857	default:
				2858	if (tcp_is_reno(tp)) {
				2859	if (flag & FLAG_SND_UNA_ADVANCED)
				2860	tcp_reset_reno_sack(tp);
				2861	if (is_dupack)
				2862	tcp_add_reno_sack(sk);
				2863	}
				2864
				2865	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
				2866	tcp_try_undo_dsack(sk);
				2867
				2868	if (!tcp_time_to_recover(sk, flag)) {
				2869	tcp_try_to_open(sk, flag, prior_unsacked);
				2870	return;
				2871	}
				2872
				2873	/* MTU probe failure: don't reduce cwnd */
				2874	if (icsk->icsk_ca_state < TCP_CA_CWR &&
				2875	icsk->icsk_mtup.probe_size &&
				2876	tp->snd_una == tp->mtu_probe.probe_seq_start) {
				2877	tcp_mtup_probe_failed(sk);
				2878	/* Restores the reduction we did in tcp_mtup_probe() */
				2879	tp->snd_cwnd++;
				2880	tcp_simple_retransmit(sk);
				2881	return;
				2882	}
				2883
				2884	/* Otherwise enter Recovery state */
				2885	tcp_enter_recovery(sk, (flag & FLAG_ECE));
				2886	fast_rexmit = 1;
				2887	}
				2888
				2889	if (do_lost)
				2890	tcp_update_scoreboard(sk, fast_rexmit);
				2891	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
				2892	tcp_xmit_retransmit_queue(sk);
				2893	}
				2894
				2895	/* Kathleen Nichols' algorithm for tracking the minimum value of
				2896	* a data stream over some fixed time interval. (E.g., the minimum
				2897	* RTT over the past five minutes.) It uses constant space and constant
				2898	* time per update yet almost always delivers the same minimum as an
				2899	* implementation that has to keep all the data in the window.
				2900	*
				2901	* The algorithm keeps track of the best, 2nd best & 3rd best min
				2902	* values, maintaining an invariant that the measurement time of the
				2903	* n'th best >= n-1'th best. It also makes sure that the three values
				2904	* are widely separated in the time window since that bounds the worse
				2905	* case error when that data is monotonically increasing over the window.
				2906	*
				2907	* Upon getting a new min, we can forget everything earlier because it
				2908	* has no value - the new min is <= everything else in the window by
				2909	* definition and it's the most recent. So we restart fresh on every new min
				2910	* and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
				2911	* best.
				2912	*/
				2913	static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
				2914	{
				2915	const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
				2916	struct rtt_meas *m = tcp_sk(sk)->rtt_min;
				2917	struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
				2918	u32 elapsed;
				2919
				2920	/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
				2921	if (unlikely(rttm.rtt <= m[0].rtt))
				2922	m[0] = m[1] = m[2] = rttm;
				2923	else if (rttm.rtt <= m[1].rtt)
				2924	m[1] = m[2] = rttm;
				2925	else if (rttm.rtt <= m[2].rtt)
				2926	m[2] = rttm;
				2927
				2928	elapsed = now - m[0].ts;
				2929	if (unlikely(elapsed > wlen)) {
				2930	/* Passed entire window without a new min so make 2nd choice
				2931	* the new min & 3rd choice the new 2nd. So forth and so on.
				2932	*/
				2933	m[0] = m[1];
				2934	m[1] = m[2];
				2935	m[2] = rttm;
				2936	if (now - m[0].ts > wlen) {
				2937	m[0] = m[1];
				2938	m[1] = rttm;
				2939	if (now - m[0].ts > wlen)
				2940	m[0] = rttm;
				2941	}
				2942	} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
				2943	/* Passed a quarter of the window without a new min so
				2944	* take 2nd choice from the 2nd quarter of the window.
				2945	*/
				2946	m[2] = m[1] = rttm;
				2947	} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
				2948	/* Passed half the window without a new min so take the 3rd
				2949	* choice from the last half of the window.
				2950	*/
				2951	m[2] = rttm;
				2952	}
				2953	}
				2954
				2955	static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
				2956	long seq_rtt_us, long sack_rtt_us,
				2957	long ca_rtt_us)
				2958	{
				2959	const struct tcp_sock *tp = tcp_sk(sk);
				2960
				2961	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
				2962	* broken middle-boxes or peers may corrupt TS-ECR fields. But
				2963	* Karn's algorithm forbids taking RTT if some retransmitted data
				2964	* is acked (RFC6298).
				2965	*/
				2966	if (seq_rtt_us < 0)
				2967	seq_rtt_us = sack_rtt_us;
				2968
				2969	/* RTTM Rule: A TSecr value received in a segment is used to
				2970	* update the averaged RTT measurement only if the segment
				2971	* acknowledges some new data, i.e., only if it advances the
				2972	* left edge of the send window.
				2973	* See draft-ietf-tcplw-high-performance-00, section 3.3.
				2974	*/
				2975	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				2976	flag & FLAG_ACKED)
				2977	seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
				2978	tp->rx_opt.rcv_tsecr);
				2979	if (seq_rtt_us < 0)
				2980	return false;
				2981
				2982	/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
				2983	* always taken together with ACK, SACK, or TS-opts. Any negative
				2984	* values will be skipped with the seq_rtt_us < 0 check above.
				2985	*/
				2986	tcp_update_rtt_min(sk, ca_rtt_us);
				2987	tcp_rtt_estimator(sk, seq_rtt_us);
				2988	tcp_set_rto(sk);
				2989
				2990	/* RFC6298: only reset backoff on valid RTT measurement. */
				2991	inet_csk(sk)->icsk_backoff = 0;
				2992	return true;
				2993	}
				2994
				2995	/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
				2996	void tcp_synack_rtt_meas(struct sock sk, struct request_sock req)
				2997	{
				2998	long rtt_us = -1L;
				2999
				3000	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) {
				3001	struct skb_mstamp now;
				3002
				3003	skb_mstamp_get(&now);
				3004	rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
				3005	}
				3006
				3007	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
				3008	}
				3009
				3010
				3011	static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
				3012	{
				3013	const struct inet_connection_sock *icsk = inet_csk(sk);
				3014
				3015	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
				3016	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
				3017	}
				3018
				3019	/* Restart timer after forward progress on connection.
				3020	* RFC2988 recommends to restart timer to now+rto.
				3021	*/
				3022	void tcp_rearm_rto(struct sock *sk)
				3023	{
				3024	const struct inet_connection_sock *icsk = inet_csk(sk);
				3025	struct tcp_sock *tp = tcp_sk(sk);
				3026
				3027	/* If the retrans timer is currently being used by Fast Open
				3028	* for SYN-ACK retrans purpose, stay put.
				3029	*/
				3030	if (tp->fastopen_rsk)
				3031	return;
				3032
				3033	if (!tp->packets_out) {
				3034	inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
				3035	} else {
				3036	u32 rto = inet_csk(sk)->icsk_rto;
				3037	/* Offset the time elapsed after installing regular RTO */
				3038	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS \|\|
				3039	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
				3040	struct sk_buff *skb = tcp_write_queue_head(sk);
				3041	const u32 rto_time_stamp =
				3042	tcp_skb_timestamp(skb) + rto;
				3043	s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
				3044	/* delta may not be positive if the socket is locked
				3045	* when the retrans timer fires and is rescheduled.
				3046	*/
				3047	rto = max(delta, 1);
				3048	}
				3049	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
				3050	TCP_RTO_MAX);
				3051	}
				3052	}
				3053
				3054	/* This function is called when the delayed ER timer fires. TCP enters
				3055	* fast recovery and performs fast-retransmit.
				3056	*/
				3057	void tcp_resume_early_retransmit(struct sock *sk)
				3058	{
				3059	struct tcp_sock *tp = tcp_sk(sk);
				3060
				3061	tcp_rearm_rto(sk);
				3062
				3063	/* Stop if ER is disabled after the delayed ER timer is scheduled */
				3064	if (!tp->do_early_retrans)
				3065	return;
				3066
				3067	tcp_enter_recovery(sk, false);
				3068	tcp_update_scoreboard(sk, 1);
				3069	tcp_xmit_retransmit_queue(sk);
				3070	}
				3071
				3072	/* If we get here, the whole TSO packet has not been acked. */
				3073	static u32 tcp_tso_acked(struct sock sk, struct sk_buff skb)
				3074	{
				3075	struct tcp_sock *tp = tcp_sk(sk);
				3076	u32 packets_acked;
				3077
				3078	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
				3079
				3080	packets_acked = tcp_skb_pcount(skb);
				3081	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
				3082	return 0;
				3083	packets_acked -= tcp_skb_pcount(skb);
				3084
				3085	if (packets_acked) {
				3086	BUG_ON(tcp_skb_pcount(skb) == 0);
				3087	BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
				3088	}
				3089
				3090	return packets_acked;
				3091	}
				3092
				3093	static void tcp_ack_tstamp(struct sock sk, struct sk_buff skb,
				3094	u32 prior_snd_una)
				3095	{
				3096	const struct skb_shared_info *shinfo;
				3097
				3098	/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
				3099	if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
				3100	return;
				3101
				3102	shinfo = skb_shinfo(skb);
				3103	if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
				3104	between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
				3105	__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
				3106	}
				3107
				3108	/* Remove acknowledged frames from the retransmission queue. If our packet
				3109	* is before the ack sequence we can discard it as it's confirmed to have
				3110	* arrived at the other end.
				3111	*/
				3112	static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
				3113	u32 prior_snd_una,
				3114	struct tcp_sacktag_state *sack)
				3115	{
				3116	const struct inet_connection_sock *icsk = inet_csk(sk);
				3117	struct skb_mstamp first_ackt, last_ackt, now;
				3118	struct tcp_sock *tp = tcp_sk(sk);
				3119	u32 prior_sacked = tp->sacked_out;
				3120	u32 reord = tp->packets_out;
				3121	bool fully_acked = true;
				3122	long sack_rtt_us = -1L;
				3123	long seq_rtt_us = -1L;
				3124	long ca_rtt_us = -1L;
				3125	struct sk_buff *skb;
				3126	u32 pkts_acked = 0;
				3127	bool rtt_update;
				3128	int flag = 0;
				3129
				3130	first_ackt.v64 = 0;
				3131
				3132	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
				3133	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
				3134	u8 sacked = scb->sacked;
				3135	u32 acked_pcount;
				3136
				3137	tcp_ack_tstamp(sk, skb, prior_snd_una);
				3138
				3139	/* Determine how many packets and what bytes were acked, tso and else */
				3140	if (after(scb->end_seq, tp->snd_una)) {
				3141	if (tcp_skb_pcount(skb) == 1 \|\|
				3142	!after(tp->snd_una, scb->seq))
				3143	break;
				3144
				3145	acked_pcount = tcp_tso_acked(sk, skb);
				3146	if (!acked_pcount)
				3147	break;
				3148
				3149	fully_acked = false;
				3150	} else {
				3151	/* Speedup tcp_unlink_write_queue() and next loop */
				3152	prefetchw(skb->next);
				3153	acked_pcount = tcp_skb_pcount(skb);
				3154	}
				3155
				3156	if (unlikely(sacked & TCPCB_RETRANS)) {
				3157	if (sacked & TCPCB_SACKED_RETRANS)
				3158	tp->retrans_out -= acked_pcount;
				3159	flag \|= FLAG_RETRANS_DATA_ACKED;
				3160	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
				3161	last_ackt = skb->skb_mstamp;
				3162	WARN_ON_ONCE(last_ackt.v64 == 0);
				3163	if (!first_ackt.v64)
				3164	first_ackt = last_ackt;
				3165
				3166	reord = min(pkts_acked, reord);
				3167	if (!after(scb->end_seq, tp->high_seq))
				3168	flag \|= FLAG_ORIG_SACK_ACKED;
				3169	}
				3170
				3171	if (sacked & TCPCB_SACKED_ACKED)
				3172	tp->sacked_out -= acked_pcount;
				3173	else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
				3174	tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
				3175	if (sacked & TCPCB_LOST)
				3176	tp->lost_out -= acked_pcount;
				3177
				3178	tp->packets_out -= acked_pcount;
				3179	pkts_acked += acked_pcount;
				3180
				3181	/* Initial outgoing SYN's get put onto the write_queue
				3182	* just like anything else we transmit. It is not
				3183	* true data, and if we misinform our callers that
				3184	* this ACK acks real data, we will erroneously exit
				3185	* connection startup slow start one packet too
				3186	* quickly. This is severely frowned upon behavior.
				3187	*/
				3188	if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
				3189	flag \|= FLAG_DATA_ACKED;
				3190	} else {
				3191	flag \|= FLAG_SYN_ACKED;
				3192	tp->retrans_stamp = 0;
				3193	}
				3194
				3195	if (!fully_acked)
				3196	break;
				3197
				3198	tcp_unlink_write_queue(skb, sk);
				3199	sk_wmem_free_skb(sk, skb);
				3200	if (unlikely(skb == tp->retransmit_skb_hint))
				3201	tp->retransmit_skb_hint = NULL;
				3202	if (unlikely(skb == tp->lost_skb_hint))
				3203	tp->lost_skb_hint = NULL;
				3204	}
				3205
				3206	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
				3207	tp->snd_up = tp->snd_una;
				3208
				3209	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
				3210	flag \|= FLAG_SACK_RENEGING;
				3211
				3212	skb_mstamp_get(&now);
				3213	if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
				3214	seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
				3215	ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
				3216	}
				3217	if (sack->first_sackt.v64) {
				3218	sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
				3219	ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
				3220	}
				3221
				3222	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
				3223	ca_rtt_us);
				3224
				3225	if (flag & FLAG_ACKED) {
				3226	tcp_rearm_rto(sk);
				3227	if (unlikely(icsk->icsk_mtup.probe_size &&
				3228	!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
				3229	tcp_mtup_probe_success(sk);
				3230	}
				3231
				3232	if (tcp_is_reno(tp)) {
				3233	tcp_remove_reno_sacks(sk, pkts_acked);
				3234	} else {
				3235	int delta;
				3236
				3237	/* Non-retransmitted hole got filled? That's reordering */
				3238	if (reord < prior_fackets && reord <= tp->fackets_out)
				3239	tcp_update_reordering(sk, tp->fackets_out - reord, 0);
				3240
				3241	delta = tcp_is_fack(tp) ? pkts_acked :
				3242	prior_sacked - tp->sacked_out;
				3243	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
				3244	}
				3245
				3246	tp->fackets_out -= min(pkts_acked, tp->fackets_out);
				3247
				3248	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
				3249	sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
				3250	/* Do not re-arm RTO if the sack RTT is measured from data sent
				3251	* after when the head was last (re)transmitted. Otherwise the
				3252	* timeout may continue to extend in loss recovery.
				3253	*/
				3254	tcp_rearm_rto(sk);
				3255	}
				3256
				3257	if (icsk->icsk_ca_ops->pkts_acked)
				3258	icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
				3259
				3260	#if FASTRETRANS_DEBUG > 0
				3261	WARN_ON((int)tp->sacked_out < 0);
				3262	WARN_ON((int)tp->lost_out < 0);
				3263	WARN_ON((int)tp->retrans_out < 0);
				3264	if (!tp->packets_out && tcp_is_sack(tp)) {
				3265	icsk = inet_csk(sk);
				3266	if (tp->lost_out) {
				3267	pr_debug("Leak l=%u %d\n",
				3268	tp->lost_out, icsk->icsk_ca_state);
				3269	tp->lost_out = 0;
				3270	}
				3271	if (tp->sacked_out) {
				3272	pr_debug("Leak s=%u %d\n",
				3273	tp->sacked_out, icsk->icsk_ca_state);
				3274	tp->sacked_out = 0;
				3275	}
				3276	if (tp->retrans_out) {
				3277	pr_debug("Leak r=%u %d\n",
				3278	tp->retrans_out, icsk->icsk_ca_state);
				3279	tp->retrans_out = 0;
				3280	}
				3281	}
				3282	#endif
				3283	return flag;
				3284	}
				3285
				3286	static void tcp_ack_probe(struct sock *sk)
				3287	{
				3288	const struct tcp_sock *tp = tcp_sk(sk);
				3289	struct inet_connection_sock *icsk = inet_csk(sk);
				3290
				3291	/* Was it a usable window open? */
				3292
				3293	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
				3294	icsk->icsk_backoff = 0;
				3295	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
				3296	/* Socket must be waked up by subsequent tcp_data_snd_check().
				3297	* This function is not for random using!
				3298	*/
				3299	} else {
				3300	unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
				3301
				3302	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
				3303	when, TCP_RTO_MAX);
				3304	}
				3305	}
				3306
				3307	static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
				3308	{
				3309	return !(flag & FLAG_NOT_DUP) \|\| (flag & FLAG_CA_ALERT) \|\|
				3310	inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
				3311	}
				3312
				3313	/* Decide wheather to run the increase function of congestion control. */
				3314	static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
				3315	{
				3316	if (tcp_in_cwnd_reduction(sk))
				3317	return false;
				3318
				3319	/* If reordering is high then always grow cwnd whenever data is
				3320	* delivered regardless of its ordering. Otherwise stay conservative
				3321	* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
				3322	* new SACK or ECE mark may first advance cwnd here and later reduce
				3323	* cwnd in tcp_fastretrans_alert() based on more states.
				3324	*/
				3325	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
				3326	return flag & FLAG_FORWARD_PROGRESS;
				3327
				3328	return flag & FLAG_DATA_ACKED;
				3329	}
				3330
				3331	/* Check that window update is acceptable.
				3332	* The function assumes that snd_una<=ack<=snd_next.
				3333	*/
				3334	static inline bool tcp_may_update_window(const struct tcp_sock *tp,
				3335	const u32 ack, const u32 ack_seq,
				3336	const u32 nwin)
				3337	{
				3338	return after(ack, tp->snd_una) \|\|
				3339	after(ack_seq, tp->snd_wl1) \|\|
				3340	(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
				3341	}
				3342
				3343	/* If we update tp->snd_una, also update tp->bytes_acked */
				3344	static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
				3345	{
				3346	u32 delta = ack - tp->snd_una;
				3347
				3348	u64_stats_update_begin(&tp->syncp);
				3349	tp->bytes_acked += delta;
				3350	u64_stats_update_end(&tp->syncp);
				3351	tp->snd_una = ack;
				3352	}
				3353
				3354	/* If we update tp->rcv_nxt, also update tp->bytes_received */
				3355	static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
				3356	{
				3357	u32 delta = seq - tp->rcv_nxt;
				3358
				3359	u64_stats_update_begin(&tp->syncp);
				3360	tp->bytes_received += delta;
				3361	u64_stats_update_end(&tp->syncp);
				3362	tp->rcv_nxt = seq;
				3363	}
				3364
				3365	/* Update our send window.
				3366	*
				3367	* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
				3368	* and in FreeBSD. NetBSD's one is even worse.) is wrong.
				3369	*/
				3370	static int tcp_ack_update_window(struct sock sk, const struct sk_buff skb, u32 ack,
				3371	u32 ack_seq)
				3372	{
				3373	struct tcp_sock *tp = tcp_sk(sk);
				3374	int flag = 0;
				3375	u32 nwin = ntohs(tcp_hdr(skb)->window);
				3376
				3377	if (likely(!tcp_hdr(skb)->syn))
				3378	nwin <<= tp->rx_opt.snd_wscale;
				3379
				3380	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
				3381	flag \|= FLAG_WIN_UPDATE;
				3382	tcp_update_wl(tp, ack_seq);
				3383
				3384	if (tp->snd_wnd != nwin) {
				3385	tp->snd_wnd = nwin;
				3386
				3387	/* Note, it is the only place, where
				3388	* fast path is recovered for sending TCP.
				3389	*/
				3390	tp->pred_flags = 0;
				3391	tcp_fast_path_check(sk);
				3392
				3393	if (tcp_send_head(sk))
				3394	tcp_slow_start_after_idle_check(sk);
				3395
				3396	if (nwin > tp->max_window) {
				3397	tp->max_window = nwin;
				3398	tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
				3399	}
				3400	}
				3401	}
				3402
				3403	tcp_snd_una_update(tp, ack);
				3404
				3405	return flag;
				3406	}
				3407
				3408	static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
				3409	u32 *last_oow_ack_time)
				3410	{
				3411	if (*last_oow_ack_time) {
				3412	s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
				3413
				3414	if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
				3415	NET_INC_STATS_BH(net, mib_idx);
				3416	return true; /* rate-limited: don't send yet! */
				3417	}
				3418	}
				3419
				3420	*last_oow_ack_time = tcp_time_stamp;
				3421
				3422	return false; /* not rate-limited: go ahead, send dupack now! */
				3423	}
				3424
				3425	/* Return true if we're currently rate-limiting out-of-window ACKs and
				3426	* thus shouldn't send a dupack right now. We rate-limit dupacks in
				3427	* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
				3428	* attacks that send repeated SYNs or ACKs for the same connection. To
				3429	* do this, we do not send a duplicate SYNACK or ACK if the remote
				3430	* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
				3431	*/
				3432	bool tcp_oow_rate_limited(struct net net, const struct sk_buff skb,
				3433	int mib_idx, u32 *last_oow_ack_time)
				3434	{
				3435	/* Data packets without SYNs are not likely part of an ACK loop. */
				3436	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
				3437	!tcp_hdr(skb)->syn)
				3438	return false;
				3439
				3440	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
				3441	}
				3442
				3443	/* RFC 5961 7 [ACK Throttling] */
				3444	static void tcp_send_challenge_ack(struct sock sk, const struct sk_buff skb)
				3445	{
				3446	/* unprotected vars, we dont care of overwrites */
				3447	static u32 challenge_timestamp;
				3448	static unsigned int challenge_count;
				3449	struct tcp_sock *tp = tcp_sk(sk);
				3450	u32 count, now;
				3451
				3452	/* First check our per-socket dupack rate limit. */
				3453	if (__tcp_oow_rate_limited(sock_net(sk),
				3454	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
				3455	&tp->last_oow_ack_time))
				3456	return;
				3457
				3458	/* Then check host-wide RFC 5961 rate limit. */
				3459	now = jiffies / HZ;
				3460	if (now != challenge_timestamp) {
				3461	u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
				3462
				3463	challenge_timestamp = now;
				3464	WRITE_ONCE(challenge_count, half +
				3465	prandom_u32_max(sysctl_tcp_challenge_ack_limit));
				3466	}
				3467	count = READ_ONCE(challenge_count);
				3468	if (count > 0) {
				3469	WRITE_ONCE(challenge_count, count - 1);
				3470	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
				3471	tcp_send_ack(sk);
				3472	}
				3473	}
				3474
				3475	static void tcp_store_ts_recent(struct tcp_sock *tp)
				3476	{
				3477	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
				3478	tp->rx_opt.ts_recent_stamp = get_seconds();
				3479	}
				3480
				3481	static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
				3482	{
				3483	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
				3484	/* PAWS bug workaround wrt. ACK frames, the PAWS discard
				3485	* extra check below makes sure this can only happen
				3486	* for pure ACK frames. -DaveM
				3487	*
				3488	* Not only, also it occurs for expired timestamps.
				3489	*/
				3490
				3491	if (tcp_paws_check(&tp->rx_opt, 0))
				3492	tcp_store_ts_recent(tp);
				3493	}
				3494	}
				3495
				3496	/* This routine deals with acks during a TLP episode.
				3497	* We mark the end of a TLP episode on receiving TLP dupack or when
				3498	* ack is after tlp_high_seq.
				3499	* Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
				3500	*/
				3501	static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
				3502	{
				3503	struct tcp_sock *tp = tcp_sk(sk);
				3504
				3505	if (before(ack, tp->tlp_high_seq))
				3506	return;
				3507
				3508	if (flag & FLAG_DSACKING_ACK) {
				3509	/* This DSACK means original and TLP probe arrived; no loss */
				3510	tp->tlp_high_seq = 0;
				3511	} else if (after(ack, tp->tlp_high_seq)) {
				3512	/* ACK advances: there was a loss, so reduce cwnd. Reset
				3513	* tlp_high_seq in tcp_init_cwnd_reduction()
				3514	*/
				3515	tcp_init_cwnd_reduction(sk);
				3516	tcp_set_ca_state(sk, TCP_CA_CWR);
				3517	tcp_end_cwnd_reduction(sk);
				3518	tcp_try_keep_open(sk);
				3519	NET_INC_STATS_BH(sock_net(sk),
				3520	LINUX_MIB_TCPLOSSPROBERECOVERY);
				3521	} else if (!(flag & (FLAG_SND_UNA_ADVANCED \|
				3522	FLAG_NOT_DUP \| FLAG_DATA_SACKED))) {
				3523	/* Pure dupack: original and TLP probe arrived; no loss */
				3524	tp->tlp_high_seq = 0;
				3525	}
				3526	}
				3527
				3528	static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
				3529	{
				3530	const struct inet_connection_sock *icsk = inet_csk(sk);
				3531
				3532	if (icsk->icsk_ca_ops->in_ack_event)
				3533	icsk->icsk_ca_ops->in_ack_event(sk, flags);
				3534	}
				3535
				3536	/* This routine deals with incoming acks, but not outgoing ones. */
				3537	static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
				3538	{
				3539	struct inet_connection_sock *icsk = inet_csk(sk);
				3540	struct tcp_sock *tp = tcp_sk(sk);
				3541	struct tcp_sacktag_state sack_state;
				3542	u32 prior_snd_una = tp->snd_una;
				3543	u32 ack_seq = TCP_SKB_CB(skb)->seq;
				3544	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				3545	bool is_dupack = false;
				3546	u32 prior_fackets;
				3547	int prior_packets = tp->packets_out;
				3548	const int prior_unsacked = tp->packets_out - tp->sacked_out;
				3549	int acked = 0; /* Number of packets newly acked */
				3550
				3551	sack_state.first_sackt.v64 = 0;
				3552
				3553	/* We very likely will need to access write queue head. */
				3554	prefetchw(sk->sk_write_queue.next);
				3555
				3556	/* If the ack is older than previous acks
				3557	* then we can probably ignore it.
				3558	*/
				3559	if (before(ack, prior_snd_una)) {
				3560	/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
				3561	if (before(ack, prior_snd_una - tp->max_window)) {
				3562	tcp_send_challenge_ack(sk, skb);
				3563	return -1;
				3564	}
				3565	goto old_ack;
				3566	}
				3567
				3568	/* If the ack includes data we haven't sent yet, discard
				3569	* this segment (RFC793 Section 3.9).
				3570	*/
				3571	if (after(ack, tp->snd_nxt))
				3572	goto invalid_ack;
				3573
				3574	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS \|\|
				3575	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
				3576	tcp_rearm_rto(sk);
				3577
				3578	if (after(ack, prior_snd_una)) {
				3579	flag \|= FLAG_SND_UNA_ADVANCED;
				3580	icsk->icsk_retransmits = 0;
				3581	}
				3582
				3583	prior_fackets = tp->fackets_out;
				3584
				3585	/* ts_recent update must be made after we are sure that the packet
				3586	* is in window.
				3587	*/
				3588	if (flag & FLAG_UPDATE_TS_RECENT)
				3589	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
				3590
				3591	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
				3592	/* Window is constant, pure forward advance.
				3593	* No more checks are required.
				3594	* Note, we use the fact that SND.UNA>=SND.WL2.
				3595	*/
				3596	tcp_update_wl(tp, ack_seq);
				3597	tcp_snd_una_update(tp, ack);
				3598	flag \|= FLAG_WIN_UPDATE;
				3599
				3600	tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
				3601
				3602	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
				3603	} else {
				3604	u32 ack_ev_flags = CA_ACK_SLOWPATH;
				3605
				3606	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
				3607	flag \|= FLAG_DATA;
				3608	else
				3609	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
				3610
				3611	flag \|= tcp_ack_update_window(sk, skb, ack, ack_seq);
				3612
				3613	if (TCP_SKB_CB(skb)->sacked)
				3614	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3615	&sack_state);
				3616
				3617	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
				3618	flag \|= FLAG_ECE;
				3619	ack_ev_flags \|= CA_ACK_ECE;
				3620	}
				3621
				3622	if (flag & FLAG_WIN_UPDATE)
				3623	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
				3624
				3625	tcp_in_ack_event(sk, ack_ev_flags);
				3626	}
				3627
				3628	/* We passed data and got it acked, remove any soft error
				3629	* log. Something worked...
				3630	*/
				3631	sk->sk_err_soft = 0;
				3632	icsk->icsk_probes_out = 0;
				3633	tp->rcv_tstamp = tcp_time_stamp;
				3634	if (!prior_packets)
				3635	goto no_queue;
				3636
				3637	/* See if we can take anything off of the retransmit queue. */
				3638	acked = tp->packets_out;
				3639	flag \|= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
				3640	&sack_state);
				3641	acked -= tp->packets_out;
				3642
				3643	if (tcp_ack_is_dubious(sk, flag)) {
				3644	is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED \| FLAG_NOT_DUP));
				3645	tcp_fastretrans_alert(sk, acked, prior_unsacked,
				3646	is_dupack, flag);
				3647	}
				3648	if (tp->tlp_high_seq)
				3649	tcp_process_tlp_ack(sk, ack, flag);
				3650
				3651	/* Advance cwnd if state allows */
				3652	if (tcp_may_raise_cwnd(sk, flag))
				3653	tcp_cong_avoid(sk, ack, acked);
				3654
				3655	if ((flag & FLAG_FORWARD_PROGRESS) \|\| !(flag & FLAG_NOT_DUP)) {
				3656	struct dst_entry *dst = __sk_dst_get(sk);
				3657	if (dst)
				3658	dst_confirm(dst);
				3659	}
				3660
				3661	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
				3662	tcp_schedule_loss_probe(sk);
				3663	tcp_update_pacing_rate(sk);
				3664	return 1;
				3665
				3666	no_queue:
				3667	/* If data was DSACKed, see if we can undo a cwnd reduction. */
				3668	if (flag & FLAG_DSACKING_ACK)
				3669	tcp_fastretrans_alert(sk, acked, prior_unsacked,
				3670	is_dupack, flag);
				3671	/* If this ack opens up a zero window, clear backoff. It was
				3672	* being used to time the probes, and is probably far higher than
				3673	* it needs to be for normal retransmission.
				3674	*/
				3675	if (tcp_send_head(sk))
				3676	tcp_ack_probe(sk);
				3677
				3678	if (tp->tlp_high_seq)
				3679	tcp_process_tlp_ack(sk, ack, flag);
				3680	return 1;
				3681
				3682	invalid_ack:
				3683	SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
				3684	return -1;
				3685
				3686	old_ack:
				3687	/* If data was SACKed, tag it and see if we should send more data.
				3688	* If data was DSACKed, see if we can undo a cwnd reduction.
				3689	*/
				3690	if (TCP_SKB_CB(skb)->sacked) {
				3691	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
				3692	&sack_state);
				3693	tcp_fastretrans_alert(sk, acked, prior_unsacked,
				3694	is_dupack, flag);
				3695	}
				3696
				3697	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
				3698	return 0;
				3699	}
				3700
				3701	static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
				3702	bool syn, struct tcp_fastopen_cookie *foc,
				3703	bool exp_opt)
				3704	{
				3705	/* Valid only in SYN or SYN-ACK with an even length. */
				3706	if (!foc \|\| !syn \|\| len < 0 \|\| (len & 1))
				3707	return;
				3708
				3709	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
				3710	len <= TCP_FASTOPEN_COOKIE_MAX)
				3711	memcpy(foc->val, cookie, len);
				3712	else if (len != 0)
				3713	len = -1;
				3714	foc->len = len;
				3715	foc->exp = exp_opt;
				3716	}
				3717
				3718	/* Look for tcp options. Normally only called on SYN and SYNACK packets.
				3719	* But, this can also be called on packets in the established flow when
				3720	* the fast version below fails.
				3721	*/
				3722	void tcp_parse_options(const struct sk_buff *skb,
				3723	struct tcp_options_received *opt_rx, int estab,
				3724	struct tcp_fastopen_cookie *foc)
				3725	{
				3726	const unsigned char *ptr;
				3727	const struct tcphdr *th = tcp_hdr(skb);
				3728	int length = (th->doff * 4) - sizeof(struct tcphdr);
				3729
				3730	ptr = (const unsigned char *)(th + 1);
				3731	opt_rx->saw_tstamp = 0;
				3732
				3733	while (length > 0) {
				3734	int opcode = *ptr++;
				3735	int opsize;
				3736
				3737	switch (opcode) {
				3738	case TCPOPT_EOL:
				3739	return;
				3740	case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
				3741	length--;
				3742	continue;
				3743	default:
				3744	opsize = *ptr++;
				3745	if (opsize < 2) /* "silly options" */
				3746	return;
				3747	if (opsize > length)
				3748	return; /* don't parse partial options */
				3749	switch (opcode) {
				3750	case TCPOPT_MSS:
				3751	if (opsize == TCPOLEN_MSS && th->syn && !estab) {
				3752	u16 in_mss = get_unaligned_be16(ptr);
				3753	if (in_mss) {
				3754	if (opt_rx->user_mss &&
				3755	opt_rx->user_mss < in_mss)
				3756	in_mss = opt_rx->user_mss;
				3757	opt_rx->mss_clamp = in_mss;
				3758	}
				3759	}
				3760	break;
				3761	case TCPOPT_WINDOW:
				3762	if (opsize == TCPOLEN_WINDOW && th->syn &&
				3763	!estab && sysctl_tcp_window_scaling) {
				3764	__u8 snd_wscale = (__u8 )ptr;
				3765	opt_rx->wscale_ok = 1;
				3766	if (snd_wscale > 14) {
				3767	net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
				3768	__func__,
				3769	snd_wscale);
				3770	snd_wscale = 14;
				3771	}
				3772	opt_rx->snd_wscale = snd_wscale;
				3773	}
				3774	break;
				3775	case TCPOPT_TIMESTAMP:
				3776	if ((opsize == TCPOLEN_TIMESTAMP) &&
				3777	((estab && opt_rx->tstamp_ok) \|\|
				3778	(!estab && sysctl_tcp_timestamps))) {
				3779	opt_rx->saw_tstamp = 1;
				3780	opt_rx->rcv_tsval = get_unaligned_be32(ptr);
				3781	opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
				3782	}
				3783	break;
				3784	case TCPOPT_SACK_PERM:
				3785	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
				3786	!estab && sysctl_tcp_sack) {
				3787	opt_rx->sack_ok = TCP_SACK_SEEN;
				3788	tcp_sack_reset(opt_rx);
				3789	}
				3790	break;
				3791
				3792	case TCPOPT_SACK:
				3793	if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
				3794	!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
				3795	opt_rx->sack_ok) {
				3796	TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
				3797	}
				3798	break;
				3799	#ifdef CONFIG_TCP_MD5SIG
				3800	case TCPOPT_MD5SIG:
				3801	/*
				3802	* The MD5 Hash has already been
				3803	* checked (see tcp_v{4,6}_do_rcv()).
				3804	*/
				3805	break;
				3806	#endif
				3807	case TCPOPT_FASTOPEN:
				3808	tcp_parse_fastopen_option(
				3809	opsize - TCPOLEN_FASTOPEN_BASE,
				3810	ptr, th->syn, foc, false);
				3811	break;
				3812
				3813	case TCPOPT_EXP:
				3814	/* Fast Open option shares code 254 using a
				3815	* 16 bits magic number.
				3816	*/
				3817	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
				3818	get_unaligned_be16(ptr) ==
				3819	TCPOPT_FASTOPEN_MAGIC)
				3820	tcp_parse_fastopen_option(opsize -
				3821	TCPOLEN_EXP_FASTOPEN_BASE,
				3822	ptr + 2, th->syn, foc, true);
				3823	break;
				3824
				3825	}
				3826	ptr += opsize-2;
				3827	length -= opsize;
				3828	}
				3829	}
				3830	}
				3831	EXPORT_SYMBOL(tcp_parse_options);
				3832
				3833	static bool tcp_parse_aligned_timestamp(struct tcp_sock tp, const struct tcphdr th)
				3834	{
				3835	const __be32 ptr = (const __be32 )(th + 1);
				3836
				3837	if (*ptr == htonl((TCPOPT_NOP << 24) \| (TCPOPT_NOP << 16)
				3838	\| (TCPOPT_TIMESTAMP << 8) \| TCPOLEN_TIMESTAMP)) {
				3839	tp->rx_opt.saw_tstamp = 1;
				3840	++ptr;
				3841	tp->rx_opt.rcv_tsval = ntohl(*ptr);
				3842	++ptr;
				3843	if (*ptr)
				3844	tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
				3845	else
				3846	tp->rx_opt.rcv_tsecr = 0;
				3847	return true;
				3848	}
				3849	return false;
				3850	}
				3851
				3852	/* Fast parse options. This hopes to only see timestamps.
				3853	* If it is wrong it falls back on tcp_parse_options().
				3854	*/
				3855	static bool tcp_fast_parse_options(const struct sk_buff *skb,
				3856	const struct tcphdr th, struct tcp_sock tp)
				3857	{
				3858	/* In the spirit of fast parsing, compare doff directly to constant
				3859	* values. Because equality is used, short doff can be ignored here.
				3860	*/
				3861	if (th->doff == (sizeof(*th) / 4)) {
				3862	tp->rx_opt.saw_tstamp = 0;
				3863	return false;
				3864	} else if (tp->rx_opt.tstamp_ok &&
				3865	th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
				3866	if (tcp_parse_aligned_timestamp(tp, th))
				3867	return true;
				3868	}
				3869
				3870	tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
				3871	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				3872	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				3873
				3874	return true;
				3875	}
				3876
				3877	#ifdef CONFIG_TCP_MD5SIG
				3878	/*
				3879	* Parse MD5 Signature option
				3880	*/
				3881	const u8 tcp_parse_md5sig_option(const struct tcphdr th)
				3882	{
				3883	int length = (th->doff << 2) - sizeof(*th);
				3884	const u8 ptr = (const u8 )(th + 1);
				3885
				3886	/* If the TCP option is too short, we can short cut */
				3887	if (length < TCPOLEN_MD5SIG)
				3888	return NULL;
				3889
				3890	while (length > 0) {
				3891	int opcode = *ptr++;
				3892	int opsize;
				3893
				3894	switch (opcode) {
				3895	case TCPOPT_EOL:
				3896	return NULL;
				3897	case TCPOPT_NOP:
				3898	length--;
				3899	continue;
				3900	default:
				3901	opsize = *ptr++;
				3902	if (opsize < 2 \|\| opsize > length)
				3903	return NULL;
				3904	if (opcode == TCPOPT_MD5SIG)
				3905	return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
				3906	}
				3907	ptr += opsize - 2;
				3908	length -= opsize;
				3909	}
				3910	return NULL;
				3911	}
				3912	EXPORT_SYMBOL(tcp_parse_md5sig_option);
				3913	#endif
				3914
				3915	/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
				3916	*
				3917	* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
				3918	* it can pass through stack. So, the following predicate verifies that
				3919	* this segment is not used for anything but congestion avoidance or
				3920	* fast retransmit. Moreover, we even are able to eliminate most of such
				3921	* second order effects, if we apply some small "replay" window (~RTO)
				3922	* to timestamp space.
				3923	*
				3924	* All these measures still do not guarantee that we reject wrapped ACKs
				3925	* on networks with high bandwidth, when sequence space is recycled fastly,
				3926	* but it guarantees that such events will be very rare and do not affect
				3927	* connection seriously. This doesn't look nice, but alas, PAWS is really
				3928	* buggy extension.
				3929	*
				3930	* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
				3931	* states that events when retransmit arrives after original data are rare.
				3932	* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
				3933	* the biggest problem on large power networks even with minor reordering.
				3934	* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
				3935	* up to bandwidth of 18Gigabit/sec. 8) ]
				3936	*/
				3937
				3938	static int tcp_disordered_ack(const struct sock sk, const struct sk_buff skb)
				3939	{
				3940	const struct tcp_sock *tp = tcp_sk(sk);
				3941	const struct tcphdr *th = tcp_hdr(skb);
				3942	u32 seq = TCP_SKB_CB(skb)->seq;
				3943	u32 ack = TCP_SKB_CB(skb)->ack_seq;
				3944
				3945	return (/* 1. Pure ACK with correct sequence number. */
				3946	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
				3947
				3948	/* 2. ... and duplicate ACK. */
				3949	ack == tp->snd_una &&
				3950
				3951	/* 3. ... and does not update window. */
				3952	!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
				3953
				3954	/* 4. ... and sits in replay window. */
				3955	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
				3956	}
				3957
				3958	static inline bool tcp_paws_discard(const struct sock *sk,
				3959	const struct sk_buff *skb)
				3960	{
				3961	const struct tcp_sock *tp = tcp_sk(sk);
				3962
				3963	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
				3964	!tcp_disordered_ack(sk, skb);
				3965	}
				3966
				3967	/* Check segment sequence number for validity.
				3968	*
				3969	* Segment controls are considered valid, if the segment
				3970	* fits to the window after truncation to the window. Acceptability
				3971	* of data (and SYN, FIN, of course) is checked separately.
				3972	* See tcp_data_queue(), for example.
				3973	*
				3974	* Also, controls (RST is main one) are accepted using RCV.WUP instead
				3975	* of RCV.NXT. Peer still did not advance his SND.UNA when we
				3976	* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
				3977	* (borrowed from freebsd)
				3978	*/
				3979
				3980	static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
				3981	{
				3982	return !before(end_seq, tp->rcv_wup) &&
				3983	!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
				3984	}
				3985
				3986	/* When we get a reset we do this. */
				3987	void tcp_reset(struct sock *sk)
				3988	{
				3989	/* We want the right error as BSD sees it (and indeed as we do). */
				3990	switch (sk->sk_state) {
				3991	case TCP_SYN_SENT:
				3992	sk->sk_err = ECONNREFUSED;
				3993	break;
				3994	case TCP_CLOSE_WAIT:
				3995	sk->sk_err = EPIPE;
				3996	break;
				3997	case TCP_CLOSE:
				3998	return;
				3999	default:
				4000	sk->sk_err = ECONNRESET;
				4001	}
				4002	/* This barrier is coupled with smp_rmb() in tcp_poll() */
				4003	smp_wmb();
				4004
				4005	if (!sock_flag(sk, SOCK_DEAD))
				4006	sk->sk_error_report(sk);
				4007
				4008	tcp_done(sk);
				4009	}
				4010
				4011	/*
				4012	* Process the FIN bit. This now behaves as it is supposed to work
				4013	* and the FIN takes effect when it is validly part of sequence
				4014	* space. Not before when we get holes.
				4015	*
				4016	* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
				4017	* (and thence onto LAST-ACK and finally, CLOSE, we never enter
				4018	* TIME-WAIT)
				4019	*
				4020	* If we are in FINWAIT-1, a received FIN indicates simultaneous
				4021	* close and we go into CLOSING (and later onto TIME-WAIT)
				4022	*
				4023	* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
				4024	*/
				4025	static void tcp_fin(struct sock *sk)
				4026	{
				4027	struct tcp_sock *tp = tcp_sk(sk);
				4028
				4029	inet_csk_schedule_ack(sk);
				4030
				4031	sk->sk_shutdown \|= RCV_SHUTDOWN;
				4032	sock_set_flag(sk, SOCK_DONE);
				4033
				4034	switch (sk->sk_state) {
				4035	case TCP_SYN_RECV:
				4036	case TCP_ESTABLISHED:
				4037	/* Move to CLOSE_WAIT */
				4038	tcp_set_state(sk, TCP_CLOSE_WAIT);
				4039	inet_csk(sk)->icsk_ack.pingpong = 1;
				4040	break;
				4041
				4042	case TCP_CLOSE_WAIT:
				4043	case TCP_CLOSING:
				4044	/* Received a retransmission of the FIN, do
				4045	* nothing.
				4046	*/
				4047	break;
				4048	case TCP_LAST_ACK:
				4049	/* RFC793: Remain in the LAST-ACK state. */
				4050	break;
				4051
				4052	case TCP_FIN_WAIT1:
				4053	/* This case occurs when a simultaneous close
				4054	* happens, we must ack the received FIN and
				4055	* enter the CLOSING state.
				4056	*/
				4057	tcp_send_ack(sk);
				4058	tcp_set_state(sk, TCP_CLOSING);
				4059	break;
				4060	case TCP_FIN_WAIT2:
				4061	/* Received a FIN -- send ACK and enter TIME_WAIT. */
				4062	tcp_send_ack(sk);
				4063	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				4064	break;
				4065	default:
				4066	/* Only TCP_LISTEN and TCP_CLOSE are left, in these
				4067	* cases we should never reach this piece of code.
				4068	*/
				4069	pr_err("%s: Impossible, sk->sk_state=%d\n",
				4070	__func__, sk->sk_state);
				4071	break;
				4072	}
				4073
				4074	/* It _is_ possible, that we have something out-of-order _after_ FIN.
				4075	* Probably, we should reset in this case. For now drop them.
				4076	*/
				4077	__skb_queue_purge(&tp->out_of_order_queue);
				4078	if (tcp_is_sack(tp))
				4079	tcp_sack_reset(&tp->rx_opt);
				4080	sk_mem_reclaim(sk);
				4081
				4082	if (!sock_flag(sk, SOCK_DEAD)) {
				4083	sk->sk_state_change(sk);
				4084
				4085	/* Do not send POLL_HUP for half duplex close. */
				4086	if (sk->sk_shutdown == SHUTDOWN_MASK \|\|
				4087	sk->sk_state == TCP_CLOSE)
				4088	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
				4089	else
				4090	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
				4091	}
				4092	}
				4093
				4094	static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
				4095	u32 end_seq)
				4096	{
				4097	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
				4098	if (before(seq, sp->start_seq))
				4099	sp->start_seq = seq;
				4100	if (after(end_seq, sp->end_seq))
				4101	sp->end_seq = end_seq;
				4102	return true;
				4103	}
				4104	return false;
				4105	}
				4106
				4107	static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
				4108	{
				4109	struct tcp_sock *tp = tcp_sk(sk);
				4110
				4111	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
				4112	int mib_idx;
				4113
				4114	if (before(seq, tp->rcv_nxt))
				4115	mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
				4116	else
				4117	mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
				4118
				4119	NET_INC_STATS_BH(sock_net(sk), mib_idx);
				4120
				4121	tp->rx_opt.dsack = 1;
				4122	tp->duplicate_sack[0].start_seq = seq;
				4123	tp->duplicate_sack[0].end_seq = end_seq;
				4124	}
				4125	}
				4126
				4127	static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
				4128	{
				4129	struct tcp_sock *tp = tcp_sk(sk);
				4130
				4131	if (!tp->rx_opt.dsack)
				4132	tcp_dsack_set(sk, seq, end_seq);
				4133	else
				4134	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
				4135	}
				4136
				4137	static void tcp_send_dupack(struct sock sk, const struct sk_buff skb)
				4138	{
				4139	struct tcp_sock *tp = tcp_sk(sk);
				4140
				4141	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				4142	before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4143	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4144	tcp_enter_quickack_mode(sk);
				4145
				4146	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
				4147	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
				4148
				4149	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
				4150	end_seq = tp->rcv_nxt;
				4151	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
				4152	}
				4153	}
				4154
				4155	tcp_send_ack(sk);
				4156	}
				4157
				4158	/* These routines update the SACK block as out-of-order packets arrive or
				4159	* in-order packets close up the sequence space.
				4160	*/
				4161	static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
				4162	{
				4163	int this_sack;
				4164	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4165	struct tcp_sack_block *swalk = sp + 1;
				4166
				4167	/* See if the recent change to the first SACK eats into
				4168	* or hits the sequence space of other SACK blocks, if so coalesce.
				4169	*/
				4170	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
				4171	if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
				4172	int i;
				4173
				4174	/* Zap SWALK, by moving every further SACK up by one slot.
				4175	* Decrease num_sacks.
				4176	*/
				4177	tp->rx_opt.num_sacks--;
				4178	for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
				4179	sp[i] = sp[i + 1];
				4180	continue;
				4181	}
				4182	this_sack++, swalk++;
				4183	}
				4184	}
				4185
				4186	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
				4187	{
				4188	struct tcp_sock *tp = tcp_sk(sk);
				4189	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4190	int cur_sacks = tp->rx_opt.num_sacks;
				4191	int this_sack;
				4192
				4193	if (!cur_sacks)
				4194	goto new_sack;
				4195
				4196	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
				4197	if (tcp_sack_extend(sp, seq, end_seq)) {
				4198	/* Rotate this_sack to the first one. */
				4199	for (; this_sack > 0; this_sack--, sp--)
				4200	swap(sp, (sp - 1));
				4201	if (cur_sacks > 1)
				4202	tcp_sack_maybe_coalesce(tp);
				4203	return;
				4204	}
				4205	}
				4206
				4207	/* Could not find an adjacent existing SACK, build a new one,
				4208	* put it at the front, and shift everyone else down. We
				4209	* always know there is at least one SACK present already here.
				4210	*
				4211	* If the sack array is full, forget about the last one.
				4212	*/
				4213	if (this_sack >= TCP_NUM_SACKS) {
				4214	this_sack--;
				4215	tp->rx_opt.num_sacks--;
				4216	sp--;
				4217	}
				4218	for (; this_sack > 0; this_sack--, sp--)
				4219	sp = (sp - 1);
				4220
				4221	new_sack:
				4222	/* Build the new head SACK, and we're done. */
				4223	sp->start_seq = seq;
				4224	sp->end_seq = end_seq;
				4225	tp->rx_opt.num_sacks++;
				4226	}
				4227
				4228	/* RCV.NXT advances, some SACKs should be eaten. */
				4229
				4230	static void tcp_sack_remove(struct tcp_sock *tp)
				4231	{
				4232	struct tcp_sack_block *sp = &tp->selective_acks[0];
				4233	int num_sacks = tp->rx_opt.num_sacks;
				4234	int this_sack;
				4235
				4236	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
				4237	if (skb_queue_empty(&tp->out_of_order_queue)) {
				4238	tp->rx_opt.num_sacks = 0;
				4239	return;
				4240	}
				4241
				4242	for (this_sack = 0; this_sack < num_sacks;) {
				4243	/* Check if the start of the sack is covered by RCV.NXT. */
				4244	if (!before(tp->rcv_nxt, sp->start_seq)) {
				4245	int i;
				4246
				4247	/* RCV.NXT must cover all the block! */
				4248	WARN_ON(before(tp->rcv_nxt, sp->end_seq));
				4249
				4250	/* Zap this SACK, by moving forward any other SACKS. */
				4251	for (i = this_sack+1; i < num_sacks; i++)
				4252	tp->selective_acks[i-1] = tp->selective_acks[i];
				4253	num_sacks--;
				4254	continue;
				4255	}
				4256	this_sack++;
				4257	sp++;
				4258	}
				4259	tp->rx_opt.num_sacks = num_sacks;
				4260	}
				4261
				4262	/**
				4263	* tcp_try_coalesce - try to merge skb to prior one
				4264	* @sk: socket
				4265	* @to: prior buffer
				4266	* @from: buffer to add in queue
				4267	* @fragstolen: pointer to boolean
				4268	*
				4269	* Before queueing skb @from after @to, try to merge them
				4270	* to reduce overall memory use and queue lengths, if cost is small.
				4271	* Packets in ofo or receive queues can stay a long time.
				4272	* Better try to coalesce them right now to avoid future collapses.
				4273	* Returns true if caller should free @from instead of queueing it
				4274	*/
				4275	static bool tcp_try_coalesce(struct sock *sk,
				4276	struct sk_buff *to,
				4277	struct sk_buff *from,
				4278	bool *fragstolen)
				4279	{
				4280	int delta;
				4281
				4282	*fragstolen = false;
				4283
				4284	/* Its possible this segment overlaps with prior segment in queue */
				4285	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
				4286	return false;
				4287
				4288	if (!skb_try_coalesce(to, from, fragstolen, &delta))
				4289	return false;
				4290
				4291	atomic_add(delta, &sk->sk_rmem_alloc);
				4292	sk_mem_charge(sk, delta);
				4293	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
				4294	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
				4295	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
				4296	TCP_SKB_CB(to)->tcp_flags \|= TCP_SKB_CB(from)->tcp_flags;
				4297	return true;
				4298	}
				4299
				4300	/* This one checks to see if we can put data from the
				4301	* out_of_order queue into the receive_queue.
				4302	*/
				4303	static void tcp_ofo_queue(struct sock *sk)
				4304	{
				4305	struct tcp_sock *tp = tcp_sk(sk);
				4306	__u32 dsack_high = tp->rcv_nxt;
				4307	struct sk_buff skb, tail;
				4308	bool fragstolen, eaten;
				4309
				4310	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
				4311	if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				4312	break;
				4313
				4314	if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
				4315	__u32 dsack = dsack_high;
				4316	if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
				4317	dsack_high = TCP_SKB_CB(skb)->end_seq;
				4318	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
				4319	}
				4320
				4321	__skb_unlink(skb, &tp->out_of_order_queue);
				4322	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
				4323	SOCK_DEBUG(sk, "ofo packet was already received\n");
				4324	__kfree_skb(skb);
				4325	continue;
				4326	}
				4327	SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
				4328	tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
				4329	TCP_SKB_CB(skb)->end_seq);
				4330
				4331	tail = skb_peek_tail(&sk->sk_receive_queue);
				4332	eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
				4333	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
				4334	if (!eaten)
				4335	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4336	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				4337	tcp_fin(sk);
				4338	if (eaten)
				4339	kfree_skb_partial(skb, fragstolen);
				4340	}
				4341	}
				4342
				4343	static bool tcp_prune_ofo_queue(struct sock *sk);
				4344	static int tcp_prune_queue(struct sock *sk);
				4345
				4346	static int tcp_try_rmem_schedule(struct sock sk, struct sk_buff skb,
				4347	unsigned int size)
				4348	{
				4349	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf \|\|
				4350	!sk_rmem_schedule(sk, skb, size)) {
				4351
				4352	if (tcp_prune_queue(sk) < 0)
				4353	return -1;
				4354
				4355	if (!sk_rmem_schedule(sk, skb, size)) {
				4356	if (!tcp_prune_ofo_queue(sk))
				4357	return -1;
				4358
				4359	if (!sk_rmem_schedule(sk, skb, size))
				4360	return -1;
				4361	}
				4362	}
				4363	return 0;
				4364	}
				4365
				4366	static void tcp_data_queue_ofo(struct sock sk, struct sk_buff skb)
				4367	{
				4368	struct tcp_sock *tp = tcp_sk(sk);
				4369	struct sk_buff *skb1;
				4370	u32 seq, end_seq;
				4371
				4372	tcp_ecn_check_ce(tp, skb);
				4373
				4374	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
				4375	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
				4376	__kfree_skb(skb);
				4377	return;
				4378	}
				4379
				4380	/* Disable header prediction. */
				4381	tp->pred_flags = 0;
				4382	inet_csk_schedule_ack(sk);
				4383
				4384	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
				4385	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
				4386	tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
				4387
				4388	skb1 = skb_peek_tail(&tp->out_of_order_queue);
				4389	if (!skb1) {
				4390	/* Initial out of order segment, build 1 SACK. */
				4391	if (tcp_is_sack(tp)) {
				4392	tp->rx_opt.num_sacks = 1;
				4393	tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
				4394	tp->selective_acks[0].end_seq =
				4395	TCP_SKB_CB(skb)->end_seq;
				4396	}
				4397	__skb_queue_head(&tp->out_of_order_queue, skb);
				4398	goto end;
				4399	}
				4400
				4401	seq = TCP_SKB_CB(skb)->seq;
				4402	end_seq = TCP_SKB_CB(skb)->end_seq;
				4403
				4404	if (seq == TCP_SKB_CB(skb1)->end_seq) {
				4405	bool fragstolen;
				4406
				4407	if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
				4408	__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
				4409	} else {
				4410	tcp_grow_window(sk, skb);
				4411	kfree_skb_partial(skb, fragstolen);
				4412	skb = NULL;
				4413	}
				4414
				4415	if (!tp->rx_opt.num_sacks \|\|
				4416	tp->selective_acks[0].end_seq != seq)
				4417	goto add_sack;
				4418
				4419	/* Common case: data arrive in order after hole. */
				4420	tp->selective_acks[0].end_seq = end_seq;
				4421	goto end;
				4422	}
				4423
				4424	/* Find place to insert this segment. */
				4425	while (1) {
				4426	if (!after(TCP_SKB_CB(skb1)->seq, seq))
				4427	break;
				4428	if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
				4429	skb1 = NULL;
				4430	break;
				4431	}
				4432	skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
				4433	}
				4434
				4435	/* Do skb overlap to previous one? */
				4436	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
				4437	if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4438	/* All the bits are present. Drop. */
				4439	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
				4440	__kfree_skb(skb);
				4441	skb = NULL;
				4442	tcp_dsack_set(sk, seq, end_seq);
				4443	goto add_sack;
				4444	}
				4445	if (after(seq, TCP_SKB_CB(skb1)->seq)) {
				4446	/* Partial overlap. */
				4447	tcp_dsack_set(sk, seq,
				4448	TCP_SKB_CB(skb1)->end_seq);
				4449	} else {
				4450	if (skb_queue_is_first(&tp->out_of_order_queue,
				4451	skb1))
				4452	skb1 = NULL;
				4453	else
				4454	skb1 = skb_queue_prev(
				4455	&tp->out_of_order_queue,
				4456	skb1);
				4457	}
				4458	}
				4459	if (!skb1)
				4460	__skb_queue_head(&tp->out_of_order_queue, skb);
				4461	else
				4462	__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
				4463
				4464	/* And clean segments covered by new one as whole. */
				4465	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
				4466	skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
				4467
				4468	if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
				4469	break;
				4470	if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
				4471	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4472	end_seq);
				4473	break;
				4474	}
				4475	__skb_unlink(skb1, &tp->out_of_order_queue);
				4476	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
				4477	TCP_SKB_CB(skb1)->end_seq);
				4478	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
				4479	__kfree_skb(skb1);
				4480	}
				4481
				4482	add_sack:
				4483	if (tcp_is_sack(tp))
				4484	tcp_sack_new_ofo_skb(sk, seq, end_seq);
				4485	end:
				4486	if (skb) {
				4487	tcp_grow_window(sk, skb);
				4488	skb_set_owner_r(skb, sk);
				4489	}
				4490	}
				4491
				4492	static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb, int hdrlen,
				4493	bool *fragstolen)
				4494	{
				4495	int eaten;
				4496	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
				4497
				4498	__skb_pull(skb, hdrlen);
				4499	eaten = (tail &&
				4500	tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
				4501	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
				4502	if (!eaten) {
				4503	__skb_queue_tail(&sk->sk_receive_queue, skb);
				4504	skb_set_owner_r(skb, sk);
				4505	}
				4506	return eaten;
				4507	}
				4508
				4509	int tcp_send_rcvq(struct sock sk, struct msghdr msg, size_t size)
				4510	{
				4511	struct sk_buff *skb;
				4512	int err = -ENOMEM;
				4513	int data_len = 0;
				4514	bool fragstolen;
				4515
				4516	if (size == 0)
				4517	return 0;
				4518
				4519	if (size > PAGE_SIZE) {
				4520	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
				4521
				4522	data_len = npages << PAGE_SHIFT;
				4523	size = data_len + (size & ~PAGE_MASK);
				4524	}
				4525	skb = alloc_skb_with_frags(size - data_len, data_len,
				4526	PAGE_ALLOC_COSTLY_ORDER,
				4527	&err, sk->sk_allocation);
				4528	if (!skb)
				4529	goto err;
				4530
				4531	skb_put(skb, size - data_len);
				4532	skb->data_len = data_len;
				4533	skb->len = size;
				4534
				4535	if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
				4536	goto err_free;
				4537
				4538	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
				4539	if (err)
				4540	goto err_free;
				4541
				4542	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
				4543	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
				4544	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
				4545
				4546	if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
				4547	WARN_ON_ONCE(fragstolen); /* should not happen */
				4548	__kfree_skb(skb);
				4549	}
				4550	return size;
				4551
				4552	err_free:
				4553	kfree_skb(skb);
				4554	err:
				4555	return err;
				4556
				4557	}
				4558
				4559	static void tcp_data_queue(struct sock sk, struct sk_buff skb)
				4560	{
				4561	struct tcp_sock *tp = tcp_sk(sk);
				4562	int eaten = -1;
				4563	bool fragstolen = false;
				4564
				4565	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
				4566	goto drop;
				4567
				4568	skb_dst_drop(skb);
				4569	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
				4570
				4571	tcp_ecn_accept_cwr(tp, skb);
				4572
				4573	tp->rx_opt.dsack = 0;
				4574
				4575	/* Queue data for delivery to the user.
				4576	* Packets in sequence go to the receive queue.
				4577	* Out of sequence packets to the out_of_order_queue.
				4578	*/
				4579	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
				4580	if (tcp_receive_window(tp) == 0)
				4581	goto out_of_window;
				4582
				4583	/* Ok. In sequence. In window. */
				4584	if (tp->ucopy.task == current &&
				4585	tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
				4586	sock_owned_by_user(sk) && !tp->urg_data) {
				4587	int chunk = min_t(unsigned int, skb->len,
				4588	tp->ucopy.len);
				4589
				4590	__set_current_state(TASK_RUNNING);
				4591
				4592	local_bh_enable();
				4593	if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
				4594	tp->ucopy.len -= chunk;
				4595	tp->copied_seq += chunk;
				4596	eaten = (chunk == skb->len);
				4597	tcp_rcv_space_adjust(sk);
				4598	}
				4599	local_bh_disable();
				4600	}
				4601
				4602	if (eaten <= 0) {
				4603	queue_and_out:
				4604	if (eaten < 0) {
				4605	if (skb_queue_len(&sk->sk_receive_queue) == 0)
				4606	sk_forced_mem_schedule(sk, skb->truesize);
				4607	else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
				4608	goto drop;
				4609	}
				4610	eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
				4611	}
				4612	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
				4613	if (skb->len)
				4614	tcp_event_data_recv(sk, skb);
				4615	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
				4616	tcp_fin(sk);
				4617
				4618	if (!skb_queue_empty(&tp->out_of_order_queue)) {
				4619	tcp_ofo_queue(sk);
				4620
				4621	/* RFC2581. 4.2. SHOULD send immediate ACK, when
				4622	* gap in queue is filled.
				4623	*/
				4624	if (skb_queue_empty(&tp->out_of_order_queue))
				4625	inet_csk(sk)->icsk_ack.pingpong = 0;
				4626	}
				4627
				4628	if (tp->rx_opt.num_sacks)
				4629	tcp_sack_remove(tp);
				4630
				4631	tcp_fast_path_check(sk);
				4632
				4633	if (eaten > 0)
				4634	kfree_skb_partial(skb, fragstolen);
				4635	if (!sock_flag(sk, SOCK_DEAD))
				4636	sk->sk_data_ready(sk);
				4637	return;
				4638	}
				4639
				4640	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
				4641	/* A retransmit, 2nd most common case. Force an immediate ack. */
				4642	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
				4643	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
				4644
				4645	out_of_window:
				4646	tcp_enter_quickack_mode(sk);
				4647	inet_csk_schedule_ack(sk);
				4648	drop:
				4649	__kfree_skb(skb);
				4650	return;
				4651	}
				4652
				4653	/* Out of window. F.e. zero window probe. */
				4654	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
				4655	goto out_of_window;
				4656
				4657	tcp_enter_quickack_mode(sk);
				4658
				4659	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
				4660	/* Partial packet, seq < rcv_next < end_seq */
				4661	SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
				4662	tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
				4663	TCP_SKB_CB(skb)->end_seq);
				4664
				4665	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
				4666
				4667	/* If window is closed, drop tail of packet. But after
				4668	* remembering D-SACK for its head made in previous line.
				4669	*/
				4670	if (!tcp_receive_window(tp))
				4671	goto out_of_window;
				4672	goto queue_and_out;
				4673	}
				4674
				4675	tcp_data_queue_ofo(sk, skb);
				4676	}
				4677
				4678	static struct sk_buff tcp_collapse_one(struct sock sk, struct sk_buff *skb,
				4679	struct sk_buff_head *list)
				4680	{
				4681	struct sk_buff *next = NULL;
				4682
				4683	if (!skb_queue_is_last(list, skb))
				4684	next = skb_queue_next(list, skb);
				4685
				4686	__skb_unlink(skb, list);
				4687	__kfree_skb(skb);
				4688	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
				4689
				4690	return next;
				4691	}
				4692
				4693	/* Collapse contiguous sequence of skbs head..tail with
				4694	* sequence numbers start..end.
				4695	*
				4696	* If tail is NULL, this means until the end of the list.
				4697	*
				4698	* Segments with FIN/SYN are not collapsed (only because this
				4699	* simplifies code)
				4700	*/
				4701	static void
				4702	tcp_collapse(struct sock sk, struct sk_buff_head list,
				4703	struct sk_buff head, struct sk_buff tail,
				4704	u32 start, u32 end)
				4705	{
				4706	struct sk_buff skb, n;
				4707	bool end_of_skbs;
				4708
				4709	/* First, check that queue is collapsible and find
				4710	* the point where collapsing can be useful. */
				4711	skb = head;
				4712	restart:
				4713	end_of_skbs = true;
				4714	skb_queue_walk_from_safe(list, skb, n) {
				4715	if (skb == tail)
				4716	break;
				4717	/* No new bits? It is possible on ofo queue. */
				4718	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				4719	skb = tcp_collapse_one(sk, skb, list);
				4720	if (!skb)
				4721	break;
				4722	goto restart;
				4723	}
				4724
				4725	/* The first skb to collapse is:
				4726	* - not SYN/FIN and
				4727	* - bloated or contains data before "start" or
				4728	* overlaps to the next one.
				4729	*/
				4730	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
				4731	(tcp_win_from_space(skb->truesize) > skb->len \|\|
				4732	before(TCP_SKB_CB(skb)->seq, start))) {
				4733	end_of_skbs = false;
				4734	break;
				4735	}
				4736
				4737	if (!skb_queue_is_last(list, skb)) {
				4738	struct sk_buff *next = skb_queue_next(list, skb);
				4739	if (next != tail &&
				4740	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
				4741	end_of_skbs = false;
				4742	break;
				4743	}
				4744	}
				4745
				4746	/* Decided to skip this, advance start seq. */
				4747	start = TCP_SKB_CB(skb)->end_seq;
				4748	}
				4749	if (end_of_skbs \|\|
				4750	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				4751	return;
				4752
				4753	while (before(start, end)) {
				4754	int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
				4755	struct sk_buff *nskb;
				4756
				4757	nskb = alloc_skb(copy, GFP_ATOMIC);
				4758	if (!nskb)
				4759	return;
				4760
				4761	memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
				4762	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
				4763	__skb_queue_before(list, skb, nskb);
				4764	skb_set_owner_r(nskb, sk);
				4765
				4766	/* Copy data, releasing collapsed skbs. */
				4767	while (copy > 0) {
				4768	int offset = start - TCP_SKB_CB(skb)->seq;
				4769	int size = TCP_SKB_CB(skb)->end_seq - start;
				4770
				4771	BUG_ON(offset < 0);
				4772	if (size > 0) {
				4773	size = min(copy, size);
				4774	if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
				4775	BUG();
				4776	TCP_SKB_CB(nskb)->end_seq += size;
				4777	copy -= size;
				4778	start += size;
				4779	}
				4780	if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
				4781	skb = tcp_collapse_one(sk, skb, list);
				4782	if (!skb \|\|
				4783	skb == tail \|\|
				4784	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
				4785	return;
				4786	}
				4787	}
				4788	}
				4789	}
				4790
				4791	/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
				4792	* and tcp_collapse() them until all the queue is collapsed.
				4793	*/
				4794	static void tcp_collapse_ofo_queue(struct sock *sk)
				4795	{
				4796	struct tcp_sock *tp = tcp_sk(sk);
				4797	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
				4798	struct sk_buff *head;
				4799	u32 start, end;
				4800
				4801	if (!skb)
				4802	return;
				4803
				4804	start = TCP_SKB_CB(skb)->seq;
				4805	end = TCP_SKB_CB(skb)->end_seq;
				4806	head = skb;
				4807
				4808	for (;;) {
				4809	struct sk_buff *next = NULL;
				4810
				4811	if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
				4812	next = skb_queue_next(&tp->out_of_order_queue, skb);
				4813	skb = next;
				4814
				4815	/* Segment is terminated when we see gap or when
				4816	* we are at the end of all the queue. */
				4817	if (!skb \|\|
				4818	after(TCP_SKB_CB(skb)->seq, end) \|\|
				4819	before(TCP_SKB_CB(skb)->end_seq, start)) {
				4820	tcp_collapse(sk, &tp->out_of_order_queue,
				4821	head, skb, start, end);
				4822	head = skb;
				4823	if (!skb)
				4824	break;
				4825	/* Start new segment */
				4826	start = TCP_SKB_CB(skb)->seq;
				4827	end = TCP_SKB_CB(skb)->end_seq;
				4828	} else {
				4829	if (before(TCP_SKB_CB(skb)->seq, start))
				4830	start = TCP_SKB_CB(skb)->seq;
				4831	if (after(TCP_SKB_CB(skb)->end_seq, end))
				4832	end = TCP_SKB_CB(skb)->end_seq;
				4833	}
				4834	}
				4835	}
				4836
				4837	/*
				4838	* Purge the out-of-order queue.
				4839	* Return true if queue was pruned.
				4840	*/
				4841	static bool tcp_prune_ofo_queue(struct sock *sk)
				4842	{
				4843	struct tcp_sock *tp = tcp_sk(sk);
				4844	bool res = false;
				4845
				4846	if (!skb_queue_empty(&tp->out_of_order_queue)) {
				4847	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
				4848	__skb_queue_purge(&tp->out_of_order_queue);
				4849
				4850	/* Reset SACK state. A conforming SACK implementation will
				4851	* do the same at a timeout based retransmit. When a connection
				4852	* is in a sad state like this, we care only about integrity
				4853	* of the connection not performance.
				4854	*/
				4855	if (tp->rx_opt.sack_ok)
				4856	tcp_sack_reset(&tp->rx_opt);
				4857	sk_mem_reclaim(sk);
				4858	res = true;
				4859	}
				4860	return res;
				4861	}
				4862
				4863	/* Reduce allocated memory if we can, trying to get
				4864	* the socket within its memory limits again.
				4865	*
				4866	* Return less than zero if we should start dropping frames
				4867	* until the socket owning process reads some of the data
				4868	* to stabilize the situation.
				4869	*/
				4870	static int tcp_prune_queue(struct sock *sk)
				4871	{
				4872	struct tcp_sock *tp = tcp_sk(sk);
				4873
				4874	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
				4875
				4876	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
				4877
				4878	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
				4879	tcp_clamp_window(sk);
				4880	else if (tcp_under_memory_pressure(sk))
				4881	tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
				4882
				4883	tcp_collapse_ofo_queue(sk);
				4884	if (!skb_queue_empty(&sk->sk_receive_queue))
				4885	tcp_collapse(sk, &sk->sk_receive_queue,
				4886	skb_peek(&sk->sk_receive_queue),
				4887	NULL,
				4888	tp->copied_seq, tp->rcv_nxt);
				4889	sk_mem_reclaim(sk);
				4890
				4891	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				4892	return 0;
				4893
				4894	/* Collapsing did not help, destructive actions follow.
				4895	* This must not ever occur. */
				4896
				4897	tcp_prune_ofo_queue(sk);
				4898
				4899	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
				4900	return 0;
				4901
				4902	/* If we are really being abused, tell the caller to silently
				4903	* drop receive data on the floor. It will get retransmitted
				4904	* and hopefully then we'll have sufficient space.
				4905	*/
				4906	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
				4907
				4908	/* Massive buffer overcommit. */
				4909	tp->pred_flags = 0;
				4910	return -1;
				4911	}
				4912
				4913	static bool tcp_should_expand_sndbuf(const struct sock *sk)
				4914	{
				4915	const struct tcp_sock *tp = tcp_sk(sk);
				4916
				4917	/* If the user specified a specific send buffer setting, do
				4918	* not modify it.
				4919	*/
				4920	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
				4921	return false;
				4922
				4923	/* If we are under global TCP memory pressure, do not expand. */
				4924	if (tcp_under_memory_pressure(sk))
				4925	return false;
				4926
				4927	/* If we are under soft global TCP memory pressure, do not expand. */
				4928	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
				4929	return false;
				4930
				4931	/* If we filled the congestion window, do not expand. */
				4932	if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
				4933	return false;
				4934
				4935	return true;
				4936	}
				4937
				4938	/* When incoming ACK allowed to free some skb from write_queue,
				4939	* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
				4940	* on the exit from tcp input handler.
				4941	*
				4942	* PROBLEM: sndbuf expansion does not work well with largesend.
				4943	*/
				4944	static void tcp_new_space(struct sock *sk)
				4945	{
				4946	struct tcp_sock *tp = tcp_sk(sk);
				4947
				4948	if (tcp_should_expand_sndbuf(sk)) {
				4949	tcp_sndbuf_expand(sk);
				4950	tp->snd_cwnd_stamp = tcp_time_stamp;
				4951	}
				4952
				4953	sk->sk_write_space(sk);
				4954	}
				4955
				4956	static void tcp_check_space(struct sock *sk)
				4957	{
				4958	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
				4959	sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
				4960	/* pairs with tcp_poll() */
				4961	smp_mb__after_atomic();
				4962	if (sk->sk_socket &&
				4963	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
				4964	tcp_new_space(sk);
				4965	}
				4966	}
				4967
				4968	static inline void tcp_data_snd_check(struct sock *sk)
				4969	{
				4970	tcp_push_pending_frames(sk);
				4971	tcp_check_space(sk);
				4972	}
				4973
				4974	/*
				4975	* Check if sending an ack is needed.
				4976	*/
				4977	static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
				4978	{
				4979	struct tcp_sock *tp = tcp_sk(sk);
				4980
				4981	/* More than one full frame received... */
				4982	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
				4983	/* ... and right edge of window advances far enough.
				4984	* (tcp_recvmsg() will send ACK otherwise). Or...
				4985	*/
				4986	__tcp_select_window(sk) >= tp->rcv_wnd) \|\|
				4987	/* We ACK each frame or... */
				4988	tcp_in_quickack_mode(sk) \|\|
				4989	/* We have out of order data. */
				4990	(ofo_possible && skb_peek(&tp->out_of_order_queue))) {
				4991	/* Then ack it now */
				4992	tcp_send_ack(sk);
				4993	} else {
				4994	/* Else, send delayed ack. */
				4995	tcp_send_delayed_ack(sk);
				4996	}
				4997	}
				4998
				4999	static inline void tcp_ack_snd_check(struct sock *sk)
				5000	{
				5001	if (!inet_csk_ack_scheduled(sk)) {
				5002	/* We sent a data segment already. */
				5003	return;
				5004	}
				5005	__tcp_ack_snd_check(sk, 1);
				5006	}
				5007
				5008	/*
				5009	* This routine is only called when we have urgent data
				5010	* signaled. Its the 'slow' part of tcp_urg. It could be
				5011	* moved inline now as tcp_urg is only called from one
				5012	* place. We handle URGent data wrong. We have to - as
				5013	* BSD still doesn't use the correction from RFC961.
				5014	* For 1003.1g we should support a new option TCP_STDURG to permit
				5015	* either form (or just set the sysctl tcp_stdurg).
				5016	*/
				5017
				5018	static void tcp_check_urg(struct sock sk, const struct tcphdr th)
				5019	{
				5020	struct tcp_sock *tp = tcp_sk(sk);
				5021	u32 ptr = ntohs(th->urg_ptr);
				5022
				5023	if (ptr && !sysctl_tcp_stdurg)
				5024	ptr--;
				5025	ptr += ntohl(th->seq);
				5026
				5027	/* Ignore urgent data that we've already seen and read. */
				5028	if (after(tp->copied_seq, ptr))
				5029	return;
				5030
				5031	/* Do not replay urg ptr.
				5032	*
				5033	* NOTE: interesting situation not covered by specs.
				5034	* Misbehaving sender may send urg ptr, pointing to segment,
				5035	* which we already have in ofo queue. We are not able to fetch
				5036	* such data and will stay in TCP_URG_NOTYET until will be eaten
				5037	* by recvmsg(). Seems, we are not obliged to handle such wicked
				5038	* situations. But it is worth to think about possibility of some
				5039	* DoSes using some hypothetical application level deadlock.
				5040	*/
				5041	if (before(ptr, tp->rcv_nxt))
				5042	return;
				5043
				5044	/* Do we already have a newer (or duplicate) urgent pointer? */
				5045	if (tp->urg_data && !after(ptr, tp->urg_seq))
				5046	return;
				5047
				5048	/* Tell the world about our new urgent pointer. */
				5049	sk_send_sigurg(sk);
				5050
				5051	/* We may be adding urgent data when the last byte read was
				5052	* urgent. To do this requires some care. We cannot just ignore
				5053	* tp->copied_seq since we would read the last urgent byte again
				5054	* as data, nor can we alter copied_seq until this data arrives
				5055	* or we break the semantics of SIOCATMARK (and thus sockatmark())
				5056	*
				5057	* NOTE. Double Dutch. Rendering to plain English: author of comment
				5058	* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
				5059	* and expect that both A and B disappear from stream. This is _wrong_.
				5060	* Though this happens in BSD with high probability, this is occasional.
				5061	* Any application relying on this is buggy. Note also, that fix "works"
				5062	* only in this artificial test. Insert some normal data between A and B and we will
				5063	* decline of BSD again. Verdict: it is better to remove to trap
				5064	* buggy users.
				5065	*/
				5066	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
				5067	!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
				5068	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
				5069	tp->copied_seq++;
				5070	if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
				5071	__skb_unlink(skb, &sk->sk_receive_queue);
				5072	__kfree_skb(skb);
				5073	}
				5074	}
				5075
				5076	tp->urg_data = TCP_URG_NOTYET;
				5077	tp->urg_seq = ptr;
				5078
				5079	/* Disable header prediction. */
				5080	tp->pred_flags = 0;
				5081	}
				5082
				5083	/* This is the 'fast' part of urgent handling. */
				5084	static void tcp_urg(struct sock sk, struct sk_buff skb, const struct tcphdr *th)
				5085	{
				5086	struct tcp_sock *tp = tcp_sk(sk);
				5087
				5088	/* Check if we get a new urgent pointer - normally not. */
				5089	if (th->urg)
				5090	tcp_check_urg(sk, th);
				5091
				5092	/* Do we wait for any urgent data? - normally not... */
				5093	if (tp->urg_data == TCP_URG_NOTYET) {
				5094	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
				5095	th->syn;
				5096
				5097	/* Is the urgent pointer pointing into this packet? */
				5098	if (ptr < skb->len) {
				5099	u8 tmp;
				5100	if (skb_copy_bits(skb, ptr, &tmp, 1))
				5101	BUG();
				5102	tp->urg_data = TCP_URG_VALID \| tmp;
				5103	if (!sock_flag(sk, SOCK_DEAD))
				5104	sk->sk_data_ready(sk);
				5105	}
				5106	}
				5107	}
				5108
				5109	static int tcp_copy_to_iovec(struct sock sk, struct sk_buff skb, int hlen)
				5110	{
				5111	struct tcp_sock *tp = tcp_sk(sk);
				5112	int chunk = skb->len - hlen;
				5113	int err;
				5114
				5115	local_bh_enable();
				5116	if (skb_csum_unnecessary(skb))
				5117	err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
				5118	else
				5119	err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
				5120
				5121	if (!err) {
				5122	tp->ucopy.len -= chunk;
				5123	tp->copied_seq += chunk;
				5124	tcp_rcv_space_adjust(sk);
				5125	}
				5126
				5127	local_bh_disable();
				5128	return err;
				5129	}
				5130
				5131	static __sum16 __tcp_checksum_complete_user(struct sock *sk,
				5132	struct sk_buff *skb)
				5133	{
				5134	__sum16 result;
				5135
				5136	if (sock_owned_by_user(sk)) {
				5137	local_bh_enable();
				5138	result = __tcp_checksum_complete(skb);
				5139	local_bh_disable();
				5140	} else {
				5141	result = __tcp_checksum_complete(skb);
				5142	}
				5143	return result;
				5144	}
				5145
				5146	static inline bool tcp_checksum_complete_user(struct sock *sk,
				5147	struct sk_buff *skb)
				5148	{
				5149	return !skb_csum_unnecessary(skb) &&
				5150	__tcp_checksum_complete_user(sk, skb);
				5151	}
				5152
				5153	/* Does PAWS and seqno based validation of an incoming segment, flags will
				5154	* play significant role here.
				5155	*/
				5156	static bool tcp_validate_incoming(struct sock sk, struct sk_buff skb,
				5157	const struct tcphdr *th, int syn_inerr)
				5158	{
				5159	struct tcp_sock *tp = tcp_sk(sk);
				5160
				5161	/* RFC1323: H1. Apply PAWS check first. */
				5162	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
				5163	tcp_paws_discard(sk, skb)) {
				5164	if (!th->rst) {
				5165	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
				5166	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5167	LINUX_MIB_TCPACKSKIPPEDPAWS,
				5168	&tp->last_oow_ack_time))
				5169	tcp_send_dupack(sk, skb);
				5170	goto discard;
				5171	}
				5172	/* Reset is accepted even if it did not pass PAWS. */
				5173	}
				5174
				5175	/* Step 1: check sequence number */
				5176	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
				5177	/* RFC793, page 37: "In all states except SYN-SENT, all reset
				5178	* (RST) segments are validated by checking their SEQ-fields."
				5179	* And page 69: "If an incoming segment is not acceptable,
				5180	* an acknowledgment should be sent in reply (unless the RST
				5181	* bit is set, if so drop the segment and return)".
				5182	*/
				5183	if (!th->rst) {
				5184	if (th->syn)
				5185	goto syn_challenge;
				5186	if (!tcp_oow_rate_limited(sock_net(sk), skb,
				5187	LINUX_MIB_TCPACKSKIPPEDSEQ,
				5188	&tp->last_oow_ack_time))
				5189	tcp_send_dupack(sk, skb);
				5190	}
				5191	goto discard;
				5192	}
				5193
				5194	/* Step 2: check RST bit */
				5195	if (th->rst) {
				5196	/* RFC 5961 3.2 :
				5197	* If sequence number exactly matches RCV.NXT, then
				5198	* RESET the connection
				5199	* else
				5200	* Send a challenge ACK
				5201	*/
				5202	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
				5203	tcp_reset(sk);
				5204	else
				5205	tcp_send_challenge_ack(sk, skb);
				5206	goto discard;
				5207	}
				5208
				5209	/* step 3: check security and precedence [ignored] */
				5210
				5211	/* step 4: Check for a SYN
				5212	* RFC 5961 4.2 : Send a challenge ack
				5213	*/
				5214	if (th->syn) {
				5215	syn_challenge:
				5216	if (syn_inerr)
				5217	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
				5218	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
				5219	tcp_send_challenge_ack(sk, skb);
				5220	goto discard;
				5221	}
				5222
				5223	return true;
				5224
				5225	discard:
				5226	__kfree_skb(skb);
				5227	return false;
				5228	}
				5229
				5230	/*
				5231	* TCP receive function for the ESTABLISHED state.
				5232	*
				5233	* It is split into a fast path and a slow path. The fast path is
				5234	* disabled when:
				5235	* - A zero window was announced from us - zero window probing
				5236	* is only handled properly in the slow path.
				5237	* - Out of order segments arrived.
				5238	* - Urgent data is expected.
				5239	* - There is no buffer space left
				5240	* - Unexpected TCP flags/window values/header lengths are received
				5241	* (detected by checking the TCP header against pred_flags)
				5242	* - Data is sent in both directions. Fast path only supports pure senders
				5243	* or pure receivers (this means either the sequence number or the ack
				5244	* value must stay constant)
				5245	* - Unexpected TCP option.
				5246	*
				5247	* When these conditions are not satisfied it drops into a standard
				5248	* receive procedure patterned after RFC793 to handle all cases.
				5249	* The first three cases are guaranteed by proper pred_flags setting,
				5250	* the rest is checked inline. Fast processing is turned on in
				5251	* tcp_data_queue when everything is OK.
				5252	*/
				5253	void tcp_rcv_established(struct sock sk, struct sk_buff skb,
				5254	const struct tcphdr *th, unsigned int len)
				5255	{
				5256	struct tcp_sock *tp = tcp_sk(sk);
				5257
				5258	if (unlikely(!sk->sk_rx_dst))
				5259	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5260	/*
				5261	* Header prediction.
				5262	* The code loosely follows the one in the famous
				5263	* "30 instruction TCP receive" Van Jacobson mail.
				5264	*
				5265	* Van's trick is to deposit buffers into socket queue
				5266	* on a device interrupt, to call tcp_recv function
				5267	* on the receive process context and checksum and copy
				5268	* the buffer to user space. smart...
				5269	*
				5270	* Our current scheme is not silly either but we take the
				5271	* extra cost of the net_bh soft interrupt processing...
				5272	* We do checksum and copy also but from device to kernel.
				5273	*/
				5274
				5275	tp->rx_opt.saw_tstamp = 0;
				5276
				5277	/* pred_flags is 0xS?10 << 16 + snd_wnd
				5278	* if header_prediction is to be made
				5279	* 'S' will always be tp->tcp_header_len >> 2
				5280	* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
				5281	* turn it off (when there are holes in the receive
				5282	* space for instance)
				5283	* PSH flag is ignored.
				5284	*/
				5285
				5286	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
				5287	TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
				5288	!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
				5289	int tcp_header_len = tp->tcp_header_len;
				5290
				5291	/* Timestamp header prediction: tcp_header_len
				5292	* is automatically equal to th->doff*4 due to pred_flags
				5293	* match.
				5294	*/
				5295
				5296	/* Check timestamp */
				5297	if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
				5298	/* No? Slow path! */
				5299	if (!tcp_parse_aligned_timestamp(tp, th))
				5300	goto slow_path;
				5301
				5302	/* If PAWS failed, check it more carefully in slow path */
				5303	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
				5304	goto slow_path;
				5305
				5306	/* DO NOT update ts_recent here, if checksum fails
				5307	* and timestamp was corrupted part, it will result
				5308	* in a hung connection since we will drop all
				5309	* future packets due to the PAWS test.
				5310	*/
				5311	}
				5312
				5313	if (len <= tcp_header_len) {
				5314	/* Bulk data transfer: sender */
				5315	if (len == tcp_header_len) {
				5316	/* Predicted packet is in window by definition.
				5317	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5318	* Hence, check seq<=rcv_wup reduces to:
				5319	*/
				5320	if (tcp_header_len ==
				5321	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5322	tp->rcv_nxt == tp->rcv_wup)
				5323	tcp_store_ts_recent(tp);
				5324
				5325	/* We know that such packets are checksummed
				5326	* on entry.
				5327	*/
				5328	tcp_ack(sk, skb, 0);
				5329	__kfree_skb(skb);
				5330	tcp_data_snd_check(sk);
				5331	return;
				5332	} else { /* Header too small */
				5333	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
				5334	goto discard;
				5335	}
				5336	} else {
				5337	int eaten = 0;
				5338	bool fragstolen = false;
				5339
				5340	if (tp->ucopy.task == current &&
				5341	tp->copied_seq == tp->rcv_nxt &&
				5342	len - tcp_header_len <= tp->ucopy.len &&
				5343	sock_owned_by_user(sk)) {
				5344	__set_current_state(TASK_RUNNING);
				5345
				5346	if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
				5347	/* Predicted packet is in window by definition.
				5348	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5349	* Hence, check seq<=rcv_wup reduces to:
				5350	*/
				5351	if (tcp_header_len ==
				5352	(sizeof(struct tcphdr) +
				5353	TCPOLEN_TSTAMP_ALIGNED) &&
				5354	tp->rcv_nxt == tp->rcv_wup)
				5355	tcp_store_ts_recent(tp);
				5356
				5357	tcp_rcv_rtt_measure_ts(sk, skb);
				5358
				5359	__skb_pull(skb, tcp_header_len);
				5360	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
				5361	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
				5362	eaten = 1;
				5363	}
				5364	}
				5365	if (!eaten) {
				5366	if (tcp_checksum_complete_user(sk, skb))
				5367	goto csum_error;
				5368
				5369	if ((int)skb->truesize > sk->sk_forward_alloc)
				5370	goto step5;
				5371
				5372	/* Predicted packet is in window by definition.
				5373	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
				5374	* Hence, check seq<=rcv_wup reduces to:
				5375	*/
				5376	if (tcp_header_len ==
				5377	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
				5378	tp->rcv_nxt == tp->rcv_wup)
				5379	tcp_store_ts_recent(tp);
				5380
				5381	tcp_rcv_rtt_measure_ts(sk, skb);
				5382
				5383	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
				5384
				5385	/* Bulk data transfer: receiver */
				5386	eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
				5387	&fragstolen);
				5388	}
				5389
				5390	tcp_event_data_recv(sk, skb);
				5391
				5392	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
				5393	/* Well, only one small jumplet in fast path... */
				5394	tcp_ack(sk, skb, FLAG_DATA);
				5395	tcp_data_snd_check(sk);
				5396	if (!inet_csk_ack_scheduled(sk))
				5397	goto no_ack;
				5398	}
				5399
				5400	__tcp_ack_snd_check(sk, 0);
				5401	no_ack:
				5402	if (eaten)
				5403	kfree_skb_partial(skb, fragstolen);
				5404	sk->sk_data_ready(sk);
				5405	return;
				5406	}
				5407	}
				5408
				5409	slow_path:
				5410	if (len < (th->doff << 2) \|\| tcp_checksum_complete_user(sk, skb))
				5411	goto csum_error;
				5412
				5413	if (!th->ack && !th->rst && !th->syn)
				5414	goto discard;
				5415
				5416	/*
				5417	* Standard slow path.
				5418	*/
				5419
				5420	if (!tcp_validate_incoming(sk, skb, th, 1))
				5421	return;
				5422
				5423	step5:
				5424	if (tcp_ack(sk, skb, FLAG_SLOWPATH \| FLAG_UPDATE_TS_RECENT) < 0)
				5425	goto discard;
				5426
				5427	tcp_rcv_rtt_measure_ts(sk, skb);
				5428
				5429	/* Process urgent data. */
				5430	tcp_urg(sk, skb, th);
				5431
				5432	/* step 7: process the segment text */
				5433	tcp_data_queue(sk, skb);
				5434
				5435	tcp_data_snd_check(sk);
				5436	tcp_ack_snd_check(sk);
				5437	return;
				5438
				5439	csum_error:
				5440	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
				5441	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
				5442
				5443	discard:
				5444	__kfree_skb(skb);
				5445	}
				5446	EXPORT_SYMBOL(tcp_rcv_established);
				5447
				5448	void tcp_finish_connect(struct sock sk, struct sk_buff skb)
				5449	{
				5450	struct tcp_sock *tp = tcp_sk(sk);
				5451	struct inet_connection_sock *icsk = inet_csk(sk);
				5452
				5453	tcp_set_state(sk, TCP_ESTABLISHED);
				5454	icsk->icsk_ack.lrcvtime = tcp_time_stamp;
				5455
				5456	if (skb) {
				5457	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
				5458	security_inet_conn_established(sk, skb);
				5459	}
				5460
				5461	/* Make sure socket is routed, for correct metrics. */
				5462	icsk->icsk_af_ops->rebuild_header(sk);
				5463
				5464	tcp_init_metrics(sk);
				5465
				5466	tcp_init_congestion_control(sk);
				5467
				5468	/* Prevent spurious tcp_cwnd_restart() on first data
				5469	* packet.
				5470	*/
				5471	tp->lsndtime = tcp_time_stamp;
				5472
				5473	tcp_init_buffer_space(sk);
				5474
				5475	if (sock_flag(sk, SOCK_KEEPOPEN))
				5476	inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
				5477
				5478	if (!tp->rx_opt.snd_wscale)
				5479	__tcp_fast_path_on(tp, tp->snd_wnd);
				5480	else
				5481	tp->pred_flags = 0;
				5482
				5483	if (!sock_flag(sk, SOCK_DEAD)) {
				5484	sk->sk_state_change(sk);
				5485	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				5486	}
				5487	}
				5488
				5489	static bool tcp_rcv_fastopen_synack(struct sock sk, struct sk_buff synack,
				5490	struct tcp_fastopen_cookie *cookie)
				5491	{
				5492	struct tcp_sock *tp = tcp_sk(sk);
				5493	struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
				5494	u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
				5495	bool syn_drop = false;
				5496
				5497	if (mss == tp->rx_opt.user_mss) {
				5498	struct tcp_options_received opt;
				5499
				5500	/* Get original SYNACK MSS value if user MSS sets mss_clamp */
				5501	tcp_clear_options(&opt);
				5502	opt.user_mss = opt.mss_clamp = 0;
				5503	tcp_parse_options(synack, &opt, 0, NULL);
				5504	mss = opt.mss_clamp;
				5505	}
				5506
				5507	if (!tp->syn_fastopen) {
				5508	/* Ignore an unsolicited cookie */
				5509	cookie->len = -1;
				5510	} else if (tp->total_retrans) {
				5511	/* SYN timed out and the SYN-ACK neither has a cookie nor
				5512	* acknowledges data. Presumably the remote received only
				5513	* the retransmitted (regular) SYNs: either the original
				5514	* SYN-data or the corresponding SYN-ACK was dropped.
				5515	*/
				5516	syn_drop = (cookie->len < 0 && data);
				5517	} else if (cookie->len < 0 && !tp->syn_data) {
				5518	/* We requested a cookie but didn't get it. If we did not use
				5519	* the (old) exp opt format then try so next time (try_exp=1).
				5520	* Otherwise we go back to use the RFC7413 opt (try_exp=2).
				5521	*/
				5522	try_exp = tp->syn_fastopen_exp ? 2 : 1;
				5523	}
				5524
				5525	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
				5526
				5527	if (data) { /* Retransmit unacked data in SYN */
				5528	tcp_for_write_queue_from(data, sk) {
				5529	if (data == tcp_send_head(sk) \|\|
				5530	__tcp_retransmit_skb(sk, data))
				5531	break;
				5532	}
				5533	tcp_rearm_rto(sk);
				5534	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
				5535	return true;
				5536	}
				5537	tp->syn_data_acked = tp->syn_data;
				5538	if (tp->syn_data_acked)
				5539	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
				5540	return false;
				5541	}
				5542
				5543	static int tcp_rcv_synsent_state_process(struct sock sk, struct sk_buff skb,
				5544	const struct tcphdr *th)
				5545	{
				5546	struct inet_connection_sock *icsk = inet_csk(sk);
				5547	struct tcp_sock *tp = tcp_sk(sk);
				5548	struct tcp_fastopen_cookie foc = { .len = -1 };
				5549	int saved_clamp = tp->rx_opt.mss_clamp;
				5550
				5551	tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
				5552	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
				5553	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
				5554
				5555	if (th->ack) {
				5556	/* rfc793:
				5557	* "If the state is SYN-SENT then
				5558	* first check the ACK bit
				5559	* If the ACK bit is set
				5560	* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
				5561	* a reset (unless the RST bit is set, if so drop
				5562	* the segment and return)"
				5563	*/
				5564	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
				5565	after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
				5566	goto reset_and_undo;
				5567
				5568	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
				5569	!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
				5570	tcp_time_stamp)) {
				5571	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
				5572	goto reset_and_undo;
				5573	}
				5574
				5575	/* Now ACK is acceptable.
				5576	*
				5577	* "If the RST bit is set
				5578	* If the ACK was acceptable then signal the user "error:
				5579	* connection reset", drop the segment, enter CLOSED state,
				5580	* delete TCB, and return."
				5581	*/
				5582
				5583	if (th->rst) {
				5584	tcp_reset(sk);
				5585	goto discard;
				5586	}
				5587
				5588	/* rfc793:
				5589	* "fifth, if neither of the SYN or RST bits is set then
				5590	* drop the segment and return."
				5591	*
				5592	* See note below!
				5593	* --ANK(990513)
				5594	*/
				5595	if (!th->syn)
				5596	goto discard_and_undo;
				5597
				5598	/* rfc793:
				5599	* "If the SYN bit is on ...
				5600	* are acceptable then ...
				5601	* (our SYN has been ACKed), change the connection
				5602	* state to ESTABLISHED..."
				5603	*/
				5604
				5605	tcp_ecn_rcv_synack(tp, th);
				5606
				5607	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				5608	tcp_ack(sk, skb, FLAG_SLOWPATH);
				5609
				5610	/* Ok.. it's good. Set up sequence numbers and
				5611	* move to established.
				5612	*/
				5613	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
				5614	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				5615
				5616	/* RFC1323: The window in SYN & SYN/ACK segments is
				5617	* never scaled.
				5618	*/
				5619	tp->snd_wnd = ntohs(th->window);
				5620
				5621	if (!tp->rx_opt.wscale_ok) {
				5622	tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
				5623	tp->window_clamp = min(tp->window_clamp, 65535U);
				5624	}
				5625
				5626	if (tp->rx_opt.saw_tstamp) {
				5627	tp->rx_opt.tstamp_ok = 1;
				5628	tp->tcp_header_len =
				5629	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				5630	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				5631	tcp_store_ts_recent(tp);
				5632	} else {
				5633	tp->tcp_header_len = sizeof(struct tcphdr);
				5634	}
				5635
				5636	if (tcp_is_sack(tp) && sysctl_tcp_fack)
				5637	tcp_enable_fack(tp);
				5638
				5639	tcp_mtup_init(sk);
				5640	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				5641	tcp_initialize_rcv_mss(sk);
				5642
				5643	/* Remember, tcp_poll() does not lock socket!
				5644	* Change state from SYN-SENT only after copied_seq
				5645	* is initialized. */
				5646	tp->copied_seq = tp->rcv_nxt;
				5647
				5648	smp_mb();
				5649
				5650	tcp_finish_connect(sk, skb);
				5651
				5652	if ((tp->syn_fastopen \|\| tp->syn_data) &&
				5653	tcp_rcv_fastopen_synack(sk, skb, &foc))
				5654	return -1;
				5655
				5656	if (sk->sk_write_pending \|\|
				5657	icsk->icsk_accept_queue.rskq_defer_accept \|\|
				5658	icsk->icsk_ack.pingpong) {
				5659	/* Save one ACK. Data will be ready after
				5660	* several ticks, if write_pending is set.
				5661	*
				5662	* It may be deleted, but with this feature tcpdumps
				5663	* look so _wonderfully_ clever, that I was not able
				5664	* to stand against the temptation 8) --ANK
				5665	*/
				5666	inet_csk_schedule_ack(sk);
				5667	tcp_enter_quickack_mode(sk);
				5668	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
				5669	TCP_DELACK_MAX, TCP_RTO_MAX);
				5670
				5671	discard:
				5672	__kfree_skb(skb);
				5673	return 0;
				5674	} else {
				5675	tcp_send_ack(sk);
				5676	}
				5677	return -1;
				5678	}
				5679
				5680	/* No ACK in the segment */
				5681
				5682	if (th->rst) {
				5683	/* rfc793:
				5684	* "If the RST bit is set
				5685	*
				5686	* Otherwise (no ACK) drop the segment and return."
				5687	*/
				5688
				5689	goto discard_and_undo;
				5690	}
				5691
				5692	/* PAWS check. */
				5693	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
				5694	tcp_paws_reject(&tp->rx_opt, 0))
				5695	goto discard_and_undo;
				5696
				5697	if (th->syn) {
				5698	/* We see SYN without ACK. It is attempt of
				5699	* simultaneous connect with crossed SYNs.
				5700	* Particularly, it can be connect to self.
				5701	*/
				5702	tcp_set_state(sk, TCP_SYN_RECV);
				5703
				5704	if (tp->rx_opt.saw_tstamp) {
				5705	tp->rx_opt.tstamp_ok = 1;
				5706	tcp_store_ts_recent(tp);
				5707	tp->tcp_header_len =
				5708	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
				5709	} else {
				5710	tp->tcp_header_len = sizeof(struct tcphdr);
				5711	}
				5712
				5713	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
				5714	tp->copied_seq = tp->rcv_nxt;
				5715	tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
				5716
				5717	/* RFC1323: The window in SYN & SYN/ACK segments is
				5718	* never scaled.
				5719	*/
				5720	tp->snd_wnd = ntohs(th->window);
				5721	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
				5722	tp->max_window = tp->snd_wnd;
				5723
				5724	tcp_ecn_rcv_syn(tp, th);
				5725
				5726	tcp_mtup_init(sk);
				5727	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
				5728	tcp_initialize_rcv_mss(sk);
				5729
				5730	tcp_send_synack(sk);
				5731	#if 0
				5732	/* Note, we could accept data and URG from this segment.
				5733	* There are no obstacles to make this (except that we must
				5734	* either change tcp_recvmsg() to prevent it from returning data
				5735	* before 3WHS completes per RFC793, or employ TCP Fast Open).
				5736	*
				5737	* However, if we ignore data in ACKless segments sometimes,
				5738	* we have no reasons to accept it sometimes.
				5739	* Also, seems the code doing it in step6 of tcp_rcv_state_process
				5740	* is not flawless. So, discard packet for sanity.
				5741	* Uncomment this return to process the data.
				5742	*/
				5743	return -1;
				5744	#else
				5745	goto discard;
				5746	#endif
				5747	}
				5748	/* "fifth, if neither of the SYN or RST bits is set then
				5749	* drop the segment and return."
				5750	*/
				5751
				5752	discard_and_undo:
				5753	tcp_clear_options(&tp->rx_opt);
				5754	tp->rx_opt.mss_clamp = saved_clamp;
				5755	goto discard;
				5756
				5757	reset_and_undo:
				5758	tcp_clear_options(&tp->rx_opt);
				5759	tp->rx_opt.mss_clamp = saved_clamp;
				5760	return 1;
				5761	}
				5762
				5763	/*
				5764	* This function implements the receiving procedure of RFC 793 for
				5765	* all states except ESTABLISHED and TIME_WAIT.
				5766	* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
				5767	* address independent.
				5768	*/
				5769
				5770	int tcp_rcv_state_process(struct sock sk, struct sk_buff skb)
				5771	{
				5772	struct tcp_sock *tp = tcp_sk(sk);
				5773	struct inet_connection_sock *icsk = inet_csk(sk);
				5774	const struct tcphdr *th = tcp_hdr(skb);
				5775	struct request_sock *req;
				5776	int queued = 0;
				5777	bool acceptable;
				5778
				5779	tp->rx_opt.saw_tstamp = 0;
				5780
				5781	switch (sk->sk_state) {
				5782	case TCP_CLOSE:
				5783	goto discard;
				5784
				5785	case TCP_LISTEN:
				5786	if (th->ack)
				5787	return 1;
				5788
				5789	if (th->rst)
				5790	goto discard;
				5791
				5792	if (th->syn) {
				5793	if (th->fin)
				5794	goto discard;
				5795	if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
				5796	return 1;
				5797
				5798	/* Now we have several options: In theory there is
				5799	* nothing else in the frame. KA9Q has an option to
				5800	* send data with the syn, BSD accepts data with the
				5801	* syn up to the [to be] advertised window and
				5802	* Solaris 2.1 gives you a protocol error. For now
				5803	* we just ignore it, that fits the spec precisely
				5804	* and avoids incompatibilities. It would be nice in
				5805	* future to drop through and process the data.
				5806	*
				5807	* Now that TTCP is starting to be used we ought to
				5808	* queue this data.
				5809	* But, this leaves one open to an easy denial of
				5810	* service attack, and SYN cookies can't defend
				5811	* against this problem. So, we drop the data
				5812	* in the interest of security over speed unless
				5813	* it's still in use.
				5814	*/
				5815	kfree_skb(skb);
				5816	return 0;
				5817	}
				5818	goto discard;
				5819
				5820	case TCP_SYN_SENT:
				5821	queued = tcp_rcv_synsent_state_process(sk, skb, th);
				5822	if (queued >= 0)
				5823	return queued;
				5824
				5825	/* Do step6 onward by hand. */
				5826	tcp_urg(sk, skb, th);
				5827	__kfree_skb(skb);
				5828	tcp_data_snd_check(sk);
				5829	return 0;
				5830	}
				5831
				5832	req = tp->fastopen_rsk;
				5833	if (req) {
				5834	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
				5835	sk->sk_state != TCP_FIN_WAIT1);
				5836
				5837	if (!tcp_check_req(sk, skb, req, true))
				5838	goto discard;
				5839	}
				5840
				5841	if (!th->ack && !th->rst && !th->syn)
				5842	goto discard;
				5843
				5844	if (!tcp_validate_incoming(sk, skb, th, 0))
				5845	return 0;
				5846
				5847	/* step 5: check the ACK field */
				5848	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH \|
				5849	FLAG_UPDATE_TS_RECENT) > 0;
				5850
				5851	switch (sk->sk_state) {
				5852	case TCP_SYN_RECV:
				5853	if (!acceptable)
				5854	return 1;
				5855
				5856	if (!tp->srtt_us)
				5857	tcp_synack_rtt_meas(sk, req);
				5858
				5859	/* Once we leave TCP_SYN_RECV, we no longer need req
				5860	* so release it.
				5861	*/
				5862	if (req) {
				5863	tp->total_retrans = req->num_retrans;
				5864	reqsk_fastopen_remove(sk, req, false);
				5865	} else {
				5866	/* Make sure socket is routed, for correct metrics. */
				5867	icsk->icsk_af_ops->rebuild_header(sk);
				5868	tcp_init_congestion_control(sk);
				5869
				5870	tcp_mtup_init(sk);
				5871	tp->copied_seq = tp->rcv_nxt;
				5872	tcp_init_buffer_space(sk);
				5873	}
				5874	smp_mb();
				5875	tcp_set_state(sk, TCP_ESTABLISHED);
				5876	sk->sk_state_change(sk);
				5877
				5878	/* Note, that this wakeup is only for marginal crossed SYN case.
				5879	* Passively open sockets are not waked up, because
				5880	* sk->sk_sleep == NULL and sk->sk_socket == NULL.
				5881	*/
				5882	if (sk->sk_socket)
				5883	sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
				5884
				5885	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
				5886	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
				5887	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
				5888
				5889	if (tp->rx_opt.tstamp_ok)
				5890	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
				5891
				5892	if (req) {
				5893	/* Re-arm the timer because data may have been sent out.
				5894	* This is similar to the regular data transmission case
				5895	* when new data has just been ack'ed.
				5896	*
				5897	* (TFO) - we could try to be more aggressive and
				5898	* retransmitting any data sooner based on when they
				5899	* are sent out.
				5900	*/
				5901	tcp_rearm_rto(sk);
				5902	} else
				5903	tcp_init_metrics(sk);
				5904
				5905	tcp_update_pacing_rate(sk);
				5906
				5907	/* Prevent spurious tcp_cwnd_restart() on first data packet */
				5908	tp->lsndtime = tcp_time_stamp;
				5909
				5910	tcp_initialize_rcv_mss(sk);
				5911	tcp_fast_path_on(tp);
				5912	break;
				5913
				5914	case TCP_FIN_WAIT1: {
				5915	struct dst_entry *dst;
				5916	int tmo;
				5917
				5918	/* If we enter the TCP_FIN_WAIT1 state and we are a
				5919	* Fast Open socket and this is the first acceptable
				5920	* ACK we have received, this would have acknowledged
				5921	* our SYNACK so stop the SYNACK timer.
				5922	*/
				5923	if (req) {
				5924	/* Return RST if ack_seq is invalid.
				5925	* Note that RFC793 only says to generate a
				5926	* DUPACK for it but for TCP Fast Open it seems
				5927	* better to treat this case like TCP_SYN_RECV
				5928	* above.
				5929	*/
				5930	if (!acceptable)
				5931	return 1;
				5932	/* We no longer need the request sock. */
				5933	reqsk_fastopen_remove(sk, req, false);
				5934	tcp_rearm_rto(sk);
				5935	}
				5936	if (tp->snd_una != tp->write_seq)
				5937	break;
				5938
				5939	tcp_set_state(sk, TCP_FIN_WAIT2);
				5940	sk->sk_shutdown \|= SEND_SHUTDOWN;
				5941
				5942	dst = __sk_dst_get(sk);
				5943	if (dst)
				5944	dst_confirm(dst);
				5945
				5946	if (!sock_flag(sk, SOCK_DEAD)) {
				5947	/* Wake up lingering close() */
				5948	sk->sk_state_change(sk);
				5949	break;
				5950	}
				5951
				5952	if (tp->linger2 < 0 \|\|
				5953	(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				5954	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
				5955	tcp_done(sk);
				5956	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				5957	return 1;
				5958	}
				5959
				5960	tmo = tcp_fin_time(sk);
				5961	if (tmo > TCP_TIMEWAIT_LEN) {
				5962	inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
				5963	} else if (th->fin \|\| sock_owned_by_user(sk)) {
				5964	/* Bad case. We could lose such FIN otherwise.
				5965	* It is not a big problem, but it looks confusing
				5966	* and not so rare event. We still can lose it now,
				5967	* if it spins in bh_lock_sock(), but it is really
				5968	* marginal case.
				5969	*/
				5970	inet_csk_reset_keepalive_timer(sk, tmo);
				5971	} else {
				5972	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				5973	goto discard;
				5974	}
				5975	break;
				5976	}
				5977
				5978	case TCP_CLOSING:
				5979	if (tp->snd_una == tp->write_seq) {
				5980	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
				5981	goto discard;
				5982	}
				5983	break;
				5984
				5985	case TCP_LAST_ACK:
				5986	if (tp->snd_una == tp->write_seq) {
				5987	tcp_update_metrics(sk);
				5988	tcp_done(sk);
				5989	goto discard;
				5990	}
				5991	break;
				5992	}
				5993
				5994	/* step 6: check the URG bit */
				5995	tcp_urg(sk, skb, th);
				5996
				5997	/* step 7: process the segment text */
				5998	switch (sk->sk_state) {
				5999	case TCP_CLOSE_WAIT:
				6000	case TCP_CLOSING:
				6001	case TCP_LAST_ACK:
				6002	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
				6003	break;
				6004	case TCP_FIN_WAIT1:
				6005	case TCP_FIN_WAIT2:
				6006	/* RFC 793 says to queue data in these states,
				6007	* RFC 1122 says we MUST send a reset.
				6008	* BSD 4.4 also does reset.
				6009	*/
				6010	if (sk->sk_shutdown & RCV_SHUTDOWN) {
				6011	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
				6012	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
				6013	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
				6014	tcp_reset(sk);
				6015	return 1;
				6016	}
				6017	}
				6018	/* Fall through */
				6019	case TCP_ESTABLISHED:
				6020	tcp_data_queue(sk, skb);
				6021	queued = 1;
				6022	break;
				6023	}
				6024
				6025	/* tcp_data could move socket to TIME-WAIT */
				6026	if (sk->sk_state != TCP_CLOSE) {
				6027	tcp_data_snd_check(sk);
				6028	tcp_ack_snd_check(sk);
				6029	}
				6030
				6031	if (!queued) {
				6032	discard:
				6033	__kfree_skb(skb);
				6034	}
				6035	return 0;
				6036	}
				6037	EXPORT_SYMBOL(tcp_rcv_state_process);
				6038
				6039	static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
				6040	{
				6041	struct inet_request_sock *ireq = inet_rsk(req);
				6042
				6043	if (family == AF_INET)
				6044	net_dbg_ratelimited("drop open request from %pI4/%u\n",
				6045	&ireq->ir_rmt_addr, port);
				6046	#if IS_ENABLED(CONFIG_IPV6)
				6047	else if (family == AF_INET6)
				6048	net_dbg_ratelimited("drop open request from %pI6/%u\n",
				6049	&ireq->ir_v6_rmt_addr, port);
				6050	#endif
				6051	}
				6052
				6053	/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
				6054	*
				6055	* If we receive a SYN packet with these bits set, it means a
				6056	* network is playing bad games with TOS bits. In order to
				6057	* avoid possible false congestion notifications, we disable
				6058	* TCP ECN negotiation.
				6059	*
				6060	* Exception: tcp_ca wants ECN. This is required for DCTCP
				6061	* congestion control: Linux DCTCP asserts ECT on all packets,
				6062	* including SYN, which is most optimal solution; however,
				6063	* others, such as FreeBSD do not.
				6064	*/
				6065	static void tcp_ecn_create_request(struct request_sock *req,
				6066	const struct sk_buff *skb,
				6067	const struct sock *listen_sk,
				6068	const struct dst_entry *dst)
				6069	{
				6070	const struct tcphdr *th = tcp_hdr(skb);
				6071	const struct net *net = sock_net(listen_sk);
				6072	bool th_ecn = th->ece && th->cwr;
				6073	bool ect, ecn_ok;
				6074	u32 ecn_ok_dst;
				6075
				6076	if (!th_ecn)
				6077	return;
				6078
				6079	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
				6080	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
				6081	ecn_ok = net->ipv4.sysctl_tcp_ecn \|\| ecn_ok_dst;
				6082
				6083	if ((!ect && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
				6084	(ecn_ok_dst & DST_FEATURE_ECN_CA))
				6085	inet_rsk(req)->ecn_ok = 1;
				6086	}
				6087
				6088	static void tcp_openreq_init(struct request_sock *req,
				6089	const struct tcp_options_received *rx_opt,
				6090	struct sk_buff skb, const struct sock sk)
				6091	{
				6092	struct inet_request_sock *ireq = inet_rsk(req);
				6093
				6094	req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
				6095	req->cookie_ts = 0;
				6096	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
				6097	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
				6098	skb_mstamp_get(&tcp_rsk(req)->snt_synack);
				6099	tcp_rsk(req)->last_oow_ack_time = 0;
				6100	req->mss = rx_opt->mss_clamp;
				6101	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
				6102	ireq->tstamp_ok = rx_opt->tstamp_ok;
				6103	ireq->sack_ok = rx_opt->sack_ok;
				6104	ireq->snd_wscale = rx_opt->snd_wscale;
				6105	ireq->wscale_ok = rx_opt->wscale_ok;
				6106	ireq->acked = 0;
				6107	ireq->ecn_ok = 0;
				6108	ireq->ir_rmt_port = tcp_hdr(skb)->source;
				6109	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
				6110	ireq->ir_mark = inet_request_mark(sk, skb);
				6111	}
				6112
				6113	struct request_sock inet_reqsk_alloc(const struct request_sock_ops ops,
				6114	struct sock *sk_listener,
				6115	bool attach_listener)
				6116	{
				6117	struct request_sock *req = reqsk_alloc(ops, sk_listener,
				6118	attach_listener);
				6119
				6120	if (req) {
				6121	struct inet_request_sock *ireq = inet_rsk(req);
				6122
				6123	kmemcheck_annotate_bitfield(ireq, flags);
				6124	ireq->ireq_opt = NULL;
				6125	atomic64_set(&ireq->ir_cookie, 0);
				6126	ireq->ireq_state = TCP_NEW_SYN_RECV;
				6127	write_pnet(&ireq->ireq_net, sock_net(sk_listener));
				6128	ireq->ireq_family = sk_listener->sk_family;
				6129	}
				6130
				6131	return req;
				6132	}
				6133	EXPORT_SYMBOL(inet_reqsk_alloc);
				6134
				6135	/*
				6136	* Return true if a syncookie should be sent
				6137	*/
				6138	static bool tcp_syn_flood_action(const struct sock *sk,
				6139	const struct sk_buff *skb,
				6140	const char *proto)
				6141	{
				6142	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
				6143	const char *msg = "Dropping request";
				6144	bool want_cookie = false;
				6145
				6146	#ifdef CONFIG_SYN_COOKIES
				6147	if (sysctl_tcp_syncookies) {
				6148	msg = "Sending cookies";
				6149	want_cookie = true;
				6150	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
				6151	} else
				6152	#endif
				6153	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
				6154
				6155	if (!queue->synflood_warned &&
				6156	sysctl_tcp_syncookies != 2 &&
				6157	xchg(&queue->synflood_warned, 1) == 0)
				6158	pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
				6159	proto, ntohs(tcp_hdr(skb)->dest), msg);
				6160
				6161	return want_cookie;
				6162	}
				6163
				6164	static void tcp_reqsk_record_syn(const struct sock *sk,
				6165	struct request_sock *req,
				6166	const struct sk_buff *skb)
				6167	{
				6168	if (tcp_sk(sk)->save_syn) {
				6169	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
				6170	u32 *copy;
				6171
				6172	copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
				6173	if (copy) {
				6174	copy[0] = len;
				6175	memcpy(&copy[1], skb_network_header(skb), len);
				6176	req->saved_syn = copy;
				6177	}
				6178	}
				6179	}
				6180
				6181	int tcp_conn_request(struct request_sock_ops *rsk_ops,
				6182	const struct tcp_request_sock_ops *af_ops,
				6183	struct sock sk, struct sk_buff skb)
				6184	{
				6185	struct tcp_fastopen_cookie foc = { .len = -1 };
				6186	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
				6187	struct tcp_options_received tmp_opt;
				6188	struct tcp_sock *tp = tcp_sk(sk);
				6189	struct sock *fastopen_sk = NULL;
				6190	struct dst_entry *dst = NULL;
				6191	struct request_sock *req;
				6192	bool want_cookie = false;
				6193	struct flowi fl;
				6194
				6195	/* TW buckets are converted to open requests without
				6196	* limitations, they conserve resources and peer is
				6197	* evidently real one.
				6198	*/
				6199	if ((sysctl_tcp_syncookies == 2 \|\|
				6200	inet_csk_reqsk_queue_is_full(sk)) && !isn) {
				6201	want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
				6202	if (!want_cookie)
				6203	goto drop;
				6204	}
				6205
				6206
				6207	/* Accept backlog is full. If we have already queued enough
				6208	* of warm entries in syn queue, drop request. It is better than
				6209	* clogging syn queue with openreqs with exponentially increasing
				6210	* timeout.
				6211	*/
				6212	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
				6213	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
				6214	goto drop;
				6215	}
				6216
				6217	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
				6218	if (!req)
				6219	goto drop;
				6220
				6221	tcp_rsk(req)->af_specific = af_ops;
				6222
				6223	tcp_clear_options(&tmp_opt);
				6224	tmp_opt.mss_clamp = af_ops->mss_clamp;
				6225	tmp_opt.user_mss = tp->rx_opt.user_mss;
				6226	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
				6227
				6228	if (want_cookie && !tmp_opt.saw_tstamp)
				6229	tcp_clear_options(&tmp_opt);
				6230
				6231	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
				6232	tcp_openreq_init(req, &tmp_opt, skb, sk);
				6233
				6234	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
				6235	inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
				6236
				6237	af_ops->init_req(req, sk, skb);
				6238
				6239	if (security_inet_conn_request(sk, skb, req))
				6240	goto drop_and_free;
				6241
				6242	if (!want_cookie && !isn) {
				6243	/* VJ's idea. We save last timestamp seen
				6244	* from the destination in peer table, when entering
				6245	* state TIME-WAIT, and check against it before
				6246	* accepting new connection request.
				6247	*
				6248	* If "isn" is not zero, this request hit alive
				6249	* timewait bucket, so that all the necessary checks
				6250	* are made in the function processing timewait state.
				6251	*/
				6252	if (tcp_death_row.sysctl_tw_recycle) {
				6253	bool strict;
				6254
				6255	dst = af_ops->route_req(sk, &fl, req, &strict);
				6256
				6257	if (dst && strict &&
				6258	!tcp_peer_is_proven(req, dst, true,
				6259	tmp_opt.saw_tstamp)) {
				6260	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
				6261	goto drop_and_release;
				6262	}
				6263	}
				6264	/* Kill the following clause, if you dislike this way. */
				6265	else if (!sysctl_tcp_syncookies &&
				6266	(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
				6267	(sysctl_max_syn_backlog >> 2)) &&
				6268	!tcp_peer_is_proven(req, dst, false,
				6269	tmp_opt.saw_tstamp)) {
				6270	/* Without syncookies last quarter of
				6271	* backlog is filled with destinations,
				6272	* proven to be alive.
				6273	* It means that we continue to communicate
				6274	* to destinations, already remembered
				6275	* to the moment of synflood.
				6276	*/
				6277	pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
				6278	rsk_ops->family);
				6279	goto drop_and_release;
				6280	}
				6281
				6282	isn = af_ops->init_seq(skb);
				6283	}
				6284	if (!dst) {
				6285	dst = af_ops->route_req(sk, &fl, req, NULL);
				6286	if (!dst)
				6287	goto drop_and_free;
				6288	}
				6289
				6290	tcp_ecn_create_request(req, skb, sk, dst);
				6291
				6292	if (want_cookie) {
				6293	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
				6294	req->cookie_ts = tmp_opt.tstamp_ok;
				6295	if (!tmp_opt.tstamp_ok)
				6296	inet_rsk(req)->ecn_ok = 0;
				6297	}
				6298
				6299	tcp_rsk(req)->snt_isn = isn;
				6300	tcp_rsk(req)->txhash = net_tx_rndhash();
				6301	tcp_openreq_init_rwin(req, sk, dst);
				6302	if (!want_cookie) {
				6303	tcp_reqsk_record_syn(sk, req, skb);
				6304	fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
				6305	}
				6306	if (fastopen_sk) {
				6307	af_ops->send_synack(fastopen_sk, dst, &fl, req,
				6308	&foc, false);
				6309	/* Add the child socket directly into the accept queue */
				6310	inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
				6311	sk->sk_data_ready(sk);
				6312	bh_unlock_sock(fastopen_sk);
				6313	sock_put(fastopen_sk);
				6314	} else {
				6315	tcp_rsk(req)->tfo_listener = false;
				6316	if (!want_cookie)
				6317	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
				6318	af_ops->send_synack(sk, dst, &fl, req,
				6319	&foc, !want_cookie);
				6320	if (want_cookie)
				6321	goto drop_and_free;
				6322	}
				6323	reqsk_put(req);
				6324	return 0;
				6325
				6326	drop_and_release:
				6327	dst_release(dst);
				6328	drop_and_free:
				6329	reqsk_free(req);
				6330	drop:
				6331	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
				6332	return 0;
				6333	}
				6334	EXPORT_SYMBOL(tcp_conn_request);