blob: c137ea68108962aaab137b20f00dc8d90c2ea88c [file] [log] [blame]
Dave Barach68b0fb02017-02-28 15:15:56 -05001/*
Florin Corasc5df8c72019-04-08 07:42:30 -07002 * Copyright (c) 2016-2019 Cisco and/or its affiliates.
Dave Barach68b0fb02017-02-28 15:15:56 -05003 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#ifndef included_tcp_packet_h
17#define included_tcp_packet_h
18
Neale Rannsf7040f02022-02-15 09:02:27 +000019#include <vnet/ip/ip4_packet.h>
20#include <vnet/ip/ip6_packet.h>
Dave Barach68b0fb02017-02-28 15:15:56 -050021
22/* TCP flags bit 0 first. */
23#define foreach_tcp_flag \
24 _ (FIN) /**< No more data from sender. */ \
25 _ (SYN) /**< Synchronize sequence numbers. */ \
26 _ (RST) /**< Reset the connection. */ \
27 _ (PSH) /**< Push function. */ \
28 _ (ACK) /**< Ack field significant. */ \
29 _ (URG) /**< Urgent pointer field significant. */ \
30 _ (ECE) /**< ECN-echo. Receiver got CE packet */ \
31 _ (CWR) /**< Sender reduced congestion window */
32
33enum
34{
35#define _(f) TCP_FLAG_BIT_##f,
36 foreach_tcp_flag
37#undef _
38 TCP_N_FLAG_BITS,
39};
40
41enum
42{
43#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f,
44 foreach_tcp_flag
45#undef _
46};
47
48typedef struct _tcp_header
49{
50 union
51 {
52 struct
53 {
54 u16 src_port; /**< Source port. */
55 u16 dst_port; /**< Destination port. */
56 };
57 struct
58 {
59 u16 src, dst;
60 };
61 };
62
63 u32 seq_number; /**< Sequence number of the first data octet in this
64 * segment, except when SYN is present. If SYN
65 * is present the seq number is is the ISN and the
66 * first data octet is ISN+1 */
67 u32 ack_number; /**< Acknowledgement number if ACK is set. It contains
68 * the value of the next sequence number the sender
69 * of the segment is expecting to receive. */
70 u8 data_offset_and_reserved;
71 u8 flags; /**< Flags: see the macro above */
72 u16 window; /**< Number of bytes sender is willing to receive. */
73
74 u16 checksum; /**< Checksum of TCP pseudo header and data. */
75 u16 urgent_pointer; /**< Seq number of the byte after the urgent data. */
76} __attribute__ ((packed)) tcp_header_t;
77
78/* Flag tests that return 0 or !0 */
79#define tcp_doff(_th) ((_th)->data_offset_and_reserved >> 4)
80#define tcp_fin(_th) ((_th)->flags & TCP_FLAG_FIN)
81#define tcp_syn(_th) ((_th)->flags & TCP_FLAG_SYN)
82#define tcp_rst(_th) ((_th)->flags & TCP_FLAG_RST)
83#define tcp_psh(_th) ((_th)->flags & TCP_FLAG_PSH)
84#define tcp_ack(_th) ((_th)->flags & TCP_FLAG_ACK)
85#define tcp_urg(_th) ((_th)->flags & TCP_FLAG_URG)
86#define tcp_ece(_th) ((_th)->flags & TCP_FLAG_ECE)
87#define tcp_cwr(_th) ((_th)->flags & TCP_FLAG_CWR)
88
89/* Flag tests that return 0 or 1 */
90#define tcp_is_syn(_th) !!((_th)->flags & TCP_FLAG_SYN)
91#define tcp_is_fin(_th) !!((_th)->flags & TCP_FLAG_FIN)
92
93always_inline int
94tcp_header_bytes (tcp_header_t * t)
95{
96 return tcp_doff (t) * sizeof (u32);
97}
98
99/*
100 * TCP options.
101 */
102
103typedef enum tcp_option_type
104{
105 TCP_OPTION_EOL = 0, /**< End of options. */
106 TCP_OPTION_NOOP = 1, /**< No operation. */
107 TCP_OPTION_MSS = 2, /**< Limit MSS. */
108 TCP_OPTION_WINDOW_SCALE = 3, /**< Window scale. */
109 TCP_OPTION_SACK_PERMITTED = 4, /**< Selective Ack permitted. */
110 TCP_OPTION_SACK_BLOCK = 5, /**< Selective Ack block. */
111 TCP_OPTION_TIMESTAMP = 8, /**< Timestamps. */
112 TCP_OPTION_UTO = 28, /**< User timeout. */
113 TCP_OPTION_AO = 29, /**< Authentication Option. */
114} tcp_option_type_t;
115
116#define foreach_tcp_options_flag \
117 _ (MSS) /**< MSS advertised in SYN */ \
118 _ (TSTAMP) /**< Timestamp capability advertised in SYN */ \
119 _ (WSCALE) /**< Wnd scale capability advertised in SYN */ \
120 _ (SACK_PERMITTED) /**< SACK capability advertised in SYN */ \
121 _ (SACK) /**< SACK present */
122
123enum
124{
125#define _(f) TCP_OPTS_FLAG_BIT_##f,
126 foreach_tcp_options_flag
127#undef _
128 TCP_OPTIONS_N_FLAG_BITS,
129};
130
131enum
132{
133#define _(f) TCP_OPTS_FLAG_##f = 1 << TCP_OPTS_FLAG_BIT_##f,
134 foreach_tcp_options_flag
135#undef _
136};
137
138typedef struct _sack_block
139{
140 u32 start; /**< Start sequence number */
Florin Coras6792ec02017-03-13 03:49:51 -0700141 u32 end; /**< End sequence number (first outside) */
Dave Barach68b0fb02017-02-28 15:15:56 -0500142} sack_block_t;
143
144typedef struct
145{
Florin Coras64698402021-04-22 16:58:23 -0700146 sack_block_t *sacks; /**< SACK blocks */
Florin Corasf6359c82017-06-19 12:26:09 -0400147 u32 tsval; /**< Timestamp value */
Dave Barach68b0fb02017-02-28 15:15:56 -0500148 u32 tsecr; /**< Echoed/reflected time stamp */
Florin Coras64698402021-04-22 16:58:23 -0700149 u16 mss; /**< Maximum segment size advertised */
150 u8 flags; /**< Option flags, see above */
151 u8 wscale; /**< Window scale advertised */
Dave Barach68b0fb02017-02-28 15:15:56 -0500152 u8 n_sack_blocks; /**< Number of SACKs blocks */
153} tcp_options_t;
154
155/* Flag tests that return 0 or !0 */
156#define tcp_opts_mss(_to) ((_to)->flags & TCP_OPTS_FLAG_MSS)
157#define tcp_opts_tstamp(_to) ((_to)->flags & TCP_OPTS_FLAG_TSTAMP)
158#define tcp_opts_wscale(_to) ((_to)->flags & TCP_OPTS_FLAG_WSCALE)
159#define tcp_opts_sack(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK)
160#define tcp_opts_sack_permitted(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK_PERMITTED)
161
162/* TCP option lengths */
163#define TCP_OPTION_LEN_EOL 1
164#define TCP_OPTION_LEN_NOOP 1
165#define TCP_OPTION_LEN_MSS 4
166#define TCP_OPTION_LEN_WINDOW_SCALE 3
167#define TCP_OPTION_LEN_SACK_PERMITTED 2
168#define TCP_OPTION_LEN_TIMESTAMP 10
169#define TCP_OPTION_LEN_SACK_BLOCK 8
170
Dave Barach2c25a622017-06-26 11:35:07 -0400171#define TCP_HDR_LEN_MAX 60
Dave Barach68b0fb02017-02-28 15:15:56 -0500172#define TCP_WND_MAX 65535U
173#define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */
174#define TCP_OPTS_ALIGN 4
175#define TCP_OPTS_MAX_SACK_BLOCKS 3
Florin Coras97f96942020-10-20 13:45:51 -0700176#define TCP_MAX_GSO_SZ 65536
Florin Coras999840c2020-03-18 20:31:34 +0000177
178/* Modulo arithmetic for TCP sequence numbers */
179#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0)
180#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0)
181#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0)
182#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0)
183#define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2))
184
185/* Modulo arithmetic for timestamps */
186#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0)
187#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0)
188
Neale Rannsf7040f02022-02-15 09:02:27 +0000189always_inline void
190ip4_tcp_reply_x1 (ip4_header_t *ip0, tcp_header_t *tcp0)
191{
192 u32 src0, dst0;
193
194 src0 = ip0->src_address.data_u32;
195 dst0 = ip0->dst_address.data_u32;
196 ip0->src_address.data_u32 = dst0;
197 ip0->dst_address.data_u32 = src0;
198
199 src0 = tcp0->src;
200 dst0 = tcp0->dst;
201 tcp0->src = dst0;
202 tcp0->dst = src0;
203}
204
205always_inline void
206ip4_tcp_reply_x2 (ip4_header_t *ip0, ip4_header_t *ip1, tcp_header_t *tcp0,
207 tcp_header_t *tcp1)
208{
209 u32 src0, dst0, src1, dst1;
210
211 src0 = ip0->src_address.data_u32;
212 src1 = ip1->src_address.data_u32;
213 dst0 = ip0->dst_address.data_u32;
214 dst1 = ip1->dst_address.data_u32;
215 ip0->src_address.data_u32 = dst0;
216 ip1->src_address.data_u32 = dst1;
217 ip0->dst_address.data_u32 = src0;
218 ip1->dst_address.data_u32 = src1;
219
220 src0 = tcp0->src;
221 src1 = tcp1->src;
222 dst0 = tcp0->dst;
223 dst1 = tcp1->dst;
224 tcp0->src = dst0;
225 tcp1->src = dst1;
226 tcp0->dst = src0;
227 tcp1->dst = src1;
228}
229
230always_inline void
231ip6_tcp_reply_x1 (ip6_header_t *ip0, tcp_header_t *tcp0)
232{
233 {
234 ip6_address_t src0, dst0;
235
236 src0 = ip0->src_address;
237 dst0 = ip0->dst_address;
238 ip0->src_address = dst0;
239 ip0->dst_address = src0;
240 }
241
242 {
243 u16 src0, dst0;
244
245 src0 = tcp0->src;
246 dst0 = tcp0->dst;
247 tcp0->src = dst0;
248 tcp0->dst = src0;
249 }
250}
251
252always_inline void
253ip6_tcp_reply_x2 (ip6_header_t *ip0, ip6_header_t *ip1, tcp_header_t *tcp0,
254 tcp_header_t *tcp1)
255{
256 {
257 ip6_address_t src0, dst0, src1, dst1;
258
259 src0 = ip0->src_address;
260 src1 = ip1->src_address;
261 dst0 = ip0->dst_address;
262 dst1 = ip1->dst_address;
263 ip0->src_address = dst0;
264 ip1->src_address = dst1;
265 ip0->dst_address = src0;
266 ip1->dst_address = src1;
267 }
268
269 {
270 u16 src0, dst0, src1, dst1;
271
272 src0 = tcp0->src;
273 src1 = tcp1->src;
274 dst0 = tcp0->dst;
275 dst1 = tcp1->dst;
276 tcp0->src = dst0;
277 tcp1->src = dst1;
278 tcp0->dst = src0;
279 tcp1->dst = src1;
280 }
281}
282
Florin Coras999840c2020-03-18 20:31:34 +0000283/**
284 * Parse TCP header options.
285 *
286 * @param th TCP header
287 * @param to TCP options data structure to be populated
288 * @param is_syn set if packet is syn
289 * @return -1 if parsing failed
290 */
291always_inline int
292tcp_options_parse (tcp_header_t * th, tcp_options_t * to, u8 is_syn)
293{
294 const u8 *data;
295 u8 opt_len, opts_len, kind;
296 int j;
297 sack_block_t b;
298
299 opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t);
300 data = (const u8 *) (th + 1);
301
302 /* Zero out all flags but those set in SYN */
303 to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE
304 | TCP_OPTS_FLAG_TSTAMP | TCP_OPTS_FLAG_MSS);
305
306 for (; opts_len > 0; opts_len -= opt_len, data += opt_len)
307 {
308 kind = data[0];
309
310 /* Get options length */
311 if (kind == TCP_OPTION_EOL)
312 break;
313 else if (kind == TCP_OPTION_NOOP)
314 {
315 opt_len = 1;
316 continue;
317 }
318 else
319 {
320 /* broken options */
321 if (opts_len < 2)
322 return -1;
323 opt_len = data[1];
324
325 /* weird option length */
326 if (opt_len < 2 || opt_len > opts_len)
327 return -1;
328 }
329
330 /* Parse options */
331 switch (kind)
332 {
333 case TCP_OPTION_MSS:
334 if (!is_syn)
335 break;
336 if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th))
337 {
338 to->flags |= TCP_OPTS_FLAG_MSS;
339 to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2));
340 }
341 break;
342 case TCP_OPTION_WINDOW_SCALE:
343 if (!is_syn)
344 break;
345 if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th))
346 {
347 to->flags |= TCP_OPTS_FLAG_WSCALE;
348 to->wscale = data[2];
349 if (to->wscale > TCP_MAX_WND_SCALE)
350 to->wscale = TCP_MAX_WND_SCALE;
351 }
352 break;
353 case TCP_OPTION_TIMESTAMP:
354 if (is_syn)
355 to->flags |= TCP_OPTS_FLAG_TSTAMP;
356 if ((to->flags & TCP_OPTS_FLAG_TSTAMP)
357 && opt_len == TCP_OPTION_LEN_TIMESTAMP)
358 {
359 to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2));
360 to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6));
361 }
362 break;
363 case TCP_OPTION_SACK_PERMITTED:
364 if (!is_syn)
365 break;
366 if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th))
367 to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
368 break;
369 case TCP_OPTION_SACK_BLOCK:
370 /* If SACK permitted was not advertised or a SYN, break */
371 if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th))
372 break;
373
374 /* If too short or not correctly formatted, break */
375 if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK))
376 break;
377
378 to->flags |= TCP_OPTS_FLAG_SACK;
379 to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK;
380 vec_reset_length (to->sacks);
381 for (j = 0; j < to->n_sack_blocks; j++)
382 {
383 b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 8 * j));
384 b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 8 * j));
385 vec_add1 (to->sacks, b);
386 }
387 break;
388 default:
389 /* Nothing to see here */
390 continue;
391 }
392 }
393 return 0;
394}
395
396/**
397 * Write TCP options to segment.
398 *
399 * @param data buffer where to write the options
400 * @param opts options to write
401 * @return length of options written
402 */
403always_inline u32
404tcp_options_write (u8 * data, tcp_options_t * opts)
405{
406 u32 opts_len = 0;
407 u32 buf, seq_len = 4;
408
409 if (tcp_opts_mss (opts))
410 {
411 *data++ = TCP_OPTION_MSS;
412 *data++ = TCP_OPTION_LEN_MSS;
413 buf = clib_host_to_net_u16 (opts->mss);
414 clib_memcpy_fast (data, &buf, sizeof (opts->mss));
415 data += sizeof (opts->mss);
416 opts_len += TCP_OPTION_LEN_MSS;
417 }
418
419 if (tcp_opts_wscale (opts))
420 {
421 *data++ = TCP_OPTION_WINDOW_SCALE;
422 *data++ = TCP_OPTION_LEN_WINDOW_SCALE;
423 *data++ = opts->wscale;
424 opts_len += TCP_OPTION_LEN_WINDOW_SCALE;
425 }
426
427 if (tcp_opts_sack_permitted (opts))
428 {
429 *data++ = TCP_OPTION_SACK_PERMITTED;
430 *data++ = TCP_OPTION_LEN_SACK_PERMITTED;
431 opts_len += TCP_OPTION_LEN_SACK_PERMITTED;
432 }
433
434 if (tcp_opts_tstamp (opts))
435 {
436 *data++ = TCP_OPTION_TIMESTAMP;
437 *data++ = TCP_OPTION_LEN_TIMESTAMP;
438 buf = clib_host_to_net_u32 (opts->tsval);
439 clib_memcpy_fast (data, &buf, sizeof (opts->tsval));
440 data += sizeof (opts->tsval);
441 buf = clib_host_to_net_u32 (opts->tsecr);
442 clib_memcpy_fast (data, &buf, sizeof (opts->tsecr));
443 data += sizeof (opts->tsecr);
444 opts_len += TCP_OPTION_LEN_TIMESTAMP;
445 }
446
447 if (tcp_opts_sack (opts))
448 {
449 int i;
450
451 if (opts->n_sack_blocks != 0)
452 {
453 *data++ = TCP_OPTION_SACK_BLOCK;
454 *data++ = 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
455 for (i = 0; i < opts->n_sack_blocks; i++)
456 {
457 buf = clib_host_to_net_u32 (opts->sacks[i].start);
458 clib_memcpy_fast (data, &buf, seq_len);
459 data += seq_len;
460 buf = clib_host_to_net_u32 (opts->sacks[i].end);
461 clib_memcpy_fast (data, &buf, seq_len);
462 data += seq_len;
463 }
464 opts_len += 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
465 }
466 }
467
Florin Corasf9e500e2020-12-18 13:30:45 -0800468 /* Terminate TCP options by padding with NOPs to a u32 boundary. Avoid using
469 * EOL because, it seems, it can break peers with broken option parsers that
470 * rely on options ending on a u32 boundary.
471 */
Florin Coras999840c2020-03-18 20:31:34 +0000472 while (opts_len % 4)
473 {
474 *data++ = TCP_OPTION_NOOP;
475 opts_len += TCP_OPTION_LEN_NOOP;
476 }
477 return opts_len;
478}
479
Dave Barach68b0fb02017-02-28 15:15:56 -0500480#endif /* included_tcp_packet_h */
481
482/*
483 * fd.io coding-style-patch-verification: ON
484 *
485 * Local Variables:
486 * eval: (c-set-style "gnu")
487 * End:
488 */