blob: e538abf65303702455da3501c617fbe269bfad25 [file] [log] [blame]
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001/*
2 * sfe_ipv4.c
3 * Shortcut forwarding engine - IPv4 edition.
4 *
5 * XXX - fill in the appropriate GPL notice.
6 */
7#include <linux/types.h>
8#include <linux/ip.h>
9#include <linux/tcp.h>
10#include <linux/module.h>
11#include <linux/skbuff.h>
12#include <linux/icmp.h>
13#include <linux/sysctl.h>
Dave Hudsonaaf97ca2013-06-13 17:52:29 +010014#include <linux/fs.h>
15#include <linux/pkt_sched.h>
16#include <linux/string.h>
17#include <net/route.h>
18#include <net/ip.h>
19#include <net/tcp.h>
20#include <asm/unaligned.h>
21#include <asm/uaccess.h>
22#include <linux/inetdevice.h>
23#include <linux/netfilter_ipv4.h>
24#include <linux/netfilter_bridge.h>
25#include <linux/if_bridge.h>
26#include <net/netfilter/nf_conntrack.h>
27#include <net/netfilter/nf_conntrack_acct.h>
28#include <net/netfilter/nf_conntrack_helper.h>
29#include <net/netfilter/nf_conntrack_l4proto.h>
30#include <net/netfilter/nf_conntrack_l3proto.h>
31#include <net/netfilter/nf_conntrack_zones.h>
32#include <net/netfilter/nf_conntrack_core.h>
33#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
34#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
35#include <net/arp.h>
36
37/*
38 * Select whether we "hook" in below or above the Ethernet bridge layer.
39 *
40 * XXX - note that hooking below the bridge (set this value to 0) will
41 * not currently work completely cleanly within Linux. In order to make
42 * this work properly we need to resync stats to Linux. Arguably if we
43 * want to do this we also need to validate that the source MAC address
44 * of any packets is actually correct too. Right now we're relying on
45 * the bridge layer to do this sort of thing for us.
46 */
47#define SFE_HOOK_ABOVE_BRIDGE 1
48
49/*
50 * Debug output verbosity level.
51 */
52#define DEBUG_LEVEL 2
53
54#if (DEBUG_LEVEL < 1)
55#define DEBUG_ERROR(s, ...)
56#else
57#define DEBUG_ERROR(s, ...) \
58 printk("%s[%u]: ERROR:", __FILE__, __LINE__); \
59 printk(s, ##__VA_ARGS__)
60#endif
61
62#if (DEBUG_LEVEL < 2)
63#define DEBUG_WARN(s, ...)
64#else
65#define DEBUG_WARN(s, ...) \
66 printk("%s[%u]: WARN:", __FILE__, __LINE__); \
67 printk(s, ##__VA_ARGS__);
68#endif
69
70#if (DEBUG_LEVEL < 3)
71#define DEBUG_INFO(s, ...)
72#else
73#define DEBUG_INFO(s, ...) \
74 printk("%s[%u]: INFO:", __FILE__, __LINE__); \
75 printk(s, ##__VA_ARGS__);
76#endif
77
78#if (DEBUG_LEVEL < 4)
79#define DEBUG_TRACE(s, ...)
80#else
81#define DEBUG_TRACE(s, ...) \
82 printk("%s[%u]: TRACE:", __FILE__, __LINE__); \
83 printk(s, ##__VA_ARGS__);
84#endif
85
86/*
87 * The default Linux ethhdr structure is "packed". It also has byte aligned
88 * MAC addresses and this leads to poor performance. This version is not
89 * packed and has better alignment for the MAC addresses.
90 */
91struct sfe_ipv4_ethhdr {
92 __be16 h_dest[ETH_ALEN / 2];
93 __be16 h_source[ETH_ALEN / 2];
94 __be16 h_proto;
95};
96
97/*
98 * The default Linux iphdr structure is "packed". This really hurts performance
Dave Hudson87973cd2013-10-22 16:00:04 +010099 * on many CPUs. Here's an aligned and "unpacked" version of the same thing.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100100 */
101struct sfe_ipv4_iphdr {
102#if defined(__LITTLE_ENDIAN_BITFIELD)
103 __u8 ihl:4,
104 version:4;
105#elif defined (__BIG_ENDIAN_BITFIELD)
106 __u8 version:4,
107 ihl:4;
108#else
109#error "Please fix <asm/byteorder.h>"
110#endif
111 __u8 tos;
112 __be16 tot_len;
113 __be16 id;
114 __be16 frag_off;
115 __u8 ttl;
116 __u8 protocol;
117 __sum16 check;
118 __be32 saddr;
119 __be32 daddr;
120 /*The options start here. */
121};
122
123/*
124 * The default Linux udphdr structure is "packed". This really hurts performance
Dave Hudson87973cd2013-10-22 16:00:04 +0100125 * on many CPUs. Here's an aligned and "unpacked" version of the same thing.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100126 */
127struct sfe_ipv4_udphdr {
128 __be16 source;
129 __be16 dest;
130 __be16 len;
131 __sum16 check;
132};
133
134/*
135 * The default Linux tcphdr structure is "packed". This really hurts performance
Dave Hudson87973cd2013-10-22 16:00:04 +0100136 * on many CPUs. Here's an aligned and "unpacked" version of the same thing.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100137 */
138struct sfe_ipv4_tcphdr {
139 __be16 source;
140 __be16 dest;
141 __be32 seq;
142 __be32 ack_seq;
143#if defined(__LITTLE_ENDIAN_BITFIELD)
144 __u16 res1:4,
145 doff:4,
146 fin:1,
147 syn:1,
148 rst:1,
149 psh:1,
150 ack:1,
151 urg:1,
152 ece:1,
153 cwr:1;
154#elif defined(__BIG_ENDIAN_BITFIELD)
155 __u16 doff:4,
156 res1:4,
157 cwr:1,
158 ece:1,
159 urg:1,
160 ack:1,
161 psh:1,
162 rst:1,
163 syn:1,
164 fin:1;
165#else
166#error "Adjust your <asm/byteorder.h> defines"
167#endif
168 __be16 window;
169 __sum16 check;
170 __be16 urg_ptr;
171};
172
173/*
174 * IPv4 connection flags.
175 */
176#define SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK 0x1
177 /* Indicates that we should not check sequence numbers */
178
179/*
180 * IPv4 connection creation structure.
181 */
182struct sfe_ipv4_create {
183 int protocol;
184 struct net_device *src_dev;
185 struct net_device *dest_dev;
186 uint32_t flags;
187 uint32_t src_mtu;
188 uint32_t dest_mtu;
Dave Hudson87973cd2013-10-22 16:00:04 +0100189 __be32 src_ip;
190 __be32 src_ip_xlate;
191 __be32 dest_ip;
192 __be32 dest_ip_xlate;
193 __be16 src_port;
194 __be16 src_port_xlate;
195 __be16 dest_port;
196 __be16 dest_port_xlate;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100197 uint8_t src_mac[ETH_ALEN];
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100198 uint8_t src_mac_xlate[ETH_ALEN];
Dave Hudson87973cd2013-10-22 16:00:04 +0100199 uint8_t dest_mac[ETH_ALEN];
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100200 uint8_t dest_mac_xlate[ETH_ALEN];
201 uint8_t src_td_window_scale;
202 uint32_t src_td_max_window;
203 uint32_t src_td_end;
204 uint32_t src_td_max_end;
205 uint8_t dest_td_window_scale;
206 uint32_t dest_td_max_window;
207 uint32_t dest_td_end;
208 uint32_t dest_td_max_end;
209};
210
211/*
212 * IPv4 connection destruction structure.
213 */
214struct sfe_ipv4_destroy {
215 int protocol;
Dave Hudson87973cd2013-10-22 16:00:04 +0100216 __be32 src_ip;
217 __be32 dest_ip;
218 __be16 src_port;
219 __be16 dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100220};
221
222/*
223 * IPv4 sync reasons.
224 */
225#define SFE_IPV4_SYNC_REASON_STATS 0 /* Sync is to synchronize stats */
226#define SFE_IPV4_SYNC_REASON_FLUSH 1 /* Sync is to flush a cache entry */
227#define SFE_IPV4_SYNC_REASON_EVICT 2 /* Sync is to evict a cache entry */
228
229/*
230 * Structure used to sync IPv4 connection stats/state back within the system.
231 *
232 * NOTE: The addresses here are NON-NAT addresses, i.e. the true endpoint addressing.
233 * 'src' is the creator of the connection.
234 */
235struct sfe_ipv4_sync {
236 int protocol; /* IP protocol number (IPPROTO_...) */
Dave Hudson87973cd2013-10-22 16:00:04 +0100237 __be32 src_ip; /* Non-NAT source address, i.e. the creator of the connection */
238 __be16 src_port; /* Non-NAT source port */
239 __be32 dest_ip; /* Non-NAT destination address, i.e. to whom the connection was created */
240 __be16 dest_port; /* Non-NAT destination port */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100241 uint32_t src_td_max_window;
242 uint32_t src_td_end;
243 uint32_t src_td_max_end;
244 uint64_t src_packet_count;
245 uint64_t src_byte_count;
246 uint32_t dest_td_max_window;
247 uint32_t dest_td_end;
248 uint32_t dest_td_max_end;
249 uint64_t dest_packet_count;
250 uint64_t dest_byte_count;
251 uint64_t delta_jiffies; /* Time to be added to the current timeout to keep the connection alive */
252 uint8_t reason; /* Reason of synchronization */
253};
254
255/*
256 * Specifies the lower bound on ACK numbers carried in the TCP header
257 */
258#define SFE_IPV4_TCP_MAX_ACK_WINDOW 65520
259
260/*
261 * IPv4 TCP connection match additional data.
262 */
263struct sfe_ipv4_tcp_connection_match {
264 uint8_t win_scale; /* Window scale */
265 uint32_t max_win; /* Maximum window size seen */
266 uint32_t end; /* Sequence number of the next byte to send (seq + segment length) */
267 uint32_t max_end; /* Sequence number of the last byte to ack */
268};
269
270/*
271 * Bit flags for IPv4 connection matching entry.
272 */
273#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC 0x1
274 /* Perform source translation */
275#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST 0x2
276 /* Perform destination translation */
277#define SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK 0x4
278 /* Ignore TCP sequence numbers */
279#define SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR 0x8
280 /* Fast Ethernet header write */
281
282/*
283 * IPv4 connection matching structure.
284 */
285struct sfe_ipv4_connection_match {
286 /*
287 * References to other objects.
288 */
289 struct sfe_ipv4_connection_match *next;
290 /* Next connection match entry in a list */
291 struct sfe_ipv4_connection_match *prev;
292 /* Previous connection match entry in a list */
293 struct sfe_ipv4_connection *connection;
294 /* Pointer to our connection */
295 struct sfe_ipv4_connection_match *counter_match;
296 /* Pointer to the connection match in the "counter" direction to this one */
297 struct sfe_ipv4_connection_match *active_next;
298 /* Pointer to the next connection in the active list */
299 struct sfe_ipv4_connection_match *active_prev;
300 /* Pointer to the previous connection in the active list */
301 bool active; /* Flag to indicate if we're on the active list */
302
303 /*
304 * Characteristics that identify flows that match this rule.
305 */
306 struct net_device *match_dev; /* Network device */
307 uint8_t match_protocol; /* Protocol */
Dave Hudson87973cd2013-10-22 16:00:04 +0100308 __be32 match_src_ip; /* Source IP address */
309 __be32 match_dest_ip; /* Destination IP address */
310 __be16 match_src_port; /* Source port/connection ident */
311 __be16 match_dest_port; /* Destination port/connection ident */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100312
313 /*
314 * Control the operations of the match.
315 */
316 uint32_t flags; /* Bit flags */
317
318 /*
319 * Connection state that we track once we match.
320 */
321 union { /* Protocol-specific state */
322 struct sfe_ipv4_tcp_connection_match tcp;
323 } protocol_state;
324 uint32_t rx_packet_count; /* Number of packets RX'd */
325 uint32_t rx_byte_count; /* Number of bytes RX'd */
326
327 /*
328 * Packet translation information.
329 */
Dave Hudson87973cd2013-10-22 16:00:04 +0100330 __be32 xlate_src_ip; /* Address after source translation */
331 __be16 xlate_src_port; /* Port/connection ident after source translation */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100332 uint16_t xlate_src_csum_adjustment;
333 /* Transport layer checksum adjustment after source translation */
Dave Hudson87973cd2013-10-22 16:00:04 +0100334 __be32 xlate_dest_ip; /* Address after destination translation */
335 __be16 xlate_dest_port; /* Port/connection ident after destination translation */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100336 uint16_t xlate_dest_csum_adjustment;
337 /* Transport layer checksum adjustment after destination translation */
338
339 /*
340 * Packet transmit information.
341 */
342 struct net_device *xmit_dev; /* Network device on which to transmit */
343 unsigned short int xmit_dev_mtu;
344 /* Interface MTU */
345 uint16_t xmit_dest_mac[ETH_ALEN / 2];
346 /* Destination MAC address to use when forwarding */
347 uint16_t xmit_src_mac[ETH_ALEN / 2];
348 /* Source MAC address to use when forwarding */
349
350 /*
351 * Summary stats.
352 */
353 uint64_t rx_packet_count64; /* Number of packets RX'd */
354 uint64_t rx_byte_count64; /* Number of bytes RX'd */
355};
356
357/*
358 * Per-connection data structure.
359 */
360struct sfe_ipv4_connection {
361 struct sfe_ipv4_connection *next;
362 /* Pointer to the next entry in a hash chain */
363 struct sfe_ipv4_connection *prev;
364 /* Pointer to the previous entry in a hash chain */
365 int protocol; /* IP protocol number */
Dave Hudson87973cd2013-10-22 16:00:04 +0100366 __be32 src_ip; /* Source IP address */
367 __be32 src_ip_xlate; /* NAT-translated source IP address */
368 __be32 dest_ip; /* Destination IP address */
369 __be32 dest_ip_xlate; /* NAT-translated destination IP address */
370 __be16 src_port; /* Source port */
371 __be16 src_port_xlate; /* NAT-translated source port */
372 __be16 dest_port; /* Destination port */
373 __be16 dest_port_xlate; /* NAT-translated destination port */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100374 struct sfe_ipv4_connection_match *original_match;
375 /* Original direction matching structure */
376 struct net_device *original_dev;
377 /* Original direction source device */
378 struct sfe_ipv4_connection_match *reply_match;
379 /* Reply direction matching structure */
380 struct net_device *reply_dev; /* Reply direction source device */
381 uint64_t last_sync_jiffies; /* Jiffies count for the last sync */
382 struct sfe_ipv4_connection *all_connections_next;
383 /* Pointer to the next entry in the list of all connections */
384 struct sfe_ipv4_connection *all_connections_prev;
385 /* Pointer to the previous entry in the list of all connections */
386 int iterators; /* Number of iterators currently using this connection */
387 bool pending_free; /* Flag that indicates that this connection should be freed after iteration */
388};
389
390/*
391 * IPv4 connections and hash table size information.
392 */
393#define SFE_IPV4_CONNECTION_HASH_SHIFT 12
394#define SFE_IPV4_CONNECTION_HASH_SIZE (1 << SFE_IPV4_CONNECTION_HASH_SHIFT)
395#define SFE_IPV4_CONNECTION_HASH_MASK (SFE_IPV4_CONNECTION_HASH_SIZE - 1)
396
397enum sfe_ipv4_exception_events {
398 SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE,
399 SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION,
400 SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT,
401 SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL,
402 SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION,
403 SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE,
404 SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS,
405 SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS,
406 SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT,
407 SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL,
408 SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION,
409 SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS,
410 SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE,
411 SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS,
412 SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK,
413 SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS,
414 SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE,
415 SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE,
416 SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE,
417 SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE,
418 SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE,
419 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE,
420 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4,
421 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE,
422 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE,
423 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE,
424 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL,
425 SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION,
426 SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION,
427 SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE,
428 SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH,
429 SFE_IPV4_EXCEPTION_EVENT_NON_V4,
430 SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT,
431 SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE,
432 SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE,
433 SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL,
434 SFE_IPV4_EXCEPTION_EVENT_LAST
435};
436
437static char *sfe_ipv4_exception_events_string[SFE_IPV4_EXCEPTION_EVENT_LAST] = {
438 "UDP_HEADER_INCOMPLETE",
439 "UDP_NO_CONNECTION",
440 "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT",
441 "UDP_SMALL_TTL",
442 "UDP_NEEDS_FRAGMENTATION",
443 "TCP_HEADER_INCOMPLETE",
444 "TCP_NO_CONNECTION_SLOW_FLAGS",
445 "TCP_NO_CONNECTION_FAST_FLAGS",
446 "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT",
447 "TCP_SMALL_TTL",
448 "TCP_NEEDS_FRAGMENTATION",
449 "TCP_FLAGS",
450 "TCP_SEQ_EXCEEDS_RIGHT_EDGE",
451 "TCP_SMALL_DATA_OFFS",
452 "TCP_BAD_SACK",
453 "TCP_BIG_DATA_OFFS",
454 "TCP_SEQ_BEFORE_LEFT_EDGE",
455 "TCP_ACK_EXCEEDS_RIGHT_EDGE",
456 "TCP_ACK_BEFORE_LEFT_EDGE",
457 "ICMP_HEADER_INCOMPLETE",
458 "ICMP_UNHANDLED_TYPE",
459 "ICMP_IPV4_HEADER_INCOMPLETE",
460 "ICMP_IPV4_NON_V4",
461 "ICMP_IPV4_IP_OPTIONS_INCOMPLETE",
462 "ICMP_IPV4_UDP_HEADER_INCOMPLETE",
463 "ICMP_IPV4_TCP_HEADER_INCOMPLETE",
464 "ICMP_IPV4_UNHANDLED_PROTOCOL",
465 "ICMP_NO_CONNECTION",
466 "ICMP_FLUSHED_CONNECTION",
467 "HEADER_INCOMPLETE",
468 "BAD_TOTAL_LENGTH",
469 "NON_V4",
470 "NON_INITIAL_FRAGMENT",
471 "DATAGRAM_INCOMPLETE",
472 "IP_OPTIONS_INCOMPLETE",
473 "UNHANDLED_PROTOCOL"
474};
475
476/*
477 * Per-modules structure.
478 */
479struct sfe_ipv4 {
480 spinlock_t lock; /* Lock for SMP correctness */
481 struct sfe_ipv4_connection_match *active_head;
482 /* Head of the list of recently active connections */
483 struct sfe_ipv4_connection_match *active_tail;
484 /* Tail of the list of recently active connections */
485 struct sfe_ipv4_connection *all_connections_head;
486 /* Head of the list of all connections */
487 struct sfe_ipv4_connection *all_connections_tail;
488 /* Tail of the list of all connections */
489 unsigned int num_connections; /* Number of connections */
490 struct timer_list timer; /* Timer used for periodic sync ops */
491 struct sfe_ipv4_connection *conn_hash[SFE_IPV4_CONNECTION_HASH_SIZE];
492 /* Connection hash table */
493 struct sfe_ipv4_connection_match *conn_match_hash[SFE_IPV4_CONNECTION_HASH_SIZE];
494 /* Connection match hash table */
495
496 /*
497 * Statistics.
498 */
499 uint32_t connection_create_requests;
500 /* Number of IPv4 connection create requests */
501 uint32_t connection_create_collisions;
502 /* Number of IPv4 connection create requests that collided with existing hash table entries */
503 uint32_t connection_destroy_requests;
504 /* Number of IPv4 connection destroy requests */
505 uint32_t connection_destroy_misses;
506 /* Number of IPv4 connection destroy requests that missed our hash table */
507 uint32_t connection_match_hash_hits;
508 /* Number of IPv4 connection match hash hits */
509 uint32_t connection_match_hash_reorders;
510 /* Number of IPv4 connection match hash reorders */
511 uint32_t connection_flushes; /* Number of IPv4 connection flushes */
512 uint32_t packets_forwarded; /* Number of IPv4 packets forwarded */
513 uint32_t packets_not_forwarded; /* Number of IPv4 packets not forwarded */
514 uint32_t exception_events[SFE_IPV4_EXCEPTION_EVENT_LAST];
515
516 /*
517 * Summary tatistics.
518 */
519 uint64_t connection_create_requests64;
520 /* Number of IPv4 connection create requests */
521 uint64_t connection_create_collisions64;
522 /* Number of IPv4 connection create requests that collided with existing hash table entries */
523 uint64_t connection_destroy_requests64;
524 /* Number of IPv4 connection destroy requests */
525 uint64_t connection_destroy_misses64;
526 /* Number of IPv4 connection destroy requests that missed our hash table */
527 uint64_t connection_match_hash_hits64;
528 /* Number of IPv4 connection match hash hits */
529 uint64_t connection_match_hash_reorders64;
530 /* Number of IPv4 connection match hash reorders */
531 uint64_t connection_flushes64; /* Number of IPv4 connection flushes */
532 uint64_t packets_forwarded64; /* Number of IPv4 packets forwarded */
533 uint64_t packets_not_forwarded64;
534 /* Number of IPv4 packets not forwarded */
535 uint64_t exception_events64[SFE_IPV4_EXCEPTION_EVENT_LAST];
536
537 /*
538 * Control state.
539 */
540 struct kobject *sys_sfe_ipv4; /* sysfs linkage */
541 int pause; /* Flag that, when non-zero, pauses all SFE processing */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100542 int debug_dev; /* Major number of the debug char device */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100543
544 /*
545 * Callback notifiers.
546 */
547 struct notifier_block dev_notifier;
548 /* Device notifier */
549 struct notifier_block inet_notifier;
550 /* IP notifier */
551};
552
553/*
554 * Enumeration of the XML output.
555 */
556enum sfe_ipv4_debug_xml_states {
557 SFE_IPV4_DEBUG_XML_STATE_START,
558 SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_START,
559 SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_CONNECTION,
560 SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_END,
561 SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_START,
562 SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION,
563 SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_END,
564 SFE_IPV4_DEBUG_XML_STATE_STATS,
565 SFE_IPV4_DEBUG_XML_STATE_END,
566 SFE_IPV4_DEBUG_XML_STATE_DONE
567};
568
569/*
570 * XML write state.
571 */
572struct sfe_ipv4_debug_xml_write_state {
573 enum sfe_ipv4_debug_xml_states state;
574 /* XML output file state machine state */
575 struct sfe_ipv4_connection *iter_conn;
576 /* Next connection iterator */
577 int iter_exception; /* Next exception iterator */
578};
579
580typedef bool (*sfe_ipv4_debug_xml_write_method_t)(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
581 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws);
582
583struct sfe_ipv4 __si;
584
585/*
586 * Expose what should be a static flag in the TCP connection tracker.
587 */
588extern int nf_ct_tcp_no_window_check;
589
590/*
591 * Expose the hook for the receive processing.
592 */
593extern int (*athrs_fast_nat_recv)(struct sk_buff *skb);
594
595/*
596 * sfe_ipv4_gen_ip_csum()
597 * Generate the IP checksum for an IPv4 header.
598 *
599 * Note that this function assumes that we have only 20 bytes of IP header.
600 */
601static inline uint16_t sfe_ipv4_gen_ip_csum(struct sfe_ipv4_iphdr *iph)
602{
603 uint32_t sum;
604 uint16_t *i = (uint16_t *)iph;
605
606 iph->check = 0;
607
608 /*
609 * Generate the sum.
610 */
611 sum = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9];
612
613 /*
614 * Fold it to ones-complement form.
615 */
616 sum = (sum & 0xffff) + (sum >> 16);
617 sum = (sum & 0xffff) + (sum >> 16);
618
619 return (uint16_t)sum ^ 0xffff;
620}
621
622/*
623 * sfe_ipv4_get_connection_match_hash()
624 * Generate the hash used in connection match lookups.
625 */
626static inline unsigned int sfe_ipv4_get_connection_match_hash(struct net_device *dev, uint8_t protocol,
Dave Hudson87973cd2013-10-22 16:00:04 +0100627 __be32 src_ip, __be16 src_port,
628 __be32 dest_ip, __be16 dest_port)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100629{
630 size_t dev_addr = (size_t)dev;
Dave Hudson87973cd2013-10-22 16:00:04 +0100631 uint32_t hash = ((uint32_t)dev_addr) ^ ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100632 return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK;
633}
634
635/*
636 * sfe_ipv4_find_sfe_ipv4_connection_match()
637 * Get the IPv4 flow match info that corresponds to a particular 5-tuple.
638 *
639 * On entry we must be holding the lock that protects the hash table.
640 */
641static struct sfe_ipv4_connection_match *
642sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, uint8_t protocol,
Dave Hudson87973cd2013-10-22 16:00:04 +0100643 __be32 src_ip, __be16 src_port,
644 __be32 dest_ip, __be16 dest_port) __attribute__((always_inline));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100645static struct sfe_ipv4_connection_match *
646sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, uint8_t protocol,
Dave Hudson87973cd2013-10-22 16:00:04 +0100647 __be32 src_ip, __be16 src_port,
648 __be32 dest_ip, __be16 dest_port)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100649{
650 struct sfe_ipv4_connection_match *cm;
651 struct sfe_ipv4_connection_match *head;
652 unsigned int conn_match_idx;
653
654 conn_match_idx = sfe_ipv4_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port);
655 cm = si->conn_match_hash[conn_match_idx];
656
657 /*
658 * If we don't have anything in this chain then bale.
659 */
660 if (unlikely(!cm)) {
661 return cm;
662 }
663
664 /*
665 * Hopefully the first entry is the one we want.
666 */
667 if (likely(cm->match_src_port == src_port)
668 && likely(cm->match_dest_port == dest_port)
669 && likely(cm->match_src_ip == src_ip)
670 && likely(cm->match_dest_ip == dest_ip)
671 && likely(cm->match_protocol == protocol)
672 && likely(cm->match_dev == dev)) {
673 si->connection_match_hash_hits++;
674 return cm;
675 }
676
677 /*
678 * We may or may not have a matching entry but if we do then we want to
679 * move that entry to the top of the hash chain when we get to it. We
680 * presume that this will be reused again very quickly.
681 */
682 head = cm;
683 do {
684 cm = cm->next;
685 } while (cm && (cm->match_src_port != src_port
686 || cm->match_dest_port != dest_port
687 || cm->match_src_ip != src_ip
688 || cm->match_dest_ip != dest_ip
689 || cm->match_protocol != protocol
690 || cm->match_dev != dev));
691
692 /*
693 * Not found then we're done.
694 */
695 if (unlikely(!cm)) {
696 return cm;
697 }
698
699 /*
700 * We found a match so move it.
701 */
702 if (cm->next) {
703 cm->next->prev = cm->prev;
704 }
705 cm->prev->next = cm->next;
706 cm->prev = NULL;
707 cm->next = head;
708 head->prev = cm;
709 si->conn_match_hash[conn_match_idx] = cm;
710 si->connection_match_hash_reorders++;
711
712 return cm;
713}
714
715/*
716 * sfe_ipv4_connection_match_update_summary_stats()
717 * Update the summary stats for a connection match entry.
718 */
719static inline void sfe_ipv4_connection_match_update_summary_stats(struct sfe_ipv4_connection_match *cm)
720{
721 cm->rx_packet_count64 += cm->rx_packet_count;
722 cm->rx_packet_count = 0;
723 cm->rx_byte_count64 += cm->rx_byte_count;
724 cm->rx_byte_count = 0;
725}
726
727/*
728 * sfe_ipv4_connection_match_compute_translations()
729 * Compute port and address translations for a connection match entry.
730 */
731static void sfe_ipv4_connection_match_compute_translations(struct sfe_ipv4_connection_match *cm)
732{
733 /*
734 * Before we insert the entry look to see if this is tagged as doing address
735 * translations. If it is then work out the adjustment that we need to apply
736 * to the transport checksum.
737 */
738 if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) {
739 /*
740 * Precompute an incremental checksum adjustment so we can
741 * edit packets in this stream very quickly. The algorithm is from RFC1624.
742 */
743 uint16_t src_ip_hi = cm->match_src_ip >> 16;
744 uint16_t src_ip_lo = cm->match_src_ip & 0xffff;
745 uint32_t xlate_src_ip = ~cm->xlate_src_ip;
746 uint16_t xlate_src_ip_hi = xlate_src_ip >> 16;
747 uint16_t xlate_src_ip_lo = xlate_src_ip & 0xffff;
Dave Hudson87973cd2013-10-22 16:00:04 +0100748 uint16_t xlate_src_port = ~cm->xlate_src_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100749 uint32_t adj;
750
751 /*
752 * When we compute this fold it down to a 16-bit offset
753 * as that way we can avoid having to do a double
754 * folding of the twos-complement result because the
755 * addition of 2 16-bit values cannot cause a double
756 * wrap-around!
757 */
758 adj = src_ip_hi + src_ip_lo + cm->match_src_port
759 + xlate_src_ip_hi + xlate_src_ip_lo + xlate_src_port;
760 adj = (adj & 0xffff) + (adj >> 16);
761 adj = (adj & 0xffff) + (adj >> 16);
762 cm->xlate_src_csum_adjustment = (uint16_t)adj;
763
764 }
765
766 if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) {
767 /*
768 * Precompute an incremental checksum adjustment so we can
769 * edit packets in this stream very quickly. The algorithm is from RFC1624.
770 */
771 uint16_t dest_ip_hi = cm->match_dest_ip >> 16;
772 uint16_t dest_ip_lo = cm->match_dest_ip & 0xffff;
773 uint32_t xlate_dest_ip = ~cm->xlate_dest_ip;
774 uint16_t xlate_dest_ip_hi = xlate_dest_ip >> 16;
775 uint16_t xlate_dest_ip_lo = xlate_dest_ip & 0xffff;
Dave Hudson87973cd2013-10-22 16:00:04 +0100776 uint16_t xlate_dest_port = ~cm->xlate_dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100777 uint32_t adj;
778
779 /*
780 * When we compute this fold it down to a 16-bit offset
781 * as that way we can avoid having to do a double
782 * folding of the twos-complement result because the
783 * addition of 2 16-bit values cannot cause a double
784 * wrap-around!
785 */
786 adj = dest_ip_hi + dest_ip_lo + cm->match_dest_port
787 + xlate_dest_ip_hi + xlate_dest_ip_lo + xlate_dest_port;
788 adj = (adj & 0xffff) + (adj >> 16);
789 adj = (adj & 0xffff) + (adj >> 16);
790 cm->xlate_dest_csum_adjustment = (uint16_t)adj;
791 }
792}
793
794/*
795 * sfe_ipv4_update_summary_stats()
796 * Update the summary stats.
797 */
798static void sfe_ipv4_update_summary_stats(struct sfe_ipv4 *si)
799{
800 int i;
801
802 si->connection_create_requests64 += si->connection_create_requests;
803 si->connection_create_requests = 0;
804 si->connection_create_collisions64 += si->connection_create_collisions;
805 si->connection_create_collisions = 0;
806 si->connection_destroy_requests64 += si->connection_destroy_requests;
807 si->connection_destroy_requests = 0;
808 si->connection_destroy_misses64 += si->connection_destroy_misses;
809 si->connection_destroy_misses = 0;
810 si->connection_match_hash_hits64 += si->connection_match_hash_hits;
811 si->connection_match_hash_hits = 0;
812 si->connection_match_hash_reorders64 += si->connection_match_hash_reorders;
813 si->connection_match_hash_reorders = 0;
814 si->connection_flushes64 += si->connection_flushes;
815 si->connection_flushes = 0;
816 si->packets_forwarded64 += si->packets_forwarded;
817 si->packets_forwarded = 0;
818 si->packets_not_forwarded64 += si->packets_not_forwarded;
819 si->packets_not_forwarded = 0;
820
821 for (i = 0; i < SFE_IPV4_EXCEPTION_EVENT_LAST; i++) {
822 si->exception_events64[i] += si->exception_events[i];
823 si->exception_events[i] = 0;
824 }
825}
826
827/*
828 * sfe_ipv4_insert_sfe_ipv4_connection_match()
829 * Insert a connection match into the hash.
830 *
831 * On entry we must be holding the lock that protects the hash table.
832 */
833static inline void sfe_ipv4_insert_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm)
834{
835 struct sfe_ipv4_connection_match **hash_head;
836 struct sfe_ipv4_connection_match *prev_head;
837 unsigned int conn_match_idx
838 = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol,
839 cm->match_src_ip, cm->match_src_port,
840 cm->match_dest_ip, cm->match_dest_port);
841 hash_head = &si->conn_match_hash[conn_match_idx];
842 prev_head = *hash_head;
843 cm->prev = NULL;
844 if (prev_head) {
845 prev_head->prev = cm;
846 }
847
848 cm->next = prev_head;
849 *hash_head = cm;
850}
851
852/*
853 * sfe_ipv4_remove_sfe_ipv4_connection_match()
854 * Remove a connection match object from the hash.
855 *
856 * On entry we must be holding the lock that protects the hash table.
857 */
858static inline void sfe_ipv4_remove_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm)
859{
860 /*
861 * Unlink the connection match entry from the hash.
862 */
863 if (cm->prev) {
864 cm->prev->next = cm->next;
865 } else {
866 unsigned int conn_match_idx
867 = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol,
868 cm->match_src_ip, cm->match_src_port,
869 cm->match_dest_ip, cm->match_dest_port);
870 si->conn_match_hash[conn_match_idx] = cm->next;
871 }
872
873 if (cm->next) {
874 cm->next->prev = cm->prev;
875 }
876
877 /*
878 * Unlink the connection match entry from the active list.
879 */
880 if (likely(cm->active_prev)) {
881 cm->active_prev->active_next = cm->active_next;
882 } else {
883 si->active_head = cm->active_next;
884 }
885
886 if (likely(cm->active_next)) {
887 cm->active_next->active_prev = cm->active_prev;
888 } else {
889 si->active_tail = cm->active_prev;
890 }
891
892}
893
894/*
895 * sfe_ipv4_get_connection_hash()
896 * Generate the hash used in connection lookups.
897 */
Dave Hudson87973cd2013-10-22 16:00:04 +0100898static inline unsigned int sfe_ipv4_get_connection_hash(uint8_t protocol, __be32 src_ip, __be16 src_port,
899 __be32 dest_ip, __be16 dest_port)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100900{
Dave Hudson87973cd2013-10-22 16:00:04 +0100901 uint32_t hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100902 return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK;
903}
904
905/*
906 * sfe_ipv4_find_sfe_ipv4_connection()
907 * Get the IPv4 connection info that corresponds to a particular 5-tuple.
908 *
909 * On entry we must be holding the lock that protects the hash table.
910 */
911static inline struct sfe_ipv4_connection *sfe_ipv4_find_sfe_ipv4_connection(struct sfe_ipv4 *si, uint32_t protocol,
Dave Hudson87973cd2013-10-22 16:00:04 +0100912 __be32 src_ip, __be16 src_port,
913 __be32 dest_ip, __be16 dest_port)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100914{
915 struct sfe_ipv4_connection *c;
916 unsigned int conn_idx = sfe_ipv4_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port);
917 c = si->conn_hash[conn_idx];
918
919 /*
920 * If we don't have anything in this chain then bale.
921 */
922 if (unlikely(!c)) {
923 return c;
924 }
925
926 /*
927 * Hopefully the first entry is the one we want.
928 */
929 if (likely(c->src_port == src_port)
930 && likely(c->dest_port == dest_port)
931 && likely(c->src_ip == src_ip)
932 && likely(c->dest_ip == dest_ip)
933 && likely(c->protocol == protocol)) {
934 return c;
935 }
936
937 /*
938 * We may or may not have a matching entry but if we do then we want to
939 * move that entry to the top of the hash chain when we get to it. We
940 * presume that this will be reused again very quickly.
941 */
942 do {
943 c = c->next;
944 } while (c && (c->src_port != src_port
945 || c->dest_port != dest_port
946 || c->src_ip != src_ip
947 || c->dest_ip != dest_ip
948 || c->protocol != protocol));
949
950 /*
951 * Will need connection entry for next create/destroy metadata,
952 * So no need to re-order entry for these requests
953 */
954 return c;
955}
956
957/*
958 * sfe_ipv4_insert_sfe_ipv4_connection()
959 * Insert a connection into the hash.
960 *
961 * On entry we must be holding the lock that protects the hash table.
962 */
963static void sfe_ipv4_insert_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
964{
965 struct sfe_ipv4_connection **hash_head;
966 struct sfe_ipv4_connection *prev_head;
967 unsigned int conn_idx;
968
969 /*
970 * Insert entry into the connection hash.
971 */
972 conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port,
973 c->dest_ip, c->dest_port);
974 hash_head = &si->conn_hash[conn_idx];
975 prev_head = *hash_head;
976 c->prev = NULL;
977 if (prev_head) {
978 prev_head->prev = c;
979 }
980
981 c->next = prev_head;
982 *hash_head = c;
983
984 /*
985 * Insert entry into the "all connections" list.
986 */
987 if (si->all_connections_tail) {
988 c->all_connections_prev = si->all_connections_tail;
989 si->all_connections_tail->all_connections_next = c;
990 } else {
991 c->all_connections_prev = NULL;
992 si->all_connections_head = c;
993 }
994
995 si->all_connections_tail = c;
996 c->all_connections_next = NULL;
997 si->num_connections++;
998
999 /*
1000 * Insert the connection match objects too.
1001 */
1002 sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->original_match);
1003 sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->reply_match);
1004}
1005
1006/*
1007 * sfe_ipv4_remove_sfe_ipv4_connection()
1008 * Remove a sfe_ipv4_connection object from the hash.
1009 *
1010 * On entry we must be holding the lock that protects the hash table.
1011 */
1012static void sfe_ipv4_remove_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
1013{
1014 /*
1015 * Remove the connection match objects.
1016 */
1017 sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->reply_match);
1018 sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->original_match);
1019
1020 /*
1021 * Unlink the connection.
1022 */
1023 if (c->prev) {
1024 c->prev->next = c->next;
1025 } else {
1026 unsigned int conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port,
1027 c->dest_ip, c->dest_port);
1028 si->conn_hash[conn_idx] = c->next;
1029 }
1030
1031 if (c->next) {
1032 c->next->prev = c->prev;
1033 }
1034}
1035
1036/*
1037 * sfe_ipv4_sync_rule()
1038 * Synchronize a connection's state.
1039 */
1040static void sfe_ipv4_sync_rule(struct sfe_ipv4_sync *sis)
1041{
1042 struct nf_conntrack_tuple_hash *h;
1043 struct nf_conntrack_tuple tuple;
1044 struct nf_conn *ct;
1045 struct nf_conn_counter *acct;
1046
1047 /*
1048 * Create a tuple so as to be able to look up a connection
1049 */
1050 memset(&tuple, 0, sizeof(tuple));
1051 tuple.src.u3.ip = sis->src_ip;
1052 tuple.src.u.all = (__be16)sis->src_port;
1053 tuple.src.l3num = AF_INET;
1054
1055 tuple.dst.u3.ip = sis->dest_ip;
1056 tuple.dst.dir = IP_CT_DIR_ORIGINAL;
1057 tuple.dst.protonum = (uint8_t)sis->protocol;
1058 tuple.dst.u.all = (__be16)sis->dest_port;
1059
1060 DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n",
1061 (int)tuple.dst.protonum,
Dave Hudson87973cd2013-10-22 16:00:04 +01001062 &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all),
1063 &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001064
1065 /*
1066 * Look up conntrack connection
1067 */
1068 h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
1069 if (unlikely(!h)) {
1070 DEBUG_TRACE("no connection found\n");
1071 return;
1072 }
1073
1074 ct = nf_ct_tuplehash_to_ctrack(h);
1075 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1076
1077 /*
1078 * Only update if this is not a fixed timeout
1079 */
1080 if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1081 ct->timeout.expires += sis->delta_jiffies;
1082 }
1083
1084 acct = nf_conn_acct_find(ct);
1085 if (acct) {
1086 spin_lock_bh(&ct->lock);
1087 atomic64_add(sis->src_packet_count, &acct[IP_CT_DIR_ORIGINAL].packets);
1088 atomic64_add(sis->src_byte_count, &acct[IP_CT_DIR_ORIGINAL].bytes);
1089 atomic64_add(sis->dest_packet_count, &acct[IP_CT_DIR_REPLY].packets);
1090 atomic64_add(sis->dest_byte_count, &acct[IP_CT_DIR_REPLY].bytes);
1091 spin_unlock_bh(&ct->lock);
1092 }
1093
1094 switch (sis->protocol) {
1095 case IPPROTO_TCP:
1096 spin_lock_bh(&ct->lock);
1097 if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) {
1098 ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window;
1099 }
1100 if ((int32_t)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) {
1101 ct->proto.tcp.seen[0].td_end = sis->src_td_end;
1102 }
1103 if ((int32_t)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) {
1104 ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end;
1105 }
1106 if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) {
1107 ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window;
1108 }
1109 if ((int32_t)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) {
1110 ct->proto.tcp.seen[1].td_end = sis->dest_td_end;
1111 }
1112 if ((int32_t)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) {
1113 ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end;
1114 }
1115 spin_unlock_bh(&ct->lock);
1116 break;
1117 }
1118
1119 /*
1120 * Release connection
1121 */
1122 nf_ct_put(ct);
1123}
1124
1125/*
1126 * sfe_ipv4_sync_sfe_ipv4_connection()
1127 * Sync a connection.
1128 *
1129 * On entry to this function we expect that the lock for the connection is either
1130 * already held or isn't required.
1131 */
1132static void sfe_ipv4_gen_sync_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c,
1133 struct sfe_ipv4_sync *sis, uint64_t now_jiffies)
1134{
1135 struct sfe_ipv4_connection_match *original_cm;
1136 struct sfe_ipv4_connection_match *reply_cm;
1137
1138 /*
1139 * Fill in the update message.
1140 */
1141 sis->protocol = c->protocol;
1142 sis->src_ip = c->src_ip;
1143 sis->dest_ip = c->dest_ip;
1144 sis->src_port = c->src_port;
1145 sis->dest_port = c->dest_port;
1146
1147 original_cm = c->original_match;
1148 reply_cm = c->reply_match;
1149 sis->src_td_max_window = original_cm->protocol_state.tcp.max_win;
1150 sis->src_td_end = original_cm->protocol_state.tcp.end;
1151 sis->src_td_max_end = original_cm->protocol_state.tcp.max_end;
1152 sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win;
1153 sis->dest_td_end = reply_cm->protocol_state.tcp.end;
1154 sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end;
1155
1156 sfe_ipv4_connection_match_update_summary_stats(original_cm);
1157 sfe_ipv4_connection_match_update_summary_stats(reply_cm);
1158
1159 sis->src_packet_count = original_cm->rx_packet_count64;
1160 sis->src_byte_count = original_cm->rx_byte_count64;
1161 sis->dest_packet_count = reply_cm->rx_packet_count64;
1162 sis->dest_byte_count = reply_cm->rx_byte_count64;
1163
1164 /*
1165 * Get the time increment since our last sync.
1166 */
1167 sis->delta_jiffies = now_jiffies - c->last_sync_jiffies;
1168 c->last_sync_jiffies = now_jiffies;
1169}
1170
1171/*
1172 * sfe_ipv4_decrement_sfe_ipv4_connection_iterator()
1173 * Remove an iterator from a connection - free all resources if necessary.
1174 *
1175 * Returns true if the connection should now be free, false if not.
1176 *
1177 * We must be locked on entry to this function.
1178 */
1179static bool sfe_ipv4_decrement_sfe_ipv4_connection_iterator(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
1180{
1181 /*
1182 * Are we the last iterator for this connection?
1183 */
1184 c->iterators--;
1185 if (c->iterators) {
1186 return false;
1187 }
1188
1189 /*
1190 * Is this connection marked for deletion?
1191 */
1192 if (!c->pending_free) {
1193 return false;
1194 }
1195
1196 /*
1197 * We're ready to delete this connection so unlink it from the "all
1198 * connections" list.
1199 */
1200 si->num_connections--;
1201 if (c->all_connections_prev) {
1202 c->all_connections_prev->all_connections_next = c->all_connections_next;
1203 } else {
1204 si->all_connections_head = c->all_connections_next;
1205 }
1206
1207 if (c->all_connections_next) {
1208 c->all_connections_next->all_connections_prev = c->all_connections_prev;
1209 } else {
1210 si->all_connections_tail = c->all_connections_prev;
1211 }
1212
1213 return true;
1214}
1215
1216/*
1217 * sfe_ipv4_flush_sfe_ipv4_connection()
1218 * Flush a connection and free all associated resources.
1219 *
1220 * We need to be called with bottom halves disabled locally as we need to acquire
1221 * the connection hash lock and release it again. In general we're actually called
1222 * from within a BH and so we're fine, but we're also called when connections are
1223 * torn down.
1224 */
1225static void sfe_ipv4_flush_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
1226{
1227 struct sfe_ipv4_sync sis;
1228 uint64_t now_jiffies;
1229 bool pending_free = false;
1230
1231 spin_lock(&si->lock);
1232 si->connection_flushes++;
1233
1234 /*
1235 * Check that we're not currently being iterated. If we are then
1236 * we can't free this entry yet but must mark it pending a free. If it's
1237 * not being iterated then we can unlink it from the list of all
1238 * connections.
1239 */
1240 if (c->iterators) {
1241 pending_free = true;
1242 c->pending_free = true;
1243 } else {
1244 si->num_connections--;
1245 if (c->all_connections_prev) {
1246 c->all_connections_prev->all_connections_next = c->all_connections_next;
1247 } else {
1248 si->all_connections_head = c->all_connections_next;
1249 }
1250
1251 if (c->all_connections_next) {
1252 c->all_connections_next->all_connections_prev = c->all_connections_prev;
1253 } else {
1254 si->all_connections_tail = c->all_connections_prev;
1255 }
1256 }
1257
1258 spin_unlock(&si->lock);
1259
1260 /*
1261 * Generate a sync message and then sync.
1262 */
1263 now_jiffies = get_jiffies_64();
1264 sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, now_jiffies);
1265 sfe_ipv4_sync_rule(&sis);
1266
1267 /*
1268 * If we can't yet free the underlying memory then we're done.
1269 */
1270 if (pending_free) {
1271 return;
1272 }
1273
1274 /*
1275 * Release our hold of the source and dest devices and free the memory
1276 * for our connection objects.
1277 */
1278 dev_put(c->original_dev);
1279 dev_put(c->reply_dev);
1280 kfree(c->original_match);
1281 kfree(c->reply_match);
1282 kfree(c);
1283}
1284
1285/*
1286 * sfe_ipv4_recv_udp()
1287 * Handle UDP packet receives and forwarding.
1288 */
1289static int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
1290 unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl, bool flush_on_find)
1291{
1292 struct sfe_ipv4_udphdr *udph;
Dave Hudson87973cd2013-10-22 16:00:04 +01001293 __be32 src_ip;
1294 __be32 dest_ip;
1295 __be16 src_port;
1296 __be16 dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001297 struct sfe_ipv4_connection_match *cm;
1298 uint8_t ttl;
1299 struct net_device *xmit_dev;
1300
1301 /*
1302 * Is our packet too short to contain a valid UDP header?
1303 */
1304 if (unlikely(len < (sizeof(struct sfe_ipv4_udphdr) + ihl))) {
1305 spin_lock(&si->lock);
1306 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++;
1307 si->packets_not_forwarded++;
1308 spin_unlock(&si->lock);
1309
1310 DEBUG_TRACE("packet too short for UDP header\n");
1311 return 0;
1312 }
1313
1314 /*
1315 * Read the IP address and port information. Read the IP header data first
1316 * because we've almost certainly got that in the cache. We may not yet have
1317 * the UDP header cached though so allow more time for any prefetching.
1318 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001319 src_ip = iph->saddr;
1320 dest_ip = iph->daddr;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001321
1322 udph = (struct sfe_ipv4_udphdr *)(skb->data + ihl);
Dave Hudson87973cd2013-10-22 16:00:04 +01001323 src_port = udph->source;
1324 dest_port = udph->dest;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001325
1326 spin_lock(&si->lock);
1327
1328 /*
1329 * Look for a connection match.
1330 */
1331 cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port);
1332 if (unlikely(!cm)) {
1333 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION]++;
1334 si->packets_not_forwarded++;
1335 spin_unlock(&si->lock);
1336
1337 DEBUG_TRACE("no connection found\n");
1338 return 0;
1339 }
1340
1341 /*
1342 * If our packet has beern marked as "flush on find" we can't actually
1343 * forward it in the fast path, but now that we've found an associated
1344 * connection we can flush that out before we process the packet.
1345 */
1346 if (unlikely(flush_on_find)) {
1347 struct sfe_ipv4_connection *c = cm->connection;
1348 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1349 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++;
1350 si->packets_not_forwarded++;
1351 spin_unlock(&si->lock);
1352
1353 DEBUG_TRACE("flush on find\n");
1354 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1355 return 0;
1356 }
1357
1358 /*
1359 * Does our TTL allow forwarding?
1360 */
1361 ttl = iph->ttl;
1362 if (unlikely(ttl < 2)) {
1363 struct sfe_ipv4_connection *c = cm->connection;
1364 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1365 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL]++;
1366 si->packets_not_forwarded++;
1367 spin_unlock(&si->lock);
1368
1369 DEBUG_TRACE("ttl too low\n");
1370 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1371 return 0;
1372 }
1373
1374 /*
1375 * If our packet is larger than the MTU of the transmit interface then
1376 * we can't forward it easily.
1377 */
1378 if (unlikely(len > cm->xmit_dev_mtu)) {
1379 struct sfe_ipv4_connection *c = cm->connection;
1380 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1381 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++;
1382 si->packets_not_forwarded++;
1383 spin_unlock(&si->lock);
1384
1385 DEBUG_TRACE("larger than mtu\n");
1386 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1387 return 0;
1388 }
1389
1390 /*
1391 * From this point on we're good to modify the packet.
1392 */
1393
1394 /*
1395 * Decrement our TTL.
1396 */
1397 iph->ttl = ttl - 1;
1398
1399 /*
1400 * Do we have to perform translations of the source address/port?
1401 */
1402 if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
1403 uint16_t udp_csum;
1404
Dave Hudson87973cd2013-10-22 16:00:04 +01001405 iph->saddr = cm->xlate_src_ip;
1406 udph->source = cm->xlate_src_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001407
1408 /*
1409 * Do we have a non-zero UDP checksum? If we do then we need
1410 * to update it.
1411 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001412 udp_csum = udph->check;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001413 if (likely(udp_csum)) {
1414 uint32_t sum = udp_csum + cm->xlate_src_csum_adjustment;
1415 sum = (sum & 0xffff) + (sum >> 16);
Dave Hudson87973cd2013-10-22 16:00:04 +01001416 udph->check = (uint16_t)sum;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001417 }
1418 }
1419
1420 /*
1421 * Do we have to perform translations of the destination address/port?
1422 */
1423 if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) {
1424 uint16_t udp_csum;
1425
Dave Hudson87973cd2013-10-22 16:00:04 +01001426 iph->daddr = cm->xlate_dest_ip;
1427 udph->dest = cm->xlate_dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001428
1429 /*
1430 * Do we have a non-zero UDP checksum? If we do then we need
1431 * to update it.
1432 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001433 udp_csum = udph->check;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001434 if (likely(udp_csum)) {
1435 uint32_t sum = udp_csum + cm->xlate_dest_csum_adjustment;
1436 sum = (sum & 0xffff) + (sum >> 16);
Dave Hudson87973cd2013-10-22 16:00:04 +01001437 udph->check = (uint16_t)sum;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001438 }
1439 }
1440
1441 /*
1442 * Replace the IP checksum.
1443 */
1444 iph->check = sfe_ipv4_gen_ip_csum(iph);
1445
1446// if ((nat_entry_data->tos & FASTNAT_DSCP_MASK) != (iph->tos & FASTNAT_DSCP_MASK)) {
1447// ipv4_change_dsfield(iph, (u_int8_t)(~FASTNAT_DSCP_MASK), nat_entry_data->tos);
1448// }
1449
1450// skb->priority = nat_entry_data->priority;
1451// skb->mark = nat_entry_data->mark;
1452
1453 /*
1454 * Update traffic stats.
1455 */
1456 cm->rx_packet_count++;
1457 cm->rx_byte_count += len;
1458
1459 /*
1460 * If we're not already on the active list then insert ourselves at the tail
1461 * of the current list.
1462 */
1463 if (unlikely(!cm->active)) {
1464 cm->active = true;
1465 cm->active_prev = si->active_tail;
1466 if (likely(si->active_tail)) {
1467 si->active_tail->active_next = cm;
1468 } else {
1469 si->active_head = cm;
1470 }
1471 si->active_tail = cm;
1472 }
1473
1474 xmit_dev = cm->xmit_dev;
1475 skb->dev = xmit_dev;
1476
1477 /*
1478 * Do we have a simple Ethernet header to write?
1479 */
1480 if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR))) {
1481 /*
1482 * If this is anything other than a point-to-point interface then we need to
1483 * create a header based on MAC addresses.
1484 */
1485 if (likely(!(xmit_dev->flags & IFF_POINTOPOINT))) {
1486 xmit_dev->header_ops->create(skb, xmit_dev, ETH_P_IP,
1487 cm->xmit_dest_mac, cm->xmit_src_mac, len);
1488 }
1489 } else {
1490 struct sfe_ipv4_ethhdr *eth = (struct sfe_ipv4_ethhdr *)__skb_push(skb, ETH_HLEN);
1491 eth->h_proto = htons(ETH_P_IP);
1492 eth->h_dest[0] = htons(cm->xmit_dest_mac[0]);
1493 eth->h_dest[1] = htons(cm->xmit_dest_mac[1]);
1494 eth->h_dest[2] = htons(cm->xmit_dest_mac[2]);
1495 eth->h_source[0] = htons(cm->xmit_src_mac[0]);
1496 eth->h_source[1] = htons(cm->xmit_src_mac[1]);
1497 eth->h_source[2] = htons(cm->xmit_src_mac[2]);
1498 }
1499
1500 si->packets_forwarded++;
1501 spin_unlock(&si->lock);
1502
1503 /*
1504 * We're going to check for GSO flags when we transmit the packet so
1505 * start fetching the necessary cache line now.
1506 */
1507 prefetch(skb_shinfo(skb));
1508
1509 /*
1510 * Send the packet on its way.
1511 */
1512 dev_queue_xmit(skb);
1513
1514 return 1;
1515}
1516
1517/*
1518 * sfe_ipv4_process_tcp_option_sack()
1519 * Parse TCP SACK option and update ack according
1520 */
1521static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcphdr *th, const uint32_t data_offs,
1522 uint32_t *ack) __attribute__((always_inline));
1523static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcphdr *th, const uint32_t data_offs,
1524 uint32_t *ack)
1525{
1526 uint32_t length = sizeof(struct sfe_ipv4_tcphdr);
1527 uint8_t *ptr = (uint8_t *)th + length;
1528
1529 /*
1530 * If option is TIMESTAMP discard it.
1531 */
1532 if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1)
1533 && likely(ptr[0] == TCPOPT_NOP)
1534 && likely(ptr[1] == TCPOPT_NOP)
1535 && likely(ptr[2] == TCPOPT_TIMESTAMP)
1536 && likely(ptr[3] == TCPOLEN_TIMESTAMP)) {
1537 return true;
1538 }
1539
1540 /*
1541 * TCP options. Parse SACK option.
1542 */
1543 while (length < data_offs) {
1544 uint8_t size;
1545 uint8_t kind;
1546
1547 ptr = (uint8_t *)th + length;
1548 kind = *ptr;
1549
1550 /*
1551 * NOP, for padding
1552 * Not in the switch because to fast escape and to not calculate size
1553 */
1554 if (kind == TCPOPT_NOP) {
1555 length++;
1556 continue;
1557 }
1558
1559 if (kind == TCPOPT_SACK) {
1560 uint32_t sack = 0;
1561 uint8_t re = 1 + 1;
1562
1563 size = *(ptr + 1);
1564 if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK))
1565 || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK))
1566 || (size > (data_offs - length))) {
1567 return false;
1568 }
1569
1570 re += 4;
1571 while (re < size) {
1572 uint32_t sack_re;
1573 uint8_t *sptr = ptr + re;
1574 sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3];
1575 if (sack_re > sack) {
1576 sack = sack_re;
1577 }
1578 re += TCPOLEN_SACK_PERBLOCK;
1579 }
1580 if (sack > *ack) {
1581 *ack = sack;
1582 }
1583 length += size;
1584 continue;
1585 }
1586 if (kind == TCPOPT_EOL) {
1587 return true;
1588 }
1589 size = *(ptr + 1);
1590 if (size < 2) {
1591 return false;
1592 }
1593 length += size;
1594 }
1595
1596 return true;
1597}
1598
1599/*
1600 * sfe_ipv4_recv_tcp()
1601 * Handle TCP packet receives and forwarding.
1602 */
1603static int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
1604 unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl, bool flush_on_find)
1605{
1606 struct sfe_ipv4_tcphdr *tcph;
Dave Hudson87973cd2013-10-22 16:00:04 +01001607 __be32 src_ip;
1608 __be32 dest_ip;
1609 __be16 src_port;
1610 __be16 dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001611 struct sfe_ipv4_connection_match *cm;
1612 struct sfe_ipv4_connection_match *counter_cm;
1613 uint8_t ttl;
1614 uint32_t flags;
1615 struct net_device *xmit_dev;
1616
1617 /*
1618 * Is our packet too short to contain a valid UDP header?
1619 */
1620 if (unlikely(len < (sizeof(struct sfe_ipv4_tcphdr) + ihl))) {
1621 spin_lock(&si->lock);
1622 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++;
1623 si->packets_not_forwarded++;
1624 spin_unlock(&si->lock);
1625
1626 DEBUG_TRACE("packet too short for TCP header\n");
1627 return 0;
1628 }
1629
1630 /*
1631 * Read the IP address and port information. Read the IP header data first
1632 * because we've almost certainly got that in the cache. We may not yet have
1633 * the TCP header cached though so allow more time for any prefetching.
1634 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001635 src_ip = iph->saddr;
1636 dest_ip = iph->daddr;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001637
1638 tcph = (struct sfe_ipv4_tcphdr *)(skb->data + ihl);
Dave Hudson87973cd2013-10-22 16:00:04 +01001639 src_port = tcph->source;
1640 dest_port = tcph->dest;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001641 flags = tcp_flag_word(tcph);
1642
1643 spin_lock(&si->lock);
1644
1645 /*
1646 * Look for a connection match.
1647 */
1648 cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port);
1649 if (unlikely(!cm)) {
1650 /*
1651 * We didn't get a connection but as TCP is connection-oriented that
1652 * may be because this is a non-fast connection (not running established).
1653 * For diagnostic purposes we differentiate this here.
1654 */
1655 if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) {
1656 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++;
1657 si->packets_not_forwarded++;
1658 spin_unlock(&si->lock);
1659
1660 DEBUG_TRACE("no connection found - fast flags\n");
1661 return 0;
1662 }
1663 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++;
1664 si->packets_not_forwarded++;
1665 spin_unlock(&si->lock);
1666
1667 DEBUG_TRACE("no connection found - slow flags: 0x%x\n",
1668 flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK));
1669 return 0;
1670 }
1671
1672 /*
1673 * If our packet has beern marked as "flush on find" we can't actually
1674 * forward it in the fast path, but now that we've found an associated
1675 * connection we can flush that out before we process the packet.
1676 */
1677 if (unlikely(flush_on_find)) {
1678 struct sfe_ipv4_connection *c = cm->connection;
1679 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1680 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++;
1681 si->packets_not_forwarded++;
1682 spin_unlock(&si->lock);
1683
1684 DEBUG_TRACE("flush on find\n");
1685 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1686 return 0;
1687 }
1688
1689 /*
1690 * Does our TTL allow forwarding?
1691 */
1692 ttl = iph->ttl;
1693 if (unlikely(ttl < 2)) {
1694 struct sfe_ipv4_connection *c = cm->connection;
1695 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1696 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL]++;
1697 si->packets_not_forwarded++;
1698 spin_unlock(&si->lock);
1699
1700 DEBUG_TRACE("ttl too low\n");
1701 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1702 return 0;
1703 }
1704
1705 /*
1706 * If our packet is larger than the MTU of the transmit interface then
1707 * we can't forward it easily.
1708 */
1709 if (unlikely(len > cm->xmit_dev_mtu)) {
1710 struct sfe_ipv4_connection *c = cm->connection;
1711 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1712 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++;
1713 si->packets_not_forwarded++;
1714 spin_unlock(&si->lock);
1715
1716 DEBUG_TRACE("larger than mtu\n");
1717 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1718 return 0;
1719 }
1720
1721 /*
1722 * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN
1723 * set is not a fast path packet.
1724 */
1725 if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) {
1726 struct sfe_ipv4_connection *c = cm->connection;
1727 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1728 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS]++;
1729 si->packets_not_forwarded++;
1730 spin_unlock(&si->lock);
1731
1732 DEBUG_TRACE("TCP flags: 0x%x are not fast\n",
1733 flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK));
1734 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1735 return 0;
1736 }
1737
1738 counter_cm = cm->counter_match;
1739
1740 /*
1741 * Are we doing sequence number checking?
1742 */
1743 if (likely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) {
1744 uint32_t seq;
1745 uint32_t ack;
1746 uint32_t sack;
1747 uint32_t data_offs;
1748 uint32_t end;
1749 uint32_t left_edge;
1750 uint32_t scaled_win;
1751 uint32_t max_end;
1752
1753 /*
1754 * Is our sequence fully past the right hand edge of the window?
1755 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001756 seq = ntohl(tcph->seq);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001757 if (unlikely((int32_t)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) {
1758 struct sfe_ipv4_connection *c = cm->connection;
1759 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1760 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++;
1761 si->packets_not_forwarded++;
1762 spin_unlock(&si->lock);
1763
1764 DEBUG_TRACE("seq: %u exceeds right edge: %u\n",
1765 seq, cm->protocol_state.tcp.max_end + 1);
1766 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1767 return 0;
1768 }
1769
1770 /*
1771 * Check that our TCP data offset isn't too short.
1772 */
1773 data_offs = tcph->doff << 2;
1774 if (unlikely(data_offs < sizeof(struct sfe_ipv4_tcphdr))) {
1775 struct sfe_ipv4_connection *c = cm->connection;
1776 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1777 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++;
1778 si->packets_not_forwarded++;
1779 spin_unlock(&si->lock);
1780
1781 DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs);
1782 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1783 return 0;
1784 }
1785
1786 /*
1787 * Update ACK according to any SACK option.
1788 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001789 ack = ntohl(tcph->ack_seq);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001790 sack = ack;
1791 if (unlikely(!sfe_ipv4_process_tcp_option_sack(tcph, data_offs, &sack))) {
1792 struct sfe_ipv4_connection *c = cm->connection;
1793 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1794 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK]++;
1795 si->packets_not_forwarded++;
1796 spin_unlock(&si->lock);
1797
1798 DEBUG_TRACE("TCP option SACK size is wrong\n");
1799 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1800 return 0;
1801 }
1802
1803 /*
1804 * Check that our TCP data offset isn't past the end of the packet.
1805 */
1806 data_offs += sizeof(struct sfe_ipv4_iphdr);
1807 if (unlikely(len < data_offs)) {
1808 struct sfe_ipv4_connection *c = cm->connection;
1809 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1810 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++;
1811 si->packets_not_forwarded++;
1812 spin_unlock(&si->lock);
1813
1814 DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n",
1815 data_offs, len);
1816 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1817 return 0;
1818 }
1819
1820 end = seq + len - data_offs;
1821
1822 /*
1823 * Is our sequence fully before the left hand edge of the window?
1824 */
1825 if (unlikely((int32_t)(end - (cm->protocol_state.tcp.end
1826 - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) {
1827 struct sfe_ipv4_connection *c = cm->connection;
1828 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1829 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++;
1830 si->packets_not_forwarded++;
1831 spin_unlock(&si->lock);
1832
1833 DEBUG_TRACE("seq: %u before left edge: %u\n",
1834 end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1);
1835 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1836 return 0;
1837 }
1838
1839 /*
1840 * Are we acking data that is to the right of what has been sent?
1841 */
1842 if (unlikely((int32_t)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) {
1843 struct sfe_ipv4_connection *c = cm->connection;
1844 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1845 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++;
1846 si->packets_not_forwarded++;
1847 spin_unlock(&si->lock);
1848
1849 DEBUG_TRACE("ack: %u exceeds right edge: %u\n",
1850 sack, counter_cm->protocol_state.tcp.end + 1);
1851 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1852 return 0;
1853 }
1854
1855 /*
1856 * Is our ack too far before the left hand edge of the window?
1857 */
1858 left_edge = counter_cm->protocol_state.tcp.end
1859 - cm->protocol_state.tcp.max_win
1860 - SFE_IPV4_TCP_MAX_ACK_WINDOW
1861 - 1;
1862 if (unlikely((int32_t)(sack - left_edge) < 0)) {
1863 struct sfe_ipv4_connection *c = cm->connection;
1864 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1865 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++;
1866 si->packets_not_forwarded++;
1867 spin_unlock(&si->lock);
1868
1869 DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge);
1870 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1871 return 0;
1872 }
1873
1874 /*
1875 * Have we just seen the largest window size yet for this connection? If yes
1876 * then we need to record the new value.
1877 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001878 scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001879 scaled_win += (sack - ack);
1880 if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) {
1881 cm->protocol_state.tcp.max_win = scaled_win;
1882 }
1883
1884 /*
1885 * If our sequence and/or ack numbers have advanced then record the new state.
1886 */
1887 if (likely((int32_t)(end - cm->protocol_state.tcp.end) >= 0)) {
1888 cm->protocol_state.tcp.end = end;
1889 }
1890
1891 max_end = sack + scaled_win;
1892 if (likely((int32_t)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) {
1893 counter_cm->protocol_state.tcp.max_end = max_end;
1894 }
1895 }
1896
1897 /*
1898 * From this point on we're good to modify the packet.
1899 */
1900
1901 /*
1902 * Decrement our TTL.
1903 */
1904 iph->ttl = ttl - 1;
1905
1906 /*
1907 * Do we have to perform translations of the source address/port?
1908 */
1909 if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
1910 uint16_t tcp_csum;
1911 uint32_t sum;
1912
Dave Hudson87973cd2013-10-22 16:00:04 +01001913 iph->saddr = cm->xlate_src_ip;
1914 tcph->source = cm->xlate_src_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001915
1916 /*
1917 * Do we have a non-zero UDP checksum? If we do then we need
1918 * to update it.
1919 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001920 tcp_csum = tcph->check;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001921 sum = tcp_csum + cm->xlate_src_csum_adjustment;
1922 sum = (sum & 0xffff) + (sum >> 16);
Dave Hudson87973cd2013-10-22 16:00:04 +01001923 tcph->check = (uint16_t)sum;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001924 }
1925
1926 /*
1927 * Do we have to perform translations of the destination address/port?
1928 */
1929 if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) {
1930 uint16_t tcp_csum;
1931 uint32_t sum;
1932
Dave Hudson87973cd2013-10-22 16:00:04 +01001933 iph->daddr = cm->xlate_dest_ip;
1934 tcph->dest = cm->xlate_dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001935
1936 /*
1937 * Do we have a non-zero UDP checksum? If we do then we need
1938 * to update it.
1939 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001940 tcp_csum = tcph->check;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001941 sum = tcp_csum + cm->xlate_dest_csum_adjustment;
1942 sum = (sum & 0xffff) + (sum >> 16);
Dave Hudson87973cd2013-10-22 16:00:04 +01001943 tcph->check = (uint16_t)sum;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001944 }
1945
1946 /*
1947 * Replace the IP checksum.
1948 */
1949 iph->check = sfe_ipv4_gen_ip_csum(iph);
1950
1951// if ((nat_entry_data->tos & FASTNAT_DSCP_MASK) != (iph->tos & FASTNAT_DSCP_MASK)) {
1952// ipv4_change_dsfield(iph, (u_int8_t)(~FASTNAT_DSCP_MASK), nat_entry_data->tos);
1953// }
1954
1955// skb->priority = nat_entry_data->priority;
1956// skb->mark = nat_entry_data->mark;
1957
1958 /*
1959 * Update traffic stats.
1960 */
1961 cm->rx_packet_count++;
1962 cm->rx_byte_count += len;
1963
1964 /*
1965 * If we're not already on the active list then insert ourselves at the tail
1966 * of the current list.
1967 */
1968 if (unlikely(!cm->active)) {
1969 cm->active = true;
1970 cm->active_prev = si->active_tail;
1971 if (likely(si->active_tail)) {
1972 si->active_tail->active_next = cm;
1973 } else {
1974 si->active_head = cm;
1975 }
1976 si->active_tail = cm;
1977 }
1978
1979 xmit_dev = cm->xmit_dev;
1980 skb->dev = xmit_dev;
1981
1982 /*
1983 * Do we have a simple Ethernet header to write?
1984 */
1985 if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR))) {
1986 /*
1987 * If this is anything other than a point-to-point interface then we need to
1988 * create a header based on MAC addresses.
1989 */
1990 if (likely(!(xmit_dev->flags & IFF_POINTOPOINT))) {
1991 xmit_dev->header_ops->create(skb, xmit_dev, ETH_P_IP,
1992 cm->xmit_dest_mac, cm->xmit_src_mac, len);
1993 }
1994 } else {
1995 struct sfe_ipv4_ethhdr *eth = (struct sfe_ipv4_ethhdr *)__skb_push(skb, ETH_HLEN);
1996 eth->h_proto = htons(ETH_P_IP);
1997 eth->h_dest[0] = htons(cm->xmit_dest_mac[0]);
1998 eth->h_dest[1] = htons(cm->xmit_dest_mac[1]);
1999 eth->h_dest[2] = htons(cm->xmit_dest_mac[2]);
2000 eth->h_source[0] = htons(cm->xmit_src_mac[0]);
2001 eth->h_source[1] = htons(cm->xmit_src_mac[1]);
2002 eth->h_source[2] = htons(cm->xmit_src_mac[2]);
2003 }
2004
2005 si->packets_forwarded++;
2006 spin_unlock(&si->lock);
2007
2008 /*
2009 * We're going to check for GSO flags when we transmit the packet so
2010 * start fetching the necessary cache line now.
2011 */
2012 prefetch(skb_shinfo(skb));
2013
2014 /*
2015 * Send the packet on its way.
2016 */
2017 dev_queue_xmit(skb);
2018
2019 return 1;
2020}
2021
2022/*
2023 * sfe_ipv4_recv_icmp()
2024 * Handle ICMP packet receives.
2025 *
2026 * ICMP packets aren't handled as a "fast path" and always have us process them
2027 * through the default Linux stack. What we do need to do is look for any errors
2028 * about connections we are handling in the fast path. If we find any such
2029 * connections then we want to flush their state so that the ICMP error path
2030 * within Linux has all of the correct state should it need it.
2031 */
2032static int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
2033 unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl)
2034{
2035 struct icmphdr *icmph;
2036 struct sfe_ipv4_iphdr *icmp_iph;
2037 unsigned int icmp_ihl_words;
2038 unsigned int icmp_ihl;
2039 uint32_t *icmp_trans_h;
2040 struct sfe_ipv4_udphdr *icmp_udph;
2041 struct sfe_ipv4_tcphdr *icmp_tcph;
Dave Hudson87973cd2013-10-22 16:00:04 +01002042 __be32 src_ip;
2043 __be32 dest_ip;
2044 __be16 src_port;
2045 __be16 dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002046 struct sfe_ipv4_connection_match *cm;
2047 struct sfe_ipv4_connection *c;
2048
2049 /*
2050 * Is our packet too short to contain a valid UDP header?
2051 */
2052 len -= ihl;
2053 if (unlikely(len < sizeof(struct icmphdr))) {
2054 spin_lock(&si->lock);
2055 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++;
2056 si->packets_not_forwarded++;
2057 spin_unlock(&si->lock);
2058
2059 DEBUG_TRACE("packet too short for ICMP header\n");
2060 return 0;
2061 }
2062
2063 /*
2064 * We only handle "destination unreachable" and "time exceeded" messages.
2065 */
2066 icmph = (struct icmphdr *)(skb->data + ihl);
2067 if ((icmph->type != ICMP_DEST_UNREACH)
2068 && (icmph->type != ICMP_TIME_EXCEEDED)) {
2069 spin_lock(&si->lock);
2070 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++;
2071 si->packets_not_forwarded++;
2072 spin_unlock(&si->lock);
2073
2074 DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->type);
2075 return 0;
2076 }
2077
2078 /*
2079 * Do we have the full embedded IP header?
2080 */
2081 len -= sizeof(struct icmphdr);
2082 if (unlikely(len < sizeof(struct sfe_ipv4_iphdr))) {
2083 spin_lock(&si->lock);
2084 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE]++;
2085 si->packets_not_forwarded++;
2086 spin_unlock(&si->lock);
2087
2088 DEBUG_TRACE("Embedded IP header not complete\n");
2089 return 0;
2090 }
2091
2092 /*
2093 * Is our embedded IP version wrong?
2094 */
2095 icmp_iph = (struct sfe_ipv4_iphdr *)(icmph + 1);
2096 if (unlikely(icmp_iph->version != 4)) {
2097 spin_lock(&si->lock);
2098 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4]++;
2099 si->packets_not_forwarded++;
2100 spin_unlock(&si->lock);
2101
2102 DEBUG_TRACE("IP version: %u\n", icmp_iph->version);
2103 return 0;
2104 }
2105
2106 /*
2107 * Do we have the full embedded IP header, including any options?
2108 */
2109 icmp_ihl_words = icmp_iph->ihl;
2110 icmp_ihl = icmp_ihl_words << 2;
2111 if (unlikely(len < icmp_ihl)) {
2112 spin_lock(&si->lock);
2113 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE]++;
2114 si->packets_not_forwarded++;
2115 spin_unlock(&si->lock);
2116
2117 DEBUG_TRACE("Embedded header not large enough for IP options\n");
2118 return 0;
2119 }
2120
2121 len -= icmp_ihl;
2122 icmp_trans_h = ((uint32_t *)icmp_iph) + icmp_ihl_words;
2123
2124 /*
2125 * Handle the embedded transport layer header.
2126 */
2127 switch (icmp_iph->protocol) {
2128 case IPPROTO_UDP:
2129 /*
2130 * We should have 8 bytes of UDP header - that's enough to identify
2131 * the connection.
2132 */
2133 if (unlikely(len < 8)) {
2134 spin_lock(&si->lock);
2135 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE]++;
2136 si->packets_not_forwarded++;
2137 spin_unlock(&si->lock);
2138
2139 DEBUG_TRACE("Incomplete embedded UDP header\n");
2140 return 0;
2141 }
2142
2143 icmp_udph = (struct sfe_ipv4_udphdr *)icmp_trans_h;
Dave Hudson87973cd2013-10-22 16:00:04 +01002144 src_port = icmp_udph->source;
2145 dest_port = icmp_udph->dest;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002146 break;
2147
2148 case IPPROTO_TCP:
2149 /*
2150 * We should have 8 bytes of TCP header - that's enough to identify
2151 * the connection.
2152 */
2153 if (unlikely(len < 8)) {
2154 spin_lock(&si->lock);
2155 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE]++;
2156 si->packets_not_forwarded++;
2157 spin_unlock(&si->lock);
2158
2159 DEBUG_TRACE("Incomplete embedded TCP header\n");
2160 return 0;
2161 }
2162
2163 icmp_tcph = (struct sfe_ipv4_tcphdr *)icmp_trans_h;
Dave Hudson87973cd2013-10-22 16:00:04 +01002164 src_port = icmp_tcph->source;
2165 dest_port = icmp_tcph->dest;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002166 break;
2167
2168 default:
2169 spin_lock(&si->lock);
2170 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL]++;
2171 si->packets_not_forwarded++;
2172 spin_unlock(&si->lock);
2173
2174 DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", icmp_iph->protocol);
2175 return 0;
2176 }
2177
Dave Hudson87973cd2013-10-22 16:00:04 +01002178 src_ip = icmp_iph->saddr;
2179 dest_ip = icmp_iph->daddr;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002180
2181 spin_lock(&si->lock);
2182
2183 /*
2184 * Look for a connection match. Note that we reverse the source and destination
2185 * here because our embedded message contains a packet that was sent in the
2186 * opposite direction to the one in which we just received it. It will have
2187 * been sent on the interface from which we received it though so that's still
2188 * ok to use.
2189 */
2190 cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, icmp_iph->protocol, dest_ip, dest_port, src_ip, src_port);
2191 if (unlikely(!cm)) {
2192 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++;
2193 si->packets_not_forwarded++;
2194 spin_unlock(&si->lock);
2195
2196 DEBUG_TRACE("no connection found\n");
2197 return 0;
2198 }
2199
2200 /*
2201 * We found a connection so now remove it from the connection list and flush
2202 * its state.
2203 */
2204 c = cm->connection;
2205 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
2206 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++;
2207 si->packets_not_forwarded++;
2208 spin_unlock(&si->lock);
2209
2210 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
2211 return 0;
2212}
2213
2214/*
2215 * sfe_ipv4_recv()
2216 * Handle packet receives and forwaring.
2217 *
2218 * Returns 1 if the packet is forwarded or 0 if it isn't.
2219 */
2220static int sfe_ipv4_recv(struct sk_buff *skb)
2221{
2222 struct sfe_ipv4 *si = &__si;
2223 struct net_device *dev;
2224#if (SFE_HOOK_ABOVE_BRIDGE)
2225 struct in_device *in_dev;
2226#endif
2227 unsigned int len;
2228 unsigned int tot_len;
2229 unsigned int frag_off;
2230 unsigned int ihl;
2231 bool flush_on_find;
2232 bool ip_options;
2233 struct sfe_ipv4_iphdr *iph;
2234 uint32_t protocol;
2235
2236 /*
2237 * We know that for the vast majority of packets we need the transpor
2238 * layer header so we may as well start to fetch it now!
2239 */
2240 prefetch(skb->data + 32);
2241 barrier();
2242
2243 dev = skb->dev;
2244
2245#if (SFE_HOOK_ABOVE_BRIDGE)
2246 /*
2247 * Does our input device support IP processing?
2248 */
2249 in_dev = (struct in_device *)dev->ip_ptr;
2250 if (unlikely(!in_dev)) {
2251 DEBUG_TRACE("no IP processing for device: %s\n", dev->name);
2252 return 0;
2253 }
2254
2255 /*
2256 * Does it have an IP address? If it doesn't then we can't do anything
2257 * interesting here!
2258 */
2259 if (unlikely(!in_dev->ifa_list)) {
2260 DEBUG_TRACE("no IP address for device: %s\n", dev->name);
2261 return 0;
2262 }
2263#endif
2264
2265 /*
2266 * We're only interested in IP packets.
2267 */
Dave Hudson87973cd2013-10-22 16:00:04 +01002268 if (unlikely(htons(ETH_P_IP) != skb->protocol)) {
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002269 DEBUG_TRACE("not IP packet\n");
2270 return 0;
2271 }
2272
2273 /*
2274 * Check that we have space for an IP header here.
2275 */
2276 len = skb->len;
2277 if (unlikely(len < sizeof(struct sfe_ipv4_iphdr))) {
2278 spin_lock(&si->lock);
2279 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE]++;
2280 si->packets_not_forwarded++;
2281 spin_unlock(&si->lock);
2282
2283 DEBUG_TRACE("len: %u is too short\n", len);
2284 return 0;
2285 }
2286
2287 /*
2288 * Check that our "total length" is large enough for an IP header.
2289 */
2290 iph = (struct sfe_ipv4_iphdr *)skb->data;
2291 tot_len = ntohs(iph->tot_len);
2292 if (unlikely(tot_len < sizeof(struct sfe_ipv4_iphdr))) {
2293 spin_lock(&si->lock);
2294 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH]++;
2295 si->packets_not_forwarded++;
2296 spin_unlock(&si->lock);
2297
2298 DEBUG_TRACE("tot_len: %u is too short\n", tot_len);
2299 return 0;
2300 }
2301
2302 /*
2303 * Is our IP version wrong?
2304 */
2305 if (unlikely(iph->version != 4)) {
2306 spin_lock(&si->lock);
2307 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_V4]++;
2308 si->packets_not_forwarded++;
2309 spin_unlock(&si->lock);
2310
2311 DEBUG_TRACE("IP version: %u\n", iph->version);
2312 return 0;
2313 }
2314
2315 /*
2316 * Does our datagram fit inside the skb?
2317 */
2318 if (unlikely(tot_len > len)) {
2319 spin_lock(&si->lock);
2320 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++;
2321 si->packets_not_forwarded++;
2322 spin_unlock(&si->lock);
2323
2324 DEBUG_TRACE("tot_len: %u, exceeds len: %u\n", tot_len, len);
2325 return 0;
2326 }
2327
2328 /*
2329 * Do we have a non-initial fragment?
2330 */
2331 frag_off = ntohs(iph->frag_off);
2332 if (unlikely(frag_off & IP_OFFSET)) {
2333 spin_lock(&si->lock);
2334 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++;
2335 si->packets_not_forwarded++;
2336 spin_unlock(&si->lock);
2337
2338 DEBUG_TRACE("non-initial fragment\n");
2339 return 0;
2340 }
2341
2342 /*
2343 * If we have a (first) fragment then mark it to cause any connection to flush.
2344 */
2345 flush_on_find = unlikely(frag_off & IP_MF) ? true : false;
2346
2347 /*
2348 * Do we have any IP options? That's definite a slow path! If we do have IP
2349 * options we need to recheck our header size.
2350 */
2351 ihl = iph->ihl << 2;
2352 ip_options = unlikely(ihl != sizeof(struct sfe_ipv4_iphdr)) ? true : false;
2353 if (unlikely(ip_options)) {
2354 if (unlikely(len < ihl)) {
2355 spin_lock(&si->lock);
2356 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE]++;
2357 si->packets_not_forwarded++;
2358 spin_unlock(&si->lock);
2359
2360 DEBUG_TRACE("len: %u is too short for header of size: %u\n", len, ihl);
2361 return 0;
2362 }
2363
2364 flush_on_find = true;
2365 }
2366
2367 protocol = iph->protocol;
2368 if (IPPROTO_UDP == protocol) {
2369 return sfe_ipv4_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find);
2370 }
2371
2372 if (IPPROTO_TCP == protocol) {
2373 return sfe_ipv4_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find);
2374 }
2375
2376 if (IPPROTO_ICMP == protocol) {
2377 return sfe_ipv4_recv_icmp(si, skb, dev, len, iph, ihl);
2378 }
2379
2380 spin_lock(&si->lock);
2381 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++;
2382 si->packets_not_forwarded++;
2383 spin_unlock(&si->lock);
2384
2385 DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", protocol);
2386 return 0;
2387}
2388
2389/*
2390 * sfe_ipv4_find_mac_addr()
2391 * Find the MAC address for a given IPv4 address.
2392 *
2393 * Returns true if we find the MAC address, otherwise false.
2394 *
2395 * We look up the rtable entry for the address and, from its neighbour
2396 * structure, obtain the hardware address. This means this function also
2397 * works if the neighbours are routers too.
2398 */
2399static bool sfe_ipv4_find_mac_addr(uint32_t addr, uint8_t *mac_addr)
2400{
2401 struct neighbour *neigh;
2402 struct rtable *rt;
2403 struct dst_entry *dst;
2404 struct net_device *dev;
2405
2406 /*
2407 * Look up the rtable entry for the IP address then get the hardware
2408 * address from its neighbour structure. This means this work when the
2409 * neighbours are routers too.
2410 */
2411 rt = ip_route_output(&init_net, addr, 0, 0, 0);
2412 if (unlikely(IS_ERR(rt))) {
2413 return false;
2414 }
2415
2416 dst = (struct dst_entry *)rt;
2417
2418 rcu_read_lock();
2419 neigh = dst_get_neighbour_noref(dst);
2420 if (unlikely(!neigh)) {
2421 rcu_read_unlock();
2422 dst_release(dst);
2423 return false;
2424 }
2425
2426 if (unlikely(!(neigh->nud_state & NUD_VALID))) {
2427 rcu_read_unlock();
2428 dst_release(dst);
2429 return false;
2430 }
2431
2432 dev = neigh->dev;
2433 if (!dev) {
2434 rcu_read_unlock();
2435 dst_release(dst);
2436 return false;
2437 }
2438
2439 memcpy(mac_addr, neigh->ha, (size_t)dev->addr_len);
2440 rcu_read_unlock();
2441
2442 dst_release(dst);
2443
2444 /*
2445 * We're only interested in unicast MAC addresses - if it's not a unicast
2446 * address then our IP address mustn't be unicast either.
2447 */
2448 if (is_multicast_ether_addr(mac_addr)) {
2449 DEBUG_TRACE("MAC is non-unicast - ignoring\n");
2450 return false;
2451 }
2452
2453 return true;
2454}
2455
2456/*
2457 * sfe_ipv4_create_rule()
2458 * Create a forwarding rule.
2459 */
2460static void sfe_ipv4_create_rule(struct sfe_ipv4 *si, struct sfe_ipv4_create *sic)
2461{
2462 struct sfe_ipv4_connection *c;
2463 struct sfe_ipv4_connection_match *original_cm;
2464 struct sfe_ipv4_connection_match *reply_cm;
2465
2466 spin_lock_bh(&si->lock);
2467 si->connection_create_requests++;
2468
2469 /*
2470 * Check to see if there is already a flow that matches the rule we're trying
2471 * to create. If there is then we can't create a new one.
2472 */
2473 c = sfe_ipv4_find_sfe_ipv4_connection(si, sic->protocol, sic->src_ip, sic->src_port,
2474 sic->dest_ip, sic->dest_port);
2475 if (c) {
2476 si->connection_create_collisions++;
2477
2478 /*
2479 * If we already have the flow then it's likely that this request to
2480 * create the connection rule contains more up-to-date information.
2481 * Check and update accordingly.
2482 */
2483 original_cm = c->original_match;
2484 reply_cm = c->reply_match;
2485
2486 switch (sic->protocol) {
2487 case IPPROTO_TCP:
2488 if (original_cm->protocol_state.tcp.max_win < sic->src_td_max_window) {
2489 original_cm->protocol_state.tcp.max_win = sic->src_td_max_window;
2490 }
2491 if ((int32_t)(original_cm->protocol_state.tcp.end - sic->src_td_end) < 0) {
2492 original_cm->protocol_state.tcp.end = sic->src_td_end;
2493 }
2494 if ((int32_t)(original_cm->protocol_state.tcp.max_end - sic->src_td_max_end) < 0) {
2495 original_cm->protocol_state.tcp.max_end = sic->src_td_max_end;
2496 }
2497 if (reply_cm->protocol_state.tcp.max_win < sic->dest_td_max_window) {
2498 reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window;
2499 }
2500 if ((int32_t)(reply_cm->protocol_state.tcp.end - sic->dest_td_end) < 0) {
2501 reply_cm->protocol_state.tcp.end = sic->dest_td_end;
2502 }
2503 if ((int32_t)(reply_cm->protocol_state.tcp.max_end - sic->dest_td_max_end) < 0) {
2504 reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end;
2505 }
2506 original_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2507 reply_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2508 if (sic->flags & SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK) {
2509 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2510 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2511 }
2512 break;
2513 }
2514
2515 spin_unlock_bh(&si->lock);
2516
2517 DEBUG_TRACE("connection already exists - p: %d\n"
2518 " s: %s:%pM:%pI4:%u, d: %s:%pM:%pI4:%u\n",
Dave Hudson87973cd2013-10-22 16:00:04 +01002519 sic->protocol, sic->src_dev->name, sic->src_mac, &sic->src_ip, ntohs(sic->src_port),
2520 sic->dest_dev->name, sic->dest_mac, &sic->dest_ip, ntohs(sic->dest_port));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002521 return;
2522 }
2523
2524 /*
2525 * Allocate the various connection tracking objects.
2526 */
2527 c = (struct sfe_ipv4_connection *)kmalloc(sizeof(struct sfe_ipv4_connection), GFP_ATOMIC);
2528 if (unlikely(!c)) {
2529 spin_unlock_bh(&si->lock);
2530 return;
2531 }
2532
2533 original_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC);
2534 if (unlikely(!original_cm)) {
2535 spin_unlock_bh(&si->lock);
2536 kfree(c);
2537 return;
2538 }
2539
2540 reply_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC);
2541 if (unlikely(!reply_cm)) {
2542 spin_unlock_bh(&si->lock);
2543 kfree(original_cm);
2544 kfree(c);
2545 return;
2546 }
2547
2548 /*
2549 * Fill in the "original" direction connection matching object.
2550 * Note that the transmit MAC address is "dest_mac_xlate" because
2551 * we always know both ends of a connection by their translated
2552 * addresses and not their public addresses.
2553 */
2554 original_cm->match_dev = sic->src_dev;
2555 original_cm->match_protocol = sic->protocol;
2556 original_cm->match_src_ip = sic->src_ip;
2557 original_cm->match_src_port = sic->src_port;
2558 original_cm->match_dest_ip = sic->dest_ip;
2559 original_cm->match_dest_port = sic->dest_port;
2560 original_cm->xlate_src_ip = sic->src_ip_xlate;
2561 original_cm->xlate_src_port = sic->src_port_xlate;
2562 original_cm->xlate_dest_ip = sic->dest_ip_xlate;
2563 original_cm->xlate_dest_port = sic->dest_port_xlate;
2564 original_cm->rx_packet_count = 0;
2565 original_cm->rx_packet_count64 = 0;
2566 original_cm->rx_byte_count = 0;
2567 original_cm->rx_byte_count64 = 0;
2568 original_cm->xmit_dev = sic->dest_dev;
2569 original_cm->xmit_dev_mtu = sic->dest_mtu;
2570 memcpy(original_cm->xmit_src_mac, sic->dest_dev->dev_addr, ETH_ALEN);
2571 memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN);
2572 original_cm->connection = c;
2573 original_cm->counter_match = reply_cm;
2574 original_cm->flags = 0;
2575 original_cm->active_next = NULL;
2576 original_cm->active_prev = NULL;
2577 original_cm->active = false;
2578 if (sic->dest_dev->header_ops->create == eth_header) {
2579 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR;
2580 }
2581
2582 /*
2583 * Fill in the "reply" direction connection matching object.
2584 */
2585 reply_cm->match_dev = sic->dest_dev;
2586 reply_cm->match_protocol = sic->protocol;
2587 reply_cm->match_src_ip = sic->dest_ip_xlate;
2588 reply_cm->match_src_port = sic->dest_port_xlate;
2589 reply_cm->match_dest_ip = sic->src_ip_xlate;
2590 reply_cm->match_dest_port = sic->src_port_xlate;
2591 reply_cm->xlate_src_ip = sic->dest_ip;
2592 reply_cm->xlate_src_port = sic->dest_port;
2593 reply_cm->xlate_dest_ip = sic->src_ip;
2594 reply_cm->xlate_dest_port = sic->src_port;
2595 reply_cm->rx_packet_count = 0;
2596 reply_cm->rx_packet_count64 = 0;
2597 reply_cm->rx_byte_count = 0;
2598 reply_cm->rx_byte_count64 = 0;
2599 reply_cm->xmit_dev = sic->src_dev;
2600 reply_cm->xmit_dev_mtu = sic->src_mtu;
2601 memcpy(reply_cm->xmit_src_mac, sic->src_dev->dev_addr, ETH_ALEN);
2602 memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN);
2603 reply_cm->connection = c;
2604 reply_cm->counter_match = original_cm;
2605 reply_cm->flags = 0;
2606 reply_cm->active_next = NULL;
2607 reply_cm->active_prev = NULL;
2608 reply_cm->active = false;
2609 if (sic->src_dev->header_ops->create == eth_header) {
2610 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR;
2611 }
2612
2613 if (sic->dest_ip != sic->dest_ip_xlate || sic->dest_port != sic->dest_port_xlate) {
2614 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST;
2615 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC;
2616 }
2617
2618 if (sic->src_ip != sic->src_ip_xlate || sic->src_port != sic->src_port_xlate) {
2619 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC;
2620 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST;
2621 }
2622
2623 c->protocol = sic->protocol;
2624 c->src_ip = sic->src_ip;
2625 c->src_ip_xlate = sic->src_ip_xlate;
2626 c->src_port = sic->src_port;
2627 c->src_port_xlate = sic->src_port_xlate;
2628 c->original_dev = sic->src_dev;
2629 c->original_match = original_cm;
2630 c->dest_ip = sic->dest_ip;
2631 c->dest_ip_xlate = sic->dest_ip_xlate;
2632 c->dest_port = sic->dest_port;
2633 c->dest_port_xlate = sic->dest_port_xlate;
2634 c->reply_dev = sic->dest_dev;
2635 c->reply_match = reply_cm;
2636
2637 c->last_sync_jiffies = get_jiffies_64();
2638 c->iterators = 0;
2639 c->pending_free = false;
2640
2641 /*
2642 * Take hold of our source and dest devices for the duration of the connection.
2643 */
2644 dev_hold(c->original_dev);
2645 dev_hold(c->reply_dev);
2646
2647 /*
2648 * Initialize the protocol-specific information that we track.
2649 */
2650 switch (sic->protocol) {
2651 case IPPROTO_TCP:
2652 original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale;
2653 original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1;
2654 original_cm->protocol_state.tcp.end = sic->src_td_end;
2655 original_cm->protocol_state.tcp.max_end = sic->src_td_max_end;
2656 reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale;
2657 reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1;
2658 reply_cm->protocol_state.tcp.end = sic->dest_td_end;
2659 reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end;
2660 if (sic->flags & SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK) {
2661 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2662 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2663 }
2664 break;
2665 }
2666
2667 sfe_ipv4_connection_match_compute_translations(original_cm);
2668 sfe_ipv4_connection_match_compute_translations(reply_cm);
2669 sfe_ipv4_insert_sfe_ipv4_connection(si, c);
2670
2671 spin_unlock_bh(&si->lock);
2672
2673 /*
2674 * We have everything we need!
2675 */
2676 DEBUG_INFO("new connection - p: %d\n"
2677 " s: %s:%pM(%pM):%pI4(%pI4):%u(%u)\n"
2678 " d: %s:%pM(%pM):%pI4(%pI4):%u(%u)\n",
2679 sic->protocol,
2680 sic->src_dev->name, sic->src_mac, sic->src_mac_xlate,
Dave Hudson87973cd2013-10-22 16:00:04 +01002681 &sic->src_ip, &sic->src_ip_xlate, ntohs(sic->src_port), ntohs(sic->src_port_xlate),
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002682 sic->dest_dev->name, sic->dest_mac, sic->dest_mac_xlate,
Dave Hudson87973cd2013-10-22 16:00:04 +01002683 &sic->dest_ip, &sic->dest_ip_xlate, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002684}
2685
2686/*
2687 * sfe_ipv4_post_routing_hook()
2688 * Called for packets about to leave the box - either locally generated or forwarded from another interface
2689 */
2690static unsigned int sfe_ipv4_post_routing_hook(unsigned int hooknum,
2691 struct sk_buff *skb,
2692 const struct net_device *in_unused,
2693 const struct net_device *out,
2694 int (*okfn)(struct sk_buff *))
2695{
2696 struct sfe_ipv4 *si = &__si;
2697 struct sfe_ipv4_create sic;
2698 struct net_device *in;
2699 struct nf_conn *ct;
2700 enum ip_conntrack_info ctinfo;
2701 struct net_device *src_dev;
2702 struct net_device *dest_dev;
2703 struct net_device *src_br_dev = NULL;
2704 struct net_device *dest_br_dev = NULL;
2705 struct nf_conntrack_tuple orig_tuple;
2706 struct nf_conntrack_tuple reply_tuple;
2707
2708 /*
2709 * If operations have paused then do not process packets.
2710 */
2711 spin_lock_bh(&si->lock);
2712 if (unlikely(si->pause)) {
2713 DEBUG_TRACE("paused, ignoring\n");
2714 spin_unlock_bh(&si->lock);
2715 return NF_ACCEPT;
2716 }
2717
2718 spin_unlock_bh(&si->lock);
2719
2720 /*
2721 * Don't process broadcast or multicast packets.
2722 */
2723 if (unlikely(skb->pkt_type == PACKET_BROADCAST)) {
2724 DEBUG_TRACE("broadcast, ignoring\n");
2725 return NF_ACCEPT;
2726 }
2727 if (unlikely(skb->pkt_type == PACKET_MULTICAST)) {
2728 DEBUG_TRACE("multicast, ignoring\n");
2729 return NF_ACCEPT;
2730 }
2731
2732 /*
2733 * Don't process packets that are not being forwarded.
2734 */
2735 in = dev_get_by_index(&init_net, skb->skb_iif);
2736 if (!in) {
2737 DEBUG_TRACE("packet not forwarding\n");
2738 return NF_ACCEPT;
2739 }
2740
2741 /*
2742 * Don't process packets with non-standard 802.3 MAC address sizes.
2743 */
2744 if (unlikely(in->addr_len != ETH_ALEN)) {
2745 DEBUG_TRACE("in device: %s not 802.3 hw addr len: %u, ignoring\n",
2746 in->name, (unsigned)in->addr_len);
2747 goto done1;
2748 }
2749 if (unlikely(out->addr_len != ETH_ALEN)) {
2750 DEBUG_TRACE("out device: %s not 802.3 hw addr len: %u, ignoring\n",
2751 out->name, (unsigned)out->addr_len);
2752 goto done1;
2753 }
2754
2755 /*
2756 * Don't process packets that aren't being tracked by conntrack.
2757 */
2758 ct = nf_ct_get(skb, &ctinfo);
2759 if (unlikely(!ct)) {
2760 DEBUG_TRACE("no conntrack connection, ignoring\n");
2761 goto done1;
2762 }
2763
2764 /*
2765 * Don't process untracked connections.
2766 */
2767 if (unlikely(ct == &nf_conntrack_untracked)) {
2768 DEBUG_TRACE("untracked connection\n");
2769 goto done1;
2770 }
2771
2772 /*
2773 * Don't process connections that require support from a 'helper' (typically a NAT ALG).
2774 */
2775 if (unlikely(nfct_help(ct))) {
2776 DEBUG_TRACE("connection has helper\n");
2777 goto done1;
2778 }
2779
2780 /*
2781 * Look up the details of our connection in conntrack.
2782 *
2783 * Note that the data we get from conntrack is for the "ORIGINAL" direction
2784 * but our packet may actually be in the "REPLY" direction.
2785 */
2786 orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
2787 reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
2788 sic.protocol = (int32_t)orig_tuple.dst.protonum;
2789
2790 /*
2791 * Get addressing information, non-NAT first
2792 */
Dave Hudson87973cd2013-10-22 16:00:04 +01002793 sic.src_ip = (__be32)orig_tuple.src.u3.ip;
2794 sic.dest_ip = (__be32)orig_tuple.dst.u3.ip;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002795
2796 /*
2797 * NAT'ed addresses - note these are as seen from the 'reply' direction
2798 * When NAT does not apply to this connection these will be identical to the above.
2799 */
Dave Hudson87973cd2013-10-22 16:00:04 +01002800 sic.src_ip_xlate = (__be32)reply_tuple.dst.u3.ip;
2801 sic.dest_ip_xlate = (__be32)reply_tuple.src.u3.ip;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002802
2803 sic.flags = 0;
2804
2805 switch (sic.protocol) {
2806 case IPPROTO_TCP:
Dave Hudson87973cd2013-10-22 16:00:04 +01002807 sic.src_port = orig_tuple.src.u.tcp.port;
2808 sic.dest_port = orig_tuple.dst.u.tcp.port;
2809 sic.src_port_xlate = reply_tuple.dst.u.tcp.port;
2810 sic.dest_port_xlate = reply_tuple.src.u.tcp.port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002811 sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale;
2812 sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin;
2813 sic.src_td_end = ct->proto.tcp.seen[0].td_end;
2814 sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend;
2815 sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale;
2816 sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin;
2817 sic.dest_td_end = ct->proto.tcp.seen[1].td_end;
2818 sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend;
2819 if (nf_ct_tcp_no_window_check
2820 || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL)
2821 || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) {
2822 sic.flags |= SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK;
2823 }
2824
2825 /*
2826 * Don't try to manage a non-established connection.
2827 */
2828 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
2829 DEBUG_TRACE("non-established connection\n");
2830 goto done1;
2831 }
2832
2833 /*
2834 * If the connection is shutting down do not manage it.
2835 * state can not be SYN_SENT, SYN_RECV because connection is assured
2836 * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE.
2837 */
2838 spin_lock_bh(&ct->lock);
2839 if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) {
2840 spin_unlock_bh(&ct->lock);
2841 DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n",
Dave Hudson87973cd2013-10-22 16:00:04 +01002842 ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port),
2843 &sic.dest_ip, ntohs(sic.dest_port));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002844 goto done1;
2845 }
2846 spin_unlock_bh(&ct->lock);
2847 break;
2848
2849 case IPPROTO_UDP:
Dave Hudson87973cd2013-10-22 16:00:04 +01002850 sic.src_port = orig_tuple.src.u.udp.port;
2851 sic.dest_port = orig_tuple.dst.u.udp.port;
2852 sic.src_port_xlate = reply_tuple.dst.u.udp.port;
2853 sic.dest_port_xlate = reply_tuple.src.u.udp.port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002854 break;
2855
2856 default:
2857 DEBUG_TRACE("unhandled protocol %d\n", sic.protocol);
2858 goto done1;
2859 }
2860
2861 /*
2862 * Get the MAC addresses that correspond to source and destination host addresses.
2863 */
2864 if (!sfe_ipv4_find_mac_addr(sic.src_ip, sic.src_mac)) {
2865 DEBUG_TRACE("failed to find MAC address for src IP: %pI4\n", &sic.src_ip);
2866 goto done1;
2867 }
2868
2869 if (!sfe_ipv4_find_mac_addr(sic.src_ip_xlate, sic.src_mac_xlate)) {
2870 DEBUG_TRACE("failed to find MAC address for xlate src IP: %pI4\n", &sic.src_ip_xlate);
2871 goto done1;
2872 }
2873
2874 /*
2875 * Do dest now
2876 */
2877 if (!sfe_ipv4_find_mac_addr(sic.dest_ip, sic.dest_mac)) {
2878 DEBUG_TRACE("failed to find MAC address for dest IP: %pI4\n", &sic.dest_ip);
2879 goto done1;
2880 }
2881
2882 if (!sfe_ipv4_find_mac_addr(sic.dest_ip_xlate, sic.dest_mac_xlate)) {
2883 DEBUG_TRACE("failed to find MAC address for xlate dest IP: %pI4\n", &sic.dest_ip_xlate);
2884 goto done1;
2885 }
2886
2887 /*
2888 * Get our device info. If we're dealing with the "reply" direction here then
2889 * we'll need things swapped around.
2890 */
2891 if (ctinfo < IP_CT_IS_REPLY) {
2892 src_dev = in;
2893 dest_dev = (struct net_device *)out;
2894 } else {
2895 src_dev = (struct net_device *)out;
2896 dest_dev = in;
2897 }
2898
2899#if (!SFE_HOOK_ABOVE_BRIDGE)
2900 /*
2901 * Now our devices may actually be a bridge interface. If that's
2902 * the case then we need to hunt down the underlying interface.
2903 */
2904 if (src_dev->priv_flags & IFF_EBRIDGE) {
2905 src_br_dev = br_port_dev_get(src_dev, sic.src_mac);
2906 if (!src_br_dev) {
2907 DEBUG_TRACE("no port found on bridge\n");
2908 goto done1;
2909 }
2910
2911 src_dev = src_br_dev;
2912 }
2913
2914 if (dest_dev->priv_flags & IFF_EBRIDGE) {
2915 dest_br_dev = br_port_dev_get(dest_dev, sic.dest_mac_xlate);
2916 if (!dest_br_dev) {
2917 DEBUG_TRACE("no port found on bridge\n");
2918 goto done2;
2919 }
2920
2921 dest_dev = dest_br_dev;
2922 }
2923#else
2924 /*
2925 * Our devices may actually be part of a bridge interface. If that's
2926 * the case then find the bridge interface instead.
2927 */
2928 if (src_dev->priv_flags & IFF_BRIDGE_PORT) {
2929 src_br_dev = src_dev->master;
2930 if (!src_br_dev) {
2931 DEBUG_TRACE("no bridge found for: %s\n", src_dev->name);
2932 goto done1;
2933 }
2934
2935 dev_hold(src_br_dev);
2936 src_dev = src_br_dev;
2937 }
2938
2939 if (dest_dev->priv_flags & IFF_BRIDGE_PORT) {
2940 dest_br_dev = dest_dev->master;
2941 if (!dest_br_dev) {
2942 DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name);
2943 goto done2;
2944 }
2945
2946 dev_hold(dest_br_dev);
2947 dest_dev = dest_br_dev;
2948 }
2949#endif
2950
2951 sic.src_dev = src_dev;
2952 sic.dest_dev = dest_dev;
2953
2954// XXX - these MTUs need handling correctly!
2955 sic.src_mtu = 1500;
2956 sic.dest_mtu = 1500;
2957
2958 sfe_ipv4_create_rule(si, &sic);
2959
2960 /*
2961 * If we had bridge ports then release them too.
2962 */
2963 if (dest_br_dev) {
2964 dev_put(dest_br_dev);
2965 }
2966
2967done2:
2968 if (src_br_dev) {
2969 dev_put(src_br_dev);
2970 }
2971
2972done1:
2973 /*
2974 * Release the interface on which this skb arrived
2975 */
2976 dev_put(in);
2977
2978 return NF_ACCEPT;
2979}
2980
2981#ifdef CONFIG_NF_CONNTRACK_EVENTS
2982/*
2983 * sfe_ipv4_destroy_rule()
2984 * Destroy a forwarding rule.
2985 */
2986static void sfe_ipv4_destroy_rule(struct sfe_ipv4 *si, struct sfe_ipv4_destroy *sid)
2987{
2988 struct sfe_ipv4_connection *c;
2989
2990 spin_lock_bh(&si->lock);
2991 si->connection_destroy_requests++;
2992
2993 /*
2994 * Check to see if we have a flow that matches the rule we're trying
2995 * to destroy. If there isn't then we can't destroy it.
2996 */
2997 c = sfe_ipv4_find_sfe_ipv4_connection(si, sid->protocol, sid->src_ip, sid->src_port,
2998 sid->dest_ip, sid->dest_port);
2999 if (!c) {
3000 si->connection_destroy_misses++;
3001 spin_unlock_bh(&si->lock);
3002
3003 DEBUG_TRACE("connection does not exist - p: %d, s: %pI4:%u, d: %pI4:%u\n",
Dave Hudson87973cd2013-10-22 16:00:04 +01003004 sid->protocol, &sid->src_ip, ntohs(sid->src_port),
3005 &sid->dest_ip, ntohs(sid->dest_port));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003006 return;
3007 }
3008
3009 /*
3010 * Remove our connection details from the hash tables.
3011 */
3012 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
3013 spin_unlock_bh(&si->lock);
3014
3015 /*
3016 * Finally synchronize state and free resources. We need to protect against
3017 * pre-emption by our bottom half while we do this though.
3018 */
3019 local_bh_disable();
3020 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
3021 local_bh_enable();
3022
3023 DEBUG_INFO("connection destroyed - p: %d, s: %pI4:%u, d: %pI4:%u\n",
Dave Hudson87973cd2013-10-22 16:00:04 +01003024 sid->protocol, &sid->src_ip, ntohs(sid->src_port),
3025 &sid->dest_ip, ntohs(sid->dest_port));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003026}
3027
3028/*
3029 * sfe_ipv4_conntrack_event()
3030 * Callback event invoked when a conntrack connection's state changes.
3031 */
3032static int sfe_ipv4_conntrack_event(unsigned int events, struct nf_ct_event *item)
3033{
3034 struct sfe_ipv4 *si = &__si;
3035 struct sfe_ipv4_destroy sid;
3036 struct nf_conn *ct = item->ct;
3037 struct nf_conntrack_tuple orig_tuple;
3038
3039 /*
3040 * If we don't have a conntrack entry then we're done.
3041 */
3042 if (unlikely(!ct)) {
3043 DEBUG_WARN("no ct in conntrack event callback\n");
3044 return NOTIFY_DONE;
3045 }
3046
3047 /*
3048 * If this is an untracked connection then we can't have any state either.
3049 */
3050 if (unlikely(ct == &nf_conntrack_untracked)) {
3051 DEBUG_TRACE("ignoring untracked conn\n");
3052 return NOTIFY_DONE;
3053 }
3054
3055 /*
3056 * Ignore anything other than IPv4 connections.
3057 */
3058 if (unlikely(nf_ct_l3num(ct) != AF_INET)) {
3059 DEBUG_TRACE("ignoring non-IPv4 conn\n");
3060 return NOTIFY_DONE;
3061 }
3062
3063 /*
3064 * We're only interested in destroy events.
3065 */
3066 if (unlikely(!(events & (1 << IPCT_DESTROY)))) {
3067 DEBUG_TRACE("ignoring non-destroy event\n");
3068 return NOTIFY_DONE;
3069 }
3070
3071 orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
3072 sid.protocol = (int32_t)orig_tuple.dst.protonum;
3073
3074 /*
3075 * Extract information from the conntrack connection. We're only interested
3076 * in nominal connection information (i.e. we're ignoring any NAT information).
3077 */
Dave Hudson87973cd2013-10-22 16:00:04 +01003078 sid.src_ip = (__be32)orig_tuple.src.u3.ip;
3079 sid.dest_ip = (__be32)orig_tuple.dst.u3.ip;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003080
3081 switch (sid.protocol) {
3082 case IPPROTO_TCP:
Dave Hudson87973cd2013-10-22 16:00:04 +01003083 sid.src_port = orig_tuple.src.u.tcp.port;
3084 sid.dest_port = orig_tuple.dst.u.tcp.port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003085 break;
3086
3087 case IPPROTO_UDP:
Dave Hudson87973cd2013-10-22 16:00:04 +01003088 sid.src_port = orig_tuple.src.u.udp.port;
3089 sid.dest_port = orig_tuple.dst.u.udp.port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003090 break;
3091
3092 default:
3093 DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol);
3094 return NOTIFY_DONE;
3095 }
3096
3097
3098 sfe_ipv4_destroy_rule(si, &sid);
3099 return NOTIFY_DONE;
3100}
3101
3102/*
3103 * Netfilter conntrack event system to monitor connection tracking changes
3104 */
3105static struct nf_ct_event_notifier sfe_ipv4_conntrack_notifier = {
3106 .fcn = sfe_ipv4_conntrack_event,
3107};
3108#endif
3109
3110/*
3111 * Structure to establish a hook into the post routing netfilter point - this
3112 * will pick up local outbound and packets going from one interface to another.
3113 *
3114 * Note: see include/linux/netfilter_ipv4.h for info related to priority levels.
3115 * We want to examine packets after NAT translation and any ALG processing.
3116 */
3117static struct nf_hook_ops sfe_ipv4_ops_post_routing[] __read_mostly = {
3118 {
3119 .hook = sfe_ipv4_post_routing_hook,
3120 .owner = THIS_MODULE,
3121 .pf = PF_INET,
3122 .hooknum = NF_INET_POST_ROUTING,
3123 .priority = NF_IP_PRI_NAT_SRC + 1,
3124 },
3125};
3126
3127/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003128 * sfe_ipv4_get_pause()
3129 */
3130static ssize_t sfe_ipv4_get_pause(struct device *dev,
3131 struct device_attribute *attr,
3132 char *buf)
3133{
3134 struct sfe_ipv4 *si = &__si;
3135 ssize_t count;
3136 int num;
3137
3138 spin_lock_bh(&si->lock);
3139 num = si->pause;
3140 spin_unlock_bh(&si->lock);
3141
3142 count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num);
3143 return count;
3144}
3145
3146/*
3147 * sfe_ipv4_set_pause()
3148 */
3149static ssize_t sfe_ipv4_set_pause(struct device *dev,
3150 struct device_attribute *attr,
3151 const char *buf, size_t count)
3152{
3153 struct sfe_ipv4 *si = &__si;
3154 char num_buf[12];
3155 int num;
3156
3157 /*
3158 * Check that our command data will fit. If it will then copy it to our local
3159 * buffer and NUL terminate it.
3160 */
3161 if (count > 11) {
3162 return 0;
3163 }
3164
3165 memcpy(num_buf, buf, count);
3166 num_buf[count] = '\0';
3167
3168 sscanf(num_buf, "%d", &num);
3169 DEBUG_TRACE("set pause: %d\n", num);
3170
3171 spin_lock_bh(&si->lock);
3172 si->pause = num;
3173 spin_unlock_bh(&si->lock);
3174
3175 return count;
3176}
3177
3178/*
3179 * sfe_ipv4_get_debug_dev()
3180 */
3181static ssize_t sfe_ipv4_get_debug_dev(struct device *dev,
3182 struct device_attribute *attr,
3183 char *buf)
3184{
3185 struct sfe_ipv4 *si = &__si;
3186 ssize_t count;
3187 int num;
3188
3189 spin_lock_bh(&si->lock);
3190 num = si->debug_dev;
3191 spin_unlock_bh(&si->lock);
3192
3193 count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num);
3194 return count;
3195}
3196
3197/*
3198 * sysfs attributes for the default classifier itself.
3199 */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003200static const struct device_attribute sfe_ipv4_pause_attr =
3201 __ATTR(pause, S_IWUGO | S_IRUGO, sfe_ipv4_get_pause, sfe_ipv4_set_pause);
3202static const struct device_attribute sfe_ipv4_debug_dev_attr =
3203 __ATTR(debug_dev, S_IWUGO | S_IRUGO, sfe_ipv4_get_debug_dev, NULL);
3204
3205/*
3206 * sfe_ipv4_destroy_all()
3207 * Destroy all connections that match a particular device.
3208 *
3209 * If we pass dev as NULL then this destroys all connections.
3210 */
3211static void sfe_ipv4_destroy_all(struct sfe_ipv4 *si, struct net_device *dev)
3212{
3213 struct sfe_ipv4_connection *c;
3214 struct sfe_ipv4_connection *c_next;
3215
3216 spin_lock_bh(&si->lock);
3217 c = si->all_connections_head;
3218 if (!c) {
3219 spin_unlock_bh(&si->lock);
3220 return;
3221 }
3222
3223 c->iterators++;
3224
3225 /*
3226 * Iterate over all connections
3227 */
3228 while (c) {
3229 c_next = c->all_connections_next;
3230
3231 /*
3232 * Before we do anything else, take an iterator reference for the
3233 * connection we'll iterate next.
3234 */
3235 if (c_next) {
3236 c_next->iterators++;
3237 }
3238
3239 /*
3240 * Does this connection relate to the device we are destroying? If
3241 * it does then ensure it is marked for being freed as soon as it
3242 * is no longer being iterated.
3243 */
3244 if (!dev
3245 || (dev == c->original_dev)
3246 || (dev == c->reply_dev)) {
3247 c->pending_free = true;
3248 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
3249 }
3250
3251 /*
3252 * Remove the iterator reference that we acquired and see if we
3253 * should free any resources.
3254 */
3255 if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
3256 spin_unlock_bh(&si->lock);
3257
3258 /*
3259 * This entry is dead so release our hold of the source and
3260 * dest devices and free the memory for our connection objects.
3261 */
3262 dev_put(c->original_dev);
3263 dev_put(c->reply_dev);
3264 kfree(c->original_match);
3265 kfree(c->reply_match);
3266 kfree(c);
3267
3268 spin_lock_bh(&si->lock);
3269 }
3270
3271 c = c_next;
3272 }
3273
3274 spin_unlock_bh(&si->lock);
3275}
3276
3277/*
3278 * sfe_ipv4_device_event()
3279 */
3280static int sfe_ipv4_device_event(struct notifier_block *this, unsigned long event, void *ptr)
3281{
3282 struct sfe_ipv4 *si = &__si;
3283 struct net_device *dev = (struct net_device *)ptr;
3284
3285 switch (event) {
3286 case NETDEV_DOWN:
3287 if (dev) {
3288 sfe_ipv4_destroy_all(si, dev);
3289 }
3290 break;
3291 }
3292
3293 return NOTIFY_DONE;
3294}
3295
3296/*
3297 * sfe_ipv4_inet_event()
3298 */
3299static int sfe_ipv4_inet_event(struct notifier_block *this, unsigned long event, void *ptr)
3300{
3301 struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
3302 return sfe_ipv4_device_event(this, event, dev);
3303}
3304
3305/*
3306 * sfe_ipv4_periodic_sync()
3307 */
3308static void sfe_ipv4_periodic_sync(unsigned long arg)
3309{
3310 struct sfe_ipv4 *si = (struct sfe_ipv4 *)arg;
3311 uint64_t now_jiffies;
3312 int quota;
3313
3314 now_jiffies = get_jiffies_64();
3315
3316 spin_lock_bh(&si->lock);
3317 sfe_ipv4_update_summary_stats(si);
3318
3319 /*
3320 * Get an estimate of the number of connections to parse in this sync.
3321 */
3322 quota = (si->num_connections + 63) / 64;
3323
3324 /*
3325 * Walk the "active" list and sync the connection state.
3326 */
3327 while (quota--) {
3328 struct sfe_ipv4_connection_match *cm;
3329 struct sfe_ipv4_connection_match *counter_cm;
3330 struct sfe_ipv4_connection *c;
3331 struct sfe_ipv4_sync sis;
3332
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003333 cm = si->active_head;
3334 if (!cm) {
3335 break;
3336 }
3337
3338 cm->active = false;
3339
3340 /*
3341 * Having found an entry we now remove it from the active scan list.
3342 */
3343 si->active_head = cm->active_next;
3344 if (likely(cm->active_next)) {
3345 cm->active_next->active_prev = NULL;
3346 } else {
3347 si->active_tail = NULL;
3348 }
3349 cm->active_next = NULL;
3350
3351 /*
3352 * We scan the connection match lists so there's a possibility that our
3353 * counter match is in the list too. If it is then remove it.
3354 */
3355 counter_cm = cm->counter_match;
3356 if (counter_cm->active) {
3357 counter_cm->active = false;
3358
3359 if (likely(counter_cm->active_prev)) {
3360 counter_cm->active_prev->active_next = counter_cm->active_next;
3361 } else {
3362 si->active_head = counter_cm->active_next;
3363 }
3364
3365 if (likely(counter_cm->active_next)) {
3366 counter_cm->active_next->active_prev = counter_cm->active_prev;
3367 } else {
3368 si->active_tail = counter_cm->active_prev;
3369 }
3370
3371 counter_cm->active_next = NULL;
3372 counter_cm->active_prev = NULL;
3373 }
3374
3375 /*
3376 * Sync the connection state.
3377 */
3378 c = cm->connection;
3379 sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, now_jiffies);
3380
3381 /*
3382 * We don't want to be holding the lock when we sync!
3383 */
3384 spin_unlock_bh(&si->lock);
3385 sfe_ipv4_sync_rule(&sis);
3386 spin_lock_bh(&si->lock);
3387 }
3388
3389 spin_unlock_bh(&si->lock);
3390
3391 mod_timer(&si->timer, jiffies + (HZ / 100));
3392}
3393
3394#define CHAR_DEV_MSG_SIZE 768
3395
3396/*
3397 * sfe_ipv4_debug_dev_read_start()
3398 * Generate part of the XML output.
3399 */
3400static bool sfe_ipv4_debug_dev_read_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3401 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3402{
3403 int bytes_read;
3404
3405 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "<sfe_ipv4>\n");
3406 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3407 return false;
3408 }
3409
3410 *length -= bytes_read;
3411 *total_read += bytes_read;
3412
3413 ws->state++;
3414 return true;
3415}
3416
3417/*
3418 * sfe_ipv4_debug_dev_read_connections_start()
3419 * Generate part of the XML output.
3420 */
3421static bool sfe_ipv4_debug_dev_read_connections_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3422 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3423{
3424 int bytes_read;
3425
3426 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<connections>\n");
3427 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3428 return false;
3429 }
3430
3431 *length -= bytes_read;
3432 *total_read += bytes_read;
3433
3434 ws->state++;
3435 return true;
3436}
3437
3438/*
3439 * sfe_ipv4_debug_dev_read_connections_connection()
3440 * Generate part of the XML output.
3441 */
3442static bool sfe_ipv4_debug_dev_read_connections_connection(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3443 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3444{
3445 struct sfe_ipv4_connection *c;
3446 struct sfe_ipv4_connection *c_next;
3447 struct sfe_ipv4_connection_match *original_cm;
3448 struct sfe_ipv4_connection_match *reply_cm;
3449 int bytes_read;
3450 int protocol;
3451 struct net_device *src_dev;
Dave Hudson87973cd2013-10-22 16:00:04 +01003452 __be32 src_ip;
3453 __be32 src_ip_xlate;
3454 __be16 src_port;
3455 __be16 src_port_xlate;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003456 uint64_t src_rx_packets;
3457 uint64_t src_rx_bytes;
3458 struct net_device *dest_dev;
Dave Hudson87973cd2013-10-22 16:00:04 +01003459 __be32 dest_ip;
3460 __be32 dest_ip_xlate;
3461 __be16 dest_port;
3462 __be16 dest_port_xlate;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003463 uint64_t dest_rx_packets;
3464 uint64_t dest_rx_bytes;
3465 uint64_t last_sync_jiffies;
3466
3467 spin_lock_bh(&si->lock);
3468 c = ws->iter_conn;
3469
3470 /*
3471 * Is this the first connection we need to scan?
3472 */
3473 if (!c) {
3474 c = si->all_connections_head;
3475
3476 /*
3477 * If there were no connections then move to the next state.
3478 */
3479 if (!c) {
3480 spin_unlock_bh(&si->lock);
3481
3482 ws->state++;
3483 return true;
3484 }
3485
3486 c->iterators++;
3487 }
3488
3489 c_next = c->all_connections_next;
3490 ws->iter_conn = c_next;
3491
3492 /*
3493 * Before we do anything else, take an iterator reference for the
3494 * connection we'll iterate next.
3495 */
3496 if (c_next) {
3497 c_next->iterators++;
3498 }
3499
3500 /*
3501 * Remove the iterator reference that we acquired and see if we
3502 * should free any resources.
3503 */
3504 if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
3505 spin_unlock_bh(&si->lock);
3506
3507 /*
3508 * This entry is dead so release our hold of the source and
3509 * dest devices and free the memory for our connection objects.
3510 */
3511 dev_put(c->original_dev);
3512 dev_put(c->reply_dev);
3513 kfree(c->original_match);
3514 kfree(c->reply_match);
3515 kfree(c);
3516
3517 /*
3518 * If we have no more connections then move to the next state.
3519 */
3520 if (!c_next) {
3521 ws->state++;
3522 }
3523
3524 return true;
3525 }
3526
3527 original_cm = c->original_match;
3528 reply_cm = c->reply_match;
3529
3530 protocol = c->protocol;
3531 src_dev = c->original_dev;
3532 src_ip = c->src_ip;
3533 src_ip_xlate = c->src_ip_xlate;
3534 src_port = c->src_port;
3535 src_port_xlate = c->src_port_xlate;
3536
3537 sfe_ipv4_connection_match_update_summary_stats(original_cm);
3538 sfe_ipv4_connection_match_update_summary_stats(reply_cm);
3539
3540 src_rx_packets = original_cm->rx_packet_count64;
3541 src_rx_bytes = original_cm->rx_byte_count64;
3542 dest_dev = c->reply_dev;
3543 dest_ip = c->dest_ip;
3544 dest_ip_xlate = c->dest_ip_xlate;
3545 dest_port = c->dest_port;
3546 dest_port_xlate = c->dest_port_xlate;
3547 dest_rx_packets = reply_cm->rx_packet_count64;
3548 dest_rx_bytes = reply_cm->rx_byte_count64;
3549 last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies;
3550 spin_unlock_bh(&si->lock);
3551
3552 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t<connection "
3553 "protocol=\"%u\" "
3554 "src_dev=\"%s\" "
3555 "src_ip=\"%pI4\" src_ip_xlate=\"%pI4\" "
3556 "src_port=\"%u\" src_port_xlate=\"%u\" "
3557 "src_rx_pkts=\"%llu\" src_rx_bytes=\"%llu\" "
3558 "dest_dev=\"%s\" "
3559 "dest_ip=\"%pI4\" dest_ip_xlate=\"%pI4\" "
3560 "dest_port=\"%u\" dest_port_xlate=\"%u\" "
3561 "dest_rx_pkts=\"%llu\" dest_rx_bytes=\"%llu\" "
3562 "last_sync=\"%llu\" />\n",
3563 protocol,
3564 src_dev->name,
3565 &src_ip, &src_ip_xlate,
Dave Hudson87973cd2013-10-22 16:00:04 +01003566 ntohs(src_port), ntohs(src_port_xlate),
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003567 src_rx_packets, src_rx_bytes,
3568 dest_dev->name,
3569 &dest_ip, &dest_ip_xlate,
Dave Hudson87973cd2013-10-22 16:00:04 +01003570 ntohs(dest_port), ntohs(dest_port_xlate),
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003571 dest_rx_packets, dest_rx_bytes,
3572 last_sync_jiffies);
3573
3574 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3575 return false;
3576 }
3577
3578 *length -= bytes_read;
3579 *total_read += bytes_read;
3580
3581 /*
3582 * If we have no more connections then move to the next state.
3583 */
3584 if (!c_next) {
3585 ws->state++;
3586 }
3587
3588 return true;
3589}
3590
3591/*
3592 * sfe_ipv4_debug_dev_read_connections_end()
3593 * Generate part of the XML output.
3594 */
3595static bool sfe_ipv4_debug_dev_read_connections_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3596 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3597{
3598 int bytes_read;
3599
3600 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t</connections>\n");
3601 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3602 return false;
3603 }
3604
3605 *length -= bytes_read;
3606 *total_read += bytes_read;
3607
3608 ws->state++;
3609 return true;
3610}
3611
3612/*
3613 * sfe_ipv4_debug_dev_read_exceptions_start()
3614 * Generate part of the XML output.
3615 */
3616static bool sfe_ipv4_debug_dev_read_exceptions_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3617 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3618{
3619 int bytes_read;
3620
3621 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<exceptions>\n");
3622 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3623 return false;
3624 }
3625
3626 *length -= bytes_read;
3627 *total_read += bytes_read;
3628
3629 ws->state++;
3630 return true;
3631}
3632
3633/*
3634 * sfe_ipv4_debug_dev_read_exceptions_exception()
3635 * Generate part of the XML output.
3636 */
3637static bool sfe_ipv4_debug_dev_read_exceptions_exception(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3638 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3639{
3640 uint64_t ct;
3641
3642 spin_lock_bh(&si->lock);
3643 ct = si->exception_events64[ws->iter_exception];
3644 spin_unlock_bh(&si->lock);
3645
3646 if (ct) {
3647 int bytes_read;
3648
3649 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE,
3650 "\t\t<exception name=\"%s\" count=\"%llu\" />\n",
3651 sfe_ipv4_exception_events_string[ws->iter_exception],
3652 ct);
3653 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3654 return false;
3655 }
3656
3657 *length -= bytes_read;
3658 *total_read += bytes_read;
3659 }
3660
3661 ws->iter_exception++;
3662 if (ws->iter_exception >= SFE_IPV4_EXCEPTION_EVENT_LAST) {
3663 ws->iter_exception = 0;
3664 ws->state++;
3665 }
3666
3667 return true;
3668}
3669
3670/*
3671 * sfe_ipv4_debug_dev_read_exceptions_end()
3672 * Generate part of the XML output.
3673 */
3674static bool sfe_ipv4_debug_dev_read_exceptions_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3675 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3676{
3677 int bytes_read;
3678
3679 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t</exceptions>\n");
3680 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3681 return false;
3682 }
3683
3684 *length -= bytes_read;
3685 *total_read += bytes_read;
3686
3687 ws->state++;
3688 return true;
3689}
3690
3691/*
3692 * sfe_ipv4_debug_dev_read_stats()
3693 * Generate part of the XML output.
3694 */
3695static bool sfe_ipv4_debug_dev_read_stats(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3696 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3697{
3698 int bytes_read;
3699 unsigned int num_connections;
3700 uint64_t packets_forwarded;
3701 uint64_t packets_not_forwarded;
3702 uint64_t connection_create_requests;
3703 uint64_t connection_create_collisions;
3704 uint64_t connection_destroy_requests;
3705 uint64_t connection_destroy_misses;
3706 uint64_t connection_flushes;
3707 uint64_t connection_match_hash_hits;
3708 uint64_t connection_match_hash_reorders;
3709
3710 spin_lock_bh(&si->lock);
3711 sfe_ipv4_update_summary_stats(si);
3712
3713 num_connections = si->num_connections;
3714 packets_forwarded = si->packets_forwarded64;
3715 packets_not_forwarded = si->packets_not_forwarded64;
3716 connection_create_requests = si->connection_create_requests64;
3717 connection_create_collisions = si->connection_create_collisions64;
3718 connection_destroy_requests = si->connection_destroy_requests64;
3719 connection_destroy_misses = si->connection_destroy_misses64;
3720 connection_flushes = si->connection_flushes64;
3721 connection_match_hash_hits = si->connection_match_hash_hits64;
3722 connection_match_hash_reorders = si->connection_match_hash_reorders64;
3723 spin_unlock_bh(&si->lock);
3724
3725 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<stats "
3726 "num_connections=\"%u\" "
3727 "pkts_forwarded=\"%llu\" pkts_not_forwarded=\"%llu\" "
3728 "create_requests=\"%llu\" create_collisions=\"%llu\" "
3729 "destroy_requests=\"%llu\" destroy_misses=\"%llu\" "
3730 "flushes=\"%llu\" "
3731 "hash_hits=\"%llu\" hash_reorders=\"%llu\" />\n",
3732 num_connections,
3733 packets_forwarded,
3734 packets_not_forwarded,
3735 connection_create_requests,
3736 connection_create_collisions,
3737 connection_destroy_requests,
3738 connection_destroy_misses,
3739 connection_flushes,
3740 connection_match_hash_hits,
3741 connection_match_hash_reorders);
3742 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3743 return false;
3744 }
3745
3746 *length -= bytes_read;
3747 *total_read += bytes_read;
3748
3749 ws->state++;
3750 return true;
3751}
3752
3753/*
3754 * sfe_ipv4_debug_dev_read_end()
3755 * Generate part of the XML output.
3756 */
3757static bool sfe_ipv4_debug_dev_read_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
3758 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
3759{
3760 int bytes_read;
3761
3762 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "</sfe_ipv4>\n");
3763 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
3764 return false;
3765 }
3766
3767 *length -= bytes_read;
3768 *total_read += bytes_read;
3769
3770 ws->state++;
3771 return true;
3772}
3773
3774/*
3775 * Array of write functions that write various XML elements that correspond to
3776 * our XML output state machine.
3777 */
3778sfe_ipv4_debug_xml_write_method_t sfe_ipv4_debug_xml_write_methods[SFE_IPV4_DEBUG_XML_STATE_DONE] = {
3779 sfe_ipv4_debug_dev_read_start,
3780 sfe_ipv4_debug_dev_read_connections_start,
3781 sfe_ipv4_debug_dev_read_connections_connection,
3782 sfe_ipv4_debug_dev_read_connections_end,
3783 sfe_ipv4_debug_dev_read_exceptions_start,
3784 sfe_ipv4_debug_dev_read_exceptions_exception,
3785 sfe_ipv4_debug_dev_read_exceptions_end,
3786 sfe_ipv4_debug_dev_read_stats,
3787 sfe_ipv4_debug_dev_read_end,
3788};
3789
3790/*
3791 * sfe_ipv4_debug_dev_read()
3792 * Send info to userspace upon read request from user
3793 */
3794static ssize_t sfe_ipv4_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset)
3795{
3796 char msg[CHAR_DEV_MSG_SIZE];
3797 int total_read = 0;
3798 struct sfe_ipv4_debug_xml_write_state *ws;
3799 struct sfe_ipv4 *si = &__si;
3800
3801 ws = (struct sfe_ipv4_debug_xml_write_state *)filp->private_data;
3802 while ((ws->state != SFE_IPV4_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) {
3803 if ((sfe_ipv4_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) {
3804 continue;
3805 }
3806 }
3807
3808 return total_read;
3809}
3810
3811/*
3812 * sfe_ipv4_debug_dev_write()
3813 * Write to char device not required/supported
3814 */
3815static ssize_t sfe_ipv4_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset)
3816{
3817 return -EINVAL;
3818}
3819
3820/*
3821 * sfe_ipv4_debug_dev_open()
3822 */
3823static int sfe_ipv4_debug_dev_open(struct inode *inode, struct file *file)
3824{
3825 struct sfe_ipv4_debug_xml_write_state *ws;
3826
3827 ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data;
3828 if (!ws) {
3829 ws = kzalloc(sizeof(struct sfe_ipv4_debug_xml_write_state), GFP_KERNEL);
3830 if (!ws) {
3831 return -ENOMEM;
3832 }
3833
3834 ws->state = SFE_IPV4_DEBUG_XML_STATE_START;
3835 file->private_data = ws;
3836 }
3837
3838 return 0;
3839}
3840
3841/*
3842 * sfe_ipv4_debug_dev_release()
3843 */
3844static int sfe_ipv4_debug_dev_release(struct inode *inode, struct file *file)
3845{
3846 struct sfe_ipv4_debug_xml_write_state *ws;
3847
3848 ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data;
3849 if (ws) {
3850 struct sfe_ipv4_connection *c;
3851
3852 /*
3853 * Are we currently iterating a connection? If we are then
3854 * make sure that we reduce its iterator count and if necessary
3855 * free it.
3856 */
3857 c = ws->iter_conn;
3858 if (c) {
3859 struct sfe_ipv4 *si = &__si;
3860
3861 spin_lock_bh(&si->lock);
3862 if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
3863 spin_unlock_bh(&si->lock);
3864
3865 /*
3866 * This entry is dead so release our hold of the source and
3867 * dest devices and free the memory for our connection objects.
3868 */
3869 dev_put(c->original_dev);
3870 dev_put(c->reply_dev);
3871 kfree(c->original_match);
3872 kfree(c->reply_match);
3873 kfree(c);
3874 }
3875 }
3876
3877 /*
3878 * We've finished with our output so free the write state.
3879 */
3880 kfree(ws);
3881 }
3882
3883 return 0;
3884}
3885
3886/*
3887 * File operations used in the debug char device
3888 */
3889static struct file_operations sfe_ipv4_debug_dev_fops = {
3890 .read = sfe_ipv4_debug_dev_read,
3891 .write = sfe_ipv4_debug_dev_write,
3892 .open = sfe_ipv4_debug_dev_open,
3893 .release = sfe_ipv4_debug_dev_release
3894};
3895
3896/*
Dave Hudson87973cd2013-10-22 16:00:04 +01003897 * sfe_ipv4_init()
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003898 */
Dave Hudson87973cd2013-10-22 16:00:04 +01003899static int __init sfe_ipv4_init(void)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003900{
3901 struct sfe_ipv4 *si = &__si;
3902 int result = -1;
3903
Dave Hudson87973cd2013-10-22 16:00:04 +01003904 DEBUG_INFO("SFE init\n");
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003905
3906 /*
3907 * Create sys/sfe_ipv4
3908 */
3909 si->sys_sfe_ipv4 = kobject_create_and_add("sfe_ipv4", NULL);
3910 if (!si->sys_sfe_ipv4) {
3911 DEBUG_ERROR("failed to register sfe_ipv4\n");
3912 goto exit1;
3913 }
3914
3915 /*
3916 * Create files, one for each parameter supported by this module.
3917 */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003918 result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_pause_attr.attr);
3919 if (result) {
3920 DEBUG_ERROR("failed to register pause file: %d\n", result);
3921 goto exit3;
3922 }
3923
3924 result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
3925 if (result) {
3926 DEBUG_ERROR("failed to register debug dev file: %d\n", result);
3927 goto exit4;
3928 }
3929
3930 /*
3931 * Register our debug char device.
3932 */
3933 result = register_chrdev(0, "sfe_ipv4", &sfe_ipv4_debug_dev_fops);
3934 if (result < 0) {
3935 DEBUG_ERROR("Failed to register chrdev: %d\n", result);
3936 goto exit5;
3937 }
3938
3939 si->debug_dev = result;
3940 si->dev_notifier.notifier_call = sfe_ipv4_device_event;
3941 si->dev_notifier.priority = 1;
3942 register_netdevice_notifier(&si->dev_notifier);
3943
3944 si->inet_notifier.notifier_call = sfe_ipv4_inet_event;
3945 si->inet_notifier.priority = 1;
3946 register_inetaddr_notifier(&si->inet_notifier);
3947
3948 /*
3949 * Create a timer to handle periodic statistics.
3950 */
3951 setup_timer(&si->timer, sfe_ipv4_periodic_sync, (unsigned long)si);
3952 mod_timer(&si->timer, jiffies + (HZ / 100));
3953
3954 /*
3955 * Register our netfilter hooks.
3956 */
3957 result = nf_register_hooks(sfe_ipv4_ops_post_routing, ARRAY_SIZE(sfe_ipv4_ops_post_routing));
3958 if (result < 0) {
3959 DEBUG_ERROR("can't register nf post routing hook: %d\n", result);
3960 goto exit6;
3961 }
3962
3963#ifdef CONFIG_NF_CONNTRACK_EVENTS
3964 /*
3965 * Register a notifier hook to get fast notifications of expired connections.
3966 */
3967 result = nf_conntrack_register_notifier(&init_net, &sfe_ipv4_conntrack_notifier);
3968 if (result < 0) {
3969 DEBUG_ERROR("can't register nf notifier hook: %d\n", result);
3970 goto exit7;
3971 }
3972#endif
3973
Dave Hudson87973cd2013-10-22 16:00:04 +01003974 spin_lock_init(&si->lock);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003975
Dave Hudson87973cd2013-10-22 16:00:04 +01003976 BUG_ON(athrs_fast_nat_recv != NULL);
3977 RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_ipv4_recv);
3978 return 0;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003979
3980#ifdef CONFIG_NF_CONNTRACK_EVENTS
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003981exit7:
3982#endif
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003983 nf_unregister_hooks(sfe_ipv4_ops_post_routing, ARRAY_SIZE(sfe_ipv4_ops_post_routing));
Dave Hudson87973cd2013-10-22 16:00:04 +01003984 del_timer_sync(&si->timer);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003985
3986exit6:
3987 unregister_inetaddr_notifier(&si->inet_notifier);
3988 unregister_netdevice_notifier(&si->dev_notifier);
3989 unregister_chrdev(si->debug_dev, "sfe_ipv4");
3990
3991exit5:
3992 sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
3993
3994exit4:
3995 sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_pause_attr.attr);
3996
3997exit3:
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003998 kobject_put(si->sys_sfe_ipv4);
3999
4000exit1:
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01004001 return result;
4002}
4003
4004/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01004005 * sfe_ipv4_exit()
4006 */
4007static void __exit sfe_ipv4_exit(void)
4008{
Dave Hudson87973cd2013-10-22 16:00:04 +01004009 struct sfe_ipv4 *si = &__si;
4010
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01004011 DEBUG_INFO("SFE exit\n");
Dave Hudson87973cd2013-10-22 16:00:04 +01004012
4013 RCU_INIT_POINTER(athrs_fast_nat_recv, NULL);
4014
4015 /*
4016 * Wait for all callbacks to complete.
4017 */
4018 rcu_barrier();
4019
4020 /*
4021 * Destroy all connections.
4022 */
4023 sfe_ipv4_destroy_all(si, NULL);
4024
4025// XXX - this is where we need to unregister with any lower level offload services.
4026
4027#ifdef CONFIG_NF_CONNTRACK_EVENTS
4028 nf_conntrack_unregister_notifier(&init_net, &sfe_ipv4_conntrack_notifier);
4029
4030#endif
4031 nf_unregister_hooks(sfe_ipv4_ops_post_routing, ARRAY_SIZE(sfe_ipv4_ops_post_routing));
4032 del_timer_sync(&si->timer);
4033
4034 unregister_inetaddr_notifier(&si->inet_notifier);
4035 unregister_netdevice_notifier(&si->dev_notifier);
4036 unregister_chrdev(si->debug_dev, "sfe_ipv4");
4037
4038 sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
4039
4040 sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_pause_attr.attr);
4041
4042 kobject_put(si->sys_sfe_ipv4);
4043
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01004044}
4045
4046module_init(sfe_ipv4_init)
4047module_exit(sfe_ipv4_exit)
4048
4049MODULE_AUTHOR("Qualcomm Atheros Inc.");
4050MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv4 edition");
4051MODULE_LICENSE("GPL");
4052