blob: 75c907818a797c0982f6eb275398ee655de76b1f [file] [log] [blame]
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001/*
2 * sfe_ipv4.c
3 * Shortcut forwarding engine - IPv4 edition.
4 *
5 * XXX - fill in the appropriate GPL notice.
6 */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01007#include <linux/module.h>
Dave Hudsondcd08fb2013-11-22 09:25:16 -06008#include <linux/sysfs.h>
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01009#include <linux/skbuff.h>
10#include <linux/icmp.h>
Dave Hudsonaaf97ca2013-06-13 17:52:29 +010011#include <net/tcp.h>
Dave Hudsondcd08fb2013-11-22 09:25:16 -060012#include <linux/etherdevice.h>
Dave Hudsonaaf97ca2013-06-13 17:52:29 +010013
Dave Hudsondcd08fb2013-11-22 09:25:16 -060014#include "sfe.h"
15#include "sfe_ipv4.h"
Dave Hudsonaaf97ca2013-06-13 17:52:29 +010016
17/*
18 * The default Linux ethhdr structure is "packed". It also has byte aligned
19 * MAC addresses and this leads to poor performance. This version is not
20 * packed and has better alignment for the MAC addresses.
21 */
22struct sfe_ipv4_ethhdr {
23 __be16 h_dest[ETH_ALEN / 2];
24 __be16 h_source[ETH_ALEN / 2];
25 __be16 h_proto;
26};
27
28/*
29 * The default Linux iphdr structure is "packed". This really hurts performance
Dave Hudson87973cd2013-10-22 16:00:04 +010030 * on many CPUs. Here's an aligned and "unpacked" version of the same thing.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +010031 */
32struct sfe_ipv4_iphdr {
33#if defined(__LITTLE_ENDIAN_BITFIELD)
34 __u8 ihl:4,
35 version:4;
36#elif defined (__BIG_ENDIAN_BITFIELD)
37 __u8 version:4,
38 ihl:4;
39#else
40#error "Please fix <asm/byteorder.h>"
41#endif
42 __u8 tos;
43 __be16 tot_len;
44 __be16 id;
45 __be16 frag_off;
46 __u8 ttl;
47 __u8 protocol;
48 __sum16 check;
49 __be32 saddr;
50 __be32 daddr;
Dave Hudsondcd08fb2013-11-22 09:25:16 -060051
52 /*
53 * The options start here.
54 */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +010055};
56
57/*
58 * The default Linux udphdr structure is "packed". This really hurts performance
Dave Hudson87973cd2013-10-22 16:00:04 +010059 * on many CPUs. Here's an aligned and "unpacked" version of the same thing.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +010060 */
61struct sfe_ipv4_udphdr {
62 __be16 source;
63 __be16 dest;
64 __be16 len;
65 __sum16 check;
66};
67
68/*
69 * The default Linux tcphdr structure is "packed". This really hurts performance
Dave Hudson87973cd2013-10-22 16:00:04 +010070 * on many CPUs. Here's an aligned and "unpacked" version of the same thing.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +010071 */
72struct sfe_ipv4_tcphdr {
73 __be16 source;
74 __be16 dest;
75 __be32 seq;
76 __be32 ack_seq;
77#if defined(__LITTLE_ENDIAN_BITFIELD)
78 __u16 res1:4,
79 doff:4,
80 fin:1,
81 syn:1,
82 rst:1,
83 psh:1,
84 ack:1,
85 urg:1,
86 ece:1,
87 cwr:1;
88#elif defined(__BIG_ENDIAN_BITFIELD)
89 __u16 doff:4,
90 res1:4,
91 cwr:1,
92 ece:1,
93 urg:1,
94 ack:1,
95 psh:1,
96 rst:1,
97 syn:1,
98 fin:1;
99#else
100#error "Adjust your <asm/byteorder.h> defines"
101#endif
102 __be16 window;
103 __sum16 check;
104 __be16 urg_ptr;
105};
106
107/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100108 * Specifies the lower bound on ACK numbers carried in the TCP header
109 */
110#define SFE_IPV4_TCP_MAX_ACK_WINDOW 65520
111
112/*
113 * IPv4 TCP connection match additional data.
114 */
115struct sfe_ipv4_tcp_connection_match {
116 uint8_t win_scale; /* Window scale */
117 uint32_t max_win; /* Maximum window size seen */
118 uint32_t end; /* Sequence number of the next byte to send (seq + segment length) */
119 uint32_t max_end; /* Sequence number of the last byte to ack */
120};
121
122/*
123 * Bit flags for IPv4 connection matching entry.
124 */
125#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC 0x1
126 /* Perform source translation */
127#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST 0x2
128 /* Perform destination translation */
129#define SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK 0x4
130 /* Ignore TCP sequence numbers */
131#define SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR 0x8
132 /* Fast Ethernet header write */
133
134/*
135 * IPv4 connection matching structure.
136 */
137struct sfe_ipv4_connection_match {
138 /*
139 * References to other objects.
140 */
141 struct sfe_ipv4_connection_match *next;
142 /* Next connection match entry in a list */
143 struct sfe_ipv4_connection_match *prev;
144 /* Previous connection match entry in a list */
145 struct sfe_ipv4_connection *connection;
146 /* Pointer to our connection */
147 struct sfe_ipv4_connection_match *counter_match;
148 /* Pointer to the connection match in the "counter" direction to this one */
149 struct sfe_ipv4_connection_match *active_next;
150 /* Pointer to the next connection in the active list */
151 struct sfe_ipv4_connection_match *active_prev;
152 /* Pointer to the previous connection in the active list */
153 bool active; /* Flag to indicate if we're on the active list */
154
155 /*
156 * Characteristics that identify flows that match this rule.
157 */
158 struct net_device *match_dev; /* Network device */
159 uint8_t match_protocol; /* Protocol */
Dave Hudson87973cd2013-10-22 16:00:04 +0100160 __be32 match_src_ip; /* Source IP address */
161 __be32 match_dest_ip; /* Destination IP address */
162 __be16 match_src_port; /* Source port/connection ident */
163 __be16 match_dest_port; /* Destination port/connection ident */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100164
165 /*
166 * Control the operations of the match.
167 */
168 uint32_t flags; /* Bit flags */
169
170 /*
171 * Connection state that we track once we match.
172 */
173 union { /* Protocol-specific state */
174 struct sfe_ipv4_tcp_connection_match tcp;
175 } protocol_state;
176 uint32_t rx_packet_count; /* Number of packets RX'd */
177 uint32_t rx_byte_count; /* Number of bytes RX'd */
178
179 /*
180 * Packet translation information.
181 */
Dave Hudson87973cd2013-10-22 16:00:04 +0100182 __be32 xlate_src_ip; /* Address after source translation */
183 __be16 xlate_src_port; /* Port/connection ident after source translation */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100184 uint16_t xlate_src_csum_adjustment;
185 /* Transport layer checksum adjustment after source translation */
Dave Hudson87973cd2013-10-22 16:00:04 +0100186 __be32 xlate_dest_ip; /* Address after destination translation */
187 __be16 xlate_dest_port; /* Port/connection ident after destination translation */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100188 uint16_t xlate_dest_csum_adjustment;
189 /* Transport layer checksum adjustment after destination translation */
190
191 /*
192 * Packet transmit information.
193 */
194 struct net_device *xmit_dev; /* Network device on which to transmit */
195 unsigned short int xmit_dev_mtu;
196 /* Interface MTU */
197 uint16_t xmit_dest_mac[ETH_ALEN / 2];
198 /* Destination MAC address to use when forwarding */
199 uint16_t xmit_src_mac[ETH_ALEN / 2];
200 /* Source MAC address to use when forwarding */
201
202 /*
203 * Summary stats.
204 */
205 uint64_t rx_packet_count64; /* Number of packets RX'd */
206 uint64_t rx_byte_count64; /* Number of bytes RX'd */
207};
208
209/*
210 * Per-connection data structure.
211 */
212struct sfe_ipv4_connection {
213 struct sfe_ipv4_connection *next;
214 /* Pointer to the next entry in a hash chain */
215 struct sfe_ipv4_connection *prev;
216 /* Pointer to the previous entry in a hash chain */
217 int protocol; /* IP protocol number */
Dave Hudson87973cd2013-10-22 16:00:04 +0100218 __be32 src_ip; /* Source IP address */
219 __be32 src_ip_xlate; /* NAT-translated source IP address */
220 __be32 dest_ip; /* Destination IP address */
221 __be32 dest_ip_xlate; /* NAT-translated destination IP address */
222 __be16 src_port; /* Source port */
223 __be16 src_port_xlate; /* NAT-translated source port */
224 __be16 dest_port; /* Destination port */
225 __be16 dest_port_xlate; /* NAT-translated destination port */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100226 struct sfe_ipv4_connection_match *original_match;
227 /* Original direction matching structure */
228 struct net_device *original_dev;
229 /* Original direction source device */
230 struct sfe_ipv4_connection_match *reply_match;
231 /* Reply direction matching structure */
232 struct net_device *reply_dev; /* Reply direction source device */
233 uint64_t last_sync_jiffies; /* Jiffies count for the last sync */
234 struct sfe_ipv4_connection *all_connections_next;
235 /* Pointer to the next entry in the list of all connections */
236 struct sfe_ipv4_connection *all_connections_prev;
237 /* Pointer to the previous entry in the list of all connections */
238 int iterators; /* Number of iterators currently using this connection */
239 bool pending_free; /* Flag that indicates that this connection should be freed after iteration */
Matthew McClintockbe7b47d2013-11-27 13:26:23 -0600240 uint32_t mark; /* mark for outgoing packet */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100241};
242
243/*
244 * IPv4 connections and hash table size information.
245 */
246#define SFE_IPV4_CONNECTION_HASH_SHIFT 12
247#define SFE_IPV4_CONNECTION_HASH_SIZE (1 << SFE_IPV4_CONNECTION_HASH_SHIFT)
248#define SFE_IPV4_CONNECTION_HASH_MASK (SFE_IPV4_CONNECTION_HASH_SIZE - 1)
249
250enum sfe_ipv4_exception_events {
251 SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE,
252 SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION,
253 SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT,
254 SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL,
255 SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION,
256 SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE,
257 SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS,
258 SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS,
259 SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT,
260 SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL,
261 SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION,
262 SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS,
263 SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE,
264 SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS,
265 SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK,
266 SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS,
267 SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE,
268 SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE,
269 SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE,
270 SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE,
271 SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE,
272 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE,
273 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4,
274 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE,
275 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE,
276 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE,
277 SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL,
278 SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION,
279 SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION,
280 SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE,
281 SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH,
282 SFE_IPV4_EXCEPTION_EVENT_NON_V4,
283 SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT,
284 SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE,
285 SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE,
286 SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL,
287 SFE_IPV4_EXCEPTION_EVENT_LAST
288};
289
290static char *sfe_ipv4_exception_events_string[SFE_IPV4_EXCEPTION_EVENT_LAST] = {
291 "UDP_HEADER_INCOMPLETE",
292 "UDP_NO_CONNECTION",
293 "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT",
294 "UDP_SMALL_TTL",
295 "UDP_NEEDS_FRAGMENTATION",
296 "TCP_HEADER_INCOMPLETE",
297 "TCP_NO_CONNECTION_SLOW_FLAGS",
298 "TCP_NO_CONNECTION_FAST_FLAGS",
299 "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT",
300 "TCP_SMALL_TTL",
301 "TCP_NEEDS_FRAGMENTATION",
302 "TCP_FLAGS",
303 "TCP_SEQ_EXCEEDS_RIGHT_EDGE",
304 "TCP_SMALL_DATA_OFFS",
305 "TCP_BAD_SACK",
306 "TCP_BIG_DATA_OFFS",
307 "TCP_SEQ_BEFORE_LEFT_EDGE",
308 "TCP_ACK_EXCEEDS_RIGHT_EDGE",
309 "TCP_ACK_BEFORE_LEFT_EDGE",
310 "ICMP_HEADER_INCOMPLETE",
311 "ICMP_UNHANDLED_TYPE",
312 "ICMP_IPV4_HEADER_INCOMPLETE",
313 "ICMP_IPV4_NON_V4",
314 "ICMP_IPV4_IP_OPTIONS_INCOMPLETE",
315 "ICMP_IPV4_UDP_HEADER_INCOMPLETE",
316 "ICMP_IPV4_TCP_HEADER_INCOMPLETE",
317 "ICMP_IPV4_UNHANDLED_PROTOCOL",
318 "ICMP_NO_CONNECTION",
319 "ICMP_FLUSHED_CONNECTION",
320 "HEADER_INCOMPLETE",
321 "BAD_TOTAL_LENGTH",
322 "NON_V4",
323 "NON_INITIAL_FRAGMENT",
324 "DATAGRAM_INCOMPLETE",
325 "IP_OPTIONS_INCOMPLETE",
326 "UNHANDLED_PROTOCOL"
327};
328
329/*
Dave Hudsondcd08fb2013-11-22 09:25:16 -0600330 * Per-module structure.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100331 */
332struct sfe_ipv4 {
333 spinlock_t lock; /* Lock for SMP correctness */
334 struct sfe_ipv4_connection_match *active_head;
335 /* Head of the list of recently active connections */
336 struct sfe_ipv4_connection_match *active_tail;
337 /* Tail of the list of recently active connections */
338 struct sfe_ipv4_connection *all_connections_head;
339 /* Head of the list of all connections */
340 struct sfe_ipv4_connection *all_connections_tail;
341 /* Tail of the list of all connections */
342 unsigned int num_connections; /* Number of connections */
343 struct timer_list timer; /* Timer used for periodic sync ops */
Dave Hudsondcd08fb2013-11-22 09:25:16 -0600344 sfe_ipv4_sync_rule_callback_t __rcu sync_rule_callback;
345 /* Callback function registered by a connection manager for stats syncing */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100346 struct sfe_ipv4_connection *conn_hash[SFE_IPV4_CONNECTION_HASH_SIZE];
347 /* Connection hash table */
348 struct sfe_ipv4_connection_match *conn_match_hash[SFE_IPV4_CONNECTION_HASH_SIZE];
349 /* Connection match hash table */
350
351 /*
352 * Statistics.
353 */
354 uint32_t connection_create_requests;
355 /* Number of IPv4 connection create requests */
356 uint32_t connection_create_collisions;
357 /* Number of IPv4 connection create requests that collided with existing hash table entries */
358 uint32_t connection_destroy_requests;
359 /* Number of IPv4 connection destroy requests */
360 uint32_t connection_destroy_misses;
361 /* Number of IPv4 connection destroy requests that missed our hash table */
362 uint32_t connection_match_hash_hits;
363 /* Number of IPv4 connection match hash hits */
364 uint32_t connection_match_hash_reorders;
365 /* Number of IPv4 connection match hash reorders */
366 uint32_t connection_flushes; /* Number of IPv4 connection flushes */
367 uint32_t packets_forwarded; /* Number of IPv4 packets forwarded */
368 uint32_t packets_not_forwarded; /* Number of IPv4 packets not forwarded */
369 uint32_t exception_events[SFE_IPV4_EXCEPTION_EVENT_LAST];
370
371 /*
372 * Summary tatistics.
373 */
374 uint64_t connection_create_requests64;
375 /* Number of IPv4 connection create requests */
376 uint64_t connection_create_collisions64;
377 /* Number of IPv4 connection create requests that collided with existing hash table entries */
378 uint64_t connection_destroy_requests64;
379 /* Number of IPv4 connection destroy requests */
380 uint64_t connection_destroy_misses64;
381 /* Number of IPv4 connection destroy requests that missed our hash table */
382 uint64_t connection_match_hash_hits64;
383 /* Number of IPv4 connection match hash hits */
384 uint64_t connection_match_hash_reorders64;
385 /* Number of IPv4 connection match hash reorders */
386 uint64_t connection_flushes64; /* Number of IPv4 connection flushes */
387 uint64_t packets_forwarded64; /* Number of IPv4 packets forwarded */
388 uint64_t packets_not_forwarded64;
389 /* Number of IPv4 packets not forwarded */
390 uint64_t exception_events64[SFE_IPV4_EXCEPTION_EVENT_LAST];
391
392 /*
393 * Control state.
394 */
395 struct kobject *sys_sfe_ipv4; /* sysfs linkage */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100396 int debug_dev; /* Major number of the debug char device */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100397};
398
399/*
400 * Enumeration of the XML output.
401 */
402enum sfe_ipv4_debug_xml_states {
403 SFE_IPV4_DEBUG_XML_STATE_START,
404 SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_START,
405 SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_CONNECTION,
406 SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_END,
407 SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_START,
408 SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION,
409 SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_END,
410 SFE_IPV4_DEBUG_XML_STATE_STATS,
411 SFE_IPV4_DEBUG_XML_STATE_END,
412 SFE_IPV4_DEBUG_XML_STATE_DONE
413};
414
415/*
416 * XML write state.
417 */
418struct sfe_ipv4_debug_xml_write_state {
419 enum sfe_ipv4_debug_xml_states state;
420 /* XML output file state machine state */
421 struct sfe_ipv4_connection *iter_conn;
422 /* Next connection iterator */
423 int iter_exception; /* Next exception iterator */
424};
425
426typedef bool (*sfe_ipv4_debug_xml_write_method_t)(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
427 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws);
428
429struct sfe_ipv4 __si;
430
431/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100432 * sfe_ipv4_gen_ip_csum()
433 * Generate the IP checksum for an IPv4 header.
434 *
435 * Note that this function assumes that we have only 20 bytes of IP header.
436 */
437static inline uint16_t sfe_ipv4_gen_ip_csum(struct sfe_ipv4_iphdr *iph)
438{
439 uint32_t sum;
440 uint16_t *i = (uint16_t *)iph;
441
442 iph->check = 0;
443
444 /*
445 * Generate the sum.
446 */
447 sum = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9];
448
449 /*
450 * Fold it to ones-complement form.
451 */
452 sum = (sum & 0xffff) + (sum >> 16);
453 sum = (sum & 0xffff) + (sum >> 16);
454
455 return (uint16_t)sum ^ 0xffff;
456}
457
458/*
459 * sfe_ipv4_get_connection_match_hash()
460 * Generate the hash used in connection match lookups.
461 */
462static inline unsigned int sfe_ipv4_get_connection_match_hash(struct net_device *dev, uint8_t protocol,
Dave Hudson87973cd2013-10-22 16:00:04 +0100463 __be32 src_ip, __be16 src_port,
464 __be32 dest_ip, __be16 dest_port)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100465{
466 size_t dev_addr = (size_t)dev;
Dave Hudson87973cd2013-10-22 16:00:04 +0100467 uint32_t hash = ((uint32_t)dev_addr) ^ ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100468 return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK;
469}
470
471/*
472 * sfe_ipv4_find_sfe_ipv4_connection_match()
473 * Get the IPv4 flow match info that corresponds to a particular 5-tuple.
474 *
475 * On entry we must be holding the lock that protects the hash table.
476 */
477static struct sfe_ipv4_connection_match *
478sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, uint8_t protocol,
Dave Hudson87973cd2013-10-22 16:00:04 +0100479 __be32 src_ip, __be16 src_port,
480 __be32 dest_ip, __be16 dest_port) __attribute__((always_inline));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100481static struct sfe_ipv4_connection_match *
482sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, uint8_t protocol,
Dave Hudson87973cd2013-10-22 16:00:04 +0100483 __be32 src_ip, __be16 src_port,
484 __be32 dest_ip, __be16 dest_port)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100485{
486 struct sfe_ipv4_connection_match *cm;
487 struct sfe_ipv4_connection_match *head;
488 unsigned int conn_match_idx;
489
490 conn_match_idx = sfe_ipv4_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port);
491 cm = si->conn_match_hash[conn_match_idx];
492
493 /*
494 * If we don't have anything in this chain then bale.
495 */
496 if (unlikely(!cm)) {
497 return cm;
498 }
499
500 /*
501 * Hopefully the first entry is the one we want.
502 */
503 if (likely(cm->match_src_port == src_port)
504 && likely(cm->match_dest_port == dest_port)
505 && likely(cm->match_src_ip == src_ip)
506 && likely(cm->match_dest_ip == dest_ip)
507 && likely(cm->match_protocol == protocol)
508 && likely(cm->match_dev == dev)) {
509 si->connection_match_hash_hits++;
510 return cm;
511 }
512
513 /*
514 * We may or may not have a matching entry but if we do then we want to
515 * move that entry to the top of the hash chain when we get to it. We
516 * presume that this will be reused again very quickly.
517 */
518 head = cm;
519 do {
520 cm = cm->next;
521 } while (cm && (cm->match_src_port != src_port
522 || cm->match_dest_port != dest_port
523 || cm->match_src_ip != src_ip
524 || cm->match_dest_ip != dest_ip
525 || cm->match_protocol != protocol
526 || cm->match_dev != dev));
527
528 /*
529 * Not found then we're done.
530 */
531 if (unlikely(!cm)) {
532 return cm;
533 }
534
535 /*
536 * We found a match so move it.
537 */
538 if (cm->next) {
539 cm->next->prev = cm->prev;
540 }
541 cm->prev->next = cm->next;
542 cm->prev = NULL;
543 cm->next = head;
544 head->prev = cm;
545 si->conn_match_hash[conn_match_idx] = cm;
546 si->connection_match_hash_reorders++;
547
548 return cm;
549}
550
551/*
552 * sfe_ipv4_connection_match_update_summary_stats()
553 * Update the summary stats for a connection match entry.
554 */
555static inline void sfe_ipv4_connection_match_update_summary_stats(struct sfe_ipv4_connection_match *cm)
556{
557 cm->rx_packet_count64 += cm->rx_packet_count;
558 cm->rx_packet_count = 0;
559 cm->rx_byte_count64 += cm->rx_byte_count;
560 cm->rx_byte_count = 0;
561}
562
563/*
564 * sfe_ipv4_connection_match_compute_translations()
565 * Compute port and address translations for a connection match entry.
566 */
567static void sfe_ipv4_connection_match_compute_translations(struct sfe_ipv4_connection_match *cm)
568{
569 /*
570 * Before we insert the entry look to see if this is tagged as doing address
571 * translations. If it is then work out the adjustment that we need to apply
572 * to the transport checksum.
573 */
574 if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) {
575 /*
576 * Precompute an incremental checksum adjustment so we can
577 * edit packets in this stream very quickly. The algorithm is from RFC1624.
578 */
579 uint16_t src_ip_hi = cm->match_src_ip >> 16;
580 uint16_t src_ip_lo = cm->match_src_ip & 0xffff;
581 uint32_t xlate_src_ip = ~cm->xlate_src_ip;
582 uint16_t xlate_src_ip_hi = xlate_src_ip >> 16;
583 uint16_t xlate_src_ip_lo = xlate_src_ip & 0xffff;
Dave Hudson87973cd2013-10-22 16:00:04 +0100584 uint16_t xlate_src_port = ~cm->xlate_src_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100585 uint32_t adj;
586
587 /*
588 * When we compute this fold it down to a 16-bit offset
589 * as that way we can avoid having to do a double
590 * folding of the twos-complement result because the
591 * addition of 2 16-bit values cannot cause a double
592 * wrap-around!
593 */
594 adj = src_ip_hi + src_ip_lo + cm->match_src_port
595 + xlate_src_ip_hi + xlate_src_ip_lo + xlate_src_port;
596 adj = (adj & 0xffff) + (adj >> 16);
597 adj = (adj & 0xffff) + (adj >> 16);
598 cm->xlate_src_csum_adjustment = (uint16_t)adj;
599
600 }
601
602 if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) {
603 /*
604 * Precompute an incremental checksum adjustment so we can
605 * edit packets in this stream very quickly. The algorithm is from RFC1624.
606 */
607 uint16_t dest_ip_hi = cm->match_dest_ip >> 16;
608 uint16_t dest_ip_lo = cm->match_dest_ip & 0xffff;
609 uint32_t xlate_dest_ip = ~cm->xlate_dest_ip;
610 uint16_t xlate_dest_ip_hi = xlate_dest_ip >> 16;
611 uint16_t xlate_dest_ip_lo = xlate_dest_ip & 0xffff;
Dave Hudson87973cd2013-10-22 16:00:04 +0100612 uint16_t xlate_dest_port = ~cm->xlate_dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100613 uint32_t adj;
614
615 /*
616 * When we compute this fold it down to a 16-bit offset
617 * as that way we can avoid having to do a double
618 * folding of the twos-complement result because the
619 * addition of 2 16-bit values cannot cause a double
620 * wrap-around!
621 */
622 adj = dest_ip_hi + dest_ip_lo + cm->match_dest_port
623 + xlate_dest_ip_hi + xlate_dest_ip_lo + xlate_dest_port;
624 adj = (adj & 0xffff) + (adj >> 16);
625 adj = (adj & 0xffff) + (adj >> 16);
626 cm->xlate_dest_csum_adjustment = (uint16_t)adj;
627 }
628}
629
630/*
631 * sfe_ipv4_update_summary_stats()
632 * Update the summary stats.
633 */
634static void sfe_ipv4_update_summary_stats(struct sfe_ipv4 *si)
635{
636 int i;
637
638 si->connection_create_requests64 += si->connection_create_requests;
639 si->connection_create_requests = 0;
640 si->connection_create_collisions64 += si->connection_create_collisions;
641 si->connection_create_collisions = 0;
642 si->connection_destroy_requests64 += si->connection_destroy_requests;
643 si->connection_destroy_requests = 0;
644 si->connection_destroy_misses64 += si->connection_destroy_misses;
645 si->connection_destroy_misses = 0;
646 si->connection_match_hash_hits64 += si->connection_match_hash_hits;
647 si->connection_match_hash_hits = 0;
648 si->connection_match_hash_reorders64 += si->connection_match_hash_reorders;
649 si->connection_match_hash_reorders = 0;
650 si->connection_flushes64 += si->connection_flushes;
651 si->connection_flushes = 0;
652 si->packets_forwarded64 += si->packets_forwarded;
653 si->packets_forwarded = 0;
654 si->packets_not_forwarded64 += si->packets_not_forwarded;
655 si->packets_not_forwarded = 0;
656
657 for (i = 0; i < SFE_IPV4_EXCEPTION_EVENT_LAST; i++) {
658 si->exception_events64[i] += si->exception_events[i];
659 si->exception_events[i] = 0;
660 }
661}
662
663/*
664 * sfe_ipv4_insert_sfe_ipv4_connection_match()
665 * Insert a connection match into the hash.
666 *
667 * On entry we must be holding the lock that protects the hash table.
668 */
669static inline void sfe_ipv4_insert_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm)
670{
671 struct sfe_ipv4_connection_match **hash_head;
672 struct sfe_ipv4_connection_match *prev_head;
673 unsigned int conn_match_idx
674 = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol,
675 cm->match_src_ip, cm->match_src_port,
676 cm->match_dest_ip, cm->match_dest_port);
677 hash_head = &si->conn_match_hash[conn_match_idx];
678 prev_head = *hash_head;
679 cm->prev = NULL;
680 if (prev_head) {
681 prev_head->prev = cm;
682 }
683
684 cm->next = prev_head;
685 *hash_head = cm;
686}
687
688/*
689 * sfe_ipv4_remove_sfe_ipv4_connection_match()
690 * Remove a connection match object from the hash.
691 *
692 * On entry we must be holding the lock that protects the hash table.
693 */
694static inline void sfe_ipv4_remove_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm)
695{
696 /*
697 * Unlink the connection match entry from the hash.
698 */
699 if (cm->prev) {
700 cm->prev->next = cm->next;
701 } else {
702 unsigned int conn_match_idx
703 = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol,
704 cm->match_src_ip, cm->match_src_port,
705 cm->match_dest_ip, cm->match_dest_port);
706 si->conn_match_hash[conn_match_idx] = cm->next;
707 }
708
709 if (cm->next) {
710 cm->next->prev = cm->prev;
711 }
712
713 /*
714 * Unlink the connection match entry from the active list.
715 */
716 if (likely(cm->active_prev)) {
717 cm->active_prev->active_next = cm->active_next;
718 } else {
719 si->active_head = cm->active_next;
720 }
721
722 if (likely(cm->active_next)) {
723 cm->active_next->active_prev = cm->active_prev;
724 } else {
725 si->active_tail = cm->active_prev;
726 }
727
728}
729
730/*
731 * sfe_ipv4_get_connection_hash()
732 * Generate the hash used in connection lookups.
733 */
Dave Hudson87973cd2013-10-22 16:00:04 +0100734static inline unsigned int sfe_ipv4_get_connection_hash(uint8_t protocol, __be32 src_ip, __be16 src_port,
735 __be32 dest_ip, __be16 dest_port)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100736{
Dave Hudson87973cd2013-10-22 16:00:04 +0100737 uint32_t hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100738 return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK;
739}
740
741/*
742 * sfe_ipv4_find_sfe_ipv4_connection()
743 * Get the IPv4 connection info that corresponds to a particular 5-tuple.
744 *
745 * On entry we must be holding the lock that protects the hash table.
746 */
747static inline struct sfe_ipv4_connection *sfe_ipv4_find_sfe_ipv4_connection(struct sfe_ipv4 *si, uint32_t protocol,
Dave Hudson87973cd2013-10-22 16:00:04 +0100748 __be32 src_ip, __be16 src_port,
749 __be32 dest_ip, __be16 dest_port)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100750{
751 struct sfe_ipv4_connection *c;
752 unsigned int conn_idx = sfe_ipv4_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port);
753 c = si->conn_hash[conn_idx];
754
755 /*
756 * If we don't have anything in this chain then bale.
757 */
758 if (unlikely(!c)) {
759 return c;
760 }
761
762 /*
763 * Hopefully the first entry is the one we want.
764 */
765 if (likely(c->src_port == src_port)
766 && likely(c->dest_port == dest_port)
767 && likely(c->src_ip == src_ip)
768 && likely(c->dest_ip == dest_ip)
769 && likely(c->protocol == protocol)) {
770 return c;
771 }
772
773 /*
774 * We may or may not have a matching entry but if we do then we want to
775 * move that entry to the top of the hash chain when we get to it. We
776 * presume that this will be reused again very quickly.
777 */
778 do {
779 c = c->next;
780 } while (c && (c->src_port != src_port
781 || c->dest_port != dest_port
782 || c->src_ip != src_ip
783 || c->dest_ip != dest_ip
784 || c->protocol != protocol));
785
786 /*
787 * Will need connection entry for next create/destroy metadata,
788 * So no need to re-order entry for these requests
789 */
790 return c;
791}
792
793/*
Matthew McClintockbe7b47d2013-11-27 13:26:23 -0600794 * sfe_ipv4_mark_rule()
795 * Updates the mark for a current offloaded connection
796 *
797 * Will take hash lock upon entry
798 */
799static void sfe_ipv4_mark_rule(struct sfe_ipv4_mark *mark)
800{
801 struct sfe_ipv4 *si = &__si;
802 struct sfe_ipv4_connection *c;
803 spin_lock(&si->lock);
804 c = sfe_ipv4_find_sfe_ipv4_connection(si, mark->protocol,
805 mark->src_ip, mark->src_port,
806 mark->dest_ip, mark->dest_port);
807 if (c) {
808 DEBUG_TRACE("INFO: Matching connection found for mark, setting to: %x\n", mark);
809 c->mark = mark->mark;
810 }
811 spin_unlock(&si->lock);
812}
813
814/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100815 * sfe_ipv4_insert_sfe_ipv4_connection()
816 * Insert a connection into the hash.
817 *
818 * On entry we must be holding the lock that protects the hash table.
819 */
820static void sfe_ipv4_insert_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
821{
822 struct sfe_ipv4_connection **hash_head;
823 struct sfe_ipv4_connection *prev_head;
824 unsigned int conn_idx;
825
826 /*
827 * Insert entry into the connection hash.
828 */
829 conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port,
830 c->dest_ip, c->dest_port);
831 hash_head = &si->conn_hash[conn_idx];
832 prev_head = *hash_head;
833 c->prev = NULL;
834 if (prev_head) {
835 prev_head->prev = c;
836 }
837
838 c->next = prev_head;
839 *hash_head = c;
840
841 /*
842 * Insert entry into the "all connections" list.
843 */
844 if (si->all_connections_tail) {
845 c->all_connections_prev = si->all_connections_tail;
846 si->all_connections_tail->all_connections_next = c;
847 } else {
848 c->all_connections_prev = NULL;
849 si->all_connections_head = c;
850 }
851
852 si->all_connections_tail = c;
853 c->all_connections_next = NULL;
854 si->num_connections++;
855
856 /*
857 * Insert the connection match objects too.
858 */
859 sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->original_match);
860 sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->reply_match);
861}
862
863/*
864 * sfe_ipv4_remove_sfe_ipv4_connection()
865 * Remove a sfe_ipv4_connection object from the hash.
866 *
867 * On entry we must be holding the lock that protects the hash table.
868 */
869static void sfe_ipv4_remove_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
870{
871 /*
872 * Remove the connection match objects.
873 */
874 sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->reply_match);
875 sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->original_match);
876
877 /*
878 * Unlink the connection.
879 */
880 if (c->prev) {
881 c->prev->next = c->next;
882 } else {
883 unsigned int conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port,
884 c->dest_ip, c->dest_port);
885 si->conn_hash[conn_idx] = c->next;
886 }
887
888 if (c->next) {
889 c->next->prev = c->prev;
890 }
891}
892
893/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100894 * sfe_ipv4_sync_sfe_ipv4_connection()
895 * Sync a connection.
896 *
897 * On entry to this function we expect that the lock for the connection is either
898 * already held or isn't required.
899 */
900static void sfe_ipv4_gen_sync_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c,
901 struct sfe_ipv4_sync *sis, uint64_t now_jiffies)
902{
903 struct sfe_ipv4_connection_match *original_cm;
904 struct sfe_ipv4_connection_match *reply_cm;
905
906 /*
907 * Fill in the update message.
908 */
909 sis->protocol = c->protocol;
910 sis->src_ip = c->src_ip;
911 sis->dest_ip = c->dest_ip;
912 sis->src_port = c->src_port;
913 sis->dest_port = c->dest_port;
914
915 original_cm = c->original_match;
916 reply_cm = c->reply_match;
917 sis->src_td_max_window = original_cm->protocol_state.tcp.max_win;
918 sis->src_td_end = original_cm->protocol_state.tcp.end;
919 sis->src_td_max_end = original_cm->protocol_state.tcp.max_end;
920 sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win;
921 sis->dest_td_end = reply_cm->protocol_state.tcp.end;
922 sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end;
923
924 sfe_ipv4_connection_match_update_summary_stats(original_cm);
925 sfe_ipv4_connection_match_update_summary_stats(reply_cm);
926
927 sis->src_packet_count = original_cm->rx_packet_count64;
928 sis->src_byte_count = original_cm->rx_byte_count64;
929 sis->dest_packet_count = reply_cm->rx_packet_count64;
930 sis->dest_byte_count = reply_cm->rx_byte_count64;
931
932 /*
933 * Get the time increment since our last sync.
934 */
935 sis->delta_jiffies = now_jiffies - c->last_sync_jiffies;
936 c->last_sync_jiffies = now_jiffies;
937}
938
939/*
940 * sfe_ipv4_decrement_sfe_ipv4_connection_iterator()
941 * Remove an iterator from a connection - free all resources if necessary.
942 *
943 * Returns true if the connection should now be free, false if not.
944 *
945 * We must be locked on entry to this function.
946 */
947static bool sfe_ipv4_decrement_sfe_ipv4_connection_iterator(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
948{
949 /*
950 * Are we the last iterator for this connection?
951 */
952 c->iterators--;
953 if (c->iterators) {
954 return false;
955 }
956
957 /*
958 * Is this connection marked for deletion?
959 */
960 if (!c->pending_free) {
961 return false;
962 }
963
964 /*
965 * We're ready to delete this connection so unlink it from the "all
966 * connections" list.
967 */
968 si->num_connections--;
969 if (c->all_connections_prev) {
970 c->all_connections_prev->all_connections_next = c->all_connections_next;
971 } else {
972 si->all_connections_head = c->all_connections_next;
973 }
974
975 if (c->all_connections_next) {
976 c->all_connections_next->all_connections_prev = c->all_connections_prev;
977 } else {
978 si->all_connections_tail = c->all_connections_prev;
979 }
980
981 return true;
982}
983
984/*
985 * sfe_ipv4_flush_sfe_ipv4_connection()
986 * Flush a connection and free all associated resources.
987 *
988 * We need to be called with bottom halves disabled locally as we need to acquire
989 * the connection hash lock and release it again. In general we're actually called
990 * from within a BH and so we're fine, but we're also called when connections are
991 * torn down.
992 */
993static void sfe_ipv4_flush_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
994{
995 struct sfe_ipv4_sync sis;
996 uint64_t now_jiffies;
997 bool pending_free = false;
Dave Hudsondcd08fb2013-11-22 09:25:16 -0600998 sfe_ipv4_sync_rule_callback_t sync_rule_callback;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +0100999
Dave Hudsondcd08fb2013-11-22 09:25:16 -06001000 rcu_read_lock();
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001001 spin_lock(&si->lock);
1002 si->connection_flushes++;
1003
1004 /*
1005 * Check that we're not currently being iterated. If we are then
1006 * we can't free this entry yet but must mark it pending a free. If it's
1007 * not being iterated then we can unlink it from the list of all
1008 * connections.
1009 */
1010 if (c->iterators) {
1011 pending_free = true;
1012 c->pending_free = true;
1013 } else {
1014 si->num_connections--;
1015 if (c->all_connections_prev) {
1016 c->all_connections_prev->all_connections_next = c->all_connections_next;
1017 } else {
1018 si->all_connections_head = c->all_connections_next;
1019 }
1020
1021 if (c->all_connections_next) {
1022 c->all_connections_next->all_connections_prev = c->all_connections_prev;
1023 } else {
1024 si->all_connections_tail = c->all_connections_prev;
1025 }
1026 }
1027
Dave Hudsondcd08fb2013-11-22 09:25:16 -06001028 sync_rule_callback = rcu_dereference(si->sync_rule_callback);
1029
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001030 spin_unlock(&si->lock);
1031
Dave Hudsondcd08fb2013-11-22 09:25:16 -06001032 if (sync_rule_callback) {
1033 /*
1034 * Generate a sync message and then sync.
1035 */
1036 now_jiffies = get_jiffies_64();
1037 sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, now_jiffies);
1038 sync_rule_callback(&sis);
1039 }
1040
1041 rcu_read_unlock();
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001042
1043 /*
1044 * If we can't yet free the underlying memory then we're done.
1045 */
1046 if (pending_free) {
1047 return;
1048 }
1049
1050 /*
1051 * Release our hold of the source and dest devices and free the memory
1052 * for our connection objects.
1053 */
1054 dev_put(c->original_dev);
1055 dev_put(c->reply_dev);
1056 kfree(c->original_match);
1057 kfree(c->reply_match);
1058 kfree(c);
1059}
1060
1061/*
1062 * sfe_ipv4_recv_udp()
1063 * Handle UDP packet receives and forwarding.
1064 */
1065static int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
1066 unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl, bool flush_on_find)
1067{
1068 struct sfe_ipv4_udphdr *udph;
Dave Hudson87973cd2013-10-22 16:00:04 +01001069 __be32 src_ip;
1070 __be32 dest_ip;
1071 __be16 src_port;
1072 __be16 dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001073 struct sfe_ipv4_connection_match *cm;
1074 uint8_t ttl;
1075 struct net_device *xmit_dev;
1076
1077 /*
1078 * Is our packet too short to contain a valid UDP header?
1079 */
1080 if (unlikely(len < (sizeof(struct sfe_ipv4_udphdr) + ihl))) {
1081 spin_lock(&si->lock);
1082 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++;
1083 si->packets_not_forwarded++;
1084 spin_unlock(&si->lock);
1085
1086 DEBUG_TRACE("packet too short for UDP header\n");
1087 return 0;
1088 }
1089
1090 /*
1091 * Read the IP address and port information. Read the IP header data first
1092 * because we've almost certainly got that in the cache. We may not yet have
1093 * the UDP header cached though so allow more time for any prefetching.
1094 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001095 src_ip = iph->saddr;
1096 dest_ip = iph->daddr;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001097
1098 udph = (struct sfe_ipv4_udphdr *)(skb->data + ihl);
Dave Hudson87973cd2013-10-22 16:00:04 +01001099 src_port = udph->source;
1100 dest_port = udph->dest;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001101
1102 spin_lock(&si->lock);
1103
1104 /*
1105 * Look for a connection match.
1106 */
1107 cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port);
1108 if (unlikely(!cm)) {
1109 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION]++;
1110 si->packets_not_forwarded++;
1111 spin_unlock(&si->lock);
1112
1113 DEBUG_TRACE("no connection found\n");
1114 return 0;
1115 }
1116
1117 /*
1118 * If our packet has beern marked as "flush on find" we can't actually
1119 * forward it in the fast path, but now that we've found an associated
1120 * connection we can flush that out before we process the packet.
1121 */
1122 if (unlikely(flush_on_find)) {
1123 struct sfe_ipv4_connection *c = cm->connection;
1124 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1125 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++;
1126 si->packets_not_forwarded++;
1127 spin_unlock(&si->lock);
1128
1129 DEBUG_TRACE("flush on find\n");
1130 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1131 return 0;
1132 }
1133
1134 /*
1135 * Does our TTL allow forwarding?
1136 */
1137 ttl = iph->ttl;
1138 if (unlikely(ttl < 2)) {
1139 struct sfe_ipv4_connection *c = cm->connection;
1140 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1141 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL]++;
1142 si->packets_not_forwarded++;
1143 spin_unlock(&si->lock);
1144
1145 DEBUG_TRACE("ttl too low\n");
1146 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1147 return 0;
1148 }
1149
1150 /*
1151 * If our packet is larger than the MTU of the transmit interface then
1152 * we can't forward it easily.
1153 */
1154 if (unlikely(len > cm->xmit_dev_mtu)) {
1155 struct sfe_ipv4_connection *c = cm->connection;
1156 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1157 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++;
1158 si->packets_not_forwarded++;
1159 spin_unlock(&si->lock);
1160
1161 DEBUG_TRACE("larger than mtu\n");
1162 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1163 return 0;
1164 }
1165
1166 /*
1167 * From this point on we're good to modify the packet.
1168 */
1169
1170 /*
1171 * Decrement our TTL.
1172 */
1173 iph->ttl = ttl - 1;
1174
1175 /*
1176 * Do we have to perform translations of the source address/port?
1177 */
1178 if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
1179 uint16_t udp_csum;
1180
Dave Hudson87973cd2013-10-22 16:00:04 +01001181 iph->saddr = cm->xlate_src_ip;
1182 udph->source = cm->xlate_src_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001183
1184 /*
1185 * Do we have a non-zero UDP checksum? If we do then we need
1186 * to update it.
1187 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001188 udp_csum = udph->check;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001189 if (likely(udp_csum)) {
1190 uint32_t sum = udp_csum + cm->xlate_src_csum_adjustment;
1191 sum = (sum & 0xffff) + (sum >> 16);
Dave Hudson87973cd2013-10-22 16:00:04 +01001192 udph->check = (uint16_t)sum;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001193 }
1194 }
1195
1196 /*
1197 * Do we have to perform translations of the destination address/port?
1198 */
1199 if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) {
1200 uint16_t udp_csum;
1201
Dave Hudson87973cd2013-10-22 16:00:04 +01001202 iph->daddr = cm->xlate_dest_ip;
1203 udph->dest = cm->xlate_dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001204
1205 /*
1206 * Do we have a non-zero UDP checksum? If we do then we need
1207 * to update it.
1208 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001209 udp_csum = udph->check;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001210 if (likely(udp_csum)) {
1211 uint32_t sum = udp_csum + cm->xlate_dest_csum_adjustment;
1212 sum = (sum & 0xffff) + (sum >> 16);
Dave Hudson87973cd2013-10-22 16:00:04 +01001213 udph->check = (uint16_t)sum;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001214 }
1215 }
1216
1217 /*
1218 * Replace the IP checksum.
1219 */
1220 iph->check = sfe_ipv4_gen_ip_csum(iph);
1221
1222// if ((nat_entry_data->tos & FASTNAT_DSCP_MASK) != (iph->tos & FASTNAT_DSCP_MASK)) {
1223// ipv4_change_dsfield(iph, (u_int8_t)(~FASTNAT_DSCP_MASK), nat_entry_data->tos);
1224// }
1225
1226// skb->priority = nat_entry_data->priority;
1227// skb->mark = nat_entry_data->mark;
1228
1229 /*
1230 * Update traffic stats.
1231 */
1232 cm->rx_packet_count++;
1233 cm->rx_byte_count += len;
1234
1235 /*
1236 * If we're not already on the active list then insert ourselves at the tail
1237 * of the current list.
1238 */
1239 if (unlikely(!cm->active)) {
1240 cm->active = true;
1241 cm->active_prev = si->active_tail;
1242 if (likely(si->active_tail)) {
1243 si->active_tail->active_next = cm;
1244 } else {
1245 si->active_head = cm;
1246 }
1247 si->active_tail = cm;
1248 }
1249
1250 xmit_dev = cm->xmit_dev;
1251 skb->dev = xmit_dev;
1252
1253 /*
1254 * Do we have a simple Ethernet header to write?
1255 */
1256 if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR))) {
1257 /*
1258 * If this is anything other than a point-to-point interface then we need to
1259 * create a header based on MAC addresses.
1260 */
1261 if (likely(!(xmit_dev->flags & IFF_POINTOPOINT))) {
1262 xmit_dev->header_ops->create(skb, xmit_dev, ETH_P_IP,
1263 cm->xmit_dest_mac, cm->xmit_src_mac, len);
1264 }
1265 } else {
1266 struct sfe_ipv4_ethhdr *eth = (struct sfe_ipv4_ethhdr *)__skb_push(skb, ETH_HLEN);
1267 eth->h_proto = htons(ETH_P_IP);
1268 eth->h_dest[0] = htons(cm->xmit_dest_mac[0]);
1269 eth->h_dest[1] = htons(cm->xmit_dest_mac[1]);
1270 eth->h_dest[2] = htons(cm->xmit_dest_mac[2]);
1271 eth->h_source[0] = htons(cm->xmit_src_mac[0]);
1272 eth->h_source[1] = htons(cm->xmit_src_mac[1]);
1273 eth->h_source[2] = htons(cm->xmit_src_mac[2]);
1274 }
1275
1276 si->packets_forwarded++;
1277 spin_unlock(&si->lock);
1278
1279 /*
1280 * We're going to check for GSO flags when we transmit the packet so
1281 * start fetching the necessary cache line now.
1282 */
1283 prefetch(skb_shinfo(skb));
1284
1285 /*
1286 * Send the packet on its way.
1287 */
1288 dev_queue_xmit(skb);
1289
1290 return 1;
1291}
1292
1293/*
1294 * sfe_ipv4_process_tcp_option_sack()
1295 * Parse TCP SACK option and update ack according
1296 */
1297static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcphdr *th, const uint32_t data_offs,
1298 uint32_t *ack) __attribute__((always_inline));
1299static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcphdr *th, const uint32_t data_offs,
1300 uint32_t *ack)
1301{
1302 uint32_t length = sizeof(struct sfe_ipv4_tcphdr);
1303 uint8_t *ptr = (uint8_t *)th + length;
1304
1305 /*
1306 * If option is TIMESTAMP discard it.
1307 */
1308 if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1)
1309 && likely(ptr[0] == TCPOPT_NOP)
1310 && likely(ptr[1] == TCPOPT_NOP)
1311 && likely(ptr[2] == TCPOPT_TIMESTAMP)
1312 && likely(ptr[3] == TCPOLEN_TIMESTAMP)) {
1313 return true;
1314 }
1315
1316 /*
1317 * TCP options. Parse SACK option.
1318 */
1319 while (length < data_offs) {
1320 uint8_t size;
1321 uint8_t kind;
1322
1323 ptr = (uint8_t *)th + length;
1324 kind = *ptr;
1325
1326 /*
1327 * NOP, for padding
1328 * Not in the switch because to fast escape and to not calculate size
1329 */
1330 if (kind == TCPOPT_NOP) {
1331 length++;
1332 continue;
1333 }
1334
1335 if (kind == TCPOPT_SACK) {
1336 uint32_t sack = 0;
1337 uint8_t re = 1 + 1;
1338
1339 size = *(ptr + 1);
1340 if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK))
1341 || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK))
1342 || (size > (data_offs - length))) {
1343 return false;
1344 }
1345
1346 re += 4;
1347 while (re < size) {
1348 uint32_t sack_re;
1349 uint8_t *sptr = ptr + re;
1350 sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3];
1351 if (sack_re > sack) {
1352 sack = sack_re;
1353 }
1354 re += TCPOLEN_SACK_PERBLOCK;
1355 }
1356 if (sack > *ack) {
1357 *ack = sack;
1358 }
1359 length += size;
1360 continue;
1361 }
1362 if (kind == TCPOPT_EOL) {
1363 return true;
1364 }
1365 size = *(ptr + 1);
1366 if (size < 2) {
1367 return false;
1368 }
1369 length += size;
1370 }
1371
1372 return true;
1373}
1374
1375/*
1376 * sfe_ipv4_recv_tcp()
1377 * Handle TCP packet receives and forwarding.
1378 */
1379static int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
1380 unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl, bool flush_on_find)
1381{
1382 struct sfe_ipv4_tcphdr *tcph;
Dave Hudson87973cd2013-10-22 16:00:04 +01001383 __be32 src_ip;
1384 __be32 dest_ip;
1385 __be16 src_port;
1386 __be16 dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001387 struct sfe_ipv4_connection_match *cm;
1388 struct sfe_ipv4_connection_match *counter_cm;
1389 uint8_t ttl;
1390 uint32_t flags;
1391 struct net_device *xmit_dev;
1392
1393 /*
1394 * Is our packet too short to contain a valid UDP header?
1395 */
1396 if (unlikely(len < (sizeof(struct sfe_ipv4_tcphdr) + ihl))) {
1397 spin_lock(&si->lock);
1398 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++;
1399 si->packets_not_forwarded++;
1400 spin_unlock(&si->lock);
1401
1402 DEBUG_TRACE("packet too short for TCP header\n");
1403 return 0;
1404 }
1405
1406 /*
1407 * Read the IP address and port information. Read the IP header data first
1408 * because we've almost certainly got that in the cache. We may not yet have
1409 * the TCP header cached though so allow more time for any prefetching.
1410 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001411 src_ip = iph->saddr;
1412 dest_ip = iph->daddr;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001413
1414 tcph = (struct sfe_ipv4_tcphdr *)(skb->data + ihl);
Dave Hudson87973cd2013-10-22 16:00:04 +01001415 src_port = tcph->source;
1416 dest_port = tcph->dest;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001417 flags = tcp_flag_word(tcph);
1418
1419 spin_lock(&si->lock);
1420
1421 /*
1422 * Look for a connection match.
1423 */
1424 cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port);
1425 if (unlikely(!cm)) {
1426 /*
1427 * We didn't get a connection but as TCP is connection-oriented that
1428 * may be because this is a non-fast connection (not running established).
1429 * For diagnostic purposes we differentiate this here.
1430 */
1431 if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) {
1432 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++;
1433 si->packets_not_forwarded++;
1434 spin_unlock(&si->lock);
1435
1436 DEBUG_TRACE("no connection found - fast flags\n");
1437 return 0;
1438 }
1439 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++;
1440 si->packets_not_forwarded++;
1441 spin_unlock(&si->lock);
1442
1443 DEBUG_TRACE("no connection found - slow flags: 0x%x\n",
1444 flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK));
1445 return 0;
1446 }
1447
1448 /*
1449 * If our packet has beern marked as "flush on find" we can't actually
1450 * forward it in the fast path, but now that we've found an associated
1451 * connection we can flush that out before we process the packet.
1452 */
1453 if (unlikely(flush_on_find)) {
1454 struct sfe_ipv4_connection *c = cm->connection;
1455 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1456 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++;
1457 si->packets_not_forwarded++;
1458 spin_unlock(&si->lock);
1459
1460 DEBUG_TRACE("flush on find\n");
1461 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1462 return 0;
1463 }
1464
1465 /*
1466 * Does our TTL allow forwarding?
1467 */
1468 ttl = iph->ttl;
1469 if (unlikely(ttl < 2)) {
1470 struct sfe_ipv4_connection *c = cm->connection;
1471 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1472 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL]++;
1473 si->packets_not_forwarded++;
1474 spin_unlock(&si->lock);
1475
1476 DEBUG_TRACE("ttl too low\n");
1477 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1478 return 0;
1479 }
1480
1481 /*
1482 * If our packet is larger than the MTU of the transmit interface then
1483 * we can't forward it easily.
1484 */
1485 if (unlikely(len > cm->xmit_dev_mtu)) {
1486 struct sfe_ipv4_connection *c = cm->connection;
1487 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1488 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++;
1489 si->packets_not_forwarded++;
1490 spin_unlock(&si->lock);
1491
1492 DEBUG_TRACE("larger than mtu\n");
1493 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1494 return 0;
1495 }
1496
1497 /*
1498 * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN
1499 * set is not a fast path packet.
1500 */
1501 if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) {
1502 struct sfe_ipv4_connection *c = cm->connection;
1503 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1504 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS]++;
1505 si->packets_not_forwarded++;
1506 spin_unlock(&si->lock);
1507
1508 DEBUG_TRACE("TCP flags: 0x%x are not fast\n",
1509 flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK));
1510 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1511 return 0;
1512 }
1513
1514 counter_cm = cm->counter_match;
1515
1516 /*
1517 * Are we doing sequence number checking?
1518 */
1519 if (likely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) {
1520 uint32_t seq;
1521 uint32_t ack;
1522 uint32_t sack;
1523 uint32_t data_offs;
1524 uint32_t end;
1525 uint32_t left_edge;
1526 uint32_t scaled_win;
1527 uint32_t max_end;
1528
1529 /*
1530 * Is our sequence fully past the right hand edge of the window?
1531 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001532 seq = ntohl(tcph->seq);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001533 if (unlikely((int32_t)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) {
1534 struct sfe_ipv4_connection *c = cm->connection;
1535 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1536 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++;
1537 si->packets_not_forwarded++;
1538 spin_unlock(&si->lock);
1539
1540 DEBUG_TRACE("seq: %u exceeds right edge: %u\n",
1541 seq, cm->protocol_state.tcp.max_end + 1);
1542 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1543 return 0;
1544 }
1545
1546 /*
1547 * Check that our TCP data offset isn't too short.
1548 */
1549 data_offs = tcph->doff << 2;
1550 if (unlikely(data_offs < sizeof(struct sfe_ipv4_tcphdr))) {
1551 struct sfe_ipv4_connection *c = cm->connection;
1552 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1553 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++;
1554 si->packets_not_forwarded++;
1555 spin_unlock(&si->lock);
1556
1557 DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs);
1558 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1559 return 0;
1560 }
1561
1562 /*
1563 * Update ACK according to any SACK option.
1564 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001565 ack = ntohl(tcph->ack_seq);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001566 sack = ack;
1567 if (unlikely(!sfe_ipv4_process_tcp_option_sack(tcph, data_offs, &sack))) {
1568 struct sfe_ipv4_connection *c = cm->connection;
1569 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1570 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK]++;
1571 si->packets_not_forwarded++;
1572 spin_unlock(&si->lock);
1573
1574 DEBUG_TRACE("TCP option SACK size is wrong\n");
1575 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1576 return 0;
1577 }
1578
1579 /*
1580 * Check that our TCP data offset isn't past the end of the packet.
1581 */
1582 data_offs += sizeof(struct sfe_ipv4_iphdr);
1583 if (unlikely(len < data_offs)) {
1584 struct sfe_ipv4_connection *c = cm->connection;
1585 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1586 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++;
1587 si->packets_not_forwarded++;
1588 spin_unlock(&si->lock);
1589
1590 DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n",
1591 data_offs, len);
1592 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1593 return 0;
1594 }
1595
1596 end = seq + len - data_offs;
1597
1598 /*
1599 * Is our sequence fully before the left hand edge of the window?
1600 */
1601 if (unlikely((int32_t)(end - (cm->protocol_state.tcp.end
1602 - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) {
1603 struct sfe_ipv4_connection *c = cm->connection;
1604 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1605 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++;
1606 si->packets_not_forwarded++;
1607 spin_unlock(&si->lock);
1608
1609 DEBUG_TRACE("seq: %u before left edge: %u\n",
1610 end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1);
1611 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1612 return 0;
1613 }
1614
1615 /*
1616 * Are we acking data that is to the right of what has been sent?
1617 */
1618 if (unlikely((int32_t)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) {
1619 struct sfe_ipv4_connection *c = cm->connection;
1620 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1621 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++;
1622 si->packets_not_forwarded++;
1623 spin_unlock(&si->lock);
1624
1625 DEBUG_TRACE("ack: %u exceeds right edge: %u\n",
1626 sack, counter_cm->protocol_state.tcp.end + 1);
1627 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1628 return 0;
1629 }
1630
1631 /*
1632 * Is our ack too far before the left hand edge of the window?
1633 */
1634 left_edge = counter_cm->protocol_state.tcp.end
1635 - cm->protocol_state.tcp.max_win
1636 - SFE_IPV4_TCP_MAX_ACK_WINDOW
1637 - 1;
1638 if (unlikely((int32_t)(sack - left_edge) < 0)) {
1639 struct sfe_ipv4_connection *c = cm->connection;
1640 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1641 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++;
1642 si->packets_not_forwarded++;
1643 spin_unlock(&si->lock);
1644
1645 DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge);
1646 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1647 return 0;
1648 }
1649
1650 /*
1651 * Have we just seen the largest window size yet for this connection? If yes
1652 * then we need to record the new value.
1653 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001654 scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001655 scaled_win += (sack - ack);
1656 if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) {
1657 cm->protocol_state.tcp.max_win = scaled_win;
1658 }
1659
1660 /*
1661 * If our sequence and/or ack numbers have advanced then record the new state.
1662 */
1663 if (likely((int32_t)(end - cm->protocol_state.tcp.end) >= 0)) {
1664 cm->protocol_state.tcp.end = end;
1665 }
1666
1667 max_end = sack + scaled_win;
1668 if (likely((int32_t)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) {
1669 counter_cm->protocol_state.tcp.max_end = max_end;
1670 }
1671 }
1672
1673 /*
1674 * From this point on we're good to modify the packet.
1675 */
1676
1677 /*
1678 * Decrement our TTL.
1679 */
1680 iph->ttl = ttl - 1;
1681
1682 /*
1683 * Do we have to perform translations of the source address/port?
1684 */
1685 if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
1686 uint16_t tcp_csum;
1687 uint32_t sum;
1688
Dave Hudson87973cd2013-10-22 16:00:04 +01001689 iph->saddr = cm->xlate_src_ip;
1690 tcph->source = cm->xlate_src_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001691
1692 /*
1693 * Do we have a non-zero UDP checksum? If we do then we need
1694 * to update it.
1695 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001696 tcp_csum = tcph->check;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001697 sum = tcp_csum + cm->xlate_src_csum_adjustment;
1698 sum = (sum & 0xffff) + (sum >> 16);
Dave Hudson87973cd2013-10-22 16:00:04 +01001699 tcph->check = (uint16_t)sum;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001700 }
1701
1702 /*
1703 * Do we have to perform translations of the destination address/port?
1704 */
1705 if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) {
1706 uint16_t tcp_csum;
1707 uint32_t sum;
1708
Dave Hudson87973cd2013-10-22 16:00:04 +01001709 iph->daddr = cm->xlate_dest_ip;
1710 tcph->dest = cm->xlate_dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001711
1712 /*
1713 * Do we have a non-zero UDP checksum? If we do then we need
1714 * to update it.
1715 */
Dave Hudson87973cd2013-10-22 16:00:04 +01001716 tcp_csum = tcph->check;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001717 sum = tcp_csum + cm->xlate_dest_csum_adjustment;
1718 sum = (sum & 0xffff) + (sum >> 16);
Dave Hudson87973cd2013-10-22 16:00:04 +01001719 tcph->check = (uint16_t)sum;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001720 }
1721
1722 /*
1723 * Replace the IP checksum.
1724 */
1725 iph->check = sfe_ipv4_gen_ip_csum(iph);
1726
1727// if ((nat_entry_data->tos & FASTNAT_DSCP_MASK) != (iph->tos & FASTNAT_DSCP_MASK)) {
1728// ipv4_change_dsfield(iph, (u_int8_t)(~FASTNAT_DSCP_MASK), nat_entry_data->tos);
1729// }
1730
1731// skb->priority = nat_entry_data->priority;
1732// skb->mark = nat_entry_data->mark;
1733
1734 /*
1735 * Update traffic stats.
1736 */
1737 cm->rx_packet_count++;
1738 cm->rx_byte_count += len;
1739
1740 /*
1741 * If we're not already on the active list then insert ourselves at the tail
1742 * of the current list.
1743 */
1744 if (unlikely(!cm->active)) {
1745 cm->active = true;
1746 cm->active_prev = si->active_tail;
1747 if (likely(si->active_tail)) {
1748 si->active_tail->active_next = cm;
1749 } else {
1750 si->active_head = cm;
1751 }
1752 si->active_tail = cm;
1753 }
1754
1755 xmit_dev = cm->xmit_dev;
1756 skb->dev = xmit_dev;
1757
1758 /*
1759 * Do we have a simple Ethernet header to write?
1760 */
1761 if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR))) {
1762 /*
1763 * If this is anything other than a point-to-point interface then we need to
1764 * create a header based on MAC addresses.
1765 */
1766 if (likely(!(xmit_dev->flags & IFF_POINTOPOINT))) {
1767 xmit_dev->header_ops->create(skb, xmit_dev, ETH_P_IP,
1768 cm->xmit_dest_mac, cm->xmit_src_mac, len);
1769 }
1770 } else {
1771 struct sfe_ipv4_ethhdr *eth = (struct sfe_ipv4_ethhdr *)__skb_push(skb, ETH_HLEN);
1772 eth->h_proto = htons(ETH_P_IP);
1773 eth->h_dest[0] = htons(cm->xmit_dest_mac[0]);
1774 eth->h_dest[1] = htons(cm->xmit_dest_mac[1]);
1775 eth->h_dest[2] = htons(cm->xmit_dest_mac[2]);
1776 eth->h_source[0] = htons(cm->xmit_src_mac[0]);
1777 eth->h_source[1] = htons(cm->xmit_src_mac[1]);
1778 eth->h_source[2] = htons(cm->xmit_src_mac[2]);
1779 }
1780
Matthew McClintockbe7b47d2013-11-27 13:26:23 -06001781 /*
1782 * Mark outgoing packet
1783 */
1784 skb->mark = cm->connection->mark;
1785 if (skb->mark) {
1786 DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark);
1787 }
1788
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001789 si->packets_forwarded++;
1790 spin_unlock(&si->lock);
1791
1792 /*
1793 * We're going to check for GSO flags when we transmit the packet so
1794 * start fetching the necessary cache line now.
1795 */
1796 prefetch(skb_shinfo(skb));
1797
1798 /*
1799 * Send the packet on its way.
1800 */
1801 dev_queue_xmit(skb);
1802
1803 return 1;
1804}
1805
1806/*
1807 * sfe_ipv4_recv_icmp()
1808 * Handle ICMP packet receives.
1809 *
1810 * ICMP packets aren't handled as a "fast path" and always have us process them
1811 * through the default Linux stack. What we do need to do is look for any errors
1812 * about connections we are handling in the fast path. If we find any such
1813 * connections then we want to flush their state so that the ICMP error path
1814 * within Linux has all of the correct state should it need it.
1815 */
1816static int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
1817 unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl)
1818{
1819 struct icmphdr *icmph;
1820 struct sfe_ipv4_iphdr *icmp_iph;
1821 unsigned int icmp_ihl_words;
1822 unsigned int icmp_ihl;
1823 uint32_t *icmp_trans_h;
1824 struct sfe_ipv4_udphdr *icmp_udph;
1825 struct sfe_ipv4_tcphdr *icmp_tcph;
Dave Hudson87973cd2013-10-22 16:00:04 +01001826 __be32 src_ip;
1827 __be32 dest_ip;
1828 __be16 src_port;
1829 __be16 dest_port;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001830 struct sfe_ipv4_connection_match *cm;
1831 struct sfe_ipv4_connection *c;
1832
1833 /*
1834 * Is our packet too short to contain a valid UDP header?
1835 */
1836 len -= ihl;
1837 if (unlikely(len < sizeof(struct icmphdr))) {
1838 spin_lock(&si->lock);
1839 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++;
1840 si->packets_not_forwarded++;
1841 spin_unlock(&si->lock);
1842
1843 DEBUG_TRACE("packet too short for ICMP header\n");
1844 return 0;
1845 }
1846
1847 /*
1848 * We only handle "destination unreachable" and "time exceeded" messages.
1849 */
1850 icmph = (struct icmphdr *)(skb->data + ihl);
1851 if ((icmph->type != ICMP_DEST_UNREACH)
1852 && (icmph->type != ICMP_TIME_EXCEEDED)) {
1853 spin_lock(&si->lock);
1854 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++;
1855 si->packets_not_forwarded++;
1856 spin_unlock(&si->lock);
1857
1858 DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->type);
1859 return 0;
1860 }
1861
1862 /*
1863 * Do we have the full embedded IP header?
1864 */
1865 len -= sizeof(struct icmphdr);
1866 if (unlikely(len < sizeof(struct sfe_ipv4_iphdr))) {
1867 spin_lock(&si->lock);
1868 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE]++;
1869 si->packets_not_forwarded++;
1870 spin_unlock(&si->lock);
1871
1872 DEBUG_TRACE("Embedded IP header not complete\n");
1873 return 0;
1874 }
1875
1876 /*
1877 * Is our embedded IP version wrong?
1878 */
1879 icmp_iph = (struct sfe_ipv4_iphdr *)(icmph + 1);
1880 if (unlikely(icmp_iph->version != 4)) {
1881 spin_lock(&si->lock);
1882 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4]++;
1883 si->packets_not_forwarded++;
1884 spin_unlock(&si->lock);
1885
1886 DEBUG_TRACE("IP version: %u\n", icmp_iph->version);
1887 return 0;
1888 }
1889
1890 /*
1891 * Do we have the full embedded IP header, including any options?
1892 */
1893 icmp_ihl_words = icmp_iph->ihl;
1894 icmp_ihl = icmp_ihl_words << 2;
1895 if (unlikely(len < icmp_ihl)) {
1896 spin_lock(&si->lock);
1897 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE]++;
1898 si->packets_not_forwarded++;
1899 spin_unlock(&si->lock);
1900
1901 DEBUG_TRACE("Embedded header not large enough for IP options\n");
1902 return 0;
1903 }
1904
1905 len -= icmp_ihl;
1906 icmp_trans_h = ((uint32_t *)icmp_iph) + icmp_ihl_words;
1907
1908 /*
1909 * Handle the embedded transport layer header.
1910 */
1911 switch (icmp_iph->protocol) {
1912 case IPPROTO_UDP:
1913 /*
1914 * We should have 8 bytes of UDP header - that's enough to identify
1915 * the connection.
1916 */
1917 if (unlikely(len < 8)) {
1918 spin_lock(&si->lock);
1919 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE]++;
1920 si->packets_not_forwarded++;
1921 spin_unlock(&si->lock);
1922
1923 DEBUG_TRACE("Incomplete embedded UDP header\n");
1924 return 0;
1925 }
1926
1927 icmp_udph = (struct sfe_ipv4_udphdr *)icmp_trans_h;
Dave Hudson87973cd2013-10-22 16:00:04 +01001928 src_port = icmp_udph->source;
1929 dest_port = icmp_udph->dest;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001930 break;
1931
1932 case IPPROTO_TCP:
1933 /*
1934 * We should have 8 bytes of TCP header - that's enough to identify
1935 * the connection.
1936 */
1937 if (unlikely(len < 8)) {
1938 spin_lock(&si->lock);
1939 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE]++;
1940 si->packets_not_forwarded++;
1941 spin_unlock(&si->lock);
1942
1943 DEBUG_TRACE("Incomplete embedded TCP header\n");
1944 return 0;
1945 }
1946
1947 icmp_tcph = (struct sfe_ipv4_tcphdr *)icmp_trans_h;
Dave Hudson87973cd2013-10-22 16:00:04 +01001948 src_port = icmp_tcph->source;
1949 dest_port = icmp_tcph->dest;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001950 break;
1951
1952 default:
1953 spin_lock(&si->lock);
1954 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL]++;
1955 si->packets_not_forwarded++;
1956 spin_unlock(&si->lock);
1957
1958 DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", icmp_iph->protocol);
1959 return 0;
1960 }
1961
Dave Hudson87973cd2013-10-22 16:00:04 +01001962 src_ip = icmp_iph->saddr;
1963 dest_ip = icmp_iph->daddr;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01001964
1965 spin_lock(&si->lock);
1966
1967 /*
1968 * Look for a connection match. Note that we reverse the source and destination
1969 * here because our embedded message contains a packet that was sent in the
1970 * opposite direction to the one in which we just received it. It will have
1971 * been sent on the interface from which we received it though so that's still
1972 * ok to use.
1973 */
1974 cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, icmp_iph->protocol, dest_ip, dest_port, src_ip, src_port);
1975 if (unlikely(!cm)) {
1976 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++;
1977 si->packets_not_forwarded++;
1978 spin_unlock(&si->lock);
1979
1980 DEBUG_TRACE("no connection found\n");
1981 return 0;
1982 }
1983
1984 /*
1985 * We found a connection so now remove it from the connection list and flush
1986 * its state.
1987 */
1988 c = cm->connection;
1989 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
1990 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++;
1991 si->packets_not_forwarded++;
1992 spin_unlock(&si->lock);
1993
1994 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
1995 return 0;
1996}
1997
1998/*
1999 * sfe_ipv4_recv()
2000 * Handle packet receives and forwaring.
2001 *
2002 * Returns 1 if the packet is forwarded or 0 if it isn't.
2003 */
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002004int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002005{
2006 struct sfe_ipv4 *si = &__si;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002007 unsigned int len;
2008 unsigned int tot_len;
2009 unsigned int frag_off;
2010 unsigned int ihl;
2011 bool flush_on_find;
2012 bool ip_options;
2013 struct sfe_ipv4_iphdr *iph;
2014 uint32_t protocol;
2015
2016 /*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002017 * Check that we have space for an IP header here.
2018 */
2019 len = skb->len;
2020 if (unlikely(len < sizeof(struct sfe_ipv4_iphdr))) {
2021 spin_lock(&si->lock);
2022 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE]++;
2023 si->packets_not_forwarded++;
2024 spin_unlock(&si->lock);
2025
2026 DEBUG_TRACE("len: %u is too short\n", len);
2027 return 0;
2028 }
2029
2030 /*
2031 * Check that our "total length" is large enough for an IP header.
2032 */
2033 iph = (struct sfe_ipv4_iphdr *)skb->data;
2034 tot_len = ntohs(iph->tot_len);
2035 if (unlikely(tot_len < sizeof(struct sfe_ipv4_iphdr))) {
2036 spin_lock(&si->lock);
2037 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH]++;
2038 si->packets_not_forwarded++;
2039 spin_unlock(&si->lock);
2040
2041 DEBUG_TRACE("tot_len: %u is too short\n", tot_len);
2042 return 0;
2043 }
2044
2045 /*
2046 * Is our IP version wrong?
2047 */
2048 if (unlikely(iph->version != 4)) {
2049 spin_lock(&si->lock);
2050 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_V4]++;
2051 si->packets_not_forwarded++;
2052 spin_unlock(&si->lock);
2053
2054 DEBUG_TRACE("IP version: %u\n", iph->version);
2055 return 0;
2056 }
2057
2058 /*
2059 * Does our datagram fit inside the skb?
2060 */
2061 if (unlikely(tot_len > len)) {
2062 spin_lock(&si->lock);
2063 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++;
2064 si->packets_not_forwarded++;
2065 spin_unlock(&si->lock);
2066
2067 DEBUG_TRACE("tot_len: %u, exceeds len: %u\n", tot_len, len);
2068 return 0;
2069 }
2070
2071 /*
2072 * Do we have a non-initial fragment?
2073 */
2074 frag_off = ntohs(iph->frag_off);
2075 if (unlikely(frag_off & IP_OFFSET)) {
2076 spin_lock(&si->lock);
2077 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++;
2078 si->packets_not_forwarded++;
2079 spin_unlock(&si->lock);
2080
2081 DEBUG_TRACE("non-initial fragment\n");
2082 return 0;
2083 }
2084
2085 /*
2086 * If we have a (first) fragment then mark it to cause any connection to flush.
2087 */
2088 flush_on_find = unlikely(frag_off & IP_MF) ? true : false;
2089
2090 /*
2091 * Do we have any IP options? That's definite a slow path! If we do have IP
2092 * options we need to recheck our header size.
2093 */
2094 ihl = iph->ihl << 2;
2095 ip_options = unlikely(ihl != sizeof(struct sfe_ipv4_iphdr)) ? true : false;
2096 if (unlikely(ip_options)) {
2097 if (unlikely(len < ihl)) {
2098 spin_lock(&si->lock);
2099 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE]++;
2100 si->packets_not_forwarded++;
2101 spin_unlock(&si->lock);
2102
2103 DEBUG_TRACE("len: %u is too short for header of size: %u\n", len, ihl);
2104 return 0;
2105 }
2106
2107 flush_on_find = true;
2108 }
2109
2110 protocol = iph->protocol;
2111 if (IPPROTO_UDP == protocol) {
2112 return sfe_ipv4_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find);
2113 }
2114
2115 if (IPPROTO_TCP == protocol) {
2116 return sfe_ipv4_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find);
2117 }
2118
2119 if (IPPROTO_ICMP == protocol) {
2120 return sfe_ipv4_recv_icmp(si, skb, dev, len, iph, ihl);
2121 }
2122
2123 spin_lock(&si->lock);
2124 si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++;
2125 si->packets_not_forwarded++;
2126 spin_unlock(&si->lock);
2127
2128 DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", protocol);
2129 return 0;
2130}
2131
2132/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002133 * sfe_ipv4_create_rule()
2134 * Create a forwarding rule.
2135 */
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002136void sfe_ipv4_create_rule(struct sfe_ipv4_create *sic)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002137{
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002138 struct sfe_ipv4 *si = &__si;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002139 struct sfe_ipv4_connection *c;
2140 struct sfe_ipv4_connection_match *original_cm;
2141 struct sfe_ipv4_connection_match *reply_cm;
2142
2143 spin_lock_bh(&si->lock);
2144 si->connection_create_requests++;
2145
2146 /*
2147 * Check to see if there is already a flow that matches the rule we're trying
2148 * to create. If there is then we can't create a new one.
2149 */
2150 c = sfe_ipv4_find_sfe_ipv4_connection(si, sic->protocol, sic->src_ip, sic->src_port,
2151 sic->dest_ip, sic->dest_port);
2152 if (c) {
2153 si->connection_create_collisions++;
2154
2155 /*
2156 * If we already have the flow then it's likely that this request to
2157 * create the connection rule contains more up-to-date information.
2158 * Check and update accordingly.
2159 */
2160 original_cm = c->original_match;
2161 reply_cm = c->reply_match;
2162
2163 switch (sic->protocol) {
2164 case IPPROTO_TCP:
2165 if (original_cm->protocol_state.tcp.max_win < sic->src_td_max_window) {
2166 original_cm->protocol_state.tcp.max_win = sic->src_td_max_window;
2167 }
2168 if ((int32_t)(original_cm->protocol_state.tcp.end - sic->src_td_end) < 0) {
2169 original_cm->protocol_state.tcp.end = sic->src_td_end;
2170 }
2171 if ((int32_t)(original_cm->protocol_state.tcp.max_end - sic->src_td_max_end) < 0) {
2172 original_cm->protocol_state.tcp.max_end = sic->src_td_max_end;
2173 }
2174 if (reply_cm->protocol_state.tcp.max_win < sic->dest_td_max_window) {
2175 reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window;
2176 }
2177 if ((int32_t)(reply_cm->protocol_state.tcp.end - sic->dest_td_end) < 0) {
2178 reply_cm->protocol_state.tcp.end = sic->dest_td_end;
2179 }
2180 if ((int32_t)(reply_cm->protocol_state.tcp.max_end - sic->dest_td_max_end) < 0) {
2181 reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end;
2182 }
2183 original_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2184 reply_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2185 if (sic->flags & SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK) {
2186 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2187 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2188 }
2189 break;
2190 }
2191
2192 spin_unlock_bh(&si->lock);
2193
2194 DEBUG_TRACE("connection already exists - p: %d\n"
2195 " s: %s:%pM:%pI4:%u, d: %s:%pM:%pI4:%u\n",
Dave Hudson87973cd2013-10-22 16:00:04 +01002196 sic->protocol, sic->src_dev->name, sic->src_mac, &sic->src_ip, ntohs(sic->src_port),
2197 sic->dest_dev->name, sic->dest_mac, &sic->dest_ip, ntohs(sic->dest_port));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002198 return;
2199 }
2200
2201 /*
2202 * Allocate the various connection tracking objects.
2203 */
2204 c = (struct sfe_ipv4_connection *)kmalloc(sizeof(struct sfe_ipv4_connection), GFP_ATOMIC);
2205 if (unlikely(!c)) {
2206 spin_unlock_bh(&si->lock);
2207 return;
2208 }
2209
2210 original_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC);
2211 if (unlikely(!original_cm)) {
2212 spin_unlock_bh(&si->lock);
2213 kfree(c);
2214 return;
2215 }
2216
2217 reply_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC);
2218 if (unlikely(!reply_cm)) {
2219 spin_unlock_bh(&si->lock);
2220 kfree(original_cm);
2221 kfree(c);
2222 return;
2223 }
2224
2225 /*
2226 * Fill in the "original" direction connection matching object.
2227 * Note that the transmit MAC address is "dest_mac_xlate" because
2228 * we always know both ends of a connection by their translated
2229 * addresses and not their public addresses.
2230 */
2231 original_cm->match_dev = sic->src_dev;
2232 original_cm->match_protocol = sic->protocol;
2233 original_cm->match_src_ip = sic->src_ip;
2234 original_cm->match_src_port = sic->src_port;
2235 original_cm->match_dest_ip = sic->dest_ip;
2236 original_cm->match_dest_port = sic->dest_port;
2237 original_cm->xlate_src_ip = sic->src_ip_xlate;
2238 original_cm->xlate_src_port = sic->src_port_xlate;
2239 original_cm->xlate_dest_ip = sic->dest_ip_xlate;
2240 original_cm->xlate_dest_port = sic->dest_port_xlate;
2241 original_cm->rx_packet_count = 0;
2242 original_cm->rx_packet_count64 = 0;
2243 original_cm->rx_byte_count = 0;
2244 original_cm->rx_byte_count64 = 0;
2245 original_cm->xmit_dev = sic->dest_dev;
2246 original_cm->xmit_dev_mtu = sic->dest_mtu;
2247 memcpy(original_cm->xmit_src_mac, sic->dest_dev->dev_addr, ETH_ALEN);
2248 memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN);
2249 original_cm->connection = c;
2250 original_cm->counter_match = reply_cm;
2251 original_cm->flags = 0;
2252 original_cm->active_next = NULL;
2253 original_cm->active_prev = NULL;
2254 original_cm->active = false;
2255 if (sic->dest_dev->header_ops->create == eth_header) {
2256 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR;
2257 }
2258
2259 /*
2260 * Fill in the "reply" direction connection matching object.
2261 */
2262 reply_cm->match_dev = sic->dest_dev;
2263 reply_cm->match_protocol = sic->protocol;
2264 reply_cm->match_src_ip = sic->dest_ip_xlate;
2265 reply_cm->match_src_port = sic->dest_port_xlate;
2266 reply_cm->match_dest_ip = sic->src_ip_xlate;
2267 reply_cm->match_dest_port = sic->src_port_xlate;
2268 reply_cm->xlate_src_ip = sic->dest_ip;
2269 reply_cm->xlate_src_port = sic->dest_port;
2270 reply_cm->xlate_dest_ip = sic->src_ip;
2271 reply_cm->xlate_dest_port = sic->src_port;
2272 reply_cm->rx_packet_count = 0;
2273 reply_cm->rx_packet_count64 = 0;
2274 reply_cm->rx_byte_count = 0;
2275 reply_cm->rx_byte_count64 = 0;
2276 reply_cm->xmit_dev = sic->src_dev;
2277 reply_cm->xmit_dev_mtu = sic->src_mtu;
2278 memcpy(reply_cm->xmit_src_mac, sic->src_dev->dev_addr, ETH_ALEN);
2279 memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN);
2280 reply_cm->connection = c;
2281 reply_cm->counter_match = original_cm;
2282 reply_cm->flags = 0;
2283 reply_cm->active_next = NULL;
2284 reply_cm->active_prev = NULL;
2285 reply_cm->active = false;
2286 if (sic->src_dev->header_ops->create == eth_header) {
2287 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR;
2288 }
2289
2290 if (sic->dest_ip != sic->dest_ip_xlate || sic->dest_port != sic->dest_port_xlate) {
2291 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST;
2292 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC;
2293 }
2294
2295 if (sic->src_ip != sic->src_ip_xlate || sic->src_port != sic->src_port_xlate) {
2296 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC;
2297 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST;
2298 }
2299
2300 c->protocol = sic->protocol;
2301 c->src_ip = sic->src_ip;
2302 c->src_ip_xlate = sic->src_ip_xlate;
2303 c->src_port = sic->src_port;
2304 c->src_port_xlate = sic->src_port_xlate;
2305 c->original_dev = sic->src_dev;
2306 c->original_match = original_cm;
2307 c->dest_ip = sic->dest_ip;
2308 c->dest_ip_xlate = sic->dest_ip_xlate;
2309 c->dest_port = sic->dest_port;
2310 c->dest_port_xlate = sic->dest_port_xlate;
2311 c->reply_dev = sic->dest_dev;
2312 c->reply_match = reply_cm;
Matthew McClintockbe7b47d2013-11-27 13:26:23 -06002313 c->mark = sic->mark;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002314
2315 c->last_sync_jiffies = get_jiffies_64();
2316 c->iterators = 0;
2317 c->pending_free = false;
2318
2319 /*
2320 * Take hold of our source and dest devices for the duration of the connection.
2321 */
2322 dev_hold(c->original_dev);
2323 dev_hold(c->reply_dev);
2324
2325 /*
2326 * Initialize the protocol-specific information that we track.
2327 */
2328 switch (sic->protocol) {
2329 case IPPROTO_TCP:
2330 original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale;
2331 original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1;
2332 original_cm->protocol_state.tcp.end = sic->src_td_end;
2333 original_cm->protocol_state.tcp.max_end = sic->src_td_max_end;
2334 reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale;
2335 reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1;
2336 reply_cm->protocol_state.tcp.end = sic->dest_td_end;
2337 reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end;
2338 if (sic->flags & SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK) {
2339 original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2340 reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
2341 }
2342 break;
2343 }
2344
2345 sfe_ipv4_connection_match_compute_translations(original_cm);
2346 sfe_ipv4_connection_match_compute_translations(reply_cm);
2347 sfe_ipv4_insert_sfe_ipv4_connection(si, c);
2348
2349 spin_unlock_bh(&si->lock);
2350
2351 /*
2352 * We have everything we need!
2353 */
2354 DEBUG_INFO("new connection - p: %d\n"
2355 " s: %s:%pM(%pM):%pI4(%pI4):%u(%u)\n"
2356 " d: %s:%pM(%pM):%pI4(%pI4):%u(%u)\n",
2357 sic->protocol,
2358 sic->src_dev->name, sic->src_mac, sic->src_mac_xlate,
Dave Hudson87973cd2013-10-22 16:00:04 +01002359 &sic->src_ip, &sic->src_ip_xlate, ntohs(sic->src_port), ntohs(sic->src_port_xlate),
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002360 sic->dest_dev->name, sic->dest_mac, sic->dest_mac_xlate,
Dave Hudson87973cd2013-10-22 16:00:04 +01002361 &sic->dest_ip, &sic->dest_ip_xlate, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002362}
2363
2364/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002365 * sfe_ipv4_destroy_rule()
2366 * Destroy a forwarding rule.
2367 */
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002368void sfe_ipv4_destroy_rule(struct sfe_ipv4_destroy *sid)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002369{
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002370 struct sfe_ipv4 *si = &__si;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002371 struct sfe_ipv4_connection *c;
2372
2373 spin_lock_bh(&si->lock);
2374 si->connection_destroy_requests++;
2375
2376 /*
2377 * Check to see if we have a flow that matches the rule we're trying
2378 * to destroy. If there isn't then we can't destroy it.
2379 */
2380 c = sfe_ipv4_find_sfe_ipv4_connection(si, sid->protocol, sid->src_ip, sid->src_port,
2381 sid->dest_ip, sid->dest_port);
2382 if (!c) {
2383 si->connection_destroy_misses++;
2384 spin_unlock_bh(&si->lock);
2385
2386 DEBUG_TRACE("connection does not exist - p: %d, s: %pI4:%u, d: %pI4:%u\n",
Dave Hudson87973cd2013-10-22 16:00:04 +01002387 sid->protocol, &sid->src_ip, ntohs(sid->src_port),
2388 &sid->dest_ip, ntohs(sid->dest_port));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002389 return;
2390 }
2391
2392 /*
2393 * Remove our connection details from the hash tables.
2394 */
2395 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
2396 spin_unlock_bh(&si->lock);
2397
2398 /*
2399 * Finally synchronize state and free resources. We need to protect against
2400 * pre-emption by our bottom half while we do this though.
2401 */
2402 local_bh_disable();
2403 sfe_ipv4_flush_sfe_ipv4_connection(si, c);
2404 local_bh_enable();
2405
2406 DEBUG_INFO("connection destroyed - p: %d, s: %pI4:%u, d: %pI4:%u\n",
Dave Hudson87973cd2013-10-22 16:00:04 +01002407 sid->protocol, &sid->src_ip, ntohs(sid->src_port),
2408 &sid->dest_ip, ntohs(sid->dest_port));
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002409}
2410
2411/*
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002412 * sfe_ipv4_register_sync_rule_callback()
2413 * Register a callback for rule synchronization.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002414 */
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002415void sfe_ipv4_register_sync_rule_callback(sfe_ipv4_sync_rule_callback_t sync_rule_callback)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002416{
2417 struct sfe_ipv4 *si = &__si;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002418
2419 spin_lock_bh(&si->lock);
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002420 rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002421 spin_unlock_bh(&si->lock);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002422}
2423
2424/*
2425 * sfe_ipv4_get_debug_dev()
2426 */
2427static ssize_t sfe_ipv4_get_debug_dev(struct device *dev,
2428 struct device_attribute *attr,
2429 char *buf)
2430{
2431 struct sfe_ipv4 *si = &__si;
2432 ssize_t count;
2433 int num;
2434
2435 spin_lock_bh(&si->lock);
2436 num = si->debug_dev;
2437 spin_unlock_bh(&si->lock);
2438
2439 count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num);
2440 return count;
2441}
2442
2443/*
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002444 * sysfs attributes.
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002445 */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002446static const struct device_attribute sfe_ipv4_debug_dev_attr =
2447 __ATTR(debug_dev, S_IWUGO | S_IRUGO, sfe_ipv4_get_debug_dev, NULL);
2448
2449/*
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002450 * sfe_ipv4_destroy_all_rules_for_dev()
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002451 * Destroy all connections that match a particular device.
2452 *
2453 * If we pass dev as NULL then this destroys all connections.
2454 */
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002455void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002456{
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002457 struct sfe_ipv4 *si = &__si;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002458 struct sfe_ipv4_connection *c;
2459 struct sfe_ipv4_connection *c_next;
2460
2461 spin_lock_bh(&si->lock);
2462 c = si->all_connections_head;
2463 if (!c) {
2464 spin_unlock_bh(&si->lock);
2465 return;
2466 }
2467
2468 c->iterators++;
2469
2470 /*
2471 * Iterate over all connections
2472 */
2473 while (c) {
2474 c_next = c->all_connections_next;
2475
2476 /*
2477 * Before we do anything else, take an iterator reference for the
2478 * connection we'll iterate next.
2479 */
2480 if (c_next) {
2481 c_next->iterators++;
2482 }
2483
2484 /*
2485 * Does this connection relate to the device we are destroying? If
2486 * it does then ensure it is marked for being freed as soon as it
2487 * is no longer being iterated.
2488 */
2489 if (!dev
2490 || (dev == c->original_dev)
2491 || (dev == c->reply_dev)) {
2492 c->pending_free = true;
2493 sfe_ipv4_remove_sfe_ipv4_connection(si, c);
2494 }
2495
2496 /*
2497 * Remove the iterator reference that we acquired and see if we
2498 * should free any resources.
2499 */
2500 if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
2501 spin_unlock_bh(&si->lock);
2502
2503 /*
2504 * This entry is dead so release our hold of the source and
2505 * dest devices and free the memory for our connection objects.
2506 */
2507 dev_put(c->original_dev);
2508 dev_put(c->reply_dev);
2509 kfree(c->original_match);
2510 kfree(c->reply_match);
2511 kfree(c);
2512
2513 spin_lock_bh(&si->lock);
2514 }
2515
2516 c = c_next;
2517 }
2518
2519 spin_unlock_bh(&si->lock);
2520}
2521
2522/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002523 * sfe_ipv4_periodic_sync()
2524 */
2525static void sfe_ipv4_periodic_sync(unsigned long arg)
2526{
2527 struct sfe_ipv4 *si = (struct sfe_ipv4 *)arg;
2528 uint64_t now_jiffies;
2529 int quota;
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002530 sfe_ipv4_sync_rule_callback_t sync_rule_callback;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002531
2532 now_jiffies = get_jiffies_64();
2533
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002534 rcu_read_lock();
2535 sync_rule_callback = rcu_dereference(si->sync_rule_callback);
2536 if (!sync_rule_callback) {
2537 rcu_read_unlock();
2538 goto done;
2539 }
2540
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002541 spin_lock_bh(&si->lock);
2542 sfe_ipv4_update_summary_stats(si);
2543
2544 /*
2545 * Get an estimate of the number of connections to parse in this sync.
2546 */
2547 quota = (si->num_connections + 63) / 64;
2548
2549 /*
2550 * Walk the "active" list and sync the connection state.
2551 */
2552 while (quota--) {
2553 struct sfe_ipv4_connection_match *cm;
2554 struct sfe_ipv4_connection_match *counter_cm;
2555 struct sfe_ipv4_connection *c;
2556 struct sfe_ipv4_sync sis;
2557
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002558 cm = si->active_head;
2559 if (!cm) {
2560 break;
2561 }
2562
2563 cm->active = false;
2564
2565 /*
2566 * Having found an entry we now remove it from the active scan list.
2567 */
2568 si->active_head = cm->active_next;
2569 if (likely(cm->active_next)) {
2570 cm->active_next->active_prev = NULL;
2571 } else {
2572 si->active_tail = NULL;
2573 }
2574 cm->active_next = NULL;
2575
2576 /*
2577 * We scan the connection match lists so there's a possibility that our
2578 * counter match is in the list too. If it is then remove it.
2579 */
2580 counter_cm = cm->counter_match;
2581 if (counter_cm->active) {
2582 counter_cm->active = false;
2583
2584 if (likely(counter_cm->active_prev)) {
2585 counter_cm->active_prev->active_next = counter_cm->active_next;
2586 } else {
2587 si->active_head = counter_cm->active_next;
2588 }
2589
2590 if (likely(counter_cm->active_next)) {
2591 counter_cm->active_next->active_prev = counter_cm->active_prev;
2592 } else {
2593 si->active_tail = counter_cm->active_prev;
2594 }
2595
2596 counter_cm->active_next = NULL;
2597 counter_cm->active_prev = NULL;
2598 }
2599
2600 /*
2601 * Sync the connection state.
2602 */
2603 c = cm->connection;
2604 sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, now_jiffies);
2605
2606 /*
2607 * We don't want to be holding the lock when we sync!
2608 */
2609 spin_unlock_bh(&si->lock);
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002610 sync_rule_callback(&sis);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002611 spin_lock_bh(&si->lock);
2612 }
2613
2614 spin_unlock_bh(&si->lock);
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002615 rcu_read_unlock();
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002616
Dave Hudsondcd08fb2013-11-22 09:25:16 -06002617done:
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002618 mod_timer(&si->timer, jiffies + (HZ / 100));
2619}
2620
2621#define CHAR_DEV_MSG_SIZE 768
2622
2623/*
2624 * sfe_ipv4_debug_dev_read_start()
2625 * Generate part of the XML output.
2626 */
2627static bool sfe_ipv4_debug_dev_read_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2628 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2629{
2630 int bytes_read;
2631
2632 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "<sfe_ipv4>\n");
2633 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2634 return false;
2635 }
2636
2637 *length -= bytes_read;
2638 *total_read += bytes_read;
2639
2640 ws->state++;
2641 return true;
2642}
2643
2644/*
2645 * sfe_ipv4_debug_dev_read_connections_start()
2646 * Generate part of the XML output.
2647 */
2648static bool sfe_ipv4_debug_dev_read_connections_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2649 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2650{
2651 int bytes_read;
2652
2653 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<connections>\n");
2654 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2655 return false;
2656 }
2657
2658 *length -= bytes_read;
2659 *total_read += bytes_read;
2660
2661 ws->state++;
2662 return true;
2663}
2664
2665/*
2666 * sfe_ipv4_debug_dev_read_connections_connection()
2667 * Generate part of the XML output.
2668 */
2669static bool sfe_ipv4_debug_dev_read_connections_connection(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2670 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2671{
2672 struct sfe_ipv4_connection *c;
2673 struct sfe_ipv4_connection *c_next;
2674 struct sfe_ipv4_connection_match *original_cm;
2675 struct sfe_ipv4_connection_match *reply_cm;
2676 int bytes_read;
2677 int protocol;
2678 struct net_device *src_dev;
Dave Hudson87973cd2013-10-22 16:00:04 +01002679 __be32 src_ip;
2680 __be32 src_ip_xlate;
2681 __be16 src_port;
2682 __be16 src_port_xlate;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002683 uint64_t src_rx_packets;
2684 uint64_t src_rx_bytes;
2685 struct net_device *dest_dev;
Dave Hudson87973cd2013-10-22 16:00:04 +01002686 __be32 dest_ip;
2687 __be32 dest_ip_xlate;
2688 __be16 dest_port;
2689 __be16 dest_port_xlate;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002690 uint64_t dest_rx_packets;
2691 uint64_t dest_rx_bytes;
2692 uint64_t last_sync_jiffies;
2693
2694 spin_lock_bh(&si->lock);
2695 c = ws->iter_conn;
2696
2697 /*
2698 * Is this the first connection we need to scan?
2699 */
2700 if (!c) {
2701 c = si->all_connections_head;
2702
2703 /*
2704 * If there were no connections then move to the next state.
2705 */
2706 if (!c) {
2707 spin_unlock_bh(&si->lock);
2708
2709 ws->state++;
2710 return true;
2711 }
2712
2713 c->iterators++;
2714 }
2715
2716 c_next = c->all_connections_next;
2717 ws->iter_conn = c_next;
2718
2719 /*
2720 * Before we do anything else, take an iterator reference for the
2721 * connection we'll iterate next.
2722 */
2723 if (c_next) {
2724 c_next->iterators++;
2725 }
2726
2727 /*
2728 * Remove the iterator reference that we acquired and see if we
2729 * should free any resources.
2730 */
2731 if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
2732 spin_unlock_bh(&si->lock);
2733
2734 /*
2735 * This entry is dead so release our hold of the source and
2736 * dest devices and free the memory for our connection objects.
2737 */
2738 dev_put(c->original_dev);
2739 dev_put(c->reply_dev);
2740 kfree(c->original_match);
2741 kfree(c->reply_match);
2742 kfree(c);
2743
2744 /*
2745 * If we have no more connections then move to the next state.
2746 */
2747 if (!c_next) {
2748 ws->state++;
2749 }
2750
2751 return true;
2752 }
2753
2754 original_cm = c->original_match;
2755 reply_cm = c->reply_match;
2756
2757 protocol = c->protocol;
2758 src_dev = c->original_dev;
2759 src_ip = c->src_ip;
2760 src_ip_xlate = c->src_ip_xlate;
2761 src_port = c->src_port;
2762 src_port_xlate = c->src_port_xlate;
2763
2764 sfe_ipv4_connection_match_update_summary_stats(original_cm);
2765 sfe_ipv4_connection_match_update_summary_stats(reply_cm);
2766
2767 src_rx_packets = original_cm->rx_packet_count64;
2768 src_rx_bytes = original_cm->rx_byte_count64;
2769 dest_dev = c->reply_dev;
2770 dest_ip = c->dest_ip;
2771 dest_ip_xlate = c->dest_ip_xlate;
2772 dest_port = c->dest_port;
2773 dest_port_xlate = c->dest_port_xlate;
2774 dest_rx_packets = reply_cm->rx_packet_count64;
2775 dest_rx_bytes = reply_cm->rx_byte_count64;
2776 last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies;
2777 spin_unlock_bh(&si->lock);
2778
2779 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t<connection "
2780 "protocol=\"%u\" "
2781 "src_dev=\"%s\" "
2782 "src_ip=\"%pI4\" src_ip_xlate=\"%pI4\" "
2783 "src_port=\"%u\" src_port_xlate=\"%u\" "
2784 "src_rx_pkts=\"%llu\" src_rx_bytes=\"%llu\" "
2785 "dest_dev=\"%s\" "
2786 "dest_ip=\"%pI4\" dest_ip_xlate=\"%pI4\" "
2787 "dest_port=\"%u\" dest_port_xlate=\"%u\" "
2788 "dest_rx_pkts=\"%llu\" dest_rx_bytes=\"%llu\" "
2789 "last_sync=\"%llu\" />\n",
2790 protocol,
2791 src_dev->name,
2792 &src_ip, &src_ip_xlate,
Dave Hudson87973cd2013-10-22 16:00:04 +01002793 ntohs(src_port), ntohs(src_port_xlate),
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002794 src_rx_packets, src_rx_bytes,
2795 dest_dev->name,
2796 &dest_ip, &dest_ip_xlate,
Dave Hudson87973cd2013-10-22 16:00:04 +01002797 ntohs(dest_port), ntohs(dest_port_xlate),
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01002798 dest_rx_packets, dest_rx_bytes,
2799 last_sync_jiffies);
2800
2801 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2802 return false;
2803 }
2804
2805 *length -= bytes_read;
2806 *total_read += bytes_read;
2807
2808 /*
2809 * If we have no more connections then move to the next state.
2810 */
2811 if (!c_next) {
2812 ws->state++;
2813 }
2814
2815 return true;
2816}
2817
2818/*
2819 * sfe_ipv4_debug_dev_read_connections_end()
2820 * Generate part of the XML output.
2821 */
2822static bool sfe_ipv4_debug_dev_read_connections_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2823 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2824{
2825 int bytes_read;
2826
2827 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t</connections>\n");
2828 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2829 return false;
2830 }
2831
2832 *length -= bytes_read;
2833 *total_read += bytes_read;
2834
2835 ws->state++;
2836 return true;
2837}
2838
2839/*
2840 * sfe_ipv4_debug_dev_read_exceptions_start()
2841 * Generate part of the XML output.
2842 */
2843static bool sfe_ipv4_debug_dev_read_exceptions_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2844 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2845{
2846 int bytes_read;
2847
2848 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<exceptions>\n");
2849 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2850 return false;
2851 }
2852
2853 *length -= bytes_read;
2854 *total_read += bytes_read;
2855
2856 ws->state++;
2857 return true;
2858}
2859
2860/*
2861 * sfe_ipv4_debug_dev_read_exceptions_exception()
2862 * Generate part of the XML output.
2863 */
2864static bool sfe_ipv4_debug_dev_read_exceptions_exception(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2865 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2866{
2867 uint64_t ct;
2868
2869 spin_lock_bh(&si->lock);
2870 ct = si->exception_events64[ws->iter_exception];
2871 spin_unlock_bh(&si->lock);
2872
2873 if (ct) {
2874 int bytes_read;
2875
2876 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE,
2877 "\t\t<exception name=\"%s\" count=\"%llu\" />\n",
2878 sfe_ipv4_exception_events_string[ws->iter_exception],
2879 ct);
2880 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2881 return false;
2882 }
2883
2884 *length -= bytes_read;
2885 *total_read += bytes_read;
2886 }
2887
2888 ws->iter_exception++;
2889 if (ws->iter_exception >= SFE_IPV4_EXCEPTION_EVENT_LAST) {
2890 ws->iter_exception = 0;
2891 ws->state++;
2892 }
2893
2894 return true;
2895}
2896
2897/*
2898 * sfe_ipv4_debug_dev_read_exceptions_end()
2899 * Generate part of the XML output.
2900 */
2901static bool sfe_ipv4_debug_dev_read_exceptions_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2902 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2903{
2904 int bytes_read;
2905
2906 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t</exceptions>\n");
2907 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2908 return false;
2909 }
2910
2911 *length -= bytes_read;
2912 *total_read += bytes_read;
2913
2914 ws->state++;
2915 return true;
2916}
2917
2918/*
2919 * sfe_ipv4_debug_dev_read_stats()
2920 * Generate part of the XML output.
2921 */
2922static bool sfe_ipv4_debug_dev_read_stats(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2923 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2924{
2925 int bytes_read;
2926 unsigned int num_connections;
2927 uint64_t packets_forwarded;
2928 uint64_t packets_not_forwarded;
2929 uint64_t connection_create_requests;
2930 uint64_t connection_create_collisions;
2931 uint64_t connection_destroy_requests;
2932 uint64_t connection_destroy_misses;
2933 uint64_t connection_flushes;
2934 uint64_t connection_match_hash_hits;
2935 uint64_t connection_match_hash_reorders;
2936
2937 spin_lock_bh(&si->lock);
2938 sfe_ipv4_update_summary_stats(si);
2939
2940 num_connections = si->num_connections;
2941 packets_forwarded = si->packets_forwarded64;
2942 packets_not_forwarded = si->packets_not_forwarded64;
2943 connection_create_requests = si->connection_create_requests64;
2944 connection_create_collisions = si->connection_create_collisions64;
2945 connection_destroy_requests = si->connection_destroy_requests64;
2946 connection_destroy_misses = si->connection_destroy_misses64;
2947 connection_flushes = si->connection_flushes64;
2948 connection_match_hash_hits = si->connection_match_hash_hits64;
2949 connection_match_hash_reorders = si->connection_match_hash_reorders64;
2950 spin_unlock_bh(&si->lock);
2951
2952 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<stats "
2953 "num_connections=\"%u\" "
2954 "pkts_forwarded=\"%llu\" pkts_not_forwarded=\"%llu\" "
2955 "create_requests=\"%llu\" create_collisions=\"%llu\" "
2956 "destroy_requests=\"%llu\" destroy_misses=\"%llu\" "
2957 "flushes=\"%llu\" "
2958 "hash_hits=\"%llu\" hash_reorders=\"%llu\" />\n",
2959 num_connections,
2960 packets_forwarded,
2961 packets_not_forwarded,
2962 connection_create_requests,
2963 connection_create_collisions,
2964 connection_destroy_requests,
2965 connection_destroy_misses,
2966 connection_flushes,
2967 connection_match_hash_hits,
2968 connection_match_hash_reorders);
2969 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2970 return false;
2971 }
2972
2973 *length -= bytes_read;
2974 *total_read += bytes_read;
2975
2976 ws->state++;
2977 return true;
2978}
2979
2980/*
2981 * sfe_ipv4_debug_dev_read_end()
2982 * Generate part of the XML output.
2983 */
2984static bool sfe_ipv4_debug_dev_read_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
2985 int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
2986{
2987 int bytes_read;
2988
2989 bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "</sfe_ipv4>\n");
2990 if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
2991 return false;
2992 }
2993
2994 *length -= bytes_read;
2995 *total_read += bytes_read;
2996
2997 ws->state++;
2998 return true;
2999}
3000
3001/*
3002 * Array of write functions that write various XML elements that correspond to
3003 * our XML output state machine.
3004 */
3005sfe_ipv4_debug_xml_write_method_t sfe_ipv4_debug_xml_write_methods[SFE_IPV4_DEBUG_XML_STATE_DONE] = {
3006 sfe_ipv4_debug_dev_read_start,
3007 sfe_ipv4_debug_dev_read_connections_start,
3008 sfe_ipv4_debug_dev_read_connections_connection,
3009 sfe_ipv4_debug_dev_read_connections_end,
3010 sfe_ipv4_debug_dev_read_exceptions_start,
3011 sfe_ipv4_debug_dev_read_exceptions_exception,
3012 sfe_ipv4_debug_dev_read_exceptions_end,
3013 sfe_ipv4_debug_dev_read_stats,
3014 sfe_ipv4_debug_dev_read_end,
3015};
3016
3017/*
3018 * sfe_ipv4_debug_dev_read()
3019 * Send info to userspace upon read request from user
3020 */
3021static ssize_t sfe_ipv4_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset)
3022{
3023 char msg[CHAR_DEV_MSG_SIZE];
3024 int total_read = 0;
3025 struct sfe_ipv4_debug_xml_write_state *ws;
3026 struct sfe_ipv4 *si = &__si;
3027
3028 ws = (struct sfe_ipv4_debug_xml_write_state *)filp->private_data;
3029 while ((ws->state != SFE_IPV4_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) {
3030 if ((sfe_ipv4_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) {
3031 continue;
3032 }
3033 }
3034
3035 return total_read;
3036}
3037
3038/*
3039 * sfe_ipv4_debug_dev_write()
3040 * Write to char device not required/supported
3041 */
3042static ssize_t sfe_ipv4_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset)
3043{
3044 return -EINVAL;
3045}
3046
3047/*
3048 * sfe_ipv4_debug_dev_open()
3049 */
3050static int sfe_ipv4_debug_dev_open(struct inode *inode, struct file *file)
3051{
3052 struct sfe_ipv4_debug_xml_write_state *ws;
3053
3054 ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data;
3055 if (!ws) {
3056 ws = kzalloc(sizeof(struct sfe_ipv4_debug_xml_write_state), GFP_KERNEL);
3057 if (!ws) {
3058 return -ENOMEM;
3059 }
3060
3061 ws->state = SFE_IPV4_DEBUG_XML_STATE_START;
3062 file->private_data = ws;
3063 }
3064
3065 return 0;
3066}
3067
3068/*
3069 * sfe_ipv4_debug_dev_release()
3070 */
3071static int sfe_ipv4_debug_dev_release(struct inode *inode, struct file *file)
3072{
3073 struct sfe_ipv4_debug_xml_write_state *ws;
3074
3075 ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data;
3076 if (ws) {
3077 struct sfe_ipv4_connection *c;
3078
3079 /*
3080 * Are we currently iterating a connection? If we are then
3081 * make sure that we reduce its iterator count and if necessary
3082 * free it.
3083 */
3084 c = ws->iter_conn;
3085 if (c) {
3086 struct sfe_ipv4 *si = &__si;
3087
3088 spin_lock_bh(&si->lock);
3089 if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
3090 spin_unlock_bh(&si->lock);
3091
3092 /*
3093 * This entry is dead so release our hold of the source and
3094 * dest devices and free the memory for our connection objects.
3095 */
3096 dev_put(c->original_dev);
3097 dev_put(c->reply_dev);
3098 kfree(c->original_match);
3099 kfree(c->reply_match);
3100 kfree(c);
3101 }
3102 }
3103
3104 /*
3105 * We've finished with our output so free the write state.
3106 */
3107 kfree(ws);
3108 }
3109
3110 return 0;
3111}
3112
3113/*
3114 * File operations used in the debug char device
3115 */
3116static struct file_operations sfe_ipv4_debug_dev_fops = {
3117 .read = sfe_ipv4_debug_dev_read,
3118 .write = sfe_ipv4_debug_dev_write,
3119 .open = sfe_ipv4_debug_dev_open,
3120 .release = sfe_ipv4_debug_dev_release
3121};
3122
3123/*
Dave Hudson87973cd2013-10-22 16:00:04 +01003124 * sfe_ipv4_init()
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003125 */
Dave Hudson87973cd2013-10-22 16:00:04 +01003126static int __init sfe_ipv4_init(void)
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003127{
3128 struct sfe_ipv4 *si = &__si;
3129 int result = -1;
3130
Dave Hudsondcd08fb2013-11-22 09:25:16 -06003131 DEBUG_INFO("SFE IPv4 init\n");
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003132
3133 /*
3134 * Create sys/sfe_ipv4
3135 */
3136 si->sys_sfe_ipv4 = kobject_create_and_add("sfe_ipv4", NULL);
3137 if (!si->sys_sfe_ipv4) {
3138 DEBUG_ERROR("failed to register sfe_ipv4\n");
3139 goto exit1;
3140 }
3141
3142 /*
3143 * Create files, one for each parameter supported by this module.
3144 */
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003145 result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
3146 if (result) {
3147 DEBUG_ERROR("failed to register debug dev file: %d\n", result);
3148 goto exit4;
3149 }
3150
3151 /*
3152 * Register our debug char device.
3153 */
3154 result = register_chrdev(0, "sfe_ipv4", &sfe_ipv4_debug_dev_fops);
3155 if (result < 0) {
3156 DEBUG_ERROR("Failed to register chrdev: %d\n", result);
3157 goto exit5;
3158 }
3159
3160 si->debug_dev = result;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003161
3162 /*
3163 * Create a timer to handle periodic statistics.
3164 */
3165 setup_timer(&si->timer, sfe_ipv4_periodic_sync, (unsigned long)si);
3166 mod_timer(&si->timer, jiffies + (HZ / 100));
3167
Dave Hudson87973cd2013-10-22 16:00:04 +01003168 spin_lock_init(&si->lock);
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003169
Dave Hudson87973cd2013-10-22 16:00:04 +01003170 return 0;
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003171
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003172exit5:
3173 sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
3174
3175exit4:
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003176 kobject_put(si->sys_sfe_ipv4);
3177
3178exit1:
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003179 return result;
3180}
3181
3182/*
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003183 * sfe_ipv4_exit()
3184 */
3185static void __exit sfe_ipv4_exit(void)
3186{
Dave Hudson87973cd2013-10-22 16:00:04 +01003187 struct sfe_ipv4 *si = &__si;
3188
Dave Hudsondcd08fb2013-11-22 09:25:16 -06003189 DEBUG_INFO("SFE IPv4 exit\n");
Dave Hudson87973cd2013-10-22 16:00:04 +01003190
3191 /*
3192 * Destroy all connections.
3193 */
Dave Hudsondcd08fb2013-11-22 09:25:16 -06003194 sfe_ipv4_destroy_all_rules_for_dev(NULL);
Dave Hudson87973cd2013-10-22 16:00:04 +01003195
3196// XXX - this is where we need to unregister with any lower level offload services.
3197
Dave Hudson87973cd2013-10-22 16:00:04 +01003198 del_timer_sync(&si->timer);
3199
Dave Hudson87973cd2013-10-22 16:00:04 +01003200 unregister_chrdev(si->debug_dev, "sfe_ipv4");
3201
3202 sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
3203
Dave Hudson87973cd2013-10-22 16:00:04 +01003204 kobject_put(si->sys_sfe_ipv4);
3205
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003206}
3207
3208module_init(sfe_ipv4_init)
3209module_exit(sfe_ipv4_exit)
3210
Dave Hudsondcd08fb2013-11-22 09:25:16 -06003211EXPORT_SYMBOL(sfe_ipv4_recv);
3212EXPORT_SYMBOL(sfe_ipv4_create_rule);
3213EXPORT_SYMBOL(sfe_ipv4_destroy_rule);
3214EXPORT_SYMBOL(sfe_ipv4_destroy_all_rules_for_dev);
3215EXPORT_SYMBOL(sfe_ipv4_register_sync_rule_callback);
Matthew McClintockbe7b47d2013-11-27 13:26:23 -06003216EXPORT_SYMBOL(sfe_ipv4_mark_rule);
Dave Hudsondcd08fb2013-11-22 09:25:16 -06003217
Dave Hudsonaaf97ca2013-06-13 17:52:29 +01003218MODULE_AUTHOR("Qualcomm Atheros Inc.");
3219MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv4 edition");
3220MODULE_LICENSE("GPL");
3221