blob: e538abf65303702455da3501c617fbe269bfad25 [file] [log] [blame]
/*
* sfe_ipv4.c
* Shortcut forwarding engine - IPv4 edition.
*
* XXX - fill in the appropriate GPL notice.
*/
#include <linux/types.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/icmp.h>
#include <linux/sysctl.h>
#include <linux/fs.h>
#include <linux/pkt_sched.h>
#include <linux/string.h>
#include <net/route.h>
#include <net/ip.h>
#include <net/tcp.h>
#include <asm/unaligned.h>
#include <asm/uaccess.h>
#include <linux/inetdevice.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
#include <linux/if_bridge.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#include <net/arp.h>
/*
* Select whether we "hook" in below or above the Ethernet bridge layer.
*
* XXX - note that hooking below the bridge (set this value to 0) will
* not currently work completely cleanly within Linux. In order to make
* this work properly we need to resync stats to Linux. Arguably if we
* want to do this we also need to validate that the source MAC address
* of any packets is actually correct too. Right now we're relying on
* the bridge layer to do this sort of thing for us.
*/
#define SFE_HOOK_ABOVE_BRIDGE 1
/*
* Debug output verbosity level.
*/
#define DEBUG_LEVEL 2
#if (DEBUG_LEVEL < 1)
#define DEBUG_ERROR(s, ...)
#else
#define DEBUG_ERROR(s, ...) \
printk("%s[%u]: ERROR:", __FILE__, __LINE__); \
printk(s, ##__VA_ARGS__)
#endif
#if (DEBUG_LEVEL < 2)
#define DEBUG_WARN(s, ...)
#else
#define DEBUG_WARN(s, ...) \
printk("%s[%u]: WARN:", __FILE__, __LINE__); \
printk(s, ##__VA_ARGS__);
#endif
#if (DEBUG_LEVEL < 3)
#define DEBUG_INFO(s, ...)
#else
#define DEBUG_INFO(s, ...) \
printk("%s[%u]: INFO:", __FILE__, __LINE__); \
printk(s, ##__VA_ARGS__);
#endif
#if (DEBUG_LEVEL < 4)
#define DEBUG_TRACE(s, ...)
#else
#define DEBUG_TRACE(s, ...) \
printk("%s[%u]: TRACE:", __FILE__, __LINE__); \
printk(s, ##__VA_ARGS__);
#endif
/*
* The default Linux ethhdr structure is "packed". It also has byte aligned
* MAC addresses and this leads to poor performance. This version is not
* packed and has better alignment for the MAC addresses.
*/
struct sfe_ipv4_ethhdr {
__be16 h_dest[ETH_ALEN / 2];
__be16 h_source[ETH_ALEN / 2];
__be16 h_proto;
};
/*
* The default Linux iphdr structure is "packed". This really hurts performance
* on many CPUs. Here's an aligned and "unpacked" version of the same thing.
*/
struct sfe_ipv4_iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 ihl:4,
version:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 version:4,
ihl:4;
#else
#error "Please fix <asm/byteorder.h>"
#endif
__u8 tos;
__be16 tot_len;
__be16 id;
__be16 frag_off;
__u8 ttl;
__u8 protocol;
__sum16 check;
__be32 saddr;
__be32 daddr;
/*The options start here. */
};
/*
* The default Linux udphdr structure is "packed". This really hurts performance
* on many CPUs. Here's an aligned and "unpacked" version of the same thing.
*/
struct sfe_ipv4_udphdr {
__be16 source;
__be16 dest;
__be16 len;
__sum16 check;
};
/*
* The default Linux tcphdr structure is "packed". This really hurts performance
* on many CPUs. Here's an aligned and "unpacked" version of the same thing.
*/
struct sfe_ipv4_tcphdr {
__be16 source;
__be16 dest;
__be32 seq;
__be32 ack_seq;
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u16 res1:4,
doff:4,
fin:1,
syn:1,
rst:1,
psh:1,
ack:1,
urg:1,
ece:1,
cwr:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
__u16 doff:4,
res1:4,
cwr:1,
ece:1,
urg:1,
ack:1,
psh:1,
rst:1,
syn:1,
fin:1;
#else
#error "Adjust your <asm/byteorder.h> defines"
#endif
__be16 window;
__sum16 check;
__be16 urg_ptr;
};
/*
* IPv4 connection flags.
*/
#define SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK 0x1
/* Indicates that we should not check sequence numbers */
/*
* IPv4 connection creation structure.
*/
struct sfe_ipv4_create {
int protocol;
struct net_device *src_dev;
struct net_device *dest_dev;
uint32_t flags;
uint32_t src_mtu;
uint32_t dest_mtu;
__be32 src_ip;
__be32 src_ip_xlate;
__be32 dest_ip;
__be32 dest_ip_xlate;
__be16 src_port;
__be16 src_port_xlate;
__be16 dest_port;
__be16 dest_port_xlate;
uint8_t src_mac[ETH_ALEN];
uint8_t src_mac_xlate[ETH_ALEN];
uint8_t dest_mac[ETH_ALEN];
uint8_t dest_mac_xlate[ETH_ALEN];
uint8_t src_td_window_scale;
uint32_t src_td_max_window;
uint32_t src_td_end;
uint32_t src_td_max_end;
uint8_t dest_td_window_scale;
uint32_t dest_td_max_window;
uint32_t dest_td_end;
uint32_t dest_td_max_end;
};
/*
* IPv4 connection destruction structure.
*/
struct sfe_ipv4_destroy {
int protocol;
__be32 src_ip;
__be32 dest_ip;
__be16 src_port;
__be16 dest_port;
};
/*
* IPv4 sync reasons.
*/
#define SFE_IPV4_SYNC_REASON_STATS 0 /* Sync is to synchronize stats */
#define SFE_IPV4_SYNC_REASON_FLUSH 1 /* Sync is to flush a cache entry */
#define SFE_IPV4_SYNC_REASON_EVICT 2 /* Sync is to evict a cache entry */
/*
* Structure used to sync IPv4 connection stats/state back within the system.
*
* NOTE: The addresses here are NON-NAT addresses, i.e. the true endpoint addressing.
* 'src' is the creator of the connection.
*/
struct sfe_ipv4_sync {
int protocol; /* IP protocol number (IPPROTO_...) */
__be32 src_ip; /* Non-NAT source address, i.e. the creator of the connection */
__be16 src_port; /* Non-NAT source port */
__be32 dest_ip; /* Non-NAT destination address, i.e. to whom the connection was created */
__be16 dest_port; /* Non-NAT destination port */
uint32_t src_td_max_window;
uint32_t src_td_end;
uint32_t src_td_max_end;
uint64_t src_packet_count;
uint64_t src_byte_count;
uint32_t dest_td_max_window;
uint32_t dest_td_end;
uint32_t dest_td_max_end;
uint64_t dest_packet_count;
uint64_t dest_byte_count;
uint64_t delta_jiffies; /* Time to be added to the current timeout to keep the connection alive */
uint8_t reason; /* Reason of synchronization */
};
/*
* Specifies the lower bound on ACK numbers carried in the TCP header
*/
#define SFE_IPV4_TCP_MAX_ACK_WINDOW 65520
/*
* IPv4 TCP connection match additional data.
*/
struct sfe_ipv4_tcp_connection_match {
uint8_t win_scale; /* Window scale */
uint32_t max_win; /* Maximum window size seen */
uint32_t end; /* Sequence number of the next byte to send (seq + segment length) */
uint32_t max_end; /* Sequence number of the last byte to ack */
};
/*
* Bit flags for IPv4 connection matching entry.
*/
#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC 0x1
/* Perform source translation */
#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST 0x2
/* Perform destination translation */
#define SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK 0x4
/* Ignore TCP sequence numbers */
#define SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR 0x8
/* Fast Ethernet header write */
/*
* IPv4 connection matching structure.
*/
struct sfe_ipv4_connection_match {
/*
* References to other objects.
*/
struct sfe_ipv4_connection_match *next;
/* Next connection match entry in a list */
struct sfe_ipv4_connection_match *prev;
/* Previous connection match entry in a list */
struct sfe_ipv4_connection *connection;
/* Pointer to our connection */
struct sfe_ipv4_connection_match *counter_match;
/* Pointer to the connection match in the "counter" direction to this one */
struct sfe_ipv4_connection_match *active_next;
/* Pointer to the next connection in the active list */
struct sfe_ipv4_connection_match *active_prev;
/* Pointer to the previous connection in the active list */
bool active; /* Flag to indicate if we're on the active list */
/*
* Characteristics that identify flows that match this rule.
*/
struct net_device *match_dev; /* Network device */
uint8_t match_protocol; /* Protocol */
__be32 match_src_ip; /* Source IP address */
__be32 match_dest_ip; /* Destination IP address */
__be16 match_src_port; /* Source port/connection ident */
__be16 match_dest_port; /* Destination port/connection ident */
/*
* Control the operations of the match.
*/
uint32_t flags; /* Bit flags */
/*
* Connection state that we track once we match.
*/
union { /* Protocol-specific state */
struct sfe_ipv4_tcp_connection_match tcp;
} protocol_state;
uint32_t rx_packet_count; /* Number of packets RX'd */
uint32_t rx_byte_count; /* Number of bytes RX'd */
/*
* Packet translation information.
*/
__be32 xlate_src_ip; /* Address after source translation */
__be16 xlate_src_port; /* Port/connection ident after source translation */
uint16_t xlate_src_csum_adjustment;
/* Transport layer checksum adjustment after source translation */
__be32 xlate_dest_ip; /* Address after destination translation */
__be16 xlate_dest_port; /* Port/connection ident after destination translation */
uint16_t xlate_dest_csum_adjustment;
/* Transport layer checksum adjustment after destination translation */
/*
* Packet transmit information.
*/
struct net_device *xmit_dev; /* Network device on which to transmit */
unsigned short int xmit_dev_mtu;
/* Interface MTU */
uint16_t xmit_dest_mac[ETH_ALEN / 2];
/* Destination MAC address to use when forwarding */
uint16_t xmit_src_mac[ETH_ALEN / 2];
/* Source MAC address to use when forwarding */
/*
* Summary stats.
*/
uint64_t rx_packet_count64; /* Number of packets RX'd */
uint64_t rx_byte_count64; /* Number of bytes RX'd */
};
/*
* Per-connection data structure.
*/
struct sfe_ipv4_connection {
struct sfe_ipv4_connection *next;
/* Pointer to the next entry in a hash chain */
struct sfe_ipv4_connection *prev;
/* Pointer to the previous entry in a hash chain */
int protocol; /* IP protocol number */
__be32 src_ip; /* Source IP address */
__be32 src_ip_xlate; /* NAT-translated source IP address */
__be32 dest_ip; /* Destination IP address */
__be32 dest_ip_xlate; /* NAT-translated destination IP address */
__be16 src_port; /* Source port */
__be16 src_port_xlate; /* NAT-translated source port */
__be16 dest_port; /* Destination port */
__be16 dest_port_xlate; /* NAT-translated destination port */
struct sfe_ipv4_connection_match *original_match;
/* Original direction matching structure */
struct net_device *original_dev;
/* Original direction source device */
struct sfe_ipv4_connection_match *reply_match;
/* Reply direction matching structure */
struct net_device *reply_dev; /* Reply direction source device */
uint64_t last_sync_jiffies; /* Jiffies count for the last sync */
struct sfe_ipv4_connection *all_connections_next;
/* Pointer to the next entry in the list of all connections */
struct sfe_ipv4_connection *all_connections_prev;
/* Pointer to the previous entry in the list of all connections */
int iterators; /* Number of iterators currently using this connection */
bool pending_free; /* Flag that indicates that this connection should be freed after iteration */
};
/*
* IPv4 connections and hash table size information.
*/
#define SFE_IPV4_CONNECTION_HASH_SHIFT 12
#define SFE_IPV4_CONNECTION_HASH_SIZE (1 << SFE_IPV4_CONNECTION_HASH_SHIFT)
#define SFE_IPV4_CONNECTION_HASH_MASK (SFE_IPV4_CONNECTION_HASH_SIZE - 1)
enum sfe_ipv4_exception_events {
SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION,
SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT,
SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL,
SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION,
SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS,
SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS,
SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT,
SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL,
SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION,
SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS,
SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE,
SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS,
SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK,
SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS,
SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE,
SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE,
SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE,
SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE,
SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4,
SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL,
SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION,
SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION,
SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH,
SFE_IPV4_EXCEPTION_EVENT_NON_V4,
SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT,
SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE,
SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL,
SFE_IPV4_EXCEPTION_EVENT_LAST
};
static char *sfe_ipv4_exception_events_string[SFE_IPV4_EXCEPTION_EVENT_LAST] = {
"UDP_HEADER_INCOMPLETE",
"UDP_NO_CONNECTION",
"UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT",
"UDP_SMALL_TTL",
"UDP_NEEDS_FRAGMENTATION",
"TCP_HEADER_INCOMPLETE",
"TCP_NO_CONNECTION_SLOW_FLAGS",
"TCP_NO_CONNECTION_FAST_FLAGS",
"TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT",
"TCP_SMALL_TTL",
"TCP_NEEDS_FRAGMENTATION",
"TCP_FLAGS",
"TCP_SEQ_EXCEEDS_RIGHT_EDGE",
"TCP_SMALL_DATA_OFFS",
"TCP_BAD_SACK",
"TCP_BIG_DATA_OFFS",
"TCP_SEQ_BEFORE_LEFT_EDGE",
"TCP_ACK_EXCEEDS_RIGHT_EDGE",
"TCP_ACK_BEFORE_LEFT_EDGE",
"ICMP_HEADER_INCOMPLETE",
"ICMP_UNHANDLED_TYPE",
"ICMP_IPV4_HEADER_INCOMPLETE",
"ICMP_IPV4_NON_V4",
"ICMP_IPV4_IP_OPTIONS_INCOMPLETE",
"ICMP_IPV4_UDP_HEADER_INCOMPLETE",
"ICMP_IPV4_TCP_HEADER_INCOMPLETE",
"ICMP_IPV4_UNHANDLED_PROTOCOL",
"ICMP_NO_CONNECTION",
"ICMP_FLUSHED_CONNECTION",
"HEADER_INCOMPLETE",
"BAD_TOTAL_LENGTH",
"NON_V4",
"NON_INITIAL_FRAGMENT",
"DATAGRAM_INCOMPLETE",
"IP_OPTIONS_INCOMPLETE",
"UNHANDLED_PROTOCOL"
};
/*
* Per-modules structure.
*/
struct sfe_ipv4 {
spinlock_t lock; /* Lock for SMP correctness */
struct sfe_ipv4_connection_match *active_head;
/* Head of the list of recently active connections */
struct sfe_ipv4_connection_match *active_tail;
/* Tail of the list of recently active connections */
struct sfe_ipv4_connection *all_connections_head;
/* Head of the list of all connections */
struct sfe_ipv4_connection *all_connections_tail;
/* Tail of the list of all connections */
unsigned int num_connections; /* Number of connections */
struct timer_list timer; /* Timer used for periodic sync ops */
struct sfe_ipv4_connection *conn_hash[SFE_IPV4_CONNECTION_HASH_SIZE];
/* Connection hash table */
struct sfe_ipv4_connection_match *conn_match_hash[SFE_IPV4_CONNECTION_HASH_SIZE];
/* Connection match hash table */
/*
* Statistics.
*/
uint32_t connection_create_requests;
/* Number of IPv4 connection create requests */
uint32_t connection_create_collisions;
/* Number of IPv4 connection create requests that collided with existing hash table entries */
uint32_t connection_destroy_requests;
/* Number of IPv4 connection destroy requests */
uint32_t connection_destroy_misses;
/* Number of IPv4 connection destroy requests that missed our hash table */
uint32_t connection_match_hash_hits;
/* Number of IPv4 connection match hash hits */
uint32_t connection_match_hash_reorders;
/* Number of IPv4 connection match hash reorders */
uint32_t connection_flushes; /* Number of IPv4 connection flushes */
uint32_t packets_forwarded; /* Number of IPv4 packets forwarded */
uint32_t packets_not_forwarded; /* Number of IPv4 packets not forwarded */
uint32_t exception_events[SFE_IPV4_EXCEPTION_EVENT_LAST];
/*
* Summary tatistics.
*/
uint64_t connection_create_requests64;
/* Number of IPv4 connection create requests */
uint64_t connection_create_collisions64;
/* Number of IPv4 connection create requests that collided with existing hash table entries */
uint64_t connection_destroy_requests64;
/* Number of IPv4 connection destroy requests */
uint64_t connection_destroy_misses64;
/* Number of IPv4 connection destroy requests that missed our hash table */
uint64_t connection_match_hash_hits64;
/* Number of IPv4 connection match hash hits */
uint64_t connection_match_hash_reorders64;
/* Number of IPv4 connection match hash reorders */
uint64_t connection_flushes64; /* Number of IPv4 connection flushes */
uint64_t packets_forwarded64; /* Number of IPv4 packets forwarded */
uint64_t packets_not_forwarded64;
/* Number of IPv4 packets not forwarded */
uint64_t exception_events64[SFE_IPV4_EXCEPTION_EVENT_LAST];
/*
* Control state.
*/
struct kobject *sys_sfe_ipv4; /* sysfs linkage */
int pause; /* Flag that, when non-zero, pauses all SFE processing */
int debug_dev; /* Major number of the debug char device */
/*
* Callback notifiers.
*/
struct notifier_block dev_notifier;
/* Device notifier */
struct notifier_block inet_notifier;
/* IP notifier */
};
/*
* Enumeration of the XML output.
*/
enum sfe_ipv4_debug_xml_states {
SFE_IPV4_DEBUG_XML_STATE_START,
SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_START,
SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_CONNECTION,
SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_END,
SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_START,
SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION,
SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_END,
SFE_IPV4_DEBUG_XML_STATE_STATS,
SFE_IPV4_DEBUG_XML_STATE_END,
SFE_IPV4_DEBUG_XML_STATE_DONE
};
/*
* XML write state.
*/
struct sfe_ipv4_debug_xml_write_state {
enum sfe_ipv4_debug_xml_states state;
/* XML output file state machine state */
struct sfe_ipv4_connection *iter_conn;
/* Next connection iterator */
int iter_exception; /* Next exception iterator */
};
typedef bool (*sfe_ipv4_debug_xml_write_method_t)(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws);
struct sfe_ipv4 __si;
/*
* Expose what should be a static flag in the TCP connection tracker.
*/
extern int nf_ct_tcp_no_window_check;
/*
* Expose the hook for the receive processing.
*/
extern int (*athrs_fast_nat_recv)(struct sk_buff *skb);
/*
* sfe_ipv4_gen_ip_csum()
* Generate the IP checksum for an IPv4 header.
*
* Note that this function assumes that we have only 20 bytes of IP header.
*/
static inline uint16_t sfe_ipv4_gen_ip_csum(struct sfe_ipv4_iphdr *iph)
{
uint32_t sum;
uint16_t *i = (uint16_t *)iph;
iph->check = 0;
/*
* Generate the sum.
*/
sum = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9];
/*
* Fold it to ones-complement form.
*/
sum = (sum & 0xffff) + (sum >> 16);
sum = (sum & 0xffff) + (sum >> 16);
return (uint16_t)sum ^ 0xffff;
}
/*
* sfe_ipv4_get_connection_match_hash()
* Generate the hash used in connection match lookups.
*/
static inline unsigned int sfe_ipv4_get_connection_match_hash(struct net_device *dev, uint8_t protocol,
__be32 src_ip, __be16 src_port,
__be32 dest_ip, __be16 dest_port)
{
size_t dev_addr = (size_t)dev;
uint32_t hash = ((uint32_t)dev_addr) ^ ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port);
return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK;
}
/*
* sfe_ipv4_find_sfe_ipv4_connection_match()
* Get the IPv4 flow match info that corresponds to a particular 5-tuple.
*
* On entry we must be holding the lock that protects the hash table.
*/
static struct sfe_ipv4_connection_match *
sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, uint8_t protocol,
__be32 src_ip, __be16 src_port,
__be32 dest_ip, __be16 dest_port) __attribute__((always_inline));
static struct sfe_ipv4_connection_match *
sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, uint8_t protocol,
__be32 src_ip, __be16 src_port,
__be32 dest_ip, __be16 dest_port)
{
struct sfe_ipv4_connection_match *cm;
struct sfe_ipv4_connection_match *head;
unsigned int conn_match_idx;
conn_match_idx = sfe_ipv4_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port);
cm = si->conn_match_hash[conn_match_idx];
/*
* If we don't have anything in this chain then bale.
*/
if (unlikely(!cm)) {
return cm;
}
/*
* Hopefully the first entry is the one we want.
*/
if (likely(cm->match_src_port == src_port)
&& likely(cm->match_dest_port == dest_port)
&& likely(cm->match_src_ip == src_ip)
&& likely(cm->match_dest_ip == dest_ip)
&& likely(cm->match_protocol == protocol)
&& likely(cm->match_dev == dev)) {
si->connection_match_hash_hits++;
return cm;
}
/*
* We may or may not have a matching entry but if we do then we want to
* move that entry to the top of the hash chain when we get to it. We
* presume that this will be reused again very quickly.
*/
head = cm;
do {
cm = cm->next;
} while (cm && (cm->match_src_port != src_port
|| cm->match_dest_port != dest_port
|| cm->match_src_ip != src_ip
|| cm->match_dest_ip != dest_ip
|| cm->match_protocol != protocol
|| cm->match_dev != dev));
/*
* Not found then we're done.
*/
if (unlikely(!cm)) {
return cm;
}
/*
* We found a match so move it.
*/
if (cm->next) {
cm->next->prev = cm->prev;
}
cm->prev->next = cm->next;
cm->prev = NULL;
cm->next = head;
head->prev = cm;
si->conn_match_hash[conn_match_idx] = cm;
si->connection_match_hash_reorders++;
return cm;
}
/*
* sfe_ipv4_connection_match_update_summary_stats()
* Update the summary stats for a connection match entry.
*/
static inline void sfe_ipv4_connection_match_update_summary_stats(struct sfe_ipv4_connection_match *cm)
{
cm->rx_packet_count64 += cm->rx_packet_count;
cm->rx_packet_count = 0;
cm->rx_byte_count64 += cm->rx_byte_count;
cm->rx_byte_count = 0;
}
/*
* sfe_ipv4_connection_match_compute_translations()
* Compute port and address translations for a connection match entry.
*/
static void sfe_ipv4_connection_match_compute_translations(struct sfe_ipv4_connection_match *cm)
{
/*
* Before we insert the entry look to see if this is tagged as doing address
* translations. If it is then work out the adjustment that we need to apply
* to the transport checksum.
*/
if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) {
/*
* Precompute an incremental checksum adjustment so we can
* edit packets in this stream very quickly. The algorithm is from RFC1624.
*/
uint16_t src_ip_hi = cm->match_src_ip >> 16;
uint16_t src_ip_lo = cm->match_src_ip & 0xffff;
uint32_t xlate_src_ip = ~cm->xlate_src_ip;
uint16_t xlate_src_ip_hi = xlate_src_ip >> 16;
uint16_t xlate_src_ip_lo = xlate_src_ip & 0xffff;
uint16_t xlate_src_port = ~cm->xlate_src_port;
uint32_t adj;
/*
* When we compute this fold it down to a 16-bit offset
* as that way we can avoid having to do a double
* folding of the twos-complement result because the
* addition of 2 16-bit values cannot cause a double
* wrap-around!
*/
adj = src_ip_hi + src_ip_lo + cm->match_src_port
+ xlate_src_ip_hi + xlate_src_ip_lo + xlate_src_port;
adj = (adj & 0xffff) + (adj >> 16);
adj = (adj & 0xffff) + (adj >> 16);
cm->xlate_src_csum_adjustment = (uint16_t)adj;
}
if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) {
/*
* Precompute an incremental checksum adjustment so we can
* edit packets in this stream very quickly. The algorithm is from RFC1624.
*/
uint16_t dest_ip_hi = cm->match_dest_ip >> 16;
uint16_t dest_ip_lo = cm->match_dest_ip & 0xffff;
uint32_t xlate_dest_ip = ~cm->xlate_dest_ip;
uint16_t xlate_dest_ip_hi = xlate_dest_ip >> 16;
uint16_t xlate_dest_ip_lo = xlate_dest_ip & 0xffff;
uint16_t xlate_dest_port = ~cm->xlate_dest_port;
uint32_t adj;
/*
* When we compute this fold it down to a 16-bit offset
* as that way we can avoid having to do a double
* folding of the twos-complement result because the
* addition of 2 16-bit values cannot cause a double
* wrap-around!
*/
adj = dest_ip_hi + dest_ip_lo + cm->match_dest_port
+ xlate_dest_ip_hi + xlate_dest_ip_lo + xlate_dest_port;
adj = (adj & 0xffff) + (adj >> 16);
adj = (adj & 0xffff) + (adj >> 16);
cm->xlate_dest_csum_adjustment = (uint16_t)adj;
}
}
/*
* sfe_ipv4_update_summary_stats()
* Update the summary stats.
*/
static void sfe_ipv4_update_summary_stats(struct sfe_ipv4 *si)
{
int i;
si->connection_create_requests64 += si->connection_create_requests;
si->connection_create_requests = 0;
si->connection_create_collisions64 += si->connection_create_collisions;
si->connection_create_collisions = 0;
si->connection_destroy_requests64 += si->connection_destroy_requests;
si->connection_destroy_requests = 0;
si->connection_destroy_misses64 += si->connection_destroy_misses;
si->connection_destroy_misses = 0;
si->connection_match_hash_hits64 += si->connection_match_hash_hits;
si->connection_match_hash_hits = 0;
si->connection_match_hash_reorders64 += si->connection_match_hash_reorders;
si->connection_match_hash_reorders = 0;
si->connection_flushes64 += si->connection_flushes;
si->connection_flushes = 0;
si->packets_forwarded64 += si->packets_forwarded;
si->packets_forwarded = 0;
si->packets_not_forwarded64 += si->packets_not_forwarded;
si->packets_not_forwarded = 0;
for (i = 0; i < SFE_IPV4_EXCEPTION_EVENT_LAST; i++) {
si->exception_events64[i] += si->exception_events[i];
si->exception_events[i] = 0;
}
}
/*
* sfe_ipv4_insert_sfe_ipv4_connection_match()
* Insert a connection match into the hash.
*
* On entry we must be holding the lock that protects the hash table.
*/
static inline void sfe_ipv4_insert_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm)
{
struct sfe_ipv4_connection_match **hash_head;
struct sfe_ipv4_connection_match *prev_head;
unsigned int conn_match_idx
= sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol,
cm->match_src_ip, cm->match_src_port,
cm->match_dest_ip, cm->match_dest_port);
hash_head = &si->conn_match_hash[conn_match_idx];
prev_head = *hash_head;
cm->prev = NULL;
if (prev_head) {
prev_head->prev = cm;
}
cm->next = prev_head;
*hash_head = cm;
}
/*
* sfe_ipv4_remove_sfe_ipv4_connection_match()
* Remove a connection match object from the hash.
*
* On entry we must be holding the lock that protects the hash table.
*/
static inline void sfe_ipv4_remove_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm)
{
/*
* Unlink the connection match entry from the hash.
*/
if (cm->prev) {
cm->prev->next = cm->next;
} else {
unsigned int conn_match_idx
= sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol,
cm->match_src_ip, cm->match_src_port,
cm->match_dest_ip, cm->match_dest_port);
si->conn_match_hash[conn_match_idx] = cm->next;
}
if (cm->next) {
cm->next->prev = cm->prev;
}
/*
* Unlink the connection match entry from the active list.
*/
if (likely(cm->active_prev)) {
cm->active_prev->active_next = cm->active_next;
} else {
si->active_head = cm->active_next;
}
if (likely(cm->active_next)) {
cm->active_next->active_prev = cm->active_prev;
} else {
si->active_tail = cm->active_prev;
}
}
/*
* sfe_ipv4_get_connection_hash()
* Generate the hash used in connection lookups.
*/
static inline unsigned int sfe_ipv4_get_connection_hash(uint8_t protocol, __be32 src_ip, __be16 src_port,
__be32 dest_ip, __be16 dest_port)
{
uint32_t hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port);
return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK;
}
/*
* sfe_ipv4_find_sfe_ipv4_connection()
* Get the IPv4 connection info that corresponds to a particular 5-tuple.
*
* On entry we must be holding the lock that protects the hash table.
*/
static inline struct sfe_ipv4_connection *sfe_ipv4_find_sfe_ipv4_connection(struct sfe_ipv4 *si, uint32_t protocol,
__be32 src_ip, __be16 src_port,
__be32 dest_ip, __be16 dest_port)
{
struct sfe_ipv4_connection *c;
unsigned int conn_idx = sfe_ipv4_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port);
c = si->conn_hash[conn_idx];
/*
* If we don't have anything in this chain then bale.
*/
if (unlikely(!c)) {
return c;
}
/*
* Hopefully the first entry is the one we want.
*/
if (likely(c->src_port == src_port)
&& likely(c->dest_port == dest_port)
&& likely(c->src_ip == src_ip)
&& likely(c->dest_ip == dest_ip)
&& likely(c->protocol == protocol)) {
return c;
}
/*
* We may or may not have a matching entry but if we do then we want to
* move that entry to the top of the hash chain when we get to it. We
* presume that this will be reused again very quickly.
*/
do {
c = c->next;
} while (c && (c->src_port != src_port
|| c->dest_port != dest_port
|| c->src_ip != src_ip
|| c->dest_ip != dest_ip
|| c->protocol != protocol));
/*
* Will need connection entry for next create/destroy metadata,
* So no need to re-order entry for these requests
*/
return c;
}
/*
* sfe_ipv4_insert_sfe_ipv4_connection()
* Insert a connection into the hash.
*
* On entry we must be holding the lock that protects the hash table.
*/
static void sfe_ipv4_insert_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
{
struct sfe_ipv4_connection **hash_head;
struct sfe_ipv4_connection *prev_head;
unsigned int conn_idx;
/*
* Insert entry into the connection hash.
*/
conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port,
c->dest_ip, c->dest_port);
hash_head = &si->conn_hash[conn_idx];
prev_head = *hash_head;
c->prev = NULL;
if (prev_head) {
prev_head->prev = c;
}
c->next = prev_head;
*hash_head = c;
/*
* Insert entry into the "all connections" list.
*/
if (si->all_connections_tail) {
c->all_connections_prev = si->all_connections_tail;
si->all_connections_tail->all_connections_next = c;
} else {
c->all_connections_prev = NULL;
si->all_connections_head = c;
}
si->all_connections_tail = c;
c->all_connections_next = NULL;
si->num_connections++;
/*
* Insert the connection match objects too.
*/
sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->original_match);
sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->reply_match);
}
/*
* sfe_ipv4_remove_sfe_ipv4_connection()
* Remove a sfe_ipv4_connection object from the hash.
*
* On entry we must be holding the lock that protects the hash table.
*/
static void sfe_ipv4_remove_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
{
/*
* Remove the connection match objects.
*/
sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->reply_match);
sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->original_match);
/*
* Unlink the connection.
*/
if (c->prev) {
c->prev->next = c->next;
} else {
unsigned int conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port,
c->dest_ip, c->dest_port);
si->conn_hash[conn_idx] = c->next;
}
if (c->next) {
c->next->prev = c->prev;
}
}
/*
* sfe_ipv4_sync_rule()
* Synchronize a connection's state.
*/
static void sfe_ipv4_sync_rule(struct sfe_ipv4_sync *sis)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct;
struct nf_conn_counter *acct;
/*
* Create a tuple so as to be able to look up a connection
*/
memset(&tuple, 0, sizeof(tuple));
tuple.src.u3.ip = sis->src_ip;
tuple.src.u.all = (__be16)sis->src_port;
tuple.src.l3num = AF_INET;
tuple.dst.u3.ip = sis->dest_ip;
tuple.dst.dir = IP_CT_DIR_ORIGINAL;
tuple.dst.protonum = (uint8_t)sis->protocol;
tuple.dst.u.all = (__be16)sis->dest_port;
DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n",
(int)tuple.dst.protonum,
&tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all),
&tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all));
/*
* Look up conntrack connection
*/
h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
if (unlikely(!h)) {
DEBUG_TRACE("no connection found\n");
return;
}
ct = nf_ct_tuplehash_to_ctrack(h);
NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
/*
* Only update if this is not a fixed timeout
*/
if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
ct->timeout.expires += sis->delta_jiffies;
}
acct = nf_conn_acct_find(ct);
if (acct) {
spin_lock_bh(&ct->lock);
atomic64_add(sis->src_packet_count, &acct[IP_CT_DIR_ORIGINAL].packets);
atomic64_add(sis->src_byte_count, &acct[IP_CT_DIR_ORIGINAL].bytes);
atomic64_add(sis->dest_packet_count, &acct[IP_CT_DIR_REPLY].packets);
atomic64_add(sis->dest_byte_count, &acct[IP_CT_DIR_REPLY].bytes);
spin_unlock_bh(&ct->lock);
}
switch (sis->protocol) {
case IPPROTO_TCP:
spin_lock_bh(&ct->lock);
if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) {
ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window;
}
if ((int32_t)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) {
ct->proto.tcp.seen[0].td_end = sis->src_td_end;
}
if ((int32_t)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) {
ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end;
}
if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) {
ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window;
}
if ((int32_t)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) {
ct->proto.tcp.seen[1].td_end = sis->dest_td_end;
}
if ((int32_t)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) {
ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end;
}
spin_unlock_bh(&ct->lock);
break;
}
/*
* Release connection
*/
nf_ct_put(ct);
}
/*
* sfe_ipv4_sync_sfe_ipv4_connection()
* Sync a connection.
*
* On entry to this function we expect that the lock for the connection is either
* already held or isn't required.
*/
static void sfe_ipv4_gen_sync_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c,
struct sfe_ipv4_sync *sis, uint64_t now_jiffies)
{
struct sfe_ipv4_connection_match *original_cm;
struct sfe_ipv4_connection_match *reply_cm;
/*
* Fill in the update message.
*/
sis->protocol = c->protocol;
sis->src_ip = c->src_ip;
sis->dest_ip = c->dest_ip;
sis->src_port = c->src_port;
sis->dest_port = c->dest_port;
original_cm = c->original_match;
reply_cm = c->reply_match;
sis->src_td_max_window = original_cm->protocol_state.tcp.max_win;
sis->src_td_end = original_cm->protocol_state.tcp.end;
sis->src_td_max_end = original_cm->protocol_state.tcp.max_end;
sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win;
sis->dest_td_end = reply_cm->protocol_state.tcp.end;
sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end;
sfe_ipv4_connection_match_update_summary_stats(original_cm);
sfe_ipv4_connection_match_update_summary_stats(reply_cm);
sis->src_packet_count = original_cm->rx_packet_count64;
sis->src_byte_count = original_cm->rx_byte_count64;
sis->dest_packet_count = reply_cm->rx_packet_count64;
sis->dest_byte_count = reply_cm->rx_byte_count64;
/*
* Get the time increment since our last sync.
*/
sis->delta_jiffies = now_jiffies - c->last_sync_jiffies;
c->last_sync_jiffies = now_jiffies;
}
/*
* sfe_ipv4_decrement_sfe_ipv4_connection_iterator()
* Remove an iterator from a connection - free all resources if necessary.
*
* Returns true if the connection should now be free, false if not.
*
* We must be locked on entry to this function.
*/
static bool sfe_ipv4_decrement_sfe_ipv4_connection_iterator(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
{
/*
* Are we the last iterator for this connection?
*/
c->iterators--;
if (c->iterators) {
return false;
}
/*
* Is this connection marked for deletion?
*/
if (!c->pending_free) {
return false;
}
/*
* We're ready to delete this connection so unlink it from the "all
* connections" list.
*/
si->num_connections--;
if (c->all_connections_prev) {
c->all_connections_prev->all_connections_next = c->all_connections_next;
} else {
si->all_connections_head = c->all_connections_next;
}
if (c->all_connections_next) {
c->all_connections_next->all_connections_prev = c->all_connections_prev;
} else {
si->all_connections_tail = c->all_connections_prev;
}
return true;
}
/*
* sfe_ipv4_flush_sfe_ipv4_connection()
* Flush a connection and free all associated resources.
*
* We need to be called with bottom halves disabled locally as we need to acquire
* the connection hash lock and release it again. In general we're actually called
* from within a BH and so we're fine, but we're also called when connections are
* torn down.
*/
static void sfe_ipv4_flush_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c)
{
struct sfe_ipv4_sync sis;
uint64_t now_jiffies;
bool pending_free = false;
spin_lock(&si->lock);
si->connection_flushes++;
/*
* Check that we're not currently being iterated. If we are then
* we can't free this entry yet but must mark it pending a free. If it's
* not being iterated then we can unlink it from the list of all
* connections.
*/
if (c->iterators) {
pending_free = true;
c->pending_free = true;
} else {
si->num_connections--;
if (c->all_connections_prev) {
c->all_connections_prev->all_connections_next = c->all_connections_next;
} else {
si->all_connections_head = c->all_connections_next;
}
if (c->all_connections_next) {
c->all_connections_next->all_connections_prev = c->all_connections_prev;
} else {
si->all_connections_tail = c->all_connections_prev;
}
}
spin_unlock(&si->lock);
/*
* Generate a sync message and then sync.
*/
now_jiffies = get_jiffies_64();
sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, now_jiffies);
sfe_ipv4_sync_rule(&sis);
/*
* If we can't yet free the underlying memory then we're done.
*/
if (pending_free) {
return;
}
/*
* Release our hold of the source and dest devices and free the memory
* for our connection objects.
*/
dev_put(c->original_dev);
dev_put(c->reply_dev);
kfree(c->original_match);
kfree(c->reply_match);
kfree(c);
}
/*
* sfe_ipv4_recv_udp()
* Handle UDP packet receives and forwarding.
*/
static int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl, bool flush_on_find)
{
struct sfe_ipv4_udphdr *udph;
__be32 src_ip;
__be32 dest_ip;
__be16 src_port;
__be16 dest_port;
struct sfe_ipv4_connection_match *cm;
uint8_t ttl;
struct net_device *xmit_dev;
/*
* Is our packet too short to contain a valid UDP header?
*/
if (unlikely(len < (sizeof(struct sfe_ipv4_udphdr) + ihl))) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("packet too short for UDP header\n");
return 0;
}
/*
* Read the IP address and port information. Read the IP header data first
* because we've almost certainly got that in the cache. We may not yet have
* the UDP header cached though so allow more time for any prefetching.
*/
src_ip = iph->saddr;
dest_ip = iph->daddr;
udph = (struct sfe_ipv4_udphdr *)(skb->data + ihl);
src_port = udph->source;
dest_port = udph->dest;
spin_lock(&si->lock);
/*
* Look for a connection match.
*/
cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port);
if (unlikely(!cm)) {
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("no connection found\n");
return 0;
}
/*
* If our packet has beern marked as "flush on find" we can't actually
* forward it in the fast path, but now that we've found an associated
* connection we can flush that out before we process the packet.
*/
if (unlikely(flush_on_find)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("flush on find\n");
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Does our TTL allow forwarding?
*/
ttl = iph->ttl;
if (unlikely(ttl < 2)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("ttl too low\n");
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* If our packet is larger than the MTU of the transmit interface then
* we can't forward it easily.
*/
if (unlikely(len > cm->xmit_dev_mtu)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("larger than mtu\n");
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* From this point on we're good to modify the packet.
*/
/*
* Decrement our TTL.
*/
iph->ttl = ttl - 1;
/*
* Do we have to perform translations of the source address/port?
*/
if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
uint16_t udp_csum;
iph->saddr = cm->xlate_src_ip;
udph->source = cm->xlate_src_port;
/*
* Do we have a non-zero UDP checksum? If we do then we need
* to update it.
*/
udp_csum = udph->check;
if (likely(udp_csum)) {
uint32_t sum = udp_csum + cm->xlate_src_csum_adjustment;
sum = (sum & 0xffff) + (sum >> 16);
udph->check = (uint16_t)sum;
}
}
/*
* Do we have to perform translations of the destination address/port?
*/
if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) {
uint16_t udp_csum;
iph->daddr = cm->xlate_dest_ip;
udph->dest = cm->xlate_dest_port;
/*
* Do we have a non-zero UDP checksum? If we do then we need
* to update it.
*/
udp_csum = udph->check;
if (likely(udp_csum)) {
uint32_t sum = udp_csum + cm->xlate_dest_csum_adjustment;
sum = (sum & 0xffff) + (sum >> 16);
udph->check = (uint16_t)sum;
}
}
/*
* Replace the IP checksum.
*/
iph->check = sfe_ipv4_gen_ip_csum(iph);
// if ((nat_entry_data->tos & FASTNAT_DSCP_MASK) != (iph->tos & FASTNAT_DSCP_MASK)) {
// ipv4_change_dsfield(iph, (u_int8_t)(~FASTNAT_DSCP_MASK), nat_entry_data->tos);
// }
// skb->priority = nat_entry_data->priority;
// skb->mark = nat_entry_data->mark;
/*
* Update traffic stats.
*/
cm->rx_packet_count++;
cm->rx_byte_count += len;
/*
* If we're not already on the active list then insert ourselves at the tail
* of the current list.
*/
if (unlikely(!cm->active)) {
cm->active = true;
cm->active_prev = si->active_tail;
if (likely(si->active_tail)) {
si->active_tail->active_next = cm;
} else {
si->active_head = cm;
}
si->active_tail = cm;
}
xmit_dev = cm->xmit_dev;
skb->dev = xmit_dev;
/*
* Do we have a simple Ethernet header to write?
*/
if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR))) {
/*
* If this is anything other than a point-to-point interface then we need to
* create a header based on MAC addresses.
*/
if (likely(!(xmit_dev->flags & IFF_POINTOPOINT))) {
xmit_dev->header_ops->create(skb, xmit_dev, ETH_P_IP,
cm->xmit_dest_mac, cm->xmit_src_mac, len);
}
} else {
struct sfe_ipv4_ethhdr *eth = (struct sfe_ipv4_ethhdr *)__skb_push(skb, ETH_HLEN);
eth->h_proto = htons(ETH_P_IP);
eth->h_dest[0] = htons(cm->xmit_dest_mac[0]);
eth->h_dest[1] = htons(cm->xmit_dest_mac[1]);
eth->h_dest[2] = htons(cm->xmit_dest_mac[2]);
eth->h_source[0] = htons(cm->xmit_src_mac[0]);
eth->h_source[1] = htons(cm->xmit_src_mac[1]);
eth->h_source[2] = htons(cm->xmit_src_mac[2]);
}
si->packets_forwarded++;
spin_unlock(&si->lock);
/*
* We're going to check for GSO flags when we transmit the packet so
* start fetching the necessary cache line now.
*/
prefetch(skb_shinfo(skb));
/*
* Send the packet on its way.
*/
dev_queue_xmit(skb);
return 1;
}
/*
* sfe_ipv4_process_tcp_option_sack()
* Parse TCP SACK option and update ack according
*/
static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcphdr *th, const uint32_t data_offs,
uint32_t *ack) __attribute__((always_inline));
static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcphdr *th, const uint32_t data_offs,
uint32_t *ack)
{
uint32_t length = sizeof(struct sfe_ipv4_tcphdr);
uint8_t *ptr = (uint8_t *)th + length;
/*
* If option is TIMESTAMP discard it.
*/
if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1)
&& likely(ptr[0] == TCPOPT_NOP)
&& likely(ptr[1] == TCPOPT_NOP)
&& likely(ptr[2] == TCPOPT_TIMESTAMP)
&& likely(ptr[3] == TCPOLEN_TIMESTAMP)) {
return true;
}
/*
* TCP options. Parse SACK option.
*/
while (length < data_offs) {
uint8_t size;
uint8_t kind;
ptr = (uint8_t *)th + length;
kind = *ptr;
/*
* NOP, for padding
* Not in the switch because to fast escape and to not calculate size
*/
if (kind == TCPOPT_NOP) {
length++;
continue;
}
if (kind == TCPOPT_SACK) {
uint32_t sack = 0;
uint8_t re = 1 + 1;
size = *(ptr + 1);
if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK))
|| ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK))
|| (size > (data_offs - length))) {
return false;
}
re += 4;
while (re < size) {
uint32_t sack_re;
uint8_t *sptr = ptr + re;
sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3];
if (sack_re > sack) {
sack = sack_re;
}
re += TCPOLEN_SACK_PERBLOCK;
}
if (sack > *ack) {
*ack = sack;
}
length += size;
continue;
}
if (kind == TCPOPT_EOL) {
return true;
}
size = *(ptr + 1);
if (size < 2) {
return false;
}
length += size;
}
return true;
}
/*
* sfe_ipv4_recv_tcp()
* Handle TCP packet receives and forwarding.
*/
static int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl, bool flush_on_find)
{
struct sfe_ipv4_tcphdr *tcph;
__be32 src_ip;
__be32 dest_ip;
__be16 src_port;
__be16 dest_port;
struct sfe_ipv4_connection_match *cm;
struct sfe_ipv4_connection_match *counter_cm;
uint8_t ttl;
uint32_t flags;
struct net_device *xmit_dev;
/*
* Is our packet too short to contain a valid UDP header?
*/
if (unlikely(len < (sizeof(struct sfe_ipv4_tcphdr) + ihl))) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("packet too short for TCP header\n");
return 0;
}
/*
* Read the IP address and port information. Read the IP header data first
* because we've almost certainly got that in the cache. We may not yet have
* the TCP header cached though so allow more time for any prefetching.
*/
src_ip = iph->saddr;
dest_ip = iph->daddr;
tcph = (struct sfe_ipv4_tcphdr *)(skb->data + ihl);
src_port = tcph->source;
dest_port = tcph->dest;
flags = tcp_flag_word(tcph);
spin_lock(&si->lock);
/*
* Look for a connection match.
*/
cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port);
if (unlikely(!cm)) {
/*
* We didn't get a connection but as TCP is connection-oriented that
* may be because this is a non-fast connection (not running established).
* For diagnostic purposes we differentiate this here.
*/
if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) {
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("no connection found - fast flags\n");
return 0;
}
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("no connection found - slow flags: 0x%x\n",
flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK));
return 0;
}
/*
* If our packet has beern marked as "flush on find" we can't actually
* forward it in the fast path, but now that we've found an associated
* connection we can flush that out before we process the packet.
*/
if (unlikely(flush_on_find)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("flush on find\n");
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Does our TTL allow forwarding?
*/
ttl = iph->ttl;
if (unlikely(ttl < 2)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("ttl too low\n");
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* If our packet is larger than the MTU of the transmit interface then
* we can't forward it easily.
*/
if (unlikely(len > cm->xmit_dev_mtu)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("larger than mtu\n");
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN
* set is not a fast path packet.
*/
if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("TCP flags: 0x%x are not fast\n",
flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK));
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
counter_cm = cm->counter_match;
/*
* Are we doing sequence number checking?
*/
if (likely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) {
uint32_t seq;
uint32_t ack;
uint32_t sack;
uint32_t data_offs;
uint32_t end;
uint32_t left_edge;
uint32_t scaled_win;
uint32_t max_end;
/*
* Is our sequence fully past the right hand edge of the window?
*/
seq = ntohl(tcph->seq);
if (unlikely((int32_t)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("seq: %u exceeds right edge: %u\n",
seq, cm->protocol_state.tcp.max_end + 1);
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Check that our TCP data offset isn't too short.
*/
data_offs = tcph->doff << 2;
if (unlikely(data_offs < sizeof(struct sfe_ipv4_tcphdr))) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs);
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Update ACK according to any SACK option.
*/
ack = ntohl(tcph->ack_seq);
sack = ack;
if (unlikely(!sfe_ipv4_process_tcp_option_sack(tcph, data_offs, &sack))) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("TCP option SACK size is wrong\n");
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Check that our TCP data offset isn't past the end of the packet.
*/
data_offs += sizeof(struct sfe_ipv4_iphdr);
if (unlikely(len < data_offs)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n",
data_offs, len);
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
end = seq + len - data_offs;
/*
* Is our sequence fully before the left hand edge of the window?
*/
if (unlikely((int32_t)(end - (cm->protocol_state.tcp.end
- counter_cm->protocol_state.tcp.max_win - 1)) < 0)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("seq: %u before left edge: %u\n",
end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1);
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Are we acking data that is to the right of what has been sent?
*/
if (unlikely((int32_t)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("ack: %u exceeds right edge: %u\n",
sack, counter_cm->protocol_state.tcp.end + 1);
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Is our ack too far before the left hand edge of the window?
*/
left_edge = counter_cm->protocol_state.tcp.end
- cm->protocol_state.tcp.max_win
- SFE_IPV4_TCP_MAX_ACK_WINDOW
- 1;
if (unlikely((int32_t)(sack - left_edge) < 0)) {
struct sfe_ipv4_connection *c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge);
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* Have we just seen the largest window size yet for this connection? If yes
* then we need to record the new value.
*/
scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale;
scaled_win += (sack - ack);
if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) {
cm->protocol_state.tcp.max_win = scaled_win;
}
/*
* If our sequence and/or ack numbers have advanced then record the new state.
*/
if (likely((int32_t)(end - cm->protocol_state.tcp.end) >= 0)) {
cm->protocol_state.tcp.end = end;
}
max_end = sack + scaled_win;
if (likely((int32_t)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) {
counter_cm->protocol_state.tcp.max_end = max_end;
}
}
/*
* From this point on we're good to modify the packet.
*/
/*
* Decrement our TTL.
*/
iph->ttl = ttl - 1;
/*
* Do we have to perform translations of the source address/port?
*/
if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) {
uint16_t tcp_csum;
uint32_t sum;
iph->saddr = cm->xlate_src_ip;
tcph->source = cm->xlate_src_port;
/*
* Do we have a non-zero UDP checksum? If we do then we need
* to update it.
*/
tcp_csum = tcph->check;
sum = tcp_csum + cm->xlate_src_csum_adjustment;
sum = (sum & 0xffff) + (sum >> 16);
tcph->check = (uint16_t)sum;
}
/*
* Do we have to perform translations of the destination address/port?
*/
if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) {
uint16_t tcp_csum;
uint32_t sum;
iph->daddr = cm->xlate_dest_ip;
tcph->dest = cm->xlate_dest_port;
/*
* Do we have a non-zero UDP checksum? If we do then we need
* to update it.
*/
tcp_csum = tcph->check;
sum = tcp_csum + cm->xlate_dest_csum_adjustment;
sum = (sum & 0xffff) + (sum >> 16);
tcph->check = (uint16_t)sum;
}
/*
* Replace the IP checksum.
*/
iph->check = sfe_ipv4_gen_ip_csum(iph);
// if ((nat_entry_data->tos & FASTNAT_DSCP_MASK) != (iph->tos & FASTNAT_DSCP_MASK)) {
// ipv4_change_dsfield(iph, (u_int8_t)(~FASTNAT_DSCP_MASK), nat_entry_data->tos);
// }
// skb->priority = nat_entry_data->priority;
// skb->mark = nat_entry_data->mark;
/*
* Update traffic stats.
*/
cm->rx_packet_count++;
cm->rx_byte_count += len;
/*
* If we're not already on the active list then insert ourselves at the tail
* of the current list.
*/
if (unlikely(!cm->active)) {
cm->active = true;
cm->active_prev = si->active_tail;
if (likely(si->active_tail)) {
si->active_tail->active_next = cm;
} else {
si->active_head = cm;
}
si->active_tail = cm;
}
xmit_dev = cm->xmit_dev;
skb->dev = xmit_dev;
/*
* Do we have a simple Ethernet header to write?
*/
if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR))) {
/*
* If this is anything other than a point-to-point interface then we need to
* create a header based on MAC addresses.
*/
if (likely(!(xmit_dev->flags & IFF_POINTOPOINT))) {
xmit_dev->header_ops->create(skb, xmit_dev, ETH_P_IP,
cm->xmit_dest_mac, cm->xmit_src_mac, len);
}
} else {
struct sfe_ipv4_ethhdr *eth = (struct sfe_ipv4_ethhdr *)__skb_push(skb, ETH_HLEN);
eth->h_proto = htons(ETH_P_IP);
eth->h_dest[0] = htons(cm->xmit_dest_mac[0]);
eth->h_dest[1] = htons(cm->xmit_dest_mac[1]);
eth->h_dest[2] = htons(cm->xmit_dest_mac[2]);
eth->h_source[0] = htons(cm->xmit_src_mac[0]);
eth->h_source[1] = htons(cm->xmit_src_mac[1]);
eth->h_source[2] = htons(cm->xmit_src_mac[2]);
}
si->packets_forwarded++;
spin_unlock(&si->lock);
/*
* We're going to check for GSO flags when we transmit the packet so
* start fetching the necessary cache line now.
*/
prefetch(skb_shinfo(skb));
/*
* Send the packet on its way.
*/
dev_queue_xmit(skb);
return 1;
}
/*
* sfe_ipv4_recv_icmp()
* Handle ICMP packet receives.
*
* ICMP packets aren't handled as a "fast path" and always have us process them
* through the default Linux stack. What we do need to do is look for any errors
* about connections we are handling in the fast path. If we find any such
* connections then we want to flush their state so that the ICMP error path
* within Linux has all of the correct state should it need it.
*/
static int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev,
unsigned int len, struct sfe_ipv4_iphdr *iph, unsigned int ihl)
{
struct icmphdr *icmph;
struct sfe_ipv4_iphdr *icmp_iph;
unsigned int icmp_ihl_words;
unsigned int icmp_ihl;
uint32_t *icmp_trans_h;
struct sfe_ipv4_udphdr *icmp_udph;
struct sfe_ipv4_tcphdr *icmp_tcph;
__be32 src_ip;
__be32 dest_ip;
__be16 src_port;
__be16 dest_port;
struct sfe_ipv4_connection_match *cm;
struct sfe_ipv4_connection *c;
/*
* Is our packet too short to contain a valid UDP header?
*/
len -= ihl;
if (unlikely(len < sizeof(struct icmphdr))) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("packet too short for ICMP header\n");
return 0;
}
/*
* We only handle "destination unreachable" and "time exceeded" messages.
*/
icmph = (struct icmphdr *)(skb->data + ihl);
if ((icmph->type != ICMP_DEST_UNREACH)
&& (icmph->type != ICMP_TIME_EXCEEDED)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->type);
return 0;
}
/*
* Do we have the full embedded IP header?
*/
len -= sizeof(struct icmphdr);
if (unlikely(len < sizeof(struct sfe_ipv4_iphdr))) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("Embedded IP header not complete\n");
return 0;
}
/*
* Is our embedded IP version wrong?
*/
icmp_iph = (struct sfe_ipv4_iphdr *)(icmph + 1);
if (unlikely(icmp_iph->version != 4)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("IP version: %u\n", icmp_iph->version);
return 0;
}
/*
* Do we have the full embedded IP header, including any options?
*/
icmp_ihl_words = icmp_iph->ihl;
icmp_ihl = icmp_ihl_words << 2;
if (unlikely(len < icmp_ihl)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("Embedded header not large enough for IP options\n");
return 0;
}
len -= icmp_ihl;
icmp_trans_h = ((uint32_t *)icmp_iph) + icmp_ihl_words;
/*
* Handle the embedded transport layer header.
*/
switch (icmp_iph->protocol) {
case IPPROTO_UDP:
/*
* We should have 8 bytes of UDP header - that's enough to identify
* the connection.
*/
if (unlikely(len < 8)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("Incomplete embedded UDP header\n");
return 0;
}
icmp_udph = (struct sfe_ipv4_udphdr *)icmp_trans_h;
src_port = icmp_udph->source;
dest_port = icmp_udph->dest;
break;
case IPPROTO_TCP:
/*
* We should have 8 bytes of TCP header - that's enough to identify
* the connection.
*/
if (unlikely(len < 8)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("Incomplete embedded TCP header\n");
return 0;
}
icmp_tcph = (struct sfe_ipv4_tcphdr *)icmp_trans_h;
src_port = icmp_tcph->source;
dest_port = icmp_tcph->dest;
break;
default:
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", icmp_iph->protocol);
return 0;
}
src_ip = icmp_iph->saddr;
dest_ip = icmp_iph->daddr;
spin_lock(&si->lock);
/*
* Look for a connection match. Note that we reverse the source and destination
* here because our embedded message contains a packet that was sent in the
* opposite direction to the one in which we just received it. It will have
* been sent on the interface from which we received it though so that's still
* ok to use.
*/
cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, icmp_iph->protocol, dest_ip, dest_port, src_ip, src_port);
if (unlikely(!cm)) {
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("no connection found\n");
return 0;
}
/*
* We found a connection so now remove it from the connection list and flush
* its state.
*/
c = cm->connection;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
return 0;
}
/*
* sfe_ipv4_recv()
* Handle packet receives and forwaring.
*
* Returns 1 if the packet is forwarded or 0 if it isn't.
*/
static int sfe_ipv4_recv(struct sk_buff *skb)
{
struct sfe_ipv4 *si = &__si;
struct net_device *dev;
#if (SFE_HOOK_ABOVE_BRIDGE)
struct in_device *in_dev;
#endif
unsigned int len;
unsigned int tot_len;
unsigned int frag_off;
unsigned int ihl;
bool flush_on_find;
bool ip_options;
struct sfe_ipv4_iphdr *iph;
uint32_t protocol;
/*
* We know that for the vast majority of packets we need the transpor
* layer header so we may as well start to fetch it now!
*/
prefetch(skb->data + 32);
barrier();
dev = skb->dev;
#if (SFE_HOOK_ABOVE_BRIDGE)
/*
* Does our input device support IP processing?
*/
in_dev = (struct in_device *)dev->ip_ptr;
if (unlikely(!in_dev)) {
DEBUG_TRACE("no IP processing for device: %s\n", dev->name);
return 0;
}
/*
* Does it have an IP address? If it doesn't then we can't do anything
* interesting here!
*/
if (unlikely(!in_dev->ifa_list)) {
DEBUG_TRACE("no IP address for device: %s\n", dev->name);
return 0;
}
#endif
/*
* We're only interested in IP packets.
*/
if (unlikely(htons(ETH_P_IP) != skb->protocol)) {
DEBUG_TRACE("not IP packet\n");
return 0;
}
/*
* Check that we have space for an IP header here.
*/
len = skb->len;
if (unlikely(len < sizeof(struct sfe_ipv4_iphdr))) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("len: %u is too short\n", len);
return 0;
}
/*
* Check that our "total length" is large enough for an IP header.
*/
iph = (struct sfe_ipv4_iphdr *)skb->data;
tot_len = ntohs(iph->tot_len);
if (unlikely(tot_len < sizeof(struct sfe_ipv4_iphdr))) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("tot_len: %u is too short\n", tot_len);
return 0;
}
/*
* Is our IP version wrong?
*/
if (unlikely(iph->version != 4)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_V4]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("IP version: %u\n", iph->version);
return 0;
}
/*
* Does our datagram fit inside the skb?
*/
if (unlikely(tot_len > len)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("tot_len: %u, exceeds len: %u\n", tot_len, len);
return 0;
}
/*
* Do we have a non-initial fragment?
*/
frag_off = ntohs(iph->frag_off);
if (unlikely(frag_off & IP_OFFSET)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("non-initial fragment\n");
return 0;
}
/*
* If we have a (first) fragment then mark it to cause any connection to flush.
*/
flush_on_find = unlikely(frag_off & IP_MF) ? true : false;
/*
* Do we have any IP options? That's definite a slow path! If we do have IP
* options we need to recheck our header size.
*/
ihl = iph->ihl << 2;
ip_options = unlikely(ihl != sizeof(struct sfe_ipv4_iphdr)) ? true : false;
if (unlikely(ip_options)) {
if (unlikely(len < ihl)) {
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("len: %u is too short for header of size: %u\n", len, ihl);
return 0;
}
flush_on_find = true;
}
protocol = iph->protocol;
if (IPPROTO_UDP == protocol) {
return sfe_ipv4_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find);
}
if (IPPROTO_TCP == protocol) {
return sfe_ipv4_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find);
}
if (IPPROTO_ICMP == protocol) {
return sfe_ipv4_recv_icmp(si, skb, dev, len, iph, ihl);
}
spin_lock(&si->lock);
si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++;
si->packets_not_forwarded++;
spin_unlock(&si->lock);
DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", protocol);
return 0;
}
/*
* sfe_ipv4_find_mac_addr()
* Find the MAC address for a given IPv4 address.
*
* Returns true if we find the MAC address, otherwise false.
*
* We look up the rtable entry for the address and, from its neighbour
* structure, obtain the hardware address. This means this function also
* works if the neighbours are routers too.
*/
static bool sfe_ipv4_find_mac_addr(uint32_t addr, uint8_t *mac_addr)
{
struct neighbour *neigh;
struct rtable *rt;
struct dst_entry *dst;
struct net_device *dev;
/*
* Look up the rtable entry for the IP address then get the hardware
* address from its neighbour structure. This means this work when the
* neighbours are routers too.
*/
rt = ip_route_output(&init_net, addr, 0, 0, 0);
if (unlikely(IS_ERR(rt))) {
return false;
}
dst = (struct dst_entry *)rt;
rcu_read_lock();
neigh = dst_get_neighbour_noref(dst);
if (unlikely(!neigh)) {
rcu_read_unlock();
dst_release(dst);
return false;
}
if (unlikely(!(neigh->nud_state & NUD_VALID))) {
rcu_read_unlock();
dst_release(dst);
return false;
}
dev = neigh->dev;
if (!dev) {
rcu_read_unlock();
dst_release(dst);
return false;
}
memcpy(mac_addr, neigh->ha, (size_t)dev->addr_len);
rcu_read_unlock();
dst_release(dst);
/*
* We're only interested in unicast MAC addresses - if it's not a unicast
* address then our IP address mustn't be unicast either.
*/
if (is_multicast_ether_addr(mac_addr)) {
DEBUG_TRACE("MAC is non-unicast - ignoring\n");
return false;
}
return true;
}
/*
* sfe_ipv4_create_rule()
* Create a forwarding rule.
*/
static void sfe_ipv4_create_rule(struct sfe_ipv4 *si, struct sfe_ipv4_create *sic)
{
struct sfe_ipv4_connection *c;
struct sfe_ipv4_connection_match *original_cm;
struct sfe_ipv4_connection_match *reply_cm;
spin_lock_bh(&si->lock);
si->connection_create_requests++;
/*
* Check to see if there is already a flow that matches the rule we're trying
* to create. If there is then we can't create a new one.
*/
c = sfe_ipv4_find_sfe_ipv4_connection(si, sic->protocol, sic->src_ip, sic->src_port,
sic->dest_ip, sic->dest_port);
if (c) {
si->connection_create_collisions++;
/*
* If we already have the flow then it's likely that this request to
* create the connection rule contains more up-to-date information.
* Check and update accordingly.
*/
original_cm = c->original_match;
reply_cm = c->reply_match;
switch (sic->protocol) {
case IPPROTO_TCP:
if (original_cm->protocol_state.tcp.max_win < sic->src_td_max_window) {
original_cm->protocol_state.tcp.max_win = sic->src_td_max_window;
}
if ((int32_t)(original_cm->protocol_state.tcp.end - sic->src_td_end) < 0) {
original_cm->protocol_state.tcp.end = sic->src_td_end;
}
if ((int32_t)(original_cm->protocol_state.tcp.max_end - sic->src_td_max_end) < 0) {
original_cm->protocol_state.tcp.max_end = sic->src_td_max_end;
}
if (reply_cm->protocol_state.tcp.max_win < sic->dest_td_max_window) {
reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window;
}
if ((int32_t)(reply_cm->protocol_state.tcp.end - sic->dest_td_end) < 0) {
reply_cm->protocol_state.tcp.end = sic->dest_td_end;
}
if ((int32_t)(reply_cm->protocol_state.tcp.max_end - sic->dest_td_max_end) < 0) {
reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end;
}
original_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
reply_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
if (sic->flags & SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK) {
original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
}
break;
}
spin_unlock_bh(&si->lock);
DEBUG_TRACE("connection already exists - p: %d\n"
" s: %s:%pM:%pI4:%u, d: %s:%pM:%pI4:%u\n",
sic->protocol, sic->src_dev->name, sic->src_mac, &sic->src_ip, ntohs(sic->src_port),
sic->dest_dev->name, sic->dest_mac, &sic->dest_ip, ntohs(sic->dest_port));
return;
}
/*
* Allocate the various connection tracking objects.
*/
c = (struct sfe_ipv4_connection *)kmalloc(sizeof(struct sfe_ipv4_connection), GFP_ATOMIC);
if (unlikely(!c)) {
spin_unlock_bh(&si->lock);
return;
}
original_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC);
if (unlikely(!original_cm)) {
spin_unlock_bh(&si->lock);
kfree(c);
return;
}
reply_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC);
if (unlikely(!reply_cm)) {
spin_unlock_bh(&si->lock);
kfree(original_cm);
kfree(c);
return;
}
/*
* Fill in the "original" direction connection matching object.
* Note that the transmit MAC address is "dest_mac_xlate" because
* we always know both ends of a connection by their translated
* addresses and not their public addresses.
*/
original_cm->match_dev = sic->src_dev;
original_cm->match_protocol = sic->protocol;
original_cm->match_src_ip = sic->src_ip;
original_cm->match_src_port = sic->src_port;
original_cm->match_dest_ip = sic->dest_ip;
original_cm->match_dest_port = sic->dest_port;
original_cm->xlate_src_ip = sic->src_ip_xlate;
original_cm->xlate_src_port = sic->src_port_xlate;
original_cm->xlate_dest_ip = sic->dest_ip_xlate;
original_cm->xlate_dest_port = sic->dest_port_xlate;
original_cm->rx_packet_count = 0;
original_cm->rx_packet_count64 = 0;
original_cm->rx_byte_count = 0;
original_cm->rx_byte_count64 = 0;
original_cm->xmit_dev = sic->dest_dev;
original_cm->xmit_dev_mtu = sic->dest_mtu;
memcpy(original_cm->xmit_src_mac, sic->dest_dev->dev_addr, ETH_ALEN);
memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN);
original_cm->connection = c;
original_cm->counter_match = reply_cm;
original_cm->flags = 0;
original_cm->active_next = NULL;
original_cm->active_prev = NULL;
original_cm->active = false;
if (sic->dest_dev->header_ops->create == eth_header) {
original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR;
}
/*
* Fill in the "reply" direction connection matching object.
*/
reply_cm->match_dev = sic->dest_dev;
reply_cm->match_protocol = sic->protocol;
reply_cm->match_src_ip = sic->dest_ip_xlate;
reply_cm->match_src_port = sic->dest_port_xlate;
reply_cm->match_dest_ip = sic->src_ip_xlate;
reply_cm->match_dest_port = sic->src_port_xlate;
reply_cm->xlate_src_ip = sic->dest_ip;
reply_cm->xlate_src_port = sic->dest_port;
reply_cm->xlate_dest_ip = sic->src_ip;
reply_cm->xlate_dest_port = sic->src_port;
reply_cm->rx_packet_count = 0;
reply_cm->rx_packet_count64 = 0;
reply_cm->rx_byte_count = 0;
reply_cm->rx_byte_count64 = 0;
reply_cm->xmit_dev = sic->src_dev;
reply_cm->xmit_dev_mtu = sic->src_mtu;
memcpy(reply_cm->xmit_src_mac, sic->src_dev->dev_addr, ETH_ALEN);
memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN);
reply_cm->connection = c;
reply_cm->counter_match = original_cm;
reply_cm->flags = 0;
reply_cm->active_next = NULL;
reply_cm->active_prev = NULL;
reply_cm->active = false;
if (sic->src_dev->header_ops->create == eth_header) {
reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_FAST_ETH_HDR;
}
if (sic->dest_ip != sic->dest_ip_xlate || sic->dest_port != sic->dest_port_xlate) {
original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST;
reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC;
}
if (sic->src_ip != sic->src_ip_xlate || sic->src_port != sic->src_port_xlate) {
original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC;
reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST;
}
c->protocol = sic->protocol;
c->src_ip = sic->src_ip;
c->src_ip_xlate = sic->src_ip_xlate;
c->src_port = sic->src_port;
c->src_port_xlate = sic->src_port_xlate;
c->original_dev = sic->src_dev;
c->original_match = original_cm;
c->dest_ip = sic->dest_ip;
c->dest_ip_xlate = sic->dest_ip_xlate;
c->dest_port = sic->dest_port;
c->dest_port_xlate = sic->dest_port_xlate;
c->reply_dev = sic->dest_dev;
c->reply_match = reply_cm;
c->last_sync_jiffies = get_jiffies_64();
c->iterators = 0;
c->pending_free = false;
/*
* Take hold of our source and dest devices for the duration of the connection.
*/
dev_hold(c->original_dev);
dev_hold(c->reply_dev);
/*
* Initialize the protocol-specific information that we track.
*/
switch (sic->protocol) {
case IPPROTO_TCP:
original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale;
original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1;
original_cm->protocol_state.tcp.end = sic->src_td_end;
original_cm->protocol_state.tcp.max_end = sic->src_td_max_end;
reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale;
reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1;
reply_cm->protocol_state.tcp.end = sic->dest_td_end;
reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end;
if (sic->flags & SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK) {
original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK;
}
break;
}
sfe_ipv4_connection_match_compute_translations(original_cm);
sfe_ipv4_connection_match_compute_translations(reply_cm);
sfe_ipv4_insert_sfe_ipv4_connection(si, c);
spin_unlock_bh(&si->lock);
/*
* We have everything we need!
*/
DEBUG_INFO("new connection - p: %d\n"
" s: %s:%pM(%pM):%pI4(%pI4):%u(%u)\n"
" d: %s:%pM(%pM):%pI4(%pI4):%u(%u)\n",
sic->protocol,
sic->src_dev->name, sic->src_mac, sic->src_mac_xlate,
&sic->src_ip, &sic->src_ip_xlate, ntohs(sic->src_port), ntohs(sic->src_port_xlate),
sic->dest_dev->name, sic->dest_mac, sic->dest_mac_xlate,
&sic->dest_ip, &sic->dest_ip_xlate, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate));
}
/*
* sfe_ipv4_post_routing_hook()
* Called for packets about to leave the box - either locally generated or forwarded from another interface
*/
static unsigned int sfe_ipv4_post_routing_hook(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in_unused,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sfe_ipv4 *si = &__si;
struct sfe_ipv4_create sic;
struct net_device *in;
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct net_device *src_dev;
struct net_device *dest_dev;
struct net_device *src_br_dev = NULL;
struct net_device *dest_br_dev = NULL;
struct nf_conntrack_tuple orig_tuple;
struct nf_conntrack_tuple reply_tuple;
/*
* If operations have paused then do not process packets.
*/
spin_lock_bh(&si->lock);
if (unlikely(si->pause)) {
DEBUG_TRACE("paused, ignoring\n");
spin_unlock_bh(&si->lock);
return NF_ACCEPT;
}
spin_unlock_bh(&si->lock);
/*
* Don't process broadcast or multicast packets.
*/
if (unlikely(skb->pkt_type == PACKET_BROADCAST)) {
DEBUG_TRACE("broadcast, ignoring\n");
return NF_ACCEPT;
}
if (unlikely(skb->pkt_type == PACKET_MULTICAST)) {
DEBUG_TRACE("multicast, ignoring\n");
return NF_ACCEPT;
}
/*
* Don't process packets that are not being forwarded.
*/
in = dev_get_by_index(&init_net, skb->skb_iif);
if (!in) {
DEBUG_TRACE("packet not forwarding\n");
return NF_ACCEPT;
}
/*
* Don't process packets with non-standard 802.3 MAC address sizes.
*/
if (unlikely(in->addr_len != ETH_ALEN)) {
DEBUG_TRACE("in device: %s not 802.3 hw addr len: %u, ignoring\n",
in->name, (unsigned)in->addr_len);
goto done1;
}
if (unlikely(out->addr_len != ETH_ALEN)) {
DEBUG_TRACE("out device: %s not 802.3 hw addr len: %u, ignoring\n",
out->name, (unsigned)out->addr_len);
goto done1;
}
/*
* Don't process packets that aren't being tracked by conntrack.
*/
ct = nf_ct_get(skb, &ctinfo);
if (unlikely(!ct)) {
DEBUG_TRACE("no conntrack connection, ignoring\n");
goto done1;
}
/*
* Don't process untracked connections.
*/
if (unlikely(ct == &nf_conntrack_untracked)) {
DEBUG_TRACE("untracked connection\n");
goto done1;
}
/*
* Don't process connections that require support from a 'helper' (typically a NAT ALG).
*/
if (unlikely(nfct_help(ct))) {
DEBUG_TRACE("connection has helper\n");
goto done1;
}
/*
* Look up the details of our connection in conntrack.
*
* Note that the data we get from conntrack is for the "ORIGINAL" direction
* but our packet may actually be in the "REPLY" direction.
*/
orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
sic.protocol = (int32_t)orig_tuple.dst.protonum;
/*
* Get addressing information, non-NAT first
*/
sic.src_ip = (__be32)orig_tuple.src.u3.ip;
sic.dest_ip = (__be32)orig_tuple.dst.u3.ip;
/*
* NAT'ed addresses - note these are as seen from the 'reply' direction
* When NAT does not apply to this connection these will be identical to the above.
*/
sic.src_ip_xlate = (__be32)reply_tuple.dst.u3.ip;
sic.dest_ip_xlate = (__be32)reply_tuple.src.u3.ip;
sic.flags = 0;
switch (sic.protocol) {
case IPPROTO_TCP:
sic.src_port = orig_tuple.src.u.tcp.port;
sic.dest_port = orig_tuple.dst.u.tcp.port;
sic.src_port_xlate = reply_tuple.dst.u.tcp.port;
sic.dest_port_xlate = reply_tuple.src.u.tcp.port;
sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale;
sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin;
sic.src_td_end = ct->proto.tcp.seen[0].td_end;
sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend;
sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale;
sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin;
sic.dest_td_end = ct->proto.tcp.seen[1].td_end;
sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend;
if (nf_ct_tcp_no_window_check
|| (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL)
|| (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) {
sic.flags |= SFE_IPV4_CREATE_FLAG_NO_SEQ_CHECK;
}
/*
* Don't try to manage a non-established connection.
*/
if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
DEBUG_TRACE("non-established connection\n");
goto done1;
}
/*
* If the connection is shutting down do not manage it.
* state can not be SYN_SENT, SYN_RECV because connection is assured
* Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE.
*/
spin_lock_bh(&ct->lock);
if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) {
spin_unlock_bh(&ct->lock);
DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n",
ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port),
&sic.dest_ip, ntohs(sic.dest_port));
goto done1;
}
spin_unlock_bh(&ct->lock);
break;
case IPPROTO_UDP:
sic.src_port = orig_tuple.src.u.udp.port;
sic.dest_port = orig_tuple.dst.u.udp.port;
sic.src_port_xlate = reply_tuple.dst.u.udp.port;
sic.dest_port_xlate = reply_tuple.src.u.udp.port;
break;
default:
DEBUG_TRACE("unhandled protocol %d\n", sic.protocol);
goto done1;
}
/*
* Get the MAC addresses that correspond to source and destination host addresses.
*/
if (!sfe_ipv4_find_mac_addr(sic.src_ip, sic.src_mac)) {
DEBUG_TRACE("failed to find MAC address for src IP: %pI4\n", &sic.src_ip);
goto done1;
}
if (!sfe_ipv4_find_mac_addr(sic.src_ip_xlate, sic.src_mac_xlate)) {
DEBUG_TRACE("failed to find MAC address for xlate src IP: %pI4\n", &sic.src_ip_xlate);
goto done1;
}
/*
* Do dest now
*/
if (!sfe_ipv4_find_mac_addr(sic.dest_ip, sic.dest_mac)) {
DEBUG_TRACE("failed to find MAC address for dest IP: %pI4\n", &sic.dest_ip);
goto done1;
}
if (!sfe_ipv4_find_mac_addr(sic.dest_ip_xlate, sic.dest_mac_xlate)) {
DEBUG_TRACE("failed to find MAC address for xlate dest IP: %pI4\n", &sic.dest_ip_xlate);
goto done1;
}
/*
* Get our device info. If we're dealing with the "reply" direction here then
* we'll need things swapped around.
*/
if (ctinfo < IP_CT_IS_REPLY) {
src_dev = in;
dest_dev = (struct net_device *)out;
} else {
src_dev = (struct net_device *)out;
dest_dev = in;
}
#if (!SFE_HOOK_ABOVE_BRIDGE)
/*
* Now our devices may actually be a bridge interface. If that's
* the case then we need to hunt down the underlying interface.
*/
if (src_dev->priv_flags & IFF_EBRIDGE) {
src_br_dev = br_port_dev_get(src_dev, sic.src_mac);
if (!src_br_dev) {
DEBUG_TRACE("no port found on bridge\n");
goto done1;
}
src_dev = src_br_dev;
}
if (dest_dev->priv_flags & IFF_EBRIDGE) {
dest_br_dev = br_port_dev_get(dest_dev, sic.dest_mac_xlate);
if (!dest_br_dev) {
DEBUG_TRACE("no port found on bridge\n");
goto done2;
}
dest_dev = dest_br_dev;
}
#else
/*
* Our devices may actually be part of a bridge interface. If that's
* the case then find the bridge interface instead.
*/
if (src_dev->priv_flags & IFF_BRIDGE_PORT) {
src_br_dev = src_dev->master;
if (!src_br_dev) {
DEBUG_TRACE("no bridge found for: %s\n", src_dev->name);
goto done1;
}
dev_hold(src_br_dev);
src_dev = src_br_dev;
}
if (dest_dev->priv_flags & IFF_BRIDGE_PORT) {
dest_br_dev = dest_dev->master;
if (!dest_br_dev) {
DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name);
goto done2;
}
dev_hold(dest_br_dev);
dest_dev = dest_br_dev;
}
#endif
sic.src_dev = src_dev;
sic.dest_dev = dest_dev;
// XXX - these MTUs need handling correctly!
sic.src_mtu = 1500;
sic.dest_mtu = 1500;
sfe_ipv4_create_rule(si, &sic);
/*
* If we had bridge ports then release them too.
*/
if (dest_br_dev) {
dev_put(dest_br_dev);
}
done2:
if (src_br_dev) {
dev_put(src_br_dev);
}
done1:
/*
* Release the interface on which this skb arrived
*/
dev_put(in);
return NF_ACCEPT;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
/*
* sfe_ipv4_destroy_rule()
* Destroy a forwarding rule.
*/
static void sfe_ipv4_destroy_rule(struct sfe_ipv4 *si, struct sfe_ipv4_destroy *sid)
{
struct sfe_ipv4_connection *c;
spin_lock_bh(&si->lock);
si->connection_destroy_requests++;
/*
* Check to see if we have a flow that matches the rule we're trying
* to destroy. If there isn't then we can't destroy it.
*/
c = sfe_ipv4_find_sfe_ipv4_connection(si, sid->protocol, sid->src_ip, sid->src_port,
sid->dest_ip, sid->dest_port);
if (!c) {
si->connection_destroy_misses++;
spin_unlock_bh(&si->lock);
DEBUG_TRACE("connection does not exist - p: %d, s: %pI4:%u, d: %pI4:%u\n",
sid->protocol, &sid->src_ip, ntohs(sid->src_port),
&sid->dest_ip, ntohs(sid->dest_port));
return;
}
/*
* Remove our connection details from the hash tables.
*/
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
spin_unlock_bh(&si->lock);
/*
* Finally synchronize state and free resources. We need to protect against
* pre-emption by our bottom half while we do this though.
*/
local_bh_disable();
sfe_ipv4_flush_sfe_ipv4_connection(si, c);
local_bh_enable();
DEBUG_INFO("connection destroyed - p: %d, s: %pI4:%u, d: %pI4:%u\n",
sid->protocol, &sid->src_ip, ntohs(sid->src_port),
&sid->dest_ip, ntohs(sid->dest_port));
}
/*
* sfe_ipv4_conntrack_event()
* Callback event invoked when a conntrack connection's state changes.
*/
static int sfe_ipv4_conntrack_event(unsigned int events, struct nf_ct_event *item)
{
struct sfe_ipv4 *si = &__si;
struct sfe_ipv4_destroy sid;
struct nf_conn *ct = item->ct;
struct nf_conntrack_tuple orig_tuple;
/*
* If we don't have a conntrack entry then we're done.
*/
if (unlikely(!ct)) {
DEBUG_WARN("no ct in conntrack event callback\n");
return NOTIFY_DONE;
}
/*
* If this is an untracked connection then we can't have any state either.
*/
if (unlikely(ct == &nf_conntrack_untracked)) {
DEBUG_TRACE("ignoring untracked conn\n");
return NOTIFY_DONE;
}
/*
* Ignore anything other than IPv4 connections.
*/
if (unlikely(nf_ct_l3num(ct) != AF_INET)) {
DEBUG_TRACE("ignoring non-IPv4 conn\n");
return NOTIFY_DONE;
}
/*
* We're only interested in destroy events.
*/
if (unlikely(!(events & (1 << IPCT_DESTROY)))) {
DEBUG_TRACE("ignoring non-destroy event\n");
return NOTIFY_DONE;
}
orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
sid.protocol = (int32_t)orig_tuple.dst.protonum;
/*
* Extract information from the conntrack connection. We're only interested
* in nominal connection information (i.e. we're ignoring any NAT information).
*/
sid.src_ip = (__be32)orig_tuple.src.u3.ip;
sid.dest_ip = (__be32)orig_tuple.dst.u3.ip;
switch (sid.protocol) {
case IPPROTO_TCP:
sid.src_port = orig_tuple.src.u.tcp.port;
sid.dest_port = orig_tuple.dst.u.tcp.port;
break;
case IPPROTO_UDP:
sid.src_port = orig_tuple.src.u.udp.port;
sid.dest_port = orig_tuple.dst.u.udp.port;
break;
default:
DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol);
return NOTIFY_DONE;
}
sfe_ipv4_destroy_rule(si, &sid);
return NOTIFY_DONE;
}
/*
* Netfilter conntrack event system to monitor connection tracking changes
*/
static struct nf_ct_event_notifier sfe_ipv4_conntrack_notifier = {
.fcn = sfe_ipv4_conntrack_event,
};
#endif
/*
* Structure to establish a hook into the post routing netfilter point - this
* will pick up local outbound and packets going from one interface to another.
*
* Note: see include/linux/netfilter_ipv4.h for info related to priority levels.
* We want to examine packets after NAT translation and any ALG processing.
*/
static struct nf_hook_ops sfe_ipv4_ops_post_routing[] __read_mostly = {
{
.hook = sfe_ipv4_post_routing_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC + 1,
},
};
/*
* sfe_ipv4_get_pause()
*/
static ssize_t sfe_ipv4_get_pause(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct sfe_ipv4 *si = &__si;
ssize_t count;
int num;
spin_lock_bh(&si->lock);
num = si->pause;
spin_unlock_bh(&si->lock);
count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num);
return count;
}
/*
* sfe_ipv4_set_pause()
*/
static ssize_t sfe_ipv4_set_pause(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct sfe_ipv4 *si = &__si;
char num_buf[12];
int num;
/*
* Check that our command data will fit. If it will then copy it to our local
* buffer and NUL terminate it.
*/
if (count > 11) {
return 0;
}
memcpy(num_buf, buf, count);
num_buf[count] = '\0';
sscanf(num_buf, "%d", &num);
DEBUG_TRACE("set pause: %d\n", num);
spin_lock_bh(&si->lock);
si->pause = num;
spin_unlock_bh(&si->lock);
return count;
}
/*
* sfe_ipv4_get_debug_dev()
*/
static ssize_t sfe_ipv4_get_debug_dev(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct sfe_ipv4 *si = &__si;
ssize_t count;
int num;
spin_lock_bh(&si->lock);
num = si->debug_dev;
spin_unlock_bh(&si->lock);
count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num);
return count;
}
/*
* sysfs attributes for the default classifier itself.
*/
static const struct device_attribute sfe_ipv4_pause_attr =
__ATTR(pause, S_IWUGO | S_IRUGO, sfe_ipv4_get_pause, sfe_ipv4_set_pause);
static const struct device_attribute sfe_ipv4_debug_dev_attr =
__ATTR(debug_dev, S_IWUGO | S_IRUGO, sfe_ipv4_get_debug_dev, NULL);
/*
* sfe_ipv4_destroy_all()
* Destroy all connections that match a particular device.
*
* If we pass dev as NULL then this destroys all connections.
*/
static void sfe_ipv4_destroy_all(struct sfe_ipv4 *si, struct net_device *dev)
{
struct sfe_ipv4_connection *c;
struct sfe_ipv4_connection *c_next;
spin_lock_bh(&si->lock);
c = si->all_connections_head;
if (!c) {
spin_unlock_bh(&si->lock);
return;
}
c->iterators++;
/*
* Iterate over all connections
*/
while (c) {
c_next = c->all_connections_next;
/*
* Before we do anything else, take an iterator reference for the
* connection we'll iterate next.
*/
if (c_next) {
c_next->iterators++;
}
/*
* Does this connection relate to the device we are destroying? If
* it does then ensure it is marked for being freed as soon as it
* is no longer being iterated.
*/
if (!dev
|| (dev == c->original_dev)
|| (dev == c->reply_dev)) {
c->pending_free = true;
sfe_ipv4_remove_sfe_ipv4_connection(si, c);
}
/*
* Remove the iterator reference that we acquired and see if we
* should free any resources.
*/
if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
spin_unlock_bh(&si->lock);
/*
* This entry is dead so release our hold of the source and
* dest devices and free the memory for our connection objects.
*/
dev_put(c->original_dev);
dev_put(c->reply_dev);
kfree(c->original_match);
kfree(c->reply_match);
kfree(c);
spin_lock_bh(&si->lock);
}
c = c_next;
}
spin_unlock_bh(&si->lock);
}
/*
* sfe_ipv4_device_event()
*/
static int sfe_ipv4_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct sfe_ipv4 *si = &__si;
struct net_device *dev = (struct net_device *)ptr;
switch (event) {
case NETDEV_DOWN:
if (dev) {
sfe_ipv4_destroy_all(si, dev);
}
break;
}
return NOTIFY_DONE;
}
/*
* sfe_ipv4_inet_event()
*/
static int sfe_ipv4_inet_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
return sfe_ipv4_device_event(this, event, dev);
}
/*
* sfe_ipv4_periodic_sync()
*/
static void sfe_ipv4_periodic_sync(unsigned long arg)
{
struct sfe_ipv4 *si = (struct sfe_ipv4 *)arg;
uint64_t now_jiffies;
int quota;
now_jiffies = get_jiffies_64();
spin_lock_bh(&si->lock);
sfe_ipv4_update_summary_stats(si);
/*
* Get an estimate of the number of connections to parse in this sync.
*/
quota = (si->num_connections + 63) / 64;
/*
* Walk the "active" list and sync the connection state.
*/
while (quota--) {
struct sfe_ipv4_connection_match *cm;
struct sfe_ipv4_connection_match *counter_cm;
struct sfe_ipv4_connection *c;
struct sfe_ipv4_sync sis;
cm = si->active_head;
if (!cm) {
break;
}
cm->active = false;
/*
* Having found an entry we now remove it from the active scan list.
*/
si->active_head = cm->active_next;
if (likely(cm->active_next)) {
cm->active_next->active_prev = NULL;
} else {
si->active_tail = NULL;
}
cm->active_next = NULL;
/*
* We scan the connection match lists so there's a possibility that our
* counter match is in the list too. If it is then remove it.
*/
counter_cm = cm->counter_match;
if (counter_cm->active) {
counter_cm->active = false;
if (likely(counter_cm->active_prev)) {
counter_cm->active_prev->active_next = counter_cm->active_next;
} else {
si->active_head = counter_cm->active_next;
}
if (likely(counter_cm->active_next)) {
counter_cm->active_next->active_prev = counter_cm->active_prev;
} else {
si->active_tail = counter_cm->active_prev;
}
counter_cm->active_next = NULL;
counter_cm->active_prev = NULL;
}
/*
* Sync the connection state.
*/
c = cm->connection;
sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, now_jiffies);
/*
* We don't want to be holding the lock when we sync!
*/
spin_unlock_bh(&si->lock);
sfe_ipv4_sync_rule(&sis);
spin_lock_bh(&si->lock);
}
spin_unlock_bh(&si->lock);
mod_timer(&si->timer, jiffies + (HZ / 100));
}
#define CHAR_DEV_MSG_SIZE 768
/*
* sfe_ipv4_debug_dev_read_start()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
int bytes_read;
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "<sfe_ipv4>\n");
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
ws->state++;
return true;
}
/*
* sfe_ipv4_debug_dev_read_connections_start()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_connections_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
int bytes_read;
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<connections>\n");
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
ws->state++;
return true;
}
/*
* sfe_ipv4_debug_dev_read_connections_connection()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_connections_connection(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
struct sfe_ipv4_connection *c;
struct sfe_ipv4_connection *c_next;
struct sfe_ipv4_connection_match *original_cm;
struct sfe_ipv4_connection_match *reply_cm;
int bytes_read;
int protocol;
struct net_device *src_dev;
__be32 src_ip;
__be32 src_ip_xlate;
__be16 src_port;
__be16 src_port_xlate;
uint64_t src_rx_packets;
uint64_t src_rx_bytes;
struct net_device *dest_dev;
__be32 dest_ip;
__be32 dest_ip_xlate;
__be16 dest_port;
__be16 dest_port_xlate;
uint64_t dest_rx_packets;
uint64_t dest_rx_bytes;
uint64_t last_sync_jiffies;
spin_lock_bh(&si->lock);
c = ws->iter_conn;
/*
* Is this the first connection we need to scan?
*/
if (!c) {
c = si->all_connections_head;
/*
* If there were no connections then move to the next state.
*/
if (!c) {
spin_unlock_bh(&si->lock);
ws->state++;
return true;
}
c->iterators++;
}
c_next = c->all_connections_next;
ws->iter_conn = c_next;
/*
* Before we do anything else, take an iterator reference for the
* connection we'll iterate next.
*/
if (c_next) {
c_next->iterators++;
}
/*
* Remove the iterator reference that we acquired and see if we
* should free any resources.
*/
if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
spin_unlock_bh(&si->lock);
/*
* This entry is dead so release our hold of the source and
* dest devices and free the memory for our connection objects.
*/
dev_put(c->original_dev);
dev_put(c->reply_dev);
kfree(c->original_match);
kfree(c->reply_match);
kfree(c);
/*
* If we have no more connections then move to the next state.
*/
if (!c_next) {
ws->state++;
}
return true;
}
original_cm = c->original_match;
reply_cm = c->reply_match;
protocol = c->protocol;
src_dev = c->original_dev;
src_ip = c->src_ip;
src_ip_xlate = c->src_ip_xlate;
src_port = c->src_port;
src_port_xlate = c->src_port_xlate;
sfe_ipv4_connection_match_update_summary_stats(original_cm);
sfe_ipv4_connection_match_update_summary_stats(reply_cm);
src_rx_packets = original_cm->rx_packet_count64;
src_rx_bytes = original_cm->rx_byte_count64;
dest_dev = c->reply_dev;
dest_ip = c->dest_ip;
dest_ip_xlate = c->dest_ip_xlate;
dest_port = c->dest_port;
dest_port_xlate = c->dest_port_xlate;
dest_rx_packets = reply_cm->rx_packet_count64;
dest_rx_bytes = reply_cm->rx_byte_count64;
last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies;
spin_unlock_bh(&si->lock);
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t<connection "
"protocol=\"%u\" "
"src_dev=\"%s\" "
"src_ip=\"%pI4\" src_ip_xlate=\"%pI4\" "
"src_port=\"%u\" src_port_xlate=\"%u\" "
"src_rx_pkts=\"%llu\" src_rx_bytes=\"%llu\" "
"dest_dev=\"%s\" "
"dest_ip=\"%pI4\" dest_ip_xlate=\"%pI4\" "
"dest_port=\"%u\" dest_port_xlate=\"%u\" "
"dest_rx_pkts=\"%llu\" dest_rx_bytes=\"%llu\" "
"last_sync=\"%llu\" />\n",
protocol,
src_dev->name,
&src_ip, &src_ip_xlate,
ntohs(src_port), ntohs(src_port_xlate),
src_rx_packets, src_rx_bytes,
dest_dev->name,
&dest_ip, &dest_ip_xlate,
ntohs(dest_port), ntohs(dest_port_xlate),
dest_rx_packets, dest_rx_bytes,
last_sync_jiffies);
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
/*
* If we have no more connections then move to the next state.
*/
if (!c_next) {
ws->state++;
}
return true;
}
/*
* sfe_ipv4_debug_dev_read_connections_end()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_connections_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
int bytes_read;
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t</connections>\n");
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
ws->state++;
return true;
}
/*
* sfe_ipv4_debug_dev_read_exceptions_start()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_exceptions_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
int bytes_read;
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<exceptions>\n");
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
ws->state++;
return true;
}
/*
* sfe_ipv4_debug_dev_read_exceptions_exception()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_exceptions_exception(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
uint64_t ct;
spin_lock_bh(&si->lock);
ct = si->exception_events64[ws->iter_exception];
spin_unlock_bh(&si->lock);
if (ct) {
int bytes_read;
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE,
"\t\t<exception name=\"%s\" count=\"%llu\" />\n",
sfe_ipv4_exception_events_string[ws->iter_exception],
ct);
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
}
ws->iter_exception++;
if (ws->iter_exception >= SFE_IPV4_EXCEPTION_EVENT_LAST) {
ws->iter_exception = 0;
ws->state++;
}
return true;
}
/*
* sfe_ipv4_debug_dev_read_exceptions_end()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_exceptions_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
int bytes_read;
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t</exceptions>\n");
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
ws->state++;
return true;
}
/*
* sfe_ipv4_debug_dev_read_stats()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_stats(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
int bytes_read;
unsigned int num_connections;
uint64_t packets_forwarded;
uint64_t packets_not_forwarded;
uint64_t connection_create_requests;
uint64_t connection_create_collisions;
uint64_t connection_destroy_requests;
uint64_t connection_destroy_misses;
uint64_t connection_flushes;
uint64_t connection_match_hash_hits;
uint64_t connection_match_hash_reorders;
spin_lock_bh(&si->lock);
sfe_ipv4_update_summary_stats(si);
num_connections = si->num_connections;
packets_forwarded = si->packets_forwarded64;
packets_not_forwarded = si->packets_not_forwarded64;
connection_create_requests = si->connection_create_requests64;
connection_create_collisions = si->connection_create_collisions64;
connection_destroy_requests = si->connection_destroy_requests64;
connection_destroy_misses = si->connection_destroy_misses64;
connection_flushes = si->connection_flushes64;
connection_match_hash_hits = si->connection_match_hash_hits64;
connection_match_hash_reorders = si->connection_match_hash_reorders64;
spin_unlock_bh(&si->lock);
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t<stats "
"num_connections=\"%u\" "
"pkts_forwarded=\"%llu\" pkts_not_forwarded=\"%llu\" "
"create_requests=\"%llu\" create_collisions=\"%llu\" "
"destroy_requests=\"%llu\" destroy_misses=\"%llu\" "
"flushes=\"%llu\" "
"hash_hits=\"%llu\" hash_reorders=\"%llu\" />\n",
num_connections,
packets_forwarded,
packets_not_forwarded,
connection_create_requests,
connection_create_collisions,
connection_destroy_requests,
connection_destroy_misses,
connection_flushes,
connection_match_hash_hits,
connection_match_hash_reorders);
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
ws->state++;
return true;
}
/*
* sfe_ipv4_debug_dev_read_end()
* Generate part of the XML output.
*/
static bool sfe_ipv4_debug_dev_read_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length,
int *total_read, struct sfe_ipv4_debug_xml_write_state *ws)
{
int bytes_read;
bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "</sfe_ipv4>\n");
if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) {
return false;
}
*length -= bytes_read;
*total_read += bytes_read;
ws->state++;
return true;
}
/*
* Array of write functions that write various XML elements that correspond to
* our XML output state machine.
*/
sfe_ipv4_debug_xml_write_method_t sfe_ipv4_debug_xml_write_methods[SFE_IPV4_DEBUG_XML_STATE_DONE] = {
sfe_ipv4_debug_dev_read_start,
sfe_ipv4_debug_dev_read_connections_start,
sfe_ipv4_debug_dev_read_connections_connection,
sfe_ipv4_debug_dev_read_connections_end,
sfe_ipv4_debug_dev_read_exceptions_start,
sfe_ipv4_debug_dev_read_exceptions_exception,
sfe_ipv4_debug_dev_read_exceptions_end,
sfe_ipv4_debug_dev_read_stats,
sfe_ipv4_debug_dev_read_end,
};
/*
* sfe_ipv4_debug_dev_read()
* Send info to userspace upon read request from user
*/
static ssize_t sfe_ipv4_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset)
{
char msg[CHAR_DEV_MSG_SIZE];
int total_read = 0;
struct sfe_ipv4_debug_xml_write_state *ws;
struct sfe_ipv4 *si = &__si;
ws = (struct sfe_ipv4_debug_xml_write_state *)filp->private_data;
while ((ws->state != SFE_IPV4_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) {
if ((sfe_ipv4_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) {
continue;
}
}
return total_read;
}
/*
* sfe_ipv4_debug_dev_write()
* Write to char device not required/supported
*/
static ssize_t sfe_ipv4_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset)
{
return -EINVAL;
}
/*
* sfe_ipv4_debug_dev_open()
*/
static int sfe_ipv4_debug_dev_open(struct inode *inode, struct file *file)
{
struct sfe_ipv4_debug_xml_write_state *ws;
ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data;
if (!ws) {
ws = kzalloc(sizeof(struct sfe_ipv4_debug_xml_write_state), GFP_KERNEL);
if (!ws) {
return -ENOMEM;
}
ws->state = SFE_IPV4_DEBUG_XML_STATE_START;
file->private_data = ws;
}
return 0;
}
/*
* sfe_ipv4_debug_dev_release()
*/
static int sfe_ipv4_debug_dev_release(struct inode *inode, struct file *file)
{
struct sfe_ipv4_debug_xml_write_state *ws;
ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data;
if (ws) {
struct sfe_ipv4_connection *c;
/*
* Are we currently iterating a connection? If we are then
* make sure that we reduce its iterator count and if necessary
* free it.
*/
c = ws->iter_conn;
if (c) {
struct sfe_ipv4 *si = &__si;
spin_lock_bh(&si->lock);
if (sfe_ipv4_decrement_sfe_ipv4_connection_iterator(si, c)) {
spin_unlock_bh(&si->lock);
/*
* This entry is dead so release our hold of the source and
* dest devices and free the memory for our connection objects.
*/
dev_put(c->original_dev);
dev_put(c->reply_dev);
kfree(c->original_match);
kfree(c->reply_match);
kfree(c);
}
}
/*
* We've finished with our output so free the write state.
*/
kfree(ws);
}
return 0;
}
/*
* File operations used in the debug char device
*/
static struct file_operations sfe_ipv4_debug_dev_fops = {
.read = sfe_ipv4_debug_dev_read,
.write = sfe_ipv4_debug_dev_write,
.open = sfe_ipv4_debug_dev_open,
.release = sfe_ipv4_debug_dev_release
};
/*
* sfe_ipv4_init()
*/
static int __init sfe_ipv4_init(void)
{
struct sfe_ipv4 *si = &__si;
int result = -1;
DEBUG_INFO("SFE init\n");
/*
* Create sys/sfe_ipv4
*/
si->sys_sfe_ipv4 = kobject_create_and_add("sfe_ipv4", NULL);
if (!si->sys_sfe_ipv4) {
DEBUG_ERROR("failed to register sfe_ipv4\n");
goto exit1;
}
/*
* Create files, one for each parameter supported by this module.
*/
result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_pause_attr.attr);
if (result) {
DEBUG_ERROR("failed to register pause file: %d\n", result);
goto exit3;
}
result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
if (result) {
DEBUG_ERROR("failed to register debug dev file: %d\n", result);
goto exit4;
}
/*
* Register our debug char device.
*/
result = register_chrdev(0, "sfe_ipv4", &sfe_ipv4_debug_dev_fops);
if (result < 0) {
DEBUG_ERROR("Failed to register chrdev: %d\n", result);
goto exit5;
}
si->debug_dev = result;
si->dev_notifier.notifier_call = sfe_ipv4_device_event;
si->dev_notifier.priority = 1;
register_netdevice_notifier(&si->dev_notifier);
si->inet_notifier.notifier_call = sfe_ipv4_inet_event;
si->inet_notifier.priority = 1;
register_inetaddr_notifier(&si->inet_notifier);
/*
* Create a timer to handle periodic statistics.
*/
setup_timer(&si->timer, sfe_ipv4_periodic_sync, (unsigned long)si);
mod_timer(&si->timer, jiffies + (HZ / 100));
/*
* Register our netfilter hooks.
*/
result = nf_register_hooks(sfe_ipv4_ops_post_routing, ARRAY_SIZE(sfe_ipv4_ops_post_routing));
if (result < 0) {
DEBUG_ERROR("can't register nf post routing hook: %d\n", result);
goto exit6;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
/*
* Register a notifier hook to get fast notifications of expired connections.
*/
result = nf_conntrack_register_notifier(&init_net, &sfe_ipv4_conntrack_notifier);
if (result < 0) {
DEBUG_ERROR("can't register nf notifier hook: %d\n", result);
goto exit7;
}
#endif
spin_lock_init(&si->lock);
BUG_ON(athrs_fast_nat_recv != NULL);
RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_ipv4_recv);
return 0;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
exit7:
#endif
nf_unregister_hooks(sfe_ipv4_ops_post_routing, ARRAY_SIZE(sfe_ipv4_ops_post_routing));
del_timer_sync(&si->timer);
exit6:
unregister_inetaddr_notifier(&si->inet_notifier);
unregister_netdevice_notifier(&si->dev_notifier);
unregister_chrdev(si->debug_dev, "sfe_ipv4");
exit5:
sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
exit4:
sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_pause_attr.attr);
exit3:
kobject_put(si->sys_sfe_ipv4);
exit1:
return result;
}
/*
* sfe_ipv4_exit()
*/
static void __exit sfe_ipv4_exit(void)
{
struct sfe_ipv4 *si = &__si;
DEBUG_INFO("SFE exit\n");
RCU_INIT_POINTER(athrs_fast_nat_recv, NULL);
/*
* Wait for all callbacks to complete.
*/
rcu_barrier();
/*
* Destroy all connections.
*/
sfe_ipv4_destroy_all(si, NULL);
// XXX - this is where we need to unregister with any lower level offload services.
#ifdef CONFIG_NF_CONNTRACK_EVENTS
nf_conntrack_unregister_notifier(&init_net, &sfe_ipv4_conntrack_notifier);
#endif
nf_unregister_hooks(sfe_ipv4_ops_post_routing, ARRAY_SIZE(sfe_ipv4_ops_post_routing));
del_timer_sync(&si->timer);
unregister_inetaddr_notifier(&si->inet_notifier);
unregister_netdevice_notifier(&si->dev_notifier);
unregister_chrdev(si->debug_dev, "sfe_ipv4");
sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr);
sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_pause_attr.attr);
kobject_put(si->sys_sfe_ipv4);
}
module_init(sfe_ipv4_init)
module_exit(sfe_ipv4_exit)
MODULE_AUTHOR("Qualcomm Atheros Inc.");
MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv4 edition");
MODULE_LICENSE("GPL");