A Protocol Independent Hierarchical FIB (VPP-352)
Main Enhancements:
- Protocol Independent FIB API
- Hierarchical FIB entries. Dynamic recursive route resolution.
- Extranet Support.
- Integration of IP and MPLS forwarding.
- Separation of FIB and Adjacency databases.
- Data-Plane Object forwarding model.
Change-Id: I52dc815c0d0aa8b493e3cf6b978568f3cc82296c
Signed-off-by: Neale Ranns <nranns@cisco.com>
diff --git a/vnet/vnet/ip/lookup.h b/vnet/vnet/ip/lookup.h
index dcc9d25..c8dcc14 100644
--- a/vnet/vnet/ip/lookup.h
+++ b/vnet/vnet/ip/lookup.h
@@ -45,7 +45,6 @@
* - Callbacks on route add.
* - Callbacks on interface address change.
*/
-
#ifndef included_ip_lookup_h
#define included_ip_lookup_h
@@ -53,12 +52,11 @@
#include <vlib/buffer.h>
#include <vnet/ip/ip4_packet.h>
#include <vnet/ip/ip6_packet.h>
+#include <vnet/fib/fib_node.h>
+#include <vnet/dpo/dpo.h>
/** @brief Common (IP4/IP6) next index stored in adjacency. */
typedef enum {
- /** Packet does not match any route in table. */
- IP_LOOKUP_NEXT_MISS,
-
/** Adjacency to drop this packet. */
IP_LOOKUP_NEXT_DROP,
/** Adjacency to punt this packet. */
@@ -67,27 +65,26 @@
/** This packet is for one of our own IP addresses. */
IP_LOOKUP_NEXT_LOCAL,
- /** This packet matches an "interface route" and packets
+ /** This packet matches an "incomplete adjacency" and packets
need to be passed to ARP to find rewrite string for
this destination. */
IP_LOOKUP_NEXT_ARP,
+ /** This packet matches an "interface route" and packets
+ need to be passed to ARP to find rewrite string for
+ this destination. */
+ IP_LOOKUP_NEXT_GLEAN,
+
/** This packet is to be rewritten and forwarded to the next
processing node. This is typically the output interface but
might be another node for further output processing. */
IP_LOOKUP_NEXT_REWRITE,
- /** This packet needs to be classified */
- IP_LOOKUP_NEXT_CLASSIFY,
+ /** This packets follow a load-balance */
+ IP_LOOKUP_NEXT_LOAD_BALANCE,
- /** This packet needs to go to MAP - RFC7596, RFC7597 */
- IP_LOOKUP_NEXT_MAP,
-
- /** This packet needs to go to MAP with Translation - RFC7599 */
- IP_LOOKUP_NEXT_MAP_T,
-
- /** This packets needs to go to indirect next hop */
- IP_LOOKUP_NEXT_INDIRECT,
+ /** This packets follow a mid-chain adjacency */
+ IP_LOOKUP_NEXT_MIDCHAIN,
/** This packets needs to go to ICMP error */
IP_LOOKUP_NEXT_ICMP_ERROR,
@@ -100,7 +97,7 @@
} ip4_lookup_next_t;
typedef enum {
- /** Hop-by-hop header handling */
+ /* Hop-by-hop header handling */
IP6_LOOKUP_NEXT_HOP_BY_HOP = IP_LOOKUP_N_NEXT,
IP6_LOOKUP_NEXT_ADD_HOP_BY_HOP,
IP6_LOOKUP_NEXT_POP_HOP_BY_HOP,
@@ -108,30 +105,26 @@
} ip6_lookup_next_t;
#define IP4_LOOKUP_NEXT_NODES { \
- [IP_LOOKUP_NEXT_MISS] = "ip4-miss", \
[IP_LOOKUP_NEXT_DROP] = "ip4-drop", \
[IP_LOOKUP_NEXT_PUNT] = "ip4-punt", \
[IP_LOOKUP_NEXT_LOCAL] = "ip4-local", \
[IP_LOOKUP_NEXT_ARP] = "ip4-arp", \
+ [IP_LOOKUP_NEXT_GLEAN] = "ip4-glean", \
[IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit", \
- [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify", \
- [IP_LOOKUP_NEXT_MAP] = "ip4-map", \
- [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t", \
- [IP_LOOKUP_NEXT_INDIRECT] = "ip4-indirect", \
+ [IP_LOOKUP_NEXT_MIDCHAIN] = "ip4-midchain", \
+ [IP_LOOKUP_NEXT_LOAD_BALANCE] = "ip4-load-balance", \
[IP_LOOKUP_NEXT_ICMP_ERROR] = "ip4-icmp-error", \
}
#define IP6_LOOKUP_NEXT_NODES { \
- [IP_LOOKUP_NEXT_MISS] = "ip6-miss", \
[IP_LOOKUP_NEXT_DROP] = "ip6-drop", \
[IP_LOOKUP_NEXT_PUNT] = "ip6-punt", \
[IP_LOOKUP_NEXT_LOCAL] = "ip6-local", \
[IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor", \
+ [IP_LOOKUP_NEXT_GLEAN] = "ip6-glean", \
[IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite", \
- [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify", \
- [IP_LOOKUP_NEXT_MAP] = "ip6-map", \
- [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t", \
- [IP_LOOKUP_NEXT_INDIRECT] = "ip6-indirect", \
+ [IP_LOOKUP_NEXT_MIDCHAIN] = "ip6-midchain", \
+ [IP_LOOKUP_NEXT_LOAD_BALANCE] = "ip6-load-balance", \
[IP_LOOKUP_NEXT_ICMP_ERROR] = "ip6-icmp-error", \
[IP6_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop", \
[IP6_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop", \
@@ -157,20 +150,20 @@
_(proto, IP_FLOW_HASH_PROTO) \
_(reverse, IP_FLOW_HASH_REVERSE_SRC_DST)
+/**
+ * A flow hash configuration is a mask of the flow hash options
+ */
+typedef u32 flow_hash_config_t;
+
#define IP_ADJACENCY_OPAQUE_SZ 16
/** @brief IP unicast adjacency.
@note cache aligned.
*/
typedef struct {
CLIB_CACHE_LINE_ALIGN_MARK(cacheline0);
- /** Handle for this adjacency in adjacency heap. */
+ /* Handle for this adjacency in adjacency heap. */
u32 heap_handle;
- STRUCT_MARK(signature_start);
-
- /** Interface address index for this local/arp adjacency. */
- u32 if_address_index;
-
/** Number of adjecencies in block. Greater than 1 means multipath;
otherwise equal to 1. */
u16 n_adj;
@@ -181,27 +174,63 @@
u16 lookup_next_index_as_int;
};
+ /** Interface address index for this local/arp adjacency. */
+ u32 if_address_index;
+
/** Force re-lookup in a different FIB. ~0 => normal behavior */
- i16 explicit_fib_index;
u16 mcast_group_index;
/** Highest possible perf subgraph arc interposition, e.g. for ip6 ioam */
u16 saved_lookup_next_index;
+ /*
+ * link/ether-type
+ */
+ u8 ia_link;
+ u8 ia_nh_proto;
+
union {
- /** IP_LOOKUP_NEXT_ARP only */
- struct {
- ip46_address_t next_hop;
- } arp;
- /** IP_LOOKUP_NEXT_CLASSIFY only */
- struct {
- u16 table_index;
- } classify;
- /** IP_LOOKUP_NEXT_INDIRECT only */
- struct {
- ip46_address_t next_hop;
- } indirect;
- u8 opaque[IP_ADJACENCY_OPAQUE_SZ];
+ union {
+ /**
+ * IP_LOOKUP_NEXT_ARP/IP_LOOKUP_NEXT_REWRITE
+ *
+ * neighbour adjacency sub-type;
+ */
+ struct {
+ ip46_address_t next_hop;
+ } nbr;
+ /**
+ * IP_LOOKUP_NEXT_MIDCHAIN
+ *
+ * A nbr adj that is also recursive. Think tunnels.
+ * A nbr adj can transition to be of type MDICHAIN
+ * so be sure to leave the two structs with the next_hop
+ * fields aligned.
+ */
+ struct {
+ /**
+ * The recursive next-hop
+ */
+ ip46_address_t next_hop;
+ /**
+ * The node index of the tunnel's post rewrite/TX function.
+ */
+ u32 tx_function_node;
+ /**
+ * The next DPO to use
+ */
+ dpo_id_t next_dpo;
+ } midchain;
+ /**
+ * IP_LOOKUP_NEXT_GLEAN
+ *
+ * Glean the address to ARP for from the packet's destination
+ */
+ struct {
+ ip46_address_t receive_addr;
+ } glean;
+ } sub_type;
+ u16 opaque[IP_ADJACENCY_OPAQUE_SZ];
};
/** @brief Special format function for this adjacency.
@@ -210,63 +239,32 @@
* the first cache line reads "full" on the free space gas gauge.
*/
u32 special_adjacency_format_function_index; /* 0 is invalid */
- STRUCT_MARK(signature_end);
-
- /** Number of FIB entries sharing this adjacency */
- u32 share_count;
- /** Use this adjacency instead */
- u32 next_adj_with_signature;
CLIB_CACHE_LINE_ALIGN_MARK(cacheline1);
- /** Rewrite in second/third cache lines */
+ /* Rewrite in second/third cache lines */
vnet_declare_rewrite (VLIB_BUFFER_PRE_DATA_SIZE);
+
+ /*
+ * member not accessed in the data plane are relgated to the
+ * remaining cachelines
+ */
+ fib_node_t ia_node;
} ip_adjacency_t;
-static inline uword
-vnet_ip_adjacency_signature (ip_adjacency_t * adj)
-{
- uword signature = 0xfeedfaceULL;
+_Static_assert((STRUCT_OFFSET_OF(ip_adjacency_t, cacheline0) == 0),
+ "IP adjacency cachline 0 is not offset");
+_Static_assert((STRUCT_OFFSET_OF(ip_adjacency_t, cacheline1) ==
+ CLIB_CACHE_LINE_BYTES),
+ "IP adjacency cachline 1 is more than one cachline size offset");
- /* Skip heap handle, sum everything up to but not including share_count */
- signature = hash_memory
- (STRUCT_MARK_PTR(adj, signature_start),
- STRUCT_OFFSET_OF(ip_adjacency_t, signature_end)
- - STRUCT_OFFSET_OF(ip_adjacency_t, signature_start),
- signature);
-
- /* and the rewrite */
- signature = hash_memory (&adj->rewrite_header, VLIB_BUFFER_PRE_DATA_SIZE,
- signature);
- return signature;
-}
-
-static inline int
-vnet_ip_adjacency_share_compare (ip_adjacency_t * a1, ip_adjacency_t *a2)
-{
- if (memcmp (STRUCT_MARK_PTR(a1, signature_start),
- STRUCT_MARK_PTR(a2, signature_start),
- STRUCT_OFFSET_OF(ip_adjacency_t, signature_end)
- - STRUCT_OFFSET_OF(ip_adjacency_t, signature_start)))
- return 0;
- if (memcmp (&a1->rewrite_header, &a2->rewrite_header,
- VLIB_BUFFER_PRE_DATA_SIZE))
- return 0;
- return 1;
-}
+/* An all zeros address */
+extern const ip46_address_t zero_addr;
/* Index into adjacency table. */
typedef u32 ip_adjacency_index_t;
typedef struct {
- /* Directly connected next-hop adjacency index. */
- u32 next_hop_adj_index;
-
- /* Path weight for this adjacency. */
- u32 weight;
-} ip_multipath_next_hop_t;
-
-typedef struct {
/* Adjacency index of first index in block. */
u32 adj_index;
@@ -276,11 +274,7 @@
/* Number of prefixes that point to this adjacency. */
u32 reference_count;
- /* Normalized next hops are used as hash keys: they are sorted by weight
- and weights are chosen so they add up to 1 << log2_n_adj_in_block (with
- zero-weighted next hops being deleted).
- Unnormalized next hops are saved so that control plane has a record of exactly
- what the RIB told it. */
+ /* Normalized next hops are saved for stats/display purposes */
struct {
/* Number of hops in the multipath. */
u32 count;
@@ -290,7 +284,7 @@
/* Heap handle used to for example free block when we're done with it. */
u32 heap_handle;
- } normalized_next_hops, unnormalized_next_hops;
+ } normalized_next_hops;
} ip_multipath_adjacency_t;
/* IP multicast adjacency. */
@@ -397,20 +391,11 @@
} ip_adj_register_t;
typedef struct ip_lookup_main_t {
- /** Adjacency heap. */
+ /* Adjacency heap. */
ip_adjacency_t * adjacency_heap;
- /** Adjacency packet/byte counters indexed by adjacency index. */
- vlib_combined_counter_main_t adjacency_counters;
-
- /** Heap of (next hop, weight) blocks. Sorted by next hop. */
- ip_multipath_next_hop_t * next_hop_heap;
-
- /** Indexed by heap_handle from ip_adjacency_t. */
- ip_multipath_adjacency_t * multipath_adjacencies;
-
- /** Adjacency by signature hash */
- uword * adj_index_by_signature;
+ /** load-balance packet/byte counters indexed by LB index. */
+ vlib_combined_counter_main_t load_balance_counters;
/** any-tx-feature-enabled interface bitmap */
uword * tx_sw_if_has_ip_output_features;
@@ -418,29 +403,6 @@
/** count of enabled features, per sw_if_index, to maintain bitmap */
i16 * tx_feature_count_by_sw_if_index;
- /** Temporary vectors for looking up next hops in hash. */
- ip_multipath_next_hop_t * next_hop_hash_lookup_key;
- ip_multipath_next_hop_t * next_hop_hash_lookup_key_normalized;
-
- /** Hash table mapping normalized next hops and weights
- to multipath adjacency index. */
- uword * multipath_adjacency_by_next_hops;
-
- u32 * adjacency_remap_table;
- u32 n_adjacency_remaps;
-
- /** If average error per adjacency is less than this threshold adjacency block
- size is accepted. */
- f64 multipath_next_hop_error_tolerance;
-
- /** Adjacency index for routing table misses, local punts, and drops. */
- u32 miss_adj_index, drop_adj_index, local_adj_index;
-
- /** Miss adjacency is always first in adjacency table. */
-#define IP_LOOKUP_MISS_ADJ_INDEX 0
-
- ip_add_del_adjacency_callback_t * add_del_adjacency_callbacks;
-
/** Pool of addresses that are assigned to interfaces. */
ip_interface_address_t * if_address_pool;
@@ -501,54 +463,6 @@
CLIB_PREFETCH (_adj, sizeof (_adj[0]), type); \
} while (0)
-/* Adds a next node to ip4 or ip6 lookup node which can be then used in adjacencies.
- * @param vlib_main pointer
- * @param lm ip4_main.lookup_main or ip6_main.lookup_main
- * @param reg registration structure
- * @param next_node_index Returned index to be used in adjacencies.
- * @return 0 on success. -1 on failure.
- */
-int ip_register_adjacency(vlib_main_t *vm, u8 is_ip4,
- ip_adj_register_t *reg);
-
-/*
- * Construction helpers to add IP adjacency at init.
- */
-#define VNET_IP_REGISTER_ADJACENCY(ip,x,...) \
- __VA_ARGS__ ip_adj_register_t ip##adj_##x; \
-static void __vnet_##ip##_register_adjacency_##x (void) \
- __attribute__((__constructor__)) ; \
-static void __vnet_##ip##_register_adjacency_##x (void) \
-{ \
- ip_lookup_main_t *lm = &ip##_main.lookup_main; \
- ip##adj_##x.next = lm->registered_adjacencies; \
- lm->registered_adjacencies = &ip##adj_##x; \
-} \
-__VA_ARGS__ ip_adj_register_t ip##adj_##x
-
-#define VNET_IP4_REGISTER_ADJACENCY(x,...) \
- VNET_IP_REGISTER_ADJACENCY(ip4, x, __VA_ARGS__)
-
-#define VNET_IP6_REGISTER_ADJACENCY(x,...) \
- VNET_IP_REGISTER_ADJACENCY(ip6, x, __VA_ARGS__)
-
-static inline void
-ip_register_add_del_adjacency_callback(ip_lookup_main_t * lm,
- ip_add_del_adjacency_callback_t cb)
-{
- vec_add1(lm->add_del_adjacency_callbacks, cb);
-}
-
-always_inline void
-ip_call_add_del_adjacency_callbacks (ip_lookup_main_t * lm, u32 adj_index, u32 is_del)
-{
- ip_adjacency_t * adj;
- uword i;
- adj = ip_get_adjacency (lm, adj_index);
- for (i = 0; i < vec_len (lm->add_del_adjacency_callbacks); i++)
- lm->add_del_adjacency_callbacks[i] (lm, adj_index, adj, is_del);
-}
-
/* Create new block of given number of contiguous adjacencies. */
ip_adjacency_t *
ip_add_adjacency (ip_lookup_main_t * lm,
@@ -556,38 +470,6 @@
u32 n_adj,
u32 * adj_index_result);
-void ip_del_adjacency (ip_lookup_main_t * lm, u32 adj_index);
-void
-ip_update_adjacency (ip_lookup_main_t * lm,
- u32 adj_index,
- ip_adjacency_t * copy_adj);
-
-static inline int
-ip_adjacency_is_multipath(ip_lookup_main_t * lm, u32 adj_index)
-{
- if (!vec_len(lm->multipath_adjacencies))
- return 0;
-
- if (vec_len(lm->multipath_adjacencies) < adj_index - 1)
- return 0;
-
-
- return (lm->multipath_adjacencies[adj_index].adj_index == adj_index &&
- lm->multipath_adjacencies[adj_index].n_adj_in_block > 0);
-}
-
-void
-ip_multipath_adjacency_free (ip_lookup_main_t * lm,
- ip_multipath_adjacency_t * a);
-
-u32
-ip_multipath_adjacency_add_del_next_hop (ip_lookup_main_t * lm,
- u32 is_del,
- u32 old_mp_adj_index,
- u32 next_hop_adj_index,
- u32 next_hop_weight,
- u32 * new_mp_adj_index);
-
clib_error_t *
ip_interface_address_add_del (ip_lookup_main_t * lm,
u32 sw_if_index,
@@ -596,6 +478,9 @@
u32 is_del,
u32 * result_index);
+u8 *
+format_ip_flow_hash_config (u8 * s, va_list * args);
+
always_inline ip_interface_address_t *
ip_get_interface_address (ip_lookup_main_t * lm, void * addr_fib)
{
@@ -603,28 +488,14 @@
return p ? pool_elt_at_index (lm->if_address_pool, p[0]) : 0;
}
+u32
+fib_table_id_find_fib_index (fib_protocol_t proto,
+ u32 table_id);
+
always_inline void *
ip_interface_address_get_address (ip_lookup_main_t * lm, ip_interface_address_t * a)
{ return mhash_key_to_mem (&lm->address_to_if_address_index, a->address_key); }
-always_inline ip_interface_address_t *
-ip_interface_address_for_packet (ip_lookup_main_t * lm, vlib_buffer_t * b, u32 sw_if_index)
-{
- ip_adjacency_t * adj;
- u32 if_address_index;
-
- adj = ip_get_adjacency (lm, vnet_buffer (b)->ip.adj_index[VLIB_TX]);
-
- ASSERT (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP
- || adj->lookup_next_index == IP_LOOKUP_NEXT_LOCAL);
- if_address_index = adj->if_address_index;
- if_address_index = (if_address_index == ~0 ?
- vec_elt (lm->if_address_pool_index_by_sw_if_index, sw_if_index)
- : if_address_index);
-
- return (if_address_index != ~0)?pool_elt_at_index (lm->if_address_pool, if_address_index):NULL;
-}
-
#define foreach_ip_interface_address(lm,a,sw_if_index,loop,body) \
do { \
vnet_main_t *_vnm = vnet_get_main(); \
@@ -653,7 +524,5 @@
} while (0)
void ip_lookup_init (ip_lookup_main_t * lm, u32 ip_lookup_node_index);
-u32 vnet_register_special_adjacency_format_function
-(ip_lookup_main_t * lm, format_function_t * fp);
#endif /* included_ip_lookup_h */