blob: 922725638781f212ced8fdcc95b13da019f6fb32 [file] [log] [blame]
Pierre Pfister953f5512018-01-12 09:41:16 +01001/*
2 * Copyright (c) 2012 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16/*
17 * Author: Pierre Pfister <ppfister@cisco.com>
18 *
19 * DISCLAIMER !
20 *
21 * This most likely is not the hash table you are looking for !!
22 *
23 * This structure targets a very specific and quite narrow set of use-cases
24 * that are not covered by other hash tables.
25 *
26 * Read the following text carefully, or ask the author or one of VPP's
27 * committers to make sure this is what you are looking for.
28 *
29 *
30 * -- Abstract:
31 * This hash table intends to provide a very fast lookup and insertion of
32 * key-value pairs for flow tables (although it might be used for other
33 * purposes), with additional support for lazy-timeouts.
34 * In particular, it was designed to minimize blocking reads, register usage and
35 * cache-lines accesses during a typical lookup.
36 * This hash table therefore provides stateful packet processing
37 * without performance degradation even when every single lookup has to fetch
38 * memory from RAM.
39 * This hash table is not thread-safe and requires executing a garbage
40 * collection function to clean-up chained buckets.
41 *
42 * -- Overview:
43 *
44 * One first aspect of this hash table is that it is self-contained in a single
45 * bulk of memory. Each entry contains a key, a value, and a 32 bits timeout
46 * value; occupies a full and single cache line; and is identified by a unique
47 * 32 bits index. The entry index zero is reserved and used when an entry
48 * could not be found nor inserted. Which means it is not necessary to
49 * immediately check whether an insertion or lookup was successful before
50 * behaving accordingly. One can just keep doing business as usual and
51 * check for the error later.
52 *
53 * Each entry is associated with a timeout value (which unit or clock is up to
54 * the user of the hash table). An entry which timeout is strictly smaller
55 * than the current time is considered empty, whereas an entry which timeout is
56 * greater or equal to the current time contains a valid key-value pair.
57 *
58 * Hash table lookup and insertion are equivalent:
59 * - An entry index is always returned (possibly index 0 if no entry could be
60 * found nor created).
61 * - The returned entry always has its key set to the provided key.
62 * - Timeout value will be greater than the provided current time whenever a
63 * valid entry was found, strictly smaller otherwise. In particular, one can
64 * not differentiate between an entry which was just created, and an entry
65 * which already existed in the past but got timeouted in between.
66 *
67 * As mentioned earlier, entry index zero is used as an invalid entry which may
68 * be manipulated as a normal one. Entries which index go from 1 to
69 * N (where N is a power of 2) are used as direct buckets, each containing a
70 * single entry. In the absence of hash collision, a single entry which location
71 * can deterministically be determined from the key-hash and the hash table
72 * header is accessed (One single cache line, without indirection). This
73 * allows for efficient pre-fetching of the key-value for more than 95% of
74 * accesses.
75 *
76 * In order to handle hash collisions (i.e. when multiple keys
77 * end-up in the same bucket), entries which index are greater than N are
78 * grouped into M groups of 16 collision entries. Such groups are linked
79 * with regular entries whenever a collision needs to be handled.
80 * When looking up a key with a bucket where a collision occurred, unused bits
81 * from the key hash are used to select two entries (from the collision bucket)
82 * where the new entry might be inserted.
83 *
84 * Once an entry is inserted, it will never be moved as long as the entry
85 * timeout value remains greater or equal to the provided current time value.
86 * The entry index can therefore be stored in other data structure as a way
87 * to bypass the hash lookup. But when doing so, one should check if the
88 * present key is the actual looked-up key.
89 *
90 * -- Garbage Collection:
91 *
92 * Since there is no explicit element removal, a garbage collector mechanism
93 * is required in order to remove buckets used for hash collisions. This
94 * is done by calling the flowhash_gc function on a regular basis. Each call
95 * to this function examines a single fixed entry. It shall therefore be called
96 * as many times as there are fixed entries in the hash table in order to
97 * ensure a full inspection.
98 *
99 * -- Time and timeout mechanism:
100 *
101 * The hash table makes use of a time value between in [1, 2^32 - 1].
102 * The provided time value shall keep increasing, and looping is not handled.
103 * When seconds are used, the system should run for 136 years without any issue.
104 * If milliseconds are used, a shift should be operated on all timeout values
105 * on a regular basis (more than every 49 days).
106 */
107
108#ifndef __included_flowhash_template_h__
109#define __included_flowhash_template_h__
110
111#include <vppinfra/clib.h>
112#include <vppinfra/mem.h>
113#include <vppinfra/cache.h>
114
115#ifndef FLOWHASH_TYPE
116#error FLOWHASH_TYPE not defined
117#endif
118
119#define _fv(a,b) a##b
120#define __fv(a,b) _fv(a,b)
121#define FV(a) __fv(a,FLOWHASH_TYPE)
122
123#define _fvt(a,b) a##b##_t
124#define __fvt(a,b) _fvt(a,b)
125#define FVT(a) __fvt(a,FLOWHASH_TYPE)
126
127/* Same for all flowhash variants */
128#ifndef __included_flowhash_common__
129
130#define FLOWHASH_INVALID_ENTRY_INDEX 0
131
132#define FLOWHASH_ENTRIES_PER_BUCKETS_LOG 4
133#define FLOWHASH_ENTRIES_PER_BUCKETS (1 << FLOWHASH_ENTRIES_PER_BUCKETS_LOG)
134
135#endif /* ifndef __included_flowhash_common__ */
136
137 /**
138 * @brief Compare a stored key with a lookup key.
139 *
140 * This function must be defined to use this template. It must return 0
141 * when the two keys are identical, and a different value otherwise.
142 */
143static_always_inline
144u8 FV(flowhash_cmp_key)(FVT(flowhash_skey) *a, FVT(flowhash_lkey) *b);
145
146 /**
147 * @brief Hash a lookup key into a 32 bit integer.
148 *
149 * This function must be defined to use this template.
150 * It must provides close to 32 bits of entropy distributed amongst
151 * all 32 bits of the provided value.
152 * Keys that are equal must have the same hash.
153 */
154 static_always_inline
155 u32 FV(flowhash_hash)(FVT(flowhash_lkey) *k);
156
157/**
158 * @brief Copy a lookup key into a destination stored key.
159 *
160 * This function must be defined to use this template. It must modify the dst
161 * key such that a later call to flowhash_cmp_key with the same arguments
162 * would return 0.
163 */
164static_always_inline
165void FV(flowhash_cpy_key)(FVT(flowhash_skey) *dst, FVT(flowhash_lkey) *src);
166
167/**
168 * @brief One flow hash entry used for both direct buckets and collision
169 * buckets.
170 */
171typedef struct {
172 /* Each entry is cache-line aligned. */
173 CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
174
175 /* Key is first to take advantage of alignment. */
176 FVT(flowhash_skey) key;
177
178 /* Entry value. */
179 FVT(flowhash_value) value;
180
181 /* Timeout value */
182 u32 timeout;
183
184 /* Entry index to the chained bucket. */
185 u32 chained_entry_index;
186} FVT(flowhash_entry);
187
188typedef struct FVT(__flowhash_struct) {
189 /* Cache aligned to simplify allocation. */
190 CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
191
192 /* Array going downward containing free bucket indices */
193 u32 free_buckets_indices[0];
194
195 /* Negative index of the first free bucket */
196 i32 free_buckets_position;
197
198 /* Number of fixed buckets minus one */
199 u32 fixed_entries_mask;
200
201 /* Allocated pointer for this hash table */
202 void *mem;
203
204 u32 collision_buckets_mask;
205 u32 total_entries;
206
207 u64 not_enough_buckets_counter;
208 u64 collision_lookup_counter;
209 u64 garbage_collection_counter;
210
211 u32 gc_counter;
212
213 /* Entry array containing:
214 * - 1 Dummy entry for error return
215 * - (buckets_mask + 1) Fixed buckets
216 * - chained_buckets Chained Buckets
217 */
218 FVT(flowhash_entry) entries[0];
219} FVT(flowhash);
220
221/* Same for all flowhash variants */
222#ifndef __included_flowhash_common__
223#define __included_flowhash_common__
224
225/**
226 * @brief Test whether a returned entry index corresponds to an overflow event.
227 */
228#define flowhash_is_overflow(ei) \
229 ((ei) == FLOWHASH_INVALID_ENTRY_INDEX)
230
231/**
232 * @brief Iterate over all entries in the hash table.
233 *
234 * Iterate over all entries in the hash table, not including the first invalid
235 * entry (at index 0), but including all chained hash collision buckets.
236 *
237 */
238#define flowhash_foreach_entry(h, ei) \
239 for (ei = 1; \
240 ei < (h)->total_entries; \
241 ei++)
242
243/**
244 * @brief Iterate over all currently valid entries.
245 *
246 * Iterate over all entries in the hash table which timeout value is greater
247 * or equal to the current time.
248 */
249#define flowhash_foreach_valid_entry(h, ei, now) \
250 flowhash_foreach_entry(h, ei) \
251 if (((now) <= (h)->entries[ei].timeout))
252
253/**
254 * @brief Timeout variable from a given entry.
255 */
256#define flowhash_timeout(h, ei) (h)->entries[ei].timeout
257
258/**
259 * @brief Indicates whether the entry is being used.
260 */
261#define flowhash_is_timeouted(h, ei, time_now) \
262 ((time_now) > flowhash_timeout(h, ei))
263
264/**
265 * @brief Get the key from the entry index, casted to the provided type.
266 */
267#define flowhash_key(h, ei) (&(h)->entries[ei].key)
268
269/**
270 * @brief Get the value from the entry index, casted to the provided type.
271 */
272#define flowhash_value(h, ei) (&(h)->entries[ei].value)
273
274/**
275 * @brief Get the number of octets allocated to this structure.
276 */
277#define flowhash_memory_size(h) clib_mem_size((h)->mem)
278
279/**
280 * @brief Test whether the entry index is in hash table boundaries.
281 */
282#define flowhash_is_valid_entry_index(h, ei) (ei < (h)->total_entries)
283
284/**
285 * @brief Adjust, if necessary, provided parameters such as being valid flowhash
286 * sizes.
287 */
288static
289void flowhash_validate_sizes(u32 *fixed_entries, u32 *collision_buckets)
290{
291 /* Find power of two greater or equal to the provided value */
292 if (*fixed_entries < FLOWHASH_ENTRIES_PER_BUCKETS)
293 *fixed_entries = FLOWHASH_ENTRIES_PER_BUCKETS;
294 if (*fixed_entries > (1 << (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG)))
295 *fixed_entries = (1 << (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG));
296
297 *fixed_entries -= 1;
298 *fixed_entries |= *fixed_entries >> 16;
299 *fixed_entries |= *fixed_entries >> 8;
300 *fixed_entries |= *fixed_entries >> 4;
301 *fixed_entries |= *fixed_entries >> 2;
302 *fixed_entries |= *fixed_entries >> 1;
303 *fixed_entries += 1;
304
305 if (*collision_buckets != 0)
306 {
307 if (*collision_buckets < CLIB_CACHE_LINE_BYTES/sizeof(u32))
308 *collision_buckets = CLIB_CACHE_LINE_BYTES/sizeof(u32);
309
310 *collision_buckets -= 1;
311 *collision_buckets |= *collision_buckets >> 16;
312 *collision_buckets |= *collision_buckets >> 8;
313 *collision_buckets |= *collision_buckets >> 4;
314 *collision_buckets |= *collision_buckets >> 2;
315 *collision_buckets |= *collision_buckets >> 1;
316 *collision_buckets += 1;
317 }
318}
319
320/**
321 * @brief Prefetch the the hash entry bucket.
322 *
323 * This should be performed approximately 200-300 cycles before lookup
324 * if the table is located in RAM. Or 30-40 cycles before lookup
325 * in case the table is located in L3.
326 */
327#define flowhash_prefetch(h, hash) \
328 CLIB_PREFETCH (&(h)->entries[((hash) & (h)->fixed_entries_mask) + 1], \
329 sizeof((h)->entries[0]), LOAD)
330
331#endif /* ifndef __included_flowhash_common__ */
332
333/**
334 * @brief Allocate a flowhash structure.
335 *
336 * @param[in] fixed_entries The number of fixed entries in the hash table.
337 * @param[in] chained_buckets The number of chained buckets.
338 *
339 * fixed_entries and chained_buckets parameters may not be used as is but
340 * modified in order to fit requirements.
341 *
342 * Since the flowhash does not support dynamic resizing, it is fairly
343 * important to choose the parameters carefully. In particular the performance
344 * gain from using this structure comes from an efficient lookup in the
345 * absence of hash collision.
346 * As a rule of thumbs, if the number of active entries (flows) is M,
347 * there should be about 16*M fixed entries, and M/16 collision buckets.
348 * Which represents 17*M allocated entries.
349 *
350 * For example:
351 * M = 2^20 total_size ~= 1GiB collision ~= 3%
352 * M = 2^18 total_size ~= 250MiB collision ~= 3%
353 * M = 2^10 total_size ~= 1MiB collision ~= 6%
354 *
355 */
356static_always_inline
357FVT(flowhash) *FV(flowhash_alloc)(u32 fixed_entries, u32 collision_buckets)
358{
359 FVT(flowhash) *h;
Pierre Pfister0318a112018-05-28 13:56:04 +0200360 uword size;
Pierre Pfister953f5512018-01-12 09:41:16 +0100361 void *mem;
362 u32 entries;
363
364 flowhash_validate_sizes(&fixed_entries, &collision_buckets);
365
366 entries = 1 + fixed_entries +
367 collision_buckets * FLOWHASH_ENTRIES_PER_BUCKETS;
368 size = sizeof(*h) + sizeof(h->entries[0]) * entries +
369 sizeof(h->free_buckets_indices[0]) * collision_buckets;
370
371 mem = clib_mem_alloc_aligned(size, CLIB_CACHE_LINE_BYTES);
372 h = mem + collision_buckets * sizeof(h->free_buckets_indices[0]);
373 h->mem = mem;
374
375 /* Fill free elements list */
376 int i;
Pierre Pfister052c2dc2018-08-20 13:48:15 +0200377 memset(h->entries, 0, sizeof(h->entries[0]) * entries);
Pierre Pfister953f5512018-01-12 09:41:16 +0100378 for (i = 1; i <= collision_buckets; i++)
379 {
380 h->free_buckets_indices[-i] =
381 entries - i * FLOWHASH_ENTRIES_PER_BUCKETS;
382 }
383
384 /* Init buckets */
385 for (i=0; i < entries; i++)
386 {
387 h->entries[i].chained_entry_index = FLOWHASH_INVALID_ENTRY_INDEX;
388 h->entries[i].timeout = 0;
389 }
390
391 h->free_buckets_position = -collision_buckets;
392 h->fixed_entries_mask = fixed_entries - 1;
393 h->collision_buckets_mask = collision_buckets - 1;
394 h->total_entries = entries;
395 h->not_enough_buckets_counter = 0;
396 h->collision_lookup_counter = 0;
397 h->garbage_collection_counter = 0;
398 h->gc_counter = 0;
399
400 return h;
401}
402
403/**
404 * @brief Free the flow hash memory.
405 */
406static_always_inline
407void FV(flowhash_free)(FVT(flowhash) *h)
408{
409 clib_mem_free(h->mem);
410}
411
412static void
413FV(__flowhash_get_chained) (FVT(flowhash) *h, FVT(flowhash_lkey) *k,
414 u32 hash, u32 time_now, u32 *ei);
415
416/**
417 * @brief Retrieves an entry index corresponding to a provided key and its hash.
418 *
419 * @param h The hash table pointer.
420 * @param k[in] A pointer to the key value.
421 * @param hash[in] The hash of the key.
422 * @param time_now[in] The current time.
423 * @param ei[out] A pointer set to the found entry index.
424 *
425 * This function always sets ei value to a valid entry index which can then be
426 * used to access the stored value as well as get or set its associated timeout.
427 * The key stored in the returned entry is always set to the provided key.
428 *
429 * In case the provided key is not found, and no entry could be created
430 * (either because there is no hash collision bucket available or
431 * the candidate entries in the collision bucket were already used), ei is
432 * set to the special value FLOWHASH_INVALID_ENTRY_INDEX (which can be tested
433 * with the flowhash_is_overflow macro).
434 *
435 * The timeout value is never modified during a lookup.
436 * - Use the flowhash_is_timeouted macro to test whether the returned entry
437 * was already valid, or is proposed for insertion.
438 * - Use the flowhash_timeout macro to get and set the entry timeout value.
439 *
440 */
441static_always_inline
442void FV(flowhash_get) (FVT(flowhash) *h, FVT(flowhash_lkey) *k,
443 u32 hash, u32 time_now, u32 *ei)
444{
445 *ei = (hash & h->fixed_entries_mask) + 1;
446
447 if (PREDICT_FALSE(FV(flowhash_cmp_key)(&h->entries[*ei].key, k) != 0))
448 {
449 if (PREDICT_TRUE(time_now > h->entries[*ei].timeout &&
450 (h->entries[*ei].chained_entry_index ==
451 FLOWHASH_INVALID_ENTRY_INDEX)))
452 {
453 FV(flowhash_cpy_key)(&h->entries[*ei].key, k);
454 }
455 else
456 {
457 FV(__flowhash_get_chained)(h, k, hash, time_now, ei);
458 }
459 }
460}
461
462static_always_inline void
463FV(__flowhash_get_chained) (FVT(flowhash) *h, FVT(flowhash_lkey) *k,
464 u32 hash, u32 time_now, u32 *ei)
465{
466 h->collision_lookup_counter++;
467
468 if (h->entries[*ei].chained_entry_index == FLOWHASH_INVALID_ENTRY_INDEX)
469 {
470 /* No chained entry yet. Let's chain one. */
471 if (h->free_buckets_position == 0)
472 {
473 /* Oops. No more buckets available. */
474 h->not_enough_buckets_counter++;
475 *ei = FLOWHASH_INVALID_ENTRY_INDEX;
476 h->entries[FLOWHASH_INVALID_ENTRY_INDEX].timeout =
477 time_now - 1;
478 FV(flowhash_cpy_key)(
479 &h->entries[FLOWHASH_INVALID_ENTRY_INDEX].key, k);
480 return;
481 }
482
483 /* Forward link */
484 h->entries[*ei].chained_entry_index =
485 h->free_buckets_indices[h->free_buckets_position];
486
487 /* Backward link (for garbage collection) */
488 h->entries[h->free_buckets_indices[h->free_buckets_position]].
489 chained_entry_index = *ei;
490
491 /* Move pointer */
492 h->free_buckets_position++;
493 }
494
495 /* Get the two indexes where to look at. */
496 u32 bi0 = h->entries[*ei].chained_entry_index +
497 (hash >> (32 - FLOWHASH_ENTRIES_PER_BUCKETS_LOG));
498 u32 bi1 = bi0 + 1;
499 bi1 = (bi0 & (FLOWHASH_ENTRIES_PER_BUCKETS - 1)) ? bi1 :
500 bi1 - FLOWHASH_ENTRIES_PER_BUCKETS;
501
502 /* It is possible that we wait while comparing bi0 key.
503 * It's better to prefetch bi1 so we don't wait twice. */
504 CLIB_PREFETCH(&h->entries[bi1], sizeof (h->entries[0]), READ);
505
506 if (FV(flowhash_cmp_key)(&h->entries[bi0].key, k) == 0)
507 {
508 *ei = bi0;
509 return;
510 }
511
512 if (FV(flowhash_cmp_key)(&h->entries[bi1].key, k) == 0)
513 {
514 *ei = bi1;
515 return;
516 }
517
518 if (h->entries[*ei].timeout >= time_now)
519 {
520 *ei = FLOWHASH_INVALID_ENTRY_INDEX;
521 *ei = (time_now > h->entries[bi0].timeout) ? bi0 : *ei;
522 *ei = (time_now > h->entries[bi1].timeout) ? bi1 : *ei;
523 }
524
525 FV(flowhash_cpy_key)(&h->entries[*ei].key, k);
526}
527
528static_always_inline void
Pierre Pfister052c2dc2018-08-20 13:48:15 +0200529FV(flowhash_gc)(FVT(flowhash) *h, u32 time_now,
530 u32 *freed_index, u32 *freed_len)
Pierre Pfister953f5512018-01-12 09:41:16 +0100531{
532 u32 ei;
Pierre Pfister052c2dc2018-08-20 13:48:15 +0200533 if (freed_index)
534 *freed_len = 0;
535
Pierre Pfister953f5512018-01-12 09:41:16 +0100536 if (PREDICT_FALSE(h->collision_buckets_mask == (((u32)0) - 1)))
537 return;
538
539 /* prefetch two rounds in advance */
540 ei = 2 + h->fixed_entries_mask +
541 ((h->gc_counter + 2) & h->collision_buckets_mask) *
542 FLOWHASH_ENTRIES_PER_BUCKETS;
543 CLIB_PREFETCH(&h->entries[ei], sizeof (h->entries[0]), READ);
544
545 /* prefetch one round in advance */
546 ei = 2 + h->fixed_entries_mask +
547 ((h->gc_counter + 1) & h->collision_buckets_mask) *
548 FLOWHASH_ENTRIES_PER_BUCKETS;
549 if (h->entries[ei].chained_entry_index != FLOWHASH_INVALID_ENTRY_INDEX)
550 {
551 CLIB_PREFETCH(&h->entries[ei], 4 * CLIB_CACHE_LINE_BYTES, READ);
552 }
553
554 /* do GC */
555 ei = 2 + h->fixed_entries_mask +
556 ((h->gc_counter) & h->collision_buckets_mask) *
557 FLOWHASH_ENTRIES_PER_BUCKETS;
558 if (h->entries[ei].chained_entry_index != FLOWHASH_INVALID_ENTRY_INDEX)
559 {
560 u8 found = 0;
561 int i;
562 for (i=0; i<FLOWHASH_ENTRIES_PER_BUCKETS; i++)
563 {
564 if (time_now <= h->entries[ei + i].timeout)
565 {
566 found = 1;
567 break;
568 }
569 }
570
571 if (!found)
572 {
Pierre Pfister052c2dc2018-08-20 13:48:15 +0200573 /* Tell caller we freed this */
574 if (freed_index)
575 {
576 *freed_index = ei;
577 *freed_len = FLOWHASH_ENTRIES_PER_BUCKETS;
578 }
Pierre Pfister953f5512018-01-12 09:41:16 +0100579 /* The bucket is not used. Let's free it. */
580 h->free_buckets_position--;
581 /* Reset forward link */
582 h->entries[h->entries[ei].chained_entry_index].chained_entry_index =
583 FLOWHASH_INVALID_ENTRY_INDEX;
584 /* Reset back link */
585 h->entries[ei].chained_entry_index = FLOWHASH_INVALID_ENTRY_INDEX;
586 /* Free element */
587 h->free_buckets_indices[h->free_buckets_position] = ei;
588 /* Count the garbage collection event */
589 h->garbage_collection_counter++;
590 }
591 }
592
593 h->gc_counter++;
594}
595
596static_always_inline
597u32 FV(flowhash_elts)(FVT(flowhash) *h, u32 time_now)
598{
599 u32 tot = 0;
600 u32 ei;
601
602 flowhash_foreach_valid_entry(h, ei, time_now)
603 tot++;
604
605 return tot;
606}
607
608#endif /* __included_flowhash_template_h__ */