blob: bb38233a55ec455254cbffbd3172f9082c74cf54 [file] [log] [blame]
Neale Ranns0bfe5d82016-08-25 15:29:12 +01001/*
2 * Copyright (c) 2016 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#include <vnet/ip/lookup.h>
17#include <vnet/dpo/load_balance.h>
18#include <vnet/dpo/load_balance_map.h>
19#include <vnet/dpo/drop_dpo.h>
20#include <vppinfra/math.h> /* for fabs */
21#include <vnet/adj/adj.h>
Neale Ranns0bfe5d82016-08-25 15:29:12 +010022#include <vnet/adj/adj_internal.h>
Neale Ranns3ee44042016-10-03 13:05:48 +010023#include <vnet/fib/fib_urpf_list.h>
Neale Rannsd792d9c2017-10-21 10:53:20 -070024#include <vnet/bier/bier_hdr_inlines.h>
Neale Ranns0bfe5d82016-08-25 15:29:12 +010025
26/*
27 * distribution error tolerance for load-balancing
28 */
29const f64 multipath_next_hop_error_tolerance = 0.1;
30
31#undef LB_DEBUG
32
33#ifdef LB_DEBUG
34#define LB_DBG(_lb, _fmt, _args...) \
35{ \
36 u8* _tmp =NULL; \
37 clib_warning("lb:[%s]:" _fmt, \
38 load_balance_format(load_balance_get_index((_lb)), \
39 0, _tmp), \
40 ##_args); \
41 vec_free(_tmp); \
42}
43#else
44#define LB_DBG(_p, _fmt, _args...)
45#endif
46
47
48/**
49 * Pool of all DPOs. It's not static so the DP can have fast access
50 */
51load_balance_t *load_balance_pool;
52
53/**
54 * The one instance of load-balance main
55 */
56load_balance_main_t load_balance_main;
57
58f64
59load_balance_get_multipath_tolerance (void)
60{
61 return (multipath_next_hop_error_tolerance);
62}
63
64static inline index_t
65load_balance_get_index (const load_balance_t *lb)
66{
67 return (lb - load_balance_pool);
68}
69
70static inline dpo_id_t*
71load_balance_get_buckets (load_balance_t *lb)
72{
73 if (LB_HAS_INLINE_BUCKETS(lb))
74 {
75 return (lb->lb_buckets_inline);
76 }
77 else
78 {
79 return (lb->lb_buckets);
80 }
81}
82
83static load_balance_t *
84load_balance_alloc_i (void)
85{
86 load_balance_t *lb;
87
88 pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES);
89 memset(lb, 0, sizeof(*lb));
90
91 lb->lb_map = INDEX_INVALID;
Neale Ranns3ee44042016-10-03 13:05:48 +010092 lb->lb_urpf = INDEX_INVALID;
Neale Ranns0bfe5d82016-08-25 15:29:12 +010093 vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters),
94 load_balance_get_index(lb));
95 vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters),
96 load_balance_get_index(lb));
97 vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters),
98 load_balance_get_index(lb));
99 vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters),
100 load_balance_get_index(lb));
101
102 return (lb);
103}
104
105static u8*
106load_balance_format (index_t lbi,
107 load_balance_format_flags_t flags,
108 u32 indent,
109 u8 *s)
110{
111 vlib_counter_t to, via;
112 load_balance_t *lb;
113 dpo_id_t *buckets;
114 u32 i;
115
116 lb = load_balance_get(lbi);
117 vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to);
118 vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via);
119 buckets = load_balance_get_buckets(lb);
120
121 s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE);
Neale Rannsf12a83f2017-04-18 09:09:40 -0700122 s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto);
123 s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets);
Neale Ranns3ee44042016-10-03 13:05:48 +0100124 s = format(s, "uRPF:%d ", lb->lb_urpf);
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100125 s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes);
126 if (0 != via.packets)
127 {
128 s = format(s, " via:[%Ld:%Ld]",
129 via.packets, via.bytes);
130 }
131 s = format(s, "]");
132
133 if (INDEX_INVALID != lb->lb_map)
134 {
135 s = format(s, "\n%U%U",
136 format_white_space, indent+4,
137 format_load_balance_map, lb->lb_map, indent+4);
138 }
139 for (i = 0; i < lb->lb_n_buckets; i++)
140 {
141 s = format(s, "\n%U[%d] %U",
142 format_white_space, indent+2,
143 i,
144 format_dpo_id,
145 &buckets[i], indent+6);
146 }
147 return (s);
148}
149
150u8*
151format_load_balance (u8 * s, va_list * args)
152{
Billy McFallcfcf1e22016-10-14 09:51:49 -0400153 index_t lbi = va_arg(*args, index_t);
154 load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t);
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100155
156 return (load_balance_format(lbi, flags, 0, s));
157}
158static u8*
159format_load_balance_dpo (u8 * s, va_list * args)
160{
Billy McFallcfcf1e22016-10-14 09:51:49 -0400161 index_t lbi = va_arg(*args, index_t);
162 u32 indent = va_arg(*args, u32);
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100163
164 return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s));
165}
166
167
168static load_balance_t *
169load_balance_create_i (u32 num_buckets,
170 dpo_proto_t lb_proto,
171 flow_hash_config_t fhc)
172{
173 load_balance_t *lb;
174
175 lb = load_balance_alloc_i();
176 lb->lb_hash_config = fhc;
177 lb->lb_n_buckets = num_buckets;
178 lb->lb_n_buckets_minus_1 = num_buckets-1;
179 lb->lb_proto = lb_proto;
180
181 if (!LB_HAS_INLINE_BUCKETS(lb))
182 {
183 vec_validate_aligned(lb->lb_buckets,
184 lb->lb_n_buckets - 1,
185 CLIB_CACHE_LINE_BYTES);
186 }
187
188 LB_DBG(lb, "create");
189
190 return (lb);
191}
192
193index_t
194load_balance_create (u32 n_buckets,
195 dpo_proto_t lb_proto,
196 flow_hash_config_t fhc)
197{
198 return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc)));
199}
200
201static inline void
202load_balance_set_bucket_i (load_balance_t *lb,
203 u32 bucket,
204 dpo_id_t *buckets,
205 const dpo_id_t *next)
206{
207 dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next);
208}
209
210void
211load_balance_set_bucket (index_t lbi,
212 u32 bucket,
213 const dpo_id_t *next)
214{
215 load_balance_t *lb;
216 dpo_id_t *buckets;
217
218 lb = load_balance_get(lbi);
219 buckets = load_balance_get_buckets(lb);
220
221 ASSERT(bucket < lb->lb_n_buckets);
222
223 load_balance_set_bucket_i(lb, bucket, buckets, next);
224}
225
226int
227load_balance_is_drop (const dpo_id_t *dpo)
228{
229 load_balance_t *lb;
230
231 if (DPO_LOAD_BALANCE != dpo->dpoi_type)
232 return (0);
233
234 lb = load_balance_get(dpo->dpoi_index);
235
236 if (1 == lb->lb_n_buckets)
237 {
238 return (dpo_is_drop(load_balance_get_bucket_i(lb, 0)));
239 }
240 return (0);
241}
242
Neale Ranns2303cb12018-02-21 04:57:17 -0800243u16
244load_balance_n_buckets (index_t lbi)
245{
246 load_balance_t *lb;
247
248 lb = load_balance_get(lbi);
249
250 return (lb->lb_n_buckets);
251}
252
Neale Ranns3ee44042016-10-03 13:05:48 +0100253void
Neale Ranns32e1c012016-11-22 17:07:28 +0000254load_balance_set_fib_entry_flags (index_t lbi,
255 fib_entry_flag_t flags)
256{
257 load_balance_t *lb;
258
259 lb = load_balance_get(lbi);
260 lb->lb_fib_entry_flags = flags;
261}
262
263
264void
Neale Ranns3ee44042016-10-03 13:05:48 +0100265load_balance_set_urpf (index_t lbi,
266 index_t urpf)
267{
268 load_balance_t *lb;
269 index_t old;
270
271 lb = load_balance_get(lbi);
272
273 /*
274 * packets in flight we see this change. but it's atomic, so :P
275 */
276 old = lb->lb_urpf;
277 lb->lb_urpf = urpf;
278
279 fib_urpf_list_unlock(old);
280 fib_urpf_list_lock(urpf);
281}
282
283index_t
284load_balance_get_urpf (index_t lbi)
285{
286 load_balance_t *lb;
287
288 lb = load_balance_get(lbi);
289
290 return (lb->lb_urpf);
291}
292
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100293const dpo_id_t *
294load_balance_get_bucket (index_t lbi,
295 u32 bucket)
296{
297 load_balance_t *lb;
298
299 lb = load_balance_get(lbi);
300
301 return (load_balance_get_bucket_i(lb, bucket));
302}
303
304static int
Neale Rannsc0790cf2017-01-05 01:01:47 -0800305next_hop_sort_by_weight (const load_balance_path_t * n1,
306 const load_balance_path_t * n2)
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100307{
308 return ((int) n1->path_weight - (int) n2->path_weight);
309}
310
311/* Given next hop vector is over-written with normalized one with sorted weights and
312 with weights corresponding to the number of adjacencies for each next hop.
313 Returns number of adjacencies in block. */
314u32
Neale Rannsc0790cf2017-01-05 01:01:47 -0800315ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops,
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100316 load_balance_path_t ** normalized_next_hops,
317 u32 *sum_weight_in,
318 f64 multipath_next_hop_error_tolerance)
319{
320 load_balance_path_t * nhs;
321 uword n_nhs, n_adj, n_adj_left, i, sum_weight;
322 f64 norm, error;
323
324 n_nhs = vec_len (raw_next_hops);
325 ASSERT (n_nhs > 0);
326 if (n_nhs == 0)
327 return 0;
328
329 /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */
330 nhs = *normalized_next_hops;
331 vec_validate (nhs, 2*n_nhs - 1);
332
333 /* Fast path: 1 next hop in block. */
334 n_adj = n_nhs;
335 if (n_nhs == 1)
336 {
337 nhs[0] = raw_next_hops[0];
338 nhs[0].path_weight = 1;
339 _vec_len (nhs) = 1;
340 sum_weight = 1;
341 goto done;
342 }
343
344 else if (n_nhs == 2)
345 {
346 int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0;
347
348 /* Fast sort. */
349 nhs[0] = raw_next_hops[cmp];
350 nhs[1] = raw_next_hops[cmp ^ 1];
351
352 /* Fast path: equal cost multipath with 2 next hops. */
353 if (nhs[0].path_weight == nhs[1].path_weight)
354 {
355 nhs[0].path_weight = nhs[1].path_weight = 1;
356 _vec_len (nhs) = 2;
357 sum_weight = 2;
358 goto done;
359 }
360 }
361 else
362 {
363 clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0]));
364 qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight);
365 }
366
367 /* Find total weight to normalize weights. */
368 sum_weight = 0;
369 for (i = 0; i < n_nhs; i++)
370 sum_weight += nhs[i].path_weight;
371
372 /* In the unlikely case that all weights are given as 0, set them all to 1. */
373 if (sum_weight == 0)
374 {
375 for (i = 0; i < n_nhs; i++)
376 nhs[i].path_weight = 1;
377 sum_weight = n_nhs;
378 }
379
380 /* Save copies of all next hop weights to avoid being overwritten in loop below. */
381 for (i = 0; i < n_nhs; i++)
382 nhs[n_nhs + i].path_weight = nhs[i].path_weight;
383
384 /* Try larger and larger power of 2 sized adjacency blocks until we
385 find one where traffic flows to within 1% of specified weights. */
386 for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2)
387 {
388 error = 0;
389
390 norm = n_adj / ((f64) sum_weight);
391 n_adj_left = n_adj;
392 for (i = 0; i < n_nhs; i++)
393 {
394 f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */
395 word n = flt_round_nearest (nf);
396
397 n = n > n_adj_left ? n_adj_left : n;
398 n_adj_left -= n;
399 error += fabs (nf - n);
400 nhs[i].path_weight = n;
Neale Ranns0bd36ea2016-11-16 11:47:44 +0000401
402 if (0 == nhs[i].path_weight)
403 {
404 /*
405 * when the weight skew is high (norm is small) and n == nf.
406 * without this correction the path with a low weight would have
407 * no represenation in the load-balanace - don't want that.
408 * If the weight skew is high so the load-balance has many buckets
409 * to allow it. pays ya money takes ya choice.
410 */
411 error = n_adj;
412 break;
413 }
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100414 }
415
416 nhs[0].path_weight += n_adj_left;
417
418 /* Less than 5% average error per adjacency with this size adjacency block? */
419 if (error <= multipath_next_hop_error_tolerance*n_adj)
420 {
421 /* Truncate any next hops with zero weight. */
422 _vec_len (nhs) = i;
423 break;
424 }
425 }
426
427done:
428 /* Save vector for next call. */
429 *normalized_next_hops = nhs;
430 *sum_weight_in = sum_weight;
431 return n_adj;
432}
433
434static load_balance_path_t *
Neale Rannsc0790cf2017-01-05 01:01:47 -0800435load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs,
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100436 dpo_proto_t drop_proto)
437{
438 if (0 == vec_len(nhs))
439 {
Neale Rannsc0790cf2017-01-05 01:01:47 -0800440 load_balance_path_t *new_nhs = NULL, *nh;
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100441
442 /*
443 * we need something for the load-balance. so use the drop
444 */
Neale Rannsc0790cf2017-01-05 01:01:47 -0800445 vec_add2(new_nhs, nh, 1);
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100446
447 nh->path_weight = 1;
448 dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto));
Neale Rannsc0790cf2017-01-05 01:01:47 -0800449
450 return (new_nhs);
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100451 }
452
Neale Rannsc0790cf2017-01-05 01:01:47 -0800453 return (NULL);
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100454}
455
456/*
457 * Fill in adjacencies in block based on corresponding
458 * next hop adjacencies.
459 */
460static void
461load_balance_fill_buckets (load_balance_t *lb,
462 load_balance_path_t *nhs,
463 dpo_id_t *buckets,
464 u32 n_buckets)
465{
466 load_balance_path_t * nh;
467 u16 ii, bucket;
468
469 bucket = 0;
470
471 /*
472 * the next-hops have normalised weights. that means their sum is the number
473 * of buckets we need to fill.
474 */
475 vec_foreach (nh, nhs)
476 {
477 for (ii = 0; ii < nh->path_weight; ii++)
478 {
479 ASSERT(bucket < n_buckets);
480 load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo);
481 }
482 }
483}
484
485static inline void
486load_balance_set_n_buckets (load_balance_t *lb,
487 u32 n_buckets)
488{
489 lb->lb_n_buckets = n_buckets;
490 lb->lb_n_buckets_minus_1 = n_buckets-1;
491}
492
493void
494load_balance_multipath_update (const dpo_id_t *dpo,
Neale Rannsc0790cf2017-01-05 01:01:47 -0800495 const load_balance_path_t * raw_nhs,
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100496 load_balance_flags_t flags)
497{
Neale Rannsc0790cf2017-01-05 01:01:47 -0800498 load_balance_path_t *nh, *nhs, *fixed_nhs;
499 u32 sum_of_weights, n_buckets, ii;
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100500 index_t lbmi, old_lbmi;
501 load_balance_t *lb;
502 dpo_id_t *tmp_dpo;
503
504 nhs = NULL;
505
506 ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type);
507 lb = load_balance_get(dpo->dpoi_index);
Neale Rannsc0790cf2017-01-05 01:01:47 -0800508 fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto);
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100509 n_buckets =
Neale Rannsc0790cf2017-01-05 01:01:47 -0800510 ip_multipath_normalize_next_hops((NULL == fixed_nhs ?
511 raw_nhs :
512 fixed_nhs),
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100513 &nhs,
514 &sum_of_weights,
515 multipath_next_hop_error_tolerance);
516
Neale Rannsc0790cf2017-01-05 01:01:47 -0800517 ASSERT (n_buckets >= vec_len (raw_nhs));
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100518
519 /*
520 * Save the old load-balance map used, and get a new one if required.
521 */
522 old_lbmi = lb->lb_map;
523 if (flags & LOAD_BALANCE_FLAG_USES_MAP)
524 {
525 lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs);
526 }
527 else
528 {
529 lbmi = INDEX_INVALID;
530 }
531
532 if (0 == lb->lb_n_buckets)
533 {
534 /*
535 * first time initialisation. no packets inflight, so we can write
536 * at leisure.
537 */
538 load_balance_set_n_buckets(lb, n_buckets);
539
540 if (!LB_HAS_INLINE_BUCKETS(lb))
541 vec_validate_aligned(lb->lb_buckets,
542 lb->lb_n_buckets - 1,
543 CLIB_CACHE_LINE_BYTES);
544
545 load_balance_fill_buckets(lb, nhs,
546 load_balance_get_buckets(lb),
547 n_buckets);
548 lb->lb_map = lbmi;
549 }
550 else
551 {
552 /*
553 * This is a modification of an existing load-balance.
554 * We need to ensure that packets inflight see a consistent state, that
555 * is the number of reported buckets the LB has (read from
556 * lb_n_buckets_minus_1) is not more than it actually has. So if the
557 * number of buckets is increasing, we must update the bucket array first,
558 * then the reported number. vice-versa if the number of buckets goes down.
559 */
560 if (n_buckets == lb->lb_n_buckets)
561 {
562 /*
563 * no change in the number of buckets. we can simply fill what
564 * is new over what is old.
565 */
566 load_balance_fill_buckets(lb, nhs,
567 load_balance_get_buckets(lb),
568 n_buckets);
569 lb->lb_map = lbmi;
570 }
571 else if (n_buckets > lb->lb_n_buckets)
572 {
573 /*
574 * we have more buckets. the old load-balance map (if there is one)
575 * will remain valid, i.e. mapping to indices within range, so we
576 * update it last.
577 */
578 if (n_buckets > LB_NUM_INLINE_BUCKETS &&
579 lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS)
580 {
581 /*
582 * the new increased number of buckets is crossing the threshold
583 * from the inline storage to out-line. Alloc the outline buckets
584 * first, then fixup the number. then reset the inlines.
585 */
586 ASSERT(NULL == lb->lb_buckets);
587 vec_validate_aligned(lb->lb_buckets,
588 n_buckets - 1,
589 CLIB_CACHE_LINE_BYTES);
590
591 load_balance_fill_buckets(lb, nhs,
592 lb->lb_buckets,
593 n_buckets);
594 CLIB_MEMORY_BARRIER();
595 load_balance_set_n_buckets(lb, n_buckets);
596
597 CLIB_MEMORY_BARRIER();
598
599 for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++)
600 {
601 dpo_reset(&lb->lb_buckets_inline[ii]);
602 }
603 }
604 else
605 {
Neale Ranns0bd36ea2016-11-16 11:47:44 +0000606 if (n_buckets <= LB_NUM_INLINE_BUCKETS)
607 {
608 /*
609 * we are not crossing the threshold and it's still inline buckets.
610 * we can write the new on the old..
611 */
612 load_balance_fill_buckets(lb, nhs,
613 load_balance_get_buckets(lb),
614 n_buckets);
615 CLIB_MEMORY_BARRIER();
616 load_balance_set_n_buckets(lb, n_buckets);
617 }
618 else
619 {
620 /*
621 * we are not crossing the threshold. We need a new bucket array to
622 * hold the increased number of choices.
623 */
624 dpo_id_t *new_buckets, *old_buckets, *tmp_dpo;
625
626 new_buckets = NULL;
627 old_buckets = load_balance_get_buckets(lb);
628
629 vec_validate_aligned(new_buckets,
630 n_buckets - 1,
631 CLIB_CACHE_LINE_BYTES);
632
633 load_balance_fill_buckets(lb, nhs, new_buckets, n_buckets);
634 CLIB_MEMORY_BARRIER();
635 lb->lb_buckets = new_buckets;
636 CLIB_MEMORY_BARRIER();
637 load_balance_set_n_buckets(lb, n_buckets);
638
639 vec_foreach(tmp_dpo, old_buckets)
640 {
641 dpo_reset(tmp_dpo);
642 }
643 vec_free(old_buckets);
644 }
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100645 }
646
647 /*
648 * buckets fixed. ready for the MAP update.
649 */
650 lb->lb_map = lbmi;
651 }
652 else
653 {
654 /*
655 * bucket size shrinkage.
656 * Any map we have will be based on the old
657 * larger number of buckets, so will be translating to indices
658 * out of range. So the new MAP must be installed first.
659 */
660 lb->lb_map = lbmi;
661 CLIB_MEMORY_BARRIER();
662
663
664 if (n_buckets <= LB_NUM_INLINE_BUCKETS &&
665 lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS)
666 {
667 /*
668 * the new decreased number of buckets is crossing the threshold
669 * from out-line storage to inline:
670 * 1 - Fill the inline buckets,
671 * 2 - fixup the number (and this point the inline buckets are
672 * used).
673 * 3 - free the outline buckets
674 */
675 load_balance_fill_buckets(lb, nhs,
676 lb->lb_buckets_inline,
677 n_buckets);
678 CLIB_MEMORY_BARRIER();
679 load_balance_set_n_buckets(lb, n_buckets);
680 CLIB_MEMORY_BARRIER();
681
682 vec_foreach(tmp_dpo, lb->lb_buckets)
683 {
684 dpo_reset(tmp_dpo);
685 }
686 vec_free(lb->lb_buckets);
687 }
688 else
689 {
690 /*
691 * not crossing the threshold.
692 * 1 - update the number to the smaller size
693 * 2 - write the new buckets
694 * 3 - reset those no longer used.
695 */
696 dpo_id_t *buckets;
697 u32 old_n_buckets;
698
699 old_n_buckets = lb->lb_n_buckets;
700 buckets = load_balance_get_buckets(lb);
701
702 load_balance_set_n_buckets(lb, n_buckets);
703 CLIB_MEMORY_BARRIER();
704
705 load_balance_fill_buckets(lb, nhs,
706 buckets,
707 n_buckets);
708
Neale Ranns32e1c012016-11-22 17:07:28 +0000709 for (ii = n_buckets; ii < old_n_buckets; ii++)
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100710 {
711 dpo_reset(&buckets[ii]);
712 }
713 }
714 }
715 }
716
717 vec_foreach (nh, nhs)
718 {
719 dpo_reset(&nh->path_dpo);
720 }
Neale Ranns33a7dd52016-10-07 15:14:33 +0100721 vec_free(nhs);
Neale Rannsc0790cf2017-01-05 01:01:47 -0800722 vec_free(fixed_nhs);
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100723
724 load_balance_map_unlock(old_lbmi);
725}
726
727static void
728load_balance_lock (dpo_id_t *dpo)
729{
730 load_balance_t *lb;
731
732 lb = load_balance_get(dpo->dpoi_index);
733
734 lb->lb_locks++;
735}
736
737static void
738load_balance_destroy (load_balance_t *lb)
739{
740 dpo_id_t *buckets;
741 int i;
742
743 buckets = load_balance_get_buckets(lb);
744
745 for (i = 0; i < lb->lb_n_buckets; i++)
746 {
747 dpo_reset(&buckets[i]);
748 }
749
750 LB_DBG(lb, "destroy");
751 if (!LB_HAS_INLINE_BUCKETS(lb))
752 {
753 vec_free(lb->lb_buckets);
754 }
755
Neale Ranns3ee44042016-10-03 13:05:48 +0100756 fib_urpf_list_unlock(lb->lb_urpf);
757 load_balance_map_unlock(lb->lb_map);
758
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100759 pool_put(load_balance_pool, lb);
760}
761
762static void
763load_balance_unlock (dpo_id_t *dpo)
764{
765 load_balance_t *lb;
766
767 lb = load_balance_get(dpo->dpoi_index);
768
769 lb->lb_locks--;
770
771 if (0 == lb->lb_locks)
772 {
773 load_balance_destroy(lb);
774 }
775}
776
Neale Ranns6c3ebcc2016-10-02 21:20:15 +0100777static void
778load_balance_mem_show (void)
779{
780 fib_show_memory_usage("load-balance",
781 pool_elts(load_balance_pool),
782 pool_len(load_balance_pool),
783 sizeof(load_balance_t));
Neale Ranns3ee44042016-10-03 13:05:48 +0100784 load_balance_map_show_mem();
Neale Ranns6c3ebcc2016-10-02 21:20:15 +0100785}
786
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100787const static dpo_vft_t lb_vft = {
788 .dv_lock = load_balance_lock,
789 .dv_unlock = load_balance_unlock,
790 .dv_format = format_load_balance_dpo,
Neale Ranns6c3ebcc2016-10-02 21:20:15 +0100791 .dv_mem_show = load_balance_mem_show,
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100792};
793
794/**
795 * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance
796 * object.
797 *
798 * this means that these graph nodes are ones from which a load-balance is the
799 * parent object in the DPO-graph.
800 *
801 * We do not list all the load-balance nodes, such as the *-lookup. instead
802 * we are relying on the correct use of the .sibling_of field when setting
803 * up these sibling nodes.
804 */
805const static char* const load_balance_ip4_nodes[] =
806{
807 "ip4-load-balance",
808 NULL,
809};
810const static char* const load_balance_ip6_nodes[] =
811{
812 "ip6-load-balance",
813 NULL,
814};
815const static char* const load_balance_mpls_nodes[] =
816{
817 "mpls-load-balance",
818 NULL,
819};
Neale Ranns5e575b12016-10-03 09:40:25 +0100820const static char* const load_balance_l2_nodes[] =
821{
822 "l2-load-balance",
823 NULL,
824};
Florin Corasb69111e2017-02-13 23:55:27 -0800825const static char* const load_balance_nsh_nodes[] =
826{
827 "nsh-load-balance",
Gabriel Ganne88fd5042017-11-14 16:07:34 +0100828 NULL
Neale Rannsd792d9c2017-10-21 10:53:20 -0700829};
830const static char* const load_balance_bier_nodes[] =
831{
832 "bier-load-balance",
Florin Corasb69111e2017-02-13 23:55:27 -0800833 NULL,
834};
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100835const static char* const * const load_balance_nodes[DPO_PROTO_NUM] =
836{
837 [DPO_PROTO_IP4] = load_balance_ip4_nodes,
838 [DPO_PROTO_IP6] = load_balance_ip6_nodes,
839 [DPO_PROTO_MPLS] = load_balance_mpls_nodes,
Neale Ranns5e575b12016-10-03 09:40:25 +0100840 [DPO_PROTO_ETHERNET] = load_balance_l2_nodes,
Florin Corasb69111e2017-02-13 23:55:27 -0800841 [DPO_PROTO_NSH] = load_balance_nsh_nodes,
Neale Rannsd792d9c2017-10-21 10:53:20 -0700842 [DPO_PROTO_BIER] = load_balance_bier_nodes,
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100843};
844
845void
846load_balance_module_init (void)
847{
Neale Rannsa3af3372017-03-28 03:49:52 -0700848 index_t lbi;
849
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100850 dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes);
851
Neale Ranns04a75e32017-03-23 06:46:01 -0700852 /*
853 * Special LB with index zero. we need to define this since the v4 mtrie
854 * assumes an index of 0 implies the ply is empty. therefore all 'real'
855 * adjs need a non-zero index.
Neale Rannsa3af3372017-03-28 03:49:52 -0700856 * This should never be used, but just in case, stack it on a drop.
Neale Ranns04a75e32017-03-23 06:46:01 -0700857 */
Neale Rannsa3af3372017-03-28 03:49:52 -0700858 lbi = load_balance_create(1, DPO_PROTO_IP4, 0);
859 load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4));
Neale Ranns04a75e32017-03-23 06:46:01 -0700860
Neale Ranns0bfe5d82016-08-25 15:29:12 +0100861 load_balance_map_module_init();
862}
863
864static clib_error_t *
865load_balance_show (vlib_main_t * vm,
866 unformat_input_t * input,
867 vlib_cli_command_t * cmd)
868{
869 index_t lbi = INDEX_INVALID;
870
871 while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
872 {
873 if (unformat (input, "%d", &lbi))
874 ;
875 else
876 break;
877 }
878
879 if (INDEX_INVALID != lbi)
880 {
881 vlib_cli_output (vm, "%U", format_load_balance, lbi,
882 LOAD_BALANCE_FORMAT_DETAIL);
883 }
884 else
885 {
886 load_balance_t *lb;
887
888 pool_foreach(lb, load_balance_pool,
889 ({
890 vlib_cli_output (vm, "%U", format_load_balance,
891 load_balance_get_index(lb),
892 LOAD_BALANCE_FORMAT_NONE);
893 }));
894 }
895
896 return 0;
897}
898
899VLIB_CLI_COMMAND (load_balance_show_command, static) = {
900 .path = "show load-balance",
901 .short_help = "show load-balance [<index>]",
902 .function = load_balance_show,
903};
Neale Ranns5e575b12016-10-03 09:40:25 +0100904
905
906always_inline u32
907ip_flow_hash (void *data)
908{
909 ip4_header_t *iph = (ip4_header_t *) data;
910
911 if ((iph->ip_version_and_header_length & 0xF0) == 0x40)
912 return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT);
913 else
914 return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT);
915}
916
917always_inline u64
918mac_to_u64 (u8 * m)
919{
920 return (*((u64 *) m) & 0xffffffffffff);
921}
922
923always_inline u32
924l2_flow_hash (vlib_buffer_t * b0)
925{
926 ethernet_header_t *eh;
927 u64 a, b, c;
928 uword is_ip, eh_size;
929 u16 eh_type;
930
931 eh = vlib_buffer_get_current (b0);
932 eh_type = clib_net_to_host_u16 (eh->type);
933 eh_size = ethernet_buffer_header_size (b0);
934
935 is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6);
936
937 /* since we have 2 cache lines, use them */
938 if (is_ip)
939 a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size);
940 else
941 a = eh->type;
942
943 b = mac_to_u64 ((u8 *) eh->dst_address);
944 c = mac_to_u64 ((u8 *) eh->src_address);
945 hash_mix64 (a, b, c);
946
947 return (u32) c;
948}
949
950typedef struct load_balance_trace_t_
951{
952 index_t lb_index;
953} load_balance_trace_t;
954
Neale Rannsd792d9c2017-10-21 10:53:20 -0700955always_inline uword
956load_balance_inline (vlib_main_t * vm,
957 vlib_node_runtime_t * node,
958 vlib_frame_t * frame,
959 int is_l2)
Neale Ranns5e575b12016-10-03 09:40:25 +0100960{
961 u32 n_left_from, next_index, *from, *to_next;
962
963 from = vlib_frame_vector_args (frame);
964 n_left_from = frame->n_vectors;
965
966 next_index = node->cached_next_index;
967
968 while (n_left_from > 0)
969 {
970 u32 n_left_to_next;
971
972 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
973
974 while (n_left_from > 0 && n_left_to_next > 0)
975 {
976 vlib_buffer_t *b0;
977 u32 bi0, lbi0, next0;
978 const dpo_id_t *dpo0;
979 const load_balance_t *lb0;
980
981 bi0 = from[0];
982 to_next[0] = bi0;
983 from += 1;
984 to_next += 1;
985 n_left_from -= 1;
986 n_left_to_next -= 1;
987
988 b0 = vlib_get_buffer (vm, bi0);
989
990 /* lookup dst + src mac */
991 lbi0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
992 lb0 = load_balance_get(lbi0);
993
Neale Rannsd792d9c2017-10-21 10:53:20 -0700994 if (is_l2)
995 {
996 vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0);
997 }
998 else
999 {
1000 /* it's BIER */
1001 const bier_hdr_t *bh0 = vlib_buffer_get_current(b0);
1002 vnet_buffer(b0)->ip.flow_hash = bier_hdr_get_entropy(bh0);
1003 }
Neale Ranns5e575b12016-10-03 09:40:25 +01001004
1005 dpo0 = load_balance_get_bucket_i(lb0,
1006 vnet_buffer(b0)->ip.flow_hash &
1007 (lb0->lb_n_buckets_minus_1));
1008
1009 next0 = dpo0->dpoi_next_node;
1010 vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1011
1012 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1013 {
1014 load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1015 sizeof (*tr));
1016 tr->lb_index = lbi0;
1017 }
1018 vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1019 n_left_to_next, bi0, next0);
1020 }
1021
1022 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1023 }
1024
1025 return frame->n_vectors;
1026}
1027
Neale Rannsd792d9c2017-10-21 10:53:20 -07001028static uword
1029l2_load_balance (vlib_main_t * vm,
1030 vlib_node_runtime_t * node,
1031 vlib_frame_t * frame)
1032{
1033 return (load_balance_inline(vm, node, frame, 1));
1034}
1035
Neale Ranns5e575b12016-10-03 09:40:25 +01001036static u8 *
Florin Corasb69111e2017-02-13 23:55:27 -08001037format_l2_load_balance_trace (u8 * s, va_list * args)
Neale Ranns5e575b12016-10-03 09:40:25 +01001038{
1039 CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1040 CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1041 load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1042
1043 s = format (s, "L2-load-balance: index %d", t->lb_index);
1044 return s;
1045}
1046
1047/**
1048 * @brief
1049 */
1050VLIB_REGISTER_NODE (l2_load_balance_node) = {
1051 .function = l2_load_balance,
1052 .name = "l2-load-balance",
1053 .vector_size = sizeof (u32),
1054
Florin Corasb69111e2017-02-13 23:55:27 -08001055 .format_trace = format_l2_load_balance_trace,
1056 .n_next_nodes = 1,
1057 .next_nodes = {
1058 [0] = "error-drop",
1059 },
1060};
1061
1062static uword
1063nsh_load_balance (vlib_main_t * vm,
1064 vlib_node_runtime_t * node,
1065 vlib_frame_t * frame)
1066{
1067 u32 n_left_from, next_index, *from, *to_next;
1068
1069 from = vlib_frame_vector_args (frame);
1070 n_left_from = frame->n_vectors;
1071
1072 next_index = node->cached_next_index;
1073
1074 while (n_left_from > 0)
1075 {
1076 u32 n_left_to_next;
1077
1078 vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
1079
1080 while (n_left_from > 0 && n_left_to_next > 0)
1081 {
1082 vlib_buffer_t *b0;
1083 u32 bi0, lbi0, next0, *nsh0;
1084 const dpo_id_t *dpo0;
1085 const load_balance_t *lb0;
1086
1087 bi0 = from[0];
1088 to_next[0] = bi0;
1089 from += 1;
1090 to_next += 1;
1091 n_left_from -= 1;
1092 n_left_to_next -= 1;
1093
1094 b0 = vlib_get_buffer (vm, bi0);
1095
1096 lbi0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX];
1097 lb0 = load_balance_get(lbi0);
1098
1099 /* SPI + SI are the second word of the NSH header */
1100 nsh0 = vlib_buffer_get_current (b0);
1101 vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets;
1102
1103 dpo0 = load_balance_get_bucket_i(lb0,
1104 vnet_buffer(b0)->ip.flow_hash &
1105 (lb0->lb_n_buckets_minus_1));
1106
1107 next0 = dpo0->dpoi_next_node;
1108 vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
1109
1110 if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
1111 {
1112 load_balance_trace_t *tr = vlib_add_trace (vm, node, b0,
1113 sizeof (*tr));
1114 tr->lb_index = lbi0;
1115 }
1116 vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
1117 n_left_to_next, bi0, next0);
1118 }
1119
1120 vlib_put_next_frame (vm, node, next_index, n_left_to_next);
1121 }
1122
1123 return frame->n_vectors;
1124}
1125
1126static u8 *
1127format_nsh_load_balance_trace (u8 * s, va_list * args)
1128{
1129 CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1130 CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1131 load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1132
1133 s = format (s, "NSH-load-balance: index %d", t->lb_index);
1134 return s;
1135}
1136
1137/**
1138 * @brief
1139 */
1140VLIB_REGISTER_NODE (nsh_load_balance_node) = {
1141 .function = nsh_load_balance,
1142 .name = "nsh-load-balance",
1143 .vector_size = sizeof (u32),
1144
1145 .format_trace = format_nsh_load_balance_trace,
Neale Ranns5e575b12016-10-03 09:40:25 +01001146 .n_next_nodes = 1,
1147 .next_nodes = {
1148 [0] = "error-drop",
1149 },
1150};
Neale Rannsd792d9c2017-10-21 10:53:20 -07001151
1152static u8 *
1153format_bier_load_balance_trace (u8 * s, va_list * args)
1154{
1155 CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1156 CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1157 load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *);
1158
1159 s = format (s, "BIER-load-balance: index %d", t->lb_index);
1160 return s;
1161}
1162
1163static uword
1164bier_load_balance (vlib_main_t * vm,
1165 vlib_node_runtime_t * node,
1166 vlib_frame_t * frame)
1167{
1168 return (load_balance_inline(vm, node, frame, 0));
1169}
1170
1171/**
1172 * @brief
1173 */
1174VLIB_REGISTER_NODE (bier_load_balance_node) = {
1175 .function = bier_load_balance,
1176 .name = "bier-load-balance",
1177 .vector_size = sizeof (u32),
1178
1179 .format_trace = format_bier_load_balance_trace,
1180 .sibling_of = "mpls-load-balance",
1181};