blob: 4c8a2c583a9bf9ace07e87ccdf0b815aeadaa5f8 [file] [log] [blame]
/*
* Copyright (c) 2015 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vppinfra/bitmap.h>
#include <vppinfra/hash.h>
#include <vppinfra/pool.h>
#include <vppinfra/timing_wheel.h>
void
timing_wheel_init (timing_wheel_t * w, u64 current_cpu_time,
f64 cpu_clocks_per_second)
{
if (w->max_sched_time <= w->min_sched_time)
{
w->min_sched_time = 1e-6;
w->max_sched_time = 1e-3;
}
w->cpu_clocks_per_second = cpu_clocks_per_second;
w->log2_clocks_per_bin =
max_log2 (w->cpu_clocks_per_second * w->min_sched_time);
w->log2_bins_per_wheel =
max_log2 (w->cpu_clocks_per_second * w->max_sched_time);
w->log2_bins_per_wheel -= w->log2_clocks_per_bin;
w->log2_clocks_per_wheel = w->log2_bins_per_wheel + w->log2_clocks_per_bin;
w->bins_per_wheel = 1 << w->log2_bins_per_wheel;
w->bins_per_wheel_mask = w->bins_per_wheel - 1;
w->current_time_index = current_cpu_time >> w->log2_clocks_per_bin;
if (w->n_wheel_elt_time_bits <= 0 ||
w->n_wheel_elt_time_bits >= STRUCT_BITS_OF (timing_wheel_elt_t,
cpu_time_relative_to_base))
w->n_wheel_elt_time_bits =
STRUCT_BITS_OF (timing_wheel_elt_t, cpu_time_relative_to_base) - 1;
w->cpu_time_base = current_cpu_time;
w->time_index_next_cpu_time_base_update
=
w->current_time_index +
((u64) 1 << (w->n_wheel_elt_time_bits - w->log2_clocks_per_bin));
}
always_inline uword
get_level_and_relative_time (timing_wheel_t * w, u64 cpu_time,
uword * rtime_result)
{
u64 dt, rtime;
uword level_index;
dt = (cpu_time >> w->log2_clocks_per_bin);
/* Time should always move forward. */
ASSERT (dt >= w->current_time_index);
dt -= w->current_time_index;
/* Find level and offset within level. Level i has bins of size 2^((i+1)*M) */
rtime = dt;
for (level_index = 0; (rtime >> w->log2_bins_per_wheel) != 0; level_index++)
rtime = (rtime >> w->log2_bins_per_wheel) - 1;
/* Return offset within level and level index. */
ASSERT (rtime < w->bins_per_wheel);
*rtime_result = rtime;
return level_index;
}
always_inline uword
time_index_to_wheel_index (timing_wheel_t * w, uword level_index, u64 ti)
{
return (ti >> (level_index * w->log2_bins_per_wheel)) &
w->bins_per_wheel_mask;
}
/* Find current time on this level. */
always_inline uword
current_time_wheel_index (timing_wheel_t * w, uword level_index)
{
return time_index_to_wheel_index (w, level_index, w->current_time_index);
}
/* Circular wheel indexing. */
always_inline uword
wheel_add (timing_wheel_t * w, word x)
{
return x & w->bins_per_wheel_mask;
}
always_inline uword
rtime_to_wheel_index (timing_wheel_t * w, uword level_index, uword rtime)
{
uword t = current_time_wheel_index (w, level_index);
return wheel_add (w, t + rtime);
}
static clib_error_t *
validate_level (timing_wheel_t * w, uword level_index, uword * n_elts)
{
timing_wheel_level_t *level;
timing_wheel_elt_t *e;
uword wi;
clib_error_t *error = 0;
#define _(x) \
do { \
error = CLIB_ERROR_ASSERT (x); \
ASSERT (! error); \
if (error) return error; \
} while (0)
level = vec_elt_at_index (w->levels, level_index);
for (wi = 0; wi < vec_len (level->elts); wi++)
{
/* Validate occupancy bitmap. */
_(clib_bitmap_get_no_check (level->occupancy_bitmap, wi) ==
(vec_len (level->elts[wi]) > 0));
*n_elts += vec_len (level->elts[wi]);
vec_foreach (e, level->elts[wi])
{
/* Validate time bin and level. */
u64 e_time;
uword e_ti, e_li, e_wi;
e_time = e->cpu_time_relative_to_base + w->cpu_time_base;
e_li = get_level_and_relative_time (w, e_time, &e_ti);
e_wi = rtime_to_wheel_index (w, level_index, e_ti);
if (e_li == level_index - 1)
/* If this element was scheduled on the previous level
it must be wrapped. */
_(e_ti + current_time_wheel_index (w, level_index - 1)
>= w->bins_per_wheel);
else
{
_(e_li == level_index);
if (e_li == 0)
_(e_wi == wi);
else
_(e_wi == wi || e_wi + 1 == wi || e_wi - 1 == wi);
}
}
}
#undef _
return error;
}
void
timing_wheel_validate (timing_wheel_t * w)
{
uword l;
clib_error_t *error = 0;
uword n_elts;
if (!w->validate)
return;
n_elts = pool_elts (w->overflow_pool);
for (l = 0; l < vec_len (w->levels); l++)
{
error = validate_level (w, l, &n_elts);
if (error)
clib_error_report (error);
}
}
always_inline void
free_elt_vector (timing_wheel_t * w, timing_wheel_elt_t * ev)
{
/* Poison free elements so we never use them by mistake. */
if (CLIB_DEBUG > 0)
memset (ev, ~0, vec_len (ev) * sizeof (ev[0]));
_vec_len (ev) = 0;
vec_add1 (w->free_elt_vectors, ev);
}
static timing_wheel_elt_t *
insert_helper (timing_wheel_t * w, uword level_index, uword rtime)
{
timing_wheel_level_t *level;
timing_wheel_elt_t *e;
uword wheel_index;
/* Circular buffer. */
vec_validate (w->levels, level_index);
level = vec_elt_at_index (w->levels, level_index);
if (PREDICT_FALSE (!level->elts))
{
uword max = w->bins_per_wheel - 1;
clib_bitmap_validate (level->occupancy_bitmap, max);
vec_validate (level->elts, max);
}
wheel_index = rtime_to_wheel_index (w, level_index, rtime);
level->occupancy_bitmap =
clib_bitmap_ori (level->occupancy_bitmap, wheel_index);
/* Allocate an elt vector from free list if there is one. */
if (!level->elts[wheel_index] && vec_len (w->free_elt_vectors))
level->elts[wheel_index] = vec_pop (w->free_elt_vectors);
/* Add element to vector for this time bin. */
vec_add2 (level->elts[wheel_index], e, 1);
return e;
}
/* Insert user data on wheel at given CPU time stamp. */
static void
timing_wheel_insert_helper (timing_wheel_t * w, u64 insert_cpu_time,
u32 user_data)
{
timing_wheel_elt_t *e;
u64 dt;
uword rtime, level_index;
level_index = get_level_and_relative_time (w, insert_cpu_time, &rtime);
dt = insert_cpu_time - w->cpu_time_base;
if (PREDICT_TRUE (0 == (dt >> BITS (e->cpu_time_relative_to_base))))
{
e = insert_helper (w, level_index, rtime);
e->user_data = user_data;
e->cpu_time_relative_to_base = dt;
}
else
{
/* Time too far in the future: add to overflow vector. */
timing_wheel_overflow_elt_t *oe;
pool_get (w->overflow_pool, oe);
oe->user_data = user_data;
oe->cpu_time = insert_cpu_time;
}
}
always_inline uword
elt_is_deleted (timing_wheel_t * w, u32 user_data)
{
return (hash_elts (w->deleted_user_data_hash) > 0
&& hash_get (w->deleted_user_data_hash, user_data));
}
static timing_wheel_elt_t *
delete_user_data (timing_wheel_elt_t * elts, u32 user_data)
{
uword found_match;
timing_wheel_elt_t *e, *new_elts;
/* Quickly scan to see if there are any elements to delete
in this bucket. */
found_match = 0;
vec_foreach (e, elts)
{
found_match = e->user_data == user_data;
if (found_match)
break;
}
if (!found_match)
return elts;
/* Re-scan to build vector of new elts with matching user_data deleted. */
new_elts = 0;
vec_foreach (e, elts)
{
if (e->user_data != user_data)
vec_add1 (new_elts, e[0]);
}
vec_free (elts);
return new_elts;
}
/* Insert user data on wheel at given CPU time stamp. */
void
timing_wheel_insert (timing_wheel_t * w, u64 insert_cpu_time, u32 user_data)
{
/* Remove previously deleted elements. */
if (elt_is_deleted (w, user_data))
{
timing_wheel_level_t *l;
uword wi;
/* Delete elts with given user data so that stale events don't expire. */
vec_foreach (l, w->levels)
{
/* *INDENT-OFF* */
clib_bitmap_foreach (wi, l->occupancy_bitmap, ({
l->elts[wi] = delete_user_data (l->elts[wi], user_data);
if (vec_len (l->elts[wi]) == 0)
l->occupancy_bitmap = clib_bitmap_andnoti (l->occupancy_bitmap, wi);
}));
/* *INDENT-ON* */
}
{
timing_wheel_overflow_elt_t *oe;
/* *INDENT-OFF* */
pool_foreach (oe, w->overflow_pool, ({
if (oe->user_data == user_data)
pool_put (w->overflow_pool, oe);
}));
/* *INDENT-ON* */
}
hash_unset (w->deleted_user_data_hash, user_data);
}
timing_wheel_insert_helper (w, insert_cpu_time, user_data);
}
void
timing_wheel_delete (timing_wheel_t * w, u32 user_data)
{
if (!w->deleted_user_data_hash)
w->deleted_user_data_hash =
hash_create ( /* capacity */ 0, /* value bytes */ 0);
hash_set1 (w->deleted_user_data_hash, user_data);
}
/* Returns time of next expiring element. */
u64
timing_wheel_next_expiring_elt_time (timing_wheel_t * w)
{
timing_wheel_level_t *l;
timing_wheel_elt_t *e;
uword li, wi, wi0;
u32 min_dt;
u64 min_t;
uword wrapped = 0;
min_dt = ~0;
min_t = ~0ULL;
vec_foreach (l, w->levels)
{
if (!l->occupancy_bitmap)
continue;
li = l - w->levels;
wi0 = wi = current_time_wheel_index (w, li);
wrapped = 0;
while (1)
{
if (clib_bitmap_get_no_check (l->occupancy_bitmap, wi))
{
vec_foreach (e, l->elts[wi])
min_dt = clib_min (min_dt, e->cpu_time_relative_to_base);
if (wrapped && li + 1 < vec_len (w->levels))
{
uword wi1 = current_time_wheel_index (w, li + 1);
if (l[1].occupancy_bitmap
&& clib_bitmap_get_no_check (l[1].occupancy_bitmap, wi1))
{
vec_foreach (e, l[1].elts[wi1])
{
min_dt =
clib_min (min_dt, e->cpu_time_relative_to_base);
}
}
}
min_t = w->cpu_time_base + min_dt;
goto done;
}
wi = wheel_add (w, wi + 1);
if (wi == wi0)
break;
wrapped = wi != wi + 1;
}
}
{
timing_wheel_overflow_elt_t *oe;
if (min_dt != ~0)
min_t = w->cpu_time_base + min_dt;
/* *INDENT-OFF* */
pool_foreach (oe, w->overflow_pool,
({ min_t = clib_min (min_t, oe->cpu_time); }));
/* *INDENT-ON* */
done:
return min_t;
}
}
static inline void
insert_elt (timing_wheel_t * w, timing_wheel_elt_t * e)
{
u64 t = w->cpu_time_base + e->cpu_time_relative_to_base;
timing_wheel_insert_helper (w, t, e->user_data);
}
always_inline u64
elt_cpu_time (timing_wheel_t * w, timing_wheel_elt_t * e)
{
return w->cpu_time_base + e->cpu_time_relative_to_base;
}
always_inline void
validate_expired_elt (timing_wheel_t * w, timing_wheel_elt_t * e,
u64 current_cpu_time)
{
if (CLIB_DEBUG > 0)
{
u64 e_time = elt_cpu_time (w, e);
/* Verify that element is actually expired. */
ASSERT ((e_time >> w->log2_clocks_per_bin)
<= (current_cpu_time >> w->log2_clocks_per_bin));
}
}
static u32 *
expire_bin (timing_wheel_t * w,
uword level_index,
uword wheel_index, u64 advance_cpu_time, u32 * expired_user_data)
{
timing_wheel_level_t *level = vec_elt_at_index (w->levels, level_index);
timing_wheel_elt_t *e;
u32 *x;
uword i, j, e_len;
e = vec_elt (level->elts, wheel_index);
e_len = vec_len (e);
vec_add2 (expired_user_data, x, e_len);
for (i = j = 0; i < e_len; i++)
{
validate_expired_elt (w, &e[i], advance_cpu_time);
x[j] = e[i].user_data;
/* Only advance if elt is not to be deleted. */
j += !elt_is_deleted (w, e[i].user_data);
}
/* Adjust for deleted elts. */
if (j < e_len)
_vec_len (expired_user_data) -= e_len - j;
free_elt_vector (w, e);
level->elts[wheel_index] = 0;
clib_bitmap_set_no_check (level->occupancy_bitmap, wheel_index, 0);
return expired_user_data;
}
/* Called rarely. 32 bit times should only overflow every 4 seconds or so on a fast machine. */
static u32 *
advance_cpu_time_base (timing_wheel_t * w, u32 * expired_user_data)
{
timing_wheel_level_t *l;
timing_wheel_elt_t *e;
u64 delta;
w->stats.cpu_time_base_advances++;
delta = ((u64) 1 << w->n_wheel_elt_time_bits);
w->cpu_time_base += delta;
w->time_index_next_cpu_time_base_update += delta >> w->log2_clocks_per_bin;
vec_foreach (l, w->levels)
{
uword wi;
/* *INDENT-OFF* */
clib_bitmap_foreach (wi, l->occupancy_bitmap, ({
vec_foreach (e, l->elts[wi])
{
/* This should always be true since otherwise we would have already expired
this element. */
ASSERT (e->cpu_time_relative_to_base >= delta);
e->cpu_time_relative_to_base -= delta;
}
}));
/* *INDENT-ON* */
}
/* See which overflow elements fit now. */
{
timing_wheel_overflow_elt_t *oe;
/* *INDENT-OFF* */
pool_foreach (oe, w->overflow_pool, ({
/* It fits now into 32 bits. */
if (0 == ((oe->cpu_time - w->cpu_time_base) >> BITS (e->cpu_time_relative_to_base)))
{
u64 ti = oe->cpu_time >> w->log2_clocks_per_bin;
if (ti < w->current_time_index)
{
/* This can happen when timing wheel is not advanced for a long time
(for example when at a gdb breakpoint for a while). */
if (! elt_is_deleted (w, oe->user_data))
vec_add1 (expired_user_data, oe->user_data);
}
else
timing_wheel_insert_helper (w, oe->cpu_time, oe->user_data);
pool_put (w->overflow_pool, oe);
}
}));
/* *INDENT-ON* */
}
return expired_user_data;
}
static u32 *
refill_level (timing_wheel_t * w,
uword level_index,
u64 advance_cpu_time,
uword from_wheel_index,
uword to_wheel_index, u32 * expired_user_data)
{
timing_wheel_level_t *level;
timing_wheel_elt_t *to_insert = w->unexpired_elts_pending_insert;
u64 advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
vec_validate (w->stats.refills, level_index);
w->stats.refills[level_index] += 1;
if (level_index + 1 >= vec_len (w->levels))
goto done;
level = vec_elt_at_index (w->levels, level_index + 1);
if (!level->occupancy_bitmap)
goto done;
while (1)
{
timing_wheel_elt_t *e, *es;
if (clib_bitmap_get_no_check
(level->occupancy_bitmap, from_wheel_index))
{
es = level->elts[from_wheel_index];
level->elts[from_wheel_index] = 0;
clib_bitmap_set_no_check (level->occupancy_bitmap, from_wheel_index,
0);
vec_foreach (e, es)
{
u64 e_time = elt_cpu_time (w, e);
u64 ti = e_time >> w->log2_clocks_per_bin;
if (ti <= advance_time_index)
{
validate_expired_elt (w, e, advance_cpu_time);
if (!elt_is_deleted (w, e->user_data))
vec_add1 (expired_user_data, e->user_data);
}
else
vec_add1 (to_insert, e[0]);
}
free_elt_vector (w, es);
}
if (from_wheel_index == to_wheel_index)
break;
from_wheel_index = wheel_add (w, from_wheel_index + 1);
}
timing_wheel_validate (w);
done:
w->unexpired_elts_pending_insert = to_insert;
return expired_user_data;
}
/* Advance wheel and return any expired user data in vector. */
u32 *
timing_wheel_advance (timing_wheel_t * w, u64 advance_cpu_time,
u32 * expired_user_data,
u64 * next_expiring_element_cpu_time)
{
timing_wheel_level_t *level;
uword level_index, advance_rtime, advance_level_index, advance_wheel_index;
uword n_expired_user_data_before;
u64 current_time_index, advance_time_index;
n_expired_user_data_before = vec_len (expired_user_data);
/* Re-fill lower levels when time wraps. */
current_time_index = w->current_time_index;
advance_time_index = advance_cpu_time >> w->log2_clocks_per_bin;
{
u64 current_ti, advance_ti;
current_ti = current_time_index >> w->log2_bins_per_wheel;
advance_ti = advance_time_index >> w->log2_bins_per_wheel;
if (PREDICT_FALSE (current_ti != advance_ti))
{
if (w->unexpired_elts_pending_insert)
_vec_len (w->unexpired_elts_pending_insert) = 0;
level_index = 0;
while (current_ti != advance_ti)
{
uword c, a;
c = current_ti & (w->bins_per_wheel - 1);
a = advance_ti & (w->bins_per_wheel - 1);
if (c != a)
expired_user_data = refill_level (w,
level_index,
advance_cpu_time,
c, a, expired_user_data);
current_ti >>= w->log2_bins_per_wheel;
advance_ti >>= w->log2_bins_per_wheel;
level_index++;
}
}
}
advance_level_index =
get_level_and_relative_time (w, advance_cpu_time, &advance_rtime);
advance_wheel_index =
rtime_to_wheel_index (w, advance_level_index, advance_rtime);
/* Empty all occupied bins for entire levels that we advance past. */
for (level_index = 0; level_index < advance_level_index; level_index++)
{
uword wi;
if (level_index >= vec_len (w->levels))
break;
level = vec_elt_at_index (w->levels, level_index);
/* *INDENT-OFF* */
clib_bitmap_foreach (wi, level->occupancy_bitmap, ({
expired_user_data = expire_bin (w, level_index, wi, advance_cpu_time,
expired_user_data);
}));
/* *INDENT-ON* */
}
if (PREDICT_TRUE (level_index < vec_len (w->levels)))
{
uword wi;
level = vec_elt_at_index (w->levels, level_index);
wi = current_time_wheel_index (w, level_index);
if (level->occupancy_bitmap)
while (1)
{
if (clib_bitmap_get_no_check (level->occupancy_bitmap, wi))
expired_user_data =
expire_bin (w, advance_level_index, wi, advance_cpu_time,
expired_user_data);
if (wi == advance_wheel_index)
break;
wi = wheel_add (w, wi + 1);
}
}
/* Advance current time index. */
w->current_time_index = advance_time_index;
if (vec_len (w->unexpired_elts_pending_insert) > 0)
{
timing_wheel_elt_t *e;
vec_foreach (e, w->unexpired_elts_pending_insert) insert_elt (w, e);
_vec_len (w->unexpired_elts_pending_insert) = 0;
}
/* Don't advance until necessary. */
while (PREDICT_FALSE
(advance_time_index >= w->time_index_next_cpu_time_base_update))
expired_user_data = advance_cpu_time_base (w, expired_user_data);
if (next_expiring_element_cpu_time)
{
u64 min_t;
/* Anything expired? If so we need to recompute next expiring elt time. */
if (vec_len (expired_user_data) == n_expired_user_data_before
&& w->cached_min_cpu_time_on_wheel != 0ULL)
min_t = w->cached_min_cpu_time_on_wheel;
else
{
min_t = timing_wheel_next_expiring_elt_time (w);
w->cached_min_cpu_time_on_wheel = min_t;
}
*next_expiring_element_cpu_time = min_t;
}
return expired_user_data;
}
u8 *
format_timing_wheel (u8 * s, va_list * va)
{
timing_wheel_t *w = va_arg (*va, timing_wheel_t *);
int verbose = va_arg (*va, int);
uword indent = format_get_indent (s);
s = format (s, "level 0: %.4e - %.4e secs, 2^%d - 2^%d clocks",
(f64) (1 << w->log2_clocks_per_bin) / w->cpu_clocks_per_second,
(f64) (1 << w->log2_clocks_per_wheel) /
w->cpu_clocks_per_second, w->log2_clocks_per_bin,
w->log2_clocks_per_wheel);
if (verbose)
{
int l;
s = format (s, "\n%Utime base advances %Ld, every %.4e secs",
format_white_space, indent + 2,
w->stats.cpu_time_base_advances,
(f64) ((u64) 1 << w->n_wheel_elt_time_bits) /
w->cpu_clocks_per_second);
for (l = 0; l < vec_len (w->levels); l++)
s = format (s, "\n%Ulevel %d: refills %Ld",
format_white_space, indent + 2,
l,
l <
vec_len (w->stats.refills) ? w->stats.
refills[l] : (u64) 0);
}
return s;
}
/*
* fd.io coding-style-patch-verification: ON
*
* Local Variables:
* eval: (c-set-style "gnu")
* End:
*/