blob: 1c1cb1aa79c58a845ec13839ff6ff4c514617239 [file] [log] [blame]
/*
* Copyright (c) 2015 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* input.c: Unix file input
*
* Copyright (c) 2008 Eliot Dresselhaus
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <vlib/vlib.h>
#include <vlib/unix/unix.h>
#include <signal.h>
#include <unistd.h>
#include <vppinfra/tw_timer_1t_3w_1024sl_ov.h>
/* FIXME autoconf */
#define HAVE_LINUX_EPOLL
#ifdef HAVE_LINUX_EPOLL
#include <sys/epoll.h>
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
int epoll_fd;
struct epoll_event *epoll_events;
int n_epoll_fds;
/* Statistics. */
u64 epoll_files_ready;
u64 epoll_waits;
} linux_epoll_main_t;
static linux_epoll_main_t *linux_epoll_mains = 0;
static void
linux_epoll_file_update (clib_file_t * f, clib_file_update_type_t update_type)
{
clib_file_main_t *fm = &file_main;
linux_epoll_main_t *em = vec_elt_at_index (linux_epoll_mains,
f->polling_thread_index);
struct epoll_event e = { 0 };
int op, add_del = 0;
e.events = EPOLLIN;
if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE)
e.events |= EPOLLOUT;
if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED)
e.events |= EPOLLET;
e.data.u32 = f - fm->file_pool;
op = -1;
switch (update_type)
{
case UNIX_FILE_UPDATE_ADD:
op = EPOLL_CTL_ADD;
add_del = 1;
break;
case UNIX_FILE_UPDATE_MODIFY:
op = EPOLL_CTL_MOD;
break;
case UNIX_FILE_UPDATE_DELETE:
op = EPOLL_CTL_DEL;
add_del = -1;
break;
default:
clib_warning ("unknown update_type %d", update_type);
return;
}
/* worker threads open epoll fd only if needed */
if (update_type == UNIX_FILE_UPDATE_ADD && em->epoll_fd == -1)
{
em->epoll_fd = epoll_create (1);
if (em->epoll_fd < 0)
{
clib_unix_warning ("epoll_create");
return;
}
em->n_epoll_fds = 0;
}
if (epoll_ctl (em->epoll_fd, op, f->file_descriptor, &e) < 0)
{
clib_unix_warning ("epoll_ctl");
return;
}
em->n_epoll_fds += add_del;
if (em->n_epoll_fds == 0)
{
close (em->epoll_fd);
em->epoll_fd = -1;
}
}
static_always_inline uword
linux_epoll_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_frame_t * frame, u32 thread_index)
{
unix_main_t *um = &unix_main;
clib_file_main_t *fm = &file_main;
linux_epoll_main_t *em = vec_elt_at_index (linux_epoll_mains, thread_index);
struct epoll_event *e;
int n_fds_ready;
int is_main = (thread_index == 0);
{
vlib_node_main_t *nm = &vm->node_main;
u32 ticks_until_expiration;
f64 timeout;
f64 now;
int timeout_ms = 0, max_timeout_ms = 10;
f64 vector_rate = vlib_last_vectors_per_main_loop (vm);
if (is_main == 0)
now = vlib_time_now (vm);
/*
* If we've been asked for a fixed-sleep between main loop polls,
* do so right away.
*/
if (PREDICT_FALSE (is_main && um->poll_sleep_usec))
{
struct timespec ts, tsrem;
timeout = 0;
timeout_ms = 0;
node->input_main_loops_per_call = 0;
ts.tv_sec = 0;
ts.tv_nsec = 1000 * um->poll_sleep_usec;
while (nanosleep (&ts, &tsrem) < 0)
{
ts = tsrem;
}
}
/* If we're not working very hard, decide how long to sleep */
else if (is_main && vector_rate < 2 && vm->api_queue_nonempty == 0
&& nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0)
{
ticks_until_expiration = TW (tw_timer_first_expires_in_ticks)
((TWT (tw_timer_wheel) *) nm->timing_wheel);
/* Nothing on the fast wheel, sleep 10ms */
if (ticks_until_expiration == TW_SLOTS_PER_RING)
{
timeout = 10e-3;
timeout_ms = max_timeout_ms;
}
else
{
timeout = (f64) ticks_until_expiration *1e-5;
if (timeout < 1e-3)
timeout_ms = 0;
else
{
timeout_ms = timeout * 1e3;
/* Must be between 1 and 10 ms. */
timeout_ms = clib_max (1, timeout_ms);
timeout_ms = clib_min (max_timeout_ms, timeout_ms);
}
}
node->input_main_loops_per_call = 0;
}
else if (is_main == 0 && vector_rate < 2
&& (vlib_global_main.time_last_barrier_release + 0.5 < now)
&& nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0)
{
timeout = 10e-3;
timeout_ms = max_timeout_ms;
node->input_main_loops_per_call = 0;
}
else /* busy */
{
/* Don't come back for a respectable number of dispatch cycles */
node->input_main_loops_per_call = 1024;
}
/* Allow any signal to wakeup our sleep. */
if (is_main || em->epoll_fd != -1)
{
static sigset_t unblock_all_signals;
n_fds_ready = epoll_pwait (em->epoll_fd,
em->epoll_events,
vec_len (em->epoll_events),
timeout_ms, &unblock_all_signals);
/* This kludge is necessary to run over absurdly old kernels */
if (n_fds_ready < 0 && errno == ENOSYS)
{
n_fds_ready = epoll_wait (em->epoll_fd,
em->epoll_events,
vec_len (em->epoll_events), timeout_ms);
}
}
else
{
/*
* Worker thread, no epoll fd's, sleep for 100us at a time
* and check for a barrier sync request
*/
if (timeout_ms)
{
struct timespec ts, tsrem;
f64 limit = now + (f64) timeout_ms * 1e-3;
while (vlib_time_now (vm) < limit)
{
/* Sleep for 100us at a time */
ts.tv_sec = 0;
ts.tv_nsec = 1000 * 100;
while (nanosleep (&ts, &tsrem) < 0)
ts = tsrem;
if (*vlib_worker_threads->wait_at_barrier)
goto done;
}
}
goto done;
}
}
if (n_fds_ready < 0)
{
if (unix_error_is_fatal (errno))
vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait"));
/* non fatal error (e.g. EINTR). */
goto done;
}
em->epoll_waits += 1;
em->epoll_files_ready += n_fds_ready;
for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++)
{
u32 i = e->data.u32;
clib_file_t *f = pool_elt_at_index (fm->file_pool, i);
clib_error_t *errors[4];
int n_errors = 0;
if (PREDICT_FALSE (pool_is_free (fm->file_pool, f)))
{
/*
* Under rare scenerop, epoll may still post us events for the
* deleted file descriptor. We just deal with it and throw away the
* events for the corresponding file descriptor.
*/
if (e->events & EPOLLIN)
{
errors[n_errors] =
clib_error_return (0, "epoll event EPOLLIN dropped due "
"to free index %u", i);
n_errors++;
}
if (e->events & EPOLLOUT)
{
errors[n_errors] =
clib_error_return (0, "epoll event EPOLLOUT dropped due "
"to free index %u", i);
n_errors++;
}
if (e->events & EPOLLERR)
{
errors[n_errors] =
clib_error_return (0, "epoll event EPOLLERR dropped due "
"to free index %u", i);
n_errors++;
}
}
else if (PREDICT_TRUE (!(e->events & EPOLLERR)))
{
if (e->events & EPOLLIN)
{
f->read_events++;
errors[n_errors] = f->read_function (f);
/* Make sure f is valid if the file pool moves */
if (pool_is_free_index (fm->file_pool, i))
continue;
f = pool_elt_at_index (fm->file_pool, i);
n_errors += errors[n_errors] != 0;
}
if (e->events & EPOLLOUT)
{
f->write_events++;
errors[n_errors] = f->write_function (f);
n_errors += errors[n_errors] != 0;
}
}
else
{
if (f->error_function)
{
f->error_events++;
errors[n_errors] = f->error_function (f);
n_errors += errors[n_errors] != 0;
}
else
close (f->file_descriptor);
}
ASSERT (n_errors < ARRAY_LEN (errors));
for (i = 0; i < n_errors; i++)
{
unix_save_error (um, errors[i]);
}
}
done:
if (PREDICT_FALSE (vm->cpu_id != clib_get_current_cpu_id ()))
{
vm->cpu_id = clib_get_current_cpu_id ();
vm->numa_node = clib_get_current_numa_node ();
}
return 0;
}
static uword
linux_epoll_input (vlib_main_t * vm,
vlib_node_runtime_t * node, vlib_frame_t * frame)
{
u32 thread_index = vlib_get_thread_index ();
if (thread_index == 0)
return linux_epoll_input_inline (vm, node, frame, 0);
else
return linux_epoll_input_inline (vm, node, frame, thread_index);
}
/* *INDENT-OFF* */
VLIB_REGISTER_NODE (linux_epoll_input_node,static) = {
.function = linux_epoll_input,
.type = VLIB_NODE_TYPE_PRE_INPUT,
.name = "unix-epoll-input",
};
/* *INDENT-ON* */
clib_error_t *
linux_epoll_input_init (vlib_main_t * vm)
{
linux_epoll_main_t *em;
clib_file_main_t *fm = &file_main;
vlib_thread_main_t *tm = vlib_get_thread_main ();
vec_validate_aligned (linux_epoll_mains, tm->n_vlib_mains,
CLIB_CACHE_LINE_BYTES);
vec_foreach (em, linux_epoll_mains)
{
/* Allocate some events. */
vec_resize (em->epoll_events, VLIB_FRAME_SIZE);
if (linux_epoll_mains == em)
{
em->epoll_fd = epoll_create (1);
if (em->epoll_fd < 0)
return clib_error_return_unix (0, "epoll_create");
}
else
em->epoll_fd = -1;
}
fm->file_update = linux_epoll_file_update;
return 0;
}
VLIB_INIT_FUNCTION (linux_epoll_input_init);
#endif /* HAVE_LINUX_EPOLL */
static clib_error_t *
unix_input_init (vlib_main_t * vm)
{
return vlib_call_init_function (vm, linux_epoll_input_init);
}
VLIB_INIT_FUNCTION (unix_input_init);
/*
* fd.io coding-style-patch-verification: ON
*
* Local Variables:
* eval: (c-set-style "gnu")
* End:
*/