blob: e96cd902466ec3c637e2a38bfa9ac68b688db158 [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 * Copyright (c) 2015 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15/*
16 * input.c: Unix file input
17 *
18 * Copyright (c) 2008 Eliot Dresselhaus
19 *
20 * Permission is hereby granted, free of charge, to any person obtaining
21 * a copy of this software and associated documentation files (the
22 * "Software"), to deal in the Software without restriction, including
23 * without limitation the rights to use, copy, modify, merge, publish,
24 * distribute, sublicense, and/or sell copies of the Software, and to
25 * permit persons to whom the Software is furnished to do so, subject to
26 * the following conditions:
27 *
28 * The above copyright notice and this permission notice shall be
29 * included in all copies or substantial portions of the Software.
30 *
31 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
35 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
36 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
37 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
38 */
39
40#include <vlib/vlib.h>
41#include <vlib/unix/unix.h>
42#include <signal.h>
Damjan Marionceab7882018-01-19 20:56:12 +010043#include <unistd.h>
Dave Barach5c20a012017-06-13 08:48:31 -040044#include <vppinfra/tw_timer_1t_3w_1024sl_ov.h>
Ed Warnickecb9cada2015-12-08 15:45:58 -070045
46/* FIXME autoconf */
47#define HAVE_LINUX_EPOLL
48
49#ifdef HAVE_LINUX_EPOLL
50
51#include <sys/epoll.h>
52
Dave Barach9b8ffd92016-07-08 08:13:45 -040053typedef struct
54{
Dave Baracheb987d32018-05-03 08:26:39 -040055 CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
Ed Warnickecb9cada2015-12-08 15:45:58 -070056 int epoll_fd;
Dave Barach9b8ffd92016-07-08 08:13:45 -040057 struct epoll_event *epoll_events;
Damjan Marionceab7882018-01-19 20:56:12 +010058 int n_epoll_fds;
Ed Warnickecb9cada2015-12-08 15:45:58 -070059
60 /* Statistics. */
61 u64 epoll_files_ready;
62 u64 epoll_waits;
63} linux_epoll_main_t;
64
Damjan Marionceab7882018-01-19 20:56:12 +010065static linux_epoll_main_t *linux_epoll_mains = 0;
Ed Warnickecb9cada2015-12-08 15:45:58 -070066
67static void
Dave Barach59b25652017-09-10 15:04:27 -040068linux_epoll_file_update (clib_file_t * f, clib_file_update_type_t update_type)
Ed Warnickecb9cada2015-12-08 15:45:58 -070069{
Damjan Marion56dd5432017-09-08 19:52:02 +020070 clib_file_main_t *fm = &file_main;
Damjan Marionceab7882018-01-19 20:56:12 +010071 linux_epoll_main_t *em = vec_elt_at_index (linux_epoll_mains,
72 f->polling_thread_index);
73 struct epoll_event e = { 0 };
74 int op, add_del = 0;
Ed Warnickecb9cada2015-12-08 15:45:58 -070075
76 e.events = EPOLLIN;
77 if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE)
78 e.events |= EPOLLOUT;
Dave Barach9b8ffd92016-07-08 08:13:45 -040079 if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED)
80 e.events |= EPOLLET;
Damjan Marion56dd5432017-09-08 19:52:02 +020081 e.data.u32 = f - fm->file_pool;
Ed Warnickecb9cada2015-12-08 15:45:58 -070082
Dave Baracha1a093d2017-03-02 13:13:23 -050083 op = -1;
84
85 switch (update_type)
86 {
87 case UNIX_FILE_UPDATE_ADD:
88 op = EPOLL_CTL_ADD;
Damjan Marionceab7882018-01-19 20:56:12 +010089 add_del = 1;
Dave Baracha1a093d2017-03-02 13:13:23 -050090 break;
91
92 case UNIX_FILE_UPDATE_MODIFY:
93 op = EPOLL_CTL_MOD;
94 break;
95
96 case UNIX_FILE_UPDATE_DELETE:
97 op = EPOLL_CTL_DEL;
Damjan Marionceab7882018-01-19 20:56:12 +010098 add_del = -1;
Dave Baracha1a093d2017-03-02 13:13:23 -050099 break;
100
101 default:
102 clib_warning ("unknown update_type %d", update_type);
103 return;
104 }
105
Damjan Marionceab7882018-01-19 20:56:12 +0100106 /* worker threads open epoll fd only if needed */
107 if (update_type == UNIX_FILE_UPDATE_ADD && em->epoll_fd == -1)
108 {
109 em->epoll_fd = epoll_create (1);
110 if (em->epoll_fd < 0)
111 {
112 clib_unix_warning ("epoll_create");
113 return;
114 }
115 em->n_epoll_fds = 0;
116 }
117
Dave Baracha1a093d2017-03-02 13:13:23 -0500118 if (epoll_ctl (em->epoll_fd, op, f->file_descriptor, &e) < 0)
Damjan Marionceab7882018-01-19 20:56:12 +0100119 {
120 clib_unix_warning ("epoll_ctl");
121 return;
122 }
123
124 em->n_epoll_fds += add_del;
125
126 if (em->n_epoll_fds == 0)
127 {
128 close (em->epoll_fd);
129 em->epoll_fd = -1;
130 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700131}
132
Damjan Marionceab7882018-01-19 20:56:12 +0100133static_always_inline uword
134linux_epoll_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
135 vlib_frame_t * frame, u32 thread_index)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700136{
Dave Barach9b8ffd92016-07-08 08:13:45 -0400137 unix_main_t *um = &unix_main;
Damjan Marion56dd5432017-09-08 19:52:02 +0200138 clib_file_main_t *fm = &file_main;
Damjan Marionceab7882018-01-19 20:56:12 +0100139 linux_epoll_main_t *em = vec_elt_at_index (linux_epoll_mains, thread_index);
Dave Barach9b8ffd92016-07-08 08:13:45 -0400140 struct epoll_event *e;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700141 int n_fds_ready;
Damjan Marionceab7882018-01-19 20:56:12 +0100142 int is_main = (thread_index == 0);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700143
144 {
Dave Barach9b8ffd92016-07-08 08:13:45 -0400145 vlib_node_main_t *nm = &vm->node_main;
Dave Barach5c20a012017-06-13 08:48:31 -0400146 u32 ticks_until_expiration;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700147 f64 timeout;
Dave Barach9ae190e2019-04-23 10:07:24 -0400148 f64 now;
Dave Barach5c20a012017-06-13 08:48:31 -0400149 int timeout_ms = 0, max_timeout_ms = 10;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700150 f64 vector_rate = vlib_last_vectors_per_main_loop (vm);
151
Dave Barach9ae190e2019-04-23 10:07:24 -0400152 if (is_main == 0)
153 now = vlib_time_now (vm);
154
Dave Barach85aa4902018-06-14 18:52:46 -0400155 /*
156 * If we've been asked for a fixed-sleep between main loop polls,
157 * do so right away.
158 */
159 if (PREDICT_FALSE (is_main && um->poll_sleep_usec))
160 {
161 struct timespec ts, tsrem;
162 timeout = 0;
163 timeout_ms = 0;
164 node->input_main_loops_per_call = 0;
165 ts.tv_sec = 0;
166 ts.tv_nsec = 1000 * um->poll_sleep_usec;
167
168 while (nanosleep (&ts, &tsrem) < 0)
169 {
170 ts = tsrem;
171 }
172 }
Dave Barach5c20a012017-06-13 08:48:31 -0400173 /* If we're not working very hard, decide how long to sleep */
Dave Barach85aa4902018-06-14 18:52:46 -0400174 else if (is_main && vector_rate < 2 && vm->api_queue_nonempty == 0
175 && nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700176 {
Dave Barach5c20a012017-06-13 08:48:31 -0400177 ticks_until_expiration = TW (tw_timer_first_expires_in_ticks)
178 ((TWT (tw_timer_wheel) *) nm->timing_wheel);
Dave Barach9b8ffd92016-07-08 08:13:45 -0400179
Dave Barach5c20a012017-06-13 08:48:31 -0400180 /* Nothing on the fast wheel, sleep 10ms */
181 if (ticks_until_expiration == TW_SLOTS_PER_RING)
Damjan Marion18bc9072016-12-07 14:07:54 +0100182 {
Dave Barach5c20a012017-06-13 08:48:31 -0400183 timeout = 10e-3;
184 timeout_ms = max_timeout_ms;
Damjan Marion18bc9072016-12-07 14:07:54 +0100185 }
186 else
187 {
Dave Barach5c20a012017-06-13 08:48:31 -0400188 timeout = (f64) ticks_until_expiration *1e-5;
189 if (timeout < 1e-3)
190 timeout_ms = 0;
191 else
192 {
193 timeout_ms = timeout * 1e3;
194 /* Must be between 1 and 10 ms. */
195 timeout_ms = clib_max (1, timeout_ms);
196 timeout_ms = clib_min (max_timeout_ms, timeout_ms);
197 }
Damjan Marion18bc9072016-12-07 14:07:54 +0100198 }
Dave Barach5c20a012017-06-13 08:48:31 -0400199 node->input_main_loops_per_call = 0;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700200 }
Damjan Marion6ffb7c62021-03-26 13:06:13 +0100201 else if (is_main == 0 && vector_rate < 2 &&
202 (vlib_get_first_main ()->time_last_barrier_release + 0.5 < now) &&
203 nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0)
Damjan Marionceab7882018-01-19 20:56:12 +0100204 {
205 timeout = 10e-3;
206 timeout_ms = max_timeout_ms;
207 node->input_main_loops_per_call = 0;
208 }
Dave Barach5c20a012017-06-13 08:48:31 -0400209 else /* busy */
Ed Warnickecb9cada2015-12-08 15:45:58 -0700210 {
Dave Barach5c20a012017-06-13 08:48:31 -0400211 /* Don't come back for a respectable number of dispatch cycles */
Ed Warnickecb9cada2015-12-08 15:45:58 -0700212 node->input_main_loops_per_call = 1024;
213 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700214
215 /* Allow any signal to wakeup our sleep. */
Damjan Marionceab7882018-01-19 20:56:12 +0100216 if (is_main || em->epoll_fd != -1)
217 {
218 static sigset_t unblock_all_signals;
219 n_fds_ready = epoll_pwait (em->epoll_fd,
220 em->epoll_events,
221 vec_len (em->epoll_events),
222 timeout_ms, &unblock_all_signals);
Dave Barach9b8ffd92016-07-08 08:13:45 -0400223
Damjan Marionceab7882018-01-19 20:56:12 +0100224 /* This kludge is necessary to run over absurdly old kernels */
225 if (n_fds_ready < 0 && errno == ENOSYS)
226 {
227 n_fds_ready = epoll_wait (em->epoll_fd,
228 em->epoll_events,
229 vec_len (em->epoll_events), timeout_ms);
230 }
Damjan Marion29c0b332019-01-28 13:41:27 +0100231
Damjan Marionceab7882018-01-19 20:56:12 +0100232 }
233 else
234 {
Dave Barach9ae190e2019-04-23 10:07:24 -0400235 /*
236 * Worker thread, no epoll fd's, sleep for 100us at a time
237 * and check for a barrier sync request
238 */
Damjan Marionceab7882018-01-19 20:56:12 +0100239 if (timeout_ms)
Dave Barach9ae190e2019-04-23 10:07:24 -0400240 {
241 struct timespec ts, tsrem;
242 f64 limit = now + (f64) timeout_ms * 1e-3;
243
244 while (vlib_time_now (vm) < limit)
245 {
246 /* Sleep for 100us at a time */
247 ts.tv_sec = 0;
248 ts.tv_nsec = 1000 * 100;
249
250 while (nanosleep (&ts, &tsrem) < 0)
251 ts = tsrem;
Damjan Marion94100532020-11-06 23:25:57 +0100252 if (*vlib_worker_threads->wait_at_barrier ||
Damjan Marion7f75e802023-11-03 21:57:42 +0000253 clib_interrupt_is_any_pending (
254 nm->input_node_interrupts) ||
255 clib_interrupt_is_any_pending (
256 nm->pre_input_node_interrupts))
Dave Barach9ae190e2019-04-23 10:07:24 -0400257 goto done;
258 }
259 }
Damjan Marion29c0b332019-01-28 13:41:27 +0100260 goto done;
Damjan Marionceab7882018-01-19 20:56:12 +0100261 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700262 }
263
264 if (n_fds_ready < 0)
265 {
266 if (unix_error_is_fatal (errno))
Dave Barach9b8ffd92016-07-08 08:13:45 -0400267 vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait"));
Ed Warnickecb9cada2015-12-08 15:45:58 -0700268
269 /* non fatal error (e.g. EINTR). */
Damjan Marion29c0b332019-01-28 13:41:27 +0100270 goto done;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700271 }
272
273 em->epoll_waits += 1;
274 em->epoll_files_ready += n_fds_ready;
275
276 for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++)
277 {
278 u32 i = e->data.u32;
Florin Corascab47332019-06-11 14:33:03 -0700279 clib_file_t *f;
Dave Barach9b8ffd92016-07-08 08:13:45 -0400280 clib_error_t *errors[4];
Ed Warnickecb9cada2015-12-08 15:45:58 -0700281 int n_errors = 0;
282
Florin Corascab47332019-06-11 14:33:03 -0700283 /*
284 * Under rare scenarios, epoll may still post us events for the
285 * deleted file descriptor. We just deal with it and throw away the
286 * events for the corresponding file descriptor.
287 */
288 f = fm->file_pool + i;
Steven97f592f2018-09-08 14:06:16 -0700289 if (PREDICT_FALSE (pool_is_free (fm->file_pool, f)))
290 {
Steven97f592f2018-09-08 14:06:16 -0700291 if (e->events & EPOLLIN)
292 {
293 errors[n_errors] =
294 clib_error_return (0, "epoll event EPOLLIN dropped due "
295 "to free index %u", i);
296 n_errors++;
297 }
298 if (e->events & EPOLLOUT)
299 {
300 errors[n_errors] =
301 clib_error_return (0, "epoll event EPOLLOUT dropped due "
302 "to free index %u", i);
303 n_errors++;
304 }
305 if (e->events & EPOLLERR)
306 {
307 errors[n_errors] =
308 clib_error_return (0, "epoll event EPOLLERR dropped due "
309 "to free index %u", i);
310 n_errors++;
311 }
312 }
313 else if (PREDICT_TRUE (!(e->events & EPOLLERR)))
Ed Warnickecb9cada2015-12-08 15:45:58 -0700314 {
315 if (e->events & EPOLLIN)
316 {
Damjan Marionceab7882018-01-19 20:56:12 +0100317 f->read_events++;
Artem Belov32b07c32019-04-18 07:30:43 +0000318 errors[n_errors] = f->read_function (f);
Florin Corasc67078e2019-04-18 10:04:03 -0700319 /* Make sure f is valid if the file pool moves */
Florin Coras81f54a52019-04-22 09:49:10 -0700320 if (pool_is_free_index (fm->file_pool, i))
321 continue;
Florin Corasc67078e2019-04-18 10:04:03 -0700322 f = pool_elt_at_index (fm->file_pool, i);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700323 n_errors += errors[n_errors] != 0;
324 }
325 if (e->events & EPOLLOUT)
326 {
Damjan Marionceab7882018-01-19 20:56:12 +0100327 f->write_events++;
Artem Belov32b07c32019-04-18 07:30:43 +0000328 errors[n_errors] = f->write_function (f);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700329 n_errors += errors[n_errors] != 0;
330 }
331 }
332 else
333 {
Dave Barach9b8ffd92016-07-08 08:13:45 -0400334 if (f->error_function)
335 {
Damjan Marionceab7882018-01-19 20:56:12 +0100336 f->error_events++;
Artem Belov32b07c32019-04-18 07:30:43 +0000337 errors[n_errors] = f->error_function (f);
Dave Barach9b8ffd92016-07-08 08:13:45 -0400338 n_errors += errors[n_errors] != 0;
339 }
Ole Troan4b12b3c2016-01-27 23:37:58 +0200340 else
Dave Barach9b8ffd92016-07-08 08:13:45 -0400341 close (f->file_descriptor);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700342 }
343
344 ASSERT (n_errors < ARRAY_LEN (errors));
345 for (i = 0; i < n_errors; i++)
346 {
347 unix_save_error (um, errors[i]);
348 }
349 }
350
Damjan Marion29c0b332019-01-28 13:41:27 +0100351done:
352 if (PREDICT_FALSE (vm->cpu_id != clib_get_current_cpu_id ()))
353 {
354 vm->cpu_id = clib_get_current_cpu_id ();
355 vm->numa_node = clib_get_current_numa_node ();
356 }
357
Ed Warnickecb9cada2015-12-08 15:45:58 -0700358 return 0;
359}
360
Damjan Marionceab7882018-01-19 20:56:12 +0100361static uword
362linux_epoll_input (vlib_main_t * vm,
363 vlib_node_runtime_t * node, vlib_frame_t * frame)
364{
365 u32 thread_index = vlib_get_thread_index ();
366
367 if (thread_index == 0)
368 return linux_epoll_input_inline (vm, node, frame, 0);
369 else
370 return linux_epoll_input_inline (vm, node, frame, thread_index);
371}
372
Ed Warnickecb9cada2015-12-08 15:45:58 -0700373VLIB_REGISTER_NODE (linux_epoll_input_node,static) = {
374 .function = linux_epoll_input,
375 .type = VLIB_NODE_TYPE_PRE_INPUT,
376 .name = "unix-epoll-input",
377};
378
379clib_error_t *
380linux_epoll_input_init (vlib_main_t * vm)
381{
Damjan Marionceab7882018-01-19 20:56:12 +0100382 linux_epoll_main_t *em;
Damjan Marion56dd5432017-09-08 19:52:02 +0200383 clib_file_main_t *fm = &file_main;
Damjan Marionceab7882018-01-19 20:56:12 +0100384 vlib_thread_main_t *tm = vlib_get_thread_main ();
Dave Barach9b8ffd92016-07-08 08:13:45 -0400385
Ed Warnickecb9cada2015-12-08 15:45:58 -0700386
Damjan Marionceab7882018-01-19 20:56:12 +0100387 vec_validate_aligned (linux_epoll_mains, tm->n_vlib_mains,
388 CLIB_CACHE_LINE_BYTES);
389
390 vec_foreach (em, linux_epoll_mains)
391 {
392 /* Allocate some events. */
393 vec_resize (em->epoll_events, VLIB_FRAME_SIZE);
394
395 if (linux_epoll_mains == em)
396 {
397 em->epoll_fd = epoll_create (1);
398 if (em->epoll_fd < 0)
399 return clib_error_return_unix (0, "epoll_create");
400 }
401 else
402 em->epoll_fd = -1;
403 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700404
Damjan Marion56dd5432017-09-08 19:52:02 +0200405 fm->file_update = linux_epoll_file_update;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700406
407 return 0;
408}
409
410VLIB_INIT_FUNCTION (linux_epoll_input_init);
411
412#endif /* HAVE_LINUX_EPOLL */
413
414static clib_error_t *
415unix_input_init (vlib_main_t * vm)
416{
Dave Barachf8d50682019-05-14 18:01:44 -0400417 return 0;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700418}
419
Dave Barachf8d50682019-05-14 18:01:44 -0400420VLIB_INIT_FUNCTION (unix_input_init) =
421{
422 .runs_before = VLIB_INITS ("linux_epoll_input_init"),
423};
Dave Barach9b8ffd92016-07-08 08:13:45 -0400424
425/*
426 * fd.io coding-style-patch-verification: ON
427 *
428 * Local Variables:
429 * eval: (c-set-style "gnu")
430 * End:
431 */