blob: e62f3f839fdd75a0089409594a4e8edabd9e8cb9 [file] [log] [blame]
Ed Warnickecb9cada2015-12-08 15:45:58 -07001/*
2 *------------------------------------------------------------------
3 * svm.c - shared VM allocation, mmap(...MAP_FIXED...)
4 * library
5 *
6 * Copyright (c) 2009 Cisco and/or its affiliates.
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at:
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *------------------------------------------------------------------
19 */
20
21#include <stdio.h>
22#include <stdlib.h>
23#include <sys/types.h>
24#include <sys/mman.h>
25#include <sys/stat.h>
26#include <netinet/in.h>
27#include <signal.h>
28#include <pthread.h>
29#include <unistd.h>
30#include <time.h>
31#include <fcntl.h>
32#include <string.h>
33#include <vppinfra/clib.h>
34#include <vppinfra/vec.h>
35#include <vppinfra/hash.h>
36#include <vppinfra/bitmap.h>
37#include <vppinfra/fifo.h>
38#include <vppinfra/time.h>
39#include <vppinfra/mheap.h>
40#include <vppinfra/heap.h>
41#include <vppinfra/pool.h>
42#include <vppinfra/format.h>
43
44#include "svm.h"
45
46static svm_region_t *root_rp;
47static int root_rp_refcount;
48
49#define MAXLOCK 2
50static pthread_mutex_t *mutexes_held [MAXLOCK];
51static int nheld;
52
53svm_region_t *svm_get_root_rp (void)
54{
55 return root_rp;
56}
57
58#define MUTEX_DEBUG
59
60static void region_lock(svm_region_t *rp, int tag)
61{
62 pthread_mutex_lock(&rp->mutex);
63#ifdef MUTEX_DEBUG
64 rp->mutex_owner_pid = getpid();
65 rp->mutex_owner_tag = tag;
66#endif
67 ASSERT(nheld < MAXLOCK);
68 /*
69 * Keep score of held mutexes so we can try to exit
70 * cleanly if the world comes to an end at the worst possible
71 * moment
72 */
73 mutexes_held [nheld++] = &rp->mutex;
74}
75
76static void region_unlock(svm_region_t *rp)
77{
78 int i,j;
79#ifdef MUTEX_DEBUG
80 rp->mutex_owner_pid = 0;
81 rp->mutex_owner_tag = 0;
82#endif
83
84 for (i = nheld-1; i >= 0; i--) {
85 if (mutexes_held[i] == &rp->mutex) {
86 for (j = i; j < MAXLOCK-1; j++)
87 mutexes_held[j] = mutexes_held[j+1];
88 nheld--;
89 goto found;
90 }
91 }
92 ASSERT(0);
93
94found:
95 CLIB_MEMORY_BARRIER();
96 pthread_mutex_unlock(&rp->mutex);
97}
98
99
100static u8 * format_svm_flags (u8 * s, va_list * args)
101{
102 uword f = va_arg (*args, uword);
103
104 if (f & SVM_FLAGS_MHEAP)
105 s = format (s, "MHEAP ");
106 if (f & SVM_FLAGS_FILE)
107 s = format (s, "FILE ");
108 if (f & SVM_FLAGS_NODATA)
109 s = format (s, "NODATA ");
110 if (f & SVM_FLAGS_NEED_DATA_INIT)
111 s = format (s, "INIT ");
112
113 return (s);
114}
115
116static u8 * format_svm_size (u8 * s, va_list * args)
117{
118 uword size = va_arg (*args, uword);
119
Damjan Marion2c29d752015-12-18 10:26:56 +0100120 if (size >= (1<<20)) {
Ed Warnickecb9cada2015-12-08 15:45:58 -0700121 s = format (s, "(%d mb)", size >> 20);
Damjan Marion2c29d752015-12-18 10:26:56 +0100122 } else if (size >= (1<<10)) {
Ed Warnickecb9cada2015-12-08 15:45:58 -0700123 s = format (s, "(%d kb)", size >> 10);
124 } else {
125 s = format (s, "(%d bytes)", size);
126 }
127 return (s);
128}
129
130u8 * format_svm_region (u8 * s, va_list * args)
131{
132 svm_region_t *rp = va_arg (*args, svm_region_t *);
133 int verbose = va_arg (*args, int);
134 int i;
135 uword lo, hi;
136
137 s = format (s, "%s: base va 0x%x size 0x%x %U\n",
138 rp->region_name, rp->virtual_base,
139 rp->virtual_size, format_svm_size, rp->virtual_size);
140 s = format (s, " user_ctx 0x%x, bitmap_size %d\n",
141 rp->user_ctx, rp->bitmap_size);
142
143 if (verbose) {
144 s = format (s, " flags: 0x%x %U\n", rp->flags,
145 format_svm_flags, rp->flags);
146 s = format (s,
147 " region_heap 0x%x data_base 0x%x data_heap 0x%x\n",
148 rp->region_heap, rp->data_base, rp->data_heap);
149 }
150
151 s = format (s, " %d clients, pids: ",
152 vec_len(rp->client_pids));
153
154 for (i = 0; i < vec_len(rp->client_pids); i++)
155 s = format (s, "%d ", rp->client_pids[i]);
156
157 s = format (s, "\n");
158
159 if (verbose) {
160 lo = hi = ~0;
161
162 s = format (s, " VM in use: ");
163
164 for (i = 0; i < rp->bitmap_size; i++) {
165 if (clib_bitmap_get_no_check (rp->bitmap, i) != 0) {
166 if (lo == ~0) {
167 hi = lo = rp->virtual_base + i*MMAP_PAGESIZE;
168 } else {
169 hi = rp->virtual_base + i*MMAP_PAGESIZE;
170 }
171 } else {
172 if (lo != ~0) {
173 hi = rp->virtual_base + i*MMAP_PAGESIZE -1;
174 s = format (s, " 0x%x - 0x%x (%dk)\n", lo, hi,
175 (hi - lo)>>10);
176 lo = hi = ~0;
177 }
178 }
179 }
180 s = format (s, " rgn heap stats: %U", format_mheap,
181 rp->region_heap, 0);
182 if ((rp->flags & SVM_FLAGS_MHEAP) && rp->data_heap) {
183 s = format (s, "\n data heap stats: %U", format_mheap,
184 rp->data_heap, 1);
185 }
186 s = format (s, "\n");
187 }
188
189 return(s);
190}
191
192/*
193 * rnd_pagesize
194 * Round to a pagesize multiple, presumably 4k works
195 */
196static unsigned int rnd_pagesize(unsigned int size)
197{
198 unsigned int rv;
199
200 rv = (size + (MMAP_PAGESIZE-1)) & ~(MMAP_PAGESIZE-1);
201 return(rv);
202}
203
204/*
205 * svm_data_region_setup
206 */
207static int svm_data_region_create (svm_map_region_args_t *a,
208 svm_region_t *rp)
209{
210 int fd;
211 u8 junk = 0;
212 uword map_size;
213
214 map_size = rp->virtual_size - (MMAP_PAGESIZE + SVM_PVT_MHEAP_SIZE);
215
216 if (a->flags & SVM_FLAGS_FILE) {
217 struct stat statb;
218
219 fd = open (a->backing_file, O_RDWR | O_CREAT, 0777);
220
221 if (fd < 0) {
222 clib_unix_warning ("open");
223 return -1;
224 }
225
226 if (fstat(fd, &statb) < 0) {
227 clib_unix_warning("fstat");
Chris Luke370e9e32016-07-07 11:01:17 -0400228 close (fd);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700229 return -2;
230 }
231
232 if (statb.st_mode & S_IFREG) {
233 if (statb.st_size == 0) {
Chris Luke370e9e32016-07-07 11:01:17 -0400234 if (lseek(fd, map_size, SEEK_SET) == (off_t) -1) {
235 clib_unix_warning ("seek region size");
236 close (fd);
237 return -3;
238 }
239 if (write(fd, &junk, 1) != 1) {
Ed Warnickecb9cada2015-12-08 15:45:58 -0700240 clib_unix_warning ("set region size");
Chris Luke370e9e32016-07-07 11:01:17 -0400241 close (fd);
242 return -3;
243 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700244 } else {
245 map_size = rnd_pagesize (statb.st_size);
246 }
247 } else {
248 map_size = a->backing_mmap_size;
249 }
250
251 ASSERT(map_size <= rp->virtual_size -
252 (MMAP_PAGESIZE + SVM_PVT_MHEAP_SIZE));
253
254 if (mmap (rp->data_base, map_size, PROT_READ | PROT_WRITE,
255 MAP_SHARED | MAP_FIXED, fd, 0) == MAP_FAILED) {
256 clib_unix_warning("mmap");
Chris Luke370e9e32016-07-07 11:01:17 -0400257 close (fd);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700258 return -3;
259 }
260 close(fd);
261 rp->backing_file = (char *) format(0, "%s\0", a->backing_file);
262 rp->flags |= SVM_FLAGS_FILE;
263 }
264
265 if (a->flags & SVM_FLAGS_MHEAP) {
266 rp->data_heap =
267 mheap_alloc_with_flags ((void *)(rp->data_base), map_size,
268 MHEAP_FLAG_DISABLE_VM);
269 rp->flags |= SVM_FLAGS_MHEAP;
270 }
271 return 0;
272}
273
274static int svm_data_region_map (svm_map_region_args_t *a,
275 svm_region_t *rp)
276{
277 int fd;
278 u8 junk = 0;
279 uword map_size;
280 struct stat statb;
281
282 map_size = rp->virtual_size - (MMAP_PAGESIZE + SVM_PVT_MHEAP_SIZE);
283
284 if (a->flags & SVM_FLAGS_FILE) {
285
286 fd = open (a->backing_file, O_RDWR, 0777);
287
288 if (fd < 0) {
289 clib_unix_warning ("open");
290 return -1;
291 }
292
293 if (fstat(fd, &statb) < 0) {
294 clib_unix_warning("fstat");
Chris Luke370e9e32016-07-07 11:01:17 -0400295 close (fd);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700296 return -2;
297 }
298
299 if (statb.st_mode & S_IFREG) {
300 if (statb.st_size == 0) {
Chris Luke370e9e32016-07-07 11:01:17 -0400301 if (lseek(fd, map_size, SEEK_SET) == (off_t) -1) {
302 clib_unix_warning ("seek region size");
303 close (fd);
304 return -3;
305 }
306 if (write(fd, &junk, 1) != 1) {
Ed Warnickecb9cada2015-12-08 15:45:58 -0700307 clib_unix_warning ("set region size");
Chris Luke370e9e32016-07-07 11:01:17 -0400308 close (fd);
309 return -3;
310 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700311 } else {
312 map_size = rnd_pagesize (statb.st_size);
313 }
314 } else {
315 map_size = a->backing_mmap_size;
316 }
317
318 ASSERT(map_size <= rp->virtual_size
319 - (MMAP_PAGESIZE + SVM_PVT_MHEAP_SIZE));
320
321 if (mmap (rp->data_base, map_size, PROT_READ | PROT_WRITE,
322 MAP_SHARED | MAP_FIXED, fd, 0) == MAP_FAILED) {
323 clib_unix_warning("mmap");
Chris Luke370e9e32016-07-07 11:01:17 -0400324 close (fd);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700325 return -3;
326 }
327 close(fd);
328 }
329 return 0;
330}
331
332u8 *shm_name_from_svm_map_region_args (svm_map_region_args_t *a)
333{
334 u8 *path;
335 u8 *shm_name;
336 u8 *split_point;
337 u8 *mkdir_arg = 0;
338 int root_path_offset = 0;
339 int name_offset = 0;
340
341 if (a->root_path) {
342 /* Tolerate present or absent slashes */
343 if (a->root_path[0] == '/')
344 root_path_offset++;
345
346 /* create the root_path under /dev/shm
347 iterate through path creating directories */
348
349 path = format (0, "/dev/shm/%s%c", &a->root_path[root_path_offset], 0);
350 split_point = path+1;
351 vec_add1(mkdir_arg, '-');
352
353 while (*split_point) {
354 while (*split_point && *split_point != '/') {
355 vec_add1 (mkdir_arg, *split_point);
356 split_point++;
357 }
358 vec_add1 (mkdir_arg, 0);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700359
360 /* ready to descend another level */
361 mkdir_arg[vec_len(mkdir_arg)-1] = '-';
362 split_point++;
363 }
364 vec_free(mkdir_arg);
365 vec_free(path);
366
367 if (a->name[0] == '/')
368 name_offset = 1;
369
370 shm_name = format (0, "/%s-%s%c", a->root_path,
371 &a->name[name_offset], 0);
372 }
373 else
374 shm_name = format (0, "%s%c", a->name, 0);
375 return (shm_name);
376}
377
378/*
379 * svm_map_region
380 */
381void *svm_map_region (svm_map_region_args_t *a)
382{
383 int svm_fd;
384 svm_region_t *rp;
385 pthread_mutexattr_t attr;
386 pthread_condattr_t cattr;
387 int deadman=0;
388 u8 junk = 0;
389 void *oldheap;
390 int overhead_space;
391 int rv;
392 uword data_base;
393 int nbits, words, bit;
394 int pid_holding_region_lock;
395 u8 *shm_name;
396 int dead_region_recovery = 0;
397 int time_left;
398 struct stat stat;
399 struct timespec ts, tsrem;
400
401 if (CLIB_DEBUG > 1)
402 clib_warning ("[%d] map region %s", getpid(), a->name);
403
404 ASSERT((a->size & ~(MMAP_PAGESIZE-1)) == a->size);
405 ASSERT(a->name);
406
407 shm_name = shm_name_from_svm_map_region_args (a);
408
409 svm_fd = shm_open((char *) shm_name, O_RDWR | O_CREAT | O_EXCL, 0777);
410
411 if (svm_fd >= 0) {
Dave Barach16c75df2016-05-31 14:05:46 -0400412 if (fchmod (svm_fd, 0770) < 0)
413 clib_unix_warning ("segment chmod");
414 /* This turns out to fail harmlessly if the client starts first */
415 if (fchown (svm_fd, a->uid, a->gid) < 0)
416 clib_unix_warning ("segment chown [ok if client starts first]");
Ed Warnickecb9cada2015-12-08 15:45:58 -0700417
418 vec_free(shm_name);
419
Chris Luke370e9e32016-07-07 11:01:17 -0400420 if (lseek(svm_fd, a->size, SEEK_SET) == (off_t) -1) {
421 clib_warning ("seek region size");
422 close (svm_fd);
423 return (0);
424 }
425 if (write(svm_fd, &junk, 1) != 1) {
Ed Warnickecb9cada2015-12-08 15:45:58 -0700426 clib_warning ("set region size");
Chris Luke370e9e32016-07-07 11:01:17 -0400427 close (svm_fd);
428 return (0);
429 }
Ed Warnickecb9cada2015-12-08 15:45:58 -0700430
431 rp = mmap((void *)a->baseva, a->size,
432 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, svm_fd, 0);
433
434 if (rp == (svm_region_t *) MAP_FAILED) {
435 clib_unix_warning ("mmap create");
Chris Luke370e9e32016-07-07 11:01:17 -0400436 close (svm_fd);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700437 return (0);
438 }
439 close(svm_fd);
440 memset(rp, 0, sizeof(*rp));
441
442 if (pthread_mutexattr_init(&attr))
443 clib_unix_warning("mutexattr_init");
444
445 if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED))
446 clib_unix_warning("mutexattr_setpshared");
447
448 if (pthread_mutex_init(&rp->mutex, &attr))
449 clib_unix_warning("mutex_init");
450
451 if (pthread_mutexattr_destroy(&attr))
452 clib_unix_warning("mutexattr_destroy");
453
454 if (pthread_condattr_init(&cattr))
455 clib_unix_warning("condattr_init");
456
457 if (pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED))
458 clib_unix_warning("condattr_setpshared");
459
460 if (pthread_cond_init(&rp->condvar, &cattr))
461 clib_unix_warning("cond_init");
462
463 if(pthread_condattr_destroy(&cattr))
464 clib_unix_warning("condattr_destroy");
465
466 region_lock (rp, 1);
467
468 rp->virtual_base = a->baseva;
469 rp->virtual_size = a->size;
470
471 rp->region_heap =
472 mheap_alloc_with_flags ((void *)(a->baseva+MMAP_PAGESIZE),
473 SVM_PVT_MHEAP_SIZE,
474 MHEAP_FLAG_DISABLE_VM);
475 oldheap = svm_push_pvt_heap(rp);
476
477 rp->region_name = (char *)format (0, "%s%c", a->name, 0);
478 vec_add1(rp->client_pids, getpid());
479
480 nbits = rp->virtual_size / MMAP_PAGESIZE;
481
482 ASSERT (nbits > 0);
483 rp->bitmap_size = nbits;
484 words = (nbits + BITS(uword)-1) / BITS(uword);
485 vec_validate (rp->bitmap, words-1);
486
487 overhead_space = MMAP_PAGESIZE /* header */ +
488 SVM_PVT_MHEAP_SIZE;
489
490 bit = 0;
491 data_base = (uword)rp->virtual_base;
492
493 if (a->flags & SVM_FLAGS_NODATA)
494 rp->flags |= SVM_FLAGS_NEED_DATA_INIT;
495
496 do {
497 clib_bitmap_set_no_check (rp->bitmap, bit, 1);
498 bit++;
499 overhead_space -= MMAP_PAGESIZE;
500 data_base += MMAP_PAGESIZE;
501 } while (overhead_space > 0);
502
503 rp->data_base = (void *)data_base;
504
505 /*
506 * Note: although the POSIX spec guarantees that only one
507 * process enters this block, we have to play games
508 * to hold off clients until e.g. the mutex is ready
509 */
510 rp->version = SVM_VERSION;
511
512 /* setup the data portion of the region */
513
514 rv = svm_data_region_create (a, rp);
515 if (rv) {
516 clib_warning ("data_region_create: %d", rv);
517 }
518
519 region_unlock(rp);
520
521 svm_pop_heap(oldheap);
522
523 return ((void *) rp);
524 } else {
525 svm_fd = shm_open((char *)shm_name, O_RDWR, 0777);
526
527 vec_free(shm_name);
528
529 if (svm_fd < 0) {
530 perror("svm_region_map(mmap open)");
531 return (0);
532 }
533
534 time_left = 20;
535 while (1) {
536 if (0 != fstat(svm_fd, &stat)) {
537 clib_warning("fstat failed: %d", errno);
Chris Luke370e9e32016-07-07 11:01:17 -0400538 close (svm_fd);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700539 return (0);
540 }
541 if (stat.st_size > 0) {
542 break;
543 }
544 if (0 == time_left) {
545 clib_warning("waiting for resize of shm file timed out");
Chris Luke370e9e32016-07-07 11:01:17 -0400546 close (svm_fd);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700547 return (0);
548 }
549 ts.tv_sec = 0;
550 ts.tv_nsec = 100000000;
551 while (nanosleep(&ts, &tsrem) < 0)
552 ts = tsrem;
553 time_left--;
554 }
555
556 rp = mmap(0, MMAP_PAGESIZE,
557 PROT_READ | PROT_WRITE, MAP_SHARED, svm_fd, 0);
558
559 if (rp == (svm_region_t *) MAP_FAILED) {
560 close(svm_fd);
561 clib_warning("mmap");
562 return (0);
563 }
564 /*
565 * We lost the footrace to create this region; make sure
566 * the winner has crossed the finish line.
567 */
568 while (rp->version == 0 && deadman++ < 5) {
569 sleep(1);
570 }
571
572 /*
573 * <bleep>-ed?
574 */
575 if (rp->version == 0) {
Ed Warnickecb9cada2015-12-08 15:45:58 -0700576 clib_warning("rp->version %d not %d", rp->version,
577 SVM_VERSION);
Chris Luke370e9e32016-07-07 11:01:17 -0400578 close(svm_fd);
579 munmap(rp, a->size);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700580 return (0);
581 }
582 /* Remap now that the region has been placed */
583 a->baseva = rp->virtual_base;
584 a->size = rp->virtual_size;
585 munmap(rp, MMAP_PAGESIZE);
586
587 rp = (void *) mmap ((void *)a->baseva, a->size,
588 PROT_READ | PROT_WRITE,
589 MAP_SHARED | MAP_FIXED, svm_fd, 0);
590 if ((uword)rp == (uword)MAP_FAILED) {
591 clib_unix_warning ("mmap");
Chris Luke370e9e32016-07-07 11:01:17 -0400592 close (svm_fd);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700593 return (0);
594 }
595
596 if ((uword) rp != rp->virtual_base) {
597 clib_warning("mmap botch");
598 }
599
600 /*
601 * Try to fix the region mutex if it is held by
602 * a dead process
603 */
604 pid_holding_region_lock = rp->mutex_owner_pid;
605 if (pid_holding_region_lock &&
606 kill (pid_holding_region_lock, 0) < 0) {
607 clib_warning (
608 "region %s mutex held by dead pid %d, tag %d, force unlock",
609 rp->region_name, pid_holding_region_lock, rp->mutex_owner_tag);
610 /* owner pid is nonexistent */
611 rp->mutex.__data.__owner = 0;
612 rp->mutex.__data.__lock = 0;
613 dead_region_recovery = 1;
614 }
615
616 if (dead_region_recovery)
617 clib_warning ("recovery: attempt to re-lock region");
618
619 region_lock(rp, 2);
620 oldheap = svm_push_pvt_heap (rp);
621 vec_add1(rp->client_pids, getpid());
622
623 if (dead_region_recovery)
624 clib_warning ("recovery: attempt svm_data_region_map");
625
626 rv = svm_data_region_map (a, rp);
627 if (rv) {
628 clib_warning ("data_region_map: %d", rv);
629 }
630
631 if (dead_region_recovery)
632 clib_warning ("unlock and continue");
633
634 region_unlock(rp);
635
636 svm_pop_heap(oldheap);
637
638 return ((void *) rp);
639
640 }
641 return 0; /* NOTREACHED */
642}
643
644static void svm_mutex_cleanup (void)
645{
646 int i;
647 for (i = 0; i < nheld; i++) {
648 pthread_mutex_unlock (mutexes_held[i]);
649 }
650}
651
Dave Barach16c75df2016-05-31 14:05:46 -0400652static void svm_region_init_internal (char *root_path, int uid, int gid)
Ed Warnickecb9cada2015-12-08 15:45:58 -0700653{
654 svm_region_t *rp;
Dave Barach16c75df2016-05-31 14:05:46 -0400655 svm_map_region_args_t _a, *a=&_a;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700656 u64 ticks = clib_cpu_time_now();
657 uword randomize_baseva;
658
659 /* guard against klutz calls */
Ed Warnickecb9cada2015-12-08 15:45:58 -0700660 if (root_rp)
661 return;
662
Dave Barach16c75df2016-05-31 14:05:46 -0400663 root_rp_refcount++;
664
Ed Warnickecb9cada2015-12-08 15:45:58 -0700665 atexit(svm_mutex_cleanup);
666
667 /* Randomize the shared-VM base at init time */
Dave Barach95bb8832015-12-12 10:37:00 -0500668 if (MMAP_PAGESIZE <= (4<<10))
669 randomize_baseva = (ticks & 15) * MMAP_PAGESIZE;
670 else
671 randomize_baseva = (ticks & 3) * MMAP_PAGESIZE;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700672
Dave Barach16c75df2016-05-31 14:05:46 -0400673 memset (a, 0, sizeof (*a));
Ed Warnickecb9cada2015-12-08 15:45:58 -0700674 a->root_path = root_path;
675 a->name = SVM_GLOBAL_REGION_NAME;
676 a->baseva = SVM_GLOBAL_REGION_BASEVA + randomize_baseva;
677 a->size = SVM_GLOBAL_REGION_SIZE;
678 a->flags = SVM_FLAGS_NODATA;
Dave Barach16c75df2016-05-31 14:05:46 -0400679 a->uid = uid;
680 a->gid = gid;
Ed Warnickecb9cada2015-12-08 15:45:58 -0700681
682 rp = svm_map_region (a);
683 ASSERT(rp);
684
685 region_lock(rp, 3);
686
687 /* Set up the main region data structures */
688 if (rp->flags & SVM_FLAGS_NEED_DATA_INIT) {
689 svm_main_region_t *mp = 0;
690 void *oldheap;
691
692 rp->flags &= ~(SVM_FLAGS_NEED_DATA_INIT);
693
694 oldheap = svm_push_pvt_heap (rp);
695 vec_validate (mp, 0);
696 mp->name_hash = hash_create_string (0, sizeof(uword));
697 mp->root_path = root_path
698 ? format (0, "%s%c", root_path, 0) : 0 ;
699 rp->data_base = mp;
700 svm_pop_heap (oldheap);
701 }
702 region_unlock(rp);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700703 root_rp = rp;
704}
705
706void svm_region_init (void)
707{
Dave Barach16c75df2016-05-31 14:05:46 -0400708 svm_region_init_internal (0, 0 /* uid */, 0 /* gid */);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700709}
710
711void svm_region_init_chroot (char *root_path)
712{
Dave Barach16c75df2016-05-31 14:05:46 -0400713 svm_region_init_internal (root_path, 0 /* uid */, 0 /* gid */);
714}
715
716void svm_region_init_chroot_uid_gid (char *root_path, int uid, int gid)
717{
718 svm_region_init_internal (root_path, uid, gid);
Ed Warnickecb9cada2015-12-08 15:45:58 -0700719}
720
721void *svm_region_find_or_create (svm_map_region_args_t *a)
722{
723 svm_main_region_t *mp;
724 svm_region_t *rp;
725 uword need_nbits;
726 int index, i;
727 void *oldheap;
728 uword *p;
729 u8 *name;
730 svm_subregion_t *subp;
731
732 ASSERT(root_rp);
733
734 a->size += MMAP_PAGESIZE + SVM_PVT_MHEAP_SIZE;
735 a->size = rnd_pagesize(a->size);
736
737 region_lock (root_rp, 4);
738 oldheap = svm_push_pvt_heap(root_rp);
739 mp = root_rp->data_base;
740
741 ASSERT(mp);
742
743 /* Map the named region from the correct chroot environment */
744 a->root_path = (char *) mp->root_path;
745
746 /*
747 * See if this region is already known. If it is, we're
748 * almost done...
749 */
750 p = hash_get_mem (mp->name_hash, a->name);
751
752 if (p) {
753 rp = svm_map_region (a);
754 region_unlock(root_rp);
755 svm_pop_heap (oldheap);
756 return rp;
757 }
758
759 /* Create the region. */
760 ASSERT((a->size & ~(MMAP_PAGESIZE-1)) == a->size);
761
762 need_nbits = a->size / MMAP_PAGESIZE;
763
764 index = 1; /* $$$ fixme, figure out how many bit to really skip */
765
766 /*
767 * Scan the virtual space allocation bitmap, looking for a large
768 * enough chunk
769 */
770 do {
771 if (clib_bitmap_get_no_check(root_rp->bitmap, index) == 0) {
772 for (i = 0; i < (need_nbits-1); i++) {
773 if (clib_bitmap_get_no_check(root_rp->bitmap,
774 index+i) == 1) {
775 index = index + i;
776 goto next;
777 }
778 }
779 break;
780 }
781 index++;
782 next:;
783 } while (index < root_rp->bitmap_size);
784
785 /* Completely out of VM? */
786 if (index >= root_rp->bitmap_size) {
787 clib_warning("region %s: not enough VM to allocate 0x%x",
788 root_rp->region_name, a->size);
789 svm_pop_heap (oldheap);
790 region_unlock (root_rp);
791 return 0;
792 }
793
794 /*
795 * Mark virtual space allocated
796 */
797#if CLIB_DEBUG > 1
798 clib_warning ("set %d bits at index %d", need_nbits, index);
799#endif
800
801 for (i = 0; i < need_nbits; i++) {
802 clib_bitmap_set_no_check (root_rp->bitmap, index+i, 1);
803 }
804
805 /* Place this region where it goes... */
806 a->baseva = root_rp->virtual_base + index*MMAP_PAGESIZE;
807
808 rp = svm_map_region (a);
809
810 pool_get (mp->subregions, subp);
811 name = format (0, "%s%c", a->name, 0);
812 subp->subregion_name = name;
813
814 hash_set_mem (mp->name_hash, name, subp - mp->subregions);
815
816 svm_pop_heap (oldheap);
817
818 region_unlock (root_rp);
819
820 return (rp);
821}
822
823/*
824 * svm_region_unmap
825 *
826 * Let go of the indicated region. If the calling process
827 * is the last customer, throw it away completely.
828 * The root region mutex guarantees atomicity with respect to
829 * a new region client showing up at the wrong moment.
830 */
831void svm_region_unmap (void *rp_arg)
832{
833 int i, mypid = getpid();
834 int nclients_left;
835 void *oldheap;
836 uword virtual_base, virtual_size;
837 svm_region_t *rp = rp_arg;
838 char *name;
839
840 /*
841 * If we take a signal while holding one or more shared-memory
842 * mutexes, we may end up back here from an otherwise
843 * benign exit handler. Bail out to avoid a recursive
844 * mutex screw-up.
845 */
846 if (nheld)
847 return;
848
849 ASSERT(rp);
850 ASSERT(root_rp);
851
852 if (CLIB_DEBUG > 1)
853 clib_warning ("[%d] unmap region %s", getpid(), rp->region_name);
854
855 region_lock (root_rp, 5);
856 region_lock (rp, 6);
857
858 oldheap = svm_push_pvt_heap (rp); /* nb vec_delete() in the loop */
859
860 /* Remove the caller from the list of mappers */
861 for (i = 0; i < vec_len(rp->client_pids); i++) {
862 if (rp->client_pids[i] == mypid) {
863 vec_delete (rp->client_pids, 1, i);
864 goto found;
865 }
866 }
867 clib_warning("pid %d AWOL", mypid);
868
869 found:
870
871 svm_pop_heap (oldheap);
872
873 nclients_left = vec_len(rp->client_pids);
874 virtual_base = rp->virtual_base;
875 virtual_size = rp->virtual_size;
876
877 if (nclients_left == 0) {
878 int index, nbits, i;
879 svm_main_region_t *mp;
880 uword *p;
881 svm_subregion_t *subp;
882
883 /* Kill the region, last guy on his way out */
884
885 oldheap = svm_push_pvt_heap (root_rp);
886 name = vec_dup (rp->region_name);
887
888 virtual_base = rp->virtual_base;
889 virtual_size = rp->virtual_size;
890
891 /* Figure out which bits to clear in the root region bitmap */
892 index = (virtual_base - root_rp->virtual_base)
893 / MMAP_PAGESIZE;
894
895 nbits = (virtual_size + MMAP_PAGESIZE - 1)
896 / MMAP_PAGESIZE;
897
898#if CLIB_DEBUG > 1
899 clib_warning ("clear %d bits at index %d", nbits, index);
900#endif
901 /* Give back the allocated VM */
902 for (i = 0; i < nbits; i++) {
903 clib_bitmap_set_no_check (root_rp->bitmap, index+i, 0);
904 }
905
906 mp = root_rp->data_base;
907
908 p = hash_get_mem (mp->name_hash, name);
909
910 /* Better never happen ... */
911 if (p == NULL) {
912 region_unlock (rp);
913 region_unlock (root_rp);
914 svm_pop_heap (oldheap);
915 clib_warning ("Region name '%s' not found?", name);
916 return;
917 }
918
919 /* Remove from the root region subregion pool */
920 subp = mp->subregions + p[0];
921 pool_put (mp->subregions, subp);
922
923 hash_unset_mem (mp->name_hash, name);
924
925 vec_free(name);
926
927 region_unlock (rp);
928 shm_unlink(rp->region_name);
929 munmap ((void *)virtual_base, virtual_size);
930 region_unlock (root_rp);
931 svm_pop_heap (oldheap);
932 return;
933 }
934
935 region_unlock(rp);
936 region_unlock(root_rp);
937
938 munmap ((void *)virtual_base, virtual_size);
939}
940
941/*
942 * svm_region_exit
943 * There is no clean way to unlink the
944 * root region when all clients go away,
945 * so remove the pid entry and call it a day.
946 */
947void svm_region_exit ()
948{
949 void *oldheap;
950 int i, mypid = getpid();
951 uword virtual_base, virtual_size;
952
953 /* It felt so nice we did it twice... */
954 if (root_rp == 0)
955 return;
956
957 if (--root_rp_refcount > 0)
958 return;
959
960 /*
961 * If we take a signal while holding one or more shared-memory
962 * mutexes, we may end up back here from an otherwise
963 * benign exit handler. Bail out to avoid a recursive
964 * mutex screw-up.
965 */
966 if (nheld)
967 return;
968
969 region_lock(root_rp, 7);
970 oldheap = svm_push_pvt_heap (root_rp);
971
972 virtual_base = root_rp->virtual_base;
973 virtual_size = root_rp->virtual_size;
974
975 for (i = 0; i < vec_len(root_rp->client_pids); i++) {
976 if (root_rp->client_pids[i] == mypid) {
977 vec_delete (root_rp->client_pids, 1, i);
978 goto found;
979 }
980 }
981 clib_warning("pid %d AWOL", mypid);
982
983 found:
984
985 region_unlock(root_rp);
986 svm_pop_heap (oldheap);
987
988 root_rp = 0;
989 munmap ((void *)virtual_base, virtual_size);
990}
991
992void svm_client_scan_this_region_nolock (svm_region_t *rp)
993{
994 int j;
995 int mypid = getpid();
996 void *oldheap;
997
998 for (j = 0; j < vec_len(rp->client_pids); j++) {
999 if (mypid == rp->client_pids[j])
1000 continue;
1001 if (rp->client_pids[j] && (kill (rp->client_pids[j], 0) < 0)) {
1002 clib_warning ("%s: cleanup ghost pid %d",
1003 rp->region_name, rp->client_pids[j]);
1004 /* nb: client vec in rp->region_heap */
1005 oldheap = svm_push_pvt_heap (rp);
1006 vec_delete (rp->client_pids, 1, j);
1007 j--;
1008 svm_pop_heap (oldheap);
1009 }
1010 }
1011}
1012
1013
1014/*
1015 * Scan svm regions for dead clients
1016 */
1017void svm_client_scan(char *root_path)
1018{
1019 int i, j;
1020 svm_main_region_t *mp;
1021 svm_map_region_args_t *a = 0;
1022 svm_region_t *root_rp;
1023 svm_region_t *rp;
1024 svm_subregion_t *subp;
1025 u8 *name=0;
1026 u8 ** svm_names=0;
1027 void *oldheap;
1028 int mypid = getpid();
1029
1030 vec_validate (a, 0);
1031
1032 svm_region_init_chroot(root_path);
1033
1034 root_rp = svm_get_root_rp();
1035
1036 pthread_mutex_lock (&root_rp->mutex);
1037
1038 mp = root_rp->data_base;
1039
1040 for (j = 0; j < vec_len (root_rp->client_pids); j++) {
1041 if (mypid == root_rp->client_pids[j])
1042 continue;
1043 if (root_rp->client_pids[j]
1044 && (kill (root_rp->client_pids[j], 0) < 0)) {
1045 clib_warning ("%s: cleanup ghost pid %d",
1046 root_rp->region_name, root_rp->client_pids[j]);
1047 /* nb: client vec in root_rp->region_heap */
1048 oldheap = svm_push_pvt_heap (root_rp);
1049 vec_delete (root_rp->client_pids, 1, j);
1050 j--;
1051 svm_pop_heap (oldheap);
1052 }
1053 }
1054
1055 /*
1056 * Snapshoot names, can't hold root rp mutex across
1057 * find_or_create.
1058 */
1059 pool_foreach (subp, mp->subregions, ({
1060 name = vec_dup (subp->subregion_name);
1061 vec_add1(svm_names, name);
1062 }));
1063
1064 pthread_mutex_unlock (&root_rp->mutex);
1065
1066 for (i = 0; i < vec_len(svm_names); i++) {
1067 vec_validate(a, 0);
1068 a->root_path = root_path;
1069 a->name = (char *) svm_names[i];
1070 rp = svm_region_find_or_create (a);
1071 if (rp) {
1072 pthread_mutex_lock (&rp->mutex);
1073
1074 svm_client_scan_this_region_nolock (rp);
1075
1076 pthread_mutex_unlock (&rp->mutex);
1077 svm_region_unmap (rp);
1078 vec_free(svm_names[i]);
1079 }
1080 vec_free (a);
1081 }
1082 vec_free(svm_names);
1083
1084 svm_region_exit ();
1085
1086 vec_free (a);
1087}