blob: 15c5735d84140cabd0261d1e96c180bbe8e75deb [file] [log] [blame]
Damjan Marion01914ce2017-09-14 19:04:50 +02001/*
2 * Copyright (c) 2017 Cisco and/or its affiliates.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16#define _GNU_SOURCE
17#include <stdlib.h>
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <unistd.h>
21#include <sys/mount.h>
22#include <sys/mman.h>
23#include <fcntl.h>
24#include <linux/mempolicy.h>
25#include <linux/memfd.h>
26
27#include <vppinfra/clib.h>
28#include <vppinfra/mem.h>
Damjan Marion70ae0a92020-10-26 10:39:30 +010029#include <vppinfra/lock.h>
Florin Corasd3e83a92018-01-16 02:40:18 -080030#include <vppinfra/time.h>
Damjan Marion01914ce2017-09-14 19:04:50 +020031#include <vppinfra/format.h>
32#include <vppinfra/clib_error.h>
33#include <vppinfra/linux/syscall.h>
34#include <vppinfra/linux/sysfs.h>
35
36#ifndef F_LINUX_SPECIFIC_BASE
37#define F_LINUX_SPECIFIC_BASE 1024
38#endif
39
40#ifndef F_ADD_SEALS
41#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
42#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
43
44#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
45#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
46#define F_SEAL_GROW 0x0004 /* prevent file from growing */
47#define F_SEAL_WRITE 0x0008 /* prevent writes */
48#endif
49
Damjan Marionc63e2a42020-09-16 21:36:00 +020050#ifndef MFD_HUGETLB
51#define MFD_HUGETLB 0x0004U
52#endif
Damjan Marion9787f5f2018-10-24 12:56:32 +020053
Damjan Marion6bfd0762020-09-11 22:16:53 +020054#ifndef MAP_HUGE_SHIFT
55#define MAP_HUGE_SHIFT 26
56#endif
57
Damjan Marionbdbb0c52020-09-17 10:40:44 +020058#ifndef MFD_HUGE_SHIFT
59#define MFD_HUGE_SHIFT 26
60#endif
61
Damjan Marion6bfd0762020-09-11 22:16:53 +020062#ifndef MAP_FIXED_NOREPLACE
63#define MAP_FIXED_NOREPLACE 0x100000
64#endif
Damjan Marion9787f5f2018-10-24 12:56:32 +020065
Damjan Marion70ae0a92020-10-26 10:39:30 +010066static void
67map_lock ()
68{
69 while (clib_atomic_test_and_set (&clib_mem_main.map_lock))
70 CLIB_PAUSE ();
71}
72
73static void
74map_unlock ()
75{
76 clib_atomic_release (&clib_mem_main.map_lock);
77}
78
Damjan Mariondae1c7e2020-10-17 13:32:25 +020079__clib_export uword
Damjan Marion9787f5f2018-10-24 12:56:32 +020080clib_mem_get_default_hugepage_size (void)
81{
82 unformat_input_t input;
83 static u32 size = 0;
84 int fd;
85
86 if (size)
87 goto done;
88
Dave Barach036343b2019-01-01 09:45:08 -050089 /*
90 * If the kernel doesn't support hugepages, /proc/meminfo won't
91 * say anything about it. Use the regular page size as a default.
92 */
93 size = clib_mem_get_page_size () / 1024;
94
Damjan Marion9787f5f2018-10-24 12:56:32 +020095 if ((fd = open ("/proc/meminfo", 0)) == -1)
96 return 0;
97
98 unformat_init_clib_file (&input, fd);
99
100 while (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT)
101 {
102 if (unformat (&input, "Hugepagesize:%_%u kB", &size))
103 ;
104 else
105 unformat_skip_line (&input);
106 }
107 unformat_free (&input);
108 close (fd);
109done:
110 return 1024ULL * size;
111}
112
Damjan Marionc63e2a42020-09-16 21:36:00 +0200113static clib_mem_page_sz_t
114legacy_get_log2_default_hugepage_size (void)
115{
116 clib_mem_page_sz_t log2_page_size = CLIB_MEM_PAGE_SZ_UNKNOWN;
117 FILE *fp;
118 char tmp[33] = { };
119
120 if ((fp = fopen ("/proc/meminfo", "r")) == NULL)
121 return CLIB_MEM_PAGE_SZ_UNKNOWN;
122
123 while (fscanf (fp, "%32s", tmp) > 0)
124 if (strncmp ("Hugepagesize:", tmp, 13) == 0)
125 {
126 u32 size;
127 if (fscanf (fp, "%u", &size) > 0)
128 log2_page_size = 10 + min_log2 (size);
129 break;
130 }
131
132 fclose (fp);
133 return log2_page_size;
134}
135
136void
137clib_mem_main_init ()
138{
139 clib_mem_main_t *mm = &clib_mem_main;
140 uword page_size;
141 void *va;
142 int fd;
143
144 if (mm->log2_page_sz != CLIB_MEM_PAGE_SZ_UNKNOWN)
145 return;
146
147 /* system page size */
148 page_size = sysconf (_SC_PAGESIZE);
149 mm->log2_page_sz = min_log2 (page_size);
150
151 /* default system hugeppage size */
152 if ((fd = memfd_create ("test", MFD_HUGETLB)) != -1)
153 {
154 mm->log2_default_hugepage_sz = clib_mem_get_fd_log2_page_size (fd);
155 close (fd);
156 }
157 else /* likely kernel older than 4.14 */
158 mm->log2_default_hugepage_sz = legacy_get_log2_default_hugepage_size ();
159
160 /* numa nodes */
161 va = mmap (0, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE |
162 MAP_ANONYMOUS, -1, 0);
163 if (va == MAP_FAILED)
164 return;
165
166 if (mlock (va, page_size))
167 goto done;
168
169 for (int i = 0; i < CLIB_MAX_NUMAS; i++)
170 {
171 int status;
172 if (move_pages (0, 1, &va, &i, &status, 0) == 0)
173 mm->numa_node_bitmap |= 1ULL << i;
174 }
175
176done:
177 munmap (va, page_size);
178}
179
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200180__clib_export u64
Damjan Marion567e61d2018-10-24 17:08:26 +0200181clib_mem_get_fd_page_size (int fd)
Damjan Marion01914ce2017-09-14 19:04:50 +0200182{
183 struct stat st = { 0 };
Chris Lukeb2bcad62017-09-18 08:51:22 -0400184 if (fstat (fd, &st) == -1)
Damjan Marion01914ce2017-09-14 19:04:50 +0200185 return 0;
Florin Corasd3e83a92018-01-16 02:40:18 -0800186 return st.st_blksize;
187}
188
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200189__clib_export clib_mem_page_sz_t
Damjan Marion567e61d2018-10-24 17:08:26 +0200190clib_mem_get_fd_log2_page_size (int fd)
Florin Corasd3e83a92018-01-16 02:40:18 -0800191{
Damjan Marion6bfd0762020-09-11 22:16:53 +0200192 uword page_size = clib_mem_get_fd_page_size (fd);
193 return page_size ? min_log2 (page_size) : CLIB_MEM_PAGE_SZ_UNKNOWN;
Florin Corasd3e83a92018-01-16 02:40:18 -0800194}
195
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200196__clib_export void
Damjan Marionb5095042020-09-11 22:13:46 +0200197clib_mem_vm_randomize_va (uword * requested_va,
198 clib_mem_page_sz_t log2_page_size)
Florin Corasd3e83a92018-01-16 02:40:18 -0800199{
200 u8 bit_mask = 15;
201
202 if (log2_page_size <= 12)
203 bit_mask = 15;
204 else if (log2_page_size > 12 && log2_page_size <= 16)
205 bit_mask = 3;
206 else
207 bit_mask = 0;
208
Haiyang Tana5ab5032018-10-15 06:17:55 -0700209 *requested_va +=
210 (clib_cpu_time_now () & bit_mask) * (1ull << log2_page_size);
Damjan Marion01914ce2017-09-14 19:04:50 +0200211}
212
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200213static int
214legacy_memfd_create (u8 * name)
215{
216 clib_mem_main_t *mm = &clib_mem_main;
217 int fd = -1;
218 char *mount_dir;
Benoît Ganne2b92c702020-09-28 17:34:17 +0200219 u8 *temp;
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200220 u8 *filename;
221
Benoît Ganne2b92c702020-09-28 17:34:17 +0200222 /*
223 * Since mkdtemp will modify template string "/tmp/hugepage_mount.XXXXXX",
224 * it must not be a string constant, but should be declared as
225 * a character array.
226 */
227 temp = format (0, "/tmp/hugepage_mount.XXXXXX%c", 0);
228
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200229 /* create mount directory */
Benoît Ganne2b92c702020-09-28 17:34:17 +0200230 if ((mount_dir = mkdtemp ((char *) temp)) == 0)
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200231 {
Benoît Ganne2b92c702020-09-28 17:34:17 +0200232 vec_free (temp);
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200233 vec_reset_length (mm->error);
234 mm->error = clib_error_return_unix (mm->error, "mkdtemp");
Damjan Marion561ae5d2020-09-24 13:53:46 +0200235 return CLIB_MEM_ERROR;
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200236 }
237
238 if (mount ("none", mount_dir, "hugetlbfs", 0, NULL))
239 {
Benoît Ganne2b92c702020-09-28 17:34:17 +0200240 vec_free (temp);
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200241 rmdir ((char *) mount_dir);
242 vec_reset_length (mm->error);
243 mm->error = clib_error_return_unix (mm->error, "mount");
Damjan Marion561ae5d2020-09-24 13:53:46 +0200244 return CLIB_MEM_ERROR;
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200245 }
246
247 filename = format (0, "%s/%s%c", mount_dir, name, 0);
248
249 if ((fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1)
250 {
251 vec_reset_length (mm->error);
252 mm->error = clib_error_return_unix (mm->error, "mkdtemp");
253 }
254
255 umount2 ((char *) mount_dir, MNT_DETACH);
256 rmdir ((char *) mount_dir);
257 vec_free (filename);
Benoît Ganne2b92c702020-09-28 17:34:17 +0200258 vec_free (temp);
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200259
260 return fd;
261}
262
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200263__clib_export int
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200264clib_mem_vm_create_fd (clib_mem_page_sz_t log2_page_size, char *fmt, ...)
265{
266 clib_mem_main_t *mm = &clib_mem_main;
267 int fd;
268 unsigned int memfd_flags;
269 va_list va;
270 u8 *s = 0;
271
272 if (log2_page_size == mm->log2_page_sz)
273 log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT;
Benoît Ganne2b92c702020-09-28 17:34:17 +0200274 else if (log2_page_size == mm->log2_default_hugepage_sz)
275 log2_page_size = CLIB_MEM_PAGE_SZ_DEFAULT_HUGE;
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200276
277 switch (log2_page_size)
278 {
279 case CLIB_MEM_PAGE_SZ_UNKNOWN:
Damjan Marion561ae5d2020-09-24 13:53:46 +0200280 return CLIB_MEM_ERROR;
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200281 case CLIB_MEM_PAGE_SZ_DEFAULT:
282 memfd_flags = MFD_ALLOW_SEALING;
283 break;
284 case CLIB_MEM_PAGE_SZ_DEFAULT_HUGE:
285 memfd_flags = MFD_HUGETLB;
286 break;
287 default:
288 memfd_flags = MFD_HUGETLB | log2_page_size << MFD_HUGE_SHIFT;
289 }
290
291 va_start (va, fmt);
292 s = va_format (0, fmt, &va);
293 va_end (va);
294
295 /* memfd_create maximum string size is 249 chars without trailing zero */
296 if (vec_len (s) > 249)
297 _vec_len (s) = 249;
298 vec_add1 (s, 0);
299
300 /* memfd_create introduced in kernel 3.17, we don't support older kernels */
301 fd = memfd_create ((char *) s, memfd_flags);
302
303 /* kernel versions < 4.14 does not support memfd_create for huge pages */
304 if (fd == -1 && errno == EINVAL &&
305 log2_page_size == CLIB_MEM_PAGE_SZ_DEFAULT_HUGE)
306 {
307 fd = legacy_memfd_create (s);
308 }
309 else if (fd == -1)
310 {
311 vec_reset_length (mm->error);
312 mm->error = clib_error_return_unix (mm->error, "memfd_create");
313 vec_free (s);
Damjan Marion561ae5d2020-09-24 13:53:46 +0200314 return CLIB_MEM_ERROR;
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200315 }
316
317 vec_free (s);
318
319 if ((memfd_flags & MFD_ALLOW_SEALING) &&
320 ((fcntl (fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1))
321 {
322 vec_reset_length (mm->error);
323 mm->error = clib_error_return_unix (mm->error, "fcntl (F_ADD_SEALS)");
324 close (fd);
Damjan Marion561ae5d2020-09-24 13:53:46 +0200325 return CLIB_MEM_ERROR;
Damjan Marionbdbb0c52020-09-17 10:40:44 +0200326 }
327
328 return fd;
329}
330
Dave Barach16e4a4a2020-04-16 12:00:14 -0400331uword
Damjan Marionb5095042020-09-11 22:13:46 +0200332clib_mem_vm_reserve (uword start, uword size, clib_mem_page_sz_t log2_page_sz)
Dave Barach16e4a4a2020-04-16 12:00:14 -0400333{
Damjan Marion6bfd0762020-09-11 22:16:53 +0200334 clib_mem_main_t *mm = &clib_mem_main;
335 uword pagesize = 1ULL << log2_page_sz;
336 uword sys_page_sz = 1ULL << mm->log2_page_sz;
337 uword n_bytes;
338 void *base = 0, *p;
Dave Barach16e4a4a2020-04-16 12:00:14 -0400339
340 size = round_pow2 (size, pagesize);
341
Damjan Marion6bfd0762020-09-11 22:16:53 +0200342 /* in adition of requested reservation, we also rserve one system page
343 * (typically 4K) adjacent to the start off reservation */
Dave Barach16e4a4a2020-04-16 12:00:14 -0400344
Damjan Marion6bfd0762020-09-11 22:16:53 +0200345 if (start)
Dave Barach16e4a4a2020-04-16 12:00:14 -0400346 {
Damjan Marion6bfd0762020-09-11 22:16:53 +0200347 /* start address is provided, so we just need to make sure we are not
348 * replacing existing map */
349 if (start & pow2_mask (log2_page_sz))
350 return ~0;
351
352 base = (void *) start - sys_page_sz;
353 base = mmap (base, size + sys_page_sz, PROT_NONE,
354 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
355 return (base == MAP_FAILED) ? ~0 : start;
Dave Barach16e4a4a2020-04-16 12:00:14 -0400356 }
357
Damjan Marion6bfd0762020-09-11 22:16:53 +0200358 /* to make sure that we get reservation aligned to page_size we need to
359 * request one additional page as mmap will return us address which is
360 * aligned only to system page size */
361 base = mmap (0, size + pagesize, PROT_NONE,
362 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
Dave Barach16e4a4a2020-04-16 12:00:14 -0400363
Damjan Marion6bfd0762020-09-11 22:16:53 +0200364 if (base == MAP_FAILED)
365 return ~0;
366
367 /* return additional space at the end of allocation */
368 p = base + size + pagesize;
369 n_bytes = (uword) p & pow2_mask (log2_page_sz);
370 if (n_bytes)
371 {
372 p -= n_bytes;
373 munmap (p, n_bytes);
374 }
375
376 /* return additional space at the start of allocation */
377 n_bytes = pagesize - sys_page_sz - n_bytes;
378 if (n_bytes)
379 {
380 munmap (base, n_bytes);
381 base += n_bytes;
382 }
383
384 return (uword) base + sys_page_sz;
Dave Barach16e4a4a2020-04-16 12:00:14 -0400385}
386
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200387__clib_export clib_mem_vm_map_hdr_t *
Damjan Marion6bfd0762020-09-11 22:16:53 +0200388clib_mem_vm_get_next_map_hdr (clib_mem_vm_map_hdr_t * hdr)
389{
390 clib_mem_main_t *mm = &clib_mem_main;
Dave Barach27c35e32020-10-07 09:37:36 -0400391 uword sys_page_sz = 1ULL << mm->log2_page_sz;
Damjan Marion6bfd0762020-09-11 22:16:53 +0200392 clib_mem_vm_map_hdr_t *next;
393 if (hdr == 0)
394 {
395 hdr = mm->first_map;
396 if (hdr)
397 mprotect (hdr, sys_page_sz, PROT_READ);
398 return hdr;
399 }
400 next = hdr->next;
401 mprotect (hdr, sys_page_sz, PROT_NONE);
402 if (next)
403 mprotect (next, sys_page_sz, PROT_READ);
404 return next;
405}
406
407void *
408clib_mem_vm_map_internal (void *base, clib_mem_page_sz_t log2_page_sz,
409 uword size, int fd, uword offset, char *name)
410{
411 clib_mem_main_t *mm = &clib_mem_main;
412 clib_mem_vm_map_hdr_t *hdr;
Dave Barach27c35e32020-10-07 09:37:36 -0400413 uword sys_page_sz = 1ULL << mm->log2_page_sz;
Damjan Marion6bfd0762020-09-11 22:16:53 +0200414 int mmap_flags = MAP_FIXED, is_huge = 0;
415
416 if (fd != -1)
417 {
418 mmap_flags |= MAP_SHARED;
419 log2_page_sz = clib_mem_get_fd_log2_page_size (fd);
420 if (log2_page_sz > mm->log2_page_sz)
421 is_huge = 1;
422 }
423 else
424 {
425 mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
426
427 if (log2_page_sz == mm->log2_page_sz)
428 log2_page_sz = CLIB_MEM_PAGE_SZ_DEFAULT;
429
430 switch (log2_page_sz)
431 {
432 case CLIB_MEM_PAGE_SZ_UNKNOWN:
433 /* will fail later */
434 break;
435 case CLIB_MEM_PAGE_SZ_DEFAULT:
436 log2_page_sz = mm->log2_page_sz;
437 break;
438 case CLIB_MEM_PAGE_SZ_DEFAULT_HUGE:
439 mmap_flags |= MAP_HUGETLB;
440 log2_page_sz = mm->log2_default_hugepage_sz;
441 is_huge = 1;
442 break;
443 default:
444 mmap_flags |= MAP_HUGETLB;
445 mmap_flags |= log2_page_sz << MAP_HUGE_SHIFT;
446 is_huge = 1;
447 }
448 }
449
450 if (log2_page_sz == CLIB_MEM_PAGE_SZ_UNKNOWN)
451 return CLIB_MEM_VM_MAP_FAILED;
452
Dave Barach27c35e32020-10-07 09:37:36 -0400453 size = round_pow2 (size, 1ULL << log2_page_sz);
Damjan Marion6bfd0762020-09-11 22:16:53 +0200454
455 base = (void *) clib_mem_vm_reserve ((uword) base, size, log2_page_sz);
456
457 if (base == (void *) ~0)
458 return CLIB_MEM_VM_MAP_FAILED;
459
460 base = mmap (base, size, PROT_READ | PROT_WRITE, mmap_flags, fd, offset);
461
462 if (base == MAP_FAILED)
463 return CLIB_MEM_VM_MAP_FAILED;
464
465 if (is_huge && (mlock (base, size) != 0))
466 {
467 munmap (base, size);
468 return CLIB_MEM_VM_MAP_FAILED;
469 }
470
471 hdr = mmap (base - sys_page_sz, sys_page_sz, PROT_READ | PROT_WRITE,
472 MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
473
474 if (hdr != base - sys_page_sz)
475 {
476 munmap (base, size);
477 return CLIB_MEM_VM_MAP_FAILED;
478 }
479
Damjan Marion70ae0a92020-10-26 10:39:30 +0100480 map_lock ();
481
Damjan Marion6bfd0762020-09-11 22:16:53 +0200482 if (mm->last_map)
483 {
484 mprotect (mm->last_map, sys_page_sz, PROT_READ | PROT_WRITE);
485 mm->last_map->next = hdr;
486 mprotect (mm->last_map, sys_page_sz, PROT_NONE);
487 }
488 else
489 mm->first_map = hdr;
490
Benoît Ganne568617b2020-10-21 10:02:18 +0200491 CLIB_MEM_UNPOISON (hdr, sys_page_sz);
Damjan Marion6bfd0762020-09-11 22:16:53 +0200492 hdr->next = 0;
493 hdr->prev = mm->last_map;
Florin Corasb4d9c5d2021-02-02 15:40:35 -0800494 snprintf (hdr->name, CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1, "%s", (char *) name);
Damjan Marion6bfd0762020-09-11 22:16:53 +0200495 mm->last_map = hdr;
496
Damjan Marion70ae0a92020-10-26 10:39:30 +0100497 map_unlock ();
498
Damjan Marion6bfd0762020-09-11 22:16:53 +0200499 hdr->base_addr = (uword) base;
500 hdr->log2_page_sz = log2_page_sz;
501 hdr->num_pages = size >> log2_page_sz;
Damjan Marion5ef25162020-09-17 13:29:33 +0200502 hdr->fd = fd;
Damjan Marion6bfd0762020-09-11 22:16:53 +0200503 hdr->name[CLIB_VM_MAP_HDR_NAME_MAX_LEN - 1] = 0;
504 mprotect (hdr, sys_page_sz, PROT_NONE);
505
506 CLIB_MEM_UNPOISON (base, size);
507 return base;
508}
509
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200510__clib_export int
Damjan Marion6bfd0762020-09-11 22:16:53 +0200511clib_mem_vm_unmap (void *base)
512{
513 clib_mem_main_t *mm = &clib_mem_main;
Dave Barach27c35e32020-10-07 09:37:36 -0400514 uword size, sys_page_sz = 1ULL << mm->log2_page_sz;
Damjan Marion6bfd0762020-09-11 22:16:53 +0200515 clib_mem_vm_map_hdr_t *hdr = base - sys_page_sz;;
516
517 if (mprotect (hdr, sys_page_sz, PROT_READ | PROT_WRITE) != 0)
Damjan Marion561ae5d2020-09-24 13:53:46 +0200518 return CLIB_MEM_ERROR;
Damjan Marion6bfd0762020-09-11 22:16:53 +0200519
520 size = hdr->num_pages << hdr->log2_page_sz;
521 if (munmap ((void *) hdr->base_addr, size) != 0)
Damjan Marion561ae5d2020-09-24 13:53:46 +0200522 return CLIB_MEM_ERROR;
Damjan Marion6bfd0762020-09-11 22:16:53 +0200523
Damjan Marion70ae0a92020-10-26 10:39:30 +0100524 map_lock ();
525
Damjan Marion6bfd0762020-09-11 22:16:53 +0200526 if (hdr->next)
527 {
528 mprotect (hdr->next, sys_page_sz, PROT_READ | PROT_WRITE);
529 hdr->next->prev = hdr->prev;
530 mprotect (hdr->next, sys_page_sz, PROT_NONE);
531 }
532 else
533 mm->last_map = hdr->prev;
534
535 if (hdr->prev)
536 {
537 mprotect (hdr->prev, sys_page_sz, PROT_READ | PROT_WRITE);
538 hdr->prev->next = hdr->next;
539 mprotect (hdr->prev, sys_page_sz, PROT_NONE);
540 }
541 else
542 mm->first_map = hdr->next;
543
Damjan Marion70ae0a92020-10-26 10:39:30 +0100544 map_unlock ();
545
Damjan Marion6bfd0762020-09-11 22:16:53 +0200546 if (munmap (hdr, sys_page_sz) != 0)
Damjan Marion561ae5d2020-09-24 13:53:46 +0200547 return CLIB_MEM_ERROR;
Damjan Marion6bfd0762020-09-11 22:16:53 +0200548
549 return 0;
550}
551
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200552__clib_export void
Damjan Marion6bfd0762020-09-11 22:16:53 +0200553clib_mem_get_page_stats (void *start, clib_mem_page_sz_t log2_page_size,
554 uword n_pages, clib_mem_page_stats_t * stats)
555{
556 int i, *status = 0;
557 void **ptr = 0;
558
559 log2_page_size = clib_mem_log2_page_size_validate (log2_page_size);
560
561 vec_validate (status, n_pages - 1);
562 vec_validate (ptr, n_pages - 1);
563
564 for (i = 0; i < n_pages; i++)
565 ptr[i] = start + (i << log2_page_size);
566
567 clib_memset (stats, 0, sizeof (clib_mem_page_stats_t));
Damjan Marionbfa75d62020-10-06 17:46:06 +0200568 stats->total = n_pages;
569 stats->log2_page_sz = log2_page_size;
Damjan Marion6bfd0762020-09-11 22:16:53 +0200570
571 if (move_pages (0, n_pages, ptr, 0, status, 0) != 0)
572 {
573 stats->unknown = n_pages;
574 return;
575 }
576
577 for (i = 0; i < n_pages; i++)
578 {
579 if (status[i] >= 0 && status[i] < CLIB_MAX_NUMAS)
580 {
581 stats->mapped++;
582 stats->per_numa[status[i]]++;
583 }
584 else if (status[i] == -EFAULT)
585 stats->not_mapped++;
586 else
587 stats->unknown++;
588 }
589}
590
591
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200592__clib_export u64 *
Damjan Marion6bfd0762020-09-11 22:16:53 +0200593clib_mem_vm_get_paddr (void *mem, clib_mem_page_sz_t log2_page_size,
594 int n_pages)
Damjan Marion01914ce2017-09-14 19:04:50 +0200595{
596 int pagesize = sysconf (_SC_PAGESIZE);
597 int fd;
598 int i;
599 u64 *r = 0;
600
Damjan Marion6bfd0762020-09-11 22:16:53 +0200601 log2_page_size = clib_mem_log2_page_size_validate (log2_page_size);
602
Damjan Marion01914ce2017-09-14 19:04:50 +0200603 if ((fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1)
604 return 0;
605
606 for (i = 0; i < n_pages; i++)
607 {
608 u64 seek, pagemap = 0;
609 uword vaddr = pointer_to_uword (mem) + (((u64) i) << log2_page_size);
610 seek = ((u64) vaddr / pagesize) * sizeof (u64);
611 if (lseek (fd, seek, SEEK_SET) != seek)
612 goto done;
613
614 if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap)))
615 goto done;
616
617 if ((pagemap & (1ULL << 63)) == 0)
618 goto done;
619
620 pagemap &= pow2_mask (55);
621 vec_add1 (r, pagemap * pagesize);
622 }
623
624done:
625 close (fd);
626 if (vec_len (r) != n_pages)
627 {
628 vec_free (r);
629 return 0;
630 }
631 return r;
632}
633
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200634__clib_export int
Damjan Marion561ae5d2020-09-24 13:53:46 +0200635clib_mem_set_numa_affinity (u8 numa_node, int force)
636{
637 clib_mem_main_t *mm = &clib_mem_main;
638 long unsigned int mask[16] = { 0 };
639 int mask_len = sizeof (mask) * 8 + 1;
640
641 /* no numa support */
642 if (mm->numa_node_bitmap == 0)
643 {
644 if (numa_node)
645 {
646 vec_reset_length (mm->error);
647 mm->error = clib_error_return (mm->error, "%s: numa not supported",
648 (char *) __func__);
649 return CLIB_MEM_ERROR;
650 }
651 else
652 return 0;
653 }
654
655 mask[0] = 1 << numa_node;
656
657 if (set_mempolicy (force ? MPOL_BIND : MPOL_PREFERRED, mask, mask_len))
658 goto error;
659
660 vec_reset_length (mm->error);
661 return 0;
662
663error:
664 vec_reset_length (mm->error);
665 mm->error = clib_error_return_unix (mm->error, (char *) __func__);
666 return CLIB_MEM_ERROR;
667}
668
Damjan Mariondae1c7e2020-10-17 13:32:25 +0200669__clib_export int
Damjan Marion561ae5d2020-09-24 13:53:46 +0200670clib_mem_set_default_numa_affinity ()
671{
672 clib_mem_main_t *mm = &clib_mem_main;
673
674 if (set_mempolicy (MPOL_DEFAULT, 0, 0))
675 {
676 vec_reset_length (mm->error);
677 mm->error = clib_error_return_unix (mm->error, (char *) __func__);
678 return CLIB_MEM_ERROR;
679 }
680 return 0;
681}
682
Damjan Marion01914ce2017-09-14 19:04:50 +0200683/*
684 * fd.io coding-style-patch-verification: ON
685 *
686 * Local Variables:
687 * eval: (c-set-style "gnu")
688 * End:
689 */