blob: 39258f19748e941578c7732c19303e1850adbe39 [file] [log] [blame]
Damjan Marion56f54af2021-10-12 20:30:02 +02001/* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2021 Damjan Marion
3 */
4
5#ifndef included_clib_memcpy_x86_64_h
6#define included_clib_memcpy_x86_64_h
7#ifdef __x86_64__
8
9#include <vppinfra/clib.h>
10#include <vppinfra/warnings.h>
11#include <stdio.h>
12
13/* clang-format off */
14WARN_OFF (stringop-overflow)
15/* clang-format on */
16
17static_always_inline void
18clib_memcpy1 (void *d, void *s)
19{
20 *(u8 *) d = *(u8 *) s;
21}
22
23static_always_inline void
24clib_memcpy2 (void *d, void *s)
25{
26 *(u16u *) d = *(u16u *) s;
27}
28
29static_always_inline void
30clib_memcpy4 (void *d, void *s)
31{
32 *(u32u *) d = *(u32u *) s;
33}
34
35static_always_inline void
36clib_memcpy8 (void *d, void *s)
37{
38 *(u64u *) d = *(u64u *) s;
39}
40
Damjan Marion56f54af2021-10-12 20:30:02 +020041static_always_inline void
42clib_memcpy16 (void *d, void *s)
43{
Damjan Marion98f7f0a2023-04-17 09:38:11 +000044#ifdef CLIB_HAVE_VEC128
Damjan Marion56f54af2021-10-12 20:30:02 +020045 *(u8x16u *) d = *(u8x16u *) s;
Damjan Marion98f7f0a2023-04-17 09:38:11 +000046#else
47 clib_memcpy8 (d, s);
48 clib_memcpy8 (d + 8, s + 8);
Damjan Marion56f54af2021-10-12 20:30:02 +020049#endif
Damjan Marion98f7f0a2023-04-17 09:38:11 +000050}
Damjan Marion56f54af2021-10-12 20:30:02 +020051
52#ifdef CLIB_HAVE_VEC256
53static_always_inline void
54clib_memcpy32 (void *d, void *s)
55{
56 *(u8x32u *) d = *(u8x32u *) s;
57}
58#endif
59
60#ifdef CLIB_HAVE_VEC512
61static_always_inline void
62clib_memcpy64 (void *d, void *s)
63{
64 *(u8x64u *) d = *(u8x64u *) s;
65}
66#endif
67
68static_always_inline void
69clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n)
70{
71 switch (n)
72 {
73 case 1:
74 clib_memcpy1 (dst, src);
75 break;
76 case 2:
77 clib_memcpy2 (dst, src);
78 break;
79 case 3:
80 clib_memcpy2 (dst, src);
81 clib_memcpy1 (dst + 2, src + 2);
82 break;
83 case 4:
84 clib_memcpy4 (dst, src);
85 break;
86 case 5:
87 clib_memcpy4 (dst, src);
88 clib_memcpy1 (dst + 4, src + 4);
89 break;
90 case 6:
91 clib_memcpy4 (dst, src);
92 clib_memcpy2 (dst + 4, src + 4);
93 break;
94 case 7:
95 clib_memcpy4 (dst, src);
96 clib_memcpy4 (dst + 3, src + 3);
97 break;
98 case 8:
99 clib_memcpy8 (dst, src);
100 break;
101 case 9:
102 clib_memcpy8 (dst, src);
103 clib_memcpy1 (dst + 8, src + 8);
104 break;
105 case 10:
106 clib_memcpy8 (dst, src);
107 clib_memcpy2 (dst + 8, src + 8);
108 break;
109 case 11:
110 case 12:
111 clib_memcpy8 (dst, src);
112 clib_memcpy4 (dst + n - 4, src + n - 4);
113 break;
114 case 13:
115 case 14:
116 case 15:
117 clib_memcpy8 (dst, src);
118 clib_memcpy8 (dst + n - 8, src + n - 8);
119 break;
120 case 16:
121 clib_memcpy16 (dst, src);
122 break;
123 case 17:
124 clib_memcpy16 (dst, src);
125 clib_memcpy1 (dst + 16, src + 16);
126 break;
127 case 18:
128 clib_memcpy16 (dst, src);
129 clib_memcpy2 (dst + 16, src + 16);
130 break;
131 case 20:
132 clib_memcpy16 (dst, src);
133 clib_memcpy4 (dst + 16, src + 16);
134 break;
135 case 24:
136 clib_memcpy16 (dst, src);
137 clib_memcpy8 (dst + 16, src + 16);
138 break;
139 default:
140 clib_memcpy16 (dst, src);
141 clib_memcpy16 (dst + n - 16, src + n - 16);
142 break;
143 }
144}
145
146static_always_inline void
147clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n)
148{
149 if (n < 32)
150 {
151 clib_memcpy_const_le32 (dst, src, n);
152 return;
153 }
154
155#if defined(CLIB_HAVE_VEC256)
156 switch (n)
157 {
158 case 32:
159 clib_memcpy32 (dst, src);
160 break;
161 case 33:
162 clib_memcpy32 (dst, src);
163 clib_memcpy1 (dst + 32, src + 32);
164 break;
165 case 34:
166 clib_memcpy32 (dst, src);
167 clib_memcpy2 (dst + 32, src + 32);
168 break;
169 case 36:
170 clib_memcpy32 (dst, src);
171 clib_memcpy4 (dst + 32, src + 32);
172 break;
173 case 40:
174 clib_memcpy32 (dst, src);
175 clib_memcpy8 (dst + 32, src + 32);
176 break;
177 case 48:
178 clib_memcpy32 (dst, src);
179 clib_memcpy16 (dst + 32, src + 32);
180 break;
181 default:
182 clib_memcpy32 (dst, src);
183 clib_memcpy32 (dst + n - 32, src + n - 32);
184 break;
185 }
186#else
187 while (n > 31)
188 {
189 clib_memcpy16 (dst, src);
190 clib_memcpy16 (dst + 16, src + 16);
191 dst += 32;
192 src += 32;
193 n -= 32;
194 }
195 clib_memcpy_const_le32 (dst, src, n);
196#endif
197}
198
199static_always_inline void
200clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n)
201{
202#if defined(CLIB_HAVE_VEC512)
203 while (n > 128)
204 {
205 clib_memcpy64 (dst, src);
206 dst += 64;
207 src += 64;
208 n -= 64;
209 }
210
211 if (n < 64)
212 {
213 clib_memcpy_const_le64 (dst, src, n);
214 return;
215 }
216
217 switch (n)
218 {
219 case 64:
220 clib_memcpy64 (dst, src);
221 break;
222 case 65:
223 clib_memcpy64 (dst, src);
224 clib_memcpy1 (dst + 64, src + 64);
225 break;
226 case 66:
227 clib_memcpy64 (dst, src);
228 clib_memcpy2 (dst + 64, src + 64);
229 break;
230 case 68:
231 clib_memcpy64 (dst, src);
232 clib_memcpy4 (dst + 64, src + 64);
233 break;
234 case 72:
235 clib_memcpy64 (dst, src);
236 clib_memcpy8 (dst + 64, src + 64);
237 break;
238 case 80:
239 clib_memcpy64 (dst, src);
240 clib_memcpy16 (dst + 64, src + 64);
241 break;
242 case 96:
243 clib_memcpy64 (dst, src);
244 clib_memcpy32 (dst + 64, src + 64);
245 break;
246 default:
247 clib_memcpy64 (dst, src);
248 clib_memcpy64 (dst + n - 64, src + n - 64);
249 break;
250 }
251#elif defined(CLIB_HAVE_VEC256)
252 while (n > 64)
253 {
254 clib_memcpy32 (dst, src);
255 dst += 32;
256 src += 32;
257 n -= 32;
258 }
259 clib_memcpy_const_le64 (dst, src, n);
260#else
261 while (n > 32)
262 {
263 clib_memcpy16 (dst, src);
264 dst += 16;
265 src += 16;
266 n -= 16;
267 }
268 clib_memcpy_const_le32 (dst, src, n);
269#endif
270}
271
272static_always_inline void *
273clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n)
274{
275 u8 *d = (u8 *) dst, *s = (u8 *) src;
276
277 if (n == 0)
278 return dst;
279
280 if (COMPILE_TIME_CONST (n))
281 {
282 if (n)
283 clib_memcpy_x86_64_const (d, s, n);
284 return dst;
285 }
286
287 if (n <= 32)
288 {
289#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
290 u32 mask = pow2_mask (n);
291 u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask);
292#else
293 if (PREDICT_TRUE (n >= 16))
294 {
295 clib_memcpy16 (d, s);
296 clib_memcpy16 (d + n - 16, s + n - 16);
297 }
298 else if (PREDICT_TRUE (n >= 8))
299 {
300 clib_memcpy8 (d, s);
301 clib_memcpy8 (d + n - 8, s + n - 8);
302 }
303 else if (PREDICT_TRUE (n >= 4))
304 {
305 clib_memcpy4 (d, s);
306 clib_memcpy4 (d + n - 4, s + n - 4);
307 }
308 else if (PREDICT_TRUE (n > 1))
309 {
310 clib_memcpy2 (d, s);
311 clib_memcpy2 (d + n - 2, s + n - 2);
312 }
313 else
314 clib_memcpy1 (d, s);
315#endif
316 }
317#ifdef CLIB_HAVE_VEC512
318 else
319 {
320 u8x64 v0, v1, v2, v3;
321 u64 final_off, nr, off = 64;
322
323 if (n <= 64)
324 {
325 n -= 32;
326 u8x32_store_unaligned (u8x32_load_unaligned (s), d);
327 u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n);
328 return dst;
329 }
330
331 u8x64_store_unaligned (u8x64_load_unaligned (s), d);
332
333 if (n <= 128)
334 goto done2;
335
336 if (n <= 192)
337 goto one;
338
339 if (n <= 512 + 64)
340 {
341 nr = round_pow2 (n - 128, 64);
342 goto last;
343 }
344
345 off -= ((u64) d) & 0x3f;
346 nr = round_pow2 (n - off - 64, 64);
347 final_off = (nr & ~(u64) 0x1ff) + off;
348
349 more:
350 v0 = u8x64_load_unaligned (s + off + 0x000);
351 v1 = u8x64_load_unaligned (s + off + 0x040);
352 v2 = u8x64_load_unaligned (s + off + 0x080);
353 v3 = u8x64_load_unaligned (s + off + 0x0c0);
354 u8x64_store_unaligned (v0, d + off + 0x000);
355 u8x64_store_unaligned (v1, d + off + 0x040);
356 u8x64_store_unaligned (v2, d + off + 0x080);
357 u8x64_store_unaligned (v3, d + off + 0x0c0);
358 v0 = u8x64_load_unaligned (s + off + 0x100);
359 v1 = u8x64_load_unaligned (s + off + 0x140);
360 v2 = u8x64_load_unaligned (s + off + 0x180);
361 v3 = u8x64_load_unaligned (s + off + 0x1c0);
362 u8x64_store_unaligned (v0, d + off + 0x100);
363 u8x64_store_unaligned (v1, d + off + 0x140);
364 u8x64_store_unaligned (v2, d + off + 0x180);
365 u8x64_store_unaligned (v3, d + off + 0x1c0);
366 off += 512;
367 if (off != final_off)
368 goto more;
369
370 if ((nr & 0x1ff) == 0)
371 goto done2;
372
373 last:
374 if (PREDICT_TRUE (nr & 256))
375 {
376 v0 = u8x64_load_unaligned (s + off + 0x000);
377 v1 = u8x64_load_unaligned (s + off + 0x040);
378 v2 = u8x64_load_unaligned (s + off + 0x080);
379 v3 = u8x64_load_unaligned (s + off + 0x0c0);
380 u8x64_store_unaligned (v0, d + off + 0x000);
381 u8x64_store_unaligned (v1, d + off + 0x040);
382 u8x64_store_unaligned (v2, d + off + 0x080);
383 u8x64_store_unaligned (v3, d + off + 0x0c0);
384 off += 256;
385 }
386 if (PREDICT_TRUE (nr & 128))
387 {
388 v0 = u8x64_load_unaligned (s + off + 0x000);
389 v1 = u8x64_load_unaligned (s + off + 0x040);
390 u8x64_store_unaligned (v0, d + off + 0x000);
391 u8x64_store_unaligned (v1, d + off + 0x040);
392 off += 128;
393 }
394 if (PREDICT_TRUE (nr & 64))
395 {
396 one:
397 u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off);
398 }
399 done2:
400 u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64);
401 }
402 return dst;
403#elif defined(CLIB_HAVE_VEC256)
404 else
405 {
406 u8x32 v0, v1, v2, v3;
407 u64 final_off, nr, off = 32;
408
409 u8x32_store_unaligned (u8x32_load_unaligned (s), d);
410
411 if (n <= 64)
412 goto done2;
413
414 if (n <= 96)
415 goto one;
416
417 if (n <= 256 + 32)
418 {
419 nr = round_pow2 (n - 64, 32);
420 goto last;
421 }
422
423 off -= ((u64) d) & 0x1f;
424 nr = round_pow2 (n - off - 32, 32);
425 final_off = (nr & ~(u64) 0xff) + off;
426
427 more:
428 v0 = u8x32_load_unaligned (s + off + 0x00);
429 v1 = u8x32_load_unaligned (s + off + 0x20);
430 v2 = u8x32_load_unaligned (s + off + 0x40);
431 v3 = u8x32_load_unaligned (s + off + 0x60);
432 u8x32_store_unaligned (v0, d + off + 0x00);
433 u8x32_store_unaligned (v1, d + off + 0x20);
434 u8x32_store_unaligned (v2, d + off + 0x40);
435 u8x32_store_unaligned (v3, d + off + 0x60);
436 v0 = u8x32_load_unaligned (s + off + 0x80);
437 v1 = u8x32_load_unaligned (s + off + 0xa0);
438 v2 = u8x32_load_unaligned (s + off + 0xc0);
439 v3 = u8x32_load_unaligned (s + off + 0xe0);
440 u8x32_store_unaligned (v0, d + off + 0x80);
441 u8x32_store_unaligned (v1, d + off + 0xa0);
442 u8x32_store_unaligned (v2, d + off + 0xc0);
443 u8x32_store_unaligned (v3, d + off + 0xe0);
444 off += 256;
445 if (off != final_off)
446 goto more;
447
448 if ((nr & 0xff) == 0)
449 goto done2;
450
451 last:
452 if (PREDICT_TRUE (nr & 128))
453 {
454 v0 = u8x32_load_unaligned (s + off + 0x00);
455 v1 = u8x32_load_unaligned (s + off + 0x20);
456 v2 = u8x32_load_unaligned (s + off + 0x40);
457 v3 = u8x32_load_unaligned (s + off + 0x60);
458 u8x32_store_unaligned (v0, d + off + 0x00);
459 u8x32_store_unaligned (v1, d + off + 0x20);
460 u8x32_store_unaligned (v2, d + off + 0x40);
461 u8x32_store_unaligned (v3, d + off + 0x60);
462 off += 128;
463 }
464 if (PREDICT_TRUE (nr & 64))
465 {
466 v0 = u8x32_load_unaligned (s + off + 0x00);
467 v1 = u8x32_load_unaligned (s + off + 0x20);
468 u8x32_store_unaligned (v0, d + off + 0x00);
469 u8x32_store_unaligned (v1, d + off + 0x20);
470 off += 64;
471 }
472 if (PREDICT_TRUE (nr & 32))
473 {
474 one:
475 u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off);
476 }
477 done2:
478 u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32);
479 }
480 return dst;
481#elif defined(CLIB_HAVE_VEC128)
482 else
483 {
484 u8x16 v0, v1, v2, v3;
485 u64 final_off, nr, off = 32;
486
487 if (0 && n > 389)
488 {
489 __builtin_memcpy (d, s, n);
490 return dst;
491 }
492
493 u8x16_store_unaligned (u8x16_load_unaligned (s), d);
494 u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16);
495
496 if (n <= 48)
497 goto done2;
498
499 if (n <= 64)
500 goto one;
501
502 if (n <= 256 + 32)
503 {
504 nr = round_pow2 (n - 48, 16);
505 goto last;
506 }
507
508 off -= ((u64) d) & 0x0f;
509 nr = round_pow2 (n - off - 16, 16);
510 final_off = (nr & ~(u64) 0xff) + off;
511
512 more:
513 v0 = u8x16_load_unaligned (s + off + 0x00);
514 v1 = u8x16_load_unaligned (s + off + 0x10);
515 v2 = u8x16_load_unaligned (s + off + 0x20);
516 v3 = u8x16_load_unaligned (s + off + 0x30);
517 u8x16_store_unaligned (v0, d + off + 0x00);
518 u8x16_store_unaligned (v1, d + off + 0x10);
519 u8x16_store_unaligned (v2, d + off + 0x20);
520 u8x16_store_unaligned (v3, d + off + 0x30);
521 v0 = u8x16_load_unaligned (s + off + 0x40);
522 v1 = u8x16_load_unaligned (s + off + 0x50);
523 v2 = u8x16_load_unaligned (s + off + 0x60);
524 v3 = u8x16_load_unaligned (s + off + 0x70);
525 u8x16_store_unaligned (v0, d + off + 0x40);
526 u8x16_store_unaligned (v1, d + off + 0x50);
527 u8x16_store_unaligned (v2, d + off + 0x60);
528 u8x16_store_unaligned (v3, d + off + 0x70);
529 v0 = u8x16_load_unaligned (s + off + 0x80);
530 v1 = u8x16_load_unaligned (s + off + 0x90);
531 v2 = u8x16_load_unaligned (s + off + 0xa0);
532 v3 = u8x16_load_unaligned (s + off + 0xb0);
533 u8x16_store_unaligned (v0, d + off + 0x80);
534 u8x16_store_unaligned (v1, d + off + 0x90);
535 u8x16_store_unaligned (v2, d + off + 0xa0);
536 u8x16_store_unaligned (v3, d + off + 0xb0);
537 v0 = u8x16_load_unaligned (s + off + 0xc0);
538 v1 = u8x16_load_unaligned (s + off + 0xd0);
539 v2 = u8x16_load_unaligned (s + off + 0xe0);
540 v3 = u8x16_load_unaligned (s + off + 0xf0);
541 u8x16_store_unaligned (v0, d + off + 0xc0);
542 u8x16_store_unaligned (v1, d + off + 0xd0);
543 u8x16_store_unaligned (v2, d + off + 0xe0);
544 u8x16_store_unaligned (v3, d + off + 0xf0);
545 off += 256;
546 if (off != final_off)
547 goto more;
548
549 if ((nr & 0xff) == 0)
550 goto done2;
551
552 last:
553 if (PREDICT_TRUE (nr & 128))
554 {
555 v0 = u8x16_load_unaligned (s + off + 0x00);
556 v1 = u8x16_load_unaligned (s + off + 0x10);
557 v2 = u8x16_load_unaligned (s + off + 0x20);
558 v3 = u8x16_load_unaligned (s + off + 0x30);
559 u8x16_store_unaligned (v0, d + off + 0x00);
560 u8x16_store_unaligned (v1, d + off + 0x10);
561 u8x16_store_unaligned (v2, d + off + 0x20);
562 u8x16_store_unaligned (v3, d + off + 0x30);
563 v0 = u8x16_load_unaligned (s + off + 0x40);
564 v1 = u8x16_load_unaligned (s + off + 0x50);
565 v2 = u8x16_load_unaligned (s + off + 0x60);
566 v3 = u8x16_load_unaligned (s + off + 0x70);
567 u8x16_store_unaligned (v0, d + off + 0x40);
568 u8x16_store_unaligned (v1, d + off + 0x50);
569 u8x16_store_unaligned (v2, d + off + 0x60);
570 u8x16_store_unaligned (v3, d + off + 0x70);
571 off += 128;
572 }
573 if (PREDICT_TRUE (nr & 64))
574 {
575 v0 = u8x16_load_unaligned (s + off + 0x00);
576 v1 = u8x16_load_unaligned (s + off + 0x10);
577 v2 = u8x16_load_unaligned (s + off + 0x20);
578 v3 = u8x16_load_unaligned (s + off + 0x30);
579 u8x16_store_unaligned (v0, d + off + 0x00);
580 u8x16_store_unaligned (v1, d + off + 0x10);
581 u8x16_store_unaligned (v2, d + off + 0x20);
582 u8x16_store_unaligned (v3, d + off + 0x30);
583 off += 64;
584 }
585 if (PREDICT_TRUE (nr & 32))
586 {
587 v0 = u8x16_load_unaligned (s + off + 0x00);
588 v1 = u8x16_load_unaligned (s + off + 0x10);
589 u8x16_store_unaligned (v0, d + off + 0x00);
590 u8x16_store_unaligned (v1, d + off + 0x10);
591 off += 32;
592 }
593 if (PREDICT_TRUE (nr & 16))
594 {
595 one:
596 u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off);
597 }
598 done2:
599 u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16);
600 }
601 return dst;
602#else
Damjan Marion2010a212021-11-15 15:33:11 +0100603 __builtin_memcpy (dst, src, n);
Damjan Marion56f54af2021-10-12 20:30:02 +0200604 return dst;
Damjan Marion2010a212021-11-15 15:33:11 +0100605#endif
Damjan Marion56f54af2021-10-12 20:30:02 +0200606}
607
608/* clang-format off */
609WARN_ON (stringop-overflow)
610/* clang-format on */
611
612#endif
613#endif