blob: e206c69c9973b1f5cfd520bd61bda1b5405feed2 [file] [log] [blame]
Damjan Marion56f54af2021-10-12 20:30:02 +02001/* SPDX-License-Identifier: Apache-2.0
2 * Copyright(c) 2021 Damjan Marion
3 */
4
5#ifndef included_clib_memcpy_x86_64_h
6#define included_clib_memcpy_x86_64_h
7#ifdef __x86_64__
8
9#include <vppinfra/clib.h>
10#include <vppinfra/warnings.h>
11#include <stdio.h>
12
13/* clang-format off */
14WARN_OFF (stringop-overflow)
15/* clang-format on */
16
17static_always_inline void
18clib_memcpy1 (void *d, void *s)
19{
20 *(u8 *) d = *(u8 *) s;
21}
22
23static_always_inline void
24clib_memcpy2 (void *d, void *s)
25{
26 *(u16u *) d = *(u16u *) s;
27}
28
29static_always_inline void
30clib_memcpy4 (void *d, void *s)
31{
32 *(u32u *) d = *(u32u *) s;
33}
34
35static_always_inline void
36clib_memcpy8 (void *d, void *s)
37{
38 *(u64u *) d = *(u64u *) s;
39}
40
41#ifdef CLIB_HAVE_VEC128
42static_always_inline void
43clib_memcpy16 (void *d, void *s)
44{
45 *(u8x16u *) d = *(u8x16u *) s;
46}
47#endif
48
49#ifdef CLIB_HAVE_VEC256
50static_always_inline void
51clib_memcpy32 (void *d, void *s)
52{
53 *(u8x32u *) d = *(u8x32u *) s;
54}
55#endif
56
57#ifdef CLIB_HAVE_VEC512
58static_always_inline void
59clib_memcpy64 (void *d, void *s)
60{
61 *(u8x64u *) d = *(u8x64u *) s;
62}
63#endif
64
65static_always_inline void
66clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n)
67{
68 switch (n)
69 {
70 case 1:
71 clib_memcpy1 (dst, src);
72 break;
73 case 2:
74 clib_memcpy2 (dst, src);
75 break;
76 case 3:
77 clib_memcpy2 (dst, src);
78 clib_memcpy1 (dst + 2, src + 2);
79 break;
80 case 4:
81 clib_memcpy4 (dst, src);
82 break;
83 case 5:
84 clib_memcpy4 (dst, src);
85 clib_memcpy1 (dst + 4, src + 4);
86 break;
87 case 6:
88 clib_memcpy4 (dst, src);
89 clib_memcpy2 (dst + 4, src + 4);
90 break;
91 case 7:
92 clib_memcpy4 (dst, src);
93 clib_memcpy4 (dst + 3, src + 3);
94 break;
95 case 8:
96 clib_memcpy8 (dst, src);
97 break;
98 case 9:
99 clib_memcpy8 (dst, src);
100 clib_memcpy1 (dst + 8, src + 8);
101 break;
102 case 10:
103 clib_memcpy8 (dst, src);
104 clib_memcpy2 (dst + 8, src + 8);
105 break;
106 case 11:
107 case 12:
108 clib_memcpy8 (dst, src);
109 clib_memcpy4 (dst + n - 4, src + n - 4);
110 break;
111 case 13:
112 case 14:
113 case 15:
114 clib_memcpy8 (dst, src);
115 clib_memcpy8 (dst + n - 8, src + n - 8);
116 break;
117 case 16:
118 clib_memcpy16 (dst, src);
119 break;
120 case 17:
121 clib_memcpy16 (dst, src);
122 clib_memcpy1 (dst + 16, src + 16);
123 break;
124 case 18:
125 clib_memcpy16 (dst, src);
126 clib_memcpy2 (dst + 16, src + 16);
127 break;
128 case 20:
129 clib_memcpy16 (dst, src);
130 clib_memcpy4 (dst + 16, src + 16);
131 break;
132 case 24:
133 clib_memcpy16 (dst, src);
134 clib_memcpy8 (dst + 16, src + 16);
135 break;
136 default:
137 clib_memcpy16 (dst, src);
138 clib_memcpy16 (dst + n - 16, src + n - 16);
139 break;
140 }
141}
142
143static_always_inline void
144clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n)
145{
146 if (n < 32)
147 {
148 clib_memcpy_const_le32 (dst, src, n);
149 return;
150 }
151
152#if defined(CLIB_HAVE_VEC256)
153 switch (n)
154 {
155 case 32:
156 clib_memcpy32 (dst, src);
157 break;
158 case 33:
159 clib_memcpy32 (dst, src);
160 clib_memcpy1 (dst + 32, src + 32);
161 break;
162 case 34:
163 clib_memcpy32 (dst, src);
164 clib_memcpy2 (dst + 32, src + 32);
165 break;
166 case 36:
167 clib_memcpy32 (dst, src);
168 clib_memcpy4 (dst + 32, src + 32);
169 break;
170 case 40:
171 clib_memcpy32 (dst, src);
172 clib_memcpy8 (dst + 32, src + 32);
173 break;
174 case 48:
175 clib_memcpy32 (dst, src);
176 clib_memcpy16 (dst + 32, src + 32);
177 break;
178 default:
179 clib_memcpy32 (dst, src);
180 clib_memcpy32 (dst + n - 32, src + n - 32);
181 break;
182 }
183#else
184 while (n > 31)
185 {
186 clib_memcpy16 (dst, src);
187 clib_memcpy16 (dst + 16, src + 16);
188 dst += 32;
189 src += 32;
190 n -= 32;
191 }
192 clib_memcpy_const_le32 (dst, src, n);
193#endif
194}
195
196static_always_inline void
197clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n)
198{
199#if defined(CLIB_HAVE_VEC512)
200 while (n > 128)
201 {
202 clib_memcpy64 (dst, src);
203 dst += 64;
204 src += 64;
205 n -= 64;
206 }
207
208 if (n < 64)
209 {
210 clib_memcpy_const_le64 (dst, src, n);
211 return;
212 }
213
214 switch (n)
215 {
216 case 64:
217 clib_memcpy64 (dst, src);
218 break;
219 case 65:
220 clib_memcpy64 (dst, src);
221 clib_memcpy1 (dst + 64, src + 64);
222 break;
223 case 66:
224 clib_memcpy64 (dst, src);
225 clib_memcpy2 (dst + 64, src + 64);
226 break;
227 case 68:
228 clib_memcpy64 (dst, src);
229 clib_memcpy4 (dst + 64, src + 64);
230 break;
231 case 72:
232 clib_memcpy64 (dst, src);
233 clib_memcpy8 (dst + 64, src + 64);
234 break;
235 case 80:
236 clib_memcpy64 (dst, src);
237 clib_memcpy16 (dst + 64, src + 64);
238 break;
239 case 96:
240 clib_memcpy64 (dst, src);
241 clib_memcpy32 (dst + 64, src + 64);
242 break;
243 default:
244 clib_memcpy64 (dst, src);
245 clib_memcpy64 (dst + n - 64, src + n - 64);
246 break;
247 }
248#elif defined(CLIB_HAVE_VEC256)
249 while (n > 64)
250 {
251 clib_memcpy32 (dst, src);
252 dst += 32;
253 src += 32;
254 n -= 32;
255 }
256 clib_memcpy_const_le64 (dst, src, n);
257#else
258 while (n > 32)
259 {
260 clib_memcpy16 (dst, src);
261 dst += 16;
262 src += 16;
263 n -= 16;
264 }
265 clib_memcpy_const_le32 (dst, src, n);
266#endif
267}
268
269static_always_inline void *
270clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n)
271{
272 u8 *d = (u8 *) dst, *s = (u8 *) src;
273
274 if (n == 0)
275 return dst;
276
277 if (COMPILE_TIME_CONST (n))
278 {
279 if (n)
280 clib_memcpy_x86_64_const (d, s, n);
281 return dst;
282 }
283
284 if (n <= 32)
285 {
286#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
287 u32 mask = pow2_mask (n);
288 u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask);
289#else
290 if (PREDICT_TRUE (n >= 16))
291 {
292 clib_memcpy16 (d, s);
293 clib_memcpy16 (d + n - 16, s + n - 16);
294 }
295 else if (PREDICT_TRUE (n >= 8))
296 {
297 clib_memcpy8 (d, s);
298 clib_memcpy8 (d + n - 8, s + n - 8);
299 }
300 else if (PREDICT_TRUE (n >= 4))
301 {
302 clib_memcpy4 (d, s);
303 clib_memcpy4 (d + n - 4, s + n - 4);
304 }
305 else if (PREDICT_TRUE (n > 1))
306 {
307 clib_memcpy2 (d, s);
308 clib_memcpy2 (d + n - 2, s + n - 2);
309 }
310 else
311 clib_memcpy1 (d, s);
312#endif
313 }
314#ifdef CLIB_HAVE_VEC512
315 else
316 {
317 u8x64 v0, v1, v2, v3;
318 u64 final_off, nr, off = 64;
319
320 if (n <= 64)
321 {
322 n -= 32;
323 u8x32_store_unaligned (u8x32_load_unaligned (s), d);
324 u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n);
325 return dst;
326 }
327
328 u8x64_store_unaligned (u8x64_load_unaligned (s), d);
329
330 if (n <= 128)
331 goto done2;
332
333 if (n <= 192)
334 goto one;
335
336 if (n <= 512 + 64)
337 {
338 nr = round_pow2 (n - 128, 64);
339 goto last;
340 }
341
342 off -= ((u64) d) & 0x3f;
343 nr = round_pow2 (n - off - 64, 64);
344 final_off = (nr & ~(u64) 0x1ff) + off;
345
346 more:
347 v0 = u8x64_load_unaligned (s + off + 0x000);
348 v1 = u8x64_load_unaligned (s + off + 0x040);
349 v2 = u8x64_load_unaligned (s + off + 0x080);
350 v3 = u8x64_load_unaligned (s + off + 0x0c0);
351 u8x64_store_unaligned (v0, d + off + 0x000);
352 u8x64_store_unaligned (v1, d + off + 0x040);
353 u8x64_store_unaligned (v2, d + off + 0x080);
354 u8x64_store_unaligned (v3, d + off + 0x0c0);
355 v0 = u8x64_load_unaligned (s + off + 0x100);
356 v1 = u8x64_load_unaligned (s + off + 0x140);
357 v2 = u8x64_load_unaligned (s + off + 0x180);
358 v3 = u8x64_load_unaligned (s + off + 0x1c0);
359 u8x64_store_unaligned (v0, d + off + 0x100);
360 u8x64_store_unaligned (v1, d + off + 0x140);
361 u8x64_store_unaligned (v2, d + off + 0x180);
362 u8x64_store_unaligned (v3, d + off + 0x1c0);
363 off += 512;
364 if (off != final_off)
365 goto more;
366
367 if ((nr & 0x1ff) == 0)
368 goto done2;
369
370 last:
371 if (PREDICT_TRUE (nr & 256))
372 {
373 v0 = u8x64_load_unaligned (s + off + 0x000);
374 v1 = u8x64_load_unaligned (s + off + 0x040);
375 v2 = u8x64_load_unaligned (s + off + 0x080);
376 v3 = u8x64_load_unaligned (s + off + 0x0c0);
377 u8x64_store_unaligned (v0, d + off + 0x000);
378 u8x64_store_unaligned (v1, d + off + 0x040);
379 u8x64_store_unaligned (v2, d + off + 0x080);
380 u8x64_store_unaligned (v3, d + off + 0x0c0);
381 off += 256;
382 }
383 if (PREDICT_TRUE (nr & 128))
384 {
385 v0 = u8x64_load_unaligned (s + off + 0x000);
386 v1 = u8x64_load_unaligned (s + off + 0x040);
387 u8x64_store_unaligned (v0, d + off + 0x000);
388 u8x64_store_unaligned (v1, d + off + 0x040);
389 off += 128;
390 }
391 if (PREDICT_TRUE (nr & 64))
392 {
393 one:
394 u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off);
395 }
396 done2:
397 u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64);
398 }
399 return dst;
400#elif defined(CLIB_HAVE_VEC256)
401 else
402 {
403 u8x32 v0, v1, v2, v3;
404 u64 final_off, nr, off = 32;
405
406 u8x32_store_unaligned (u8x32_load_unaligned (s), d);
407
408 if (n <= 64)
409 goto done2;
410
411 if (n <= 96)
412 goto one;
413
414 if (n <= 256 + 32)
415 {
416 nr = round_pow2 (n - 64, 32);
417 goto last;
418 }
419
420 off -= ((u64) d) & 0x1f;
421 nr = round_pow2 (n - off - 32, 32);
422 final_off = (nr & ~(u64) 0xff) + off;
423
424 more:
425 v0 = u8x32_load_unaligned (s + off + 0x00);
426 v1 = u8x32_load_unaligned (s + off + 0x20);
427 v2 = u8x32_load_unaligned (s + off + 0x40);
428 v3 = u8x32_load_unaligned (s + off + 0x60);
429 u8x32_store_unaligned (v0, d + off + 0x00);
430 u8x32_store_unaligned (v1, d + off + 0x20);
431 u8x32_store_unaligned (v2, d + off + 0x40);
432 u8x32_store_unaligned (v3, d + off + 0x60);
433 v0 = u8x32_load_unaligned (s + off + 0x80);
434 v1 = u8x32_load_unaligned (s + off + 0xa0);
435 v2 = u8x32_load_unaligned (s + off + 0xc0);
436 v3 = u8x32_load_unaligned (s + off + 0xe0);
437 u8x32_store_unaligned (v0, d + off + 0x80);
438 u8x32_store_unaligned (v1, d + off + 0xa0);
439 u8x32_store_unaligned (v2, d + off + 0xc0);
440 u8x32_store_unaligned (v3, d + off + 0xe0);
441 off += 256;
442 if (off != final_off)
443 goto more;
444
445 if ((nr & 0xff) == 0)
446 goto done2;
447
448 last:
449 if (PREDICT_TRUE (nr & 128))
450 {
451 v0 = u8x32_load_unaligned (s + off + 0x00);
452 v1 = u8x32_load_unaligned (s + off + 0x20);
453 v2 = u8x32_load_unaligned (s + off + 0x40);
454 v3 = u8x32_load_unaligned (s + off + 0x60);
455 u8x32_store_unaligned (v0, d + off + 0x00);
456 u8x32_store_unaligned (v1, d + off + 0x20);
457 u8x32_store_unaligned (v2, d + off + 0x40);
458 u8x32_store_unaligned (v3, d + off + 0x60);
459 off += 128;
460 }
461 if (PREDICT_TRUE (nr & 64))
462 {
463 v0 = u8x32_load_unaligned (s + off + 0x00);
464 v1 = u8x32_load_unaligned (s + off + 0x20);
465 u8x32_store_unaligned (v0, d + off + 0x00);
466 u8x32_store_unaligned (v1, d + off + 0x20);
467 off += 64;
468 }
469 if (PREDICT_TRUE (nr & 32))
470 {
471 one:
472 u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off);
473 }
474 done2:
475 u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32);
476 }
477 return dst;
478#elif defined(CLIB_HAVE_VEC128)
479 else
480 {
481 u8x16 v0, v1, v2, v3;
482 u64 final_off, nr, off = 32;
483
484 if (0 && n > 389)
485 {
486 __builtin_memcpy (d, s, n);
487 return dst;
488 }
489
490 u8x16_store_unaligned (u8x16_load_unaligned (s), d);
491 u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16);
492
493 if (n <= 48)
494 goto done2;
495
496 if (n <= 64)
497 goto one;
498
499 if (n <= 256 + 32)
500 {
501 nr = round_pow2 (n - 48, 16);
502 goto last;
503 }
504
505 off -= ((u64) d) & 0x0f;
506 nr = round_pow2 (n - off - 16, 16);
507 final_off = (nr & ~(u64) 0xff) + off;
508
509 more:
510 v0 = u8x16_load_unaligned (s + off + 0x00);
511 v1 = u8x16_load_unaligned (s + off + 0x10);
512 v2 = u8x16_load_unaligned (s + off + 0x20);
513 v3 = u8x16_load_unaligned (s + off + 0x30);
514 u8x16_store_unaligned (v0, d + off + 0x00);
515 u8x16_store_unaligned (v1, d + off + 0x10);
516 u8x16_store_unaligned (v2, d + off + 0x20);
517 u8x16_store_unaligned (v3, d + off + 0x30);
518 v0 = u8x16_load_unaligned (s + off + 0x40);
519 v1 = u8x16_load_unaligned (s + off + 0x50);
520 v2 = u8x16_load_unaligned (s + off + 0x60);
521 v3 = u8x16_load_unaligned (s + off + 0x70);
522 u8x16_store_unaligned (v0, d + off + 0x40);
523 u8x16_store_unaligned (v1, d + off + 0x50);
524 u8x16_store_unaligned (v2, d + off + 0x60);
525 u8x16_store_unaligned (v3, d + off + 0x70);
526 v0 = u8x16_load_unaligned (s + off + 0x80);
527 v1 = u8x16_load_unaligned (s + off + 0x90);
528 v2 = u8x16_load_unaligned (s + off + 0xa0);
529 v3 = u8x16_load_unaligned (s + off + 0xb0);
530 u8x16_store_unaligned (v0, d + off + 0x80);
531 u8x16_store_unaligned (v1, d + off + 0x90);
532 u8x16_store_unaligned (v2, d + off + 0xa0);
533 u8x16_store_unaligned (v3, d + off + 0xb0);
534 v0 = u8x16_load_unaligned (s + off + 0xc0);
535 v1 = u8x16_load_unaligned (s + off + 0xd0);
536 v2 = u8x16_load_unaligned (s + off + 0xe0);
537 v3 = u8x16_load_unaligned (s + off + 0xf0);
538 u8x16_store_unaligned (v0, d + off + 0xc0);
539 u8x16_store_unaligned (v1, d + off + 0xd0);
540 u8x16_store_unaligned (v2, d + off + 0xe0);
541 u8x16_store_unaligned (v3, d + off + 0xf0);
542 off += 256;
543 if (off != final_off)
544 goto more;
545
546 if ((nr & 0xff) == 0)
547 goto done2;
548
549 last:
550 if (PREDICT_TRUE (nr & 128))
551 {
552 v0 = u8x16_load_unaligned (s + off + 0x00);
553 v1 = u8x16_load_unaligned (s + off + 0x10);
554 v2 = u8x16_load_unaligned (s + off + 0x20);
555 v3 = u8x16_load_unaligned (s + off + 0x30);
556 u8x16_store_unaligned (v0, d + off + 0x00);
557 u8x16_store_unaligned (v1, d + off + 0x10);
558 u8x16_store_unaligned (v2, d + off + 0x20);
559 u8x16_store_unaligned (v3, d + off + 0x30);
560 v0 = u8x16_load_unaligned (s + off + 0x40);
561 v1 = u8x16_load_unaligned (s + off + 0x50);
562 v2 = u8x16_load_unaligned (s + off + 0x60);
563 v3 = u8x16_load_unaligned (s + off + 0x70);
564 u8x16_store_unaligned (v0, d + off + 0x40);
565 u8x16_store_unaligned (v1, d + off + 0x50);
566 u8x16_store_unaligned (v2, d + off + 0x60);
567 u8x16_store_unaligned (v3, d + off + 0x70);
568 off += 128;
569 }
570 if (PREDICT_TRUE (nr & 64))
571 {
572 v0 = u8x16_load_unaligned (s + off + 0x00);
573 v1 = u8x16_load_unaligned (s + off + 0x10);
574 v2 = u8x16_load_unaligned (s + off + 0x20);
575 v3 = u8x16_load_unaligned (s + off + 0x30);
576 u8x16_store_unaligned (v0, d + off + 0x00);
577 u8x16_store_unaligned (v1, d + off + 0x10);
578 u8x16_store_unaligned (v2, d + off + 0x20);
579 u8x16_store_unaligned (v3, d + off + 0x30);
580 off += 64;
581 }
582 if (PREDICT_TRUE (nr & 32))
583 {
584 v0 = u8x16_load_unaligned (s + off + 0x00);
585 v1 = u8x16_load_unaligned (s + off + 0x10);
586 u8x16_store_unaligned (v0, d + off + 0x00);
587 u8x16_store_unaligned (v1, d + off + 0x10);
588 off += 32;
589 }
590 if (PREDICT_TRUE (nr & 16))
591 {
592 one:
593 u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off);
594 }
595 done2:
596 u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16);
597 }
598 return dst;
599#else
Damjan Marion2010a212021-11-15 15:33:11 +0100600 __builtin_memcpy (dst, src, n);
Damjan Marion56f54af2021-10-12 20:30:02 +0200601 return dst;
Damjan Marion2010a212021-11-15 15:33:11 +0100602#endif
Damjan Marion56f54af2021-10-12 20:30:02 +0200603}
604
605/* clang-format off */
606WARN_ON (stringop-overflow)
607/* clang-format on */
608
609#endif
610#endif