blob: 66e3f6a673a45e8a8c25b06509cfd40d0cf18929 [file] [log] [blame]
Damjan Mariondd2423e2019-05-22 16:30:01 +02001/*
2 *------------------------------------------------------------------
3 * Copyright (c) 2019 Cisco and/or its affiliates.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *------------------------------------------------------------------
16 */
17
18/*
19 *------------------------------------------------------------------
20 * Copyright(c) 2018, Intel Corporation All rights reserved.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * * Redistributions of source code must retain the above copyright
26 * notice, this list of conditions and the following disclaimer.
27 * * Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in
29 * the documentation and/or other materials provided with the
30 * distribution.
31 * * Neither the name of Intel Corporation nor the names of its
32 * contributors may be used to endorse or promote products derived
33 * from this software without specific prior written permission.
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
42 * DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 *------------------------------------------------------------------
47 */
48
49/*
50 * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
51 * Vinodh Gopal, James Guilford, Tomasz Kantecki
52 *
53 * References:
54 * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
55 * Intel Architecture Processors. August, 2010
56 * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
57 * Intel Architecture Processors. October, 2012.
58 * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
59 *
60 * Definitions:
61 * GF Galois Extension Field GF(2^128) - finite field where elements are
62 * represented as polynomials with coefficients in GF(2) with the
63 * highest degree of 127. Polynomials are represented as 128-bit binary
64 * numbers where each bit represents one coefficient.
65 * e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
66 * H hash key (128 bit)
67 * POLY irreducible polynomial x^127 + x^7 + x^2 + x + 1
68 * RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
69 * + addition in GF, which equals to XOR operation
70 * * multiplication in GF
71 *
72 * GF multiplication consists of 2 steps:
73 * - carry-less multiplication of two 128-bit operands into 256-bit result
74 * - reduction of 256-bit result into 128-bit with modulo POLY
75 *
76 * GHash is calculated on 128-bit blocks of data according to the following
77 * formula:
78 * GH = (GH + data) * hash_key
79 *
80 * To avoid bit-reflection of data, this code uses GF multipication
81 * with reversed polynomial:
82 * a * b * x^-127 mod RPOLY
83 *
84 * To improve computation speed table Hi is precomputed with powers of H',
85 * where H' is calculated as H<<1 mod RPOLY.
86 * This allows us to improve performance by deferring reduction. For example
87 * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
88 *
Damjan Marionb47376f2023-03-15 11:42:06 +000089 * u8x16 Hi[4];
Damjan Mariondd2423e2019-05-22 16:30:01 +020090 * ghash_precompute (H, Hi, 4);
91 *
Damjan Marion41724482023-03-23 13:44:01 +000092 * ghash_ctx_t _gd, *gd = &_gd;
Damjan Mariondd2423e2019-05-22 16:30:01 +020093 * ghash_mul_first (gd, GH ^ b0, Hi[3]);
94 * ghash_mul_next (gd, b1, Hi[2]);
95 * ghash_mul_next (gd, b2, Hi[1]);
96 * ghash_mul_next (gd, b3, Hi[0]);
97 * ghash_reduce (gd);
98 * ghash_reduce2 (gd);
99 * GH = ghash_final (gd);
100 *
101 * Reduction step is split into 3 functions so it can be better interleaved
102 * with other code, (i.e. with AES computation).
103 */
104
105#ifndef __ghash_h__
106#define __ghash_h__
107
Damjan Marion415b4b02020-02-11 17:04:38 +0100108static_always_inline u8x16
109gmul_lo_lo (u8x16 a, u8x16 b)
110{
Damjan Marion11da5752020-02-12 20:12:55 +0100111#if defined (__PCLMUL__)
Damjan Marion415b4b02020-02-11 17:04:38 +0100112 return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
Damjan Marion11da5752020-02-12 20:12:55 +0100113#elif defined (__ARM_FEATURE_CRYPTO)
114 return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
115 (poly64_t) vget_low_p64 ((poly64x2_t) b));
116#endif
Damjan Marion415b4b02020-02-11 17:04:38 +0100117}
118
119static_always_inline u8x16
Damjan Marion8727acd2020-02-12 18:30:17 +0100120gmul_hi_lo (u8x16 a, u8x16 b)
Damjan Marion415b4b02020-02-11 17:04:38 +0100121{
Damjan Marion11da5752020-02-12 20:12:55 +0100122#if defined (__PCLMUL__)
Damjan Marion415b4b02020-02-11 17:04:38 +0100123 return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
Damjan Marion11da5752020-02-12 20:12:55 +0100124#elif defined (__ARM_FEATURE_CRYPTO)
125 return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
126 (poly64_t) vget_low_p64 ((poly64x2_t) b));
127#endif
Damjan Marion415b4b02020-02-11 17:04:38 +0100128}
129
130static_always_inline u8x16
Damjan Marion8727acd2020-02-12 18:30:17 +0100131gmul_lo_hi (u8x16 a, u8x16 b)
Damjan Marion415b4b02020-02-11 17:04:38 +0100132{
Damjan Marion11da5752020-02-12 20:12:55 +0100133#if defined (__PCLMUL__)
Damjan Marion415b4b02020-02-11 17:04:38 +0100134 return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
Damjan Marion11da5752020-02-12 20:12:55 +0100135#elif defined (__ARM_FEATURE_CRYPTO)
136 return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
137 (poly64_t) vget_high_p64 ((poly64x2_t) b));
138#endif
Damjan Marion415b4b02020-02-11 17:04:38 +0100139}
140
141static_always_inline u8x16
142gmul_hi_hi (u8x16 a, u8x16 b)
143{
Damjan Marion11da5752020-02-12 20:12:55 +0100144#if defined (__PCLMUL__)
Damjan Marion415b4b02020-02-11 17:04:38 +0100145 return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
Damjan Marion11da5752020-02-12 20:12:55 +0100146#elif defined (__ARM_FEATURE_CRYPTO)
147 return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
148#endif
Damjan Marion415b4b02020-02-11 17:04:38 +0100149}
150
Damjan Mariondd2423e2019-05-22 16:30:01 +0200151typedef struct
152{
Damjan Marion415b4b02020-02-11 17:04:38 +0100153 u8x16 mid, hi, lo, tmp_lo, tmp_hi;
Damjan Marionb47376f2023-03-15 11:42:06 +0000154 u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
155 u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
Damjan Mariondd2423e2019-05-22 16:30:01 +0200156 int pending;
Damjan Marion41724482023-03-23 13:44:01 +0000157} ghash_ctx_t;
Damjan Mariondd2423e2019-05-22 16:30:01 +0200158
Damjan Marion415b4b02020-02-11 17:04:38 +0100159static const u8x16 ghash_poly = {
160 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
161 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
162};
163
164static const u8x16 ghash_poly2 = {
165 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
166 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
167};
Damjan Mariondd2423e2019-05-22 16:30:01 +0200168
169static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000170ghash_mul_first (ghash_ctx_t *gd, u8x16 a, u8x16 b)
Damjan Mariondd2423e2019-05-22 16:30:01 +0200171{
172 /* a1 * b1 */
Damjan Marion415b4b02020-02-11 17:04:38 +0100173 gd->hi = gmul_hi_hi (a, b);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200174 /* a0 * b0 */
Damjan Marion415b4b02020-02-11 17:04:38 +0100175 gd->lo = gmul_lo_lo (a, b);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200176 /* a0 * b1 ^ a1 * b0 */
Damjan Marionb47376f2023-03-15 11:42:06 +0000177 gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200178
179 /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
180 there is no pending data in tmp_lo and tmp_hi */
181 gd->pending = 0;
182}
183
184static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000185ghash_mul_next (ghash_ctx_t *gd, u8x16 a, u8x16 b)
Damjan Mariondd2423e2019-05-22 16:30:01 +0200186{
187 /* a1 * b1 */
Damjan Marion415b4b02020-02-11 17:04:38 +0100188 u8x16 hi = gmul_hi_hi (a, b);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200189 /* a0 * b0 */
Damjan Marion415b4b02020-02-11 17:04:38 +0100190 u8x16 lo = gmul_lo_lo (a, b);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200191
192 /* this branch will be optimized out by the compiler, and it allows us to
193 reduce number of XOR operations by using ternary logic */
194 if (gd->pending)
195 {
196 /* there is peding data from previous invocation so we can XOR */
Damjan Marionf75defa2020-02-13 18:14:06 +0100197 gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
198 gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200199 gd->pending = 0;
200 }
201 else
202 {
203 /* there is no peding data from previous invocation so we postpone XOR */
204 gd->tmp_hi = hi;
205 gd->tmp_lo = lo;
206 gd->pending = 1;
207 }
208
209 /* gd->mid ^= a0 * b1 ^ a1 * b0 */
Damjan Marionf75defa2020-02-13 18:14:06 +0100210 gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
Damjan Mariondd2423e2019-05-22 16:30:01 +0200211}
212
213static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000214ghash_reduce (ghash_ctx_t *gd)
Damjan Mariondd2423e2019-05-22 16:30:01 +0200215{
Damjan Marion415b4b02020-02-11 17:04:38 +0100216 u8x16 r;
Damjan Mariondd2423e2019-05-22 16:30:01 +0200217
218 /* Final combination:
219 gd->lo ^= gd->mid << 64
220 gd->hi ^= gd->mid >> 64 */
Damjan Marion415b4b02020-02-11 17:04:38 +0100221 u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
222 u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200223
224 if (gd->pending)
225 {
Damjan Marionf75defa2020-02-13 18:14:06 +0100226 gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
227 gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200228 }
229 else
230 {
231 gd->lo ^= midl;
232 gd->hi ^= midr;
233 }
Damjan Marion8727acd2020-02-12 18:30:17 +0100234 r = gmul_hi_lo (ghash_poly2, gd->lo);
Damjan Marion415b4b02020-02-11 17:04:38 +0100235 gd->lo ^= u8x16_word_shift_left (r, 8);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200236}
237
238static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000239ghash_reduce2 (ghash_ctx_t *gd)
Damjan Mariondd2423e2019-05-22 16:30:01 +0200240{
Damjan Marion415b4b02020-02-11 17:04:38 +0100241 gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
Damjan Marion8727acd2020-02-12 18:30:17 +0100242 gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200243}
244
Damjan Marion415b4b02020-02-11 17:04:38 +0100245static_always_inline u8x16
Damjan Marion41724482023-03-23 13:44:01 +0000246ghash_final (ghash_ctx_t *gd)
Damjan Mariondd2423e2019-05-22 16:30:01 +0200247{
Damjan Marionf75defa2020-02-13 18:14:06 +0100248 return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
Damjan Marion415b4b02020-02-11 17:04:38 +0100249 u8x16_word_shift_left (gd->tmp_hi, 4));
Damjan Mariondd2423e2019-05-22 16:30:01 +0200250}
251
Damjan Marion415b4b02020-02-11 17:04:38 +0100252static_always_inline u8x16
253ghash_mul (u8x16 a, u8x16 b)
Damjan Mariondd2423e2019-05-22 16:30:01 +0200254{
Damjan Marion41724482023-03-23 13:44:01 +0000255 ghash_ctx_t _gd, *gd = &_gd;
Damjan Mariondd2423e2019-05-22 16:30:01 +0200256 ghash_mul_first (gd, a, b);
257 ghash_reduce (gd);
258 ghash_reduce2 (gd);
259 return ghash_final (gd);
260}
261
Damjan Marionadeaf162023-03-14 18:04:45 +0000262#if defined(__VPCLMULQDQ__) && defined(__AVX512F__)
Damjan Marion627fb6a2020-02-16 13:07:13 +0100263
264static const u8x64 ghash4_poly2 = {
265 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
266 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
267 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
268 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
269 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
270 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
271 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
273};
274
Damjan Marion627fb6a2020-02-16 13:07:13 +0100275static_always_inline u8x64
276gmul4_lo_lo (u8x64 a, u8x64 b)
277{
278 return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
279}
280
281static_always_inline u8x64
282gmul4_hi_lo (u8x64 a, u8x64 b)
283{
284 return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
285}
286
287static_always_inline u8x64
288gmul4_lo_hi (u8x64 a, u8x64 b)
289{
290 return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
291}
292
293static_always_inline u8x64
294gmul4_hi_hi (u8x64 a, u8x64 b)
295{
296 return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
297}
298
Damjan Marion627fb6a2020-02-16 13:07:13 +0100299static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000300ghash4_mul_first (ghash_ctx_t *gd, u8x64 a, u8x64 b)
Damjan Marion627fb6a2020-02-16 13:07:13 +0100301{
Damjan Marionb47376f2023-03-15 11:42:06 +0000302 gd->hi4 = gmul4_hi_hi (a, b);
303 gd->lo4 = gmul4_lo_lo (a, b);
304 gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
Damjan Marion627fb6a2020-02-16 13:07:13 +0100305 gd->pending = 0;
306}
307
308static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000309ghash4_mul_next (ghash_ctx_t *gd, u8x64 a, u8x64 b)
Damjan Marion627fb6a2020-02-16 13:07:13 +0100310{
311 u8x64 hi = gmul4_hi_hi (a, b);
312 u8x64 lo = gmul4_lo_lo (a, b);
313
314 if (gd->pending)
315 {
316 /* there is peding data from previous invocation so we can XOR */
Damjan Marionb47376f2023-03-15 11:42:06 +0000317 gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
318 gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
Damjan Marion627fb6a2020-02-16 13:07:13 +0100319 gd->pending = 0;
320 }
321 else
322 {
323 /* there is no peding data from previous invocation so we postpone XOR */
Damjan Marionb47376f2023-03-15 11:42:06 +0000324 gd->tmp_hi4 = hi;
325 gd->tmp_lo4 = lo;
Damjan Marion627fb6a2020-02-16 13:07:13 +0100326 gd->pending = 1;
327 }
Damjan Marionb47376f2023-03-15 11:42:06 +0000328 gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
Damjan Marion627fb6a2020-02-16 13:07:13 +0100329}
330
331static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000332ghash4_reduce (ghash_ctx_t *gd)
Damjan Marion627fb6a2020-02-16 13:07:13 +0100333{
334 u8x64 r;
335
336 /* Final combination:
Damjan Marionb47376f2023-03-15 11:42:06 +0000337 gd->lo4 ^= gd->mid4 << 64
338 gd->hi4 ^= gd->mid4 >> 64 */
Damjan Marion627fb6a2020-02-16 13:07:13 +0100339
Damjan Marionb47376f2023-03-15 11:42:06 +0000340 u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
341 u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
Damjan Marion627fb6a2020-02-16 13:07:13 +0100342
343 if (gd->pending)
344 {
Damjan Marionb47376f2023-03-15 11:42:06 +0000345 gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
346 gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
Damjan Marion627fb6a2020-02-16 13:07:13 +0100347 }
348 else
349 {
Damjan Marionb47376f2023-03-15 11:42:06 +0000350 gd->lo4 ^= midl;
351 gd->hi4 ^= midr;
Damjan Marion627fb6a2020-02-16 13:07:13 +0100352 }
353
Damjan Marionb47376f2023-03-15 11:42:06 +0000354 r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
355 gd->lo4 ^= u8x64_word_shift_left (r, 8);
Damjan Marion627fb6a2020-02-16 13:07:13 +0100356}
357
358static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000359ghash4_reduce2 (ghash_ctx_t *gd)
Damjan Marion627fb6a2020-02-16 13:07:13 +0100360{
Damjan Marionb47376f2023-03-15 11:42:06 +0000361 gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
362 gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
Damjan Marion627fb6a2020-02-16 13:07:13 +0100363}
364
365static_always_inline u8x16
Damjan Marion41724482023-03-23 13:44:01 +0000366ghash4_final (ghash_ctx_t *gd)
Damjan Marion627fb6a2020-02-16 13:07:13 +0100367{
368 u8x64 r;
369 u8x32 t;
370
Damjan Marionb47376f2023-03-15 11:42:06 +0000371 r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
372 u8x64_word_shift_left (gd->tmp_hi4, 4));
Damjan Marion627fb6a2020-02-16 13:07:13 +0100373
374 /* horizontal XOR of 4 128-bit lanes */
375 t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
376 return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
377}
378#endif
379
Damjan Marionb47376f2023-03-15 11:42:06 +0000380#if defined(__VPCLMULQDQ__)
381
382static const u8x32 ghash2_poly2 = {
383 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
384 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
385 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
386};
387
388static_always_inline u8x32
389gmul2_lo_lo (u8x32 a, u8x32 b)
390{
391 return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
392}
393
394static_always_inline u8x32
395gmul2_hi_lo (u8x32 a, u8x32 b)
396{
397 return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
398}
399
400static_always_inline u8x32
401gmul2_lo_hi (u8x32 a, u8x32 b)
402{
403 return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
404}
405
406static_always_inline u8x32
407gmul2_hi_hi (u8x32 a, u8x32 b)
408{
409 return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
410}
411
412static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000413ghash2_mul_first (ghash_ctx_t *gd, u8x32 a, u8x32 b)
Damjan Marionb47376f2023-03-15 11:42:06 +0000414{
415 gd->hi2 = gmul2_hi_hi (a, b);
416 gd->lo2 = gmul2_lo_lo (a, b);
417 gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
418 gd->pending = 0;
419}
420
421static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000422ghash2_mul_next (ghash_ctx_t *gd, u8x32 a, u8x32 b)
Damjan Marionb47376f2023-03-15 11:42:06 +0000423{
424 u8x32 hi = gmul2_hi_hi (a, b);
425 u8x32 lo = gmul2_lo_lo (a, b);
426
427 if (gd->pending)
428 {
429 /* there is peding data from previous invocation so we can XOR */
430 gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
431 gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
432 gd->pending = 0;
433 }
434 else
435 {
436 /* there is no peding data from previous invocation so we postpone XOR */
437 gd->tmp_hi2 = hi;
438 gd->tmp_lo2 = lo;
439 gd->pending = 1;
440 }
441 gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
442}
443
444static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000445ghash2_reduce (ghash_ctx_t *gd)
Damjan Marionb47376f2023-03-15 11:42:06 +0000446{
447 u8x32 r;
448
449 /* Final combination:
450 gd->lo2 ^= gd->mid2 << 64
451 gd->hi2 ^= gd->mid2 >> 64 */
452
453 u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
454 u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
455
456 if (gd->pending)
457 {
458 gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
459 gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
460 }
461 else
462 {
463 gd->lo2 ^= midl;
464 gd->hi2 ^= midr;
465 }
466
467 r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
468 gd->lo2 ^= u8x32_word_shift_left (r, 8);
469}
470
471static_always_inline void
Damjan Marion41724482023-03-23 13:44:01 +0000472ghash2_reduce2 (ghash_ctx_t *gd)
Damjan Marionb47376f2023-03-15 11:42:06 +0000473{
474 gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
475 gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
476}
477
478static_always_inline u8x16
Damjan Marion41724482023-03-23 13:44:01 +0000479ghash2_final (ghash_ctx_t *gd)
Damjan Marionb47376f2023-03-15 11:42:06 +0000480{
481 u8x32 r;
482
483 r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
484 u8x32_word_shift_left (gd->tmp_hi2, 4));
485
486 /* horizontal XOR of 2 128-bit lanes */
487 return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
488}
489#endif
490
Damjan Mariondd2423e2019-05-22 16:30:01 +0200491static_always_inline void
Damjan Mariona2e56ee2020-02-24 14:26:08 +0100492ghash_precompute (u8x16 H, u8x16 * Hi, int n)
Damjan Mariondd2423e2019-05-22 16:30:01 +0200493{
Damjan Marion415b4b02020-02-11 17:04:38 +0100494 u8x16 r8;
495 u32x4 r32;
Damjan Mariondd2423e2019-05-22 16:30:01 +0200496 /* calcullate H<<1 mod poly from the hash key */
Damjan Marion415b4b02020-02-11 17:04:38 +0100497 r8 = (u8x16) ((u64x2) H >> 63);
498 H = (u8x16) ((u64x2) H << 1);
499 H |= u8x16_word_shift_left (r8, 8);
500 r32 = (u32x4) u8x16_word_shift_right (r8, 8);
Damjan Marion11da5752020-02-12 20:12:55 +0100501#ifdef __SSE2__
Damjan Marion415b4b02020-02-11 17:04:38 +0100502 r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
Damjan Marion11da5752020-02-12 20:12:55 +0100503#else
504 r32[3] = r32[0];
505#endif
Damjan Marion415b4b02020-02-11 17:04:38 +0100506 r32 = r32 == (u32x4) {1, 0, 0, 1};
Damjan Mariona2e56ee2020-02-24 14:26:08 +0100507 Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200508
509 /* calculate H^(i + 1) */
Damjan Mariona2e56ee2020-02-24 14:26:08 +0100510 for (int i = n - 2; i >= 0; i--)
511 Hi[i] = ghash_mul (H, Hi[i + 1]);
Damjan Mariondd2423e2019-05-22 16:30:01 +0200512}
513
514#endif /* __ghash_h__ */
515