1 /*
2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
5 *
6 * gf_w4.c
7 *
8 * Routines for 4-bit Galois fields
9 */
10
11 #include "gf_int.h"
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include "gf_w4.h"
15 #include "gf_cpu.h"
16
17 #define AB2(ip, am1 ,am2, b, t1, t2) {\
18 t1 = (b << 1) & am1;\
19 t2 = b & am2; \
20 t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
21 b = (t1 ^ (t2 & ip));}
22
23 // ToDo(KMG/JSP): Why is 0x88 hard-coded?
24 #define SSE_AB2(pp, m1, va, t1, t2) {\
25 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
26 t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \
27 t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
28 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
29
30 /* ------------------------------------------------------------
31 JSP: These are basic and work from multiple implementations.
32 */
33
34 static
35 inline
36 gf_val_32_t gf_w4_inverse_from_divide (gf_t *gf, gf_val_32_t a)
37 {
38 return gf->divide.w32(gf, 1, a);
39 }
40
41 static
42 inline
43 gf_val_32_t gf_w4_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
44 {
45 b = gf->inverse.w32(gf, b);
46 return gf->multiply.w32(gf, a, b);
47 }
48
49 static
50 inline
51 gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
52 {
53 gf_val_32_t e_i, e_im1, e_ip1;
54 gf_val_32_t d_i, d_im1, d_ip1;
55 gf_val_32_t y_i, y_im1, y_ip1;
56 gf_val_32_t c_i;
57
58 if (b == 0) return -1;
59 e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
60 e_i = b;
61 d_im1 = 4;
62 for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
63 y_i = 1;
64 y_im1 = 0;
65
66 while (e_i != 1) {
67 e_ip1 = e_im1;
68 d_ip1 = d_im1;
69 c_i = 0;
70
71 while (d_ip1 >= d_i) {
72 c_i ^= (1 << (d_ip1 - d_i));
73 e_ip1 ^= (e_i << (d_ip1 - d_i));
74 if (e_ip1 == 0) return 0;
75 while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
76 }
77
78 y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
79 y_im1 = y_i;
80 y_i = y_ip1;
81
82 e_im1 = e_i;
83 d_im1 = d_i;
84 e_i = e_ip1;
85 d_i = d_ip1;
86 }
87
88 return y_i;
89 }
90
91 static
92 gf_val_32_t gf_w4_extract_word(gf_t *gf, void *start, int bytes, int index)
93 {
94 uint8_t *r8, v;
95
96 r8 = (uint8_t *) start;
97 v = r8[index/2];
98 if (index%2) {
99 return v >> 4;
100 } else {
101 return v&0xf;
102 }
103 }
104
105
106 static
107 inline
108 gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b)
109 {
110 return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly);
111 }
112
113
114 static
115 inline
116 gf_val_32_t
117 gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
118 {
119 uint8_t product, i, pp;
120 gf_internal_t *h;
121
122 h = (gf_internal_t *) gf->scratch;
123 pp = h->prim_poly;
124
125 product = 0;
126
127 for (i = 0; i < GF_FIELD_WIDTH; i++) {
128 if (a & (1 << i)) product ^= (b << i);
129 }
130 for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
131 if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
132 }
133 return product;
134 }
135
136 /* Ben: This function works, but it is 33% slower than the normal shift mult */
137
138 #if defined(INTEL_SSE4_PCLMUL)
139 static
140 inline
141 gf_val_32_t
142 gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
143 {
144 gf_val_32_t rv = 0;
145
146 __m128i a, b;
147 __m128i result;
148 __m128i prim_poly;
149 __m128i w;
150 gf_internal_t * h = gf->scratch;
151
152 a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0);
153 b = _mm_insert_epi32 (a, b4, 0);
154
155 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL));
156
157 /* Do the initial multiply */
158
159 result = _mm_clmulepi64_si128 (a, b, 0);
160
161 /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only
162 have to do the reduction only once, because (w-2)/z == 1. Where
163 z is equal to the number of zeros after the leading 1.
164
165 _mm_clmulepi64_si128 is the carryless multiply operation. Here
166 _mm_srli_epi64 shifts the result to the right by 4 bits. This allows
167 us to multiply the prim_poly by the leading bits of the result. We
168 then xor the result of that operation back with the result. */
169
170 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0);
171 result = _mm_xor_si128 (result, w);
172
173 /* Extracts 32 bit value from result. */
174
175 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
176 return rv;
177 }
178 #endif
179
180 static
181 void
182 gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
183 xor)
184 {
185 gf_region_data rd;
186 uint8_t *s8;
187 uint8_t *d8;
188
189 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
190 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
191
192 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
193 gf_do_initial_region_alignment(&rd);
194
195 s8 = (uint8_t *) rd.s_start;
196 d8 = (uint8_t *) rd.d_start;
197
198 if (xor) {
199 while (d8 < ((uint8_t *) rd.d_top)) {
200 *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) |
201 ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
202 d8++;
203 s8++;
204 }
205 } else {
206 while (d8 < ((uint8_t *) rd.d_top)) {
207 *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) |
208 ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
209 d8++;
210 s8++;
211 }
212 }
213 gf_do_final_region_alignment(&rd);
214 }
215
216 /* ------------------------------------------------------------
217 IMPLEMENTATION: LOG_TABLE:
218
219 JSP: This is a basic log-antilog implementation.
220 I'm not going to spend any time optimizing it because the
221 other techniques are faster for both single and region
222 operations.
223 */
224
225 static
226 inline
227 gf_val_32_t
228 gf_w4_log_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
229 {
230 struct gf_logtable_data *ltd;
231
232 ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
233 return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
234 }
235
236 static
237 inline
238 gf_val_32_t
239 gf_w4_log_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
240 {
241 int log_sum = 0;
242 struct gf_logtable_data *ltd;
243
244 if (a == 0 || b == 0) return 0;
245 ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
246
247 log_sum = ltd->log_tbl[a] - ltd->log_tbl[b];
248 return (ltd->antilog_tbl_div[log_sum]);
249 }
250
251 static
252 void
253 gf_w4_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
254 {
255 int i;
256 uint8_t lv, b, c;
257 uint8_t *s8, *d8;
258
259 struct gf_logtable_data *ltd;
260
261 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
262 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
263
264 ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
265 s8 = (uint8_t *) src;
266 d8 = (uint8_t *) dest;
267
268 lv = ltd->log_tbl[val];
269
270 for (i = 0; i < bytes; i++) {
271 c = (xor) ? d8[i] : 0;
272 b = (s8[i] >> GF_FIELD_WIDTH);
273 c ^= (b == 0) ? 0 : (ltd->antilog_tbl[lv + ltd->log_tbl[b]] << GF_FIELD_WIDTH);
274 b = (s8[i] & 0xf);
275 c ^= (b == 0) ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[b]];
276 d8[i] = c;
277 }
278 }
279
280 static
281 int gf_w4_log_init(gf_t *gf)
282 {
283 gf_internal_t *h;
284 struct gf_logtable_data *ltd;
285 int i, b;
286
287 h = (gf_internal_t *) gf->scratch;
288 ltd = h->private;
289
290 for (i = 0; i < GF_FIELD_SIZE; i++)
291 ltd->log_tbl[i]=0;
292
293 ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1);
294 b = 1;
295 i = 0;
296 do {
297 if (ltd->log_tbl[b] != 0 && i != 0) {
298 fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n");
299 return 0;
300 }
301 ltd->log_tbl[b] = i;
302 ltd->antilog_tbl[i] = b;
303 ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
304 b <<= 1;
305 i++;
306 if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
307 } while (b != 1);
308
309 if (i != GF_FIELD_SIZE - 1) {
310 _gf_errno = GF_E_LOGPOLY;
311 return 0;
312 }
313
314 SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
315 SET_FUNCTION(gf,divide,w32,gf_w4_log_divide)
316 SET_FUNCTION(gf,multiply,w32,gf_w4_log_multiply)
317 SET_FUNCTION(gf,multiply_region,w32,gf_w4_log_multiply_region)
318 return 1;
319 }
320
321 /* ------------------------------------------------------------
322 IMPLEMENTATION: SINGLE TABLE: JSP.
323 */
324
325 static
326 inline
327 gf_val_32_t
328 gf_w4_single_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
329 {
330 struct gf_single_table_data *std;
331
332 std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
333 return std->mult[a][b];
334 }
335
336 static
337 inline
338 gf_val_32_t
339 gf_w4_single_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
340 {
341 struct gf_single_table_data *std;
342
343 std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
344 return std->div[a][b];
345 }
346
347 static
348 void
349 gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
350 {
351 int i;
352 uint8_t b, c;
353 uint8_t *s8, *d8;
354
355 struct gf_single_table_data *std;
356
357 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
358 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
359
360 std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
361 s8 = (uint8_t *) src;
362 d8 = (uint8_t *) dest;
363
364 for (i = 0; i < bytes; i++) {
365 c = (xor) ? d8[i] : 0;
366 b = (s8[i] >> GF_FIELD_WIDTH);
367 c ^= (std->mult[val][b] << GF_FIELD_WIDTH);
368 b = (s8[i] & 0xf);
369 c ^= (std->mult[val][b]);
370 d8[i] = c;
371 }
372 }
373
374 #define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); }
375
376 #ifdef INTEL_SSSE3
377 static
378 void
379 gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
380 {
381 gf_region_data rd;
382 uint8_t *base, *sptr, *dptr, *top;
383 __m128i tl, loset, r, va, th;
384
385 struct gf_single_table_data *std;
386
387 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
388 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
389
390 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
391
392 std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
393 base = (uint8_t *) std->mult;
394 base += (val << GF_FIELD_WIDTH);
395
396 gf_do_initial_region_alignment(&rd);
397
398 tl = _mm_loadu_si128((__m128i *)base);
399 th = _mm_slli_epi64(tl, 4);
400 loset = _mm_set1_epi8 (0x0f);
401
402 sptr = rd.s_start;
403 dptr = rd.d_start;
404 top = rd.s_top;
405
406 while (sptr < (uint8_t *) top) {
407 va = _mm_load_si128 ((__m128i *)(sptr));
408 r = _mm_and_si128 (loset, va);
409 r = _mm_shuffle_epi8 (tl, r);
410 va = _mm_srli_epi64 (va, 4);
411 va = _mm_and_si128 (loset, va);
412 va = _mm_shuffle_epi8 (th, va);
413 r = _mm_xor_si128 (r, va);
414 va = (xor) ? _mm_load_si128 ((__m128i *)(dptr)) : _mm_setzero_si128();
415 r = _mm_xor_si128 (r, va);
416 _mm_store_si128 ((__m128i *)(dptr), r);
417 dptr += 16;
418 sptr += 16;
419 }
420 gf_do_final_region_alignment(&rd);
421
422 }
423 #endif
424
425 static
426 int gf_w4_single_table_init(gf_t *gf)
427 {
428 gf_internal_t *h;
429 struct gf_single_table_data *std;
430 int a, b, prod;
431
432
433 h = (gf_internal_t *) gf->scratch;
434 std = (struct gf_single_table_data *)h->private;
435
436 bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
437 bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
438
439 for (a = 1; a < GF_FIELD_SIZE; a++) {
440 for (b = 1; b < GF_FIELD_SIZE; b++) {
441 prod = gf_w4_shift_multiply(gf, a, b);
442 std->mult[a][b] = prod;
443 std->div[prod][b] = a;
444 }
445 }
446
447 SET_FUNCTION(gf,inverse,w32,NULL)
448 SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
449 SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
450 #if defined(INTEL_SSSE3)
451 if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
452 SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
453 } else {
454 #elif defined(ARM_NEON)
455 if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
456 gf_w4_neon_single_table_init(gf);
457 } else {
458 #endif
459 SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
460 if (h->region_type & GF_REGION_SIMD) return 0;
461 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
462 }
463 #endif
464
465 return 1;
466 }
467
468 /* ------------------------------------------------------------
469 IMPLEMENTATION: DOUBLE TABLE: JSP.
470 */
471
472 static
473 inline
474 gf_val_32_t
475 gf_w4_double_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
476 {
477 struct gf_double_table_data *std;
478
479 std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
480 return std->mult[a][b];
481 }
482
483 static
484 inline
485 gf_val_32_t
486 gf_w4_double_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
487 {
488 struct gf_double_table_data *std;
489
490 std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
491 return std->div[a][b];
492 }
493
494 static
495 void
496 gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
497 {
498 int i;
499 uint8_t *s8, *d8, *base;
500 gf_region_data rd;
501 struct gf_double_table_data *std;
502
503 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
504 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
505
506 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
507
508 std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
509 s8 = (uint8_t *) src;
510 d8 = (uint8_t *) dest;
511 base = (uint8_t *) std->mult;
512 base += (val << GF_DOUBLE_WIDTH);
513
514 if (xor) {
515 for (i = 0; i < bytes; i++) d8[i] ^= base[s8[i]];
516 } else {
517 for (i = 0; i < bytes; i++) d8[i] = base[s8[i]];
518 }
519 }
520
521 static
522 int gf_w4_double_table_init(gf_t *gf)
523 {
524 gf_internal_t *h;
525 struct gf_double_table_data *std;
526 int a, b, c, prod, ab;
527 uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
528
529 h = (gf_internal_t *) gf->scratch;
530 std = (struct gf_double_table_data *)h->private;
531
532 bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
533 bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
534
535 for (a = 1; a < GF_FIELD_SIZE; a++) {
536 for (b = 1; b < GF_FIELD_SIZE; b++) {
537 prod = gf_w4_shift_multiply(gf, a, b);
538 mult[a][b] = prod;
539 std->div[prod][b] = a;
540 }
541 }
542 bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
543 for (a = 0; a < GF_FIELD_SIZE; a++) {
544 for (b = 0; b < GF_FIELD_SIZE; b++) {
545 ab = mult[a][b];
546 for (c = 0; c < GF_FIELD_SIZE; c++) {
547 std->mult[a][(b << 4) | c] = ((ab << 4) | mult[a][c]);
548 }
549 }
550 }
551
552 SET_FUNCTION(gf,inverse,w32,NULL)
553 SET_FUNCTION(gf,divide,w32,gf_w4_double_table_divide)
554 SET_FUNCTION(gf,multiply,w32,gf_w4_double_table_multiply)
555 SET_FUNCTION(gf,multiply_region,w32,gf_w4_double_table_multiply_region)
556 return 1;
557 }
558
559
560 static
561 inline
562 gf_val_32_t
563 gf_w4_quad_table_lazy_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
564 {
565 struct gf_quad_table_lazy_data *std;
566
567 std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
568 return std->div[a][b];
569 }
570
571 static
572 inline
573 gf_val_32_t
574 gf_w4_quad_table_lazy_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
575 {
576 struct gf_quad_table_lazy_data *std;
577
578 std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
579 return std->smult[a][b];
580 }
581
582 static
583 inline
584 gf_val_32_t
585 gf_w4_quad_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
586 {
587 struct gf_quad_table_data *std;
588
589 std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
590 return std->div[a][b];
591 }
592
593 static
594 inline
595 gf_val_32_t
596 gf_w4_quad_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
597 {
598 struct gf_quad_table_data *std;
599 uint16_t v;
600
601 std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
602 v = std->mult[a][b];
603 return v;
604 }
605
606 static
607 void
608 gf_w4_quad_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
609 {
610 uint16_t *base;
611 gf_region_data rd;
612 struct gf_quad_table_data *std;
613 struct gf_quad_table_lazy_data *ltd;
614 gf_internal_t *h;
615 int a, b, c, d, va, vb, vc, vd;
616
617 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
618 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
619
620 h = (gf_internal_t *) (gf->scratch);
621 if (h->region_type & GF_REGION_LAZY) {
622 ltd = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
623 base = ltd->mult;
624 for (a = 0; a < 16; a++) {
625 va = (ltd->smult[val][a] << 12);
626 for (b = 0; b < 16; b++) {
627 vb = (ltd->smult[val][b] << 8);
628 for (c = 0; c < 16; c++) {
629 vc = (ltd->smult[val][c] << 4);
630 for (d = 0; d < 16; d++) {
631 vd = ltd->smult[val][d];
632 base[(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd);
633 }
634 }
635 }
636 }
637 } else {
638 std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
639 base = &(std->mult[val][0]);
640 }
641
642 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
643 gf_do_initial_region_alignment(&rd);
644 gf_two_byte_region_table_multiply(&rd, base);
645 gf_do_final_region_alignment(&rd);
646 }
647
648 static
649 int gf_w4_quad_table_init(gf_t *gf)
650 {
651 gf_internal_t *h;
652 struct gf_quad_table_data *std;
653 int prod, val, a, b, c, d, va, vb, vc, vd;
654 uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
655
656 h = (gf_internal_t *) gf->scratch;
657 std = (struct gf_quad_table_data *)h->private;
658
659 bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
660 bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
661
662 for (a = 1; a < GF_FIELD_SIZE; a++) {
663 for (b = 1; b < GF_FIELD_SIZE; b++) {
664 prod = gf_w4_shift_multiply(gf, a, b);
665 mult[a][b] = prod;
666 std->div[prod][b] = a;
667 }
668 }
669
670 for (val = 0; val < 16; val++) {
671 for (a = 0; a < 16; a++) {
672 va = (mult[val][a] << 12);
673 for (b = 0; b < 16; b++) {
674 vb = (mult[val][b] << 8);
675 for (c = 0; c < 16; c++) {
676 vc = (mult[val][c] << 4);
677 for (d = 0; d < 16; d++) {
678 vd = mult[val][d];
679 std->mult[val][(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd);
680 }
681 }
682 }
683 }
684 }
685
686 SET_FUNCTION(gf,inverse,w32,NULL)
687 SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_divide)
688 SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_multiply)
689 SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
690 return 1;
691 }
692 static
693 int gf_w4_quad_table_lazy_init(gf_t *gf)
694 {
695 gf_internal_t *h;
696 struct gf_quad_table_lazy_data *std;
697 int a, b, prod, loga, logb;
698 uint8_t log_tbl[GF_FIELD_SIZE];
699 uint8_t antilog_tbl[GF_FIELD_SIZE*2];
700
701 h = (gf_internal_t *) gf->scratch;
702 std = (struct gf_quad_table_lazy_data *)h->private;
703
704 b = 1;
705 for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
706 log_tbl[b] = a;
707 antilog_tbl[a] = b;
708 antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
709 b <<= 1;
710 if (b & GF_FIELD_SIZE) {
711 b = b ^ h->prim_poly;
712 }
713 }
714
715 bzero(std->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
716 bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
717
718 for (a = 1; a < GF_FIELD_SIZE; a++) {
719 loga = log_tbl[a];
720 for (b = 1; b < GF_FIELD_SIZE; b++) {
721 logb = log_tbl[b];
722 prod = antilog_tbl[loga+logb];
723 std->smult[a][b] = prod;
724 std->div[prod][b] = a;
725 }
726 }
727
728 SET_FUNCTION(gf,inverse,w32,NULL)
729 SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_lazy_divide)
730 SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_lazy_multiply)
731 SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
732 return 1;
733 }
734
735 static
736 int gf_w4_table_init(gf_t *gf)
737 {
738 int rt;
739 gf_internal_t *h;
740
741 h = (gf_internal_t *) gf->scratch;
742 rt = (h->region_type);
743
744 if (h->mult_type == GF_MULT_DEFAULT &&
745 !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))
746 rt |= GF_REGION_DOUBLE_TABLE;
747
748 if (rt & GF_REGION_DOUBLE_TABLE) {
749 return gf_w4_double_table_init(gf);
750 } else if (rt & GF_REGION_QUAD_TABLE) {
751 if (rt & GF_REGION_LAZY) {
752 return gf_w4_quad_table_lazy_init(gf);
753 } else {
754 return gf_w4_quad_table_init(gf);
755 }
756 } else {
757 return gf_w4_single_table_init(gf);
758 }
759 return 0;
760 }
761
762 /* ------------------------------------------------------------
763 JSP: GF_MULT_BYTWO_p and _b: See the paper.
764 */
765
766 static
767 inline
768 gf_val_32_t
769 gf_w4_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
770 {
771 uint32_t prod, pp, pmask, amask;
772 gf_internal_t *h;
773
774 h = (gf_internal_t *) gf->scratch;
775 pp = h->prim_poly;
776
777
778 prod = 0;
779 pmask = 0x8;
780 amask = 0x8;
781
782 while (amask != 0) {
783 if (prod & pmask) {
784 prod = ((prod << 1) ^ pp);
785 } else {
786 prod <<= 1;
787 }
788 if (a & amask) prod ^= b;
789 amask >>= 1;
790 }
791 return prod;
792 }
793
794 static
795 inline
796 gf_val_32_t
797 gf_w4_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
798 {
799 uint32_t prod, pp, bmask;
800 gf_internal_t *h;
801
802 h = (gf_internal_t *) gf->scratch;
803 pp = h->prim_poly;
804
805 prod = 0;
806 bmask = 0x8;
807
808 while (1) {
809 if (a & 1) prod ^= b;
810 a >>= 1;
811 if (a == 0) return prod;
812 if (b & bmask) {
813 b = ((b << 1) ^ pp);
814 } else {
815 b <<= 1;
816 }
817 }
818 }
819
820 static
821 void
822 gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
823 {
824 uint64_t *s64, *d64, t1, t2, ta, prod, amask;
825 gf_region_data rd;
826 struct gf_bytwo_data *btd;
827
828 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
829 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
830
831 btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
832
833 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
834 gf_do_initial_region_alignment(&rd);
835
836 s64 = (uint64_t *) rd.s_start;
837 d64 = (uint64_t *) rd.d_start;
838
839 if (xor) {
840 while (s64 < (uint64_t *) rd.s_top) {
841 prod = 0;
842 amask = 0x8;
843 ta = *s64;
844 while (amask != 0) {
845 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
846 if (val & amask) prod ^= ta;
847 amask >>= 1;
848 }
849 *d64 ^= prod;
850 d64++;
851 s64++;
852 }
853 } else {
854 while (s64 < (uint64_t *) rd.s_top) {
855 prod = 0;
856 amask = 0x8;
857 ta = *s64;
858 while (amask != 0) {
859 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
860 if (val & amask) prod ^= ta;
861 amask >>= 1;
862 }
863 *d64 = prod;
864 d64++;
865 s64++;
866 }
867 }
868 gf_do_final_region_alignment(&rd);
869 }
870
871 #define BYTWO_P_ONESTEP {\
872 SSE_AB2(pp, m1, prod, t1, t2); \
873 t1 = _mm_and_si128(v, one); \
874 t1 = _mm_sub_epi8(t1, one); \
875 t1 = _mm_and_si128(t1, ta); \
876 prod = _mm_xor_si128(prod, t1); \
877 v = _mm_srli_epi64(v, 1); }
878
879 #ifdef INTEL_SSE2
880 static
881 void
882 gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
883 {
884 int i;
885 uint8_t *s8, *d8;
886 uint8_t vrev;
887 __m128i pp, m1, ta, prod, t1, t2, tp, one, v;
888 struct gf_bytwo_data *btd;
889 gf_region_data rd;
890
891 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
892 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
893
894 btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
895
896 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
897 gf_do_initial_region_alignment(&rd);
898
899 vrev = 0;
900 for (i = 0; i < 4; i++) {
901 vrev <<= 1;
902 if (!(val & (1 << i))) vrev |= 1;
903 }
904
905 s8 = (uint8_t *) rd.s_start;
906 d8 = (uint8_t *) rd.d_start;
907
908 pp = _mm_set1_epi8(btd->prim_poly&0xff);
909 m1 = _mm_set1_epi8((btd->mask1)&0xff);
910 one = _mm_set1_epi8(1);
911
912 while (d8 < (uint8_t *) rd.d_top) {
913 prod = _mm_setzero_si128();
914 v = _mm_set1_epi8(vrev);
915 ta = _mm_load_si128((__m128i *) s8);
916 tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
917 BYTWO_P_ONESTEP;
918 BYTWO_P_ONESTEP;
919 BYTWO_P_ONESTEP;
920 BYTWO_P_ONESTEP;
921 _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
922 d8 += 16;
923 s8 += 16;
924 }
925 gf_do_final_region_alignment(&rd);
926 }
927 #endif
928
929 /*
930 #ifdef INTEL_SSE2
931 static
932 void
933 gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
934 {
935 uint8_t *d8, *s8, tb;
936 __m128i pp, m1, m2, t1, t2, va, vb;
937 struct gf_bytwo_data *btd;
938 gf_region_data rd;
939
940 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
941 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
942
943 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
944 gf_do_initial_region_alignment(&rd);
945
946 s8 = (uint8_t *) rd.s_start;
947 d8 = (uint8_t *) rd.d_start;
948
949 btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
950
951 pp = _mm_set1_epi8(btd->prim_poly&0xff);
952 m1 = _mm_set1_epi8((btd->mask1)&0xff);
953 m2 = _mm_set1_epi8((btd->mask2)&0xff);
954
955 if (xor) {
956 while (d8 < (uint8_t *) rd.d_top) {
957 va = _mm_load_si128 ((__m128i *)(s8));
958 vb = _mm_load_si128 ((__m128i *)(d8));
959 tb = val;
960 while (1) {
961 if (tb & 1) vb = _mm_xor_si128(vb, va);
962 tb >>= 1;
963 if (tb == 0) break;
964 SSE_AB2(pp, m1, m2, va, t1, t2);
965 }
966 _mm_store_si128((__m128i *)d8, vb);
967 d8 += 16;
968 s8 += 16;
969 }
970 } else {
971 while (d8 < (uint8_t *) rd.d_top) {
972 va = _mm_load_si128 ((__m128i *)(s8));
973 vb = _mm_setzero_si128 ();
974 tb = val;
975 while (1) {
976 if (tb & 1) vb = _mm_xor_si128(vb, va);
977 tb >>= 1;
978 if (tb == 0) break;
979 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1);
980 t2 = _mm_and_si128(va, m2);
981 t2 = _mm_sub_epi64 (
982 _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1)));
983 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp));
984 }
985 _mm_store_si128((__m128i *)d8, vb);
986 d8 += 16;
987 s8 += 16;
988 }
989 }
990 gf_do_final_region_alignment(&rd);
991 }
992 #endif
993 */
994
995 #ifdef INTEL_SSE2
996 static
997 void
998 gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
999 {
1000 uint8_t *d8, *s8;
1001 __m128i pp, m1, t1, t2, va;
1002
1003 s8 = (uint8_t *) rd->s_start;
1004 d8 = (uint8_t *) rd->d_start;
1005
1006 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1007 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1008
1009 while (d8 < (uint8_t *) rd->d_top) {
1010 va = _mm_load_si128 ((__m128i *)(s8));
1011 SSE_AB2(pp, m1, va, t1, t2);
1012 _mm_store_si128((__m128i *)d8, va);
1013 d8 += 16;
1014 s8 += 16;
1015 }
1016 }
1017 #endif
1018
1019 #ifdef INTEL_SSE2
1020 static
1021 void
1022 gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1023 {
1024 uint8_t *d8, *s8;
1025 __m128i pp, m1, t1, t2, va, vb;
1026
1027 s8 = (uint8_t *) rd->s_start;
1028 d8 = (uint8_t *) rd->d_start;
1029
1030 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1031 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1032
1033 while (d8 < (uint8_t *) rd->d_top) {
1034 va = _mm_load_si128 ((__m128i *)(s8));
1035 SSE_AB2(pp, m1, va, t1, t2);
1036 vb = _mm_load_si128 ((__m128i *)(d8));
1037 vb = _mm_xor_si128(vb, va);
1038 _mm_store_si128((__m128i *)d8, vb);
1039 d8 += 16;
1040 s8 += 16;
1041 }
1042 }
1043 #endif
1044
1045 #ifdef INTEL_SSE2
1046 static
1047 void
1048 gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1049 {
1050 uint8_t *d8, *s8;
1051 __m128i pp, m1, t1, t2, va;
1052
1053 s8 = (uint8_t *) rd->s_start;
1054 d8 = (uint8_t *) rd->d_start;
1055
1056 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1057 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1058
1059 while (d8 < (uint8_t *) rd->d_top) {
1060 va = _mm_load_si128 ((__m128i *)(s8));
1061 SSE_AB2(pp, m1, va, t1, t2);
1062 SSE_AB2(pp, m1, va, t1, t2);
1063 _mm_store_si128((__m128i *)d8, va);
1064 d8 += 16;
1065 s8 += 16;
1066 }
1067 }
1068 #endif
1069
1070 #ifdef INTEL_SSE2
1071 static
1072 void
1073 gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1074 {
1075 uint8_t *d8, *s8;
1076 __m128i pp, m1, t1, t2, va, vb;
1077
1078 s8 = (uint8_t *) rd->s_start;
1079 d8 = (uint8_t *) rd->d_start;
1080
1081 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1082 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1083
1084 while (d8 < (uint8_t *) rd->d_top) {
1085 va = _mm_load_si128 ((__m128i *)(s8));
1086 SSE_AB2(pp, m1, va, t1, t2);
1087 SSE_AB2(pp, m1, va, t1, t2);
1088 vb = _mm_load_si128 ((__m128i *)(d8));
1089 vb = _mm_xor_si128(vb, va);
1090 _mm_store_si128((__m128i *)d8, vb);
1091 d8 += 16;
1092 s8 += 16;
1093 }
1094 }
1095 #endif
1096
1097
1098 #ifdef INTEL_SSE2
1099 static
1100 void
1101 gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1102 {
1103 uint8_t *d8, *s8;
1104 __m128i pp, m1, t1, t2, va, vb;
1105
1106 s8 = (uint8_t *) rd->s_start;
1107 d8 = (uint8_t *) rd->d_start;
1108
1109 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1110 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1111
1112 while (d8 < (uint8_t *) rd->d_top) {
1113 va = _mm_load_si128 ((__m128i *)(s8));
1114 vb = va;
1115 SSE_AB2(pp, m1, va, t1, t2);
1116 va = _mm_xor_si128(va, vb);
1117 _mm_store_si128((__m128i *)d8, va);
1118 d8 += 16;
1119 s8 += 16;
1120 }
1121 }
1122 #endif
1123
1124 #ifdef INTEL_SSE2
1125 static
1126 void
1127 gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1128 {
1129 uint8_t *d8, *s8;
1130 __m128i pp, m1, t1, t2, va, vb;
1131
1132 s8 = (uint8_t *) rd->s_start;
1133 d8 = (uint8_t *) rd->d_start;
1134
1135 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1136 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1137
1138 while (d8 < (uint8_t *) rd->d_top) {
1139 va = _mm_load_si128 ((__m128i *)(s8));
1140 vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
1141 SSE_AB2(pp, m1, va, t1, t2);
1142 vb = _mm_xor_si128(vb, va);
1143 _mm_store_si128((__m128i *)d8, vb);
1144 d8 += 16;
1145 s8 += 16;
1146 }
1147 }
1148 #endif
1149
1150 #ifdef INTEL_SSE2
1151 static
1152 void
1153 gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1154 {
1155 uint8_t *d8, *s8;
1156 __m128i pp, m1, t1, t2, va, vb;
1157
1158 s8 = (uint8_t *) rd->s_start;
1159 d8 = (uint8_t *) rd->d_start;
1160
1161 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1162 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1163
1164 while (d8 < (uint8_t *) rd->d_top) {
1165 va = _mm_load_si128 ((__m128i *)(s8));
1166 vb = va;
1167 SSE_AB2(pp, m1, va, t1, t2);
1168 SSE_AB2(pp, m1, va, t1, t2);
1169 va = _mm_xor_si128(va, vb);
1170 _mm_store_si128((__m128i *)d8, va);
1171 d8 += 16;
1172 s8 += 16;
1173 }
1174 }
1175 #endif
1176
1177 #ifdef INTEL_SSE2
1178 static
1179 void
1180 gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1181 {
1182 uint8_t *d8, *s8;
1183 __m128i pp, m1, t1, t2, va, vb;
1184
1185 s8 = (uint8_t *) rd->s_start;
1186 d8 = (uint8_t *) rd->d_start;
1187
1188 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1189 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1190
1191 while (d8 < (uint8_t *) rd->d_top) {
1192 va = _mm_load_si128 ((__m128i *)(s8));
1193 vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
1194 SSE_AB2(pp, m1, va, t1, t2);
1195 SSE_AB2(pp, m1, va, t1, t2);
1196 vb = _mm_xor_si128(vb, va);
1197 _mm_store_si128((__m128i *)d8, vb);
1198 d8 += 16;
1199 s8 += 16;
1200 }
1201 }
1202 #endif
1203
1204 #ifdef INTEL_SSE2
1205 static
1206 void
1207 gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1208 {
1209 uint8_t *d8, *s8;
1210 __m128i pp, m1, t1, t2, va, vb;
1211
1212 s8 = (uint8_t *) rd->s_start;
1213 d8 = (uint8_t *) rd->d_start;
1214
1215 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1216 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1217
1218 while (d8 < (uint8_t *) rd->d_top) {
1219 va = _mm_load_si128 ((__m128i *)(s8));
1220 vb = va;
1221 SSE_AB2(pp, m1, va, t1, t2);
1222 vb = _mm_xor_si128(va, vb);
1223 SSE_AB2(pp, m1, va, t1, t2);
1224 va = _mm_xor_si128(va, vb);
1225 _mm_store_si128((__m128i *)d8, va);
1226 d8 += 16;
1227 s8 += 16;
1228 }
1229 }
1230 #endif
1231
1232 #ifdef INTEL_SSE2
1233 static
1234 void
1235 gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1236 {
1237 uint8_t *d8, *s8;
1238 __m128i pp, m1, t1, t2, va, vb;
1239
1240 s8 = (uint8_t *) rd->s_start;
1241 d8 = (uint8_t *) rd->d_start;
1242
1243 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1244 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1245
1246 while (d8 < (uint8_t *) rd->d_top) {
1247 va = _mm_load_si128 ((__m128i *)(s8));
1248 vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
1249 SSE_AB2(pp, m1, va, t1, t2);
1250 vb = _mm_xor_si128(vb, va);
1251 SSE_AB2(pp, m1, va, t1, t2);
1252 vb = _mm_xor_si128(vb, va);
1253 _mm_store_si128((__m128i *)d8, vb);
1254 d8 += 16;
1255 s8 += 16;
1256 }
1257 }
1258 #endif
1259
1260 #ifdef INTEL_SSE2
1261 static
1262 void
1263 gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1264 {
1265 uint8_t *d8, *s8;
1266 __m128i pp, m1, t1, t2, va, vb;
1267
1268 s8 = (uint8_t *) rd->s_start;
1269 d8 = (uint8_t *) rd->d_start;
1270
1271 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1272 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1273
1274 while (d8 < (uint8_t *) rd->d_top) {
1275 va = _mm_load_si128 ((__m128i *)(s8));
1276 SSE_AB2(pp, m1, va, t1, t2);
1277 vb = va;
1278 SSE_AB2(pp, m1, va, t1, t2);
1279 va = _mm_xor_si128(va, vb);
1280 _mm_store_si128((__m128i *)d8, va);
1281 d8 += 16;
1282 s8 += 16;
1283 }
1284 }
1285 #endif
1286
1287 #ifdef INTEL_SSE2
1288 static
1289 void
1290 gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1291 {
1292 uint8_t *d8, *s8;
1293 __m128i pp, m1, t1, t2, va, vb;
1294
1295 s8 = (uint8_t *) rd->s_start;
1296 d8 = (uint8_t *) rd->d_start;
1297
1298 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1299 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1300
1301 while (d8 < (uint8_t *) rd->d_top) {
1302 va = _mm_load_si128 ((__m128i *)(s8));
1303 SSE_AB2(pp, m1, va, t1, t2);
1304 vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
1305 SSE_AB2(pp, m1, va, t1, t2);
1306 vb = _mm_xor_si128(vb, va);
1307 _mm_store_si128((__m128i *)d8, vb);
1308 d8 += 16;
1309 s8 += 16;
1310 }
1311 }
1312 #endif
1313
1314 #ifdef INTEL_SSE2
1315 static
1316 void
1317 gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1318 {
1319 uint8_t *d8, *s8, tb;
1320 __m128i pp, m1, m2, t1, t2, va, vb;
1321 struct gf_bytwo_data *btd;
1322 gf_region_data rd;
1323
1324 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1325 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1326
1327 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1328 gf_do_initial_region_alignment(&rd);
1329
1330 s8 = (uint8_t *) rd.s_start;
1331 d8 = (uint8_t *) rd.d_start;
1332
1333 btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1334
1335 switch (val) {
1336 case 2:
1337 if (!xor) {
1338 gf_w4_bytwo_b_sse_region_2_noxor(&rd, btd);
1339 } else {
1340 gf_w4_bytwo_b_sse_region_2_xor(&rd, btd);
1341 }
1342 gf_do_final_region_alignment(&rd);
1343 return;
1344 case 3:
1345 if (!xor) {
1346 gf_w4_bytwo_b_sse_region_3_noxor(&rd, btd);
1347 } else {
1348 gf_w4_bytwo_b_sse_region_3_xor(&rd, btd);
1349 }
1350 gf_do_final_region_alignment(&rd);
1351 return;
1352 case 4:
1353 if (!xor) {
1354 gf_w4_bytwo_b_sse_region_4_noxor(&rd, btd);
1355 } else {
1356 gf_w4_bytwo_b_sse_region_4_xor(&rd, btd);
1357 }
1358 gf_do_final_region_alignment(&rd);
1359 return;
1360 case 5:
1361 if (!xor) {
1362 gf_w4_bytwo_b_sse_region_5_noxor(&rd, btd);
1363 } else {
1364 gf_w4_bytwo_b_sse_region_5_xor(&rd, btd);
1365 }
1366 gf_do_final_region_alignment(&rd);
1367 return;
1368 case 6:
1369 if (!xor) {
1370 gf_w4_bytwo_b_sse_region_6_noxor(&rd, btd);
1371 } else {
1372 gf_w4_bytwo_b_sse_region_6_xor(&rd, btd);
1373 }
1374 gf_do_final_region_alignment(&rd);
1375 return;
1376 case 7:
1377 if (!xor) {
1378 gf_w4_bytwo_b_sse_region_7_noxor(&rd, btd);
1379 } else {
1380 gf_w4_bytwo_b_sse_region_7_xor(&rd, btd);
1381 }
1382 gf_do_final_region_alignment(&rd);
1383 return;
1384 }
1385
1386 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1387 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1388 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1389
1390 if (xor) {
1391 while (d8 < (uint8_t *) rd.d_top) {
1392 va = _mm_load_si128 ((__m128i *)(s8));
1393 vb = _mm_load_si128 ((__m128i *)(d8));
1394 tb = val;
1395 while (1) {
1396 if (tb & 1) vb = _mm_xor_si128(vb, va);
1397 tb >>= 1;
1398 if (tb == 0) break;
1399 SSE_AB2(pp, m1, va, t1, t2);
1400 }
1401 _mm_store_si128((__m128i *)d8, vb);
1402 d8 += 16;
1403 s8 += 16;
1404 }
1405 } else {
1406 while (d8 < (uint8_t *) rd.d_top) {
1407 va = _mm_load_si128 ((__m128i *)(s8));
1408 vb = _mm_setzero_si128 ();
1409 tb = val;
1410 while (1) {
1411 if (tb & 1) vb = _mm_xor_si128(vb, va);
1412 tb >>= 1;
1413 if (tb == 0) break;
1414 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1);
1415 t2 = _mm_and_si128(va, m2);
1416 t2 = _mm_sub_epi64 (
1417 _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1)));
1418 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp));
1419 }
1420 _mm_store_si128((__m128i *)d8, vb);
1421 d8 += 16;
1422 s8 += 16;
1423 }
1424 }
1425 gf_do_final_region_alignment(&rd);
1426 }
1427 #endif
1428
1429 static
1430 void
1431 gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1432 {
1433 uint64_t *s64, *d64, t1, t2, ta, tb, prod;
1434 struct gf_bytwo_data *btd;
1435 gf_region_data rd;
1436
1437 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1438 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1439
1440 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1441 gf_do_initial_region_alignment(&rd);
1442
1443 btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1444 s64 = (uint64_t *) rd.s_start;
1445 d64 = (uint64_t *) rd.d_start;
1446
1447 switch (val) {
1448 case 1:
1449 if (xor) {
1450 while (d64 < (uint64_t *) rd.d_top) {
1451 *d64 ^= *s64;
1452 d64++;
1453 s64++;
1454 }
1455 } else {
1456 while (d64 < (uint64_t *) rd.d_top) {
1457 *d64 = *s64;
1458 d64++;
1459 s64++;
1460 }
1461 }
1462 break;
1463 case 2:
1464 if (xor) {
1465 while (d64 < (uint64_t *) rd.d_top) {
1466 ta = *s64;
1467 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1468 *d64 ^= ta;
1469 d64++;
1470 s64++;
1471 }
1472 } else {
1473 while (d64 < (uint64_t *) rd.d_top) {
1474 ta = *s64;
1475 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1476 *d64 = ta;
1477 d64++;
1478 s64++;
1479 }
1480 }
1481 break;
1482 case 3:
1483 if (xor) {
1484 while (d64 < (uint64_t *) rd.d_top) {
1485 ta = *s64;
1486 prod = ta;
1487 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1488 *d64 ^= (ta ^ prod);
1489 d64++;
1490 s64++;
1491 }
1492 } else {
1493 while (d64 < (uint64_t *) rd.d_top) {
1494 ta = *s64;
1495 prod = ta;
1496 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1497 *d64 = (ta ^ prod);
1498 d64++;
1499 s64++;
1500 }
1501 }
1502 break;
1503 case 4:
1504 if (xor) {
1505 while (d64 < (uint64_t *) rd.d_top) {
1506 ta = *s64;
1507 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1508 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1509 *d64 ^= ta;
1510 d64++;
1511 s64++;
1512 }
1513 } else {
1514 while (d64 < (uint64_t *) rd.d_top) {
1515 ta = *s64;
1516 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1517 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1518 *d64 = ta;
1519 d64++;
1520 s64++;
1521 }
1522 }
1523 break;
1524 case 5:
1525 if (xor) {
1526 while (d64 < (uint64_t *) rd.d_top) {
1527 ta = *s64;
1528 prod = ta;
1529 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1530 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1531 *d64 ^= (ta ^ prod);
1532 d64++;
1533 s64++;
1534 }
1535 } else {
1536 while (d64 < (uint64_t *) rd.d_top) {
1537 ta = *s64;
1538 prod = ta;
1539 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1540 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1541 *d64 = ta ^ prod;
1542 d64++;
1543 s64++;
1544 }
1545 }
1546 break;
1547 case 6:
1548 if (xor) {
1549 while (d64 < (uint64_t *) rd.d_top) {
1550 ta = *s64;
1551 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1552 prod = ta;
1553 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1554 *d64 ^= (ta ^ prod);
1555 d64++;
1556 s64++;
1557 }
1558 } else {
1559 while (d64 < (uint64_t *) rd.d_top) {
1560 ta = *s64;
1561 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1562 prod = ta;
1563 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1564 *d64 = ta ^ prod;
1565 d64++;
1566 s64++;
1567 }
1568 }
1569 break;
1570 case 7:
1571 if (xor) {
1572 while (d64 < (uint64_t *) rd.d_top) {
1573 ta = *s64;
1574 prod = ta;
1575 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1576 prod ^= ta;
1577 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1578 *d64 ^= (ta ^ prod);
1579 d64++;
1580 s64++;
1581 }
1582 } else {
1583 while (d64 < (uint64_t *) rd.d_top) {
1584 ta = *s64;
1585 prod = ta;
1586 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1587 prod ^= ta;
1588 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1589 *d64 = ta ^ prod;
1590 d64++;
1591 s64++;
1592 }
1593 }
1594 break;
1595 case 8:
1596 if (xor) {
1597 while (d64 < (uint64_t *) rd.d_top) {
1598 ta = *s64;
1599 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1600 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1601 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1602 *d64 ^= ta;
1603 d64++;
1604 s64++;
1605 }
1606 } else {
1607 while (d64 < (uint64_t *) rd.d_top) {
1608 ta = *s64;
1609 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1610 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1611 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1612 *d64 = ta;
1613 d64++;
1614 s64++;
1615 }
1616 }
1617 break;
1618 case 9:
1619 if (xor) {
1620 while (d64 < (uint64_t *) rd.d_top) {
1621 ta = *s64;
1622 prod = ta;
1623 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1624 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1625 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1626 *d64 ^= (ta ^ prod);
1627 d64++;
1628 s64++;
1629 }
1630 } else {
1631 while (d64 < (uint64_t *) rd.d_top) {
1632 ta = *s64;
1633 prod = ta;
1634 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1635 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1636 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1637 *d64 = (ta ^ prod);
1638 d64++;
1639 s64++;
1640 }
1641 }
1642 break;
1643 case 10:
1644 if (xor) {
1645 while (d64 < (uint64_t *) rd.d_top) {
1646 ta = *s64;
1647 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1648 prod = ta;
1649 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1650 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1651 *d64 ^= (ta ^ prod);
1652 d64++;
1653 s64++;
1654 }
1655 } else {
1656 while (d64 < (uint64_t *) rd.d_top) {
1657 ta = *s64;
1658 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1659 prod = ta;
1660 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1661 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1662 *d64 = (ta ^ prod);
1663 d64++;
1664 s64++;
1665 }
1666 }
1667 break;
1668 case 11:
1669 if (xor) {
1670 while (d64 < (uint64_t *) rd.d_top) {
1671 ta = *s64;
1672 prod = ta;
1673 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1674 prod ^= ta;
1675 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1676 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1677 *d64 ^= (ta ^ prod);
1678 d64++;
1679 s64++;
1680 }
1681 } else {
1682 while (d64 < (uint64_t *) rd.d_top) {
1683 ta = *s64;
1684 prod = ta;
1685 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1686 prod ^= ta;
1687 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1688 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1689 *d64 = (ta ^ prod);
1690 d64++;
1691 s64++;
1692 }
1693 }
1694 break;
1695 case 12:
1696 if (xor) {
1697 while (d64 < (uint64_t *) rd.d_top) {
1698 ta = *s64;
1699 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1700 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1701 prod = ta;
1702 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1703 *d64 ^= (ta ^ prod);
1704 d64++;
1705 s64++;
1706 }
1707 } else {
1708 while (d64 < (uint64_t *) rd.d_top) {
1709 ta = *s64;
1710 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1711 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1712 prod = ta;
1713 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1714 *d64 = (ta ^ prod);
1715 d64++;
1716 s64++;
1717 }
1718 }
1719 break;
1720 case 13:
1721 if (xor) {
1722 while (d64 < (uint64_t *) rd.d_top) {
1723 ta = *s64;
1724 prod = ta;
1725 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1726 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1727 prod ^= ta;
1728 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1729 *d64 ^= (ta ^ prod);
1730 d64++;
1731 s64++;
1732 }
1733 } else {
1734 while (d64 < (uint64_t *) rd.d_top) {
1735 ta = *s64;
1736 prod = ta;
1737 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1738 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1739 prod ^= ta;
1740 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1741 *d64 = (ta ^ prod);
1742 d64++;
1743 s64++;
1744 }
1745 }
1746 break;
1747 case 14:
1748 if (xor) {
1749 while (d64 < (uint64_t *) rd.d_top) {
1750 ta = *s64;
1751 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1752 prod = ta;
1753 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1754 prod ^= ta;
1755 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1756 *d64 ^= (ta ^ prod);
1757 d64++;
1758 s64++;
1759 }
1760 } else {
1761 while (d64 < (uint64_t *) rd.d_top) {
1762 ta = *s64;
1763 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1764 prod = ta;
1765 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1766 prod ^= ta;
1767 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1768 *d64 = (ta ^ prod);
1769 d64++;
1770 s64++;
1771 }
1772 }
1773 break;
1774 case 15:
1775 if (xor) {
1776 while (d64 < (uint64_t *) rd.d_top) {
1777 ta = *s64;
1778 prod = ta;
1779 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1780 prod ^= ta;
1781 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1782 prod ^= ta;
1783 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1784 *d64 ^= (ta ^ prod);
1785 d64++;
1786 s64++;
1787 }
1788 } else {
1789 while (d64 < (uint64_t *) rd.d_top) {
1790 ta = *s64;
1791 prod = ta;
1792 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1793 prod ^= ta;
1794 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1795 prod ^= ta;
1796 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1797 *d64 = (ta ^ prod);
1798 d64++;
1799 s64++;
1800 }
1801 }
1802 break;
1803 default:
1804 if (xor) {
1805 while (d64 < (uint64_t *) rd.d_top) {
1806 prod = *d64 ;
1807 ta = *s64;
1808 tb = val;
1809 while (1) {
1810 if (tb & 1) prod ^= ta;
1811 tb >>= 1;
1812 if (tb == 0) break;
1813 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1814 }
1815 *d64 = prod;
1816 d64++;
1817 s64++;
1818 }
1819 } else {
1820 while (d64 < (uint64_t *) rd.d_top) {
1821 prod = 0 ;
1822 ta = *s64;
1823 tb = val;
1824 while (1) {
1825 if (tb & 1) prod ^= ta;
1826 tb >>= 1;
1827 if (tb == 0) break;
1828 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1829 }
1830 *d64 = prod;
1831 d64++;
1832 s64++;
1833 }
1834 }
1835 break;
1836 }
1837 gf_do_final_region_alignment(&rd);
1838 }
1839
1840 static
1841 int gf_w4_bytwo_init(gf_t *gf)
1842 {
1843 gf_internal_t *h;
1844 uint64_t ip, m1, m2;
1845 struct gf_bytwo_data *btd;
1846
1847 h = (gf_internal_t *) gf->scratch;
1848 btd = (struct gf_bytwo_data *) (h->private);
1849 ip = h->prim_poly & 0xf;
1850 m1 = 0xe;
1851 m2 = 0x8;
1852 btd->prim_poly = 0;
1853 btd->mask1 = 0;
1854 btd->mask2 = 0;
1855
1856 while (ip != 0) {
1857 btd->prim_poly |= ip;
1858 btd->mask1 |= m1;
1859 btd->mask2 |= m2;
1860 ip <<= GF_FIELD_WIDTH;
1861 m1 <<= GF_FIELD_WIDTH;
1862 m2 <<= GF_FIELD_WIDTH;
1863 }
1864
1865 if (h->mult_type == GF_MULT_BYTWO_p) {
1866 SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
1867 #ifdef INTEL_SSE2
1868 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1869 SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
1870 } else {
1871 #endif
1872 SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
1873 if (h->region_type & GF_REGION_SIMD)
1874 return 0;
1875 #ifdef INTEL_SSE2
1876 }
1877 #endif
1878 } else {
1879 SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
1880 #ifdef INTEL_SSE2
1881 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1882 SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
1883 } else {
1884 #endif
1885 SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
1886 if (h->region_type & GF_REGION_SIMD)
1887 return 0;
1888 #ifdef INTEL_SSE2
1889 }
1890 #endif
1891 }
1892 return 1;
1893 }
1894
1895
1896 static
1897 int gf_w4_cfm_init(gf_t *gf)
1898 {
1899 #if defined(INTEL_SSE4_PCLMUL)
1900 if (gf_cpu_supports_intel_pclmul) {
1901 SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
1902 return 1;
1903 }
1904 #elif defined(ARM_NEON)
1905 if (gf_cpu_supports_arm_neon) {
1906 return gf_w4_neon_cfm_init(gf);
1907 }
1908 #endif
1909 return 0;
1910 }
1911
1912 static
1913 int gf_w4_shift_init(gf_t *gf)
1914 {
1915 SET_FUNCTION(gf,multiply,w32,gf_w4_shift_multiply)
1916 return 1;
1917 }
1918
1919 /* JSP: I'm putting all error-checking into gf_error_check(), so you don't
1920 have to do error checking in scratch_size or in init */
1921
1922 int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
1923 {
1924 switch(mult_type)
1925 {
1926 case GF_MULT_BYTWO_p:
1927 case GF_MULT_BYTWO_b:
1928 return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data);
1929 break;
1930 case GF_MULT_DEFAULT:
1931 case GF_MULT_TABLE:
1932 if (region_type == GF_REGION_CAUCHY) {
1933 return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
1934 }
1935
1936 if (mult_type == GF_MULT_DEFAULT &&
1937 !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
1938 region_type = GF_REGION_DOUBLE_TABLE;
1939
1940 if (region_type & GF_REGION_DOUBLE_TABLE) {
1941 return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
1942 } else if (region_type & GF_REGION_QUAD_TABLE) {
1943 if ((region_type & GF_REGION_LAZY) == 0) {
1944 return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64;
1945 } else {
1946 return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64;
1947 }
1948 } else {
1949 return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
1950 }
1951 break;
1952
1953 case GF_MULT_LOG_TABLE:
1954 return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
1955 break;
1956 case GF_MULT_CARRY_FREE:
1957 return sizeof(gf_internal_t);
1958 break;
1959 case GF_MULT_SHIFT:
1960 return sizeof(gf_internal_t);
1961 break;
1962 default:
1963 return 0;
1964 }
1965 return 0;
1966 }
1967
1968 int
1969 gf_w4_init (gf_t *gf)
1970 {
1971 gf_internal_t *h;
1972
1973 h = (gf_internal_t *) gf->scratch;
1974 if (h->prim_poly == 0) h->prim_poly = 0x13;
1975 h->prim_poly |= 0x10;
1976 SET_FUNCTION(gf,multiply,w32,NULL)
1977 SET_FUNCTION(gf,divide,w32,NULL)
1978 SET_FUNCTION(gf,inverse,w32,NULL)
1979 SET_FUNCTION(gf,multiply_region,w32,NULL)
1980 SET_FUNCTION(gf,extract_word,w32,gf_w4_extract_word)
1981
1982 switch(h->mult_type) {
1983 case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
1984 case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break;
1985 case GF_MULT_BYTWO_p:
1986 case GF_MULT_BYTWO_b: if (gf_w4_bytwo_init(gf) == 0) return 0; break;
1987 case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
1988 case GF_MULT_DEFAULT:
1989 case GF_MULT_TABLE: if (gf_w4_table_init(gf) == 0) return 0; break;
1990 default: return 0;
1991 }
1992
1993 if (h->divide_type == GF_DIVIDE_EUCLID) {
1994 SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
1995 SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
1996 } else if (h->divide_type == GF_DIVIDE_MATRIX) {
1997 SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
1998 SET_FUNCTION(gf,inverse,w32,gf_w4_matrix)
1999 }
2000
2001 if (gf->divide.w32 == NULL) {
2002 SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
2003 if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
2004 }
2005
2006 if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
2007
2008 if (h->region_type == GF_REGION_CAUCHY) {
2009 SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
2010 SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
2011 }
2012
2013 if (gf->multiply_region.w32 == NULL) {
2014 SET_FUNCTION(gf,multiply_region,w32,gf_w4_multiply_region_from_single)
2015 }
2016
2017 return 1;
2018 }
2019
2020 /* Inline setup functions */
2021
2022 uint8_t *gf_w4_get_mult_table(gf_t *gf)
2023 {
2024 gf_internal_t *h;
2025 struct gf_single_table_data *std;
2026
2027 h = (gf_internal_t *) gf->scratch;
2028 if (gf->multiply.w32 == gf_w4_single_table_multiply) {
2029 std = (struct gf_single_table_data *) h->private;
2030 return (uint8_t *) std->mult;
2031 }
2032 return NULL;
2033 }
2034
2035 uint8_t *gf_w4_get_div_table(gf_t *gf)
2036 {
2037 gf_internal_t *h;
2038 struct gf_single_table_data *std;
2039
2040 h = (gf_internal_t *) gf->scratch;
2041 if (gf->multiply.w32 == gf_w4_single_table_multiply) {
2042 std = (struct gf_single_table_data *) h->private;
2043 return (uint8_t *) std->div;
2044 }
2045 return NULL;
2046 }
2047
2048