1    	/*
2    	 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3    	 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4    	 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
5    	 *
6    	 * gf_w4.c
7    	 *
8    	 * Routines for 4-bit Galois fields
9    	 */
10   	
11   	#include "gf_int.h"
12   	#include <stdio.h>
13   	#include <stdlib.h>
14   	#include "gf_w4.h"
15   	#include "gf_cpu.h"
16   	
17   	#define AB2(ip, am1 ,am2, b, t1, t2) {\
18   	  t1 = (b << 1) & am1;\
19   	  t2 = b & am2; \
20   	  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
21   	  b = (t1 ^ (t2 & ip));}
22   	
23   	// ToDo(KMG/JSP): Why is 0x88 hard-coded?
24   	#define SSE_AB2(pp, m1, va, t1, t2) {\
25   	          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
26   	          t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \
27   	          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
28   	          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
29   	
30   	/* ------------------------------------------------------------
31   	   JSP: These are basic and work from multiple implementations.
32   	 */
33   	
34   	static
35   	inline
36   	gf_val_32_t gf_w4_inverse_from_divide (gf_t *gf, gf_val_32_t a)
37   	{
38   	  return gf->divide.w32(gf, 1, a);
39   	}
40   	
41   	static
42   	inline
43   	gf_val_32_t gf_w4_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
44   	{
45   	  b = gf->inverse.w32(gf, b);
46   	  return gf->multiply.w32(gf, a, b);
47   	}
48   	
49   	static
50   	inline
51   	gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
52   	{
53   	  gf_val_32_t e_i, e_im1, e_ip1;
54   	  gf_val_32_t d_i, d_im1, d_ip1;
55   	  gf_val_32_t y_i, y_im1, y_ip1;
56   	  gf_val_32_t c_i;
57   	
58   	  if (b == 0) return -1;
59   	  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
60   	  e_i = b;
61   	  d_im1 = 4;
62   	  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
63   	  y_i = 1;
64   	  y_im1 = 0;
65   	
66   	  while (e_i != 1) {
67   	    e_ip1 = e_im1;
68   	    d_ip1 = d_im1;
69   	    c_i = 0;
70   	
71   	    while (d_ip1 >= d_i) {
72   	      c_i ^= (1 << (d_ip1 - d_i));
73   	      e_ip1 ^= (e_i << (d_ip1 - d_i));
74   	      if (e_ip1 == 0) return 0;
75   	      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
76   	    }
77   	
78   	    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
79   	    y_im1 = y_i;
80   	    y_i = y_ip1;
81   	
82   	    e_im1 = e_i;
83   	    d_im1 = d_i;
84   	    e_i = e_ip1;
85   	    d_i = d_ip1;
86   	  }
87   	
88   	  return y_i;
89   	}
90   	
91   	static 
92   	gf_val_32_t gf_w4_extract_word(gf_t *gf, void *start, int bytes, int index)
93   	{
94   	  uint8_t *r8, v;
95   	
96   	  r8 = (uint8_t *) start;
97   	  v = r8[index/2];
98   	  if (index%2) {
99   	    return v >> 4;
100  	  } else {
101  	    return v&0xf;
102  	  }
103  	}
104  	
105  	
106  	static
107  	inline
108  	gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b)
109  	{
110  	  return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly);
111  	}
112  	
113  	
114  	static
115  	inline
116  	gf_val_32_t
117  	gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
118  	{
119  	  uint8_t product, i, pp;
120  	  gf_internal_t *h;
121  	  
122  	  h = (gf_internal_t *) gf->scratch;
123  	  pp = h->prim_poly;
124  	
125  	  product = 0;
126  	
127  	  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
128  	    if (a & (1 << i)) product ^= (b << i);
129  	  }
130  	  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
131  	    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
132  	  }
133  	  return product;
134  	}
135  	
136  	/* Ben: This function works, but it is 33% slower than the normal shift mult */
137  	
138  	#if defined(INTEL_SSE4_PCLMUL)
139  	static
140  	inline
141  	gf_val_32_t
142  	gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
143  	{
144  	  gf_val_32_t rv = 0;
145  	
146  	  __m128i         a, b;
147  	  __m128i         result;
148  	  __m128i         prim_poly;
149  	  __m128i         w;
150  	  gf_internal_t * h = gf->scratch;
151  	
152  	  a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0);
153  	  b = _mm_insert_epi32 (a, b4, 0);
154  	
155  	  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL));
156  	
157  	  /* Do the initial multiply */
158  	
159  	  result = _mm_clmulepi64_si128 (a, b, 0);
160  	
161  	  /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only
162  	     have to do the reduction only once, because (w-2)/z == 1. Where
163  	     z is equal to the number of zeros after the leading 1.
164  	
165  	     _mm_clmulepi64_si128 is the carryless multiply operation. Here
166  	     _mm_srli_epi64 shifts the result to the right by 4 bits. This allows
167  	     us to multiply the prim_poly by the leading bits of the result. We
168  	     then xor the result of that operation back with the result. */
169  	
170  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0);
171  	  result = _mm_xor_si128 (result, w);
172  	
173  	  /* Extracts 32 bit value from result. */
174  	
175  	  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
176  	  return rv;
177  	}
178  	#endif
179  	
180  	static
181  	void
182  	gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int 
183  	    xor)
184  	{
185  	  gf_region_data rd;
186  	  uint8_t *s8;
187  	  uint8_t *d8;
188  	
189  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
190  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
191  	
192  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
193  	  gf_do_initial_region_alignment(&rd);
194  	
195  	  s8 = (uint8_t *) rd.s_start;
196  	  d8 = (uint8_t *) rd.d_start;
197  	
198  	  if (xor) {
199  	    while (d8 < ((uint8_t *) rd.d_top)) {
200  	      *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
201  	             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
202  	      d8++;
203  	      s8++;
204  	    }
205  	  } else {
206  	    while (d8 < ((uint8_t *) rd.d_top)) {
207  	      *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
208  	             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
209  	      d8++;
210  	      s8++;
211  	    }
212  	  }
213  	  gf_do_final_region_alignment(&rd);
214  	}
215  	
216  	/* ------------------------------------------------------------
217  	  IMPLEMENTATION: LOG_TABLE: 
218  	
219  	  JSP: This is a basic log-antilog implementation.  
220  	       I'm not going to spend any time optimizing it because the
221  	       other techniques are faster for both single and region
222  	       operations. 
223  	 */
224  	
225  	static
226  	inline
227  	gf_val_32_t
228  	gf_w4_log_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
229  	{
230  	  struct gf_logtable_data *ltd;
231  	    
232  	  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
233  	  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
234  	}
235  	
236  	static
237  	inline
238  	gf_val_32_t
239  	gf_w4_log_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
240  	{
241  	  int log_sum = 0;
242  	  struct gf_logtable_data *ltd;
243  	    
244  	  if (a == 0 || b == 0) return 0;
245  	  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
246  	
247  	  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b];
248  	  return (ltd->antilog_tbl_div[log_sum]);
249  	}
250  	
251  	static
252  	void 
253  	gf_w4_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
254  	{
255  	  int i;
256  	  uint8_t lv, b, c;
257  	  uint8_t *s8, *d8;
258  	  
259  	  struct gf_logtable_data *ltd;
260  	
261  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
262  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
263  	
264  	  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
265  	  s8 = (uint8_t *) src;
266  	  d8 = (uint8_t *) dest;
267  	
268  	  lv = ltd->log_tbl[val];
269  	
270  	  for (i = 0; i < bytes; i++) {
271  	    c = (xor) ? d8[i] : 0;
272  	    b = (s8[i] >> GF_FIELD_WIDTH);
273  	    c ^= (b == 0) ? 0 : (ltd->antilog_tbl[lv + ltd->log_tbl[b]] << GF_FIELD_WIDTH);
274  	    b = (s8[i] & 0xf);
275  	    c ^= (b == 0) ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[b]];
276  	    d8[i] = c;
277  	  }
278  	}
279  	
280  	static 
281  	int gf_w4_log_init(gf_t *gf)
282  	{
283  	  gf_internal_t *h;
284  	  struct gf_logtable_data *ltd;
285  	  int i, b;
286  	
287  	  h = (gf_internal_t *) gf->scratch;
288  	  ltd = h->private;
289  	
290  	  for (i = 0; i < GF_FIELD_SIZE; i++)
291  	    ltd->log_tbl[i]=0;
292  	
293  	  ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1);
294  	  b = 1;
295  	  i = 0;
296  	  do {
297  	    if (ltd->log_tbl[b] != 0 && i != 0) {
298  	      fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n");
299  	      return 0;
300  	    }
301  	    ltd->log_tbl[b] = i;
302  	    ltd->antilog_tbl[i] = b;
303  	    ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
304  	    b <<= 1;
305  	    i++;
306  	    if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
307  	  } while (b != 1);
308  	
309  	  if (i != GF_FIELD_SIZE - 1) {
310  	    _gf_errno = GF_E_LOGPOLY;
311  	    return 0;
312  	  }
313  	    
314  	  SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
315  	  SET_FUNCTION(gf,divide,w32,gf_w4_log_divide)
316  	  SET_FUNCTION(gf,multiply,w32,gf_w4_log_multiply)
317  	  SET_FUNCTION(gf,multiply_region,w32,gf_w4_log_multiply_region)
318  	  return 1;
319  	}
320  	
321  	/* ------------------------------------------------------------
322  	  IMPLEMENTATION: SINGLE TABLE: JSP. 
323  	 */
324  	
325  	static
326  	inline
327  	gf_val_32_t
328  	gf_w4_single_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
329  	{
330  	  struct gf_single_table_data *std;
331  	    
332  	  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
333  	  return std->mult[a][b];
334  	}
335  	
336  	static
337  	inline
338  	gf_val_32_t
339  	gf_w4_single_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
340  	{
341  	  struct gf_single_table_data *std;
342  	    
343  	  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
344  	  return std->div[a][b];
345  	}
346  	
347  	static
348  	void 
349  	gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
350  	{
351  	  int i;
352  	  uint8_t b, c;
353  	  uint8_t *s8, *d8;
354  	  
355  	  struct gf_single_table_data *std;
356  	
357  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
358  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
359  	
360  	  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
361  	  s8 = (uint8_t *) src;
362  	  d8 = (uint8_t *) dest;
363  	
364  	  for (i = 0; i < bytes; i++) {
365  	    c = (xor) ? d8[i] : 0;
366  	    b = (s8[i] >> GF_FIELD_WIDTH);
367  	    c ^= (std->mult[val][b] << GF_FIELD_WIDTH);
368  	    b = (s8[i] & 0xf);
369  	    c ^= (std->mult[val][b]);
370  	    d8[i] = c;
371  	  }
372  	}
373  	
374  	#define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); }
375  	
376  	#ifdef INTEL_SSSE3
377  	static
378  	void 
379  	gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
380  	{
381  	  gf_region_data rd;
382  	  uint8_t *base, *sptr, *dptr, *top;
383  	  __m128i  tl, loset, r, va, th;
384  	  
385  	  struct gf_single_table_data *std;
386  	    
387  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
388  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
389  	
390  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
391  	
392  	  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
393  	  base = (uint8_t *) std->mult;
394  	  base += (val << GF_FIELD_WIDTH);
395  	
396  	  gf_do_initial_region_alignment(&rd);
397  	
398  	  tl = _mm_loadu_si128((__m128i *)base);
399  	  th = _mm_slli_epi64(tl, 4);
400  	  loset = _mm_set1_epi8 (0x0f);
401  	
402  	  sptr = rd.s_start;
403  	  dptr = rd.d_start;
404  	  top = rd.s_top;
405  	
406  	  while (sptr < (uint8_t *) top) {
407  	    va = _mm_load_si128 ((__m128i *)(sptr));
408  	    r = _mm_and_si128 (loset, va);
409  	    r = _mm_shuffle_epi8 (tl, r);
410  	    va = _mm_srli_epi64 (va, 4);
411  	    va = _mm_and_si128 (loset, va);
412  	    va = _mm_shuffle_epi8 (th, va);
413  	    r = _mm_xor_si128 (r, va);
414  	    va = (xor) ? _mm_load_si128 ((__m128i *)(dptr)) : _mm_setzero_si128(); 
415  	    r = _mm_xor_si128 (r, va);
416  	    _mm_store_si128 ((__m128i *)(dptr), r);
417  	    dptr += 16;
418  	    sptr += 16;
419  	  }
420  	  gf_do_final_region_alignment(&rd);
421  	
422  	}
423  	#endif
424  	
425  	static 
426  	int gf_w4_single_table_init(gf_t *gf)
427  	{
428  	  gf_internal_t *h;
429  	  struct gf_single_table_data *std;
430  	  int a, b, prod;
431  	
432  	
433  	  h = (gf_internal_t *) gf->scratch;
434  	  std = (struct gf_single_table_data *)h->private;
435  	
436  	  bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
437  	  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
438  	
439  	  for (a = 1; a < GF_FIELD_SIZE; a++) {
440  	    for (b = 1; b < GF_FIELD_SIZE; b++) {
441  	      prod = gf_w4_shift_multiply(gf, a, b);
442  	      std->mult[a][b] = prod;
443  	      std->div[prod][b] = a;
444  	    }
445  	  }
446  	
447  	  SET_FUNCTION(gf,inverse,w32,NULL)
448  	  SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
449  	  SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
450  	  #if defined(INTEL_SSSE3)
451  	    if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
452  	      SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
453  	    } else {
454  	  #elif defined(ARM_NEON)
455  	    if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
456  	      gf_w4_neon_single_table_init(gf);
457  	    } else {
458  	  #endif
459  	      SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
460  	      if (h->region_type & GF_REGION_SIMD) return 0;
461  	  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
462  	    }
463  	  #endif
464  	
465  	  return 1;
466  	}
467  	
468  	/* ------------------------------------------------------------
469  	  IMPLEMENTATION: DOUBLE TABLE: JSP. 
470  	 */
471  	
472  	static
473  	inline
474  	gf_val_32_t
475  	gf_w4_double_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
476  	{
477  	  struct gf_double_table_data *std;
478  	    
479  	  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
480  	  return std->mult[a][b];
481  	}
482  	
483  	static
484  	inline
485  	gf_val_32_t
486  	gf_w4_double_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
487  	{
488  	  struct gf_double_table_data *std;
489  	    
490  	  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
491  	  return std->div[a][b];
492  	}
493  	
494  	static
495  	void 
496  	gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
497  	{
498  	  int i;
499  	  uint8_t *s8, *d8, *base;
500  	  gf_region_data rd;
501  	  struct gf_double_table_data *std;
502  	    
503  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
504  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
505  	
506  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
507  	
508  	  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
509  	  s8 = (uint8_t *) src;
510  	  d8 = (uint8_t *) dest;
511  	  base = (uint8_t *) std->mult;
512  	  base += (val << GF_DOUBLE_WIDTH);
513  	
514  	  if (xor) {
515  	    for (i = 0; i < bytes; i++) d8[i] ^= base[s8[i]];
516  	  } else {
517  	    for (i = 0; i < bytes; i++) d8[i] = base[s8[i]];
518  	  }
519  	}
520  	
521  	static 
522  	int gf_w4_double_table_init(gf_t *gf)
523  	{
524  	  gf_internal_t *h;
525  	  struct gf_double_table_data *std;
526  	  int a, b, c, prod, ab;
527  	  uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
528  	
529  	  h = (gf_internal_t *) gf->scratch;
530  	  std = (struct gf_double_table_data *)h->private;
531  	
532  	  bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
533  	  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
534  	
535  	  for (a = 1; a < GF_FIELD_SIZE; a++) {
536  	    for (b = 1; b < GF_FIELD_SIZE; b++) {
537  	      prod = gf_w4_shift_multiply(gf, a, b);
538  	      mult[a][b] = prod;
539  	      std->div[prod][b] = a;
540  	    }
541  	  }
542  	  bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
543  	  for (a = 0; a < GF_FIELD_SIZE; a++) {
544  	    for (b = 0; b < GF_FIELD_SIZE; b++) {
545  	      ab = mult[a][b];
546  	      for (c = 0; c < GF_FIELD_SIZE; c++) {
547  	        std->mult[a][(b << 4) | c] = ((ab << 4) | mult[a][c]);
548  	      }
549  	    }
550  	  }
551  	
552  	  SET_FUNCTION(gf,inverse,w32,NULL)
553  	  SET_FUNCTION(gf,divide,w32,gf_w4_double_table_divide)
554  	  SET_FUNCTION(gf,multiply,w32,gf_w4_double_table_multiply)
555  	  SET_FUNCTION(gf,multiply_region,w32,gf_w4_double_table_multiply_region)
556  	  return 1;
557  	}
558  	
559  	
560  	static
561  	inline
562  	gf_val_32_t
563  	gf_w4_quad_table_lazy_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
564  	{
565  	  struct gf_quad_table_lazy_data *std;
566  	    
567  	  std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
568  	  return std->div[a][b];
569  	}
570  	
571  	static
572  	inline
573  	gf_val_32_t
574  	gf_w4_quad_table_lazy_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
575  	{
576  	  struct gf_quad_table_lazy_data *std;
577  	    
578  	  std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
579  	  return std->smult[a][b];
580  	}
581  	
582  	static
583  	inline
584  	gf_val_32_t
585  	gf_w4_quad_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
586  	{
587  	  struct gf_quad_table_data *std;
588  	    
589  	  std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
590  	  return std->div[a][b];
591  	}
592  	
593  	static
594  	inline
595  	gf_val_32_t
596  	gf_w4_quad_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
597  	{
598  	  struct gf_quad_table_data *std;
599  	  uint16_t v;
600  	    
601  	  std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
602  	  v = std->mult[a][b];
603  	  return v;
604  	}
605  	
606  	static
607  	void 
608  	gf_w4_quad_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
609  	{
610  	  uint16_t *base;
611  	  gf_region_data rd;
612  	  struct gf_quad_table_data *std;
613  	  struct gf_quad_table_lazy_data *ltd;
614  	  gf_internal_t *h;
615  	  int a, b, c, d, va, vb, vc, vd;
616  	    
617  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
618  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
619  	
620  	  h = (gf_internal_t *) (gf->scratch);
621  	  if (h->region_type & GF_REGION_LAZY) {
622  	    ltd = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
623  	    base = ltd->mult;
624  	    for (a = 0; a < 16; a++) {
625  	      va = (ltd->smult[val][a] << 12);
626  	      for (b = 0; b < 16; b++) {
627  	        vb = (ltd->smult[val][b] << 8);
628  	        for (c = 0; c < 16; c++) {
629  	          vc = (ltd->smult[val][c] << 4);
630  	          for (d = 0; d < 16; d++) {
631  	            vd = ltd->smult[val][d];
632  	            base[(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd);
633  	          }
634  	        }
635  	      }
636  	    }
637  	  } else {
638  	    std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
639  	    base = &(std->mult[val][0]);
640  	  }
641  	
642  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
643  	  gf_do_initial_region_alignment(&rd);
644  	  gf_two_byte_region_table_multiply(&rd, base);
645  	  gf_do_final_region_alignment(&rd);
646  	}
647  	
648  	static 
649  	int gf_w4_quad_table_init(gf_t *gf)
650  	{
651  	  gf_internal_t *h;
652  	  struct gf_quad_table_data *std;
653  	  int prod, val, a, b, c, d, va, vb, vc, vd;
654  	  uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
655  	
656  	  h = (gf_internal_t *) gf->scratch;
657  	  std = (struct gf_quad_table_data *)h->private;
658  	
659  	  bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
660  	  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
661  	
662  	  for (a = 1; a < GF_FIELD_SIZE; a++) {
663  	    for (b = 1; b < GF_FIELD_SIZE; b++) {
664  	      prod = gf_w4_shift_multiply(gf, a, b);
665  	      mult[a][b] = prod;
666  	      std->div[prod][b] = a;
667  	    }
668  	  }
669  	
670  	  for (val = 0; val < 16; val++) {
671  	    for (a = 0; a < 16; a++) {
672  	      va = (mult[val][a] << 12);
673  	      for (b = 0; b < 16; b++) {
674  	        vb = (mult[val][b] << 8);
675  	        for (c = 0; c < 16; c++) {
676  	          vc = (mult[val][c] << 4);
677  	          for (d = 0; d < 16; d++) {
678  	            vd = mult[val][d];
679  	            std->mult[val][(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd);
680  	          }
681  	        }
682  	      }
683  	    }
684  	  }
685  	
686  	  SET_FUNCTION(gf,inverse,w32,NULL)
687  	  SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_divide)
688  	  SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_multiply)
689  	  SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
690  	  return 1;
691  	}
692  	static 
693  	int gf_w4_quad_table_lazy_init(gf_t *gf)
694  	{
695  	  gf_internal_t *h;
696  	  struct gf_quad_table_lazy_data *std;
697  	  int a, b, prod, loga, logb;
698  	  uint8_t log_tbl[GF_FIELD_SIZE];
699  	  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
700  	
701  	  h = (gf_internal_t *) gf->scratch;
702  	  std = (struct gf_quad_table_lazy_data *)h->private;
703  	
704  	  b = 1;
705  	  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
706  	      log_tbl[b] = a;
707  	      antilog_tbl[a] = b;
708  	      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
709  	      b <<= 1;
710  	      if (b & GF_FIELD_SIZE) {
711  	          b = b ^ h->prim_poly;
712  	      }
713  	  }
714  	
715  	  bzero(std->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
716  	  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
717  	
718  	  for (a = 1; a < GF_FIELD_SIZE; a++) {
719  	    loga = log_tbl[a];
720  	    for (b = 1; b < GF_FIELD_SIZE; b++) {
721  	      logb = log_tbl[b];
722  	      prod = antilog_tbl[loga+logb];
723  	      std->smult[a][b] = prod;
724  	      std->div[prod][b] = a;
725  	    }
726  	  }
727  	
728  	  SET_FUNCTION(gf,inverse,w32,NULL)
729  	  SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_lazy_divide)
730  	  SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_lazy_multiply)
731  	  SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
732  	  return 1;
733  	}
734  	
735  	static 
736  	int gf_w4_table_init(gf_t *gf)
737  	{
738  	  int rt;
739  	  gf_internal_t *h;
740  	
741  	  h = (gf_internal_t *) gf->scratch;
742  	  rt = (h->region_type);
743  	
744  	  if (h->mult_type == GF_MULT_DEFAULT && 
745  	    !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) 
746  	      rt |= GF_REGION_DOUBLE_TABLE;
747  	
748  	  if (rt & GF_REGION_DOUBLE_TABLE) {
749  	    return gf_w4_double_table_init(gf);
750  	  } else if (rt & GF_REGION_QUAD_TABLE) {
751  	    if (rt & GF_REGION_LAZY) {
752  	      return gf_w4_quad_table_lazy_init(gf);
753  	    } else {
754  	      return gf_w4_quad_table_init(gf);
755  	    }
756  	  } else {
757  	    return gf_w4_single_table_init(gf);
758  	  }
759  	  return 0;
760  	}
761  	
762  	/* ------------------------------------------------------------
763  	   JSP: GF_MULT_BYTWO_p and _b: See the paper.
764  	*/
765  	
766  	static
767  	inline
768  	gf_val_32_t
769  	gf_w4_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
770  	{
771  	  uint32_t prod, pp, pmask, amask;
772  	  gf_internal_t *h;
773  	  
774  	  h = (gf_internal_t *) gf->scratch;
775  	  pp = h->prim_poly;
776  	
777  	  
778  	  prod = 0;
779  	  pmask = 0x8;
780  	  amask = 0x8;
781  	
782  	  while (amask != 0) {
783  	    if (prod & pmask) {
784  	      prod = ((prod << 1) ^ pp);
785  	    } else {
786  	      prod <<= 1;
787  	    }
788  	    if (a & amask) prod ^= b;
789  	    amask >>= 1;
790  	  }
791  	  return prod;
792  	}
793  	
794  	static
795  	inline
796  	gf_val_32_t
797  	gf_w4_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
798  	{
799  	  uint32_t prod, pp, bmask;
800  	  gf_internal_t *h;
801  	  
802  	  h = (gf_internal_t *) gf->scratch;
803  	  pp = h->prim_poly;
804  	
805  	  prod = 0;
806  	  bmask = 0x8;
807  	
808  	  while (1) {
809  	    if (a & 1) prod ^= b;
810  	    a >>= 1;
811  	    if (a == 0) return prod;
812  	    if (b & bmask) {
813  	      b = ((b << 1) ^ pp);
814  	    } else {
815  	      b <<= 1;
816  	    }
817  	  }
818  	}
819  	
820  	static
821  	void 
822  	gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
823  	{
824  	  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
825  	  gf_region_data rd;
826  	  struct gf_bytwo_data *btd;
827  	    
828  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
829  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
830  	
831  	  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
832  	
833  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
834  	  gf_do_initial_region_alignment(&rd);
835  	
836  	  s64 = (uint64_t *) rd.s_start;
837  	  d64 = (uint64_t *) rd.d_start;
838  	
839  	  if (xor) {
840  	    while (s64 < (uint64_t *) rd.s_top) {
841  	      prod = 0;
842  	      amask = 0x8;
843  	      ta = *s64;
844  	      while (amask != 0) {
845  	        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
846  	        if (val & amask) prod ^= ta;
847  	        amask >>= 1;
848  	      }
849  	      *d64 ^= prod;
850  	      d64++;
851  	      s64++;
852  	    }
853  	  } else { 
854  	    while (s64 < (uint64_t *) rd.s_top) {
855  	      prod = 0;
856  	      amask = 0x8;
857  	      ta = *s64;
858  	      while (amask != 0) {
859  	        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
860  	        if (val & amask) prod ^= ta;
861  	        amask >>= 1;
862  	      }
863  	      *d64 = prod;
864  	      d64++;
865  	      s64++;
866  	    }
867  	  }
868  	  gf_do_final_region_alignment(&rd);
869  	}
870  	
871  	#define BYTWO_P_ONESTEP {\
872  	      SSE_AB2(pp, m1, prod, t1, t2); \
873  	      t1 = _mm_and_si128(v, one); \
874  	      t1 = _mm_sub_epi8(t1, one); \
875  	      t1 = _mm_and_si128(t1, ta); \
876  	      prod = _mm_xor_si128(prod, t1); \
877  	      v = _mm_srli_epi64(v, 1); }
878  	
879  	#ifdef INTEL_SSE2
880  	static
881  	void
882  	gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
883  	{
884  	  int i;
885  	  uint8_t *s8, *d8;
886  	  uint8_t vrev;
887  	  __m128i pp, m1, ta, prod, t1, t2, tp, one, v;
888  	  struct gf_bytwo_data *btd;
889  	  gf_region_data rd;
890  	
891  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
892  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
893  	
894  	  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
895  	
896  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
897  	  gf_do_initial_region_alignment(&rd);
898  	
899  	  vrev = 0;
900  	  for (i = 0; i < 4; i++) {
901  	    vrev <<= 1;
902  	    if (!(val & (1 << i))) vrev |= 1;
903  	  }
904  	
905  	  s8 = (uint8_t *) rd.s_start;
906  	  d8 = (uint8_t *) rd.d_start;
907  	
908  	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
909  	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
910  	  one = _mm_set1_epi8(1);
911  	
912  	  while (d8 < (uint8_t *) rd.d_top) {
913  	    prod = _mm_setzero_si128();
914  	    v = _mm_set1_epi8(vrev);
915  	    ta = _mm_load_si128((__m128i *) s8);
916  	    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
917  	    BYTWO_P_ONESTEP;
918  	    BYTWO_P_ONESTEP;
919  	    BYTWO_P_ONESTEP;
920  	    BYTWO_P_ONESTEP;
921  	    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
922  	    d8 += 16;
923  	    s8 += 16;
924  	  }
925  	  gf_do_final_region_alignment(&rd);
926  	}
927  	#endif
928  	
929  	/*
930  	#ifdef INTEL_SSE2
931  	static
932  	void 
933  	gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
934  	{
935  	  uint8_t *d8, *s8, tb;
936  	  __m128i pp, m1, m2, t1, t2, va, vb;
937  	  struct gf_bytwo_data *btd;
938  	  gf_region_data rd;
939  	    
940  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
941  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
942  	
943  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
944  	  gf_do_initial_region_alignment(&rd);
945  	
946  	  s8 = (uint8_t *) rd.s_start;
947  	  d8 = (uint8_t *) rd.d_start;
948  	
949  	  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
950  	
951  	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
952  	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
953  	  m2 = _mm_set1_epi8((btd->mask2)&0xff);
954  	
955  	  if (xor) {
956  	    while (d8 < (uint8_t *) rd.d_top) {
957  	      va = _mm_load_si128 ((__m128i *)(s8));
958  	      vb = _mm_load_si128 ((__m128i *)(d8));
959  	      tb = val;
960  	      while (1) {
961  	        if (tb & 1) vb = _mm_xor_si128(vb, va);
962  	        tb >>= 1;
963  	        if (tb == 0) break;
964  	        SSE_AB2(pp, m1, m2, va, t1, t2);
965  	      }
966  	      _mm_store_si128((__m128i *)d8, vb);
967  	      d8 += 16;
968  	      s8 += 16;
969  	    }
970  	  } else {
971  	    while (d8 < (uint8_t *) rd.d_top) {
972  	      va = _mm_load_si128 ((__m128i *)(s8));
973  	      vb = _mm_setzero_si128 ();
974  	      tb = val;
975  	      while (1) {
976  	        if (tb & 1) vb = _mm_xor_si128(vb, va);
977  	        tb >>= 1;
978  	        if (tb == 0) break;
979  	        t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1);
980  	        t2 = _mm_and_si128(va, m2);
981  	        t2 = _mm_sub_epi64 (
982  	          _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1)));
983  	        va = _mm_xor_si128(t1, _mm_and_si128(t2, pp));
984  	      }
985  	      _mm_store_si128((__m128i *)d8, vb);
986  	      d8 += 16;
987  	      s8 += 16;
988  	    }
989  	  }
990  	  gf_do_final_region_alignment(&rd);
991  	}
992  	#endif
993  	*/
994  	
995  	#ifdef INTEL_SSE2
996  	static 
997  	void
998  	gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
999  	{
1000 	  uint8_t *d8, *s8;
1001 	  __m128i pp, m1, t1, t2, va;
1002 	
1003 	  s8 = (uint8_t *) rd->s_start;
1004 	  d8 = (uint8_t *) rd->d_start;
1005 	
1006 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1007 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1008 	
1009 	  while (d8 < (uint8_t *) rd->d_top) {
1010 	    va = _mm_load_si128 ((__m128i *)(s8));
1011 	    SSE_AB2(pp, m1, va, t1, t2);
1012 	    _mm_store_si128((__m128i *)d8, va);
1013 	    d8 += 16;
1014 	    s8 += 16;
1015 	  }
1016 	}
1017 	#endif
1018 	
1019 	#ifdef INTEL_SSE2
1020 	static 
1021 	void
1022 	gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1023 	{
1024 	  uint8_t *d8, *s8;
1025 	  __m128i pp, m1, t1, t2, va, vb;
1026 	
1027 	  s8 = (uint8_t *) rd->s_start;
1028 	  d8 = (uint8_t *) rd->d_start;
1029 	
1030 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1031 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1032 	
1033 	  while (d8 < (uint8_t *) rd->d_top) {
1034 	    va = _mm_load_si128 ((__m128i *)(s8));
1035 	    SSE_AB2(pp, m1, va, t1, t2);
1036 	    vb = _mm_load_si128 ((__m128i *)(d8));
1037 	    vb = _mm_xor_si128(vb, va);
1038 	    _mm_store_si128((__m128i *)d8, vb);
1039 	    d8 += 16;
1040 	    s8 += 16;
1041 	  }
1042 	}
1043 	#endif
1044 	
1045 	#ifdef INTEL_SSE2
1046 	static 
1047 	void
1048 	gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1049 	{
1050 	  uint8_t *d8, *s8;
1051 	  __m128i pp, m1, t1, t2, va;
1052 	
1053 	  s8 = (uint8_t *) rd->s_start;
1054 	  d8 = (uint8_t *) rd->d_start;
1055 	
1056 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1057 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1058 	
1059 	  while (d8 < (uint8_t *) rd->d_top) {
1060 	    va = _mm_load_si128 ((__m128i *)(s8));
1061 	    SSE_AB2(pp, m1, va, t1, t2);
1062 	    SSE_AB2(pp, m1, va, t1, t2);
1063 	    _mm_store_si128((__m128i *)d8, va);
1064 	    d8 += 16;
1065 	    s8 += 16;
1066 	  }
1067 	}
1068 	#endif
1069 	
1070 	#ifdef INTEL_SSE2
1071 	static 
1072 	void
1073 	gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1074 	{
1075 	  uint8_t *d8, *s8;
1076 	  __m128i pp, m1, t1, t2, va, vb;
1077 	
1078 	  s8 = (uint8_t *) rd->s_start;
1079 	  d8 = (uint8_t *) rd->d_start;
1080 	
1081 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1082 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1083 	
1084 	  while (d8 < (uint8_t *) rd->d_top) {
1085 	    va = _mm_load_si128 ((__m128i *)(s8));
1086 	    SSE_AB2(pp, m1, va, t1, t2);
1087 	    SSE_AB2(pp, m1, va, t1, t2);
1088 	    vb = _mm_load_si128 ((__m128i *)(d8));
1089 	    vb = _mm_xor_si128(vb, va);
1090 	    _mm_store_si128((__m128i *)d8, vb);
1091 	    d8 += 16;
1092 	    s8 += 16;
1093 	  }
1094 	}
1095 	#endif
1096 	
1097 	
1098 	#ifdef INTEL_SSE2
1099 	static 
1100 	void
1101 	gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1102 	{
1103 	  uint8_t *d8, *s8;
1104 	  __m128i pp, m1, t1, t2, va, vb;
1105 	
1106 	  s8 = (uint8_t *) rd->s_start;
1107 	  d8 = (uint8_t *) rd->d_start;
1108 	
1109 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1110 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1111 	
1112 	  while (d8 < (uint8_t *) rd->d_top) {
1113 	    va = _mm_load_si128 ((__m128i *)(s8));
1114 	    vb = va;
1115 	    SSE_AB2(pp, m1, va, t1, t2);
1116 	    va = _mm_xor_si128(va, vb);
1117 	    _mm_store_si128((__m128i *)d8, va);
1118 	    d8 += 16;
1119 	    s8 += 16;
1120 	  }
1121 	}
1122 	#endif
1123 	
1124 	#ifdef INTEL_SSE2
1125 	static 
1126 	void
1127 	gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1128 	{
1129 	  uint8_t *d8, *s8;
1130 	  __m128i pp, m1, t1, t2, va, vb;
1131 	
1132 	  s8 = (uint8_t *) rd->s_start;
1133 	  d8 = (uint8_t *) rd->d_start;
1134 	
1135 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1136 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1137 	
1138 	  while (d8 < (uint8_t *) rd->d_top) {
1139 	    va = _mm_load_si128 ((__m128i *)(s8));
1140 	    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
1141 	    SSE_AB2(pp, m1, va, t1, t2);
1142 	    vb = _mm_xor_si128(vb, va);
1143 	    _mm_store_si128((__m128i *)d8, vb);
1144 	    d8 += 16;
1145 	    s8 += 16;
1146 	  }
1147 	}
1148 	#endif
1149 	
1150 	#ifdef INTEL_SSE2
1151 	static 
1152 	void
1153 	gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1154 	{
1155 	  uint8_t *d8, *s8;
1156 	  __m128i pp, m1, t1, t2, va, vb;
1157 	
1158 	  s8 = (uint8_t *) rd->s_start;
1159 	  d8 = (uint8_t *) rd->d_start;
1160 	
1161 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1162 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1163 	
1164 	  while (d8 < (uint8_t *) rd->d_top) {
1165 	    va = _mm_load_si128 ((__m128i *)(s8));
1166 	    vb = va;
1167 	    SSE_AB2(pp, m1, va, t1, t2);
1168 	    SSE_AB2(pp, m1, va, t1, t2);
1169 	    va = _mm_xor_si128(va, vb);
1170 	    _mm_store_si128((__m128i *)d8, va);
1171 	    d8 += 16;
1172 	    s8 += 16;
1173 	  }
1174 	}
1175 	#endif
1176 	
1177 	#ifdef INTEL_SSE2
1178 	static 
1179 	void
1180 	gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1181 	{
1182 	  uint8_t *d8, *s8;
1183 	  __m128i pp, m1, t1, t2, va, vb;
1184 	
1185 	  s8 = (uint8_t *) rd->s_start;
1186 	  d8 = (uint8_t *) rd->d_start;
1187 	
1188 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1189 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1190 	
1191 	  while (d8 < (uint8_t *) rd->d_top) {
1192 	    va = _mm_load_si128 ((__m128i *)(s8));
1193 	    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
1194 	    SSE_AB2(pp, m1, va, t1, t2);
1195 	    SSE_AB2(pp, m1, va, t1, t2);
1196 	    vb = _mm_xor_si128(vb, va);
1197 	    _mm_store_si128((__m128i *)d8, vb);
1198 	    d8 += 16;
1199 	    s8 += 16;
1200 	  }
1201 	}
1202 	#endif
1203 	
1204 	#ifdef INTEL_SSE2
1205 	static 
1206 	void
1207 	gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1208 	{
1209 	  uint8_t *d8, *s8;
1210 	  __m128i pp, m1, t1, t2, va, vb;
1211 	
1212 	  s8 = (uint8_t *) rd->s_start;
1213 	  d8 = (uint8_t *) rd->d_start;
1214 	
1215 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1216 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1217 	
1218 	  while (d8 < (uint8_t *) rd->d_top) {
1219 	    va = _mm_load_si128 ((__m128i *)(s8));
1220 	    vb = va;
1221 	    SSE_AB2(pp, m1, va, t1, t2);
1222 	    vb = _mm_xor_si128(va, vb);
1223 	    SSE_AB2(pp, m1, va, t1, t2);
1224 	    va = _mm_xor_si128(va, vb);
1225 	    _mm_store_si128((__m128i *)d8, va);
1226 	    d8 += 16;
1227 	    s8 += 16;
1228 	  }
1229 	}
1230 	#endif
1231 	
1232 	#ifdef INTEL_SSE2
1233 	static 
1234 	void
1235 	gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1236 	{
1237 	  uint8_t *d8, *s8;
1238 	  __m128i pp, m1, t1, t2, va, vb;
1239 	
1240 	  s8 = (uint8_t *) rd->s_start;
1241 	  d8 = (uint8_t *) rd->d_start;
1242 	
1243 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1244 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1245 	
1246 	  while (d8 < (uint8_t *) rd->d_top) {
1247 	    va = _mm_load_si128 ((__m128i *)(s8));
1248 	    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
1249 	    SSE_AB2(pp, m1, va, t1, t2);
1250 	    vb = _mm_xor_si128(vb, va);
1251 	    SSE_AB2(pp, m1, va, t1, t2);
1252 	    vb = _mm_xor_si128(vb, va);
1253 	    _mm_store_si128((__m128i *)d8, vb);
1254 	    d8 += 16;
1255 	    s8 += 16;
1256 	  }
1257 	}
1258 	#endif
1259 	
1260 	#ifdef INTEL_SSE2
1261 	static 
1262 	void
1263 	gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
1264 	{
1265 	  uint8_t *d8, *s8;
1266 	  __m128i pp, m1, t1, t2, va, vb;
1267 	
1268 	  s8 = (uint8_t *) rd->s_start;
1269 	  d8 = (uint8_t *) rd->d_start;
1270 	
1271 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1272 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1273 	
1274 	  while (d8 < (uint8_t *) rd->d_top) {
1275 	    va = _mm_load_si128 ((__m128i *)(s8));
1276 	    SSE_AB2(pp, m1, va, t1, t2);
1277 	    vb = va;
1278 	    SSE_AB2(pp, m1, va, t1, t2);
1279 	    va = _mm_xor_si128(va, vb);
1280 	    _mm_store_si128((__m128i *)d8, va);
1281 	    d8 += 16;
1282 	    s8 += 16;
1283 	  }
1284 	}
1285 	#endif
1286 	
1287 	#ifdef INTEL_SSE2
1288 	static 
1289 	void
1290 	gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
1291 	{
1292 	  uint8_t *d8, *s8;
1293 	  __m128i pp, m1, t1, t2, va, vb;
1294 	
1295 	  s8 = (uint8_t *) rd->s_start;
1296 	  d8 = (uint8_t *) rd->d_start;
1297 	
1298 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1299 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1300 	
1301 	  while (d8 < (uint8_t *) rd->d_top) {
1302 	    va = _mm_load_si128 ((__m128i *)(s8));
1303 	    SSE_AB2(pp, m1, va, t1, t2);
1304 	    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
1305 	    SSE_AB2(pp, m1, va, t1, t2);
1306 	    vb = _mm_xor_si128(vb, va);
1307 	    _mm_store_si128((__m128i *)d8, vb);
1308 	    d8 += 16;
1309 	    s8 += 16;
1310 	  }
1311 	}
1312 	#endif
1313 	
1314 	#ifdef INTEL_SSE2
1315 	static
1316 	void 
1317 	gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1318 	{
1319 	  uint8_t *d8, *s8, tb;
1320 	  __m128i pp, m1, m2, t1, t2, va, vb;
1321 	  struct gf_bytwo_data *btd;
1322 	  gf_region_data rd;
1323 	    
1324 	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1325 	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1326 	
1327 	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1328 	  gf_do_initial_region_alignment(&rd);
1329 	
1330 	  s8 = (uint8_t *) rd.s_start;
1331 	  d8 = (uint8_t *) rd.d_start;
1332 	
1333 	  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1334 	
1335 	  switch (val) {
1336 	    case 2:
1337 	      if (!xor) {
1338 	        gf_w4_bytwo_b_sse_region_2_noxor(&rd, btd);
1339 	      } else {
1340 	        gf_w4_bytwo_b_sse_region_2_xor(&rd, btd);
1341 	      }
1342 	      gf_do_final_region_alignment(&rd);
1343 	      return;
1344 	    case 3:
1345 	      if (!xor) {
1346 	        gf_w4_bytwo_b_sse_region_3_noxor(&rd, btd);
1347 	      } else {
1348 	        gf_w4_bytwo_b_sse_region_3_xor(&rd, btd);
1349 	      }
1350 	      gf_do_final_region_alignment(&rd);
1351 	      return;
1352 	    case 4:
1353 	      if (!xor) {
1354 	        gf_w4_bytwo_b_sse_region_4_noxor(&rd, btd);
1355 	      } else {
1356 	        gf_w4_bytwo_b_sse_region_4_xor(&rd, btd);
1357 	      }
1358 	      gf_do_final_region_alignment(&rd);
1359 	      return;
1360 	    case 5:
1361 	      if (!xor) {
1362 	        gf_w4_bytwo_b_sse_region_5_noxor(&rd, btd);
1363 	      } else {
1364 	        gf_w4_bytwo_b_sse_region_5_xor(&rd, btd);
1365 	      }
1366 	      gf_do_final_region_alignment(&rd);
1367 	      return;
1368 	    case 6:
1369 	      if (!xor) {
1370 	        gf_w4_bytwo_b_sse_region_6_noxor(&rd, btd);
1371 	      } else {
1372 	        gf_w4_bytwo_b_sse_region_6_xor(&rd, btd);
1373 	      }
1374 	      gf_do_final_region_alignment(&rd);
1375 	      return;
1376 	    case 7:
1377 	      if (!xor) {
1378 	        gf_w4_bytwo_b_sse_region_7_noxor(&rd, btd);
1379 	      } else {
1380 	        gf_w4_bytwo_b_sse_region_7_xor(&rd, btd);
1381 	      }
1382 	      gf_do_final_region_alignment(&rd);
1383 	      return;
1384 	  }
1385 	
1386 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1387 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1388 	  m2 = _mm_set1_epi8((btd->mask2)&0xff);
1389 	
1390 	  if (xor) {
1391 	    while (d8 < (uint8_t *) rd.d_top) {
1392 	      va = _mm_load_si128 ((__m128i *)(s8));
1393 	      vb = _mm_load_si128 ((__m128i *)(d8));
1394 	      tb = val;
1395 	      while (1) {
1396 	        if (tb & 1) vb = _mm_xor_si128(vb, va);
1397 	        tb >>= 1;
1398 	        if (tb == 0) break;
1399 	        SSE_AB2(pp, m1, va, t1, t2);
1400 	      }
1401 	      _mm_store_si128((__m128i *)d8, vb);
1402 	      d8 += 16;
1403 	      s8 += 16;
1404 	    }
1405 	  } else {
1406 	    while (d8 < (uint8_t *) rd.d_top) {
1407 	      va = _mm_load_si128 ((__m128i *)(s8));
1408 	      vb = _mm_setzero_si128 ();
1409 	      tb = val;
1410 	      while (1) {
1411 	        if (tb & 1) vb = _mm_xor_si128(vb, va);
1412 	        tb >>= 1;
1413 	        if (tb == 0) break;
1414 	        t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1);
1415 	        t2 = _mm_and_si128(va, m2);
1416 	        t2 = _mm_sub_epi64 (
1417 	          _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1)));
1418 	        va = _mm_xor_si128(t1, _mm_and_si128(t2, pp));
1419 	      }
1420 	      _mm_store_si128((__m128i *)d8, vb);
1421 	      d8 += 16;
1422 	      s8 += 16;
1423 	    }
1424 	  }
1425 	  gf_do_final_region_alignment(&rd);
1426 	}
1427 	#endif
1428 	
1429 	static
1430 	void 
1431 	gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1432 	{
1433 	  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
1434 	  struct gf_bytwo_data *btd;
1435 	  gf_region_data rd;
1436 	
(1) Event cond_at_least: Condition "val == 0U", taking false branch. Now the value of "val" is at least 1.
(3) Event cond_cannot_single: Condition "val == 0U", taking false branch. Now the value of "val" cannot be equal to 0.
Also see events: [at_least][cond_cannot_set][cannot_set][dead_error_condition][dead_error_begin]
1437 	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
(4) Event cond_cannot_set: Condition "val == 1U", taking false branch. Now the value of "val" cannot be equal to any of {0, 1}.
Also see events: [cond_at_least][at_least][cond_cannot_single][cannot_set][dead_error_condition][dead_error_begin]
1438 	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1439 	
1440 	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1441 	  gf_do_initial_region_alignment(&rd);
1442 	
1443 	  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1444 	  s64 = (uint64_t *) rd.s_start;
1445 	  d64 = (uint64_t *) rd.d_start;
1446 	
(2) Event at_least: When switching on "val", the value of "val" must be at least 2.
(5) Event cannot_set: When switching on "val", the value of "val" cannot be equal to any of {0, 1}.
Also see events: [cond_at_least][cond_cannot_single][cond_cannot_set][dead_error_condition][dead_error_begin]
1447 	  switch (val) {
(6) Event dead_error_condition: The "switch" governing value "val" cannot be "1U".
(7) Event dead_error_begin: Execution cannot reach this statement: "case 1U:".
Also see events: [cond_at_least][at_least][cond_cannot_single][cond_cannot_set][cannot_set]
1448 	  case 1:
1449 	    if (xor) {
1450 	      while (d64 < (uint64_t *) rd.d_top) {
1451 	        *d64 ^= *s64;
1452 	        d64++;
1453 	        s64++;
1454 	      }
1455 	    } else {
1456 	      while (d64 < (uint64_t *) rd.d_top) {
1457 	        *d64 = *s64;
1458 	        d64++;
1459 	        s64++;
1460 	      }
1461 	    }
1462 	    break;
1463 	  case 2:
1464 	    if (xor) {
1465 	      while (d64 < (uint64_t *) rd.d_top) {
1466 	        ta = *s64;
1467 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1468 	        *d64 ^= ta;
1469 	        d64++;
1470 	        s64++;
1471 	      }
1472 	    } else {
1473 	      while (d64 < (uint64_t *) rd.d_top) {
1474 	        ta = *s64;
1475 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1476 	        *d64 = ta;
1477 	        d64++;
1478 	        s64++;
1479 	      }
1480 	    }
1481 	    break; 
1482 	  case 3:
1483 	    if (xor) {
1484 	      while (d64 < (uint64_t *) rd.d_top) {
1485 	        ta = *s64;
1486 	        prod = ta;
1487 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1488 	        *d64 ^= (ta ^ prod);
1489 	        d64++;
1490 	        s64++;
1491 	      }
1492 	    } else {
1493 	      while (d64 < (uint64_t *) rd.d_top) {
1494 	        ta = *s64;
1495 	        prod = ta;
1496 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1497 	        *d64 = (ta ^ prod);
1498 	        d64++;
1499 	        s64++;
1500 	      }
1501 	    }
1502 	    break; 
1503 	  case 4:
1504 	    if (xor) {
1505 	      while (d64 < (uint64_t *) rd.d_top) {
1506 	        ta = *s64;
1507 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1508 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1509 	        *d64 ^= ta;
1510 	        d64++;
1511 	        s64++;
1512 	      }
1513 	    } else {
1514 	      while (d64 < (uint64_t *) rd.d_top) {
1515 	        ta = *s64;
1516 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1517 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1518 	        *d64 = ta;
1519 	        d64++;
1520 	        s64++;
1521 	      }
1522 	    }
1523 	    break; 
1524 	  case 5:
1525 	    if (xor) {
1526 	      while (d64 < (uint64_t *) rd.d_top) {
1527 	        ta = *s64;
1528 	        prod = ta;
1529 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1530 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1531 	        *d64 ^= (ta ^ prod);
1532 	        d64++;
1533 	        s64++;
1534 	      }
1535 	    } else {
1536 	      while (d64 < (uint64_t *) rd.d_top) {
1537 	        ta = *s64;
1538 	        prod = ta;
1539 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1540 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1541 	        *d64 = ta ^ prod;
1542 	        d64++;
1543 	        s64++;
1544 	      }
1545 	    }
1546 	    break;
1547 	  case 6:
1548 	    if (xor) {
1549 	      while (d64 < (uint64_t *) rd.d_top) {
1550 	        ta = *s64;
1551 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1552 	        prod = ta;
1553 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1554 	        *d64 ^= (ta ^ prod);
1555 	        d64++;
1556 	        s64++;
1557 	      }
1558 	    } else {
1559 	      while (d64 < (uint64_t *) rd.d_top) {
1560 	        ta = *s64;
1561 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1562 	        prod = ta;
1563 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1564 	        *d64 = ta ^ prod;
1565 	        d64++;
1566 	        s64++;
1567 	      }
1568 	    }
1569 	    break;
1570 	  case 7:
1571 	    if (xor) {
1572 	      while (d64 < (uint64_t *) rd.d_top) {
1573 	        ta = *s64;
1574 	        prod = ta;
1575 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1576 	        prod ^= ta;
1577 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1578 	        *d64 ^= (ta ^ prod);
1579 	        d64++;
1580 	        s64++;
1581 	      }
1582 	    } else {
1583 	      while (d64 < (uint64_t *) rd.d_top) {
1584 	        ta = *s64;
1585 	        prod = ta;
1586 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1587 	        prod ^= ta;
1588 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1589 	        *d64 = ta ^ prod;
1590 	        d64++;
1591 	        s64++;
1592 	      }
1593 	    }
1594 	    break; 
1595 	  case 8:
1596 	    if (xor) {
1597 	      while (d64 < (uint64_t *) rd.d_top) {
1598 	        ta = *s64;
1599 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1600 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1601 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1602 	        *d64 ^= ta;
1603 	        d64++;
1604 	        s64++;
1605 	      }
1606 	    } else {
1607 	      while (d64 < (uint64_t *) rd.d_top) {
1608 	        ta = *s64;
1609 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1610 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1611 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1612 	        *d64 = ta;
1613 	        d64++;
1614 	        s64++;
1615 	      }
1616 	    }
1617 	    break; 
1618 	  case 9:
1619 	    if (xor) {
1620 	      while (d64 < (uint64_t *) rd.d_top) {
1621 	        ta = *s64;
1622 	        prod = ta;
1623 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1624 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1625 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1626 	        *d64 ^= (ta ^ prod);
1627 	        d64++;
1628 	        s64++;
1629 	      }
1630 	    } else {
1631 	      while (d64 < (uint64_t *) rd.d_top) {
1632 	        ta = *s64;
1633 	        prod = ta;
1634 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1635 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1636 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1637 	        *d64 = (ta ^ prod);
1638 	        d64++;
1639 	        s64++;
1640 	      }
1641 	    }
1642 	    break; 
1643 	  case 10:
1644 	    if (xor) {
1645 	      while (d64 < (uint64_t *) rd.d_top) {
1646 	        ta = *s64;
1647 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1648 	        prod = ta;
1649 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1650 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1651 	        *d64 ^= (ta ^ prod);
1652 	        d64++;
1653 	        s64++;
1654 	      }
1655 	    } else {
1656 	      while (d64 < (uint64_t *) rd.d_top) {
1657 	        ta = *s64;
1658 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1659 	        prod = ta;
1660 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1661 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1662 	        *d64 = (ta ^ prod);
1663 	        d64++;
1664 	        s64++;
1665 	      }
1666 	    }
1667 	    break; 
1668 	  case 11:
1669 	    if (xor) {
1670 	      while (d64 < (uint64_t *) rd.d_top) {
1671 	        ta = *s64;
1672 	        prod = ta;
1673 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1674 	        prod ^= ta;
1675 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1676 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1677 	        *d64 ^= (ta ^ prod);
1678 	        d64++;
1679 	        s64++;
1680 	      }
1681 	    } else {
1682 	      while (d64 < (uint64_t *) rd.d_top) {
1683 	        ta = *s64;
1684 	        prod = ta;
1685 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1686 	        prod ^= ta;
1687 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1688 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1689 	        *d64 = (ta ^ prod);
1690 	        d64++;
1691 	        s64++;
1692 	      }
1693 	    }
1694 	    break; 
1695 	  case 12:
1696 	    if (xor) {
1697 	      while (d64 < (uint64_t *) rd.d_top) {
1698 	        ta = *s64;
1699 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1700 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1701 	        prod = ta;
1702 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1703 	        *d64 ^= (ta ^ prod);
1704 	        d64++;
1705 	        s64++;
1706 	      }
1707 	    } else {
1708 	      while (d64 < (uint64_t *) rd.d_top) {
1709 	        ta = *s64;
1710 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1711 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1712 	        prod = ta;
1713 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1714 	        *d64 = (ta ^ prod);
1715 	        d64++;
1716 	        s64++;
1717 	      }
1718 	    }
1719 	    break; 
1720 	  case 13:
1721 	    if (xor) {
1722 	      while (d64 < (uint64_t *) rd.d_top) {
1723 	        ta = *s64;
1724 	        prod = ta;
1725 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1726 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1727 	        prod ^= ta;
1728 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1729 	        *d64 ^= (ta ^ prod);
1730 	        d64++;
1731 	        s64++;
1732 	      }
1733 	    } else {
1734 	      while (d64 < (uint64_t *) rd.d_top) {
1735 	        ta = *s64;
1736 	        prod = ta;
1737 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1738 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1739 	        prod ^= ta;
1740 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1741 	        *d64 = (ta ^ prod);
1742 	        d64++;
1743 	        s64++;
1744 	      }
1745 	    }
1746 	    break; 
1747 	  case 14:
1748 	    if (xor) {
1749 	      while (d64 < (uint64_t *) rd.d_top) {
1750 	        ta = *s64;
1751 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1752 	        prod = ta;
1753 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1754 	        prod ^= ta;
1755 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1756 	        *d64 ^= (ta ^ prod);
1757 	        d64++;
1758 	        s64++;
1759 	      }
1760 	    } else {
1761 	      while (d64 < (uint64_t *) rd.d_top) {
1762 	        ta = *s64;
1763 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1764 	        prod = ta;
1765 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1766 	        prod ^= ta;
1767 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1768 	        *d64 = (ta ^ prod);
1769 	        d64++;
1770 	        s64++;
1771 	      }
1772 	    }
1773 	    break; 
1774 	  case 15:
1775 	    if (xor) {
1776 	      while (d64 < (uint64_t *) rd.d_top) {
1777 	        ta = *s64;
1778 	        prod = ta;
1779 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1780 	        prod ^= ta;
1781 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1782 	        prod ^= ta;
1783 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1784 	        *d64 ^= (ta ^ prod);
1785 	        d64++;
1786 	        s64++;
1787 	      }
1788 	    } else {
1789 	      while (d64 < (uint64_t *) rd.d_top) {
1790 	        ta = *s64;
1791 	        prod = ta;
1792 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1793 	        prod ^= ta;
1794 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1795 	        prod ^= ta;
1796 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1797 	        *d64 = (ta ^ prod);
1798 	        d64++;
1799 	        s64++;
1800 	      }
1801 	    }
1802 	    break; 
1803 	  default:
1804 	    if (xor) {
1805 	      while (d64 < (uint64_t *) rd.d_top) {
1806 	        prod = *d64 ;
1807 	        ta = *s64;
1808 	        tb = val;
1809 	        while (1) {
1810 	          if (tb & 1) prod ^= ta;
1811 	          tb >>= 1;
1812 	          if (tb == 0) break;
1813 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1814 	        }
1815 	        *d64 = prod;
1816 	        d64++;
1817 	        s64++;
1818 	      }
1819 	    } else {
1820 	      while (d64 < (uint64_t *) rd.d_top) {
1821 	        prod = 0 ;
1822 	        ta = *s64;
1823 	        tb = val;
1824 	        while (1) {
1825 	          if (tb & 1) prod ^= ta;
1826 	          tb >>= 1;
1827 	          if (tb == 0) break;
1828 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1829 	        }
1830 	        *d64 = prod;
1831 	        d64++;
1832 	        s64++;
1833 	      }
1834 	    }
1835 	    break;
1836 	  }
1837 	  gf_do_final_region_alignment(&rd);
1838 	}
1839 	
1840 	static 
1841 	int gf_w4_bytwo_init(gf_t *gf)
1842 	{
1843 	  gf_internal_t *h;
1844 	  uint64_t ip, m1, m2;
1845 	  struct gf_bytwo_data *btd;
1846 	
1847 	  h = (gf_internal_t *) gf->scratch;
1848 	  btd = (struct gf_bytwo_data *) (h->private);
1849 	  ip = h->prim_poly & 0xf;
1850 	  m1 = 0xe;
1851 	  m2 = 0x8;
1852 	  btd->prim_poly = 0;
1853 	  btd->mask1 = 0;
1854 	  btd->mask2 = 0;
1855 	
1856 	  while (ip != 0) {
1857 	    btd->prim_poly |= ip;
1858 	    btd->mask1 |= m1;
1859 	    btd->mask2 |= m2;
1860 	    ip <<= GF_FIELD_WIDTH;
1861 	    m1 <<= GF_FIELD_WIDTH;
1862 	    m2 <<= GF_FIELD_WIDTH;
1863 	  }
1864 	
1865 	  if (h->mult_type == GF_MULT_BYTWO_p) {
1866 	    SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
1867 	    #ifdef INTEL_SSE2
1868 	      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1869 	        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
1870 	      } else {
1871 	    #endif
1872 	        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
1873 	        if (h->region_type & GF_REGION_SIMD)
1874 	          return 0;
1875 	    #ifdef INTEL_SSE2
1876 	      }
1877 	    #endif
1878 	  } else {
1879 	    SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
1880 	    #ifdef INTEL_SSE2
1881 	      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1882 	        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
1883 	      } else {
1884 	    #endif
1885 	        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
1886 	        if (h->region_type & GF_REGION_SIMD)
1887 	          return 0;
1888 	    #ifdef INTEL_SSE2
1889 	      }
1890 	    #endif
1891 	  }
1892 	  return 1;
1893 	}
1894 	
1895 	
1896 	static 
1897 	int gf_w4_cfm_init(gf_t *gf)
1898 	{
1899 	#if defined(INTEL_SSE4_PCLMUL)
1900 	  if (gf_cpu_supports_intel_pclmul) {
1901 	    SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
1902 	    return 1;
1903 	  }
1904 	#elif defined(ARM_NEON)
1905 	  if (gf_cpu_supports_arm_neon) {
1906 	    return gf_w4_neon_cfm_init(gf);
1907 	  }
1908 	#endif
1909 	  return 0;
1910 	}
1911 	
1912 	static 
1913 	int gf_w4_shift_init(gf_t *gf)
1914 	{
1915 	  SET_FUNCTION(gf,multiply,w32,gf_w4_shift_multiply)
1916 	  return 1;
1917 	}
1918 	
1919 	/* JSP: I'm putting all error-checking into gf_error_check(), so you don't 
1920 	   have to do error checking in scratch_size or in init */
1921 	
1922 	int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
1923 	{
1924 	  switch(mult_type)
1925 	  {
1926 	    case GF_MULT_BYTWO_p:
1927 	    case GF_MULT_BYTWO_b:
1928 	      return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data);
1929 	      break;
1930 	    case GF_MULT_DEFAULT:
1931 	    case GF_MULT_TABLE:
1932 	      if (region_type == GF_REGION_CAUCHY) {
1933 	        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
1934 	      }
1935 	
1936 	      if (mult_type == GF_MULT_DEFAULT && 
1937 	          !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
1938 	          region_type = GF_REGION_DOUBLE_TABLE;
1939 	
1940 	      if (region_type & GF_REGION_DOUBLE_TABLE) {
1941 	        return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
1942 	      } else if (region_type & GF_REGION_QUAD_TABLE) {
1943 	        if ((region_type & GF_REGION_LAZY) == 0) {
1944 	          return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64;
1945 	        } else {
1946 	          return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64;
1947 	        }
1948 	      } else {
1949 	        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
1950 	      }
1951 	      break;
1952 	
1953 	    case GF_MULT_LOG_TABLE:
1954 	      return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
1955 	      break;
1956 	    case GF_MULT_CARRY_FREE:
1957 	      return sizeof(gf_internal_t);
1958 	      break;
1959 	    case GF_MULT_SHIFT:
1960 	      return sizeof(gf_internal_t);
1961 	      break;
1962 	    default:
1963 	      return 0;
1964 	   }
1965 	  return 0;
1966 	}
1967 	
1968 	int
1969 	gf_w4_init (gf_t *gf)
1970 	{
1971 	  gf_internal_t *h;
1972 	
1973 	  h = (gf_internal_t *) gf->scratch;
1974 	  if (h->prim_poly == 0) h->prim_poly = 0x13;
1975 	  h->prim_poly |= 0x10;
1976 	  SET_FUNCTION(gf,multiply,w32,NULL)
1977 	  SET_FUNCTION(gf,divide,w32,NULL)
1978 	  SET_FUNCTION(gf,inverse,w32,NULL)
1979 	  SET_FUNCTION(gf,multiply_region,w32,NULL)
1980 	  SET_FUNCTION(gf,extract_word,w32,gf_w4_extract_word)
1981 	
1982 	  switch(h->mult_type) {
1983 	    case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
1984 	    case GF_MULT_SHIFT:      if (gf_w4_shift_init(gf) == 0) return 0; break;
1985 	    case GF_MULT_BYTWO_p:   
1986 	    case GF_MULT_BYTWO_b:    if (gf_w4_bytwo_init(gf) == 0) return 0; break;
1987 	    case GF_MULT_LOG_TABLE:  if (gf_w4_log_init(gf) == 0) return 0; break;
1988 	    case GF_MULT_DEFAULT:   
1989 	    case GF_MULT_TABLE:      if (gf_w4_table_init(gf) == 0) return 0; break;
1990 	    default: return 0;
1991 	  }
1992 	
1993 	  if (h->divide_type == GF_DIVIDE_EUCLID) {
1994 	    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
1995 	    SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
1996 	  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
1997 	    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
1998 	    SET_FUNCTION(gf,inverse,w32,gf_w4_matrix)
1999 	  }
2000 	
2001 	  if (gf->divide.w32 == NULL) {
2002 	    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
2003 	    if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
2004 	  }
2005 	
2006 	  if (gf->inverse.w32 == NULL)  SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
2007 	
2008 	  if (h->region_type == GF_REGION_CAUCHY) {
2009 	    SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
2010 	    SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
2011 	  }
2012 	
2013 	  if (gf->multiply_region.w32 == NULL) {
2014 	    SET_FUNCTION(gf,multiply_region,w32,gf_w4_multiply_region_from_single)
2015 	  }
2016 	
2017 	  return 1;
2018 	}
2019 	
2020 	/* Inline setup functions */
2021 	
2022 	uint8_t *gf_w4_get_mult_table(gf_t *gf)
2023 	{
2024 	  gf_internal_t *h;
2025 	  struct gf_single_table_data *std;
2026 	  
2027 	  h = (gf_internal_t *) gf->scratch;
2028 	  if (gf->multiply.w32 == gf_w4_single_table_multiply) {
2029 	    std = (struct gf_single_table_data *) h->private;
2030 	    return (uint8_t *) std->mult;
2031 	  } 
2032 	  return NULL;
2033 	}
2034 	    
2035 	uint8_t *gf_w4_get_div_table(gf_t *gf)
2036 	{
2037 	  gf_internal_t *h;
2038 	  struct gf_single_table_data *std;
2039 	  
2040 	  h = (gf_internal_t *) gf->scratch;
2041 	  if (gf->multiply.w32 == gf_w4_single_table_multiply) {
2042 	    std = (struct gf_single_table_data *) h->private;
2043 	    return (uint8_t *) std->div;
2044 	  } 
2045 	  return NULL;
2046 	}
2047 	
2048