1    	/*
2    	 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3    	 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4    	 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
5    	 *
6    	 * gf_w8.c
7    	 *
8    	 * Routines for 8-bit Galois fields
9    	 */
10   	
11   	#include "gf_int.h"
12   	#include "gf_w8.h"
13   	#include <stdio.h>
14   	#include <stdlib.h>
15   	#include <assert.h>
16   	#include "gf_cpu.h"
17   	
18   	#define AB2(ip, am1 ,am2, b, t1, t2) {\
19   	  t1 = (b << 1) & am1;\
20   	  t2 = b & am2; \
21   	  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
22   	  b = (t1 ^ (t2 & ip));}
23   	
24   	#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
25   	          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
26   	          t2 = _mm_and_si128(va, m2); \
27   	          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
28   	          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
29   	
30   	#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf("  %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
31   	
32   	static
33   	inline
34   	uint32_t gf_w8_inverse_from_divide (gf_t *gf, uint32_t a)
35   	{
36   	  return gf->divide.w32(gf, 1, a);
37   	}
38   	
39   	static
40   	inline
41   	uint32_t gf_w8_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
42   	{
43   	  b = gf->inverse.w32(gf, b);
44   	  return gf->multiply.w32(gf, a, b);
45   	}
46   	
47   	static
48   	inline
49   	uint32_t gf_w8_euclid (gf_t *gf, uint32_t b)
50   	{
51   	  uint32_t e_i, e_im1, e_ip1;
52   	  uint32_t d_i, d_im1, d_ip1;
53   	  uint32_t y_i, y_im1, y_ip1;
54   	  uint32_t c_i;
55   	
56   	  if (b == 0) return -1;
57   	  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
58   	  e_i = b;
59   	  d_im1 = 8;
60   	  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
61   	  y_i = 1;
62   	  y_im1 = 0;
63   	
64   	  while (e_i != 1) {
65   	
66   	    e_ip1 = e_im1;
67   	    d_ip1 = d_im1;
68   	    c_i = 0;
69   	
70   	    while (d_ip1 >= d_i) {
71   	      c_i ^= (1 << (d_ip1 - d_i));
72   	      e_ip1 ^= (e_i << (d_ip1 - d_i));
73   	      if (e_ip1 == 0) return 0;
74   	      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
75   	    }
76   	
77   	    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
78   	    y_im1 = y_i;
79   	    y_i = y_ip1;
80   	
81   	    e_im1 = e_i;
82   	    d_im1 = d_i;
83   	    e_i = e_ip1;
84   	    d_i = d_ip1;
85   	  }
86   	
87   	  return y_i;
88   	}
89   	
90   	static
91   	gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index)
92   	{
93   	  uint8_t *r8;
94   	
95   	  r8 = (uint8_t *) start;
96   	  return r8[index];
97   	}
98   	
99   	static
100  	gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
101  	{
102  	  int sub_size;
103  	  gf_internal_t *h;
104  	  uint8_t *r8, *top;
105  	  uint8_t a, b;
106  	  gf_region_data rd;
107  	
108  	  h = (gf_internal_t *) gf->scratch;
109  	  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
110  	  r8 = (uint8_t *) start;
111  	  if (r8 + index < (uint8_t *) rd.d_start) return r8[index];
112  	  if (r8 + index >= (uint8_t *) rd.d_top) return r8[index];
113  	  index -= (((uint8_t *) rd.d_start) - r8);
114  	  r8 = (uint8_t *) rd.d_start;
115  	  top = (uint8_t *) rd.d_top;
116  	  sub_size = (top-r8)/2;
117  	
118  	  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
119  	  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
120  	  return (a | (b << 4));
121  	}
122  	
123  	static
124  	inline
125  	uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
126  	{
127  	  return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly);
128  	}
129  	
130  	
131  	#if defined(INTEL_SSE4_PCLMUL)
132  	static
133  	inline
134  	gf_val_32_t
135  	gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
136  	{
137  	  gf_val_32_t rv = 0;
138  	
139  	  __m128i         a, b;
140  	  __m128i         result;
141  	  __m128i         prim_poly;
142  	  __m128i         w;
143  	  gf_internal_t * h = gf->scratch;
144  	
145  	  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
146  	  b = _mm_insert_epi32 (a, b8, 0);
147  	
148  	  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
149  	
150  	  /* Do the initial multiply */
151  	
152  	  result = _mm_clmulepi64_si128 (a, b, 0);
153  	
154  	  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
155  	     have to do the reduction at most twice, because (w-2)/z == 2. Where
156  	     z is equal to the number of zeros after the leading 1
157  	
158  	     _mm_clmulepi64_si128 is the carryless multiply operation. Here
159  	     _mm_srli_si128 shifts the result to the right by 1 byte. This allows
160  	     us to multiply the prim_poly by the leading bits of the result. We
161  	     then xor the result of that operation back with the result.*/
162  	
163  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
164  	  result = _mm_xor_si128 (result, w);
165  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
166  	  result = _mm_xor_si128 (result, w);
167  	
168  	  /* Extracts 32 bit value from result. */
169  	  
170  	  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
171  	
172  	  return rv;
173  	}
174  	#endif
175  	
176  	#if defined(INTEL_SSE4_PCLMUL)
177  	static
178  	inline
179  	gf_val_32_t
180  	gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
181  	{
182  	  gf_val_32_t rv = 0;
183  	
184  	  __m128i         a, b;
185  	  __m128i         result;
186  	  __m128i         prim_poly;
187  	  __m128i         w;
188  	  gf_internal_t * h = gf->scratch;
189  	
190  	  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
191  	  b = _mm_insert_epi32 (a, b8, 0);
192  	
193  	  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
194  	
195  	  /* Do the initial multiply */
196  	  
197  	  result = _mm_clmulepi64_si128 (a, b, 0);
198  	
199  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
200  	  result = _mm_xor_si128 (result, w);
201  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
202  	  result = _mm_xor_si128 (result, w);
203  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
204  	  result = _mm_xor_si128 (result, w);
205  	
206  	  /* Extracts 32 bit value from result. */
207  	  
208  	  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
209  	
210  	  return rv;
211  	}
212  	#endif
213  	
214  	#if defined(INTEL_SSE4_PCLMUL)
215  	static
216  	inline
217  	gf_val_32_t
218  	gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
219  	{
220  	  gf_val_32_t rv = 0;
221  	
222  	  __m128i         a, b;
223  	  __m128i         result;
224  	  __m128i         prim_poly;
225  	  __m128i         w;
226  	  gf_internal_t * h = gf->scratch;
227  	
228  	  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
229  	  b = _mm_insert_epi32 (a, b8, 0);
230  	
231  	  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
232  	
233  	  /* Do the initial multiply */
234  	  
235  	  result = _mm_clmulepi64_si128 (a, b, 0);
236  	
237  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
238  	  result = _mm_xor_si128 (result, w);
239  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
240  	  result = _mm_xor_si128 (result, w);
241  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
242  	  result = _mm_xor_si128 (result, w);
243  	  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
244  	  result = _mm_xor_si128 (result, w);
245  	
246  	  /* Extracts 32 bit value from result. */
247  	  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
248  	
249  	  return rv;
250  	}
251  	#endif
252  	
253  	
254  	static
255  	void
256  	gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
257  	    xor)
258  	{
259  	  gf_region_data rd;
260  	  uint8_t *s8;
261  	  uint8_t *d8;
262  	
263  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
264  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
265  	
266  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
267  	  gf_do_initial_region_alignment(&rd);
268  	
269  	  s8 = (uint8_t *) rd.s_start;
270  	  d8 = (uint8_t *) rd.d_start;
271  	
272  	  if (xor) {
273  	    while (d8 < ((uint8_t *) rd.d_top)) {
274  	      *d8 ^= gf->multiply.w32(gf, val, *s8);
275  	      d8++;
276  	      s8++;
277  	    }
278  	  } else {
279  	    while (d8 < ((uint8_t *) rd.d_top)) {
280  	      *d8 = gf->multiply.w32(gf, val, *s8);
281  	      d8++;
282  	      s8++;
283  	    }
284  	  }
285  	  gf_do_final_region_alignment(&rd);
286  	}
287  	
288  	#if defined(INTEL_SSE4_PCLMUL)
289  	static
290  	void
291  	gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
292  	    xor)
293  	{
294  	  gf_region_data rd;
295  	  uint8_t *s8;
296  	  uint8_t *d8;
297  	
298  	  __m128i         a, b;
299  	  __m128i         result;
300  	  __m128i         prim_poly;
301  	  __m128i         w;
302  	  gf_internal_t * h = gf->scratch;
303  	
304  	  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
305  	
306  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
307  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
308  	
309  	  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
310  	
311  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
312  	  gf_do_initial_region_alignment(&rd);
313  	
314  	  s8 = (uint8_t *) rd.s_start;
315  	  d8 = (uint8_t *) rd.d_start;
316  	
317  	  if (xor) {
318  	    while (d8 < ((uint8_t *) rd.d_top)) {
319  	      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
320  	      result = _mm_clmulepi64_si128 (a, b, 0);
321  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
322  	      result = _mm_xor_si128 (result, w);
323  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
324  	      result = _mm_xor_si128 (result, w);
325  	      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
326  	      d8++;
327  	      s8++;
328  	    }
329  	  } else {
330  	    while (d8 < ((uint8_t *) rd.d_top)) {
331  	      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
332  	      result = _mm_clmulepi64_si128 (a, b, 0);
333  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
334  	      result = _mm_xor_si128 (result, w);
335  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
336  	      result = _mm_xor_si128 (result, w);
337  	      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
338  	      d8++;
339  	      s8++;
340  	    }
341  	  }
342  	  gf_do_final_region_alignment(&rd);
343  	}
344  	#endif
345  	
346  	#if defined(INTEL_SSE4_PCLMUL)
347  	static
348  	void
349  	gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
350  	    xor)
351  	{
352  	  gf_region_data rd;
353  	  uint8_t *s8;
354  	  uint8_t *d8;
355  	
356  	  __m128i         a, b;
357  	  __m128i         result;
358  	  __m128i         prim_poly;
359  	  __m128i         w;
360  	  gf_internal_t * h = gf->scratch;
361  	
362  	  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
363  	
364  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
365  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
366  	
367  	  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
368  	
369  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
370  	  gf_do_initial_region_alignment(&rd);
371  	
372  	  s8 = (uint8_t *) rd.s_start;
373  	  d8 = (uint8_t *) rd.d_start;
374  	
375  	  if (xor) {
376  	    while (d8 < ((uint8_t *) rd.d_top)) {
377  	      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
378  	      result = _mm_clmulepi64_si128 (a, b, 0);
379  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
380  	      result = _mm_xor_si128 (result, w);
381  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
382  	      result = _mm_xor_si128 (result, w);
383  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
384  	      result = _mm_xor_si128 (result, w);
385  	      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
386  	      d8++;
387  	      s8++;
388  	    }
389  	  } else {
390  	    while (d8 < ((uint8_t *) rd.d_top)) {
391  	      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
392  	      result = _mm_clmulepi64_si128 (a, b, 0);
393  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
394  	      result = _mm_xor_si128 (result, w);
395  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
396  	      result = _mm_xor_si128 (result, w);
397  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
398  	      result = _mm_xor_si128 (result, w);
399  	      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
400  	      d8++;
401  	      s8++;
402  	    }
403  	  }
404  	  gf_do_final_region_alignment(&rd);
405  	}
406  	#endif
407  	
408  	#if defined(INTEL_SSE4_PCLMUL)
409  	static
410  	void
411  	gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
412  	    xor)
413  	{
414  	  gf_region_data rd;
415  	  uint8_t *s8;
416  	  uint8_t *d8;
417  	
418  	  __m128i         a, b;
419  	  __m128i         result;
420  	  __m128i         prim_poly;
421  	  __m128i         w;
422  	  gf_internal_t * h = gf->scratch;
423  	
424  	  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
425  	
426  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
427  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
428  	
429  	  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
430  	
431  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
432  	  gf_do_initial_region_alignment(&rd);
433  	
434  	  s8 = (uint8_t *) rd.s_start;
435  	  d8 = (uint8_t *) rd.d_start;
436  	
437  	  if (xor) {
438  	    while (d8 < ((uint8_t *) rd.d_top)) {
439  	      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
440  	      result = _mm_clmulepi64_si128 (a, b, 0);
441  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
442  	      result = _mm_xor_si128 (result, w);
443  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
444  	      result = _mm_xor_si128 (result, w);
445  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
446  	      result = _mm_xor_si128 (result, w);
447  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
448  	      result = _mm_xor_si128 (result, w);
449  	      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
450  	      d8++;
451  	      s8++;
452  	    }
453  	  } else {
454  	    while (d8 < ((uint8_t *) rd.d_top)) {
455  	      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
456  	      result = _mm_clmulepi64_si128 (a, b, 0);
457  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
458  	      result = _mm_xor_si128 (result, w);
459  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
460  	      result = _mm_xor_si128 (result, w);
461  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
462  	      result = _mm_xor_si128 (result, w);
463  	      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
464  	      result = _mm_xor_si128 (result, w);
465  	      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
466  	      d8++;
467  	      s8++;
468  	    }
469  	  }
470  	  gf_do_final_region_alignment(&rd);
471  	}
472  	#endif
473  	
474  	/* ------------------------------------------------------------
475  	IMPLEMENTATION: SHIFT:
476  	
477  	JSP: The world's dumbest multiplication algorithm.  I only
478  	include it for completeness.  It does have the feature that it requires no
479  	extra memory.  
480  	 */
481  	
482  	static
483  	inline
484  	  uint32_t
485  	gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
486  	{
487  	  uint16_t product, i, pp, a, b;
488  	  gf_internal_t *h;
489  	
490  	  a = a8;
491  	  b = b8;
492  	  h = (gf_internal_t *) gf->scratch;
493  	  pp = h->prim_poly;
494  	
495  	  product = 0;
496  	
497  	  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
498  	    if (a & (1 << i)) product ^= (b << i);
499  	  }
500  	  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
501  	    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
502  	  }
503  	  return product;
504  	}
505  	
506  	static 
507  	int gf_w8_cfm_init(gf_t *gf)
508  	{ 
509  	#if defined(INTEL_SSE4_PCLMUL)
510  	  if (gf_cpu_supports_intel_pclmul) {
511  	    gf_internal_t *h;
512  	
513  	    h = (gf_internal_t *) gf->scratch;
514  	
515  	      if ((0xe0 & h->prim_poly) == 0){
516  	        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
517  	        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
518  	      }else if ((0xc0 & h->prim_poly) == 0){
519  	        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
520  	        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
521  	      }else if ((0x80 & h->prim_poly) == 0){ 
522  	        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
523  	        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
524  	      }else{
525  	        return 0;
526  	      }
527  	    return 1;
528  	  }
529  	#elif defined(ARM_NEON)
530  	  if (gf_cpu_supports_arm_neon) {
531  	    return gf_w8_neon_cfm_init(gf);
532  	  }
533  	#endif
534  	
535  	  return 0;
536  	
537  	}
538  	
539  	static 
540  	int gf_w8_shift_init(gf_t *gf)
541  	{ 
542  	  SET_FUNCTION(gf,multiply,w32,gf_w8_shift_multiply)  /* The others will be set automatically */
543  	  return 1;
544  	}
545  	
546  	/* ------------------------------------------------------------
547  	IMPLEMENTATION: LOG_TABLE:
548  	
549  	JSP: Kevin wrote this, and I'm converting it to my structure.
550  	*/
551  	
552  	static
553  	inline
554  	  uint32_t
555  	gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
556  	{
557  	  struct gf_w8_logzero_table_data *ltd;
558  	
559  	  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
560  	  return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
561  	}
562  	
563  	static
564  	inline
565  	  uint32_t
566  	gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
567  	{
568  	  struct gf_w8_logzero_table_data *ltd;
569  	
570  	  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
571  	  return ltd->div_tbl[ltd->log_tbl[a] - ltd->log_tbl[b]];
572  	}
573  	
574  	static
575  	inline
576  	  uint32_t
577  	gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
578  	{
579  	  struct gf_w8_logzero_small_table_data *std;
580  	
581  	  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
582  	  if (b == 0) return 0;
583  	  return std->antilog_tbl[std->log_tbl[a] + std->log_tbl[b]];
584  	}
585  	
586  	static
587  	inline
588  	  uint32_t
589  	gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
590  	{
591  	  struct gf_w8_logzero_small_table_data *std;
592  	
593  	  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
594  	  return std->div_tbl[std->log_tbl[a] - std->log_tbl[b]];
595  	}
596  	
597  	static
598  	inline
599  	  uint32_t
600  	gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
601  	{
602  	  struct gf_w8_logtable_data *ltd;
603  	
604  	  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
605  	  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
606  	}
607  	
608  	static
609  	inline
610  	  uint32_t
611  	gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
612  	{
613  	  int log_sum = 0;
614  	  struct gf_w8_logtable_data *ltd;
615  	
616  	  if (a == 0 || b == 0) return 0;
617  	  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
618  	
619  	  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
620  	  return (ltd->antilog_tbl[log_sum]);
621  	}
622  	
623  	static
624  	  uint32_t
625  	gf_w8_log_inverse (gf_t *gf, uint32_t a)
626  	{
627  	  struct gf_w8_logtable_data *ltd;
628  	
629  	  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
630  	  return (ltd->inv_tbl[a]);
631  	}
632  	
633  	static
634  	  uint32_t
635  	gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
636  	{
637  	  struct gf_w8_logzero_table_data *ltd;
638  	
639  	  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
640  	  return (ltd->inv_tbl[a]);
641  	}
642  	
643  	static
644  	  uint32_t
645  	gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
646  	{
647  	  struct gf_w8_logzero_small_table_data *std;
648  	
649  	  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
650  	  return (std->inv_tbl[a]);
651  	}
652  	
653  	static
654  	  void
655  	gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
656  	{
657  	  int i;
658  	  uint8_t lv;
659  	  uint8_t *s8, *d8;
660  	  struct gf_w8_logtable_data *ltd;
661  	
662  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
663  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
664  	
665  	  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
666  	  s8 = (uint8_t *) src;
667  	  d8 = (uint8_t *) dest;
668  	
669  	  lv = ltd->log_tbl[val];
670  	
671  	  if (xor) {
672  	    for (i = 0; i < bytes; i++) {
673  	      d8[i] ^= (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
674  	    }
675  	  } else {
676  	    for (i = 0; i < bytes; i++) {
677  	      d8[i] = (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
678  	    }
679  	  }
680  	}
681  	
682  	static
683  	  void
684  	gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
685  	{
686  	  int i;
687  	  uint8_t lv;
688  	  uint8_t *s8, *d8;
689  	  struct gf_w8_logzero_table_data *ltd;
690  	  struct gf_w8_logzero_small_table_data *std;
691  	  short *log;
692  	  uint8_t *alt;
693  	  gf_internal_t *h;
694  	
695  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
696  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
697  	
698  	  h = (gf_internal_t *) gf->scratch;
699  	
700  	  if (h->arg1 == 1) {
701  	    std = (struct gf_w8_logzero_small_table_data *) h->private;
702  	    log = std->log_tbl;
703  	    alt = std->antilog_tbl;
704  	  } else {
705  	    ltd = (struct gf_w8_logzero_table_data *) h->private;
706  	    log = ltd->log_tbl;
707  	    alt = ltd->antilog_tbl;
708  	  }
709  	  s8 = (uint8_t *) src;
710  	  d8 = (uint8_t *) dest;
711  	
712  	  lv = log[val];
713  	
714  	  if (xor) {
715  	    for (i = 0; i < bytes; i++) {
716  	      d8[i] ^= (alt[lv + log[s8[i]]]);
717  	    }
718  	  } else {
719  	    for (i = 0; i < bytes; i++) {
720  	      d8[i] = (alt[lv + log[s8[i]]]);
721  	    }
722  	  }
723  	}
724  	
725  	  static
726  	int gf_w8_log_init(gf_t *gf)
727  	{
728  	  gf_internal_t *h;
729  	  struct gf_w8_logtable_data *ltd = NULL;
730  	  struct gf_w8_logzero_table_data *ztd = NULL;
731  	  struct gf_w8_logzero_small_table_data *std = NULL;
732  	  uint8_t *alt;
733  	  uint8_t *inv;
734  	  int i, b;
735  	  int check = 0;
736  	
737  	  h = (gf_internal_t *) gf->scratch;
738  	  if (h->mult_type == GF_MULT_LOG_TABLE) {
739  	    ltd = h->private;
740  	    alt = ltd->antilog_tbl;
741  	    inv = ltd->inv_tbl;
742  	  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
743  	    std = h->private;
744  	    alt = std->antilog_tbl;
745  	    std->div_tbl = (alt + 255);
746  	    inv = std->inv_tbl;
747  	  } else {
748  	    ztd = h->private;
749  	    alt = ztd->antilog_tbl;
750  	    ztd->inv_tbl = (alt + 512 + 256);
751  	    ztd->div_tbl = (alt + 255);
752  	    inv = ztd->inv_tbl;
753  	  }
754  	
755  	  for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) {
756  	    if (h->mult_type == GF_MULT_LOG_TABLE)
757  	      ltd->log_tbl[i] = 0;
758  	    else if (h->mult_type == GF_MULT_LOG_ZERO)
759  	      std->log_tbl[i] = 0;
760  	    else
761  	      ztd->log_tbl[i] = 0;
762  	  }
763  	
764  	  if (h->mult_type == GF_MULT_LOG_TABLE) {
765  	    ltd->log_tbl[0] = 0;
766  	  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
767  	    std->log_tbl[0] = 510;
768  	  } else {
769  	    ztd->log_tbl[0] = 512;
770  	  }
771  	
772  	  b = 1;
773  	  for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
774  	    if (h->mult_type == GF_MULT_LOG_TABLE) {
775  	      if (ltd->log_tbl[b] != 0) check = 1;
776  	      ltd->log_tbl[b] = i;
777  	    } else if (h->mult_type == GF_MULT_LOG_ZERO) {
778  	      if (std->log_tbl[b] != 0) check = 1;
779  	      std->log_tbl[b] = i;
780  	    } else {
781  	      if (ztd->log_tbl[b] != 0) check = 1;
782  	      ztd->log_tbl[b] = i;
783  	    }
784  	    alt[i] = b;
785  	    alt[i+GF_MULT_GROUP_SIZE] = b;
786  	    b <<= 1;
787  	    if (b & GF_FIELD_SIZE) {
788  	      b = b ^ h->prim_poly;
789  	    }
790  	  }
791  	  if (check) {
792  	    _gf_errno = GF_E_LOGPOLY;
793  	    return 0;
794  	  }
795  	
796  	  if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255);
797  	
798  	  if (h->mult_type == GF_MULT_LOG_ZERO_EXT) {
799  	    bzero(alt+512, 255);
800  	    alt[512+512] = 0;
801  	  }
802  	
803  	  inv[0] = 0;  /* Not really, but we need to fill it with something  */
804  	  i = 1;
805  	  b = GF_MULT_GROUP_SIZE;
806  	  do {
807  	    inv[i] = alt[b];
808  	    i <<= 1;
809  	    if (i & (1 << 8)) i ^= h->prim_poly;
810  	    b--;
811  	  } while (i != 1);
812  	
813  	  if (h->mult_type == GF_MULT_LOG_TABLE) {
814  	    SET_FUNCTION(gf,inverse,w32,gf_w8_log_inverse)
815  	    SET_FUNCTION(gf,divide,w32,gf_w8_log_divide)
816  	    SET_FUNCTION(gf,multiply,w32,gf_w8_log_multiply)
817  	    SET_FUNCTION(gf,multiply_region,w32,gf_w8_log_multiply_region)
818  	  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
819  	    SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_small_inverse)
820  	    SET_FUNCTION(gf,divide,w32,gf_w8_logzero_small_divide)
821  	    SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_small_multiply)
822  	    SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
823  	  } else {
824  	    SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_inverse)
825  	    SET_FUNCTION(gf,divide,w32,gf_w8_logzero_divide)
826  	    SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_multiply)
827  	    SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
828  	  }
829  	  return 1;
830  	}
831  	
832  	/* ------------------------------------------------------------
833  	IMPLEMENTATION: FULL_TABLE:
834  	
835  	JSP: Kevin wrote this, and I'm converting it to my structure.
836  	 */
837  	
838  	static
839  	  gf_val_32_t
840  	gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
841  	{
842  	  struct gf_w8_single_table_data *ftd;
843  	
844  	  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
845  	  return (ftd->multtable[a][b]);
846  	}
847  	
848  	static
849  	  gf_val_32_t
850  	gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
851  	{
852  	  struct gf_w8_single_table_data *ftd;
853  	
854  	  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
855  	  return (ftd->divtable[a][b]);
856  	}
857  	
858  	static
859  	  gf_val_32_t
860  	gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
861  	{
862  	  struct gf_w8_default_data *ftd;
863  	
864  	  ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
865  	  return (ftd->multtable[a][b]);
866  	}
867  	
868  	#if defined(INTEL_SSSE3) || defined(ARM_NEON)
869  	static
870  	  gf_val_32_t
871  	gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
872  	{
873  	  struct gf_w8_default_data *ftd;
874  	
875  	  ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
876  	  return (ftd->divtable[a][b]);
877  	}
878  	#endif
879  	
880  	static
881  	  gf_val_32_t
882  	gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
883  	{
884  	  struct gf_w8_double_table_data *ftd;
885  	
886  	  ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
887  	  return (ftd->mult[a][b]);
888  	}
889  	
890  	static
891  	  gf_val_32_t
892  	gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
893  	{
894  	  struct gf_w8_double_table_data *ftd;
895  	
896  	  ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
897  	  return (ftd->div[a][b]);
898  	}
899  	
900  	static
901  	  void
902  	gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
903  	{
904  	  uint16_t *base;
905  	  uint32_t b, c, vc, vb;
906  	  gf_internal_t *h;
907  	  struct gf_w8_double_table_data  *dtd;
908  	  struct gf_w8_double_table_lazy_data  *ltd;
909  	  gf_region_data rd;
910  	
911  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
912  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
913  	
914  	  h = (gf_internal_t *) (gf->scratch);
915  	  if (h->region_type & GF_REGION_LAZY) {
916  	    ltd = (struct gf_w8_double_table_lazy_data *) h->private;
917  	    base = ltd->mult;
918  	    for (b = 0; b < GF_FIELD_SIZE; b++) {
919  	      vb = (ltd->smult[val][b] << 8);
920  	      for (c = 0; c < GF_FIELD_SIZE; c++) {
921  	        vc = ltd->smult[val][c];
922  	        base[(b << 8)| c] = (vb | vc);
923  	      }
924  	    }
925  	
926  	  } else {
927  	    dtd = (struct gf_w8_double_table_data *) h->private;
928  	    base = &(dtd->mult[val][0]);
929  	  }
930  	
931  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
932  	  gf_do_initial_region_alignment(&rd);
933  	  gf_two_byte_region_table_multiply(&rd, base);
934  	  gf_do_final_region_alignment(&rd);
935  	}
936  	
937  	static
938  	  gf_val_32_t
939  	gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
940  	{
941  	  struct gf_w8_double_table_lazy_data *ftd;
942  	
943  	  ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
944  	  return (ftd->smult[a][b]);
945  	}
946  	
947  	static
948  	  gf_val_32_t
949  	gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
950  	{
951  	  struct gf_w8_double_table_lazy_data *ftd;
952  	
953  	  ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
954  	  return (ftd->div[a][b]);
955  	}
956  	
957  	static
958  	  void
959  	gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
960  	{
961  	  int i;
962  	  uint8_t *s8, *d8;
963  	  struct gf_w8_single_table_data *ftd;
964  	
965  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
966  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
967  	
968  	  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
969  	  s8 = (uint8_t *) src;
970  	  d8 = (uint8_t *) dest;
971  	
972  	  if (xor) {
973  	    for (i = 0; i < bytes; i++) {
974  	      d8[i] ^= ftd->multtable[s8[i]][val];
975  	    }
976  	  } else {
977  	    for (i = 0; i < bytes; i++) {
978  	      d8[i] = ftd->multtable[s8[i]][val];
979  	    }
980  	  }
981  	}
982  	
983  	#ifdef INTEL_SSSE3
984  	static
985  	  void
986  	gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
987  	{
988  	  uint8_t *bh, *bl, *sptr, *dptr;
989  	  __m128i  loset, t1, r, va, mth, mtl;
990  	  struct gf_w8_half_table_data *htd;
991  	  gf_region_data rd;
992  	
993  	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
994  	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
995  	
996  	  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
997  	
998  	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
999  	  gf_do_initial_region_alignment(&rd);
1000 	
1001 	  bh = (uint8_t *) htd->high;
1002 	  bh += (val << 4);
1003 	  bl = (uint8_t *) htd->low;
1004 	  bl += (val << 4);
1005 	
1006 	  sptr = rd.s_start;
1007 	  dptr = rd.d_start;
1008 	
1009 	  mth = _mm_loadu_si128 ((__m128i *)(bh));
1010 	  mtl = _mm_loadu_si128 ((__m128i *)(bl));
1011 	  loset = _mm_set1_epi8 (0x0f);
1012 	
1013 	  if (xor) {
1014 	    while (sptr < (uint8_t *) rd.s_top) {
1015 	      va = _mm_load_si128 ((__m128i *)(sptr));
1016 	      t1 = _mm_and_si128 (loset, va);
1017 	      r = _mm_shuffle_epi8 (mtl, t1);
1018 	      va = _mm_srli_epi64 (va, 4);
1019 	      t1 = _mm_and_si128 (loset, va);
1020 	      r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
1021 	      va = _mm_load_si128 ((__m128i *)(dptr));
1022 	      r = _mm_xor_si128 (r, va);
1023 	      _mm_store_si128 ((__m128i *)(dptr), r);
1024 	      dptr += 16;
1025 	      sptr += 16;
1026 	    }
1027 	  } else {
1028 	    while (sptr < (uint8_t *) rd.s_top) {
1029 	      va = _mm_load_si128 ((__m128i *)(sptr));
1030 	      t1 = _mm_and_si128 (loset, va);
1031 	      r = _mm_shuffle_epi8 (mtl, t1);
1032 	      va = _mm_srli_epi64 (va, 4);
1033 	      t1 = _mm_and_si128 (loset, va);
1034 	      r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
1035 	      _mm_store_si128 ((__m128i *)(dptr), r);
1036 	      dptr += 16;
1037 	      sptr += 16;
1038 	    }
1039 	  }
1040 	
1041 	  gf_do_final_region_alignment(&rd);
1042 	}
1043 	#endif
1044 	
1045 	
1046 	/* ------------------------------------------------------------
1047 	IMPLEMENTATION: FULL_TABLE:
1048 	 */
1049 	
1050 	static
1051 	  gf_val_32_t
1052 	gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1053 	{
1054 	  struct gf_w8_half_table_data *htd;
1055 	  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
1056 	
1057 	  return htd->high[b][a>>4] ^ htd->low[b][a&0xf];
1058 	}
1059 	
1060 	static
1061 	  void
1062 	gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1063 	{
1064 	  int i;
1065 	  uint8_t *s8, *d8;
1066 	  struct gf_w8_half_table_data *htd;
1067 	
1068 	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1069 	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1070 	
1071 	  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
1072 	  s8 = (uint8_t *) src;
1073 	  d8 = (uint8_t *) dest;
1074 	
1075 	  if (xor) {
1076 	    for (i = 0; i < bytes; i++) {
1077 	      d8[i] ^= (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
1078 	    }
1079 	  } else {
1080 	    for (i = 0; i < bytes; i++) {
1081 	      d8[i] = (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
1082 	    }
1083 	  }
1084 	}
1085 	
1086 	
1087 	  static
1088 	int gf_w8_split_init(gf_t *gf)
1089 	{
1090 	  gf_internal_t *h;
1091 	  struct gf_w8_half_table_data *htd;
1092 	  int a, b;
1093 	
1094 	  h = (gf_internal_t *) gf->scratch;
1095 	  htd = (struct gf_w8_half_table_data *)h->private;
1096 	
1097 	  bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
1098 	  bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
1099 	
1100 	  for (a = 1; a < GF_FIELD_SIZE; a++) {
1101 	    for (b = 1; b < GF_HALF_SIZE; b++) {
1102 	      htd->low[a][b] = gf_w8_shift_multiply(gf,a,b);
1103 	      htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4);
1104 	    }
1105 	  }
1106 	
1107 	  SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply)
1108 	
1109 	  #if defined(INTEL_SSSE3)
1110 	    if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
1111 	      SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
1112 	    } else {
1113 	  #elif defined(ARM_NEON)
1114 	    if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) {
1115 	      gf_w8_neon_split_init(gf);
1116 	    } else {
1117 	  #endif
1118 	    SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
1119 	    if(h->region_type & GF_REGION_SIMD)
1120 	      return 0;
1121 	  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1122 	    }
1123 	  #endif
1124 	
1125 	  return 1;
1126 	}
1127 	
1128 	/* JSP: This is disgusting, but it is what it is.  If there is no SSE,
1129 	   then the default is equivalent to single table.  If there is SSE, then
1130 	   we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */
1131 	   
1132 	static
1133 	int gf_w8_table_init(gf_t *gf)
1134 	{
1135 	  gf_internal_t *h;
1136 	  struct gf_w8_single_table_data *ftd = NULL;
1137 	  struct gf_w8_double_table_data *dtd = NULL;
1138 	  struct gf_w8_double_table_lazy_data *ltd = NULL;
1139 	  struct gf_w8_default_data *dd = NULL;
1140 	  int a, b, c, prod, scase;
1141 	
1142 	  h = (gf_internal_t *) gf->scratch;
1143 	
(1) Event cond_true: Condition "h->mult_type == GF_MULT_DEFAULT", taking true branch.
(2) Event cond_true: Condition "gf_cpu_supports_intel_ssse3", taking true branch.
1144 	  if (h->mult_type == GF_MULT_DEFAULT &&
1145 	      (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
1146 	    dd = (struct gf_w8_default_data *)h->private;
1147 	    scase = 3;
1148 	    bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
1149 	    bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
1150 	    bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1151 	    bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
(3) Event if_fallthrough: Falling through to end of if statement.
1152 	  } else if (h->mult_type == GF_MULT_DEFAULT || 
1153 	             h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) {
1154 	    ftd = (struct gf_w8_single_table_data *)h->private;
1155 	    bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1156 	    bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1157 	    scase = 0;
1158 	  } else if (h->region_type == GF_REGION_DOUBLE_TABLE) {
1159 	    dtd = (struct gf_w8_double_table_data *)h->private;
1160 	    bzero(dtd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1161 	    bzero(dtd->mult, sizeof(uint16_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
1162 	    scase = 1;
1163 	  } else if (h->region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
1164 	    ltd = (struct gf_w8_double_table_lazy_data *)h->private;
1165 	    bzero(ltd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1166 	    bzero(ltd->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1167 	    scase = 2;
1168 	  } else {
1169 	    fprintf(stderr, "Internal error in gf_w8_table_init\n");
1170 	    assert(0);
(4) Event if_end: End of if statement.
1171 	  }
1172 	
(5) Event cond_true: Condition "a < (256 /* 1 << 8 */)", taking true branch.
(19) Event loop_begin: Jumped back to beginning of loop.
(20) Event cond_true: Condition "a < (256 /* 1 << 8 */)", taking true branch.
1173 	  for (a = 1; a < GF_FIELD_SIZE; a++) {
(6) Event cond_true: Condition "b < (256 /* 1 << 8 */)", taking true branch.
(15) Event loop_begin: Jumped back to beginning of loop.
(16) Event cond_false: Condition "b < (256 /* 1 << 8 */)", taking false branch.
(21) Event cond_true: Condition "b < (256 /* 1 << 8 */)", taking true branch.
(30) Event loop_begin: Jumped back to beginning of loop.
(31) Event cond_true: Condition "b < (256 /* 1 << 8 */)", taking true branch.
(32) Event cond_at_most: Checking "b < 256" implies that "b" may be up to 255 on the true branch.
Also see events: [overrun-local]
1174 	    for (b = 1; b < GF_FIELD_SIZE; b++) {
1175 	      prod = gf_w8_shift_multiply(gf,a,b);
(7) Event switch: Switch case value "3".
(22) Event switch: Switch case value "3".
(33) Event switch: Switch case value "3".
1176 	      switch (scase) {
1177 	        case 0: 
1178 	          ftd->multtable[a][b] = prod;
1179 	          ftd->divtable[prod][b] = a;
1180 	          break;
1181 	        case 1:
1182 	          dtd->div[prod][b] = a;
1183 	          for (c = 0; c < GF_FIELD_SIZE; c++) {
1184 	            dtd->mult[a][(c<<8)|b] |= prod;
1185 	            dtd->mult[a][(b<<8)|c] |= (prod<<8);
1186 	          }
1187 	          break;
1188 	        case 2:
1189 	          ltd->div[prod][b] = a;
1190 	          ltd->smult[a][b] = prod;
1191 	          break;
(8) Event switch_case: Reached case "3".
(23) Event switch_case: Reached case "3".
(34) Event switch_case: Reached case "3".
1192 	        case 3:
1193 	          dd->multtable[a][b] = prod;
1194 	          dd->divtable[prod][b] = a;
(9) Event cond_true: Condition "(b & 0xf) == b", taking true branch.
(24) Event cond_true: Condition "(b & 0xf) == b", taking true branch.
(35) Event cond_true: Condition "(b & 0xf) == b", taking true branch.
(36) Event overrun-local: Overrunning array "dd->low[a]" of 4096 bytes at byte offset 4335 using index "b" (which evaluates to 255).
Also see events: [cond_at_most]
1195 	          if ((b & 0xf) == b) { dd->low[a][b] = prod; }
(10) Event cond_false: Condition "(b & 0xf0) == b", taking false branch.
(11) Event if_end: End of if statement.
(25) Event cond_false: Condition "(b & 0xf0) == b", taking false branch.
(26) Event if_end: End of if statement.
1196 	          if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; }
(12) Event break: Breaking from switch.
(27) Event break: Breaking from switch.
1197 	          break;
(13) Event switch_end: Reached end of switch.
(28) Event switch_end: Reached end of switch.
1198 	      }
(14) Event loop: Jumping back to the beginning of the loop.
(17) Event loop_end: Reached end of loop.
(29) Event loop: Jumping back to the beginning of the loop.
1199 	    }
(18) Event loop: Jumping back to the beginning of the loop.
1200 	  }
1201 	
1202 	  SET_FUNCTION(gf,inverse,w32,NULL) /* Will set from divide */
1203 	  switch (scase) {
1204 	    case 0: 
1205 	      SET_FUNCTION(gf,divide,w32,gf_w8_table_divide)
1206 	      SET_FUNCTION(gf,multiply,w32,gf_w8_table_multiply)
1207 	      SET_FUNCTION(gf,multiply_region,w32,gf_w8_table_multiply_region)
1208 	      break;
1209 	    case 1:
1210 	      SET_FUNCTION(gf,divide,w32,gf_w8_double_table_divide)
1211 	      SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_multiply)
1212 	      SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
1213 	      break;
1214 	    case 2:
1215 	      SET_FUNCTION(gf,divide,w32,gf_w8_double_table_lazy_divide)
1216 	      SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_lazy_multiply)
1217 	      SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
1218 	      break;
1219 	    case 3:
1220 	#if defined(INTEL_SSSE3) || defined(ARM_NEON)
1221 	      if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
1222 	        SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
1223 	        SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
1224 	#if defined(INTEL_SSSE3)
1225 	        if (gf_cpu_supports_intel_ssse3) {
1226 	          SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
1227 	        }
1228 	#elif defined(ARM_NEON)
1229 	        if (gf_cpu_supports_arm_neon) {
1230 	          gf_w8_neon_split_init(gf);
1231 	        }
1232 	#endif
1233 	      }
1234 	#endif
1235 	      break;
1236 	  }
1237 	  return 1;
1238 	}
1239 	
1240 	static
1241 	  void
1242 	gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1243 	{
1244 	  gf_internal_t *h = (gf_internal_t *) gf->scratch;
1245 	  gf_t *base_gf = h->base_gf;
1246 	  uint8_t val0 = val & 0x0f;
1247 	  uint8_t val1 = (val & 0xf0) >> 4;
1248 	  gf_region_data rd;
1249 	  int sub_reg_size;
1250 	
1251 	  if (val == 0) {
1252 	    if (xor) return;
1253 	    bzero(dest, bytes);
1254 	    return;
1255 	  }
1256 	
1257 	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1258 	  gf_do_initial_region_alignment(&rd);
1259 	
1260 	  sub_reg_size = ((uint8_t *)rd.d_top - (uint8_t *)rd.d_start) / 2;
1261 	
1262 	  base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor);
1263 	  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1);
1264 	  base_gf->multiply_region.w32(base_gf, rd.s_start, (uint8_t *)rd.d_start+sub_reg_size, val1, sub_reg_size, xor);
1265 	  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, val0, sub_reg_size, 1);
1266 	  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
1267 	
1268 	   gf_do_final_region_alignment(&rd);
1269 	}
1270 	
1271 	static
1272 	gf_val_32_t
1273 	gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1274 	{
1275 	  gf_internal_t *h = (gf_internal_t *) gf->scratch;
1276 	  gf_t *base_gf = h->base_gf;
1277 	  uint8_t b0 = b & 0x0f; 
1278 	  uint8_t b1 = (b & 0xf0) >> 4; 
1279 	  uint8_t a0 = a & 0x0f; 
1280 	  uint8_t a1 = (a & 0xf0) >> 4; 
1281 	  uint8_t a1b1;
1282 	
1283 	  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1284 	
1285 	  return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
1286 	          ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
1287 	           base_gf->multiply.w32(base_gf, a0, b1) ^ 
1288 	           base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1289 	}
1290 	
1291 	static
1292 	gf_val_32_t
1293 	gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1294 	{
1295 	  gf_internal_t *h = (gf_internal_t *) gf->scratch;
1296 	  uint8_t b0 = b & 0x0f; 
1297 	  uint8_t b1 = (b & 0xf0) >> 4; 
1298 	  uint8_t a0 = a & 0x0f; 
1299 	  uint8_t a1 = (a & 0xf0) >> 4; 
1300 	  uint8_t a1b1, *mt;
1301 	  struct gf_w8_composite_data *cd;
1302 	
1303 	  cd = (struct gf_w8_composite_data *) h->private;
1304 	  mt = cd->mult_table;
1305 	
1306 	  a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1307 	
1308 	  return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
1309 	          ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
1310 	           GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
1311 	           GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1312 	}
1313 	
1314 	/*
1315 	 * Composite field division trick (explained in 2007 tech report) 
1316 	 *
1317 	 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 
1318 	 * 
1319 	 * let c = b^-1
1320 	 *
1321 	 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
1322 	 * 
1323 	 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 
1324 	 *
1325 	 * let d = b1c1 and d+1 = b0c0
1326 	 *
1327 	 * solve s*b1c1+b1c0+b0c1 = 0
1328 	 *
1329 	 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
1330 	 *
1331 	 * c0 = (d+1)b0^-1
1332 	 * c1 = d*b1^-1
1333 	 * 
1334 	 * a / b = a * c
1335 	 */
1336 	
1337 	static
1338 	gf_val_32_t
1339 	gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
1340 	{
1341 	  gf_internal_t *h = (gf_internal_t *) gf->scratch;
1342 	  gf_t *base_gf = h->base_gf;
1343 	  uint8_t a0 = a & 0x0f; 
1344 	  uint8_t a1 = (a & 0xf0) >> 4; 
1345 	  uint8_t c0, c1, c, d, tmp;
1346 	  uint8_t a0inv, a1inv; 
1347 	
1348 	  if (a0 == 0) {
1349 	    a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
1350 	    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
1351 	    c1 = a1inv;
1352 	  } else if (a1 == 0) {
1353 	    c0 = base_gf->inverse.w32(base_gf, a0);
1354 	    c1 = 0;
1355 	  } else {
1356 	    a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
1357 	    a0inv = base_gf->inverse.w32(base_gf, a0) & 0xf;
1358 	
1359 	    d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf;
1360 	
1361 	    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf;
1362 	    tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf;
1363 	
1364 	    d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf;
1365 	
1366 	    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf; 
1367 	    c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf; 
1368 	  }
1369 	
1370 	  c = c0 | (c1 << 4);
1371 	
1372 	  return c;
1373 	}
1374 	
1375 	static
1376 	void
1377 	gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1378 	{
1379 	  gf_region_data rd;
1380 	  gf_internal_t *h = (gf_internal_t *) gf->scratch;
1381 	  gf_t *base_gf = h->base_gf;
1382 	  uint8_t b0 = val & 0x0f; 
1383 	  uint8_t b1 = (val & 0xf0) >> 4; 
1384 	  uint8_t *s8;
1385 	  uint8_t *d8; 
1386 	  uint8_t *mt;
1387 	  uint8_t a0, a1, a1b1;
1388 	  struct gf_w8_composite_data *cd;
1389 	
1390 	  cd = (struct gf_w8_composite_data *) h->private;
1391 	
1392 	  if (val == 0) {
1393 	    if (xor) return;
1394 	    bzero(dest, bytes);
1395 	    return;
1396 	  }
1397 	
1398 	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
1399 	  gf_do_initial_region_alignment(&rd);
1400 	  
1401 	  
1402 	  s8 = (uint8_t *) rd.s_start;
1403 	  d8 = (uint8_t *) rd.d_start;
1404 	
1405 	  mt = cd->mult_table;
1406 	  if (mt == NULL) {
1407 	    if (xor) {
1408 	      while (d8 < (uint8_t *) rd.d_top) {
1409 	        a0 = *s8 & 0x0f; 
1410 	        a1 = (*s8 & 0xf0) >> 4; 
1411 	        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1412 	  
1413 	        *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
1414 	               ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
1415 	                 base_gf->multiply.w32(base_gf, a0, b1) ^ 
1416 	                 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1417 	        s8++;
1418 	        d8++;
1419 	      }
1420 	    } else {
1421 	      while (d8 < (uint8_t *) rd.d_top) {
1422 	        a0 = *s8 & 0x0f; 
1423 	        a1 = (*s8 & 0xf0) >> 4; 
1424 	        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1425 	  
1426 	        *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
1427 	              ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
1428 	                base_gf->multiply.w32(base_gf, a0, b1) ^ 
1429 	                base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1430 	        s8++;
1431 	        d8++;
1432 	      }
1433 	    }
1434 	  } else {
1435 	    if (xor) {
1436 	      while (d8 < (uint8_t *) rd.d_top) {
1437 	        a0 = *s8 & 0x0f; 
1438 	        a1 = (*s8 & 0xf0) >> 4; 
1439 	        a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1440 	  
1441 	        *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
1442 	               ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
1443 	                 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
1444 	                 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1445 	        s8++;
1446 	        d8++;
1447 	      }
1448 	    } else {
1449 	      while (d8 < (uint8_t *) rd.d_top) {
1450 	        a0 = *s8 & 0x0f; 
1451 	        a1 = (*s8 & 0xf0) >> 4; 
1452 	        a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1453 	  
1454 	        *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
1455 	              ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
1456 	                GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
1457 	                GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1458 	        s8++;
1459 	        d8++;
1460 	      }
1461 	    }
1462 	  }
1463 	  gf_do_final_region_alignment(&rd);
1464 	  return;
1465 	}
1466 	
1467 	static
1468 	int gf_w8_composite_init(gf_t *gf)
1469 	{
1470 	  gf_internal_t *h = (gf_internal_t *) gf->scratch;
1471 	  struct gf_w8_composite_data *cd;
1472 	
1473 	  if (h->base_gf == NULL) return 0;
1474 	
1475 	  cd = (struct gf_w8_composite_data *) h->private;
1476 	  cd->mult_table = gf_w4_get_mult_table(h->base_gf);
1477 	
1478 	  if (h->region_type & GF_REGION_ALTMAP) {
1479 	    SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region_alt)
1480 	  } else {
1481 	    SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region)
1482 	  }
1483 	
1484 	  if (cd->mult_table == NULL) {
1485 	    SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_recursive)
1486 	  } else {
1487 	    SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_inline)
1488 	  }
1489 	  SET_FUNCTION(gf,divide,w32,NULL)
1490 	  SET_FUNCTION(gf,inverse,w32,gf_w8_composite_inverse)
1491 	
1492 	  return 1;
1493 	}
1494 	
1495 	static
1496 	inline
1497 	  gf_val_32_t
1498 	gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1499 	{
1500 	  uint32_t prod, pp, pmask, amask;
1501 	  gf_internal_t *h;
1502 	
1503 	  h = (gf_internal_t *) gf->scratch;
1504 	  pp = h->prim_poly;
1505 	
1506 	
1507 	  prod = 0;
1508 	  pmask = 0x80;
1509 	  amask = 0x80;
1510 	
1511 	  while (amask != 0) {
1512 	    if (prod & pmask) {
1513 	      prod = ((prod << 1) ^ pp);
1514 	    } else {
1515 	      prod <<= 1;
1516 	    }
1517 	    if (a & amask) prod ^= b;
1518 	    amask >>= 1;
1519 	  }
1520 	  return prod;
1521 	}
1522 	
1523 	static
1524 	inline
1525 	  gf_val_32_t
1526 	gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1527 	{
1528 	  uint32_t prod, pp, bmask;
1529 	  gf_internal_t *h;
1530 	
1531 	  h = (gf_internal_t *) gf->scratch;
1532 	  pp = h->prim_poly;
1533 	
1534 	  prod = 0;
1535 	  bmask = 0x80;
1536 	
1537 	  while (1) {
1538 	    if (a & 1) prod ^= b;
1539 	    a >>= 1;
1540 	    if (a == 0) return prod;
1541 	    if (b & bmask) {
1542 	      b = ((b << 1) ^ pp);
1543 	    } else {
1544 	      b <<= 1;
1545 	    }
1546 	  }
1547 	}
1548 	
1549 	static
1550 	  void 
1551 	gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1552 	{
1553 	  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
1554 	  gf_region_data rd;
1555 	  struct gf_w8_bytwo_data *btd;
1556 	
1557 	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1558 	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1559 	
1560 	  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1561 	
1562 	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1563 	  gf_do_initial_region_alignment(&rd);
1564 	
1565 	  s64 = (uint64_t *) rd.s_start;
1566 	  d64 = (uint64_t *) rd.d_start;
1567 	
1568 	  if (xor) {
1569 	    while (s64 < (uint64_t *) rd.s_top) {
1570 	      prod = 0;
1571 	      amask = 0x80;
1572 	      ta = *s64;
1573 	      while (amask != 0) {
1574 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1575 	        if (val & amask) prod ^= ta;
1576 	        amask >>= 1;
1577 	      }
1578 	      *d64 ^= prod;
1579 	      d64++;
1580 	      s64++;
1581 	    }
1582 	  } else { 
1583 	    while (s64 < (uint64_t *) rd.s_top) {
1584 	      prod = 0;
1585 	      amask = 0x80;
1586 	      ta = *s64;
1587 	      while (amask != 0) {
1588 	        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1589 	        if (val & amask) prod ^= ta;
1590 	        amask >>= 1;
1591 	      }
1592 	      *d64 = prod;
1593 	      d64++;
1594 	      s64++;
1595 	    }
1596 	  }
1597 	  gf_do_final_region_alignment(&rd);
1598 	}
1599 	
1600 	#define BYTWO_P_ONESTEP {\
1601 	  SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1602 	  t1 = _mm_and_si128(v, one); \
1603 	  t1 = _mm_sub_epi8(t1, one); \
1604 	  t1 = _mm_and_si128(t1, ta); \
1605 	  prod = _mm_xor_si128(prod, t1); \
1606 	  v = _mm_srli_epi64(v, 1); }
1607 	
1608 	#ifdef INTEL_SSE2
1609 	static
1610 	  void 
1611 	gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1612 	{
1613 	  int i;
1614 	  uint8_t *s8, *d8;
1615 	  uint8_t vrev;
1616 	  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
1617 	  struct gf_w8_bytwo_data *btd;
1618 	  gf_region_data rd;
1619 	
1620 	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1621 	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1622 	
1623 	  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1624 	
1625 	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1626 	  gf_do_initial_region_alignment(&rd);
1627 	
1628 	  vrev = 0;
1629 	  for (i = 0; i < 8; i++) {
1630 	    vrev <<= 1;
1631 	    if (!(val & (1 << i))) vrev |= 1;
1632 	  }
1633 	
1634 	  s8 = (uint8_t *) rd.s_start;
1635 	  d8 = (uint8_t *) rd.d_start;
1636 	
1637 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1638 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1639 	  m2 = _mm_set1_epi8((btd->mask2)&0xff);
1640 	  one = _mm_set1_epi8(1);
1641 	
1642 	  while (d8 < (uint8_t *) rd.d_top) {
1643 	    prod = _mm_setzero_si128();
1644 	    v = _mm_set1_epi8(vrev);
1645 	    ta = _mm_load_si128((__m128i *) s8);
1646 	    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
1647 	    BYTWO_P_ONESTEP;
1648 	    BYTWO_P_ONESTEP;
1649 	    BYTWO_P_ONESTEP;
1650 	    BYTWO_P_ONESTEP;
1651 	    BYTWO_P_ONESTEP;
1652 	    BYTWO_P_ONESTEP;
1653 	    BYTWO_P_ONESTEP;
1654 	    BYTWO_P_ONESTEP;
1655 	    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
1656 	    d8 += 16;
1657 	    s8 += 16;
1658 	  }
1659 	  gf_do_final_region_alignment(&rd);
1660 	}
1661 	#endif
1662 	
1663 	#ifdef INTEL_SSE2
1664 	static
1665 	  void
1666 	gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
1667 	{
1668 	  uint8_t *d8, *s8;
1669 	  __m128i pp, m1, m2, t1, t2, va;
1670 	
1671 	  s8 = (uint8_t *) rd->s_start;
1672 	  d8 = (uint8_t *) rd->d_start;
1673 	
1674 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1675 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1676 	  m2 = _mm_set1_epi8((btd->mask2)&0xff);
1677 	
1678 	  while (d8 < (uint8_t *) rd->d_top) {
1679 	    va = _mm_load_si128 ((__m128i *)(s8));
1680 	    SSE_AB2(pp, m1, m2, va, t1, t2);
1681 	    _mm_store_si128((__m128i *)d8, va);
1682 	    d8 += 16;
1683 	    s8 += 16;
1684 	  }
1685 	}
1686 	#endif
1687 	
1688 	#ifdef INTEL_SSE2
1689 	static
1690 	  void
1691 	gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
1692 	{
1693 	  uint8_t *d8, *s8;
1694 	  __m128i pp, m1, m2, t1, t2, va, vb;
1695 	
1696 	  s8 = (uint8_t *) rd->s_start;
1697 	  d8 = (uint8_t *) rd->d_start;
1698 	
1699 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1700 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1701 	  m2 = _mm_set1_epi8((btd->mask2)&0xff);
1702 	
1703 	  while (d8 < (uint8_t *) rd->d_top) {
1704 	    va = _mm_load_si128 ((__m128i *)(s8));
1705 	    SSE_AB2(pp, m1, m2, va, t1, t2);
1706 	    vb = _mm_load_si128 ((__m128i *)(d8));
1707 	    vb = _mm_xor_si128(vb, va);
1708 	    _mm_store_si128((__m128i *)d8, vb);
1709 	    d8 += 16;
1710 	    s8 += 16;
1711 	  }
1712 	}
1713 	#endif
1714 	
1715 	
1716 	#ifdef INTEL_SSE2
1717 	static
1718 	  void 
1719 	gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1720 	{
1721 	  int itb;
1722 	  uint8_t *d8, *s8;
1723 	  __m128i pp, m1, m2, t1, t2, va, vb;
1724 	  struct gf_w8_bytwo_data *btd;
1725 	  gf_region_data rd;
1726 	
1727 	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1728 	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1729 	
1730 	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1731 	  gf_do_initial_region_alignment(&rd);
1732 	
1733 	  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1734 	
1735 	  if (val == 2) {
1736 	    if (xor) {
1737 	      gf_w8_bytwo_b_sse_region_2_xor(&rd, btd);
1738 	    } else {
1739 	      gf_w8_bytwo_b_sse_region_2_noxor(&rd, btd);
1740 	    }
1741 	    gf_do_final_region_alignment(&rd);
1742 	    return;
1743 	  }
1744 	
1745 	  s8 = (uint8_t *) rd.s_start;
1746 	  d8 = (uint8_t *) rd.d_start;
1747 	
1748 	  pp = _mm_set1_epi8(btd->prim_poly&0xff);
1749 	  m1 = _mm_set1_epi8((btd->mask1)&0xff);
1750 	  m2 = _mm_set1_epi8((btd->mask2)&0xff);
1751 	
1752 	  while (d8 < (uint8_t *) rd.d_top) {
1753 	    va = _mm_load_si128 ((__m128i *)(s8));
1754 	    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
1755 	    itb = val;
1756 	    while (1) {
1757 	      if (itb & 1) vb = _mm_xor_si128(vb, va);
1758 	      itb >>= 1;
1759 	      if (itb == 0) break;
1760 	      SSE_AB2(pp, m1, m2, va, t1, t2);
1761 	    }
1762 	    _mm_store_si128((__m128i *)d8, vb);
1763 	    d8 += 16;
1764 	    s8 += 16;
1765 	  }
1766 	
1767 	  gf_do_final_region_alignment(&rd);
1768 	}
1769 	#endif
1770 	
1771 	static
1772 	  void 
1773 	gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1774 	{
1775 	  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
1776 	  struct gf_w8_bytwo_data *btd;
1777 	  gf_region_data rd;
1778 	
1779 	  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1780 	  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1781 	
1782 	  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1783 	  gf_do_initial_region_alignment(&rd);
1784 	
1785 	  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1786 	  s64 = (uint64_t *) rd.s_start;
1787 	  d64 = (uint64_t *) rd.d_start;
1788 	
1789 	  switch (val) {
1790 	    case 2:
1791 	      if (xor) {
1792 	        while (d64 < (uint64_t *) rd.d_top) {
1793 	          ta = *s64;
1794 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1795 	          *d64 ^= ta;
1796 	          d64++;
1797 	          s64++;
1798 	        }
1799 	      } else {
1800 	        while (d64 < (uint64_t *) rd.d_top) {
1801 	          ta = *s64;
1802 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1803 	          *d64 = ta;
1804 	          d64++;
1805 	          s64++;
1806 	        }
1807 	      }
1808 	      break; 
1809 	    case 3:
1810 	      if (xor) {
1811 	        while (d64 < (uint64_t *) rd.d_top) {
1812 	          ta = *s64;
1813 	          prod = ta;
1814 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1815 	          *d64 ^= (ta ^ prod);
1816 	          d64++;
1817 	          s64++;
1818 	        }
1819 	      } else {
1820 	        while (d64 < (uint64_t *) rd.d_top) {
1821 	          ta = *s64;
1822 	          prod = ta;
1823 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1824 	          *d64 = (ta ^ prod);
1825 	          d64++;
1826 	          s64++;
1827 	        }
1828 	      }
1829 	      break; 
1830 	    case 4:
1831 	      if (xor) {
1832 	        while (d64 < (uint64_t *) rd.d_top) {
1833 	          ta = *s64;
1834 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1835 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1836 	          *d64 ^= ta;
1837 	          d64++;
1838 	          s64++;
1839 	        }
1840 	      } else {
1841 	        while (d64 < (uint64_t *) rd.d_top) {
1842 	          ta = *s64;
1843 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1844 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1845 	          *d64 = ta;
1846 	          d64++;
1847 	          s64++;
1848 	        }
1849 	      }
1850 	      break; 
1851 	    case 5:
1852 	      if (xor) {
1853 	        while (d64 < (uint64_t *) rd.d_top) {
1854 	          ta = *s64;
1855 	          prod = ta;
1856 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1857 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1858 	          *d64 ^= (ta ^ prod);
1859 	          d64++;
1860 	          s64++;
1861 	        }
1862 	      } else {
1863 	        while (d64 < (uint64_t *) rd.d_top) {
1864 	          ta = *s64;
1865 	          prod = ta;
1866 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1867 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1868 	          *d64 = ta ^ prod;
1869 	          d64++;
1870 	          s64++;
1871 	        }
1872 	      }
1873 	      break;
1874 	    case 6:
1875 	      if (xor) {
1876 	        while (d64 < (uint64_t *) rd.d_top) {
1877 	          ta = *s64;
1878 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1879 	          prod = ta;
1880 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1881 	          *d64 ^= (ta ^ prod);
1882 	          d64++;
1883 	          s64++;
1884 	        }
1885 	      } else {
1886 	        while (d64 < (uint64_t *) rd.d_top) {
1887 	          ta = *s64;
1888 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1889 	          prod = ta;
1890 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1891 	          *d64 = ta ^ prod;
1892 	          d64++;
1893 	          s64++;
1894 	        }
1895 	      }
1896 	      break;
1897 	      /*
1898 	         case 7:
1899 	         if (xor) {
1900 	         while (d64 < (uint64_t *) rd.d_top) {
1901 	         ta = *s64;
1902 	         prod = ta;
1903 	         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1904 	         prod ^= ta;
1905 	         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1906 	       *d64 ^= (ta ^ prod);
1907 	       d64++;
1908 	       s64++;
1909 	       }
1910 	       } else {
1911 	       while (d64 < (uint64_t *) rd.d_top) {
1912 	       ta = *s64;
1913 	       prod = ta;
1914 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1915 	       prod ^= ta;
1916 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1917 	       *d64 = ta ^ prod;
1918 	       d64++;
1919 	       s64++;
1920 	       }
1921 	       }
1922 	       break; 
1923 	       */
1924 	    case 8:
1925 	      if (xor) {
1926 	        while (d64 < (uint64_t *) rd.d_top) {
1927 	          ta = *s64;
1928 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1929 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1930 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1931 	          *d64 ^= ta;
1932 	          d64++;
1933 	          s64++;
1934 	        }
1935 	      } else {
1936 	        while (d64 < (uint64_t *) rd.d_top) {
1937 	          ta = *s64;
1938 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1939 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1940 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1941 	          *d64 = ta;
1942 	          d64++;
1943 	          s64++;
1944 	        }
1945 	      }
1946 	      break; 
1947 	      /*
1948 	         case 9:
1949 	         if (xor) {
1950 	         while (d64 < (uint64_t *) rd.d_top) {
1951 	         ta = *s64;
1952 	         prod = ta;
1953 	         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1954 	         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1955 	         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1956 	       *d64 ^= (ta ^ prod);
1957 	       d64++;
1958 	       s64++;
1959 	       }
1960 	       } else {
1961 	       while (d64 < (uint64_t *) rd.d_top) {
1962 	       ta = *s64;
1963 	       prod = ta;
1964 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1965 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1966 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1967 	       *d64 = (ta ^ prod);
1968 	       d64++;
1969 	       s64++;
1970 	       }
1971 	       }
1972 	       break; 
1973 	       case 10:
1974 	       if (xor) {
1975 	       while (d64 < (uint64_t *) rd.d_top) {
1976 	       ta = *s64;
1977 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1978 	       prod = ta;
1979 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1980 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1981 	       *d64 ^= (ta ^ prod);
1982 	       d64++;
1983 	       s64++;
1984 	       }
1985 	       } else {
1986 	       while (d64 < (uint64_t *) rd.d_top) {
1987 	       ta = *s64;
1988 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1989 	       prod = ta;
1990 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1991 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1992 	       *d64 = (ta ^ prod);
1993 	       d64++;
1994 	       s64++;
1995 	       }
1996 	       }
1997 	       break; 
1998 	       case 11:
1999 	       if (xor) {
2000 	       while (d64 < (uint64_t *) rd.d_top) {
2001 	       ta = *s64;
2002 	       prod = ta;
2003 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2004 	       prod ^= ta;
2005 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2006 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2007 	       *d64 ^= (ta ^ prod);
2008 	       d64++;
2009 	       s64++;
2010 	       }
2011 	       } else {
2012 	       while (d64 < (uint64_t *) rd.d_top) {
2013 	       ta = *s64;
2014 	       prod = ta;
2015 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2016 	       prod ^= ta;
2017 	       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2018 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2019 	      *d64 = (ta ^ prod);
2020 	      d64++;
2021 	      s64++;
2022 	      }
2023 	  }
2024 	  break; 
2025 	    case 12:
2026 	  if (xor) {
2027 	    while (d64 < (uint64_t *) rd.d_top) {
2028 	      ta = *s64;
2029 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2030 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2031 	      prod = ta;
2032 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2033 	      *d64 ^= (ta ^ prod);
2034 	      d64++;
2035 	      s64++;
2036 	    }
2037 	  } else {
2038 	    while (d64 < (uint64_t *) rd.d_top) {
2039 	      ta = *s64;
2040 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2041 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2042 	      prod = ta;
2043 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2044 	      *d64 = (ta ^ prod);
2045 	      d64++;
2046 	      s64++;
2047 	    }
2048 	  }
2049 	  break; 
2050 	    case 13:
2051 	  if (xor) {
2052 	    while (d64 < (uint64_t *) rd.d_top) {
2053 	      ta = *s64;
2054 	      prod = ta;
2055 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2056 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2057 	      prod ^= ta;
2058 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2059 	      *d64 ^= (ta ^ prod);
2060 	      d64++;
2061 	      s64++;
2062 	    }
2063 	  } else {
2064 	    while (d64 < (uint64_t *) rd.d_top) {
2065 	      ta = *s64;
2066 	      prod = ta;
2067 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2068 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2069 	      prod ^= ta;
2070 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2071 	      *d64 = (ta ^ prod);
2072 	      d64++;
2073 	      s64++;
2074 	    }
2075 	  }
2076 	  break; 
2077 	    case 14:
2078 	  if (xor) {
2079 	    while (d64 < (uint64_t *) rd.d_top) {
2080 	      ta = *s64;
2081 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2082 	      prod = ta;
2083 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2084 	      prod ^= ta;
2085 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2086 	      *d64 ^= (ta ^ prod);
2087 	      d64++;
2088 	      s64++;
2089 	    }
2090 	  } else {
2091 	    while (d64 < (uint64_t *) rd.d_top) {
2092 	      ta = *s64;
2093 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2094 	      prod = ta;
2095 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2096 	      prod ^= ta;
2097 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2098 	      *d64 = (ta ^ prod);
2099 	      d64++;
2100 	      s64++;
2101 	    }
2102 	  }
2103 	  break; 
2104 	    case 15:
2105 	  if (xor) {
2106 	    while (d64 < (uint64_t *) rd.d_top) {
2107 	      ta = *s64;
2108 	      prod = ta;
2109 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2110 	      prod ^= ta;
2111 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2112 	      prod ^= ta;
2113 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2114 	      *d64 ^= (ta ^ prod);
2115 	      d64++;
2116 	      s64++;
2117 	    }
2118 	  } else {
2119 	    while (d64 < (uint64_t *) rd.d_top) {
2120 	      ta = *s64;
2121 	      prod = ta;
2122 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2123 	      prod ^= ta;
2124 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2125 	      prod ^= ta;
2126 	      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2127 	      *d64 = (ta ^ prod);
2128 	      d64++;
2129 	      s64++;
2130 	    }
2131 	  }
2132 	  break; 
2133 	  */
2134 	    default:
2135 	    if (xor) {
2136 	      while (d64 < (uint64_t *) rd.d_top) {
2137 	        prod = *d64 ;
2138 	        ta = *s64;
2139 	        tb = val;
2140 	        while (1) {
2141 	          if (tb & 1) prod ^= ta;
2142 	          tb >>= 1;
2143 	          if (tb == 0) break;
2144 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2145 	        }
2146 	        *d64 = prod;
2147 	        d64++;
2148 	        s64++;
2149 	      }
2150 	    } else {
2151 	      while (d64 < (uint64_t *) rd.d_top) {
2152 	        prod = 0 ;
2153 	        ta = *s64;
2154 	        tb = val;
2155 	        while (1) {
2156 	          if (tb & 1) prod ^= ta;
2157 	          tb >>= 1;
2158 	          if (tb == 0) break;
2159 	          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2160 	        }
2161 	        *d64 = prod;
2162 	        d64++;
2163 	        s64++;
2164 	      }
2165 	    }
2166 	    break;
2167 	  }
2168 	  gf_do_final_region_alignment(&rd);
2169 	}
2170 	
2171 	  static
2172 	int gf_w8_bytwo_init(gf_t *gf)
2173 	{
2174 	  gf_internal_t *h;
2175 	  uint64_t ip, m1, m2;
2176 	  struct gf_w8_bytwo_data *btd;
2177 	
2178 	  h = (gf_internal_t *) gf->scratch;
2179 	  btd = (struct gf_w8_bytwo_data *) (h->private);
2180 	  ip = h->prim_poly & 0xff;
2181 	  m1 = 0xfe;
2182 	  m2 = 0x80;
2183 	  btd->prim_poly = 0;
2184 	  btd->mask1 = 0;
2185 	  btd->mask2 = 0;
2186 	
2187 	  while (ip != 0) {
2188 	    btd->prim_poly |= ip;
2189 	    btd->mask1 |= m1;
2190 	    btd->mask2 |= m2;
2191 	    ip <<= GF_FIELD_WIDTH;
2192 	    m1 <<= GF_FIELD_WIDTH;
2193 	    m2 <<= GF_FIELD_WIDTH;
2194 	  }
2195 	
2196 	  if (h->mult_type == GF_MULT_BYTWO_p) {
2197 	    SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply)
2198 	#ifdef INTEL_SSE2
2199 	    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
2200 	      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region)
2201 	    } else {
2202 	#endif
2203 	      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
2204 	      if(h->region_type & GF_REGION_SIMD)
2205 	        return 0;
2206 	#ifdef INTEL_SSE2
2207 	    }
2208 	#endif
2209 	  } else {
2210 	    SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply)
2211 	#ifdef INTEL_SSE2
2212 	    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
2213 	      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region)
2214 	    } else {
2215 	#endif
2216 	    SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
2217 	    if(h->region_type & GF_REGION_SIMD)
2218 	      return 0;
2219 	#ifdef INTEL_SSE2
2220 	    }
2221 	#endif
2222 	  }
2223 	  return 1;
2224 	}
2225 	
2226 	
2227 	/* ------------------------------------------------------------
2228 	   General procedures.
2229 	   You don't need to error check here on in init, because it's done
2230 	   for you in gf_error_check().
2231 	 */
2232 	
2233 	int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2234 	{
2235 	  switch(mult_type)
2236 	  {
2237 	    case GF_MULT_DEFAULT:
2238 	      if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
2239 	        return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
2240 	      }
2241 	      return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2242 	    case GF_MULT_TABLE:
2243 	      if (region_type == GF_REGION_CAUCHY) {
2244 	        return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2245 	      }
2246 	
2247 	      if (region_type == GF_REGION_DEFAULT) {
2248 	        return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2249 	      } 
2250 	      if (region_type & GF_REGION_DOUBLE_TABLE) {
2251 	        if (region_type == GF_REGION_DOUBLE_TABLE) {
2252 	          return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_data) + 64;
2253 	        } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
2254 	          return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64;
2255 	        } else {
2256 	          return 0;
2257 	        }
2258 	      }
2259 	      return 0;
2260 	      break;
2261 	    case GF_MULT_BYTWO_p:
2262 	    case GF_MULT_BYTWO_b:
2263 	      return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data);
2264 	      break;
2265 	    case GF_MULT_SPLIT_TABLE:
2266 	      if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) {
2267 	        return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
2268 	      }
2269 	      break;
2270 	    case GF_MULT_LOG_TABLE:
2271 	      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
2272 	      break;
2273 	    case GF_MULT_LOG_ZERO:
2274 	      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
2275 	      break;
2276 	    case GF_MULT_LOG_ZERO_EXT:
2277 	      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64;
2278 	      break;
2279 	    case GF_MULT_CARRY_FREE:
2280 	      return sizeof(gf_internal_t);
2281 	      break;
2282 	    case GF_MULT_SHIFT:
2283 	      return sizeof(gf_internal_t);
2284 	      break;
2285 	    case GF_MULT_COMPOSITE:
2286 	      return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64;
2287 	    default:
2288 	      return 0;
2289 	  }
2290 	  return 0;
2291 	}
2292 	
2293 	int gf_w8_init(gf_t *gf)
2294 	{
2295 	  gf_internal_t *h;
2296 	
2297 	  h = (gf_internal_t *) gf->scratch;
2298 	
2299 	  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
2300 	
2301 	  if (h->prim_poly == 0) {
2302 	    if (h->mult_type == GF_MULT_COMPOSITE) { 
2303 	      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
2304 	      if (h->prim_poly == 0) return 0;   /* JSP: This shouldn't happen, but just in case. */
2305 	    } else {             
2306 	      h->prim_poly = 0x11d;
2307 	    } 
2308 	  }
2309 	  if (h->mult_type != GF_MULT_COMPOSITE) { 
2310 	    h->prim_poly |= 0x100;
2311 	  }
2312 	
2313 	  SET_FUNCTION(gf,multiply,w32,NULL)
2314 	  SET_FUNCTION(gf,divide,w32,NULL)
2315 	  SET_FUNCTION(gf,inverse,w32,NULL)
2316 	  SET_FUNCTION(gf,multiply_region,w32,NULL)
2317 	  SET_FUNCTION(gf,extract_word,w32,gf_w8_extract_word)
2318 	
2319 	  switch(h->mult_type) {
2320 	    case GF_MULT_DEFAULT:      
2321 	    case GF_MULT_TABLE:        if (gf_w8_table_init(gf) == 0) return 0; break;
2322 	    case GF_MULT_BYTWO_p:
2323 	    case GF_MULT_BYTWO_b:      if (gf_w8_bytwo_init(gf) == 0) return 0; break;
2324 	    case GF_MULT_LOG_ZERO:
2325 	    case GF_MULT_LOG_ZERO_EXT:
2326 	    case GF_MULT_LOG_TABLE:    if (gf_w8_log_init(gf) == 0) return 0; break;
2327 	    case GF_MULT_CARRY_FREE:   if (gf_w8_cfm_init(gf) == 0) return 0; break;
2328 	    case GF_MULT_SHIFT:        if (gf_w8_shift_init(gf) == 0) return 0; break;
2329 	    case GF_MULT_SPLIT_TABLE:  if (gf_w8_split_init(gf) == 0) return 0; break;
2330 	    case GF_MULT_COMPOSITE:    if (gf_w8_composite_init(gf) == 0) return 0; break;
2331 	    default: return 0;
2332 	  }
2333 	
2334 	  if (h->divide_type == GF_DIVIDE_EUCLID) {
2335 	    SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2336 	    SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
2337 	  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
2338 	    SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2339 	    SET_FUNCTION(gf,inverse,w32,gf_w8_matrix)
2340 	  }
2341 	
2342 	  if (gf->divide.w32 == NULL) {
2343 	    SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2344 	    if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
2345 	  }
2346 	
2347 	  if (gf->inverse.w32 == NULL)  SET_FUNCTION(gf,inverse,w32,gf_w8_inverse_from_divide)
2348 	
2349 	  if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
2350 	    SET_FUNCTION(gf,extract_word,w32,gf_w8_composite_extract_word)
2351 	  }
2352 	
2353 	  if (h->region_type == GF_REGION_CAUCHY) {
2354 	    SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
2355 	    SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
2356 	  }
2357 	
2358 	  if (gf->multiply_region.w32 == NULL) {
2359 	    SET_FUNCTION(gf,multiply_region,w32,gf_w8_multiply_region_from_single)
2360 	  }
2361 	
2362 	  return 1;
2363 	}
2364 	
2365 	
2366 	/* Inline setup functions */
2367 	
2368 	uint8_t *gf_w8_get_mult_table(gf_t *gf)
2369 	{
2370 	  gf_internal_t *h;
2371 	  struct gf_w8_default_data *ftd;
2372 	  struct gf_w8_single_table_data *std;
2373 	
2374 	  h = (gf_internal_t *) gf->scratch;
2375 	  if (gf->multiply.w32 == gf_w8_default_multiply) {
2376 	    ftd = (struct gf_w8_default_data *) h->private;
2377 	    return (uint8_t *) ftd->multtable;
2378 	  } else if (gf->multiply.w32 == gf_w8_table_multiply) {
2379 	    std = (struct gf_w8_single_table_data *) h->private;
2380 	    return (uint8_t *) std->multtable;
2381 	  }
2382 	  return NULL;
2383 	}
2384 	
2385 	uint8_t *gf_w8_get_div_table(gf_t *gf)
2386 	{
2387 	  struct gf_w8_default_data *ftd;
2388 	  struct gf_w8_single_table_data *std;
2389 	
2390 	  if (gf->multiply.w32 == gf_w8_default_multiply) {
2391 	    ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
2392 	    return (uint8_t *) ftd->divtable;
2393 	  } else if (gf->multiply.w32 == gf_w8_table_multiply) {
2394 	    std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
2395 	    return (uint8_t *) std->divtable;
2396 	  }
2397 	  return NULL;
2398 	}
2399