File: | home/bhubbard/working/src/ceph/src/erasure-code/jerasure/gf-complete/src/gf.c |
Warning: | line 611, column 3 Undefined or garbage value returned to caller |
[?] Use j/k keys for keyboard navigation
1 | /* | |||
2 | * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic | |||
3 | * James S. Plank, Ethan L. Miller, Kevin M. Greenan, | |||
4 | * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. | |||
5 | * | |||
6 | * gf.c | |||
7 | * | |||
8 | * Generic routines for Galois fields | |||
9 | */ | |||
10 | ||||
11 | #include "gf_int.h" | |||
12 | #include <stdio.h> | |||
13 | #include <stdlib.h> | |||
14 | #include <assert.h> | |||
15 | #include "gf_cpu.h" | |||
16 | ||||
17 | int _gf_errno = GF_E_DEFAULT; | |||
18 | ||||
19 | void gf_error() | |||
20 | { | |||
21 | char *s; | |||
22 | ||||
23 | switch(_gf_errno) { | |||
24 | case GF_E_DEFAULT: s = "No Error."; break; | |||
25 | case GF_E_TWOMULT: s = "Cannot specify two -m's."; break; | |||
26 | case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break; | |||
27 | case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break; | |||
28 | case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break; | |||
29 | case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break; | |||
30 | case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break; | |||
31 | case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break; | |||
32 | case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break; | |||
33 | case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break; | |||
34 | case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break; | |||
35 | case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break; | |||
36 | case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break; | |||
37 | case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break; | |||
38 | case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break; | |||
39 | case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break; | |||
40 | case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break; | |||
41 | case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break; | |||
42 | case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break; | |||
43 | case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break; | |||
44 | case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break; | |||
45 | case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break; | |||
46 | case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break; | |||
47 | case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break; | |||
48 | case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break; | |||
49 | case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break; | |||
50 | case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break; | |||
51 | case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break; | |||
52 | case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break; | |||
53 | case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break; | |||
54 | case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break; | |||
55 | case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break; | |||
56 | case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break; | |||
57 | case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break; | |||
58 | case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break; | |||
59 | case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break; | |||
60 | case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break; | |||
61 | case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break; | |||
62 | case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break; | |||
63 | case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break; | |||
64 | case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break; | |||
65 | case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break; | |||
66 | case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break; | |||
67 | case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break; | |||
68 | case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break; | |||
69 | case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break; | |||
70 | case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break; | |||
71 | case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break; | |||
72 | case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break; | |||
73 | case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break; | |||
74 | case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break; | |||
75 | case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break; | |||
76 | case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break; | |||
77 | case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break; | |||
78 | case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break; | |||
79 | case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break; | |||
80 | case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break; | |||
81 | case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break; | |||
82 | case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break; | |||
83 | case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break; | |||
84 | case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break; | |||
85 | case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break; | |||
86 | case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break; | |||
87 | case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; | |||
88 | case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break; | |||
89 | case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; | |||
90 | case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; | |||
91 | case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break; | |||
92 | case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; | |||
93 | case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; | |||
94 | case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; | |||
95 | case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break; | |||
96 | case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break; | |||
97 | case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break; | |||
98 | case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break; | |||
99 | case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break; | |||
100 | case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break; | |||
101 | case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break; | |||
102 | case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break; | |||
103 | case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break; | |||
104 | case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break; | |||
105 | case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break; | |||
106 | case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break; | |||
107 | case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break; | |||
108 | case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break; | |||
109 | case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break; | |||
110 | case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break; | |||
111 | case GF_E_UNKNOWN: s = "Unknown multiplication type."; break; | |||
112 | case GF_E_UNK_REG: s = "Unknown region type."; break; | |||
113 | case GF_E_UNK_DIV: s = "Unknown division type."; break; | |||
114 | default: s = "Undefined error."; | |||
115 | } | |||
116 | ||||
117 | fprintf(stderr, "%s\n", s)__fprintf_chk (stderr, 2 - 1, "%s\n", s); | |||
118 | } | |||
119 | ||||
120 | uint64_t gf_composite_get_default_poly(gf_t *base) | |||
121 | { | |||
122 | gf_internal_t *h; | |||
123 | uint64_t rv; | |||
124 | ||||
125 | h = (gf_internal_t *) base->scratch; | |||
126 | if (h->w == 4) { | |||
127 | if (h->mult_type == GF_MULT_COMPOSITE) return 0; | |||
128 | if (h->prim_poly == 0x13) return 2; | |||
129 | return 0; | |||
130 | } | |||
131 | if (h->w == 8) { | |||
132 | if (h->mult_type == GF_MULT_COMPOSITE) return 0; | |||
133 | if (h->prim_poly == 0x11d) return 3; | |||
134 | return 0; | |||
135 | } | |||
136 | if (h->w == 16) { | |||
137 | if (h->mult_type == GF_MULT_COMPOSITE) { | |||
138 | rv = gf_composite_get_default_poly(h->base_gf); | |||
139 | if (rv != h->prim_poly) return 0; | |||
140 | if (rv == 3) return 0x105; | |||
141 | return 0; | |||
142 | } else { | |||
143 | if (h->prim_poly == 0x1100b) return 2; | |||
144 | if (h->prim_poly == 0x1002d) return 7; | |||
145 | return 0; | |||
146 | } | |||
147 | } | |||
148 | if (h->w == 32) { | |||
149 | if (h->mult_type == GF_MULT_COMPOSITE) { | |||
150 | rv = gf_composite_get_default_poly(h->base_gf); | |||
151 | if (rv != h->prim_poly) return 0; | |||
152 | if (rv == 2) return 0x10005; | |||
153 | if (rv == 7) return 0x10008; | |||
154 | if (rv == 0x105) return 0x10002; | |||
155 | return 0; | |||
156 | } else { | |||
157 | if (h->prim_poly == 0x400007) return 2; | |||
158 | if (h->prim_poly == 0xc5) return 3; | |||
159 | return 0; | |||
160 | } | |||
161 | } | |||
162 | if (h->w == 64) { | |||
163 | if (h->mult_type == GF_MULT_COMPOSITE) { | |||
164 | rv = gf_composite_get_default_poly(h->base_gf); | |||
165 | if (rv != h->prim_poly) return 0; | |||
166 | if (rv == 3) return 0x100000009ULL; | |||
167 | if (rv == 2) return 0x100000004ULL; | |||
168 | if (rv == 0x10005) return 0x100000003ULL; | |||
169 | if (rv == 0x10002) return 0x100000005ULL; | |||
170 | if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too, | |||
171 | but I want to differentiate cases). */ | |||
172 | return 0; | |||
173 | } else { | |||
174 | if (h->prim_poly == 0x1bULL) return 2; | |||
175 | return 0; | |||
176 | } | |||
177 | } | |||
178 | return 0; | |||
179 | } | |||
180 | ||||
181 | int gf_error_check(int w, int mult_type, int region_type, int divide_type, | |||
182 | int arg1, int arg2, uint64_t poly, gf_t *base) | |||
183 | { | |||
184 | int sse3 = 0; | |||
185 | int sse2 = 0; | |||
186 | int pclmul = 0; | |||
187 | int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp; | |||
188 | gf_internal_t *sub; | |||
189 | ||||
190 | rdouble = (region_type & GF_REGION_DOUBLE_TABLE(0x1)); | |||
191 | rquad = (region_type & GF_REGION_QUAD_TABLE(0x2)); | |||
192 | rlazy = (region_type & GF_REGION_LAZY(0x4)); | |||
193 | rsimd = (region_type & GF_REGION_SIMD(0x8)); | |||
194 | rnosimd = (region_type & GF_REGION_NOSIMD(0x10)); | |||
195 | raltmap = (region_type & GF_REGION_ALTMAP(0x20)); | |||
196 | rcauchy = (region_type & GF_REGION_CAUCHY(0x40)); | |||
197 | ||||
198 | if (divide_type != GF_DIVIDE_DEFAULT && | |||
199 | divide_type != GF_DIVIDE_MATRIX && | |||
200 | divide_type != GF_DIVIDE_EUCLID) { | |||
201 | _gf_errno = GF_E_UNK_DIV; | |||
202 | return 0; | |||
203 | } | |||
204 | ||||
205 | tmp = ( GF_REGION_DOUBLE_TABLE(0x1) | GF_REGION_QUAD_TABLE(0x2) | GF_REGION_LAZY(0x4) | | |||
206 | GF_REGION_SIMD(0x8) | GF_REGION_NOSIMD(0x10) | GF_REGION_ALTMAP(0x20) | | |||
207 | GF_REGION_CAUCHY(0x40) ); | |||
208 | if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } | |||
209 | ||||
210 | #ifdef INTEL_SSE21 | |||
211 | if (gf_cpu_supports_intel_sse2) { | |||
212 | sse2 = 1; | |||
213 | } | |||
214 | #endif | |||
215 | ||||
216 | #ifdef INTEL_SSSE31 | |||
217 | if (gf_cpu_supports_intel_ssse3) { | |||
218 | sse3 = 1; | |||
219 | } | |||
220 | #endif | |||
221 | ||||
222 | #ifdef INTEL_SSE4_PCLMUL1 | |||
223 | if (gf_cpu_supports_intel_pclmul) { | |||
224 | pclmul = 1; | |||
225 | } | |||
226 | #endif | |||
227 | ||||
228 | #ifdef ARM_NEON | |||
229 | if (gf_cpu_supports_arm_neon) { | |||
230 | pclmul = (w == 4 || w == 8); | |||
231 | sse3 = 1; | |||
232 | } | |||
233 | #endif | |||
234 | ||||
235 | ||||
236 | if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; } | |||
237 | ||||
238 | if (mult_type != GF_MULT_COMPOSITE && w < 64) { | |||
239 | if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; } | |||
240 | } | |||
241 | ||||
242 | if (mult_type == GF_MULT_DEFAULT) { | |||
243 | if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; } | |||
244 | if (region_type != GF_REGION_DEFAULT(0x0)) { _gf_errno = GF_E_MDEFREG; return 0; } | |||
245 | if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; } | |||
246 | return 1; | |||
247 | } | |||
248 | ||||
249 | if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; } | |||
250 | if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; } | |||
251 | if (rcauchy && region_type != GF_REGION_CAUCHY(0x40)) { _gf_errno = GF_E_CAUCHYB; return 0; } | |||
252 | if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; } | |||
253 | ||||
254 | if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && | |||
255 | mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { | |||
256 | _gf_errno = GF_E_ARG1SET; | |||
257 | return 0; | |||
258 | } | |||
259 | ||||
260 | if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { | |||
261 | _gf_errno = GF_E_ARG2SET; | |||
262 | return 0; | |||
263 | } | |||
264 | ||||
265 | if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; } | |||
266 | ||||
267 | if (rdouble) { | |||
268 | if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; } | |||
269 | if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; } | |||
270 | if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; } | |||
271 | if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } | |||
272 | if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; } | |||
273 | return 1; | |||
274 | } | |||
275 | ||||
276 | if (rquad) { | |||
277 | if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; } | |||
278 | if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; } | |||
279 | if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } | |||
280 | return 1; | |||
281 | } | |||
282 | ||||
283 | if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; } | |||
284 | ||||
285 | if (mult_type == GF_MULT_SHIFT) { | |||
286 | if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; } | |||
287 | if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; } | |||
288 | return 1; | |||
289 | } | |||
290 | ||||
291 | if (mult_type == GF_MULT_CARRY_FREE) { | |||
292 | if (w != 4 && w != 8 && w != 16 && | |||
293 | w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } | |||
294 | if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; } | |||
295 | if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; } | |||
296 | if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; } | |||
297 | if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; } | |||
298 | if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; } | |||
299 | if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } | |||
300 | if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; } | |||
301 | if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } | |||
302 | return 1; | |||
303 | } | |||
304 | ||||
305 | if (mult_type == GF_MULT_CARRY_FREE_GK) { | |||
306 | if (w != 4 && w != 8 && w != 16 && | |||
307 | w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } | |||
308 | if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } | |||
309 | if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; } | |||
310 | if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } | |||
311 | return 1; | |||
312 | } | |||
313 | ||||
314 | if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { | |||
315 | if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } | |||
316 | if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } | |||
317 | return 1; | |||
318 | } | |||
319 | ||||
320 | if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO | |||
321 | || mult_type == GF_MULT_LOG_ZERO_EXT ) { | |||
322 | if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; } | |||
323 | if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; } | |||
324 | ||||
325 | if (mult_type == GF_MULT_LOG_TABLE) return 1; | |||
326 | ||||
327 | if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; } | |||
328 | ||||
329 | if (mult_type == GF_MULT_LOG_ZERO) return 1; | |||
330 | ||||
331 | if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; } | |||
332 | return 1; | |||
333 | } | |||
334 | ||||
335 | if (mult_type == GF_MULT_GROUP) { | |||
336 | if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; } | |||
337 | if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; } | |||
338 | if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; } | |||
339 | if (w == 128 && (arg1 != 4 || | |||
340 | (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; } | |||
341 | if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; } | |||
342 | if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; } | |||
343 | if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; } | |||
344 | return 1; | |||
345 | } | |||
346 | ||||
347 | if (mult_type == GF_MULT_TABLE) { | |||
348 | if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; } | |||
349 | if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; } | |||
350 | if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } | |||
351 | if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; } | |||
352 | return 1; | |||
353 | } | |||
354 | ||||
355 | if (mult_type == GF_MULT_SPLIT_TABLE) { | |||
356 | if (arg1 > arg2) { | |||
357 | tmp = arg1; | |||
358 | arg1 = arg2; | |||
359 | arg2 = tmp; | |||
360 | } | |||
361 | if (w == 8) { | |||
362 | if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; } | |||
363 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
364 | if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } | |||
365 | } else if (w == 16) { | |||
366 | if ((arg1 == 8 && arg2 == 8) || | |||
367 | (arg1 == 8 && arg2 == 16)) { | |||
368 | if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; } | |||
369 | if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } | |||
370 | } else if (arg1 == 4 && arg2 == 16) { | |||
371 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
372 | } else { _gf_errno = GF_E_SP_16AR; return 0; } | |||
373 | } else if (w == 32) { | |||
374 | if ((arg1 == 8 && arg2 == 8) || | |||
375 | (arg1 == 8 && arg2 == 32) || | |||
376 | (arg1 == 16 && arg2 == 32)) { | |||
377 | if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; } | |||
378 | if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } | |||
379 | } else if (arg1 == 4 && arg2 == 32) { | |||
380 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
381 | if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } | |||
382 | if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; } | |||
383 | } else { _gf_errno = GF_E_SP_32AR; return 0; } | |||
384 | } else if (w == 64) { | |||
385 | if ((arg1 == 8 && arg2 == 8) || | |||
386 | (arg1 == 8 && arg2 == 64) || | |||
387 | (arg1 == 16 && arg2 == 64)) { | |||
388 | if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; } | |||
389 | if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; } | |||
390 | } else if (arg1 == 4 && arg2 == 64) { | |||
391 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
392 | if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; } | |||
393 | if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; } | |||
394 | } else { _gf_errno = GF_E_SP_64AR; return 0; } | |||
395 | } else if (w == 128) { | |||
396 | if (arg1 == 8 && arg2 == 128) { | |||
397 | if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; } | |||
398 | if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; } | |||
399 | } else if (arg1 == 4 && arg2 == 128) { | |||
400 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
401 | if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; } | |||
402 | if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; } | |||
403 | } else { _gf_errno = GF_E_SP128AR; return 0; } | |||
404 | } else { _gf_errno = GF_E_SPLIT_W; return 0; } | |||
405 | return 1; | |||
406 | } | |||
407 | ||||
408 | if (mult_type == GF_MULT_COMPOSITE) { | |||
409 | if (w != 8 && w != 16 && w != 32 | |||
410 | && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; } | |||
411 | if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; } | |||
412 | if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; } | |||
413 | if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; } | |||
414 | if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; } | |||
415 | if (base != NULL((void*)0)) { | |||
416 | sub = (gf_internal_t *) base->scratch; | |||
417 | if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; } | |||
418 | if (poly == 0) { | |||
419 | if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; } | |||
420 | } | |||
421 | } | |||
422 | return 1; | |||
423 | } | |||
424 | ||||
425 | _gf_errno = GF_E_UNKNOWN; | |||
426 | return 0; | |||
427 | } | |||
428 | ||||
429 | int gf_scratch_size(int w, | |||
430 | int mult_type, | |||
431 | int region_type, | |||
432 | int divide_type, | |||
433 | int arg1, | |||
434 | int arg2) | |||
435 | { | |||
436 | if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL((void*)0)) == 0) return 0; | |||
437 | ||||
438 | switch(w) { | |||
439 | case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
440 | case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
441 | case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
442 | case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
443 | case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
444 | case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
445 | default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); | |||
446 | } | |||
447 | } | |||
448 | ||||
449 | extern int gf_size(gf_t *gf) | |||
450 | { | |||
451 | gf_internal_t *h; | |||
452 | int s; | |||
453 | ||||
454 | s = sizeof(gf_t); | |||
455 | h = (gf_internal_t *) gf->scratch; | |||
456 | s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2); | |||
457 | if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf); | |||
458 | return s; | |||
459 | } | |||
460 | ||||
461 | ||||
462 | int gf_init_easy(gf_t *gf, int w) | |||
463 | { | |||
464 | return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT(0x0), GF_DIVIDE_DEFAULT, | |||
465 | 0, 0, 0, NULL((void*)0), NULL((void*)0)); | |||
466 | } | |||
467 | ||||
468 | /* Allen: What's going on here is this function is putting info into the | |||
469 | scratch mem of gf, and then calling the relevant REAL init | |||
470 | func for the word size. Probably done this way to consolidate | |||
471 | those aspects of initialization that don't rely on word size, | |||
472 | and then take care of word-size-specific stuff. */ | |||
473 | ||||
474 | int gf_init_hard(gf_t *gf, int w, int mult_type, | |||
475 | int region_type, | |||
476 | int divide_type, | |||
477 | uint64_t prim_poly, | |||
478 | int arg1, int arg2, | |||
479 | gf_t *base_gf, | |||
480 | void *scratch_memory) | |||
481 | { | |||
482 | int sz; | |||
483 | gf_internal_t *h; | |||
484 | ||||
485 | gf_cpu_identify(); | |||
486 | ||||
487 | if (gf_error_check(w, mult_type, region_type, divide_type, | |||
488 | arg1, arg2, prim_poly, base_gf) == 0) return 0; | |||
489 | ||||
490 | sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); | |||
491 | if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught | |||
492 | in gf_error_check() */ | |||
493 | ||||
494 | if (scratch_memory == NULL((void*)0)) { | |||
495 | h = (gf_internal_t *) malloc(sz); | |||
496 | h->free_me = 1; | |||
497 | } else { | |||
498 | h = scratch_memory; | |||
499 | h->free_me = 0; | |||
500 | } | |||
501 | gf->scratch = (void *) h; | |||
502 | h->mult_type = mult_type; | |||
503 | h->region_type = region_type; | |||
504 | h->divide_type = divide_type; | |||
505 | h->w = w; | |||
506 | h->prim_poly = prim_poly; | |||
507 | h->arg1 = arg1; | |||
508 | h->arg2 = arg2; | |||
509 | h->base_gf = base_gf; | |||
510 | h->private = (void *) gf->scratch; | |||
511 | h->private = (uint8_t *)h->private + (sizeof(gf_internal_t)); | |||
512 | gf->extract_word.w32 = NULL((void*)0); | |||
513 | ||||
514 | switch(w) { | |||
515 | case 4: return gf_w4_init(gf); | |||
516 | case 8: return gf_w8_init(gf); | |||
517 | case 16: return gf_w16_init(gf); | |||
518 | case 32: return gf_w32_init(gf); | |||
519 | case 64: return gf_w64_init(gf); | |||
520 | case 128: return gf_w128_init(gf); | |||
521 | default: return gf_wgen_init(gf); | |||
522 | } | |||
523 | } | |||
524 | ||||
525 | int gf_free(gf_t *gf, int recursive) | |||
526 | { | |||
527 | gf_internal_t *h; | |||
528 | ||||
529 | h = (gf_internal_t *) gf->scratch; | |||
530 | if (recursive && h->base_gf != NULL((void*)0)) { | |||
531 | gf_free(h->base_gf, 1); | |||
532 | free(h->base_gf); | |||
533 | } | |||
534 | if (h->free_me) free(h); | |||
535 | return 0; /* Making compiler happy */ | |||
536 | } | |||
537 | ||||
538 | void gf_alignment_error(char *s, int a) | |||
539 | { | |||
540 | fprintf(stderr, "Alignment error in %s:\n", s)__fprintf_chk (stderr, 2 - 1, "Alignment error in %s:\n", s); | |||
541 | fprintf(stderr, " The source and destination buffers must be aligned to each other,\n")__fprintf_chk (stderr, 2 - 1, " The source and destination buffers must be aligned to each other,\n" ); | |||
542 | fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a)__fprintf_chk (stderr, 2 - 1, " and they must be aligned to a %d-byte address.\n" , a); | |||
543 | assert(0)((void) (0)); | |||
544 | } | |||
545 | ||||
546 | static | |||
547 | void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) { | |||
548 | int cols, i, j; | |||
549 | uint32_t tmp; | |||
550 | ||||
551 | cols = rows; | |||
552 | ||||
553 | for (i = 0; i < rows; i++) inv[i] = (1 << i); | |||
554 | ||||
555 | /* First -- convert into upper triangular */ | |||
556 | ||||
557 | for (i = 0; i < cols; i++) { | |||
558 | ||||
559 | /* Swap rows if we ave a zero i,i element. If we can't swap, then the | |||
560 | matrix was not invertible */ | |||
561 | ||||
562 | if ((mat[i] & (1 << i)) == 0) { | |||
563 | for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ; | |||
564 | if (j == rows) { | |||
565 | fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n")__fprintf_chk (stderr, 2 - 1, "galois_invert_matrix: Matrix not invertible!!\n" ); | |||
566 | assert(0)((void) (0)); | |||
567 | } | |||
568 | tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp; | |||
569 | tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp; | |||
570 | } | |||
571 | ||||
572 | /* Now for each j>i, add A_ji*Ai to Aj */ | |||
573 | for (j = i+1; j != rows; j++) { | |||
574 | if ((mat[j] & (1 << i)) != 0) { | |||
575 | mat[j] ^= mat[i]; | |||
576 | inv[j] ^= inv[i]; | |||
577 | } | |||
578 | } | |||
579 | } | |||
580 | ||||
581 | /* Now the matrix is upper triangular. Start at the top and multiply down */ | |||
582 | ||||
583 | for (i = rows-1; i >= 0; i--) { | |||
584 | for (j = 0; j < i; j++) { | |||
585 | if (mat[j] & (1 << i)) { | |||
586 | /* mat[j] ^= mat[i]; */ | |||
587 | inv[j] ^= inv[i]; | |||
588 | } | |||
589 | } | |||
590 | } | |||
591 | } | |||
592 | ||||
593 | uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) | |||
594 | { | |||
595 | uint32_t mat[32], inv[32], mask; | |||
596 | int i; | |||
597 | ||||
598 | mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1; | |||
| ||||
599 | for (i = 0; i < w; i++) { | |||
600 | mat[i] = y; | |||
601 | ||||
602 | if (y & (1 << (w-1))) { | |||
603 | y = y << 1; | |||
604 | y = ((y ^ pp) & mask); | |||
605 | } else { | |||
606 | y = y << 1; | |||
607 | } | |||
608 | } | |||
609 | ||||
610 | gf_invert_binary_matrix(mat, inv, w); | |||
611 | return inv[0]; | |||
| ||||
612 | } | |||
613 | ||||
614 | void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) | |||
615 | { | |||
616 | uint64_t a, prod; | |||
617 | int xor; | |||
618 | uint64_t *s64, *d64, *top; | |||
619 | ||||
620 | s64 = rd->s_start; | |||
621 | d64 = rd->d_start; | |||
622 | top = rd->d_top; | |||
623 | xor = rd->xor; | |||
624 | ||||
625 | if (xor) { | |||
626 | while (d64 != top) { | |||
627 | a = *s64; | |||
628 | prod = base[a >> 48]; | |||
629 | a <<= 16; | |||
630 | prod <<= 16; | |||
631 | prod ^= base[a >> 48]; | |||
632 | a <<= 16; | |||
633 | prod <<= 16; | |||
634 | prod ^= base[a >> 48]; | |||
635 | a <<= 16; | |||
636 | prod <<= 16; | |||
637 | prod ^= base[a >> 48]; | |||
638 | prod ^= *d64; | |||
639 | *d64 = prod; | |||
640 | s64++; | |||
641 | d64++; | |||
642 | } | |||
643 | } else { | |||
644 | while (d64 != top) { | |||
645 | a = *s64; | |||
646 | prod = base[a >> 48]; | |||
647 | a <<= 16; | |||
648 | prod <<= 16; | |||
649 | prod ^= base[a >> 48]; | |||
650 | a <<= 16; | |||
651 | prod <<= 16; | |||
652 | prod ^= base[a >> 48]; | |||
653 | a <<= 16; | |||
654 | prod <<= 16; | |||
655 | prod ^= base[a >> 48]; | |||
656 | *d64 = prod; | |||
657 | s64++; | |||
658 | d64++; | |||
659 | } | |||
660 | } | |||
661 | } | |||
662 | ||||
663 | static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top) | |||
664 | { | |||
665 | uint8_t *s8, *d8; | |||
666 | uint16_t *s16, *d16; | |||
667 | uint32_t *s32, *d32; | |||
668 | uint64_t *s64, *d64; | |||
669 | gf_internal_t *h; | |||
670 | int wb; | |||
671 | uint32_t p, a; | |||
672 | ||||
673 | h = rd->gf->scratch; | |||
674 | wb = (h->w)/8; | |||
675 | if (wb == 0) wb = 1; | |||
676 | ||||
677 | while (src < s_top) { | |||
678 | switch (h->w) { | |||
679 | case 8: | |||
680 | s8 = (uint8_t *) src; | |||
681 | d8 = (uint8_t *) dest; | |||
682 | *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : | |||
683 | rd->gf->multiply.w32(rd->gf, rd->val, *s8); | |||
684 | break; | |||
685 | case 4: | |||
686 | s8 = (uint8_t *) src; | |||
687 | d8 = (uint8_t *) dest; | |||
688 | a = *s8; | |||
689 | p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf); | |||
690 | p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4); | |||
691 | if (rd->xor) p ^= *d8; | |||
692 | *d8 = p; | |||
693 | break; | |||
694 | case 16: | |||
695 | s16 = (uint16_t *) src; | |||
696 | d16 = (uint16_t *) dest; | |||
697 | *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : | |||
698 | rd->gf->multiply.w32(rd->gf, rd->val, *s16); | |||
699 | break; | |||
700 | case 32: | |||
701 | s32 = (uint32_t *) src; | |||
702 | d32 = (uint32_t *) dest; | |||
703 | *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : | |||
704 | rd->gf->multiply.w32(rd->gf, rd->val, *s32); | |||
705 | break; | |||
706 | case 64: | |||
707 | s64 = (uint64_t *) src; | |||
708 | d64 = (uint64_t *) dest; | |||
709 | *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : | |||
710 | rd->gf->multiply.w64(rd->gf, rd->val, *s64); | |||
711 | break; | |||
712 | default: | |||
713 | fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w)__fprintf_chk (stderr, 2 - 1, "Error: gf_slow_multiply_region: w=%d not implemented.\n" , h->w); | |||
714 | exit(1); | |||
715 | } | |||
716 | src = (uint8_t *)src + wb; | |||
717 | dest = (uint8_t *)dest + wb; | |||
718 | } | |||
719 | } | |||
720 | ||||
721 | /* JSP - The purpose of this procedure is to error check alignment, | |||
722 | and to set up the region operation so that it can best leverage | |||
723 | large words. | |||
724 | ||||
725 | It stores its information in rd. | |||
726 | ||||
727 | Assuming you're not doing Cauchy coding, (see below for that), | |||
728 | then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably | |||
729 | should change that). | |||
730 | ||||
731 | src and dest must then be aligned on ceil(w/8)-byte boundaries. | |||
732 | Moreover, bytes must be a multiple of ceil(w/8). If the variable | |||
733 | align is equal to ceil(w/8), then we will set s_start = src, | |||
734 | d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes). | |||
735 | And we return -- the implementation will go ahead and do the | |||
736 | multiplication on individual words (e.g. using discrete logs). | |||
737 | ||||
738 | If align is greater than ceil(w/8), then the implementation needs | |||
739 | to work on groups of "align" bytes. For example, suppose you are | |||
740 | implementing BYTWO, without SSE. Then you will be doing the region | |||
741 | multiplication in units of 8 bytes, so align = 8. Or, suppose you | |||
742 | are doing a Quad table in GF(2^4). You will be doing the region | |||
743 | multiplication in units of 2 bytes, so align = 2. Or, suppose you | |||
744 | are doing split multiplication with SSE operations in GF(2^8). | |||
745 | Then align = 16. Worse yet, suppose you are doing split | |||
746 | multiplication with SSE operations in GF(2^16), with or without | |||
747 | ALTMAP. Then, you will be doing the multiplication on 256 bits at | |||
748 | a time. So align = 32. | |||
749 | ||||
750 | When align does not equal ceil(w/8), we split the region | |||
751 | multiplication into three parts. We are going to make s_start be | |||
752 | the first address greater than or equal to src that is a multiple | |||
753 | of align. s_top is going to be the largest address >= src+bytes | |||
754 | such that (s_top - s_start) is a multiple of align. We do the | |||
755 | same with d_start and d_top. When we say that "src and dest must | |||
756 | be aligned with respect to each other, we mean that s_start-src | |||
757 | must equal d_start-dest. | |||
758 | ||||
759 | Now, the region multiplication is done in three parts -- the part | |||
760 | between src and s_start must be done using single words. | |||
761 | Similarly, the part between s_top and src+bytes must also be done | |||
762 | using single words. The part between s_start and s_top will be | |||
763 | done in chunks of "align" bytes. | |||
764 | ||||
765 | One final thing -- if align > 16, then s_start and d_start will be | |||
766 | aligned on a 16 byte boundary. Perhaps we should have two | |||
767 | variables: align and chunksize. Then we'd have s_start & d_start | |||
768 | aligned to "align", and have s_top-s_start be a multiple of | |||
769 | chunksize. That may be less confusing, but it would be a big | |||
770 | change. | |||
771 | ||||
772 | Finally, if align = -1, then we are doing Cauchy multiplication, | |||
773 | using only XOR's. In this case, we're not going to care about | |||
774 | alignment because we are just doing XOR's. Instead, the only | |||
775 | thing we care about is that bytes must be a multiple of w. | |||
776 | ||||
777 | This is not to say that alignment doesn't matter in performance | |||
778 | with XOR's. See that discussion in gf_multby_one(). | |||
779 | ||||
780 | After you call gf_set_region_data(), the procedure | |||
781 | gf_do_initial_region_alignment() calls gf->multiply.w32() on | |||
782 | everything between src and s_start. The procedure | |||
783 | gf_do_final_region_alignment() calls gf->multiply.w32() on | |||
784 | everything between s_top and src+bytes. | |||
785 | */ | |||
786 | ||||
787 | void gf_set_region_data(gf_region_data *rd, | |||
788 | gf_t *gf, | |||
789 | void *src, | |||
790 | void *dest, | |||
791 | int bytes, | |||
792 | uint64_t val, | |||
793 | int xor, | |||
794 | int align) | |||
795 | { | |||
796 | gf_internal_t *h = NULL((void*)0); | |||
797 | int wb; | |||
798 | uint32_t a; | |||
799 | unsigned long uls, uld; | |||
800 | ||||
801 | if (gf == NULL((void*)0)) { /* JSP - Can be NULL if you're just doing XOR's */ | |||
802 | wb = 1; | |||
803 | } else { | |||
804 | h = gf->scratch; | |||
805 | wb = (h->w)/8; | |||
806 | if (wb == 0) wb = 1; | |||
807 | } | |||
808 | ||||
809 | rd->gf = gf; | |||
810 | rd->src = src; | |||
811 | rd->dest = dest; | |||
812 | rd->bytes = bytes; | |||
813 | rd->val = val; | |||
814 | rd->xor = xor; | |||
815 | rd->align = align; | |||
816 | ||||
817 | uls = (unsigned long) src; | |||
818 | uld = (unsigned long) dest; | |||
819 | ||||
820 | a = (align <= 16) ? align : 16; | |||
821 | ||||
822 | if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers | |||
823 | so that there are no alignment regions. */ | |||
824 | if (h != NULL((void*)0) && bytes % h->w != 0) { | |||
825 | fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n" ); | |||
826 | fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w)__fprintf_chk (stderr, 2 - 1, "The size must be a multiple of %d bytes.\n" , h->w); | |||
827 | assert(0)((void) (0)); | |||
828 | } | |||
829 | ||||
830 | rd->s_start = src; | |||
831 | rd->d_start = dest; | |||
832 | rd->s_top = (uint8_t *)src + bytes; | |||
833 | rd->d_top = (uint8_t *)src + bytes; | |||
834 | return; | |||
835 | } | |||
836 | ||||
837 | if (uls % a != uld % a) { | |||
838 | fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n" ); | |||
839 | fprintf(stderr, "The source & destination pointers must be aligned with respect\n")__fprintf_chk (stderr, 2 - 1, "The source & destination pointers must be aligned with respect\n" ); | |||
840 | fprintf(stderr, "to each other along a %d byte boundary.\n", a)__fprintf_chk (stderr, 2 - 1, "to each other along a %d byte boundary.\n" , a); | |||
841 | fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, (unsigned long) dest) | |||
842 | (unsigned long) dest)__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, (unsigned long) dest); | |||
843 | assert(0)((void) (0)); | |||
844 | } | |||
845 | ||||
846 | if (uls % wb != 0) { | |||
847 | fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n" ); | |||
848 | fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb)__fprintf_chk (stderr, 2 - 1, "The pointers must be aligned along a %d byte boundary.\n" , wb); | |||
849 | fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, (unsigned long) dest) | |||
850 | (unsigned long) dest)__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, (unsigned long) dest); | |||
851 | assert(0)((void) (0)); | |||
852 | } | |||
853 | ||||
854 | if (bytes % wb != 0) { | |||
855 | fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n" ); | |||
856 | fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb)__fprintf_chk (stderr, 2 - 1, "The size must be a multiple of %d bytes.\n" , wb); | |||
857 | assert(0)((void) (0)); | |||
858 | } | |||
859 | ||||
860 | uls %= a; | |||
861 | if (uls != 0) uls = (a-uls); | |||
862 | rd->s_start = (uint8_t *)rd->src + uls; | |||
863 | rd->d_start = (uint8_t *)rd->dest + uls; | |||
864 | bytes -= uls; | |||
865 | bytes -= (bytes % align); | |||
866 | rd->s_top = (uint8_t *)rd->s_start + bytes; | |||
867 | rd->d_top = (uint8_t *)rd->d_start + bytes; | |||
868 | ||||
869 | } | |||
870 | ||||
871 | void gf_do_initial_region_alignment(gf_region_data *rd) | |||
872 | { | |||
873 | gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start); | |||
874 | } | |||
875 | ||||
876 | void gf_do_final_region_alignment(gf_region_data *rd) | |||
877 | { | |||
878 | gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes); | |||
879 | } | |||
880 | ||||
881 | void gf_multby_zero(void *dest, int bytes, int xor) | |||
882 | { | |||
883 | if (xor) return; | |||
884 | bzero(dest, bytes); | |||
885 | return; | |||
886 | } | |||
887 | ||||
888 | /* JSP - gf_multby_one tries to do this in the most efficient way | |||
889 | possible. If xor = 0, then simply call memcpy() since that | |||
890 | should be optimized by the system. Otherwise, try to do the xor | |||
891 | in the following order: | |||
892 | ||||
893 | If src and dest are aligned with respect to each other on 16-byte | |||
894 | boundaries and you have SSE instructions, then use aligned SSE | |||
895 | instructions. | |||
896 | ||||
897 | If they aren't but you still have SSE instructions, use unaligned | |||
898 | SSE instructions. | |||
899 | ||||
900 | If there are no SSE instructions, but they are aligned with | |||
901 | respect to each other on 8-byte boundaries, then do them with | |||
902 | uint64_t's. | |||
903 | ||||
904 | Otherwise, call gf_unaligned_xor(), which does the following: | |||
905 | align a destination pointer along an 8-byte boundary, and then | |||
906 | memcpy 32 bytes at a time from the src pointer to an array of | |||
907 | doubles. I'm not sure if that's the best -- probably needs | |||
908 | testing, but this seems like it could be a black hole. | |||
909 | */ | |||
910 | ||||
911 | static void gf_unaligned_xor(void *src, void *dest, int bytes); | |||
912 | ||||
913 | void gf_multby_one(void *src, void *dest, int bytes, int xor) | |||
914 | { | |||
915 | unsigned long uls, uld; | |||
916 | uint8_t *s8, *d8; | |||
917 | uint64_t *s64, *d64, *dtop64; | |||
918 | gf_region_data rd; | |||
919 | ||||
920 | if (!xor) { | |||
921 | if (dest != src) | |||
922 | memcpy(dest, src, bytes); | |||
923 | return; | |||
924 | } | |||
925 | uls = (unsigned long) src; | |||
926 | uld = (unsigned long) dest; | |||
927 | ||||
928 | #ifdef INTEL_SSE21 | |||
929 | if (gf_cpu_supports_intel_sse2) { | |||
930 | __m128i ms, md; | |||
931 | int abytes; | |||
932 | s8 = (uint8_t *) src; | |||
933 | d8 = (uint8_t *) dest; | |||
934 | if (uls % 16 == uld % 16) { | |||
935 | gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 16); | |||
936 | while (s8 != rd.s_start) { | |||
937 | *d8 ^= *s8; | |||
938 | d8++; | |||
939 | s8++; | |||
940 | } | |||
941 | while (s8 < (uint8_t *) rd.s_top) { | |||
942 | ms = _mm_load_si128 ((__m128i *)(s8)); | |||
943 | md = _mm_load_si128 ((__m128i *)(d8)); | |||
944 | md = _mm_xor_si128(md, ms); | |||
945 | _mm_store_si128((__m128i *)(d8), md); | |||
946 | s8 += 16; | |||
947 | d8 += 16; | |||
948 | } | |||
949 | while (s8 != (uint8_t *) src + bytes) { | |||
950 | *d8 ^= *s8; | |||
951 | d8++; | |||
952 | s8++; | |||
953 | } | |||
954 | return; | |||
955 | } | |||
956 | ||||
957 | abytes = (bytes & 0xfffffff0); | |||
958 | ||||
959 | while (d8 < (uint8_t *) dest + abytes) { | |||
960 | ms = _mm_loadu_si128 ((__m128i *)(s8)); | |||
961 | md = _mm_loadu_si128 ((__m128i *)(d8)); | |||
962 | md = _mm_xor_si128(md, ms); | |||
963 | _mm_storeu_si128((__m128i *)(d8), md); | |||
964 | s8 += 16; | |||
965 | d8 += 16; | |||
966 | } | |||
967 | while (d8 != (uint8_t *) dest+bytes) { | |||
968 | *d8 ^= *s8; | |||
969 | d8++; | |||
970 | s8++; | |||
971 | } | |||
972 | return; | |||
973 | } | |||
974 | #endif | |||
975 | #if defined(ARM_NEON) | |||
976 | if (gf_cpu_supports_arm_neon) { | |||
977 | s8 = (uint8_t *) src; | |||
978 | d8 = (uint8_t *) dest; | |||
979 | ||||
980 | if (uls % 16 == uld % 16) { | |||
981 | gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 16); | |||
982 | while (s8 != rd.s_start) { | |||
983 | *d8 ^= *s8; | |||
984 | s8++; | |||
985 | d8++; | |||
986 | } | |||
987 | while (s8 < (uint8_t *) rd.s_top) { | |||
988 | uint8x16_t vs = vld1q_u8 (s8); | |||
989 | uint8x16_t vd = vld1q_u8 (d8); | |||
990 | uint8x16_t vr = veorq_u8 (vs, vd); | |||
991 | vst1q_u8 (d8, vr); | |||
992 | s8 += 16; | |||
993 | d8 += 16; | |||
994 | } | |||
995 | } else { | |||
996 | while (s8 + 15 < (uint8_t *) src + bytes) { | |||
997 | uint8x16_t vs = vld1q_u8 (s8); | |||
998 | uint8x16_t vd = vld1q_u8 (d8); | |||
999 | uint8x16_t vr = veorq_u8 (vs, vd); | |||
1000 | vst1q_u8 (d8, vr); | |||
1001 | s8 += 16; | |||
1002 | d8 += 16; | |||
1003 | } | |||
1004 | } | |||
1005 | while (s8 < (uint8_t *) src + bytes) { | |||
1006 | *d8 ^= *s8; | |||
1007 | s8++; | |||
1008 | d8++; | |||
1009 | } | |||
1010 | return; | |||
1011 | } | |||
1012 | #endif | |||
1013 | if (uls % 8 != uld % 8) { | |||
1014 | gf_unaligned_xor(src, dest, bytes); | |||
1015 | return; | |||
1016 | } | |||
1017 | ||||
1018 | gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 8); | |||
1019 | s8 = (uint8_t *) src; | |||
1020 | d8 = (uint8_t *) dest; | |||
1021 | while (d8 != rd.d_start) { | |||
1022 | *d8 ^= *s8; | |||
1023 | d8++; | |||
1024 | s8++; | |||
1025 | } | |||
1026 | dtop64 = (uint64_t *) rd.d_top; | |||
1027 | ||||
1028 | d64 = (uint64_t *) rd.d_start; | |||
1029 | s64 = (uint64_t *) rd.s_start; | |||
1030 | ||||
1031 | while (d64 < dtop64) { | |||
1032 | *d64 ^= *s64; | |||
1033 | d64++; | |||
1034 | s64++; | |||
1035 | } | |||
1036 | ||||
1037 | s8 = (uint8_t *) rd.s_top; | |||
1038 | d8 = (uint8_t *) rd.d_top; | |||
1039 | ||||
1040 | while (d8 != (uint8_t *) dest+bytes) { | |||
1041 | *d8 ^= *s8; | |||
1042 | d8++; | |||
1043 | s8++; | |||
1044 | } | |||
1045 | return; | |||
1046 | } | |||
1047 | ||||
1048 | #define UNALIGNED_BUFSIZE(8) (8) | |||
1049 | ||||
1050 | static void gf_unaligned_xor(void *src, void *dest, int bytes) | |||
1051 | { | |||
1052 | uint64_t scopy[UNALIGNED_BUFSIZE(8)], *d64; | |||
1053 | int i; | |||
1054 | gf_region_data rd; | |||
1055 | uint8_t *s8, *d8; | |||
1056 | ||||
1057 | /* JSP - call gf_set_region_data(), but use dest in both places. This is | |||
1058 | because I only want to set up dest. If I used src, gf_set_region_data() | |||
1059 | would fail because src and dest are not aligned to each other wrt | |||
1060 | 8-byte pointers. I know this will actually align d_start to 16 bytes. | |||
1061 | If I change gf_set_region_data() to split alignment & chunksize, then | |||
1062 | I could do this correctly. */ | |||
1063 | ||||
1064 | gf_set_region_data(&rd, NULL((void*)0), dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE(8)); | |||
1065 | s8 = (uint8_t *) src; | |||
1066 | d8 = (uint8_t *) dest; | |||
1067 | ||||
1068 | while (d8 < (uint8_t *) rd.d_start) { | |||
1069 | *d8 ^= *s8; | |||
1070 | d8++; | |||
1071 | s8++; | |||
1072 | } | |||
1073 | ||||
1074 | d64 = (uint64_t *) d8; | |||
1075 | while (d64 < (uint64_t *) rd.d_top) { | |||
1076 | memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE(8)); | |||
1077 | s8 += 8*UNALIGNED_BUFSIZE(8); | |||
1078 | for (i = 0; i < UNALIGNED_BUFSIZE(8); i++) { | |||
1079 | *d64 ^= scopy[i]; | |||
1080 | d64++; | |||
1081 | } | |||
1082 | } | |||
1083 | ||||
1084 | d8 = (uint8_t *) d64; | |||
1085 | while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) { | |||
1086 | *d8 ^= *s8; | |||
1087 | d8++; | |||
1088 | s8++; | |||
1089 | } | |||
1090 | } |