| File: | home/bhubbard/working/src/ceph/src/erasure-code/jerasure/gf-complete/src/gf.c |
| Warning: | line 611, column 3 Undefined or garbage value returned to caller |
[?] Use j/k keys for keyboard navigation
| 1 | /* | |||
| 2 | * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic | |||
| 3 | * James S. Plank, Ethan L. Miller, Kevin M. Greenan, | |||
| 4 | * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. | |||
| 5 | * | |||
| 6 | * gf.c | |||
| 7 | * | |||
| 8 | * Generic routines for Galois fields | |||
| 9 | */ | |||
| 10 | ||||
| 11 | #include "gf_int.h" | |||
| 12 | #include <stdio.h> | |||
| 13 | #include <stdlib.h> | |||
| 14 | #include <assert.h> | |||
| 15 | #include "gf_cpu.h" | |||
| 16 | ||||
| 17 | int _gf_errno = GF_E_DEFAULT; | |||
| 18 | ||||
| 19 | void gf_error() | |||
| 20 | { | |||
| 21 | char *s; | |||
| 22 | ||||
| 23 | switch(_gf_errno) { | |||
| 24 | case GF_E_DEFAULT: s = "No Error."; break; | |||
| 25 | case GF_E_TWOMULT: s = "Cannot specify two -m's."; break; | |||
| 26 | case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break; | |||
| 27 | case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break; | |||
| 28 | case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break; | |||
| 29 | case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break; | |||
| 30 | case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break; | |||
| 31 | case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break; | |||
| 32 | case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break; | |||
| 33 | case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break; | |||
| 34 | case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break; | |||
| 35 | case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break; | |||
| 36 | case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break; | |||
| 37 | case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break; | |||
| 38 | case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break; | |||
| 39 | case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break; | |||
| 40 | case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break; | |||
| 41 | case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break; | |||
| 42 | case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break; | |||
| 43 | case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break; | |||
| 44 | case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break; | |||
| 45 | case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break; | |||
| 46 | case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break; | |||
| 47 | case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break; | |||
| 48 | case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break; | |||
| 49 | case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break; | |||
| 50 | case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break; | |||
| 51 | case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break; | |||
| 52 | case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break; | |||
| 53 | case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break; | |||
| 54 | case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break; | |||
| 55 | case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break; | |||
| 56 | case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break; | |||
| 57 | case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break; | |||
| 58 | case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break; | |||
| 59 | case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break; | |||
| 60 | case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break; | |||
| 61 | case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break; | |||
| 62 | case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break; | |||
| 63 | case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break; | |||
| 64 | case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break; | |||
| 65 | case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break; | |||
| 66 | case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break; | |||
| 67 | case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break; | |||
| 68 | case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break; | |||
| 69 | case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break; | |||
| 70 | case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break; | |||
| 71 | case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break; | |||
| 72 | case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break; | |||
| 73 | case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break; | |||
| 74 | case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break; | |||
| 75 | case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break; | |||
| 76 | case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break; | |||
| 77 | case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break; | |||
| 78 | case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break; | |||
| 79 | case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break; | |||
| 80 | case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break; | |||
| 81 | case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break; | |||
| 82 | case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break; | |||
| 83 | case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break; | |||
| 84 | case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break; | |||
| 85 | case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break; | |||
| 86 | case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break; | |||
| 87 | case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; | |||
| 88 | case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break; | |||
| 89 | case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; | |||
| 90 | case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; | |||
| 91 | case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break; | |||
| 92 | case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; | |||
| 93 | case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; | |||
| 94 | case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; | |||
| 95 | case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break; | |||
| 96 | case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break; | |||
| 97 | case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break; | |||
| 98 | case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break; | |||
| 99 | case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break; | |||
| 100 | case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break; | |||
| 101 | case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break; | |||
| 102 | case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break; | |||
| 103 | case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break; | |||
| 104 | case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break; | |||
| 105 | case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break; | |||
| 106 | case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break; | |||
| 107 | case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break; | |||
| 108 | case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break; | |||
| 109 | case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break; | |||
| 110 | case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break; | |||
| 111 | case GF_E_UNKNOWN: s = "Unknown multiplication type."; break; | |||
| 112 | case GF_E_UNK_REG: s = "Unknown region type."; break; | |||
| 113 | case GF_E_UNK_DIV: s = "Unknown division type."; break; | |||
| 114 | default: s = "Undefined error."; | |||
| 115 | } | |||
| 116 | ||||
| 117 | fprintf(stderr, "%s\n", s)__fprintf_chk (stderr, 2 - 1, "%s\n", s); | |||
| 118 | } | |||
| 119 | ||||
| 120 | uint64_t gf_composite_get_default_poly(gf_t *base) | |||
| 121 | { | |||
| 122 | gf_internal_t *h; | |||
| 123 | uint64_t rv; | |||
| 124 | ||||
| 125 | h = (gf_internal_t *) base->scratch; | |||
| 126 | if (h->w == 4) { | |||
| 127 | if (h->mult_type == GF_MULT_COMPOSITE) return 0; | |||
| 128 | if (h->prim_poly == 0x13) return 2; | |||
| 129 | return 0; | |||
| 130 | } | |||
| 131 | if (h->w == 8) { | |||
| 132 | if (h->mult_type == GF_MULT_COMPOSITE) return 0; | |||
| 133 | if (h->prim_poly == 0x11d) return 3; | |||
| 134 | return 0; | |||
| 135 | } | |||
| 136 | if (h->w == 16) { | |||
| 137 | if (h->mult_type == GF_MULT_COMPOSITE) { | |||
| 138 | rv = gf_composite_get_default_poly(h->base_gf); | |||
| 139 | if (rv != h->prim_poly) return 0; | |||
| 140 | if (rv == 3) return 0x105; | |||
| 141 | return 0; | |||
| 142 | } else { | |||
| 143 | if (h->prim_poly == 0x1100b) return 2; | |||
| 144 | if (h->prim_poly == 0x1002d) return 7; | |||
| 145 | return 0; | |||
| 146 | } | |||
| 147 | } | |||
| 148 | if (h->w == 32) { | |||
| 149 | if (h->mult_type == GF_MULT_COMPOSITE) { | |||
| 150 | rv = gf_composite_get_default_poly(h->base_gf); | |||
| 151 | if (rv != h->prim_poly) return 0; | |||
| 152 | if (rv == 2) return 0x10005; | |||
| 153 | if (rv == 7) return 0x10008; | |||
| 154 | if (rv == 0x105) return 0x10002; | |||
| 155 | return 0; | |||
| 156 | } else { | |||
| 157 | if (h->prim_poly == 0x400007) return 2; | |||
| 158 | if (h->prim_poly == 0xc5) return 3; | |||
| 159 | return 0; | |||
| 160 | } | |||
| 161 | } | |||
| 162 | if (h->w == 64) { | |||
| 163 | if (h->mult_type == GF_MULT_COMPOSITE) { | |||
| 164 | rv = gf_composite_get_default_poly(h->base_gf); | |||
| 165 | if (rv != h->prim_poly) return 0; | |||
| 166 | if (rv == 3) return 0x100000009ULL; | |||
| 167 | if (rv == 2) return 0x100000004ULL; | |||
| 168 | if (rv == 0x10005) return 0x100000003ULL; | |||
| 169 | if (rv == 0x10002) return 0x100000005ULL; | |||
| 170 | if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too, | |||
| 171 | but I want to differentiate cases). */ | |||
| 172 | return 0; | |||
| 173 | } else { | |||
| 174 | if (h->prim_poly == 0x1bULL) return 2; | |||
| 175 | return 0; | |||
| 176 | } | |||
| 177 | } | |||
| 178 | return 0; | |||
| 179 | } | |||
| 180 | ||||
| 181 | int gf_error_check(int w, int mult_type, int region_type, int divide_type, | |||
| 182 | int arg1, int arg2, uint64_t poly, gf_t *base) | |||
| 183 | { | |||
| 184 | int sse3 = 0; | |||
| 185 | int sse2 = 0; | |||
| 186 | int pclmul = 0; | |||
| 187 | int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp; | |||
| 188 | gf_internal_t *sub; | |||
| 189 | ||||
| 190 | rdouble = (region_type & GF_REGION_DOUBLE_TABLE(0x1)); | |||
| 191 | rquad = (region_type & GF_REGION_QUAD_TABLE(0x2)); | |||
| 192 | rlazy = (region_type & GF_REGION_LAZY(0x4)); | |||
| 193 | rsimd = (region_type & GF_REGION_SIMD(0x8)); | |||
| 194 | rnosimd = (region_type & GF_REGION_NOSIMD(0x10)); | |||
| 195 | raltmap = (region_type & GF_REGION_ALTMAP(0x20)); | |||
| 196 | rcauchy = (region_type & GF_REGION_CAUCHY(0x40)); | |||
| 197 | ||||
| 198 | if (divide_type != GF_DIVIDE_DEFAULT && | |||
| 199 | divide_type != GF_DIVIDE_MATRIX && | |||
| 200 | divide_type != GF_DIVIDE_EUCLID) { | |||
| 201 | _gf_errno = GF_E_UNK_DIV; | |||
| 202 | return 0; | |||
| 203 | } | |||
| 204 | ||||
| 205 | tmp = ( GF_REGION_DOUBLE_TABLE(0x1) | GF_REGION_QUAD_TABLE(0x2) | GF_REGION_LAZY(0x4) | | |||
| 206 | GF_REGION_SIMD(0x8) | GF_REGION_NOSIMD(0x10) | GF_REGION_ALTMAP(0x20) | | |||
| 207 | GF_REGION_CAUCHY(0x40) ); | |||
| 208 | if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } | |||
| 209 | ||||
| 210 | #ifdef INTEL_SSE21 | |||
| 211 | if (gf_cpu_supports_intel_sse2) { | |||
| 212 | sse2 = 1; | |||
| 213 | } | |||
| 214 | #endif | |||
| 215 | ||||
| 216 | #ifdef INTEL_SSSE31 | |||
| 217 | if (gf_cpu_supports_intel_ssse3) { | |||
| 218 | sse3 = 1; | |||
| 219 | } | |||
| 220 | #endif | |||
| 221 | ||||
| 222 | #ifdef INTEL_SSE4_PCLMUL1 | |||
| 223 | if (gf_cpu_supports_intel_pclmul) { | |||
| 224 | pclmul = 1; | |||
| 225 | } | |||
| 226 | #endif | |||
| 227 | ||||
| 228 | #ifdef ARM_NEON | |||
| 229 | if (gf_cpu_supports_arm_neon) { | |||
| 230 | pclmul = (w == 4 || w == 8); | |||
| 231 | sse3 = 1; | |||
| 232 | } | |||
| 233 | #endif | |||
| 234 | ||||
| 235 | ||||
| 236 | if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; } | |||
| 237 | ||||
| 238 | if (mult_type != GF_MULT_COMPOSITE && w < 64) { | |||
| 239 | if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; } | |||
| 240 | } | |||
| 241 | ||||
| 242 | if (mult_type == GF_MULT_DEFAULT) { | |||
| 243 | if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; } | |||
| 244 | if (region_type != GF_REGION_DEFAULT(0x0)) { _gf_errno = GF_E_MDEFREG; return 0; } | |||
| 245 | if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; } | |||
| 246 | return 1; | |||
| 247 | } | |||
| 248 | ||||
| 249 | if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; } | |||
| 250 | if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; } | |||
| 251 | if (rcauchy && region_type != GF_REGION_CAUCHY(0x40)) { _gf_errno = GF_E_CAUCHYB; return 0; } | |||
| 252 | if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; } | |||
| 253 | ||||
| 254 | if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && | |||
| 255 | mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { | |||
| 256 | _gf_errno = GF_E_ARG1SET; | |||
| 257 | return 0; | |||
| 258 | } | |||
| 259 | ||||
| 260 | if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { | |||
| 261 | _gf_errno = GF_E_ARG2SET; | |||
| 262 | return 0; | |||
| 263 | } | |||
| 264 | ||||
| 265 | if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; } | |||
| 266 | ||||
| 267 | if (rdouble) { | |||
| 268 | if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; } | |||
| 269 | if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; } | |||
| 270 | if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; } | |||
| 271 | if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } | |||
| 272 | if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; } | |||
| 273 | return 1; | |||
| 274 | } | |||
| 275 | ||||
| 276 | if (rquad) { | |||
| 277 | if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; } | |||
| 278 | if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; } | |||
| 279 | if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } | |||
| 280 | return 1; | |||
| 281 | } | |||
| 282 | ||||
| 283 | if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; } | |||
| 284 | ||||
| 285 | if (mult_type == GF_MULT_SHIFT) { | |||
| 286 | if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; } | |||
| 287 | if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; } | |||
| 288 | return 1; | |||
| 289 | } | |||
| 290 | ||||
| 291 | if (mult_type == GF_MULT_CARRY_FREE) { | |||
| 292 | if (w != 4 && w != 8 && w != 16 && | |||
| 293 | w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } | |||
| 294 | if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; } | |||
| 295 | if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; } | |||
| 296 | if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; } | |||
| 297 | if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; } | |||
| 298 | if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; } | |||
| 299 | if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } | |||
| 300 | if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; } | |||
| 301 | if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } | |||
| 302 | return 1; | |||
| 303 | } | |||
| 304 | ||||
| 305 | if (mult_type == GF_MULT_CARRY_FREE_GK) { | |||
| 306 | if (w != 4 && w != 8 && w != 16 && | |||
| 307 | w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } | |||
| 308 | if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } | |||
| 309 | if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; } | |||
| 310 | if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } | |||
| 311 | return 1; | |||
| 312 | } | |||
| 313 | ||||
| 314 | if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { | |||
| 315 | if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } | |||
| 316 | if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } | |||
| 317 | return 1; | |||
| 318 | } | |||
| 319 | ||||
| 320 | if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO | |||
| 321 | || mult_type == GF_MULT_LOG_ZERO_EXT ) { | |||
| 322 | if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; } | |||
| 323 | if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; } | |||
| 324 | ||||
| 325 | if (mult_type == GF_MULT_LOG_TABLE) return 1; | |||
| 326 | ||||
| 327 | if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; } | |||
| 328 | ||||
| 329 | if (mult_type == GF_MULT_LOG_ZERO) return 1; | |||
| 330 | ||||
| 331 | if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; } | |||
| 332 | return 1; | |||
| 333 | } | |||
| 334 | ||||
| 335 | if (mult_type == GF_MULT_GROUP) { | |||
| 336 | if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; } | |||
| 337 | if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; } | |||
| 338 | if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; } | |||
| 339 | if (w == 128 && (arg1 != 4 || | |||
| 340 | (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; } | |||
| 341 | if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; } | |||
| 342 | if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; } | |||
| 343 | if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; } | |||
| 344 | return 1; | |||
| 345 | } | |||
| 346 | ||||
| 347 | if (mult_type == GF_MULT_TABLE) { | |||
| 348 | if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; } | |||
| 349 | if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; } | |||
| 350 | if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } | |||
| 351 | if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; } | |||
| 352 | return 1; | |||
| 353 | } | |||
| 354 | ||||
| 355 | if (mult_type == GF_MULT_SPLIT_TABLE) { | |||
| 356 | if (arg1 > arg2) { | |||
| 357 | tmp = arg1; | |||
| 358 | arg1 = arg2; | |||
| 359 | arg2 = tmp; | |||
| 360 | } | |||
| 361 | if (w == 8) { | |||
| 362 | if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; } | |||
| 363 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
| 364 | if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } | |||
| 365 | } else if (w == 16) { | |||
| 366 | if ((arg1 == 8 && arg2 == 8) || | |||
| 367 | (arg1 == 8 && arg2 == 16)) { | |||
| 368 | if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; } | |||
| 369 | if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } | |||
| 370 | } else if (arg1 == 4 && arg2 == 16) { | |||
| 371 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
| 372 | } else { _gf_errno = GF_E_SP_16AR; return 0; } | |||
| 373 | } else if (w == 32) { | |||
| 374 | if ((arg1 == 8 && arg2 == 8) || | |||
| 375 | (arg1 == 8 && arg2 == 32) || | |||
| 376 | (arg1 == 16 && arg2 == 32)) { | |||
| 377 | if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; } | |||
| 378 | if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } | |||
| 379 | } else if (arg1 == 4 && arg2 == 32) { | |||
| 380 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
| 381 | if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } | |||
| 382 | if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; } | |||
| 383 | } else { _gf_errno = GF_E_SP_32AR; return 0; } | |||
| 384 | } else if (w == 64) { | |||
| 385 | if ((arg1 == 8 && arg2 == 8) || | |||
| 386 | (arg1 == 8 && arg2 == 64) || | |||
| 387 | (arg1 == 16 && arg2 == 64)) { | |||
| 388 | if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; } | |||
| 389 | if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; } | |||
| 390 | } else if (arg1 == 4 && arg2 == 64) { | |||
| 391 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
| 392 | if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; } | |||
| 393 | if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; } | |||
| 394 | } else { _gf_errno = GF_E_SP_64AR; return 0; } | |||
| 395 | } else if (w == 128) { | |||
| 396 | if (arg1 == 8 && arg2 == 128) { | |||
| 397 | if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; } | |||
| 398 | if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; } | |||
| 399 | } else if (arg1 == 4 && arg2 == 128) { | |||
| 400 | if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } | |||
| 401 | if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; } | |||
| 402 | if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; } | |||
| 403 | } else { _gf_errno = GF_E_SP128AR; return 0; } | |||
| 404 | } else { _gf_errno = GF_E_SPLIT_W; return 0; } | |||
| 405 | return 1; | |||
| 406 | } | |||
| 407 | ||||
| 408 | if (mult_type == GF_MULT_COMPOSITE) { | |||
| 409 | if (w != 8 && w != 16 && w != 32 | |||
| 410 | && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; } | |||
| 411 | if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; } | |||
| 412 | if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; } | |||
| 413 | if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; } | |||
| 414 | if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; } | |||
| 415 | if (base != NULL((void*)0)) { | |||
| 416 | sub = (gf_internal_t *) base->scratch; | |||
| 417 | if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; } | |||
| 418 | if (poly == 0) { | |||
| 419 | if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; } | |||
| 420 | } | |||
| 421 | } | |||
| 422 | return 1; | |||
| 423 | } | |||
| 424 | ||||
| 425 | _gf_errno = GF_E_UNKNOWN; | |||
| 426 | return 0; | |||
| 427 | } | |||
| 428 | ||||
| 429 | int gf_scratch_size(int w, | |||
| 430 | int mult_type, | |||
| 431 | int region_type, | |||
| 432 | int divide_type, | |||
| 433 | int arg1, | |||
| 434 | int arg2) | |||
| 435 | { | |||
| 436 | if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL((void*)0)) == 0) return 0; | |||
| 437 | ||||
| 438 | switch(w) { | |||
| 439 | case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
| 440 | case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
| 441 | case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
| 442 | case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
| 443 | case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
| 444 | case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2); | |||
| 445 | default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); | |||
| 446 | } | |||
| 447 | } | |||
| 448 | ||||
| 449 | extern int gf_size(gf_t *gf) | |||
| 450 | { | |||
| 451 | gf_internal_t *h; | |||
| 452 | int s; | |||
| 453 | ||||
| 454 | s = sizeof(gf_t); | |||
| 455 | h = (gf_internal_t *) gf->scratch; | |||
| 456 | s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2); | |||
| 457 | if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf); | |||
| 458 | return s; | |||
| 459 | } | |||
| 460 | ||||
| 461 | ||||
| 462 | int gf_init_easy(gf_t *gf, int w) | |||
| 463 | { | |||
| 464 | return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT(0x0), GF_DIVIDE_DEFAULT, | |||
| 465 | 0, 0, 0, NULL((void*)0), NULL((void*)0)); | |||
| 466 | } | |||
| 467 | ||||
| 468 | /* Allen: What's going on here is this function is putting info into the | |||
| 469 | scratch mem of gf, and then calling the relevant REAL init | |||
| 470 | func for the word size. Probably done this way to consolidate | |||
| 471 | those aspects of initialization that don't rely on word size, | |||
| 472 | and then take care of word-size-specific stuff. */ | |||
| 473 | ||||
| 474 | int gf_init_hard(gf_t *gf, int w, int mult_type, | |||
| 475 | int region_type, | |||
| 476 | int divide_type, | |||
| 477 | uint64_t prim_poly, | |||
| 478 | int arg1, int arg2, | |||
| 479 | gf_t *base_gf, | |||
| 480 | void *scratch_memory) | |||
| 481 | { | |||
| 482 | int sz; | |||
| 483 | gf_internal_t *h; | |||
| 484 | ||||
| 485 | gf_cpu_identify(); | |||
| 486 | ||||
| 487 | if (gf_error_check(w, mult_type, region_type, divide_type, | |||
| 488 | arg1, arg2, prim_poly, base_gf) == 0) return 0; | |||
| 489 | ||||
| 490 | sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); | |||
| 491 | if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught | |||
| 492 | in gf_error_check() */ | |||
| 493 | ||||
| 494 | if (scratch_memory == NULL((void*)0)) { | |||
| 495 | h = (gf_internal_t *) malloc(sz); | |||
| 496 | h->free_me = 1; | |||
| 497 | } else { | |||
| 498 | h = scratch_memory; | |||
| 499 | h->free_me = 0; | |||
| 500 | } | |||
| 501 | gf->scratch = (void *) h; | |||
| 502 | h->mult_type = mult_type; | |||
| 503 | h->region_type = region_type; | |||
| 504 | h->divide_type = divide_type; | |||
| 505 | h->w = w; | |||
| 506 | h->prim_poly = prim_poly; | |||
| 507 | h->arg1 = arg1; | |||
| 508 | h->arg2 = arg2; | |||
| 509 | h->base_gf = base_gf; | |||
| 510 | h->private = (void *) gf->scratch; | |||
| 511 | h->private = (uint8_t *)h->private + (sizeof(gf_internal_t)); | |||
| 512 | gf->extract_word.w32 = NULL((void*)0); | |||
| 513 | ||||
| 514 | switch(w) { | |||
| 515 | case 4: return gf_w4_init(gf); | |||
| 516 | case 8: return gf_w8_init(gf); | |||
| 517 | case 16: return gf_w16_init(gf); | |||
| 518 | case 32: return gf_w32_init(gf); | |||
| 519 | case 64: return gf_w64_init(gf); | |||
| 520 | case 128: return gf_w128_init(gf); | |||
| 521 | default: return gf_wgen_init(gf); | |||
| 522 | } | |||
| 523 | } | |||
| 524 | ||||
| 525 | int gf_free(gf_t *gf, int recursive) | |||
| 526 | { | |||
| 527 | gf_internal_t *h; | |||
| 528 | ||||
| 529 | h = (gf_internal_t *) gf->scratch; | |||
| 530 | if (recursive && h->base_gf != NULL((void*)0)) { | |||
| 531 | gf_free(h->base_gf, 1); | |||
| 532 | free(h->base_gf); | |||
| 533 | } | |||
| 534 | if (h->free_me) free(h); | |||
| 535 | return 0; /* Making compiler happy */ | |||
| 536 | } | |||
| 537 | ||||
| 538 | void gf_alignment_error(char *s, int a) | |||
| 539 | { | |||
| 540 | fprintf(stderr, "Alignment error in %s:\n", s)__fprintf_chk (stderr, 2 - 1, "Alignment error in %s:\n", s); | |||
| 541 | fprintf(stderr, " The source and destination buffers must be aligned to each other,\n")__fprintf_chk (stderr, 2 - 1, " The source and destination buffers must be aligned to each other,\n" ); | |||
| 542 | fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a)__fprintf_chk (stderr, 2 - 1, " and they must be aligned to a %d-byte address.\n" , a); | |||
| 543 | assert(0)((void) (0)); | |||
| 544 | } | |||
| 545 | ||||
| 546 | static | |||
| 547 | void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) { | |||
| 548 | int cols, i, j; | |||
| 549 | uint32_t tmp; | |||
| 550 | ||||
| 551 | cols = rows; | |||
| 552 | ||||
| 553 | for (i = 0; i < rows; i++) inv[i] = (1 << i); | |||
| 554 | ||||
| 555 | /* First -- convert into upper triangular */ | |||
| 556 | ||||
| 557 | for (i = 0; i < cols; i++) { | |||
| 558 | ||||
| 559 | /* Swap rows if we ave a zero i,i element. If we can't swap, then the | |||
| 560 | matrix was not invertible */ | |||
| 561 | ||||
| 562 | if ((mat[i] & (1 << i)) == 0) { | |||
| 563 | for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ; | |||
| 564 | if (j == rows) { | |||
| 565 | fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n")__fprintf_chk (stderr, 2 - 1, "galois_invert_matrix: Matrix not invertible!!\n" ); | |||
| 566 | assert(0)((void) (0)); | |||
| 567 | } | |||
| 568 | tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp; | |||
| 569 | tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp; | |||
| 570 | } | |||
| 571 | ||||
| 572 | /* Now for each j>i, add A_ji*Ai to Aj */ | |||
| 573 | for (j = i+1; j != rows; j++) { | |||
| 574 | if ((mat[j] & (1 << i)) != 0) { | |||
| 575 | mat[j] ^= mat[i]; | |||
| 576 | inv[j] ^= inv[i]; | |||
| 577 | } | |||
| 578 | } | |||
| 579 | } | |||
| 580 | ||||
| 581 | /* Now the matrix is upper triangular. Start at the top and multiply down */ | |||
| 582 | ||||
| 583 | for (i = rows-1; i >= 0; i--) { | |||
| 584 | for (j = 0; j < i; j++) { | |||
| 585 | if (mat[j] & (1 << i)) { | |||
| 586 | /* mat[j] ^= mat[i]; */ | |||
| 587 | inv[j] ^= inv[i]; | |||
| 588 | } | |||
| 589 | } | |||
| 590 | } | |||
| 591 | } | |||
| 592 | ||||
| 593 | uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) | |||
| 594 | { | |||
| 595 | uint32_t mat[32], inv[32], mask; | |||
| 596 | int i; | |||
| 597 | ||||
| 598 | mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1; | |||
| ||||
| 599 | for (i = 0; i < w; i++) { | |||
| 600 | mat[i] = y; | |||
| 601 | ||||
| 602 | if (y & (1 << (w-1))) { | |||
| 603 | y = y << 1; | |||
| 604 | y = ((y ^ pp) & mask); | |||
| 605 | } else { | |||
| 606 | y = y << 1; | |||
| 607 | } | |||
| 608 | } | |||
| 609 | ||||
| 610 | gf_invert_binary_matrix(mat, inv, w); | |||
| 611 | return inv[0]; | |||
| ||||
| 612 | } | |||
| 613 | ||||
| 614 | void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) | |||
| 615 | { | |||
| 616 | uint64_t a, prod; | |||
| 617 | int xor; | |||
| 618 | uint64_t *s64, *d64, *top; | |||
| 619 | ||||
| 620 | s64 = rd->s_start; | |||
| 621 | d64 = rd->d_start; | |||
| 622 | top = rd->d_top; | |||
| 623 | xor = rd->xor; | |||
| 624 | ||||
| 625 | if (xor) { | |||
| 626 | while (d64 != top) { | |||
| 627 | a = *s64; | |||
| 628 | prod = base[a >> 48]; | |||
| 629 | a <<= 16; | |||
| 630 | prod <<= 16; | |||
| 631 | prod ^= base[a >> 48]; | |||
| 632 | a <<= 16; | |||
| 633 | prod <<= 16; | |||
| 634 | prod ^= base[a >> 48]; | |||
| 635 | a <<= 16; | |||
| 636 | prod <<= 16; | |||
| 637 | prod ^= base[a >> 48]; | |||
| 638 | prod ^= *d64; | |||
| 639 | *d64 = prod; | |||
| 640 | s64++; | |||
| 641 | d64++; | |||
| 642 | } | |||
| 643 | } else { | |||
| 644 | while (d64 != top) { | |||
| 645 | a = *s64; | |||
| 646 | prod = base[a >> 48]; | |||
| 647 | a <<= 16; | |||
| 648 | prod <<= 16; | |||
| 649 | prod ^= base[a >> 48]; | |||
| 650 | a <<= 16; | |||
| 651 | prod <<= 16; | |||
| 652 | prod ^= base[a >> 48]; | |||
| 653 | a <<= 16; | |||
| 654 | prod <<= 16; | |||
| 655 | prod ^= base[a >> 48]; | |||
| 656 | *d64 = prod; | |||
| 657 | s64++; | |||
| 658 | d64++; | |||
| 659 | } | |||
| 660 | } | |||
| 661 | } | |||
| 662 | ||||
| 663 | static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top) | |||
| 664 | { | |||
| 665 | uint8_t *s8, *d8; | |||
| 666 | uint16_t *s16, *d16; | |||
| 667 | uint32_t *s32, *d32; | |||
| 668 | uint64_t *s64, *d64; | |||
| 669 | gf_internal_t *h; | |||
| 670 | int wb; | |||
| 671 | uint32_t p, a; | |||
| 672 | ||||
| 673 | h = rd->gf->scratch; | |||
| 674 | wb = (h->w)/8; | |||
| 675 | if (wb == 0) wb = 1; | |||
| 676 | ||||
| 677 | while (src < s_top) { | |||
| 678 | switch (h->w) { | |||
| 679 | case 8: | |||
| 680 | s8 = (uint8_t *) src; | |||
| 681 | d8 = (uint8_t *) dest; | |||
| 682 | *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : | |||
| 683 | rd->gf->multiply.w32(rd->gf, rd->val, *s8); | |||
| 684 | break; | |||
| 685 | case 4: | |||
| 686 | s8 = (uint8_t *) src; | |||
| 687 | d8 = (uint8_t *) dest; | |||
| 688 | a = *s8; | |||
| 689 | p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf); | |||
| 690 | p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4); | |||
| 691 | if (rd->xor) p ^= *d8; | |||
| 692 | *d8 = p; | |||
| 693 | break; | |||
| 694 | case 16: | |||
| 695 | s16 = (uint16_t *) src; | |||
| 696 | d16 = (uint16_t *) dest; | |||
| 697 | *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : | |||
| 698 | rd->gf->multiply.w32(rd->gf, rd->val, *s16); | |||
| 699 | break; | |||
| 700 | case 32: | |||
| 701 | s32 = (uint32_t *) src; | |||
| 702 | d32 = (uint32_t *) dest; | |||
| 703 | *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : | |||
| 704 | rd->gf->multiply.w32(rd->gf, rd->val, *s32); | |||
| 705 | break; | |||
| 706 | case 64: | |||
| 707 | s64 = (uint64_t *) src; | |||
| 708 | d64 = (uint64_t *) dest; | |||
| 709 | *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : | |||
| 710 | rd->gf->multiply.w64(rd->gf, rd->val, *s64); | |||
| 711 | break; | |||
| 712 | default: | |||
| 713 | fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w)__fprintf_chk (stderr, 2 - 1, "Error: gf_slow_multiply_region: w=%d not implemented.\n" , h->w); | |||
| 714 | exit(1); | |||
| 715 | } | |||
| 716 | src = (uint8_t *)src + wb; | |||
| 717 | dest = (uint8_t *)dest + wb; | |||
| 718 | } | |||
| 719 | } | |||
| 720 | ||||
| 721 | /* JSP - The purpose of this procedure is to error check alignment, | |||
| 722 | and to set up the region operation so that it can best leverage | |||
| 723 | large words. | |||
| 724 | ||||
| 725 | It stores its information in rd. | |||
| 726 | ||||
| 727 | Assuming you're not doing Cauchy coding, (see below for that), | |||
| 728 | then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably | |||
| 729 | should change that). | |||
| 730 | ||||
| 731 | src and dest must then be aligned on ceil(w/8)-byte boundaries. | |||
| 732 | Moreover, bytes must be a multiple of ceil(w/8). If the variable | |||
| 733 | align is equal to ceil(w/8), then we will set s_start = src, | |||
| 734 | d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes). | |||
| 735 | And we return -- the implementation will go ahead and do the | |||
| 736 | multiplication on individual words (e.g. using discrete logs). | |||
| 737 | ||||
| 738 | If align is greater than ceil(w/8), then the implementation needs | |||
| 739 | to work on groups of "align" bytes. For example, suppose you are | |||
| 740 | implementing BYTWO, without SSE. Then you will be doing the region | |||
| 741 | multiplication in units of 8 bytes, so align = 8. Or, suppose you | |||
| 742 | are doing a Quad table in GF(2^4). You will be doing the region | |||
| 743 | multiplication in units of 2 bytes, so align = 2. Or, suppose you | |||
| 744 | are doing split multiplication with SSE operations in GF(2^8). | |||
| 745 | Then align = 16. Worse yet, suppose you are doing split | |||
| 746 | multiplication with SSE operations in GF(2^16), with or without | |||
| 747 | ALTMAP. Then, you will be doing the multiplication on 256 bits at | |||
| 748 | a time. So align = 32. | |||
| 749 | ||||
| 750 | When align does not equal ceil(w/8), we split the region | |||
| 751 | multiplication into three parts. We are going to make s_start be | |||
| 752 | the first address greater than or equal to src that is a multiple | |||
| 753 | of align. s_top is going to be the largest address >= src+bytes | |||
| 754 | such that (s_top - s_start) is a multiple of align. We do the | |||
| 755 | same with d_start and d_top. When we say that "src and dest must | |||
| 756 | be aligned with respect to each other, we mean that s_start-src | |||
| 757 | must equal d_start-dest. | |||
| 758 | ||||
| 759 | Now, the region multiplication is done in three parts -- the part | |||
| 760 | between src and s_start must be done using single words. | |||
| 761 | Similarly, the part between s_top and src+bytes must also be done | |||
| 762 | using single words. The part between s_start and s_top will be | |||
| 763 | done in chunks of "align" bytes. | |||
| 764 | ||||
| 765 | One final thing -- if align > 16, then s_start and d_start will be | |||
| 766 | aligned on a 16 byte boundary. Perhaps we should have two | |||
| 767 | variables: align and chunksize. Then we'd have s_start & d_start | |||
| 768 | aligned to "align", and have s_top-s_start be a multiple of | |||
| 769 | chunksize. That may be less confusing, but it would be a big | |||
| 770 | change. | |||
| 771 | ||||
| 772 | Finally, if align = -1, then we are doing Cauchy multiplication, | |||
| 773 | using only XOR's. In this case, we're not going to care about | |||
| 774 | alignment because we are just doing XOR's. Instead, the only | |||
| 775 | thing we care about is that bytes must be a multiple of w. | |||
| 776 | ||||
| 777 | This is not to say that alignment doesn't matter in performance | |||
| 778 | with XOR's. See that discussion in gf_multby_one(). | |||
| 779 | ||||
| 780 | After you call gf_set_region_data(), the procedure | |||
| 781 | gf_do_initial_region_alignment() calls gf->multiply.w32() on | |||
| 782 | everything between src and s_start. The procedure | |||
| 783 | gf_do_final_region_alignment() calls gf->multiply.w32() on | |||
| 784 | everything between s_top and src+bytes. | |||
| 785 | */ | |||
| 786 | ||||
| 787 | void gf_set_region_data(gf_region_data *rd, | |||
| 788 | gf_t *gf, | |||
| 789 | void *src, | |||
| 790 | void *dest, | |||
| 791 | int bytes, | |||
| 792 | uint64_t val, | |||
| 793 | int xor, | |||
| 794 | int align) | |||
| 795 | { | |||
| 796 | gf_internal_t *h = NULL((void*)0); | |||
| 797 | int wb; | |||
| 798 | uint32_t a; | |||
| 799 | unsigned long uls, uld; | |||
| 800 | ||||
| 801 | if (gf == NULL((void*)0)) { /* JSP - Can be NULL if you're just doing XOR's */ | |||
| 802 | wb = 1; | |||
| 803 | } else { | |||
| 804 | h = gf->scratch; | |||
| 805 | wb = (h->w)/8; | |||
| 806 | if (wb == 0) wb = 1; | |||
| 807 | } | |||
| 808 | ||||
| 809 | rd->gf = gf; | |||
| 810 | rd->src = src; | |||
| 811 | rd->dest = dest; | |||
| 812 | rd->bytes = bytes; | |||
| 813 | rd->val = val; | |||
| 814 | rd->xor = xor; | |||
| 815 | rd->align = align; | |||
| 816 | ||||
| 817 | uls = (unsigned long) src; | |||
| 818 | uld = (unsigned long) dest; | |||
| 819 | ||||
| 820 | a = (align <= 16) ? align : 16; | |||
| 821 | ||||
| 822 | if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers | |||
| 823 | so that there are no alignment regions. */ | |||
| 824 | if (h != NULL((void*)0) && bytes % h->w != 0) { | |||
| 825 | fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n" ); | |||
| 826 | fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w)__fprintf_chk (stderr, 2 - 1, "The size must be a multiple of %d bytes.\n" , h->w); | |||
| 827 | assert(0)((void) (0)); | |||
| 828 | } | |||
| 829 | ||||
| 830 | rd->s_start = src; | |||
| 831 | rd->d_start = dest; | |||
| 832 | rd->s_top = (uint8_t *)src + bytes; | |||
| 833 | rd->d_top = (uint8_t *)src + bytes; | |||
| 834 | return; | |||
| 835 | } | |||
| 836 | ||||
| 837 | if (uls % a != uld % a) { | |||
| 838 | fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n" ); | |||
| 839 | fprintf(stderr, "The source & destination pointers must be aligned with respect\n")__fprintf_chk (stderr, 2 - 1, "The source & destination pointers must be aligned with respect\n" ); | |||
| 840 | fprintf(stderr, "to each other along a %d byte boundary.\n", a)__fprintf_chk (stderr, 2 - 1, "to each other along a %d byte boundary.\n" , a); | |||
| 841 | fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, (unsigned long) dest) | |||
| 842 | (unsigned long) dest)__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, (unsigned long) dest); | |||
| 843 | assert(0)((void) (0)); | |||
| 844 | } | |||
| 845 | ||||
| 846 | if (uls % wb != 0) { | |||
| 847 | fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n" ); | |||
| 848 | fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb)__fprintf_chk (stderr, 2 - 1, "The pointers must be aligned along a %d byte boundary.\n" , wb); | |||
| 849 | fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, (unsigned long) dest) | |||
| 850 | (unsigned long) dest)__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, (unsigned long) dest); | |||
| 851 | assert(0)((void) (0)); | |||
| 852 | } | |||
| 853 | ||||
| 854 | if (bytes % wb != 0) { | |||
| 855 | fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n" ); | |||
| 856 | fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb)__fprintf_chk (stderr, 2 - 1, "The size must be a multiple of %d bytes.\n" , wb); | |||
| 857 | assert(0)((void) (0)); | |||
| 858 | } | |||
| 859 | ||||
| 860 | uls %= a; | |||
| 861 | if (uls != 0) uls = (a-uls); | |||
| 862 | rd->s_start = (uint8_t *)rd->src + uls; | |||
| 863 | rd->d_start = (uint8_t *)rd->dest + uls; | |||
| 864 | bytes -= uls; | |||
| 865 | bytes -= (bytes % align); | |||
| 866 | rd->s_top = (uint8_t *)rd->s_start + bytes; | |||
| 867 | rd->d_top = (uint8_t *)rd->d_start + bytes; | |||
| 868 | ||||
| 869 | } | |||
| 870 | ||||
| 871 | void gf_do_initial_region_alignment(gf_region_data *rd) | |||
| 872 | { | |||
| 873 | gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start); | |||
| 874 | } | |||
| 875 | ||||
| 876 | void gf_do_final_region_alignment(gf_region_data *rd) | |||
| 877 | { | |||
| 878 | gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes); | |||
| 879 | } | |||
| 880 | ||||
| 881 | void gf_multby_zero(void *dest, int bytes, int xor) | |||
| 882 | { | |||
| 883 | if (xor) return; | |||
| 884 | bzero(dest, bytes); | |||
| 885 | return; | |||
| 886 | } | |||
| 887 | ||||
| 888 | /* JSP - gf_multby_one tries to do this in the most efficient way | |||
| 889 | possible. If xor = 0, then simply call memcpy() since that | |||
| 890 | should be optimized by the system. Otherwise, try to do the xor | |||
| 891 | in the following order: | |||
| 892 | ||||
| 893 | If src and dest are aligned with respect to each other on 16-byte | |||
| 894 | boundaries and you have SSE instructions, then use aligned SSE | |||
| 895 | instructions. | |||
| 896 | ||||
| 897 | If they aren't but you still have SSE instructions, use unaligned | |||
| 898 | SSE instructions. | |||
| 899 | ||||
| 900 | If there are no SSE instructions, but they are aligned with | |||
| 901 | respect to each other on 8-byte boundaries, then do them with | |||
| 902 | uint64_t's. | |||
| 903 | ||||
| 904 | Otherwise, call gf_unaligned_xor(), which does the following: | |||
| 905 | align a destination pointer along an 8-byte boundary, and then | |||
| 906 | memcpy 32 bytes at a time from the src pointer to an array of | |||
| 907 | doubles. I'm not sure if that's the best -- probably needs | |||
| 908 | testing, but this seems like it could be a black hole. | |||
| 909 | */ | |||
| 910 | ||||
| 911 | static void gf_unaligned_xor(void *src, void *dest, int bytes); | |||
| 912 | ||||
| 913 | void gf_multby_one(void *src, void *dest, int bytes, int xor) | |||
| 914 | { | |||
| 915 | unsigned long uls, uld; | |||
| 916 | uint8_t *s8, *d8; | |||
| 917 | uint64_t *s64, *d64, *dtop64; | |||
| 918 | gf_region_data rd; | |||
| 919 | ||||
| 920 | if (!xor) { | |||
| 921 | if (dest != src) | |||
| 922 | memcpy(dest, src, bytes); | |||
| 923 | return; | |||
| 924 | } | |||
| 925 | uls = (unsigned long) src; | |||
| 926 | uld = (unsigned long) dest; | |||
| 927 | ||||
| 928 | #ifdef INTEL_SSE21 | |||
| 929 | if (gf_cpu_supports_intel_sse2) { | |||
| 930 | __m128i ms, md; | |||
| 931 | int abytes; | |||
| 932 | s8 = (uint8_t *) src; | |||
| 933 | d8 = (uint8_t *) dest; | |||
| 934 | if (uls % 16 == uld % 16) { | |||
| 935 | gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 16); | |||
| 936 | while (s8 != rd.s_start) { | |||
| 937 | *d8 ^= *s8; | |||
| 938 | d8++; | |||
| 939 | s8++; | |||
| 940 | } | |||
| 941 | while (s8 < (uint8_t *) rd.s_top) { | |||
| 942 | ms = _mm_load_si128 ((__m128i *)(s8)); | |||
| 943 | md = _mm_load_si128 ((__m128i *)(d8)); | |||
| 944 | md = _mm_xor_si128(md, ms); | |||
| 945 | _mm_store_si128((__m128i *)(d8), md); | |||
| 946 | s8 += 16; | |||
| 947 | d8 += 16; | |||
| 948 | } | |||
| 949 | while (s8 != (uint8_t *) src + bytes) { | |||
| 950 | *d8 ^= *s8; | |||
| 951 | d8++; | |||
| 952 | s8++; | |||
| 953 | } | |||
| 954 | return; | |||
| 955 | } | |||
| 956 | ||||
| 957 | abytes = (bytes & 0xfffffff0); | |||
| 958 | ||||
| 959 | while (d8 < (uint8_t *) dest + abytes) { | |||
| 960 | ms = _mm_loadu_si128 ((__m128i *)(s8)); | |||
| 961 | md = _mm_loadu_si128 ((__m128i *)(d8)); | |||
| 962 | md = _mm_xor_si128(md, ms); | |||
| 963 | _mm_storeu_si128((__m128i *)(d8), md); | |||
| 964 | s8 += 16; | |||
| 965 | d8 += 16; | |||
| 966 | } | |||
| 967 | while (d8 != (uint8_t *) dest+bytes) { | |||
| 968 | *d8 ^= *s8; | |||
| 969 | d8++; | |||
| 970 | s8++; | |||
| 971 | } | |||
| 972 | return; | |||
| 973 | } | |||
| 974 | #endif | |||
| 975 | #if defined(ARM_NEON) | |||
| 976 | if (gf_cpu_supports_arm_neon) { | |||
| 977 | s8 = (uint8_t *) src; | |||
| 978 | d8 = (uint8_t *) dest; | |||
| 979 | ||||
| 980 | if (uls % 16 == uld % 16) { | |||
| 981 | gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 16); | |||
| 982 | while (s8 != rd.s_start) { | |||
| 983 | *d8 ^= *s8; | |||
| 984 | s8++; | |||
| 985 | d8++; | |||
| 986 | } | |||
| 987 | while (s8 < (uint8_t *) rd.s_top) { | |||
| 988 | uint8x16_t vs = vld1q_u8 (s8); | |||
| 989 | uint8x16_t vd = vld1q_u8 (d8); | |||
| 990 | uint8x16_t vr = veorq_u8 (vs, vd); | |||
| 991 | vst1q_u8 (d8, vr); | |||
| 992 | s8 += 16; | |||
| 993 | d8 += 16; | |||
| 994 | } | |||
| 995 | } else { | |||
| 996 | while (s8 + 15 < (uint8_t *) src + bytes) { | |||
| 997 | uint8x16_t vs = vld1q_u8 (s8); | |||
| 998 | uint8x16_t vd = vld1q_u8 (d8); | |||
| 999 | uint8x16_t vr = veorq_u8 (vs, vd); | |||
| 1000 | vst1q_u8 (d8, vr); | |||
| 1001 | s8 += 16; | |||
| 1002 | d8 += 16; | |||
| 1003 | } | |||
| 1004 | } | |||
| 1005 | while (s8 < (uint8_t *) src + bytes) { | |||
| 1006 | *d8 ^= *s8; | |||
| 1007 | s8++; | |||
| 1008 | d8++; | |||
| 1009 | } | |||
| 1010 | return; | |||
| 1011 | } | |||
| 1012 | #endif | |||
| 1013 | if (uls % 8 != uld % 8) { | |||
| 1014 | gf_unaligned_xor(src, dest, bytes); | |||
| 1015 | return; | |||
| 1016 | } | |||
| 1017 | ||||
| 1018 | gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 8); | |||
| 1019 | s8 = (uint8_t *) src; | |||
| 1020 | d8 = (uint8_t *) dest; | |||
| 1021 | while (d8 != rd.d_start) { | |||
| 1022 | *d8 ^= *s8; | |||
| 1023 | d8++; | |||
| 1024 | s8++; | |||
| 1025 | } | |||
| 1026 | dtop64 = (uint64_t *) rd.d_top; | |||
| 1027 | ||||
| 1028 | d64 = (uint64_t *) rd.d_start; | |||
| 1029 | s64 = (uint64_t *) rd.s_start; | |||
| 1030 | ||||
| 1031 | while (d64 < dtop64) { | |||
| 1032 | *d64 ^= *s64; | |||
| 1033 | d64++; | |||
| 1034 | s64++; | |||
| 1035 | } | |||
| 1036 | ||||
| 1037 | s8 = (uint8_t *) rd.s_top; | |||
| 1038 | d8 = (uint8_t *) rd.d_top; | |||
| 1039 | ||||
| 1040 | while (d8 != (uint8_t *) dest+bytes) { | |||
| 1041 | *d8 ^= *s8; | |||
| 1042 | d8++; | |||
| 1043 | s8++; | |||
| 1044 | } | |||
| 1045 | return; | |||
| 1046 | } | |||
| 1047 | ||||
| 1048 | #define UNALIGNED_BUFSIZE(8) (8) | |||
| 1049 | ||||
| 1050 | static void gf_unaligned_xor(void *src, void *dest, int bytes) | |||
| 1051 | { | |||
| 1052 | uint64_t scopy[UNALIGNED_BUFSIZE(8)], *d64; | |||
| 1053 | int i; | |||
| 1054 | gf_region_data rd; | |||
| 1055 | uint8_t *s8, *d8; | |||
| 1056 | ||||
| 1057 | /* JSP - call gf_set_region_data(), but use dest in both places. This is | |||
| 1058 | because I only want to set up dest. If I used src, gf_set_region_data() | |||
| 1059 | would fail because src and dest are not aligned to each other wrt | |||
| 1060 | 8-byte pointers. I know this will actually align d_start to 16 bytes. | |||
| 1061 | If I change gf_set_region_data() to split alignment & chunksize, then | |||
| 1062 | I could do this correctly. */ | |||
| 1063 | ||||
| 1064 | gf_set_region_data(&rd, NULL((void*)0), dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE(8)); | |||
| 1065 | s8 = (uint8_t *) src; | |||
| 1066 | d8 = (uint8_t *) dest; | |||
| 1067 | ||||
| 1068 | while (d8 < (uint8_t *) rd.d_start) { | |||
| 1069 | *d8 ^= *s8; | |||
| 1070 | d8++; | |||
| 1071 | s8++; | |||
| 1072 | } | |||
| 1073 | ||||
| 1074 | d64 = (uint64_t *) d8; | |||
| 1075 | while (d64 < (uint64_t *) rd.d_top) { | |||
| 1076 | memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE(8)); | |||
| 1077 | s8 += 8*UNALIGNED_BUFSIZE(8); | |||
| 1078 | for (i = 0; i < UNALIGNED_BUFSIZE(8); i++) { | |||
| 1079 | *d64 ^= scopy[i]; | |||
| 1080 | d64++; | |||
| 1081 | } | |||
| 1082 | } | |||
| 1083 | ||||
| 1084 | d8 = (uint8_t *) d64; | |||
| 1085 | while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) { | |||
| 1086 | *d8 ^= *s8; | |||
| 1087 | d8++; | |||
| 1088 | s8++; | |||
| 1089 | } | |||
| 1090 | } |