/home/bhubbard/working/src/ceph/src/erasure-code/jerasure/gf-complete/src/gf.c

Bug Summary

File:	home/bhubbard/working/src/ceph/src/erasure-code/jerasure/gf-complete/src/gf.c
Warning:	line 611, column 3 Undefined or garbage value returned to caller

Annotated Source Code

[?] Use j/k keys for keyboard navigation

* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic

* James S. Plank, Ethan L. Miller, Kevin M. Greenan,

* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.

* gf.c

* Generic routines for Galois fields

#include "gf_int.h"

#include <stdio.h>

#include <stdlib.h>

#include <assert.h>

#include "gf_cpu.h"

int _gf_errno = GF_E_DEFAULT;

void gf_error()

{

char *s;

switch(_gf_errno) {

case GF_E_DEFAULT: s = "No Error."; break;

case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;

case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;

case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;

case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;

case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;

case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;

case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;

case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;

case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;

case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;

case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;

case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;

case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;

case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;

case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;

case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;

case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;

case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;

case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;

case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;

case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;

case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;

case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;

case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;

case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;

case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;

case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;

case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;

case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;

case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;

case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;

case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;

case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;

case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;

case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;

case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;

case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;

case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;

case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;

case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;

case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;

case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;

case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;

case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;

case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;

case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;

case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;

case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;

case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;

case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;

case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;

case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;

case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;

case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;

case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;

case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;

case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;

case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;

case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;

case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;

case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;

case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;

case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;

case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;

case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;

case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;

case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;

case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;

case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;

case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;

case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;

case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;

case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;

case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;

case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;

100

case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;

101

case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;

102

case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;

103

case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;

104

case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;

105

case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;

106

case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;

107

case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;

108

case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;

109

case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;

110

case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;

111

case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;

112

case GF_E_UNK_REG: s = "Unknown region type."; break;

113

case GF_E_UNK_DIV: s = "Unknown division type."; break;

114

default: s = "Undefined error.";

115

}

116

117

fprintf(stderr, "%s\n", s)__fprintf_chk (stderr, 2 - 1, "%s\n", s);

118

}

119

120

uint64_t gf_composite_get_default_poly(gf_t *base)

121

{

122

gf_internal_t *h;

123

uint64_t rv;

124

125

h = (gf_internal_t *) base->scratch;

126

if (h->w == 4) {

127

if (h->mult_type == GF_MULT_COMPOSITE) return 0;

128

if (h->prim_poly == 0x13) return 2;

129

return 0;

130

}

131

if (h->w == 8) {

132

if (h->mult_type == GF_MULT_COMPOSITE) return 0;

133

if (h->prim_poly == 0x11d) return 3;

134

return 0;

135

}

136

if (h->w == 16) {

137

if (h->mult_type == GF_MULT_COMPOSITE) {

138

rv = gf_composite_get_default_poly(h->base_gf);

139

if (rv != h->prim_poly) return 0;

140

if (rv == 3) return 0x105;

141

return 0;

142

} else {

143

if (h->prim_poly == 0x1100b) return 2;

144

if (h->prim_poly == 0x1002d) return 7;

145

return 0;

146

}

147

}

148

if (h->w == 32) {

149

if (h->mult_type == GF_MULT_COMPOSITE) {

150

rv = gf_composite_get_default_poly(h->base_gf);

151

if (rv != h->prim_poly) return 0;

152

if (rv == 2) return 0x10005;

153

if (rv == 7) return 0x10008;

154

if (rv == 0x105) return 0x10002;

155

return 0;

156

} else {

157

if (h->prim_poly == 0x400007) return 2;

158

if (h->prim_poly == 0xc5) return 3;

159

return 0;

160

}

161

}

162

if (h->w == 64) {

163

if (h->mult_type == GF_MULT_COMPOSITE) {

164

rv = gf_composite_get_default_poly(h->base_gf);

165

if (rv != h->prim_poly) return 0;

166

if (rv == 3) return 0x100000009ULL;

167

if (rv == 2) return 0x100000004ULL;

168

if (rv == 0x10005) return 0x100000003ULL;

169

if (rv == 0x10002) return 0x100000005ULL;

170

if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too,

171

but I want to differentiate cases). */

172

return 0;

173

} else {

174

if (h->prim_poly == 0x1bULL) return 2;

175

return 0;

176

}

177

}

178

return 0;

179

}

180

181

int gf_error_check(int w, int mult_type, int region_type, int divide_type,

182

int arg1, int arg2, uint64_t poly, gf_t *base)

183

{

184

int sse3 = 0;

185

int sse2 = 0;

186

int pclmul = 0;

187

int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;

188

gf_internal_t *sub;

189

190

rdouble = (region_type & GF_REGION_DOUBLE_TABLE(0x1));

191

rquad = (region_type & GF_REGION_QUAD_TABLE(0x2));

192

rlazy = (region_type & GF_REGION_LAZY(0x4));

193

rsimd = (region_type & GF_REGION_SIMD(0x8));

194

rnosimd = (region_type & GF_REGION_NOSIMD(0x10));

195

raltmap = (region_type & GF_REGION_ALTMAP(0x20));

196

rcauchy = (region_type & GF_REGION_CAUCHY(0x40));

197

198

if (divide_type != GF_DIVIDE_DEFAULT &&

199

divide_type != GF_DIVIDE_MATRIX &&

200

divide_type != GF_DIVIDE_EUCLID) {

201

_gf_errno = GF_E_UNK_DIV;

202

return 0;

203

}

204

205

tmp = ( GF_REGION_DOUBLE_TABLE(0x1) | GF_REGION_QUAD_TABLE(0x2) | GF_REGION_LAZY(0x4) |

206

GF_REGION_SIMD(0x8) | GF_REGION_NOSIMD(0x10) | GF_REGION_ALTMAP(0x20) |

207

GF_REGION_CAUCHY(0x40) );

208

if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }

209

210

#ifdef INTEL_SSE21

211

if (gf_cpu_supports_intel_sse2) {

212

sse2 = 1;

213

}

214

#endif

215

216

#ifdef INTEL_SSSE31

217

if (gf_cpu_supports_intel_ssse3) {

218

sse3 = 1;

219

}

220

#endif

221

222

#ifdef INTEL_SSE4_PCLMUL1

223

if (gf_cpu_supports_intel_pclmul) {

224

pclmul = 1;

225

}

226

#endif

227

228

#ifdef ARM_NEON

229

if (gf_cpu_supports_arm_neon) {

230

pclmul = (w == 4 || w == 8);

231

sse3 = 1;

232

}

233

#endif

234

235

236

if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }

237

238

if (mult_type != GF_MULT_COMPOSITE && w < 64) {

239

if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; }

240

}

241

242

if (mult_type == GF_MULT_DEFAULT) {

243

if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }

244

if (region_type != GF_REGION_DEFAULT(0x0)) { _gf_errno = GF_E_MDEFREG; return 0; }

245

if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; }

246

return 1;

247

}

248

249

if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; }

250

if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }

251

if (rcauchy && region_type != GF_REGION_CAUCHY(0x40)) { _gf_errno = GF_E_CAUCHYB; return 0; }

252

if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }

253

254

if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE &&

255

mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {

256

_gf_errno = GF_E_ARG1SET;

257

return 0;

258

}

259

260

if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {

261

_gf_errno = GF_E_ARG2SET;

262

return 0;

263

}

264

265

if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }

266

267

if (rdouble) {

268

if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }

269

if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }

270

if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }

271

if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }

272

if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }

273

return 1;

274

}

275

276

if (rquad) {

277

if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }

278

if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }

279

if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }

280

return 1;

281

}

282

283

if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; }

284

285

if (mult_type == GF_MULT_SHIFT) {

286

if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }

287

if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; }

288

return 1;

289

}

290

291

if (mult_type == GF_MULT_CARRY_FREE) {

292

if (w != 4 && w != 8 && w != 16 &&

293

w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }

294

if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; }

295

if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; }

296

if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; }

297

if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }

298

if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }

299

if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }

300

if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }

301

if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }

302

return 1;

303

}

304

305

if (mult_type == GF_MULT_CARRY_FREE_GK) {

306

if (w != 4 && w != 8 && w != 16 &&

307

w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }

308

if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }

309

if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }

310

if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }

311

return 1;

312

}

313

314

if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {

315

if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }

316

if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }

317

return 1;

318

}

319

320

if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO

321

|| mult_type == GF_MULT_LOG_ZERO_EXT ) {

322

if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }

323

if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }

324

325

if (mult_type == GF_MULT_LOG_TABLE) return 1;

326

327

if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; }

328

329

if (mult_type == GF_MULT_LOG_ZERO) return 1;

330

331

if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; }

332

return 1;

333

}

334

335

if (mult_type == GF_MULT_GROUP) {

336

if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; }

337

if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; }

338

if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; }

339

if (w == 128 && (arg1 != 4 ||

340

(arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }

341

if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }

342

if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }

343

if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; }

344

return 1;

345

}

346

347

if (mult_type == GF_MULT_TABLE) {

348

if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }

349

if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; }

350

if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }

351

if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }

352

return 1;

353

}

354

355

if (mult_type == GF_MULT_SPLIT_TABLE) {

356

if (arg1 > arg2) {

357

tmp = arg1;

358

arg1 = arg2;

359

arg2 = tmp;

360

}

361

if (w == 8) {

362

if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }

363

if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }

364

if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }

365

} else if (w == 16) {

366

if ((arg1 == 8 && arg2 == 8) ||

367

(arg1 == 8 && arg2 == 16)) {

368

if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; }

369

if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }

370

} else if (arg1 == 4 && arg2 == 16) {

371

if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }

372

} else { _gf_errno = GF_E_SP_16AR; return 0; }

373

} else if (w == 32) {

374

if ((arg1 == 8 && arg2 == 8) ||

375

(arg1 == 8 && arg2 == 32) ||

376

(arg1 == 16 && arg2 == 32)) {

377

if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; }

378

if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }

379

} else if (arg1 == 4 && arg2 == 32) {

380

if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }

381

if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }

382

if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; }

383

} else { _gf_errno = GF_E_SP_32AR; return 0; }

384

} else if (w == 64) {

385

if ((arg1 == 8 && arg2 == 8) ||

386

(arg1 == 8 && arg2 == 64) ||

387

(arg1 == 16 && arg2 == 64)) {

388

if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; }

389

if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }

390

} else if (arg1 == 4 && arg2 == 64) {

391

if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }

392

if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }

393

if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; }

394

} else { _gf_errno = GF_E_SP_64AR; return 0; }

395

} else if (w == 128) {

396

if (arg1 == 8 && arg2 == 128) {

397

if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; }

398

if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }

399

} else if (arg1 == 4 && arg2 == 128) {

400

if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }

401

if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }

402

if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; }

403

} else { _gf_errno = GF_E_SP128AR; return 0; }

404

} else { _gf_errno = GF_E_SPLIT_W; return 0; }

405

return 1;

406

}

407

408

if (mult_type == GF_MULT_COMPOSITE) {

409

if (w != 8 && w != 16 && w != 32

410

&& w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; }

411

if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }

412

if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }

413

if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }

414

if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; }

415

if (base != NULL((void*)0)) {

416

sub = (gf_internal_t *) base->scratch;

417

if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }

418

if (poly == 0) {

419

if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }

420

}

421

}

422

return 1;

423

}

424

425

_gf_errno = GF_E_UNKNOWN;

426

return 0;

427

}

428

429

int gf_scratch_size(int w,

430

int mult_type,

431

int region_type,

432

int divide_type,

433

int arg1,

434

int arg2)

435

{

436

if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL((void*)0)) == 0) return 0;

437

438

switch(w) {

439

case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);

440

case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);

441

case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);

442

case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);

443

case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);

444

case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);

445

default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);

446

}

447

}

448

449

extern int gf_size(gf_t *gf)

450

{

451

gf_internal_t *h;

452

int s;

453

454

s = sizeof(gf_t);

455

h = (gf_internal_t *) gf->scratch;

456

s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);

457

if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);

458

return s;

459

}

460

461

462

int gf_init_easy(gf_t *gf, int w)

463

{

464

return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT(0x0), GF_DIVIDE_DEFAULT,

465

0, 0, 0, NULL((void*)0), NULL((void*)0));

466

}

467

468

/* Allen: What's going on here is this function is putting info into the

469

scratch mem of gf, and then calling the relevant REAL init

470

func for the word size. Probably done this way to consolidate

471

those aspects of initialization that don't rely on word size,

472

and then take care of word-size-specific stuff. */

473

474

int gf_init_hard(gf_t *gf, int w, int mult_type,

475

int region_type,

476

int divide_type,

477

uint64_t prim_poly,

478

int arg1, int arg2,

479

gf_t *base_gf,

480

void *scratch_memory)

481

{

482

int sz;

483

gf_internal_t *h;

484

485

gf_cpu_identify();

486

487

if (gf_error_check(w, mult_type, region_type, divide_type,

488

arg1, arg2, prim_poly, base_gf) == 0) return 0;

489

490

sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);

491

if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught

492

in gf_error_check() */

493

494

if (scratch_memory == NULL((void*)0)) {

495

h = (gf_internal_t *) malloc(sz);

496

h->free_me = 1;

497

} else {

498

h = scratch_memory;

499

h->free_me = 0;

500

}

501

gf->scratch = (void *) h;

502

h->mult_type = mult_type;

503

h->region_type = region_type;

504

h->divide_type = divide_type;

505

h->w = w;

506

h->prim_poly = prim_poly;

507

h->arg1 = arg1;

508

h->arg2 = arg2;

509

h->base_gf = base_gf;

510

h->private = (void *) gf->scratch;

511

h->private = (uint8_t *)h->private + (sizeof(gf_internal_t));

512

gf->extract_word.w32 = NULL((void*)0);

513

514

switch(w) {

515

case 4: return gf_w4_init(gf);

516

case 8: return gf_w8_init(gf);

517

case 16: return gf_w16_init(gf);

518

case 32: return gf_w32_init(gf);

519

case 64: return gf_w64_init(gf);

520

case 128: return gf_w128_init(gf);

521

default: return gf_wgen_init(gf);

522

}

523

}

524

525

int gf_free(gf_t *gf, int recursive)

526

{

527

gf_internal_t *h;

528

529

h = (gf_internal_t *) gf->scratch;

530

if (recursive && h->base_gf != NULL((void*)0)) {

531

gf_free(h->base_gf, 1);

532

free(h->base_gf);

533

}

534

if (h->free_me) free(h);

535

return 0; /* Making compiler happy */

536

}

537

538

void gf_alignment_error(char *s, int a)

539

{

540

fprintf(stderr, "Alignment error in %s:\n", s)__fprintf_chk (stderr, 2 - 1, "Alignment error in %s:\n", s);

541

fprintf(stderr, " The source and destination buffers must be aligned to each other,\n")__fprintf_chk (stderr, 2 - 1, " The source and destination buffers must be aligned to each other,\n"
);

542

fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a)__fprintf_chk (stderr, 2 - 1, " and they must be aligned to a %d-byte address.\n"
, a);

543

assert(0)((void) (0));

544

}

545

546

static

547

void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {

548

int cols, i, j;

549

uint32_t tmp;

550

551

cols = rows;

552

553

for (i = 0; i < rows; i++) inv[i] = (1 << i);

←

Loop condition is false. Execution continues on line 557

→

554

555

/* First -- convert into upper triangular */

556

557

for (i = 0; i < cols; i++) {

←

Loop condition is false. Execution continues on line 583

→

558

559

/* Swap rows if we ave a zero i,i element. If we can't swap, then the

560

matrix was not invertible */

561

562

if ((mat[i] & (1 << i)) == 0) {

563

for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;

564

if (j == rows) {

565

fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n")__fprintf_chk (stderr, 2 - 1, "galois_invert_matrix: Matrix not invertible!!\n"
);

566

assert(0)((void) (0));

567

}

568

tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;

569

tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;

570

}

571

572

/* Now for each j>i, add A_ji*Ai to Aj */

573

for (j = i+1; j != rows; j++) {

574

if ((mat[j] & (1 << i)) != 0) {

575

mat[j] ^= mat[i];

576

inv[j] ^= inv[i];

577

}

578

}

579

}

580

581

/* Now the matrix is upper triangular. Start at the top and multiply down */

582

583

for (i = rows-1; i >= 0; i--) {

←

Assuming 'i' is < 0

→

←

Loop condition is false. Execution continues on line 583

→

584

for (j = 0; j < i; j++) {

585

if (mat[j] & (1 << i)) {

586

/* mat[j] ^= mat[i]; */

587

inv[j] ^= inv[i];

588

}

589

}

590

}

591

}

592

593

uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)

594

{

595

uint32_t mat[32], inv[32], mask;

596

int i;

597

598

mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1;

Assuming 'w' is not equal to 32

→

←

'?' condition is false

→

599

for (i = 0; i < w; i++) {

←

Assuming 'i' is >= 'w'

→

←

Loop condition is false. Execution continues on line 610

→

600

mat[i] = y;

601

602

if (y & (1 << (w-1))) {

603

y = y << 1;

604

y = ((y ^ pp) & mask);

605

} else {

606

y = y << 1;

607

}

608

}

609

610

gf_invert_binary_matrix(mat, inv, w);

←

Calling 'gf_invert_binary_matrix'

→

←

Returning from 'gf_invert_binary_matrix'

→

611

return inv[0];

←

Undefined or garbage value returned to caller

612

}

613

614

void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)

615

{

616

uint64_t a, prod;

617

int xor;

618

uint64_t *s64, *d64, *top;

619

620

s64 = rd->s_start;

621

d64 = rd->d_start;

622

top = rd->d_top;

623

xor = rd->xor;

624

625

if (xor) {

626

while (d64 != top) {

627

a = *s64;

628

prod = base[a >> 48];

629

a <<= 16;

630

prod <<= 16;

631

prod ^= base[a >> 48];

632

a <<= 16;

633

prod <<= 16;

634

prod ^= base[a >> 48];

635

a <<= 16;

636

prod <<= 16;

637

prod ^= base[a >> 48];

638

prod ^= *d64;

639

*d64 = prod;

640

s64++;

641

d64++;

642

}

643

} else {

644

while (d64 != top) {

645

a = *s64;

646

prod = base[a >> 48];

647

a <<= 16;

648

prod <<= 16;

649

prod ^= base[a >> 48];

650

a <<= 16;

651

prod <<= 16;

652

prod ^= base[a >> 48];

653

a <<= 16;

654

prod <<= 16;

655

prod ^= base[a >> 48];

656

*d64 = prod;

657

s64++;

658

d64++;

659

}

660

}

661

}

662

663

static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)

664

{

665

uint8_t *s8, *d8;

666

uint16_t *s16, *d16;

667

uint32_t *s32, *d32;

668

uint64_t *s64, *d64;

669

gf_internal_t *h;

670

int wb;

671

uint32_t p, a;

672

673

h = rd->gf->scratch;

674

wb = (h->w)/8;

675

if (wb == 0) wb = 1;

676

677

while (src < s_top) {

678

switch (h->w) {

679

case 8:

680

s8 = (uint8_t *) src;

681

d8 = (uint8_t *) dest;

682

*d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) :

683

rd->gf->multiply.w32(rd->gf, rd->val, *s8);

684

break;

685

case 4:

686

s8 = (uint8_t *) src;

687

d8 = (uint8_t *) dest;

688

a = *s8;

689

p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);

690

p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);

691

if (rd->xor) p ^= *d8;

692

*d8 = p;

693

break;

694

case 16:

695

s16 = (uint16_t *) src;

696

d16 = (uint16_t *) dest;

697

*d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) :

698

rd->gf->multiply.w32(rd->gf, rd->val, *s16);

699

break;

700

case 32:

701

s32 = (uint32_t *) src;

702

d32 = (uint32_t *) dest;

703

*d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) :

704

rd->gf->multiply.w32(rd->gf, rd->val, *s32);

705

break;

706

case 64:

707

s64 = (uint64_t *) src;

708

d64 = (uint64_t *) dest;

709

*d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) :

710

rd->gf->multiply.w64(rd->gf, rd->val, *s64);

711

break;

712

default:

713

fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w)__fprintf_chk (stderr, 2 - 1, "Error: gf_slow_multiply_region: w=%d not implemented.\n"
, h->w);

714

exit(1);

715

}

716

src = (uint8_t *)src + wb;

717

dest = (uint8_t *)dest + wb;

718

}

719

}

720

721

/* JSP - The purpose of this procedure is to error check alignment,

722

and to set up the region operation so that it can best leverage

723

large words.

724

725

It stores its information in rd.

726

727

Assuming you're not doing Cauchy coding, (see below for that),

728

then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably

729

should change that).

730

731

src and dest must then be aligned on ceil(w/8)-byte boundaries.

732

Moreover, bytes must be a multiple of ceil(w/8). If the variable

733

align is equal to ceil(w/8), then we will set s_start = src,

734

d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).

735

And we return -- the implementation will go ahead and do the

736

multiplication on individual words (e.g. using discrete logs).

737

738

If align is greater than ceil(w/8), then the implementation needs

739

to work on groups of "align" bytes. For example, suppose you are

740

implementing BYTWO, without SSE. Then you will be doing the region

741

multiplication in units of 8 bytes, so align = 8. Or, suppose you

742

are doing a Quad table in GF(2^4). You will be doing the region

743

multiplication in units of 2 bytes, so align = 2. Or, suppose you

744

are doing split multiplication with SSE operations in GF(2^8).

745

Then align = 16. Worse yet, suppose you are doing split

746

multiplication with SSE operations in GF(2^16), with or without

747

ALTMAP. Then, you will be doing the multiplication on 256 bits at

748

a time. So align = 32.

749

750

When align does not equal ceil(w/8), we split the region

751

multiplication into three parts. We are going to make s_start be

752

the first address greater than or equal to src that is a multiple

753

of align. s_top is going to be the largest address >= src+bytes

754

such that (s_top - s_start) is a multiple of align. We do the

755

same with d_start and d_top. When we say that "src and dest must

756

be aligned with respect to each other, we mean that s_start-src

757

must equal d_start-dest.

758

759

Now, the region multiplication is done in three parts -- the part

760

between src and s_start must be done using single words.

761

Similarly, the part between s_top and src+bytes must also be done

762

using single words. The part between s_start and s_top will be

763

done in chunks of "align" bytes.

764

765

One final thing -- if align > 16, then s_start and d_start will be

766

aligned on a 16 byte boundary. Perhaps we should have two

767

variables: align and chunksize. Then we'd have s_start & d_start

768

aligned to "align", and have s_top-s_start be a multiple of

769

chunksize. That may be less confusing, but it would be a big

770

change.

771

772

Finally, if align = -1, then we are doing Cauchy multiplication,

773

using only XOR's. In this case, we're not going to care about

774

alignment because we are just doing XOR's. Instead, the only

775

thing we care about is that bytes must be a multiple of w.

776

777

This is not to say that alignment doesn't matter in performance

778

with XOR's. See that discussion in gf_multby_one().

779

780

After you call gf_set_region_data(), the procedure

781

gf_do_initial_region_alignment() calls gf->multiply.w32() on

782

everything between src and s_start. The procedure

783

gf_do_final_region_alignment() calls gf->multiply.w32() on

784

everything between s_top and src+bytes.

785

786

787

void gf_set_region_data(gf_region_data *rd,

788

gf_t *gf,

789

void *src,

790

void *dest,

791

int bytes,

792

uint64_t val,

793

int xor,

794

int align)

795

{

796

gf_internal_t *h = NULL((void*)0);

797

int wb;

798

uint32_t a;

799

unsigned long uls, uld;

800

801

if (gf == NULL((void*)0)) { /* JSP - Can be NULL if you're just doing XOR's */

802

wb = 1;

803

} else {

804

h = gf->scratch;

805

wb = (h->w)/8;

806

if (wb == 0) wb = 1;

807

}

808

809

rd->gf = gf;

810

rd->src = src;

811

rd->dest = dest;

812

rd->bytes = bytes;

813

rd->val = val;

814

rd->xor = xor;

815

rd->align = align;

816

817

uls = (unsigned long) src;

818

uld = (unsigned long) dest;

819

820

a = (align <= 16) ? align : 16;

821

822

if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers

823

so that there are no alignment regions. */

824

if (h != NULL((void*)0) && bytes % h->w != 0) {

825

fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n"
);

826

fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w)__fprintf_chk (stderr, 2 - 1, "The size must be a multiple of %d bytes.\n"
, h->w);

827

assert(0)((void) (0));

828

}

829

830

rd->s_start = src;

831

rd->d_start = dest;

832

rd->s_top = (uint8_t *)src + bytes;

833

rd->d_top = (uint8_t *)src + bytes;

834

return;

835

}

836

837

if (uls % a != uld % a) {

838

fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n"
);

839

fprintf(stderr, "The source & destination pointers must be aligned with respect\n")__fprintf_chk (stderr, 2 - 1, "The source & destination pointers must be aligned with respect\n"
);

840

fprintf(stderr, "to each other along a %d byte boundary.\n", a)__fprintf_chk (stderr, 2 - 1, "to each other along a %d byte boundary.\n"
, a);

841

fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n",
(unsigned long) src, (unsigned long) dest)

842

(unsigned long) dest)__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n",
(unsigned long) src, (unsigned long) dest);

843

assert(0)((void) (0));

844

}

845

846

if (uls % wb != 0) {

847

fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n"
);

848

fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb)__fprintf_chk (stderr, 2 - 1, "The pointers must be aligned along a %d byte boundary.\n"
, wb);

849

fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n",
(unsigned long) src, (unsigned long) dest)

850

(unsigned long) dest)__fprintf_chk (stderr, 2 - 1, "Src = 0x%lx. Dest = 0x%lx\n",
(unsigned long) src, (unsigned long) dest);

851

assert(0)((void) (0));

852

}

853

854

if (bytes % wb != 0) {

855

fprintf(stderr, "Error in region multiply operation.\n")__fprintf_chk (stderr, 2 - 1, "Error in region multiply operation.\n"
);

856

fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb)__fprintf_chk (stderr, 2 - 1, "The size must be a multiple of %d bytes.\n"
, wb);

857

assert(0)((void) (0));

858

}

859

860

uls %= a;

861

if (uls != 0) uls = (a-uls);

862

rd->s_start = (uint8_t *)rd->src + uls;

863

rd->d_start = (uint8_t *)rd->dest + uls;

864

bytes -= uls;

865

bytes -= (bytes % align);

866

rd->s_top = (uint8_t *)rd->s_start + bytes;

867

rd->d_top = (uint8_t *)rd->d_start + bytes;

868

869

}

870

871

void gf_do_initial_region_alignment(gf_region_data *rd)

872

{

873

gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);

874

}

875

876

void gf_do_final_region_alignment(gf_region_data *rd)

877

{

878

gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes);

879

}

880

881

void gf_multby_zero(void *dest, int bytes, int xor)

882

{

883

if (xor) return;

884

bzero(dest, bytes);

885

return;

886

}

887

888

/* JSP - gf_multby_one tries to do this in the most efficient way

889

possible. If xor = 0, then simply call memcpy() since that

890

should be optimized by the system. Otherwise, try to do the xor

891

in the following order:

892

893

If src and dest are aligned with respect to each other on 16-byte

894

boundaries and you have SSE instructions, then use aligned SSE

895

instructions.

896

897

If they aren't but you still have SSE instructions, use unaligned

898

SSE instructions.

899

900

If there are no SSE instructions, but they are aligned with

901

respect to each other on 8-byte boundaries, then do them with

902

uint64_t's.

903

904

Otherwise, call gf_unaligned_xor(), which does the following:

905

align a destination pointer along an 8-byte boundary, and then

906

memcpy 32 bytes at a time from the src pointer to an array of

907

doubles. I'm not sure if that's the best -- probably needs

908

testing, but this seems like it could be a black hole.

909

910

911

static void gf_unaligned_xor(void *src, void *dest, int bytes);

912

913

void gf_multby_one(void *src, void *dest, int bytes, int xor)

914

{

915

unsigned long uls, uld;

916

uint8_t *s8, *d8;

917

uint64_t *s64, *d64, *dtop64;

918

gf_region_data rd;

919

920

if (!xor) {

921

if (dest != src)

922

memcpy(dest, src, bytes);

923

return;

924

}

925

uls = (unsigned long) src;

926

uld = (unsigned long) dest;

927

928

#ifdef INTEL_SSE21

929

if (gf_cpu_supports_intel_sse2) {

930

__m128i ms, md;

931

int abytes;

932

s8 = (uint8_t *) src;

933

d8 = (uint8_t *) dest;

934

if (uls % 16 == uld % 16) {

935

gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 16);

936

while (s8 != rd.s_start) {

937

*d8 ^= *s8;

938

d8++;

939

s8++;

940

}

941

while (s8 < (uint8_t *) rd.s_top) {

942

ms = _mm_load_si128 ((__m128i *)(s8));

943

md = _mm_load_si128 ((__m128i *)(d8));

944

md = _mm_xor_si128(md, ms);

945

_mm_store_si128((__m128i *)(d8), md);

946

s8 += 16;

947

d8 += 16;

948

}

949

while (s8 != (uint8_t *) src + bytes) {

950

*d8 ^= *s8;

951

d8++;

952

s8++;

953

}

954

return;

955

}

956

957

abytes = (bytes & 0xfffffff0);

958

959

while (d8 < (uint8_t *) dest + abytes) {

960

ms = _mm_loadu_si128 ((__m128i *)(s8));

961

md = _mm_loadu_si128 ((__m128i *)(d8));

962

md = _mm_xor_si128(md, ms);

963

_mm_storeu_si128((__m128i *)(d8), md);

964

s8 += 16;

965

d8 += 16;

966

}

967

while (d8 != (uint8_t *) dest+bytes) {

968

*d8 ^= *s8;

969

d8++;

970

s8++;

971

}

972

return;

973

}

974

#endif

975

#if defined(ARM_NEON)

976

if (gf_cpu_supports_arm_neon) {

977

s8 = (uint8_t *) src;

978

d8 = (uint8_t *) dest;

979

980

if (uls % 16 == uld % 16) {

981

gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 16);

982

while (s8 != rd.s_start) {

983

*d8 ^= *s8;

984

s8++;

985

d8++;

986

}

987

while (s8 < (uint8_t *) rd.s_top) {

988

uint8x16_t vs = vld1q_u8 (s8);

989

uint8x16_t vd = vld1q_u8 (d8);

990

uint8x16_t vr = veorq_u8 (vs, vd);

991

vst1q_u8 (d8, vr);

992

s8 += 16;

993

d8 += 16;

994

}

995

} else {

996

while (s8 + 15 < (uint8_t *) src + bytes) {

997

uint8x16_t vs = vld1q_u8 (s8);

998

uint8x16_t vd = vld1q_u8 (d8);

999

uint8x16_t vr = veorq_u8 (vs, vd);

1000

vst1q_u8 (d8, vr);

1001

s8 += 16;

1002

d8 += 16;

1003

}

1004

}

1005

while (s8 < (uint8_t *) src + bytes) {

1006

*d8 ^= *s8;

1007

s8++;

1008

d8++;

1009

}

1010

return;

1011

}

1012

#endif

1013

if (uls % 8 != uld % 8) {

1014

gf_unaligned_xor(src, dest, bytes);

1015

return;

1016

}

1017

1018

gf_set_region_data(&rd, NULL((void*)0), src, dest, bytes, 1, xor, 8);

1019

s8 = (uint8_t *) src;

1020

d8 = (uint8_t *) dest;

1021

while (d8 != rd.d_start) {

1022

*d8 ^= *s8;

1023

d8++;

1024

s8++;

1025

}

1026

dtop64 = (uint64_t *) rd.d_top;

1027

1028

d64 = (uint64_t *) rd.d_start;

1029

s64 = (uint64_t *) rd.s_start;

1030

1031

while (d64 < dtop64) {

1032

*d64 ^= *s64;

1033

d64++;

1034

s64++;

1035

}

1036

1037

s8 = (uint8_t *) rd.s_top;

1038

d8 = (uint8_t *) rd.d_top;

1039

1040

while (d8 != (uint8_t *) dest+bytes) {

1041

*d8 ^= *s8;

1042

d8++;

1043

s8++;

1044

}

1045

return;

1046

}

1047

1048

#define UNALIGNED_BUFSIZE(8) (8)

1049

1050

static void gf_unaligned_xor(void *src, void *dest, int bytes)

1051

{

1052

uint64_t scopy[UNALIGNED_BUFSIZE(8)], *d64;

1053

int i;

1054

gf_region_data rd;

1055

uint8_t *s8, *d8;

1056

1057

/* JSP - call gf_set_region_data(), but use dest in both places. This is

1058

because I only want to set up dest. If I used src, gf_set_region_data()

1059

would fail because src and dest are not aligned to each other wrt

1060

8-byte pointers. I know this will actually align d_start to 16 bytes.

1061

If I change gf_set_region_data() to split alignment & chunksize, then

1062

I could do this correctly. */

1063

1064

gf_set_region_data(&rd, NULL((void*)0), dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE(8));

1065

s8 = (uint8_t *) src;

1066

d8 = (uint8_t *) dest;

1067

1068

while (d8 < (uint8_t *) rd.d_start) {

1069

*d8 ^= *s8;

1070

d8++;

1071

s8++;

1072

}

1073

1074

d64 = (uint64_t *) d8;

1075

while (d64 < (uint64_t *) rd.d_top) {

1076

memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE(8));

1077

s8 += 8*UNALIGNED_BUFSIZE(8);

1078

for (i = 0; i < UNALIGNED_BUFSIZE(8); i++) {

1079

*d64 ^= scopy[i];

1080

d64++;

1081

}

1082

}

1083

1084

d8 = (uint8_t *) d64;

1085

while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) {

1086

*d8 ^= *s8;

1087

d8++;

1088

s8++;

1089

}

1090

}