1    	/* *
2    	 * Copyright (c) 2014, James S. Plank and Kevin Greenan
3    	 * All rights reserved.
4    	 *
5    	 * Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure
6    	 * Coding Techniques
7    	 *
8    	 * Revision 2.0: Galois Field backend now links to GF-Complete
9    	 *
10   	 * Redistribution and use in source and binary forms, with or without
11   	 * modification, are permitted provided that the following conditions
12   	 * are met:
13   	 *
14   	 *  - Redistributions of source code must retain the above copyright
15   	 *    notice, this list of conditions and the following disclaimer.
16   	 *
17   	 *  - Redistributions in binary form must reproduce the above copyright
18   	 *    notice, this list of conditions and the following disclaimer in
19   	 *    the documentation and/or other materials provided with the
20   	 *    distribution.
21   	 *
22   	 *  - Neither the name of the University of Tennessee nor the names of its
23   	 *    contributors may be used to endorse or promote products derived
24   	 *    from this software without specific prior written permission.
25   	 *
26   	 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27   	 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28   	 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29   	 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30   	 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
31   	 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
32   	 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
33   	 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
34   	 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35   	 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
36   	 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37   	 * POSSIBILITY OF SUCH DAMAGE.
38   	 */
39   	
40   	/* Jerasure's authors:
41   	
42   	   Revision 2.x - 2014: James S. Plank and Kevin M. Greenan
43   	   Revision 1.2 - 2008: James S. Plank, Scott Simmerman and Catherine D. Schuman.
44   	   Revision 1.0 - 2007: James S. Plank
45   	 */
46   	
47   	#include <stdio.h>
48   	#include <stdlib.h>
49   	#include <string.h>
50   	#include <assert.h>
51   	
52   	#include "galois.h"
53   	#include "jerasure.h"
54   	
55   	#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
56   	
57   	static double jerasure_total_xor_bytes = 0;
58   	static double jerasure_total_gf_bytes = 0;
59   	static double jerasure_total_memcpy_bytes = 0;
60   	
61   	void jerasure_print_matrix(int *m, int rows, int cols, int w)
62   	{
63   	  int i, j;
64   	  int fw;
65   	  char s[30];
66   	  unsigned int w2;
67   	
68   	  if (w == 32) {
69   	    fw = 10;
70   	  } else {
71   	    w2 = (1 << w);
72   	    sprintf(s, "%u", w2-1);
73   	    fw = strlen(s);
74   	  }
75   	
76   	  for (i = 0; i < rows; i++) {
77   	    for (j = 0; j < cols; j++) {
78   	      if (j != 0) printf(" ");
79   	      printf("%*u", fw, m[i*cols+j]); 
80   	    }
81   	    printf("\n");
82   	  }
83   	}
84   	
85   	void jerasure_print_bitmatrix(int *m, int rows, int cols, int w)
86   	{
87   	  int i, j;
88   	
89   	  for (i = 0; i < rows; i++) {
90   	    if (i != 0 && i%w == 0) printf("\n");
91   	    for (j = 0; j < cols; j++) {
92   	      if (j != 0 && j%w == 0) printf(" ");
93   	      printf("%d", m[i*cols+j]); 
94   	    }
95   	    printf("\n");
96   	  }
97   	}
98   	
99   	int jerasure_make_decoding_matrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids)
100  	{
101  	  int i, j, *tmpmat;
102  	
103  	  j = 0;
104  	  for (i = 0; j < k; i++) {
105  	    if (erased[i] == 0) {
106  	      dm_ids[j] = i;
107  	      j++;
108  	    }
109  	  }
110  	
111  	  tmpmat = talloc(int, k*k);
112  	  if (tmpmat == NULL) { return -1; }
113  	  for (i = 0; i < k; i++) {
114  	    if (dm_ids[i] < k) {
115  	      for (j = 0; j < k; j++) tmpmat[i*k+j] = 0;
116  	      tmpmat[i*k+dm_ids[i]] = 1;
117  	    } else {
118  	      for (j = 0; j < k; j++) {
119  	        tmpmat[i*k+j] = matrix[(dm_ids[i]-k)*k+j];
120  	      }
121  	    }
122  	  }
123  	
124  	  i = jerasure_invert_matrix(tmpmat, decoding_matrix, k, w);
125  	  free(tmpmat);
126  	  return i;
127  	}
128  	
129  	/* Internal Routine */
130  	int jerasure_make_decoding_bitmatrix(int k, int m, int w, int *matrix, int *erased, int *decoding_matrix, int *dm_ids)
131  	{
132  	  int i, j, *tmpmat;
133  	  int index, mindex;
134  	
135  	  j = 0;
136  	  for (i = 0; j < k; i++) {
137  	    if (erased[i] == 0) {
138  	      dm_ids[j] = i;
139  	      j++;
140  	    }
141  	  }
142  	
143  	  tmpmat = talloc(int, k*k*w*w);
144  	  if (tmpmat == NULL) { return -1; }
145  	  for (i = 0; i < k; i++) {
146  	    if (dm_ids[i] < k) {
147  	      index = i*k*w*w;
148  	      for (j = 0; j < k*w*w; j++) tmpmat[index+j] = 0;
149  	      index = i*k*w*w+dm_ids[i]*w;
150  	      for (j = 0; j < w; j++) {
151  	        tmpmat[index] = 1;
152  	        index += (k*w+1);
153  	      }
154  	    } else {
155  	      index = i*k*w*w;
156  	      mindex = (dm_ids[i]-k)*k*w*w;
157  	      for (j = 0; j < k*w*w; j++) {
158  	        tmpmat[index+j] = matrix[mindex+j];
159  	      }
160  	    }
161  	  }
162  	
163  	  i = jerasure_invert_bitmatrix(tmpmat, decoding_matrix, k*w);
164  	  free(tmpmat);
165  	  return i;
166  	}
167  	
168  	int jerasure_matrix_decode(int k, int m, int w, int *matrix, int row_k_ones, int *erasures,
169  	                          char **data_ptrs, char **coding_ptrs, int size)
170  	{
171  	  int i, edd, lastdrive;
172  	  int *tmpids;
173  	  int *erased, *decoding_matrix, *dm_ids;
174  	
175  	  if (w != 8 && w != 16 && w != 32) return -1;
176  	
177  	  erased = jerasure_erasures_to_erased(k, m, erasures);
178  	  if (erased == NULL) return -1;
179  	
180  	  /* Find the number of data drives failed */
181  	
182  	  lastdrive = k;
183  	
184  	  edd = 0;
185  	  for (i = 0; i < k; i++) {
186  	    if (erased[i]) {
187  	      edd++;
188  	      lastdrive = i;
189  	    }
190  	  }
191  	    
192  	  /* You only need to create the decoding matrix in the following cases:
193  	
194  	      1. edd > 0 and row_k_ones is false.
195  	      2. edd > 0 and row_k_ones is true and coding device 0 has been erased.
196  	      3. edd > 1
197  	
198  	      We're going to use lastdrive to denote when to stop decoding data.
199  	      At this point in the code, it is equal to the last erased data device.
200  	      However, if we can't use the parity row to decode it (i.e. row_k_ones=0
201  	         or erased[k] = 1, we're going to set it to k so that the decoding 
202  	         pass will decode all data.
203  	   */
204  	
205  	  if (!row_k_ones || erased[k]) lastdrive = k;
206  	
207  	  dm_ids = NULL;
208  	  decoding_matrix = NULL;
209  	
210  	  if (edd > 1 || (edd > 0 && (!row_k_ones || erased[k]))) {
211  	    dm_ids = talloc(int, k);
212  	    if (dm_ids == NULL) {
213  	      free(erased);
214  	      return -1;
215  	    }
216  	
217  	    decoding_matrix = talloc(int, k*k);
218  	    if (decoding_matrix == NULL) {
219  	      free(erased);
220  	      free(dm_ids);
221  	      return -1;
222  	    }
223  	
224  	    if (jerasure_make_decoding_matrix(k, m, w, matrix, erased, decoding_matrix, dm_ids) < 0) {
225  	      free(erased);
226  	      free(dm_ids);
227  	      free(decoding_matrix);
228  	      return -1;
229  	    }
230  	  }
231  	
232  	  /* Decode the data drives.  
233  	     If row_k_ones is true and coding device 0 is intact, then only decode edd-1 drives.
234  	     This is done by stopping at lastdrive.
235  	     We test whether edd > 0 so that we can exit the loop early if we're done.
236  	   */
237  	
238  	  for (i = 0; edd > 0 && i < lastdrive; i++) {
239  	    if (erased[i]) {
240  	      jerasure_matrix_dotprod(k, w, decoding_matrix+(i*k), dm_ids, i, data_ptrs, coding_ptrs, size);
241  	      edd--;
242  	    }
243  	  }
244  	
245  	  /* Then if necessary, decode drive lastdrive */
246  	
247  	  if (edd > 0) {
248  	    tmpids = talloc(int, k);
249  	    for (i = 0; i < k; i++) {
250  	      tmpids[i] = (i < lastdrive) ? i : i+1;
251  	    }
252  	    jerasure_matrix_dotprod(k, w, matrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size);
253  	    free(tmpids);
254  	  }
255  	  
256  	  /* Finally, re-encode any erased coding devices */
257  	
258  	  for (i = 0; i < m; i++) {
259  	    if (erased[k+i]) {
260  	      jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, i+k, data_ptrs, coding_ptrs, size);
261  	    }
262  	  }
263  	
264  	  free(erased);
265  	  if (dm_ids != NULL) free(dm_ids);
266  	  if (decoding_matrix != NULL) free(decoding_matrix);
267  	
268  	  return 0;
269  	}
270  	
271  	
272  	int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix) 
273  	{
274  	  int *bitmatrix;
275  	  int rowelts, rowindex, colindex, elt, i, j, l, x;
276  	
277  	  if (matrix == NULL) { return NULL; }
278  	
279  	  bitmatrix = talloc(int, k*m*w*w);
280  	
281  	  rowelts = k * w;
282  	  rowindex = 0;
283  	
284  	  for (i = 0; i < m; i++) {
285  	    colindex = rowindex;
286  	    for (j = 0; j < k; j++) {
287  	      elt = matrix[i*k+j];
288  	      for (x = 0; x < w; x++) {
289  	        for (l = 0; l < w; l++) {
290  	          bitmatrix[colindex+x+l*rowelts] = ((elt & (1 << l)) ? 1 : 0);
291  	        }
292  	        elt = galois_single_multiply(elt, 2, w);
293  	      }
294  	      colindex += w;
295  	    }
296  	    rowindex += rowelts * w;
297  	  }
298  	  return bitmatrix;
299  	}
300  	
301  	void jerasure_matrix_encode(int k, int m, int w, int *matrix,
302  	                          char **data_ptrs, char **coding_ptrs, int size)
303  	{
304  	  int i;
305  	  
306  	  if (w != 8 && w != 16 && w != 32) {
307  	    fprintf(stderr, "ERROR: jerasure_matrix_encode() and w is not 8, 16 or 32\n");
308  	    assert(0);
309  	  }
310  	
311  	  for (i = 0; i < m; i++) {
312  	    jerasure_matrix_dotprod(k, w, matrix+(i*k), NULL, k+i, data_ptrs, coding_ptrs, size);
313  	  }
314  	}
315  	
316  	void jerasure_bitmatrix_dotprod(int k, int w, int *bitmatrix_row,
317  	                             int *src_ids, int dest_id,
318  	                             char **data_ptrs, char **coding_ptrs, int size, int packetsize)
319  	{
320  	  int j, sindex, pstarted, index, x, y;
321  	  char *dptr, *pptr, *bdptr, *bpptr;
322  	
323  	  if (size%(w*packetsize) != 0) {
324  	    fprintf(stderr, "jerasure_bitmatrix_dotprod - size%c(w*packetsize)) must = 0\n", '%');
325  	    assert(0);
326  	  }
327  	
328  	  bpptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k];
329  	
330  	  for (sindex = 0; sindex < size; sindex += (packetsize*w)) {
331  	    index = 0;
332  	    for (j = 0; j < w; j++) {
333  	      pstarted = 0;
334  	      pptr = bpptr + sindex + j*packetsize;
335  	      for (x = 0; x < k; x++) {
336  	        if (src_ids == NULL) {
337  	          bdptr = data_ptrs[x];
338  	        } else if (src_ids[x] < k) {
339  	          bdptr = data_ptrs[src_ids[x]];
340  	        } else {
341  	          bdptr = coding_ptrs[src_ids[x]-k];
342  	        }
343  	        for (y = 0; y < w; y++) {
344  	          if (bitmatrix_row[index]) {
345  	            dptr = bdptr + sindex + y*packetsize;
346  	            if (!pstarted) {
347  	              memcpy(pptr, dptr, packetsize);
348  	              jerasure_total_memcpy_bytes += packetsize;
349  	              pstarted = 1;
350  	            } else {
351  	              galois_region_xor(dptr, pptr, packetsize);
352  	              jerasure_total_xor_bytes += packetsize;
353  	            }
354  	          }
355  	          index++;
356  	        }
357  	      }
358  	    }
359  	  }
360  	}
361  	
362  	void jerasure_do_parity(int k, char **data_ptrs, char *parity_ptr, int size) 
363  	{
364  	  int i;
365  	
366  	  memcpy(parity_ptr, data_ptrs[0], size);
367  	  jerasure_total_memcpy_bytes += size;
368  	  
369  	  for (i = 1; i < k; i++) {
370  	    galois_region_xor(data_ptrs[i], parity_ptr, size);
371  	    jerasure_total_xor_bytes += size;
372  	  }
373  	}
374  	
375  	int jerasure_invert_matrix(int *mat, int *inv, int rows, int w)
376  	{
377  	  int cols, i, j, k, x, rs2;
378  	  int row_start, tmp, inverse;
379  	 
380  	  cols = rows;
381  	
382  	  k = 0;
383  	  for (i = 0; i < rows; i++) {
384  	    for (j = 0; j < cols; j++) {
385  	      inv[k] = (i == j) ? 1 : 0;
386  	      k++;
387  	    }
388  	  }
389  	
390  	  /* First -- convert into upper triangular  */
391  	  for (i = 0; i < cols; i++) {
392  	    row_start = cols*i;
393  	
394  	    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the 
395  	       matrix was not invertible  */
396  	
397  	    if (mat[row_start+i] == 0) { 
398  	      for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ;
399  	      if (j == rows) return -1;
400  	      rs2 = j*cols;
401  	      for (k = 0; k < cols; k++) {
402  	        tmp = mat[row_start+k];
403  	        mat[row_start+k] = mat[rs2+k];
404  	        mat[rs2+k] = tmp;
405  	        tmp = inv[row_start+k];
406  	        inv[row_start+k] = inv[rs2+k];
407  	        inv[rs2+k] = tmp;
408  	      }
409  	    }
410  	 
411  	    /* Multiply the row by 1/element i,i  */
412  	    tmp = mat[row_start+i];
413  	    if (tmp != 1) {
414  	      inverse = galois_single_divide(1, tmp, w);
415  	      for (j = 0; j < cols; j++) { 
416  	        mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w);
417  	        inv[row_start+j] = galois_single_multiply(inv[row_start+j], inverse, w);
418  	      }
419  	    }
420  	
421  	    /* Now for each j>i, add A_ji*Ai to Aj  */
422  	    k = row_start+i;
423  	    for (j = i+1; j != cols; j++) {
424  	      k += cols;
425  	      if (mat[k] != 0) {
426  	        if (mat[k] == 1) {
427  	          rs2 = cols*j;
428  	          for (x = 0; x < cols; x++) {
429  	            mat[rs2+x] ^= mat[row_start+x];
430  	            inv[rs2+x] ^= inv[row_start+x];
431  	          }
432  	        } else {
433  	          tmp = mat[k];
434  	          rs2 = cols*j;
435  	          for (x = 0; x < cols; x++) {
436  	            mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w);
437  	            inv[rs2+x] ^= galois_single_multiply(tmp, inv[row_start+x], w);
438  	          }
439  	        }
440  	      }
441  	    }
442  	  }
443  	
444  	  /* Now the matrix is upper triangular.  Start at the top and multiply down  */
445  	
446  	  for (i = rows-1; i >= 0; i--) {
447  	    row_start = i*cols;
448  	    for (j = 0; j < i; j++) {
449  	      rs2 = j*cols;
450  	      if (mat[rs2+i] != 0) {
451  	        tmp = mat[rs2+i];
452  	        mat[rs2+i] = 0; 
453  	        for (k = 0; k < cols; k++) {
454  	          inv[rs2+k] ^= galois_single_multiply(tmp, inv[row_start+k], w);
455  	        }
456  	      }
457  	    }
458  	  }
459  	  return 0;
460  	}
461  	
462  	int jerasure_invertible_matrix(int *mat, int rows, int w)
463  	{
464  	  int cols, i, j, k, x, rs2;
465  	  int row_start, tmp, inverse;
466  	 
467  	  cols = rows;
468  	
469  	  /* First -- convert into upper triangular  */
470  	  for (i = 0; i < cols; i++) {
471  	    row_start = cols*i;
472  	
473  	    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the 
474  	       matrix was not invertible  */
475  	
476  	    if (mat[row_start+i] == 0) { 
477  	      for (j = i+1; j < rows && mat[cols*j+i] == 0; j++) ;
478  	      if (j == rows) return 0;
479  	      rs2 = j*cols;
480  	      for (k = 0; k < cols; k++) {
481  	        tmp = mat[row_start+k];
482  	        mat[row_start+k] = mat[rs2+k];
483  	        mat[rs2+k] = tmp;
484  	      }
485  	    }
486  	 
487  	    /* Multiply the row by 1/element i,i  */
488  	    tmp = mat[row_start+i];
489  	    if (tmp != 1) {
490  	      inverse = galois_single_divide(1, tmp, w);
491  	      for (j = 0; j < cols; j++) { 
492  	        mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w);
493  	      }
494  	    }
495  	
496  	    /* Now for each j>i, add A_ji*Ai to Aj  */
497  	    k = row_start+i;
498  	    for (j = i+1; j != cols; j++) {
499  	      k += cols;
500  	      if (mat[k] != 0) {
501  	        if (mat[k] == 1) {
502  	          rs2 = cols*j;
503  	          for (x = 0; x < cols; x++) {
504  	            mat[rs2+x] ^= mat[row_start+x];
505  	          }
506  	        } else {
507  	          tmp = mat[k];
508  	          rs2 = cols*j;
509  	          for (x = 0; x < cols; x++) {
510  	            mat[rs2+x] ^= galois_single_multiply(tmp, mat[row_start+x], w);
511  	          }
512  	        }
513  	      }
514  	    }
515  	  }
516  	  return 1;
517  	}
518  	
519  	/* Converts a list-style version of the erasures into an array of k+m elements
520  	   where the element = 1 if the index has been erased, and zero otherwise */
521  	
522  	int *jerasure_erasures_to_erased(int k, int m, int *erasures)
523  	{
524  	  int td;
525  	  int t_non_erased;
526  	  int *erased;
527  	  int i;
528  	
529  	  td = k+m;
530  	  erased = talloc(int, td);
531  	  if (erased == NULL) return NULL;
532  	  t_non_erased = td;
533  	
534  	  for (i = 0; i < td; i++) erased[i] = 0;
535  	
536  	  for (i = 0; erasures[i] != -1; i++) {
537  	    if (erased[erasures[i]] == 0) {
538  	      erased[erasures[i]] = 1;
539  	      t_non_erased--;
540  	      if (t_non_erased < k) {
541  	        free(erased);
542  	        return NULL;
543  	      }
544  	    }
545  	  }
546  	  return erased;
547  	}
548  	  
549  	void jerasure_free_schedule(int **schedule)
550  	{
551  	  int i;
552  	
553  	  for (i = 0; schedule[i][0] >= 0; i++) free(schedule[i]);
554  	  free(schedule[i]);
555  	  free(schedule);
556  	}
557  	
558  	void jerasure_free_schedule_cache(int k, int m, int ***cache)
559  	{
560  	  int e1, e2;
561  	
562  	  if (m != 2) {
563  	    fprintf(stderr, "jerasure_free_schedule_cache(): m must equal 2\n");
564  	    assert(0);
565  	  }
566  	
567  	  for (e1 = 0; e1 < k+m; e1++) {
568  	    for (e2 = 0; e2 < e1; e2++) {
569  	      jerasure_free_schedule(cache[e1*(k+m)+e2]);
570  	    }
571  	    jerasure_free_schedule(cache[e1*(k+m)+e1]);
572  	  }
573  	  free(cache);
574  	}
575  	
576  	void jerasure_matrix_dotprod(int k, int w, int *matrix_row,
577  	                          int *src_ids, int dest_id,
578  	                          char **data_ptrs, char **coding_ptrs, int size)
579  	{
580  	  int init;
581  	  char *dptr, *sptr;
582  	  int i;
583  	
584  	  if (w != 1 && w != 8 && w != 16 && w != 32) {
585  	    fprintf(stderr, "ERROR: jerasure_matrix_dotprod() called and w is not 1, 8, 16 or 32\n");
586  	    assert(0);
587  	  }
588  	
589  	  init = 0;
590  	
591  	  dptr = (dest_id < k) ? data_ptrs[dest_id] : coding_ptrs[dest_id-k];
592  	
593  	  /* First copy or xor any data that does not need to be multiplied by a factor */
594  	
595  	  for (i = 0; i < k; i++) {
596  	    if (matrix_row[i] == 1) {
597  	      if (src_ids == NULL) {
598  	        sptr = data_ptrs[i];
599  	      } else if (src_ids[i] < k) {
600  	        sptr = data_ptrs[src_ids[i]];
601  	      } else {
602  	        sptr = coding_ptrs[src_ids[i]-k];
603  	      }
604  	      if (init == 0) {
605  	        memcpy(dptr, sptr, size);
606  	        jerasure_total_memcpy_bytes += size;
607  	        init = 1;
608  	      } else {
609  	        galois_region_xor(sptr, dptr, size);
610  	        jerasure_total_xor_bytes += size;
611  	      }
612  	    }
613  	  }
614  	
615  	  /* Now do the data that needs to be multiplied by a factor */
616  	
617  	  for (i = 0; i < k; i++) {
618  	    if (matrix_row[i] != 0 && matrix_row[i] != 1) {
619  	      if (src_ids == NULL) {
620  	        sptr = data_ptrs[i];
621  	      } else if (src_ids[i] < k) {
622  	        sptr = data_ptrs[src_ids[i]];
623  	      } else {
624  	        sptr = coding_ptrs[src_ids[i]-k];
625  	      }
626  	      switch (w) {
627  	        case 8:  galois_w08_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
628  	        case 16: galois_w16_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
629  	        case 32: galois_w32_region_multiply(sptr, matrix_row[i], size, dptr, init); break;
630  	      }
631  	      jerasure_total_gf_bytes += size;
632  	      init = 1;
633  	    }
634  	  }
635  	}
636  	
637  	
638  	int jerasure_bitmatrix_decode(int k, int m, int w, int *bitmatrix, int row_k_ones, int *erasures,
639  	                            char **data_ptrs, char **coding_ptrs, int size, int packetsize)
640  	{
641  	  int i;
642  	  int *erased;
643  	  int *decoding_matrix;
644  	  int *dm_ids;
645  	  int edd, *tmpids, lastdrive;
646  	  
647  	  erased = jerasure_erasures_to_erased(k, m, erasures);
648  	  if (erased == NULL) return -1;
649  	
650  	  /* See jerasure_matrix_decode for the logic of this routine.  This one works just like
651  	     it, but calls the bitmatrix ops instead */
652  	
653  	  lastdrive = k;
654  	    
655  	  edd = 0;
656  	  for (i = 0; i < k; i++) {
657  	    if (erased[i]) {
658  	      edd++;
659  	      lastdrive = i;
660  	    } 
661  	  }
662  	
663  	  if (row_k_ones != 1 || erased[k]) lastdrive = k;
664  	  
665  	  dm_ids = NULL;
666  	  decoding_matrix = NULL;
667  	  
668  	  if (edd > 1 || (edd > 0 && (row_k_ones != 1 || erased[k]))) {
669  	
670  	    dm_ids = talloc(int, k);
671  	    if (dm_ids == NULL) {
672  	      free(erased);
673  	      return -1;
674  	    }
675  	  
676  	    decoding_matrix = talloc(int, k*k*w*w);
677  	    if (decoding_matrix == NULL) {
678  	      free(erased);
679  	      free(dm_ids);
680  	      return -1;
681  	    }
682  	  
683  	    if (jerasure_make_decoding_bitmatrix(k, m, w, bitmatrix, erased, decoding_matrix, dm_ids) < 0) {
684  	      free(erased);
685  	      free(dm_ids);
686  	      free(decoding_matrix);
687  	      return -1;
688  	    }
689  	  }
690  	
691  	  for (i = 0; edd > 0 && i < lastdrive; i++) {
692  	    if (erased[i]) {
693  	      jerasure_bitmatrix_dotprod(k, w, decoding_matrix+i*k*w*w, dm_ids, i, data_ptrs, coding_ptrs, size, packetsize);
694  	      edd--;
695  	    }
696  	  }
697  	
698  	  if (edd > 0) {
699  	    tmpids = talloc(int, k);
700  	    for (i = 0; i < k; i++) {
701  	      tmpids[i] = (i < lastdrive) ? i : i+1;
702  	    }
703  	    jerasure_bitmatrix_dotprod(k, w, bitmatrix, tmpids, lastdrive, data_ptrs, coding_ptrs, size, packetsize);
704  	    free(tmpids);
705  	  }
706  	
707  	  for (i = 0; i < m; i++) {
708  	    if (erased[k+i]) {
709  	      jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize);
710  	    }
711  	  }
712  	
713  	  free(erased);
714  	  if (dm_ids != NULL) free(dm_ids);
715  	  if (decoding_matrix != NULL) free(decoding_matrix);
716  	
717  	  return 0;
718  	}
719  	
720  	static char **set_up_ptrs_for_scheduled_decoding(int k, int m, int *erasures, char **data_ptrs, char **coding_ptrs)
721  	{
722  	  int ddf, cdf;
723  	  int *erased;
724  	  char **ptrs;
725  	  int i, j, x;
726  	
727  	  ddf = 0;
728  	  cdf = 0;
729  	  for (i = 0; erasures[i] != -1; i++) {
730  	    if (erasures[i] < k) ddf++; else cdf++;
731  	  }
732  	  
733  	  erased = jerasure_erasures_to_erased(k, m, erasures);
734  	  if (erased == NULL) return NULL;
735  	
736  	  /* Set up ptrs.  It will be as follows:
737  	
738  	       - If data drive i has not failed, then ptrs[i] = data_ptrs[i].
739  	       - If data drive i has failed, then ptrs[i] = coding_ptrs[j], where j is the 
740  	            lowest unused non-failed coding drive.
741  	       - Elements k to k+ddf-1 are data_ptrs[] of the failed data drives.
742  	       - Elements k+ddf to k+ddf+cdf-1 are coding_ptrs[] of the failed data drives.
743  	
744  	       The array row_ids contains the ids of ptrs.
745  	       The array ind_to_row_ids contains the row_id of drive i.
746  	  
747  	       However, we're going to set row_ids and ind_to_row in a different procedure.
748  	   */
749  	         
750  	  ptrs = talloc(char *, k+m);
751  	
752  	  j = k;
753  	  x = k;
754  	  for (i = 0; i < k; i++) {
755  	    if (erased[i] == 0) {
756  	      ptrs[i] = data_ptrs[i];
757  	    } else {
758  	      while (erased[j]) j++;
759  	      ptrs[i] = coding_ptrs[j-k];
760  	      j++;
761  	      ptrs[x] = data_ptrs[i];
762  	      x++;
763  	    }
764  	  }
765  	  for (i = k; i < k+m; i++) {
766  	    if (erased[i]) {
767  	      ptrs[x] = coding_ptrs[i-k];
768  	      x++;
769  	    }
770  	  }
771  	  free(erased);
772  	  return ptrs;
773  	}
774  	
(1) Event noescape: "set_up_ids_for_scheduled_decoding(int, int, int *, int *, int *)" does not free or save its parameter "row_ids".
775  	static int set_up_ids_for_scheduled_decoding(int k, int m, int *erasures, int *row_ids, int *ind_to_row)
776  	{
777  	  int ddf, cdf;
778  	  int *erased;
779  	  int i, j, x;
780  	
781  	  ddf = 0;
782  	  cdf = 0;
783  	  for (i = 0; erasures[i] != -1; i++) {
784  	    if (erasures[i] < k) ddf++; else cdf++;
785  	  }
786  	  
787  	  erased = jerasure_erasures_to_erased(k, m, erasures);
788  	  if (erased == NULL) return -1;
789  	
790  	  /* See set_up_ptrs_for_scheduled_decoding for how these are set */
791  	
792  	  j = k;
793  	  x = k;
794  	  for (i = 0; i < k; i++) {
795  	    if (erased[i] == 0) {
796  	      row_ids[i] = i;
797  	      ind_to_row[i] = i;
798  	    } else {
799  	      while (erased[j]) j++;
800  	      row_ids[i] = j;
801  	      ind_to_row[j] = i;
802  	      j++;
803  	      row_ids[x] = i;
804  	      ind_to_row[i] = x;
805  	      x++;
806  	    }
807  	  }
808  	  for (i = k; i < k+m; i++) {
809  	    if (erased[i]) {
810  	      row_ids[x] = i;
811  	      ind_to_row[i] = x;
812  	      x++;
813  	    }
814  	  }
815  	  free(erased);
816  	  return 0;
817  	}
818  	
819  	static int **jerasure_generate_decoding_schedule(int k, int m, int w, int *bitmatrix, int *erasures, int smart)
820  	{
821  	  int i, j, x, drive, y, index, z;
822  	  int *decoding_matrix, *inverse, *real_decoding_matrix;
823  	  int *ptr;
824  	  int *row_ids;
825  	  int *ind_to_row;
826  	  int ddf, cdf;
827  	  int **schedule;
828  	  int *b1, *b2;
829  	 
830  	 /* First, figure out the number of data drives that have failed, and the
831  	    number of coding drives that have failed: ddf and cdf */
832  	
833  	  ddf = 0;
834  	  cdf = 0;
(1) Event cond_true: Condition "erasures[i] != -1", taking true branch.
(6) Event loop_begin: Jumped back to beginning of loop.
(7) Event cond_true: Condition "erasures[i] != -1", taking true branch.
(12) Event loop_begin: Jumped back to beginning of loop.
(13) Event cond_true: Condition "erasures[i] != -1", taking true branch.
(17) Event loop_begin: Jumped back to beginning of loop.
(18) Event cond_true: Condition "erasures[i] != -1", taking true branch.
(22) Event loop_begin: Jumped back to beginning of loop.
(23) Event cond_false: Condition "erasures[i] != -1", taking false branch.
835  	  for (i = 0; erasures[i] != -1; i++) {
(2) Event cond_true: Condition "erasures[i] < k", taking true branch.
(3) Event if_fallthrough: Falling through to end of if statement.
(4) Event if_end: End of if statement.
(8) Event cond_true: Condition "erasures[i] < k", taking true branch.
(9) Event if_fallthrough: Falling through to end of if statement.
(10) Event if_end: End of if statement.
(14) Event cond_false: Condition "erasures[i] < k", taking false branch.
(15) Event else_branch: Reached else branch.
(19) Event cond_false: Condition "erasures[i] < k", taking false branch.
(20) Event else_branch: Reached else branch.
836  	    if (erasures[i] < k) ddf++; else cdf++;
(5) Event loop: Jumping back to the beginning of the loop.
(11) Event loop: Jumping back to the beginning of the loop.
(16) Event loop: Jumping back to the beginning of the loop.
(21) Event loop: Jumping back to the beginning of the loop.
(24) Event loop_end: Reached end of loop.
837  	  }
838  	  
(25) Event alloc_fn: Storage is returned from allocation function "malloc".
(26) Event var_assign: Assigning: "row_ids" = storage returned from "malloc(4UL * (k + m))".
Also see events: [noescape][leaked_storage]
839  	  row_ids = talloc(int, k+m);
840  	  ind_to_row = talloc(int, k+m);
841  	
(27) Event noescape: Resource "row_ids" is not freed or pointed-to in "set_up_ids_for_scheduled_decoding". [details]
(28) Event cond_true: Condition "set_up_ids_for_scheduled_decoding(k, m, erasures, row_ids, ind_to_row) < 0", taking true branch.
(29) Event leaked_storage: Variable "row_ids" going out of scope leaks the storage it points to.
Also see events: [alloc_fn][var_assign]
842  	  if (set_up_ids_for_scheduled_decoding(k, m, erasures, row_ids, ind_to_row) < 0) return NULL;
843  	
844  	  /* Now, we're going to create one decoding matrix which is going to 
845  	     decode everything with one call.  The hope is that the scheduler
846  	     will do a good job.    This matrix has w*e rows, where e is the
847  	     number of erasures (ddf+cdf) */
848  	
849  	  real_decoding_matrix = talloc(int, k*w*(cdf+ddf)*w);
850  	
851  	  /* First, if any data drives have failed, then initialize the first
852  	     ddf*w rows of the decoding matrix from the standard decoding
853  	     matrix inversion */
854  	
855  	  if (ddf > 0) {
856  	    
857  	    decoding_matrix = talloc(int, k*k*w*w);
858  	    ptr = decoding_matrix;
859  	    for (i = 0; i < k; i++) {
860  	      if (row_ids[i] == i) {
861  	        bzero(ptr, k*w*w*sizeof(int));
862  	        for (x = 0; x < w; x++) {
863  	          ptr[x+i*w+x*k*w] = 1;
864  	        } 
865  	      } else {
866  	        memcpy(ptr, bitmatrix+k*w*w*(row_ids[i]-k), k*w*w*sizeof(int));
867  	      }
868  	      ptr += (k*w*w);
869  	    }
870  	    inverse = talloc(int, k*k*w*w);
871  	    jerasure_invert_bitmatrix(decoding_matrix, inverse, k*w);
872  	
873  	/*    printf("\nMatrix to invert\n");
874  	    jerasure_print_bitmatrix(decoding_matrix, k*w, k*w, w);
875  	    printf("\n");
876  	    printf("\nInverse\n");
877  	    jerasure_print_bitmatrix(inverse, k*w, k*w, w);
878  	    printf("\n"); */
879  	
880  	    free(decoding_matrix);
881  	    ptr = real_decoding_matrix;
882  	    for (i = 0; i < ddf; i++) {
883  	      memcpy(ptr, inverse+k*w*w*row_ids[k+i], sizeof(int)*k*w*w);
884  	      ptr += (k*w*w);
885  	    }
886  	    free(inverse);
887  	  } 
888  	
889  	  /* Next, here comes the hard part.  For each coding node that needs
890  	     to be decoded, you start by putting its rows of the distribution
891  	     matrix into the decoding matrix.  If there were no failed data
892  	     nodes, then you're done.  However, if there have been failed
893  	     data nodes, then you need to modify the columns that correspond
894  	     to the data nodes.  You do that by first zeroing them.  Then
895  	     whereever there is a one in the distribution matrix, you XOR
896  	     in the corresponding row from the failed data node's entry in
897  	     the decoding matrix.  The whole process kind of makes my head
898  	     spin, but it works.
899  	   */
900  	
901  	  for (x = 0; x < cdf; x++) {
902  	    drive = row_ids[x+ddf+k]-k;
903  	    ptr = real_decoding_matrix + k*w*w*(ddf+x);
904  	    memcpy(ptr, bitmatrix+drive*k*w*w, sizeof(int)*k*w*w);
905  	
906  	    for (i = 0; i < k; i++) {
907  	      if (row_ids[i] != i) {
908  	        for (j = 0; j < w; j++) {
909  	          bzero(ptr+j*k*w+i*w, sizeof(int)*w);
910  	        }
911  	      }  
912  	    }
913  	
914  	    /* There's the yucky part */
915  	
916  	    index = drive*k*w*w;
917  	    for (i = 0; i < k; i++) {
918  	      if (row_ids[i] != i) {
919  	        b1 = real_decoding_matrix+(ind_to_row[i]-k)*k*w*w;
920  	        for (j = 0; j < w; j++) {
921  	          b2 = ptr + j*k*w;
922  	          for (y = 0; y < w; y++) {
923  	            if (bitmatrix[index+j*k*w+i*w+y]) {
924  	              for (z = 0; z < k*w; z++) {
925  	                b2[z] = b2[z] ^ b1[z+y*k*w];
926  	              }
927  	            }
928  	          }
929  	        }
930  	      }  
931  	    }
932  	  }
933  	
934  	/*
935  	  printf("\n\nReal Decoding Matrix\n\n");
936  	  jerasure_print_bitmatrix(real_decoding_matrix, (ddf+cdf)*w, k*w, w);
937  	  printf("\n"); */
938  	  if (smart) {
939  	    schedule = jerasure_smart_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix);
940  	  } else {
941  	    schedule = jerasure_dumb_bitmatrix_to_schedule(k, ddf+cdf, w, real_decoding_matrix);
942  	  }
943  	  free(row_ids);
944  	  free(ind_to_row);
945  	  free(real_decoding_matrix);
946  	  return schedule;
947  	}
948  	
949  	int jerasure_schedule_decode_lazy(int k, int m, int w, int *bitmatrix, int *erasures,
950  	                            char **data_ptrs, char **coding_ptrs, int size, int packetsize, 
951  	                            int smart)
952  	{
953  	  int i, tdone;
954  	  char **ptrs;
955  	  int **schedule;
956  	 
957  	  ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs);
958  	  if (ptrs == NULL) return -1;
959  	
960  	  schedule = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
961  	  if (schedule == NULL) {
962  	    free(ptrs);
963  	    return -1;
964  	  }
965  	
966  	  for (tdone = 0; tdone < size; tdone += packetsize*w) {
967  	  jerasure_do_scheduled_operations(ptrs, schedule, packetsize);
968  	    for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w);
969  	  }
970  	
971  	  jerasure_free_schedule(schedule);
972  	  free(ptrs);
973  	
974  	  return 0;
975  	}
976  	
977  	int jerasure_schedule_decode_cache(int k, int m, int w, int ***scache, int *erasures,
978  	                            char **data_ptrs, char **coding_ptrs, int size, int packetsize)
979  	{
980  	  int i, tdone;
981  	  char **ptrs;
982  	  int **schedule;
983  	  int index;
984  	 
985  	  if (erasures[1] == -1) {
986  	    index = erasures[0]*(k+m) + erasures[0];
987  	  } else if (erasures[2] == -1) {
988  	    index = erasures[0]*(k+m) + erasures[1];
989  	  } else {
990  	    return -1;
991  	  }
992  	
993  	  schedule = scache[index];
994  	
995  	  ptrs = set_up_ptrs_for_scheduled_decoding(k, m, erasures, data_ptrs, coding_ptrs);
996  	  if (ptrs == NULL) return -1;
997  	
998  	
999  	  for (tdone = 0; tdone < size; tdone += packetsize*w) {
1000 	  jerasure_do_scheduled_operations(ptrs, schedule, packetsize);
1001 	    for (i = 0; i < k+m; i++) ptrs[i] += (packetsize*w);
1002 	  }
1003 	
1004 	  free(ptrs);
1005 	
1006 	  return 0;
1007 	}
1008 	
1009 	/* This only works when m = 2 */
1010 	
1011 	int ***jerasure_generate_schedule_cache(int k, int m, int w, int *bitmatrix, int smart)
1012 	{
1013 	  int ***scache;
1014 	  int erasures[3];
1015 	  int e1, e2;
1016 	 
1017 	  /* Ok -- this is yucky, but it's how I'm doing it.  You will make an index out
1018 	     of erasures, which will be  e1*(k+m)+(e2).  If there is no e2, then e2 = e1.
1019 	     Isn't that clever and confusing.  Sorry.
1020 	
1021 	     We're not going to worry about ordering -- in other words, the schedule for
1022 	     e1,e2 will be the same as e2,e1.  They will have the same pointer -- the 
1023 	     schedule will not be duplicated. */
1024 	
1025 	  if (m != 2) return NULL;
1026 	
1027 	  scache = talloc(int **, (k+m)*(k+m+1));
1028 	  if (scache == NULL) return NULL;
1029 	  
1030 	  for (e1 = 0; e1 < k+m; e1++) {
1031 	    erasures[0] = e1;
1032 	    for (e2 = 0; e2 < e1; e2++) {
1033 	      erasures[1] = e2;
1034 	      erasures[2] = -1;
1035 	      scache[e1*(k+m)+e2] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
1036 	      scache[e2*(k+m)+e1] = scache[e1*(k+m)+e2];
1037 	    }
1038 	    erasures[1] = -1;
1039 	    scache[e1*(k+m)+e1] = jerasure_generate_decoding_schedule(k, m, w, bitmatrix, erasures, smart);
1040 	  }
1041 	  return scache;
1042 	
1043 	}
1044 	
1045 	int jerasure_invert_bitmatrix(int *mat, int *inv, int rows)
1046 	{
1047 	  int cols, i, j, k;
1048 	  int tmp;
1049 	 
1050 	  cols = rows;
1051 	
1052 	  k = 0;
1053 	  for (i = 0; i < rows; i++) {
1054 	    for (j = 0; j < cols; j++) {
1055 	      inv[k] = (i == j) ? 1 : 0;
1056 	      k++;
1057 	    }
1058 	  }
1059 	
1060 	  /* First -- convert into upper triangular */
1061 	
1062 	  for (i = 0; i < cols; i++) {
1063 	
1064 	    /* Swap rows if we have a zero i,i element.  If we can't swap, then the 
1065 	       matrix was not invertible */
1066 	
1067 	    if ((mat[i*cols+i]) == 0) { 
1068 	      for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ;
1069 	      if (j == rows) return -1;
1070 	      for (k = 0; k < cols; k++) {
1071 	        tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp;
1072 	        tmp = inv[i*cols+k]; inv[i*cols+k] = inv[j*cols+k]; inv[j*cols+k] = tmp;
1073 	      }
1074 	    }
1075 	 
1076 	    /* Now for each j>i, add A_ji*Ai to Aj */
1077 	    for (j = i+1; j != rows; j++) {
1078 	      if (mat[j*cols+i] != 0) {
1079 	        for (k = 0; k < cols; k++) {
1080 	          mat[j*cols+k] ^= mat[i*cols+k]; 
1081 	          inv[j*cols+k] ^= inv[i*cols+k];
1082 	        }
1083 	      }
1084 	    }
1085 	  }
1086 	
1087 	  /* Now the matrix is upper triangular.  Start at the top and multiply down */
1088 	
1089 	  for (i = rows-1; i >= 0; i--) {
1090 	    for (j = 0; j < i; j++) {
1091 	      if (mat[j*cols+i]) {
1092 	        for (k = 0; k < cols; k++) {
1093 	          mat[j*cols+k] ^= mat[i*cols+k]; 
1094 	          inv[j*cols+k] ^= inv[i*cols+k];
1095 	        }
1096 	      }
1097 	    }
1098 	  } 
1099 	  return 0;
1100 	}
1101 	
1102 	int jerasure_invertible_bitmatrix(int *mat, int rows)
1103 	{
1104 	  int cols, i, j, k;
1105 	  int tmp;
1106 	 
1107 	  cols = rows;
1108 	
1109 	  /* First -- convert into upper triangular */
1110 	
1111 	  for (i = 0; i < cols; i++) {
1112 	
1113 	    /* Swap rows if we have a zero i,i element.  If we can't swap, then the 
1114 	       matrix was not invertible */
1115 	
1116 	    if ((mat[i*cols+i]) == 0) { 
1117 	      for (j = i+1; j < rows && (mat[j*cols+i]) == 0; j++) ;
1118 	      if (j == rows) return 0;
1119 	      for (k = 0; k < cols; k++) {
1120 	        tmp = mat[i*cols+k]; mat[i*cols+k] = mat[j*cols+k]; mat[j*cols+k] = tmp;
1121 	      }
1122 	    }
1123 	 
1124 	    /* Now for each j>i, add A_ji*Ai to Aj */
1125 	    for (j = i+1; j != rows; j++) {
1126 	      if (mat[j*cols+i] != 0) {
1127 	        for (k = 0; k < cols; k++) {
1128 	          mat[j*cols+k] ^= mat[i*cols+k]; 
1129 	        }
1130 	      }
1131 	    }
1132 	  }
1133 	  return 1;
1134 	}
1135 	
1136 	  
1137 	int *jerasure_matrix_multiply(int *m1, int *m2, int r1, int c1, int r2, int c2, int w)
1138 	{
1139 	  int *product, i, j, k;
1140 	
1141 	  product = (int *) malloc(sizeof(int)*r1*c2);
1142 	  for (i = 0; i < r1*c2; i++) product[i] = 0;
1143 	
1144 	  for (i = 0; i < r1; i++) {
1145 	    for (j = 0; j < c2; j++) {
1146 	      for (k = 0; k < r2; k++) {
1147 	        product[i*c2+j] ^= galois_single_multiply(m1[i*c1+k], m2[k*c2+j], w);
1148 	      }
1149 	    }
1150 	  }
1151 	  return product;
1152 	}
1153 	
1154 	void jerasure_get_stats(double *fill_in)
1155 	{
1156 	  fill_in[0] = jerasure_total_xor_bytes;
1157 	  fill_in[1] = jerasure_total_gf_bytes;
1158 	  fill_in[2] = jerasure_total_memcpy_bytes;
1159 	  jerasure_total_xor_bytes = 0;
1160 	  jerasure_total_gf_bytes = 0;
1161 	  jerasure_total_memcpy_bytes = 0;
1162 	}
1163 	
1164 	void jerasure_do_scheduled_operations(char **ptrs, int **operations, int packetsize)
1165 	{
1166 	  char *sptr;
1167 	  char *dptr;
1168 	  int op;
1169 	
1170 	  for (op = 0; operations[op][0] >= 0; op++) {
1171 	    sptr = ptrs[operations[op][0]] + operations[op][1]*packetsize;
1172 	    dptr = ptrs[operations[op][2]] + operations[op][3]*packetsize;
1173 	    if (operations[op][4]) {
1174 	/*      printf("%d,%d %d,%d\n", operations[op][0], 
1175 	      operations[op][1], 
1176 	      operations[op][2], 
1177 	      operations[op][3]); 
1178 	      printf("xor(0x%x, 0x%x -> 0x%x, %d)\n", sptr, dptr, dptr, packetsize); */
1179 	      galois_region_xor(sptr, dptr, packetsize);
1180 	      jerasure_total_xor_bytes += packetsize;
1181 	    } else {
1182 	/*      printf("memcpy(0x%x <- 0x%x)\n", dptr, sptr); */
1183 	      memcpy(dptr, sptr, packetsize);
1184 	      jerasure_total_memcpy_bytes += packetsize;
1185 	    }
1186 	  }  
1187 	}
1188 	
1189 	void jerasure_schedule_encode(int k, int m, int w, int **schedule,
1190 	                                   char **data_ptrs, char **coding_ptrs, int size, int packetsize)
1191 	{
1192 	  char **ptr_copy;
1193 	  int i, tdone;
1194 	
1195 	  ptr_copy = talloc(char *, (k+m));
1196 	  for (i = 0; i < k; i++) ptr_copy[i] = data_ptrs[i];
1197 	  for (i = 0; i < m; i++) ptr_copy[i+k] = coding_ptrs[i];
1198 	  for (tdone = 0; tdone < size; tdone += packetsize*w) {
1199 	    jerasure_do_scheduled_operations(ptr_copy, schedule, packetsize);
1200 	    for (i = 0; i < k+m; i++) ptr_copy[i] += (packetsize*w);
1201 	  }
1202 	  free(ptr_copy);
1203 	}
1204 	    
1205 	int **jerasure_dumb_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix)
1206 	{
1207 	  int **operations;
1208 	  int op;
1209 	  int index, optodo, i, j;
1210 	
1211 	  operations = talloc(int *, k*m*w*w+1);
1212 	  op = 0;
1213 	  
1214 	  index = 0;
1215 	  for (i = 0; i < m*w; i++) {
1216 	    optodo = 0;
1217 	    for (j = 0; j < k*w; j++) {
1218 	      if (bitmatrix[index]) {
1219 	        operations[op] = talloc(int, 5);
1220 	        operations[op][4] = optodo;
1221 	        operations[op][0] = j/w;
1222 	        operations[op][1] = j%w;
1223 	        operations[op][2] = k+i/w;
1224 	        operations[op][3] = i%w;
1225 	        optodo = 1;
1226 	        op++;
1227 	        
1228 	      }
1229 	      index++;
1230 	    }
1231 	  }
1232 	  operations[op] = talloc(int, 5);
1233 	  operations[op][0] = -1;
1234 	  return operations;
1235 	}
1236 	
1237 	int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix)
1238 	{
1239 	  int **operations;
1240 	  int op;
1241 	  int i, j;
1242 	  int *diff, *from, *b1, *flink, *blink;
1243 	  int *ptr, no, row;
1244 	  int optodo;
1245 	  int bestrow = 0, bestdiff, top;
1246 	
1247 	/*   printf("Scheduling:\n\n");
1248 	  jerasure_print_bitmatrix(bitmatrix, m*w, k*w, w); */
1249 	
1250 	  operations = talloc(int *, k*m*w*w+1);
1251 	  op = 0;
1252 	  
1253 	  diff = talloc(int, m*w);
1254 	  from = talloc(int, m*w);
1255 	  flink = talloc(int, m*w);
1256 	  blink = talloc(int, m*w);
1257 	
1258 	  ptr = bitmatrix;
1259 	
1260 	  bestdiff = k*w+1;
1261 	  top = 0;
1262 	  for (i = 0; i < m*w; i++) {
1263 	    no = 0;
1264 	    for (j = 0; j < k*w; j++) {
1265 	      no += *ptr;
1266 	      ptr++;
1267 	    }
1268 	    diff[i] = no;
1269 	    from[i] = -1;
1270 	    flink[i] = i+1;
1271 	    blink[i] = i-1;
1272 	    if (no < bestdiff) {
1273 	      bestdiff = no;
1274 	      bestrow = i;
1275 	    }
1276 	  }
1277 	
1278 	  flink[m*w-1] = -1;
1279 	  
1280 	  while (top != -1) {
1281 	    row = bestrow;
1282 	    /* printf("Doing row %d - %d from %d\n", row, diff[row], from[row]);  */
1283 	
1284 	    if (blink[row] == -1) {
1285 	      top = flink[row];
1286 	      if (top != -1) blink[top] = -1;
1287 	    } else {
1288 	      flink[blink[row]] = flink[row];
1289 	      if (flink[row] != -1) {
1290 	        blink[flink[row]] = blink[row];
1291 	      }
1292 	    }
1293 	
1294 	    ptr = bitmatrix + row*k*w;
1295 	    if (from[row] == -1) {
1296 	      optodo = 0;
1297 	      for (j = 0; j < k*w; j++) {
1298 	        if (ptr[j]) {
1299 	          operations[op] = talloc(int, 5);
1300 	          operations[op][4] = optodo;
1301 	          operations[op][0] = j/w;
1302 	          operations[op][1] = j%w;
1303 	          operations[op][2] = k+row/w;
1304 	          operations[op][3] = row%w;
1305 	          optodo = 1;
1306 	          op++;
1307 	        }
1308 	      }
1309 	    } else {
1310 	      operations[op] = talloc(int, 5);
1311 	      operations[op][4] = 0;
1312 	      operations[op][0] = k+from[row]/w;
1313 	      operations[op][1] = from[row]%w;
1314 	      operations[op][2] = k+row/w;
1315 	      operations[op][3] = row%w;
1316 	      op++;
1317 	      b1 = bitmatrix + from[row]*k*w;
1318 	      for (j = 0; j < k*w; j++) {
1319 	        if (ptr[j] ^ b1[j]) {
1320 	          operations[op] = talloc(int, 5);
1321 	          operations[op][4] = 1;
1322 	          operations[op][0] = j/w;
1323 	          operations[op][1] = j%w;
1324 	          operations[op][2] = k+row/w;
1325 	          operations[op][3] = row%w;
1326 	          optodo = 1;
1327 	          op++;
1328 	        }
1329 	      }
1330 	    }
1331 	    bestdiff = k*w+1;
1332 	    for (i = top; i != -1; i = flink[i]) {
1333 	      no = 1;
1334 	      b1 = bitmatrix + i*k*w;
1335 	      for (j = 0; j < k*w; j++) no += (ptr[j] ^ b1[j]);
1336 	      if (no < diff[i]) {
1337 	        from[i] = row;
1338 	        diff[i] = no;
1339 	      }
1340 	      if (diff[i] < bestdiff) {
1341 	        bestdiff = diff[i];
1342 	        bestrow = i;
1343 	      }
1344 	    }
1345 	  }
1346 	  
1347 	  operations[op] = talloc(int, 5);
1348 	  operations[op][0] = -1;
1349 	  free(from);
1350 	  free(diff);
1351 	  free(blink);
1352 	  free(flink);
1353 	
1354 	  return operations;
1355 	}
1356 	
1357 	void jerasure_bitmatrix_encode(int k, int m, int w, int *bitmatrix,
1358 	                            char **data_ptrs, char **coding_ptrs, int size, int packetsize)
1359 	{
1360 	  int i;
1361 	
1362 	  if (packetsize%sizeof(long) != 0) {
1363 	    fprintf(stderr, "jerasure_bitmatrix_encode - packetsize(%d) %c sizeof(long) != 0\n", packetsize, '%');
1364 	    assert(0);
1365 	  }
1366 	  if (size%(packetsize*w) != 0) {
1367 	    fprintf(stderr, "jerasure_bitmatrix_encode - size(%d) %c (packetsize(%d)*w(%d))) != 0\n", 
1368 	         size, '%', packetsize, w);
1369 	    assert(0);
1370 	  }
1371 	
1372 	  for (i = 0; i < m; i++) {
1373 	    jerasure_bitmatrix_dotprod(k, w, bitmatrix+i*k*w*w, NULL, k+i, data_ptrs, coding_ptrs, size, packetsize);
1374 	  }
1375 	}
1376 	
1377 	/*
1378 	 * Exported function for use by autoconf to perform quick 
1379 	 * spot-check.
1380 	 */
1381 	int jerasure_autoconf_test()
1382 	{
1383 	  int x = galois_single_multiply(1, 2, 8);
1384 	  if (x != 2) {
1385 	    return -1;
1386 	  }
1387 	  return 0;
1388 	}
1389 	
1390