1    	// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2    	// vim: ts=8 sw=2 smarttab
3    	/*
4    	 * Ceph - scalable distributed file system
5    	 *
6    	 * Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in>
7    	 *
8    	 * Author: Myna Vajha <mynaramana@gmail.com>
9    	 *
10   	 *  This library is free software; you can redistribute it and/or
11   	 *  modify it under the terms of the GNU Lesser General Public
12   	 *  License as published by the Free Software Foundation; either
13   	 *  version 2.1 of the License, or (at your option) any later version.
14   	 *
15   	 */
16   	
17   	#include <errno.h>
18   	#include <algorithm>
19   	
20   	#include "ErasureCodeClay.h"
21   	
22   	#include "common/debug.h"
23   	#include "erasure-code/ErasureCodePlugin.h"
24   	#include "include/ceph_assert.h"
25   	#include "include/str_map.h"
26   	#include "include/stringify.h"
27   	#include "osd/osd_types.h"
28   	
29   	
30   	#define dout_context g_ceph_context
31   	#define dout_subsys ceph_subsys_osd
32   	#undef dout_prefix
33   	#define dout_prefix _prefix(_dout)
34   	
35   	#define LARGEST_VECTOR_WORDSIZE 16
36   	#define talloc(type, num) (type *) malloc(sizeof(type)*(num))
37   	
38   	using namespace std;
39   	using namespace ceph;
40   	static ostream& _prefix(std::ostream* _dout)
41   	{
42   	  return *_dout << "ErasureCodeClay: ";
43   	}
44   	
45   	static int pow_int(int a, int x) {
46   	  int power = 1;
47   	  while (x) {
48   	    if (x & 1) power *= a;
49   	    x /= 2;
50   	    a *= a;
51   	  }
52   	  return power;
53   	}
54   	
(1) Event exn_spec_violation: An exception of type "ceph::buffer::v14_2_0::end_of_buffer" is thrown but the throw list "throw()" doesn't allow it to be thrown. This will cause a call to unexpected() which usually calls terminate().
Also see events: [fun_call_w_exception]
55   	ErasureCodeClay::~ErasureCodeClay()
56   	{
57   	  for (int i = 0; i < q*t; i++) {
(2) Event fun_call_w_exception: Called function throws an exception of type "ceph::buffer::v14_2_0::end_of_buffer". [details]
Also see events: [exn_spec_violation]
58   	    if (U_buf[i].length() != 0) U_buf[i].clear();
59   	  }
60   	}
61   	
62   	int ErasureCodeClay::init(ErasureCodeProfile &profile,
63   				  ostream *ss)
64   	{
65   	  int r;
66   	  r = parse(profile, ss);
67   	  if (r)
68   	    return r;
69   	
70   	  r = ErasureCode::init(profile, ss);
71   	  if (r)
72   	    return r;
73   	  ErasureCodePluginRegistry &registry = ErasureCodePluginRegistry::instance();
74   	  r = registry.factory(mds.profile["plugin"],
75   			       directory,
76   			       mds.profile,
77   			       &mds.erasure_code,
78   			       ss);
79   	  if (r)
80   	    return r;
81   	  r = registry.factory(pft.profile["plugin"],
82   			       directory,
83   			       pft.profile,
84   			       &pft.erasure_code,
85   			       ss);
86   	  return r;
87   	
88   	}
89   	
90   	unsigned int ErasureCodeClay::get_chunk_size(unsigned int object_size) const
91   	{
92   	  unsigned int alignment_scalar_code = pft.erasure_code->get_chunk_size(1);
93   	  unsigned int alignment = sub_chunk_no * k * alignment_scalar_code;
94   	  
95   	  return round_up_to(object_size, alignment) / k;
96   	}
97   	
98   	int ErasureCodeClay::minimum_to_decode(const set<int> &want_to_read,
99   					       const set<int> &available,
100  					       map<int, vector<pair<int, int>>> *minimum)
101  	{
102  	  if (is_repair(want_to_read, available)) {
103  	    return minimum_to_repair(want_to_read, available, minimum);
104  	  } else {
105  	    return ErasureCode::minimum_to_decode(want_to_read, available, minimum);
106  	  }
107  	}
108  	
109  	int ErasureCodeClay::decode(const set<int> &want_to_read,
110  				    const map<int, bufferlist> &chunks,
111  				    map<int, bufferlist> *decoded, int chunk_size)
112  	{
113  	  set<int> avail;
114  	  for ([[maybe_unused]] auto& [node, bl] : chunks) {
115  	    avail.insert(node);
116  	    (void)bl;  // silence -Wunused-variable
117  	  }
118  	
119  	  if (is_repair(want_to_read, avail) && 
120  	         ((unsigned int)chunk_size > chunks.begin()->second.length())) {
121  	    return repair(want_to_read, chunks, decoded, chunk_size);
122  	  } else {
123  	    return ErasureCode::_decode(want_to_read, chunks, decoded);
124  	  }
125  	}
126  	
127  	void p(const set<int> &s) { cerr << s; } // for gdb
128  	
129  	int ErasureCodeClay::encode_chunks(const set<int> &want_to_encode,
130  					   map<int, bufferlist> *encoded)
131  	{
132  	  map<int, bufferlist> chunks;
133  	  set<int> parity_chunks;
134  	  int chunk_size = (*encoded)[0].length();
135  	
136  	  for (int i = 0; i < k + m; i++) {
137  	    if (i < k) {
138  	      chunks[i] = (*encoded)[i];
139  	    } else {
140  	      chunks[i+nu] = (*encoded)[i];
141  	      parity_chunks.insert(i+nu);
142  	    }
143  	  }
144  	
145  	  for (int i = k; i < k + nu; i++) {
146  	    bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN));
147  	    buf.zero();
148  	    chunks[i].push_back(std::move(buf));
149  	  }
150  	
151  	  int res = decode_layered(parity_chunks, &chunks);
152  	  for (int i = k ; i < k + nu; i++) {
153  	    // need to clean some of the intermediate chunks here!!
154  	    chunks[i].clear();
155  	  }
156  	  return res;
157  	}
158  	
159  	int ErasureCodeClay::decode_chunks(const set<int> &want_to_read,
160  					   const map<int, bufferlist> &chunks,
161  					   map<int, bufferlist> *decoded)
162  	{
163  	  set<int> erasures;
164  	  map<int, bufferlist> coded_chunks;
165  	
166  	  for (int i = 0; i < k + m; i++) {
167  	    if (chunks.count(i) == 0) {
168  	      erasures.insert(i < k ? i : i+nu);
169  	    }
170  	    ceph_assert(decoded->count(i) > 0);
171  	    coded_chunks[i < k ? i : i+nu] = (*decoded)[i];
172  	  }
173  	  int chunk_size = coded_chunks[0].length();
174  	
175  	  for (int i = k; i < k+nu; i++) {
176  	    bufferptr buf(buffer::create_aligned(chunk_size, SIMD_ALIGN));
177  	    buf.zero();
178  	    coded_chunks[i].push_back(std::move(buf));
179  	  }
180  	
181  	  int res = decode_layered(erasures, &coded_chunks);
182  	  for (int i = k; i < k+nu; i++) {
183  	    coded_chunks[i].clear();
184  	  }
185  	  return res;
186  	}
187  	
188  	int ErasureCodeClay::parse(ErasureCodeProfile &profile,
189  				   ostream *ss)
190  	{
191  	  int err = 0;
192  	  err = ErasureCode::parse(profile, ss);
193  	  err |= to_int("k", profile, &k, DEFAULT_K, ss);
194  	  err |= to_int("m", profile, &m, DEFAULT_M, ss);
195  	
196  	  err |= sanity_check_k_m(k, m, ss);
197  	
198  	  err |= to_int("d", profile, &d, std::to_string(k+m-1), ss);
199  	
200  	  // check for scalar_mds in profile input
201  	  if (profile.find("scalar_mds") == profile.end() ||
202  	      profile.find("scalar_mds")->second.empty()) {
203  	    mds.profile["plugin"] = "jerasure";
204  	    pft.profile["plugin"] = "jerasure";
205  	  } else {
206  	    std::string p = profile.find("scalar_mds")->second;
207  	    if ((p == "jerasure") || (p == "isa") || (p == "shec")) {
208  	      mds.profile["plugin"] = p;
209  	      pft.profile["plugin"] = p;
210  	    } else {
211  	        *ss << "scalar_mds " << mds.profile["plugin"] <<
212  	               "is not currently supported, use one of 'jerasure',"<<
213  	               " 'isa', 'shec'" << std::endl;
214  	        err = -EINVAL;
215  	        return err;
216  	    }
217  	  }
218  	
219  	  if (profile.find("technique") == profile.end() ||
220  	      profile.find("technique")->second.empty()) {
221  	    if ((mds.profile["plugin"]=="jerasure") || (mds.profile["plugin"]=="isa") ) {
222  	      mds.profile["technique"] = "reed_sol_van";
223  	      pft.profile["technique"] = "reed_sol_van";
224  	    } else {
225  	      mds.profile["technique"] = "single";
226  	      pft.profile["technique"] = "single";
227  	    }
228  	  } else {
229  	    std::string p = profile.find("technique")->second;
230  	    if (mds.profile["plugin"] == "jerasure") {
231  	      if ( (p == "reed_sol_van") || (p == "reed_sol_r6_op") || (p == "cauchy_orig")
232  	           || (p == "cauchy_good") || (p == "liber8tion")) {
233  	        mds.profile["technique"] = p;
234  	        pft.profile["technique"] = p;
235  	      } else {
236  	        *ss << "technique " << p << "is not currently supported, use one of "
237  		    << "reed_sol_van', 'reed_sol_r6_op','cauchy_orig',"
238  		    << "'cauchy_good','liber8tion'"<< std::endl;
239  	        err = -EINVAL;
240  	        return err;
241  	      }
242  	    } else if (mds.profile["plugin"] == "isa") {
243  	      if ( (p == "reed_sol_van") || (p == "cauchy")) {
244  	        mds.profile["technique"] = p;
245  	        pft.profile["technique"] = p;
246  	      } else {
247  	        *ss << "technique " << p << "is not currently supported, use one of"
248  		    << "'reed_sol_van','cauchy'"<< std::endl;
249  	        err = -EINVAL;
250  	        return err;
251  	      }
252  	    } else {
253  	      if ( (p == "single") || (p == "multiple")) {
254  	        mds.profile["technique"] = p;
255  	        pft.profile["technique"] = p;
256  	      } else {
257  	        *ss << "technique " << p << "is not currently supported, use one of"<<
258  	               "'single','multiple'"<< std::endl;
259  	        err = -EINVAL;
260  	        return err;
261  	      }
262  	    }
263  	  }
264  	  if ((d < k) || (d > k + m - 1)) {
265  	    *ss << "value of d " << d
266  	        << " must be within [ " << k << "," << k+m-1 << "]" << std::endl;
267  	    err = -EINVAL;
268  	    return err;
269  	  }
270  	
271  	  q = d - k + 1;
272  	  if ((k + m) % q) {
273  	    nu = q - (k + m) % q;
274  	  } else {
275  	    nu = 0;
276  	  }
277  	
278  	  if (k+m+nu > 254) {
279  	    err = -EINVAL;
280  	    return err;
281  	  }
282  	
283  	  if (mds.profile["plugin"] == "shec") {
284  	    mds.profile["c"] = '2';
285  	    pft.profile["c"] = '2';
286  	  }
287  	  mds.profile["k"] = std::to_string(k+nu);
288  	  mds.profile["m"] = std::to_string(m);
289  	  mds.profile["w"] = '8';
290  	
291  	  pft.profile["k"] = '2';
292  	  pft.profile["m"] = '2';
293  	  pft.profile["w"] = '8';
294  	
295  	  t = (k + m + nu) / q;
296  	  sub_chunk_no = pow_int(q, t);
297  	
298  	  dout(10) << __func__
299  		   << " (q,t,nu)=(" << q << "," << t << "," << nu <<")" << dendl;
300  	
301  	  return err;
302  	}
303  	
304  	int ErasureCodeClay::is_repair(const set<int> &want_to_read,
305  				       const set<int> &available_chunks) {
306  	
307  	  if (includes(available_chunks.begin(), available_chunks.end(),
308  	               want_to_read.begin(), want_to_read.end())) return 0;
309  	  if (want_to_read.size() > 1) return 0;
310  	
311  	  int i = *want_to_read.begin();
312  	  int lost_node_id = (i < k) ? i: i+nu;
313  	  for (int x = 0; x < q; x++) {
314  	    int node = (lost_node_id/q)*q+x;
315  	    node = (node < k) ? node : node-nu;
316  	    if (node != i) { // node in the same group other than erased node
317  	      if (available_chunks.count(node) == 0) return 0;
318  	    }
319  	  }
320  	
321  	  if (available_chunks.size() < (unsigned)d) return 0;
322  	  return 1;
323  	}
324  	
325  	int ErasureCodeClay::minimum_to_repair(const set<int> &want_to_read,
326  					       const set<int> &available_chunks,
327  					       map<int, vector<pair<int, int>>> *minimum)
328  	{
329  	  int i = *want_to_read.begin();
330  	  int lost_node_index = (i < k) ? i : i+nu;
331  	  int rep_node_index = 0;
332  	
333  	  // add all the nodes in lost node's y column.
334  	  vector<pair<int, int>> sub_chunk_ind;
335  	  get_repair_subchunks(lost_node_index, sub_chunk_ind);
336  	  if ((available_chunks.size() >= (unsigned)d)) {
337  	    for (int j = 0; j < q; j++) {
338  	      if (j != lost_node_index%q) {
339  	        rep_node_index = (lost_node_index/q)*q+j;
340  	        if (rep_node_index < k) {
341  	          minimum->insert(make_pair(rep_node_index, sub_chunk_ind));
342  	        } else if (rep_node_index >= k+nu) {
343  	          minimum->insert(make_pair(rep_node_index-nu, sub_chunk_ind));
344  	        }
345  	      }
346  	    }
347  	    for (auto chunk : available_chunks) {
348  	      if (minimum->size() >= (unsigned)d) {
349  		break;
350  	      }
351  	      if (!minimum->count(chunk)) {
352  		minimum->emplace(chunk, sub_chunk_ind);
353  	      }
354  	    }
355  	  } else {
356  	    dout(0) << "minimum_to_repair: shouldn't have come here" << dendl;
357  	    ceph_assert(0);
358  	  }
359  	  ceph_assert(minimum->size() == (unsigned)d);
360  	  return 0;
361  	}
362  	
363  	void ErasureCodeClay::get_repair_subchunks(const int &lost_node,
364  						   vector<pair<int, int>> &repair_sub_chunks_ind)
365  	{
366  	  const int y_lost = lost_node / q;
367  	  const int x_lost = lost_node % q;
368  	
369  	  const int seq_sc_count = pow_int(q, t-1-y_lost);
370  	  const int num_seq = pow_int(q, y_lost);
371  	
372  	  int index = x_lost * seq_sc_count;
373  	  for (int ind_seq = 0; ind_seq < num_seq; ind_seq++) {
374  	    repair_sub_chunks_ind.push_back(make_pair(index, seq_sc_count));
375  	    index += q * seq_sc_count;
376  	  }
377  	}
378  	
379  	int ErasureCodeClay::get_repair_sub_chunk_count(const set<int> &want_to_read)
380  	{
381  	  int weight_vector[t];
382  	  std::fill(weight_vector, weight_vector + t, 0);
383  	  for (auto to_read : want_to_read) {
384  	    weight_vector[to_read / q]++;
385  	  }
386  	
387  	  int repair_subchunks_count = 1;
388  	  for (int y = 0; y < t; y++) {
389  	    repair_subchunks_count = repair_subchunks_count*(q-weight_vector[y]);
390  	  }
391  	
392  	  return sub_chunk_no - repair_subchunks_count;
393  	}
394  	
395  	int ErasureCodeClay::repair(const set<int> &want_to_read,
396  				    const map<int, bufferlist> &chunks,
397  				    map<int, bufferlist> *repaired, int chunk_size)
398  	{
399  	
400  	  ceph_assert((want_to_read.size() == 1) && (chunks.size() == (unsigned)d));
401  	
402  	  int repair_sub_chunk_no = get_repair_sub_chunk_count(want_to_read);
403  	  vector<pair<int, int>> repair_sub_chunks_ind;
404  	
405  	  unsigned repair_blocksize = chunks.begin()->second.length();
406  	  assert(repair_blocksize%repair_sub_chunk_no == 0);
407  	
408  	  unsigned sub_chunksize = repair_blocksize/repair_sub_chunk_no;
409  	  unsigned chunksize = sub_chunk_no*sub_chunksize;
410  	
411  	  ceph_assert(chunksize == (unsigned)chunk_size);
412  	
413  	  map<int, bufferlist> recovered_data;
414  	  map<int, bufferlist> helper_data;
415  	  set<int> aloof_nodes;
416  	
417  	  for (int i =  0; i < k + m; i++) {
418  	    // included helper data only for d+nu nodes.
419  	    if (auto found = chunks.find(i); found != chunks.end()) { // i is a helper
420  	      if (i<k) {
421  		helper_data[i] = found->second;
422  	      } else {
423  		helper_data[i+nu] = found->second;
424  	      }
425  	    } else {
426  	      if (i != *want_to_read.begin()) { // aloof node case.
427  	        int aloof_node_id = (i < k) ? i: i+nu;
428  	        aloof_nodes.insert(aloof_node_id);
429  	      } else {
430  	        bufferptr ptr(buffer::create_aligned(chunksize, SIMD_ALIGN));
431  		ptr.zero();
432  	        int lost_node_id = (i < k) ? i : i+nu;
433  	        (*repaired)[i].push_back(ptr);
434  	        recovered_data[lost_node_id] = (*repaired)[i];
435  	        get_repair_subchunks(lost_node_id, repair_sub_chunks_ind);
436  	      }
437  	    }
438  	  }
439  	
440  	  // this is for shortened codes i.e., when nu > 0
441  	  for (int i=k; i < k+nu; i++) {
442  	    bufferptr ptr(buffer::create_aligned(repair_blocksize, SIMD_ALIGN));
443  	    ptr.zero();
444  	    helper_data[i].push_back(ptr);
445  	  }
446  	
447  	  ceph_assert(helper_data.size()+aloof_nodes.size()+recovered_data.size() ==
448  		      (unsigned) q*t);
449  	
450  	  int r = repair_one_lost_chunk(recovered_data, aloof_nodes,
451  					helper_data, repair_blocksize,
452  					repair_sub_chunks_ind);
453  	
454  	  // clear buffers created for the purpose of shortening
455  	  for (int i = k; i < k+nu; i++) {
456  	    helper_data[i].clear();
457  	  }
458  	
459  	  return r;
460  	}
461  	
462  	int ErasureCodeClay::repair_one_lost_chunk(map<int, bufferlist> &recovered_data,
463  						   set<int> &aloof_nodes,
464  						   map<int, bufferlist> &helper_data,
465  						   int repair_blocksize,
466  						   vector<pair<int,int>> &repair_sub_chunks_ind)
467  	{
468  	  unsigned repair_subchunks = (unsigned)sub_chunk_no / q;
469  	  unsigned sub_chunksize = repair_blocksize / repair_subchunks;
470  	
471  	  int z_vec[t];
472  	  map<int, set<int> > ordered_planes;
473  	  map<int, int> repair_plane_to_ind;
474  	  int count_retrieved_sub_chunks = 0;
475  	  int plane_ind = 0;
476  	
477  	  bufferptr buf(buffer::create_aligned(sub_chunksize, SIMD_ALIGN));
478  	  bufferlist temp_buf;
479  	  temp_buf.push_back(buf);
480  	
481  	  for (auto [index,count] : repair_sub_chunks_ind) {
482  	    for (int j = index; j < index + count; j++) {
483  	      get_plane_vector(j, z_vec);
484  	      int order = 0;
485  	      // check across all erasures and aloof nodes
486  	      for ([[maybe_unused]] auto& [node, bl] : recovered_data) {
487  	        if (node % q == z_vec[node / q]) order++;
488  	        (void)bl;  // silence -Wunused-variable
489  	      }
490  	      for (auto node : aloof_nodes) {
491  	        if (node % q == z_vec[node / q]) order++;
492  	      }
493  	      ceph_assert(order > 0);
494  	      ordered_planes[order].insert(j);
495  	      // to keep track of a sub chunk within helper buffer recieved
496  	      repair_plane_to_ind[j] = plane_ind;
497  	      plane_ind++;
498  	    }
499  	  }
500  	  assert((unsigned)plane_ind == repair_subchunks);
501  	
502  	  int plane_count = 0;
503  	  for (int i = 0; i < q*t; i++) {
504  	    if (U_buf[i].length() == 0) {
505  	      bufferptr buf(buffer::create_aligned(sub_chunk_no*sub_chunksize, SIMD_ALIGN));
506  	      buf.zero();
507  	      U_buf[i].push_back(std::move(buf));
508  	    }
509  	  }
510  	
511  	  int lost_chunk;
512  	  int count = 0;
513  	  for ([[maybe_unused]] auto& [node, bl] : recovered_data) {
514  	    lost_chunk = node;
515  	    count++;
516  	    (void)bl;  // silence -Wunused-variable
517  	  }
518  	  ceph_assert(count == 1);
519  	
520  	  set<int> erasures;
521  	  for (int i = 0; i < q; i++) {
522  	    erasures.insert(lost_chunk - lost_chunk % q + i);
523  	  }
524  	  for (auto node : aloof_nodes) {
525  	    erasures.insert(node);
526  	  }
527  	
528  	  for (int order = 1; ;order++) {
529  	    if (ordered_planes.count(order) == 0) {
530  	      break;
531  	    }
532  	    plane_count += ordered_planes[order].size();
533  	    for (auto z : ordered_planes[order]) {
534  	      get_plane_vector(z, z_vec);
535  	
536  	      for (int y = 0; y < t; y++) {
537  		for (int x = 0; x < q; x++) {
538  		  int node_xy = y*q + x;
539  		  map<int, bufferlist> known_subchunks;
540  		  map<int, bufferlist> pftsubchunks;
541  		  set<int> pft_erasures;
542  		  if (erasures.count(node_xy) == 0) {
543  		    assert(helper_data.count(node_xy) > 0);
544  		    int z_sw = z + (x - z_vec[y])*pow_int(q,t-1-y);
545  		    int node_sw = y*q + z_vec[y];
546  		    int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
547  		    if (z_vec[y] > x) {
548  		      i0 = 1;
549  		      i1 = 0;
550  		      i2 = 3;
551  		      i3 = 2;
552  		    }
553  		    if (aloof_nodes.count(node_sw) > 0) {
554  		      assert(repair_plane_to_ind.count(z) > 0);
555  		      assert(repair_plane_to_ind.count(z_sw) > 0);
556  		      pft_erasures.insert(i2);
557  		      known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
558  		      known_subchunks[i3].substr_of(U_buf[node_sw], z_sw*sub_chunksize, sub_chunksize);
559  		      pftsubchunks[i0] = known_subchunks[i0];
560  		      pftsubchunks[i1] = temp_buf;
561  		      pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize);
562  		      pftsubchunks[i3] = known_subchunks[i3];
563  		      for (int i=0; i<3; i++) {
564  			pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
565  		      }
566  		      pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
567  		    } else {
568  		      ceph_assert(helper_data.count(node_sw) > 0);
569  		      ceph_assert(repair_plane_to_ind.count(z) > 0);
570  		      if (z_vec[y] != x){
571  			pft_erasures.insert(i2);
572  			ceph_assert(repair_plane_to_ind.count(z_sw) > 0);
573  			known_subchunks[i0].substr_of(helper_data[node_xy], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
574  			known_subchunks[i1].substr_of(helper_data[node_sw], repair_plane_to_ind[z_sw]*sub_chunksize, sub_chunksize);
575  			pftsubchunks[i0] = known_subchunks[i0];
576  			pftsubchunks[i1] = known_subchunks[i1];
577  			pftsubchunks[i2].substr_of(U_buf[node_xy], z*sub_chunksize, sub_chunksize);
578  			pftsubchunks[i3].substr_of(temp_buf, 0, sub_chunksize);
579  			for (int i=0; i<3; i++) {
580  			  pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
581  			}
582  			pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
583  		      } else {
584  			char* uncoupled_chunk = U_buf[node_xy].c_str();
585  			char* coupled_chunk = helper_data[node_xy].c_str();
586  			memcpy(&uncoupled_chunk[z*sub_chunksize],
587  			       &coupled_chunk[repair_plane_to_ind[z]*sub_chunksize],
588  			       sub_chunksize);
589  		      }
590  		    }
591  		  }
592  		} // x
593  	      } // y
594  	      ceph_assert(erasures.size() <= (unsigned)m);
595  	      decode_uncoupled(erasures, z, sub_chunksize);
596  	
597  	      for (auto i : erasures) {
598  		int x = i % q;
599  		int y = i / q;
600  		int node_sw = y*q+z_vec[y];
601  		int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
602  		set<int> pft_erasures;
603  		map<int, bufferlist> known_subchunks;
604  		map<int, bufferlist> pftsubchunks;
605  		int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
606  		if (z_vec[y] > x) {
607  		  i0 = 1;
608  		  i1 = 0;
609  		  i2 = 3;
610  		  i3 = 2;
611  		}
612  		// make sure it is not an aloof node before you retrieve repaired_data
613  		if (aloof_nodes.count(i) == 0) {
614  		  if (x == z_vec[y]) { // hole-dot pair (type 0)
615  		    char* coupled_chunk = recovered_data[i].c_str();
616  		    char* uncoupled_chunk = U_buf[i].c_str();
617  		    memcpy(&coupled_chunk[z*sub_chunksize],
618  			   &uncoupled_chunk[z*sub_chunksize],
619  			   sub_chunksize);
620  		    count_retrieved_sub_chunks++;
621  		  } else {
622  		    ceph_assert(y == lost_chunk / q);
623  		    ceph_assert(node_sw == lost_chunk);
624  		    ceph_assert(helper_data.count(i) > 0);
625  		    pft_erasures.insert(i1);
626  		    known_subchunks[i0].substr_of(helper_data[i], repair_plane_to_ind[z]*sub_chunksize, sub_chunksize);
627  		    known_subchunks[i2].substr_of(U_buf[i], z*sub_chunksize, sub_chunksize);
628  	
629  		    pftsubchunks[i0] = known_subchunks[i0];
630  		    pftsubchunks[i1].substr_of(recovered_data[node_sw], z_sw*sub_chunksize, sub_chunksize);
631  		    pftsubchunks[i2] = known_subchunks[i2];
632  		    pftsubchunks[i3] = temp_buf;
633  		    for (int i=0; i<3; i++) {
634  		      pftsubchunks[i].rebuild_aligned(SIMD_ALIGN);
635  		    }
636  		    pft.erasure_code->decode_chunks(pft_erasures, known_subchunks, &pftsubchunks);
637  		  }
638  		}
639  	      } // recover all erasures
640  	    } // planes of particular order
641  	  } // order
642  	
643  	  return 0;
644  	}
645  	
646  	
647  	int ErasureCodeClay::decode_layered(set<int> &erased_chunks,
648  	                                    map<int, bufferlist> *chunks)
649  	{
650  	  int num_erasures = erased_chunks.size();
651  	
652  	  int size = (*chunks)[0].length();
653  	  ceph_assert(size%sub_chunk_no == 0);
654  	  int sc_size = size / sub_chunk_no;
655  	
656  	  ceph_assert(num_erasures > 0);
657  	
658  	  for (int i = k+nu; (num_erasures < m) && (i < q*t); i++) {
659  	    if ([[maybe_unused]] auto [it, added] = erased_chunks.emplace(i); added) {
660  	      num_erasures++;
661  	      (void)it;  // silence -Wunused-variable
662  	    }
663  	  }
664  	  ceph_assert(num_erasures == m);
665  	
666  	  int max_iscore = get_max_iscore(erased_chunks);
667  	  int order[sub_chunk_no];
668  	  int z_vec[t];
669  	  for (int i = 0; i < q*t; i++) {
670  	    if (U_buf[i].length() == 0) {
671  	      bufferptr buf(buffer::create_aligned(size, SIMD_ALIGN));
672  	      buf.zero();
673  	      U_buf[i].push_back(std::move(buf));
674  	    }
675  	  }
676  	
677  	  set_planes_sequential_decoding_order(order, erased_chunks);
678  	
679  	  for (int iscore = 0; iscore <= max_iscore; iscore++) {
680  	   for (int z = 0; z < sub_chunk_no; z++) {
681  	      if (order[z] == iscore) {
682  	        decode_erasures(erased_chunks, z, chunks, sc_size);
683  	      }
684  	    }
685  	
686  	    for (int z = 0; z < sub_chunk_no; z++) {
687  	      if (order[z] == iscore) {
688  		get_plane_vector(z, z_vec);
689  	        for (auto node_xy : erased_chunks) {
690  	          int x = node_xy % q;
691  	          int y = node_xy / q;
692  		  int node_sw = y*q+z_vec[y];
693  	          if (z_vec[y] != x) {
694  	            if (erased_chunks.count(node_sw) == 0) {
695  		      recover_type1_erasure(chunks, x, y, z, z_vec, sc_size);
696  		    } else if (z_vec[y] < x){
697  	              ceph_assert(erased_chunks.count(node_sw) > 0);
698  	              ceph_assert(z_vec[y] != x);
699  	              get_coupled_from_uncoupled(chunks, x, y, z, z_vec, sc_size);
700  		    }
701  		  } else {
702  		    char* C = (*chunks)[node_xy].c_str();
703  	            char* U = U_buf[node_xy].c_str();
704  	            memcpy(&C[z*sc_size], &U[z*sc_size], sc_size);
705  	          }
706  	        }
707  	      }
708  	    } // plane
709  	  } // iscore, order
710  	
711  	  return 0;
712  	}
713  	
714  	int ErasureCodeClay::decode_erasures(const set<int>& erased_chunks, int z,
715  					     map<int, bufferlist>* chunks, int sc_size)
716  	{
717  	  int z_vec[t];
718  	
719  	  get_plane_vector(z, z_vec);
720  	
721  	  for (int x = 0; x < q; x++) {
722  	    for (int y = 0; y < t; y++) {
723  	      int node_xy = q*y+x;
724  	      int node_sw = q*y+z_vec[y];
725  	      if (erased_chunks.count(node_xy) == 0) {
726  		if (z_vec[y] < x) {
727  		  get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size);
728  		} else if (z_vec[y] == x) {
729  		  char* uncoupled_chunk = U_buf[node_xy].c_str();
730  		  char* coupled_chunk = (*chunks)[node_xy].c_str();
731  	          memcpy(&uncoupled_chunk[z*sc_size], &coupled_chunk[z*sc_size], sc_size);
732  	        } else {
733  	          if (erased_chunks.count(node_sw) > 0) {
734  	            get_uncoupled_from_coupled(chunks, x, y, z, z_vec, sc_size);
735  	          }
736  	        }
737  	      }
738  	    }
739  	  }
740  	  return decode_uncoupled(erased_chunks, z, sc_size);
741  	}
742  	
743  	int ErasureCodeClay::decode_uncoupled(const set<int>& erased_chunks, int z, int sc_size)
744  	{
745  	  map<int, bufferlist> known_subchunks;
746  	  map<int, bufferlist> all_subchunks;
747  	
748  	  for (int i = 0; i < q*t; i++) {
749  	    if (erased_chunks.count(i) == 0) {
750  	      known_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size);
751  	      all_subchunks[i] = known_subchunks[i];
752  	    } else {
753  	      all_subchunks[i].substr_of(U_buf[i], z*sc_size, sc_size);
754  	    }
755  	    all_subchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
756  	    assert(all_subchunks[i].is_contiguous());
757  	  }
758  	
759  	  mds.erasure_code->decode_chunks(erased_chunks, known_subchunks, &all_subchunks);
760  	  return 0;
761  	}
762  	
763  	void ErasureCodeClay::set_planes_sequential_decoding_order(int* order, set<int>& erasures) {
764  	  int z_vec[t];
765  	  for (int z = 0; z < sub_chunk_no; z++) {
766  	    get_plane_vector(z,z_vec);
767  	    order[z] = 0;
768  	    for (auto i : erasures) {
769  	      if (i % q == z_vec[i / q]) {
770  		order[z] = order[z] + 1;
771  	      }
772  	    }
773  	  }
774  	}
775  	
776  	void ErasureCodeClay::recover_type1_erasure(map<int, bufferlist>* chunks,
777  						    int x, int y, int z,
778  						    int* z_vec, int sc_size)
779  	{
780  	  set<int> erased_chunks;
781  	
782  	  int node_xy = y*q+x;
783  	  int node_sw = y*q+z_vec[y];
784  	  int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
785  	
786  	  map<int, bufferlist> known_subchunks;
787  	  map<int, bufferlist> pftsubchunks;
788  	  bufferptr ptr(buffer::create_aligned(sc_size, SIMD_ALIGN));
789  	  ptr.zero();
790  	
791  	  int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
792  	  if (z_vec[y] > x) {
793  	    i0 = 1;
794  	    i1 = 0;
795  	    i2 = 3;
796  	    i3 = 2;
797  	  }
798  	
799  	  erased_chunks.insert(i0);
800  	  pftsubchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
801  	  known_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
802  	  known_subchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
803  	  pftsubchunks[i1] = known_subchunks[i1];
804  	  pftsubchunks[i2] = known_subchunks[i2];
805  	  pftsubchunks[i3].push_back(ptr);
806  	
807  	  for (int i=0; i<3; i++) {
808  	    pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
809  	  }
810  	
811  	  pft.erasure_code->decode_chunks(erased_chunks, known_subchunks, &pftsubchunks);
812  	}
813  	
814  	void ErasureCodeClay::get_coupled_from_uncoupled(map<int, bufferlist>* chunks,
815  							 int x, int y, int z,
816  							 int* z_vec, int sc_size)
817  	{
818  	  set<int> erased_chunks = {0, 1};
819  	
820  	  int node_xy = y*q+x;
821  	  int node_sw = y*q+z_vec[y];
822  	  int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
823  	
824  	  ceph_assert(z_vec[y] < x);
825  	  map<int, bufferlist> uncoupled_subchunks;
826  	  uncoupled_subchunks[2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
827  	  uncoupled_subchunks[3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size);
828  	
829  	  map<int, bufferlist> pftsubchunks;
830  	  pftsubchunks[0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
831  	  pftsubchunks[1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
832  	  pftsubchunks[2] = uncoupled_subchunks[2];
833  	  pftsubchunks[3] = uncoupled_subchunks[3];
834  	
835  	  for (int i=0; i<3; i++) {
836  	    pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
837  	  }
838  	  pft.erasure_code->decode_chunks(erased_chunks, uncoupled_subchunks, &pftsubchunks);
839  	}
840  	
841  	void ErasureCodeClay::get_uncoupled_from_coupled(map<int, bufferlist>* chunks,
842  							 int x, int y, int z,
843  							 int* z_vec, int sc_size)
844  	{
845  	  set<int> erased_chunks = {2, 3};
846  	
847  	  int node_xy = y*q+x;
848  	  int node_sw = y*q+z_vec[y];
849  	  int z_sw = z + (x - z_vec[y]) * pow_int(q,t-1-y);
850  	
851  	  int i0 = 0, i1 = 1, i2 = 2, i3 = 3;
852  	  if (z_vec[y] > x) {
853  	    i0 = 1;
854  	    i1 = 0;
855  	    i2 = 3;
856  	    i3 = 2;
857  	  }
858  	  map<int, bufferlist> coupled_subchunks;
859  	  coupled_subchunks[i0].substr_of((*chunks)[node_xy], z * sc_size, sc_size);
860  	  coupled_subchunks[i1].substr_of((*chunks)[node_sw], z_sw * sc_size, sc_size);
861  	
862  	  map<int, bufferlist> pftsubchunks;
863  	  pftsubchunks[0] = coupled_subchunks[0];
864  	  pftsubchunks[1] = coupled_subchunks[1];
865  	  pftsubchunks[i2].substr_of(U_buf[node_xy], z * sc_size, sc_size);
866  	  pftsubchunks[i3].substr_of(U_buf[node_sw], z_sw * sc_size, sc_size);
867  	  for (int i=0; i<3; i++) {
868  	    pftsubchunks[i].rebuild_aligned_size_and_memory(sc_size, SIMD_ALIGN);
869  	  }
870  	  pft.erasure_code->decode_chunks(erased_chunks, coupled_subchunks, &pftsubchunks);
871  	}
872  	
873  	int ErasureCodeClay::get_max_iscore(set<int>& erased_chunks)
874  	{
875  	  int weight_vec[t];
876  	  int iscore = 0;
877  	  memset(weight_vec, 0, sizeof(int)*t);
878  	
879  	  for (auto i : erased_chunks) {
880  	    if (weight_vec[i / q] == 0) {
881  	      weight_vec[i / q] = 1;
882  	      iscore++;
883  	    }
884  	  }
885  	  return iscore;
886  	}
887  	
888  	void ErasureCodeClay::get_plane_vector(int z, int* z_vec)
889  	{
890  	  for (int i = 0; i < t; i++) {
891  	    z_vec[t-1-i] = z % q;
892  	    z = (z - z_vec[t-1-i]) / q;
893  	  }
894  	}
895