1    	/*
2    	 * Ceph - scalable distributed file system
3    	 *
4    	 * Copyright (C) 2015 Intel Corporation All Rights Reserved
5    	 *
6    	 * This is free software; you can redistribute it and/or
7    	 * modify it under the terms of the GNU Lesser General Public
8    	 * License version 2.1, as published by the Free Software
9    	 * Foundation.  See file COPYING.
10   	 *
11   	 */
12   	
13   	#ifdef __KERNEL__
14   	# include <linux/string.h>
15   	# include <linux/slab.h>
16   	# include <linux/bug.h>
17   	# include <linux/kernel.h>
18   	# include <linux/crush/crush.h>
19   	# include <linux/crush/hash.h>
20   	#else
21   	# include "crush_compat.h"
22   	# include "crush.h"
23   	# include "hash.h"
24   	#endif
25   	#include "crush_ln_table.h"
26   	#include "mapper.h"
27   	
28   	#define dprintk(args...) /* printf(args) */
29   	
30   	/*
31   	 * Implement the core CRUSH mapping algorithm.
32   	 */
33   	
34   	/**
35   	 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
36   	 * @map: the crush_map
37   	 * @ruleset: the storage ruleset id (user defined)
38   	 * @type: storage ruleset type (user defined)
39   	 * @size: output set size
40   	 */
41   	int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
42   	{
43   		__u32 i;
44   	
45   		for (i = 0; i < map->max_rules; i++) {
46   			if (map->rules[i] &&
47   			    map->rules[i]->mask.ruleset == ruleset &&
48   			    map->rules[i]->mask.type == type &&
49   			    map->rules[i]->mask.min_size <= size &&
50   			    map->rules[i]->mask.max_size >= size)
51   				return i;
52   		}
53   		return -1;
54   	}
55   	
56   	/*
57   	 * bucket choose methods
58   	 *
59   	 * For each bucket algorithm, we have a "choose" method that, given a
60   	 * crush input @x and replica position (usually, position in output set) @r,
61   	 * will produce an item in the bucket.
62   	 */
63   	
64   	/*
65   	 * Choose based on a random permutation of the bucket.
66   	 *
67   	 * We used to use some prime number arithmetic to do this, but it
68   	 * wasn't very random, and had some other bad behaviors.  Instead, we
69   	 * calculate an actual random permutation of the bucket members.
70   	 * Since this is expensive, we optimize for the r=0 case, which
71   	 * captures the vast majority of calls.
72   	 */
73   	static int bucket_perm_choose(const struct crush_bucket *bucket,
74   				      struct crush_work_bucket *work,
75   				      int x, int r)
76   	{
77   		unsigned int pr = r % bucket->size;
78   		unsigned int i, s;
79   	
80   		/* start a new permutation if @x has changed */
81   		if (work->perm_x != (__u32)x || work->perm_n == 0) {
82   			dprintk("bucket %d new x=%d\n", bucket->id, x);
83   			work->perm_x = x;
84   	
85   			/* optimize common r=0 case */
86   			if (pr == 0) {
87   				s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
88   					bucket->size;
89   				work->perm[0] = s;
90   				work->perm_n = 0xffff;   /* magic value, see below */
91   				goto out;
92   			}
93   	
94   			for (i = 0; i < bucket->size; i++)
95   				work->perm[i] = i;
96   			work->perm_n = 0;
97   		} else if (work->perm_n == 0xffff) {
98   			/* clean up after the r=0 case above */
99   			for (i = 1; i < bucket->size; i++)
100  				work->perm[i] = i;
101  			work->perm[work->perm[0]] = 0;
102  			work->perm_n = 1;
103  		}
104  	
105  		/* calculate permutation up to pr */
106  		for (i = 0; i < work->perm_n; i++)
107  			dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
108  		while (work->perm_n <= pr) {
109  			unsigned int p = work->perm_n;
110  			/* no point in swapping the final entry */
111  			if (p < bucket->size - 1) {
112  				i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
113  					(bucket->size - p);
114  				if (i) {
115  					unsigned int t = work->perm[p + i];
116  					work->perm[p + i] = work->perm[p];
117  					work->perm[p] = t;
118  				}
119  				dprintk(" perm_choose swap %d with %d\n", p, p+i);
120  			}
121  			work->perm_n++;
122  		}
123  		for (i = 0; i < bucket->size; i++)
124  			dprintk(" perm_choose  %d: %d\n", i, work->perm[i]);
125  	
126  		s = work->perm[pr];
127  	out:
128  		dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
129  			bucket->size, x, r, pr, s);
130  		return bucket->items[s];
131  	}
132  	
133  	/* uniform */
134  	static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
135  					 struct crush_work_bucket *work, int x, int r)
136  	{
137  		return bucket_perm_choose(&bucket->h, work, x, r);
138  	}
139  	
140  	/* list */
141  	static int bucket_list_choose(const struct crush_bucket_list *bucket,
142  				      int x, int r)
143  	{
144  		int i;
145  	
146  		for (i = bucket->h.size-1; i >= 0; i--) {
147  			__u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
148  						 r, bucket->h.id);
149  			w &= 0xffff;
150  			dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
151  				"sw %x rand %llx",
152  				i, x, r, bucket->h.items[i], bucket->item_weights[i],
153  				bucket->sum_weights[i], w);
154  			w *= bucket->sum_weights[i];
155  			w = w >> 16;
156  			/*dprintk(" scaled %llx\n", w);*/
157  			if (w < bucket->item_weights[i]) {
158  				return bucket->h.items[i];
159  			}
160  		}
161  	
162  		dprintk("bad list sums for bucket %d\n", bucket->h.id);
163  		return bucket->h.items[0];
164  	}
165  	
166  	
167  	/* (binary) tree */
168  	static int height(int n)
169  	{
(1) Event assign_zero_constant: Assigning: "h" = "0", which is zero.
Also see events: [return_zero_variable]
170  		int h = 0;
(2) Event cond_false: Condition "(n & 1) == 0", taking false branch.
171  		while ((n & 1) == 0) {
172  			h++;
173  			n = n >> 1;
(3) Event loop_end: Reached end of loop.
174  		}
(4) Event return_zero_variable: Explicitly returning zero variable "h".
Also see events: [assign_zero_constant]
175  		return h;
176  	}
177  	
178  	static int left(int x)
179  	{
(1) Event zero_return: Function call "height(x)" returns 0. [details]
(2) Event assignment: Assigning: "h" = "height(x)". The value of "h" is now 0.
Also see events: [negative_shift]
180  		int h = height(x);
(3) Event negative_shift: In expression "1 << h - 1", shifting by a negative amount has undefined behavior. The shift amount, "h - 1", is -1.
Also see events: [zero_return][assignment]
181  		return x - (1 << (h-1));
182  	}
183  	
184  	static int right(int x)
185  	{
186  		int h = height(x);
187  		return x + (1 << (h-1));
188  	}
189  	
190  	static int terminal(int x)
191  	{
192  		return x & 1;
193  	}
194  	
195  	static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
196  				      int x, int r)
197  	{
198  		int n;
199  		__u32 w;
200  		__u64 t;
201  	
202  		/* start at root */
203  		n = bucket->num_nodes >> 1;
204  	
205  		while (!terminal(n)) {
206  			int l;
207  			/* pick point in [0, w) */
208  			w = bucket->node_weights[n];
209  			t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
210  						  bucket->h.id) * (__u64)w;
211  			t = t >> 32;
212  	
213  			/* descend to the left or right? */
214  			l = left(n);
215  			if (t < bucket->node_weights[l])
216  				n = l;
217  			else
218  				n = right(n);
219  		}
220  	
221  		return bucket->h.items[n >> 1];
222  	}
223  	
224  	
225  	/* straw */
226  	
227  	static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
228  				       int x, int r)
229  	{
230  		__u32 i;
231  		int high = 0;
232  		__u64 high_draw = 0;
233  		__u64 draw;
234  	
235  		for (i = 0; i < bucket->h.size; i++) {
236  			draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
237  			draw &= 0xffff;
238  			draw *= bucket->straws[i];
239  			if (i == 0 || draw > high_draw) {
240  				high = i;
241  				high_draw = draw;
242  			}
243  		}
244  		return bucket->h.items[high];
245  	}
246  	
247  	/* compute 2^44*log2(input+1) */
248  	static __u64 crush_ln(unsigned int xin)
249  	{
250  		unsigned int x = xin;
251  		int iexpon, index1, index2;
252  		__u64 RH, LH, LL, xl64, result;
253  	
254  		x++;
255  	
256  		/* normalize input */
257  		iexpon = 15;
258  	
259  		// figure out number of bits we need to shift and
260  		// do it in one step instead of iteratively
261  		if (!(x & 0x18000)) {
262  		  int bits = __builtin_clz(x & 0x1FFFF) - 16;
263  		  x <<= bits;
264  		  iexpon = 15 - bits;
265  		}
266  	
267  		index1 = (x >> 8) << 1;
268  		/* RH ~ 2^56/index1 */
269  		RH = __RH_LH_tbl[index1 - 256];
270  		/* LH ~ 2^48 * log2(index1/256) */
271  		LH = __RH_LH_tbl[index1 + 1 - 256];
272  	
273  		/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
274  		xl64 = (__s64)x * RH;
275  		xl64 >>= 48;
276  	
277  		result = iexpon;
278  		result <<= (12 + 32);
279  	
280  		index2 = xl64 & 0xff;
281  		/* LL ~ 2^48*log2(1.0+index2/2^15) */
282  		LL = __LL_tbl[index2];
283  	
284  		LH = LH + LL;
285  	
286  		LH >>= (48 - 12 - 32);
287  		result += LH;
288  	
289  		return result;
290  	}
291  	
292  	
293  	/*
294  	 * straw2
295  	 *
296  	 * Suppose we have two osds: osd.0 and osd.1, with weight 8 and 4 respectively, It means:
297  	 *   a). For osd.0, the time interval between each io request apply to exponential distribution 
298  	 *       with lamba equals 8
299  	 *   b). For osd.1, the time interval between each io request apply to exponential distribution 
300  	 *       with lamba equals 4
301  	 *   c). If we apply to each osd's exponential random variable, then the total pgs on each osd
302  	 *       is proportional to its weight.
303  	 *
304  	 * for reference, see:
305  	 *
306  	 * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
307  	 */
308  	
309  	static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
310  	                                            const struct crush_choose_arg *arg,
311  	                                            int position)
312  	{
313  		if ((arg == NULL) || (arg->weight_set == NULL))
314  			return bucket->item_weights;
315  		if (position >= arg->weight_set_positions)
316  			position = arg->weight_set_positions - 1;
317  		return arg->weight_set[position].weights;
318  	}
319  	
320  	static inline __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
321  						const struct crush_choose_arg *arg)
322  	{
323  		if ((arg == NULL) || (arg->ids == NULL))
324  			return bucket->h.items;
325  		return arg->ids;
326  	}
327  	
328  	/*
329  	 * Compute exponential random variable using inversion method.
330  	 *
331  	 * for reference, see the exponential distribution example at:  
332  	 * https://en.wikipedia.org/wiki/Inverse_transform_sampling#Examples
333  	 */
334  	static inline __s64 generate_exponential_distribution(int type, int x, int y, int z, 
335  	                                                      int weight)
336  	{
337  		unsigned int u = crush_hash32_3(type, x, y, z);
338  		u &= 0xffff;
339  	
340  		/*
341  		 * for some reason slightly less than 0x10000 produces
342  		 * a slightly more accurate distribution... probably a
343  		 * rounding effect.
344  		 *
345  		 * the natural log lookup table maps [0,0xffff]
346  		 * (corresponding to real numbers [1/0x10000, 1] to
347  		 * [0, 0xffffffffffff] (corresponding to real numbers
348  		 * [-11.090355,0]).
349  		 */
350  		__s64 ln = crush_ln(u) - 0x1000000000000ll;
351  	
352  		/*
353  		 * divide by 16.16 fixed-point weight.  note
354  		 * that the ln value is negative, so a larger
355  		 * weight means a larger (less negative) value
356  		 * for draw.
357  		 */
358  		return div64_s64(ln, weight);
359  	}
360  	
361  	static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
362  					int x, int r, const struct crush_choose_arg *arg,
363  	                                int position)
364  	{
365  		unsigned int i, high = 0;
366  		__s64 draw, high_draw = 0;
367  	        __u32 *weights = get_choose_arg_weights(bucket, arg, position);
368  	        __s32 *ids = get_choose_arg_ids(bucket, arg);
369  		for (i = 0; i < bucket->h.size; i++) {
370  	                dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
371  			if (weights[i]) {
372  				draw = generate_exponential_distribution(bucket->h.hash, x, ids[i], r, weights[i]);
373  			} else {
374  				draw = S64_MIN;
375  			}
376  	
377  			if (i == 0 || draw > high_draw) {
378  				high = i;
379  				high_draw = draw;
380  			}
381  		}
382  	
383  		return bucket->h.items[high];
384  	}
385  	
386  	
387  	static int crush_bucket_choose(const struct crush_bucket *in,
388  				       struct crush_work_bucket *work,
389  				       int x, int r,
390  	                               const struct crush_choose_arg *arg,
391  	                               int position)
392  	{
393  		dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
394  		BUG_ON(in->size == 0);
395  		switch (in->alg) {
396  		case CRUSH_BUCKET_UNIFORM:
397  			return bucket_uniform_choose(
398  				(const struct crush_bucket_uniform *)in,
399  				work, x, r);
400  		case CRUSH_BUCKET_LIST:
401  			return bucket_list_choose((const struct crush_bucket_list *)in,
402  						  x, r);
403  		case CRUSH_BUCKET_TREE:
404  			return bucket_tree_choose((const struct crush_bucket_tree *)in,
405  						  x, r);
406  		case CRUSH_BUCKET_STRAW:
407  			return bucket_straw_choose(
408  				(const struct crush_bucket_straw *)in,
409  				x, r);
410  		case CRUSH_BUCKET_STRAW2:
411  			return bucket_straw2_choose(
412  				(const struct crush_bucket_straw2 *)in,
413  				x, r, arg, position);
414  		default:
415  			dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
416  			return in->items[0];
417  		}
418  	}
419  	
420  	/*
421  	 * true if device is marked "out" (failed, fully offloaded)
422  	 * of the cluster
423  	 */
424  	static int is_out(const struct crush_map *map,
425  			  const __u32 *weight, int weight_max,
426  			  int item, int x)
427  	{
428  		if (item >= weight_max)
429  			return 1;
430  		if (weight[item] >= 0x10000)
431  			return 0;
432  		if (weight[item] == 0)
433  			return 1;
434  		if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
435  		    < weight[item])
436  			return 0;
437  		return 1;
438  	}
439  	
440  	/**
441  	 * crush_choose_firstn - choose numrep distinct items of given type
442  	 * @map: the crush_map
443  	 * @bucket: the bucket we are choose an item from
444  	 * @x: crush input value
445  	 * @numrep: the number of items to choose
446  	 * @type: the type of item to choose
447  	 * @out: pointer to output vector
448  	 * @outpos: our position in that vector
449  	 * @out_size: size of the out vector
450  	 * @tries: number of attempts to make
451  	 * @recurse_tries: number of attempts to have recursive chooseleaf make
452  	 * @local_retries: localized retries
453  	 * @local_fallback_retries: localized fallback retries
454  	 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
455  	 * @stable: stable mode starts rep=0 in the recursive call for all replicas
456  	 * @vary_r: pass r to recursive calls
457  	 * @out2: second output vector for leaf items (if @recurse_to_leaf)
458  	 * @parent_r: r value passed from the parent
459  	 */
460  	static int crush_choose_firstn(const struct crush_map *map,
461  				       struct crush_work *work,
462  				       const struct crush_bucket *bucket,
463  				       const __u32 *weight, int weight_max,
464  				       int x, int numrep, int type,
465  				       int *out, int outpos,
466  				       int out_size,
467  				       unsigned int tries,
468  				       unsigned int recurse_tries,
469  				       unsigned int local_retries,
470  				       unsigned int local_fallback_retries,
471  				       int recurse_to_leaf,
472  				       unsigned int vary_r,
473  				       unsigned int stable,
474  				       int *out2,
475  				       int parent_r,
476  	                               const struct crush_choose_arg *choose_args)
477  	{
478  		int rep;
479  		unsigned int ftotal, flocal;
480  		int retry_descent, retry_bucket, skip_rep;
481  		const struct crush_bucket *in = bucket;
482  		int r;
483  		int i;
484  		int item = 0;
485  		int itemtype;
486  		int collide, reject;
487  		int count = out_size;
488  	
489  		dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d \
490  	recurse_tries %d local_retries %d local_fallback_retries %d \
491  	parent_r %d stable %d\n",
492  			recurse_to_leaf ? "_LEAF" : "",
493  			bucket->id, x, outpos, numrep,
494  			tries, recurse_tries, local_retries, local_fallback_retries,
495  			parent_r, stable);
496  	
497  		for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
498  			/* keep trying until we get a non-out, non-colliding item */
499  			ftotal = 0;
500  			skip_rep = 0;
501  			do {
502  				retry_descent = 0;
503  				in = bucket;              /* initial bucket */
504  	
505  				/* choose through intervening buckets */
506  				flocal = 0;
507  				do {
508  					collide = 0;
509  					retry_bucket = 0;
510  					r = rep + parent_r;
511  					/* r' = r + f_total */
512  					r += ftotal;
513  	
514  					/* bucket choose */
515  					if (in->size == 0) {
516  						reject = 1;
517  						goto reject;
518  					}
519  					if (local_fallback_retries > 0 &&
520  					    flocal >= (in->size>>1) &&
521  					    flocal > local_fallback_retries)
522  						item = bucket_perm_choose(
523  							in, work->work[-1-in->id],
524  							x, r);
525  					else
526  						item = crush_bucket_choose(
527  							in, work->work[-1-in->id],
528  							x, r,
529  	                                                (choose_args ? &choose_args[-1-in->id] : 0),
530  	                                                outpos);
531  					if (item >= map->max_devices) {
532  						dprintk("   bad item %d\n", item);
533  						skip_rep = 1;
534  						break;
535  					}
536  	
537  					/* desired type? */
538  					if (item < 0)
539  						itemtype = map->buckets[-1-item]->type;
540  					else
541  						itemtype = 0;
542  					dprintk("  item %d type %d\n", item, itemtype);
543  	
544  					/* keep going? */
545  					if (itemtype != type) {
546  						if (item >= 0 ||
547  						    (-1-item) >= map->max_buckets) {
548  							dprintk("   bad item type %d\n", type);
549  							skip_rep = 1;
550  							break;
551  						}
552  						in = map->buckets[-1-item];
553  						retry_bucket = 1;
554  						continue;
555  					}
556  	
557  					/* collision? */
558  					for (i = 0; i < outpos; i++) {
559  						if (out[i] == item) {
560  							collide = 1;
561  							break;
562  						}
563  					}
564  	
565  					reject = 0;
566  					if (!collide && recurse_to_leaf) {
567  						if (item < 0) {
568  							int sub_r;
569  							if (vary_r)
570  								sub_r = r >> (vary_r-1);
571  							else
572  								sub_r = 0;
573  							if (crush_choose_firstn(
574  								    map,
575  								    work,
576  								    map->buckets[-1-item],
577  								    weight, weight_max,
578  								    x, stable ? 1 : outpos+1, 0,
579  								    out2, outpos, count,
580  								    recurse_tries, 0,
581  								    local_retries,
582  								    local_fallback_retries,
583  								    0,
584  								    vary_r,
585  								    stable,
586  								    NULL,
587  								    sub_r,
588  	                                                            choose_args) <= outpos)
589  								/* didn't get leaf */
590  								reject = 1;
591  						} else {
592  							/* we already have a leaf! */
593  							out2[outpos] = item;
594  			                        }
595  					}
596  	
597  					if (!reject && !collide) {
598  						/* out? */
599  						if (itemtype == 0)
600  							reject = is_out(map, weight,
601  									weight_max,
602  									item, x);
603  					}
604  	
605  	reject:
606  					if (reject || collide) {
607  						ftotal++;
608  						flocal++;
609  	
610  						if (collide && flocal <= local_retries)
611  							/* retry locally a few times */
612  							retry_bucket = 1;
613  						else if (local_fallback_retries > 0 &&
614  							 flocal <= in->size + local_fallback_retries)
615  							/* exhaustive bucket search */
616  							retry_bucket = 1;
617  						else if (ftotal < tries)
618  							/* then retry descent */
619  							retry_descent = 1;
620  						else
621  							/* else give up */
622  							skip_rep = 1;
623  						dprintk("  reject %d  collide %d  "
624  							"ftotal %u  flocal %u\n",
625  							reject, collide, ftotal,
626  							flocal);
627  					}
628  				} while (retry_bucket);
629  			} while (retry_descent);
630  	
631  			if (skip_rep) {
632  				dprintk("skip rep\n");
633  				continue;
634  			}
635  	
636  			dprintk("CHOOSE got %d\n", item);
637  			out[outpos] = item;
638  			outpos++;
639  			count--;
640  	#ifndef __KERNEL__
641  			if (map->choose_tries && ftotal <= map->choose_total_tries)
642  				map->choose_tries[ftotal]++;
643  	#endif
644  		}
645  	
646  		dprintk("CHOOSE returns %d\n", outpos);
647  		return outpos;
648  	}
649  	
650  	
651  	/**
652  	 * crush_choose_indep: alternative breadth-first positionally stable mapping
653  	 *
654  	 */
655  	static void crush_choose_indep(const struct crush_map *map,
656  				       struct crush_work *work,
657  				       const struct crush_bucket *bucket,
658  				       const __u32 *weight, int weight_max,
659  				       int x, int left, int numrep, int type,
660  				       int *out, int outpos,
661  				       unsigned int tries,
662  				       unsigned int recurse_tries,
663  				       int recurse_to_leaf,
664  				       int *out2,
665  				       int parent_r,
666  	                               const struct crush_choose_arg *choose_args)
667  	{
668  		const struct crush_bucket *in = bucket;
669  		int endpos = outpos + left;
670  		int rep;
671  		unsigned int ftotal;
672  		int r;
673  		int i;
674  		int item = 0;
675  		int itemtype;
676  		int collide;
677  	
678  		dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
679  			bucket->id, x, outpos, numrep);
680  	
681  		/* initially my result is undefined */
682  		for (rep = outpos; rep < endpos; rep++) {
683  			out[rep] = CRUSH_ITEM_UNDEF;
684  			if (out2)
685  				out2[rep] = CRUSH_ITEM_UNDEF;
686  		}
687  	
688  		for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
689  	#ifdef DEBUG_INDEP
690  			if (out2 && ftotal) {
691  				dprintk("%u %d a: ", ftotal, left);
692  				for (rep = outpos; rep < endpos; rep++) {
693  					dprintk(" %d", out[rep]);
694  				}
695  				dprintk("\n");
696  				dprintk("%u %d b: ", ftotal, left);
697  				for (rep = outpos; rep < endpos; rep++) {
698  					dprintk(" %d", out2[rep]);
699  				}
700  				dprintk("\n");
701  			}
702  	#endif
703  			for (rep = outpos; rep < endpos; rep++) {
704  				if (out[rep] != CRUSH_ITEM_UNDEF)
705  					continue;
706  	
707  				in = bucket;  /* initial bucket */
708  	
709  				/* choose through intervening buckets */
710  				for (;;) {
711  					/* note: we base the choice on the position
712  					 * even in the nested call.  that means that
713  					 * if the first layer chooses the same bucket
714  					 * in a different position, we will tend to
715  					 * choose a different item in that bucket.
716  					 * this will involve more devices in data
717  					 * movement and tend to distribute the load.
718  					 */
719  					r = rep + parent_r;
720  	
721  					/* be careful */
722  					if (in->alg == CRUSH_BUCKET_UNIFORM &&
723  					    in->size % numrep == 0)
724  						/* r'=r+(n+1)*f_total */
725  						r += (numrep+1) * ftotal;
726  					else
727  						/* r' = r + n*f_total */
728  						r += numrep * ftotal;
729  	
730  					/* bucket choose */
731  					if (in->size == 0) {
732  						dprintk("   empty bucket\n");
733  						break;
734  					}
735  	
736  					item = crush_bucket_choose(
737  						in, work->work[-1-in->id],
738  						x, r,
739  	                                        (choose_args ? &choose_args[-1-in->id] : 0),
740  	                                        outpos);
741  					if (item >= map->max_devices) {
742  						dprintk("   bad item %d\n", item);
743  						out[rep] = CRUSH_ITEM_NONE;
744  						if (out2)
745  							out2[rep] = CRUSH_ITEM_NONE;
746  						left--;
747  						break;
748  					}
749  	
750  					/* desired type? */
751  					if (item < 0)
752  						itemtype = map->buckets[-1-item]->type;
753  					else
754  						itemtype = 0;
755  					dprintk("  item %d type %d\n", item, itemtype);
756  	
757  					/* keep going? */
758  					if (itemtype != type) {
759  						if (item >= 0 ||
760  						    (-1-item) >= map->max_buckets) {
761  							dprintk("   bad item type %d\n", type);
762  							out[rep] = CRUSH_ITEM_NONE;
763  							if (out2)
764  								out2[rep] =
765  									CRUSH_ITEM_NONE;
766  							left--;
767  							break;
768  						}
769  						in = map->buckets[-1-item];
770  						continue;
771  					}
772  	
773  					/* collision? */
774  					collide = 0;
775  					for (i = outpos; i < endpos; i++) {
776  						if (out[i] == item) {
777  							collide = 1;
778  							break;
779  						}
780  					}
781  					if (collide)
782  						break;
783  	
784  					if (recurse_to_leaf) {
785  						if (item < 0) {
786  							crush_choose_indep(
787  								map,
788  								work,
789  								map->buckets[-1-item],
790  								weight, weight_max,
791  								x, 1, numrep, 0,
792  								out2, rep,
793  								recurse_tries, 0,
794  								0, NULL, r, choose_args);
795  							if (out2 && out2[rep] == CRUSH_ITEM_NONE) {
796  								/* placed nothing; no leaf */
797  								break;
798  							}
799  						} else if (out2) {
800  							/* we already have a leaf! */
801  							out2[rep] = item;
802  						}
803  					}
804  	
805  					/* out? */
806  					if (itemtype == 0 &&
807  					    is_out(map, weight, weight_max, item, x))
808  						break;
809  	
810  					/* yay! */
811  					out[rep] = item;
812  					left--;
813  					break;
814  				}
815  			}
816  		}
817  		for (rep = outpos; rep < endpos; rep++) {
818  			if (out[rep] == CRUSH_ITEM_UNDEF) {
819  				out[rep] = CRUSH_ITEM_NONE;
820  			}
821  			if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
822  				out2[rep] = CRUSH_ITEM_NONE;
823  			}
824  		}
825  	#ifndef __KERNEL__
826  		if (map->choose_tries && ftotal <= map->choose_total_tries)
827  			map->choose_tries[ftotal]++;
828  	#endif
829  	#ifdef DEBUG_INDEP
830  		if (out2) {
831  			dprintk("%u %d a: ", ftotal, left);
832  			for (rep = outpos; rep < endpos; rep++) {
833  				dprintk(" %d", out[rep]);
834  			}
835  			dprintk("\n");
836  			dprintk("%u %d b: ", ftotal, left);
837  			for (rep = outpos; rep < endpos; rep++) {
838  				dprintk(" %d", out2[rep]);
839  			}
840  			dprintk("\n");
841  		}
842  	#endif
843  	}
844  	
845  	
846  	/* This takes a chunk of memory and sets it up to be a shiny new
847  	   working area for a CRUSH placement computation. It must be called
848  	   on any newly allocated memory before passing it in to
849  	   crush_do_rule. It may be used repeatedly after that, so long as the
850  	   map has not changed. If the map /has/ changed, you must make sure
851  	   the working size is no smaller than what was allocated and re-run
852  	   crush_init_workspace.
853  	
854  	   If you do retain the working space between calls to crush, make it
855  	   thread-local. If you reinstitute the locking I've spent so much
856  	   time getting rid of, I will be very unhappy with you. */
857  	
858  	void crush_init_workspace(const struct crush_map *m, void *v) {
859  		/* We work by moving through the available space and setting
860  		   values and pointers as we go.
861  	
862  		   It's a bit like Forth's use of the 'allot' word since we
863  		   set the pointer first and then reserve the space for it to
864  		   point to by incrementing the point. */
865  		struct crush_work *w = (struct crush_work *)v;
866  		char *point = (char *)v;
867  		__s32 b;
868  		point += sizeof(struct crush_work);
869  		w->work = (struct crush_work_bucket **)point;
870  		point += m->max_buckets * sizeof(struct crush_work_bucket *);
871  		for (b = 0; b < m->max_buckets; ++b) {
872  			if (m->buckets[b] == 0)
873  				continue;
874  	
875  			w->work[b] = (struct crush_work_bucket *) point;
876  			switch (m->buckets[b]->alg) {
877  			default:
878  				point += sizeof(struct crush_work_bucket);
879  				break;
880  			}
881  			w->work[b]->perm_x = 0;
882  			w->work[b]->perm_n = 0;
883  			w->work[b]->perm = (__u32 *)point;
884  			point += m->buckets[b]->size * sizeof(__u32);
885  		}
886  		BUG_ON((char *)point - (char *)w != m->working_size);
887  	}
888  	
889  	/**
890  	 * crush_do_rule - calculate a mapping with the given input and rule
891  	 * @map: the crush_map
892  	 * @ruleno: the rule id
893  	 * @x: hash input
894  	 * @result: pointer to result vector
895  	 * @result_max: maximum result size
896  	 * @weight: weight vector (for map leaves)
897  	 * @weight_max: size of weight vector
898  	 * @cwin: Pointer to at least map->working_size bytes of memory or NULL.
899  	 */
900  	int crush_do_rule(const struct crush_map *map,
901  			  int ruleno, int x, int *result, int result_max,
902  			  const __u32 *weight, int weight_max,
903  			  void *cwin, const struct crush_choose_arg *choose_args)
904  	{
905  		int result_len;
906  		struct crush_work *cw = cwin;
907  		int *a = (int *)((char *)cw + map->working_size);
908  		int *b = a + result_max;
909  		int *c = b + result_max;
910  		int *w = a;
911  		int *o = b;
912  		int recurse_to_leaf;
913  		int wsize = 0;
914  		int osize;
915  		int *tmp;
916  		const struct crush_rule *rule;
917  		__u32 step;
918  		int i, j;
919  		int numrep;
920  		int out_size;
921  		/*
922  		 * the original choose_total_tries value was off by one (it
923  		 * counted "retries" and not "tries").  add one.
924  		 */
925  		int choose_tries = map->choose_total_tries + 1;
926  		int choose_leaf_tries = 0;
927  		/*
928  		 * the local tries values were counted as "retries", though,
929  		 * and need no adjustment
930  		 */
931  		int choose_local_retries = map->choose_local_tries;
932  		int choose_local_fallback_retries = map->choose_local_fallback_tries;
933  	
934  		int vary_r = map->chooseleaf_vary_r;
935  		int stable = map->chooseleaf_stable;
936  	
937  		if ((__u32)ruleno >= map->max_rules) {
938  			dprintk(" bad ruleno %d\n", ruleno);
939  			return 0;
940  		}
941  	
942  		rule = map->rules[ruleno];
943  		result_len = 0;
944  	
945  		for (step = 0; step < rule->len; step++) {
946  			int firstn = 0;
947  			const struct crush_rule_step *curstep = &rule->steps[step];
948  	
949  			switch (curstep->op) {
950  			case CRUSH_RULE_TAKE:
951  				if ((curstep->arg1 >= 0 &&
952  				     curstep->arg1 < map->max_devices) ||
953  				    (-1-curstep->arg1 >= 0 &&
954  				     -1-curstep->arg1 < map->max_buckets &&
955  				     map->buckets[-1-curstep->arg1])) {
956  					w[0] = curstep->arg1;
957  					wsize = 1;
958  				} else {
959  					dprintk(" bad take value %d\n", curstep->arg1);
960  				}
961  				break;
962  	
963  			case CRUSH_RULE_SET_CHOOSE_TRIES:
964  				if (curstep->arg1 > 0)
965  					choose_tries = curstep->arg1;
966  				break;
967  	
968  			case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
969  				if (curstep->arg1 > 0)
970  					choose_leaf_tries = curstep->arg1;
971  				break;
972  	
973  			case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
974  				if (curstep->arg1 >= 0)
975  					choose_local_retries = curstep->arg1;
976  				break;
977  	
978  			case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
979  				if (curstep->arg1 >= 0)
980  					choose_local_fallback_retries = curstep->arg1;
981  				break;
982  	
983  			case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
984  				if (curstep->arg1 >= 0)
985  					vary_r = curstep->arg1;
986  				break;
987  	
988  			case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
989  				if (curstep->arg1 >= 0)
990  					stable = curstep->arg1;
991  				break;
992  	
993  			case CRUSH_RULE_CHOOSELEAF_FIRSTN:
994  			case CRUSH_RULE_CHOOSE_FIRSTN:
995  				firstn = 1;
996  				/* fall through */
997  			case CRUSH_RULE_CHOOSELEAF_INDEP:
998  			case CRUSH_RULE_CHOOSE_INDEP:
999  				if (wsize == 0)
1000 					break;
1001 	
1002 				recurse_to_leaf =
1003 					curstep->op ==
1004 					 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
1005 					curstep->op ==
1006 					CRUSH_RULE_CHOOSELEAF_INDEP;
1007 	
1008 				/* reset output */
1009 				osize = 0;
1010 	
1011 				for (i = 0; i < wsize; i++) {
1012 					int bno;
1013 					numrep = curstep->arg1;
1014 					if (numrep <= 0) {
1015 						numrep += result_max;
1016 						if (numrep <= 0)
1017 							continue;
1018 					}
1019 					j = 0;
1020 					/* make sure bucket id is valid */
1021 					bno = -1 - w[i];
1022 					if (bno < 0 || bno >= map->max_buckets) {
1023 						// w[i] is probably CRUSH_ITEM_NONE
1024 						dprintk("  bad w[i] %d\n", w[i]);
1025 						continue;
1026 					}
1027 					if (firstn) {
1028 						int recurse_tries;
1029 						if (choose_leaf_tries)
1030 							recurse_tries =
1031 								choose_leaf_tries;
1032 						else if (map->chooseleaf_descend_once)
1033 							recurse_tries = 1;
1034 						else
1035 							recurse_tries = choose_tries;
1036 						osize += crush_choose_firstn(
1037 							map,
1038 							cw,
1039 							map->buckets[bno],
1040 							weight, weight_max,
1041 							x, numrep,
1042 							curstep->arg2,
1043 							o+osize, j,
1044 							result_max-osize,
1045 							choose_tries,
1046 							recurse_tries,
1047 							choose_local_retries,
1048 							choose_local_fallback_retries,
1049 							recurse_to_leaf,
1050 							vary_r,
1051 							stable,
1052 							c+osize,
1053 							0,
1054 							choose_args);
1055 					} else {
1056 						out_size = ((numrep < (result_max-osize)) ?
1057 							    numrep : (result_max-osize));
1058 						crush_choose_indep(
1059 							map,
1060 							cw,
1061 							map->buckets[bno],
1062 							weight, weight_max,
1063 							x, out_size, numrep,
1064 							curstep->arg2,
1065 							o+osize, j,
1066 							choose_tries,
1067 							choose_leaf_tries ?
1068 							   choose_leaf_tries : 1,
1069 							recurse_to_leaf,
1070 							c+osize,
1071 							0,
1072 							choose_args);
1073 						osize += out_size;
1074 					}
1075 				}
1076 	
1077 				if (recurse_to_leaf)
1078 					/* copy final _leaf_ values to output set */
1079 					memcpy(o, c, osize*sizeof(*o));
1080 	
1081 				/* swap o and w arrays */
1082 				tmp = o;
1083 				o = w;
1084 				w = tmp;
1085 				wsize = osize;
1086 				break;
1087 	
1088 	
1089 			case CRUSH_RULE_EMIT:
1090 				for (i = 0; i < wsize && result_len < result_max; i++) {
1091 					result[result_len] = w[i];
1092 					result_len++;
1093 				}
1094 				wsize = 0;
1095 				break;
1096 	
1097 			default:
1098 				dprintk(" unknown op %d at step %d\n",
1099 					curstep->op, step);
1100 				break;
1101 			}
1102 		}
1103 	
1104 		return result_len;
1105 	}
1106