1 /*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2015 Intel Corporation All Rights Reserved
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13 #ifdef __KERNEL__
14 # include <linux/string.h>
15 # include <linux/slab.h>
16 # include <linux/bug.h>
17 # include <linux/kernel.h>
18 # include <linux/crush/crush.h>
19 # include <linux/crush/hash.h>
20 #else
21 # include "crush_compat.h"
22 # include "crush.h"
23 # include "hash.h"
24 #endif
25 #include "crush_ln_table.h"
26 #include "mapper.h"
27
28 #define dprintk(args...) /* printf(args) */
29
30 /*
31 * Implement the core CRUSH mapping algorithm.
32 */
33
34 /**
35 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
36 * @map: the crush_map
37 * @ruleset: the storage ruleset id (user defined)
38 * @type: storage ruleset type (user defined)
39 * @size: output set size
40 */
41 int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
42 {
43 __u32 i;
44
45 for (i = 0; i < map->max_rules; i++) {
46 if (map->rules[i] &&
47 map->rules[i]->mask.ruleset == ruleset &&
48 map->rules[i]->mask.type == type &&
49 map->rules[i]->mask.min_size <= size &&
50 map->rules[i]->mask.max_size >= size)
51 return i;
52 }
53 return -1;
54 }
55
56 /*
57 * bucket choose methods
58 *
59 * For each bucket algorithm, we have a "choose" method that, given a
60 * crush input @x and replica position (usually, position in output set) @r,
61 * will produce an item in the bucket.
62 */
63
64 /*
65 * Choose based on a random permutation of the bucket.
66 *
67 * We used to use some prime number arithmetic to do this, but it
68 * wasn't very random, and had some other bad behaviors. Instead, we
69 * calculate an actual random permutation of the bucket members.
70 * Since this is expensive, we optimize for the r=0 case, which
71 * captures the vast majority of calls.
72 */
73 static int bucket_perm_choose(const struct crush_bucket *bucket,
74 struct crush_work_bucket *work,
75 int x, int r)
76 {
77 unsigned int pr = r % bucket->size;
78 unsigned int i, s;
79
80 /* start a new permutation if @x has changed */
81 if (work->perm_x != (__u32)x || work->perm_n == 0) {
82 dprintk("bucket %d new x=%d\n", bucket->id, x);
83 work->perm_x = x;
84
85 /* optimize common r=0 case */
86 if (pr == 0) {
87 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
88 bucket->size;
89 work->perm[0] = s;
90 work->perm_n = 0xffff; /* magic value, see below */
91 goto out;
92 }
93
94 for (i = 0; i < bucket->size; i++)
95 work->perm[i] = i;
96 work->perm_n = 0;
97 } else if (work->perm_n == 0xffff) {
98 /* clean up after the r=0 case above */
99 for (i = 1; i < bucket->size; i++)
100 work->perm[i] = i;
101 work->perm[work->perm[0]] = 0;
102 work->perm_n = 1;
103 }
104
105 /* calculate permutation up to pr */
106 for (i = 0; i < work->perm_n; i++)
107 dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
108 while (work->perm_n <= pr) {
109 unsigned int p = work->perm_n;
110 /* no point in swapping the final entry */
111 if (p < bucket->size - 1) {
112 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
113 (bucket->size - p);
114 if (i) {
115 unsigned int t = work->perm[p + i];
116 work->perm[p + i] = work->perm[p];
117 work->perm[p] = t;
118 }
119 dprintk(" perm_choose swap %d with %d\n", p, p+i);
120 }
121 work->perm_n++;
122 }
123 for (i = 0; i < bucket->size; i++)
124 dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
125
126 s = work->perm[pr];
127 out:
128 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
129 bucket->size, x, r, pr, s);
130 return bucket->items[s];
131 }
132
133 /* uniform */
134 static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
135 struct crush_work_bucket *work, int x, int r)
136 {
137 return bucket_perm_choose(&bucket->h, work, x, r);
138 }
139
140 /* list */
141 static int bucket_list_choose(const struct crush_bucket_list *bucket,
142 int x, int r)
143 {
144 int i;
145
146 for (i = bucket->h.size-1; i >= 0; i--) {
147 __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
148 r, bucket->h.id);
149 w &= 0xffff;
150 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
151 "sw %x rand %llx",
152 i, x, r, bucket->h.items[i], bucket->item_weights[i],
153 bucket->sum_weights[i], w);
154 w *= bucket->sum_weights[i];
155 w = w >> 16;
156 /*dprintk(" scaled %llx\n", w);*/
157 if (w < bucket->item_weights[i]) {
158 return bucket->h.items[i];
159 }
160 }
161
162 dprintk("bad list sums for bucket %d\n", bucket->h.id);
163 return bucket->h.items[0];
164 }
165
166
167 /* (binary) tree */
168 static int height(int n)
169 {
(1) Event assign_zero_constant: |
Assigning: "h" = "0", which is zero. |
Also see events: |
[return_zero_variable] |
170 int h = 0;
(2) Event cond_false: |
Condition "(n & 1) == 0", taking false branch. |
171 while ((n & 1) == 0) {
172 h++;
173 n = n >> 1;
(3) Event loop_end: |
Reached end of loop. |
174 }
(4) Event return_zero_variable: |
Explicitly returning zero variable "h". |
Also see events: |
[assign_zero_constant] |
175 return h;
176 }
177
178 static int left(int x)
179 {
(1) Event zero_return: |
Function call "height(x)" returns 0. [details] |
(2) Event assignment: |
Assigning: "h" = "height(x)". The value of "h" is now 0. |
Also see events: |
[negative_shift] |
180 int h = height(x);
(3) Event negative_shift: |
In expression "1 << h - 1", shifting by a negative amount has undefined behavior. The shift amount, "h - 1", is -1. |
Also see events: |
[zero_return][assignment] |
181 return x - (1 << (h-1));
182 }
183
184 static int right(int x)
185 {
186 int h = height(x);
187 return x + (1 << (h-1));
188 }
189
190 static int terminal(int x)
191 {
192 return x & 1;
193 }
194
195 static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
196 int x, int r)
197 {
198 int n;
199 __u32 w;
200 __u64 t;
201
202 /* start at root */
203 n = bucket->num_nodes >> 1;
204
205 while (!terminal(n)) {
206 int l;
207 /* pick point in [0, w) */
208 w = bucket->node_weights[n];
209 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
210 bucket->h.id) * (__u64)w;
211 t = t >> 32;
212
213 /* descend to the left or right? */
214 l = left(n);
215 if (t < bucket->node_weights[l])
216 n = l;
217 else
218 n = right(n);
219 }
220
221 return bucket->h.items[n >> 1];
222 }
223
224
225 /* straw */
226
227 static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
228 int x, int r)
229 {
230 __u32 i;
231 int high = 0;
232 __u64 high_draw = 0;
233 __u64 draw;
234
235 for (i = 0; i < bucket->h.size; i++) {
236 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
237 draw &= 0xffff;
238 draw *= bucket->straws[i];
239 if (i == 0 || draw > high_draw) {
240 high = i;
241 high_draw = draw;
242 }
243 }
244 return bucket->h.items[high];
245 }
246
247 /* compute 2^44*log2(input+1) */
248 static __u64 crush_ln(unsigned int xin)
249 {
250 unsigned int x = xin;
251 int iexpon, index1, index2;
252 __u64 RH, LH, LL, xl64, result;
253
254 x++;
255
256 /* normalize input */
257 iexpon = 15;
258
259 // figure out number of bits we need to shift and
260 // do it in one step instead of iteratively
261 if (!(x & 0x18000)) {
262 int bits = __builtin_clz(x & 0x1FFFF) - 16;
263 x <<= bits;
264 iexpon = 15 - bits;
265 }
266
267 index1 = (x >> 8) << 1;
268 /* RH ~ 2^56/index1 */
269 RH = __RH_LH_tbl[index1 - 256];
270 /* LH ~ 2^48 * log2(index1/256) */
271 LH = __RH_LH_tbl[index1 + 1 - 256];
272
273 /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
274 xl64 = (__s64)x * RH;
275 xl64 >>= 48;
276
277 result = iexpon;
278 result <<= (12 + 32);
279
280 index2 = xl64 & 0xff;
281 /* LL ~ 2^48*log2(1.0+index2/2^15) */
282 LL = __LL_tbl[index2];
283
284 LH = LH + LL;
285
286 LH >>= (48 - 12 - 32);
287 result += LH;
288
289 return result;
290 }
291
292
293 /*
294 * straw2
295 *
296 * Suppose we have two osds: osd.0 and osd.1, with weight 8 and 4 respectively, It means:
297 * a). For osd.0, the time interval between each io request apply to exponential distribution
298 * with lamba equals 8
299 * b). For osd.1, the time interval between each io request apply to exponential distribution
300 * with lamba equals 4
301 * c). If we apply to each osd's exponential random variable, then the total pgs on each osd
302 * is proportional to its weight.
303 *
304 * for reference, see:
305 *
306 * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
307 */
308
309 static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
310 const struct crush_choose_arg *arg,
311 int position)
312 {
313 if ((arg == NULL) || (arg->weight_set == NULL))
314 return bucket->item_weights;
315 if (position >= arg->weight_set_positions)
316 position = arg->weight_set_positions - 1;
317 return arg->weight_set[position].weights;
318 }
319
320 static inline __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
321 const struct crush_choose_arg *arg)
322 {
323 if ((arg == NULL) || (arg->ids == NULL))
324 return bucket->h.items;
325 return arg->ids;
326 }
327
328 /*
329 * Compute exponential random variable using inversion method.
330 *
331 * for reference, see the exponential distribution example at:
332 * https://en.wikipedia.org/wiki/Inverse_transform_sampling#Examples
333 */
334 static inline __s64 generate_exponential_distribution(int type, int x, int y, int z,
335 int weight)
336 {
337 unsigned int u = crush_hash32_3(type, x, y, z);
338 u &= 0xffff;
339
340 /*
341 * for some reason slightly less than 0x10000 produces
342 * a slightly more accurate distribution... probably a
343 * rounding effect.
344 *
345 * the natural log lookup table maps [0,0xffff]
346 * (corresponding to real numbers [1/0x10000, 1] to
347 * [0, 0xffffffffffff] (corresponding to real numbers
348 * [-11.090355,0]).
349 */
350 __s64 ln = crush_ln(u) - 0x1000000000000ll;
351
352 /*
353 * divide by 16.16 fixed-point weight. note
354 * that the ln value is negative, so a larger
355 * weight means a larger (less negative) value
356 * for draw.
357 */
358 return div64_s64(ln, weight);
359 }
360
361 static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
362 int x, int r, const struct crush_choose_arg *arg,
363 int position)
364 {
365 unsigned int i, high = 0;
366 __s64 draw, high_draw = 0;
367 __u32 *weights = get_choose_arg_weights(bucket, arg, position);
368 __s32 *ids = get_choose_arg_ids(bucket, arg);
369 for (i = 0; i < bucket->h.size; i++) {
370 dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
371 if (weights[i]) {
372 draw = generate_exponential_distribution(bucket->h.hash, x, ids[i], r, weights[i]);
373 } else {
374 draw = S64_MIN;
375 }
376
377 if (i == 0 || draw > high_draw) {
378 high = i;
379 high_draw = draw;
380 }
381 }
382
383 return bucket->h.items[high];
384 }
385
386
387 static int crush_bucket_choose(const struct crush_bucket *in,
388 struct crush_work_bucket *work,
389 int x, int r,
390 const struct crush_choose_arg *arg,
391 int position)
392 {
393 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
394 BUG_ON(in->size == 0);
395 switch (in->alg) {
396 case CRUSH_BUCKET_UNIFORM:
397 return bucket_uniform_choose(
398 (const struct crush_bucket_uniform *)in,
399 work, x, r);
400 case CRUSH_BUCKET_LIST:
401 return bucket_list_choose((const struct crush_bucket_list *)in,
402 x, r);
403 case CRUSH_BUCKET_TREE:
404 return bucket_tree_choose((const struct crush_bucket_tree *)in,
405 x, r);
406 case CRUSH_BUCKET_STRAW:
407 return bucket_straw_choose(
408 (const struct crush_bucket_straw *)in,
409 x, r);
410 case CRUSH_BUCKET_STRAW2:
411 return bucket_straw2_choose(
412 (const struct crush_bucket_straw2 *)in,
413 x, r, arg, position);
414 default:
415 dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
416 return in->items[0];
417 }
418 }
419
420 /*
421 * true if device is marked "out" (failed, fully offloaded)
422 * of the cluster
423 */
424 static int is_out(const struct crush_map *map,
425 const __u32 *weight, int weight_max,
426 int item, int x)
427 {
428 if (item >= weight_max)
429 return 1;
430 if (weight[item] >= 0x10000)
431 return 0;
432 if (weight[item] == 0)
433 return 1;
434 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
435 < weight[item])
436 return 0;
437 return 1;
438 }
439
440 /**
441 * crush_choose_firstn - choose numrep distinct items of given type
442 * @map: the crush_map
443 * @bucket: the bucket we are choose an item from
444 * @x: crush input value
445 * @numrep: the number of items to choose
446 * @type: the type of item to choose
447 * @out: pointer to output vector
448 * @outpos: our position in that vector
449 * @out_size: size of the out vector
450 * @tries: number of attempts to make
451 * @recurse_tries: number of attempts to have recursive chooseleaf make
452 * @local_retries: localized retries
453 * @local_fallback_retries: localized fallback retries
454 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
455 * @stable: stable mode starts rep=0 in the recursive call for all replicas
456 * @vary_r: pass r to recursive calls
457 * @out2: second output vector for leaf items (if @recurse_to_leaf)
458 * @parent_r: r value passed from the parent
459 */
460 static int crush_choose_firstn(const struct crush_map *map,
461 struct crush_work *work,
462 const struct crush_bucket *bucket,
463 const __u32 *weight, int weight_max,
464 int x, int numrep, int type,
465 int *out, int outpos,
466 int out_size,
467 unsigned int tries,
468 unsigned int recurse_tries,
469 unsigned int local_retries,
470 unsigned int local_fallback_retries,
471 int recurse_to_leaf,
472 unsigned int vary_r,
473 unsigned int stable,
474 int *out2,
475 int parent_r,
476 const struct crush_choose_arg *choose_args)
477 {
478 int rep;
479 unsigned int ftotal, flocal;
480 int retry_descent, retry_bucket, skip_rep;
481 const struct crush_bucket *in = bucket;
482 int r;
483 int i;
484 int item = 0;
485 int itemtype;
486 int collide, reject;
487 int count = out_size;
488
489 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d \
490 recurse_tries %d local_retries %d local_fallback_retries %d \
491 parent_r %d stable %d\n",
492 recurse_to_leaf ? "_LEAF" : "",
493 bucket->id, x, outpos, numrep,
494 tries, recurse_tries, local_retries, local_fallback_retries,
495 parent_r, stable);
496
497 for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
498 /* keep trying until we get a non-out, non-colliding item */
499 ftotal = 0;
500 skip_rep = 0;
501 do {
502 retry_descent = 0;
503 in = bucket; /* initial bucket */
504
505 /* choose through intervening buckets */
506 flocal = 0;
507 do {
508 collide = 0;
509 retry_bucket = 0;
510 r = rep + parent_r;
511 /* r' = r + f_total */
512 r += ftotal;
513
514 /* bucket choose */
515 if (in->size == 0) {
516 reject = 1;
517 goto reject;
518 }
519 if (local_fallback_retries > 0 &&
520 flocal >= (in->size>>1) &&
521 flocal > local_fallback_retries)
522 item = bucket_perm_choose(
523 in, work->work[-1-in->id],
524 x, r);
525 else
526 item = crush_bucket_choose(
527 in, work->work[-1-in->id],
528 x, r,
529 (choose_args ? &choose_args[-1-in->id] : 0),
530 outpos);
531 if (item >= map->max_devices) {
532 dprintk(" bad item %d\n", item);
533 skip_rep = 1;
534 break;
535 }
536
537 /* desired type? */
538 if (item < 0)
539 itemtype = map->buckets[-1-item]->type;
540 else
541 itemtype = 0;
542 dprintk(" item %d type %d\n", item, itemtype);
543
544 /* keep going? */
545 if (itemtype != type) {
546 if (item >= 0 ||
547 (-1-item) >= map->max_buckets) {
548 dprintk(" bad item type %d\n", type);
549 skip_rep = 1;
550 break;
551 }
552 in = map->buckets[-1-item];
553 retry_bucket = 1;
554 continue;
555 }
556
557 /* collision? */
558 for (i = 0; i < outpos; i++) {
559 if (out[i] == item) {
560 collide = 1;
561 break;
562 }
563 }
564
565 reject = 0;
566 if (!collide && recurse_to_leaf) {
567 if (item < 0) {
568 int sub_r;
569 if (vary_r)
570 sub_r = r >> (vary_r-1);
571 else
572 sub_r = 0;
573 if (crush_choose_firstn(
574 map,
575 work,
576 map->buckets[-1-item],
577 weight, weight_max,
578 x, stable ? 1 : outpos+1, 0,
579 out2, outpos, count,
580 recurse_tries, 0,
581 local_retries,
582 local_fallback_retries,
583 0,
584 vary_r,
585 stable,
586 NULL,
587 sub_r,
588 choose_args) <= outpos)
589 /* didn't get leaf */
590 reject = 1;
591 } else {
592 /* we already have a leaf! */
593 out2[outpos] = item;
594 }
595 }
596
597 if (!reject && !collide) {
598 /* out? */
599 if (itemtype == 0)
600 reject = is_out(map, weight,
601 weight_max,
602 item, x);
603 }
604
605 reject:
606 if (reject || collide) {
607 ftotal++;
608 flocal++;
609
610 if (collide && flocal <= local_retries)
611 /* retry locally a few times */
612 retry_bucket = 1;
613 else if (local_fallback_retries > 0 &&
614 flocal <= in->size + local_fallback_retries)
615 /* exhaustive bucket search */
616 retry_bucket = 1;
617 else if (ftotal < tries)
618 /* then retry descent */
619 retry_descent = 1;
620 else
621 /* else give up */
622 skip_rep = 1;
623 dprintk(" reject %d collide %d "
624 "ftotal %u flocal %u\n",
625 reject, collide, ftotal,
626 flocal);
627 }
628 } while (retry_bucket);
629 } while (retry_descent);
630
631 if (skip_rep) {
632 dprintk("skip rep\n");
633 continue;
634 }
635
636 dprintk("CHOOSE got %d\n", item);
637 out[outpos] = item;
638 outpos++;
639 count--;
640 #ifndef __KERNEL__
641 if (map->choose_tries && ftotal <= map->choose_total_tries)
642 map->choose_tries[ftotal]++;
643 #endif
644 }
645
646 dprintk("CHOOSE returns %d\n", outpos);
647 return outpos;
648 }
649
650
651 /**
652 * crush_choose_indep: alternative breadth-first positionally stable mapping
653 *
654 */
655 static void crush_choose_indep(const struct crush_map *map,
656 struct crush_work *work,
657 const struct crush_bucket *bucket,
658 const __u32 *weight, int weight_max,
659 int x, int left, int numrep, int type,
660 int *out, int outpos,
661 unsigned int tries,
662 unsigned int recurse_tries,
663 int recurse_to_leaf,
664 int *out2,
665 int parent_r,
666 const struct crush_choose_arg *choose_args)
667 {
668 const struct crush_bucket *in = bucket;
669 int endpos = outpos + left;
670 int rep;
671 unsigned int ftotal;
672 int r;
673 int i;
674 int item = 0;
675 int itemtype;
676 int collide;
677
678 dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
679 bucket->id, x, outpos, numrep);
680
681 /* initially my result is undefined */
682 for (rep = outpos; rep < endpos; rep++) {
683 out[rep] = CRUSH_ITEM_UNDEF;
684 if (out2)
685 out2[rep] = CRUSH_ITEM_UNDEF;
686 }
687
688 for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
689 #ifdef DEBUG_INDEP
690 if (out2 && ftotal) {
691 dprintk("%u %d a: ", ftotal, left);
692 for (rep = outpos; rep < endpos; rep++) {
693 dprintk(" %d", out[rep]);
694 }
695 dprintk("\n");
696 dprintk("%u %d b: ", ftotal, left);
697 for (rep = outpos; rep < endpos; rep++) {
698 dprintk(" %d", out2[rep]);
699 }
700 dprintk("\n");
701 }
702 #endif
703 for (rep = outpos; rep < endpos; rep++) {
704 if (out[rep] != CRUSH_ITEM_UNDEF)
705 continue;
706
707 in = bucket; /* initial bucket */
708
709 /* choose through intervening buckets */
710 for (;;) {
711 /* note: we base the choice on the position
712 * even in the nested call. that means that
713 * if the first layer chooses the same bucket
714 * in a different position, we will tend to
715 * choose a different item in that bucket.
716 * this will involve more devices in data
717 * movement and tend to distribute the load.
718 */
719 r = rep + parent_r;
720
721 /* be careful */
722 if (in->alg == CRUSH_BUCKET_UNIFORM &&
723 in->size % numrep == 0)
724 /* r'=r+(n+1)*f_total */
725 r += (numrep+1) * ftotal;
726 else
727 /* r' = r + n*f_total */
728 r += numrep * ftotal;
729
730 /* bucket choose */
731 if (in->size == 0) {
732 dprintk(" empty bucket\n");
733 break;
734 }
735
736 item = crush_bucket_choose(
737 in, work->work[-1-in->id],
738 x, r,
739 (choose_args ? &choose_args[-1-in->id] : 0),
740 outpos);
741 if (item >= map->max_devices) {
742 dprintk(" bad item %d\n", item);
743 out[rep] = CRUSH_ITEM_NONE;
744 if (out2)
745 out2[rep] = CRUSH_ITEM_NONE;
746 left--;
747 break;
748 }
749
750 /* desired type? */
751 if (item < 0)
752 itemtype = map->buckets[-1-item]->type;
753 else
754 itemtype = 0;
755 dprintk(" item %d type %d\n", item, itemtype);
756
757 /* keep going? */
758 if (itemtype != type) {
759 if (item >= 0 ||
760 (-1-item) >= map->max_buckets) {
761 dprintk(" bad item type %d\n", type);
762 out[rep] = CRUSH_ITEM_NONE;
763 if (out2)
764 out2[rep] =
765 CRUSH_ITEM_NONE;
766 left--;
767 break;
768 }
769 in = map->buckets[-1-item];
770 continue;
771 }
772
773 /* collision? */
774 collide = 0;
775 for (i = outpos; i < endpos; i++) {
776 if (out[i] == item) {
777 collide = 1;
778 break;
779 }
780 }
781 if (collide)
782 break;
783
784 if (recurse_to_leaf) {
785 if (item < 0) {
786 crush_choose_indep(
787 map,
788 work,
789 map->buckets[-1-item],
790 weight, weight_max,
791 x, 1, numrep, 0,
792 out2, rep,
793 recurse_tries, 0,
794 0, NULL, r, choose_args);
795 if (out2 && out2[rep] == CRUSH_ITEM_NONE) {
796 /* placed nothing; no leaf */
797 break;
798 }
799 } else if (out2) {
800 /* we already have a leaf! */
801 out2[rep] = item;
802 }
803 }
804
805 /* out? */
806 if (itemtype == 0 &&
807 is_out(map, weight, weight_max, item, x))
808 break;
809
810 /* yay! */
811 out[rep] = item;
812 left--;
813 break;
814 }
815 }
816 }
817 for (rep = outpos; rep < endpos; rep++) {
818 if (out[rep] == CRUSH_ITEM_UNDEF) {
819 out[rep] = CRUSH_ITEM_NONE;
820 }
821 if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
822 out2[rep] = CRUSH_ITEM_NONE;
823 }
824 }
825 #ifndef __KERNEL__
826 if (map->choose_tries && ftotal <= map->choose_total_tries)
827 map->choose_tries[ftotal]++;
828 #endif
829 #ifdef DEBUG_INDEP
830 if (out2) {
831 dprintk("%u %d a: ", ftotal, left);
832 for (rep = outpos; rep < endpos; rep++) {
833 dprintk(" %d", out[rep]);
834 }
835 dprintk("\n");
836 dprintk("%u %d b: ", ftotal, left);
837 for (rep = outpos; rep < endpos; rep++) {
838 dprintk(" %d", out2[rep]);
839 }
840 dprintk("\n");
841 }
842 #endif
843 }
844
845
846 /* This takes a chunk of memory and sets it up to be a shiny new
847 working area for a CRUSH placement computation. It must be called
848 on any newly allocated memory before passing it in to
849 crush_do_rule. It may be used repeatedly after that, so long as the
850 map has not changed. If the map /has/ changed, you must make sure
851 the working size is no smaller than what was allocated and re-run
852 crush_init_workspace.
853
854 If you do retain the working space between calls to crush, make it
855 thread-local. If you reinstitute the locking I've spent so much
856 time getting rid of, I will be very unhappy with you. */
857
858 void crush_init_workspace(const struct crush_map *m, void *v) {
859 /* We work by moving through the available space and setting
860 values and pointers as we go.
861
862 It's a bit like Forth's use of the 'allot' word since we
863 set the pointer first and then reserve the space for it to
864 point to by incrementing the point. */
865 struct crush_work *w = (struct crush_work *)v;
866 char *point = (char *)v;
867 __s32 b;
868 point += sizeof(struct crush_work);
869 w->work = (struct crush_work_bucket **)point;
870 point += m->max_buckets * sizeof(struct crush_work_bucket *);
871 for (b = 0; b < m->max_buckets; ++b) {
872 if (m->buckets[b] == 0)
873 continue;
874
875 w->work[b] = (struct crush_work_bucket *) point;
876 switch (m->buckets[b]->alg) {
877 default:
878 point += sizeof(struct crush_work_bucket);
879 break;
880 }
881 w->work[b]->perm_x = 0;
882 w->work[b]->perm_n = 0;
883 w->work[b]->perm = (__u32 *)point;
884 point += m->buckets[b]->size * sizeof(__u32);
885 }
886 BUG_ON((char *)point - (char *)w != m->working_size);
887 }
888
889 /**
890 * crush_do_rule - calculate a mapping with the given input and rule
891 * @map: the crush_map
892 * @ruleno: the rule id
893 * @x: hash input
894 * @result: pointer to result vector
895 * @result_max: maximum result size
896 * @weight: weight vector (for map leaves)
897 * @weight_max: size of weight vector
898 * @cwin: Pointer to at least map->working_size bytes of memory or NULL.
899 */
900 int crush_do_rule(const struct crush_map *map,
901 int ruleno, int x, int *result, int result_max,
902 const __u32 *weight, int weight_max,
903 void *cwin, const struct crush_choose_arg *choose_args)
904 {
905 int result_len;
906 struct crush_work *cw = cwin;
907 int *a = (int *)((char *)cw + map->working_size);
908 int *b = a + result_max;
909 int *c = b + result_max;
910 int *w = a;
911 int *o = b;
912 int recurse_to_leaf;
913 int wsize = 0;
914 int osize;
915 int *tmp;
916 const struct crush_rule *rule;
917 __u32 step;
918 int i, j;
919 int numrep;
920 int out_size;
921 /*
922 * the original choose_total_tries value was off by one (it
923 * counted "retries" and not "tries"). add one.
924 */
925 int choose_tries = map->choose_total_tries + 1;
926 int choose_leaf_tries = 0;
927 /*
928 * the local tries values were counted as "retries", though,
929 * and need no adjustment
930 */
931 int choose_local_retries = map->choose_local_tries;
932 int choose_local_fallback_retries = map->choose_local_fallback_tries;
933
934 int vary_r = map->chooseleaf_vary_r;
935 int stable = map->chooseleaf_stable;
936
937 if ((__u32)ruleno >= map->max_rules) {
938 dprintk(" bad ruleno %d\n", ruleno);
939 return 0;
940 }
941
942 rule = map->rules[ruleno];
943 result_len = 0;
944
945 for (step = 0; step < rule->len; step++) {
946 int firstn = 0;
947 const struct crush_rule_step *curstep = &rule->steps[step];
948
949 switch (curstep->op) {
950 case CRUSH_RULE_TAKE:
951 if ((curstep->arg1 >= 0 &&
952 curstep->arg1 < map->max_devices) ||
953 (-1-curstep->arg1 >= 0 &&
954 -1-curstep->arg1 < map->max_buckets &&
955 map->buckets[-1-curstep->arg1])) {
956 w[0] = curstep->arg1;
957 wsize = 1;
958 } else {
959 dprintk(" bad take value %d\n", curstep->arg1);
960 }
961 break;
962
963 case CRUSH_RULE_SET_CHOOSE_TRIES:
964 if (curstep->arg1 > 0)
965 choose_tries = curstep->arg1;
966 break;
967
968 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
969 if (curstep->arg1 > 0)
970 choose_leaf_tries = curstep->arg1;
971 break;
972
973 case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
974 if (curstep->arg1 >= 0)
975 choose_local_retries = curstep->arg1;
976 break;
977
978 case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
979 if (curstep->arg1 >= 0)
980 choose_local_fallback_retries = curstep->arg1;
981 break;
982
983 case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
984 if (curstep->arg1 >= 0)
985 vary_r = curstep->arg1;
986 break;
987
988 case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
989 if (curstep->arg1 >= 0)
990 stable = curstep->arg1;
991 break;
992
993 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
994 case CRUSH_RULE_CHOOSE_FIRSTN:
995 firstn = 1;
996 /* fall through */
997 case CRUSH_RULE_CHOOSELEAF_INDEP:
998 case CRUSH_RULE_CHOOSE_INDEP:
999 if (wsize == 0)
1000 break;
1001
1002 recurse_to_leaf =
1003 curstep->op ==
1004 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
1005 curstep->op ==
1006 CRUSH_RULE_CHOOSELEAF_INDEP;
1007
1008 /* reset output */
1009 osize = 0;
1010
1011 for (i = 0; i < wsize; i++) {
1012 int bno;
1013 numrep = curstep->arg1;
1014 if (numrep <= 0) {
1015 numrep += result_max;
1016 if (numrep <= 0)
1017 continue;
1018 }
1019 j = 0;
1020 /* make sure bucket id is valid */
1021 bno = -1 - w[i];
1022 if (bno < 0 || bno >= map->max_buckets) {
1023 // w[i] is probably CRUSH_ITEM_NONE
1024 dprintk(" bad w[i] %d\n", w[i]);
1025 continue;
1026 }
1027 if (firstn) {
1028 int recurse_tries;
1029 if (choose_leaf_tries)
1030 recurse_tries =
1031 choose_leaf_tries;
1032 else if (map->chooseleaf_descend_once)
1033 recurse_tries = 1;
1034 else
1035 recurse_tries = choose_tries;
1036 osize += crush_choose_firstn(
1037 map,
1038 cw,
1039 map->buckets[bno],
1040 weight, weight_max,
1041 x, numrep,
1042 curstep->arg2,
1043 o+osize, j,
1044 result_max-osize,
1045 choose_tries,
1046 recurse_tries,
1047 choose_local_retries,
1048 choose_local_fallback_retries,
1049 recurse_to_leaf,
1050 vary_r,
1051 stable,
1052 c+osize,
1053 0,
1054 choose_args);
1055 } else {
1056 out_size = ((numrep < (result_max-osize)) ?
1057 numrep : (result_max-osize));
1058 crush_choose_indep(
1059 map,
1060 cw,
1061 map->buckets[bno],
1062 weight, weight_max,
1063 x, out_size, numrep,
1064 curstep->arg2,
1065 o+osize, j,
1066 choose_tries,
1067 choose_leaf_tries ?
1068 choose_leaf_tries : 1,
1069 recurse_to_leaf,
1070 c+osize,
1071 0,
1072 choose_args);
1073 osize += out_size;
1074 }
1075 }
1076
1077 if (recurse_to_leaf)
1078 /* copy final _leaf_ values to output set */
1079 memcpy(o, c, osize*sizeof(*o));
1080
1081 /* swap o and w arrays */
1082 tmp = o;
1083 o = w;
1084 w = tmp;
1085 wsize = osize;
1086 break;
1087
1088
1089 case CRUSH_RULE_EMIT:
1090 for (i = 0; i < wsize && result_len < result_max; i++) {
1091 result[result_len] = w[i];
1092 result_len++;
1093 }
1094 wsize = 0;
1095 break;
1096
1097 default:
1098 dprintk(" unknown op %d at step %d\n",
1099 curstep->op, step);
1100 break;
1101 }
1102 }
1103
1104 return result_len;
1105 }
1106