1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_FRAG_H
16 #define CEPH_FRAG_H
17
18 #include <boost/container/small_vector.hpp>
19
20 #include <iostream>
21
22 #include <stdint.h>
23 #include <stdio.h>
24
25 #include "buffer.h"
26 #include "compact_map.h"
27
28 #include "ceph_frag.h"
29 #include "include/encoding.h"
30 #include "include/ceph_assert.h"
31
32 #include "common/dout.h"
33
34 /*
35 *
36 * the goal here is to use a binary split strategy to partition a namespace.
37 * frag_t represents a particular fragment. bits() tells you the size of the
38 * fragment, and value() it's name. this is roughly analogous to an ip address
39 * and netmask.
40 *
41 * fragtree_t represents an entire namespace and it's partition. it essentially
42 * tells you where fragments are split into other fragments, and by how much
43 * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
44 *
45 * this vaguely resembles a btree, in that when a fragment becomes large or small
46 * we can split or merge, except that there is no guarantee of being balanced.
47 *
48 * presumably we are partitioning the output of a (perhaps specialized) hash
49 * function.
50 */
51
52 /**
53 * frag_t
54 *
55 * description of an individual fragment. that is, a particular piece
56 * of the overall namespace.
57 *
58 * this is conceptually analogous to an ip address and netmask.
59 *
60 * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
61 *
62 * we write it as v/b, where v is a value and b is the number of bits.
63 * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that,
64 * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on.
65 *
66 * this makes the right most bit of v the "most significant", which is the
67 * opposite of what we usually see.
68 */
69
70 /*
71 * TODO:
72 * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial)
73 * iteration efficient (see, e.g., try_assimilate_children()
74 * - rework frag_t so that we mask the left-most (most significant) bits instead of
75 * the right-most (least significant) bits. just because it's more intuitive, and
76 * matches the network/netmask concept.
77 */
78
79 class frag_t {
80 /*
81 * encoding is dictated by frag_* functions in ceph_fs.h. use those
82 * helpers _exclusively_.
83 */
84 public:
85 using _frag_t = uint32_t;
86
87 frag_t() = default;
88 frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
89 frag_t(_frag_t e) : _enc(e) { }
90
91 // constructors
92 void from_unsigned(unsigned e) { _enc = e; }
93
94 // accessors
95 unsigned value() const { return ceph_frag_value(_enc); }
96 unsigned bits() const { return ceph_frag_bits(_enc); }
97 unsigned mask() const { return ceph_frag_mask(_enc); }
98 unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
99
100 operator _frag_t() const { return _enc; }
101
102 // tests
103 bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
104 bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
105 bool is_root() const { return bits() == 0; }
106 frag_t parent() const {
107 ceph_assert(bits() > 0);
108 return frag_t(ceph_frag_parent(_enc));
109 }
110
111 // splitting
112 frag_t make_child(int i, int nb) const {
113 ceph_assert(i < (1<<nb));
114 return frag_t(ceph_frag_make_child(_enc, nb, i));
115 }
116 template<typename T>
117 void split(int nb, T& fragments) const {
118 ceph_assert(nb > 0);
119 unsigned nway = 1 << nb;
120 for (unsigned i=0; i<nway; i++)
121 fragments.push_back(make_child(i, nb));
122 }
123
124 // binary splitting
125 frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
126 frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
127
128 bool is_left() const { return ceph_frag_is_left_child(_enc); }
129 bool is_right() const { return ceph_frag_is_right_child(_enc); }
130 frag_t get_sibling() const {
131 ceph_assert(!is_root());
132 return frag_t(ceph_frag_sibling(_enc));
133 }
134
135 // sequencing
136 bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
137 bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
138 frag_t next() const {
139 ceph_assert(!is_rightmost());
140 return frag_t(ceph_frag_next(_enc));
141 }
142
143 // parse
144 bool parse(const char *s) {
145 int pvalue, pbits;
146 int r = sscanf(s, "%x/%d", &pvalue, &pbits);
147 if (r == 2) {
148 *this = frag_t(pvalue, pbits);
149 return true;
150 }
151 return false;
152 }
153
154 void encode(bufferlist& bl) const {
(1) Event overrun-buffer-val: |
Overrunning buffer pointed to by "this->_enc" of 4 bytes by passing it to a function which accesses it at byte offset 7. [details] |
155 encode_raw(_enc, bl);
156 }
157 void decode(bufferlist::const_iterator& p) {
158 __u32 v;
159 decode_raw(v, p);
160 _enc = v;
161 }
162
163 private:
164 _frag_t _enc = 0;
165 };
166
167 inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
168 {
169 //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
170 unsigned num = hb.bits();
171 if (num) {
172 unsigned val = hb.value();
173 for (unsigned bit = 23; num; num--, bit--)
174 out << ((val & (1<<bit)) ? '1':'0');
175 }
176 return out << '*';
177 }
178
179 inline void encode(const frag_t &f, bufferlist& bl) { f.encode(bl); }
180 inline void decode(frag_t &f, bufferlist::const_iterator& p) { f.decode(p); }
181
182 using frag_vec_t = boost::container::small_vector<frag_t, 4>;
183
184 /**
185 * fragtree_t -- partition an entire namespace into one or more frag_t's.
186 */
187 class fragtree_t {
188 // pairs <f, b>:
189 // frag_t f is split by b bits.
190 // if child frag_t does not appear, it is not split.
191 public:
192 compact_map<frag_t,int32_t> _splits;
193
194 public:
195 // -------------
196 // basics
197 void swap(fragtree_t& other) {
198 _splits.swap(other._splits);
199 }
200 void clear() {
201 _splits.clear();
202 }
203
204 // -------------
205 // accessors
206 bool empty() const {
207 return _splits.empty();
208 }
209 int get_split(const frag_t hb) const {
210 compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
211 if (p == _splits.end())
212 return 0;
213 else
214 return p->second;
215 }
216
217
218 bool is_leaf(frag_t x) const {
219 frag_vec_t s;
220 get_leaves_under(x, s);
221 //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
222 return s.size() == 1 && s.front() == x;
223 }
224
225 /**
226 * get_leaves -- list all leaves
227 */
228 template<typename T>
229 void get_leaves(T& c) const {
230 return get_leaves_under_split(frag_t(), c);
231 }
232
233 /**
234 * get_leaves_under_split -- list all leaves under a known split point (or root)
235 */
236 template<typename T>
237 void get_leaves_under_split(frag_t under, T& c) const {
238 frag_vec_t s;
239 s.push_back(under);
240 while (!s.empty()) {
241 frag_t t = s.back();
242 s.pop_back();
243 int nb = get_split(t);
244 if (nb)
245 t.split(nb, s); // queue up children
246 else
247 c.push_back(t); // not spit, it's a leaf.
248 }
249 }
250
251 /**
252 * get_branch -- get branch point at OR above frag @a x
253 * - may be @a x itself, if @a x is a split
254 * - may be root (frag_t())
255 */
256 frag_t get_branch(frag_t x) const {
257 while (1) {
258 if (x == frag_t()) return x; // root
259 if (get_split(x)) return x; // found it!
260 x = x.parent();
261 }
262 }
263
264 /**
265 * get_branch_above -- get a branch point above frag @a x
266 * - may be root (frag_t())
267 * - may NOT be @a x, even if @a x is a split.
268 */
269 frag_t get_branch_above(frag_t x) const {
270 while (1) {
271 if (x == frag_t()) return x; // root
272 x = x.parent();
273 if (get_split(x)) return x; // found it!
274 }
275 }
276
277
278 /**
279 * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
280 * - may be @a x itself, if @a x is a split or leaf
281 * - may be root (frag_t())
282 */
283 frag_t get_branch_or_leaf(frag_t x) const {
284 frag_t branch = get_branch(x);
285 int nb = get_split(branch);
286 if (nb > 0 && // if branch is a split, and
287 branch.bits() + nb <= x.bits()) // one of the children is or contains x
288 return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf)
289 else
290 return branch;
291 }
292
293 /**
294 * get_leaves_under(x, ls) -- search for any leaves fully contained by x
295 */
296 template<typename T>
297 void get_leaves_under(frag_t x, T& c) const {
298 frag_vec_t s;
299 s.push_back(get_branch_or_leaf(x));
300 while (!s.empty()) {
301 frag_t t = s.back();
302 s.pop_back();
303 if (t.bits() >= x.bits() && // if t is more specific than x, and
304 !x.contains(t)) // x does not contain t,
305 continue; // then skip
306 int nb = get_split(t);
307 if (nb)
308 t.split(nb, s); // queue up children
309 else if (x.contains(t))
310 c.push_back(t); // not spit, it's a leaf.
311 }
312 }
313
314 /**
315 * contains(fg) -- does fragtree contain the specific frag @a x
316 */
317 bool contains(frag_t x) const {
318 frag_vec_t s;
319 s.push_back(get_branch(x));
320 while (!s.empty()) {
321 frag_t t = s.back();
322 s.pop_back();
323 if (t.bits() >= x.bits() && // if t is more specific than x, and
324 !x.contains(t)) // x does not contain t,
325 continue; // then skip
326 int nb = get_split(t);
327 if (nb) {
328 if (t == x) return false; // it's split.
329 t.split(nb, s); // queue up children
330 } else {
331 if (t == x) return true; // it's there.
332 }
333 }
334 return false;
335 }
336
337 /**
338 * operator[] -- map a (hash?) value to a frag
339 */
340 frag_t operator[](unsigned v) const {
341 frag_t t;
342 while (1) {
343 ceph_assert(t.contains(v));
344 int nb = get_split(t);
345
346 // is this a leaf?
347 if (nb == 0) return t; // done.
348
349 // pick appropriate child fragment.
350 unsigned nway = 1 << nb;
351 unsigned i;
352 for (i=0; i<nway; i++) {
353 frag_t n = t.make_child(i, nb);
354 if (n.contains(v)) {
355 t = n;
356 break;
357 }
358 }
359 ceph_assert(i < nway);
360 }
361 }
362
363
364 // ---------------
365 // modifiers
366 void split(frag_t x, int b, bool simplify=true) {
367 ceph_assert(is_leaf(x));
368 _splits[x] = b;
369
370 if (simplify)
371 try_assimilate_children(get_branch_above(x));
372 }
373 void merge(frag_t x, int b, bool simplify=true) {
374 ceph_assert(!is_leaf(x));
375 ceph_assert(_splits[x] == b);
376 _splits.erase(x);
377
378 if (simplify)
379 try_assimilate_children(get_branch_above(x));
380 }
381
382 /*
383 * if all of a given split's children are identically split,
384 * then the children can be assimilated.
385 */
386 void try_assimilate_children(frag_t x) {
387 int nb = get_split(x);
388 if (!nb) return;
389 frag_vec_t children;
390 x.split(nb, children);
391 int childbits = 0;
392 for (auto& frag : children) {
393 int cb = get_split(frag);
394 if (!cb) return; // nope.
395 if (childbits && cb != childbits) return; // not the same
396 childbits = cb;
397 }
398 // all children are split with childbits!
399 for (auto& frag : children)
400 _splits.erase(frag);
401 _splits[x] += childbits;
402 }
403
404 bool force_to_leaf(CephContext *cct, frag_t x) {
405 if (is_leaf(x))
406 return false;
407
408 lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
409
410 frag_t parent = get_branch_or_leaf(x);
411 ceph_assert(parent.bits() <= x.bits());
412 lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
413
414 // do we need to split from parent to x?
415 if (parent.bits() < x.bits()) {
416 int spread = x.bits() - parent.bits();
417 int nb = get_split(parent);
418 lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
419 if (nb == 0) {
420 // easy: split parent (a leaf) by the difference
421 lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
422 split(parent, spread);
423 ceph_assert(is_leaf(x));
424 return true;
425 }
426 ceph_assert(nb > spread);
427
428 // add an intermediary split
429 merge(parent, nb, false);
430 split(parent, spread, false);
431
432 frag_vec_t subs;
433 parent.split(spread, subs);
434 for (auto& frag : subs) {
435 lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl;
436 split(frag, nb - spread, false);
437 }
438 }
439
440 // x is now a leaf or split.
441 // hoover up any children.
442 frag_vec_t s;
443 s.push_back(x);
444 while (!s.empty()) {
445 frag_t t = s.back();
446 s.pop_back();
447 int nb = get_split(t);
448 if (nb) {
449 lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
450 merge(t, nb, false); // merge this point, and
451 t.split(nb, s); // queue up children
452 }
453 }
454
455 lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
456 ceph_assert(is_leaf(x));
457 return true;
458 }
459
460 // encoding
461 void encode(bufferlist& bl) const {
462 using ceph::encode;
463 encode(_splits, bl);
464 }
465 void decode(bufferlist::const_iterator& p) {
466 using ceph::decode;
467 decode(_splits, p);
468 }
469 void encode_nohead(bufferlist& bl) const {
470 using ceph::encode;
471 for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
472 p != _splits.end();
473 ++p) {
474 encode(p->first, bl);
475 encode(p->second, bl);
476 }
477 }
478 void decode_nohead(int n, bufferlist::const_iterator& p) {
479 using ceph::decode;
480 _splits.clear();
481 while (n-- > 0) {
482 frag_t f;
483 decode(f, p);
484 decode(_splits[f], p);
485 }
486 }
487
488 void print(std::ostream& out) {
489 out << "fragtree_t(";
490 frag_vec_t s;
491 s.push_back(frag_t());
492 while (!s.empty()) {
493 frag_t t = s.back();
494 s.pop_back();
495 // newline + indent?
496 if (t.bits()) {
497 out << std::endl;
498 for (unsigned i=0; i<t.bits(); i++) out << ' ';
499 }
500 int nb = get_split(t);
501 if (nb) {
502 out << t << " %" << nb;
503 t.split(nb, s); // queue up children
504 } else {
505 out << t;
506 }
507 }
508 out << ")";
509 }
510
511 void dump(Formatter *f) const {
512 f->open_array_section("splits");
513 for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
514 p != _splits.end();
515 ++p) {
516 f->open_object_section("split");
517 std::ostringstream frag_str;
518 frag_str << p->first;
519 f->dump_string("frag", frag_str.str());
520 f->dump_int("children", p->second);
521 f->close_section(); // split
522 }
523 f->close_section(); // splits
524 }
525 };
526 WRITE_CLASS_ENCODER(fragtree_t)
527
528 inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
529 return l._splits == r._splits;
530 }
531 inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
532 return l._splits != r._splits;
533 }
534
535 inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
536 {
537 out << "fragtree_t(";
538
539 for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
540 p != ft._splits.end();
541 ++p) {
542 if (p != ft._splits.begin())
543 out << " ";
544 out << p->first << "^" << p->second;
545 }
546 return out << ")";
547 }
548
549
550 /**
551 * fragset_t -- a set of fragments
552 */
553 class fragset_t {
554 std::set<frag_t> _set;
555
556 public:
557 const std::set<frag_t> &get() const { return _set; }
558 std::set<frag_t>::iterator begin() { return _set.begin(); }
559 std::set<frag_t>::iterator end() { return _set.end(); }
560
561 bool empty() const { return _set.empty(); }
562
563 bool contains(frag_t f) const {
564 while (1) {
565 if (_set.count(f)) return true;
566 if (f.bits() == 0) return false;
567 f = f.parent();
568 }
569 }
570
571 void insert(frag_t f) {
572 _set.insert(f);
573 simplify();
574 }
575
576 void simplify() {
577 while (1) {
578 bool clean = true;
579 std::set<frag_t>::iterator p = _set.begin();
580 while (p != _set.end()) {
581 if (!p->is_root() &&
582 _set.count(p->get_sibling())) {
583 _set.erase(p->get_sibling());
584 _set.insert(p->parent());
585 _set.erase(p++);
586 clean = false;
587 } else {
588 p++;
589 }
590 }
591 if (clean)
592 break;
593 }
594 }
595 };
596
597 inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs)
598 {
599 return out << "fragset_t(" << fs.get() << ")";
600 }
601
602 #endif
603