1    	// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
2    	// vim: ts=8 sw=2 smarttab
3    	/*
4    	 * Ceph - scalable distributed file system
5    	 *
6    	 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7    	 *
8    	 * This is free software; you can redistribute it and/or
9    	 * modify it under the terms of the GNU Lesser General Public
10   	 * License version 2.1, as published by the Free Software 
11   	 * Foundation.  See file COPYING.
12   	 * 
13   	 */
14   	
15   	#ifndef CEPH_FRAG_H
16   	#define CEPH_FRAG_H
17   	
18   	#include <boost/container/small_vector.hpp>
19   	
20   	#include <iostream>
21   	
22   	#include <stdint.h>
23   	#include <stdio.h>
24   	
25   	#include "buffer.h"
26   	#include "compact_map.h"
27   	
28   	#include "ceph_frag.h"
29   	#include "include/encoding.h"
30   	#include "include/ceph_assert.h"
31   	
32   	#include "common/dout.h"
33   	
34   	/*
35   	 * 
36   	 * the goal here is to use a binary split strategy to partition a namespace.  
37   	 * frag_t represents a particular fragment.  bits() tells you the size of the
38   	 * fragment, and value() it's name.  this is roughly analogous to an ip address
39   	 * and netmask.
40   	 * 
41   	 * fragtree_t represents an entire namespace and it's partition.  it essentially 
42   	 * tells you where fragments are split into other fragments, and by how much 
43   	 * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
44   	 * 
45   	 * this vaguely resembles a btree, in that when a fragment becomes large or small
46   	 * we can split or merge, except that there is no guarantee of being balanced.
47   	 *
48   	 * presumably we are partitioning the output of a (perhaps specialized) hash 
49   	 * function.
50   	 */
51   	
52   	/**
53   	 * frag_t
54   	 *
55   	 * description of an individual fragment.  that is, a particular piece
56   	 * of the overall namespace.
57   	 *
58   	 * this is conceptually analogous to an ip address and netmask.
59   	 *
60   	 * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
61   	 *
62   	 * we write it as v/b, where v is a value and b is the number of bits.
63   	 * 0/0 (bits==0) corresponds to the entire namespace.  if we bisect that,
64   	 * we get 0/1 and 1/1.  quartering gives us 0/2, 1/2, 2/2, 3/2.  and so on.
65   	 *
66   	 * this makes the right most bit of v the "most significant", which is the 
67   	 * opposite of what we usually see.
68   	 */
69   	
70   	/*
71   	 * TODO:
72   	 *  - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) 
73   	 *    iteration efficient (see, e.g., try_assimilate_children()
74   	 *  - rework frag_t so that we mask the left-most (most significant) bits instead of
75   	 *    the right-most (least significant) bits.  just because it's more intuitive, and
76   	 *    matches the network/netmask concept.
77   	 */
78   	
79   	class frag_t {
80   	  /*
81   	   * encoding is dictated by frag_* functions in ceph_fs.h.  use those
82   	   * helpers _exclusively_.
83   	   */
84   	public:
85   	  using _frag_t = uint32_t;
86   	  
87   	  frag_t() = default;
88   	  frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
89   	  frag_t(_frag_t e) : _enc(e) { }
90   	
91   	  // constructors
92   	  void from_unsigned(unsigned e) { _enc = e; }
93   	  
94   	  // accessors
95   	  unsigned value() const { return ceph_frag_value(_enc); }
96   	  unsigned bits() const { return ceph_frag_bits(_enc); }
97   	  unsigned mask() const { return ceph_frag_mask(_enc); }
98   	  unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
99   	
100  	  operator _frag_t() const { return _enc; }
101  	
102  	  // tests
103  	  bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
104  	  bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
105  	  bool is_root() const { return bits() == 0; }
106  	  frag_t parent() const {
107  	    ceph_assert(bits() > 0);
108  	    return frag_t(ceph_frag_parent(_enc));
109  	  }
110  	
111  	  // splitting
112  	  frag_t make_child(int i, int nb) const {
113  	    ceph_assert(i < (1<<nb));
114  	    return frag_t(ceph_frag_make_child(_enc, nb, i));
115  	  }
116  	  template<typename T>
117  	  void split(int nb, T& fragments) const {
118  	    ceph_assert(nb > 0);
119  	    unsigned nway = 1 << nb;
120  	    for (unsigned i=0; i<nway; i++) 
121  	      fragments.push_back(make_child(i, nb));
122  	  }
123  	
124  	  // binary splitting
125  	  frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
126  	  frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
127  	
128  	  bool is_left() const { return ceph_frag_is_left_child(_enc); }
129  	  bool is_right() const { return ceph_frag_is_right_child(_enc); }
130  	  frag_t get_sibling() const {
131  	    ceph_assert(!is_root());
132  	    return frag_t(ceph_frag_sibling(_enc));
133  	  }
134  	
135  	  // sequencing
136  	  bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
137  	  bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
138  	  frag_t next() const {
139  	    ceph_assert(!is_rightmost());
140  	    return frag_t(ceph_frag_next(_enc));
141  	  }
142  	
143  	  // parse
144  	  bool parse(const char *s) {
145  	    int pvalue, pbits;
146  	    int r = sscanf(s, "%x/%d", &pvalue, &pbits);
147  	    if (r == 2) {
148  	      *this = frag_t(pvalue, pbits);
149  	      return true;
150  	    }
151  	    return false;
152  	  }
153  	
154  	  void encode(bufferlist& bl) const {
(1) Event overrun-buffer-val: Overrunning buffer pointed to by "this->_enc" of 4 bytes by passing it to a function which accesses it at byte offset 7. [details]
155  	    encode_raw(_enc, bl);
156  	  }
157  	  void decode(bufferlist::const_iterator& p) {
158  	    __u32 v;
159  	    decode_raw(v, p);
160  	    _enc = v;
161  	  }
162  	
163  	private:
164  	  _frag_t _enc = 0;
165  	};
166  	
167  	inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
168  	{
169  	  //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
170  	  unsigned num = hb.bits();
171  	  if (num) {
172  	    unsigned val = hb.value();
173  	    for (unsigned bit = 23; num; num--, bit--) 
174  	      out << ((val & (1<<bit)) ? '1':'0');
175  	  }
176  	  return out << '*';
177  	}
178  	
179  	inline void encode(const frag_t &f, bufferlist& bl) { f.encode(bl); }
180  	inline void decode(frag_t &f, bufferlist::const_iterator& p) { f.decode(p); }
181  	
182  	using frag_vec_t = boost::container::small_vector<frag_t, 4>;
183  	
184  	/**
185  	 * fragtree_t -- partition an entire namespace into one or more frag_t's. 
186  	 */
187  	class fragtree_t {
188  	  // pairs <f, b>:
189  	  //  frag_t f is split by b bits.
190  	  //  if child frag_t does not appear, it is not split.
191  	public:
192  	  compact_map<frag_t,int32_t> _splits;
193  	
194  	public:
195  	  // -------------
196  	  // basics
197  	  void swap(fragtree_t& other) {
198  	    _splits.swap(other._splits);
199  	  }
200  	  void clear() {
201  	    _splits.clear();
202  	  }
203  	
204  	  // -------------
205  	  // accessors
206  	  bool empty() const { 
207  	    return _splits.empty();
208  	  }
209  	  int get_split(const frag_t hb) const {
210  	    compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
211  	    if (p == _splits.end())
212  	      return 0;
213  	    else
214  	      return p->second;
215  	  }
216  	
217  	  
218  	  bool is_leaf(frag_t x) const {
219  	    frag_vec_t s;
220  	    get_leaves_under(x, s);
221  	    //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
222  	    return s.size() == 1 && s.front() == x;
223  	  }
224  	
225  	  /**
226  	   * get_leaves -- list all leaves
227  	   */
228  	  template<typename T>
229  	  void get_leaves(T& c) const {
230  	    return get_leaves_under_split(frag_t(), c);
231  	  }
232  	
233  	  /**
234  	   * get_leaves_under_split -- list all leaves under a known split point (or root)
235  	   */
236  	  template<typename T>
237  	  void get_leaves_under_split(frag_t under, T& c) const {
238  	    frag_vec_t s;
239  	    s.push_back(under);
240  	    while (!s.empty()) {
241  	      frag_t t = s.back();
242  	      s.pop_back();
243  	      int nb = get_split(t);
244  	      if (nb) 
245  		t.split(nb, s);   // queue up children
246  	      else
247  		c.push_back(t);  // not spit, it's a leaf.
248  	    }
249  	  }
250  	
251  	  /**
252  	   * get_branch -- get branch point at OR above frag @a x
253  	   *  - may be @a x itself, if @a x is a split
254  	   *  - may be root (frag_t())
255  	   */
256  	  frag_t get_branch(frag_t x) const {
257  	    while (1) {
258  	      if (x == frag_t()) return x;  // root
259  	      if (get_split(x)) return x;   // found it!
260  	      x = x.parent();
261  	    }
262  	  }
263  	
264  	  /**
265  	   * get_branch_above -- get a branch point above frag @a x
266  	   *  - may be root (frag_t())
267  	   *  - may NOT be @a x, even if @a x is a split.
268  	   */
269  	  frag_t get_branch_above(frag_t x) const {
270  	    while (1) {
271  	      if (x == frag_t()) return x;  // root
272  	      x = x.parent();
273  	      if (get_split(x)) return x;   // found it!
274  	    }
275  	  }
276  	
277  	
278  	  /**
279  	   * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
280  	   *  - may be @a x itself, if @a x is a split or leaf
281  	   *  - may be root (frag_t())
282  	   */
283  	  frag_t get_branch_or_leaf(frag_t x) const {
284  	    frag_t branch = get_branch(x);
285  	    int nb = get_split(branch);
286  	    if (nb > 0 &&                                  // if branch is a split, and
287  		branch.bits() + nb <= x.bits())            // one of the children is or contains x 
288  	      return frag_t(x.value(), branch.bits()+nb);  // then return that child (it's a leaf)
289  	    else
290  	      return branch;
291  	  }
292  	
293  	  /**
294  	   * get_leaves_under(x, ls) -- search for any leaves fully contained by x
295  	   */
296  	  template<typename T>
297  	  void get_leaves_under(frag_t x, T& c) const {
298  	    frag_vec_t s;
299  	    s.push_back(get_branch_or_leaf(x));
300  	    while (!s.empty()) {
301  	      frag_t t = s.back();
302  	      s.pop_back();
303  	      if (t.bits() >= x.bits() &&    // if t is more specific than x, and
304  		  !x.contains(t))            // x does not contain t,
305  		continue;         // then skip
306  	      int nb = get_split(t);
307  	      if (nb) 
308  		t.split(nb, s);   // queue up children
309  	      else if (x.contains(t))
310  		c.push_back(t);  // not spit, it's a leaf.
311  	    }
312  	  }
313  	
314  	  /**
315  	   * contains(fg) -- does fragtree contain the specific frag @a x
316  	   */
317  	  bool contains(frag_t x) const {
318  	    frag_vec_t s;
319  	    s.push_back(get_branch(x));
320  	    while (!s.empty()) {
321  	      frag_t t = s.back();
322  	      s.pop_back();
323  	      if (t.bits() >= x.bits() &&  // if t is more specific than x, and
324  		  !x.contains(t))          // x does not contain t,
325  		continue;         // then skip 
326  	      int nb = get_split(t);
327  	      if (nb) {
328  		if (t == x) return false;  // it's split.
329  		t.split(nb, s);   // queue up children
330  	      } else {
331  		if (t == x) return true;   // it's there.
332  	      }
333  	    }
334  	    return false;
335  	  }
336  	
337  	  /** 
338  	   * operator[] -- map a (hash?) value to a frag
339  	   */
340  	  frag_t operator[](unsigned v) const {
341  	    frag_t t;
342  	    while (1) {
343  	      ceph_assert(t.contains(v));
344  	      int nb = get_split(t);
345  	
346  	      // is this a leaf?
347  	      if (nb == 0) return t;  // done.
348  	      
349  	      // pick appropriate child fragment.
350  	      unsigned nway = 1 << nb;
351  	      unsigned i;
352  	      for (i=0; i<nway; i++) {
353  		frag_t n = t.make_child(i, nb);
354  		if (n.contains(v)) {
355  		  t = n;
356  		  break;
357  		}
358  	      }
359  	      ceph_assert(i < nway);
360  	    }
361  	  }
362  	
363  	
364  	  // ---------------
365  	  // modifiers
366  	  void split(frag_t x, int b, bool simplify=true) {
367  	    ceph_assert(is_leaf(x));
368  	    _splits[x] = b;
369  	    
370  	    if (simplify)
371  	      try_assimilate_children(get_branch_above(x));
372  	  }
373  	  void merge(frag_t x, int b, bool simplify=true) {
374  	    ceph_assert(!is_leaf(x));
375  	    ceph_assert(_splits[x] == b);
376  	    _splits.erase(x);
377  	
378  	    if (simplify)
379  	      try_assimilate_children(get_branch_above(x));
380  	  }
381  	
382  	  /*
383  	   * if all of a given split's children are identically split,
384  	   * then the children can be assimilated.
385  	   */
386  	  void try_assimilate_children(frag_t x) {
387  	    int nb = get_split(x);
388  	    if (!nb) return;
389  	    frag_vec_t children;
390  	    x.split(nb, children);
391  	    int childbits = 0;
392  	    for (auto& frag : children) {
393  	      int cb = get_split(frag);
394  	      if (!cb) return;  // nope.
395  	      if (childbits && cb != childbits) return;  // not the same
396  	      childbits = cb;
397  	    }
398  	    // all children are split with childbits!
399  	    for (auto& frag : children)
400  	      _splits.erase(frag);
401  	    _splits[x] += childbits;
402  	  }
403  	
404  	  bool force_to_leaf(CephContext *cct, frag_t x) {
405  	    if (is_leaf(x))
406  	      return false;
407  	
408  	    lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
409  	
410  	    frag_t parent = get_branch_or_leaf(x);
411  	    ceph_assert(parent.bits() <= x.bits());
412  	    lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
413  	
414  	    // do we need to split from parent to x?
415  	    if (parent.bits() < x.bits()) {
416  	      int spread = x.bits() - parent.bits();
417  	      int nb = get_split(parent);
418  	      lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
419  	      if (nb == 0) {
420  		// easy: split parent (a leaf) by the difference
421  		lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
422  		split(parent, spread);
423  		ceph_assert(is_leaf(x));
424  		return true;
425  	      }
426  	      ceph_assert(nb > spread);
427  	      
428  	      // add an intermediary split
429  	      merge(parent, nb, false);
430  	      split(parent, spread, false);
431  	
432  	      frag_vec_t subs;
433  	      parent.split(spread, subs);
434  	      for (auto& frag : subs) {
435  		lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl;
436  		split(frag, nb - spread, false);
437  	      }
438  	    }
439  	
440  	    // x is now a leaf or split.  
441  	    // hoover up any children.
442  	    frag_vec_t s;
443  	    s.push_back(x);
444  	    while (!s.empty()) {
445  	      frag_t t = s.back();
446  	      s.pop_back();
447  	      int nb = get_split(t);
448  	      if (nb) {
449  		lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
450  		merge(t, nb, false);    // merge this point, and
451  		t.split(nb, s);         // queue up children
452  	      }
453  	    }
454  	
455  	    lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
456  	    ceph_assert(is_leaf(x));
457  	    return true;
458  	  }
459  	
460  	  // encoding
461  	  void encode(bufferlist& bl) const {
462  	    using ceph::encode;
463  	    encode(_splits, bl);
464  	  }
465  	  void decode(bufferlist::const_iterator& p) {
466  	    using ceph::decode;
467  	    decode(_splits, p);
468  	  }
469  	  void encode_nohead(bufferlist& bl) const {
470  	    using ceph::encode;
471  	    for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
472  		 p != _splits.end();
473  		 ++p) {
474  	      encode(p->first, bl);
475  	      encode(p->second, bl);
476  	    }
477  	  }
478  	  void decode_nohead(int n, bufferlist::const_iterator& p) {
479  	    using ceph::decode;
480  	    _splits.clear();
481  	    while (n-- > 0) {
482  	      frag_t f;
483  	      decode(f, p);
484  	      decode(_splits[f], p);
485  	    }
486  	  }
487  	
488  	  void print(std::ostream& out) {
489  	    out << "fragtree_t(";
490  	    frag_vec_t s;
491  	    s.push_back(frag_t());
492  	    while (!s.empty()) {
493  	      frag_t t = s.back();
494  	      s.pop_back();
495  	      // newline + indent?
496  	      if (t.bits()) {
497  		out << std::endl;
498  		for (unsigned i=0; i<t.bits(); i++) out << ' ';
499  	      }
500  	      int nb = get_split(t);
501  	      if (nb) {
502  		out << t << " %" << nb;
503  		t.split(nb, s);   // queue up children
504  	      } else {
505  		out << t;
506  	      }
507  	    }
508  	    out << ")";
509  	  }
510  	
511  	  void dump(Formatter *f) const {
512  	    f->open_array_section("splits");
513  	    for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
514  	         p != _splits.end();
515  	         ++p) {
516  	      f->open_object_section("split");
517  	      std::ostringstream frag_str;
518  	      frag_str << p->first;
519  	      f->dump_string("frag", frag_str.str());
520  	      f->dump_int("children", p->second);
521  	      f->close_section(); // split
522  	    }
523  	    f->close_section(); // splits
524  	  }
525  	};
526  	WRITE_CLASS_ENCODER(fragtree_t)
527  	
528  	inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
529  	  return l._splits == r._splits;
530  	}
531  	inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
532  	  return l._splits != r._splits;
533  	}
534  	
535  	inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
536  	{
537  	  out << "fragtree_t(";
538  	  
539  	  for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
540  	       p != ft._splits.end();
541  	       ++p) {
542  	    if (p != ft._splits.begin())
543  	      out << " ";
544  	    out << p->first << "^" << p->second;
545  	  }
546  	  return out << ")";
547  	}
548  	
549  	
550  	/**
551  	 * fragset_t -- a set of fragments
552  	 */
553  	class fragset_t {
554  	  std::set<frag_t> _set;
555  	
556  	public:
557  	  const std::set<frag_t> &get() const { return _set; }
558  	  std::set<frag_t>::iterator begin() { return _set.begin(); }
559  	  std::set<frag_t>::iterator end() { return _set.end(); }
560  	
561  	  bool empty() const { return _set.empty(); }
562  	
563  	  bool contains(frag_t f) const {
564  	    while (1) {
565  	      if (_set.count(f)) return true;
566  	      if (f.bits() == 0) return false;
567  	      f = f.parent();
568  	    }
569  	  }
570  	  
571  	  void insert(frag_t f) {
572  	    _set.insert(f);
573  	    simplify();
574  	  }
575  	
576  	  void simplify() {
577  	    while (1) {
578  	      bool clean = true;
579  	      std::set<frag_t>::iterator p = _set.begin();
580  	      while (p != _set.end()) {
581  		if (!p->is_root() &&
582  		    _set.count(p->get_sibling())) {
583  		  _set.erase(p->get_sibling());
584  		  _set.insert(p->parent());
585  		  _set.erase(p++);
586  		  clean = false;
587  		} else {
588  		  p++;
589  		}
590  	      }
591  	      if (clean)
592  		break;
593  	    }
594  	  }
595  	};
596  	
597  	inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) 
598  	{
599  	  return out << "fragset_t(" << fs.get() << ")";
600  	}
601  	
602  	#endif
603