1    	// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
2    	// vim: ts=8 sw=2 smarttab
3    	/*
4    	 * Ceph - scalable distributed file system
5    	 *
6    	 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7    	 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8    	 *
9    	 * Author: Loic Dachary <loic@dachary.org>
10   	 *
11   	 * This is free software; you can redistribute it and/or
12   	 * modify it under the terms of the GNU Lesser General Public
13   	 * License version 2.1, as published by the Free Software 
14   	 * Foundation.  See file COPYING.
15   	 * 
16   	 */
17   	
18   	#ifndef CEPH_OSD_TYPES_H
19   	#define CEPH_OSD_TYPES_H
20   	
21   	#include <atomic>
22   	#include <sstream>
23   	#include <cstdio>
24   	#include <memory>
25   	#include <string_view>
26   	
27   	#include <boost/scoped_ptr.hpp>
28   	#include <boost/optional/optional_io.hpp>
29   	#include <boost/variant.hpp>
30   	#include <boost/smart_ptr/local_shared_ptr.hpp>
31   	
32   	#include "include/rados/rados_types.hpp"
33   	#include "include/mempool.h"
34   	
35   	#include "msg/msg_types.h"
36   	#include "include/types.h"
37   	#include "include/utime.h"
38   	#include "include/CompatSet.h"
39   	#include "common/ceph_context.h"
40   	#include "common/histogram.h"
41   	#include "include/interval_set.h"
42   	#include "include/inline_memory.h"
43   	#include "common/Formatter.h"
44   	#include "common/bloom_filter.hpp"
45   	#include "common/hobject.h"
46   	#include "common/snap_types.h"
47   	#include "HitSet.h"
48   	#include "Watch.h"
49   	#include "include/cmp.h"
50   	#include "librados/ListObjectImpl.h"
51   	#include "compressor/Compressor.h"
52   	#include "osd_perf_counters.h"
53   	
54   	#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
55   	
56   	#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
57   	#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
58   	#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
59   	#define CEPH_OSD_FEATURE_INCOMPAT_LEC  CompatSet::Feature(4, "last_epoch_clean")
60   	#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES  CompatSet::Feature(5, "categories")
61   	#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL  CompatSet::Feature(6, "hobjectpool")
62   	#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
63   	#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
64   	#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
65   	#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
66   	#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
67   	#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
68   	#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
69   	#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
70   	#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
71   	#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
72   	#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
73   	
74   	
75   	/// pool priority range set by user
76   	#define OSD_POOL_PRIORITY_MAX 10
77   	#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
78   	
79   	/// min recovery priority for MBackfillReserve
80   	#define OSD_RECOVERY_PRIORITY_MIN 0
81   	
82   	/// base backfill priority for MBackfillReserve
83   	#define OSD_BACKFILL_PRIORITY_BASE 100
84   	
85   	/// base backfill priority for MBackfillReserve (degraded PG)
86   	#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
87   	
88   	/// base recovery priority for MBackfillReserve
89   	#define OSD_RECOVERY_PRIORITY_BASE 180
90   	
91   	/// base backfill priority for MBackfillReserve (inactive PG)
92   	#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
93   	
94   	/// base recovery priority for MRecoveryReserve (inactive PG)
95   	#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
96   	
97   	/// max manually/automatically set recovery priority for MBackfillReserve
98   	#define OSD_RECOVERY_PRIORITY_MAX 253
99   	
100  	/// backfill priority for MBackfillReserve, when forced manually
101  	#define OSD_BACKFILL_PRIORITY_FORCED 254
102  	
103  	/// recovery priority for MRecoveryReserve, when forced manually
104  	#define OSD_RECOVERY_PRIORITY_FORCED 255
105  	
106  	/// priority for pg deletion when osd is not fullish
107  	#define OSD_DELETE_PRIORITY_NORMAL 179
108  	
109  	/// priority for pg deletion when osd is approaching full
110  	#define OSD_DELETE_PRIORITY_FULLISH 219
111  	
112  	/// priority when more full
113  	#define OSD_DELETE_PRIORITY_FULL 255
114  	
115  	static std::map<int, int> max_prio_map = {
116  		{OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
117  		{OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
118  		{OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
119  		{OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
120  		{OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
121  	};
122  	
123  	typedef hobject_t collection_list_handle_t;
124  	
125  	/// convert a single CPEH_OSD_FLAG_* to a std::string
126  	const char *ceph_osd_flag_name(unsigned flag);
127  	/// convert a single CEPH_OSD_OF_FLAG_* to a std::string
128  	const char *ceph_osd_op_flag_name(unsigned flag);
129  	
130  	/// convert CEPH_OSD_FLAG_* op flags to a std::string
131  	std::string ceph_osd_flag_string(unsigned flags);
132  	/// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
133  	std::string ceph_osd_op_flag_string(unsigned flags);
134  	/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
135  	std::string ceph_osd_alloc_hint_flag_string(unsigned flags);
136  	
137  	typedef std::map<std::string,std::string> osd_alert_list_t;
138  	/// map osd id -> alert_list_t
139  	typedef std::map<int, osd_alert_list_t> osd_alerts_t;
140  	void dump(ceph::Formatter* f, const osd_alerts_t& alerts);
141  	
142  	
143  	typedef interval_set<
144  	  snapid_t,
145  	  mempool::osdmap::flat_map<snapid_t,snapid_t>> snap_interval_set_t;
146  	
147  	
148  	/**
149  	 * osd request identifier
150  	 *
151  	 * caller name + incarnation# + tid to unique identify this request.
152  	 */
153  	struct osd_reqid_t {
154  	  entity_name_t name; // who
155  	  ceph_tid_t    tid;
156  	  int32_t       inc;  // incarnation
157  	
158  	  osd_reqid_t()
159  	    : tid(0), inc(0)
160  	  {}
161  	  osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
162  	    : name(a), tid(t), inc(i)
163  	  {}
164  	
165  	  DENC(osd_reqid_t, v, p) {
166  	    DENC_START(2, 2, p);
167  	    denc(v.name, p);
168  	    denc(v.tid, p);
169  	    denc(v.inc, p);
170  	    DENC_FINISH(p);
171  	  }
172  	  void dump(ceph::Formatter *f) const;
173  	  static void generate_test_instances(std::list<osd_reqid_t*>& o);
174  	};
175  	WRITE_CLASS_DENC(osd_reqid_t)
176  	
177  	
178  	
179  	struct pg_shard_t {
180  	  static const int32_t NO_OSD = 0x7fffffff;
181  	  int32_t osd;
182  	  shard_id_t shard;
183  	  pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
184  	  explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
185  	  pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
186  	  bool is_undefined() const {
187  	    return osd == -1;
188  	  }
189  	  std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); }
190  	  void encode(ceph::buffer::list &bl) const;
191  	  void decode(ceph::buffer::list::const_iterator &bl);
192  	  void dump(ceph::Formatter *f) const {
193  	    f->dump_unsigned("osd", osd);
194  	    if (shard != shard_id_t::NO_SHARD) {
195  	      f->dump_unsigned("shard", shard);
196  	    }
197  	  }
198  	};
199  	WRITE_CLASS_ENCODER(pg_shard_t)
200  	WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
201  	WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
202  	std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs);
203  	
204  	class IsPGRecoverablePredicate {
205  	public:
206  	  /**
207  	   * have encodes the shards available
208  	   */
209  	  virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
210  	  virtual ~IsPGRecoverablePredicate() {}
211  	};
212  	
213  	class IsPGReadablePredicate {
214  	public:
215  	  /**
216  	   * have encodes the shards available
217  	   */
218  	  virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
219  	  virtual ~IsPGReadablePredicate() {}
220  	};
221  	
222  	inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) {
223  	  return out << r.name << "." << r.inc << ":" << r.tid;
224  	}
225  	
226  	inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
227  	  return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
228  	}
229  	inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
230  	  return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
231  	}
232  	inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
233  	  return (l.name < r.name) || (l.inc < r.inc) || 
234  	    (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
235  	}
236  	inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
237  	  return (l.name < r.name) || (l.inc < r.inc) ||
238  	    (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
239  	}
240  	inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
241  	inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
242  	
243  	namespace std {
244  	  template<> struct hash<osd_reqid_t> {
245  	    size_t operator()(const osd_reqid_t &r) const { 
246  	      static hash<uint64_t> H;
247  	      return H(r.name.num() ^ r.tid ^ r.inc);
248  	    }
249  	  };
250  	} // namespace std
251  	
252  	
253  	// -----
254  	
255  	// a locator constrains the placement of an object.  mainly, which pool
256  	// does it go in.
257  	struct object_locator_t {
258  	  // You specify either the hash or the key -- not both
259  	  int64_t pool;     ///< pool id
260  	  std::string key;       ///< key std::string (if non-empty)
261  	  std::string nspace;    ///< namespace
262  	  int64_t hash;     ///< hash position (if >= 0)
263  	
264  	  explicit object_locator_t()
265  	    : pool(-1), hash(-1) {}
266  	  explicit object_locator_t(int64_t po)
267  	    : pool(po), hash(-1)  {}
268  	  explicit object_locator_t(int64_t po, int64_t ps)
269  	    : pool(po), hash(ps)  {}
270  	  explicit object_locator_t(int64_t po, std::string ns)
271  	    : pool(po), nspace(ns), hash(-1) {}
272  	  explicit object_locator_t(int64_t po, std::string ns, int64_t ps)
273  	    : pool(po), nspace(ns), hash(ps) {}
274  	  explicit object_locator_t(int64_t po, std::string ns, std::string s)
275  	    : pool(po), key(s), nspace(ns), hash(-1) {}
276  	  explicit object_locator_t(const hobject_t& soid)
277  	    : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
278  	
279  	  int64_t get_pool() const {
280  	    return pool;
281  	  }
282  	
283  	  void clear() {
284  	    pool = -1;
285  	    key = "";
286  	    nspace = "";
287  	    hash = -1;
288  	  }
289  	
290  	  bool empty() const {
291  	    return pool == -1;
292  	  }
293  	
294  	  void encode(ceph::buffer::list& bl) const;
295  	  void decode(ceph::buffer::list::const_iterator& p);
296  	  void dump(ceph::Formatter *f) const;
297  	  static void generate_test_instances(std::list<object_locator_t*>& o);
298  	};
299  	WRITE_CLASS_ENCODER(object_locator_t)
300  	
301  	inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
302  	  return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
303  	}
304  	inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
305  	  return !(l == r);
306  	}
307  	
308  	inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc)
309  	{
310  	  out << "@" << loc.pool;
311  	  if (loc.nspace.length())
312  	    out << ";" << loc.nspace;
313  	  if (loc.key.length())
314  	    out << ":" << loc.key;
315  	  return out;
316  	}
317  	
318  	struct request_redirect_t {
319  	private:
320  	  object_locator_t redirect_locator; ///< this is authoritative
321  	  std::string redirect_object; ///< If non-empty, the request goes to this object name
322  	
323  	  friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir);
324  	public:
325  	
326  	  request_redirect_t() {}
327  	  explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
328  	      redirect_locator(orig) { redirect_locator.pool = rpool; }
329  	  explicit request_redirect_t(const object_locator_t& rloc) :
330  	      redirect_locator(rloc) {}
331  	  explicit request_redirect_t(const object_locator_t& orig,
332  	                              const std::string& robj) :
333  	      redirect_locator(orig), redirect_object(robj) {}
334  	
335  	  bool empty() const { return redirect_locator.empty() &&
336  				      redirect_object.empty(); }
337  	
338  	  void combine_with_locator(object_locator_t& orig, std::string& obj) const {
339  	    orig = redirect_locator;
340  	    if (!redirect_object.empty())
341  	      obj = redirect_object;
342  	  }
343  	
344  	  void encode(ceph::buffer::list& bl) const;
345  	  void decode(ceph::buffer::list::const_iterator& bl);
346  	  void dump(ceph::Formatter *f) const;
347  	  static void generate_test_instances(std::list<request_redirect_t*>& o);
348  	};
349  	WRITE_CLASS_ENCODER(request_redirect_t)
350  	
351  	inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) {
352  	  out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
353  	  return out;
354  	}
355  	
356  	// Internal OSD op flags - set by the OSD based on the op types
357  	enum {
358  	  CEPH_OSD_RMW_FLAG_READ        = (1 << 1),
359  	  CEPH_OSD_RMW_FLAG_WRITE       = (1 << 2),
360  	  CEPH_OSD_RMW_FLAG_CLASS_READ  = (1 << 3),
361  	  CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
362  	  CEPH_OSD_RMW_FLAG_PGOP        = (1 << 5),
363  	  CEPH_OSD_RMW_FLAG_CACHE       = (1 << 6),
364  	  CEPH_OSD_RMW_FLAG_FORCE_PROMOTE   = (1 << 7),
365  	  CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
366  	  CEPH_OSD_RMW_FLAG_SKIP_PROMOTE      = (1 << 9),
367  	  CEPH_OSD_RMW_FLAG_RWORDERED         = (1 << 10),
368  	  CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11),
369  	};
370  	
371  	
372  	// pg stuff
373  	
374  	#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
375  	
376  	// placement seed (a hash value)
377  	typedef uint32_t ps_t;
378  	
379  	// old (v1) pg_t encoding (wrap old struct ceph_pg)
380  	struct old_pg_t {
381  	  ceph_pg v;
382  	  void encode(ceph::buffer::list& bl) const {
383  	    ceph::encode_raw(v, bl);
384  	  }
385  	  void decode(ceph::buffer::list::const_iterator& bl) {
386  	    ceph::decode_raw(v, bl);
387  	  }
388  	};
389  	WRITE_CLASS_ENCODER(old_pg_t)
390  	
391  	// placement group id
392  	struct pg_t {
393  	  uint64_t m_pool;
394  	  uint32_t m_seed;
395  	
396  	  pg_t() : m_pool(0), m_seed(0) {}
397  	  pg_t(ps_t seed, uint64_t pool) :
398  	    m_pool(pool), m_seed(seed) {}
399  	  // cppcheck-suppress noExplicitConstructor
400  	  pg_t(const ceph_pg& cpg) :
401  	    m_pool(cpg.pool), m_seed(cpg.ps) {}
402  	
403  	  // cppcheck-suppress noExplicitConstructor
404  	  pg_t(const old_pg_t& opg) {
405  	    *this = opg.v;
406  	  }
407  	
408  	  old_pg_t get_old_pg() const {
409  	    old_pg_t o;
410  	    ceph_assert(m_pool < 0xffffffffull);
411  	    o.v.pool = m_pool;
412  	    o.v.ps = m_seed;
413  	    o.v.preferred = (__s16)-1;
414  	    return o;
415  	  }
416  	
417  	  ps_t ps() const {
418  	    return m_seed;
419  	  }
420  	  int64_t pool() const {
421  	    return m_pool;
422  	  }
423  	
424  	  static const uint8_t calc_name_buf_size = 36;  // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
425  	  char *calc_name(char *buf, const char *suffix_backwords) const;
426  	
427  	  void set_ps(ps_t p) {
428  	    m_seed = p;
429  	  }
430  	  void set_pool(uint64_t p) {
431  	    m_pool = p;
432  	  }
433  	
434  	  pg_t get_parent() const;
435  	  pg_t get_ancestor(unsigned old_pg_num) const;
436  	
437  	  int print(char *o, int maxlen) const;
438  	  bool parse(const char *s);
439  	
440  	  bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set<pg_t> *pchildren) const;
441  	
442  	  bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
443  	  bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
444  	    return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
445  	  }
446  	
447  	  /**
448  	   * Returns b such that for all object o:
449  	   *   ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
450  	   */
451  	  unsigned get_split_bits(unsigned pg_num) const;
452  	
453  	  bool contains(int bits, const ghobject_t& oid) const {
454  	    return
455  	      (int64_t)m_pool == oid.hobj.get_logical_pool() &&
456  	      oid.match(bits, ps());
457  	  }
458  	  bool contains(int bits, const hobject_t& oid) const {
459  	    return
460  	      (int64_t)m_pool == oid.get_logical_pool() &&
461  	      oid.match(bits, ps());
462  	  }
463  	
464  	  hobject_t get_hobj_start() const;
465  	  hobject_t get_hobj_end(unsigned pg_num) const;
466  	
467  	  void encode(ceph::buffer::list& bl) const {
468  	    using ceph::encode;
469  	    __u8 v = 1;
470  	    encode(v, bl);
471  	    encode(m_pool, bl);
472  	    encode(m_seed, bl);
473  	    encode((int32_t)-1, bl); // was preferred
474  	  }
475  	  void decode(ceph::buffer::list::const_iterator& bl) {
476  	    using ceph::decode;
477  	    __u8 v;
478  	    decode(v, bl);
479  	    decode(m_pool, bl);
480  	    decode(m_seed, bl);
481  	    bl.advance(sizeof(int32_t)); // was preferred
482  	  }
483  	  void decode_old(ceph::buffer::list::const_iterator& bl) {
484  	    using ceph::decode;
485  	    old_pg_t opg;
486  	    decode(opg, bl);
487  	    *this = opg;
488  	  }
489  	  void dump(ceph::Formatter *f) const;
490  	  static void generate_test_instances(std::list<pg_t*>& o);
491  	};
492  	WRITE_CLASS_ENCODER(pg_t)
493  	
494  	inline bool operator<(const pg_t& l, const pg_t& r) {
495  	  return l.pool() < r.pool() ||
496  	    (l.pool() == r.pool() && (l.ps() < r.ps()));
497  	}
498  	inline bool operator<=(const pg_t& l, const pg_t& r) {
499  	  return l.pool() < r.pool() ||
500  	    (l.pool() == r.pool() && (l.ps() <= r.ps()));
501  	}
502  	inline bool operator==(const pg_t& l, const pg_t& r) {
503  	  return l.pool() == r.pool() &&
504  	    l.ps() == r.ps();
505  	}
506  	inline bool operator!=(const pg_t& l, const pg_t& r) {
507  	  return l.pool() != r.pool() ||
508  	    l.ps() != r.ps();
509  	}
510  	inline bool operator>(const pg_t& l, const pg_t& r) {
511  	  return l.pool() > r.pool() ||
512  	    (l.pool() == r.pool() && (l.ps() > r.ps()));
513  	}
514  	inline bool operator>=(const pg_t& l, const pg_t& r) {
515  	  return l.pool() > r.pool() ||
516  	    (l.pool() == r.pool() && (l.ps() >= r.ps()));
517  	}
518  	
519  	std::ostream& operator<<(std::ostream& out, const pg_t &pg);
520  	
521  	namespace std {
522  	  template<> struct hash< pg_t >
523  	  {
524  	    size_t operator()( const pg_t& x ) const
525  	    {
526  	      static hash<uint32_t> H;
527  	      // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
528  	      return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
529  	    }
530  	  };
531  	} // namespace std
532  	
533  	struct spg_t {
534  	  pg_t pgid;
535  	  shard_id_t shard;
536  	  spg_t() : shard(shard_id_t::NO_SHARD) {}
537  	  spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
538  	  explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
539  	  unsigned get_split_bits(unsigned pg_num) const {
540  	    return pgid.get_split_bits(pg_num);
541  	  }
542  	  spg_t get_parent() const {
543  	    return spg_t(pgid.get_parent(), shard);
544  	  }
545  	  ps_t ps() const {
546  	    return pgid.ps();
547  	  }
548  	  uint64_t pool() const {
549  	    return pgid.pool();
550  	  }
551  	  void reset_shard(shard_id_t s) {
552  	    shard = s;
553  	  }
554  	
555  	  static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
556  	  char *calc_name(char *buf, const char *suffix_backwords) const;
557  	 
558  	  bool parse(const char *s);
559  	  bool parse(const std::string& s) {
560  	    return parse(s.c_str());
561  	  }
562  	
563  	  spg_t get_ancestor(unsigned old_pg_num) const {
564  	    return spg_t(pgid.get_ancestor(old_pg_num), shard);
565  	  }
566  	
567  	  bool is_split(unsigned old_pg_num, unsigned new_pg_num,
568  			std::set<spg_t> *pchildren) const {
569  	    std::set<pg_t> _children;
570  	    std::set<pg_t> *children = pchildren ? &_children : NULL;
571  	    bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
572  	    if (pchildren && is_split) {
573  	      for (std::set<pg_t>::iterator i = _children.begin();
574  		   i != _children.end();
575  		   ++i) {
576  		pchildren->insert(spg_t(*i, shard));
577  	      }
578  	    }
579  	    return is_split;
580  	  }
581  	  bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
582  	    return pgid.is_merge_target(old_pg_num, new_pg_num);
583  	  }
584  	  bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
585  			       spg_t *parent) const {
586  	    spg_t out = *this;
587  	    bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
588  	    if (r && parent) {
589  	      *parent = out;
590  	    }
591  	    return r;
592  	  }
593  	
594  	  bool is_no_shard() const {
595  	    return shard == shard_id_t::NO_SHARD;
596  	  }
597  	
598  	  ghobject_t make_pgmeta_oid() const {
599  	    return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
600  	  }
601  	
602  	  void encode(ceph::buffer::list &bl) const {
603  	    ENCODE_START(1, 1, bl);
604  	    encode(pgid, bl);
605  	    encode(shard, bl);
606  	    ENCODE_FINISH(bl);
607  	  }
608  	  void decode(ceph::buffer::list::const_iterator& bl) {
609  	    DECODE_START(1, bl);
610  	    decode(pgid, bl);
611  	    decode(shard, bl);
612  	    DECODE_FINISH(bl);
613  	  }
614  	
615  	  ghobject_t make_temp_ghobject(const std::string& name) const {
616  	    return ghobject_t(
617  	      hobject_t(object_t(name), "", CEPH_NOSNAP,
618  			pgid.ps(),
619  			hobject_t::get_temp_pool(pgid.pool()),
620  			""),
621  	      ghobject_t::NO_GEN,
622  	      shard);
623  	  }
624  	
625  	  unsigned hash_to_shard(unsigned num_shards) const {
626  	    return ps() % num_shards;
627  	  }
628  	};
629  	WRITE_CLASS_ENCODER(spg_t)
630  	WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
631  	WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
632  	
633  	namespace std {
634  	  template<> struct hash< spg_t >
635  	  {
636  	    size_t operator()( const spg_t& x ) const
637  	      {
638  	      static hash<uint32_t> H;
639  	      return H(hash<pg_t>()(x.pgid) ^ x.shard);
640  	    }
641  	  };
642  	} // namespace std
643  	
644  	std::ostream& operator<<(std::ostream& out, const spg_t &pg);
645  	
646  	// ----------------------
647  	
648  	class coll_t {
649  	  enum type_t {
650  	    TYPE_META = 0,
651  	    TYPE_LEGACY_TEMP = 1,  /* no longer used */
652  	    TYPE_PG = 2,
653  	    TYPE_PG_TEMP = 3,
654  	  };
655  	  type_t type;
656  	  spg_t pgid;
657  	  uint64_t removal_seq;  // note: deprecated, not encoded
658  	
659  	  char _str_buff[spg_t::calc_name_buf_size];
660  	  char *_str;
661  	
662  	  void calc_str();
663  	
664  	  coll_t(type_t t, spg_t p, uint64_t r)
665  	    : type(t), pgid(p), removal_seq(r) {
666  	    calc_str();
667  	  }
668  	
669  	public:
670  	  coll_t() : type(TYPE_META), removal_seq(0)
671  	  {
672  	    calc_str();
673  	  }
674  	
675  	  coll_t(const coll_t& other)
676  	    : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
677  	    calc_str();
678  	  }
679  	
680  	  explicit coll_t(spg_t pgid)
681  	    : type(TYPE_PG), pgid(pgid), removal_seq(0)
682  	  {
683  	    calc_str();
684  	  }
685  	
686  	  coll_t& operator=(const coll_t& rhs)
687  	  {
688  	    this->type = rhs.type;
689  	    this->pgid = rhs.pgid;
690  	    this->removal_seq = rhs.removal_seq;
691  	    this->calc_str();
692  	    return *this;
693  	  }
694  	
695  	  // named constructors
696  	  static coll_t meta() {
697  	    return coll_t();
698  	  }
699  	  static coll_t pg(spg_t p) {
700  	    return coll_t(p);
701  	  }
702  	
703  	  const std::string to_str() const {
704  	    return std::string(_str);
705  	  }
706  	  const char *c_str() const {
707  	    return _str;
708  	  }
709  	
710  	  bool parse(const std::string& s);
711  	
712  	  int operator<(const coll_t &rhs) const {
713  	    return type < rhs.type ||
714  			  (type == rhs.type && pgid < rhs.pgid);
715  	  }
716  	
717  	  bool is_meta() const {
718  	    return type == TYPE_META;
719  	  }
720  	  bool is_pg_prefix(spg_t *pgid_) const {
721  	    if (type == TYPE_PG || type == TYPE_PG_TEMP) {
722  	      *pgid_ = pgid;
723  	      return true;
724  	    }
725  	    return false;
726  	  }
727  	  bool is_pg() const {
728  	    return type == TYPE_PG;
729  	  }
730  	  bool is_pg(spg_t *pgid_) const {
731  	    if (type == TYPE_PG) {
732  	      *pgid_ = pgid;
733  	      return true;
734  	    }
735  	    return false;
736  	  }
737  	  bool is_temp() const {
738  	    return type == TYPE_PG_TEMP;
739  	  }
740  	  bool is_temp(spg_t *pgid_) const {
741  	    if (type == TYPE_PG_TEMP) {
742  	      *pgid_ = pgid;
743  	      return true;
744  	    }
745  	    return false;
746  	  }
747  	  int64_t pool() const {
748  	    return pgid.pool();
749  	  }
750  	
751  	  void encode(ceph::buffer::list& bl) const;
752  	  void decode(ceph::buffer::list::const_iterator& bl);
753  	  size_t encoded_size() const;
754  	
755  	  inline bool operator==(const coll_t& rhs) const {
756  	    // only compare type if meta
757  	    if (type != rhs.type)
758  	      return false;
759  	    if (type == TYPE_META)
760  	      return true;
761  	    return type == rhs.type && pgid == rhs.pgid;
762  	  }
763  	  inline bool operator!=(const coll_t& rhs) const {
764  	    return !(*this == rhs);
765  	  }
766  	
767  	  // get a TEMP collection that corresponds to the current collection,
768  	  // which we presume is a pg collection.
769  	  coll_t get_temp() const {
770  	    ceph_assert(type == TYPE_PG);
771  	    return coll_t(TYPE_PG_TEMP, pgid, 0);
772  	  }
773  	
774  	  ghobject_t get_min_hobj() const {
775  	    ghobject_t o;
776  	    switch (type) {
777  	    case TYPE_PG:
778  	      o.hobj.pool = pgid.pool();
779  	      o.set_shard(pgid.shard);
780  	      break;
781  	    case TYPE_META:
782  	      o.hobj.pool = -1;
783  	      break;
784  	    default:
785  	      break;
786  	    }
787  	    return o;
788  	  }
789  	
790  	  unsigned hash_to_shard(unsigned num_shards) const {
791  	    if (type == TYPE_PG)
792  	      return pgid.hash_to_shard(num_shards);
793  	    return 0;  // whatever.
794  	  }
795  	
796  	  void dump(ceph::Formatter *f) const;
797  	  static void generate_test_instances(std::list<coll_t*>& o);
798  	};
799  	
800  	WRITE_CLASS_ENCODER(coll_t)
801  	
802  	inline std::ostream& operator<<(std::ostream& out, const coll_t& c) {
803  	  out << c.to_str();
804  	  return out;
805  	}
806  	
807  	namespace std {
808  	  template<> struct hash<coll_t> {
809  	    size_t operator()(const coll_t &c) const { 
810  	      size_t h = 0;
811  	      std::string str(c.to_str());
812  	      std::string::const_iterator end(str.end());
813  	      for (std::string::const_iterator s = str.begin(); s != end; ++s) {
814  		h += *s;
815  		h += (h << 10);
816  		h ^= (h >> 6);
817  	      }
818  	      h += (h << 3);
819  	      h ^= (h >> 11);
820  	      h += (h << 15);
821  	      return h;
822  	    }
823  	  };
824  	} // namespace std
825  	
826  	inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol)
827  	{
828  	  out << pg_t(ol.ol_pgid);
829  	  int su = ol.ol_stripe_unit;
830  	  if (su)
831  	    out << ".su=" << su;
832  	  return out;
833  	}
834  	
835  	
836  	
837  	// compound rados version type
838  	/* WARNING: If add member in eversion_t, please make sure the encode/decode function
839  	 * work well. For little-endian machine, we should make sure there is no padding
840  	 * in 32-bit machine and 64-bit machine.
841  	 */
842  	class eversion_t {
843  	public:
844  	  version_t version;
845  	  epoch_t epoch;
846  	  __u32 __pad;
847  	  eversion_t() : version(0), epoch(0), __pad(0) {}
848  	  eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
849  	
850  	  // cppcheck-suppress noExplicitConstructor
851  	  eversion_t(const ceph_eversion& ce) :
852  	    version(ce.version),
853  	    epoch(ce.epoch),
854  	    __pad(0) { }
855  	
856  	  explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); }
857  	
858  	  static const eversion_t& max() {
859  	    static const eversion_t max(-1,-1);
860  	    return max;
861  	  }
862  	
863  	  operator ceph_eversion() {
864  	    ceph_eversion c;
865  	    c.epoch = epoch;
866  	    c.version = version;
867  	    return c;
868  	  }
869  	
870  	  std::string get_key_name() const;
871  	
872  	  // key must point to the beginning of a block of 32 chars
873  	  inline void get_key_name(char* key) const {
874  	    // Below is equivalent of sprintf("%010u.%020llu");
875  	    key[31] = 0;
876  	    ritoa<uint64_t, 10, 20>(version, key + 31);
877  	    key[10] = '.';
878  	    ritoa<uint32_t, 10, 10>(epoch, key + 10);
879  	  }
880  	
881  	  void encode(ceph::buffer::list &bl) const {
882  	#if defined(CEPH_LITTLE_ENDIAN)
883  	    bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
884  	#else
885  	    using ceph::encode;
886  	    encode(version, bl);
887  	    encode(epoch, bl);
888  	#endif
889  	  }
890  	  void decode(ceph::buffer::list::const_iterator &bl) {
891  	#if defined(CEPH_LITTLE_ENDIAN)
892  	    bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
893  	#else
894  	    using ceph::decode;
895  	    decode(version, bl);
896  	    decode(epoch, bl);
897  	#endif
898  	  }
899  	  void decode(ceph::buffer::list& bl) {
900  	    auto p = std::cbegin(bl);
901  	    decode(p);
902  	  }
903  	};
904  	WRITE_CLASS_ENCODER(eversion_t)
905  	
906  	inline bool operator==(const eversion_t& l, const eversion_t& r) {
907  	  return (l.epoch == r.epoch) && (l.version == r.version);
908  	}
909  	inline bool operator!=(const eversion_t& l, const eversion_t& r) {
910  	  return (l.epoch != r.epoch) || (l.version != r.version);
911  	}
912  	inline bool operator<(const eversion_t& l, const eversion_t& r) {
913  	  return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
914  	}
915  	inline bool operator<=(const eversion_t& l, const eversion_t& r) {
916  	  return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
917  	}
918  	inline bool operator>(const eversion_t& l, const eversion_t& r) {
919  	  return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
920  	}
921  	inline bool operator>=(const eversion_t& l, const eversion_t& r) {
922  	  return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
923  	}
924  	inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) {
925  	  return out << e.epoch << "'" << e.version;
926  	}
927  	
928  	/**
929  	 * objectstore_perf_stat_t
930  	 *
931  	 * current perf information about the osd
932  	 */
933  	struct objectstore_perf_stat_t {
934  	  // cur_op_latency is in ns since double add/sub are not associative
935  	  uint64_t os_commit_latency_ns;
936  	  uint64_t os_apply_latency_ns;
937  	
938  	  objectstore_perf_stat_t() :
939  	    os_commit_latency_ns(0), os_apply_latency_ns(0) {}
940  	
941  	  bool operator==(const objectstore_perf_stat_t &r) const {
942  	    return os_commit_latency_ns == r.os_commit_latency_ns &&
943  	      os_apply_latency_ns == r.os_apply_latency_ns;
944  	  }
945  	
946  	  void add(const objectstore_perf_stat_t &o) {
947  	    os_commit_latency_ns += o.os_commit_latency_ns;
948  	    os_apply_latency_ns += o.os_apply_latency_ns;
949  	  }
950  	  void sub(const objectstore_perf_stat_t &o) {
951  	    os_commit_latency_ns -= o.os_commit_latency_ns;
952  	    os_apply_latency_ns -= o.os_apply_latency_ns;
953  	  }
954  	  void dump(ceph::Formatter *f) const;
955  	  void encode(ceph::buffer::list &bl, uint64_t features) const;
956  	  void decode(ceph::buffer::list::const_iterator &bl);
957  	  static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
958  	};
959  	WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
960  	
961  	/*
962  	 * pg states
963  	 */
964  	#define PG_STATE_CREATING           (1ULL << 0)  // creating
965  	#define PG_STATE_ACTIVE             (1ULL << 1)  // i am active.  (primary: replicas too)
966  	#define PG_STATE_CLEAN              (1ULL << 2)  // peers are complete, clean of stray replicas.
967  	#define PG_STATE_DOWN               (1ULL << 4)  // a needed replica is down, PG offline
968  	#define PG_STATE_RECOVERY_UNFOUND   (1ULL << 5)  // recovery stopped due to unfound
969  	#define PG_STATE_BACKFILL_UNFOUND   (1ULL << 6)  // backfill stopped due to unfound
970  	#define PG_STATE_PREMERGE           (1ULL << 7)  // i am prepare to merging
971  	#define PG_STATE_SCRUBBING          (1ULL << 8)  // scrubbing
972  	//#define PG_STATE_SCRUBQ           (1ULL << 9)  // queued for scrub
973  	#define PG_STATE_DEGRADED           (1ULL << 10) // pg contains objects with reduced redundancy
974  	#define PG_STATE_INCONSISTENT       (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
975  	#define PG_STATE_PEERING            (1ULL << 12) // pg is (re)peering
976  	#define PG_STATE_REPAIR             (1ULL << 13) // pg should repair on next scrub
977  	#define PG_STATE_RECOVERING         (1ULL << 14) // pg is recovering/migrating objects
978  	#define PG_STATE_BACKFILL_WAIT      (1ULL << 15) // [active] reserving backfill
979  	#define PG_STATE_INCOMPLETE         (1ULL << 16) // incomplete content, peering failed.
980  	#define PG_STATE_STALE              (1ULL << 17) // our state for this pg is stale, unknown.
981  	#define PG_STATE_REMAPPED           (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
982  	#define PG_STATE_DEEP_SCRUB         (1ULL << 19) // deep scrub: check CRC32 on files
983  	#define PG_STATE_BACKFILLING        (1ULL << 20) // [active] backfilling pg content
984  	#define PG_STATE_BACKFILL_TOOFULL   (1ULL << 21) // backfill can't proceed: too full
985  	#define PG_STATE_RECOVERY_WAIT      (1ULL << 22) // waiting for recovery reservations
986  	#define PG_STATE_UNDERSIZED         (1ULL << 23) // pg acting < pool size
987  	#define PG_STATE_ACTIVATING         (1ULL << 24) // pg is peered but not yet active
988  	#define PG_STATE_PEERED             (1ULL << 25) // peered, cannot go active, can recover
989  	#define PG_STATE_SNAPTRIM           (1ULL << 26) // trimming snaps
990  	#define PG_STATE_SNAPTRIM_WAIT      (1ULL << 27) // queued to trim snaps
991  	#define PG_STATE_RECOVERY_TOOFULL   (1ULL << 28) // recovery can't proceed: too full
992  	#define PG_STATE_SNAPTRIM_ERROR     (1ULL << 29) // error stopped trimming snaps
993  	#define PG_STATE_FORCED_RECOVERY    (1ULL << 30) // force recovery of this pg before any other
994  	#define PG_STATE_FORCED_BACKFILL    (1ULL << 31) // force backfill of this pg before any other
995  	#define PG_STATE_FAILED_REPAIR      (1ULL << 32) // A repair failed to fix all errors
996  	#define PG_STATE_LAGGY              (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
997  	#define PG_STATE_WAIT               (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
998  	
999  	std::string pg_state_string(uint64_t state);
1000 	std::string pg_vector_string(const std::vector<int32_t> &a);
1001 	std::optional<uint64_t> pg_string_state(const std::string& state);
1002 	
1003 	
1004 	/*
1005 	 * pool_snap_info_t
1006 	 *
1007 	 * attributes for a single pool snapshot.  
1008 	 */
1009 	struct pool_snap_info_t {
1010 	  snapid_t snapid;
1011 	  utime_t stamp;
1012 	  std::string name;
1013 	
1014 	  void dump(ceph::Formatter *f) const;
1015 	  void encode(ceph::buffer::list& bl, uint64_t features) const;
1016 	  void decode(ceph::buffer::list::const_iterator& bl);
1017 	  static void generate_test_instances(std::list<pool_snap_info_t*>& o);
1018 	};
1019 	WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1020 	
1021 	inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) {
1022 	  return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1023 	}
1024 	
1025 	
1026 	/*
1027 	 * pool_opts_t
1028 	 *
1029 	 * pool options.
1030 	 */
1031 	
1032 	class pool_opts_t {
1033 	public:
1034 	  enum key_t {
1035 	    SCRUB_MIN_INTERVAL,
1036 	    SCRUB_MAX_INTERVAL,
1037 	    DEEP_SCRUB_INTERVAL,
1038 	    RECOVERY_PRIORITY,
1039 	    RECOVERY_OP_PRIORITY,
1040 	    SCRUB_PRIORITY,
1041 	    COMPRESSION_MODE,
1042 	    COMPRESSION_ALGORITHM,
1043 	    COMPRESSION_REQUIRED_RATIO,
1044 	    COMPRESSION_MAX_BLOB_SIZE,
1045 	    COMPRESSION_MIN_BLOB_SIZE,
1046 	    CSUM_TYPE,
1047 	    CSUM_MAX_BLOCK,
1048 	    CSUM_MIN_BLOCK,
1049 	    FINGERPRINT_ALGORITHM,
1050 	    PG_NUM_MIN,         // min pg_num
1051 	    TARGET_SIZE_BYTES,  // total bytes in pool
1052 	    TARGET_SIZE_RATIO,  // fraction of total cluster
1053 	    PG_AUTOSCALE_BIAS,
1054 	    READ_LEASE_INTERVAL,
1055 	  };
1056 	
1057 	  enum type_t {
1058 	    STR,
1059 	    INT,
1060 	    DOUBLE,
1061 	  };
1062 	
1063 	  struct opt_desc_t {
1064 	    key_t key;
1065 	    type_t type;
1066 	
1067 	    opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1068 	
1069 	    bool operator==(const opt_desc_t& rhs) const {
1070 	      return key == rhs.key && type == rhs.type;
1071 	    }
1072 	  };
1073 	
1074 	  typedef boost::variant<std::string,int64_t,double> value_t;
1075 	
1076 	  static bool is_opt_name(const std::string& name);
1077 	  static opt_desc_t get_opt_desc(const std::string& name);
1078 	
1079 	  pool_opts_t() : opts() {}
1080 	
1081 	  bool is_set(key_t key) const;
1082 	
1083 	  template<typename T>
1084 	  void set(key_t key, const T &val) {
1085 	    value_t value = val;
1086 	    opts[key] = value;
1087 	  }
1088 	
1089 	  template<typename T>
1090 	  bool get(key_t key, T *val) const {
1091 	    opts_t::const_iterator i = opts.find(key);
1092 	    if (i == opts.end()) {
1093 	      return false;
1094 	    }
1095 	    *val = boost::get<T>(i->second);
1096 	    return true;
1097 	  }
1098 	
1099 	  const value_t& get(key_t key) const;
1100 	
1101 	  bool unset(key_t key);
1102 	
1103 	  void dump(const std::string& name, ceph::Formatter *f) const;
1104 	
1105 	  void dump(ceph::Formatter *f) const;
1106 	  void encode(ceph::buffer::list &bl, uint64_t features) const;
1107 	  void decode(ceph::buffer::list::const_iterator &bl);
1108 	
1109 	private:
1110 	  typedef std::map<key_t, value_t> opts_t;
1111 	  opts_t opts;
1112 	
1113 	  friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts);
1114 	};
1115 	WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
1116 	
1117 	struct pg_merge_meta_t {
1118 	  pg_t source_pgid;
1119 	  epoch_t ready_epoch = 0;
1120 	  epoch_t last_epoch_started = 0;
1121 	  epoch_t last_epoch_clean = 0;
1122 	  eversion_t source_version;
1123 	  eversion_t target_version;
1124 	
1125 	  void encode(ceph::buffer::list& bl) const {
1126 	    ENCODE_START(1, 1, bl);
1127 	    encode(source_pgid, bl);
1128 	    encode(ready_epoch, bl);
1129 	    encode(last_epoch_started, bl);
1130 	    encode(last_epoch_clean, bl);
1131 	    encode(source_version, bl);
1132 	    encode(target_version, bl);
1133 	    ENCODE_FINISH(bl);
1134 	  }
1135 	  void decode(ceph::buffer::list::const_iterator& p) {
1136 	    DECODE_START(1, p);
1137 	    decode(source_pgid, p);
1138 	    decode(ready_epoch, p);
1139 	    decode(last_epoch_started, p);
1140 	    decode(last_epoch_clean, p);
1141 	    decode(source_version, p);
1142 	    decode(target_version, p);
1143 	    DECODE_FINISH(p);
1144 	  }
1145 	  void dump(ceph::Formatter *f) const {
1146 	    f->dump_stream("source_pgid") << source_pgid;
1147 	    f->dump_unsigned("ready_epoch", ready_epoch);
1148 	    f->dump_unsigned("last_epoch_started", last_epoch_started);
1149 	    f->dump_unsigned("last_epoch_clean", last_epoch_clean);
1150 	    f->dump_stream("source_version") << source_version;
1151 	    f->dump_stream("target_version") << target_version;
1152 	  }
1153 	};
1154 	WRITE_CLASS_ENCODER(pg_merge_meta_t)
1155 	
1156 	/*
1157 	 * pg_pool
1158 	 */
1159 	struct pg_pool_t {
1160 	  static const char *APPLICATION_NAME_CEPHFS;
1161 	  static const char *APPLICATION_NAME_RBD;
1162 	  static const char *APPLICATION_NAME_RGW;
1163 	
1164 	  enum {
1165 	    TYPE_REPLICATED = 1,     // replication
1166 	    //TYPE_RAID4 = 2,   // raid4 (never implemented)
1167 	    TYPE_ERASURE = 3,      // erasure-coded
1168 	  };
1169 	  static std::string_view get_type_name(int t) {
1170 	    switch (t) {
1171 	    case TYPE_REPLICATED: return "replicated";
1172 	      //case TYPE_RAID4: return "raid4";
1173 	    case TYPE_ERASURE: return "erasure";
1174 	    default: return "???";
1175 	    }
1176 	  }
1177 	  std::string_view get_type_name() const {
1178 	    return get_type_name(type);
1179 	  }
1180 	
1181 	  enum {
1182 	    FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1183 	    FLAG_FULL       = 1<<1, // pool is full
1184 	    FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1185 	    FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1186 	    FLAG_NODELETE = 1<<4, // pool can't be deleted
1187 	    FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1188 	    FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1189 	    FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1190 	    FLAG_NOSCRUB = 1<<8, // block periodic scrub
1191 	    FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
1192 	    FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1193 	    FLAG_NEARFULL = 1<<11, // pool is nearfull
1194 	    FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
1195 	    FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
1196 	    FLAG_POOL_SNAPS = 1<<14,        // pool has pool snaps
1197 	    FLAG_CREATING = 1<<15,          // initial pool PGs are being created
1198 	  };
1199 	
1200 	  static const char *get_flag_name(int f) {
1201 	    switch (f) {
1202 	    case FLAG_HASHPSPOOL: return "hashpspool";
1203 	    case FLAG_FULL: return "full";
1204 	    case FLAG_EC_OVERWRITES: return "ec_overwrites";
1205 	    case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1206 	    case FLAG_NODELETE: return "nodelete";
1207 	    case FLAG_NOPGCHANGE: return "nopgchange";
1208 	    case FLAG_NOSIZECHANGE: return "nosizechange";
1209 	    case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1210 	    case FLAG_NOSCRUB: return "noscrub";
1211 	    case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
1212 	    case FLAG_FULL_QUOTA: return "full_quota";
1213 	    case FLAG_NEARFULL: return "nearfull";
1214 	    case FLAG_BACKFILLFULL: return "backfillfull";
1215 	    case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
1216 	    case FLAG_POOL_SNAPS: return "pool_snaps";
1217 	    case FLAG_CREATING: return "creating";
1218 	    default: return "???";
1219 	    }
1220 	  }
1221 	  static std::string get_flags_string(uint64_t f) {
1222 	    std::string s;
1223 	    for (unsigned n=0; f && n<64; ++n) {
1224 	      if (f & (1ull << n)) {
1225 		if (s.length())
1226 		  s += ",";
1227 		s += get_flag_name(1ull << n);
1228 	      }
1229 	    }
1230 	    return s;
1231 	  }
1232 	  std::string get_flags_string() const {
1233 	    return get_flags_string(flags);
1234 	  }
1235 	  static uint64_t get_flag_by_name(const std::string& name) {
1236 	    if (name == "hashpspool")
1237 	      return FLAG_HASHPSPOOL;
1238 	    if (name == "full")
1239 	      return FLAG_FULL;
1240 	    if (name == "ec_overwrites")
1241 	      return FLAG_EC_OVERWRITES;
1242 	    if (name == "incomplete_clones")
1243 	      return FLAG_INCOMPLETE_CLONES;
1244 	    if (name == "nodelete")
1245 	      return FLAG_NODELETE;
1246 	    if (name == "nopgchange")
1247 	      return FLAG_NOPGCHANGE;
1248 	    if (name == "nosizechange")
1249 	      return FLAG_NOSIZECHANGE;
1250 	    if (name == "write_fadvise_dontneed")
1251 	      return FLAG_WRITE_FADVISE_DONTNEED;
1252 	    if (name == "noscrub")
1253 	      return FLAG_NOSCRUB;
1254 	    if (name == "nodeep-scrub")
1255 	      return FLAG_NODEEP_SCRUB;
1256 	    if (name == "full_quota")
1257 	      return FLAG_FULL_QUOTA;
1258 	    if (name == "nearfull")
1259 	      return FLAG_NEARFULL;
1260 	    if (name == "backfillfull")
1261 	      return FLAG_BACKFILLFULL;
1262 	    if (name == "selfmanaged_snaps")
1263 	      return FLAG_SELFMANAGED_SNAPS;
1264 	    if (name == "pool_snaps")
1265 	      return FLAG_POOL_SNAPS;
1266 	    if (name == "creating")
1267 	      return FLAG_CREATING;
1268 	    return 0;
1269 	  }
1270 	
1271 	  /// converts the acting/up vector to a set of pg shards
1272 	  void convert_to_pg_shards(const std::vector<int> &from, std::set<pg_shard_t>* to) const;
1273 	
1274 	  typedef enum {
1275 	    CACHEMODE_NONE = 0,                  ///< no caching
1276 	    CACHEMODE_WRITEBACK = 1,             ///< write to cache, flush later
1277 	    CACHEMODE_FORWARD = 2,               ///< forward if not in cache
1278 	    CACHEMODE_READONLY = 3,              ///< handle reads, forward writes [not strongly consistent]
1279 	    CACHEMODE_READFORWARD = 4,           ///< forward reads, write to cache flush later
1280 	    CACHEMODE_READPROXY = 5,             ///< proxy reads, write to cache flush later
1281 	    CACHEMODE_PROXY = 6,                 ///< proxy if not in cache
1282 	  } cache_mode_t;
1283 	  static const char *get_cache_mode_name(cache_mode_t m) {
1284 	    switch (m) {
1285 	    case CACHEMODE_NONE: return "none";
1286 	    case CACHEMODE_WRITEBACK: return "writeback";
1287 	    case CACHEMODE_FORWARD: return "forward";
1288 	    case CACHEMODE_READONLY: return "readonly";
1289 	    case CACHEMODE_READFORWARD: return "readforward";
1290 	    case CACHEMODE_READPROXY: return "readproxy";
1291 	    case CACHEMODE_PROXY: return "proxy";
1292 	    default: return "unknown";
1293 	    }
1294 	  }
1295 	  static cache_mode_t get_cache_mode_from_str(const std::string& s) {
1296 	    if (s == "none")
1297 	      return CACHEMODE_NONE;
1298 	    if (s == "writeback")
1299 	      return CACHEMODE_WRITEBACK;
1300 	    if (s == "forward")
1301 	      return CACHEMODE_FORWARD;
1302 	    if (s == "readonly")
1303 	      return CACHEMODE_READONLY;
1304 	    if (s == "readforward")
1305 	      return CACHEMODE_READFORWARD;
1306 	    if (s == "readproxy")
1307 	      return CACHEMODE_READPROXY;
1308 	    if (s == "proxy")
1309 	      return CACHEMODE_PROXY;
1310 	    return (cache_mode_t)-1;
1311 	  }
1312 	  const char *get_cache_mode_name() const {
1313 	    return get_cache_mode_name(cache_mode);
1314 	  }
1315 	  bool cache_mode_requires_hit_set() const {
1316 	    switch (cache_mode) {
1317 	    case CACHEMODE_NONE:
1318 	    case CACHEMODE_FORWARD:
1319 	    case CACHEMODE_READONLY:
1320 	    case CACHEMODE_PROXY:
1321 	      return false;
1322 	    case CACHEMODE_WRITEBACK:
1323 	    case CACHEMODE_READFORWARD:
1324 	    case CACHEMODE_READPROXY:
1325 	      return true;
1326 	    default:
1327 	      ceph_abort_msg("implement me");
1328 	    }
1329 	  }
1330 	
1331 	  enum class pg_autoscale_mode_t : uint8_t {
1332 	    OFF = 0,
1333 	    WARN = 1,
1334 	    ON = 2,
1335 	    UNKNOWN = UINT8_MAX,
1336 	  };
1337 	  static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) {
1338 	    switch (m) {
1339 	    case pg_autoscale_mode_t::OFF: return "off";
1340 	    case pg_autoscale_mode_t::ON: return "on";
1341 	    case pg_autoscale_mode_t::WARN: return "warn";
1342 	    default: return "???";
1343 	    }
1344 	  }
1345 	  static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) {
1346 	    if (m == "off") {
1347 	      return pg_autoscale_mode_t::OFF;
1348 	    }
1349 	    if (m == "warn") {
1350 	      return pg_autoscale_mode_t::WARN;
1351 	    }
1352 	    if (m == "on") {
1353 	      return pg_autoscale_mode_t::ON;
1354 	    }
1355 	    return pg_autoscale_mode_t::UNKNOWN;
1356 	  }
1357 	
1358 	  utime_t create_time;
1359 	  uint64_t flags = 0;           ///< FLAG_*
1360 	  __u8 type = 0;                ///< TYPE_*
1361 	  __u8 size = 0, min_size = 0;  ///< number of osds in each pg
1362 	  __u8 crush_rule = 0;          ///< crush placement rule
1363 	  __u8 object_hash = 0;         ///< hash mapping object name to ps
1364 	  pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
1365 	
1366 	private:
1367 	  __u32 pg_num = 0, pgp_num = 0;  ///< number of pgs
1368 	  __u32 pg_num_pending = 0;       ///< pg_num we are about to merge down to
1369 	  __u32 pg_num_target = 0;        ///< pg_num we should converge toward
1370 	  __u32 pgp_num_target = 0;       ///< pgp_num we should converge toward
1371 	
1372 	public:
1373 	  std::map<std::string, std::string> properties;  ///< OBSOLETE
1374 	  std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1375 	  epoch_t last_change = 0;      ///< most recent epoch changed, exclusing snapshot changes
1376 	
1377 	  /// last epoch that forced clients to resend
1378 	  epoch_t last_force_op_resend = 0;
1379 	  /// last epoch that forced clients to resend (pre-nautilus clients only)
1380 	  epoch_t last_force_op_resend_prenautilus = 0;
1381 	  /// last epoch that forced clients to resend (pre-luminous clients only)
1382 	  epoch_t last_force_op_resend_preluminous = 0;
1383 	
1384 	  /// metadata for the most recent PG merge
1385 	  pg_merge_meta_t last_pg_merge_meta;
1386 	  
1387 	  snapid_t snap_seq = 0;        ///< seq for per-pool snapshot
1388 	  epoch_t snap_epoch = 0;       ///< osdmap epoch of last snap
1389 	  uint64_t auid = 0;            ///< who owns the pg
1390 	
1391 	  uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool
1392 	  uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool
1393 	
1394 	  /*
1395 	   * Pool snaps (global to this pool).  These define a SnapContext for
1396 	   * the pool, unless the client manually specifies an alternate
1397 	   * context.
1398 	   */
1399 	  std::map<snapid_t, pool_snap_info_t> snaps;
1400 	  /*
1401 	   * Alternatively, if we are defining non-pool snaps (e.g. via the
1402 	   * Ceph MDS), we must track @removed_snaps (since @snaps is not
1403 	   * used).  Snaps and removed_snaps are to be used exclusive of each
1404 	   * other!
1405 	   */
1406 	  interval_set<snapid_t> removed_snaps;
1407 	
1408 	  unsigned pg_num_mask = 0, pgp_num_mask = 0;
1409 	
1410 	  std::set<uint64_t> tiers;      ///< pools that are tiers of us
1411 	  int64_t tier_of = -1;         ///< pool for which we are a tier
1412 	  // Note that write wins for read+write ops
1413 	  int64_t read_tier = -1;       ///< pool/tier for objecter to direct reads to
1414 	  int64_t write_tier = -1;      ///< pool/tier for objecter to direct writes to
1415 	  cache_mode_t cache_mode = CACHEMODE_NONE;  ///< cache pool mode
1416 	
1417 	  bool is_tier() const { return tier_of >= 0; }
1418 	  bool has_tiers() const { return !tiers.empty(); }
1419 	  void clear_tier() {
1420 	    tier_of = -1;
1421 	    clear_read_tier();
1422 	    clear_write_tier();
1423 	    clear_tier_tunables();
1424 	  }
1425 	  bool has_read_tier() const { return read_tier >= 0; }
1426 	  void clear_read_tier() { read_tier = -1; }
1427 	  bool has_write_tier() const { return write_tier >= 0; }
1428 	  void clear_write_tier() { write_tier = -1; }
1429 	  void clear_tier_tunables() {
1430 	    if (cache_mode != CACHEMODE_NONE)
1431 	      flags |= FLAG_INCOMPLETE_CLONES;
1432 	    cache_mode = CACHEMODE_NONE;
1433 	
1434 	    target_max_bytes = 0;
1435 	    target_max_objects = 0;
1436 	    cache_target_dirty_ratio_micro = 0;
1437 	    cache_target_dirty_high_ratio_micro = 0;
1438 	    cache_target_full_ratio_micro = 0;
1439 	    hit_set_params = HitSet::Params();
1440 	    hit_set_period = 0;
1441 	    hit_set_count = 0;
1442 	    hit_set_grade_decay_rate = 0;
1443 	    hit_set_search_last_n = 0;
1444 	    grade_table.resize(0);
1445 	  }
1446 	
1447 	  uint64_t target_max_bytes = 0;   ///< tiering: target max pool size
1448 	  uint64_t target_max_objects = 0; ///< tiering: target max pool size
1449 	
1450 	  uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty
1451 	  uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of  target to flush with high speed
1452 	  uint32_t cache_target_full_ratio_micro = 0;  ///< cache: fraction of target to fill before we evict in earnest
1453 	
1454 	  uint32_t cache_min_flush_age = 0;  ///< minimum age (seconds) before we can flush
1455 	  uint32_t cache_min_evict_age = 0;  ///< minimum age (seconds) before we can evict
1456 	
1457 	  HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1458 	  uint32_t hit_set_period = 0;   ///< periodicity of HitSet segments (seconds)
1459 	  uint32_t hit_set_count = 0;    ///< number of periods to retain
1460 	  bool use_gmt_hitset = true;	 ///< use gmt to name the hitset archive object
1461 	  uint32_t min_read_recency_for_promote = 0;   ///< minimum number of HitSet to check before promote on read
1462 	  uint32_t min_write_recency_for_promote = 0;  ///< minimum number of HitSet to check before promote on write
1463 	  uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects
1464 	                                         ///< temperature count,the follow hit_set's priority decay
1465 	                                         ///< by this params than pre hit_set
1466 	  uint32_t hit_set_search_last_n = 0;    ///< accumulate atmost N hit_sets for temperature
1467 	
1468 	  uint32_t stripe_width = 0;        ///< erasure coded stripe size in bytes
1469 	
1470 	  uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates
1471 	                                     ///< user does not specify any expected value
1472 	  bool fast_read = false;            ///< whether turn on fast read on the pool or not
1473 	
1474 	  pool_opts_t opts; ///< options
1475 	
1476 	  typedef enum {
1477 	    TYPE_FINGERPRINT_NONE = 0,
1478 	    TYPE_FINGERPRINT_SHA1 = 1,     
1479 	    TYPE_FINGERPRINT_SHA256 = 2,     
1480 	    TYPE_FINGERPRINT_SHA512 = 3,     
1481 	  } fingerprint_t;
1482 	  static fingerprint_t get_fingerprint_from_str(const std::string& s) {
1483 	    if (s == "none")
1484 	      return TYPE_FINGERPRINT_NONE;
1485 	    if (s == "sha1")
1486 	      return TYPE_FINGERPRINT_SHA1;
1487 	    if (s == "sha256")
1488 	      return TYPE_FINGERPRINT_SHA256;
1489 	    if (s == "sha512")
1490 	      return TYPE_FINGERPRINT_SHA512;
1491 	    return (fingerprint_t)-1;
1492 	  }
1493 	  const fingerprint_t get_fingerprint_type() const {
1494 	    std::string fp_str;
1495 	    opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1496 	    return get_fingerprint_from_str(fp_str);
1497 	  }
1498 	  const char *get_fingerprint_name() const {
1499 	    std::string fp_str;
1500 	    fingerprint_t fp_t;
1501 	    opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1502 	    fp_t = get_fingerprint_from_str(fp_str);
1503 	    return get_fingerprint_name(fp_t);
1504 	  }
1505 	  static const char *get_fingerprint_name(fingerprint_t m) {
1506 	    switch (m) {
1507 	    case TYPE_FINGERPRINT_NONE: return "none";
1508 	    case TYPE_FINGERPRINT_SHA1: return "sha1";
1509 	    case TYPE_FINGERPRINT_SHA256: return "sha256";
1510 	    case TYPE_FINGERPRINT_SHA512: return "sha512";
1511 	    default: return "unknown";
1512 	    }
1513 	  }
1514 	
1515 	  /// application -> key/value metadata
1516 	  std::map<std::string, std::map<std::string, std::string>> application_metadata;
1517 	
1518 	private:
1519 	  std::vector<uint32_t> grade_table;
1520 	
1521 	public:
1522 	  uint32_t get_grade(unsigned i) const {
1523 	    if (grade_table.size() <= i)
1524 	      return 0;
1525 	    return grade_table[i];
1526 	  }
1527 	  void calc_grade_table() {
1528 	    unsigned v = 1000000;
1529 	    grade_table.resize(hit_set_count);
1530 	    for (unsigned i = 0; i < hit_set_count; i++) {
1531 	      v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1532 	      grade_table[i] = v;
1533 	    }
1534 	  }
1535 	
1536 	  pg_pool_t() = default;
1537 	
1538 	  void dump(ceph::Formatter *f) const;
1539 	
1540 	  const utime_t &get_create_time() const { return create_time; }
1541 	  uint64_t get_flags() const { return flags; }
1542 	  bool has_flag(uint64_t f) const { return flags & f; }
1543 	  void set_flag(uint64_t f) { flags |= f; }
1544 	  void unset_flag(uint64_t f) { flags &= ~f; }
1545 	
1546 	  bool require_rollback() const {
1547 	    return is_erasure();
1548 	  }
1549 	
1550 	  /// true if incomplete clones may be present
1551 	  bool allow_incomplete_clones() const {
1552 	    return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1553 	  }
1554 	
1555 	  unsigned get_type() const { return type; }
1556 	  unsigned get_size() const { return size; }
1557 	  unsigned get_min_size() const { return min_size; }
1558 	  int get_crush_rule() const { return crush_rule; }
1559 	  int get_object_hash() const { return object_hash; }
1560 	  const char *get_object_hash_name() const {
1561 	    return ceph_str_hash_name(get_object_hash());
1562 	  }
1563 	  epoch_t get_last_change() const { return last_change; }
1564 	  epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1565 	  epoch_t get_last_force_op_resend_prenautilus() const {
1566 	    return last_force_op_resend_prenautilus;
1567 	  }
1568 	  epoch_t get_last_force_op_resend_preluminous() const {
1569 	    return last_force_op_resend_preluminous;
1570 	  }
1571 	  epoch_t get_snap_epoch() const { return snap_epoch; }
1572 	  snapid_t get_snap_seq() const { return snap_seq; }
1573 	  uint64_t get_auid() const { return auid; }
1574 	
1575 	  void set_snap_seq(snapid_t s) { snap_seq = s; }
1576 	  void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1577 	
1578 	  void set_stripe_width(uint32_t s) { stripe_width = s; }
1579 	  uint32_t get_stripe_width() const { return stripe_width; }
1580 	
1581 	  bool is_replicated()   const { return get_type() == TYPE_REPLICATED; }
1582 	  bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1583 	
1584 	  bool supports_omap() const {
1585 	    return !(get_type() == TYPE_ERASURE);
1586 	  }
1587 	
1588 	  bool requires_aligned_append() const {
1589 	    return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1590 	  }
1591 	  uint64_t required_alignment() const { return stripe_width; }
1592 	
1593 	  bool allows_ecoverwrites() const {
1594 	    return has_flag(FLAG_EC_OVERWRITES);
1595 	  }
1596 	
1597 	  bool can_shift_osds() const {
1598 	    switch (get_type()) {
1599 	    case TYPE_REPLICATED:
1600 	      return true;
1601 	    case TYPE_ERASURE:
1602 	      return false;
1603 	    default:
1604 	      ceph_abort_msg("unhandled pool type");
1605 	    }
1606 	  }
1607 	
1608 	  unsigned get_pg_num() const { return pg_num; }
1609 	  unsigned get_pgp_num() const { return pgp_num; }
1610 	  unsigned get_pg_num_target() const { return pg_num_target; }
1611 	  unsigned get_pgp_num_target() const { return pgp_num_target; }
1612 	  unsigned get_pg_num_pending() const { return pg_num_pending; }
1613 	
1614 	  unsigned get_pg_num_mask() const { return pg_num_mask; }
1615 	  unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1616 	
1617 	  // if pg_num is not a multiple of two, pgs are not equally sized.
1618 	  // return, for a given pg, the fraction (denominator) of the total
1619 	  // pool size that it represents.
1620 	  unsigned get_pg_num_divisor(pg_t pgid) const;
1621 	
1622 	  bool is_pending_merge(pg_t pgid, bool *target) const;
1623 	
1624 	  void set_pg_num(int p) {
1625 	    pg_num = p;
1626 	    pg_num_pending = p;
1627 	    calc_pg_masks();
1628 	  }
1629 	  void set_pgp_num(int p) {
1630 	    pgp_num = p;
1631 	    calc_pg_masks();
1632 	  }
1633 	  void set_pg_num_pending(int p) {
1634 	    pg_num_pending = p;
1635 	    calc_pg_masks();
1636 	  }
1637 	  void set_pg_num_target(int p) {
1638 	    pg_num_target = p;
1639 	  }
1640 	  void set_pgp_num_target(int p) {
1641 	    pgp_num_target = p;
1642 	  }
1643 	  void dec_pg_num(pg_t source_pgid,
1644 			  epoch_t ready_epoch,
1645 			  eversion_t source_version,
1646 			  eversion_t target_version,
1647 			  epoch_t last_epoch_started,
1648 			  epoch_t last_epoch_clean) {
1649 	    --pg_num;
1650 	    last_pg_merge_meta.source_pgid = source_pgid;
1651 	    last_pg_merge_meta.ready_epoch = ready_epoch;
1652 	    last_pg_merge_meta.source_version = source_version;
1653 	    last_pg_merge_meta.target_version = target_version;
1654 	    last_pg_merge_meta.last_epoch_started = last_epoch_started;
1655 	    last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
1656 	    calc_pg_masks();
1657 	  }
1658 	
1659 	  void set_quota_max_bytes(uint64_t m) {
1660 	    quota_max_bytes = m;
1661 	  }
1662 	  uint64_t get_quota_max_bytes() {
1663 	    return quota_max_bytes;
1664 	  }
1665 	
1666 	  void set_quota_max_objects(uint64_t m) {
1667 	    quota_max_objects = m;
1668 	  }
1669 	  uint64_t get_quota_max_objects() {
1670 	    return quota_max_objects;
1671 	  }
1672 	
1673 	  void set_last_force_op_resend(uint64_t t) {
1674 	    last_force_op_resend = t;
1675 	    last_force_op_resend_prenautilus = t;
1676 	    last_force_op_resend_preluminous = t;
1677 	  }
1678 	
1679 	  void calc_pg_masks();
1680 	
1681 	  /*
1682 	   * we have two snap modes:
1683 	   *  - pool global snaps
1684 	   *    - snap existence/non-existence defined by snaps[] and snap_seq
1685 	   *  - user managed snaps
1686 	   *    - removal governed by removed_snaps
1687 	   *
1688 	   * we know which mode we're using based on whether removed_snaps is empty.
1689 	   * If nothing has been created, both functions report false.
1690 	   */
1691 	  bool is_pool_snaps_mode() const;
1692 	  bool is_unmanaged_snaps_mode() const;
1693 	  bool is_removed_snap(snapid_t s) const;
1694 	
1695 	  snapid_t snap_exists(const char *s) const;
1696 	  void add_snap(const char *n, utime_t stamp);
1697 	  uint64_t add_unmanaged_snap(bool preoctopus_compat);
1698 	  void remove_snap(snapid_t s);
1699 	  void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat);
1700 	
1701 	  SnapContext get_snap_context() const;
1702 	
1703 	  /// hash a object name+namespace key to a hash position
1704 	  uint32_t hash_key(const std::string& key, const std::string& ns) const;
1705 	
1706 	  /// round a hash position down to a pg num
1707 	  uint32_t raw_hash_to_pg(uint32_t v) const;
1708 	
1709 	  /*
1710 	   * map a raw pg (with full precision ps) into an actual pg, for storage
1711 	   */
1712 	  pg_t raw_pg_to_pg(pg_t pg) const;
1713 	  
1714 	  /*
1715 	   * map raw pg (full precision ps) into a placement seed.  include
1716 	   * pool id in that value so that different pools don't use the same
1717 	   * seeds.
1718 	   */
1719 	  ps_t raw_pg_to_pps(pg_t pg) const;
1720 	
1721 	  /// choose a random hash position within a pg
1722 	  uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1723 	
1724 	  void encode(ceph::buffer::list& bl, uint64_t features) const;
1725 	  void decode(ceph::buffer::list::const_iterator& bl);
1726 	
1727 	  static void generate_test_instances(std::list<pg_pool_t*>& o);
1728 	};
1729 	WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1730 	
1731 	std::ostream& operator<<(std::ostream& out, const pg_pool_t& p);
1732 	
1733 	
1734 	/**
1735 	 * a summation of object stats
1736 	 *
1737 	 * This is just a container for object stats; we don't know what for.
1738 	 *
1739 	 * If you add members in object_stat_sum_t, you should make sure there are
1740 	 * not padding among these members.
1741 	 * You should also modify the padding_check function.
1742 	
1743 	 */
1744 	struct object_stat_sum_t {
1745 	  /**************************************************************************
1746 	   * WARNING: be sure to update operator==, floor, and split when
1747 	   * adding/removing fields!
1748 	   **************************************************************************/
1749 	  int64_t num_bytes;    // in bytes
1750 	  int64_t num_objects;
1751 	  int64_t num_object_clones;
1752 	  int64_t num_object_copies;  // num_objects * num_replicas
1753 	  int64_t num_objects_missing_on_primary;
1754 	  int64_t num_objects_degraded;
1755 	  int64_t num_objects_unfound;
1756 	  int64_t num_rd;
1757 	  int64_t num_rd_kb;
1758 	  int64_t num_wr;
1759 	  int64_t num_wr_kb;
1760 	  int64_t num_scrub_errors;	// total deep and shallow scrub errors
1761 	  int64_t num_objects_recovered;
1762 	  int64_t num_bytes_recovered;
1763 	  int64_t num_keys_recovered;
1764 	  int64_t num_shallow_scrub_errors;
1765 	  int64_t num_deep_scrub_errors;
1766 	  int64_t num_objects_dirty;
1767 	  int64_t num_whiteouts;
1768 	  int64_t num_objects_omap;
1769 	  int64_t num_objects_hit_set_archive;
1770 	  int64_t num_objects_misplaced;
1771 	  int64_t num_bytes_hit_set_archive;
1772 	  int64_t num_flush;
1773 	  int64_t num_flush_kb;
1774 	  int64_t num_evict;
1775 	  int64_t num_evict_kb;
1776 	  int64_t num_promote;
1777 	  int32_t num_flush_mode_high;  // 1 when in high flush mode, otherwise 0
1778 	  int32_t num_flush_mode_low;   // 1 when in low flush mode, otherwise 0
1779 	  int32_t num_evict_mode_some;  // 1 when in evict some mode, otherwise 0
1780 	  int32_t num_evict_mode_full;  // 1 when in evict full mode, otherwise 0
1781 	  int64_t num_objects_pinned;
1782 	  int64_t num_objects_missing;
1783 	  int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
1784 	  int64_t num_large_omap_objects = 0;
1785 	  int64_t num_objects_manifest = 0;
1786 	  int64_t num_omap_bytes = 0;
1787 	  int64_t num_omap_keys = 0;
1788 	  int64_t num_objects_repaired = 0;
1789 	
1790 	  object_stat_sum_t()
1791 	    : num_bytes(0),
1792 	      num_objects(0), num_object_clones(0), num_object_copies(0),
1793 	      num_objects_missing_on_primary(0), num_objects_degraded(0),
1794 	      num_objects_unfound(0),
1795 	      num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1796 	      num_scrub_errors(0),
1797 	      num_objects_recovered(0),
1798 	      num_bytes_recovered(0),
1799 	      num_keys_recovered(0),
1800 	      num_shallow_scrub_errors(0),
1801 	      num_deep_scrub_errors(0),
1802 	      num_objects_dirty(0),
1803 	      num_whiteouts(0),
1804 	      num_objects_omap(0),
1805 	      num_objects_hit_set_archive(0),
1806 	      num_objects_misplaced(0),
1807 	      num_bytes_hit_set_archive(0),
1808 	      num_flush(0),
1809 	      num_flush_kb(0),
1810 	      num_evict(0),
1811 	      num_evict_kb(0),
1812 	      num_promote(0),
1813 	      num_flush_mode_high(0), num_flush_mode_low(0),
1814 	      num_evict_mode_some(0), num_evict_mode_full(0),
1815 	      num_objects_pinned(0),
1816 	      num_objects_missing(0),
1817 	      num_legacy_snapsets(0)
1818 	  {}
1819 	
1820 	  void floor(int64_t f) {
1821 	#define FLOOR(x) if (x < f) x = f
1822 	    FLOOR(num_bytes);
1823 	    FLOOR(num_objects);
1824 	    FLOOR(num_object_clones);
1825 	    FLOOR(num_object_copies);
1826 	    FLOOR(num_objects_missing_on_primary);
1827 	    FLOOR(num_objects_missing);
1828 	    FLOOR(num_objects_degraded);
1829 	    FLOOR(num_objects_misplaced);
1830 	    FLOOR(num_objects_unfound);
1831 	    FLOOR(num_rd);
1832 	    FLOOR(num_rd_kb);
1833 	    FLOOR(num_wr);
1834 	    FLOOR(num_wr_kb);
1835 	    FLOOR(num_large_omap_objects);
1836 	    FLOOR(num_objects_manifest);
1837 	    FLOOR(num_omap_bytes);
1838 	    FLOOR(num_omap_keys);
1839 	    FLOOR(num_shallow_scrub_errors);
1840 	    FLOOR(num_deep_scrub_errors);
1841 	    num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
1842 	    FLOOR(num_objects_recovered);
1843 	    FLOOR(num_bytes_recovered);
1844 	    FLOOR(num_keys_recovered);
1845 	    FLOOR(num_objects_dirty);
1846 	    FLOOR(num_whiteouts);
1847 	    FLOOR(num_objects_omap);
1848 	    FLOOR(num_objects_hit_set_archive);
1849 	    FLOOR(num_bytes_hit_set_archive);
1850 	    FLOOR(num_flush);
1851 	    FLOOR(num_flush_kb);
1852 	    FLOOR(num_evict);
1853 	    FLOOR(num_evict_kb);
1854 	    FLOOR(num_promote);
1855 	    FLOOR(num_flush_mode_high);
1856 	    FLOOR(num_flush_mode_low);
1857 	    FLOOR(num_evict_mode_some);
1858 	    FLOOR(num_evict_mode_full);
1859 	    FLOOR(num_objects_pinned);
1860 	    FLOOR(num_legacy_snapsets);
1861 	    FLOOR(num_objects_repaired);
1862 	#undef FLOOR
1863 	  }
1864 	
1865 	  void split(std::vector<object_stat_sum_t> &out) const {
1866 	#define SPLIT(PARAM)                            \
1867 	    for (unsigned i = 0; i < out.size(); ++i) { \
1868 	      out[i].PARAM = PARAM / out.size();        \
1869 	      if (i < (PARAM % out.size())) {           \
1870 		out[i].PARAM++;                         \
1871 	      }                                         \
1872 	    }
1873 	#define SPLIT_PRESERVE_NONZERO(PARAM)		\
1874 	    for (unsigned i = 0; i < out.size(); ++i) { \
1875 	      if (PARAM)				\
1876 		out[i].PARAM = 1 + PARAM / out.size();	\
1877 	      else					\
1878 		out[i].PARAM = 0;			\
1879 	    }
1880 	
1881 	    SPLIT(num_bytes);
1882 	    SPLIT(num_objects);
1883 	    SPLIT(num_object_clones);
1884 	    SPLIT(num_object_copies);
1885 	    SPLIT(num_objects_missing_on_primary);
1886 	    SPLIT(num_objects_missing);
1887 	    SPLIT(num_objects_degraded);
1888 	    SPLIT(num_objects_misplaced);
1889 	    SPLIT(num_objects_unfound);
1890 	    SPLIT(num_rd);
1891 	    SPLIT(num_rd_kb);
1892 	    SPLIT(num_wr);
1893 	    SPLIT(num_wr_kb);
1894 	    SPLIT(num_large_omap_objects);
1895 	    SPLIT(num_objects_manifest);
1896 	    SPLIT(num_omap_bytes);
1897 	    SPLIT(num_omap_keys);
1898 	    SPLIT(num_objects_repaired);
1899 	    SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
1900 	    SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
1901 	    for (unsigned i = 0; i < out.size(); ++i) {
1902 	      out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
1903 					out[i].num_deep_scrub_errors;
1904 	    }
1905 	    SPLIT(num_objects_recovered);
1906 	    SPLIT(num_bytes_recovered);
1907 	    SPLIT(num_keys_recovered);
1908 	    SPLIT(num_objects_dirty);
1909 	    SPLIT(num_whiteouts);
1910 	    SPLIT(num_objects_omap);
1911 	    SPLIT(num_objects_hit_set_archive);
1912 	    SPLIT(num_bytes_hit_set_archive);
1913 	    SPLIT(num_flush);
1914 	    SPLIT(num_flush_kb);
1915 	    SPLIT(num_evict);
1916 	    SPLIT(num_evict_kb);
1917 	    SPLIT(num_promote);
1918 	    SPLIT(num_flush_mode_high);
1919 	    SPLIT(num_flush_mode_low);
1920 	    SPLIT(num_evict_mode_some);
1921 	    SPLIT(num_evict_mode_full);
1922 	    SPLIT(num_objects_pinned);
1923 	    SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1924 	#undef SPLIT
1925 	#undef SPLIT_PRESERVE_NONZERO
1926 	  }
1927 	
1928 	  void clear() {
1929 	    memset(this, 0, sizeof(*this));
1930 	  }
1931 	
1932 	  void calc_copies(int nrep) {
1933 	    num_object_copies = nrep * num_objects;
1934 	  }
1935 	
1936 	  bool is_zero() const {
1937 	    return mem_is_zero((char*)this, sizeof(*this));
1938 	  }
1939 	
1940 	  void add(const object_stat_sum_t& o);
1941 	  void sub(const object_stat_sum_t& o);
1942 	
1943 	  void dump(ceph::Formatter *f) const;
1944 	  void padding_check() {
1945 	    static_assert(
1946 	      sizeof(object_stat_sum_t) ==
1947 	        sizeof(num_bytes) +
1948 	        sizeof(num_objects) +
1949 	        sizeof(num_object_clones) +
1950 	        sizeof(num_object_copies) +
1951 	        sizeof(num_objects_missing_on_primary) +
1952 	        sizeof(num_objects_degraded) +
1953 	        sizeof(num_objects_unfound) +
1954 	        sizeof(num_rd) +
1955 	        sizeof(num_rd_kb) +
1956 	        sizeof(num_wr) +
1957 	        sizeof(num_wr_kb) +
1958 	        sizeof(num_scrub_errors) +
1959 	        sizeof(num_large_omap_objects) +
1960 	        sizeof(num_objects_manifest) +
1961 	        sizeof(num_omap_bytes) +
1962 	        sizeof(num_omap_keys) +
1963 	        sizeof(num_objects_repaired) +
1964 	        sizeof(num_objects_recovered) +
1965 	        sizeof(num_bytes_recovered) +
1966 	        sizeof(num_keys_recovered) +
1967 	        sizeof(num_shallow_scrub_errors) +
1968 	        sizeof(num_deep_scrub_errors) +
1969 	        sizeof(num_objects_dirty) +
1970 	        sizeof(num_whiteouts) +
1971 	        sizeof(num_objects_omap) +
1972 	        sizeof(num_objects_hit_set_archive) +
1973 	        sizeof(num_objects_misplaced) +
1974 	        sizeof(num_bytes_hit_set_archive) +
1975 	        sizeof(num_flush) +
1976 	        sizeof(num_flush_kb) +
1977 	        sizeof(num_evict) +
1978 	        sizeof(num_evict_kb) +
1979 	        sizeof(num_promote) +
1980 	        sizeof(num_flush_mode_high) +
1981 	        sizeof(num_flush_mode_low) +
1982 	        sizeof(num_evict_mode_some) +
1983 	        sizeof(num_evict_mode_full) +
1984 	        sizeof(num_objects_pinned) +
1985 	        sizeof(num_objects_missing) +
1986 	        sizeof(num_legacy_snapsets)
1987 	      ,
1988 	      "object_stat_sum_t have padding");
1989 	  }
1990 	  void encode(ceph::buffer::list& bl) const;
1991 	  void decode(ceph::buffer::list::const_iterator& bl);
1992 	  static void generate_test_instances(std::list<object_stat_sum_t*>& o);
1993 	};
1994 	WRITE_CLASS_ENCODER(object_stat_sum_t)
1995 	
1996 	bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
1997 	
1998 	/**
1999 	 * a collection of object stat sums
2000 	 *
2001 	 * This is a collection of stat sums over different categories.
2002 	 */
2003 	struct object_stat_collection_t {
2004 	  /**************************************************************************
2005 	   * WARNING: be sure to update the operator== when adding/removing fields! *
2006 	   **************************************************************************/
2007 	  object_stat_sum_t sum;
2008 	
2009 	  void calc_copies(int nrep) {
2010 	    sum.calc_copies(nrep);
2011 	  }
2012 	
2013 	  void dump(ceph::Formatter *f) const;
2014 	  void encode(ceph::buffer::list& bl) const;
2015 	  void decode(ceph::buffer::list::const_iterator& bl);
2016 	  static void generate_test_instances(std::list<object_stat_collection_t*>& o);
2017 	
2018 	  bool is_zero() const {
2019 	    return sum.is_zero();
2020 	  }
2021 	
2022 	  void clear() {
2023 	    sum.clear();
2024 	  }
2025 	
2026 	  void floor(int64_t f) {
2027 	    sum.floor(f);
2028 	  }
2029 	
2030 	  void add(const object_stat_sum_t& o) {
2031 	    sum.add(o);
2032 	  }
2033 	
2034 	  void add(const object_stat_collection_t& o) {
2035 	    sum.add(o.sum);
2036 	  }
2037 	  void sub(const object_stat_collection_t& o) {
2038 	    sum.sub(o.sum);
2039 	  }
2040 	};
2041 	WRITE_CLASS_ENCODER(object_stat_collection_t)
2042 	
2043 	inline bool operator==(const object_stat_collection_t& l,
2044 			       const object_stat_collection_t& r) {
2045 	  return l.sum == r.sum;
2046 	}
2047 	
2048 	
2049 	/** pg_stat
2050 	 * aggregate stats for a single PG.
2051 	 */
2052 	struct pg_stat_t {
2053 	  /**************************************************************************
2054 	   * WARNING: be sure to update the operator== when adding/removing fields! *
2055 	   **************************************************************************/
2056 	  eversion_t version;
2057 	  version_t reported_seq;  // sequence number
2058 	  epoch_t reported_epoch;  // epoch of this report
2059 	  uint64_t state;
2060 	  utime_t last_fresh;   // last reported
2061 	  utime_t last_change;  // new state != previous state
2062 	  utime_t last_active;  // state & PG_STATE_ACTIVE
2063 	  utime_t last_peered;  // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2064 	  utime_t last_clean;   // state & PG_STATE_CLEAN
2065 	  utime_t last_unstale; // (state & PG_STATE_STALE) == 0
2066 	  utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
2067 	  utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
2068 	
2069 	  eversion_t log_start;         // (log_start,version]
2070 	  eversion_t ondisk_log_start;  // there may be more on disk
2071 	
2072 	  epoch_t created;
2073 	  epoch_t last_epoch_clean;
2074 	  pg_t parent;
2075 	  __u32 parent_split_bits;
2076 	
2077 	  eversion_t last_scrub;
2078 	  eversion_t last_deep_scrub;
2079 	  utime_t last_scrub_stamp;
2080 	  utime_t last_deep_scrub_stamp;
2081 	  utime_t last_clean_scrub_stamp;
2082 	
2083 	  object_stat_collection_t stats;
2084 	
2085 	  int64_t log_size;
2086 	  int64_t ondisk_log_size;    // >= active_log_size
2087 	
2088 	  std::vector<int32_t> up, acting;
2089 	  std::vector<pg_shard_t> avail_no_missing;
2090 	  std::map< std::set<pg_shard_t>, int32_t > object_location_counts;
2091 	  epoch_t mapping_epoch;
2092 	
2093 	  std::vector<int32_t> blocked_by;  ///< osds on which the pg is blocked
2094 	
2095 	  interval_set<snapid_t> purged_snaps;  ///< recently removed snaps that we've purged
2096 	
2097 	  utime_t last_became_active;
2098 	  utime_t last_became_peered;
2099 	
2100 	  /// up, acting primaries
2101 	  int32_t up_primary;
2102 	  int32_t acting_primary;
2103 	
2104 	  // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2105 	  // absurd already, so cap it to 2^32 and save 4 bytes at  the same time
2106 	  uint32_t snaptrimq_len;
2107 	
2108 	  bool stats_invalid:1;
2109 	  /// true if num_objects_dirty is not accurate (because it was not
2110 	  /// maintained starting from pool creation)
2111 	  bool dirty_stats_invalid:1;
2112 	  bool omap_stats_invalid:1;
2113 	  bool hitset_stats_invalid:1;
2114 	  bool hitset_bytes_stats_invalid:1;
2115 	  bool pin_stats_invalid:1;
2116 	  bool manifest_stats_invalid:1;
2117 	
2118 	  pg_stat_t()
2119 	    : reported_seq(0),
2120 	      reported_epoch(0),
2121 	      state(0),
2122 	      created(0), last_epoch_clean(0),
2123 	      parent_split_bits(0),
2124 	      log_size(0), ondisk_log_size(0),
2125 	      mapping_epoch(0),
2126 	      up_primary(-1),
2127 	      acting_primary(-1),
2128 	      snaptrimq_len(0),
2129 	      stats_invalid(false),
2130 	      dirty_stats_invalid(false),
2131 	      omap_stats_invalid(false),
2132 	      hitset_stats_invalid(false),
2133 	      hitset_bytes_stats_invalid(false),
2134 	      pin_stats_invalid(false),
2135 	      manifest_stats_invalid(false)
2136 	  { }
2137 	
2138 	  epoch_t get_effective_last_epoch_clean() const {
2139 	    if (state & PG_STATE_CLEAN) {
2140 	      // we are clean as of this report, and should thus take the
2141 	      // reported epoch
2142 	      return reported_epoch;
2143 	    } else {
2144 	      return last_epoch_clean;
2145 	    }
2146 	  }
2147 	
2148 	  std::pair<epoch_t, version_t> get_version_pair() const {
2149 	    return { reported_epoch, reported_seq };
2150 	  }
2151 	
2152 	  void floor(int64_t f) {
2153 	    stats.floor(f);
2154 	    if (log_size < f)
2155 	      log_size = f;
2156 	    if (ondisk_log_size < f)
2157 	      ondisk_log_size = f;
2158 	    if (snaptrimq_len < f)
2159 	      snaptrimq_len = f;
2160 	  }
2161 	
2162 	  void add_sub_invalid_flags(const pg_stat_t& o) {
2163 	    // adding (or subtracting!) invalid stats render our stats invalid too
2164 	    stats_invalid |= o.stats_invalid;
2165 	    dirty_stats_invalid |= o.dirty_stats_invalid;
2166 	    omap_stats_invalid |= o.omap_stats_invalid;
2167 	    hitset_stats_invalid |= o.hitset_stats_invalid;
2168 	    hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid;
2169 	    pin_stats_invalid |= o.pin_stats_invalid;
2170 	    manifest_stats_invalid |= o.manifest_stats_invalid;
2171 	  }
2172 	  void add(const pg_stat_t& o) {
2173 	    stats.add(o.stats);
2174 	    log_size += o.log_size;
2175 	    ondisk_log_size += o.ondisk_log_size;
2176 	    snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
2177 	                             (uint64_t)(1ull << 31));
2178 	    add_sub_invalid_flags(o);
2179 	  }
2180 	  void sub(const pg_stat_t& o) {
2181 	    stats.sub(o.stats);
2182 	    log_size -= o.log_size;
2183 	    ondisk_log_size -= o.ondisk_log_size;
2184 	    if (o.snaptrimq_len < snaptrimq_len) {
2185 	      snaptrimq_len -= o.snaptrimq_len;
2186 	    } else {
2187 	      snaptrimq_len = 0;
2188 	    }
2189 	    add_sub_invalid_flags(o);
2190 	  }
2191 	
2192 	  bool is_acting_osd(int32_t osd, bool primary) const;
2193 	  void dump(ceph::Formatter *f) const;
2194 	  void dump_brief(ceph::Formatter *f) const;
2195 	  void encode(ceph::buffer::list &bl) const;
2196 	  void decode(ceph::buffer::list::const_iterator &bl);
2197 	  static void generate_test_instances(std::list<pg_stat_t*>& o);
2198 	};
2199 	WRITE_CLASS_ENCODER(pg_stat_t)
2200 	
2201 	bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2202 	
2203 	/** store_statfs_t
2204 	 * ObjectStore full statfs information
2205 	 */
2206 	struct store_statfs_t
2207 	{
2208 	  uint64_t total = 0;                  ///< Total bytes
2209 	  uint64_t available = 0;              ///< Free bytes available
2210 	  uint64_t internally_reserved = 0;    ///< Bytes reserved for internal purposes
2211 	
2212 	  int64_t allocated = 0;               ///< Bytes allocated by the store
2213 	
2214 	  int64_t data_stored = 0;                ///< Bytes actually stored by the user
2215 	  int64_t data_compressed = 0;            ///< Bytes stored after compression
2216 	  int64_t data_compressed_allocated = 0;  ///< Bytes allocated for compressed data
2217 	  int64_t data_compressed_original = 0;   ///< Bytes that were compressed
2218 	
2219 	  int64_t omap_allocated = 0;         ///< approx usage of omap data
2220 	  int64_t internal_metadata = 0;      ///< approx usage of internal metadata
2221 	
2222 	  void reset() {
2223 	    *this = store_statfs_t();
2224 	  }
2225 	  void floor(int64_t f) {
2226 	#define FLOOR(x) if (int64_t(x) < f) x = f
2227 	    FLOOR(total);
2228 	    FLOOR(available);
2229 	    FLOOR(internally_reserved);
2230 	    FLOOR(allocated);
2231 	    FLOOR(data_stored);
2232 	    FLOOR(data_compressed);
2233 	    FLOOR(data_compressed_allocated);
2234 	    FLOOR(data_compressed_original);
2235 	
2236 	    FLOOR(omap_allocated);
2237 	    FLOOR(internal_metadata);
2238 	#undef FLOOR
2239 	  }
2240 	
2241 	  bool operator ==(const store_statfs_t& other) const;
2242 	  bool is_zero() const {
2243 	    return *this == store_statfs_t();
2244 	  }
2245 	
2246 	  uint64_t get_used() const {
2247 	    return total - available - internally_reserved;
2248 	  }
2249 	
2250 	  // this accumulates both actually used and statfs's internally_reserved
2251 	  uint64_t get_used_raw() const {
2252 	    return total - available;
2253 	  }
2254 	
2255 	  float get_used_raw_ratio() const {
2256 	    if (total) {
2257 	      return (float)get_used_raw() / (float)total;
2258 	    } else {
2259 	      return 0.0;
2260 	    }
2261 	  }
2262 	
2263 	  // helpers to ease legacy code porting
2264 	  uint64_t kb_avail() const {
2265 	    return available >> 10;
2266 	  }
2267 	  uint64_t kb() const {
2268 	    return total >> 10;
2269 	  }
2270 	  uint64_t kb_used() const {
2271 	    return (total - available - internally_reserved) >> 10;
2272 	  }
2273 	  uint64_t kb_used_raw() const {
2274 	    return get_used_raw() >> 10;
2275 	  }
2276 	
2277 	  uint64_t kb_used_data() const {
2278 	    return allocated >> 10;
2279 	  }
2280 	  uint64_t kb_used_omap() const {
2281 	    return omap_allocated >> 10;
2282 	  }
2283 	
2284 	  uint64_t kb_used_internal_metadata() const {
2285 	    return internal_metadata >> 10;
2286 	  }
2287 	
2288 	  void add(const store_statfs_t& o) {
2289 	    total += o.total;
2290 	    available += o.available;
2291 	    internally_reserved += o.internally_reserved;
2292 	    allocated += o.allocated;
2293 	    data_stored += o.data_stored;
2294 	    data_compressed += o.data_compressed;
2295 	    data_compressed_allocated += o.data_compressed_allocated;
2296 	    data_compressed_original += o.data_compressed_original;
2297 	    omap_allocated += o.omap_allocated;
2298 	    internal_metadata += o.internal_metadata;
2299 	  }
2300 	  void sub(const store_statfs_t& o) {
2301 	    total -= o.total;
2302 	    available -= o.available;
2303 	    internally_reserved -= o.internally_reserved;
2304 	    allocated -= o.allocated;
2305 	    data_stored -= o.data_stored;
2306 	    data_compressed -= o.data_compressed;
2307 	    data_compressed_allocated -= o.data_compressed_allocated;
2308 	    data_compressed_original -= o.data_compressed_original;
2309 	    omap_allocated -= o.omap_allocated;
2310 	    internal_metadata -= o.internal_metadata;
2311 	  }
2312 	  void dump(ceph::Formatter *f) const;
2313 	  DENC(store_statfs_t, v, p) {
2314 	    DENC_START(1, 1, p);
2315 	    denc(v.total, p);
2316 	    denc(v.available, p);
2317 	    denc(v.internally_reserved, p);
2318 	    denc(v.allocated, p);
2319 	    denc(v.data_stored, p);
2320 	    denc(v.data_compressed, p);
2321 	    denc(v.data_compressed_allocated, p);
2322 	    denc(v.data_compressed_original, p);
2323 	    denc(v.omap_allocated, p);
2324 	    denc(v.internal_metadata, p);
2325 	    DENC_FINISH(p);
2326 	  }
2327 	  static void generate_test_instances(std::list<store_statfs_t*>& o);
2328 	};
2329 	WRITE_CLASS_DENC(store_statfs_t)
2330 	
2331 	std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs);
2332 	
2333 	/** osd_stat
2334 	 * aggregate stats for an osd
2335 	 */
2336 	struct osd_stat_t {
2337 	  store_statfs_t statfs;
2338 	  std::vector<int> hb_peers;
2339 	  int32_t snap_trim_queue_len, num_snap_trimming;
2340 	  uint64_t num_shards_repaired;
2341 	
2342 	  pow2_hist_t op_queue_age_hist;
2343 	
2344 	  objectstore_perf_stat_t os_perf_stat;
2345 	  osd_alerts_t os_alerts;
2346 	
2347 	  epoch_t up_from = 0;
2348 	  uint64_t seq = 0;
2349 	
2350 	  uint32_t num_pgs = 0;
2351 	
2352 	  uint32_t num_osds = 0;
2353 	  uint32_t num_per_pool_osds = 0;
2354 	  uint32_t num_per_pool_omap_osds = 0;
2355 	
2356 	  struct Interfaces {
2357 	    uint32_t last_update;  // in seconds
2358 	    uint32_t back_pingtime[3];
2359 	    uint32_t back_min[3];
2360 	    uint32_t back_max[3];
2361 	    uint32_t back_last;
2362 	    uint32_t front_pingtime[3];
2363 	    uint32_t front_min[3];
2364 	    uint32_t front_max[3];
2365 	    uint32_t front_last;
2366 	  };
2367 	  map<int, Interfaces> hb_pingtime;  ///< map of osd id to Interfaces
2368 	
2369 	  osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2370 	       num_shards_repaired(0)	{}
2371 	
2372 	 void add(const osd_stat_t& o) {
2373 	    statfs.add(o.statfs);
2374 	    snap_trim_queue_len += o.snap_trim_queue_len;
2375 	    num_snap_trimming += o.num_snap_trimming;
2376 	    num_shards_repaired += o.num_shards_repaired;
2377 	    op_queue_age_hist.add(o.op_queue_age_hist);
2378 	    os_perf_stat.add(o.os_perf_stat);
2379 	    num_pgs += o.num_pgs;
2380 	    num_osds += o.num_osds;
2381 	    num_per_pool_osds += o.num_per_pool_osds;
2382 	    num_per_pool_omap_osds += o.num_per_pool_omap_osds;
2383 	    for (const auto& a : o.os_alerts) {
2384 	      auto& target = os_alerts[a.first];
2385 	      for (auto& i : a.second) {
2386 		target.emplace(i.first, i.second);
2387 	      }
2388 	    }
2389 	  }
2390 	  void sub(const osd_stat_t& o) {
2391 	    statfs.sub(o.statfs);
2392 	    snap_trim_queue_len -= o.snap_trim_queue_len;
2393 	    num_snap_trimming -= o.num_snap_trimming;
2394 	    num_shards_repaired -= o.num_shards_repaired;
2395 	    op_queue_age_hist.sub(o.op_queue_age_hist);
2396 	    os_perf_stat.sub(o.os_perf_stat);
2397 	    num_pgs -= o.num_pgs;
2398 	    num_osds -= o.num_osds;
2399 	    num_per_pool_osds -= o.num_per_pool_osds;
2400 	    num_per_pool_omap_osds -= o.num_per_pool_omap_osds;
2401 	    for (const auto& a : o.os_alerts) {
2402 	      auto& target = os_alerts[a.first];
2403 	      for (auto& i : a.second) {
2404 	        target.erase(i.first);
2405 	      }
2406 	      if (target.empty()) {
2407 		os_alerts.erase(a.first);
2408 	      }
2409 	    }
2410 	  }
2411 	  void dump(ceph::Formatter *f) const;
2412 	  void encode(ceph::buffer::list &bl, uint64_t features) const;
2413 	  void decode(ceph::buffer::list::const_iterator &bl);
2414 	  static void generate_test_instances(std::list<osd_stat_t*>& o);
2415 	};
2416 	WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
2417 	
2418 	inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
2419 	  return l.statfs == r.statfs &&
2420 	    l.snap_trim_queue_len == r.snap_trim_queue_len &&
2421 	    l.num_snap_trimming == r.num_snap_trimming &&
2422 	    l.num_shards_repaired == r.num_shards_repaired &&
2423 	    l.hb_peers == r.hb_peers &&
2424 	    l.op_queue_age_hist == r.op_queue_age_hist &&
2425 	    l.os_perf_stat == r.os_perf_stat &&
2426 	    l.num_pgs == r.num_pgs &&
2427 	    l.num_osds == r.num_osds &&
2428 	    l.num_per_pool_osds == r.num_per_pool_osds &&
2429 	    l.num_per_pool_omap_osds == r.num_per_pool_omap_osds;
2430 	}
2431 	inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
2432 	  return !(l == r);
2433 	}
2434 	
2435 	inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) {
2436 	  return out << "osd_stat(" << s.statfs << ", "
2437 		     << "peers " << s.hb_peers
2438 		     << " op hist " << s.op_queue_age_hist.h
2439 		     << ")";
2440 	}
2441 	
2442 	/*
2443 	 * summation over an entire pool
2444 	 */
2445 	struct pool_stat_t {
2446 	  object_stat_collection_t stats;
2447 	  store_statfs_t store_stats;
2448 	  int64_t log_size;
2449 	  int64_t ondisk_log_size;    // >= active_log_size
2450 	  int32_t up;       ///< number of up replicas or shards
2451 	  int32_t acting;   ///< number of acting replicas or shards
2452 	  int32_t num_store_stats; ///< amount of store_stats accumulated
2453 	
2454 	  pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2455 	    num_store_stats(0)
2456 	  { }
2457 	
2458 	  void floor(int64_t f) {
2459 	    stats.floor(f);
2460 	    store_stats.floor(f);
2461 	    if (log_size < f)
2462 	      log_size = f;
2463 	    if (ondisk_log_size < f)
2464 	      ondisk_log_size = f;
2465 	    if (up < f)
2466 	      up = f;
2467 	    if (acting < f)
2468 	      acting = f;
2469 	    if (num_store_stats < f)
2470 	      num_store_stats = f;
2471 	  }
2472 	
2473 	  void add(const store_statfs_t& o) {
2474 	    store_stats.add(o);
2475 	    ++num_store_stats;
2476 	  }
2477 	  void sub(const store_statfs_t& o) {
2478 	    store_stats.sub(o);
2479 	    --num_store_stats;
2480 	  }
2481 	
2482 	  void add(const pg_stat_t& o) {
2483 	    stats.add(o.stats);
2484 	    log_size += o.log_size;
2485 	    ondisk_log_size += o.ondisk_log_size;
2486 	    up += o.up.size();
2487 	    acting += o.acting.size();
2488 	  }
2489 	  void sub(const pg_stat_t& o) {
2490 	    stats.sub(o.stats);
2491 	    log_size -= o.log_size;
2492 	    ondisk_log_size -= o.ondisk_log_size;
2493 	    up -= o.up.size();
2494 	    acting -= o.acting.size();
2495 	  }
2496 	
2497 	  bool is_zero() const {
2498 	    return (stats.is_zero() &&
2499 	            store_stats.is_zero() &&
2500 		    log_size == 0 &&
2501 		    ondisk_log_size == 0 &&
2502 		    up == 0 &&
2503 		    acting == 0 &&
2504 		    num_store_stats == 0);
2505 	  }
2506 	
2507 	  // helper accessors to retrieve used/netto bytes depending on the
2508 	  // collection method: new per-pool objectstore report or legacy PG
2509 	  // summation at OSD.
2510 	  // In legacy mode used and netto values are the same. But for new per-pool
2511 	  // collection 'used' provides amount of space ALLOCATED at all related OSDs 
2512 	  // and 'netto' is amount of stored user data.
2513 	  uint64_t get_allocated_data_bytes(bool per_pool) const {
2514 	    if (per_pool) {
2515 	      return store_stats.allocated;
2516 	    } else {
2517 	      // legacy mode, use numbers from 'stats'
2518 	      return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2519 	    }
2520 	  }
2521 	  uint64_t get_allocated_omap_bytes(bool per_pool_omap) const {
2522 	    if (per_pool_omap) {
2523 	      return store_stats.omap_allocated;
2524 	    } else {
2525 	      // omap is not broken out by pool by nautilus bluestore; report the
2526 	      // scrub value.  this will be imprecise in that it won't account for
2527 	      // any storage overhead/efficiency.
2528 	      return stats.sum.num_omap_bytes;
2529 	    }
2530 	  }
2531 	  uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor
2532 				       bool per_pool) const {
2533 	    // NOTE: we need the space amp factor so that we can work backwards from
2534 	    // the raw utilization to the amount of data that the user actually stored.
2535 	    if (per_pool) {
2536 	      return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
2537 	    } else {
2538 	      // legacy mode, use numbers from 'stats'.  note that we do NOT use the
2539 	      // raw_used_rate factor here because we are working from the PG stats
2540 	      // directly.
2541 	      return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2542 	    }
2543 	  }
2544 	  uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor
2545 				       bool per_pool_omap) const {
2546 	    if (per_pool_omap) {
2547 	      return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0;
2548 	    } else {
2549 	      // omap usage is lazily reported during scrub; this value may lag.
2550 	      return stats.sum.num_omap_bytes;
2551 	    }
2552 	  }
2553 	
2554 	  void dump(ceph::Formatter *f) const;
2555 	  void encode(ceph::buffer::list &bl, uint64_t features) const;
2556 	  void decode(ceph::buffer::list::const_iterator &bl);
2557 	  static void generate_test_instances(std::list<pool_stat_t*>& o);
2558 	};
2559 	WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2560 	
2561 	
2562 	// -----------------------------------------
2563 	
2564 	/**
2565 	 * pg_hit_set_info_t - information about a single recorded HitSet
2566 	 *
2567 	 * Track basic metadata about a HitSet, like the number of insertions
2568 	 * and the time range it covers.
2569 	 */
2570 	struct pg_hit_set_info_t {
2571 	  utime_t begin, end;   ///< time interval
2572 	  eversion_t version;   ///< version this HitSet object was written
2573 	  bool using_gmt;	///< use gmt for creating the hit_set archive object name
2574 	
2575 	  friend bool operator==(const pg_hit_set_info_t& l,
2576 				 const pg_hit_set_info_t& r) {
2577 	    return
2578 	      l.begin == r.begin &&
2579 	      l.end == r.end &&
2580 	      l.version == r.version &&
2581 	      l.using_gmt == r.using_gmt;
2582 	  }
2583 	
2584 	  explicit pg_hit_set_info_t(bool using_gmt = true)
2585 	    : using_gmt(using_gmt) {}
2586 	
2587 	  void encode(ceph::buffer::list &bl) const;
2588 	  void decode(ceph::buffer::list::const_iterator &bl);
2589 	  void dump(ceph::Formatter *f) const;
2590 	  static void generate_test_instances(std::list<pg_hit_set_info_t*>& o);
2591 	};
2592 	WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2593 	
2594 	/**
2595 	 * pg_hit_set_history_t - information about a history of hitsets
2596 	 *
2597 	 * Include information about the currently accumulating hit set as well
2598 	 * as archived/historical ones.
2599 	 */
2600 	struct pg_hit_set_history_t {
2601 	  eversion_t current_last_update;  ///< last version inserted into current set
2602 	  std::list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2603 	
2604 	  friend bool operator==(const pg_hit_set_history_t& l,
2605 				 const pg_hit_set_history_t& r) {
2606 	    return
2607 	      l.current_last_update == r.current_last_update &&
2608 	      l.history == r.history;
2609 	  }
2610 	
2611 	  void encode(ceph::buffer::list &bl) const;
2612 	  void decode(ceph::buffer::list::const_iterator &bl);
2613 	  void dump(ceph::Formatter *f) const;
2614 	  static void generate_test_instances(std::list<pg_hit_set_history_t*>& o);
2615 	};
2616 	WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2617 	
2618 	
2619 	// -----------------------------------------
2620 	
2621 	/**
2622 	 * pg_history_t - information about recent pg peering/mapping history
2623 	 *
2624 	 * This is aggressively shared between OSDs to bound the amount of past
2625 	 * history they need to worry about.
2626 	 */
2627 	struct pg_history_t {
2628 	  epoch_t epoch_created = 0;       // epoch in which *pg* was created (pool or pg)
2629 	  epoch_t epoch_pool_created = 0;  // epoch in which *pool* was created
2630 				       // (note: may be pg creation epoch for
2631 				       // pre-luminous clusters)
2632 	  epoch_t last_epoch_started = 0;;  // lower bound on last epoch started (anywhere, not necessarily locally)
2633 	  epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval
2634 	  epoch_t last_epoch_clean = 0;;    // lower bound on last epoch the PG was completely clean.
2635 	  epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval
2636 	  epoch_t last_epoch_split = 0;;    // as parent or child
2637 	  epoch_t last_epoch_marked_full = 0;;  // pool or cluster
2638 	
2639 	  /**
2640 	   * In the event of a map discontinuity, same_*_since may reflect the first
2641 	   * map the osd has seen in the new map sequence rather than the actual start
2642 	   * of the interval.  This is ok since a discontinuity at epoch e means there
2643 	   * must have been a clean interval between e and now and that we cannot be
2644 	   * in the active set during the interval containing e.
2645 	   */
2646 	  epoch_t same_up_since = 0;;       // same acting set since
2647 	  epoch_t same_interval_since = 0;;   // same acting AND up set since
2648 	  epoch_t same_primary_since = 0;;  // same primary at least back through this epoch.
2649 	
2650 	  eversion_t last_scrub;
2651 	  eversion_t last_deep_scrub;
2652 	  utime_t last_scrub_stamp;
2653 	  utime_t last_deep_scrub_stamp;
2654 	  utime_t last_clean_scrub_stamp;
2655 	
2656 	  /// upper bound on how long prior interval readable (relative to encode time)
2657 	  ceph::timespan prior_readable_until_ub = ceph::timespan::zero();
2658 	
2659 	  friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2660 	    return
2661 	      l.epoch_created == r.epoch_created &&
2662 	      l.epoch_pool_created == r.epoch_pool_created &&
2663 	      l.last_epoch_started == r.last_epoch_started &&
2664 	      l.last_interval_started == r.last_interval_started &&
2665 	      l.last_epoch_clean == r.last_epoch_clean &&
2666 	      l.last_interval_clean == r.last_interval_clean &&
2667 	      l.last_epoch_split == r.last_epoch_split &&
2668 	      l.last_epoch_marked_full == r.last_epoch_marked_full &&
2669 	      l.same_up_since == r.same_up_since &&
2670 	      l.same_interval_since == r.same_interval_since &&
2671 	      l.same_primary_since == r.same_primary_since &&
2672 	      l.last_scrub == r.last_scrub &&
2673 	      l.last_deep_scrub == r.last_deep_scrub &&
2674 	      l.last_scrub_stamp == r.last_scrub_stamp &&
2675 	      l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2676 	      l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2677 	      l.prior_readable_until_ub == r.prior_readable_until_ub;
2678 	  }
2679 	
2680 	  pg_history_t() {}
2681 	  pg_history_t(epoch_t created, utime_t stamp)
2682 	    : epoch_created(created),
2683 	      epoch_pool_created(created),
2684 	      same_up_since(created),
2685 	      same_interval_since(created),
2686 	      same_primary_since(created),
2687 	      last_scrub_stamp(stamp),
2688 	      last_deep_scrub_stamp(stamp),
2689 	      last_clean_scrub_stamp(stamp) {}
2690 	  
2691 	  bool merge(const pg_history_t &other) {
2692 	    // Here, we only update the fields which cannot be calculated from the OSDmap.
2693 	    bool modified = false;
2694 	    if (epoch_created < other.epoch_created) {
2695 	      epoch_created = other.epoch_created;
2696 	      modified = true;
2697 	    }
2698 	    if (epoch_pool_created < other.epoch_pool_created) {
2699 	      // FIXME: for jewel compat only; this should either be 0 or always the
2700 	      // same value across all pg instances.
2701 	      epoch_pool_created = other.epoch_pool_created;
2702 	      modified = true;
2703 	    }
2704 	    if (last_epoch_started < other.last_epoch_started) {
2705 	      last_epoch_started = other.last_epoch_started;
2706 	      modified = true;
2707 	    }
2708 	    if (last_interval_started < other.last_interval_started) {
2709 	      last_interval_started = other.last_interval_started;
2710 	      // if we are learning about a newer *started* interval, our
2711 	      // readable_until_ub is obsolete
2712 	      prior_readable_until_ub = other.prior_readable_until_ub;
2713 	      modified = true;
2714 	    } else if (other.last_interval_started == last_interval_started &&
2715 		       other.prior_readable_until_ub < prior_readable_until_ub) {
2716 	      // if other is the *same* interval, than pull our upper bound in
2717 	      // if they have a tighter bound.
2718 	      prior_readable_until_ub = other.prior_readable_until_ub;
2719 	      modified = true;
2720 	    }
2721 	    if (last_epoch_clean < other.last_epoch_clean) {
2722 	      last_epoch_clean = other.last_epoch_clean;
2723 	      modified = true;
2724 	    }
2725 	    if (last_interval_clean < other.last_interval_clean) {
2726 	      last_interval_clean = other.last_interval_clean;
2727 	      modified = true;
2728 	    }
2729 	    if (last_epoch_split < other.last_epoch_split) {
2730 	      last_epoch_split = other.last_epoch_split; 
2731 	      modified = true;
2732 	    }
2733 	    if (last_epoch_marked_full < other.last_epoch_marked_full) {
2734 	      last_epoch_marked_full = other.last_epoch_marked_full;
2735 	      modified = true;
2736 	    }
2737 	    if (other.last_scrub > last_scrub) {
2738 	      last_scrub = other.last_scrub;
2739 	      modified = true;
2740 	    }
2741 	    if (other.last_scrub_stamp > last_scrub_stamp) {
2742 	      last_scrub_stamp = other.last_scrub_stamp;
2743 	      modified = true;
2744 	    }
2745 	    if (other.last_deep_scrub > last_deep_scrub) {
2746 	      last_deep_scrub = other.last_deep_scrub;
2747 	      modified = true;
2748 	    }
2749 	    if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2750 	      last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2751 	      modified = true;
2752 	    }
2753 	    if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2754 	      last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2755 	      modified = true;
2756 	    }
2757 	    return modified;
2758 	  }
2759 	
2760 	  void encode(ceph::buffer::list& bl) const;
2761 	  void decode(ceph::buffer::list::const_iterator& p);
2762 	  void dump(ceph::Formatter *f) const;
2763 	  static void generate_test_instances(std::list<pg_history_t*>& o);
2764 	
2765 	  ceph::signedspan refresh_prior_readable_until_ub(
2766 	    ceph::signedspan now,  ///< now, relative to osd startup_time
2767 	    ceph::signedspan ub) { ///< ub, relative to osd startup_time
2768 	    if (now >= ub) {
2769 	      // prior interval(s) are unreadable; we can zero the upper bound
2770 	      prior_readable_until_ub = ceph::signedspan::zero();
2771 	      return ceph::signedspan::zero();
2772 	    } else {
2773 	      prior_readable_until_ub = ub - now;
2774 	      return ub;
2775 	    }
2776 	  }
2777 	  ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) {
2778 	    if (prior_readable_until_ub == ceph::signedspan::zero()) {
2779 	      return ceph::signedspan::zero();
2780 	    }
2781 	    return now + prior_readable_until_ub;
2782 	  }
2783 	};
2784 	WRITE_CLASS_ENCODER(pg_history_t)
2785 	
2786 	inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) {
2787 	  out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
2788 	      << " lis/c=" << h.last_interval_started
2789 	      << "/" << h.last_interval_clean
2790 	      << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean
2791 	      << "/" << h.last_epoch_marked_full
2792 	      << " sis=" << h.same_interval_since;
2793 	  if (h.prior_readable_until_ub != ceph::timespan::zero()) {
2794 	    out << " pruub=" << h.prior_readable_until_ub;
2795 	  }
2796 	  return out;
2797 	}
2798 	
2799 	
2800 	/**
2801 	 * pg_info_t - summary of PG statistics.
2802 	 *
2803 	 * some notes: 
2804 	 *  - last_complete implies we have all objects that existed as of that
2805 	 *    stamp, OR a newer object, OR have already applied a later delete.
2806 	 *  - if last_complete >= log.bottom, then we know pg contents thru log.head.
2807 	 *    otherwise, we have no idea what the pg is supposed to contain.
2808 	 */
2809 	struct pg_info_t {
2810 	  spg_t pgid;
2811 	  eversion_t last_update;      ///< last object version applied to store.
2812 	  eversion_t last_complete;    ///< last version pg was complete through.
2813 	  epoch_t last_epoch_started;  ///< last epoch at which this pg started on this osd
2814 	  epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2815 	  
2816 	  version_t last_user_version; ///< last user object version applied to store
2817 	
2818 	  eversion_t log_tail;         ///< oldest log entry.
2819 	
2820 	  hobject_t last_backfill;     ///< objects >= this and < last_complete may be missing
2821 	
2822 	  interval_set<snapid_t> purged_snaps;
2823 	
2824 	  pg_stat_t stats;
2825 	
2826 	  pg_history_t history;
2827 	  pg_hit_set_history_t hit_set;
2828 	
2829 	  friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2830 	    return
2831 	      l.pgid == r.pgid &&
2832 	      l.last_update == r.last_update &&
2833 	      l.last_complete == r.last_complete &&
2834 	      l.last_epoch_started == r.last_epoch_started &&
2835 	      l.last_interval_started == r.last_interval_started &&
2836 	      l.last_user_version == r.last_user_version &&
2837 	      l.log_tail == r.log_tail &&
2838 	      l.last_backfill == r.last_backfill &&
2839 	      l.purged_snaps == r.purged_snaps &&
2840 	      l.stats == r.stats &&
2841 	      l.history == r.history &&
2842 	      l.hit_set == r.hit_set;
2843 	  }
2844 	
2845 	  pg_info_t()
2846 	    : last_epoch_started(0),
2847 	      last_interval_started(0),
2848 	      last_user_version(0),
2849 	      last_backfill(hobject_t::get_max())
2850 	  { }
2851 	  // cppcheck-suppress noExplicitConstructor
2852 	  pg_info_t(spg_t p)
2853 	    : pgid(p),
2854 	      last_epoch_started(0),
2855 	      last_interval_started(0),
2856 	      last_user_version(0),
2857 	      last_backfill(hobject_t::get_max())
2858 	  { }
2859 	  
2860 	  void set_last_backfill(hobject_t pos) {
2861 	    last_backfill = pos;
2862 	  }
2863 	
2864 	  bool is_empty() const { return last_update.version == 0; }
2865 	  bool dne() const { return history.epoch_created == 0; }
2866 	
2867 	  bool has_missing() const { return last_complete != last_update; }
2868 	  bool is_incomplete() const { return !last_backfill.is_max(); }
2869 	
2870 	  void encode(ceph::buffer::list& bl) const;
2871 	  void decode(ceph::buffer::list::const_iterator& p);
2872 	  void dump(ceph::Formatter *f) const;
2873 	  static void generate_test_instances(std::list<pg_info_t*>& o);
2874 	};
2875 	WRITE_CLASS_ENCODER(pg_info_t)
2876 	
2877 	inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi) 
2878 	{
2879 	  out << pgi.pgid << "(";
2880 	  if (pgi.dne())
2881 	    out << " DNE";
2882 	  if (pgi.is_empty())
2883 	    out << " empty";
2884 	  else {
2885 	    out << " v " << pgi.last_update;
2886 	    if (pgi.last_complete != pgi.last_update)
2887 	      out << " lc " << pgi.last_complete;
2888 	    out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2889 	  }
2890 	  if (pgi.is_incomplete())
2891 	    out << " lb " << pgi.last_backfill;
2892 	  //out << " c " << pgi.epoch_created;
2893 	  out << " local-lis/les=" << pgi.last_interval_started
2894 	      << "/" << pgi.last_epoch_started;
2895 	  out << " n=" << pgi.stats.stats.sum.num_objects;
2896 	  out << " " << pgi.history
2897 	      << ")";
2898 	  return out;
2899 	}
2900 	
2901 	/**
2902 	 * pg_fast_info_t - common pg_info_t fields
2903 	 *
2904 	 * These are the fields of pg_info_t (and children) that are updated for
2905 	 * most IO operations.
2906 	 *
2907 	 * ** WARNING **
2908 	 * Because we rely on these fields to be applied to the normal
2909 	 * info struct, adding a new field here that is not also new in info
2910 	 * means that we must set an incompat OSD feature bit!
2911 	 */
2912 	struct pg_fast_info_t {
2913 	  eversion_t last_update;
2914 	  eversion_t last_complete;
2915 	  version_t last_user_version;
2916 	  struct { // pg_stat_t stats
2917 	    eversion_t version;
2918 	    version_t reported_seq;
2919 	    utime_t last_fresh;
2920 	    utime_t last_active;
2921 	    utime_t last_peered;
2922 	    utime_t last_clean;
2923 	    utime_t last_unstale;
2924 	    utime_t last_undegraded;
2925 	    utime_t last_fullsized;
2926 	    int64_t log_size;  // (also ondisk_log_size, which has the same value)
2927 	    struct { // object_stat_collection_t stats;
2928 	      struct { // objct_stat_sum_t sum
2929 		int64_t num_bytes;    // in bytes
2930 		int64_t num_objects;
2931 		int64_t num_object_copies;
2932 		int64_t num_rd;
2933 		int64_t num_rd_kb;
2934 		int64_t num_wr;
2935 		int64_t num_wr_kb;
2936 		int64_t num_objects_dirty;
2937 	      } sum;
2938 	    } stats;
2939 	  } stats;
2940 	
2941 	  void populate_from(const pg_info_t& info) {
2942 	    last_update = info.last_update;
2943 	    last_complete = info.last_complete;
2944 	    last_user_version = info.last_user_version;
2945 	    stats.version = info.stats.version;
2946 	    stats.reported_seq = info.stats.reported_seq;
2947 	    stats.last_fresh = info.stats.last_fresh;
2948 	    stats.last_active = info.stats.last_active;
2949 	    stats.last_peered = info.stats.last_peered;
2950 	    stats.last_clean = info.stats.last_clean;
2951 	    stats.last_unstale = info.stats.last_unstale;
2952 	    stats.last_undegraded = info.stats.last_undegraded;
2953 	    stats.last_fullsized = info.stats.last_fullsized;
2954 	    stats.log_size = info.stats.log_size;
2955 	    stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2956 	    stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2957 	    stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2958 	    stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2959 	    stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2960 	    stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2961 	    stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2962 	    stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2963 	  }
2964 	
2965 	  bool try_apply_to(pg_info_t* info) {
2966 	    if (last_update <= info->last_update)
2967 	      return false;
2968 	    info->last_update = last_update;
2969 	    info->last_complete = last_complete;
2970 	    info->last_user_version = last_user_version;
2971 	    info->stats.version = stats.version;
2972 	    info->stats.reported_seq = stats.reported_seq;
2973 	    info->stats.last_fresh = stats.last_fresh;
2974 	    info->stats.last_active = stats.last_active;
2975 	    info->stats.last_peered = stats.last_peered;
2976 	    info->stats.last_clean = stats.last_clean;
2977 	    info->stats.last_unstale = stats.last_unstale;
2978 	    info->stats.last_undegraded = stats.last_undegraded;
2979 	    info->stats.last_fullsized = stats.last_fullsized;
2980 	    info->stats.log_size = stats.log_size;
2981 	    info->stats.ondisk_log_size = stats.log_size;
2982 	    info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2983 	    info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2984 	    info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2985 	    info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2986 	    info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2987 	    info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2988 	    info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2989 	    info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2990 	    return true;
2991 	  }
2992 	
2993 	  void encode(ceph::buffer::list& bl) const {
2994 	    ENCODE_START(1, 1, bl);
2995 	    encode(last_update, bl);
2996 	    encode(last_complete, bl);
2997 	    encode(last_user_version, bl);
2998 	    encode(stats.version, bl);
2999 	    encode(stats.reported_seq, bl);
3000 	    encode(stats.last_fresh, bl);
3001 	    encode(stats.last_active, bl);
3002 	    encode(stats.last_peered, bl);
3003 	    encode(stats.last_clean, bl);
3004 	    encode(stats.last_unstale, bl);
3005 	    encode(stats.last_undegraded, bl);
3006 	    encode(stats.last_fullsized, bl);
3007 	    encode(stats.log_size, bl);
3008 	    encode(stats.stats.sum.num_bytes, bl);
3009 	    encode(stats.stats.sum.num_objects, bl);
3010 	    encode(stats.stats.sum.num_object_copies, bl);
3011 	    encode(stats.stats.sum.num_rd, bl);
3012 	    encode(stats.stats.sum.num_rd_kb, bl);
3013 	    encode(stats.stats.sum.num_wr, bl);
3014 	    encode(stats.stats.sum.num_wr_kb, bl);
3015 	    encode(stats.stats.sum.num_objects_dirty, bl);
3016 	    ENCODE_FINISH(bl);
3017 	  }
3018 	  void decode(ceph::buffer::list::const_iterator& p) {
3019 	    DECODE_START(1, p);
3020 	    decode(last_update, p);
3021 	    decode(last_complete, p);
3022 	    decode(last_user_version, p);
3023 	    decode(stats.version, p);
3024 	    decode(stats.reported_seq, p);
3025 	    decode(stats.last_fresh, p);
3026 	    decode(stats.last_active, p);
3027 	    decode(stats.last_peered, p);
3028 	    decode(stats.last_clean, p);
3029 	    decode(stats.last_unstale, p);
3030 	    decode(stats.last_undegraded, p);
3031 	    decode(stats.last_fullsized, p);
3032 	    decode(stats.log_size, p);
3033 	    decode(stats.stats.sum.num_bytes, p);
3034 	    decode(stats.stats.sum.num_objects, p);
3035 	    decode(stats.stats.sum.num_object_copies, p);
3036 	    decode(stats.stats.sum.num_rd, p);
3037 	    decode(stats.stats.sum.num_rd_kb, p);
3038 	    decode(stats.stats.sum.num_wr, p);
3039 	    decode(stats.stats.sum.num_wr_kb, p);
3040 	    decode(stats.stats.sum.num_objects_dirty, p);
3041 	    DECODE_FINISH(p);
3042 	  }
3043 	};
3044 	WRITE_CLASS_ENCODER(pg_fast_info_t)
3045 	
3046 	
3047 	class OSDMap;
3048 	/**
3049 	 * PastIntervals -- information needed to determine the PriorSet and
3050 	 * the might_have_unfound set
3051 	 */
3052 	class PastIntervals {
3053 	#ifdef WITH_SEASTAR
3054 	  using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
3055 	#else
3056 	  using OSDMapRef = std::shared_ptr<const OSDMap>;
3057 	#endif
3058 	public:
3059 	  struct pg_interval_t {
3060 	    std::vector<int32_t> up, acting;
3061 	    epoch_t first, last;
3062 	    bool maybe_went_rw;
3063 	    int32_t primary;
3064 	    int32_t up_primary;
3065 	
3066 	    pg_interval_t()
3067 	      : first(0), last(0),
3068 		maybe_went_rw(false),
3069 		primary(-1),
3070 		up_primary(-1)
3071 	      {}
3072 	
3073 	    pg_interval_t(
3074 	      std::vector<int32_t> &&up,
3075 	      std::vector<int32_t> &&acting,
3076 	      epoch_t first,
3077 	      epoch_t last,
3078 	      bool maybe_went_rw,
3079 	      int32_t primary,
3080 	      int32_t up_primary)
3081 	      : up(up), acting(acting), first(first), last(last),
3082 		maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
3083 	      {}
3084 	
3085 	    void encode(ceph::buffer::list& bl) const;
3086 	    void decode(ceph::buffer::list::const_iterator& bl);
3087 	    void dump(ceph::Formatter *f) const;
3088 	    static void generate_test_instances(std::list<pg_interval_t*>& o);
3089 	  };
3090 	
3091 	  PastIntervals();
3092 	  PastIntervals(PastIntervals &&rhs) = default;
3093 	  PastIntervals &operator=(PastIntervals &&rhs) = default;
3094 	
3095 	  PastIntervals(const PastIntervals &rhs);
3096 	  PastIntervals &operator=(const PastIntervals &rhs);
3097 	
3098 	  class interval_rep {
3099 	  public:
3100 	    virtual size_t size() const = 0;
3101 	    virtual bool empty() const = 0;
3102 	    virtual void clear() = 0;
3103 	    virtual std::pair<epoch_t, epoch_t> get_bounds() const = 0;
3104 	    virtual std::set<pg_shard_t> get_all_participants(
3105 	      bool ec_pool) const = 0;
3106 	    virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
3107 	    virtual std::unique_ptr<interval_rep> clone() const = 0;
3108 	    virtual std::ostream &print(std::ostream &out) const = 0;
3109 	    virtual void encode(ceph::buffer::list &bl) const = 0;
3110 	    virtual void decode(ceph::buffer::list::const_iterator &bl) = 0;
3111 	    virtual void dump(ceph::Formatter *f) const = 0;
3112 	    virtual void iterate_mayberw_back_to(
3113 	      epoch_t les,
3114 	      std::function<void(epoch_t, const std::set<pg_shard_t> &)> &&f) const = 0;
3115 	
3116 	    virtual bool has_full_intervals() const { return false; }
3117 	    virtual void iterate_all_intervals(
3118 	      std::function<void(const pg_interval_t &)> &&f) const {
3119 	      ceph_assert(!has_full_intervals());
3120 	      ceph_abort_msg("not valid for this implementation");
3121 	    }
3122 	    virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
3123 	
3124 	    virtual ~interval_rep() {}
3125 	  };
3126 	  friend class pi_compact_rep;
3127 	private:
3128 	
3129 	  std::unique_ptr<interval_rep> past_intervals;
3130 	
3131 	  explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
3132 	
3133 	public:
3134 	  void add_interval(bool ec_pool, const pg_interval_t &interval) {
3135 	    ceph_assert(past_intervals);
3136 	    return past_intervals->add_interval(ec_pool, interval);
3137 	  }
3138 	
3139 	  void encode(ceph::buffer::list &bl) const {
3140 	    ENCODE_START(1, 1, bl);
3141 	    if (past_intervals) {
3142 	      __u8 type = 2;
3143 	      encode(type, bl);
3144 	      past_intervals->encode(bl);
3145 	    } else {
3146 	      encode((__u8)0, bl);
3147 	    }
3148 	    ENCODE_FINISH(bl);
3149 	  }
3150 	
3151 	  void decode(ceph::buffer::list::const_iterator &bl);
3152 	
3153 	  void dump(ceph::Formatter *f) const {
3154 	    ceph_assert(past_intervals);
3155 	    past_intervals->dump(f);
3156 	  }
3157 	  static void generate_test_instances(std::list<PastIntervals *> & o);
3158 	
3159 	  /**
3160 	   * Determines whether there is an interval change
3161 	   */
3162 	  static bool is_new_interval(
3163 	    int old_acting_primary,
3164 	    int new_acting_primary,
3165 	    const std::vector<int> &old_acting,
3166 	    const std::vector<int> &new_acting,
3167 	    int old_up_primary,
3168 	    int new_up_primary,
3169 	    const std::vector<int> &old_up,
3170 	    const std::vector<int> &new_up,
3171 	    int old_size,
3172 	    int new_size,
3173 	    int old_min_size,
3174 	    int new_min_size,
3175 	    unsigned old_pg_num,
3176 	    unsigned new_pg_num,
3177 	    unsigned old_pg_num_pending,
3178 	    unsigned new_pg_num_pending,
3179 	    bool old_sort_bitwise,
3180 	    bool new_sort_bitwise,
3181 	    bool old_recovery_deletes,
3182 	    bool new_recovery_deletes,
3183 	    pg_t pgid
3184 	    );
3185 	
3186 	  /**
3187 	   * Determines whether there is an interval change
3188 	   */
3189 	  static bool is_new_interval(
3190 	    int old_acting_primary,                     ///< [in] primary as of lastmap
3191 	    int new_acting_primary,                     ///< [in] primary as of lastmap
3192 	    const std::vector<int> &old_acting,              ///< [in] acting as of lastmap
3193 	    const std::vector<int> &new_acting,              ///< [in] acting as of osdmap
3194 	    int old_up_primary,                         ///< [in] up primary of lastmap
3195 	    int new_up_primary,                         ///< [in] up primary of osdmap
3196 	    const std::vector<int> &old_up,                  ///< [in] up as of lastmap
3197 	    const std::vector<int> &new_up,                  ///< [in] up as of osdmap
3198 	    const OSDMap *osdmap,  ///< [in] current map
3199 	    const OSDMap *lastmap, ///< [in] last map
3200 	    pg_t pgid                                   ///< [in] pgid for pg
3201 	    );
3202 	
3203 	  /**
3204 	   * Integrates a new map into *past_intervals, returns true
3205 	   * if an interval was closed out.
3206 	   */
3207 	  static bool check_new_interval(
3208 	    int old_acting_primary,                     ///< [in] primary as of lastmap
3209 	    int new_acting_primary,                     ///< [in] primary as of osdmap
3210 	    const std::vector<int> &old_acting,              ///< [in] acting as of lastmap
3211 	    const std::vector<int> &new_acting,              ///< [in] acting as of osdmap
3212 	    int old_up_primary,                         ///< [in] up primary of lastmap
3213 	    int new_up_primary,                         ///< [in] up primary of osdmap
3214 	    const std::vector<int> &old_up,                  ///< [in] up as of lastmap
3215 	    const std::vector<int> &new_up,                  ///< [in] up as of osdmap
3216 	    epoch_t same_interval_since,                ///< [in] as of osdmap
3217 	    epoch_t last_epoch_clean,                   ///< [in] current
3218 	    const OSDMap *osdmap,      ///< [in] current map
3219 	    const OSDMap *lastmap,     ///< [in] last map
3220 	    pg_t pgid,                                  ///< [in] pgid for pg
3221 	    const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3222 	    PastIntervals *past_intervals,              ///< [out] intervals
3223 	    std::ostream *out = 0                            ///< [out] debug ostream
3224 	    );
3225 	  static bool check_new_interval(
3226 	    int old_acting_primary,                     ///< [in] primary as of lastmap
3227 	    int new_acting_primary,                     ///< [in] primary as of osdmap
3228 	    const std::vector<int> &old_acting,              ///< [in] acting as of lastmap
3229 	    const std::vector<int> &new_acting,              ///< [in] acting as of osdmap
3230 	    int old_up_primary,                         ///< [in] up primary of lastmap
3231 	    int new_up_primary,                         ///< [in] up primary of osdmap
3232 	    const std::vector<int> &old_up,                  ///< [in] up as of lastmap
3233 	    const std::vector<int> &new_up,                  ///< [in] up as of osdmap
3234 	    epoch_t same_interval_since,                ///< [in] as of osdmap
3235 	    epoch_t last_epoch_clean,                   ///< [in] current
3236 	    OSDMapRef osdmap,      ///< [in] current map
3237 	    OSDMapRef lastmap,     ///< [in] last map
3238 	    pg_t pgid,                                  ///< [in] pgid for pg
3239 	    const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3240 	    PastIntervals *past_intervals,              ///< [out] intervals
3241 	    std::ostream *out = 0                            ///< [out] debug ostream
3242 	    ) {
3243 	    return check_new_interval(
3244 	      old_acting_primary, new_acting_primary,
3245 	      old_acting, new_acting,
3246 	      old_up_primary, new_up_primary,
3247 	      old_up, new_up,
3248 	      same_interval_since, last_epoch_clean,
3249 	      osdmap.get(), lastmap.get(),
3250 	      pgid,
3251 	      could_have_gone_active,
3252 	      past_intervals,
3253 	      out);
3254 	  }
3255 	
3256 	  friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3257 	
3258 	  template <typename F>
3259 	  void iterate_mayberw_back_to(
3260 	    epoch_t les,
3261 	    F &&f) const {
3262 	    ceph_assert(past_intervals);
3263 	    past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
3264 	  }
3265 	  void clear() {
3266 	    ceph_assert(past_intervals);
3267 	    past_intervals->clear();
3268 	  }
3269 	
3270 	  /**
3271 	   * Should return a value which gives an indication of the amount
3272 	   * of state contained
3273 	   */
3274 	  size_t size() const {
3275 	    ceph_assert(past_intervals);
3276 	    return past_intervals->size();
3277 	  }
3278 	
3279 	  bool empty() const {
3280 	    ceph_assert(past_intervals);
3281 	    return past_intervals->empty();
3282 	  }
3283 	
3284 	  void swap(PastIntervals &other) {
3285 	    using std::swap;
3286 	    swap(other.past_intervals, past_intervals);
3287 	  }
3288 	
3289 	  /**
3290 	   * Return all shards which have been in the acting set back to the
3291 	   * latest epoch to which we have trimmed except for pg_whoami
3292 	   */
3293 	  std::set<pg_shard_t> get_might_have_unfound(
3294 	    pg_shard_t pg_whoami,
3295 	    bool ec_pool) const {
3296 	    ceph_assert(past_intervals);
3297 	    auto ret = past_intervals->get_all_participants(ec_pool);
3298 	    ret.erase(pg_whoami);
3299 	    return ret;
3300 	  }
3301 	
3302 	  /**
3303 	   * Return all shards which we might want to talk to for peering
3304 	   */
3305 	  std::set<pg_shard_t> get_all_probe(
3306 	    bool ec_pool) const {
3307 	    ceph_assert(past_intervals);
3308 	    return past_intervals->get_all_participants(ec_pool);
3309 	  }
3310 	
3311 	  /* Return the set of epochs [start, end) represented by the
3312 	   * past_interval set.
3313 	   */
3314 	  std::pair<epoch_t, epoch_t> get_bounds() const {
3315 	    ceph_assert(past_intervals);
3316 	    return past_intervals->get_bounds();
3317 	  }
3318 	
3319 	  void adjust_start_backwards(epoch_t last_epoch_clean) {
3320 	    ceph_assert(past_intervals);
3321 	    past_intervals->adjust_start_backwards(last_epoch_clean);
3322 	  }
3323 	
3324 	  enum osd_state_t {
3325 	    UP,
3326 	    DOWN,
3327 	    DNE,
3328 	    LOST
3329 	  };
3330 	  struct PriorSet {
3331 	    bool ec_pool = false;
3332 	    std::set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
3333 	    std::set<int> down;  ///< down osds that would normally be in @a probe and might be interesting.
3334 	    std::map<int, epoch_t> blocked_by;  ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
3335 	
3336 	    bool pg_down = false;   ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
3337 	    const IsPGRecoverablePredicate* pcontdec = nullptr;
3338 	
3339 	    PriorSet() = default;
3340 	    PriorSet(PriorSet &&) = default;
3341 	    PriorSet &operator=(PriorSet &&) = default;
3342 	
3343 	    PriorSet &operator=(const PriorSet &) = delete;
3344 	    PriorSet(const PriorSet &) = delete;
3345 	
3346 	    bool operator==(const PriorSet &rhs) const {
3347 	      return (ec_pool == rhs.ec_pool) &&
3348 		(probe == rhs.probe) &&
3349 		(down == rhs.down) &&
3350 		(blocked_by == rhs.blocked_by) &&
3351 		(pg_down == rhs.pg_down);
3352 	    }
3353 	
3354 	    bool affected_by_map(
3355 	      const OSDMap &osdmap,
3356 	      const DoutPrefixProvider *dpp) const;
3357 	
3358 	    // For verifying tests
3359 	    PriorSet(
3360 	      bool ec_pool,
3361 	      std::set<pg_shard_t> probe,
3362 	      std::set<int> down,
3363 	      std::map<int, epoch_t> blocked_by,
3364 	      bool pg_down,
3365 	      const IsPGRecoverablePredicate *pcontdec)
3366 	      : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
3367 		pg_down(pg_down), pcontdec(pcontdec) {}
3368 	
3369 	  private:
3370 	    template <typename F>
3371 	    PriorSet(
3372 	      const PastIntervals &past_intervals,
3373 	      bool ec_pool,
3374 	      epoch_t last_epoch_started,
3375 	      const IsPGRecoverablePredicate *c,
3376 	      F f,
3377 	      const std::vector<int> &up,
3378 	      const std::vector<int> &acting,
3379 	      const DoutPrefixProvider *dpp);
3380 	
3381 	    friend class PastIntervals;
3382 	  };
3383 	
3384 	  template <typename... Args>
3385 	  PriorSet get_prior_set(Args&&... args) const {
3386 	    return PriorSet(*this, std::forward<Args>(args)...);
3387 	  }
3388 	};
3389 	WRITE_CLASS_ENCODER(PastIntervals)
3390 	
3391 	std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i);
3392 	std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3393 	std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i);
3394 	
3395 	template <typename F>
3396 	PastIntervals::PriorSet::PriorSet(
3397 	  const PastIntervals &past_intervals,
3398 	  bool ec_pool,
3399 	  epoch_t last_epoch_started,
3400 	  const IsPGRecoverablePredicate *c,
3401 	  F f,
3402 	  const std::vector<int> &up,
3403 	  const std::vector<int> &acting,
3404 	  const DoutPrefixProvider *dpp)
3405 	  : ec_pool(ec_pool), pg_down(false), pcontdec(c)
3406 	{
3407 	  /*
3408 	   * We have to be careful to gracefully deal with situations like
3409 	   * so. Say we have a power outage or something that takes out both
3410 	   * OSDs, but the monitor doesn't mark them down in the same epoch.
3411 	   * The history may look like
3412 	   *
3413 	   *  1: A B
3414 	   *  2:   B
3415 	   *  3:       let's say B dies for good, too (say, from the power spike)
3416 	   *  4: A
3417 	   *
3418 	   * which makes it look like B may have applied updates to the PG
3419 	   * that we need in order to proceed.  This sucks...
3420 	   *
3421 	   * To minimize the risk of this happening, we CANNOT go active if
3422 	   * _any_ OSDs in the prior set are down until we send an MOSDAlive
3423 	   * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3424 	   * Then, we have something like
3425 	   *
3426 	   *  1: A B
3427 	   *  2:   B   up_thru[B]=0
3428 	   *  3:
3429 	   *  4: A
3430 	   *
3431 	   * -> we can ignore B, bc it couldn't have gone active (alive_thru
3432 	   *    still 0).
3433 	   *
3434 	   * or,
3435 	   *
3436 	   *  1: A B
3437 	   *  2:   B   up_thru[B]=0
3438 	   *  3:   B   up_thru[B]=2
3439 	   *  4:
3440 	   *  5: A
3441 	   *
3442 	   * -> we must wait for B, bc it was alive through 2, and could have
3443 	   *    written to the pg.
3444 	   *
3445 	   * If B is really dead, then an administrator will need to manually
3446 	   * intervene by marking the OSD as "lost."
3447 	   */
3448 	
3449 	  // Include current acting and up nodes... not because they may
3450 	  // contain old data (this interval hasn't gone active, obviously),
3451 	  // but because we want their pg_info to inform choose_acting(), and
3452 	  // so that we know what they do/do not have explicitly before
3453 	  // sending them any new info/logs/whatever.
3454 	  for (unsigned i = 0; i < acting.size(); i++) {
3455 	    if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3456 	      probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3457 	  }
3458 	  // It may be possible to exclude the up nodes, but let's keep them in
3459 	  // there for now.
3460 	  for (unsigned i = 0; i < up.size(); i++) {
3461 	    if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3462 	      probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3463 	  }
3464 	
3465 	  std::set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
3466 	  ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3467 	  for (auto &&i: all_probe) {
3468 	    switch (f(0, i.osd, nullptr)) {
3469 	    case UP: {
3470 	      probe.insert(i);
3471 	      break;
3472 	    }
3473 	    case DNE:
3474 	    case LOST:
3475 	    case DOWN: {
3476 	      down.insert(i.osd);
3477 	      break;
3478 	    }
3479 	    }
3480 	  }
3481 	
3482 	  past_intervals.iterate_mayberw_back_to(
3483 	    last_epoch_started,
3484 	    [&](epoch_t start, const std::set<pg_shard_t> &acting) {
3485 	      ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3486 				 << ", acting: " << acting << dendl;
3487 	
3488 	      // look at candidate osds during this interval.  each falls into
3489 	      // one of three categories: up, down (but potentially
3490 	      // interesting), or lost (down, but we won't wait for it).
3491 	      std::set<pg_shard_t> up_now;
3492 	      std::map<int, epoch_t> candidate_blocked_by;
3493 	      // any candidates down now (that might have useful data)
3494 	      bool any_down_now = false;
3495 	
3496 	      // consider ACTING osds
3497 	      for (auto &&so: acting) {
3498 		epoch_t lost_at = 0;
3499 		switch (f(start, so.osd, &lost_at)) {
3500 		case UP: {
3501 		  // include past acting osds if they are up.
3502 		  up_now.insert(so);
3503 		  break;
3504 		}
3505 		case DNE: {
3506 		  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
3507 				     << " no longer exists" << dendl;
3508 		  break;
3509 		}
3510 		case LOST: {
3511 		  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
3512 				     << " is down, but lost_at " << lost_at << dendl;
3513 		  up_now.insert(so);
3514 		  break;
3515 		}
3516 		case DOWN: {
3517 		  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
3518 				     << " is down" << dendl;
3519 		  candidate_blocked_by[so.osd] = lost_at;
3520 		  any_down_now = true;
3521 		  break;
3522 		}
3523 		}
3524 	      }
3525 	
3526 	      // if not enough osds survived this interval, and we may have gone rw,
3527 	      // then we need to wait for one of those osds to recover to
3528 	      // ensure that we haven't lost any information.
3529 	      if (!(*pcontdec)(up_now) && any_down_now) {
3530 		// fixme: how do we identify a "clean" shutdown anyway?
3531 		ldpp_dout(dpp, 10) << "build_prior  possibly went active+rw,"
3532 				   << " insufficient up; including down osds" << dendl;
3533 		ceph_assert(!candidate_blocked_by.empty());
3534 		pg_down = true;
3535 		blocked_by.insert(
3536 		  candidate_blocked_by.begin(),
3537 		  candidate_blocked_by.end());
3538 	      }
3539 	    });
3540 	
3541 	  ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3542 		   << " down " << down
3543 		   << " blocked_by " << blocked_by
3544 		   << (pg_down ? " pg_down":"")
3545 		   << dendl;
3546 	}
3547 	
3548 	struct pg_notify_t {
3549 	  epoch_t query_epoch;
3550 	  epoch_t epoch_sent;
3551 	  pg_info_t info;
3552 	  shard_id_t to;
3553 	  shard_id_t from;
3554 	  PastIntervals past_intervals;
3555 	  pg_notify_t() :
3556 	    query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
3557 	    from(shard_id_t::NO_SHARD) {}
3558 	  pg_notify_t(
3559 	    shard_id_t to,
3560 	    shard_id_t from,
3561 	    epoch_t query_epoch,
3562 	    epoch_t epoch_sent,
3563 	    const pg_info_t &info,
3564 	    const PastIntervals& pi)
3565 	    : query_epoch(query_epoch),
3566 	      epoch_sent(epoch_sent),
3567 	      info(info), to(to), from(from),
3568 	      past_intervals(pi) {
3569 	    ceph_assert(from == info.pgid.shard);
3570 	  }
3571 	  void encode(ceph::buffer::list &bl) const;
3572 	  void decode(ceph::buffer::list::const_iterator &p);
3573 	  void dump(ceph::Formatter *f) const;
3574 	  static void generate_test_instances(std::list<pg_notify_t*> &o);
3575 	};
3576 	WRITE_CLASS_ENCODER(pg_notify_t)
3577 	std::ostream &operator<<(std::ostream &lhs, const pg_notify_t &notify);
3578 	
3579 	
3580 	/** 
3581 	 * pg_query_t - used to ask a peer for information about a pg.
3582 	 *
3583 	 * note: if version=0, type=LOG, then we just provide our full log.
3584 	 */
3585 	struct pg_query_t {
3586 	  enum {
3587 	    INFO = 0,
3588 	    LOG = 1,
3589 	    MISSING = 4,
3590 	    FULLLOG = 5,
3591 	  };
3592 	  std::string_view get_type_name() const {
3593 	    switch (type) {
3594 	    case INFO: return "info";
3595 	    case LOG: return "log";
3596 	    case MISSING: return "missing";
3597 	    case FULLLOG: return "fulllog";
3598 	    default: return "???";
3599 	    }
3600 	  }
3601 	
3602 	  __s32 type;
3603 	  eversion_t since;
3604 	  pg_history_t history;
3605 	  epoch_t epoch_sent;
3606 	  shard_id_t to;
3607 	  shard_id_t from;
3608 	
3609 	  pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3610 			 from(shard_id_t::NO_SHARD) {}
3611 	  pg_query_t(
3612 	    int t,
3613 	    shard_id_t to,
3614 	    shard_id_t from,
3615 	    const pg_history_t& h,
3616 	    epoch_t epoch_sent)
3617 	    : type(t),
3618 	      history(h),
3619 	      epoch_sent(epoch_sent),
3620 	      to(to), from(from) {
3621 	    ceph_assert(t != LOG);
3622 	  }
3623 	  pg_query_t(
3624 	    int t,
3625 	    shard_id_t to,
3626 	    shard_id_t from,
3627 	    eversion_t s,
3628 	    const pg_history_t& h,
3629 	    epoch_t epoch_sent)
3630 	    : type(t), since(s), history(h),
3631 	      epoch_sent(epoch_sent), to(to), from(from) {
3632 	    ceph_assert(t == LOG);
3633 	  }
3634 	  
3635 	  void encode(ceph::buffer::list &bl, uint64_t features) const;
3636 	  void decode(ceph::buffer::list::const_iterator &bl);
3637 	
3638 	  void dump(ceph::Formatter *f) const;
3639 	  static void generate_test_instances(std::list<pg_query_t*>& o);
3640 	};
3641 	WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3642 	
3643 	inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) {
3644 	  out << "query(" << q.get_type_name() << " " << q.since;
3645 	  if (q.type == pg_query_t::LOG)
3646 	    out << " " << q.history;
3647 	  out << " epoch_sent " << q.epoch_sent;
3648 	  out << ")";
3649 	  return out;
3650 	}
3651 	
3652 	/**
3653 	 * pg_lease_t - readable lease metadata, from primary -> non-primary
3654 	 *
3655 	 * This metadata serves to increase either or both of the lease expiration
3656 	 * and upper bound on the non-primary.
3657 	 */
3658 	struct pg_lease_t {
3659 	  /// pg readable_until value; replicas must not be readable beyond this
3660 	  ceph::signedspan readable_until = ceph::signedspan::zero();
3661 	
3662 	  /// upper bound on any acting osd's readable_until
3663 	  ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3664 	
3665 	  /// duration of the lease (in case clock deltas aren't available)
3666 	  ceph::signedspan interval = ceph::signedspan::zero();
3667 	
3668 	  pg_lease_t() {}
3669 	  pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub,
3670 		     ceph::signedspan i)
3671 	    : readable_until(ru),
3672 	      readable_until_ub(ruub),
3673 	      interval(i) {}
3674 	
3675 	  void encode(ceph::buffer::list &bl) const;
3676 	  void decode(ceph::buffer::list::const_iterator &bl);
3677 	  void dump(ceph::Formatter *f) const;
3678 	  static void generate_test_instances(std::list<pg_lease_t*>& o);
3679 	
3680 	  friend ostream& operator<<(ostream& out, const pg_lease_t& l) {
3681 	    return out << "pg_lease(ru " << l.readable_until
3682 		       << " ub " << l.readable_until_ub
3683 		       << " int " << l.interval << ")";
3684 	  }
3685 	};
3686 	WRITE_CLASS_ENCODER(pg_lease_t)
3687 	
3688 	/**
3689 	 * pg_lease_ack_t - lease ack, from non-primary -> primary
3690 	 *
3691 	 * This metadata acknowledges to the primary what a non-primary's noted
3692 	 * upper bound is.
3693 	 */
3694 	struct pg_lease_ack_t {
3695 	  /// highest upper bound non-primary has recorded (primary's clock)
3696 	  ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3697 	
3698 	  pg_lease_ack_t() {}
3699 	  pg_lease_ack_t(ceph::signedspan ub)
3700 	    : readable_until_ub(ub) {}
3701 	
3702 	  void encode(ceph::buffer::list &bl) const;
3703 	  void decode(ceph::buffer::list::const_iterator &bl);
3704 	  void dump(ceph::Formatter *f) const;
3705 	  static void generate_test_instances(std::list<pg_lease_ack_t*>& o);
3706 	
3707 	  friend ostream& operator<<(ostream& out, const pg_lease_ack_t& l) {
3708 	    return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")";
3709 	  }
3710 	};
3711 	WRITE_CLASS_ENCODER(pg_lease_ack_t)
3712 	
3713 	
3714 	
3715 	class PGBackend;
3716 	class ObjectModDesc {
3717 	  bool can_local_rollback;
3718 	  bool rollback_info_completed;
3719 	
3720 	  // version required to decode, reflected in encode/decode version
3721 	  __u8 max_required_version = 1;
3722 	public:
3723 	  class Visitor {
3724 	  public:
3725 	    virtual void append(uint64_t old_offset) {}
3726 	    virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {}
3727 	    virtual void rmobject(version_t old_version) {}
3728 	    /**
3729 	     * Used to support the unfound_lost_delete log event: if the stashed
3730 	     * version exists, we unstash it, otherwise, we do nothing.  This way
3731 	     * each replica rolls back to whatever state it had prior to the attempt
3732 	     * at mark unfound lost delete
3733 	     */
3734 	    virtual void try_rmobject(version_t old_version) {
3735 	      rmobject(old_version);
3736 	    }
3737 	    virtual void create() {}
3738 	    virtual void update_snaps(const std::set<snapid_t> &old_snaps) {}
3739 	    virtual void rollback_extents(
3740 	      version_t gen,
3741 	      const std::vector<std::pair<uint64_t, uint64_t> > &extents) {}
3742 	    virtual ~Visitor() {}
3743 	  };
3744 	  void visit(Visitor *visitor) const;
3745 	  mutable ceph::buffer::list bl;
3746 	  enum ModID {
3747 	    APPEND = 1,
3748 	    SETATTRS = 2,
3749 	    DELETE = 3,
3750 	    CREATE = 4,
3751 	    UPDATE_SNAPS = 5,
3752 	    TRY_DELETE = 6,
3753 	    ROLLBACK_EXTENTS = 7
3754 	  };
3755 	  ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3756 	    bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3757 	  }
3758 	  void claim(ObjectModDesc &other) {
3759 	    bl.clear();
3760 	    bl.claim(other.bl);
3761 	    can_local_rollback = other.can_local_rollback;
3762 	    rollback_info_completed = other.rollback_info_completed;
3763 	  }
3764 	  void claim_append(ObjectModDesc &other) {
3765 	    if (!can_local_rollback || rollback_info_completed)
3766 	      return;
3767 	    if (!other.can_local_rollback) {
3768 	      mark_unrollbackable();
3769 	      return;
3770 	    }
3771 	    bl.claim_append(other.bl);
3772 	    rollback_info_completed = other.rollback_info_completed;
3773 	  }
3774 	  void swap(ObjectModDesc &other) {
3775 	    bl.swap(other.bl);
3776 	
3777 	    using std::swap;
3778 	    swap(other.can_local_rollback, can_local_rollback);
3779 	    swap(other.rollback_info_completed, rollback_info_completed);
3780 	    swap(other.max_required_version, max_required_version);
3781 	  }
3782 	  void append_id(ModID id) {
3783 	    using ceph::encode;
3784 	    uint8_t _id(id);
3785 	    encode(_id, bl);
3786 	  }
3787 	  void append(uint64_t old_size) {
3788 	    if (!can_local_rollback || rollback_info_completed)
3789 	      return;
3790 	    ENCODE_START(1, 1, bl);
3791 	    append_id(APPEND);
3792 	    encode(old_size, bl);
3793 	    ENCODE_FINISH(bl);
3794 	  }
3795 	  void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) {
3796 	    if (!can_local_rollback || rollback_info_completed)
3797 	      return;
3798 	    ENCODE_START(1, 1, bl);
3799 	    append_id(SETATTRS);
3800 	    encode(old_attrs, bl);
3801 	    ENCODE_FINISH(bl);
3802 	  }
3803 	  bool rmobject(version_t deletion_version) {
3804 	    if (!can_local_rollback || rollback_info_completed)
3805 	      return false;
3806 	    ENCODE_START(1, 1, bl);
3807 	    append_id(DELETE);
3808 	    encode(deletion_version, bl);
3809 	    ENCODE_FINISH(bl);
3810 	    rollback_info_completed = true;
3811 	    return true;
3812 	  }
3813 	  bool try_rmobject(version_t deletion_version) {
3814 	    if (!can_local_rollback || rollback_info_completed)
3815 	      return false;
3816 	    ENCODE_START(1, 1, bl);
3817 	    append_id(TRY_DELETE);
3818 	    encode(deletion_version, bl);
3819 	    ENCODE_FINISH(bl);
3820 	    rollback_info_completed = true;
3821 	    return true;
3822 	  }
3823 	  void create() {
3824 	    if (!can_local_rollback || rollback_info_completed)
3825 	      return;
3826 	    rollback_info_completed = true;
3827 	    ENCODE_START(1, 1, bl);
3828 	    append_id(CREATE);
3829 	    ENCODE_FINISH(bl);
3830 	  }
3831 	  void update_snaps(const std::set<snapid_t> &old_snaps) {
3832 	    if (!can_local_rollback || rollback_info_completed)
3833 	      return;
3834 	    ENCODE_START(1, 1, bl);
3835 	    append_id(UPDATE_SNAPS);
3836 	    encode(old_snaps, bl);
3837 	    ENCODE_FINISH(bl);
3838 	  }
3839 	  void rollback_extents(
3840 	    version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) {
3841 	    ceph_assert(can_local_rollback);
3842 	    ceph_assert(!rollback_info_completed);
3843 	    if (max_required_version < 2)
3844 	      max_required_version = 2;
3845 	    ENCODE_START(2, 2, bl);
3846 	    append_id(ROLLBACK_EXTENTS);
3847 	    encode(gen, bl);
3848 	    encode(extents, bl);
3849 	    ENCODE_FINISH(bl);
3850 	  }
3851 	
3852 	  // cannot be rolled back
3853 	  void mark_unrollbackable() {
3854 	    can_local_rollback = false;
3855 	    bl.clear();
3856 	  }
3857 	  bool can_rollback() const {
3858 	    return can_local_rollback;
3859 	  }
3860 	  bool empty() const {
3861 	    return can_local_rollback && (bl.length() == 0);
3862 	  }
3863 	
3864 	  bool requires_kraken() const {
3865 	    return max_required_version >= 2;
3866 	  }
3867 	
3868 	  /**
3869 	   * Create fresh copy of bl bytes to avoid keeping large buffers around
3870 	   * in the case that bl contains ptrs which point into a much larger
3871 	   * message buffer
3872 	   */
3873 	  void trim_bl() const {
3874 	    if (bl.length() > 0)
3875 	      bl.rebuild();
3876 	  }
3877 	  void encode(ceph::buffer::list &bl) const;
3878 	  void decode(ceph::buffer::list::const_iterator &bl);
3879 	  void dump(ceph::Formatter *f) const;
3880 	  static void generate_test_instances(std::list<ObjectModDesc*>& o);
3881 	};
3882 	WRITE_CLASS_ENCODER(ObjectModDesc)
3883 	
3884 	class ObjectCleanRegions {
3885 	private:
3886 	  bool new_object;
3887 	  bool clean_omap;
3888 	  interval_set<uint64_t> clean_offsets;
3889 	  static std::atomic<int32_t> max_num_intervals;
3890 	
3891 	  /**
3892 	   * trim the number of intervals if clean_offsets.num_intervals()
3893 	   * exceeds the given upbound max_num_intervals
3894 	   * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
3895 	   * then new interval [30~10] will evict out the shortest one [20~5]
3896 	   * finally, clean_offsets becomes {[5~10], [30~10]}
3897 	   */
3898 	  void trim();
3899 	  friend ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr);
3900 	public:
3901 	  ObjectCleanRegions() : new_object(false), clean_omap(true) {
3902 	    clean_offsets.insert(0, (uint64_t)-1);
3903 	  }
3904 	  ObjectCleanRegions(uint64_t offset, uint64_t len, bool co)
3905 	    : new_object(false), clean_omap(co) {
3906 	    clean_offsets.insert(offset, len);
3907 	  }
3908 	  bool operator==(const ObjectCleanRegions &orc) const {
3909 	    return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets;
3910 	  }
3911 	  static void set_max_num_intervals(int32_t num);
3912 	  void merge(const ObjectCleanRegions &other);
3913 	  void mark_data_region_dirty(uint64_t offset, uint64_t len);
3914 	  void mark_omap_dirty();
3915 	  void mark_object_new();
3916 	  void mark_fully_dirty();
3917 	  interval_set<uint64_t> get_dirty_regions() const;
3918 	  bool omap_is_dirty() const;
3919 	  bool object_is_exist() const;
3920 	
3921 	  void encode(bufferlist &bl) const;
3922 	  void decode(bufferlist::const_iterator &bl);
3923 	  void dump(Formatter *f) const;
3924 	  static void generate_test_instances(list<ObjectCleanRegions*>& o);
3925 	};
3926 	WRITE_CLASS_ENCODER(ObjectCleanRegions)
3927 	ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr);
3928 	
3929 	
3930 	struct OSDOp {
3931 	  ceph_osd_op op;
3932 	  sobject_t soid;
3933 	
3934 	  ceph::buffer::list indata, outdata;
3935 	  errorcode32_t rval = 0;
3936 	
3937 	  OSDOp() {
3938 	    memset(&op, 0, sizeof(ceph_osd_op));
3939 	  }
3940 	
3941 	  OSDOp(const int op_code) {
3942 	    memset(&op, 0, sizeof(ceph_osd_op));
3943 	    op.op = op_code;
3944 	  }
3945 	
3946 	  /**
3947 	   * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
3948 	   *
3949 	   * @param ops [out] vector of OSDOps
3950 	   * @param in  [in] combined data buffer
3951 	   */
3952 	  static void split_osd_op_vector_in_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
3953 	
3954 	  /**
3955 	   * merge indata members of a vector of OSDOp into a single ceph::buffer::list
3956 	   *
3957 	   * Notably this also encodes certain other OSDOp data into the data
3958 	   * buffer, including the sobject_t soid.
3959 	   *
3960 	   * @param ops [in] vector of OSDOps
3961 	   * @param out [out] combined data buffer
3962 	   */
3963 	  static void merge_osd_op_vector_in_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
3964 	
3965 	  /**
3966 	   * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
3967 	   *
3968 	   * @param ops [out] vector of OSDOps
3969 	   * @param in  [in] combined data buffer
3970 	   */
3971 	  static void split_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
3972 	
3973 	  /**
3974 	   * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
3975 	   *
3976 	   * @param ops [in] vector of OSDOps
3977 	   * @param out [out] combined data buffer
3978 	   */
3979 	  static void merge_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
3980 	
3981 	  /**
3982 	   * Clear data as much as possible, leave minimal data for historical op dump
3983 	   *
3984 	   * @param ops [in] vector of OSDOps
3985 	   */
3986 	  static void clear_data(std::vector<OSDOp>& ops);
3987 	};
3988 	std::ostream& operator<<(std::ostream& out, const OSDOp& op);
3989 	
3990 	
3991 	struct pg_log_op_return_item_t {
3992 	  int32_t rval;
3993 	  bufferlist bl;
3994 	  void encode(bufferlist& p) const {
3995 	    using ceph::encode;
3996 	    encode(rval, p);
3997 	    encode(bl, p);
3998 	  }
3999 	  void decode(bufferlist::const_iterator& p) {
4000 	    using ceph::decode;
4001 	    decode(rval, p);
4002 	    decode(bl, p);
4003 	  }
4004 	  void dump(Formatter *f) const {
4005 	    f->dump_int("rval", rval);
4006 	    f->dump_unsigned("bl_length", bl.length());
4007 	  }
4008 	  friend bool operator==(const pg_log_op_return_item_t& lhs,
4009 				 const pg_log_op_return_item_t& rhs) {
4010 	    return lhs.rval == rhs.rval &&
4011 	      lhs.bl.contents_equal(rhs.bl);
4012 	  }
4013 	  friend bool operator!=(const pg_log_op_return_item_t& lhs,
4014 				 const pg_log_op_return_item_t& rhs) {
4015 	    return !(lhs == rhs);
4016 	  }
4017 	  friend ostream& operator<<(ostream& out, const pg_log_op_return_item_t& i) {
4018 	    return out << "r=" << i.rval << "+" << i.bl.length() << "b";
4019 	  }
4020 	};
4021 	WRITE_CLASS_ENCODER(pg_log_op_return_item_t)
4022 	
4023 	/**
4024 	 * pg_log_entry_t - single entry/event in pg log
4025 	 *
4026 	 */
4027 	struct pg_log_entry_t {
4028 	  enum {
4029 	    MODIFY = 1,   // some unspecified modification (but not *all* modifications)
4030 	    CLONE = 2,    // cloned object from head
4031 	    DELETE = 3,   // deleted object
4032 	    //BACKLOG = 4,  // event invented by generate_backlog [obsolete]
4033 	    LOST_REVERT = 5, // lost new version, revert to an older version.
4034 	    LOST_DELETE = 6, // lost new version, revert to no object (deleted).
4035 	    LOST_MARK = 7,   // lost new version, now EIO
4036 	    PROMOTE = 8,     // promoted object from another tier
4037 	    CLEAN = 9,       // mark an object clean
4038 	    ERROR = 10,      // write that returned an error
4039 	  };
4040 	  static const char *get_op_name(int op) {
4041 	    switch (op) {
4042 	    case MODIFY:
4043 	      return "modify";
4044 	    case PROMOTE:
4045 	      return "promote";
4046 	    case CLONE:
4047 	      return "clone";
4048 	    case DELETE:
4049 	      return "delete";
4050 	    case LOST_REVERT:
4051 	      return "l_revert";
4052 	    case LOST_DELETE:
4053 	      return "l_delete";
4054 	    case LOST_MARK:
4055 	      return "l_mark";
4056 	    case CLEAN:
4057 	      return "clean";
4058 	    case ERROR:
4059 	      return "error";
4060 	    default:
4061 	      return "unknown";
4062 	    }
4063 	  }
4064 	  const char *get_op_name() const {
4065 	    return get_op_name(op);
4066 	  }
4067 	
4068 	  // describes state for a locally-rollbackable entry
4069 	  ObjectModDesc mod_desc;
4070 	  ceph::buffer::list snaps;   // only for clone entries
4071 	  hobject_t  soid;
4072 	  osd_reqid_t reqid;  // caller+tid to uniquely identify request
4073 	  mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
4074 	
4075 	  /// map extra_reqids by index to error return code (if any)
4076 	  mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
4077 	
4078 	  eversion_t version, prior_version, reverting_to;
4079 	  version_t user_version; // the user version for this entry
4080 	  utime_t     mtime;  // this is the _user_ mtime, mind you
4081 	  int32_t return_code; // only stored for ERRORs for dup detection
4082 	
4083 	  vector<pg_log_op_return_item_t> op_returns;
4084 	
4085 	  __s32      op;
4086 	  bool invalid_hash; // only when decoding sobject_t based entries
4087 	  bool invalid_pool; // only when decoding pool-less hobject based entries
4088 	  ObjectCleanRegions clean_regions;
4089 	
4090 	  pg_log_entry_t()
4091 	   : user_version(0), return_code(0), op(0),
4092 	     invalid_hash(false), invalid_pool(false) {
4093 	    snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4094 	  }
4095 	  pg_log_entry_t(int _op, const hobject_t& _soid,
4096 	                const eversion_t& v, const eversion_t& pv,
4097 	                version_t uv,
4098 	                const osd_reqid_t& rid, const utime_t& mt,
4099 	                int return_code)
4100 	   : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
4101 	     mtime(mt), return_code(return_code), op(_op),
4102 	     invalid_hash(false), invalid_pool(false) {
4103 	    snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4104 	  }
4105 	      
4106 	  bool is_clone() const { return op == CLONE; }
4107 	  bool is_modify() const { return op == MODIFY; }
4108 	  bool is_promote() const { return op == PROMOTE; }
4109 	  bool is_clean() const { return op == CLEAN; }
4110 	  bool is_lost_revert() const { return op == LOST_REVERT; }
4111 	  bool is_lost_delete() const { return op == LOST_DELETE; }
4112 	  bool is_lost_mark() const { return op == LOST_MARK; }
4113 	  bool is_error() const { return op == ERROR; }
4114 	
4115 	  bool is_update() const {
4116 	    return
4117 	      is_clone() || is_modify() || is_promote() || is_clean() ||
4118 	      is_lost_revert() || is_lost_mark();
4119 	  }
4120 	  bool is_delete() const {
4121 	    return op == DELETE || op == LOST_DELETE;
4122 	  }
4123 	
4124 	  bool can_rollback() const {
4125 	    return mod_desc.can_rollback();
4126 	  }
4127 	
4128 	  void mark_unrollbackable() {
4129 	    mod_desc.mark_unrollbackable();
4130 	  }
4131 	
4132 	  bool requires_kraken() const {
4133 	    return mod_desc.requires_kraken();
4134 	  }
4135 	
4136 	  // Errors are only used for dup detection, whereas
4137 	  // the index by objects is used by recovery, copy_get,
4138 	  // and other facilities that don't expect or need to
4139 	  // be aware of error entries.
4140 	  bool object_is_indexed() const {
4141 	    return !is_error();
4142 	  }
4143 	
4144 	  bool reqid_is_indexed() const {
4145 	    return reqid != osd_reqid_t() &&
4146 	      (op == MODIFY || op == DELETE || op == ERROR);
4147 	  }
4148 	
4149 	  void set_op_returns(std::vector<OSDOp>& ops) {
4150 	    op_returns.resize(ops.size());
4151 	    for (unsigned i = 0; i < ops.size(); ++i) {
4152 	      op_returns[i].rval = ops[i].rval;
4153 	      op_returns[i].bl = ops[i].outdata;
4154 	    }
4155 	  }
4156 	
4157 	  std::string get_key_name() const;
4158 	  void encode_with_checksum(ceph::buffer::list& bl) const;
4159 	  void decode_with_checksum(ceph::buffer::list::const_iterator& p);
4160 	
4161 	  void encode(ceph::buffer::list &bl) const;
4162 	  void decode(ceph::buffer::list::const_iterator &bl);
4163 	  void dump(ceph::Formatter *f) const;
4164 	  static void generate_test_instances(std::list<pg_log_entry_t*>& o);
4165 	
4166 	};
4167 	WRITE_CLASS_ENCODER(pg_log_entry_t)
4168 	
4169 	std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e);
4170 	
4171 	struct pg_log_dup_t {
4172 	  osd_reqid_t reqid;  // caller+tid to uniquely identify request
4173 	  eversion_t version;
4174 	  version_t user_version; // the user version for this entry
4175 	  int32_t return_code; // only stored for ERRORs for dup detection
4176 	
4177 	  vector<pg_log_op_return_item_t> op_returns;
4178 	
4179 	  pg_log_dup_t()
4180 	    : user_version(0), return_code(0)
4181 	  {}
4182 	  explicit pg_log_dup_t(const pg_log_entry_t& entry)
4183 	    : reqid(entry.reqid), version(entry.version),
4184 	      user_version(entry.user_version),
4185 	      return_code(entry.return_code),
4186 	      op_returns(entry.op_returns)
4187 	  {}
4188 	  pg_log_dup_t(const eversion_t& v, version_t uv,
4189 		       const osd_reqid_t& rid, int return_code)
4190 	    : reqid(rid), version(v), user_version(uv),
4191 	      return_code(return_code)
4192 	  {}
4193 	
4194 	  std::string get_key_name() const;
4195 	  void encode(ceph::buffer::list &bl) const;
4196 	  void decode(ceph::buffer::list::const_iterator &bl);
4197 	  void dump(ceph::Formatter *f) const;
4198 	  static void generate_test_instances(std::list<pg_log_dup_t*>& o);
4199 	
4200 	  bool operator==(const pg_log_dup_t &rhs) const {
4201 	    return reqid == rhs.reqid &&
4202 	      version == rhs.version &&
4203 	      user_version == rhs.user_version &&
4204 	      return_code == rhs.return_code &&
4205 	      op_returns == rhs.op_returns;
4206 	  }
4207 	  bool operator!=(const pg_log_dup_t &rhs) const {
4208 	    return !(*this == rhs);
4209 	  }
4210 	
4211 	  friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4212 	};
4213 	WRITE_CLASS_ENCODER(pg_log_dup_t)
4214 	
4215 	std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4216 	
4217 	/**
4218 	 * pg_log_t - incremental log of recent pg changes.
4219 	 *
4220 	 *  serves as a recovery queue for recent changes.
4221 	 */
4222 	struct pg_log_t {
4223 	  /*
4224 	   *   head - newest entry (update|delete)
4225 	   *   tail - entry previous to oldest (update|delete) for which we have
4226 	   *          complete negative information.  
4227 	   * i.e. we can infer pg contents for any store whose last_update >= tail.
4228 	   */
4229 	  eversion_t head;    // newest entry
4230 	  eversion_t tail;    // version prior to oldest
4231 	
4232 	protected:
4233 	  // We can rollback rollback-able entries > can_rollback_to
4234 	  eversion_t can_rollback_to;
4235 	
4236 	  // always <= can_rollback_to, indicates how far stashed rollback
4237 	  // data can be found
4238 	  eversion_t rollback_info_trimmed_to;
4239 	
4240 	public:
4241 	  // the actual log
4242 	  mempool::osd_pglog::list<pg_log_entry_t> log;
4243 	
4244 	  // entries just for dup op detection ordered oldest to newest
4245 	  mempool::osd_pglog::list<pg_log_dup_t> dups;
4246 	
4247 	  pg_log_t() = default;
4248 	  pg_log_t(const eversion_t &last_update,
4249 		   const eversion_t &log_tail,
4250 		   const eversion_t &can_rollback_to,
4251 		   const eversion_t &rollback_info_trimmed_to,
4252 		   mempool::osd_pglog::list<pg_log_entry_t> &&entries,
4253 		   mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
4254 	    : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4255 	      rollback_info_trimmed_to(rollback_info_trimmed_to),
4256 	      log(std::move(entries)), dups(std::move(dup_entries)) {}
4257 	  pg_log_t(const eversion_t &last_update,
4258 		   const eversion_t &log_tail,
4259 		   const eversion_t &can_rollback_to,
4260 		   const eversion_t &rollback_info_trimmed_to,
4261 		   const std::list<pg_log_entry_t> &entries,
4262 		   const std::list<pg_log_dup_t> &dup_entries)
4263 	    : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4264 	      rollback_info_trimmed_to(rollback_info_trimmed_to) {
4265 	    for (auto &&entry: entries) {
4266 	      log.push_back(entry);
4267 	    }
4268 	    for (auto &&entry: dup_entries) {
4269 	      dups.push_back(entry);
4270 	    }
4271 	  }
4272 	
4273 	  void clear() {
4274 	    eversion_t z;
4275 	    rollback_info_trimmed_to = can_rollback_to = head = tail = z;
4276 	    log.clear();
4277 	    dups.clear();
4278 	  }
4279 	
4280 	  eversion_t get_rollback_info_trimmed_to() const {
4281 	    return rollback_info_trimmed_to;
4282 	  }
4283 	  eversion_t get_can_rollback_to() const {
4284 	    return can_rollback_to;
4285 	  }
4286 	
4287 	
4288 	  pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
4289 	    mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
4290 	    oldlog.swap(log);
4291 	
4292 	    eversion_t old_tail;
4293 	    unsigned mask = ~((~0)<<split_bits);
4294 	    for (auto i = oldlog.begin();
4295 		 i != oldlog.end();
4296 	      ) {
4297 	      if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
4298 		childlog.push_back(*i);
4299 	      } else {
4300 		log.push_back(*i);
4301 	      }
4302 	      oldlog.erase(i++);
4303 	    }
4304 	
4305 	    // osd_reqid is unique, so it doesn't matter if there are extra
4306 	    // dup entries in each pg. To avoid storing oid with the dup
4307 	    // entries, just copy the whole list.
4308 	    auto childdups(dups);
4309 	
4310 	    return pg_log_t(
4311 	      head,
4312 	      tail,
4313 	      can_rollback_to,
4314 	      rollback_info_trimmed_to,
4315 	      std::move(childlog),
4316 	      std::move(childdups));
4317 	    }
4318 	
4319 	  mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
4320 	    ceph_assert(newhead >= tail);
4321 	
4322 	    mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
4323 	    mempool::osd_pglog::list<pg_log_entry_t> divergent;
4324 	    while (true) {
4325 	      if (p == log.begin()) {
4326 		// yikes, the whole thing is divergent!
4327 		using std::swap;
4328 		swap(divergent, log);
4329 		break;
4330 	      }
4331 	      --p;
4332 	      if (p->version.version <= newhead.version) {
4333 		/*
4334 		 * look at eversion.version here.  we want to avoid a situation like:
4335 		 *  our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4336 		 *  new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4337 		 *  lower_bound = 100'9
4338 		 * i.e, same request, different version.  If the eversion.version is > the
4339 		 * lower_bound, we it is divergent.
4340 		 */
4341 		++p;
4342 		divergent.splice(divergent.begin(), log, p, log.end());
4343 		break;
4344 	      }
4345 	      ceph_assert(p->version > newhead);
4346 	    }
4347 	    head = newhead;
4348 	
4349 	    if (can_rollback_to > newhead)
4350 	      can_rollback_to = newhead;
4351 	
4352 	    if (rollback_info_trimmed_to > newhead)
4353 	      rollback_info_trimmed_to = newhead;
4354 	
4355 	    return divergent;
4356 	  }
4357 	
4358 	  void merge_from(const std::vector<pg_log_t*>& slogs, eversion_t last_update) {
4359 	    log.clear();
4360 	
4361 	    // sort and merge dups
4362 	    std::multimap<eversion_t,pg_log_dup_t> sorted;
4363 	    for (auto& d : dups) {
4364 	      sorted.emplace(d.version, d);
4365 	    }
4366 	    for (auto l : slogs) {
4367 	      for (auto& d : l->dups) {
4368 		sorted.emplace(d.version, d);
4369 	      }
4370 	    }
4371 	    dups.clear();
4372 	    for (auto& i : sorted) {
4373 	      dups.push_back(i.second);
4374 	    }
4375 	
4376 	    head = last_update;
4377 	    tail = last_update;
4378 	    can_rollback_to = last_update;
4379 	    rollback_info_trimmed_to = last_update;
4380 	  }
4381 	
4382 	  bool empty() const {
4383 	    return log.empty();
4384 	  }
4385 	
4386 	  bool null() const {
4387 	    return head.version == 0 && head.epoch == 0;
4388 	  }
4389 	
4390 	  uint64_t approx_size() const {
4391 	    return head.version - tail.version;
4392 	  }
4393 	
4394 	  static void filter_log(spg_t import_pgid, const OSDMap &curmap,
4395 	    const std::string &hit_set_namespace, const pg_log_t &in,
4396 	    pg_log_t &out, pg_log_t &reject);
4397 	
4398 	  /**
4399 	   * copy entries from the tail of another pg_log_t
4400 	   *
4401 	   * @param other pg_log_t to copy from
4402 	   * @param from copy entries after this version
4403 	   */
4404 	  void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
4405 	
4406 	  /**
4407 	   * copy up to N entries
4408 	   *
4409 	   * @param other source log
4410 	   * @param max max number of entries to copy
4411 	   */
4412 	  void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
4413 	
4414 	  std::ostream& print(std::ostream& out) const;
4415 	
4416 	  void encode(ceph::buffer::list &bl) const;
4417 	  void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
4418 	  void dump(ceph::Formatter *f) const;
4419 	  static void generate_test_instances(std::list<pg_log_t*>& o);
4420 	};
4421 	WRITE_CLASS_ENCODER(pg_log_t)
4422 	
4423 	inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log)
4424 	{
4425 	  out << "log((" << log.tail << "," << log.head << "], crt="
4426 	      << log.get_can_rollback_to() << ")";
4427 	  return out;
4428 	}
4429 	
4430 	
4431 	/**
4432 	 * pg_missing_t - summary of missing objects.
4433 	 *
4434 	 *  kept in memory, as a supplement to pg_log_t
4435 	 *  also used to pass missing info in messages.
4436 	 */
4437 	struct pg_missing_item {
4438 	  eversion_t need, have;
4439 	  ObjectCleanRegions clean_regions;
4440 	  enum missing_flags_t {
4441 	    FLAG_NONE = 0,
4442 	    FLAG_DELETE = 1,
4443 	  } flags;
4444 	  pg_missing_item() : flags(FLAG_NONE) {}
4445 	  explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {}  // have no old version
4446 	  pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) :
4447 	    need(n), have(h) {
4448 	    set_delete(is_delete);
4449 	    if (old_style)
4450 	      clean_regions.mark_fully_dirty();
4451 	  }
4452 	
4453 	  void encode(ceph::buffer::list& bl, uint64_t features) const {
4454 	    using ceph::encode;
(1) Event cond_false: Condition "(features & 144115188344356864UL /* CEPH_FEATUREMASK_SERVER_OCTOPUS */) == 144115188344356864UL /* CEPH_FEATUREMASK_SERVER_OCTOPUS */", taking false branch.
4455 	    if (HAVE_FEATURE(features, SERVER_OCTOPUS)) {
4456 	      // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
4457 	      // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
4458 	      // possible. This can be replaced with the legacy encoding
4459 	      encode(eversion_t(), bl);
4460 	      encode(eversion_t(-1, -1), bl);
4461 	      encode(need, bl);
4462 	      encode(have, bl);   
4463 	      encode(static_cast<uint8_t>(flags), bl);
4464 	      encode(clean_regions, bl);
(2) Event else_branch: Reached else branch.
4465 	    } else {
4466 	      encode(eversion_t(), bl);
4467 	      encode(need, bl);
4468 	      encode(have, bl);
(3) Event overrun-buffer-val: Overrunning buffer pointed to by "__u8 const(static_cast<uint8_t>(this->flags))" of 1 bytes by passing it to a function which accesses it at byte offset 7. [details]
4469 	      encode(static_cast<uint8_t>(flags), bl);
4470 	    }
4471 	  }
4472 	  void decode(ceph::buffer::list::const_iterator& bl) {
4473 	    using ceph::decode;
4474 	    eversion_t e, l;
4475 	    decode(e, bl);
4476 	    decode(l, bl);
4477 	    if(l == eversion_t(-1, -1)) {
4478 	      // support all
4479 	      decode(need, bl);
4480 	      decode(have, bl);
4481 	      uint8_t f;
4482 	      decode(f, bl);
4483 	      flags = static_cast<missing_flags_t>(f);
4484 	      decode(clean_regions, bl);
4485 	     } else {
4486 	      // support OSD_RECOVERY_DELETES
4487 	      need = l;
4488 	      decode(have, bl);
4489 	      uint8_t f;
4490 	      decode(f, bl);
4491 	      flags = static_cast<missing_flags_t>(f); 
4492 	      clean_regions.mark_fully_dirty();
4493 	    }
4494 	  }
4495 	
4496 	  void set_delete(bool is_delete) {
4497 	    flags = is_delete ? FLAG_DELETE : FLAG_NONE;
4498 	  }
4499 	
4500 	  bool is_delete() const {
4501 	    return (flags & FLAG_DELETE) == FLAG_DELETE;
4502 	  }
4503 	
4504 	  std::string flag_str() const {
4505 	    if (flags == FLAG_NONE) {
4506 	      return "none";
4507 	    } else {
4508 	      return "delete";
4509 	    }
4510 	  }
4511 	
4512 	  void dump(ceph::Formatter *f) const {
4513 	    f->dump_stream("need") << need;
4514 	    f->dump_stream("have") << have;
4515 	    f->dump_stream("flags") << flag_str();
4516 	    f->dump_stream("clean_regions") << clean_regions;
4517 	  }
4518 	  static void generate_test_instances(std::list<pg_missing_item*>& o) {
4519 	    o.push_back(new pg_missing_item);
4520 	    o.push_back(new pg_missing_item);
4521 	    o.back()->need = eversion_t(1, 2);
4522 	    o.back()->have = eversion_t(1, 1);
4523 	    o.push_back(new pg_missing_item);
4524 	    o.back()->need = eversion_t(3, 5);
4525 	    o.back()->have = eversion_t(3, 4);
4526 	    o.back()->clean_regions.mark_data_region_dirty(4096, 8192);
4527 	    o.back()->clean_regions.mark_omap_dirty();
4528 	    o.back()->flags = FLAG_DELETE;
4529 	  }
4530 	  bool operator==(const pg_missing_item &rhs) const {
4531 	    return need == rhs.need && have == rhs.have && flags == rhs.flags;
4532 	  }
4533 	  bool operator!=(const pg_missing_item &rhs) const {
4534 	    return !(*this == rhs);
4535 	  }
4536 	};
4537 	WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
4538 	std::ostream& operator<<(std::ostream& out, const pg_missing_item &item);
4539 	
4540 	class pg_missing_const_i {
4541 	public:
4542 	  virtual const std::map<hobject_t, pg_missing_item> &
4543 	    get_items() const = 0;
4544 	  virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0;
4545 	  virtual bool get_may_include_deletes() const = 0;
4546 	  virtual unsigned int num_missing() const = 0;
4547 	  virtual bool have_missing() const = 0;
4548 	  virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
4549 	  virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
4550 	  virtual ~pg_missing_const_i() {}
4551 	};
4552 	
4553 	
4554 	template <bool Track>
4555 	class ChangeTracker {
4556 	public:
4557 	  void changed(const hobject_t &obj) {}
4558 	  template <typename F>
4559 	  void get_changed(F &&f) const {}
4560 	  void flush() {}
4561 	  bool is_clean() const {
4562 	    return true;
4563 	  }
4564 	};
4565 	template <>
4566 	class ChangeTracker<true> {
4567 	  std::set<hobject_t> _changed;
4568 	public:
4569 	  void changed(const hobject_t &obj) {
4570 	    _changed.insert(obj);
4571 	  }
4572 	  template <typename F>
4573 	  void get_changed(F &&f) const {
4574 	    for (auto const &i: _changed) {
4575 	      f(i);
4576 	    }
4577 	  }
4578 	  void flush() {
4579 	    _changed.clear();
4580 	  }
4581 	  bool is_clean() const {
4582 	    return _changed.empty();
4583 	  }
4584 	};
4585 	
4586 	template <bool TrackChanges>
4587 	class pg_missing_set : public pg_missing_const_i {
4588 	  using item = pg_missing_item;
4589 	  std::map<hobject_t, item> missing;  // oid -> (need v, have v)
4590 	  std::map<version_t, hobject_t> rmissing;  // v -> oid
4591 	  ChangeTracker<TrackChanges> tracker;
4592 	
4593 	public:
4594 	  pg_missing_set() = default;
4595 	
4596 	  template <typename missing_type>
4597 	  pg_missing_set(const missing_type &m) {
4598 	    missing = m.get_items();
4599 	    rmissing = m.get_rmissing();
4600 	    may_include_deletes = m.get_may_include_deletes();
4601 	    for (auto &&i: missing)
4602 	      tracker.changed(i.first);
4603 	  }
4604 	
4605 	  bool may_include_deletes = false;
4606 	
4607 	  const std::map<hobject_t, item> &get_items() const override {
4608 	    return missing;
4609 	  }
4610 	  const std::map<version_t, hobject_t> &get_rmissing() const override {
4611 	    return rmissing;
4612 	  }
4613 	  bool get_may_include_deletes() const override {
4614 	    return may_include_deletes;
4615 	  }
4616 	  unsigned int num_missing() const override {
4617 	    return missing.size();
4618 	  }
4619 	  bool have_missing() const override {
4620 	    return !missing.empty();
4621 	  }
4622 	  void merge(const pg_log_entry_t& e) {
4623 	    auto miter = missing.find(e.soid);
4624 	    if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have)
4625 	      miter->second.clean_regions.merge(e.clean_regions);
4626 	  }
4627 	  bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
4628 	    auto iter = missing.find(oid);
4629 	    if (iter == missing.end())
4630 	      return false;
4631 	    if (out)
4632 	      *out = iter->second;
4633 	    return true;
4634 	  }
4635 	  bool is_missing(const hobject_t& oid, eversion_t v) const override {
4636 	    std::map<hobject_t, item>::const_iterator m =
4637 	      missing.find(oid);
4638 	    if (m == missing.end())
4639 	      return false;
4640 	    const item &item(m->second);
4641 	    if (item.need > v)
4642 	      return false;
4643 	    return true;
4644 	  }
4645 	  eversion_t get_oldest_need() const {
4646 	    if (missing.empty()) {
4647 	      return eversion_t();
4648 	    }
4649 	    auto it = missing.find(rmissing.begin()->second);
4650 	    ceph_assert(it != missing.end());
4651 	    return it->second.need;
4652 	  }
4653 	
4654 	  void claim(pg_missing_set& o) {
4655 	    static_assert(!TrackChanges, "Can't use claim with TrackChanges");
4656 	    missing.swap(o.missing);
4657 	    rmissing.swap(o.rmissing);
4658 	  }
4659 	
4660 	  /*
4661 	   * this needs to be called in log order as we extend the log.  it
4662 	   * assumes missing is accurate up through the previous log entry.
4663 	   */
4664 	  void add_next_event(const pg_log_entry_t& e) {
4665 	    std::map<hobject_t, item>::iterator missing_it;
4666 	    missing_it = missing.find(e.soid);
4667 	    bool is_missing_divergent_item = missing_it != missing.end();
4668 	    if (e.prior_version == eversion_t() || e.is_clone()) {
4669 	      // new object.
4670 	      if (is_missing_divergent_item) {  // use iterator
4671 	        rmissing.erase(missing_it->second.need.version);
4672 	        // .have = nil
4673 	        missing_it->second = item(e.version, eversion_t(), e.is_delete());
4674 	        missing_it->second.clean_regions.mark_fully_dirty();
4675 	      } else {
4676 	         // create new element in missing map
4677 	         // .have = nil
4678 	        missing[e.soid] = item(e.version, eversion_t(), e.is_delete());
4679 	        missing[e.soid].clean_regions.mark_fully_dirty();
4680 	      }
4681 	    } else if (is_missing_divergent_item) {
4682 	      // already missing (prior).
4683 	      rmissing.erase((missing_it->second).need.version);
4684 	      missing_it->second.need = e.version;  // leave .have unchanged.
4685 	      missing_it->second.set_delete(e.is_delete());
4686 	      if (e.is_lost_revert())
4687 	        missing_it->second.clean_regions.mark_fully_dirty();
4688 	      else
4689 	        missing_it->second.clean_regions.merge(e.clean_regions);
4690 	    } else {
4691 	      // not missing, we must have prior_version (if any)
4692 	      ceph_assert(!is_missing_divergent_item);
4693 	      missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
4694 	      if (e.is_lost_revert())
4695 	        missing[e.soid].clean_regions.mark_fully_dirty();
4696 	      else
4697 	        missing[e.soid].clean_regions = e.clean_regions;
4698 	    }
4699 	    rmissing[e.version.version] = e.soid;
4700 	    tracker.changed(e.soid);
4701 	  }
4702 	
4703 	  void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
4704 	    auto p = missing.find(oid);
4705 	    if (p != missing.end()) {
4706 	      rmissing.erase((p->second).need.version);
4707 	      p->second.need = need;          // do not adjust .have
4708 	      p->second.set_delete(is_delete);
4709 	      p->second.clean_regions.mark_fully_dirty();
4710 	    } else {
4711 	      missing[oid] = item(need, eversion_t(), is_delete);
4712 	      missing[oid].clean_regions.mark_fully_dirty();
4713 	    }
4714 	    rmissing[need.version] = oid;
4715 	
4716 	    tracker.changed(oid);
4717 	  }
4718 	
4719 	  void revise_have(hobject_t oid, eversion_t have) {
4720 	    auto p = missing.find(oid);
4721 	    if (p != missing.end()) {
4722 	      tracker.changed(oid);
4723 	      (p->second).have = have;
4724 	    }
4725 	  }
4726 	
4727 	  void mark_fully_dirty(const hobject_t& oid) {
4728 	    auto p = missing.find(oid);
4729 	    if (p != missing.end()) {
4730 	      tracker.changed(oid);
4731 	      (p->second).clean_regions.mark_fully_dirty();
4732 	    }
4733 	  }
4734 	
4735 	  void add(const hobject_t& oid, eversion_t need, eversion_t have,
4736 		   bool is_delete) {
4737 	    missing[oid] = item(need, have, is_delete, true);
4738 	    rmissing[need.version] = oid;
4739 	    tracker.changed(oid);
4740 	  }
4741 	
4742 	  void add(const hobject_t& oid, pg_missing_item&& item) {
4743 	    rmissing[item.need.version] = oid;
4744 	    missing.insert({oid, std::move(item)});
4745 	    tracker.changed(oid);
4746 	  }
4747 	
4748 	  void rm(const hobject_t& oid, eversion_t v) {
4749 	    std::map<hobject_t, item>::iterator p = missing.find(oid);
4750 	    if (p != missing.end() && p->second.need <= v)
4751 	      rm(p);
4752 	  }
4753 	
4754 	  void rm(std::map<hobject_t, item>::const_iterator m) {
4755 	    tracker.changed(m->first);
4756 	    rmissing.erase(m->second.need.version);
4757 	    missing.erase(m);
4758 	  }
4759 	
4760 	  void got(const hobject_t& oid, eversion_t v) {
4761 	    std::map<hobject_t, item>::iterator p = missing.find(oid);
4762 	    ceph_assert(p != missing.end());
4763 	    ceph_assert(p->second.need <= v || p->second.is_delete());
4764 	    got(p);
4765 	  }
4766 	
4767 	  void got(std::map<hobject_t, item>::const_iterator m) {
4768 	    tracker.changed(m->first);
4769 	    rmissing.erase(m->second.need.version);
4770 	    missing.erase(m);
4771 	  }
4772 	
4773 	  void split_into(
4774 	    pg_t child_pgid,
4775 	    unsigned split_bits,
4776 	    pg_missing_set *omissing) {
4777 	    omissing->may_include_deletes = may_include_deletes;
4778 	    unsigned mask = ~((~0)<<split_bits);
4779 	    for (std::map<hobject_t, item>::iterator i = missing.begin();
4780 		 i != missing.end();
4781 	      ) {
4782 	      if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
4783 		omissing->add(i->first, i->second.need, i->second.have,
4784 			      i->second.is_delete());
4785 		rm(i++);
4786 	      } else {
4787 		++i;
4788 	      }
4789 	    }
4790 	  }
4791 	
4792 	  void clear() {
4793 	    for (auto const &i: missing)
4794 	      tracker.changed(i.first);
4795 	    missing.clear();
4796 	    rmissing.clear();
4797 	  }
4798 	
4799 	  void encode(ceph::buffer::list &bl, uint64_t features) const {
4800 	    ENCODE_START(5, 2, bl)
4801 	    encode(missing, bl, features);
4802 	    encode(may_include_deletes, bl);
4803 	    ENCODE_FINISH(bl);
4804 	  }
4805 	  void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) {
4806 	    for (auto const &i: missing)
4807 	      tracker.changed(i.first);
4808 	    DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
4809 	    decode(missing, bl);
4810 	    if (struct_v >= 4) {
4811 	      decode(may_include_deletes, bl);
4812 	    }
4813 	    DECODE_FINISH(bl);
4814 	
4815 	    if (struct_v < 3) {
4816 	      // Handle hobject_t upgrade
4817 	      std::map<hobject_t, item> tmp;
4818 	      for (std::map<hobject_t, item>::iterator i =
4819 		     missing.begin();
4820 		   i != missing.end();
4821 		) {
4822 		if (!i->first.is_max() && i->first.pool == -1) {
4823 		  hobject_t to_insert(i->first);
4824 		  to_insert.pool = pool;
4825 		  tmp[to_insert] = i->second;
4826 		  missing.erase(i++);
4827 		} else {
4828 		  ++i;
4829 		}
4830 	      }
4831 	      missing.insert(tmp.begin(), tmp.end());
4832 	    }
4833 	
4834 	    for (std::map<hobject_t,item>::iterator it =
4835 		   missing.begin();
4836 		 it != missing.end();
4837 		 ++it)
4838 	      rmissing[it->second.need.version] = it->first;
4839 	    for (auto const &i: missing)
4840 	      tracker.changed(i.first);
4841 	  }
4842 	  void dump(ceph::Formatter *f) const {
4843 	    f->open_array_section("missing");
4844 	    for (std::map<hobject_t,item>::const_iterator p =
4845 		   missing.begin(); p != missing.end(); ++p) {
4846 	      f->open_object_section("item");
4847 	      f->dump_stream("object") << p->first;
4848 	      p->second.dump(f);
4849 	      f->close_section();
4850 	    }
4851 	    f->close_section();
4852 	    f->dump_bool("may_include_deletes", may_include_deletes);
4853 	  }
4854 	  template <typename F>
4855 	  void filter_objects(F &&f) {
4856 	    for (auto i = missing.begin(); i != missing.end();) {
4857 	      if (f(i->first)) {
4858 		rm(i++);
4859 	      } else {
4860 	        ++i;
4861 	      }
4862 	    }
4863 	  }
4864 	  static void generate_test_instances(std::list<pg_missing_set*>& o) {
4865 	    o.push_back(new pg_missing_set);
4866 	    o.back()->may_include_deletes = true;
4867 	    o.push_back(new pg_missing_set);
4868 	    o.back()->add(
4869 	      hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4870 	      eversion_t(5, 6), eversion_t(5, 1), false);
4871 	    o.back()->may_include_deletes = true;
4872 	    o.push_back(new pg_missing_set);
4873 	    o.back()->add(
4874 	      hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4875 	      eversion_t(5, 6), eversion_t(5, 1), true);
4876 	    o.back()->may_include_deletes = true;
4877 	  }
4878 	  template <typename F>
4879 	  void get_changed(F &&f) const {
4880 	    tracker.get_changed(f);
4881 	  }
4882 	  void flush() {
4883 	    tracker.flush();
4884 	  }
4885 	  bool is_clean() const {
4886 	    return tracker.is_clean();
4887 	  }
4888 	  template <typename missing_t>
4889 	  bool debug_verify_from_init(
4890 	    const missing_t &init_missing,
4891 	    std::ostream *oss) const {
4892 	    if (!TrackChanges)
4893 	      return true;
4894 	    auto check_missing(init_missing.get_items());
4895 	    tracker.get_changed([&](const hobject_t &hoid) {
4896 		check_missing.erase(hoid);
4897 		if (missing.count(hoid)) {
4898 		  check_missing.insert(*(missing.find(hoid)));
4899 		}
4900 	      });
4901 	    bool ok = true;
4902 	    if (check_missing.size() != missing.size()) {
4903 	      if (oss) {
4904 		*oss << "Size mismatch, check: " << check_missing.size()
4905 		     << ", actual: " << missing.size() << "\n";
4906 	      }
4907 	      ok = false;
4908 	    }
4909 	    for (auto &i: missing) {
4910 	      if (!check_missing.count(i.first)) {
4911 		if (oss)
4912 		  *oss << "check_missing missing " << i.first << "\n";
4913 		ok = false;
4914 	      } else if (check_missing[i.first] != i.second) {
4915 		if (oss)
4916 		  *oss << "check_missing missing item mismatch on " << i.first
4917 		       << ", check: " << check_missing[i.first]
4918 		       << ", actual: " << i.second << "\n";
4919 		ok = false;
4920 	      }
4921 	    }
4922 	    if (oss && !ok) {
4923 	      *oss << "check_missing: " << check_missing << "\n";
4924 	      std::set<hobject_t> changed;
4925 	      tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4926 	      *oss << "changed: " << changed << "\n";
4927 	    }
4928 	    return ok;
4929 	  }
4930 	};
4931 	template <bool TrackChanges>
4932 	void encode(
4933 	  const pg_missing_set<TrackChanges> &c, ceph::buffer::list &bl, uint64_t features=0) {
4934 	  ENCODE_DUMP_PRE();
4935 	  c.encode(bl, features);
4936 	  ENCODE_DUMP_POST(cl);
4937 	}
4938 	template <bool TrackChanges>
4939 	void decode(pg_missing_set<TrackChanges> &c, ceph::buffer::list::const_iterator &p) {
4940 	  c.decode(p);
4941 	}
4942 	template <bool TrackChanges>
4943 	std::ostream& operator<<(std::ostream& out, const pg_missing_set<TrackChanges> &missing)
4944 	{
4945 	  out << "missing(" << missing.num_missing()
4946 	      << " may_include_deletes = " << missing.may_include_deletes;
4947 	  //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4948 	  out << ")";
4949 	  return out;
4950 	}
4951 	
4952 	using pg_missing_t = pg_missing_set<false>;
4953 	using pg_missing_tracker_t = pg_missing_set<true>;
4954 	
4955 	
4956 	/**
4957 	 * pg list objects response format
4958 	 *
4959 	 */
4960 	struct pg_nls_response_t {
4961 	  collection_list_handle_t handle;
4962 	  std::list<librados::ListObjectImpl> entries;
4963 	
4964 	  void encode(ceph::buffer::list& bl) const {
4965 	    ENCODE_START(1, 1, bl);
4966 	    encode(handle, bl);
4967 	    __u32 n = (__u32)entries.size();
4968 	    encode(n, bl);
4969 	    for (std::list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
4970 	      encode(i->nspace, bl);
4971 	      encode(i->oid, bl);
4972 	      encode(i->locator, bl);
4973 	    }
4974 	    ENCODE_FINISH(bl);
4975 	  }
4976 	  void decode(ceph::buffer::list::const_iterator& bl) {
4977 	    DECODE_START(1, bl);
4978 	    decode(handle, bl);
4979 	    __u32 n;
4980 	    decode(n, bl);
4981 	    entries.clear();
4982 	    while (n--) {
4983 	      librados::ListObjectImpl i;
4984 	      decode(i.nspace, bl);
4985 	      decode(i.oid, bl);
4986 	      decode(i.locator, bl);
4987 	      entries.push_back(i);
4988 	    }
4989 	    DECODE_FINISH(bl);
4990 	  }
4991 	  void dump(ceph::Formatter *f) const {
4992 	    f->dump_stream("handle") << handle;
4993 	    f->open_array_section("entries");
4994 	    for (std::list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4995 	      f->open_object_section("object");
4996 	      f->dump_string("namespace", p->nspace);
4997 	      f->dump_string("object", p->oid);
4998 	      f->dump_string("key", p->locator);
4999 	      f->close_section();
5000 	    }
5001 	    f->close_section();
5002 	  }
5003 	  static void generate_test_instances(std::list<pg_nls_response_t*>& o) {
5004 	    o.push_back(new pg_nls_response_t);
5005 	    o.push_back(new pg_nls_response_t);
5006 	    o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5007 	    o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5008 	    o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5009 	    o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5010 	    o.push_back(new pg_nls_response_t);
5011 	    o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
5012 	    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5013 	    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5014 	    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5015 	    o.push_back(new pg_nls_response_t);
5016 	    o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
5017 	    o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5018 	    o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5019 	    o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5020 	    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5021 	    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5022 	    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5023 	  }
5024 	};
5025 	
5026 	WRITE_CLASS_ENCODER(pg_nls_response_t)
5027 	
5028 	// For backwards compatibility with older OSD requests
5029 	struct pg_ls_response_t {
5030 	  collection_list_handle_t handle; 
5031 	  std::list<std::pair<object_t, std::string> > entries;
5032 	
5033 	  void encode(ceph::buffer::list& bl) const {
5034 	    using ceph::encode;
5035 	    __u8 v = 1;
5036 	    encode(v, bl);
5037 	    encode(handle, bl);
5038 	    encode(entries, bl);
5039 	  }
5040 	  void decode(ceph::buffer::list::const_iterator& bl) {
5041 	    using ceph::decode;
5042 	    __u8 v;
5043 	    decode(v, bl);
5044 	    ceph_assert(v == 1);
5045 	    decode(handle, bl);
5046 	    decode(entries, bl);
5047 	  }
5048 	  void dump(ceph::Formatter *f) const {
5049 	    f->dump_stream("handle") << handle;
5050 	    f->open_array_section("entries");
5051 	    for (std::list<std::pair<object_t, std::string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5052 	      f->open_object_section("object");
5053 	      f->dump_stream("object") << p->first;
5054 	      f->dump_string("key", p->second);
5055 	      f->close_section();
5056 	    }
5057 	    f->close_section();
5058 	  }
5059 	  static void generate_test_instances(std::list<pg_ls_response_t*>& o) {
5060 	    o.push_back(new pg_ls_response_t);
5061 	    o.push_back(new pg_ls_response_t);
5062 	    o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5063 	    o.back()->entries.push_back(std::make_pair(object_t("one"), std::string()));
5064 	    o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey")));
5065 	  }
5066 	};
5067 	
5068 	WRITE_CLASS_ENCODER(pg_ls_response_t)
5069 	
5070 	/**
5071 	 * object_copy_cursor_t
5072 	 */
5073 	struct object_copy_cursor_t {
5074 	  uint64_t data_offset;
5075 	  std::string omap_offset;
5076 	  bool attr_complete;
5077 	  bool data_complete;
5078 	  bool omap_complete;
5079 	
5080 	  object_copy_cursor_t()
5081 	    : data_offset(0),
5082 	      attr_complete(false),
5083 	      data_complete(false),
5084 	      omap_complete(false)
5085 	  {}
5086 	
5087 	  bool is_initial() const {
5088 	    return !attr_complete && data_offset == 0 && omap_offset.empty();
5089 	  }
5090 	  bool is_complete() const {
5091 	    return attr_complete && data_complete && omap_complete;
5092 	  }
5093 	
5094 	  static void generate_test_instances(std::list<object_copy_cursor_t*>& o);
5095 	  void encode(ceph::buffer::list& bl) const;
5096 	  void decode(ceph::buffer::list::const_iterator &bl);
5097 	  void dump(ceph::Formatter *f) const;
5098 	};
5099 	WRITE_CLASS_ENCODER(object_copy_cursor_t)
5100 	
5101 	/**
5102 	 * object_copy_data_t
5103 	 *
5104 	 * Return data from a copy request. The semantics are a little strange
5105 	 * as a result of the encoding's heritage.
5106 	 *
5107 	 * In particular, the sender unconditionally fills in the cursor (from what
5108 	 * it receives and sends), the size, and the mtime, but is responsible for
5109 	 * figuring out whether it should put any data in the attrs, data, or
5110 	 * omap members (corresponding to xattrs, object data, and the omap entries)
5111 	 * based on external data (the client includes a max amount to return with
5112 	 * the copy request). The client then looks into the attrs, data, and/or omap
5113 	 * based on the contents of the cursor.
5114 	 */
5115 	struct object_copy_data_t {
5116 	  enum {
5117 	    FLAG_DATA_DIGEST = 1<<0,
5118 	    FLAG_OMAP_DIGEST = 1<<1,
5119 	  };
5120 	  object_copy_cursor_t cursor;
5121 	  uint64_t size;
5122 	  utime_t mtime;
5123 	  uint32_t data_digest, omap_digest;
5124 	  uint32_t flags;
5125 	  std::map<std::string, ceph::buffer::list> attrs;
5126 	  ceph::buffer::list data;
5127 	  ceph::buffer::list omap_header;
5128 	  ceph::buffer::list omap_data;
5129 	
5130 	  /// which snaps we are defined for (if a snap and not the head)
5131 	  std::vector<snapid_t> snaps;
5132 	  /// latest snap seq for the object (if head)
5133 	  snapid_t snap_seq;
5134 	
5135 	  /// recent reqids on this object
5136 	  mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids;
5137 	
5138 	  /// map reqids by index to error return code (if any)
5139 	  mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
5140 	
5141 	  uint64_t truncate_seq;
5142 	  uint64_t truncate_size;
5143 	
5144 	public:
5145 	  object_copy_data_t() :
5146 	    size((uint64_t)-1), data_digest(-1),
5147 	    omap_digest(-1), flags(0),
5148 	    truncate_seq(0),
5149 	    truncate_size(0) {}
5150 	
5151 	  static void generate_test_instances(std::list<object_copy_data_t*>& o);
5152 	  void encode(ceph::buffer::list& bl, uint64_t features) const;
5153 	  void decode(ceph::buffer::list::const_iterator& bl);
5154 	  void dump(ceph::Formatter *f) const;
5155 	};
5156 	WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
5157 	
5158 	/**
5159 	 * pg creation info
5160 	 */
5161 	struct pg_create_t {
5162 	  epoch_t created;   // epoch pg created
5163 	  pg_t parent;       // split from parent (if != pg_t())
5164 	  __s32 split_bits;
5165 	
5166 	  pg_create_t()
5167 	    : created(0), split_bits(0) {}
5168 	  pg_create_t(unsigned c, pg_t p, int s)
5169 	    : created(c), parent(p), split_bits(s) {}
5170 	
5171 	  void encode(ceph::buffer::list &bl) const;
5172 	  void decode(ceph::buffer::list::const_iterator &bl);
5173 	  void dump(ceph::Formatter *f) const;
5174 	  static void generate_test_instances(std::list<pg_create_t*>& o);
5175 	};
5176 	WRITE_CLASS_ENCODER(pg_create_t)
5177 	
5178 	// -----------------------------------------
5179 	
5180 	class ObjectExtent {
5181 	  /**
5182 	   * ObjectExtents are used for specifying IO behavior against RADOS
5183 	   * objects when one is using the ObjectCacher.
5184 	   *
5185 	   * To use this in a real system, *every member* must be filled
5186 	   * out correctly. In particular, make sure to initialize the
5187 	   * oloc correctly, as its default values are deliberate poison
5188 	   * and will cause internal ObjectCacher asserts.
5189 	   *
5190 	   * Similarly, your buffer_extents vector *must* specify a total
5191 	   * size equal to your length. If the buffer_extents inadvertently
5192 	   * contain less space than the length member specifies, you
5193 	   * will get unintelligible asserts deep in the ObjectCacher.
5194 	   *
5195 	   * If you are trying to do testing and don't care about actual
5196 	   * RADOS function, the simplest thing to do is to initialize
5197 	   * the ObjectExtent (truncate_size can be 0), create a single entry
5198 	   * in buffer_extents matching the length, and set oloc.pool to 0.
5199 	   */
5200 	 public:
5201 	  object_t    oid;       // object id
5202 	  uint64_t    objectno;
5203 	  uint64_t    offset;    // in object
5204 	  uint64_t    length;    // in object
5205 	  uint64_t    truncate_size;	// in object
5206 	
5207 	  object_locator_t oloc;   // object locator (pool etc)
5208 	
5209 	  std::vector<std::pair<uint64_t,uint64_t> >  buffer_extents;  // off -> len.  extents in buffer being mapped (may be fragmented bc of striping!)
5210 	  
5211 	  ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
5212 	  ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
5213 	    oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
5214 	};
5215 	
5216 	inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex)
5217 	{
5218 	  return out << "extent(" 
5219 	             << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
5220 	             << " " << ex.offset << "~" << ex.length
5221 		     << " -> " << ex.buffer_extents
5222 	             << ")";
5223 	}
5224 	
5225 	
5226 	// ---------------------------------------
5227 	
5228 	class OSDSuperblock {
5229 	public:
5230 	  uuid_d cluster_fsid, osd_fsid;
5231 	  int32_t whoami = -1;    // my role in this fs.
5232 	  epoch_t current_epoch = 0;             // most recent epoch
5233 	  epoch_t oldest_map = 0, newest_map = 0;    // oldest/newest maps we have.
5234 	  double weight = 0.0;
5235 	
5236 	  CompatSet compat_features;
5237 	
5238 	  // last interval over which i mounted and was then active
5239 	  epoch_t mounted = 0;     // last epoch i mounted
5240 	  epoch_t clean_thru = 0;  // epoch i was active and clean thru
5241 	
5242 	  epoch_t purged_snaps_last = 0;
5243 	  utime_t last_purged_snaps_scrub;
5244 	
5245 	  void encode(ceph::buffer::list &bl) const;
5246 	  void decode(ceph::buffer::list::const_iterator &bl);
5247 	  void dump(ceph::Formatter *f) const;
5248 	  static void generate_test_instances(std::list<OSDSuperblock*>& o);
5249 	};
5250 	WRITE_CLASS_ENCODER(OSDSuperblock)
5251 	
5252 	inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb)
5253 	{
5254 	  return out << "sb(" << sb.cluster_fsid
5255 	             << " osd." << sb.whoami
5256 		     << " " << sb.osd_fsid
5257 	             << " e" << sb.current_epoch
5258 	             << " [" << sb.oldest_map << "," << sb.newest_map << "]"
5259 		     << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
5260 	             << ")";
5261 	}
5262 	
5263 	
5264 	// -------
5265 	
5266 	
5267 	
5268 	
5269 	
5270 	
5271 	/*
5272 	 * attached to object head.  describes most recent snap context, and
5273 	 * set of existing clones.
5274 	 */
5275 	struct SnapSet {
5276 	  snapid_t seq;
5277 	  // NOTE: this is for pre-octopus compatibility only! remove in Q release
5278 	  std::vector<snapid_t> snaps;    // descending
5279 	  std::vector<snapid_t> clones;   // ascending
5280 	  std::map<snapid_t, interval_set<uint64_t> > clone_overlap;  // overlap w/ next newest
5281 	  std::map<snapid_t, uint64_t> clone_size;
5282 	  std::map<snapid_t, std::vector<snapid_t>> clone_snaps; // descending
5283 	
5284 	  SnapSet() : seq(0) {}
5285 	  explicit SnapSet(ceph::buffer::list& bl) {
5286 	    auto p = std::cbegin(bl);
5287 	    decode(p);
5288 	  }
5289 	
5290 	  /// populate SnapSet from a librados::snap_set_t
5291 	  void from_snap_set(const librados::snap_set_t& ss, bool legacy);
5292 	
5293 	  /// get space accounted to clone
5294 	  uint64_t get_clone_bytes(snapid_t clone) const;
5295 	    
5296 	  void encode(ceph::buffer::list& bl) const;
5297 	  void decode(ceph::buffer::list::const_iterator& bl);
5298 	  void dump(ceph::Formatter *f) const;
5299 	  static void generate_test_instances(std::list<SnapSet*>& o);  
5300 	
5301 	  SnapContext get_ssc_as_of(snapid_t as_of) const {
5302 	    SnapContext out;
5303 	    out.seq = as_of;
5304 	    for (auto p = clone_snaps.rbegin();
5305 		 p != clone_snaps.rend();
5306 		 ++p) {
5307 	      for (auto snap : p->second) {
5308 		if (snap <= as_of) {
5309 		  out.snaps.push_back(snap);
5310 		}
5311 	      }
5312 	    }
5313 	    return out;
5314 	  }
5315 	
5316 	
5317 	  SnapSet get_filtered(const pg_pool_t &pinfo) const;
5318 	  void filter(const pg_pool_t &pinfo);
5319 	};
5320 	WRITE_CLASS_ENCODER(SnapSet)
5321 	
5322 	std::ostream& operator<<(std::ostream& out, const SnapSet& cs);
5323 	
5324 	
5325 	
5326 	#define OI_ATTR "_"
5327 	#define SS_ATTR "snapset"
5328 	
5329 	struct watch_info_t {
5330 	  uint64_t cookie;
5331 	  uint32_t timeout_seconds;
5332 	  entity_addr_t addr;
5333 	
5334 	  watch_info_t() : cookie(0), timeout_seconds(0) { }
5335 	  watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
5336 	
5337 	  void encode(ceph::buffer::list& bl, uint64_t features) const;
5338 	  void decode(ceph::buffer::list::const_iterator& bl);
5339 	  void dump(ceph::Formatter *f) const;
5340 	  static void generate_test_instances(std::list<watch_info_t*>& o);
5341 	};
5342 	WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
5343 	
5344 	static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
5345 	  return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
5346 		    && l.addr == r.addr;
5347 	}
5348 	
5349 	static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) {
5350 	  return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
5351 	    << " " << w.addr << ")";
5352 	}
5353 	
5354 	struct notify_info_t {
5355 	  uint64_t cookie;
5356 	  uint64_t notify_id;
5357 	  uint32_t timeout;
5358 	  ceph::buffer::list bl;
5359 	};
5360 	
5361 	static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) {
5362 	  return out << "notify(cookie " << n.cookie
5363 		     << " notify" << n.notify_id
5364 		     << " " << n.timeout << "s)";
5365 	}
5366 	
5367 	struct chunk_info_t {
5368 	  typedef enum {
5369 	    FLAG_DIRTY = 1, 
5370 	    FLAG_MISSING = 2,
5371 	    FLAG_HAS_REFERENCE = 4,
5372 	    FLAG_HAS_FINGERPRINT = 8,
5373 	  } cflag_t;
5374 	  uint32_t offset;
5375 	  uint32_t length;
5376 	  hobject_t oid;
5377 	  cflag_t flags;   // FLAG_*
5378 	
5379 	  chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
5380 	
5381 	  static std::string get_flag_string(uint64_t flags) {
5382 	    std::string r;
5383 	    if (flags & FLAG_DIRTY) {
5384 	      r += "|dirty";
5385 	    }
5386 	    if (flags & FLAG_MISSING) {
5387 	      r += "|missing";
5388 	    }
5389 	    if (flags & FLAG_HAS_REFERENCE) {
5390 	      r += "|has_reference";
5391 	    }
5392 	    if (flags & FLAG_HAS_FINGERPRINT) {
5393 	      r += "|has_fingerprint";
5394 	    }
5395 	    if (r.length())
5396 	      return r.substr(1);
5397 	    return r;
5398 	  }
5399 	  bool test_flag(cflag_t f) const {
5400 	    return (flags & f) == f;
5401 	  }
5402 	  void set_flag(cflag_t f) {
5403 	    flags = (cflag_t)(flags | f);
5404 	  }
5405 	  void set_flags(cflag_t f) {
5406 	    flags = f;
5407 	  }
5408 	  void clear_flag(cflag_t f) {
5409 	    flags = (cflag_t)(flags & ~f);
5410 	  }
5411 	  void clear_flags() {
5412 	    flags = (cflag_t)0;
5413 	  }
5414 	  bool is_dirty() const {
5415 	    return test_flag(FLAG_DIRTY);
5416 	  }
5417 	  bool is_missing() const {
5418 	    return test_flag(FLAG_MISSING);
5419 	  }
5420 	  bool has_reference() const {
5421 	    return test_flag(FLAG_HAS_REFERENCE);
5422 	  }
5423 	  bool has_fingerprint() const {
5424 	    return test_flag(FLAG_HAS_FINGERPRINT);
5425 	  }
5426 	  void encode(ceph::buffer::list &bl) const;
5427 	  void decode(ceph::buffer::list::const_iterator &bl);
5428 	  void dump(ceph::Formatter *f) const;
5429 	  friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
5430 	};
5431 	WRITE_CLASS_ENCODER(chunk_info_t)
5432 	std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
5433 	
5434 	struct object_info_t;
5435 	struct object_manifest_t {
5436 	  enum {
5437 	    TYPE_NONE = 0,
5438 	    TYPE_REDIRECT = 1, 
5439 	    TYPE_CHUNKED = 2, 
5440 	  };
5441 	  uint8_t type;  // redirect, chunked, ...
5442 	  hobject_t redirect_target;
5443 	  std::map<uint64_t, chunk_info_t> chunk_map;
5444 	
5445 	  object_manifest_t() : type(0) { }
5446 	  object_manifest_t(uint8_t type, const hobject_t& redirect_target) 
5447 	    : type(type), redirect_target(redirect_target) { }
5448 	
5449 	  bool is_empty() const {
5450 	    return type == TYPE_NONE;
5451 	  }
5452 	  bool is_redirect() const {
5453 	    return type == TYPE_REDIRECT;
5454 	  }
5455 	  bool is_chunked() const {
5456 	    return type == TYPE_CHUNKED;
5457 	  }
5458 	  static std::string_view get_type_name(uint8_t m) {
5459 	    switch (m) {
5460 	    case TYPE_NONE: return "none";
5461 	    case TYPE_REDIRECT: return "redirect";
5462 	    case TYPE_CHUNKED: return "chunked";
5463 	    default: return "unknown";
5464 	    }
5465 	  }
5466 	  std::string_view get_type_name() const {
5467 	    return get_type_name(type);
5468 	  }
5469 	  void clear() {
5470 	    type = 0;
5471 	    redirect_target = hobject_t();
5472 	    chunk_map.clear();
5473 	  }
5474 	  static void generate_test_instances(std::list<object_manifest_t*>& o);
5475 	  void encode(ceph::buffer::list &bl) const;
5476 	  void decode(ceph::buffer::list::const_iterator &bl);
5477 	  void dump(ceph::Formatter *f) const;
5478 	  friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
5479 	};
5480 	WRITE_CLASS_ENCODER(object_manifest_t)
5481 	std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi);
5482 	
5483 	struct object_info_t {
5484 	  hobject_t soid;
5485 	  eversion_t version, prior_version;
5486 	  version_t user_version;
5487 	  osd_reqid_t last_reqid;
5488 	
5489 	  uint64_t size;
5490 	  utime_t mtime;
5491 	  utime_t local_mtime; // local mtime
5492 	
5493 	  // note: these are currently encoded into a total 16 bits; see
5494 	  // encode()/decode() for the weirdness.
5495 	  typedef enum {
5496 	    FLAG_LOST        = 1<<0,
5497 	    FLAG_WHITEOUT    = 1<<1, // object logically does not exist
5498 	    FLAG_DIRTY       = 1<<2, // object has been modified since last flushed or undirtied
5499 	    FLAG_OMAP        = 1<<3, // has (or may have) some/any omap data
5500 	    FLAG_DATA_DIGEST = 1<<4, // has data crc
5501 	    FLAG_OMAP_DIGEST = 1<<5, // has omap crc
5502 	    FLAG_CACHE_PIN   = 1<<6, // pin the object in cache tier
5503 	    FLAG_MANIFEST    = 1<<7, // has manifest
5504 	    FLAG_USES_TMAP   = 1<<8, // deprecated; no longer used
5505 	    FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
5506 	  } flag_t;
5507 	
5508 	  flag_t flags;
5509 	
5510 	  static std::string get_flag_string(flag_t flags) {
5511 	    std::string s;
5512 	    std::vector<std::string> sv = get_flag_vector(flags);
5513 	    for (auto ss : sv) {
5514 	      s += std::string("|") + ss;
5515 	    }
5516 	    if (s.length())
5517 	      return s.substr(1);
5518 	    return s;
5519 	  }
5520 	  static std::vector<std::string> get_flag_vector(flag_t flags) {
5521 	    std::vector<std::string> sv;
5522 	    if (flags & FLAG_LOST)
5523 	      sv.insert(sv.end(), "lost");
5524 	    if (flags & FLAG_WHITEOUT)
5525 	      sv.insert(sv.end(), "whiteout");
5526 	    if (flags & FLAG_DIRTY)
5527 	      sv.insert(sv.end(), "dirty");
5528 	    if (flags & FLAG_USES_TMAP)
5529 	      sv.insert(sv.end(), "uses_tmap");
5530 	    if (flags & FLAG_OMAP)
5531 	      sv.insert(sv.end(), "omap");
5532 	    if (flags & FLAG_DATA_DIGEST)
5533 	      sv.insert(sv.end(), "data_digest");
5534 	    if (flags & FLAG_OMAP_DIGEST)
5535 	      sv.insert(sv.end(), "omap_digest");
5536 	    if (flags & FLAG_CACHE_PIN)
5537 	      sv.insert(sv.end(), "cache_pin");
5538 	    if (flags & FLAG_MANIFEST)
5539 	      sv.insert(sv.end(), "manifest");
5540 	    if (flags & FLAG_REDIRECT_HAS_REFERENCE)
5541 	      sv.insert(sv.end(), "redirect_has_reference");
5542 	    return sv;
5543 	  }
5544 	  std::string get_flag_string() const {
5545 	    return get_flag_string(flags);
5546 	  }
5547 	
5548 	  uint64_t truncate_seq, truncate_size;
5549 	
5550 	  std::map<std::pair<uint64_t, entity_name_t>, watch_info_t> watchers;
5551 	
5552 	  // opportunistic checksums; may or may not be present
5553 	  __u32 data_digest;  ///< data crc32c
5554 	  __u32 omap_digest;  ///< omap crc32c
5555 	  
5556 	  // alloc hint attribute
5557 	  uint64_t expected_object_size, expected_write_size;
5558 	  uint32_t alloc_hint_flags;
5559 	
5560 	  struct object_manifest_t manifest;
5561 	
5562 	  void copy_user_bits(const object_info_t& other);
5563 	
5564 	  bool test_flag(flag_t f) const {
5565 	    return (flags & f) == f;
5566 	  }
5567 	  void set_flag(flag_t f) {
5568 	    flags = (flag_t)(flags | f);
5569 	  }
5570 	  void clear_flag(flag_t f) {
5571 	    flags = (flag_t)(flags & ~f);
5572 	  }
5573 	  bool is_lost() const {
5574 	    return test_flag(FLAG_LOST);
5575 	  }
5576 	  bool is_whiteout() const {
5577 	    return test_flag(FLAG_WHITEOUT);
5578 	  }
5579 	  bool is_dirty() const {
5580 	    return test_flag(FLAG_DIRTY);
5581 	  }
5582 	  bool is_omap() const {
5583 	    return test_flag(FLAG_OMAP);
5584 	  }
5585 	  bool is_data_digest() const {
5586 	    return test_flag(FLAG_DATA_DIGEST);
5587 	  }
5588 	  bool is_omap_digest() const {
5589 	    return test_flag(FLAG_OMAP_DIGEST);
5590 	  }
5591 	  bool is_cache_pinned() const {
5592 	    return test_flag(FLAG_CACHE_PIN);
5593 	  }
5594 	  bool has_manifest() const {
5595 	    return test_flag(FLAG_MANIFEST);
5596 	  }
5597 	  void set_data_digest(__u32 d) {
5598 	    set_flag(FLAG_DATA_DIGEST);
5599 	    data_digest = d;
5600 	  }
5601 	  void set_omap_digest(__u32 d) {
5602 	    set_flag(FLAG_OMAP_DIGEST);
5603 	    omap_digest = d;
5604 	  }
5605 	  void clear_data_digest() {
5606 	    clear_flag(FLAG_DATA_DIGEST);
5607 	    data_digest = -1;
5608 	  }
5609 	  void clear_omap_digest() {
5610 	    clear_flag(FLAG_OMAP_DIGEST);
5611 	    omap_digest = -1;
5612 	  }
5613 	  void new_object() {
5614 	    clear_data_digest();
5615 	    clear_omap_digest();
5616 	  }
5617 	
5618 	  void encode(ceph::buffer::list& bl, uint64_t features) const;
5619 	  void decode(ceph::buffer::list::const_iterator& bl);
5620 	  void decode(ceph::buffer::list& bl) {
5621 	    auto p = std::cbegin(bl);
5622 	    decode(p);
5623 	  }
5624 	  void dump(ceph::Formatter *f) const;
5625 	  static void generate_test_instances(std::list<object_info_t*>& o);
5626 	
5627 	  explicit object_info_t()
5628 	    : user_version(0), size(0), flags((flag_t)0),
5629 	      truncate_seq(0), truncate_size(0),
5630 	      data_digest(-1), omap_digest(-1),
5631 	      expected_object_size(0), expected_write_size(0),
5632 	      alloc_hint_flags(0)
5633 	  {}
5634 	
5635 	  explicit object_info_t(const hobject_t& s)
5636 	    : soid(s),
5637 	      user_version(0), size(0), flags((flag_t)0),
5638 	      truncate_seq(0), truncate_size(0),
5639 	      data_digest(-1), omap_digest(-1),
5640 	      expected_object_size(0), expected_write_size(0),
5641 	      alloc_hint_flags(0)
5642 	  {}
5643 	
5644 	  explicit object_info_t(ceph::buffer::list& bl) {
5645 	    decode(bl);
5646 	  }
5647 	};
5648 	WRITE_CLASS_ENCODER_FEATURES(object_info_t)
5649 	
5650 	std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
5651 	
5652 	
5653 	
5654 	// Object recovery
5655 	struct ObjectRecoveryInfo {
5656 	  hobject_t soid;
5657 	  eversion_t version;
5658 	  uint64_t size;
5659 	  object_info_t oi;
5660 	  SnapSet ss;   // only populated if soid is_snap()
5661 	  interval_set<uint64_t> copy_subset;
5662 	  std::map<hobject_t, interval_set<uint64_t>> clone_subset;
5663 	  bool object_exist;
5664 	
5665 	  ObjectRecoveryInfo() : size(0), object_exist(true) { }
5666 	
5667 	  static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o);
5668 	  void encode(ceph::buffer::list &bl, uint64_t features) const;
5669 	  void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
5670 	  std::ostream &print(std::ostream &out) const;
5671 	  void dump(ceph::Formatter *f) const;
5672 	};
5673 	WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
5674 	std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf);
5675 	
5676 	struct ObjectRecoveryProgress {
5677 	  uint64_t data_recovered_to;
5678 	  std::string omap_recovered_to;
5679 	  bool first;
5680 	  bool data_complete;
5681 	  bool omap_complete;
5682 	  bool error = false;
5683 	
5684 	  ObjectRecoveryProgress()
5685 	    : data_recovered_to(0),
5686 	      first(true),
5687 	      data_complete(false), omap_complete(false) { }
5688 	
5689 	  bool is_complete(const ObjectRecoveryInfo& info) const {
5690 	    return (data_recovered_to >= (
5691 	      info.copy_subset.empty() ?
5692 	      0 : info.copy_subset.range_end())) &&
5693 	      omap_complete;
5694 	  }
5695 	
5696 	  static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o);
5697 	  void encode(ceph::buffer::list &bl) const;
5698 	  void decode(ceph::buffer::list::const_iterator &bl);
5699 	  std::ostream &print(std::ostream &out) const;
5700 	  void dump(ceph::Formatter *f) const;
5701 	};
5702 	WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
5703 	std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog);
5704 	
5705 	struct PushReplyOp {
5706 	  hobject_t soid;
5707 	
5708 	  static void generate_test_instances(std::list<PushReplyOp*>& o);
5709 	  void encode(ceph::buffer::list &bl) const;
5710 	  void decode(ceph::buffer::list::const_iterator &bl);
5711 	  std::ostream &print(std::ostream &out) const;
5712 	  void dump(ceph::Formatter *f) const;
5713 	
5714 	  uint64_t cost(CephContext *cct) const;
5715 	};
5716 	WRITE_CLASS_ENCODER(PushReplyOp)
5717 	std::ostream& operator<<(std::ostream& out, const PushReplyOp &op);
5718 	
5719 	struct PullOp {
5720 	  hobject_t soid;
5721 	
5722 	  ObjectRecoveryInfo recovery_info;
5723 	  ObjectRecoveryProgress recovery_progress;
5724 	
5725 	  static void generate_test_instances(std::list<PullOp*>& o);
5726 	  void encode(ceph::buffer::list &bl, uint64_t features) const;
5727 	  void decode(ceph::buffer::list::const_iterator &bl);
5728 	  std::ostream &print(std::ostream &out) const;
5729 	  void dump(ceph::Formatter *f) const;
5730 	
5731 	  uint64_t cost(CephContext *cct) const;
5732 	};
5733 	WRITE_CLASS_ENCODER_FEATURES(PullOp)
5734 	std::ostream& operator<<(std::ostream& out, const PullOp &op);
5735 	
5736 	struct PushOp {
5737 	  hobject_t soid;
5738 	  eversion_t version;
5739 	  ceph::buffer::list data;
5740 	  interval_set<uint64_t> data_included;
5741 	  ceph::buffer::list omap_header;
5742 	  std::map<std::string, ceph::buffer::list> omap_entries;
5743 	  std::map<std::string, ceph::buffer::list> attrset;
5744 	
5745 	  ObjectRecoveryInfo recovery_info;
5746 	  ObjectRecoveryProgress before_progress;
5747 	  ObjectRecoveryProgress after_progress;
5748 	
5749 	  static void generate_test_instances(std::list<PushOp*>& o);
5750 	  void encode(ceph::buffer::list &bl, uint64_t features) const;
5751 	  void decode(ceph::buffer::list::const_iterator &bl);
5752 	  std::ostream &print(std::ostream &out) const;
5753 	  void dump(ceph::Formatter *f) const;
5754 	
5755 	  uint64_t cost(CephContext *cct) const;
5756 	};
5757 	WRITE_CLASS_ENCODER_FEATURES(PushOp)
5758 	std::ostream& operator<<(std::ostream& out, const PushOp &op);
5759 	
5760 	
5761 	/*
5762 	 * summarize pg contents for purposes of a scrub
5763 	 */
5764 	struct ScrubMap {
5765 	  struct object {
5766 	    std::map<std::string, ceph::buffer::ptr> attrs;
5767 	    uint64_t size;
5768 	    __u32 omap_digest;         ///< omap crc32c
5769 	    __u32 digest;              ///< data crc32c
5770 	    bool negative:1;
5771 	    bool digest_present:1;
5772 	    bool omap_digest_present:1;
5773 	    bool read_error:1;
5774 	    bool stat_error:1;
5775 	    bool ec_hash_mismatch:1;
5776 	    bool ec_size_mismatch:1;
5777 	    bool large_omap_object_found:1;
5778 	    uint64_t large_omap_object_key_count = 0;
5779 	    uint64_t large_omap_object_value_size = 0;
5780 	    uint64_t object_omap_bytes = 0;
5781 	    uint64_t object_omap_keys = 0;
5782 	
5783 	    object() :
5784 	      // Init invalid size so it won't match if we get a stat EIO error
5785 	      size(-1), omap_digest(0), digest(0),
5786 	      negative(false), digest_present(false), omap_digest_present(false),
5787 	      read_error(false), stat_error(false), ec_hash_mismatch(false),
5788 	      ec_size_mismatch(false), large_omap_object_found(false) {}
5789 	
5790 	    void encode(ceph::buffer::list& bl) const;
5791 	    void decode(ceph::buffer::list::const_iterator& bl);
5792 	    void dump(ceph::Formatter *f) const;
5793 	    static void generate_test_instances(std::list<object*>& o);
5794 	  };
5795 	  WRITE_CLASS_ENCODER(object)
5796 	
5797 	  std::map<hobject_t,object> objects;
5798 	  eversion_t valid_through;
5799 	  eversion_t incr_since;
5800 	  bool has_large_omap_object_errors:1;
5801 	  bool has_omap_keys:1;
5802 	
5803 	  void merge_incr(const ScrubMap &l);
5804 	  void clear_from(const hobject_t& start) {
5805 	    objects.erase(objects.lower_bound(start), objects.end());
5806 	  }
5807 	  void insert(const ScrubMap &r) {
5808 	    objects.insert(r.objects.begin(), r.objects.end());
5809 	  }
5810 	  void swap(ScrubMap &r) {
5811 	    using std::swap;
5812 	    swap(objects, r.objects);
5813 	    swap(valid_through, r.valid_through);
5814 	    swap(incr_since, r.incr_since);
5815 	  }
5816 	
5817 	  void encode(ceph::buffer::list& bl) const;
5818 	  void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1);
5819 	  void dump(ceph::Formatter *f) const;
5820 	  static void generate_test_instances(std::list<ScrubMap*>& o);
5821 	};
5822 	WRITE_CLASS_ENCODER(ScrubMap::object)
5823 	WRITE_CLASS_ENCODER(ScrubMap)
5824 	
5825 	struct ScrubMapBuilder {
5826 	  bool deep = false;
5827 	  std::vector<hobject_t> ls;
5828 	  size_t pos = 0;
5829 	  int64_t data_pos = 0;
5830 	  std::string omap_pos;
5831 	  int ret = 0;
5832 	  ceph::buffer::hash data_hash, omap_hash;  ///< accumulatinng hash value
5833 	  uint64_t omap_keys = 0;
5834 	  uint64_t omap_bytes = 0;
5835 	
5836 	  bool empty() {
5837 	    return ls.empty();
5838 	  }
5839 	  bool done() {
5840 	    return pos >= ls.size();
5841 	  }
5842 	  void reset() {
5843 	    *this = ScrubMapBuilder();
5844 	  }
5845 	
5846 	  bool data_done() {
5847 	    return data_pos < 0;
5848 	  }
5849 	
5850 	  void next_object() {
5851 	    ++pos;
5852 	    data_pos = 0;
5853 	    omap_pos.clear();
5854 	    omap_keys = 0;
5855 	    omap_bytes = 0;
5856 	  }
5857 	
5858 	  friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) {
5859 	    out << "(" << pos.pos << "/" << pos.ls.size();
5860 	    if (pos.pos < pos.ls.size()) {
5861 	      out << " " << pos.ls[pos.pos];
5862 	    }
5863 	    if (pos.data_pos < 0) {
5864 	      out << " byte " << pos.data_pos;
5865 	    }
5866 	    if (!pos.omap_pos.empty()) {
5867 	      out << " key " << pos.omap_pos;
5868 	    }
5869 	    if (pos.deep) {
5870 	      out << " deep";
5871 	    }
5872 	    if (pos.ret) {
5873 	      out << " ret " << pos.ret;
5874 	    }
5875 	    return out << ")";
5876 	  }
5877 	};
5878 	
5879 	struct watch_item_t {
5880 	  entity_name_t name;
5881 	  uint64_t cookie;
5882 	  uint32_t timeout_seconds;
5883 	  entity_addr_t addr;
5884 	
5885 	  watch_item_t() : cookie(0), timeout_seconds(0) { }
5886 	  watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5887 	     const entity_addr_t& addr)
5888 	    : name(name), cookie(cookie), timeout_seconds(timeout),
5889 	    addr(addr) { }
5890 	
5891 	  void encode(ceph::buffer::list &bl, uint64_t features) const {
5892 	    ENCODE_START(2, 1, bl);
5893 	    encode(name, bl);
5894 	    encode(cookie, bl);
5895 	    encode(timeout_seconds, bl);
5896 	    encode(addr, bl, features);
5897 	    ENCODE_FINISH(bl);
5898 	  }
5899 	  void decode(ceph::buffer::list::const_iterator &bl) {
5900 	    DECODE_START(2, bl);
5901 	    decode(name, bl);
5902 	    decode(cookie, bl);
5903 	    decode(timeout_seconds, bl);
5904 	    if (struct_v >= 2) {
5905 	      decode(addr, bl);
5906 	    }
5907 	    DECODE_FINISH(bl);
5908 	  }
5909 	  void dump(ceph::Formatter *f) const {
5910 	    f->dump_stream("watcher") << name;
5911 	    f->dump_int("cookie", cookie);
5912 	    f->dump_int("timeout", timeout_seconds);
5913 	    f->open_object_section("addr");
5914 	    addr.dump(f);
5915 	    f->close_section();
5916 	  }
5917 	  static void generate_test_instances(std::list<watch_item_t*>& o) {
5918 	    entity_addr_t ea;
5919 	    ea.set_type(entity_addr_t::TYPE_LEGACY);
5920 	    ea.set_nonce(1000);
5921 	    ea.set_family(AF_INET);
5922 	    ea.set_in4_quad(0, 127);
5923 	    ea.set_in4_quad(1, 0);
5924 	    ea.set_in4_quad(2, 0);
5925 	    ea.set_in4_quad(3, 1);
5926 	    ea.set_port(1024);
5927 	    o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5928 	    ea.set_nonce(1001);
5929 	    ea.set_in4_quad(3, 2);
5930 	    ea.set_port(1025);
5931 	    o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5932 	  }
5933 	};
5934 	WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5935 	
5936 	struct obj_watch_item_t {
5937 	  hobject_t obj;
5938 	  watch_item_t wi;
5939 	};
5940 	
5941 	/**
5942 	 * obj list watch response format
5943 	 *
5944 	 */
5945 	struct obj_list_watch_response_t {
5946 	  std::list<watch_item_t> entries;
5947 	
5948 	  void encode(ceph::buffer::list& bl, uint64_t features) const {
5949 	    ENCODE_START(1, 1, bl);
5950 	    encode(entries, bl, features);
5951 	    ENCODE_FINISH(bl);
5952 	  }
5953 	  void decode(ceph::buffer::list::const_iterator& bl) {
5954 	    DECODE_START(1, bl);
5955 	    decode(entries, bl);
5956 	    DECODE_FINISH(bl);
5957 	  }
5958 	  void dump(ceph::Formatter *f) const {
5959 	    f->open_array_section("entries");
5960 	    for (std::list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5961 	      f->open_object_section("watch");
5962 	      p->dump(f);
5963 	      f->close_section();
5964 	    }
5965 	    f->close_section();
5966 	  }
5967 	  static void generate_test_instances(std::list<obj_list_watch_response_t*>& o) {
5968 	    entity_addr_t ea;
5969 	    o.push_back(new obj_list_watch_response_t);
5970 	    o.push_back(new obj_list_watch_response_t);
5971 	    std::list<watch_item_t*> test_watchers;
5972 	    watch_item_t::generate_test_instances(test_watchers);
5973 	    for (auto &e : test_watchers) {
5974 	      o.back()->entries.push_back(*e);
5975 	      delete e;
5976 	    }
5977 	  }
5978 	};
5979 	WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5980 	
5981 	struct clone_info {
5982 	  snapid_t cloneid;
5983 	  std::vector<snapid_t> snaps;  // ascending
5984 	  std::vector< std::pair<uint64_t,uint64_t> > overlap;
5985 	  uint64_t size;
5986 	
5987 	  clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5988 	
5989 	  void encode(ceph::buffer::list& bl) const {
5990 	    ENCODE_START(1, 1, bl);
5991 	    encode(cloneid, bl);
5992 	    encode(snaps, bl);
5993 	    encode(overlap, bl);
5994 	    encode(size, bl);
5995 	    ENCODE_FINISH(bl);
5996 	  }
5997 	  void decode(ceph::buffer::list::const_iterator& bl) {
5998 	    DECODE_START(1, bl);
5999 	    decode(cloneid, bl);
6000 	    decode(snaps, bl);
6001 	    decode(overlap, bl);
6002 	    decode(size, bl);
6003 	    DECODE_FINISH(bl);
6004 	  }
6005 	  void dump(ceph::Formatter *f) const {
6006 	    if (cloneid == CEPH_NOSNAP)
6007 	      f->dump_string("cloneid", "HEAD");
6008 	    else
6009 	      f->dump_unsigned("cloneid", cloneid.val);
6010 	    f->open_array_section("snapshots");
6011 	    for (std::vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
6012 	      f->open_object_section("snap");
6013 	      f->dump_unsigned("id", p->val);
6014 	      f->close_section();
6015 	    }
6016 	    f->close_section();
6017 	    f->open_array_section("overlaps");
6018 	    for (std::vector< std::pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
6019 	          q != overlap.end(); ++q) {
6020 	      f->open_object_section("overlap");
6021 	      f->dump_unsigned("offset", q->first);
6022 	      f->dump_unsigned("length", q->second);
6023 	      f->close_section();
6024 	    }
6025 	    f->close_section();
6026 	    f->dump_unsigned("size", size);
6027 	  }
6028 	  static void generate_test_instances(std::list<clone_info*>& o) {
6029 	    o.push_back(new clone_info);
6030 	    o.push_back(new clone_info);
6031 	    o.back()->cloneid = 1;
6032 	    o.back()->snaps.push_back(1);
6033 	    o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6034 	    o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
6035 	    o.back()->size = 16384;
6036 	    o.push_back(new clone_info);
6037 	    o.back()->cloneid = CEPH_NOSNAP;
6038 	    o.back()->size = 32768;
6039 	  }
6040 	};
6041 	WRITE_CLASS_ENCODER(clone_info)
6042 	
6043 	/**
6044 	 * obj list snaps response format
6045 	 *
6046 	 */
6047 	struct obj_list_snap_response_t {
6048 	  std::vector<clone_info> clones;   // ascending
6049 	  snapid_t seq;
6050 	
6051 	  void encode(ceph::buffer::list& bl) const {
6052 	    ENCODE_START(2, 1, bl);
6053 	    encode(clones, bl);
6054 	    encode(seq, bl);
6055 	    ENCODE_FINISH(bl);
6056 	  }
6057 	  void decode(ceph::buffer::list::const_iterator& bl) {
6058 	    DECODE_START(2, bl);
6059 	    decode(clones, bl);
6060 	    if (struct_v >= 2)
6061 	      decode(seq, bl);
6062 	    else
6063 	      seq = CEPH_NOSNAP;
6064 	    DECODE_FINISH(bl);
6065 	  }
6066 	  void dump(ceph::Formatter *f) const {
6067 	    f->open_array_section("clones");
6068 	    for (std::vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
6069 	      f->open_object_section("clone");
6070 	      p->dump(f);
6071 	      f->close_section();
6072 	    }
6073 	    f->dump_unsigned("seq", seq);
6074 	    f->close_section();
6075 	  }
6076 	  static void generate_test_instances(std::list<obj_list_snap_response_t*>& o) {
6077 	    o.push_back(new obj_list_snap_response_t);
6078 	    o.push_back(new obj_list_snap_response_t);
6079 	    clone_info cl;
6080 	    cl.cloneid = 1;
6081 	    cl.snaps.push_back(1);
6082 	    cl.overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6083 	    cl.overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
6084 	    cl.size = 16384;
6085 	    o.back()->clones.push_back(cl);
6086 	    cl.cloneid = CEPH_NOSNAP;
6087 	    cl.snaps.clear();
6088 	    cl.overlap.clear();
6089 	    cl.size = 32768;
6090 	    o.back()->clones.push_back(cl);
6091 	    o.back()->seq = 123;
6092 	  }
6093 	};
6094 	
6095 	WRITE_CLASS_ENCODER(obj_list_snap_response_t)
6096 	
6097 	// PromoteCounter
6098 	
6099 	struct PromoteCounter {
6100 	  std::atomic<unsigned long long>  attempts{0};
6101 	  std::atomic<unsigned long long>  objects{0};
6102 	  std::atomic<unsigned long long>  bytes{0};
6103 	
6104 	  void attempt() {
6105 	    attempts++;
6106 	  }
6107 	
6108 	  void finish(uint64_t size) {
6109 	    objects++;
6110 	    bytes += size;
6111 	  }
6112 	
6113 	  void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
6114 	    *a = attempts;
6115 	    *o = objects;
6116 	    *b = bytes;
6117 	    attempts = *a / 2;
6118 	    objects = *o / 2;
6119 	    bytes = *b / 2;
6120 	  }
6121 	};
6122 	
6123 	struct pool_pg_num_history_t {
6124 	  /// last epoch updated
6125 	  epoch_t epoch = 0;
6126 	  /// poolid -> epoch -> pg_num
6127 	  std::map<int64_t, std::map<epoch_t,uint32_t>> pg_nums;
6128 	  /// pair(epoch, poolid)
6129 	  std::set<std::pair<epoch_t,int64_t>> deleted_pools;
6130 	
6131 	  void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
6132 	    pg_nums[pool][epoch] = pg_num;
6133 	  }
6134 	  void log_pool_delete(epoch_t epoch, int64_t pool) {
6135 	    deleted_pools.insert(std::make_pair(epoch, pool));
6136 	  }
6137 	
6138 	  /// prune history based on oldest osdmap epoch in the cluster
6139 	  void prune(epoch_t oldest_epoch) {
6140 	    auto i = deleted_pools.begin();
6141 	    while (i != deleted_pools.end()) {
6142 	      if (i->first >= oldest_epoch) {
6143 		break;
6144 	      }
6145 	      pg_nums.erase(i->second);
6146 	      i = deleted_pools.erase(i);
6147 	    }
6148 	    for (auto& j : pg_nums) {
6149 	      auto k = j.second.lower_bound(oldest_epoch);
6150 	      // keep this and the entry before it (just to be paranoid)
6151 	      if (k != j.second.begin()) {
6152 		--k;
6153 		j.second.erase(j.second.begin(), k);
6154 	      }
6155 	    }
6156 	  }
6157 	
6158 	  void encode(ceph::buffer::list& bl) const {
6159 	    ENCODE_START(1, 1, bl);
6160 	    encode(epoch, bl);
6161 	    encode(pg_nums, bl);
6162 	    encode(deleted_pools, bl);
6163 	    ENCODE_FINISH(bl);
6164 	  }
6165 	  void decode(ceph::buffer::list::const_iterator& p) {
6166 	    DECODE_START(1, p);
6167 	    decode(epoch, p);
6168 	    decode(pg_nums, p);
6169 	    decode(deleted_pools, p);
6170 	    DECODE_FINISH(p);
6171 	  }
6172 	  void dump(ceph::Formatter *f) const {
6173 	    f->dump_unsigned("epoch", epoch);
6174 	    f->open_object_section("pools");
6175 	    for (auto& i : pg_nums) {
6176 	      f->open_object_section("pool");
6177 	      f->dump_unsigned("pool_id", i.first);
6178 	      f->open_array_section("changes");
6179 	      for (auto& j : i.second) {
6180 		f->open_object_section("change");
6181 		f->dump_unsigned("epoch", j.first);
6182 		f->dump_unsigned("pg_num", j.second);
6183 		f->close_section();
6184 	      }
6185 	      f->close_section();
6186 	      f->close_section();
6187 	    }
6188 	    f->close_section();
6189 	    f->open_array_section("deleted_pools");
6190 	    for (auto& i : deleted_pools) {
6191 	      f->open_object_section("deletion");
6192 	      f->dump_unsigned("pool_id", i.second);
6193 	      f->dump_unsigned("epoch", i.first);
6194 	      f->close_section();
6195 	    }
6196 	    f->close_section();
6197 	  }
6198 	  static void generate_test_instances(std::list<pool_pg_num_history_t*>& ls) {
6199 	    ls.push_back(new pool_pg_num_history_t);
6200 	  }
6201 	  friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) {
6202 	    return out << "pg_num_history(e" << h.epoch
6203 		       << " pg_nums " << h.pg_nums
6204 		       << " deleted_pools " << h.deleted_pools
6205 		       << ")";
6206 	  }
6207 	};
6208 	WRITE_CLASS_ENCODER(pool_pg_num_history_t)
6209 	
6210 	// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
6211 	// easily skip them
6212 	static const string_view infover_key = "_infover"sv;
6213 	static const string_view info_key = "_info"sv;
6214 	static const string_view biginfo_key = "_biginfo"sv;
6215 	static const string_view epoch_key = "_epoch"sv;
6216 	static const string_view fastinfo_key = "_fastinfo"sv;
6217 	
6218 	static const __u8 pg_latest_struct_v = 10;
6219 	// v10 is the new past_intervals encoding
6220 	// v9 was fastinfo_key addition
6221 	// v8 was the move to a per-pg pgmeta object
6222 	// v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
6223 	// (first appeared in cuttlefish).
6224 	static const __u8 pg_compat_struct_v = 10;
6225 	
6226 	int prepare_info_keymap(
6227 	  CephContext* cct,
6228 	  map<string,bufferlist> *km,
6229 	  epoch_t epoch,
6230 	  pg_info_t &info,
6231 	  pg_info_t &last_written_info,
6232 	  PastIntervals &past_intervals,
6233 	  bool dirty_big_info,
6234 	  bool dirty_epoch,
6235 	  bool try_fast_info,
6236 	  PerfCounters *logger = nullptr,
6237 	  DoutPrefixProvider *dpp = nullptr);
6238 	
6239 	namespace ceph::os {
6240 	  class Transaction;
6241 	};
6242 	
6243 	void create_pg_collection(
6244 	  ceph::os::Transaction& t, spg_t pgid, int bits);
6245 	
6246 	void init_pg_ondisk(
6247 	  ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool);
6248 	
6249 	// omap specific stats
6250 	struct omap_stat_t {
6251 	 int large_omap_objects;
6252 	 int64_t omap_bytes;
6253 	 int64_t omap_keys;
6254 	};
6255 	
6256 	// filter for pg listings
6257 	class PGLSFilter {
6258 	  CephContext* cct;
6259 	protected:
6260 	  std::string xattr;
6261 	public:
6262 	  PGLSFilter();
6263 	  virtual ~PGLSFilter();
6264 	  virtual bool filter(const hobject_t &obj,
6265 	                      const ceph::buffer::list& xattr_data) const = 0;
6266 	
6267 	  /**
6268 	   * Arguments passed from the RADOS client.  Implementations must
6269 	   * handle any encoding errors, and return an appropriate error code,
6270 	   * or 0 on valid input.
6271 	   */
6272 	  virtual int init(ceph::buffer::list::const_iterator &params) = 0;
6273 	
6274 	  /**
6275 	   * xattr key, or empty string.  If non-empty, this xattr will be fetched
6276 	   * and the value passed into ::filter
6277 	   */
6278 	  virtual const std::string& get_xattr() const { return xattr; }
6279 	
6280 	  /**
6281 	   * If true, objects without the named xattr (if xattr name is not empty)
6282 	   * will be rejected without calling ::filter
6283 	   */
6284 	  virtual bool reject_empty_xattr() const { return true; }
6285 	};
6286 	
6287 	class PGLSPlainFilter : public PGLSFilter {
6288 	  std::string val;
6289 	public:
6290 	  int init(ceph::bufferlist::const_iterator &params) override;
6291 	  ~PGLSPlainFilter() override {}
6292 	  bool filter(const hobject_t& obj,
6293 	              const ceph::bufferlist& xattr_data) const override;
6294 	};
6295 	
6296 	
6297 	#endif
6298