1    	// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2    	// vim: ts=8 sw=2 smarttab
3    	/*
4    	 * Ceph - scalable distributed file system
5    	 *
6    	 * Copyright (C) 2011 New Dream Network
7    	 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8    	 *
9    	 * Author: Loic Dachary <loic@dachary.org>
10   	 *
11   	 * This is free software; you can redistribute it and/or
12   	 * modify it under the terms of the GNU Lesser General Public
13   	 * License version 2.1, as published by the Free Software
14   	 * Foundation.  See file COPYING.
15   	 *
16   	 */
17   	
18   	#include <list>
19   	#include <map>
20   	#include <ostream>
21   	#include <sstream>
22   	#include <set>
23   	#include <string>
24   	#include <utility>
25   	#include <vector>
26   	
27   	
28   	#include <boost/assign/list_of.hpp>
29   	
30   	#include "include/ceph_features.h"
31   	#include "include/encoding.h"
32   	#include "include/stringify.h"
33   	extern "C" {
34   	#include "crush/hash.h"
35   	}
36   	
37   	#include "common/Formatter.h"
38   	#include "OSDMap.h"
39   	#include "osd_types.h"
40   	#include "os/Transaction.h"
41   	
42   	using std::list;
43   	using std::make_pair;
44   	using std::map;
45   	using std::ostream;
46   	using std::ostringstream;
47   	using std::pair;
48   	using std::set;
49   	using std::string;
50   	using std::stringstream;
51   	using std::unique_ptr;
52   	using std::vector;
53   	
54   	using ceph::decode;
55   	using ceph::decode_nohead;
56   	using ceph::encode;
57   	using ceph::encode_nohead;
58   	using ceph::Formatter;
59   	
60   	using namespace std::literals;
61   	
62   	const char *ceph_osd_flag_name(unsigned flag)
63   	{
64   	  switch (flag) {
65   	  case CEPH_OSD_FLAG_ACK: return "ack";
66   	  case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
67   	  case CEPH_OSD_FLAG_ONDISK: return "ondisk";
68   	  case CEPH_OSD_FLAG_RETRY: return "retry";
69   	  case CEPH_OSD_FLAG_READ: return "read";
70   	  case CEPH_OSD_FLAG_WRITE: return "write";
71   	  case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
72   	  case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
73   	  case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
74   	  case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
75   	  case CEPH_OSD_FLAG_PGOP: return "pgop";
76   	  case CEPH_OSD_FLAG_EXEC: return "exec";
77   	  case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
78   	  case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
79   	  case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
80   	  case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
81   	  case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
82   	  case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
83   	  case CEPH_OSD_FLAG_FLUSH: return "flush";
84   	  case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
85   	  case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
86   	  case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
87   	  case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
88   	  case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
89   	  case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
90   	  case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
91   	  case CEPH_OSD_FLAG_RETURNVEC: return "returnvec";
92   	  default: return "???";
93   	  }
94   	}
95   	
96   	string ceph_osd_flag_string(unsigned flags)
97   	{
98   	  string s;
99   	  for (unsigned i=0; i<32; ++i) {
100  	    if (flags & (1u<<i)) {
101  	      if (s.length())
102  		s += "+";
103  	      s += ceph_osd_flag_name(1u << i);
104  	    }
105  	  }
106  	  if (s.length())
107  	    return s;
108  	  return string("-");
109  	}
110  	
111  	const char * ceph_osd_op_flag_name(unsigned flag)
112  	{
113  	  const char *name;
114  	
115  	  switch(flag) {
116  	    case CEPH_OSD_OP_FLAG_EXCL:
117  	      name = "excl";
118  	      break;
119  	    case CEPH_OSD_OP_FLAG_FAILOK:
120  	      name = "failok";
121  	      break;
122  	    case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
123  	      name = "fadvise_random";
124  	      break;
125  	    case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
126  	      name = "fadvise_sequential";
127  	      break;
128  	    case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
129  	      name = "favise_willneed";
130  	      break;
131  	    case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
132  	      name = "fadvise_dontneed";
133  	      break;
134  	    case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
135  	      name = "fadvise_nocache";
136  	      break;
137  	    case CEPH_OSD_OP_FLAG_WITH_REFERENCE:
138  	      name = "with_reference";
139  	      break;
140  	    case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
141  	      name = "bypass_clean_cache";
142  	      break;
143  	    default:
144  	      name = "???";
145  	  };
146  	
147  	  return name;
148  	}
149  	
150  	string ceph_osd_op_flag_string(unsigned flags)
151  	{
152  	  string s;
153  	  for (unsigned i=0; i<32; ++i) {
154  	    if (flags & (1u<<i)) {
155  	      if (s.length())
156  		s += "+";
157  	      s += ceph_osd_op_flag_name(1u << i);
158  	    }
159  	  }
160  	  if (s.length())
161  	    return s;
162  	  return string("-");
163  	}
164  	
165  	string ceph_osd_alloc_hint_flag_string(unsigned flags)
166  	{
167  	  string s;
168  	  for (unsigned i=0; i<32; ++i) {
169  	    if (flags & (1u<<i)) {
170  	      if (s.length())
171  		s += "+";
172  	      s += ceph_osd_alloc_hint_flag_name(1u << i);
173  	    }
174  	  }
175  	  if (s.length())
176  	    return s;
177  	  return string("-");
178  	}
179  	
180  	void pg_shard_t::encode(ceph::buffer::list &bl) const
181  	{
182  	  ENCODE_START(1, 1, bl);
183  	  encode(osd, bl);
184  	  encode(shard, bl);
185  	  ENCODE_FINISH(bl);
186  	}
187  	void pg_shard_t::decode(ceph::buffer::list::const_iterator &bl)
188  	{
189  	  DECODE_START(1, bl);
190  	  decode(osd, bl);
191  	  decode(shard, bl);
192  	  DECODE_FINISH(bl);
193  	}
194  	
195  	ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
196  	{
197  	  if (rhs.is_undefined())
198  	    return lhs << "?";
199  	  if (rhs.shard == shard_id_t::NO_SHARD)
200  	    return lhs << rhs.get_osd();
201  	  return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
202  	}
203  	
204  	void dump(Formatter* f, const osd_alerts_t& alerts)
205  	{
206  	  for (auto& a : alerts) {
207  	    string s0 = " osd: ";
208  	    s0 += stringify(a.first);
209  	    string s;
210  	    for (auto& aa : a.second) {
211  	      s = s0;
212  	      s += " ";
213  	      s += aa.first;
214  	      s += ":";
215  	      s += aa.second;
216  	      f->dump_string("alert", s);
217  	    }
218  	  }
219  	}
220  	
221  	// -- osd_reqid_t --
222  	void osd_reqid_t::dump(Formatter *f) const
223  	{
224  	  f->dump_stream("name") << name;
225  	  f->dump_int("inc", inc);
226  	  f->dump_unsigned("tid", tid);
227  	}
228  	
229  	void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
230  	{
231  	  o.push_back(new osd_reqid_t);
232  	  o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
233  	}
234  	
235  	// -- object_locator_t --
236  	
237  	void object_locator_t::encode(ceph::buffer::list& bl) const
238  	{
239  	  // verify that nobody's corrupted the locator
240  	  ceph_assert(hash == -1 || key.empty());
241  	  __u8 encode_compat = 3;
242  	  ENCODE_START(6, encode_compat, bl);
243  	  encode(pool, bl);
244  	  int32_t preferred = -1;  // tell old code there is no preferred osd (-1).
245  	  encode(preferred, bl);
246  	  encode(key, bl);
247  	  encode(nspace, bl);
248  	  encode(hash, bl);
249  	  if (hash != -1)
250  	    encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash
251  	  ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
252  	}
253  	
254  	void object_locator_t::decode(ceph::buffer::list::const_iterator& p)
255  	{
256  	  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
257  	  if (struct_v < 2) {
258  	    int32_t op;
259  	    decode(op, p);
260  	    pool = op;
261  	    int16_t pref;
262  	    decode(pref, p);
263  	  } else {
264  	    decode(pool, p);
265  	    int32_t preferred;
266  	    decode(preferred, p);
267  	  }
268  	  decode(key, p);
269  	  if (struct_v >= 5)
270  	    decode(nspace, p);
271  	  if (struct_v >= 6)
272  	    decode(hash, p);
273  	  else
274  	    hash = -1;
275  	  DECODE_FINISH(p);
276  	  // verify that nobody's corrupted the locator
277  	  ceph_assert(hash == -1 || key.empty());
278  	}
279  	
280  	void object_locator_t::dump(Formatter *f) const
281  	{
282  	  f->dump_int("pool", pool);
283  	  f->dump_string("key", key);
284  	  f->dump_string("namespace", nspace);
285  	  f->dump_int("hash", hash);
286  	}
287  	
288  	void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
289  	{
290  	  o.push_back(new object_locator_t);
291  	  o.push_back(new object_locator_t(123));
292  	  o.push_back(new object_locator_t(123, 876));
293  	  o.push_back(new object_locator_t(1, "n2"));
294  	  o.push_back(new object_locator_t(1234, "", "key"));
295  	  o.push_back(new object_locator_t(12, "n1", "key2"));
296  	}
297  	
298  	// -- request_redirect_t --
299  	void request_redirect_t::encode(ceph::buffer::list& bl) const
300  	{
301  	  ENCODE_START(1, 1, bl);
302  	  encode(redirect_locator, bl);
303  	  encode(redirect_object, bl);
304  	  // legacy of the removed osd_instructions member
305  	  encode((uint32_t)0, bl);
306  	  ENCODE_FINISH(bl);
307  	}
308  	
309  	void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl)
310  	{
311  	  DECODE_START(1, bl);
312  	  uint32_t legacy_osd_instructions_len;
313  	  decode(redirect_locator, bl);
314  	  decode(redirect_object, bl);
315  	  decode(legacy_osd_instructions_len, bl);
316  	  if (legacy_osd_instructions_len) {
317  	    bl.advance(legacy_osd_instructions_len);
318  	  }
319  	  DECODE_FINISH(bl);
320  	}
321  	
322  	void request_redirect_t::dump(Formatter *f) const
323  	{
324  	  f->dump_string("object", redirect_object);
325  	  f->open_object_section("locator");
326  	  redirect_locator.dump(f);
327  	  f->close_section(); // locator
328  	}
329  	
330  	void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
331  	{
332  	  object_locator_t loc(1, "redir_obj");
333  	  o.push_back(new request_redirect_t());
334  	  o.push_back(new request_redirect_t(loc, 0));
335  	  o.push_back(new request_redirect_t(loc, "redir_obj"));
336  	  o.push_back(new request_redirect_t(loc));
337  	}
338  	
339  	void objectstore_perf_stat_t::dump(Formatter *f) const
340  	{
341  	  // *_ms values just for compatibility.
342  	  f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0);
343  	  f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0);
344  	  f->dump_unsigned("commit_latency_ns", os_commit_latency_ns);
345  	  f->dump_unsigned("apply_latency_ns", os_apply_latency_ns);
346  	}
347  	
348  	void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
349  	{
350  	  uint8_t target_v = 2;
351  	  if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) {
352  	    target_v = 1;
353  	  }
354  	  ENCODE_START(target_v, target_v, bl);
355  	  if (target_v >= 2) {
356  	    encode(os_commit_latency_ns, bl);
357  	    encode(os_apply_latency_ns, bl);
358  	  } else {
359  	    constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
360  	    uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS;
361  	    uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS;
362  	    encode(commit_latency_ms, bl); // for compatibility with older monitor.
363  	    encode(apply_latency_ms, bl); // for compatibility with older monitor.
364  	  }
365  	  ENCODE_FINISH(bl);
366  	}
367  	
368  	void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl)
369  	{
370  	  DECODE_START(2, bl);
371  	  if (struct_v >= 2) {
372  	    decode(os_commit_latency_ns, bl);
373  	    decode(os_apply_latency_ns, bl);
374  	  } else {
375  	    uint32_t commit_latency_ms;
376  	    uint32_t apply_latency_ms;
377  	    decode(commit_latency_ms, bl);
378  	    decode(apply_latency_ms, bl);
379  	    constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
380  	    os_commit_latency_ns = commit_latency_ms * NS_PER_MS;
381  	    os_apply_latency_ns = apply_latency_ms * NS_PER_MS;
382  	  }
383  	  DECODE_FINISH(bl);
384  	}
385  	
386  	void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
387  	{
388  	  o.push_back(new objectstore_perf_stat_t());
389  	  o.push_back(new objectstore_perf_stat_t());
390  	  o.back()->os_commit_latency_ns = 20000000;
391  	  o.back()->os_apply_latency_ns = 30000000;
392  	}
393  	
394  	// -- osd_stat_t --
395  	void osd_stat_t::dump(Formatter *f) const
396  	{
397  	  f->dump_unsigned("up_from", up_from);
398  	  f->dump_unsigned("seq", seq);
399  	  f->dump_unsigned("num_pgs", num_pgs);
400  	  f->dump_unsigned("num_osds", num_osds);
401  	  f->dump_unsigned("num_per_pool_osds", num_per_pool_osds);
402  	  f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds);
403  	
404  	  /// dump legacy stats fields to ensure backward compatibility.
405  	  f->dump_unsigned("kb", statfs.kb());
406  	  f->dump_unsigned("kb_used", statfs.kb_used_raw());
407  	  f->dump_unsigned("kb_used_data", statfs.kb_used_data());
408  	  f->dump_unsigned("kb_used_omap", statfs.kb_used_omap());
409  	  f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata());
410  	  f->dump_unsigned("kb_avail", statfs.kb_avail());
411  	  ////////////////////
412  	
413  	  f->open_object_section("statfs");
414  	  statfs.dump(f);
415  	  f->close_section();
416  	  f->open_array_section("hb_peers");
417  	  for (auto p : hb_peers)
418  	    f->dump_int("osd", p);
419  	  f->close_section();
420  	  f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
421  	  f->dump_int("num_snap_trimming", num_snap_trimming);
422  	  f->dump_int("num_shards_repaired", num_shards_repaired);
423  	  f->open_object_section("op_queue_age_hist");
424  	  op_queue_age_hist.dump(f);
425  	  f->close_section();
426  	  f->open_object_section("perf_stat");
427  	  os_perf_stat.dump(f);
428  	  f->close_section();
429  	  f->open_array_section("alerts");
430  	  ::dump(f, os_alerts);
431  	  f->close_section();
432  	  f->open_array_section("network_ping_times");
433  	  for (auto &i : hb_pingtime) {
434  	    f->open_object_section("entry");
435  	    f->dump_int("osd", i.first);
436  	    const time_t lu(i.second.last_update);
437  	    char buffer[26];
438  	    string lustr(ctime_r(&lu, buffer));
439  	    lustr.pop_back();   // Remove trailing \n
440  	    f->dump_string("last update", lustr);
441  	    f->open_array_section("interfaces");
442  	    f->open_object_section("interface");
443  	    f->dump_string("interface", "back");
444  	    f->open_object_section("average");
445  	    f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_pingtime[0],3).c_str());
446  	    f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_pingtime[1],3).c_str());
447  	    f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_pingtime[2],3).c_str());
448  	    f->close_section(); // average
449  	    f->open_object_section("min");
450  	    f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_min[0],3).c_str());
451  	    f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_min[1],3).c_str());
452  	    f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_min[2],3).c_str());
453  	    f->close_section(); // min
454  	    f->open_object_section("max");
455  	    f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_max[0],3).c_str());
456  	    f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_max[1],3).c_str());
457  	    f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_max[2],3).c_str());
458  	    f->close_section(); // max
459  	    f->dump_format_unquoted("last", "%s", fixed_u_to_string(i.second.back_last,3).c_str());
460  	    f->close_section(); // interface
461  	
462  	    if (i.second.front_pingtime[0] != 0) {
463  	      f->open_object_section("interface");
464  	      f->dump_string("interface", "front");
465  	      f->open_object_section("average");
466  	      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_pingtime[0],3).c_str());
467  	      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_pingtime[1],3).c_str());
468  	      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_pingtime[2],3).c_str());
469  	      f->close_section(); // average
470  	      f->open_object_section("min");
471  	      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_min[0],3).c_str());
472  	      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_min[1],3).c_str());
473  	      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_min[2],3).c_str());
474  	      f->close_section(); // min
475  	      f->open_object_section("max");
476  	      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_max[0],3).c_str());
477  	      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_max[1],3).c_str());
478  	      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_max[2],3).c_str());
479  	      f->close_section(); // max
480  	      f->dump_format_unquoted("last", "%s", fixed_u_to_string(i.second.front_last,3).c_str());
481  	      f->close_section(); // interface
482  	    }
483  	    f->close_section(); // interfaces
484  	    f->close_section(); // entry
485  	  }
486  	  f->close_section(); // network_ping_time
487  	}
488  	
489  	void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
490  	{
491  	  ENCODE_START(14, 2, bl);
492  	
493  	  //////// for compatibility ////////
494  	  int64_t kb = statfs.kb();
495  	  int64_t kb_used = statfs.kb_used_raw();
496  	  int64_t kb_avail = statfs.kb_avail();
497  	  encode(kb, bl);
498  	  encode(kb_used, bl);
499  	  encode(kb_avail, bl);
500  	  ///////////////////////////////////
501  	
502  	  encode(snap_trim_queue_len, bl);
503  	  encode(num_snap_trimming, bl);
504  	  encode(hb_peers, bl);
505  	  encode((uint32_t)0, bl);
506  	  encode(op_queue_age_hist, bl);
507  	  encode(os_perf_stat, bl, features);
508  	  encode(up_from, bl);
509  	  encode(seq, bl);
510  	  encode(num_pgs, bl);
511  	
512  	  //////// for compatibility ////////
513  	  int64_t kb_used_data = statfs.kb_used_data();
514  	  int64_t kb_used_omap = statfs.kb_used_omap();
515  	  int64_t kb_used_meta = statfs.kb_used_internal_metadata();
516  	  encode(kb_used_data, bl);
517  	  encode(kb_used_omap, bl);
518  	  encode(kb_used_meta, bl);
519  	  encode(statfs, bl);
520  	  ///////////////////////////////////
521  	  encode(os_alerts, bl);
522  	  encode(num_shards_repaired, bl);
523  	  encode(num_osds, bl);
524  	  encode(num_per_pool_osds, bl);
525  	  encode(num_per_pool_omap_osds, bl);
526  	
527  	  // hb_pingtime map
528  	  encode((int)hb_pingtime.size(), bl);
529  	  for (auto i : hb_pingtime) {
530  	    encode(i.first, bl); // osd
531  	    encode(i.second.last_update, bl);
532  	    encode(i.second.back_pingtime[0], bl);
533  	    encode(i.second.back_pingtime[1], bl);
534  	    encode(i.second.back_pingtime[2], bl);
535  	    encode(i.second.back_min[0], bl);
536  	    encode(i.second.back_min[1], bl);
537  	    encode(i.second.back_min[2], bl);
538  	    encode(i.second.back_max[0], bl);
539  	    encode(i.second.back_max[1], bl);
540  	    encode(i.second.back_max[2], bl);
541  	    encode(i.second.back_last, bl);
542  	    encode(i.second.front_pingtime[0], bl);
543  	    encode(i.second.front_pingtime[1], bl);
544  	    encode(i.second.front_pingtime[2], bl);
545  	    encode(i.second.front_min[0], bl);
546  	    encode(i.second.front_min[1], bl);
547  	    encode(i.second.front_min[2], bl);
548  	    encode(i.second.front_max[0], bl);
549  	    encode(i.second.front_max[1], bl);
550  	    encode(i.second.front_max[2], bl);
551  	    encode(i.second.front_last, bl);
552  	  }
553  	  ENCODE_FINISH(bl);
554  	}
555  	
556  	void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl)
557  	{
558  	  int64_t kb, kb_used,kb_avail;
559  	  int64_t kb_used_data, kb_used_omap, kb_used_meta;
560  	  DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl);
561  	  decode(kb, bl);
562  	  decode(kb_used, bl);
563  	  decode(kb_avail, bl);
564  	  decode(snap_trim_queue_len, bl);
565  	  decode(num_snap_trimming, bl);
566  	  decode(hb_peers, bl);
567  	  vector<int> num_hb_out;
568  	  decode(num_hb_out, bl);
569  	  if (struct_v >= 3)
570  	    decode(op_queue_age_hist, bl);
571  	  if (struct_v >= 4)
572  	    decode(os_perf_stat, bl);
573  	  if (struct_v >= 6) {
574  	    decode(up_from, bl);
575  	    decode(seq, bl);
576  	  }
577  	  if (struct_v >= 7) {
578  	    decode(num_pgs, bl);
579  	  }
580  	  if (struct_v >= 8) {
581  	    decode(kb_used_data, bl);
582  	    decode(kb_used_omap, bl);
583  	    decode(kb_used_meta, bl);
584  	  } else {
585  	    kb_used_data = kb_used;
586  	    kb_used_omap = 0;
587  	    kb_used_meta = 0;
588  	  }
589  	  if (struct_v >= 9) {
590  	    decode(statfs, bl);
591  	  } else {
592  	    statfs.reset();
593  	    statfs.total = kb << 10;
594  	    statfs.available = kb_avail << 10;
595  	    // actually it's totally unexpected to have ststfs.total < statfs.available
596  	    // here but unfortunately legacy generate_test_instances produced such a
597  	    // case hence inserting some handling rather than assert
598  	    statfs.internally_reserved =
599  	      statfs.total > statfs.available ? statfs.total - statfs.available : 0;
600  	    kb_used <<= 10;
601  	    if ((int64_t)statfs.internally_reserved > kb_used) {
602  	      statfs.internally_reserved -= kb_used;
603  	    } else {
604  	      statfs.internally_reserved = 0;
605  	    }
606  	    statfs.allocated = kb_used_data << 10;
607  	    statfs.omap_allocated = kb_used_omap << 10;
608  	    statfs.internal_metadata = kb_used_meta << 10;
609  	  }
610  	  if (struct_v >= 10) {
611  	    decode(os_alerts, bl);
612  	  } else {
613  	    os_alerts.clear();
614  	  }
615  	  if (struct_v >= 11) {
616  	    decode(num_shards_repaired, bl);
617  	  } else {
618  	    num_shards_repaired = 0;
619  	  }
620  	  if (struct_v >= 12) {
621  	    decode(num_osds, bl);
622  	    decode(num_per_pool_osds, bl);
623  	  } else {
624  	    num_osds = 0;
625  	    num_per_pool_osds = 0;
626  	  }
627  	  if (struct_v >= 13) {
628  	    decode(num_per_pool_omap_osds, bl);
629  	  } else {
630  	    num_per_pool_omap_osds = 0;
631  	  }
632  	  hb_pingtime.clear();
633  	  if (struct_v >= 14) {
634  	    int count;
635  	    decode(count, bl);
636  	    for (int i = 0 ; i < count ; i++) {
637  	      int osd;
638  	      decode(osd, bl);
639  	      struct Interfaces ifs;
640  	      decode(ifs.last_update, bl);
641  	      decode(ifs.back_pingtime[0],bl);
642  	      decode(ifs.back_pingtime[1], bl);
643  	      decode(ifs.back_pingtime[2], bl);
644  	      decode(ifs.back_min[0],bl);
645  	      decode(ifs.back_min[1], bl);
646  	      decode(ifs.back_min[2], bl);
647  	      decode(ifs.back_max[0],bl);
648  	      decode(ifs.back_max[1], bl);
649  	      decode(ifs.back_max[2], bl);
650  	      decode(ifs.back_last, bl);
651  	      decode(ifs.front_pingtime[0], bl);
652  	      decode(ifs.front_pingtime[1], bl);
653  	      decode(ifs.front_pingtime[2], bl);
654  	      decode(ifs.front_min[0], bl);
655  	      decode(ifs.front_min[1], bl);
656  	      decode(ifs.front_min[2], bl);
657  	      decode(ifs.front_max[0], bl);
658  	      decode(ifs.front_max[1], bl);
659  	      decode(ifs.front_max[2], bl);
660  	      decode(ifs.front_last, bl);
661  	      hb_pingtime[osd] = ifs;
662  	    }
663  	  }
664  	  DECODE_FINISH(bl);
665  	}
666  	
667  	void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
668  	{
669  	  o.push_back(new osd_stat_t);
670  	
671  	  o.push_back(new osd_stat_t);
672  	  list<store_statfs_t*> ll;
673  	  store_statfs_t::generate_test_instances(ll);
674  	  o.back()->statfs = *ll.back();
675  	  o.back()->hb_peers.push_back(7);
676  	  o.back()->snap_trim_queue_len = 8;
677  	  o.back()->num_snap_trimming = 99;
678  	  o.back()->num_shards_repaired = 101;
679  	  o.back()->os_alerts[0].emplace(
680  	    "some alert", "some alert details");
681  	  o.back()->os_alerts[1].emplace(
682  	    "some alert2", "some alert2 details");
683  	  struct Interfaces gen_interfaces = {
684  		123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
685  		 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
686  	  o.back()->hb_pingtime[20] = gen_interfaces;
687  	  gen_interfaces = {
688  		987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
689  	  o.back()->hb_pingtime[30] = gen_interfaces;
690  	}
691  	
692  	// -- pg_t --
693  	
694  	int pg_t::print(char *o, int maxlen) const
695  	{
696  	  return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
697  	}
698  	
699  	bool pg_t::parse(const char *s)
700  	{
701  	  uint64_t ppool;
702  	  uint32_t pseed;
703  	  int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
704  	  if (r < 2)
705  	    return false;
706  	  m_pool = ppool;
707  	  m_seed = pseed;
708  	  return true;
709  	}
710  	
711  	bool spg_t::parse(const char *s)
712  	{
713  	  shard = shard_id_t::NO_SHARD;
714  	  uint64_t ppool;
715  	  uint32_t pseed;
716  	  uint32_t pshard;
717  	  int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
718  	  if (r < 2)
719  	    return false;
720  	  pgid.set_pool(ppool);
721  	  pgid.set_ps(pseed);
722  	
723  	  const char *p = strchr(s, 's');
724  	  if (p) {
725  	    r = sscanf(p, "s%u", &pshard);
726  	    if (r == 1) {
727  	      shard = shard_id_t(pshard);
728  	    } else {
729  	      return false;
730  	    }
731  	  }
732  	  return true;
733  	}
734  	
735  	char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
736  	{
737  	  while (*suffix_backwords)
738  	    *--buf = *suffix_backwords++;
739  	
740  	  if (!is_no_shard()) {
741  	    buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
742  	    *--buf = 's';
743  	  }
744  	
745  	  return pgid.calc_name(buf, "");
746  	}
747  	
748  	ostream& operator<<(ostream& out, const spg_t &pg)
749  	{
750  	  char buf[spg_t::calc_name_buf_size];
751  	  buf[spg_t::calc_name_buf_size - 1] = '\0';
752  	  out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
753  	  return out;
754  	}
755  	
756  	pg_t pg_t::get_ancestor(unsigned old_pg_num) const
757  	{
758  	  int old_bits = cbits(old_pg_num);
759  	  int old_mask = (1 << old_bits) - 1;
760  	  pg_t ret = *this;
761  	  ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
762  	  return ret;
763  	}
764  	
765  	bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
766  	{
767  	  //ceph_assert(m_seed < old_pg_num);
768  	  if (m_seed >= old_pg_num) {
769  	    // degenerate case
770  	    return false;
771  	  }
772  	  if (new_pg_num <= old_pg_num)
773  	    return false;
774  	
775  	  bool split = false;
776  	  if (true) {
777  	    unsigned old_bits = cbits(old_pg_num);
778  	    unsigned old_mask = (1 << old_bits) - 1;
779  	    for (unsigned n = 1; ; n++) {
780  	      unsigned next_bit = (n << (old_bits-1));
781  	      unsigned s = next_bit | m_seed;
782  	
783  	      if (s < old_pg_num || s == m_seed)
784  		continue;
785  	      if (s >= new_pg_num)
786  		break;
787  	      if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
788  		split = true;
789  		if (children)
790  		  children->insert(pg_t(s, m_pool));
791  	      }
792  	    }
793  	  }
794  	  if (false) {
795  	    // brute force
796  	    int old_bits = cbits(old_pg_num);
797  	    int old_mask = (1 << old_bits) - 1;
798  	    for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
799  	      unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
800  	      if (o == m_seed) {
801  		split = true;
802  		children->insert(pg_t(x, m_pool));
803  	      }
804  	    }
805  	  }
806  	  return split;
807  	}
808  	
809  	unsigned pg_t::get_split_bits(unsigned pg_num) const {
810  	  if (pg_num == 1)
811  	    return 0;
812  	  ceph_assert(pg_num > 1);
813  	
814  	  // Find unique p such that pg_num \in [2^(p-1), 2^p)
815  	  unsigned p = cbits(pg_num);
816  	  ceph_assert(p); // silence coverity #751330 
817  	
818  	  if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
819  	    return p;
820  	  else
821  	    return p - 1;
822  	}
823  	
824  	bool pg_t::is_merge_source(
825  	  unsigned old_pg_num,
826  	  unsigned new_pg_num,
827  	  pg_t *parent) const
828  	{
829  	  if (m_seed < old_pg_num &&
830  	      m_seed >= new_pg_num) {
831  	    if (parent) {
832  	      pg_t t = *this;
833  	      while (t.m_seed >= new_pg_num) {
834  		t = t.get_parent();
835  	      }
836  	      *parent = t;
837  	    }
838  	    return true;
839  	  }
840  	  return false;
841  	}
842  	
843  	pg_t pg_t::get_parent() const
844  	{
845  	  unsigned bits = cbits(m_seed);
846  	  ceph_assert(bits);
847  	  pg_t retval = *this;
848  	  retval.m_seed &= ~((~0)<<(bits - 1));
849  	  return retval;
850  	}
851  	
852  	hobject_t pg_t::get_hobj_start() const
853  	{
854  	  return hobject_t(object_t(), string(), 0, m_seed, m_pool,
855  			   string());
856  	}
857  	
858  	hobject_t pg_t::get_hobj_end(unsigned pg_num) const
859  	{
860  	  // note: this assumes a bitwise sort; with the legacy nibblewise
861  	  // sort a PG did not always cover a single contiguous range of the
862  	  // (bit-reversed) hash range.
863  	  unsigned bits = get_split_bits(pg_num);
864  	  uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
865  	  uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
866  	  if (rev_end >= 0x100000000) {
867  	    ceph_assert(rev_end == 0x100000000);
868  	    return hobject_t::get_max();
869  	  } else {
870  	    return hobject_t(object_t(), string(), CEPH_NOSNAP,
871  			   hobject_t::_reverse_bits(rev_end), m_pool,
872  			   string());
873  	  }
874  	}
875  	
876  	void pg_t::dump(Formatter *f) const
877  	{
878  	  f->dump_unsigned("pool", m_pool);
879  	  f->dump_unsigned("seed", m_seed);
880  	}
881  	
882  	void pg_t::generate_test_instances(list<pg_t*>& o)
883  	{
884  	  o.push_back(new pg_t);
885  	  o.push_back(new pg_t(1, 2));
886  	  o.push_back(new pg_t(13123, 3));
887  	  o.push_back(new pg_t(131223, 4));
888  	}
889  	
890  	char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
891  	{
892  	  while (*suffix_backwords)
893  	    *--buf = *suffix_backwords++;
894  	
895  	  buf = ritoa<uint32_t, 16>(m_seed, buf);
896  	
897  	  *--buf = '.';
898  	
899  	  return  ritoa<uint64_t, 10>(m_pool, buf);
900  	}
901  	
902  	ostream& operator<<(ostream& out, const pg_t &pg)
903  	{
904  	  char buf[pg_t::calc_name_buf_size];
905  	  buf[pg_t::calc_name_buf_size - 1] = '\0';
906  	  out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
907  	  return out;
908  	}
909  	
910  	
911  	// -- coll_t --
912  	
913  	void coll_t::calc_str()
914  	{
915  	  switch (type) {
916  	  case TYPE_META:
917  	    strcpy(_str_buff, "meta");
918  	    _str = _str_buff;
919  	    break;
920  	  case TYPE_PG:
921  	    _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
922  	    _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
923  	    break;
924  	  case TYPE_PG_TEMP:
925  	    _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
926  	    _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
927  	    break;
928  	  default:
929  	    ceph_abort_msg("unknown collection type");
930  	  }
931  	}
932  	
933  	bool coll_t::parse(const std::string& s)
934  	{
935  	  if (s == "meta") {
936  	    type = TYPE_META;
937  	    pgid = spg_t();
938  	    removal_seq = 0;
939  	    calc_str();
940  	    ceph_assert(s == _str);
941  	    return true;
942  	  }
943  	  if (s.find("_head") == s.length() - 5 &&
944  	      pgid.parse(s.substr(0, s.length() - 5))) {
945  	    type = TYPE_PG;
946  	    removal_seq = 0;
947  	    calc_str();
948  	    ceph_assert(s == _str);
949  	    return true;
950  	  }
951  	  if (s.find("_TEMP") == s.length() - 5 &&
952  	      pgid.parse(s.substr(0, s.length() - 5))) {
953  	    type = TYPE_PG_TEMP;
954  	    removal_seq = 0;
955  	    calc_str();
956  	    ceph_assert(s == _str);
957  	    return true;
958  	  }
959  	  return false;
960  	}
961  	
962  	void coll_t::encode(ceph::buffer::list& bl) const
963  	{
964  	  using ceph::encode;
965  	  // when changing this, remember to update encoded_size() too.
966  	  if (is_temp()) {
967  	    // can't express this as v2...
968  	    __u8 struct_v = 3;
969  	    encode(struct_v, bl);
970  	    encode(to_str(), bl);
971  	  } else {
972  	    __u8 struct_v = 2;
973  	    encode(struct_v, bl);
974  	    encode((__u8)type, bl);
975  	    encode(pgid, bl);
976  	    snapid_t snap = CEPH_NOSNAP;
977  	    encode(snap, bl);
978  	  }
979  	}
980  	
981  	size_t coll_t::encoded_size() const
982  	{
983  	  size_t r = sizeof(__u8);
984  	  if (is_temp()) {
985  	    // v3
986  	    r += sizeof(__u32);
987  	    if (_str) {
988  	      r += strlen(_str);
989  	    }
990  	  } else {
991  	      // v2
992  	      // 1. type
993  	      r += sizeof(__u8);
994  	      // 2. pgid
995  	      //  - encoding header
996  	      r += sizeof(ceph_le32) + 2 * sizeof(__u8);
997  	      // - pg_t
998  	      r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
999  	      // - shard_id_t
1000 	      r += sizeof(int8_t);
1001 	      // 3. snapid_t
1002 	      r += sizeof(uint64_t);
1003 	  }
1004 	
1005 	  return r;
1006 	}
1007 	
1008 	void coll_t::decode(ceph::buffer::list::const_iterator& bl)
1009 	{
1010 	  using ceph::decode;
1011 	  __u8 struct_v;
1012 	  decode(struct_v, bl);
1013 	  switch (struct_v) {
1014 	  case 1:
1015 	    {
1016 	      snapid_t snap;
1017 	      decode(pgid, bl);
1018 	      decode(snap, bl);
1019 	
1020 	      // infer the type
1021 	      if (pgid == spg_t() && snap == 0) {
1022 		type = TYPE_META;
1023 	      } else {
1024 		type = TYPE_PG;
1025 	      }
1026 	      removal_seq = 0;
1027 	    }
1028 	    break;
1029 	
1030 	  case 2:
1031 	    {
1032 	      __u8 _type;
1033 	      snapid_t snap;
1034 	      decode(_type, bl);
1035 	      decode(pgid, bl);
1036 	      decode(snap, bl);
1037 	      type = (type_t)_type;
1038 	      removal_seq = 0;
1039 	    }
1040 	    break;
1041 	
1042 	  case 3:
1043 	    {
1044 	      string str;
1045 	      decode(str, bl);
1046 	      bool ok = parse(str);
1047 	      if (!ok)
1048 		throw std::domain_error(std::string("unable to parse pg ") + str);
1049 	    }
1050 	    break;
1051 	
1052 	  default:
1053 	    {
1054 	      ostringstream oss;
1055 	      oss << "coll_t::decode(): don't know how to decode version "
1056 		  << struct_v;
1057 	      throw std::domain_error(oss.str());
1058 	    }
1059 	  }
1060 	}
1061 	
1062 	void coll_t::dump(Formatter *f) const
1063 	{
1064 	  f->dump_unsigned("type_id", (unsigned)type);
1065 	  if (type != TYPE_META)
1066 	    f->dump_stream("pgid") << pgid;
1067 	  f->dump_string("name", to_str());
1068 	}
1069 	
1070 	void coll_t::generate_test_instances(list<coll_t*>& o)
1071 	{
1072 	  o.push_back(new coll_t());
1073 	  o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
1074 	  o.push_back(new coll_t(o.back()->get_temp()));
1075 	  o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
1076 	  o.push_back(new coll_t(o.back()->get_temp()));
1077 	  o.push_back(new coll_t());
1078 	}
1079 	
1080 	// ---
1081 	
1082 	std::string pg_vector_string(const vector<int32_t> &a)
1083 	{
1084 	  ostringstream oss;
1085 	  oss << "[";
1086 	  for (auto i = a.cbegin(); i != a.cend(); ++i) {
1087 	    if (i != a.begin())
1088 	      oss << ",";
1089 	    if (*i != CRUSH_ITEM_NONE)
1090 	      oss << *i;
1091 	    else
1092 	      oss << "NONE";
1093 	  }
1094 	  oss << "]";
1095 	  return oss.str();
1096 	}
1097 	
1098 	std::string pg_state_string(uint64_t state)
1099 	{
1100 	  ostringstream oss;
1101 	  if (state & PG_STATE_STALE)
1102 	    oss << "stale+";
1103 	  if (state & PG_STATE_CREATING)
1104 	    oss << "creating+";
1105 	  if (state & PG_STATE_ACTIVE)
1106 	    oss << "active+";
1107 	  if (state & PG_STATE_ACTIVATING)
1108 	    oss << "activating+";
1109 	  if (state & PG_STATE_CLEAN)
1110 	    oss << "clean+";
1111 	  if (state & PG_STATE_RECOVERY_WAIT)
1112 	    oss << "recovery_wait+";
1113 	  if (state & PG_STATE_RECOVERY_TOOFULL)
1114 	    oss << "recovery_toofull+";
1115 	  if (state & PG_STATE_RECOVERING)
1116 	    oss << "recovering+";
1117 	  if (state & PG_STATE_FORCED_RECOVERY)
1118 	    oss << "forced_recovery+";
1119 	  if (state & PG_STATE_DOWN)
1120 	    oss << "down+";
1121 	  if (state & PG_STATE_RECOVERY_UNFOUND)
1122 	    oss << "recovery_unfound+";
1123 	  if (state & PG_STATE_BACKFILL_UNFOUND)
1124 	    oss << "backfill_unfound+";
1125 	  if (state & PG_STATE_UNDERSIZED)
1126 	    oss << "undersized+";
1127 	  if (state & PG_STATE_DEGRADED)
1128 	    oss << "degraded+";
1129 	  if (state & PG_STATE_REMAPPED)
1130 	    oss << "remapped+";
1131 	  if (state & PG_STATE_PREMERGE)
1132 	    oss << "premerge+";
1133 	  if (state & PG_STATE_SCRUBBING)
1134 	    oss << "scrubbing+";
1135 	  if (state & PG_STATE_DEEP_SCRUB)
1136 	    oss << "deep+";
1137 	  if (state & PG_STATE_INCONSISTENT)
1138 	    oss << "inconsistent+";
1139 	  if (state & PG_STATE_PEERING)
1140 	    oss << "peering+";
1141 	  if (state & PG_STATE_REPAIR)
1142 	    oss << "repair+";
1143 	  if (state & PG_STATE_BACKFILL_WAIT)
1144 	    oss << "backfill_wait+";
1145 	  if (state & PG_STATE_BACKFILLING)
1146 	    oss << "backfilling+";
1147 	  if (state & PG_STATE_FORCED_BACKFILL)
1148 	    oss << "forced_backfill+";
1149 	  if (state & PG_STATE_BACKFILL_TOOFULL)
1150 	    oss << "backfill_toofull+";
1151 	  if (state & PG_STATE_INCOMPLETE)
1152 	    oss << "incomplete+";
1153 	  if (state & PG_STATE_PEERED)
1154 	    oss << "peered+";
1155 	  if (state & PG_STATE_SNAPTRIM)
1156 	    oss << "snaptrim+";
1157 	  if (state & PG_STATE_SNAPTRIM_WAIT)
1158 	    oss << "snaptrim_wait+";
1159 	  if (state & PG_STATE_SNAPTRIM_ERROR)
1160 	    oss << "snaptrim_error+";
1161 	  if (state & PG_STATE_FAILED_REPAIR)
1162 	    oss << "failed_repair+";
1163 	  if (state & PG_STATE_LAGGY)
1164 	    oss << "laggy+";
1165 	  if (state & PG_STATE_WAIT)
1166 	    oss << "wait+";
1167 	  string ret(oss.str());
1168 	  if (ret.length() > 0)
1169 	    ret.resize(ret.length() - 1);
1170 	  else
1171 	    ret = "unknown";
1172 	  return ret;
1173 	}
1174 	
1175 	std::optional<uint64_t> pg_string_state(const std::string& state)
1176 	{
1177 	  std::optional<uint64_t> type;
1178 	  if (state == "active")
1179 	    type = PG_STATE_ACTIVE;
1180 	  else if (state == "clean")
1181 	    type = PG_STATE_CLEAN;
1182 	  else if (state == "down")
1183 	    type = PG_STATE_DOWN;
1184 	  else if (state == "recovery_unfound")
1185 	    type = PG_STATE_RECOVERY_UNFOUND;
1186 	  else if (state == "backfill_unfound")
1187 	    type = PG_STATE_BACKFILL_UNFOUND;
1188 	  else if (state == "premerge")
1189 	    type = PG_STATE_PREMERGE;
1190 	  else if (state == "scrubbing")
1191 	    type = PG_STATE_SCRUBBING;
1192 	  else if (state == "degraded")
1193 	    type = PG_STATE_DEGRADED;
1194 	  else if (state == "inconsistent")
1195 	    type = PG_STATE_INCONSISTENT;
1196 	  else if (state == "peering")
1197 	    type = PG_STATE_PEERING;
1198 	  else if (state == "repair")
1199 	    type = PG_STATE_REPAIR;
1200 	  else if (state == "recovering")
1201 	    type = PG_STATE_RECOVERING;
1202 	  else if (state == "forced_recovery")
1203 	    type = PG_STATE_FORCED_RECOVERY;
1204 	  else if (state == "backfill_wait")
1205 	    type = PG_STATE_BACKFILL_WAIT;
1206 	  else if (state == "incomplete")
1207 	    type = PG_STATE_INCOMPLETE;
1208 	  else if (state == "stale")
1209 	    type = PG_STATE_STALE;
1210 	  else if (state == "remapped")
1211 	    type = PG_STATE_REMAPPED;
1212 	  else if (state == "deep")
1213 	    type = PG_STATE_DEEP_SCRUB;
1214 	  else if (state == "backfilling")
1215 	    type = PG_STATE_BACKFILLING;
1216 	  else if (state == "forced_backfill")
1217 	    type = PG_STATE_FORCED_BACKFILL;
1218 	  else if (state == "backfill_toofull")
1219 	    type = PG_STATE_BACKFILL_TOOFULL;
1220 	  else if (state == "recovery_wait")
1221 	    type = PG_STATE_RECOVERY_WAIT;
1222 	  else if (state == "recovery_toofull")
1223 	    type = PG_STATE_RECOVERY_TOOFULL;
1224 	  else if (state == "undersized")
1225 	    type = PG_STATE_UNDERSIZED;
1226 	  else if (state == "activating")
1227 	    type = PG_STATE_ACTIVATING;
1228 	  else if (state == "peered")
1229 	    type = PG_STATE_PEERED;
1230 	  else if (state == "snaptrim")
1231 	    type = PG_STATE_SNAPTRIM;
1232 	  else if (state == "snaptrim_wait")
1233 	    type = PG_STATE_SNAPTRIM_WAIT;
1234 	  else if (state == "snaptrim_error")
1235 	    type = PG_STATE_SNAPTRIM_ERROR;
1236 	  else if (state == "creating")
1237 	    type = PG_STATE_CREATING;
1238 	  else if (state == "failed_repair")
1239 	    type = PG_STATE_FAILED_REPAIR;
1240 	  else if (state == "laggy")
1241 	    type = PG_STATE_LAGGY;
1242 	  else if (state == "wait")
1243 	    type = PG_STATE_WAIT;
1244 	  else if (state == "unknown")
1245 	    type = 0;
1246 	  else
1247 	    type = std::nullopt;
1248 	  return type;
1249 	}
1250 	
1251 	// -- eversion_t --
1252 	string eversion_t::get_key_name() const
1253 	{
1254 	  std::string key(32, ' ');
1255 	  get_key_name(&key[0]);
1256 	  key.resize(31); // remove the null terminator
1257 	  return key;
1258 	}
1259 	
1260 	// -- pool_snap_info_t --
1261 	void pool_snap_info_t::dump(Formatter *f) const
1262 	{
1263 	  f->dump_unsigned("snapid", snapid);
1264 	  f->dump_stream("stamp") << stamp;
1265 	  f->dump_string("name", name);
1266 	}
1267 	
1268 	void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
1269 	{
1270 	  using ceph::encode;
1271 	  if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1272 	    __u8 struct_v = 1;
1273 	    encode(struct_v, bl);
1274 	    encode(snapid, bl);
1275 	    encode(stamp, bl);
1276 	    encode(name, bl);
1277 	    return;
1278 	  }
1279 	  ENCODE_START(2, 2, bl);
1280 	  encode(snapid, bl);
1281 	  encode(stamp, bl);
1282 	  encode(name, bl);
1283 	  ENCODE_FINISH(bl);
1284 	}
1285 	
1286 	void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl)
1287 	{
1288 	  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
1289 	  decode(snapid, bl);
1290 	  decode(stamp, bl);
1291 	  decode(name, bl);
1292 	  DECODE_FINISH(bl);
1293 	}
1294 	
1295 	void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
1296 	{
1297 	  o.push_back(new pool_snap_info_t);
1298 	  o.push_back(new pool_snap_info_t);
1299 	  o.back()->snapid = 1;
1300 	  o.back()->stamp = utime_t(1, 2);
1301 	  o.back()->name = "foo";
1302 	}
1303 	
1304 	// -- pool_opts_t --
1305 	
1306 	typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
1307 	static opt_mapping_t opt_mapping = boost::assign::map_list_of
1308 		   ("scrub_min_interval", pool_opts_t::opt_desc_t(
1309 		     pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
1310 		   ("scrub_max_interval", pool_opts_t::opt_desc_t(
1311 		     pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
1312 		   ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1313 		     pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
1314 	           ("recovery_priority", pool_opts_t::opt_desc_t(
1315 	             pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
1316 	           ("recovery_op_priority", pool_opts_t::opt_desc_t(
1317 	             pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
1318 	           ("scrub_priority", pool_opts_t::opt_desc_t(
1319 	             pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
1320 	           ("compression_mode", pool_opts_t::opt_desc_t(
1321 		     pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
1322 	           ("compression_algorithm", pool_opts_t::opt_desc_t(
1323 		     pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
1324 	           ("compression_required_ratio", pool_opts_t::opt_desc_t(
1325 		     pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
1326 	           ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1327 		     pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
1328 	           ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1329 		     pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
1330 	           ("csum_type", pool_opts_t::opt_desc_t(
1331 		     pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1332 	           ("csum_max_block", pool_opts_t::opt_desc_t(
1333 		     pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1334 	           ("csum_min_block", pool_opts_t::opt_desc_t(
1335 		     pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT))
1336 	           ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1337 		     pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR))
1338 	           ("pg_num_min", pool_opts_t::opt_desc_t(
1339 		     pool_opts_t::PG_NUM_MIN, pool_opts_t::INT))
1340 	           ("target_size_bytes", pool_opts_t::opt_desc_t(
1341 		     pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT))
1342 	           ("target_size_ratio", pool_opts_t::opt_desc_t(
1343 		     pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE))
1344 	           ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
1345 		     pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE))
1346 	           ("read_lease_interval", pool_opts_t::opt_desc_t(
1347 		     pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE));
1348 	
1349 	bool pool_opts_t::is_opt_name(const std::string& name)
1350 	{
1351 	  return opt_mapping.count(name);
1352 	}
1353 	
1354 	pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name)
1355 	{
1356 	  auto i = opt_mapping.find(name);
1357 	  ceph_assert(i != opt_mapping.end());
1358 	  return i->second;
1359 	}
1360 	
1361 	bool pool_opts_t::is_set(pool_opts_t::key_t key) const
1362 	{
1363 	  return opts.count(key);
1364 	}
1365 	
1366 	const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const
1367 	{
1368 	  auto i = opts.find(key);
1369 	  ceph_assert(i != opts.end());
1370 	  return i->second;
1371 	}
1372 	
1373 	bool pool_opts_t::unset(pool_opts_t::key_t key) {
1374 	  return opts.erase(key) > 0;
1375 	}
1376 	
1377 	class pool_opts_dumper_t : public boost::static_visitor<> {
1378 	public:
1379 	  pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1380 	    name(name_.c_str()), f(f_) {}
1381 	
1382 	  void operator()(std::string s) const {
1383 	    f->dump_string(name, s);
1384 	  }
1385 	  void operator()(int64_t i) const {
1386 	    f->dump_int(name, i);
1387 	  }
1388 	  void operator()(double d) const {
1389 	    f->dump_float(name, d);
1390 	  }
1391 	
1392 	private:
1393 	  const char* name;
1394 	  Formatter* f;
1395 	};
1396 	
1397 	void pool_opts_t::dump(const std::string& name, Formatter* f) const
1398 	{
1399 	  const opt_desc_t& desc = get_opt_desc(name);
1400 	  auto i = opts.find(desc.key);
1401 	  if (i == opts.end()) {
1402 	      return;
1403 	  }
1404 	  boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1405 	}
1406 	
1407 	void pool_opts_t::dump(Formatter* f) const
1408 	{
1409 	  for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) {
1410 	    const std::string& name = i->first;
1411 	    const opt_desc_t& desc = i->second;
1412 	    auto j = opts.find(desc.key);
1413 	    if (j == opts.end()) {
1414 	      continue;
1415 	    }
1416 	    boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1417 	  }
1418 	}
1419 	
1420 	class pool_opts_encoder_t : public boost::static_visitor<> {
1421 	public:
1422 	  explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features)
1423 	    : bl(bl_),
1424 	      features(features) {}
1425 	
1426 	  void operator()(const std::string &s) const {
1427 	    encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1428 	    encode(s, bl);
1429 	  }
1430 	  void operator()(int64_t i) const {
1431 	    encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1432 	    if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1433 	      encode(i, bl);
1434 	    } else {
1435 	      encode(static_cast<int32_t>(i), bl);
1436 	    }
1437 	  }
1438 	  void operator()(double d) const {
1439 	    encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1440 	    encode(d, bl);
1441 	  }
1442 	
1443 	private:
1444 	  ceph::buffer::list& bl;
1445 	  uint64_t features;
1446 	};
1447 	
1448 	void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const
1449 	{
1450 	  unsigned v = 2;
1451 	  if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1452 	    v = 1;
1453 	  }
1454 	  ENCODE_START(v, 1, bl);
1455 	  uint32_t n = static_cast<uint32_t>(opts.size());
1456 	  encode(n, bl);
1457 	  for (auto i = opts.cbegin(); i != opts.cend(); ++i) {
1458 	    encode(static_cast<int32_t>(i->first), bl);
1459 	    boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second);
1460 	  }
1461 	  ENCODE_FINISH(bl);
1462 	}
1463 	
1464 	void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl)
1465 	{
1466 	  DECODE_START(1, bl);
1467 	  __u32 n;
1468 	  decode(n, bl);
1469 	  opts.clear();
1470 	  while (n--) {
1471 	    int32_t k, t;
1472 	    decode(k, bl);
1473 	    decode(t, bl);
1474 	    if (t == STR) {
1475 	      std::string s;
1476 	      decode(s, bl);
1477 	      opts[static_cast<key_t>(k)] = s;
1478 	    } else if (t == INT) {
1479 	      int64_t i;
1480 	      if (struct_v >= 2) {
1481 		decode(i, bl);
1482 	      } else {
1483 		int ii;
1484 		decode(ii, bl);
1485 		i = ii;
1486 	      }
1487 	      opts[static_cast<key_t>(k)] = i;
1488 	    } else if (t == DOUBLE) {
1489 	      double d;
1490 	      decode(d, bl);
1491 	      opts[static_cast<key_t>(k)] = d;
1492 	    } else {
1493 	      ceph_assert(!"invalid type");
1494 	    }
1495 	  }
1496 	  DECODE_FINISH(bl);
1497 	}
1498 	
1499 	ostream& operator<<(ostream& out, const pool_opts_t& opts)
1500 	{
1501 	  for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) {
1502 	    const std::string& name = i->first;
1503 	    const pool_opts_t::opt_desc_t& desc = i->second;
1504 	    auto j = opts.opts.find(desc.key);
1505 	    if (j == opts.opts.end()) {
1506 	      continue;
1507 	    }
1508 	    out << " " << name << " " << j->second;
1509 	  }
1510 	  return out;
1511 	}
1512 	
1513 	// -- pg_pool_t --
1514 	
1515 	const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1516 	const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1517 	const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1518 	
1519 	void pg_pool_t::dump(Formatter *f) const
1520 	{
1521 	  f->dump_stream("create_time") << get_create_time();
1522 	  f->dump_unsigned("flags", get_flags());
1523 	  f->dump_string("flags_names", get_flags_string());
1524 	  f->dump_int("type", get_type());
1525 	  f->dump_int("size", get_size());
1526 	  f->dump_int("min_size", get_min_size());
1527 	  f->dump_int("crush_rule", get_crush_rule());
1528 	  f->dump_int("object_hash", get_object_hash());
1529 	  f->dump_string("pg_autoscale_mode",
1530 			 get_pg_autoscale_mode_name(pg_autoscale_mode));
1531 	  f->dump_unsigned("pg_num", get_pg_num());
1532 	  f->dump_unsigned("pg_placement_num", get_pgp_num());
1533 	  f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1534 	  f->dump_unsigned("pg_num_target", get_pg_num_target());
1535 	  f->dump_unsigned("pg_num_pending", get_pg_num_pending());
1536 	  f->dump_object("last_pg_merge_meta", last_pg_merge_meta);
1537 	  f->dump_stream("last_change") << get_last_change();
1538 	  f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1539 	  f->dump_stream("last_force_op_resend_prenautilus")
1540 	    << get_last_force_op_resend_prenautilus();
1541 	  f->dump_stream("last_force_op_resend_preluminous")
1542 	    << get_last_force_op_resend_preluminous();
1543 	  f->dump_unsigned("auid", get_auid());
1544 	  f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1545 	  f->dump_unsigned("snap_seq", get_snap_seq());
1546 	  f->dump_unsigned("snap_epoch", get_snap_epoch());
1547 	  f->open_array_section("pool_snaps");
1548 	  for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) {
1549 	    f->open_object_section("pool_snap_info");
1550 	    p->second.dump(f);
1551 	    f->close_section();
1552 	  }
1553 	  f->close_section();
1554 	  f->dump_stream("removed_snaps") << removed_snaps;
1555 	  f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1556 	  f->dump_unsigned("quota_max_objects", quota_max_objects);
1557 	  f->open_array_section("tiers");
1558 	  for (auto p = tiers.cbegin(); p != tiers.cend(); ++p)
1559 	    f->dump_unsigned("pool_id", *p);
1560 	  f->close_section();
1561 	  f->dump_int("tier_of", tier_of);
1562 	  f->dump_int("read_tier", read_tier);
1563 	  f->dump_int("write_tier", write_tier);
1564 	  f->dump_string("cache_mode", get_cache_mode_name());
1565 	  f->dump_unsigned("target_max_bytes", target_max_bytes);
1566 	  f->dump_unsigned("target_max_objects", target_max_objects);
1567 	  f->dump_unsigned("cache_target_dirty_ratio_micro",
1568 			   cache_target_dirty_ratio_micro);
1569 	  f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1570 			   cache_target_dirty_high_ratio_micro);
1571 	  f->dump_unsigned("cache_target_full_ratio_micro",
1572 			   cache_target_full_ratio_micro);
1573 	  f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1574 	  f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1575 	  f->dump_string("erasure_code_profile", erasure_code_profile);
1576 	  f->open_object_section("hit_set_params");
1577 	  hit_set_params.dump(f);
1578 	  f->close_section(); // hit_set_params
1579 	  f->dump_unsigned("hit_set_period", hit_set_period);
1580 	  f->dump_unsigned("hit_set_count", hit_set_count);
1581 	  f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1582 	  f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1583 	  f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1584 	  f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1585 	  f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1586 	  f->open_array_section("grade_table");
1587 	  for (unsigned i = 0; i < hit_set_count; ++i)
1588 	    f->dump_unsigned("value", get_grade(i));
1589 	  f->close_section();
1590 	  f->dump_unsigned("stripe_width", get_stripe_width());
1591 	  f->dump_unsigned("expected_num_objects", expected_num_objects);
1592 	  f->dump_bool("fast_read", fast_read);
1593 	  f->open_object_section("options");
1594 	  opts.dump(f);
1595 	  f->close_section(); // options
1596 	  f->open_object_section("application_metadata");
1597 	  for (auto &app_pair : application_metadata) {
1598 	    f->open_object_section(app_pair.first.c_str());
1599 	    for (auto &kv_pair : app_pair.second) {
1600 	      f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1601 	    }
1602 	    f->close_section(); // application
1603 	  }
1604 	  f->close_section(); // application_metadata
1605 	}
1606 	
1607 	void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1608 	  for (size_t i = 0; i < from.size(); ++i) {
1609 	    if (from[i] != CRUSH_ITEM_NONE) {
1610 	      to->insert(
1611 	        pg_shard_t(
1612 	          from[i],
1613 	          is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1614 	    }
1615 	  }
1616 	}
1617 	
1618 	void pg_pool_t::calc_pg_masks()
1619 	{
1620 	  pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1621 	  pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1622 	}
1623 	
1624 	unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1625 	{
1626 	  if (pg_num == pg_num_mask + 1)
1627 	    return pg_num;                    // power-of-2 split
1628 	  unsigned mask = pg_num_mask >> 1;
1629 	  if ((pgid.ps() & mask) < (pg_num & mask))
1630 	    return pg_num_mask + 1;           // smaller bin size (already split)
1631 	  else
1632 	    return (pg_num_mask + 1) >> 1;    // bigger bin (not yet split)
1633 	}
1634 	
1635 	bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const
1636 	{
1637 	  if (pg_num_pending >= pg_num) {
1638 	    return false;
1639 	  }
1640 	  if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) {
1641 	    if (target) {
1642 	      *target = false;
1643 	    }
1644 	    return true;
1645 	  }
1646 	  for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) {
1647 	    if (pg_t(ps, pgid.pool()).get_parent() == pgid) {
1648 	      if (target) {
1649 		*target = true;
1650 	      }
1651 	      return true;
1652 	    }
1653 	  }
1654 	  return false;
1655 	}
1656 	
1657 	/*
1658 	 * we have two snap modes:
1659 	 *  - pool snaps
1660 	 *    - snap existence/non-existence defined by snaps[] and snap_seq
1661 	 *  - user managed snaps
1662 	 *    - existence tracked by librados user
1663 	 */
1664 	bool pg_pool_t::is_pool_snaps_mode() const
1665 	{
1666 	  return has_flag(FLAG_POOL_SNAPS);
1667 	}
1668 	
1669 	bool pg_pool_t::is_unmanaged_snaps_mode() const
1670 	{
1671 	  return has_flag(FLAG_SELFMANAGED_SNAPS);
1672 	}
1673 	
1674 	bool pg_pool_t::is_removed_snap(snapid_t s) const
1675 	{
1676 	  if (is_pool_snaps_mode())
1677 	    return s <= get_snap_seq() && snaps.count(s) == 0;
1678 	  else
1679 	    return removed_snaps.contains(s);
1680 	}
1681 	
1682 	snapid_t pg_pool_t::snap_exists(const char *s) const
1683 	{
1684 	  for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
1685 	    if (p->second.name == s)
1686 	      return p->second.snapid;
1687 	  return 0;
1688 	}
1689 	
1690 	void pg_pool_t::add_snap(const char *n, utime_t stamp)
1691 	{
1692 	  ceph_assert(!is_unmanaged_snaps_mode());
1693 	  flags |= FLAG_POOL_SNAPS;
1694 	  snapid_t s = get_snap_seq() + 1;
1695 	  snap_seq = s;
1696 	  snaps[s].snapid = s;
1697 	  snaps[s].name = n;
1698 	  snaps[s].stamp = stamp;
1699 	}
1700 	
1701 	uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat)
1702 	{
1703 	  ceph_assert(!is_pool_snaps_mode());
1704 	  if (snap_seq == 0) {
1705 	    if (preoctopus_compat) {
1706 	      // kludge for pre-mimic tracking of pool vs selfmanaged snaps.  after
1707 	      // mimic this field is not decoded but our flag is set; pre-mimic, we
1708 	      // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1709 	      removed_snaps.insert(snapid_t(1));
1710 	    }
1711 	    snap_seq = 1;
1712 	  }
1713 	  flags |= FLAG_SELFMANAGED_SNAPS;
1714 	  snap_seq = snap_seq + 1;
1715 	  return snap_seq;
1716 	}
1717 	
1718 	void pg_pool_t::remove_snap(snapid_t s)
1719 	{
1720 	  ceph_assert(snaps.count(s));
1721 	  snaps.erase(s);
1722 	  snap_seq = snap_seq + 1;
1723 	}
1724 	
1725 	void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat)
1726 	{
1727 	  ceph_assert(is_unmanaged_snaps_mode());
1728 	  ++snap_seq;
1729 	  if (preoctopus_compat) {
1730 	    removed_snaps.insert(s);
1731 	    // try to add in the new seq, just to try to keep the interval_set contiguous
1732 	    if (!removed_snaps.contains(get_snap_seq())) {
1733 	      removed_snaps.insert(get_snap_seq());
1734 	    }
1735 	  }
1736 	}
1737 	
1738 	SnapContext pg_pool_t::get_snap_context() const
1739 	{
1740 	  vector<snapid_t> s(snaps.size());
1741 	  unsigned i = 0;
1742 	  for (auto p = snaps.crbegin(); p != snaps.crend(); ++p)
1743 	    s[i++] = p->first;
1744 	  return SnapContext(get_snap_seq(), s);
1745 	}
1746 	
1747 	uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1748 	{
1749 	 if (ns.empty()) 
1750 	    return ceph_str_hash(object_hash, key.data(), key.length());
1751 	  int nsl = ns.length();
1752 	  int len = key.length() + nsl + 1;
1753 	  char buf[len];
1754 	  memcpy(&buf[0], ns.data(), nsl);
1755 	  buf[nsl] = '\037';
1756 	  memcpy(&buf[nsl+1], key.data(), key.length());
1757 	  return ceph_str_hash(object_hash, &buf[0], len);
1758 	}
1759 	
1760 	uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1761 	{
1762 	  return ceph_stable_mod(v, pg_num, pg_num_mask);
1763 	}
1764 	
1765 	/*
1766 	 * map a raw pg (with full precision ps) into an actual pg, for storage
1767 	 */
1768 	pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1769 	{
1770 	  pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1771 	  return pg;
1772 	}
1773 	  
1774 	/*
1775 	 * map raw pg (full precision ps) into a placement seed.  include
1776 	 * pool id in that value so that different pools don't use the same
1777 	 * seeds.
1778 	 */
1779 	ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1780 	{
1781 	  if (flags & FLAG_HASHPSPOOL) {
1782 	    // Hash the pool id so that pool PGs do not overlap.
1783 	    return
1784 	      crush_hash32_2(CRUSH_HASH_RJENKINS1,
1785 			     ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1786 			     pg.pool());
1787 	  } else {
1788 	    // Legacy behavior; add ps and pool together.  This is not a great
1789 	    // idea because the PGs from each pool will essentially overlap on
1790 	    // top of each other: 0.5 == 1.4 == 2.3 == ...
1791 	    return
1792 	      ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1793 	      pg.pool();
1794 	  }
1795 	}
1796 	
1797 	uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1798 	{
1799 	  uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1800 	  if (pg_num == pg_num_mask + 1) {
1801 	    r &= ~pg_num_mask;
1802 	  } else {
1803 	    unsigned smaller_mask = pg_num_mask >> 1;
1804 	    if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1805 	      r &= ~pg_num_mask;
1806 	    } else {
1807 	      r &= ~smaller_mask;
1808 	    }
1809 	  }
1810 	  r |= pg.ps();
1811 	  return r;
1812 	}
1813 	
1814 	void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
1815 	{
1816 	  using ceph::encode;
1817 	  if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1818 	    // this encoding matches the old struct ceph_pg_pool
1819 	    __u8 struct_v = 2;
1820 	    encode(struct_v, bl);
1821 	    encode(type, bl);
1822 	    encode(size, bl);
1823 	    encode(crush_rule, bl);
1824 	    encode(object_hash, bl);
1825 	    encode(pg_num, bl);
1826 	    encode(pgp_num, bl);
1827 	    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
1828 	    encode(lpg_num, bl);
1829 	    encode(lpgp_num, bl);
1830 	    encode(last_change, bl);
1831 	    encode(snap_seq, bl);
1832 	    encode(snap_epoch, bl);
1833 	
1834 	    __u32 n = snaps.size();
1835 	    encode(n, bl);
1836 	    n = removed_snaps.num_intervals();
1837 	    encode(n, bl);
1838 	
1839 	    encode(auid, bl);
1840 	
1841 	    encode_nohead(snaps, bl, features);
1842 	    encode_nohead(removed_snaps, bl);
1843 	    return;
1844 	  }
1845 	
1846 	  if ((features & CEPH_FEATURE_OSDENC) == 0) {
1847 	    __u8 struct_v = 4;
1848 	    encode(struct_v, bl);
1849 	    encode(type, bl);
1850 	    encode(size, bl);
1851 	    encode(crush_rule, bl);
1852 	    encode(object_hash, bl);
1853 	    encode(pg_num, bl);
1854 	    encode(pgp_num, bl);
1855 	    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
1856 	    encode(lpg_num, bl);
1857 	    encode(lpgp_num, bl);
1858 	    encode(last_change, bl);
1859 	    encode(snap_seq, bl);
1860 	    encode(snap_epoch, bl);
1861 	    encode(snaps, bl, features);
1862 	    encode(removed_snaps, bl);
1863 	    encode(auid, bl);
1864 	    encode(flags, bl);
1865 	    encode((uint32_t)0, bl); // crash_replay_interval
1866 	    return;
1867 	  }
1868 	
1869 	  if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1870 	    // we simply added last_force_op_resend here, which is a fully
1871 	    // backward compatible change.  however, encoding the same map
1872 	    // differently between monitors triggers scrub noise (even though
1873 	    // they are decodable without the feature), so let's be pendantic
1874 	    // about it.
1875 	    ENCODE_START(14, 5, bl);
1876 	    encode(type, bl);
1877 	    encode(size, bl);
1878 	    encode(crush_rule, bl);
1879 	    encode(object_hash, bl);
1880 	    encode(pg_num, bl);
1881 	    encode(pgp_num, bl);
1882 	    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
1883 	    encode(lpg_num, bl);
1884 	    encode(lpgp_num, bl);
1885 	    encode(last_change, bl);
1886 	    encode(snap_seq, bl);
1887 	    encode(snap_epoch, bl);
1888 	    encode(snaps, bl, features);
1889 	    encode(removed_snaps, bl);
1890 	    encode(auid, bl);
1891 	    encode(flags, bl);
1892 	    encode((uint32_t)0, bl); // crash_replay_interval
1893 	    encode(min_size, bl);
1894 	    encode(quota_max_bytes, bl);
1895 	    encode(quota_max_objects, bl);
1896 	    encode(tiers, bl);
1897 	    encode(tier_of, bl);
1898 	    __u8 c = cache_mode;
1899 	    encode(c, bl);
1900 	    encode(read_tier, bl);
1901 	    encode(write_tier, bl);
1902 	    encode(properties, bl);
1903 	    encode(hit_set_params, bl);
1904 	    encode(hit_set_period, bl);
1905 	    encode(hit_set_count, bl);
1906 	    encode(stripe_width, bl);
1907 	    encode(target_max_bytes, bl);
1908 	    encode(target_max_objects, bl);
1909 	    encode(cache_target_dirty_ratio_micro, bl);
1910 	    encode(cache_target_full_ratio_micro, bl);
1911 	    encode(cache_min_flush_age, bl);
1912 	    encode(cache_min_evict_age, bl);
1913 	    encode(erasure_code_profile, bl);
1914 	    ENCODE_FINISH(bl);
1915 	    return;
1916 	  }
1917 	
1918 	  uint8_t v = 29;
1919 	  // NOTE: any new encoding dependencies must be reflected by
1920 	  // SIGNIFICANT_FEATURES
1921 	  if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1922 	    // this was the first post-hammer thing we added; if it's missing, encode
1923 	    // like hammer.
1924 	    v = 21;
1925 	  } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1926 	    v = 24;
1927 	  } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
1928 	    v = 26;
1929 	  } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1930 	    v = 27;
1931 	  }
1932 	
1933 	  ENCODE_START(v, 5, bl);
1934 	  encode(type, bl);
1935 	  encode(size, bl);
1936 	  encode(crush_rule, bl);
1937 	  encode(object_hash, bl);
1938 	  encode(pg_num, bl);
1939 	  encode(pgp_num, bl);
1940 	  __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
1941 	  encode(lpg_num, bl);
1942 	  encode(lpgp_num, bl);
1943 	  encode(last_change, bl);
1944 	  encode(snap_seq, bl);
1945 	  encode(snap_epoch, bl);
1946 	  encode(snaps, bl, features);
1947 	  encode(removed_snaps, bl);
1948 	  encode(auid, bl);
1949 	  if (v >= 27) {
1950 	    encode(flags, bl);
1951 	  } else {
1952 	    auto tmp = flags;
1953 	    tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
1954 	    encode(tmp, bl);
1955 	  }
1956 	  encode((uint32_t)0, bl); // crash_replay_interval
1957 	  encode(min_size, bl);
1958 	  encode(quota_max_bytes, bl);
1959 	  encode(quota_max_objects, bl);
1960 	  encode(tiers, bl);
1961 	  encode(tier_of, bl);
1962 	  __u8 c = cache_mode;
1963 	  encode(c, bl);
1964 	  encode(read_tier, bl);
1965 	  encode(write_tier, bl);
1966 	  encode(properties, bl);
1967 	  encode(hit_set_params, bl);
1968 	  encode(hit_set_period, bl);
1969 	  encode(hit_set_count, bl);
1970 	  encode(stripe_width, bl);
1971 	  encode(target_max_bytes, bl);
1972 	  encode(target_max_objects, bl);
1973 	  encode(cache_target_dirty_ratio_micro, bl);
1974 	  encode(cache_target_full_ratio_micro, bl);
1975 	  encode(cache_min_flush_age, bl);
1976 	  encode(cache_min_evict_age, bl);
1977 	  encode(erasure_code_profile, bl);
1978 	  encode(last_force_op_resend_preluminous, bl);
1979 	  encode(min_read_recency_for_promote, bl);
1980 	  encode(expected_num_objects, bl);
1981 	  if (v >= 19) {
1982 	    encode(cache_target_dirty_high_ratio_micro, bl);
1983 	  }
1984 	  if (v >= 20) {
1985 	    encode(min_write_recency_for_promote, bl);
1986 	  }
1987 	  if (v >= 21) {
1988 	    encode(use_gmt_hitset, bl);
1989 	  }
1990 	  if (v >= 22) {
1991 	    encode(fast_read, bl);
1992 	  }
1993 	  if (v >= 23) {
1994 	    encode(hit_set_grade_decay_rate, bl);
1995 	    encode(hit_set_search_last_n, bl);
1996 	  }
1997 	  if (v >= 24) {
1998 	    encode(opts, bl, features);
1999 	  }
2000 	  if (v >= 25) {
2001 	    encode(last_force_op_resend_prenautilus, bl);
2002 	  }
2003 	  if (v >= 26) {
2004 	    encode(application_metadata, bl);
2005 	  }
2006 	  if (v >= 27) {
2007 	    encode(create_time, bl);
2008 	  }
2009 	  if (v >= 28) {
2010 	    encode(pg_num_target, bl);
2011 	    encode(pgp_num_target, bl);
2012 	    encode(pg_num_pending, bl);
2013 	    encode((epoch_t)0, bl);  // pg_num_dec_last_epoch_started from 14.1.[01]
2014 	    encode((epoch_t)0, bl);  // pg_num_dec_last_epoch_clean from 14.1.[01]
2015 	    encode(last_force_op_resend, bl);
2016 	    encode(pg_autoscale_mode, bl);
2017 	  }
2018 	  if (v >= 29) {
2019 	    encode(last_pg_merge_meta, bl);
2020 	  }
2021 	  ENCODE_FINISH(bl);
2022 	}
2023 	
2024 	void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
2025 	{
2026 	  DECODE_START_LEGACY_COMPAT_LEN(29, 5, 5, bl);
2027 	  decode(type, bl);
2028 	  decode(size, bl);
2029 	  decode(crush_rule, bl);
2030 	  decode(object_hash, bl);
2031 	  decode(pg_num, bl);
2032 	  decode(pgp_num, bl);
2033 	  {
2034 	    __u32 lpg_num, lpgp_num;
2035 	    decode(lpg_num, bl);
2036 	    decode(lpgp_num, bl);
2037 	  }
2038 	  decode(last_change, bl);
2039 	  decode(snap_seq, bl);
2040 	  decode(snap_epoch, bl);
2041 	
2042 	  if (struct_v >= 3) {
2043 	    decode(snaps, bl);
2044 	    decode(removed_snaps, bl);
2045 	    decode(auid, bl);
2046 	  } else {
2047 	    __u32 n, m;
2048 	    decode(n, bl);
2049 	    decode(m, bl);
2050 	    decode(auid, bl);
2051 	    decode_nohead(n, snaps, bl);
2052 	    decode_nohead(m, removed_snaps, bl);
2053 	  }
2054 	
2055 	  if (struct_v >= 4) {
2056 	    decode(flags, bl);
2057 	    uint32_t crash_replay_interval;
2058 	    decode(crash_replay_interval, bl);
2059 	  } else {
2060 	    flags = 0;
2061 	  }
2062 	  // upgrade path for selfmanaged vs pool snaps
2063 	  if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
2064 	    if (!removed_snaps.empty()) {
2065 	      flags |= FLAG_SELFMANAGED_SNAPS;
2066 	    } else {
2067 	      flags |= FLAG_POOL_SNAPS;
2068 	    }
2069 	  }
2070 	  if (struct_v >= 7) {
2071 	    decode(min_size, bl);
2072 	  } else {
2073 	    min_size = size - size/2;
2074 	  }
2075 	  if (struct_v >= 8) {
2076 	    decode(quota_max_bytes, bl);
2077 	    decode(quota_max_objects, bl);
2078 	  }
2079 	  if (struct_v >= 9) {
2080 	    decode(tiers, bl);
2081 	    decode(tier_of, bl);
2082 	    __u8 v;
2083 	    decode(v, bl);
2084 	    cache_mode = (cache_mode_t)v;
2085 	    decode(read_tier, bl);
2086 	    decode(write_tier, bl);
2087 	  }
2088 	  if (struct_v >= 10) {
2089 	    decode(properties, bl);
2090 	  }
2091 	  if (struct_v >= 11) {
2092 	    decode(hit_set_params, bl);
2093 	    decode(hit_set_period, bl);
2094 	    decode(hit_set_count, bl);
2095 	  } else {
2096 	    pg_pool_t def;
2097 	    hit_set_period = def.hit_set_period;
2098 	    hit_set_count = def.hit_set_count;
2099 	  }
2100 	  if (struct_v >= 12) {
2101 	    decode(stripe_width, bl);
2102 	  } else {
2103 	    set_stripe_width(0);
2104 	  }
2105 	  if (struct_v >= 13) {
2106 	    decode(target_max_bytes, bl);
2107 	    decode(target_max_objects, bl);
2108 	    decode(cache_target_dirty_ratio_micro, bl);
2109 	    decode(cache_target_full_ratio_micro, bl);
2110 	    decode(cache_min_flush_age, bl);
2111 	    decode(cache_min_evict_age, bl);
2112 	  } else {
2113 	    target_max_bytes = 0;
2114 	    target_max_objects = 0;
2115 	    cache_target_dirty_ratio_micro = 0;
2116 	    cache_target_full_ratio_micro = 0;
2117 	    cache_min_flush_age = 0;
2118 	    cache_min_evict_age = 0;
2119 	  }
2120 	  if (struct_v >= 14) {
2121 	    decode(erasure_code_profile, bl);
2122 	  }
2123 	  if (struct_v >= 15) {
2124 	    decode(last_force_op_resend_preluminous, bl);
2125 	  } else {
2126 	    last_force_op_resend_preluminous = 0;
2127 	  }
2128 	  if (struct_v >= 16) {
2129 	    decode(min_read_recency_for_promote, bl);
2130 	  } else {
2131 	    min_read_recency_for_promote = 1;
2132 	  }
2133 	  if (struct_v >= 17) {
2134 	    decode(expected_num_objects, bl);
2135 	  } else {
2136 	    expected_num_objects = 0;
2137 	  }
2138 	  if (struct_v >= 19) {
2139 	    decode(cache_target_dirty_high_ratio_micro, bl);
2140 	  } else {
2141 	    cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
2142 	  }
2143 	  if (struct_v >= 20) {
2144 	    decode(min_write_recency_for_promote, bl);
2145 	  } else {
2146 	    min_write_recency_for_promote = 1;
2147 	  }
2148 	  if (struct_v >= 21) {
2149 	    decode(use_gmt_hitset, bl);
2150 	  } else {
2151 	    use_gmt_hitset = false;
2152 	  }
2153 	  if (struct_v >= 22) {
2154 	    decode(fast_read, bl);
2155 	  } else {
2156 	    fast_read = false;
2157 	  }
2158 	  if (struct_v >= 23) {
2159 	    decode(hit_set_grade_decay_rate, bl);
2160 	    decode(hit_set_search_last_n, bl);
2161 	  } else {
2162 	    hit_set_grade_decay_rate = 0;
2163 	    hit_set_search_last_n = 1;
2164 	  }
2165 	  if (struct_v >= 24) {
2166 	    decode(opts, bl);
2167 	  }
2168 	  if (struct_v >= 25) {
2169 	    decode(last_force_op_resend_prenautilus, bl);
2170 	  } else {
2171 	    last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
2172 	  }
2173 	  if (struct_v >= 26) {
2174 	    decode(application_metadata, bl);
2175 	  }
2176 	  if (struct_v >= 27) {
2177 	    decode(create_time, bl);
2178 	  }
2179 	  if (struct_v >= 28) {
2180 	    decode(pg_num_target, bl);
2181 	    decode(pgp_num_target, bl);
2182 	    decode(pg_num_pending, bl);
2183 	    epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
2184 	    decode(old_merge_last_epoch_started, bl);
2185 	    decode(old_merge_last_epoch_clean, bl);
2186 	    decode(last_force_op_resend, bl);
2187 	    decode(pg_autoscale_mode, bl);
2188 	    if (struct_v >= 29) {
2189 	      decode(last_pg_merge_meta, bl);
2190 	    } else {
2191 	      last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
2192 	      last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
2193 	    }
2194 	  } else {
2195 	    pg_num_target = pg_num;
2196 	    pgp_num_target = pgp_num;
2197 	    pg_num_pending = pg_num;
2198 	    last_force_op_resend = last_force_op_resend_prenautilus;
2199 	    pg_autoscale_mode = pg_autoscale_mode_t::WARN;    // default to warn on upgrade
2200 	  }
2201 	  DECODE_FINISH(bl);
2202 	  calc_pg_masks();
2203 	  calc_grade_table();
2204 	}
2205 	
2206 	void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
2207 	{
2208 	  pg_pool_t a;
2209 	  o.push_back(new pg_pool_t(a));
2210 	
2211 	  a.create_time = utime_t(4,5);
2212 	  a.type = TYPE_REPLICATED;
2213 	  a.size = 2;
2214 	  a.crush_rule = 3;
2215 	  a.object_hash = 4;
2216 	  a.pg_num = 6;
2217 	  a.pgp_num = 4;
2218 	  a.pgp_num_target = 4;
2219 	  a.pg_num_target = 5;
2220 	  a.pg_num_pending = 5;
2221 	  a.last_pg_merge_meta.last_epoch_started = 2;
2222 	  a.last_pg_merge_meta.last_epoch_clean = 2;
2223 	  a.last_change = 9;
2224 	  a.last_force_op_resend = 123823;
2225 	  a.last_force_op_resend_preluminous = 123824;
2226 	  a.snap_seq = 10;
2227 	  a.snap_epoch = 11;
2228 	  a.flags = FLAG_POOL_SNAPS;
2229 	  a.auid = 12;
2230 	  a.quota_max_bytes = 473;
2231 	  a.quota_max_objects = 474;
2232 	  o.push_back(new pg_pool_t(a));
2233 	
2234 	  a.snaps[3].name = "asdf";
2235 	  a.snaps[3].snapid = 3;
2236 	  a.snaps[3].stamp = utime_t(123, 4);
2237 	  a.snaps[6].name = "qwer";
2238 	  a.snaps[6].snapid = 6;
2239 	  a.snaps[6].stamp = utime_t(23423, 4);
2240 	  o.push_back(new pg_pool_t(a));
2241 	
2242 	  a.flags = FLAG_SELFMANAGED_SNAPS;
2243 	  a.snaps.clear();
2244 	  a.removed_snaps.insert(2);
2245 	  a.quota_max_bytes = 2473;
2246 	  a.quota_max_objects = 4374;
2247 	  a.tiers.insert(0);
2248 	  a.tiers.insert(1);
2249 	  a.tier_of = 2;
2250 	  a.cache_mode = CACHEMODE_WRITEBACK;
2251 	  a.read_tier = 1;
2252 	  a.write_tier = 1;
2253 	  a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
2254 	  a.hit_set_period = 3600;
2255 	  a.hit_set_count = 8;
2256 	  a.min_read_recency_for_promote = 1;
2257 	  a.min_write_recency_for_promote = 1;
2258 	  a.hit_set_grade_decay_rate = 50;
2259 	  a.hit_set_search_last_n = 1;
2260 	  a.calc_grade_table();
2261 	  a.set_stripe_width(12345);
2262 	  a.target_max_bytes = 1238132132;
2263 	  a.target_max_objects = 1232132;
2264 	  a.cache_target_dirty_ratio_micro = 187232;
2265 	  a.cache_target_dirty_high_ratio_micro = 309856;
2266 	  a.cache_target_full_ratio_micro = 987222;
2267 	  a.cache_min_flush_age = 231;
2268 	  a.cache_min_evict_age = 2321;
2269 	  a.erasure_code_profile = "profile in osdmap";
2270 	  a.expected_num_objects = 123456;
2271 	  a.fast_read = false;
2272 	  a.application_metadata = {{"rbd", {{"key", "value"}}}};
2273 	  o.push_back(new pg_pool_t(a));
2274 	}
2275 	
2276 	ostream& operator<<(ostream& out, const pg_pool_t& p)
2277 	{
2278 	  out << p.get_type_name();
2279 	  if (p.get_type_name() == "erasure") {
2280 	    out << " profile " << p.erasure_code_profile;
2281 	  }
2282 	  out << " size " << p.get_size()
2283 	      << " min_size " << p.get_min_size()
2284 	      << " crush_rule " << p.get_crush_rule()
2285 	      << " object_hash " << p.get_object_hash_name()
2286 	      << " pg_num " << p.get_pg_num()
2287 	      << " pgp_num " << p.get_pgp_num();
2288 	  if (p.get_pg_num_target() != p.get_pg_num()) {
2289 	    out << " pg_num_target " << p.get_pg_num_target();
2290 	  }
2291 	  if (p.get_pgp_num_target() != p.get_pgp_num()) {
2292 	    out << " pgp_num_target " << p.get_pgp_num_target();
2293 	  }
2294 	  if (p.get_pg_num_pending() != p.get_pg_num()) {
2295 	    out << " pg_num_pending " << p.get_pg_num_pending();
2296 	  }
2297 	  if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
2298 	    out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode);
2299 	  }
2300 	  out << " last_change " << p.get_last_change();
2301 	  if (p.get_last_force_op_resend() ||
2302 	      p.get_last_force_op_resend_prenautilus() ||
2303 	      p.get_last_force_op_resend_preluminous())
2304 	    out << " lfor " << p.get_last_force_op_resend() << "/"
2305 		<< p.get_last_force_op_resend_prenautilus() << "/"
2306 		<< p.get_last_force_op_resend_preluminous();
2307 	  if (p.get_auid())
2308 	    out << " owner " << p.get_auid();
2309 	  if (p.flags)
2310 	    out << " flags " << p.get_flags_string();
2311 	  if (p.quota_max_bytes)
2312 	    out << " max_bytes " << p.quota_max_bytes;
2313 	  if (p.quota_max_objects)
2314 	    out << " max_objects " << p.quota_max_objects;
2315 	  if (!p.tiers.empty())
2316 	    out << " tiers " << p.tiers;
2317 	  if (p.is_tier())
2318 	    out << " tier_of " << p.tier_of;
2319 	  if (p.has_read_tier())
2320 	    out << " read_tier " << p.read_tier;
2321 	  if (p.has_write_tier())
2322 	    out << " write_tier " << p.write_tier;
2323 	  if (p.cache_mode)
2324 	    out << " cache_mode " << p.get_cache_mode_name();
2325 	  if (p.target_max_bytes)
2326 	    out << " target_bytes " << p.target_max_bytes;
2327 	  if (p.target_max_objects)
2328 	    out << " target_objects " << p.target_max_objects;
2329 	  if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
2330 	    out << " hit_set " << p.hit_set_params
2331 		<< " " << p.hit_set_period << "s"
2332 		<< " x" << p.hit_set_count << " decay_rate "
2333 		<< p.hit_set_grade_decay_rate
2334 		<< " search_last_n " << p.hit_set_search_last_n;
2335 	  }
2336 	  if (p.min_read_recency_for_promote)
2337 	    out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
2338 	  if (p.min_write_recency_for_promote)
2339 	    out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
2340 	  out << " stripe_width " << p.get_stripe_width();
2341 	  if (p.expected_num_objects)
2342 	    out << " expected_num_objects " << p.expected_num_objects;
2343 	  if (p.fast_read)
2344 	    out << " fast_read " << p.fast_read;
2345 	  out << p.opts;
2346 	  if (!p.application_metadata.empty()) {
2347 	    out << " application ";
2348 	    for (auto it = p.application_metadata.begin();
2349 	         it != p.application_metadata.end(); ++it) {
2350 	      if (it != p.application_metadata.begin())
2351 	        out << ",";
2352 	      out << it->first;
2353 	    }
2354 	  }
2355 	  return out;
2356 	}
2357 	
2358 	
2359 	// -- object_stat_sum_t --
2360 	
2361 	void object_stat_sum_t::dump(Formatter *f) const
2362 	{
2363 	  f->dump_int("num_bytes", num_bytes);
2364 	  f->dump_int("num_objects", num_objects);
2365 	  f->dump_int("num_object_clones", num_object_clones);
2366 	  f->dump_int("num_object_copies", num_object_copies);
2367 	  f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
2368 	  f->dump_int("num_objects_missing", num_objects_missing);
2369 	  f->dump_int("num_objects_degraded", num_objects_degraded);
2370 	  f->dump_int("num_objects_misplaced", num_objects_misplaced);
2371 	  f->dump_int("num_objects_unfound", num_objects_unfound);
2372 	  f->dump_int("num_objects_dirty", num_objects_dirty);
2373 	  f->dump_int("num_whiteouts", num_whiteouts);
2374 	  f->dump_int("num_read", num_rd);
2375 	  f->dump_int("num_read_kb", num_rd_kb);
2376 	  f->dump_int("num_write", num_wr);
2377 	  f->dump_int("num_write_kb", num_wr_kb);
2378 	  f->dump_int("num_scrub_errors", num_scrub_errors);
2379 	  f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
2380 	  f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
2381 	  f->dump_int("num_objects_recovered", num_objects_recovered);
2382 	  f->dump_int("num_bytes_recovered", num_bytes_recovered);
2383 	  f->dump_int("num_keys_recovered", num_keys_recovered);
2384 	  f->dump_int("num_objects_omap", num_objects_omap);
2385 	  f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
2386 	  f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
2387 	  f->dump_int("num_flush", num_flush);
2388 	  f->dump_int("num_flush_kb", num_flush_kb);
2389 	  f->dump_int("num_evict", num_evict);
2390 	  f->dump_int("num_evict_kb", num_evict_kb);
2391 	  f->dump_int("num_promote", num_promote);
2392 	  f->dump_int("num_flush_mode_high", num_flush_mode_high);
2393 	  f->dump_int("num_flush_mode_low", num_flush_mode_low);
2394 	  f->dump_int("num_evict_mode_some", num_evict_mode_some);
2395 	  f->dump_int("num_evict_mode_full", num_evict_mode_full);
2396 	  f->dump_int("num_objects_pinned", num_objects_pinned);
2397 	  f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
2398 	  f->dump_int("num_large_omap_objects", num_large_omap_objects);
2399 	  f->dump_int("num_objects_manifest", num_objects_manifest);
2400 	  f->dump_int("num_omap_bytes", num_omap_bytes);
2401 	  f->dump_int("num_omap_keys", num_omap_keys);
2402 	  f->dump_int("num_objects_repaired", num_objects_repaired);
2403 	}
2404 	
2405 	void object_stat_sum_t::encode(ceph::buffer::list& bl) const
2406 	{
2407 	  ENCODE_START(20, 14, bl);
2408 	#if defined(CEPH_LITTLE_ENDIAN)
2409 	  bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
2410 	#else
2411 	  encode(num_bytes, bl);
2412 	  encode(num_objects, bl);
2413 	  encode(num_object_clones, bl);
2414 	  encode(num_object_copies, bl);
2415 	  encode(num_objects_missing_on_primary, bl);
2416 	  encode(num_objects_degraded, bl);
2417 	  encode(num_objects_unfound, bl);
2418 	  encode(num_rd, bl);
2419 	  encode(num_rd_kb, bl);
2420 	  encode(num_wr, bl);
2421 	  encode(num_wr_kb, bl);
2422 	  encode(num_scrub_errors, bl);
2423 	  encode(num_objects_recovered, bl);
2424 	  encode(num_bytes_recovered, bl);
2425 	  encode(num_keys_recovered, bl);
2426 	  encode(num_shallow_scrub_errors, bl);
2427 	  encode(num_deep_scrub_errors, bl);
2428 	  encode(num_objects_dirty, bl);
2429 	  encode(num_whiteouts, bl);
2430 	  encode(num_objects_omap, bl);
2431 	  encode(num_objects_hit_set_archive, bl);
2432 	  encode(num_objects_misplaced, bl);
2433 	  encode(num_bytes_hit_set_archive, bl);
2434 	  encode(num_flush, bl);
2435 	  encode(num_flush_kb, bl);
2436 	  encode(num_evict, bl);
2437 	  encode(num_evict_kb, bl);
2438 	  encode(num_promote, bl);
2439 	  encode(num_flush_mode_high, bl);
2440 	  encode(num_flush_mode_low, bl);
2441 	  encode(num_evict_mode_some, bl);
2442 	  encode(num_evict_mode_full, bl);
2443 	  encode(num_objects_pinned, bl);
2444 	  encode(num_objects_missing, bl);
2445 	  encode(num_legacy_snapsets, bl);
2446 	  encode(num_large_omap_objects, bl);
2447 	  encode(num_objects_manifest, bl);
2448 	  encode(num_omap_bytes, bl);
2449 	  encode(num_omap_keys, bl);
2450 	  encode(num_objects_repaired, bl);
2451 	#endif
2452 	  ENCODE_FINISH(bl);
2453 	}
2454 	
2455 	void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl)
2456 	{
2457 	  bool decode_finish = false;
2458 	  static const int STAT_SUM_DECODE_VERSION = 20;
2459 	  DECODE_START(STAT_SUM_DECODE_VERSION, bl);
2460 	#if defined(CEPH_LITTLE_ENDIAN)
2461 	  if (struct_v == STAT_SUM_DECODE_VERSION) {
2462 	    bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
2463 	    decode_finish = true;
2464 	  }
2465 	#endif
2466 	  if (!decode_finish) {
2467 	    decode(num_bytes, bl);
2468 	    decode(num_objects, bl);
2469 	    decode(num_object_clones, bl);
2470 	    decode(num_object_copies, bl);
2471 	    decode(num_objects_missing_on_primary, bl);
2472 	    decode(num_objects_degraded, bl);
2473 	    decode(num_objects_unfound, bl);
2474 	    decode(num_rd, bl);
2475 	    decode(num_rd_kb, bl);
2476 	    decode(num_wr, bl);
2477 	    decode(num_wr_kb, bl);
2478 	    decode(num_scrub_errors, bl);
2479 	    decode(num_objects_recovered, bl);
2480 	    decode(num_bytes_recovered, bl);
2481 	    decode(num_keys_recovered, bl);
2482 	    decode(num_shallow_scrub_errors, bl);
2483 	    decode(num_deep_scrub_errors, bl);
2484 	    decode(num_objects_dirty, bl);
2485 	    decode(num_whiteouts, bl);
2486 	    decode(num_objects_omap, bl);
2487 	    decode(num_objects_hit_set_archive, bl);
2488 	    decode(num_objects_misplaced, bl);
2489 	    decode(num_bytes_hit_set_archive, bl);
2490 	    decode(num_flush, bl);
2491 	    decode(num_flush_kb, bl);
2492 	    decode(num_evict, bl);
2493 	    decode(num_evict_kb, bl);
2494 	    decode(num_promote, bl);
2495 	    decode(num_flush_mode_high, bl);
2496 	    decode(num_flush_mode_low, bl);
2497 	    decode(num_evict_mode_some, bl);
2498 	    decode(num_evict_mode_full, bl);
2499 	    decode(num_objects_pinned, bl);
2500 	    decode(num_objects_missing, bl);
2501 	    if (struct_v >= 16) {
2502 	      decode(num_legacy_snapsets, bl);
2503 	    } else {
2504 	      num_legacy_snapsets = num_object_clones;  // upper bound
2505 	    }
2506 	    if (struct_v >= 17) {
2507 	      decode(num_large_omap_objects, bl);
2508 	    }
2509 	    if (struct_v >= 18) {
2510 	      decode(num_objects_manifest, bl);
2511 	    }
2512 	    if (struct_v >= 19) {
2513 	      decode(num_omap_bytes, bl);
2514 	      decode(num_omap_keys, bl);
2515 	    }
2516 	    if (struct_v >= 20) {
2517 	      decode(num_objects_repaired, bl);
2518 	    }
2519 	  }
2520 	  DECODE_FINISH(bl);
2521 	}
2522 	
2523 	void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2524 	{
2525 	  object_stat_sum_t a;
2526 	
2527 	  a.num_bytes = 1;
2528 	  a.num_objects = 3;
2529 	  a.num_object_clones = 4;
2530 	  a.num_object_copies = 5;
2531 	  a.num_objects_missing_on_primary = 6;
2532 	  a.num_objects_missing = 123;
2533 	  a.num_objects_degraded = 7;
2534 	  a.num_objects_unfound = 8;
2535 	  a.num_rd = 9; a.num_rd_kb = 10;
2536 	  a.num_wr = 11; a.num_wr_kb = 12;
2537 	  a.num_objects_recovered = 14;
2538 	  a.num_bytes_recovered = 15;
2539 	  a.num_keys_recovered = 16;
2540 	  a.num_deep_scrub_errors = 17;
2541 	  a.num_shallow_scrub_errors = 18;
2542 	  a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2543 	  a.num_objects_dirty = 21;
2544 	  a.num_whiteouts = 22;
2545 	  a.num_objects_misplaced = 1232;
2546 	  a.num_objects_hit_set_archive = 2;
2547 	  a.num_bytes_hit_set_archive = 27;
2548 	  a.num_flush = 5;
2549 	  a.num_flush_kb = 6;
2550 	  a.num_evict = 7;
2551 	  a.num_evict_kb = 8;
2552 	  a.num_promote = 9;
2553 	  a.num_flush_mode_high = 0;
2554 	  a.num_flush_mode_low = 1;
2555 	  a.num_evict_mode_some = 1;
2556 	  a.num_evict_mode_full = 0;
2557 	  a.num_objects_pinned = 20;
2558 	  a.num_large_omap_objects = 5;
2559 	  a.num_objects_manifest = 2;
2560 	  a.num_omap_bytes = 20000;
2561 	  a.num_omap_keys = 200;
2562 	  a.num_objects_repaired = 300;
2563 	  o.push_back(new object_stat_sum_t(a));
2564 	}
2565 	
2566 	void object_stat_sum_t::add(const object_stat_sum_t& o)
2567 	{
2568 	  num_bytes += o.num_bytes;
2569 	  num_objects += o.num_objects;
2570 	  num_object_clones += o.num_object_clones;
2571 	  num_object_copies += o.num_object_copies;
2572 	  num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2573 	  num_objects_missing += o.num_objects_missing;
2574 	  num_objects_degraded += o.num_objects_degraded;
2575 	  num_objects_misplaced += o.num_objects_misplaced;
2576 	  num_rd += o.num_rd;
2577 	  num_rd_kb += o.num_rd_kb;
2578 	  num_wr += o.num_wr;
2579 	  num_wr_kb += o.num_wr_kb;
2580 	  num_objects_unfound += o.num_objects_unfound;
2581 	  num_scrub_errors += o.num_scrub_errors;
2582 	  num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2583 	  num_deep_scrub_errors += o.num_deep_scrub_errors;
2584 	  num_objects_recovered += o.num_objects_recovered;
2585 	  num_bytes_recovered += o.num_bytes_recovered;
2586 	  num_keys_recovered += o.num_keys_recovered;
2587 	  num_objects_dirty += o.num_objects_dirty;
2588 	  num_whiteouts += o.num_whiteouts;
2589 	  num_objects_omap += o.num_objects_omap;
2590 	  num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2591 	  num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2592 	  num_flush += o.num_flush;
2593 	  num_flush_kb += o.num_flush_kb;
2594 	  num_evict += o.num_evict;
2595 	  num_evict_kb += o.num_evict_kb;
2596 	  num_promote += o.num_promote;
2597 	  num_flush_mode_high += o.num_flush_mode_high;
2598 	  num_flush_mode_low += o.num_flush_mode_low;
2599 	  num_evict_mode_some += o.num_evict_mode_some;
2600 	  num_evict_mode_full += o.num_evict_mode_full;
2601 	  num_objects_pinned += o.num_objects_pinned;
2602 	  num_legacy_snapsets += o.num_legacy_snapsets;
2603 	  num_large_omap_objects += o.num_large_omap_objects;
2604 	  num_objects_manifest += o.num_objects_manifest;
2605 	  num_omap_bytes += o.num_omap_bytes;
2606 	  num_omap_keys += o.num_omap_keys;
2607 	  num_objects_repaired += o.num_objects_repaired;
2608 	}
2609 	
2610 	void object_stat_sum_t::sub(const object_stat_sum_t& o)
2611 	{
2612 	  num_bytes -= o.num_bytes;
2613 	  num_objects -= o.num_objects;
2614 	  num_object_clones -= o.num_object_clones;
2615 	  num_object_copies -= o.num_object_copies;
2616 	  num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2617 	  num_objects_missing -= o.num_objects_missing;
2618 	  num_objects_degraded -= o.num_objects_degraded;
2619 	  num_objects_misplaced -= o.num_objects_misplaced;
2620 	  num_rd -= o.num_rd;
2621 	  num_rd_kb -= o.num_rd_kb;
2622 	  num_wr -= o.num_wr;
2623 	  num_wr_kb -= o.num_wr_kb;
2624 	  num_objects_unfound -= o.num_objects_unfound;
2625 	  num_scrub_errors -= o.num_scrub_errors;
2626 	  num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2627 	  num_deep_scrub_errors -= o.num_deep_scrub_errors;
2628 	  num_objects_recovered -= o.num_objects_recovered;
2629 	  num_bytes_recovered -= o.num_bytes_recovered;
2630 	  num_keys_recovered -= o.num_keys_recovered;
2631 	  num_objects_dirty -= o.num_objects_dirty;
2632 	  num_whiteouts -= o.num_whiteouts;
2633 	  num_objects_omap -= o.num_objects_omap;
2634 	  num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2635 	  num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2636 	  num_flush -= o.num_flush;
2637 	  num_flush_kb -= o.num_flush_kb;
2638 	  num_evict -= o.num_evict;
2639 	  num_evict_kb -= o.num_evict_kb;
2640 	  num_promote -= o.num_promote;
2641 	  num_flush_mode_high -= o.num_flush_mode_high;
2642 	  num_flush_mode_low -= o.num_flush_mode_low;
2643 	  num_evict_mode_some -= o.num_evict_mode_some;
2644 	  num_evict_mode_full -= o.num_evict_mode_full;
2645 	  num_objects_pinned -= o.num_objects_pinned;
2646 	  num_legacy_snapsets -= o.num_legacy_snapsets;
2647 	  num_large_omap_objects -= o.num_large_omap_objects;
2648 	  num_objects_manifest -= o.num_objects_manifest;
2649 	  num_omap_bytes -= o.num_omap_bytes;
2650 	  num_omap_keys -= o.num_omap_keys;
2651 	  num_objects_repaired -= o.num_objects_repaired;
2652 	}
2653 	
2654 	bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2655 	{
2656 	  return
2657 	    l.num_bytes == r.num_bytes &&
2658 	    l.num_objects == r.num_objects &&
2659 	    l.num_object_clones == r.num_object_clones &&
2660 	    l.num_object_copies == r.num_object_copies &&
2661 	    l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2662 	    l.num_objects_missing == r.num_objects_missing &&
2663 	    l.num_objects_degraded == r.num_objects_degraded &&
2664 	    l.num_objects_misplaced == r.num_objects_misplaced &&
2665 	    l.num_objects_unfound == r.num_objects_unfound &&
2666 	    l.num_rd == r.num_rd &&
2667 	    l.num_rd_kb == r.num_rd_kb &&
2668 	    l.num_wr == r.num_wr &&
2669 	    l.num_wr_kb == r.num_wr_kb &&
2670 	    l.num_scrub_errors == r.num_scrub_errors &&
2671 	    l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2672 	    l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2673 	    l.num_objects_recovered == r.num_objects_recovered &&
2674 	    l.num_bytes_recovered == r.num_bytes_recovered &&
2675 	    l.num_keys_recovered == r.num_keys_recovered &&
2676 	    l.num_objects_dirty == r.num_objects_dirty &&
2677 	    l.num_whiteouts == r.num_whiteouts &&
2678 	    l.num_objects_omap == r.num_objects_omap &&
2679 	    l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2680 	    l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2681 	    l.num_flush == r.num_flush &&
2682 	    l.num_flush_kb == r.num_flush_kb &&
2683 	    l.num_evict == r.num_evict &&
2684 	    l.num_evict_kb == r.num_evict_kb &&
2685 	    l.num_promote == r.num_promote &&
2686 	    l.num_flush_mode_high == r.num_flush_mode_high &&
2687 	    l.num_flush_mode_low == r.num_flush_mode_low &&
2688 	    l.num_evict_mode_some == r.num_evict_mode_some &&
2689 	    l.num_evict_mode_full == r.num_evict_mode_full &&
2690 	    l.num_objects_pinned == r.num_objects_pinned &&
2691 	    l.num_legacy_snapsets == r.num_legacy_snapsets &&
2692 	    l.num_large_omap_objects == r.num_large_omap_objects &&
2693 	    l.num_objects_manifest == r.num_objects_manifest &&
2694 	    l.num_omap_bytes == r.num_omap_bytes &&
2695 	    l.num_omap_keys == r.num_omap_keys &&
2696 	    l.num_objects_repaired == r.num_objects_repaired;
2697 	}
2698 	
2699 	// -- object_stat_collection_t --
2700 	
2701 	void object_stat_collection_t::dump(Formatter *f) const
2702 	{
2703 	  f->open_object_section("stat_sum");
2704 	  sum.dump(f);
2705 	  f->close_section();
2706 	}
2707 	
2708 	void object_stat_collection_t::encode(ceph::buffer::list& bl) const
2709 	{
2710 	  ENCODE_START(2, 2, bl);
2711 	  encode(sum, bl);
2712 	  encode((__u32)0, bl);
2713 	  ENCODE_FINISH(bl);
2714 	}
2715 	
2716 	void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl)
2717 	{
2718 	  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2719 	  decode(sum, bl);
2720 	  {
2721 	    map<string,object_stat_sum_t> cat_sum;
2722 	    decode(cat_sum, bl);
2723 	  }
2724 	  DECODE_FINISH(bl);
2725 	}
2726 	
2727 	void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2728 	{
2729 	  object_stat_collection_t a;
2730 	  o.push_back(new object_stat_collection_t(a));
2731 	  list<object_stat_sum_t*> l;
2732 	  object_stat_sum_t::generate_test_instances(l);
2733 	  for (auto p = l.begin(); p != l.end(); ++p) {
2734 	    a.add(**p);
2735 	    o.push_back(new object_stat_collection_t(a));
2736 	  }
2737 	}
2738 	
2739 	
2740 	// -- pg_stat_t --
2741 	
2742 	bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2743 	{
2744 	  if (primary && osd == acting_primary) {
2745 	    return true;
2746 	  } else if (!primary) {
2747 	    for(auto it = acting.cbegin(); it != acting.cend(); ++it)
2748 	    {
2749 	      if (*it == osd)
2750 	        return true;
2751 	    }
2752 	  }
2753 	  return false;
2754 	}
2755 	
2756 	void pg_stat_t::dump(Formatter *f) const
2757 	{
2758 	  f->dump_stream("version") << version;
2759 	  f->dump_stream("reported_seq") << reported_seq;
2760 	  f->dump_stream("reported_epoch") << reported_epoch;
2761 	  f->dump_string("state", pg_state_string(state));
2762 	  f->dump_stream("last_fresh") << last_fresh;
2763 	  f->dump_stream("last_change") << last_change;
2764 	  f->dump_stream("last_active") << last_active;
2765 	  f->dump_stream("last_peered") << last_peered;
2766 	  f->dump_stream("last_clean") << last_clean;
2767 	  f->dump_stream("last_became_active") << last_became_active;
2768 	  f->dump_stream("last_became_peered") << last_became_peered;
2769 	  f->dump_stream("last_unstale") << last_unstale;
2770 	  f->dump_stream("last_undegraded") << last_undegraded;
2771 	  f->dump_stream("last_fullsized") << last_fullsized;
2772 	  f->dump_unsigned("mapping_epoch", mapping_epoch);
2773 	  f->dump_stream("log_start") << log_start;
2774 	  f->dump_stream("ondisk_log_start") << ondisk_log_start;
2775 	  f->dump_unsigned("created", created);
2776 	  f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2777 	  f->dump_stream("parent") << parent;
2778 	  f->dump_unsigned("parent_split_bits", parent_split_bits);
2779 	  f->dump_stream("last_scrub") << last_scrub;
2780 	  f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2781 	  f->dump_stream("last_deep_scrub") << last_deep_scrub;
2782 	  f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2783 	  f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2784 	  f->dump_int("log_size", log_size);
2785 	  f->dump_int("ondisk_log_size", ondisk_log_size);
2786 	  f->dump_bool("stats_invalid", stats_invalid);
2787 	  f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2788 	  f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2789 	  f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2790 	  f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2791 	  f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2792 	  f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
2793 	  f->dump_unsigned("snaptrimq_len", snaptrimq_len);
2794 	  stats.dump(f);
2795 	  f->open_array_section("up");
2796 	  for (auto p = up.cbegin(); p != up.cend(); ++p)
2797 	    f->dump_int("osd", *p);
2798 	  f->close_section();
2799 	  f->open_array_section("acting");
2800 	  for (auto p = acting.cbegin(); p != acting.cend(); ++p)
2801 	    f->dump_int("osd", *p);
2802 	  f->close_section();
2803 	  f->open_array_section("avail_no_missing");
2804 	  for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p)
2805 	    f->dump_stream("shard") << *p;
2806 	  f->close_section();
2807 	  f->open_array_section("object_location_counts");
2808 	  for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) {
2809 	    f->open_object_section("entry");
2810 	    f->dump_stream("shards") << p->first;
2811 	    f->dump_int("objects", p->second);
2812 	    f->close_section();
2813 	  }
2814 	  f->close_section();
2815 	  f->open_array_section("blocked_by");
2816 	  for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p)
2817 	    f->dump_int("osd", *p);
2818 	  f->close_section();
2819 	  f->dump_int("up_primary", up_primary);
2820 	  f->dump_int("acting_primary", acting_primary);
2821 	  f->open_array_section("purged_snaps");
2822 	  for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) {
2823 	    f->open_object_section("interval");
2824 	    f->dump_stream("start") << i.get_start();
2825 	    f->dump_stream("length") << i.get_len();
2826 	    f->close_section();
2827 	  }
2828 	  f->close_section();
2829 	}
2830 	
2831 	void pg_stat_t::dump_brief(Formatter *f) const
2832 	{
2833 	  f->dump_string("state", pg_state_string(state));
2834 	  f->open_array_section("up");
2835 	  for (auto p = up.cbegin(); p != up.cend(); ++p)
2836 	    f->dump_int("osd", *p);
2837 	  f->close_section();
2838 	  f->open_array_section("acting");
2839 	  for (auto p = acting.cbegin(); p != acting.cend(); ++p)
2840 	    f->dump_int("osd", *p);
2841 	  f->close_section();
2842 	  f->dump_int("up_primary", up_primary);
2843 	  f->dump_int("acting_primary", acting_primary);
2844 	}
2845 	
2846 	void pg_stat_t::encode(ceph::buffer::list &bl) const
2847 	{
2848 	  ENCODE_START(26, 22, bl);
2849 	  encode(version, bl);
2850 	  encode(reported_seq, bl);
2851 	  encode(reported_epoch, bl);
2852 	  encode((__u32)state, bl);   // for older peers
2853 	  encode(log_start, bl);
2854 	  encode(ondisk_log_start, bl);
2855 	  encode(created, bl);
2856 	  encode(last_epoch_clean, bl);
2857 	  encode(parent, bl);
2858 	  encode(parent_split_bits, bl);
2859 	  encode(last_scrub, bl);
2860 	  encode(last_scrub_stamp, bl);
2861 	  encode(stats, bl);
2862 	  encode(log_size, bl);
2863 	  encode(ondisk_log_size, bl);
2864 	  encode(up, bl);
2865 	  encode(acting, bl);
2866 	  encode(last_fresh, bl);
2867 	  encode(last_change, bl);
2868 	  encode(last_active, bl);
2869 	  encode(last_clean, bl);
2870 	  encode(last_unstale, bl);
2871 	  encode(mapping_epoch, bl);
2872 	  encode(last_deep_scrub, bl);
2873 	  encode(last_deep_scrub_stamp, bl);
2874 	  encode(stats_invalid, bl);
2875 	  encode(last_clean_scrub_stamp, bl);
2876 	  encode(last_became_active, bl);
2877 	  encode(dirty_stats_invalid, bl);
2878 	  encode(up_primary, bl);
2879 	  encode(acting_primary, bl);
2880 	  encode(omap_stats_invalid, bl);
2881 	  encode(hitset_stats_invalid, bl);
2882 	  encode(blocked_by, bl);
2883 	  encode(last_undegraded, bl);
2884 	  encode(last_fullsized, bl);
2885 	  encode(hitset_bytes_stats_invalid, bl);
2886 	  encode(last_peered, bl);
2887 	  encode(last_became_peered, bl);
2888 	  encode(pin_stats_invalid, bl);
2889 	  encode(snaptrimq_len, bl);
2890 	  __u32 top_state = (state >> 32);
2891 	  encode(top_state, bl);
2892 	  encode(purged_snaps, bl);
2893 	  encode(manifest_stats_invalid, bl);
2894 	  encode(avail_no_missing, bl);
2895 	  encode(object_location_counts, bl);
2896 	  ENCODE_FINISH(bl);
2897 	}
2898 	
2899 	void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
2900 	{
2901 	  bool tmp;
2902 	  uint32_t old_state;
2903 	  DECODE_START(26, bl);
2904 	  decode(version, bl);
2905 	  decode(reported_seq, bl);
2906 	  decode(reported_epoch, bl);
2907 	  decode(old_state, bl);
2908 	  decode(log_start, bl);
2909 	  decode(ondisk_log_start, bl);
2910 	  decode(created, bl);
2911 	  decode(last_epoch_clean, bl);
2912 	  decode(parent, bl);
2913 	  decode(parent_split_bits, bl);
2914 	  decode(last_scrub, bl);
2915 	  decode(last_scrub_stamp, bl);
2916 	  decode(stats, bl);
2917 	  decode(log_size, bl);
2918 	  decode(ondisk_log_size, bl);
2919 	  decode(up, bl);
2920 	  decode(acting, bl);
2921 	  decode(last_fresh, bl);
2922 	  decode(last_change, bl);
2923 	  decode(last_active, bl);
2924 	  decode(last_clean, bl);
2925 	  decode(last_unstale, bl);
2926 	  decode(mapping_epoch, bl);
2927 	  decode(last_deep_scrub, bl);
2928 	  decode(last_deep_scrub_stamp, bl);
2929 	  decode(tmp, bl);
2930 	  stats_invalid = tmp;
2931 	  decode(last_clean_scrub_stamp, bl);
2932 	  decode(last_became_active, bl);
2933 	  decode(tmp, bl);
2934 	  dirty_stats_invalid = tmp;
2935 	  decode(up_primary, bl);
2936 	  decode(acting_primary, bl);
2937 	  decode(tmp, bl);
2938 	  omap_stats_invalid = tmp;
2939 	  decode(tmp, bl);
2940 	  hitset_stats_invalid = tmp;
2941 	  decode(blocked_by, bl);
2942 	  decode(last_undegraded, bl);
2943 	  decode(last_fullsized, bl);
2944 	  decode(tmp, bl);
2945 	  hitset_bytes_stats_invalid = tmp;
2946 	  decode(last_peered, bl);
2947 	  decode(last_became_peered, bl);
2948 	  decode(tmp, bl);
2949 	  pin_stats_invalid = tmp;
2950 	  if (struct_v >= 23) {
2951 	    decode(snaptrimq_len, bl);
2952 	    if (struct_v >= 24) {
2953 	      __u32 top_state;
2954 	      decode(top_state, bl);
2955 	      state = (uint64_t)old_state | ((uint64_t)top_state << 32);
2956 	      decode(purged_snaps, bl);
2957 	    } else {
2958 	      state = old_state;
2959 	    }
2960 	    if (struct_v >= 25) {
2961 	      decode(tmp, bl);
2962 	      manifest_stats_invalid = tmp;
2963 	    } else {
2964 	      manifest_stats_invalid = true;
2965 	    }
2966 	    if (struct_v >= 26) {
2967 	      decode(avail_no_missing, bl);
2968 	      decode(object_location_counts, bl);
2969 	    }
2970 	  }
2971 	  DECODE_FINISH(bl);
2972 	}
2973 	
2974 	void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
2975 	{
2976 	  pg_stat_t a;
2977 	  o.push_back(new pg_stat_t(a));
2978 	
2979 	  a.version = eversion_t(1, 3);
2980 	  a.reported_epoch = 1;
2981 	  a.reported_seq = 2;
2982 	  a.state = 123;
2983 	  a.mapping_epoch = 998;
2984 	  a.last_fresh = utime_t(1002, 1);
2985 	  a.last_change = utime_t(1002, 2);
2986 	  a.last_active = utime_t(1002, 3);
2987 	  a.last_clean = utime_t(1002, 4);
2988 	  a.last_unstale = utime_t(1002, 5);
2989 	  a.last_undegraded = utime_t(1002, 7);
2990 	  a.last_fullsized = utime_t(1002, 8);
2991 	  a.log_start = eversion_t(1, 4);
2992 	  a.ondisk_log_start = eversion_t(1, 5);
2993 	  a.created = 6;
2994 	  a.last_epoch_clean = 7;
2995 	  a.parent = pg_t(1, 2);
2996 	  a.parent_split_bits = 12;
2997 	  a.last_scrub = eversion_t(9, 10);
2998 	  a.last_scrub_stamp = utime_t(11, 12);
2999 	  a.last_deep_scrub = eversion_t(13, 14);
3000 	  a.last_deep_scrub_stamp = utime_t(15, 16);
3001 	  a.last_clean_scrub_stamp = utime_t(17, 18);
3002 	  a.snaptrimq_len = 1048576;
3003 	  list<object_stat_collection_t*> l;
3004 	  object_stat_collection_t::generate_test_instances(l);
3005 	  a.stats = *l.back();
3006 	  a.log_size = 99;
3007 	  a.ondisk_log_size = 88;
3008 	  a.up.push_back(123);
3009 	  a.up_primary = 123;
3010 	  a.acting.push_back(456);
3011 	  a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD));
3012 	  set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) };
3013 	  a.object_location_counts.insert(make_pair(sset, 10));
3014 	  sset.insert(pg_shard_t(2));
3015 	  a.object_location_counts.insert(make_pair(sset, 5));
3016 	  a.acting_primary = 456;
3017 	  o.push_back(new pg_stat_t(a));
3018 	
3019 	  a.up.push_back(124);
3020 	  a.up_primary = 124;
3021 	  a.acting.push_back(124);
3022 	  a.acting_primary = 124;
3023 	  a.blocked_by.push_back(155);
3024 	  a.blocked_by.push_back(156);
3025 	  o.push_back(new pg_stat_t(a));
3026 	}
3027 	
3028 	bool operator==(const pg_stat_t& l, const pg_stat_t& r)
3029 	{
3030 	  return
3031 	    l.version == r.version &&
3032 	    l.reported_seq == r.reported_seq &&
3033 	    l.reported_epoch == r.reported_epoch &&
3034 	    l.state == r.state &&
3035 	    l.last_fresh == r.last_fresh &&
3036 	    l.last_change == r.last_change &&
3037 	    l.last_active == r.last_active &&
3038 	    l.last_peered == r.last_peered &&
3039 	    l.last_clean == r.last_clean &&
3040 	    l.last_unstale == r.last_unstale &&
3041 	    l.last_undegraded == r.last_undegraded &&
3042 	    l.last_fullsized == r.last_fullsized &&
3043 	    l.log_start == r.log_start &&
3044 	    l.ondisk_log_start == r.ondisk_log_start &&
3045 	    l.created == r.created &&
3046 	    l.last_epoch_clean == r.last_epoch_clean &&
3047 	    l.parent == r.parent &&
3048 	    l.parent_split_bits == r.parent_split_bits &&
3049 	    l.last_scrub == r.last_scrub &&
3050 	    l.last_deep_scrub == r.last_deep_scrub &&
3051 	    l.last_scrub_stamp == r.last_scrub_stamp &&
3052 	    l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
3053 	    l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
3054 	    l.stats == r.stats &&
3055 	    l.stats_invalid == r.stats_invalid &&
3056 	    l.log_size == r.log_size &&
3057 	    l.ondisk_log_size == r.ondisk_log_size &&
3058 	    l.up == r.up &&
3059 	    l.acting == r.acting &&
3060 	    l.avail_no_missing == r.avail_no_missing &&
3061 	    l.object_location_counts == r.object_location_counts &&
3062 	    l.mapping_epoch == r.mapping_epoch &&
3063 	    l.blocked_by == r.blocked_by &&
3064 	    l.last_became_active == r.last_became_active &&
3065 	    l.last_became_peered == r.last_became_peered &&
3066 	    l.dirty_stats_invalid == r.dirty_stats_invalid &&
3067 	    l.omap_stats_invalid == r.omap_stats_invalid &&
3068 	    l.hitset_stats_invalid == r.hitset_stats_invalid &&
3069 	    l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
3070 	    l.up_primary == r.up_primary &&
3071 	    l.acting_primary == r.acting_primary &&
3072 	    l.pin_stats_invalid == r.pin_stats_invalid &&
3073 	    l.manifest_stats_invalid == r.manifest_stats_invalid &&
3074 	    l.purged_snaps == r.purged_snaps &&
3075 	    l.snaptrimq_len == r.snaptrimq_len;
3076 	}
3077 	
3078 	// -- store_statfs_t --
3079 	
3080 	bool store_statfs_t::operator==(const store_statfs_t& other) const
3081 	{
3082 	  return total == other.total
3083 	    && available == other.available
3084 	    && allocated == other.allocated
3085 	    && internally_reserved == other.internally_reserved
3086 	    && data_stored == other.data_stored
3087 	    && data_compressed == other.data_compressed
3088 	    && data_compressed_allocated == other.data_compressed_allocated
3089 	    && data_compressed_original == other.data_compressed_original
3090 	    && omap_allocated == other.omap_allocated
3091 	    && internal_metadata == other.internal_metadata;
3092 	}
3093 	
3094 	void store_statfs_t::dump(Formatter *f) const
3095 	{
3096 	  f->dump_int("total", total);
3097 	  f->dump_int("available", available);
3098 	  f->dump_int("internally_reserved", internally_reserved);
3099 	  f->dump_int("allocated", allocated);
3100 	  f->dump_int("data_stored", data_stored);
3101 	  f->dump_int("data_compressed", data_compressed);
3102 	  f->dump_int("data_compressed_allocated", data_compressed_allocated);
3103 	  f->dump_int("data_compressed_original", data_compressed_original);
3104 	  f->dump_int("omap_allocated", omap_allocated);
3105 	  f->dump_int("internal_metadata", internal_metadata);
3106 	}
3107 	
3108 	ostream& operator<<(ostream& out, const store_statfs_t &s)
3109 	{
3110 	  out << std::hex
3111 	      << "store_statfs(0x" << s.available
3112 	      << "/0x"  << s.internally_reserved
3113 	      << "/0x"  << s.total
3114 	      << ", data 0x" << s.data_stored
3115 	      << "/0x"  << s.allocated
3116 	      << ", compress 0x" << s.data_compressed
3117 	      << "/0x"  << s.data_compressed_allocated
3118 	      << "/0x"  << s.data_compressed_original
3119 	      << ", omap 0x" << s.omap_allocated
3120 	      << ", meta 0x" << s.internal_metadata
3121 	      << std::dec
3122 	      << ")";
3123 	  return out;
3124 	}
3125 	
3126 	void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o)
3127 	{
3128 	  store_statfs_t a;
3129 	  o.push_back(new store_statfs_t(a));
3130 	  a.total = 234;
3131 	  a.available = 123;
3132 	  a.internally_reserved = 33;
3133 	  a.allocated = 32;
3134 	  a.data_stored = 44;
3135 	  a.data_compressed = 21;
3136 	  a.data_compressed_allocated = 12;
3137 	  a.data_compressed_original = 13;
3138 	  a.omap_allocated = 14;
3139 	  a.internal_metadata = 15;
3140 	  o.push_back(new store_statfs_t(a));
3141 	}
3142 	
3143 	// -- pool_stat_t --
3144 	
3145 	void pool_stat_t::dump(Formatter *f) const
3146 	{
3147 	  stats.dump(f);
3148 	  f->open_object_section("store_stats");
3149 	  store_stats.dump(f);
3150 	  f->close_section();
3151 	  f->dump_int("log_size", log_size);
3152 	  f->dump_int("ondisk_log_size", ondisk_log_size);
3153 	  f->dump_int("up", up);
3154 	  f->dump_int("acting", acting);
3155 	  f->dump_int("num_store_stats", num_store_stats);
3156 	}
3157 	
3158 	void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
3159 	{
3160 	  using ceph::encode;
3161 	  if ((features & CEPH_FEATURE_OSDENC) == 0) {
3162 	    __u8 v = 4;
3163 	    encode(v, bl);
3164 	    encode(stats, bl);
3165 	    encode(log_size, bl);
3166 	    encode(ondisk_log_size, bl);
3167 	    return;
3168 	  }
3169 	
3170 	  ENCODE_START(7, 5, bl);
3171 	  encode(stats, bl);
3172 	  encode(log_size, bl);
3173 	  encode(ondisk_log_size, bl);
3174 	  encode(up, bl);
3175 	  encode(acting, bl);
3176 	  encode(store_stats, bl);
3177 	  encode(num_store_stats, bl);
3178 	  ENCODE_FINISH(bl);
3179 	}
3180 	
3181 	void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl)
3182 	{
3183 	  DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
3184 	  if (struct_v >= 4) {
3185 	    decode(stats, bl);
3186 	    decode(log_size, bl);
3187 	    decode(ondisk_log_size, bl);
3188 	    if (struct_v >= 6) {
3189 	      decode(up, bl);
3190 	      decode(acting, bl);
3191 	    } else {
3192 	      up = 0;
3193 	      acting = 0;
3194 	    }
3195 	    if (struct_v >= 7) {
3196 	      decode(store_stats, bl);
3197 	      decode(num_store_stats, bl);
3198 	    } else {
3199 	      store_stats.reset();
3200 	      num_store_stats = 0;
3201 	    }
3202 	
3203 	  } else {
3204 	    decode(stats.sum.num_bytes, bl);
3205 	    uint64_t num_kb;
3206 	    decode(num_kb, bl);
3207 	    decode(stats.sum.num_objects, bl);
3208 	    decode(stats.sum.num_object_clones, bl);
3209 	    decode(stats.sum.num_object_copies, bl);
3210 	    decode(stats.sum.num_objects_missing_on_primary, bl);
3211 	    decode(stats.sum.num_objects_degraded, bl);
3212 	    decode(log_size, bl);
3213 	    decode(ondisk_log_size, bl);
3214 	    if (struct_v >= 2) {
3215 	      decode(stats.sum.num_rd, bl);
3216 	      decode(stats.sum.num_rd_kb, bl);
3217 	      decode(stats.sum.num_wr, bl);
3218 	      decode(stats.sum.num_wr_kb, bl);
3219 	    }
3220 	    if (struct_v >= 3) {
3221 	      decode(stats.sum.num_objects_unfound, bl);
3222 	    }
3223 	  }
3224 	  DECODE_FINISH(bl);
3225 	}
3226 	
3227 	void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
3228 	{
3229 	  pool_stat_t a;
3230 	  o.push_back(new pool_stat_t(a));
3231 	
3232 	  list<object_stat_collection_t*> l;
3233 	  object_stat_collection_t::generate_test_instances(l);
3234 	  list<store_statfs_t*> ll;
3235 	  store_statfs_t::generate_test_instances(ll);
3236 	  a.stats = *l.back();
3237 	  a.store_stats = *ll.back();
3238 	  a.log_size = 123;
3239 	  a.ondisk_log_size = 456;
3240 	  a.acting = 3;
3241 	  a.up = 4;
3242 	  a.num_store_stats = 1;
3243 	  o.push_back(new pool_stat_t(a));
3244 	}
3245 	
3246 	
3247 	// -- pg_history_t --
3248 	
3249 	void pg_history_t::encode(ceph::buffer::list &bl) const
3250 	{
3251 	  ENCODE_START(10, 4, bl);
3252 	  encode(epoch_created, bl);
3253 	  encode(last_epoch_started, bl);
3254 	  encode(last_epoch_clean, bl);
3255 	  encode(last_epoch_split, bl);
3256 	  encode(same_interval_since, bl);
3257 	  encode(same_up_since, bl);
3258 	  encode(same_primary_since, bl);
3259 	  encode(last_scrub, bl);
3260 	  encode(last_scrub_stamp, bl);
3261 	  encode(last_deep_scrub, bl);
3262 	  encode(last_deep_scrub_stamp, bl);
3263 	  encode(last_clean_scrub_stamp, bl);
3264 	  encode(last_epoch_marked_full, bl);
3265 	  encode(last_interval_started, bl);
3266 	  encode(last_interval_clean, bl);
3267 	  encode(epoch_pool_created, bl);
3268 	  encode(prior_readable_until_ub, bl);
3269 	  ENCODE_FINISH(bl);
3270 	}
3271 	
3272 	void pg_history_t::decode(ceph::buffer::list::const_iterator &bl)
3273 	{
3274 	  DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
3275 	  decode(epoch_created, bl);
3276 	  decode(last_epoch_started, bl);
3277 	  if (struct_v >= 3)
3278 	    decode(last_epoch_clean, bl);
3279 	  else
3280 	    last_epoch_clean = last_epoch_started;  // careful, it's a lie!
3281 	  decode(last_epoch_split, bl);
3282 	  decode(same_interval_since, bl);
3283 	  decode(same_up_since, bl);
3284 	  decode(same_primary_since, bl);
3285 	  if (struct_v >= 2) {
3286 	    decode(last_scrub, bl);
3287 	    decode(last_scrub_stamp, bl);
3288 	  }
3289 	  if (struct_v >= 5) {
3290 	    decode(last_deep_scrub, bl);
3291 	    decode(last_deep_scrub_stamp, bl);
3292 	  }
3293 	  if (struct_v >= 6) {
3294 	    decode(last_clean_scrub_stamp, bl);
3295 	  }
3296 	  if (struct_v >= 7) {
3297 	    decode(last_epoch_marked_full, bl);
3298 	  }
3299 	  if (struct_v >= 8) {
3300 	    decode(last_interval_started, bl);
3301 	    decode(last_interval_clean, bl);
3302 	  } else {
3303 	    if (last_epoch_started >= same_interval_since) {
3304 	      last_interval_started = same_interval_since;
3305 	    } else {
3306 	      last_interval_started = last_epoch_started; // best guess
3307 	    }
3308 	    if (last_epoch_clean >= same_interval_since) {
3309 	      last_interval_clean = same_interval_since;
3310 	    } else {
3311 	      last_interval_clean = last_epoch_clean; // best guess
3312 	    }
3313 	  }
3314 	  if (struct_v >= 9) {
3315 	    decode(epoch_pool_created, bl);
3316 	  } else {
3317 	    epoch_pool_created = epoch_created;
3318 	  }
3319 	  if (struct_v >= 10) {
3320 	    decode(prior_readable_until_ub, bl);
3321 	  }
3322 	  DECODE_FINISH(bl);
3323 	}
3324 	
3325 	void pg_history_t::dump(Formatter *f) const
3326 	{
3327 	  f->dump_int("epoch_created", epoch_created);
3328 	  f->dump_int("epoch_pool_created", epoch_pool_created);
3329 	  f->dump_int("last_epoch_started", last_epoch_started);
3330 	  f->dump_int("last_interval_started", last_interval_started);
3331 	  f->dump_int("last_epoch_clean", last_epoch_clean);
3332 	  f->dump_int("last_interval_clean", last_interval_clean);
3333 	  f->dump_int("last_epoch_split", last_epoch_split);
3334 	  f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
3335 	  f->dump_int("same_up_since", same_up_since);
3336 	  f->dump_int("same_interval_since", same_interval_since);
3337 	  f->dump_int("same_primary_since", same_primary_since);
3338 	  f->dump_stream("last_scrub") << last_scrub;
3339 	  f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
3340 	  f->dump_stream("last_deep_scrub") << last_deep_scrub;
3341 	  f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
3342 	  f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
3343 	  f->dump_float(
3344 	    "prior_readable_until_ub",
3345 	    std::chrono::duration<double>(prior_readable_until_ub).count());
3346 	}
3347 	
3348 	void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
3349 	{
3350 	  o.push_back(new pg_history_t);
3351 	  o.push_back(new pg_history_t);
3352 	  o.back()->epoch_created = 1;
3353 	  o.back()->epoch_pool_created = 1;
3354 	  o.back()->last_epoch_started = 2;
3355 	  o.back()->last_interval_started = 2;
3356 	  o.back()->last_epoch_clean = 3;
3357 	  o.back()->last_interval_clean = 2;
3358 	  o.back()->last_epoch_split = 4;
3359 	  o.back()->prior_readable_until_ub = make_timespan(3.1415);
3360 	  o.back()->same_up_since = 5;
3361 	  o.back()->same_interval_since = 6;
3362 	  o.back()->same_primary_since = 7;
3363 	  o.back()->last_scrub = eversion_t(8, 9);
3364 	  o.back()->last_scrub_stamp = utime_t(10, 11);
3365 	  o.back()->last_deep_scrub = eversion_t(12, 13);
3366 	  o.back()->last_deep_scrub_stamp = utime_t(14, 15);
3367 	  o.back()->last_clean_scrub_stamp = utime_t(16, 17);
3368 	  o.back()->last_epoch_marked_full = 18;
3369 	}
3370 	
3371 	
3372 	// -- pg_info_t --
3373 	
3374 	void pg_info_t::encode(ceph::buffer::list &bl) const
3375 	{
3376 	  ENCODE_START(32, 26, bl);
3377 	  encode(pgid.pgid, bl);
3378 	  encode(last_update, bl);
3379 	  encode(last_complete, bl);
3380 	  encode(log_tail, bl);
3381 	  encode(hobject_t(), bl);  // old (nibblewise) last_backfill
3382 	  encode(stats, bl);
3383 	  history.encode(bl);
3384 	  encode(purged_snaps, bl);
3385 	  encode(last_epoch_started, bl);
3386 	  encode(last_user_version, bl);
3387 	  encode(hit_set, bl);
3388 	  encode(pgid.shard, bl);
3389 	  encode(last_backfill, bl);
3390 	  encode(true, bl); // was last_backfill_bitwise
3391 	  encode(last_interval_started, bl);
3392 	  ENCODE_FINISH(bl);
3393 	}
3394 	
3395 	void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
3396 	{
3397 	  DECODE_START(32, bl);
3398 	  decode(pgid.pgid, bl);
3399 	  decode(last_update, bl);
3400 	  decode(last_complete, bl);
3401 	  decode(log_tail, bl);
3402 	  {
3403 	    hobject_t old_last_backfill;
3404 	    decode(old_last_backfill, bl);
3405 	  }
3406 	  decode(stats, bl);
3407 	  history.decode(bl);
3408 	  decode(purged_snaps, bl);
3409 	  decode(last_epoch_started, bl);
3410 	  decode(last_user_version, bl);
3411 	  decode(hit_set, bl);
3412 	  decode(pgid.shard, bl);
3413 	  decode(last_backfill, bl);
3414 	  {
3415 	    bool last_backfill_bitwise;
3416 	    decode(last_backfill_bitwise, bl);
3417 	    // note: we may see a false value here since the default value for
3418 	    // the member was false, so it often didn't get set to true until
3419 	    // peering progressed.
3420 	  }
3421 	  if (struct_v >= 32) {
3422 	    decode(last_interval_started, bl);
3423 	  } else {
3424 	    last_interval_started = last_epoch_started;
3425 	  }
3426 	  DECODE_FINISH(bl);
3427 	}
3428 	
3429 	// -- pg_info_t --
3430 	
3431 	void pg_info_t::dump(Formatter *f) const
3432 	{
3433 	  f->dump_stream("pgid") << pgid;
3434 	  f->dump_stream("last_update") << last_update;
3435 	  f->dump_stream("last_complete") << last_complete;
3436 	  f->dump_stream("log_tail") << log_tail;
3437 	  f->dump_int("last_user_version", last_user_version);
3438 	  f->dump_stream("last_backfill") << last_backfill;
3439 	  f->open_array_section("purged_snaps");
3440 	  for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
3441 	       i != purged_snaps.end();
3442 	       ++i) {
3443 	    f->open_object_section("purged_snap_interval");
3444 	    f->dump_stream("start") << i.get_start();
3445 	    f->dump_stream("length") << i.get_len();
3446 	    f->close_section();
3447 	  }
3448 	  f->close_section();
3449 	  f->open_object_section("history");
3450 	  history.dump(f);
3451 	  f->close_section();
3452 	  f->open_object_section("stats");
3453 	  stats.dump(f);
3454 	  f->close_section();
3455 	
3456 	  f->dump_int("empty", is_empty());
3457 	  f->dump_int("dne", dne());
3458 	  f->dump_int("incomplete", is_incomplete());
3459 	  f->dump_int("last_epoch_started", last_epoch_started);
3460 	
3461 	  f->open_object_section("hit_set_history");
3462 	  hit_set.dump(f);
3463 	  f->close_section();
3464 	}
3465 	
3466 	void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
3467 	{
3468 	  o.push_back(new pg_info_t);
3469 	  o.push_back(new pg_info_t);
3470 	  list<pg_history_t*> h;
3471 	  pg_history_t::generate_test_instances(h);
3472 	  o.back()->history = *h.back();
3473 	  o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD);
3474 	  o.back()->last_update = eversion_t(3, 4);
3475 	  o.back()->last_complete = eversion_t(5, 6);
3476 	  o.back()->last_user_version = 2;
3477 	  o.back()->log_tail = eversion_t(7, 8);
3478 	  o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
3479 	  {
3480 	    list<pg_stat_t*> s;
3481 	    pg_stat_t::generate_test_instances(s);
3482 	    o.back()->stats = *s.back();
3483 	  }
3484 	  {
3485 	    list<pg_hit_set_history_t*> s;
3486 	    pg_hit_set_history_t::generate_test_instances(s);
3487 	    o.back()->hit_set = *s.back();
3488 	  }
3489 	}
3490 	
3491 	// -- pg_notify_t --
3492 	void pg_notify_t::encode(ceph::buffer::list &bl) const
3493 	{
3494 	  ENCODE_START(3, 2, bl);
3495 	  encode(query_epoch, bl);
3496 	  encode(epoch_sent, bl);
3497 	  encode(info, bl);
3498 	  encode(to, bl);
3499 	  encode(from, bl);
3500 	  encode(past_intervals, bl);
3501 	  ENCODE_FINISH(bl);
3502 	}
3503 	
3504 	void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl)
3505 	{
3506 	  DECODE_START(3, bl);
3507 	  decode(query_epoch, bl);
3508 	  decode(epoch_sent, bl);
3509 	  decode(info, bl);
3510 	  decode(to, bl);
3511 	  decode(from, bl);
3512 	  if (struct_v >= 3) {
3513 	    decode(past_intervals, bl);
3514 	  }
3515 	  DECODE_FINISH(bl);
3516 	}
3517 	
3518 	void pg_notify_t::dump(Formatter *f) const
3519 	{
3520 	  f->dump_int("from", from);
3521 	  f->dump_int("to", to);
3522 	  f->dump_unsigned("query_epoch", query_epoch);
3523 	  f->dump_unsigned("epoch_sent", epoch_sent);
3524 	  {
3525 	    f->open_object_section("info");
3526 	    info.dump(f);
3527 	    f->close_section();
3528 	  }
3529 	  f->dump_object("past_intervals", past_intervals);
3530 	}
3531 	
3532 	void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
3533 	{
3534 	  o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1,
3535 				      pg_info_t(), PastIntervals()));
3536 	  o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
3537 				      pg_info_t(), PastIntervals()));
3538 	}
3539 	
3540 	ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
3541 	{
3542 	  lhs << "(query:" << notify.query_epoch
3543 	      << " sent:" << notify.epoch_sent
3544 	      << " " << notify.info;
3545 	  if (notify.from != shard_id_t::NO_SHARD ||
3546 	      notify.to != shard_id_t::NO_SHARD)
3547 	    lhs << " " << (unsigned)notify.from
3548 		<< "->" << (unsigned)notify.to;
3549 	  lhs << " " << notify.past_intervals;
3550 	  return lhs << ")";
3551 	}
3552 	
3553 	// -- pg_interval_t --
3554 	
3555 	void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const
3556 	{
3557 	  ENCODE_START(4, 2, bl);
3558 	  encode(first, bl);
3559 	  encode(last, bl);
3560 	  encode(up, bl);
3561 	  encode(acting, bl);
3562 	  encode(maybe_went_rw, bl);
3563 	  encode(primary, bl);
3564 	  encode(up_primary, bl);
3565 	  ENCODE_FINISH(bl);
3566 	}
3567 	
3568 	void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl)
3569 	{
3570 	  DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
3571 	  decode(first, bl);
3572 	  decode(last, bl);
3573 	  decode(up, bl);
3574 	  decode(acting, bl);
3575 	  decode(maybe_went_rw, bl);
3576 	  if (struct_v >= 3) {
3577 	    decode(primary, bl);
3578 	  } else {
3579 	    if (acting.size())
3580 	      primary = acting[0];
3581 	  }
3582 	  if (struct_v >= 4) {
3583 	    decode(up_primary, bl);
3584 	  } else {
3585 	    if (up.size())
3586 	      up_primary = up[0];
3587 	  }
3588 	  DECODE_FINISH(bl);
3589 	}
3590 	
3591 	void PastIntervals::pg_interval_t::dump(Formatter *f) const
3592 	{
3593 	  f->dump_unsigned("first", first);
3594 	  f->dump_unsigned("last", last);
3595 	  f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
3596 	  f->open_array_section("up");
3597 	  for (auto p = up.cbegin(); p != up.cend(); ++p)
3598 	    f->dump_int("osd", *p);
3599 	  f->close_section();
3600 	  f->open_array_section("acting");
3601 	  for (auto p = acting.cbegin(); p != acting.cend(); ++p)
3602 	    f->dump_int("osd", *p);
3603 	  f->close_section();
3604 	  f->dump_int("primary", primary);
3605 	  f->dump_int("up_primary", up_primary);
3606 	}
3607 	
3608 	void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
3609 	{
3610 	  o.push_back(new pg_interval_t);
3611 	  o.push_back(new pg_interval_t);
3612 	  o.back()->up.push_back(1);
3613 	  o.back()->acting.push_back(2);
3614 	  o.back()->acting.push_back(3);
3615 	  o.back()->first = 4;
3616 	  o.back()->last = 5;
3617 	  o.back()->maybe_went_rw = true;
3618 	}
3619 	
3620 	WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
3621 	
3622 	
3623 	/**
3624 	 * pi_compact_rep
3625 	 *
3626 	 * PastIntervals only needs to be able to answer two questions:
3627 	 * 1) Where should the primary look for unfound objects?
3628 	 * 2) List a set of subsets of the OSDs such that contacting at least
3629 	 *    one from each subset guarantees we speak to at least one witness
3630 	 *    of any completed write.
3631 	 *
3632 	 * Crucially, 2) does not require keeping *all* past intervals.  Certainly,
3633 	 * we don't need to keep any where maybe_went_rw would be false.  We also
3634 	 * needn't keep two intervals where the actingset in one is a subset
3635 	 * of the other (only need to keep the smaller of the two sets).  In order
3636 	 * to accurately trim the set of intervals as last_epoch_started changes
3637 	 * without rebuilding the set from scratch, we'll retain the larger set
3638 	 * if it in an older interval.
3639 	 */
3640 	struct compact_interval_t {
3641 	  epoch_t first;
3642 	  epoch_t last;
3643 	  set<pg_shard_t> acting;
3644 	  bool supersedes(const compact_interval_t &other) {
3645 	    for (auto &&i: acting) {
3646 	      if (!other.acting.count(i))
3647 		return false;
3648 	    }
3649 	    return true;
3650 	  }
3651 	  void dump(Formatter *f) const {
3652 	    f->open_object_section("compact_interval_t");
3653 	    f->dump_stream("first") << first;
3654 	    f->dump_stream("last") << last;
3655 	    f->dump_stream("acting") << acting;
3656 	    f->close_section();
3657 	  }
3658 	  void encode(ceph::buffer::list &bl) const {
3659 	    ENCODE_START(1, 1, bl);
3660 	    encode(first, bl);
3661 	    encode(last, bl);
3662 	    encode(acting, bl);
3663 	    ENCODE_FINISH(bl);
3664 	  }
3665 	  void decode(ceph::buffer::list::const_iterator &bl) {
3666 	    DECODE_START(1, bl);
3667 	    decode(first, bl);
3668 	    decode(last, bl);
3669 	    decode(acting, bl);
3670 	    DECODE_FINISH(bl);
3671 	  }
3672 	  static void generate_test_instances(list<compact_interval_t*> & o) {
3673 	    /* Not going to be used, we'll generate pi_compact_rep directly */
3674 	  }
3675 	};
3676 	ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3677 	{
3678 	  return o << "([" << rhs.first << "," << rhs.last
3679 		   << "] acting " << rhs.acting << ")";
3680 	}
3681 	WRITE_CLASS_ENCODER(compact_interval_t)
3682 	
3683 	class pi_compact_rep : public PastIntervals::interval_rep {
3684 	  epoch_t first = 0;
3685 	  epoch_t last = 0; // inclusive
3686 	  set<pg_shard_t> all_participants;
3687 	  list<compact_interval_t> intervals;
3688 	  pi_compact_rep(
3689 	    bool ec_pool,
3690 	    std::list<PastIntervals::pg_interval_t> &&intervals) {
3691 	    for (auto &&i: intervals)
3692 	      add_interval(ec_pool, i);
3693 	  }
3694 	public:
3695 	  pi_compact_rep() = default;
3696 	  pi_compact_rep(const pi_compact_rep &) = default;
3697 	  pi_compact_rep(pi_compact_rep &&) = default;
3698 	  pi_compact_rep &operator=(const pi_compact_rep &) = default;
3699 	  pi_compact_rep &operator=(pi_compact_rep &&) = default;
3700 	
3701 	  size_t size() const override { return intervals.size(); }
3702 	  bool empty() const override {
3703 	    return first > last || (first == 0 && last == 0);
3704 	  }
3705 	  void clear() override {
3706 	    *this = pi_compact_rep();
3707 	  }
3708 	  pair<epoch_t, epoch_t> get_bounds() const override {
3709 	    return make_pair(first, last + 1);
3710 	  }
3711 	  void adjust_start_backwards(epoch_t last_epoch_clean) {
3712 	    first = last_epoch_clean;
3713 	  }
3714 	
3715 	  set<pg_shard_t> get_all_participants(
3716 	    bool ec_pool) const override {
3717 	    return all_participants;
3718 	  }
3719 	  void add_interval(
3720 	    bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3721 	    if (first == 0)
3722 	      first = interval.first;
3723 	    ceph_assert(interval.last > last);
3724 	    last = interval.last;
3725 	    set<pg_shard_t> acting;
3726 	    for (unsigned i = 0; i < interval.acting.size(); ++i) {
3727 	      if (interval.acting[i] == CRUSH_ITEM_NONE)
3728 		continue;
3729 	      acting.insert(
3730 		pg_shard_t(
3731 		  interval.acting[i],
3732 		  ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3733 	    }
3734 	    all_participants.insert(acting.begin(), acting.end());
3735 	    if (!interval.maybe_went_rw)
3736 	      return;
3737 	    intervals.push_back(
3738 	      compact_interval_t{interval.first, interval.last, acting});
3739 	    auto plast = intervals.end();
3740 	    --plast;
3741 	    for (auto cur = intervals.begin(); cur != plast; ) {
3742 	      if (plast->supersedes(*cur)) {
3743 		intervals.erase(cur++);
3744 	      } else {
3745 		++cur;
3746 	      }
3747 	    }
3748 	  }
3749 	  unique_ptr<PastIntervals::interval_rep> clone() const override {
3750 	    return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3751 	  }
3752 	  ostream &print(ostream &out) const override {
3753 	    return out << "([" << first << "," << last
3754 		       << "] all_participants=" << all_participants
3755 		       << " intervals=" << intervals << ")";
3756 	  }
3757 	  void encode(ceph::buffer::list &bl) const override {
3758 	    ENCODE_START(1, 1, bl);
3759 	    encode(first, bl);
3760 	    encode(last, bl);
3761 	    encode(all_participants, bl);
3762 	    encode(intervals, bl);
3763 	    ENCODE_FINISH(bl);
3764 	  }
3765 	  void decode(ceph::buffer::list::const_iterator &bl) override {
3766 	    DECODE_START(1, bl);
3767 	    decode(first, bl);
3768 	    decode(last, bl);
3769 	    decode(all_participants, bl);
3770 	    decode(intervals, bl);
3771 	    DECODE_FINISH(bl);
3772 	  }
3773 	  void dump(Formatter *f) const override {
3774 	    f->open_object_section("PastIntervals::compact_rep");
3775 	    f->dump_stream("first") << first;
3776 	    f->dump_stream("last") << last;
3777 	    f->open_array_section("all_participants");
3778 	    for (auto& i : all_participants) {
3779 	      f->dump_object("pg_shard", i);
3780 	    }
3781 	    f->close_section();
3782 	    f->open_array_section("intervals");
3783 	    for (auto &&i: intervals) {
3784 	      i.dump(f);
3785 	    }
3786 	    f->close_section();
3787 	    f->close_section();
3788 	  }
3789 	  static void generate_test_instances(list<pi_compact_rep*> &o) {
3790 	    using ival = PastIntervals::pg_interval_t;
3791 	    using ivallst = std::list<ival>;
3792 	    o.push_back(
3793 	      new pi_compact_rep(
3794 		true, ivallst
3795 		{ ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
3796 		, ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
3797 		, ival{{      2}, {      2}, 31, 35, false, 2, 2}
3798 		, ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
3799 		}));
3800 	    o.push_back(
3801 	      new pi_compact_rep(
3802 		false, ivallst
3803 		{ ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
3804 		, ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
3805 		, ival{{      2}, {      2}, 31, 35, false, 2, 2}
3806 		, ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
3807 		}));
3808 	    o.push_back(
3809 	      new pi_compact_rep(
3810 		true, ivallst
3811 		{ ival{{2, 1, 0}, {2, 1, 0}, 10, 20,  true, 1, 1}
3812 		, ival{{   0, 2}, {   0, 2}, 21, 30,  true, 0, 0}
3813 		, ival{{   0, 2}, {2,    0}, 31, 35,  true, 2, 2}
3814 		, ival{{   0, 2}, {   0, 2}, 36, 50,  true, 0, 0}
3815 		}));
3816 	  }
3817 	  void iterate_mayberw_back_to(
3818 	    epoch_t les,
3819 	    std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3820 	    for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3821 	      if (i->last < les)
3822 		break;
3823 	      f(i->first, i->acting);
3824 	    }
3825 	  }
3826 	  virtual ~pi_compact_rep() override {}
3827 	};
3828 	WRITE_CLASS_ENCODER(pi_compact_rep)
3829 	
3830 	PastIntervals::PastIntervals()
3831 	{
3832 	  past_intervals.reset(new pi_compact_rep);
3833 	}
3834 	
3835 	PastIntervals::PastIntervals(const PastIntervals &rhs)
3836 	  : past_intervals(rhs.past_intervals ?
3837 			   rhs.past_intervals->clone() :
3838 			   nullptr) {}
3839 	
3840 	PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3841 	{
3842 	  PastIntervals other(rhs);
3843 	  swap(other);
3844 	  return *this;
3845 	}
3846 	
3847 	ostream& operator<<(ostream& out, const PastIntervals &i)
3848 	{
3849 	  if (i.past_intervals) {
3850 	    return i.past_intervals->print(out);
3851 	  } else {
3852 	    return out << "(empty)";
3853 	  }
3854 	}
3855 	
3856 	ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3857 	{
3858 	  return out << "PriorSet("
3859 		     << "ec_pool: " << i.ec_pool
3860 		     << ", probe: " << i.probe
3861 		     << ", down: " << i.down
3862 		     << ", blocked_by: " << i.blocked_by
3863 		     << ", pg_down: " << i.pg_down
3864 		     << ")";
3865 	}
3866 	
3867 	void PastIntervals::decode(ceph::buffer::list::const_iterator &bl)
3868 	{
3869 	  DECODE_START(1, bl);
3870 	  __u8 type = 0;
3871 	  decode(type, bl);
3872 	  switch (type) {
3873 	  case 0:
3874 	    break;
3875 	  case 1:
3876 	    ceph_abort_msg("pi_simple_rep support removed post-luminous");
3877 	    break;
3878 	  case 2:
3879 	    past_intervals.reset(new pi_compact_rep);
3880 	    past_intervals->decode(bl);
3881 	    break;
3882 	  }
3883 	  DECODE_FINISH(bl);
3884 	}
3885 	
3886 	void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
3887 	{
3888 	  {
3889 	    list<pi_compact_rep *> compact;
3890 	    pi_compact_rep::generate_test_instances(compact);
3891 	    for (auto &&i: compact) {
3892 	      // takes ownership of contents
3893 	      o.push_back(new PastIntervals(i));
3894 	    }
3895 	  }
3896 	  return;
3897 	}
3898 	
3899 	bool PastIntervals::is_new_interval(
3900 	  int old_acting_primary,
3901 	  int new_acting_primary,
3902 	  const vector<int> &old_acting,
3903 	  const vector<int> &new_acting,
3904 	  int old_up_primary,
3905 	  int new_up_primary,
3906 	  const vector<int> &old_up,
3907 	  const vector<int> &new_up,
3908 	  int old_size,
3909 	  int new_size,
3910 	  int old_min_size,
3911 	  int new_min_size,
3912 	  unsigned old_pg_num,
3913 	  unsigned new_pg_num,
3914 	  unsigned old_pg_num_pending,
3915 	  unsigned new_pg_num_pending,
3916 	  bool old_sort_bitwise,
3917 	  bool new_sort_bitwise,
3918 	  bool old_recovery_deletes,
3919 	  bool new_recovery_deletes,
3920 	  pg_t pgid) {
3921 	  return old_acting_primary != new_acting_primary ||
3922 	    new_acting != old_acting ||
3923 	    old_up_primary != new_up_primary ||
3924 	    new_up != old_up ||
3925 	    old_min_size != new_min_size ||
3926 	    old_size != new_size ||
3927 	    pgid.is_split(old_pg_num, new_pg_num, 0) ||
3928 	    // (is or was) pre-merge source
3929 	    pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) ||
3930 	    pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) ||
3931 	    // merge source
3932 	    pgid.is_merge_source(old_pg_num, new_pg_num, 0) ||
3933 	    // (is or was) pre-merge target
3934 	    pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) ||
3935 	    pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) ||
3936 	    // merge target
3937 	    pgid.is_merge_target(old_pg_num, new_pg_num) ||
3938 	    old_sort_bitwise != new_sort_bitwise ||
3939 	    old_recovery_deletes != new_recovery_deletes;
3940 	}
3941 	
3942 	bool PastIntervals::is_new_interval(
3943 	  int old_acting_primary,
3944 	  int new_acting_primary,
3945 	  const vector<int> &old_acting,
3946 	  const vector<int> &new_acting,
3947 	  int old_up_primary,
3948 	  int new_up_primary,
3949 	  const vector<int> &old_up,
3950 	  const vector<int> &new_up,
3951 	  const OSDMap *osdmap,
3952 	  const OSDMap *lastmap,
3953 	  pg_t pgid)
3954 	{
3955 	  const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool());
3956 	  if (!plast) {
3957 	    return false; // after pool is deleted there are no more interval changes
3958 	  }
3959 	  const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool());
3960 	  if (!pi) {
3961 	    return true;  // pool was deleted this epoch -> (final!) interval change
3962 	  }
3963 	  return
3964 	    is_new_interval(old_acting_primary,
3965 			    new_acting_primary,
3966 			    old_acting,
3967 			    new_acting,
3968 			    old_up_primary,
3969 			    new_up_primary,
3970 			    old_up,
3971 			    new_up,
3972 			    plast->size,
3973 			    pi->size,
3974 			    plast->min_size,
3975 			    pi->min_size,
3976 			    plast->get_pg_num(),
3977 			    pi->get_pg_num(),
3978 			    plast->get_pg_num_pending(),
3979 			    pi->get_pg_num_pending(),
3980 			    lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3981 			    osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3982 			    lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3983 			    osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3984 			    pgid);
3985 	}
3986 	
3987 	bool PastIntervals::check_new_interval(
3988 	  int old_acting_primary,
3989 	  int new_acting_primary,
3990 	  const vector<int> &old_acting,
3991 	  const vector<int> &new_acting,
3992 	  int old_up_primary,
3993 	  int new_up_primary,
3994 	  const vector<int> &old_up,
3995 	  const vector<int> &new_up,
3996 	  epoch_t same_interval_since,
3997 	  epoch_t last_epoch_clean,
3998 	  const OSDMap *osdmap,
3999 	  const OSDMap *lastmap,
4000 	  pg_t pgid,
4001 	  const IsPGRecoverablePredicate &could_have_gone_active,
4002 	  PastIntervals *past_intervals,
4003 	  std::ostream *out)
4004 	{
4005 	  /*
4006 	   * We have to be careful to gracefully deal with situations like
4007 	   * so. Say we have a power outage or something that takes out both
4008 	   * OSDs, but the monitor doesn't mark them down in the same epoch.
4009 	   * The history may look like
4010 	   *
4011 	   *  1: A B
4012 	   *  2:   B
4013 	   *  3:       let's say B dies for good, too (say, from the power spike) 
4014 	   *  4: A
4015 	   *
4016 	   * which makes it look like B may have applied updates to the PG
4017 	   * that we need in order to proceed.  This sucks...
4018 	   *
4019 	   * To minimize the risk of this happening, we CANNOT go active if
4020 	   * _any_ OSDs in the prior set are down until we send an MOSDAlive
4021 	   * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
4022 	   * Then, we have something like
4023 	   *
4024 	   *  1: A B
4025 	   *  2:   B   up_thru[B]=0
4026 	   *  3:
4027 	   *  4: A
4028 	   *
4029 	   * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
4030 	   *
4031 	   * or,
4032 	   *
4033 	   *  1: A B
4034 	   *  2:   B   up_thru[B]=0
4035 	   *  3:   B   up_thru[B]=2
4036 	   *  4:
4037 	   *  5: A    
4038 	   *
4039 	   * -> we must wait for B, bc it was alive through 2, and could have
4040 	   *    written to the pg.
4041 	   *
4042 	   * If B is really dead, then an administrator will need to manually
4043 	   * intervene by marking the OSD as "lost."
4044 	   */
4045 	
4046 	  // remember past interval
4047 	  //  NOTE: a change in the up set primary triggers an interval
4048 	  //  change, even though the interval members in the pg_interval_t
4049 	  //  do not change.
4050 	  ceph_assert(past_intervals);
4051 	  ceph_assert(past_intervals->past_intervals);
4052 	  if (is_new_interval(
4053 		old_acting_primary,
4054 		new_acting_primary,
4055 		old_acting,
4056 		new_acting,
4057 		old_up_primary,
4058 		new_up_primary,
4059 		old_up,
4060 		new_up,
4061 		osdmap,
4062 		lastmap,
4063 		pgid)) {
4064 	    pg_interval_t i;
4065 	    i.first = same_interval_since;
4066 	    i.last = osdmap->get_epoch() - 1;
4067 	    ceph_assert(i.first <= i.last);
4068 	    i.acting = old_acting;
4069 	    i.up = old_up;
4070 	    i.primary = old_acting_primary;
4071 	    i.up_primary = old_up_primary;
4072 	
4073 	    unsigned num_acting = 0;
4074 	    for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p)
4075 	      if (*p != CRUSH_ITEM_NONE)
4076 		++num_acting;
4077 	
4078 	    ceph_assert(lastmap->get_pools().count(pgid.pool()));
4079 	    const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
4080 	    set<pg_shard_t> old_acting_shards;
4081 	    old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
4082 	
4083 	    if (num_acting &&
4084 		i.primary != -1 &&
4085 		num_acting >= old_pg_pool.min_size &&
4086 	        could_have_gone_active(old_acting_shards)) {
4087 	      if (out)
4088 		*out << __func__ << " " << i
4089 		     << " up_thru " << lastmap->get_up_thru(i.primary)
4090 		     << " up_from " << lastmap->get_up_from(i.primary)
4091 		     << " last_epoch_clean " << last_epoch_clean;
4092 	      if (lastmap->get_up_thru(i.primary) >= i.first &&
4093 		  lastmap->get_up_from(i.primary) <= i.first) {
4094 		i.maybe_went_rw = true;
4095 		if (out)
4096 		  *out << " " << i
4097 		       << " : primary up " << lastmap->get_up_from(i.primary)
4098 		       << "-" << lastmap->get_up_thru(i.primary)
4099 		       << " includes interval"
4100 	               << std::endl;
4101 	      } else if (last_epoch_clean >= i.first &&
4102 			 last_epoch_clean <= i.last) {
4103 		// If the last_epoch_clean is included in this interval, then
4104 		// the pg must have been rw (for recovery to have completed).
4105 		// This is important because we won't know the _real_
4106 		// first_epoch because we stop at last_epoch_clean, and we
4107 		// don't want the oldest interval to randomly have
4108 		// maybe_went_rw false depending on the relative up_thru vs
4109 		// last_epoch_clean timing.
4110 		i.maybe_went_rw = true;
4111 		if (out)
4112 		  *out << " " << i
4113 		       << " : includes last_epoch_clean " << last_epoch_clean
4114 		       << " and presumed to have been rw"
4115 		       << std::endl;
4116 	      } else {
4117 		i.maybe_went_rw = false;
4118 		if (out)
4119 		  *out << " " << i
4120 		       << " : primary up " << lastmap->get_up_from(i.primary)
4121 		       << "-" << lastmap->get_up_thru(i.primary)
4122 		       << " does not include interval"
4123 	               << std::endl;
4124 	      }
4125 	    } else {
4126 	      i.maybe_went_rw = false;
4127 	      if (out)
4128 		*out << __func__ << " " << i << " : acting set is too small" << std::endl;
4129 	    }
4130 	    past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);
4131 	    return true;
4132 	  } else {
4133 	    return false;
4134 	  }
4135 	}
4136 	
4137 	
4138 	// true if the given map affects the prior set
4139 	bool PastIntervals::PriorSet::affected_by_map(
4140 	  const OSDMap &osdmap,
4141 	  const DoutPrefixProvider *dpp) const
4142 	{
4143 	  for (auto p = probe.begin(); p != probe.end(); ++p) {
4144 	    int o = p->osd;
4145 	
4146 	    // did someone in the prior set go down?
4147 	    if (osdmap.is_down(o) && down.count(o) == 0) {
4148 	      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
4149 	      return true;
4150 	    }
4151 	
4152 	    // did a down osd in cur get (re)marked as lost?
4153 	    auto r = blocked_by.find(o);
4154 	    if (r != blocked_by.end()) {
4155 	      if (!osdmap.exists(o)) {
4156 		ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4157 		return true;
4158 	      }
4159 	      if (osdmap.get_info(o).lost_at != r->second) {
4160 		ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4161 		return true;
4162 	      }
4163 	    }
4164 	  }
4165 	
4166 	  // did someone in the prior down set go up?
4167 	  for (auto p = down.cbegin(); p != down.cend(); ++p) {
4168 	    int o = *p;
4169 	
4170 	    if (osdmap.is_up(o)) {
(1) Event exp_primary_expr: expected an expression
(2) Event caretline: ^
4171 	      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
4172 	      return true;
4173 	    }
4174 	
4175 	    // did someone in the prior set get lost or destroyed?
4176 	    if (!osdmap.exists(o)) {
4177 	      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4178 	      return true;
4179 	    }
4180 	    // did a down osd in down get (re)marked as lost?
4181 	    auto r = blocked_by.find(o);
4182 	    if (r != blocked_by.end()) {
4183 	      if (osdmap.get_info(o).lost_at != r->second) {
4184 	        ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4185 	        return true;
4186 	      }
4187 	    }
4188 	  }
4189 	
4190 	  return false;
4191 	}
4192 	
4193 	ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
4194 	{
4195 	  out << "interval(" << i.first << "-" << i.last
4196 	      << " up " << i.up << "(" << i.up_primary << ")"
4197 	      << " acting " << i.acting << "(" << i.primary << ")";
4198 	  if (i.maybe_went_rw)
4199 	    out << " maybe_went_rw";
4200 	  out << ")";
4201 	  return out;
4202 	}
4203 	
4204 	
4205 	
4206 	// -- pg_query_t --
4207 	
4208 	void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const {
4209 	  ENCODE_START(3, 3, bl);
4210 	  encode(type, bl);
4211 	  encode(since, bl);
4212 	  history.encode(bl);
4213 	  encode(epoch_sent, bl);
4214 	  encode(to, bl);
4215 	  encode(from, bl);
4216 	  ENCODE_FINISH(bl);
4217 	}
4218 	
4219 	void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) {
4220 	  DECODE_START(3, bl);
4221 	  decode(type, bl);
4222 	  decode(since, bl);
4223 	  history.decode(bl);
4224 	  decode(epoch_sent, bl);
4225 	  decode(to, bl);
4226 	  decode(from, bl);
4227 	  DECODE_FINISH(bl);
4228 	}
4229 	
4230 	void pg_query_t::dump(Formatter *f) const
4231 	{
4232 	  f->dump_int("from", from);
4233 	  f->dump_int("to", to);
4234 	  f->dump_string("type", get_type_name());
4235 	  f->dump_stream("since") << since;
4236 	  f->dump_stream("epoch_sent") << epoch_sent;
4237 	  f->open_object_section("history");
4238 	  history.dump(f);
4239 	  f->close_section();
4240 	}
4241 	void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
4242 	{
4243 	  o.push_back(new pg_query_t());
4244 	  list<pg_history_t*> h;
4245 	  pg_history_t::generate_test_instances(h);
4246 	  o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
4247 	  o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
4248 	  o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
4249 				     eversion_t(4, 5), *h.back(), 4));
4250 	  o.push_back(new pg_query_t(pg_query_t::FULLLOG,
4251 				     shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
4252 				     *h.back(), 5));
4253 	}
4254 	
4255 	// -- pg_lease_t --
4256 	
4257 	void pg_lease_t::encode(bufferlist& bl) const
4258 	{
4259 	  ENCODE_START(1, 1, bl);
4260 	  encode(readable_until, bl);
4261 	  encode(readable_until_ub, bl);
4262 	  encode(interval, bl);
4263 	  ENCODE_FINISH(bl);
4264 	}
4265 	
4266 	void pg_lease_t::decode(bufferlist::const_iterator& p)
4267 	{
4268 	  DECODE_START(1, p);
4269 	  decode(readable_until, p);
4270 	  decode(readable_until_ub, p);
4271 	  decode(interval, p);
4272 	  DECODE_FINISH(p);
4273 	}
4274 	
4275 	void pg_lease_t::dump(Formatter *f) const
4276 	{
4277 	  f->dump_stream("readable_until") << readable_until;
4278 	  f->dump_stream("readable_until_ub") << readable_until_ub;
4279 	  f->dump_stream("interval") << interval;
4280 	}
4281 	
4282 	void pg_lease_t::generate_test_instances(std::list<pg_lease_t*>& o)
4283 	{
4284 	  o.push_back(new pg_lease_t());
4285 	  o.push_back(new pg_lease_t());
4286 	  o.back()->readable_until = make_timespan(1.5);
4287 	  o.back()->readable_until_ub = make_timespan(3.4);
4288 	  o.back()->interval = make_timespan(1.0);
4289 	}
4290 	
4291 	// -- pg_lease_ack_t --
4292 	
4293 	void pg_lease_ack_t::encode(bufferlist& bl) const
4294 	{
4295 	  ENCODE_START(1, 1, bl);
4296 	  encode(readable_until_ub, bl);
4297 	  ENCODE_FINISH(bl);
4298 	}
4299 	
4300 	void pg_lease_ack_t::decode(bufferlist::const_iterator& p)
4301 	{
4302 	  DECODE_START(1, p);
4303 	  decode(readable_until_ub, p);
4304 	  DECODE_FINISH(p);
4305 	}
4306 	
4307 	void pg_lease_ack_t::dump(Formatter *f) const
4308 	{
4309 	  f->dump_stream("readable_until_ub") << readable_until_ub;
4310 	}
4311 	
4312 	void pg_lease_ack_t::generate_test_instances(std::list<pg_lease_ack_t*>& o)
4313 	{
4314 	  o.push_back(new pg_lease_ack_t());
4315 	  o.push_back(new pg_lease_ack_t());
4316 	  o.back()->readable_until_ub = make_timespan(3.4);
4317 	}
4318 	
4319 	
4320 	// -- ObjectModDesc --
4321 	void ObjectModDesc::visit(Visitor *visitor) const
4322 	{
4323 	  auto bp = bl.cbegin();
4324 	  try {
4325 	    while (!bp.end()) {
4326 	      DECODE_START(max_required_version, bp);
4327 	      uint8_t code;
4328 	      decode(code, bp);
4329 	      switch (code) {
4330 	      case APPEND: {
4331 		uint64_t size;
4332 		decode(size, bp);
4333 		visitor->append(size);
4334 		break;
4335 	      }
4336 	      case SETATTRS: {
4337 		map<string, std::optional<ceph::buffer::list> > attrs;
4338 		decode(attrs, bp);
4339 		visitor->setattrs(attrs);
4340 		break;
4341 	      }
4342 	      case DELETE: {
4343 		version_t old_version;
4344 		decode(old_version, bp);
4345 		visitor->rmobject(old_version);
4346 		break;
4347 	      }
4348 	      case CREATE: {
4349 		visitor->create();
4350 		break;
4351 	      }
4352 	      case UPDATE_SNAPS: {
4353 		set<snapid_t> snaps;
4354 		decode(snaps, bp);
4355 		visitor->update_snaps(snaps);
4356 		break;
4357 	      }
4358 	      case TRY_DELETE: {
4359 		version_t old_version;
4360 		decode(old_version, bp);
4361 		visitor->try_rmobject(old_version);
4362 		break;
4363 	      }
4364 	      case ROLLBACK_EXTENTS: {
4365 		vector<pair<uint64_t, uint64_t> > extents;
4366 		version_t gen;
4367 		decode(gen, bp);
4368 		decode(extents, bp);
4369 		visitor->rollback_extents(gen,extents);
4370 		break;
4371 	      }
4372 	      default:
4373 		ceph_abort_msg("Invalid rollback code");
4374 	      }
4375 	      DECODE_FINISH(bp);
4376 	    }
4377 	  } catch (...) {
4378 	    ceph_abort_msg("Invalid encoding");
4379 	  }
4380 	}
4381 	
4382 	struct DumpVisitor : public ObjectModDesc::Visitor {
4383 	  Formatter *f;
4384 	  explicit DumpVisitor(Formatter *f) : f(f) {}
4385 	  void append(uint64_t old_size) override {
4386 	    f->open_object_section("op");
4387 	    f->dump_string("code", "APPEND");
4388 	    f->dump_unsigned("old_size", old_size);
4389 	    f->close_section();
4390 	  }
4391 	  void setattrs(map<string, std::optional<ceph::buffer::list> > &attrs) override {
4392 	    f->open_object_section("op");
4393 	    f->dump_string("code", "SETATTRS");
4394 	    f->open_array_section("attrs");
4395 	    for (auto i = attrs.begin(); i != attrs.end(); ++i) {
4396 	      f->dump_string("attr_name", i->first);
4397 	    }
4398 	    f->close_section();
4399 	    f->close_section();
4400 	  }
4401 	  void rmobject(version_t old_version) override {
4402 	    f->open_object_section("op");
4403 	    f->dump_string("code", "RMOBJECT");
4404 	    f->dump_unsigned("old_version", old_version);
4405 	    f->close_section();
4406 	  }
4407 	  void try_rmobject(version_t old_version) override {
4408 	    f->open_object_section("op");
4409 	    f->dump_string("code", "TRY_RMOBJECT");
4410 	    f->dump_unsigned("old_version", old_version);
4411 	    f->close_section();
4412 	  }
4413 	  void create() override {
4414 	    f->open_object_section("op");
4415 	    f->dump_string("code", "CREATE");
4416 	    f->close_section();
4417 	  }
4418 	  void update_snaps(const set<snapid_t> &snaps) override {
4419 	    f->open_object_section("op");
4420 	    f->dump_string("code", "UPDATE_SNAPS");
4421 	    f->dump_stream("snaps") << snaps;
4422 	    f->close_section();
4423 	  }
4424 	  void rollback_extents(
4425 	    version_t gen,
4426 	    const vector<pair<uint64_t, uint64_t> > &extents) override {
4427 	    f->open_object_section("op");
4428 	    f->dump_string("code", "ROLLBACK_EXTENTS");
4429 	    f->dump_unsigned("gen", gen);
4430 	    f->dump_stream("snaps") << extents;
4431 	    f->close_section();
4432 	  }
4433 	};
4434 	
4435 	void ObjectModDesc::dump(Formatter *f) const
4436 	{
4437 	  f->open_object_section("object_mod_desc");
4438 	  f->dump_bool("can_local_rollback", can_local_rollback);
4439 	  f->dump_bool("rollback_info_completed", rollback_info_completed);
4440 	  {
4441 	    f->open_array_section("ops");
4442 	    DumpVisitor vis(f);
4443 	    visit(&vis);
4444 	    f->close_section();
4445 	  }
4446 	  f->close_section();
4447 	}
4448 	
4449 	void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
4450 	{
4451 	  map<string, std::optional<ceph::buffer::list> > attrs;
4452 	  attrs[OI_ATTR];
4453 	  attrs[SS_ATTR];
4454 	  attrs["asdf"];
4455 	  o.push_back(new ObjectModDesc());
4456 	  o.back()->append(100);
4457 	  o.back()->setattrs(attrs);
4458 	  o.push_back(new ObjectModDesc());
4459 	  o.back()->rmobject(1001);
4460 	  o.push_back(new ObjectModDesc());
4461 	  o.back()->create();
4462 	  o.back()->setattrs(attrs);
4463 	  o.push_back(new ObjectModDesc());
4464 	  o.back()->create();
4465 	  o.back()->setattrs(attrs);
4466 	  o.back()->mark_unrollbackable();
4467 	  o.back()->append(1000);
4468 	}
4469 	
4470 	void ObjectModDesc::encode(ceph::buffer::list &_bl) const
4471 	{
4472 	  ENCODE_START(max_required_version, max_required_version, _bl);
4473 	  encode(can_local_rollback, _bl);
4474 	  encode(rollback_info_completed, _bl);
4475 	  encode(bl, _bl);
4476 	  ENCODE_FINISH(_bl);
4477 	}
4478 	void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl)
4479 	{
4480 	  DECODE_START(2, _bl);
4481 	  max_required_version = struct_v;
4482 	  decode(can_local_rollback, _bl);
4483 	  decode(rollback_info_completed, _bl);
4484 	  decode(bl, _bl);
4485 	  // ensure bl does not pin a larger ceph::buffer in memory
4486 	  bl.rebuild();
4487 	  bl.reassign_to_mempool(mempool::mempool_osd_pglog);
4488 	  DECODE_FINISH(_bl);
4489 	}
4490 	
4491 	std::atomic<int32_t> ObjectCleanRegions::max_num_intervals = {10};
4492 	
4493 	void ObjectCleanRegions::set_max_num_intervals(int32_t num)
4494 	{
4495 	  max_num_intervals = num;
4496 	}
4497 	
4498 	void ObjectCleanRegions::trim()
4499 	{
4500 	  while(clean_offsets.num_intervals() > max_num_intervals) {
4501 	    typename interval_set<uint64_t>::iterator shortest_interval = clean_offsets.begin();
4502 	    if (shortest_interval == clean_offsets.end())
4503 	      break;
4504 	    for (typename interval_set<uint64_t>::iterator it = clean_offsets.begin();
4505 	        it != clean_offsets.end();
4506 	        ++it) {
4507 	      if (it.get_len() < shortest_interval.get_len())
4508 	        shortest_interval = it;
4509 	    }
4510 	    clean_offsets.erase(shortest_interval);
4511 	  }
4512 	}
4513 	
4514 	void ObjectCleanRegions::merge(const ObjectCleanRegions &other)
4515 	{
4516 	  clean_offsets.intersection_of(other.clean_offsets);
4517 	  clean_omap = clean_omap && other.clean_omap;
4518 	  trim();
4519 	}
4520 	
4521 	void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len)
4522 	{
4523 	  interval_set<uint64_t> clean_region;
4524 	  clean_region.insert(0, (uint64_t)-1);
4525 	  clean_region.erase(offset, len);
4526 	  clean_offsets.intersection_of(clean_region);
4527 	  trim();
4528 	}
4529 	
4530 	void ObjectCleanRegions::mark_omap_dirty()
4531 	{
4532 	  clean_omap = false;
4533 	}
4534 	
4535 	void ObjectCleanRegions::mark_object_new()
4536 	{
4537 	  new_object = true;
4538 	}
4539 	
4540 	void ObjectCleanRegions::mark_fully_dirty()
4541 	{
4542 	  mark_data_region_dirty(0, (uint64_t)-1);
4543 	  mark_omap_dirty();
4544 	  mark_object_new();
4545 	}
4546 	
4547 	interval_set<uint64_t> ObjectCleanRegions::get_dirty_regions() const
4548 	{
4549 	   interval_set<uint64_t> dirty_region;
4550 	   dirty_region.insert(0, (uint64_t)-1);
4551 	   dirty_region.subtract(clean_offsets);
4552 	   return dirty_region;
4553 	}
4554 	
4555 	bool ObjectCleanRegions::omap_is_dirty() const
4556 	{
4557 	  return !clean_omap;
4558 	}
4559 	
4560 	bool ObjectCleanRegions::object_is_exist() const
4561 	{
4562 	  return !new_object;
4563 	}
4564 	
4565 	void ObjectCleanRegions::encode(bufferlist &bl) const
4566 	{
4567 	  ENCODE_START(1, 1, bl);
4568 	  using ceph::encode;
4569 	  encode(clean_offsets, bl);
4570 	  encode(clean_omap, bl);
4571 	  encode(new_object, bl);
4572 	  ENCODE_FINISH(bl);
4573 	}
4574 	
4575 	void ObjectCleanRegions::decode(bufferlist::const_iterator &bl)
4576 	{
4577 	  DECODE_START(1, bl);
4578 	  using ceph::decode;
4579 	  decode(clean_offsets, bl);
4580 	  decode(clean_omap, bl);
4581 	  decode(new_object, bl);
4582 	  DECODE_FINISH(bl);
4583 	}
4584 	
4585 	void ObjectCleanRegions::dump(Formatter *f) const
4586 	{
4587 	  f->open_object_section("object_clean_regions");
4588 	  f->dump_stream("clean_offsets") << clean_offsets;
4589 	  f->dump_bool("clean_omap", clean_omap);
4590 	  f->dump_bool("new_object", new_object);
4591 	  f->close_section();
4592 	}
4593 	
4594 	void ObjectCleanRegions::generate_test_instances(list<ObjectCleanRegions*>& o)
4595 	{
4596 	  o.push_back(new ObjectCleanRegions());
4597 	  o.push_back(new ObjectCleanRegions());
4598 	  o.back()->mark_data_region_dirty(4096, 40960);
4599 	  o.back()->mark_omap_dirty();
4600 	  o.back()->mark_object_new();
4601 	}
4602 	
4603 	ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr)
4604 	{
4605 	  return out << "clean_offsets: " << ocr.clean_offsets
4606 	             << ", clean_omap: " << ocr.clean_omap
4607 	             << ", new_object: " << ocr.new_object;
4608 	}
4609 	
4610 	// -- pg_log_entry_t --
4611 	
4612 	string pg_log_entry_t::get_key_name() const
4613 	{
4614 	  return version.get_key_name();
4615 	}
4616 	
4617 	void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const
4618 	{
4619 	  using ceph::encode;
4620 	  ceph::buffer::list ebl(sizeof(*this)*2);
4621 	  this->encode(ebl);
4622 	  __u32 crc = ebl.crc32c(0);
4623 	  encode(ebl, bl);
4624 	  encode(crc, bl);
4625 	}
4626 	
4627 	void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p)
4628 	{
4629 	  using ceph::decode;
4630 	  ceph::buffer::list bl;
4631 	  decode(bl, p);
4632 	  __u32 crc;
4633 	  decode(crc, p);
4634 	  if (crc != bl.crc32c(0))
4635 	    throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
4636 	  auto q = bl.cbegin();
4637 	  this->decode(q);
4638 	}
4639 	
4640 	void pg_log_entry_t::encode(ceph::buffer::list &bl) const
4641 	{
4642 	  ENCODE_START(14, 4, bl);
4643 	  encode(op, bl);
4644 	  encode(soid, bl);
4645 	  encode(version, bl);
4646 	
4647 	  /**
4648 	   * Added with reverting_to:
4649 	   * Previous code used prior_version to encode
4650 	   * what we now call reverting_to.  This will
4651 	   * allow older code to decode reverting_to
4652 	   * into prior_version as expected.
4653 	   */
4654 	  if (op == LOST_REVERT)
4655 	    encode(reverting_to, bl);
4656 	  else
4657 	    encode(prior_version, bl);
4658 	
4659 	  encode(reqid, bl);
4660 	  encode(mtime, bl);
4661 	  if (op == LOST_REVERT)
4662 	    encode(prior_version, bl);
4663 	  encode(snaps, bl);
4664 	  encode(user_version, bl);
4665 	  encode(mod_desc, bl);
4666 	  encode(extra_reqids, bl);
4667 	  if (op == ERROR)
4668 	    encode(return_code, bl);
4669 	  if (!extra_reqids.empty())
4670 	    encode(extra_reqid_return_codes, bl);
4671 	  encode(clean_regions, bl);
4672 	  if (op != ERROR)
4673 	    encode(return_code, bl);
4674 	  encode(op_returns, bl);
4675 	  ENCODE_FINISH(bl);
4676 	}
4677 	
4678 	void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
4679 	{
4680 	  DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl);
4681 	  decode(op, bl);
4682 	  if (struct_v < 2) {
4683 	    sobject_t old_soid;
4684 	    decode(old_soid, bl);
4685 	    soid.oid = old_soid.oid;
4686 	    soid.snap = old_soid.snap;
4687 	    invalid_hash = true;
4688 	  } else {
4689 	    decode(soid, bl);
4690 	  }
4691 	  if (struct_v < 3)
4692 	    invalid_hash = true;
4693 	  decode(version, bl);
4694 	
4695 	  if (struct_v >= 6 && op == LOST_REVERT)
4696 	    decode(reverting_to, bl);
4697 	  else
4698 	    decode(prior_version, bl);
4699 	
4700 	  decode(reqid, bl);
4701 	
4702 	  decode(mtime, bl);
4703 	  if (struct_v < 5)
4704 	    invalid_pool = true;
4705 	
4706 	  if (op == LOST_REVERT) {
4707 	    if (struct_v >= 6) {
4708 	      decode(prior_version, bl);
4709 	    } else {
4710 	      reverting_to = prior_version;
4711 	    }
4712 	  }
4713 	  if (struct_v >= 7 ||  // for v >= 7, this is for all ops.
4714 	      op == CLONE) {    // for v < 7, it's only present for CLONE.
4715 	    decode(snaps, bl);
4716 	    // ensure snaps does not pin a larger ceph::buffer in memory
4717 	    snaps.rebuild();
4718 	    snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4719 	  }
4720 	
4721 	  if (struct_v >= 8)
4722 	    decode(user_version, bl);
4723 	  else
4724 	    user_version = version.version;
4725 	
4726 	  if (struct_v >= 9)
4727 	    decode(mod_desc, bl);
4728 	  else
4729 	    mod_desc.mark_unrollbackable();
4730 	  if (struct_v >= 10)
4731 	    decode(extra_reqids, bl);
4732 	  if (struct_v >= 11 && op == ERROR)
4733 	    decode(return_code, bl);
4734 	  if (struct_v >= 12 && !extra_reqids.empty())
4735 	    decode(extra_reqid_return_codes, bl);
4736 	  if (struct_v >= 13)
4737 	    decode(clean_regions, bl);
4738 	  else
4739 	    clean_regions.mark_fully_dirty();
4740 	  if (struct_v >= 14) {
4741 	    if (op != ERROR) {
4742 	      decode(return_code, bl);
4743 	    }
4744 	    decode(op_returns, bl);
4745 	  }
4746 	  DECODE_FINISH(bl);
4747 	}
4748 	
4749 	void pg_log_entry_t::dump(Formatter *f) const
4750 	{
4751 	  f->dump_string("op", get_op_name());
4752 	  f->dump_stream("object") << soid;
4753 	  f->dump_stream("version") << version;
4754 	  f->dump_stream("prior_version") << prior_version;
4755 	  f->dump_stream("reqid") << reqid;
4756 	  f->open_array_section("extra_reqids");
4757 	  uint32_t idx = 0;
4758 	  for (auto p = extra_reqids.begin();
4759 	       p != extra_reqids.end();
4760 	       ++idx, ++p) {
4761 	    f->open_object_section("extra_reqid");
4762 	    f->dump_stream("reqid") << p->first;
4763 	    f->dump_stream("user_version") << p->second;
4764 	    auto it = extra_reqid_return_codes.find(idx);
4765 	    if (it != extra_reqid_return_codes.end()) {
4766 	      f->dump_int("return_code", it->second);
4767 	    }
4768 	    f->close_section();
4769 	  }
4770 	  f->close_section();
4771 	  f->dump_stream("mtime") << mtime;
4772 	  f->dump_int("return_code", return_code);
4773 	  if (!op_returns.empty()) {
4774 	    f->open_array_section("op_returns");
4775 	    for (auto& i : op_returns) {
4776 	      f->dump_object("op", i);
4777 	    }
4778 	    f->close_section();
4779 	  }
4780 	  if (snaps.length() > 0) {
4781 	    vector<snapid_t> v;
4782 	    ceph::buffer::list c = snaps;
4783 	    auto p = c.cbegin();
4784 	    try {
4785 	      using ceph::decode;
4786 	      decode(v, p);
4787 	    } catch (...) {
4788 	      v.clear();
4789 	    }
4790 	    f->open_object_section("snaps");
4791 	    for (auto p = v.begin(); p != v.end(); ++p)
4792 	      f->dump_unsigned("snap", *p);
4793 	    f->close_section();
4794 	  }
4795 	  {
4796 	    f->open_object_section("mod_desc");
4797 	    mod_desc.dump(f);
4798 	    f->close_section();
4799 	  }
4800 	  {
4801 	    f->open_object_section("clean_regions");
4802 	    clean_regions.dump(f);
4803 	    f->close_section();
4804 	  }
4805 	}
4806 	
4807 	void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4808 	{
4809 	  o.push_back(new pg_log_entry_t());
4810 	  hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4811 	  o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4812 					 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4813 					 utime_t(8,9), 0));
4814 	  o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4815 					 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4816 					 utime_t(8,9), -ENOENT));
4817 	}
4818 	
4819 	ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4820 	{
4821 	  out << e.version << " (" << e.prior_version << ") "
4822 	      << std::left << std::setw(8) << e.get_op_name() << ' '
4823 	      << e.soid << " by " << e.reqid << " " << e.mtime
4824 	      << " " << e.return_code;
4825 	  if (!e.op_returns.empty()) {
4826 	    out << " " << e.op_returns;
4827 	  }
4828 	  if (e.snaps.length()) {
4829 	    vector<snapid_t> snaps;
4830 	    ceph::buffer::list c = e.snaps;
4831 	    auto p = c.cbegin();
4832 	    try {
4833 	      decode(snaps, p);
4834 	    } catch (...) {
4835 	      snaps.clear();
4836 	    }
4837 	    out << " snaps " << snaps;
4838 	  }
4839 	  out << " ObjectCleanRegions " << e.clean_regions;
4840 	  return out;
4841 	}
4842 	
4843 	// -- pg_log_dup_t --
4844 	
4845 	std::string pg_log_dup_t::get_key_name() const
4846 	{
4847 	  static const char prefix[] = "dup_";
4848 	  std::string key(36, ' ');
4849 	  memcpy(&key[0], prefix, 4);
4850 	  version.get_key_name(&key[4]);
4851 	  key.resize(35); // remove the null terminator
4852 	  return key;
4853 	}
4854 	
4855 	void pg_log_dup_t::encode(ceph::buffer::list &bl) const
4856 	{
4857 	  ENCODE_START(2, 1, bl);
4858 	  encode(reqid, bl);
4859 	  encode(version, bl);
4860 	  encode(user_version, bl);
4861 	  encode(return_code, bl);
4862 	  encode(op_returns, bl);
4863 	  ENCODE_FINISH(bl);
4864 	}
4865 	
4866 	void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl)
4867 	{
4868 	  DECODE_START(2, bl);
4869 	  decode(reqid, bl);
4870 	  decode(version, bl);
4871 	  decode(user_version, bl);
4872 	  decode(return_code, bl);
4873 	  if (struct_v >= 2) {
4874 	    decode(op_returns, bl);
4875 	  }
4876 	  DECODE_FINISH(bl);
4877 	}
4878 	
4879 	void pg_log_dup_t::dump(Formatter *f) const
4880 	{
4881 	  f->dump_stream("reqid") << reqid;
4882 	  f->dump_stream("version") << version;
4883 	  f->dump_stream("user_version") << user_version;
4884 	  f->dump_stream("return_code") << return_code;
4885 	  if (!op_returns.empty()) {
4886 	    f->open_array_section("op_returns");
4887 	    for (auto& i : op_returns) {
4888 	      f->dump_object("op", i);
4889 	    }
4890 	    f->close_section();
4891 	  }
4892 	}
4893 	
4894 	void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
4895 	{
4896 	  o.push_back(new pg_log_dup_t());
4897 	  o.push_back(new pg_log_dup_t(eversion_t(1,2),
4898 				       1,
4899 				       osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4900 				       0));
4901 	  o.push_back(new pg_log_dup_t(eversion_t(1,2),
4902 				       2,
4903 				       osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4904 				       -ENOENT));
4905 	}
4906 	
4907 	
4908 	std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
4909 	  out << "log_dup(reqid=" << e.reqid <<
4910 	    " v=" << e.version << " uv=" << e.user_version <<
4911 	    " rc=" << e.return_code;
4912 	  if (!e.op_returns.empty()) {
4913 	    out << " " << e.op_returns;
4914 	  }
4915 	  return out << ")";
4916 	}
4917 	
4918 	
4919 	// -- pg_log_t --
4920 	
4921 	// out: pg_log_t that only has entries that apply to import_pgid using curmap
4922 	// reject: Entries rejected from "in" are in the reject.log.  Other fields not set.
4923 	void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
4924 	  const string &hit_set_namespace, const pg_log_t &in,
4925 	  pg_log_t &out, pg_log_t &reject)
4926 	{
4927 	  out = in;
4928 	  out.log.clear();
4929 	  reject.log.clear();
4930 	
4931 	  for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) {
4932 	
4933 	    // Reject pg log entries for temporary objects
4934 	    if (i->soid.is_temp()) {
4935 	      reject.log.push_back(*i);
4936 	      continue;
4937 	    }
4938 	
4939 	    if (i->soid.nspace != hit_set_namespace) {
4940 	      object_t oid = i->soid.oid;
4941 	      object_locator_t loc(i->soid);
4942 	      pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
4943 	      pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
4944 	
4945 	      if (import_pgid.pgid == pgid) {
4946 	        out.log.push_back(*i);
4947 	      } else {
4948 	        reject.log.push_back(*i);
4949 	      }
4950 	    } else {
4951 	      out.log.push_back(*i);
4952 	    }
4953 	  }
4954 	}
4955 	
4956 	void pg_log_t::encode(ceph::buffer::list& bl) const
4957 	{
4958 	  ENCODE_START(7, 3, bl);
4959 	  encode(head, bl);
4960 	  encode(tail, bl);
4961 	  encode(log, bl);
4962 	  encode(can_rollback_to, bl);
4963 	  encode(rollback_info_trimmed_to, bl);
4964 	  encode(dups, bl);
4965 	  ENCODE_FINISH(bl);
4966 	}
4967 	 
4968 	void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool)
4969 	{
4970 	  DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
4971 	  decode(head, bl);
4972 	  decode(tail, bl);
4973 	  if (struct_v < 2) {
4974 	    bool backlog;
4975 	    decode(backlog, bl);
4976 	  }
4977 	  decode(log, bl);
4978 	  if (struct_v >= 5)
4979 	    decode(can_rollback_to, bl);
4980 	
4981 	  if (struct_v >= 6)
4982 	    decode(rollback_info_trimmed_to, bl);
4983 	  else
4984 	    rollback_info_trimmed_to = tail;
4985 	
4986 	  if (struct_v >= 7)
4987 	    decode(dups, bl);
4988 	
4989 	  DECODE_FINISH(bl);
4990 	
4991 	  // handle hobject_t format change
4992 	  if (struct_v < 4) {
4993 	    for (auto i = log.begin(); i != log.end(); ++i) {
4994 	      if (!i->soid.is_max() && i->soid.pool == -1)
4995 		i->soid.pool = pool;
4996 	    }
4997 	  }
4998 	}
4999 	
5000 	void pg_log_t::dump(Formatter *f) const
5001 	{
5002 	  f->dump_stream("head") << head;
5003 	  f->dump_stream("tail") << tail;
5004 	  f->open_array_section("log");
5005 	  for (auto p = log.cbegin(); p != log.cend(); ++p) {
5006 	    f->open_object_section("entry");
5007 	    p->dump(f);
5008 	    f->close_section();
5009 	  }
5010 	  f->close_section();
5011 	  f->open_array_section("dups");
5012 	  for (const auto& entry : dups) {
5013 	    f->open_object_section("entry");
5014 	    entry.dump(f);
5015 	    f->close_section();
5016 	  }
5017 	  f->close_section();
5018 	}
5019 	
5020 	void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
5021 	{
5022 	  o.push_back(new pg_log_t);
5023 	
5024 	  // this is nonsensical:
5025 	  o.push_back(new pg_log_t);
5026 	  o.back()->head = eversion_t(1,2);
5027 	  o.back()->tail = eversion_t(3,4);
5028 	  list<pg_log_entry_t*> e;
5029 	  pg_log_entry_t::generate_test_instances(e);
5030 	  for (auto p = e.begin(); p != e.end(); ++p)
5031 	    o.back()->log.push_back(**p);
5032 	}
5033 	
5034 	static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups)
5035 	{
5036 	  auto earliest_dup_version =
5037 		        target.head.version < maxdups ? 0u : target.head.version - maxdups + 1;
5038 	  lgeneric_subdout(cct, osd, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version << dendl;
5039 	
5040 	  for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) {
5041 	    if (d->version.version >= earliest_dup_version) {
5042 	      lgeneric_subdout(cct, osd, 20)
5043 		      << "copy_up_to/copy_after copy dup version "
5044 		      << d->version << dendl;
5045 	      target.dups.push_back(pg_log_dup_t(*d));
5046 	    }
5047 	  }
5048 	
5049 	  for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) {
5050 	    ceph_assert(i->version > other.tail);
5051 	    if (i->version > target.tail)
5052 	      break;
5053 	    if (i->version.version >= earliest_dup_version) {
5054 	      lgeneric_subdout(cct, osd, 20)
5055 			<< "copy_up_to/copy_after copy dup from log version "
5056 			<< i->version << dendl;
5057 	      target.dups.push_back(pg_log_dup_t(*i));
5058 	    }
5059 	  }
5060 	}
5061 	
5062 	
5063 	void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v)
5064 	{
5065 	  can_rollback_to = other.can_rollback_to;
5066 	  head = other.head;
5067 	  tail = other.tail;
5068 	  lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v << dendl;
5069 	  for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
5070 	    ceph_assert(i->version > other.tail);
5071 	    if (i->version <= v) {
5072 	      // make tail accurate.
5073 	      tail = i->version;
5074 	      break;
5075 	    }
5076 	    lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
5077 	    log.push_front(*i);
5078 	  }
5079 	  _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
5080 	}
5081 	
5082 	void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max)
5083 	{
5084 	  can_rollback_to = other.can_rollback_to;
5085 	  int n = 0;
5086 	  head = other.head;
5087 	  tail = other.tail;
5088 	  lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max << dendl;
5089 	  for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
5090 	    ceph_assert(i->version > other.tail);
5091 	    if (n++ >= max) {
5092 	      tail = i->version;
5093 	      break;
5094 	    }
5095 	    lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
5096 	    log.push_front(*i);
5097 	  }
5098 	  _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
5099 	}
5100 	
5101 	ostream& pg_log_t::print(ostream& out) const
5102 	{
5103 	  out << *this << std::endl;
5104 	  for (auto p = log.cbegin(); p != log.cend(); ++p)
5105 	    out << *p << std::endl;
5106 	  for (const auto& entry : dups) {
5107 	    out << " dup entry: " << entry << std::endl;
5108 	  }
5109 	  return out;
5110 	}
5111 	
5112 	// -- pg_missing_t --
5113 	
5114 	ostream& operator<<(ostream& out, const pg_missing_item& i)
5115 	{
5116 	  out << i.need;
5117 	  if (i.have != eversion_t())
5118 	    out << "(" << i.have << ")";
5119 	  out << " flags = " << i.flag_str()
5120 	      << " " << i.clean_regions;
5121 	  return out;
5122 	}
5123 	
5124 	// -- object_copy_cursor_t --
5125 	
5126 	void object_copy_cursor_t::encode(ceph::buffer::list& bl) const
5127 	{
5128 	  ENCODE_START(1, 1, bl);
5129 	  encode(attr_complete, bl);
5130 	  encode(data_offset, bl);
5131 	  encode(data_complete, bl);
5132 	  encode(omap_offset, bl);
5133 	  encode(omap_complete, bl);
5134 	  ENCODE_FINISH(bl);
5135 	}
5136 	
5137 	void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl)
5138 	{
5139 	  DECODE_START(1, bl);
5140 	  decode(attr_complete, bl);
5141 	  decode(data_offset, bl);
5142 	  decode(data_complete, bl);
5143 	  decode(omap_offset, bl);
5144 	  decode(omap_complete, bl);
5145 	  DECODE_FINISH(bl);
5146 	}
5147 	
5148 	void object_copy_cursor_t::dump(Formatter *f) const
5149 	{
5150 	  f->dump_unsigned("attr_complete", (int)attr_complete);
5151 	  f->dump_unsigned("data_offset", data_offset);
5152 	  f->dump_unsigned("data_complete", (int)data_complete);
5153 	  f->dump_string("omap_offset", omap_offset);
5154 	  f->dump_unsigned("omap_complete", (int)omap_complete);
5155 	}
5156 	
5157 	void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
5158 	{
5159 	  o.push_back(new object_copy_cursor_t);
5160 	  o.push_back(new object_copy_cursor_t);
5161 	  o.back()->attr_complete = true;
5162 	  o.back()->data_offset = 123;
5163 	  o.push_back(new object_copy_cursor_t);
5164 	  o.back()->attr_complete = true;
5165 	  o.back()->data_complete = true;
5166 	  o.back()->omap_offset = "foo";
5167 	  o.push_back(new object_copy_cursor_t);
5168 	  o.back()->attr_complete = true;
5169 	  o.back()->data_complete = true;
5170 	  o.back()->omap_complete = true;
5171 	}
5172 	
5173 	// -- object_copy_data_t --
5174 	
5175 	void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const
5176 	{
5177 	  ENCODE_START(8, 5, bl);
5178 	  encode(size, bl);
5179 	  encode(mtime, bl);
5180 	  encode(attrs, bl);
5181 	  encode(data, bl);
5182 	  encode(omap_data, bl);
5183 	  encode(cursor, bl);
5184 	  encode(omap_header, bl);
5185 	  encode(snaps, bl);
5186 	  encode(snap_seq, bl);
5187 	  encode(flags, bl);
5188 	  encode(data_digest, bl);
5189 	  encode(omap_digest, bl);
5190 	  encode(reqids, bl);
5191 	  encode(truncate_seq, bl);
5192 	  encode(truncate_size, bl);
5193 	  encode(reqid_return_codes, bl);
5194 	  ENCODE_FINISH(bl);
5195 	}
5196 	
5197 	void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl)
5198 	{
5199 	  DECODE_START(8, bl);
5200 	  if (struct_v < 5) {
5201 	    // old
5202 	    decode(size, bl);
5203 	    decode(mtime, bl);
5204 	    {
5205 	      string category;
5206 	      decode(category, bl);  // no longer used
5207 	    }
5208 	    decode(attrs, bl);
5209 	    decode(data, bl);
5210 	    {
5211 	      map<string,ceph::buffer::list> omap;
5212 	      decode(omap, bl);
5213 	      omap_data.clear();
5214 	      if (!omap.empty()) {
5215 		using ceph::encode;
5216 		encode(omap, omap_data);
5217 	      }
5218 	    }
5219 	    decode(cursor, bl);
5220 	    if (struct_v >= 2)
5221 	      decode(omap_header, bl);
5222 	    if (struct_v >= 3) {
5223 	      decode(snaps, bl);
5224 	      decode(snap_seq, bl);
5225 	    } else {
5226 	      snaps.clear();
5227 	      snap_seq = 0;
5228 	    }
5229 	    if (struct_v >= 4) {
5230 	      decode(flags, bl);
5231 	      decode(data_digest, bl);
5232 	      decode(omap_digest, bl);
5233 	    }
5234 	  } else {
5235 	    // current
5236 	    decode(size, bl);
5237 	    decode(mtime, bl);
5238 	    decode(attrs, bl);
5239 	    decode(data, bl);
5240 	    decode(omap_data, bl);
5241 	    decode(cursor, bl);
5242 	    decode(omap_header, bl);
5243 	    decode(snaps, bl);
5244 	    decode(snap_seq, bl);
5245 	    if (struct_v >= 4) {
5246 	      decode(flags, bl);
5247 	      decode(data_digest, bl);
5248 	      decode(omap_digest, bl);
5249 	    }
5250 	    if (struct_v >= 6) {
5251 	      decode(reqids, bl);
5252 	    }
5253 	    if (struct_v >= 7) {
5254 	      decode(truncate_seq, bl);
5255 	      decode(truncate_size, bl);
5256 	    }
5257 	    if (struct_v >= 8) {
5258 	      decode(reqid_return_codes, bl);
5259 	    }
5260 	  }
5261 	  DECODE_FINISH(bl);
5262 	}
5263 	
5264 	void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
5265 	{
5266 	  o.push_back(new object_copy_data_t());
5267 	
5268 	  list<object_copy_cursor_t*> cursors;
5269 	  object_copy_cursor_t::generate_test_instances(cursors);
5270 	  auto ci = cursors.begin();
5271 	  o.back()->cursor = **(ci++);
5272 	
5273 	  o.push_back(new object_copy_data_t());
5274 	  o.back()->cursor = **(ci++);
5275 	
5276 	  o.push_back(new object_copy_data_t());
5277 	  o.back()->size = 1234;
5278 	  o.back()->mtime.set_from_double(1234);
5279 	  ceph::buffer::ptr bp("there", 5);
5280 	  ceph::buffer::list bl;
5281 	  bl.push_back(bp);
5282 	  o.back()->attrs["hello"] = bl;
5283 	  ceph::buffer::ptr bp2("not", 3);
5284 	  ceph::buffer::list bl2;
5285 	  bl2.push_back(bp2);
5286 	  map<string,ceph::buffer::list> omap;
5287 	  omap["why"] = bl2;
5288 	  using ceph::encode;
5289 	  encode(omap, o.back()->omap_data);
5290 	  ceph::buffer::ptr databp("iamsomedatatocontain", 20);
5291 	  o.back()->data.push_back(databp);
5292 	  o.back()->omap_header.append("this is an omap header");
5293 	  o.back()->snaps.push_back(123);
5294 	  o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
5295 	}
5296 	
5297 	void object_copy_data_t::dump(Formatter *f) const
5298 	{
5299 	  f->open_object_section("cursor");
5300 	  cursor.dump(f);
5301 	  f->close_section(); // cursor
5302 	  f->dump_int("size", size);
5303 	  f->dump_stream("mtime") << mtime;
5304 	  /* we should really print out the attrs here, but ceph::buffer::list
5305 	     const-correctness prevents that */
5306 	  f->dump_int("attrs_size", attrs.size());
5307 	  f->dump_int("flags", flags);
5308 	  f->dump_unsigned("data_digest", data_digest);
5309 	  f->dump_unsigned("omap_digest", omap_digest);
5310 	  f->dump_int("omap_data_length", omap_data.length());
5311 	  f->dump_int("omap_header_length", omap_header.length());
5312 	  f->dump_int("data_length", data.length());
5313 	  f->open_array_section("snaps");
5314 	  for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
5315 	    f->dump_unsigned("snap", *p);
5316 	  f->close_section();
5317 	  f->open_array_section("reqids");
5318 	  uint32_t idx = 0;
5319 	  for (auto p = reqids.begin();
5320 	       p != reqids.end();
5321 	       ++idx, ++p) {
5322 	    f->open_object_section("extra_reqid");
5323 	    f->dump_stream("reqid") << p->first;
5324 	    f->dump_stream("user_version") << p->second;
5325 	    auto it = reqid_return_codes.find(idx);
5326 	    if (it != reqid_return_codes.end()) {
5327 	      f->dump_int("return_code", it->second);
5328 	    }
5329 	    f->close_section();
5330 	  }
5331 	  f->close_section();
5332 	}
5333 	
5334 	// -- pg_create_t --
5335 	
5336 	void pg_create_t::encode(ceph::buffer::list &bl) const
5337 	{
5338 	  ENCODE_START(1, 1, bl);
5339 	  encode(created, bl);
5340 	  encode(parent, bl);
5341 	  encode(split_bits, bl);
5342 	  ENCODE_FINISH(bl);
5343 	}
5344 	
5345 	void pg_create_t::decode(ceph::buffer::list::const_iterator &bl)
5346 	{
5347 	  DECODE_START(1, bl);
5348 	  decode(created, bl);
5349 	  decode(parent, bl);
5350 	  decode(split_bits, bl);
5351 	  DECODE_FINISH(bl);
5352 	}
5353 	
5354 	void pg_create_t::dump(Formatter *f) const
5355 	{
5356 	  f->dump_unsigned("created", created);
5357 	  f->dump_stream("parent") << parent;
5358 	  f->dump_int("split_bits", split_bits);
5359 	}
5360 	
5361 	void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
5362 	{
5363 	  o.push_back(new pg_create_t);
5364 	  o.push_back(new pg_create_t(1, pg_t(3, 4), 2));
5365 	}
5366 	
5367 	
5368 	// -- pg_hit_set_info_t --
5369 	
5370 	void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const
5371 	{
5372 	  ENCODE_START(2, 1, bl);
5373 	  encode(begin, bl);
5374 	  encode(end, bl);
5375 	  encode(version, bl);
5376 	  encode(using_gmt, bl);
5377 	  ENCODE_FINISH(bl);
5378 	}
5379 	
5380 	void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p)
5381 	{
5382 	  DECODE_START(2, p);
5383 	  decode(begin, p);
5384 	  decode(end, p);
5385 	  decode(version, p);
5386 	  if (struct_v >= 2) {
5387 	    decode(using_gmt, p);
5388 	  } else {
5389 	    using_gmt = false;
5390 	  }
5391 	  DECODE_FINISH(p);
5392 	}
5393 	
5394 	void pg_hit_set_info_t::dump(Formatter *f) const
5395 	{
5396 	  f->dump_stream("begin") << begin;
5397 	  f->dump_stream("end") << end;
5398 	  f->dump_stream("version") << version;
5399 	  f->dump_stream("using_gmt") << using_gmt;
5400 	}
5401 	
5402 	void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
5403 	{
5404 	  ls.push_back(new pg_hit_set_info_t);
5405 	  ls.push_back(new pg_hit_set_info_t);
5406 	  ls.back()->begin = utime_t(1, 2);
5407 	  ls.back()->end = utime_t(3, 4);
5408 	}
5409 	
5410 	
5411 	// -- pg_hit_set_history_t --
5412 	
5413 	void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const
5414 	{
5415 	  ENCODE_START(1, 1, bl);
5416 	  encode(current_last_update, bl);
5417 	  {
5418 	    utime_t dummy_stamp;
5419 	    encode(dummy_stamp, bl);
5420 	  }
5421 	  {
5422 	    pg_hit_set_info_t dummy_info;
5423 	    encode(dummy_info, bl);
5424 	  }
5425 	  encode(history, bl);
5426 	  ENCODE_FINISH(bl);
5427 	}
5428 	
5429 	void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p)
5430 	{
5431 	  DECODE_START(1, p);
5432 	  decode(current_last_update, p);
5433 	  {
5434 	    utime_t dummy_stamp;
5435 	    decode(dummy_stamp, p);
5436 	  }
5437 	  {
5438 	    pg_hit_set_info_t dummy_info;
5439 	    decode(dummy_info, p);
5440 	  }
5441 	  decode(history, p);
5442 	  DECODE_FINISH(p);
5443 	}
5444 	
5445 	void pg_hit_set_history_t::dump(Formatter *f) const
5446 	{
5447 	  f->dump_stream("current_last_update") << current_last_update;
5448 	  f->open_array_section("history");
5449 	  for (auto p = history.cbegin(); p != history.cend(); ++p) {
5450 	    f->open_object_section("info");
5451 	    p->dump(f);
5452 	    f->close_section();
5453 	  }
5454 	  f->close_section();
5455 	}
5456 	
5457 	void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
5458 	{
5459 	  ls.push_back(new pg_hit_set_history_t);
5460 	  ls.push_back(new pg_hit_set_history_t);
5461 	  ls.back()->current_last_update = eversion_t(1, 2);
5462 	  ls.back()->history.push_back(pg_hit_set_info_t());
5463 	}
5464 	
5465 	// -- OSDSuperblock --
5466 	
5467 	void OSDSuperblock::encode(ceph::buffer::list &bl) const
5468 	{
5469 	  ENCODE_START(9, 5, bl);
5470 	  encode(cluster_fsid, bl);
5471 	  encode(whoami, bl);
5472 	  encode(current_epoch, bl);
5473 	  encode(oldest_map, bl);
5474 	  encode(newest_map, bl);
5475 	  encode(weight, bl);
5476 	  compat_features.encode(bl);
5477 	  encode(clean_thru, bl);
5478 	  encode(mounted, bl);
5479 	  encode(osd_fsid, bl);
5480 	  encode((epoch_t)0, bl);  // epoch_t last_epoch_marked_full
5481 	  encode((uint32_t)0, bl);  // map<int64_t,epoch_t> pool_last_epoch_marked_full
5482 	  encode(purged_snaps_last, bl);
5483 	  encode(last_purged_snaps_scrub, bl);
5484 	  ENCODE_FINISH(bl);
5485 	}
5486 	
5487 	void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl)
5488 	{
5489 	  DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
5490 	  if (struct_v < 3) {
5491 	    string magic;
5492 	    decode(magic, bl);
5493 	  }
5494 	  decode(cluster_fsid, bl);
5495 	  decode(whoami, bl);
5496 	  decode(current_epoch, bl);
5497 	  decode(oldest_map, bl);
5498 	  decode(newest_map, bl);
5499 	  decode(weight, bl);
5500 	  if (struct_v >= 2) {
5501 	    compat_features.decode(bl);
5502 	  } else { //upgrade it!
5503 	    compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
5504 	  }
5505 	  decode(clean_thru, bl);
5506 	  decode(mounted, bl);
5507 	  if (struct_v >= 4)
5508 	    decode(osd_fsid, bl);
5509 	  if (struct_v >= 6) {
5510 	    epoch_t last_map_marked_full;
5511 	    decode(last_map_marked_full, bl);
5512 	  }
5513 	  if (struct_v >= 7) {
5514 	    map<int64_t,epoch_t> pool_last_map_marked_full;
5515 	    decode(pool_last_map_marked_full, bl);
5516 	  }
5517 	  if (struct_v >= 9) {
5518 	    decode(purged_snaps_last, bl);
5519 	    decode(last_purged_snaps_scrub, bl);
5520 	  } else {
5521 	    purged_snaps_last = 0;
5522 	  }
5523 	  DECODE_FINISH(bl);
5524 	}
5525 	
5526 	void OSDSuperblock::dump(Formatter *f) const
5527 	{
5528 	  f->dump_stream("cluster_fsid") << cluster_fsid;
5529 	  f->dump_stream("osd_fsid") << osd_fsid;
5530 	  f->dump_int("whoami", whoami);
5531 	  f->dump_int("current_epoch", current_epoch);
5532 	  f->dump_int("oldest_map", oldest_map);
5533 	  f->dump_int("newest_map", newest_map);
5534 	  f->dump_float("weight", weight);
5535 	  f->open_object_section("compat");
5536 	  compat_features.dump(f);
5537 	  f->close_section();
5538 	  f->dump_int("clean_thru", clean_thru);
5539 	  f->dump_int("last_epoch_mounted", mounted);
5540 	  f->dump_unsigned("purged_snaps_last", purged_snaps_last);
5541 	  f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
5542 	}
5543 	
5544 	void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
5545 	{
5546 	  OSDSuperblock z;
5547 	  o.push_back(new OSDSuperblock(z));
5548 	  z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101");
5549 	  z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
5550 	  z.whoami = 3;
5551 	  z.current_epoch = 4;
5552 	  z.oldest_map = 5;
5553 	  z.newest_map = 9;
5554 	  z.mounted = 8;
5555 	  z.clean_thru = 7;
5556 	  o.push_back(new OSDSuperblock(z));
5557 	  o.push_back(new OSDSuperblock(z));
5558 	}
5559 	
5560 	// -- SnapSet --
5561 	
5562 	void SnapSet::encode(ceph::buffer::list& bl) const
5563 	{
5564 	  ENCODE_START(3, 2, bl);
5565 	  encode(seq, bl);
5566 	  encode(true, bl);  // head_exists
5567 	  encode(snaps, bl);
5568 	  encode(clones, bl);
5569 	  encode(clone_overlap, bl);
5570 	  encode(clone_size, bl);
5571 	  encode(clone_snaps, bl);
5572 	  ENCODE_FINISH(bl);
5573 	}
5574 	
5575 	void SnapSet::decode(ceph::buffer::list::const_iterator& bl)
5576 	{
5577 	  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5578 	  decode(seq, bl);
5579 	  bl.advance(1u);  // skip legacy head_exists (always true)
5580 	  decode(snaps, bl);
5581 	  decode(clones, bl);
5582 	  decode(clone_overlap, bl);
5583 	  decode(clone_size, bl);
5584 	  if (struct_v >= 3) {
5585 	    decode(clone_snaps, bl);
5586 	  } else {
5587 	    clone_snaps.clear();
5588 	  }
5589 	  DECODE_FINISH(bl);
5590 	}
5591 	
5592 	void SnapSet::dump(Formatter *f) const
5593 	{
5594 	  f->dump_unsigned("seq", seq);
5595 	  f->open_array_section("clones");
5596 	  for (auto p = clones.cbegin(); p != clones.cend(); ++p) {
5597 	    f->open_object_section("clone");
5598 	    f->dump_unsigned("snap", *p);
5599 	    auto cs = clone_size.find(*p);
5600 	    if (cs != clone_size.end())
5601 	      f->dump_unsigned("size", cs->second);
5602 	    else
5603 	      f->dump_string("size", "????");
5604 	    auto co = clone_overlap.find(*p);
5605 	    if (co != clone_overlap.end())
5606 	      f->dump_stream("overlap") << co->second;
5607 	    else
5608 	      f->dump_stream("overlap") << "????";
5609 	    auto q = clone_snaps.find(*p);
5610 	    if (q != clone_snaps.end()) {
5611 	      f->open_array_section("snaps");
5612 	      for (auto s : q->second) {
5613 		f->dump_unsigned("snap", s);
5614 	      }
5615 	      f->close_section();
5616 	    }
5617 	    f->close_section();
5618 	  }
5619 	  f->close_section();
5620 	}
5621 	
5622 	void SnapSet::generate_test_instances(list<SnapSet*>& o)
5623 	{
5624 	  o.push_back(new SnapSet);
5625 	  o.push_back(new SnapSet);
5626 	  o.back()->seq = 123;
5627 	  o.back()->snaps.push_back(123);
5628 	  o.back()->snaps.push_back(12);
5629 	  o.push_back(new SnapSet);
5630 	  o.back()->seq = 123;
5631 	  o.back()->snaps.push_back(123);
5632 	  o.back()->snaps.push_back(12);
5633 	  o.back()->clones.push_back(12);
5634 	  o.back()->clone_size[12] = 12345;
5635 	  o.back()->clone_overlap[12];
5636 	  o.back()->clone_snaps[12] = {12, 10, 8};
5637 	}
5638 	
5639 	ostream& operator<<(ostream& out, const SnapSet& cs)
5640 	{
5641 	  return out << cs.seq << "=" << cs.snaps << ":"
5642 		     << cs.clone_snaps;
5643 	}
5644 	
5645 	void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
5646 	{
5647 	  // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5648 	  // correct: it will not include snaps that still logically exist
5649 	  // but for which there was no clone that is defined.  For all
5650 	  // practical purposes this doesn't matter, since we only use that
5651 	  // information to clone on the OSD, and we have already moved
5652 	  // forward past that part of the object history.
5653 	
5654 	  seq = ss.seq;
5655 	  set<snapid_t> _snaps;
5656 	  set<snapid_t> _clones;
5657 	  for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) {
5658 	    if (p->cloneid != librados::SNAP_HEAD) {
5659 	      _clones.insert(p->cloneid);
5660 	      _snaps.insert(p->snaps.begin(), p->snaps.end());
5661 	      clone_size[p->cloneid] = p->size;
5662 	      clone_overlap[p->cloneid];  // the entry must exist, even if it's empty.
5663 	      for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q)
5664 		clone_overlap[p->cloneid].insert(q->first, q->second);
5665 	      if (!legacy) {
5666 		// p->snaps is ascending; clone_snaps is descending
5667 		vector<snapid_t>& v = clone_snaps[p->cloneid];
5668 		for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
5669 		  v.push_back(*q);
5670 		}
5671 	      }
5672 	    }
5673 	  }
5674 	
5675 	  // ascending
5676 	  clones.clear();
5677 	  clones.reserve(_clones.size());
5678 	  for (auto p = _clones.begin(); p != _clones.end(); ++p)
5679 	    clones.push_back(*p);
5680 	
5681 	  // descending
5682 	  snaps.clear();
5683 	  snaps.reserve(_snaps.size());
5684 	  for (auto p = _snaps.rbegin();
5685 	       p != _snaps.rend(); ++p)
5686 	    snaps.push_back(*p);
5687 	}
5688 	
5689 	uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
5690 	{
5691 	  ceph_assert(clone_size.count(clone));
5692 	  uint64_t size = clone_size.find(clone)->second;
5693 	  ceph_assert(clone_overlap.count(clone));
5694 	  const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
5695 	  ceph_assert(size >= (uint64_t)overlap.size());
5696 	  return size - overlap.size();
5697 	}
5698 	
5699 	void SnapSet::filter(const pg_pool_t &pinfo)
5700 	{
5701 	  vector<snapid_t> oldsnaps;
5702 	  oldsnaps.swap(snaps);
5703 	  for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) {
5704 	    if (!pinfo.is_removed_snap(*i))
5705 	      snaps.push_back(*i);
5706 	  }
5707 	}
5708 	
5709 	SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5710 	{
5711 	  SnapSet ss = *this;
5712 	  ss.filter(pinfo);
5713 	  return ss;
5714 	}
5715 	
5716 	// -- watch_info_t --
5717 	
5718 	void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
5719 	{
5720 	  ENCODE_START(4, 3, bl);
5721 	  encode(cookie, bl);
5722 	  encode(timeout_seconds, bl);
5723 	  encode(addr, bl, features);
5724 	  ENCODE_FINISH(bl);
5725 	}
5726 	
5727 	void watch_info_t::decode(ceph::buffer::list::const_iterator& bl)
5728 	{
5729 	  DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
5730 	  decode(cookie, bl);
5731 	  if (struct_v < 2) {
5732 	    uint64_t ver;
5733 	    decode(ver, bl);
5734 	  }
5735 	  decode(timeout_seconds, bl);
5736 	  if (struct_v >= 4) {
5737 	    decode(addr, bl);
5738 	  }
5739 	  DECODE_FINISH(bl);
5740 	}
5741 	
5742 	void watch_info_t::dump(Formatter *f) const
5743 	{
5744 	  f->dump_unsigned("cookie", cookie);
5745 	  f->dump_unsigned("timeout_seconds", timeout_seconds);
5746 	  f->open_object_section("addr");
5747 	  addr.dump(f);
5748 	  f->close_section();
5749 	}
5750 	
5751 	void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5752 	{
5753 	  o.push_back(new watch_info_t);
5754 	  o.push_back(new watch_info_t);
5755 	  o.back()->cookie = 123;
5756 	  o.back()->timeout_seconds = 99;
5757 	  entity_addr_t ea;
5758 	  ea.set_type(entity_addr_t::TYPE_LEGACY);
5759 	  ea.set_nonce(1);
5760 	  ea.set_family(AF_INET);
5761 	  ea.set_in4_quad(0, 127);
5762 	  ea.set_in4_quad(1, 0);
5763 	  ea.set_in4_quad(2, 1);
5764 	  ea.set_in4_quad(3, 2);
5765 	  ea.set_port(2);
5766 	  o.back()->addr = ea;
5767 	}
5768 	
5769 	// -- chunk_info_t --
5770 	
5771 	void chunk_info_t::encode(ceph::buffer::list& bl) const
5772 	{
5773 	  ENCODE_START(1, 1, bl);
5774 	  encode(offset, bl);
5775 	  encode(length, bl);
5776 	  encode(oid, bl);
5777 	  __u32 _flags = flags;
5778 	  encode(_flags, bl);
5779 	  ENCODE_FINISH(bl);
5780 	}
5781 	
5782 	void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl)
5783 	{
5784 	  DECODE_START(1, bl);
5785 	  decode(offset, bl);
5786 	  decode(length, bl);
5787 	  decode(oid, bl);
5788 	  __u32 _flags;
5789 	  decode(_flags, bl);
5790 	  flags = (cflag_t)_flags;
5791 	  DECODE_FINISH(bl);
5792 	}
5793 	
5794 	void chunk_info_t::dump(Formatter *f) const
5795 	{
5796 	  f->dump_unsigned("length", length);
5797 	  f->open_object_section("oid");
5798 	  oid.dump(f);
5799 	  f->close_section();
5800 	  f->dump_unsigned("flags", flags);
5801 	}
5802 	
5803 	ostream& operator<<(ostream& out, const chunk_info_t& ci)
5804 	{
5805 	  return out << "(len: " << ci.length << " oid: " << ci.oid
5806 		     << " offset: " << ci.offset
5807 		     << " flags: " << ci.get_flag_string(ci.flags) << ")";
5808 	}
5809 	
5810 	// -- object_manifest_t --
5811 	
5812 	void object_manifest_t::encode(ceph::buffer::list& bl) const
5813 	{
5814 	  ENCODE_START(1, 1, bl);
5815 	  encode(type, bl);
5816 	  switch (type) {
5817 	    case TYPE_NONE: break;
5818 	    case TYPE_REDIRECT: 
5819 	      encode(redirect_target, bl);
5820 	      break;
5821 	    case TYPE_CHUNKED:
5822 	      encode(chunk_map, bl);
5823 	      break;
5824 	    default:
5825 	      ceph_abort();
5826 	  }
5827 	  ENCODE_FINISH(bl);
5828 	}
5829 	
5830 	void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl)
5831 	{
5832 	  DECODE_START(1, bl);
5833 	  decode(type, bl);
5834 	  switch (type) {
5835 	    case TYPE_NONE: break;
5836 	    case TYPE_REDIRECT: 
5837 	      decode(redirect_target, bl);
5838 	      break;
5839 	    case TYPE_CHUNKED:
5840 	      decode(chunk_map, bl);
5841 	      break;
5842 	    default:
5843 	      ceph_abort();
5844 	  }
5845 	  DECODE_FINISH(bl);
5846 	}
5847 	
5848 	void object_manifest_t::dump(Formatter *f) const
5849 	{
5850 	  f->dump_unsigned("type", type);
5851 	  if (type == TYPE_REDIRECT) {
5852 	    f->open_object_section("redirect_target");
5853 	    redirect_target.dump(f);
5854 	    f->close_section();
5855 	  } else if (type == TYPE_CHUNKED) {
5856 	    f->open_array_section("chunk_map");
5857 	    for (auto& p : chunk_map) {
5858 	      f->open_object_section("chunk");
5859 	      f->dump_unsigned("offset", p.first);
5860 	      p.second.dump(f);
5861 	      f->close_section();
5862 	    }
5863 	    f->close_section();
5864 	  }
5865 	}
5866 	
5867 	void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
5868 	{
5869 	  o.push_back(new object_manifest_t());
5870 	  o.back()->type = TYPE_REDIRECT;
5871 	}
5872 	
5873 	ostream& operator<<(ostream& out, const object_manifest_t& om)
5874 	{
5875 	  out << "manifest(" << om.get_type_name();
5876 	  if (om.is_redirect()) {
5877 	    out << " " << om.redirect_target;
5878 	  } else if (om.is_chunked()) {
5879 	    out << " " << om.chunk_map;
5880 	  }
5881 	  out << ")";
5882 	  return out;
5883 	}
5884 	
5885 	// -- object_info_t --
5886 	
5887 	void object_info_t::copy_user_bits(const object_info_t& other)
5888 	{
5889 	  // these bits are copied from head->clone.
5890 	  size = other.size;
5891 	  mtime = other.mtime;
5892 	  local_mtime = other.local_mtime;
5893 	  last_reqid = other.last_reqid;
5894 	  truncate_seq = other.truncate_seq;
5895 	  truncate_size = other.truncate_size;
5896 	  flags = other.flags;
5897 	  user_version = other.user_version;
5898 	  data_digest = other.data_digest;
5899 	  omap_digest = other.omap_digest;
5900 	}
5901 	
5902 	void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
5903 	{
5904 	  object_locator_t myoloc(soid);
5905 	  map<entity_name_t, watch_info_t> old_watchers;
5906 	  for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) {
5907 	    old_watchers.insert(make_pair(i->first.second, i->second));
5908 	  }
5909 	  ENCODE_START(17, 8, bl);
5910 	  encode(soid, bl);
5911 	  encode(myoloc, bl);	//Retained for compatibility
5912 	  encode((__u32)0, bl); // was category, no longer used
5913 	  encode(version, bl);
5914 	  encode(prior_version, bl);
5915 	  encode(last_reqid, bl);
5916 	  encode(size, bl);
5917 	  encode(mtime, bl);
5918 	  if (soid.snap == CEPH_NOSNAP)
5919 	    encode(osd_reqid_t(), bl);  // used to be wrlock_by
5920 	  else
5921 	    encode((uint32_t)0, bl);    // was legacy_snaps
5922 	  encode(truncate_seq, bl);
5923 	  encode(truncate_size, bl);
5924 	  encode(is_lost(), bl);
5925 	  encode(old_watchers, bl, features);
5926 	  /* shenanigans to avoid breaking backwards compatibility in the disk format.
5927 	   * When we can, switch this out for simply putting the version_t on disk. */
5928 	  eversion_t user_eversion(0, user_version);
5929 	  encode(user_eversion, bl);
5930 	  encode(test_flag(FLAG_USES_TMAP), bl);
5931 	  encode(watchers, bl, features);
5932 	  __u32 _flags = flags;
5933 	  encode(_flags, bl);
5934 	  encode(local_mtime, bl);
5935 	  encode(data_digest, bl);
5936 	  encode(omap_digest, bl);
5937 	  encode(expected_object_size, bl);
5938 	  encode(expected_write_size, bl);
5939 	  encode(alloc_hint_flags, bl);
5940 	  if (has_manifest()) {
5941 	    encode(manifest, bl);
5942 	  }
5943 	  ENCODE_FINISH(bl);
5944 	}
5945 	
5946 	void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
5947 	{
5948 	  object_locator_t myoloc;
5949 	  DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
5950 	  map<entity_name_t, watch_info_t> old_watchers;
5951 	  decode(soid, bl);
5952 	  decode(myoloc, bl);
5953 	  {
5954 	    string category;
5955 	    decode(category, bl);  // no longer used
5956 	  }
5957 	  decode(version, bl);
5958 	  decode(prior_version, bl);
5959 	  decode(last_reqid, bl);
5960 	  decode(size, bl);
5961 	  decode(mtime, bl);
5962 	  if (soid.snap == CEPH_NOSNAP) {
5963 	    osd_reqid_t wrlock_by;
5964 	    decode(wrlock_by, bl);
5965 	  } else {
5966 	    vector<snapid_t> legacy_snaps;
5967 	    decode(legacy_snaps, bl);
5968 	  }
5969 	  decode(truncate_seq, bl);
5970 	  decode(truncate_size, bl);
5971 	
5972 	  // if this is struct_v >= 13, we will overwrite this
5973 	  // below since this field is just here for backwards
5974 	  // compatibility
5975 	  __u8 lo;
5976 	  decode(lo, bl);
5977 	  flags = (flag_t)lo;
5978 	
5979 	  decode(old_watchers, bl);
5980 	  eversion_t user_eversion;
5981 	  decode(user_eversion, bl);
5982 	  user_version = user_eversion.version;
5983 	
5984 	  if (struct_v >= 9) {
5985 	    bool uses_tmap = false;
5986 	    decode(uses_tmap, bl);
5987 	    if (uses_tmap)
5988 	      set_flag(FLAG_USES_TMAP);
5989 	  } else {
5990 	    set_flag(FLAG_USES_TMAP);
5991 	  }
5992 	  if (struct_v < 10)
5993 	    soid.pool = myoloc.pool;
5994 	  if (struct_v >= 11) {
5995 	    decode(watchers, bl);
5996 	  } else {
5997 	    for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) {
5998 	      watchers.insert(
5999 		make_pair(
6000 		  make_pair(i->second.cookie, i->first), i->second));
6001 	    }
6002 	  }
6003 	  if (struct_v >= 13) {
6004 	    __u32 _flags;
6005 	    decode(_flags, bl);
6006 	    flags = (flag_t)_flags;
6007 	  }
6008 	  if (struct_v >= 14) {
6009 	    decode(local_mtime, bl);
6010 	  } else {
6011 	    local_mtime = utime_t();
6012 	  }
6013 	  if (struct_v >= 15) {
6014 	    decode(data_digest, bl);
6015 	    decode(omap_digest, bl);
6016 	  } else {
6017 	    data_digest = omap_digest = -1;
6018 	    clear_flag(FLAG_DATA_DIGEST);
6019 	    clear_flag(FLAG_OMAP_DIGEST);
6020 	  }
6021 	  if (struct_v >= 16) {
6022 	    decode(expected_object_size, bl);
6023 	    decode(expected_write_size, bl);
6024 	    decode(alloc_hint_flags, bl);
6025 	  } else {
6026 	    expected_object_size = 0;
6027 	    expected_write_size = 0;
6028 	    alloc_hint_flags = 0;
6029 	  }
6030 	  if (struct_v >= 17) {
6031 	    if (has_manifest()) {
6032 	      decode(manifest, bl);
6033 	    }
6034 	  }
6035 	  DECODE_FINISH(bl);
6036 	}
6037 	
6038 	void object_info_t::dump(Formatter *f) const
6039 	{
6040 	  f->open_object_section("oid");
6041 	  soid.dump(f);
6042 	  f->close_section();
6043 	  f->dump_stream("version") << version;
6044 	  f->dump_stream("prior_version") << prior_version;
6045 	  f->dump_stream("last_reqid") << last_reqid;
6046 	  f->dump_unsigned("user_version", user_version);
6047 	  f->dump_unsigned("size", size);
6048 	  f->dump_stream("mtime") << mtime;
6049 	  f->dump_stream("local_mtime") << local_mtime;
6050 	  f->dump_unsigned("lost", (int)is_lost());
6051 	  vector<string> sv = get_flag_vector(flags);
6052 	  f->open_array_section("flags");
6053 	  for (auto str: sv)
6054 	    f->dump_string("flags", str);
6055 	  f->close_section();
6056 	  f->dump_unsigned("truncate_seq", truncate_seq);
6057 	  f->dump_unsigned("truncate_size", truncate_size);
6058 	  f->dump_format("data_digest", "0x%08x", data_digest);
6059 	  f->dump_format("omap_digest", "0x%08x", omap_digest);
6060 	  f->dump_unsigned("expected_object_size", expected_object_size);
6061 	  f->dump_unsigned("expected_write_size", expected_write_size);
6062 	  f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
6063 	  f->dump_object("manifest", manifest);
6064 	  f->open_object_section("watchers");
6065 	  for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) {
6066 	    stringstream ss;
6067 	    ss << p->first.second;
6068 	    f->open_object_section(ss.str().c_str());
6069 	    p->second.dump(f);
6070 	    f->close_section();
6071 	  }
6072 	  f->close_section();
6073 	}
6074 	
6075 	void object_info_t::generate_test_instances(list<object_info_t*>& o)
6076 	{
6077 	  o.push_back(new object_info_t());
6078 	  
6079 	  // fixme
6080 	}
6081 	
6082 	
6083 	ostream& operator<<(ostream& out, const object_info_t& oi)
6084 	{
6085 	  out << oi.soid << "(" << oi.version
6086 	      << " " << oi.last_reqid;
6087 	  if (oi.flags)
6088 	    out << " " << oi.get_flag_string();
6089 	  out << " s " << oi.size;
6090 	  out << " uv " << oi.user_version;
6091 	  if (oi.is_data_digest())
6092 	    out << " dd " << std::hex << oi.data_digest << std::dec;
6093 	  if (oi.is_omap_digest())
6094 	    out << " od " << std::hex << oi.omap_digest << std::dec;
6095 	  out << " alloc_hint [" << oi.expected_object_size
6096 	      << " " << oi.expected_write_size
6097 	      << " " << oi.alloc_hint_flags << "]";
6098 	  if (oi.has_manifest())
6099 	    out << " " << oi.manifest;
6100 	  out << ")";
6101 	  return out;
6102 	}
6103 	
6104 	// -- ObjectRecovery --
6105 	void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const
6106 	{
6107 	  ENCODE_START(1, 1, bl);
6108 	  encode(first, bl);
6109 	  encode(data_complete, bl);
6110 	  encode(data_recovered_to, bl);
6111 	  encode(omap_recovered_to, bl);
6112 	  encode(omap_complete, bl);
6113 	  ENCODE_FINISH(bl);
6114 	}
6115 	
6116 	void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl)
6117 	{
6118 	  DECODE_START(1, bl);
6119 	  decode(first, bl);
6120 	  decode(data_complete, bl);
6121 	  decode(data_recovered_to, bl);
6122 	  decode(omap_recovered_to, bl);
6123 	  decode(omap_complete, bl);
6124 	  DECODE_FINISH(bl);
6125 	}
6126 	
6127 	ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
6128 	{
6129 	  return prog.print(out);
6130 	}
6131 	
6132 	void ObjectRecoveryProgress::generate_test_instances(
6133 	  list<ObjectRecoveryProgress*>& o)
6134 	{
6135 	  o.push_back(new ObjectRecoveryProgress);
6136 	  o.back()->first = false;
6137 	  o.back()->data_complete = true;
6138 	  o.back()->omap_complete = true;
6139 	  o.back()->data_recovered_to = 100;
6140 	
6141 	  o.push_back(new ObjectRecoveryProgress);
6142 	  o.back()->first = true;
6143 	  o.back()->data_complete = false;
6144 	  o.back()->omap_complete = false;
6145 	  o.back()->data_recovered_to = 0;
6146 	}
6147 	
6148 	ostream &ObjectRecoveryProgress::print(ostream &out) const
6149 	{
6150 	  return out << "ObjectRecoveryProgress("
6151 		     << ( first ? "" : "!" ) << "first, "
6152 		     << "data_recovered_to:" << data_recovered_to
6153 		     << ", data_complete:" << ( data_complete ? "true" : "false" )
6154 		     << ", omap_recovered_to:" << omap_recovered_to
6155 		     << ", omap_complete:" << ( omap_complete ? "true" : "false" )
6156 		     << ", error:" << ( error ? "true" : "false" )
6157 		     << ")";
6158 	}
6159 	
6160 	void ObjectRecoveryProgress::dump(Formatter *f) const
6161 	{
6162 	  f->dump_int("first?", first);
6163 	  f->dump_int("data_complete?", data_complete);
6164 	  f->dump_unsigned("data_recovered_to", data_recovered_to);
6165 	  f->dump_int("omap_complete?", omap_complete);
6166 	  f->dump_string("omap_recovered_to", omap_recovered_to);
6167 	}
6168 	
6169 	void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
6170 	{
6171 	  ENCODE_START(3, 1, bl);
6172 	  encode(soid, bl);
6173 	  encode(version, bl);
6174 	  encode(size, bl);
6175 	  encode(oi, bl, features);
6176 	  encode(ss, bl);
6177 	  encode(copy_subset, bl);
6178 	  encode(clone_subset, bl);
6179 	  encode(object_exist, bl);
6180 	  ENCODE_FINISH(bl);
6181 	}
6182 	
6183 	void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
6184 					int64_t pool)
6185 	{
6186 	  DECODE_START(3, bl);
6187 	  decode(soid, bl);
6188 	  decode(version, bl);
6189 	  decode(size, bl);
6190 	  decode(oi, bl);
6191 	  decode(ss, bl);
6192 	  decode(copy_subset, bl);
6193 	  decode(clone_subset, bl);
6194 	  if (struct_v > 2)
6195 	    decode(object_exist, bl);
6196 	  else
6197 	    object_exist = false;
6198 	  DECODE_FINISH(bl);
6199 	  if (struct_v < 2) {
6200 	    if (!soid.is_max() && soid.pool == -1)
6201 	      soid.pool = pool;
6202 	    map<hobject_t, interval_set<uint64_t>> tmp;
6203 	    tmp.swap(clone_subset);
6204 	    for (auto i = tmp.begin(); i != tmp.end(); ++i) {
6205 	      hobject_t first(i->first);
6206 	      if (!first.is_max() && first.pool == -1)
6207 		first.pool = pool;
6208 	      clone_subset[first].swap(i->second);
6209 	    }
6210 	  }
6211 	}
6212 	
6213 	void ObjectRecoveryInfo::generate_test_instances(
6214 	  list<ObjectRecoveryInfo*>& o)
6215 	{
6216 	  o.push_back(new ObjectRecoveryInfo);
6217 	  o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
6218 	  o.back()->version = eversion_t(0,0);
6219 	  o.back()->size = 100;
6220 	  o.back()->object_exist = false;
6221 	}
6222 	
6223 	
6224 	void ObjectRecoveryInfo::dump(Formatter *f) const
6225 	{
6226 	  f->dump_stream("object") << soid;
6227 	  f->dump_stream("at_version") << version;
6228 	  f->dump_stream("size") << size;
6229 	  {
6230 	    f->open_object_section("object_info");
6231 	    oi.dump(f);
6232 	    f->close_section();
6233 	  }
6234 	  {
6235 	    f->open_object_section("snapset");
6236 	    ss.dump(f);
6237 	    f->close_section();
6238 	  }
6239 	  f->dump_stream("copy_subset") << copy_subset;
6240 	  f->dump_stream("clone_subset") << clone_subset;
6241 	  f->dump_stream("object_exist") << object_exist;
6242 	}
6243 	
6244 	ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
6245 	{
6246 	  return inf.print(out);
6247 	}
6248 	
6249 	ostream &ObjectRecoveryInfo::print(ostream &out) const
6250 	{
6251 	  return out << "ObjectRecoveryInfo("
6252 		     << soid << "@" << version
6253 		     << ", size: " << size
6254 		     << ", copy_subset: " << copy_subset
6255 		     << ", clone_subset: " << clone_subset
6256 		     << ", snapset: " << ss
6257 		     << ", object_exist: " << object_exist
6258 		     << ")";
6259 	}
6260 	
6261 	// -- PushReplyOp --
6262 	void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
6263 	{
6264 	  o.push_back(new PushReplyOp);
6265 	  o.push_back(new PushReplyOp);
6266 	  o.back()->soid = hobject_t(sobject_t("asdf", 2));
6267 	  o.push_back(new PushReplyOp);
6268 	  o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6269 	}
6270 	
6271 	void PushReplyOp::encode(ceph::buffer::list &bl) const
6272 	{
6273 	  ENCODE_START(1, 1, bl);
6274 	  encode(soid, bl);
6275 	  ENCODE_FINISH(bl);
6276 	}
6277 	
6278 	void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl)
6279 	{
6280 	  DECODE_START(1, bl);
6281 	  decode(soid, bl);
6282 	  DECODE_FINISH(bl);
6283 	}
6284 	
6285 	void PushReplyOp::dump(Formatter *f) const
6286 	{
6287 	  f->dump_stream("soid") << soid;
6288 	}
6289 	
6290 	ostream &PushReplyOp::print(ostream &out) const
6291 	{
6292 	  return out
6293 	    << "PushReplyOp(" << soid
6294 	    << ")";
6295 	}
6296 	
6297 	ostream& operator<<(ostream& out, const PushReplyOp &op)
6298 	{
6299 	  return op.print(out);
6300 	}
6301 	
6302 	uint64_t PushReplyOp::cost(CephContext *cct) const
6303 	{
6304 	
6305 	  return cct->_conf->osd_push_per_object_cost +
6306 	    cct->_conf->osd_recovery_max_chunk;
6307 	}
6308 	
6309 	// -- PullOp --
6310 	void PullOp::generate_test_instances(list<PullOp*> &o)
6311 	{
6312 	  o.push_back(new PullOp);
6313 	  o.push_back(new PullOp);
6314 	  o.back()->soid = hobject_t(sobject_t("asdf", 2));
6315 	  o.back()->recovery_info.version = eversion_t(3, 10);
6316 	  o.push_back(new PullOp);
6317 	  o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6318 	  o.back()->recovery_info.version = eversion_t(0, 0);
6319 	}
6320 	
6321 	void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const
6322 	{
6323 	  ENCODE_START(1, 1, bl);
6324 	  encode(soid, bl);
6325 	  encode(recovery_info, bl, features);
6326 	  encode(recovery_progress, bl);
6327 	  ENCODE_FINISH(bl);
6328 	}
6329 	
6330 	void PullOp::decode(ceph::buffer::list::const_iterator &bl)
6331 	{
6332 	  DECODE_START(1, bl);
6333 	  decode(soid, bl);
6334 	  decode(recovery_info, bl);
6335 	  decode(recovery_progress, bl);
6336 	  DECODE_FINISH(bl);
6337 	}
6338 	
6339 	void PullOp::dump(Formatter *f) const
6340 	{
6341 	  f->dump_stream("soid") << soid;
6342 	  {
6343 	    f->open_object_section("recovery_info");
6344 	    recovery_info.dump(f);
6345 	    f->close_section();
6346 	  }
6347 	  {
6348 	    f->open_object_section("recovery_progress");
6349 	    recovery_progress.dump(f);
6350 	    f->close_section();
6351 	  }
6352 	}
6353 	
6354 	ostream &PullOp::print(ostream &out) const
6355 	{
6356 	  return out
6357 	    << "PullOp(" << soid
6358 	    << ", recovery_info: " << recovery_info
6359 	    << ", recovery_progress: " << recovery_progress
6360 	    << ")";
6361 	}
6362 	
6363 	ostream& operator<<(ostream& out, const PullOp &op)
6364 	{
6365 	  return op.print(out);
6366 	}
6367 	
6368 	uint64_t PullOp::cost(CephContext *cct) const
6369 	{
6370 	  return cct->_conf->osd_push_per_object_cost +
6371 	    cct->_conf->osd_recovery_max_chunk;
6372 	}
6373 	
6374 	// -- PushOp --
6375 	void PushOp::generate_test_instances(list<PushOp*> &o)
6376 	{
6377 	  o.push_back(new PushOp);
6378 	  o.push_back(new PushOp);
6379 	  o.back()->soid = hobject_t(sobject_t("asdf", 2));
6380 	  o.back()->version = eversion_t(3, 10);
6381 	  o.push_back(new PushOp);
6382 	  o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6383 	  o.back()->version = eversion_t(0, 0);
6384 	}
6385 	
6386 	void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const
6387 	{
6388 	  ENCODE_START(1, 1, bl);
6389 	  encode(soid, bl);
6390 	  encode(version, bl);
6391 	  encode(data, bl);
6392 	  encode(data_included, bl);
6393 	  encode(omap_header, bl);
6394 	  encode(omap_entries, bl);
6395 	  encode(attrset, bl);
6396 	  encode(recovery_info, bl, features);
6397 	  encode(after_progress, bl);
6398 	  encode(before_progress, bl);
6399 	  ENCODE_FINISH(bl);
6400 	}
6401 	
6402 	void PushOp::decode(ceph::buffer::list::const_iterator &bl)
6403 	{
6404 	  DECODE_START(1, bl);
6405 	  decode(soid, bl);
6406 	  decode(version, bl);
6407 	  decode(data, bl);
6408 	  decode(data_included, bl);
6409 	  decode(omap_header, bl);
6410 	  decode(omap_entries, bl);
6411 	  decode(attrset, bl);
6412 	  decode(recovery_info, bl);
6413 	  decode(after_progress, bl);
6414 	  decode(before_progress, bl);
6415 	  DECODE_FINISH(bl);
6416 	}
6417 	
6418 	void PushOp::dump(Formatter *f) const
6419 	{
6420 	  f->dump_stream("soid") << soid;
6421 	  f->dump_stream("version") << version;
6422 	  f->dump_int("data_len", data.length());
6423 	  f->dump_stream("data_included") << data_included;
6424 	  f->dump_int("omap_header_len", omap_header.length());
6425 	  f->dump_int("omap_entries_len", omap_entries.size());
6426 	  f->dump_int("attrset_len", attrset.size());
6427 	  {
6428 	    f->open_object_section("recovery_info");
6429 	    recovery_info.dump(f);
6430 	    f->close_section();
6431 	  }
6432 	  {
6433 	    f->open_object_section("after_progress");
6434 	    after_progress.dump(f);
6435 	    f->close_section();
6436 	  }
6437 	  {
6438 	    f->open_object_section("before_progress");
6439 	    before_progress.dump(f);
6440 	    f->close_section();
6441 	  }
6442 	}
6443 	
6444 	ostream &PushOp::print(ostream &out) const
6445 	{
6446 	  return out
6447 	    << "PushOp(" << soid
6448 	    << ", version: " << version
6449 	    << ", data_included: " << data_included
6450 	    << ", data_size: " << data.length()
6451 	    << ", omap_header_size: " << omap_header.length()
6452 	    << ", omap_entries_size: " << omap_entries.size()
6453 	    << ", attrset_size: " << attrset.size()
6454 	    << ", recovery_info: " << recovery_info
6455 	    << ", after_progress: " << after_progress
6456 	    << ", before_progress: " << before_progress
6457 	    << ")";
6458 	}
6459 	
6460 	ostream& operator<<(ostream& out, const PushOp &op)
6461 	{
6462 	  return op.print(out);
6463 	}
6464 	
6465 	uint64_t PushOp::cost(CephContext *cct) const
6466 	{
6467 	  uint64_t cost = data_included.size();
6468 	  for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) {
6469 	    cost += i->second.length();
6470 	  }
6471 	  cost += cct->_conf->osd_push_per_object_cost;
6472 	  return cost;
6473 	}
6474 	
6475 	// -- ScrubMap --
6476 	
6477 	void ScrubMap::merge_incr(const ScrubMap &l)
6478 	{
6479 	  ceph_assert(valid_through == l.incr_since);
6480 	  valid_through = l.valid_through;
6481 	
6482 	  for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){
6483 	    if (p->second.negative) {
6484 	      auto q = objects.find(p->first);
6485 	      if (q != objects.end()) {
6486 		objects.erase(q);
6487 	      }
6488 	    } else {
6489 	      objects[p->first] = p->second;
6490 	    }
6491 	  }
6492 	}          
6493 	
6494 	void ScrubMap::encode(ceph::buffer::list& bl) const
6495 	{
6496 	  ENCODE_START(3, 2, bl);
6497 	  encode(objects, bl);
6498 	  encode((__u32)0, bl); // used to be attrs; now deprecated
6499 	  ceph::buffer::list old_logbl;  // not used
6500 	  encode(old_logbl, bl);
6501 	  encode(valid_through, bl);
6502 	  encode(incr_since, bl);
6503 	  ENCODE_FINISH(bl);
6504 	}
6505 	
6506 	void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool)
6507 	{
6508 	  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
6509 	  decode(objects, bl);
6510 	  {
6511 	    map<string,string> attrs;  // deprecated
6512 	    decode(attrs, bl);
6513 	  }
6514 	  ceph::buffer::list old_logbl;   // not used
6515 	  decode(old_logbl, bl);
6516 	  decode(valid_through, bl);
6517 	  decode(incr_since, bl);
6518 	  DECODE_FINISH(bl);
6519 	
6520 	  // handle hobject_t upgrade
6521 	  if (struct_v < 3) {
6522 	    map<hobject_t, object> tmp;
6523 	    tmp.swap(objects);
6524 	    for (auto i = tmp.begin(); i != tmp.end(); ++i) {
6525 	      hobject_t first(i->first);
6526 	      if (!first.is_max() && first.pool == -1)
6527 		first.pool = pool;
6528 	      objects[first] = i->second;
6529 	    }
6530 	  }
6531 	}
6532 	
6533 	void ScrubMap::dump(Formatter *f) const
6534 	{
6535 	  f->dump_stream("valid_through") << valid_through;
6536 	  f->dump_stream("incremental_since") << incr_since;
6537 	  f->open_array_section("objects");
6538 	  for (auto p = objects.cbegin(); p != objects.cend(); ++p) {
6539 	    f->open_object_section("object");
6540 	    f->dump_string("name", p->first.oid.name);
6541 	    f->dump_unsigned("hash", p->first.get_hash());
6542 	    f->dump_string("key", p->first.get_key());
6543 	    f->dump_int("snapid", p->first.snap);
6544 	    p->second.dump(f);
6545 	    f->close_section();
6546 	  }
6547 	  f->close_section();
6548 	}
6549 	
6550 	void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
6551 	{
6552 	  o.push_back(new ScrubMap);
6553 	  o.push_back(new ScrubMap);
6554 	  o.back()->valid_through = eversion_t(1, 2);
6555 	  o.back()->incr_since = eversion_t(3, 4);
6556 	  list<object*> obj;
6557 	  object::generate_test_instances(obj);
6558 	  o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
6559 	  obj.pop_back();
6560 	  o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
6561 	}
6562 	
6563 	// -- ScrubMap::object --
6564 	
6565 	void ScrubMap::object::encode(ceph::buffer::list& bl) const
6566 	{
6567 	  bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
6568 	  ENCODE_START(10, 7, bl);
6569 	  encode(size, bl);
6570 	  encode(negative, bl);
6571 	  encode(attrs, bl);
6572 	  encode(digest, bl);
6573 	  encode(digest_present, bl);
6574 	  encode((uint32_t)0, bl);  // obsolete nlinks
6575 	  encode((uint32_t)0, bl);  // snapcolls
6576 	  encode(omap_digest, bl);
6577 	  encode(omap_digest_present, bl);
6578 	  encode(compat_read_error, bl);
6579 	  encode(stat_error, bl);
6580 	  encode(read_error, bl);
6581 	  encode(ec_hash_mismatch, bl);
6582 	  encode(ec_size_mismatch, bl);
6583 	  encode(large_omap_object_found, bl);
6584 	  encode(large_omap_object_key_count, bl);
6585 	  encode(large_omap_object_value_size, bl);
6586 	  encode(object_omap_bytes, bl);
6587 	  encode(object_omap_keys, bl);
6588 	  ENCODE_FINISH(bl);
6589 	}
6590 	
6591 	void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl)
6592 	{
6593 	  DECODE_START(10, bl);
6594 	  decode(size, bl);
6595 	  bool tmp, compat_read_error = false;
6596 	  decode(tmp, bl);
6597 	  negative = tmp;
6598 	  decode(attrs, bl);
6599 	  decode(digest, bl);
6600 	  decode(tmp, bl);
6601 	  digest_present = tmp;
6602 	  {
6603 	    uint32_t nlinks;
6604 	    decode(nlinks, bl);
6605 	    set<snapid_t> snapcolls;
6606 	    decode(snapcolls, bl);
6607 	  }
6608 	  decode(omap_digest, bl);
6609 	  decode(tmp, bl);
6610 	  omap_digest_present = tmp;
6611 	  decode(compat_read_error, bl);
6612 	  decode(tmp, bl);
6613 	  stat_error = tmp;
6614 	  if (struct_v >= 8) {
6615 	    decode(tmp, bl);
6616 	    read_error = tmp;
6617 	    decode(tmp, bl);
6618 	    ec_hash_mismatch = tmp;
6619 	    decode(tmp, bl);
6620 	    ec_size_mismatch = tmp;
6621 	  }
6622 	  // If older encoder found a read_error, set read_error
6623 	  if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
6624 	    read_error = true;
6625 	  if (struct_v >= 9) {
6626 	    decode(tmp, bl);
6627 	    large_omap_object_found = tmp;
6628 	    decode(large_omap_object_key_count, bl);
6629 	    decode(large_omap_object_value_size, bl);
6630 	  }
6631 	  if (struct_v >= 10) {
6632 	    decode(object_omap_bytes, bl);
6633 	    decode(object_omap_keys, bl);
6634 	  }
6635 	  DECODE_FINISH(bl);
6636 	}
6637 	
6638 	void ScrubMap::object::dump(Formatter *f) const
6639 	{
6640 	  f->dump_int("size", size);
6641 	  f->dump_int("negative", negative);
6642 	  f->open_array_section("attrs");
6643 	  for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) {
6644 	    f->open_object_section("attr");
6645 	    f->dump_string("name", p->first);
6646 	    f->dump_int("length", p->second.length());
6647 	    f->close_section();
6648 	  }
6649 	  f->close_section();
6650 	}
6651 	
6652 	void ScrubMap::object::generate_test_instances(list<object*>& o)
6653 	{
6654 	  o.push_back(new object);
6655 	  o.push_back(new object);
6656 	  o.back()->negative = true;
6657 	  o.push_back(new object);
6658 	  o.back()->size = 123;
6659 	  o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3);
6660 	  o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6);
6661 	}
6662 	
6663 	// -- OSDOp --
6664 	
6665 	ostream& operator<<(ostream& out, const OSDOp& op)
6666 	{
6667 	  out << ceph_osd_op_name(op.op.op);
6668 	  if (ceph_osd_op_type_data(op.op.op)) {
6669 	    // data extent
6670 	    switch (op.op.op) {
6671 	    case CEPH_OSD_OP_ASSERT_VER:
6672 	      out << " v" << op.op.assert_ver.ver;
6673 	      break;
6674 	    case CEPH_OSD_OP_TRUNCATE:
6675 	      out << " " << op.op.extent.offset;
6676 	      break;
6677 	    case CEPH_OSD_OP_MASKTRUNC:
6678 	    case CEPH_OSD_OP_TRIMTRUNC:
6679 	      out << " " << op.op.extent.truncate_seq << "@"
6680 		  << (int64_t)op.op.extent.truncate_size;
6681 	      break;
6682 	    case CEPH_OSD_OP_ROLLBACK:
6683 	      out << " " << snapid_t(op.op.snap.snapid);
6684 	      break;
6685 	    case CEPH_OSD_OP_WATCH:
6686 	      out << " " << ceph_osd_watch_op_name(op.op.watch.op)
6687 		  << " cookie " << op.op.watch.cookie;
6688 	      if (op.op.watch.gen)
6689 		out << " gen " << op.op.watch.gen;
6690 	      break;
6691 	    case CEPH_OSD_OP_NOTIFY:
6692 	      out << " cookie " << op.op.notify.cookie;
6693 	      break;
6694 	    case CEPH_OSD_OP_COPY_GET:
6695 	      out << " max " << op.op.copy_get.max;
6696 	      break;
6697 	    case CEPH_OSD_OP_COPY_FROM:
6698 	      out << " ver " << op.op.copy_from.src_version;
6699 	      break;
6700 	    case CEPH_OSD_OP_SETALLOCHINT:
6701 	      out << " object_size " << op.op.alloc_hint.expected_object_size
6702 	          << " write_size " << op.op.alloc_hint.expected_write_size;
6703 	      break;
6704 	    case CEPH_OSD_OP_READ:
6705 	    case CEPH_OSD_OP_SPARSE_READ:
6706 	    case CEPH_OSD_OP_SYNC_READ:
6707 	    case CEPH_OSD_OP_WRITE:
6708 	    case CEPH_OSD_OP_WRITEFULL:
6709 	    case CEPH_OSD_OP_ZERO:
6710 	    case CEPH_OSD_OP_APPEND:
6711 	    case CEPH_OSD_OP_MAPEXT:
6712 	    case CEPH_OSD_OP_CMPEXT:
6713 	      out << " " << op.op.extent.offset << "~" << op.op.extent.length;
6714 	      if (op.op.extent.truncate_seq)
6715 		out << " [" << op.op.extent.truncate_seq << "@"
6716 		    << (int64_t)op.op.extent.truncate_size << "]";
6717 	      if (op.op.flags)
6718 		out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
6719 	    default:
6720 	      // don't show any arg info
6721 	      break;
6722 	    }
6723 	  } else if (ceph_osd_op_type_attr(op.op.op)) {
6724 	    // xattr name
6725 	    if (op.op.xattr.name_len && op.indata.length()) {
6726 	      out << " ";
6727 	      op.indata.write(0, op.op.xattr.name_len, out);
6728 	    }
6729 	    if (op.op.xattr.value_len)
6730 	      out << " (" << op.op.xattr.value_len << ")";
6731 	    if (op.op.op == CEPH_OSD_OP_CMPXATTR)
6732 	      out << " op " << (int)op.op.xattr.cmp_op
6733 		  << " mode " << (int)op.op.xattr.cmp_mode;
6734 	  } else if (ceph_osd_op_type_exec(op.op.op)) {
6735 	    // class.method
6736 	    if (op.op.cls.class_len && op.indata.length()) {
6737 	      out << " ";
6738 	      op.indata.write(0, op.op.cls.class_len, out);
6739 	      out << ".";
6740 	      op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
6741 	    }
6742 	  } else if (ceph_osd_op_type_pg(op.op.op)) {
6743 	    switch (op.op.op) {
6744 	    case CEPH_OSD_OP_PGLS:
6745 	    case CEPH_OSD_OP_PGLS_FILTER:
6746 	    case CEPH_OSD_OP_PGNLS:
6747 	    case CEPH_OSD_OP_PGNLS_FILTER:
6748 	      out << " start_epoch " << op.op.pgls.start_epoch;
6749 	      break;
6750 	    case CEPH_OSD_OP_PG_HITSET_LS:
6751 	      break;
6752 	    case CEPH_OSD_OP_PG_HITSET_GET:
6753 	      out << " " << utime_t(op.op.hit_set_get.stamp);
6754 	      break;
6755 	    case CEPH_OSD_OP_SCRUBLS:
6756 	      break;
6757 	    }
6758 	  }
6759 	  if (op.indata.length()) {
6760 	    out << " in=" << op.indata.length() << "b";
6761 	  }
6762 	  if (op.outdata.length()) {
6763 	    out << " out=" << op.outdata.length() << "b";
6764 	  }
6765 	  return out;
6766 	}
6767 	
6768 	
6769 	void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, ceph::buffer::list& in)
6770 	{
6771 	  ceph::buffer::list::iterator datap = in.begin();
6772 	  for (unsigned i = 0; i < ops.size(); i++) {
6773 	    if (ops[i].op.payload_len) {
6774 	      datap.copy(ops[i].op.payload_len, ops[i].indata);
6775 	    }
6776 	  }
6777 	}
6778 	
6779 	void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, ceph::buffer::list& out)
6780 	{
6781 	  for (unsigned i = 0; i < ops.size(); i++) {
6782 	    if (ops[i].indata.length()) {
6783 	      ops[i].op.payload_len = ops[i].indata.length();
6784 	      out.append(ops[i].indata);
6785 	    }
6786 	  }
6787 	}
6788 	
6789 	void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& in)
6790 	{
6791 	  auto datap = in.begin();
6792 	  for (unsigned i = 0; i < ops.size(); i++) {
6793 	    if (ops[i].op.payload_len) {
6794 	      datap.copy(ops[i].op.payload_len, ops[i].outdata);
6795 	    }
6796 	  }
6797 	}
6798 	
6799 	void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& out)
6800 	{
6801 	  for (unsigned i = 0; i < ops.size(); i++) {
6802 	    ops[i].op.payload_len = ops[i].outdata.length();
6803 	    if (ops[i].outdata.length()) {
6804 	      out.append(ops[i].outdata);
6805 	    }
6806 	  }
6807 	}
6808 	
6809 	void OSDOp::clear_data(vector<OSDOp>& ops)
6810 	{
6811 	  for (unsigned i = 0; i < ops.size(); i++) {
6812 	    OSDOp& op = ops[i];
6813 	    op.outdata.clear();
6814 	    if (ceph_osd_op_type_attr(op.op.op) &&
6815 	        op.op.xattr.name_len &&
6816 		op.indata.length() >= op.op.xattr.name_len) {
6817 	      ceph::buffer::ptr bp(op.op.xattr.name_len);
6818 	      ceph::buffer::list bl;
6819 	      bl.append(bp);
6820 	      bl.copy_in(0, op.op.xattr.name_len, op.indata);
6821 	      op.indata.claim(bl);
6822 	    } else if (ceph_osd_op_type_exec(op.op.op) &&
6823 	               op.op.cls.class_len &&
6824 		       op.indata.length() >
6825 		         (op.op.cls.class_len + op.op.cls.method_len)) {
6826 	      __u8 len = op.op.cls.class_len + op.op.cls.method_len;
6827 	      ceph::buffer::ptr bp(len);
6828 	      ceph::buffer::list bl;
6829 	      bl.append(bp);
6830 	      bl.copy_in(0, len, op.indata);
6831 	      op.indata.claim(bl);
6832 	    } else {
6833 	      op.indata.clear();
6834 	    }
6835 	  }
6836 	}
6837 	
6838 	int prepare_info_keymap(
6839 	  CephContext* cct,
6840 	  map<string,bufferlist> *km,
6841 	  epoch_t epoch,
6842 	  pg_info_t &info,
6843 	  pg_info_t &last_written_info,
6844 	  PastIntervals &past_intervals,
6845 	  bool dirty_big_info,
6846 	  bool dirty_epoch,
6847 	  bool try_fast_info,
6848 	  PerfCounters *logger,
6849 	  DoutPrefixProvider *dpp)
6850 	{
6851 	  if (dirty_epoch) {
6852 	    encode(epoch, (*km)[string(epoch_key)]);
6853 	  }
6854 	
6855 	  if (logger)
6856 	    logger->inc(l_osd_pg_info);
6857 	
6858 	  // try to do info efficiently?
6859 	  if (!dirty_big_info && try_fast_info &&
6860 	      info.last_update > last_written_info.last_update) {
6861 	    pg_fast_info_t fast;
6862 	    fast.populate_from(info);
6863 	    bool did = fast.try_apply_to(&last_written_info);
6864 	    ceph_assert(did);  // we verified last_update increased above
6865 	    if (info == last_written_info) {
6866 	      encode(fast, (*km)[string(fastinfo_key)]);
6867 	      if (logger)
6868 		logger->inc(l_osd_pg_fastinfo);
6869 	      return 0;
6870 	    }
6871 	    if (dpp) {
6872 	      ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n";
6873 	      {
6874 		JSONFormatter jf(true);
6875 		jf.dump_object("info", info);
6876 		jf.flush(*_dout);
6877 	      }
6878 	      {
6879 		*_dout << "\nlast_written_info:\n";
6880 		JSONFormatter jf(true);
6881 		jf.dump_object("last_written_info", last_written_info);
6882 		jf.flush(*_dout);
6883 	      }
6884 	      *_dout << dendl;
6885 	    }
6886 	  }
6887 	
6888 	  last_written_info = info;
6889 	
6890 	  // info.  store purged_snaps separately.
6891 	  interval_set<snapid_t> purged_snaps;
6892 	  purged_snaps.swap(info.purged_snaps);
6893 	  encode(info, (*km)[string(info_key)]);
6894 	  purged_snaps.swap(info.purged_snaps);
6895 	
6896 	  if (dirty_big_info) {
6897 	    // potentially big stuff
6898 	    bufferlist& bigbl = (*km)[string(biginfo_key)];
6899 	    encode(past_intervals, bigbl);
6900 	    encode(info.purged_snaps, bigbl);
6901 	    //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
6902 	    if (logger)
6903 	      logger->inc(l_osd_pg_biginfo);
6904 	  }
6905 	
6906 	  return 0;
6907 	}
6908 	
6909 	void create_pg_collection(
6910 	  ceph::os::Transaction& t, spg_t pgid, int bits)
6911 	{
6912 	  coll_t coll(pgid);
6913 	  t.create_collection(coll, bits);
6914 	}
6915 	
6916 	void init_pg_ondisk(
6917 	  ceph::os::Transaction& t,
6918 	  spg_t pgid,
6919 	  const pg_pool_t *pool)
6920 	{
6921 	  coll_t coll(pgid);
6922 	  if (pool) {
6923 	    // Give a hint to the PG collection
6924 	    bufferlist hint;
6925 	    uint32_t pg_num = pool->get_pg_num();
6926 	    uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
6927 	    encode(pg_num, hint);
6928 	    encode(expected_num_objects_pg, hint);
6929 	    uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
6930 	    t.collection_hint(coll, hint_type, hint);
6931 	  }
6932 	
6933 	  ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
6934 	  t.touch(coll, pgmeta_oid);
6935 	  map<string,bufferlist> values;
6936 	  __u8 struct_v = pg_latest_struct_v;
6937 	  encode(struct_v, values[string(infover_key)]);
6938 	  t.omap_setkeys(coll, pgmeta_oid, values);
6939 	}
6940 	
6941 	PGLSFilter::PGLSFilter() : cct(nullptr)
6942 	{
6943 	}
6944 	
6945 	PGLSFilter::~PGLSFilter()
6946 	{
6947 	}
6948 	
6949 	int PGLSPlainFilter::init(ceph::bufferlist::const_iterator &params)
6950 	{
6951 	  try {
6952 	    decode(xattr, params);
6953 	    decode(val, params);
6954 	  } catch (buffer::error &e) {
6955 	    return -EINVAL;
6956 	  }
6957 	  return 0;
6958 	}
6959 	
6960 	bool PGLSPlainFilter::filter(const hobject_t& obj,
6961 	                             const ceph::bufferlist& xattr_data) const
6962 	{
6963 	  return xattr_data.contents_equal(val.c_str(), val.size());
6964 	}
6965