1    	// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2    	// vim: ts=8 sw=2 smarttab
3    	
4    	#include <boost/algorithm/string.hpp>
5    	
6    	#include "PGMap.h"
7    	
8    	#define dout_subsys ceph_subsys_mon
9    	#include "common/debug.h"
10   	#include "common/Clock.h"
11   	#include "common/Formatter.h"
12   	#include "global/global_context.h"
13   	#include "include/ceph_features.h"
14   	#include "include/stringify.h"
15   	
16   	#include "osd/osd_types.h"
17   	#include "osd/OSDMap.h"
18   	#include <boost/range/adaptor/reversed.hpp>
19   	
20   	#define dout_context g_ceph_context
21   	
22   	using std::list;
23   	using std::make_pair;
24   	using std::map;
25   	using std::pair;
26   	using std::ostream;
27   	using std::ostringstream;
28   	using std::set;
29   	using std::string;
30   	using std::stringstream;
31   	using std::vector;
32   	
33   	using ceph::bufferlist;
34   	
35   	MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
36   	MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
37   	MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
38   	
39   	
40   	// ---------------------
41   	// PGMapDigest
42   	
43   	void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
44   	{
45   	  // NOTE: see PGMap::encode_digest
46   	  uint8_t v = 4;
47   	  if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
48   	    v = 1;
49   	  } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
50   	    v = 3;
51   	  }
52   	  ENCODE_START(v, 1, bl);
53   	  encode(num_pg, bl);
54   	  encode(num_pg_active, bl);
55   	  encode(num_pg_unknown, bl);
56   	  encode(num_osd, bl);
57   	  encode(pg_pool_sum, bl, features);
58   	  encode(pg_sum, bl, features);
59   	  encode(osd_sum, bl, features);
60   	  if (v >= 2) {
61   	    encode(num_pg_by_state, bl);
62   	  } else {
63   	    uint32_t n = num_pg_by_state.size();
64   	    encode(n, bl);
65   	    for (auto p : num_pg_by_state) {
66   	      encode((uint32_t)p.first, bl);
67   	      encode(p.second, bl);
68   	    }
69   	  }
70   	  encode(num_pg_by_osd, bl);
71   	  encode(num_pg_by_pool, bl);
72   	  encode(osd_last_seq, bl);
73   	  encode(per_pool_sum_delta, bl, features);
74   	  encode(per_pool_sum_deltas_stamps, bl);
75   	  encode(pg_sum_delta, bl, features);
76   	  encode(stamp_delta, bl);
77   	  encode(avail_space_by_rule, bl);
78   	  if (struct_v >= 3) {
79   	    encode(purged_snaps, bl);
80   	  }
81   	  if (struct_v >= 4) {
82   	    encode(osd_sum_by_class, bl, features);
83   	  }
84   	  ENCODE_FINISH(bl);
85   	}
86   	
87   	void PGMapDigest::decode(bufferlist::const_iterator& p)
88   	{
89   	  DECODE_START(4, p);
90   	  decode(num_pg, p);
91   	  decode(num_pg_active, p);
92   	  decode(num_pg_unknown, p);
93   	  decode(num_osd, p);
94   	  decode(pg_pool_sum, p);
95   	  decode(pg_sum, p);
96   	  decode(osd_sum, p);
97   	  if (struct_v >= 2) {
98   	    decode(num_pg_by_state, p);
99   	  } else {
100  	    map<int32_t, int32_t> nps;
101  	    decode(nps, p);
102  	    num_pg_by_state.clear();
103  	    for (auto i : nps) {
104  	      num_pg_by_state[i.first] = i.second;
105  	    }
106  	  }
107  	  decode(num_pg_by_osd, p);
108  	  decode(num_pg_by_pool, p);
109  	  decode(osd_last_seq, p);
110  	  decode(per_pool_sum_delta, p);
111  	  decode(per_pool_sum_deltas_stamps, p);
112  	  decode(pg_sum_delta, p);
113  	  decode(stamp_delta, p);
114  	  decode(avail_space_by_rule, p);
115  	  if (struct_v >= 3) {
116  	    decode(purged_snaps, p);
117  	  }
118  	  if (struct_v >= 4) {
119  	    decode(osd_sum_by_class, p);
120  	  }
121  	  DECODE_FINISH(p);
122  	}
123  	
124  	void PGMapDigest::dump(ceph::Formatter *f) const
125  	{
126  	  f->dump_unsigned("num_pg", num_pg);
127  	  f->dump_unsigned("num_pg_active", num_pg_active);
128  	  f->dump_unsigned("num_pg_unknown", num_pg_unknown);
129  	  f->dump_unsigned("num_osd", num_osd);
130  	  f->dump_object("pool_sum", pg_sum);
131  	  f->dump_object("osd_sum", osd_sum);
132  	
133  	  f->open_object_section("osd_sum_by_class");
134  	  for (auto& i : osd_sum_by_class) {
135  	    f->dump_object(i.first.c_str(), i.second);
136  	  }
137  	  f->close_section();
138  	
139  	  f->open_array_section("pool_stats");
140  	  for (auto& p : pg_pool_sum) {
141  	    f->open_object_section("pool_stat");
142  	    f->dump_int("poolid", p.first);
143  	    auto q = num_pg_by_pool.find(p.first);
144  	    if (q != num_pg_by_pool.end())
145  	      f->dump_unsigned("num_pg", q->second);
146  	    p.second.dump(f);
147  	    f->close_section();
148  	  }
149  	  f->close_section();
150  	  f->open_array_section("osd_stats");
151  	  int i = 0;
152  	  // TODO: this isn't really correct since we can dump non-existent OSDs
153  	  // I dunno what osd_last_seq is set to in that case...
154  	  for (auto& p : osd_last_seq) {
155  	    f->open_object_section("osd_stat");
156  	    f->dump_int("osd", i);
157  	    f->dump_unsigned("seq", p);
158  	    f->close_section();
159  	    ++i;
160  	  }
161  	  f->close_section();
162  	  f->open_array_section("num_pg_by_state");
163  	  for (auto& p : num_pg_by_state) {
164  	    f->open_object_section("count");
165  	    f->dump_string("state", pg_state_string(p.first));
166  	    f->dump_unsigned("num", p.second);
167  	    f->close_section();
168  	  }
169  	  f->close_section();
170  	  f->open_array_section("num_pg_by_osd");
171  	  for (auto& p : num_pg_by_osd) {
172  	    f->open_object_section("count");
173  	    f->dump_unsigned("osd", p.first);
174  	    f->dump_unsigned("num_primary_pg", p.second.primary);
175  	    f->dump_unsigned("num_acting_pg", p.second.acting);
176  	    f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
177  	    f->close_section();
178  	  }
179  	  f->close_section();
180  	  f->open_array_section("purged_snaps");
181  	  for (auto& j : purged_snaps) {
182  	    f->open_object_section("pool");
183  	    f->dump_int("pool", j.first);
184  	    f->open_object_section("purged_snaps");
185  	    for (auto i = j.second.begin(); i != j.second.end(); ++i) {
186  	      f->open_object_section("interval");
187  	      f->dump_stream("start") << i.get_start();
188  	      f->dump_stream("length") << i.get_len();
189  	      f->close_section();
190  	    }
191  	    f->close_section();
192  	    f->close_section();
193  	  }
194  	  f->close_section();
195  	}
196  	
197  	void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
198  	{
199  	  ls.push_back(new PGMapDigest);
200  	}
201  	
202  	inline std::string percentify(const float& a) {
203  	  std::stringstream ss;
204  	  if (a < 0.01)
205  	    ss << "0";
206  	  else
207  	    ss << std::fixed << std::setprecision(2) << a;
208  	  return ss.str();
209  	}
210  	
211  	void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
212  	{
213  	  if (f)
214  	    f->open_array_section("pgs_by_state");
215  	
216  	  // list is descending numeric order (by count)
217  	  std::multimap<int,int> state_by_count;  // count -> state
218  	  for (auto p = num_pg_by_state.begin();
219  	       p != num_pg_by_state.end();
220  	       ++p) {
221  	    state_by_count.insert(make_pair(p->second, p->first));
222  	  }
223  	  if (f) {
224  	    for (auto p = state_by_count.rbegin();
225  	         p != state_by_count.rend();
226  	         ++p)
227  	    {
228  	      f->open_object_section("pgs_by_state_element");
229  	      f->dump_string("state_name", pg_state_string(p->second));
230  	      f->dump_unsigned("count", p->first);
231  	      f->close_section();
232  	    }
233  	  }
234  	  if (f)
235  	    f->close_section();
236  	
237  	  if (f) {
238  	    f->dump_unsigned("num_pgs", num_pg);
239  	    f->dump_unsigned("num_pools", pg_pool_sum.size());
240  	    f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
241  	    f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
242  	    f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
243  	    f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
244  	    f->dump_unsigned("bytes_total", osd_sum.statfs.total);
245  	  } else {
246  	    *out << "    pools:   " << pg_pool_sum.size() << " pools, "
247  	         << num_pg << " pgs\n";
248  	    *out << "    objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
249  	         << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
250  	    *out << "    usage:   "
251  	         << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
252  	         << byte_u_t(osd_sum.statfs.available) << " / "
253  	         << byte_u_t(osd_sum.statfs.total) << " avail\n";
254  	    *out << "    pgs:     ";
255  	  }
256  	
257  	  bool pad = false;
258  	
259  	  if (num_pg_unknown > 0) {
260  	    float p = (float)num_pg_unknown / (float)num_pg;
261  	    if (f) {
262  	      f->dump_float("unknown_pgs_ratio", p);
263  	    } else {
264  	      char b[20];
265  	      snprintf(b, sizeof(b), "%.3lf", p * 100.0);
266  	      *out << b << "% pgs unknown\n";
267  	      pad = true;
268  	    }
269  	  }
270  	
271  	  int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
272  	  if (num_pg_inactive > 0) {
273  	    float p = (float)num_pg_inactive / (float)num_pg;
274  	    if (f) {
275  	      f->dump_float("inactive_pgs_ratio", p);
276  	    } else {
277  	      if (pad) {
278  	        *out << "             ";
279  	      }
280  	      char b[20];
281  	      snprintf(b, sizeof(b), "%.3f", p * 100.0);
282  	      *out << b << "% pgs not active\n";
283  	      pad = true;
284  	    }
285  	  }
286  	
287  	  list<string> sl;
288  	  overall_recovery_summary(f, &sl);
289  	  if (!f && !sl.empty()) {
290  	    for (auto p = sl.begin(); p != sl.end(); ++p) {
291  	      if (pad) {
292  	        *out << "             ";
293  	      }
294  	      *out << *p << "\n";
295  	      pad = true;
296  	    }
297  	  }
298  	  sl.clear();
299  	
300  	  if (!f) {
301  	    unsigned max_width = 1;
302  	    for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
303  	    {
304  	      std::stringstream ss;
305  	      ss << p->first;
306  	      max_width = std::max<size_t>(ss.str().size(), max_width);
307  	    }
308  	
309  	    for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
310  	    {
311  	      if (pad) {
312  	        *out << "             ";
313  	      }
314  	      pad = true;
315  	      out->setf(std::ios::left);
316  	      *out << std::setw(max_width) << p->first
317  	           << " " << pg_state_string(p->second) << "\n";
318  	      out->unsetf(std::ios::left);
319  	    }
320  	  }
321  	
322  	  ostringstream ss_rec_io;
323  	  overall_recovery_rate_summary(f, &ss_rec_io);
324  	  ostringstream ss_client_io;
325  	  overall_client_io_rate_summary(f, &ss_client_io);
326  	  ostringstream ss_cache_io;
327  	  overall_cache_io_rate_summary(f, &ss_cache_io);
328  	
329  	  if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
330  	             || ss_cache_io.str().length())) {
331  	    *out << "\n \n";
332  	    *out << "  io:\n";
333  	  }
334  	
335  	  if (!f && ss_client_io.str().length())
336  	    *out << "    client:   " << ss_client_io.str() << "\n";
337  	  if (!f && ss_rec_io.str().length())
338  	    *out << "    recovery: " << ss_rec_io.str() << "\n";
339  	  if (!f && ss_cache_io.str().length())
340  	    *out << "    cache:    " << ss_cache_io.str() << "\n";
341  	}
342  	
343  	void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
344  	{
345  	  std::stringstream ss;
346  	
347  	  if (f)
348  	    f->open_array_section("num_pg_by_state");
349  	  for (auto p = num_pg_by_state.begin();
350  	       p != num_pg_by_state.end();
351  	       ++p) {
352  	    if (f) {
353  	      f->open_object_section("state");
354  	      f->dump_string("name", pg_state_string(p->first));
355  	      f->dump_unsigned("num", p->second);
356  	      f->close_section();
357  	    }
358  	    if (p != num_pg_by_state.begin())
359  	      ss << ", ";
360  	    ss << p->second << " " << pg_state_string(p->first);
361  	  }
362  	  if (f)
363  	    f->close_section();
364  	
365  	  string states = ss.str();
366  	  if (out)
367  	    *out << num_pg << " pgs: "
368  	         << states << "; "
369  	         << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
370  	         << byte_u_t(osd_sum.statfs.get_used()) << " used, "
371  	         << byte_u_t(osd_sum.statfs.available) << " / "
372  	         << byte_u_t(osd_sum.statfs.total) << " avail";
373  	  if (f) {
374  	    f->dump_unsigned("num_pgs", num_pg);
375  	    f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
376  	    f->dump_int("total_bytes", osd_sum.statfs.total);
377  	    f->dump_int("total_avail_bytes", osd_sum.statfs.available);
378  	    f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
379  	    f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
380  	  }
381  	
382  	  // make non-negative; we can get negative values if osds send
383  	  // uncommitted stats and then "go backward" or if they are just
384  	  // buggy/wrong.
385  	  pool_stat_t pos_delta = pg_sum_delta;
386  	  pos_delta.floor(0);
387  	  if (pos_delta.stats.sum.num_rd ||
388  	      pos_delta.stats.sum.num_wr) {
389  	    if (out)
390  	      *out << "; ";
391  	    if (pos_delta.stats.sum.num_rd) {
392  	      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
393  	      if (out)
394  		*out << byte_u_t(rd) << "/s rd, ";
395  	      if (f)
396  		f->dump_unsigned("read_bytes_sec", rd);
397  	    }
398  	    if (pos_delta.stats.sum.num_wr) {
399  	      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
400  	      if (out)
401  		*out << byte_u_t(wr) << "/s wr, ";
402  	      if (f)
403  		f->dump_unsigned("write_bytes_sec", wr);
404  	    }
405  	    int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
406  	    if (out)
407  	      *out << si_u_t(iops) << " op/s";
408  	    if (f)
409  	      f->dump_unsigned("io_sec", iops);
410  	  }
411  	
412  	  list<string> sl;
413  	  overall_recovery_summary(f, &sl);
414  	  if (out)
415  	    for (auto p = sl.begin(); p != sl.end(); ++p)
416  	      *out << "; " << *p;
417  	  std::stringstream ssr;
418  	  overall_recovery_rate_summary(f, &ssr);
419  	  if (out && ssr.str().length())
420  	    *out << "; " << ssr.str() << " recovering";
421  	}
422  	
423  	void PGMapDigest::get_recovery_stats(
424  	    double *misplaced_ratio,
425  	    double *degraded_ratio,
426  	    double *inactive_pgs_ratio,
427  	    double *unknown_pgs_ratio) const
428  	{
429  	  if (pg_sum.stats.sum.num_objects_degraded &&
430  	      pg_sum.stats.sum.num_object_copies > 0) {
431  	    *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
432  	      (double)pg_sum.stats.sum.num_object_copies;
433  	  } else {
434  	    *degraded_ratio = 0;
435  	  }
436  	  if (pg_sum.stats.sum.num_objects_misplaced &&
437  	      pg_sum.stats.sum.num_object_copies > 0) {
438  	    *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
439  	      (double)pg_sum.stats.sum.num_object_copies;
440  	  } else {
441  	    *misplaced_ratio = 0;
442  	  }
443  	  if (num_pg > 0) {
444  	    int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
445  	    *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
446  	    *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
447  	 } else {
448  	    *inactive_pgs_ratio = 0;
449  	    *unknown_pgs_ratio = 0;
450  	  }
451  	}
452  	
453  	void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
454  	                             const pool_stat_t& pool_sum) const
455  	{
456  	  if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
457  	    double pc = (double)pool_sum.stats.sum.num_objects_degraded /
458  	                (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
459  	    char b[20];
460  	    snprintf(b, sizeof(b), "%.3lf", pc);
461  	    if (f) {
462  	      f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
463  	      f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
464  	      f->dump_float("degraded_ratio", pc / 100.0);
465  	    } else {
466  	      ostringstream ss;
467  	      ss << pool_sum.stats.sum.num_objects_degraded
468  	         << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
469  	      psl->push_back(ss.str());
470  	    }
471  	  }
472  	  if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
473  	    double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
474  	                (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
475  	    char b[20];
476  	    snprintf(b, sizeof(b), "%.3lf", pc);
477  	    if (f) {
478  	      f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
479  	      f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
480  	      f->dump_float("misplaced_ratio", pc / 100.0);
481  	    } else {
482  	      ostringstream ss;
483  	      ss << pool_sum.stats.sum.num_objects_misplaced
484  	         << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
485  	      psl->push_back(ss.str());
486  	    }
487  	  }
488  	  if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
489  	    double pc = (double)pool_sum.stats.sum.num_objects_unfound /
490  	                (double)pool_sum.stats.sum.num_objects * (double)100.0;
491  	    char b[20];
492  	    snprintf(b, sizeof(b), "%.3lf", pc);
493  	    if (f) {
494  	      f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
495  	      f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
496  	      f->dump_float("unfound_ratio", pc / 100.0);
497  	    } else {
498  	      ostringstream ss;
499  	      ss << pool_sum.stats.sum.num_objects_unfound
500  	         << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
501  	      psl->push_back(ss.str());
502  	    }
503  	  }
504  	}
505  	
506  	void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
507  	                                  const pool_stat_t& delta_sum,
508  	                                  utime_t delta_stamp) const
509  	{
510  	  // make non-negative; we can get negative values if osds send
511  	  // uncommitted stats and then "go backward" or if they are just
512  	  // buggy/wrong.
513  	  pool_stat_t pos_delta = delta_sum;
514  	  pos_delta.floor(0);
515  	  if (pos_delta.stats.sum.num_objects_recovered ||
516  	      pos_delta.stats.sum.num_bytes_recovered ||
517  	      pos_delta.stats.sum.num_keys_recovered) {
518  	    int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
519  	    int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
520  	    int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
521  	    if (f) {
522  	      f->dump_int("recovering_objects_per_sec", objps);
523  	      f->dump_int("recovering_bytes_per_sec", bps);
524  	      f->dump_int("recovering_keys_per_sec", kps);
525  	      f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
526  	      f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
527  	      f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
528  	    } else {
529  	      *out << byte_u_t(bps) << "/s";
530  	      if (pos_delta.stats.sum.num_keys_recovered)
531  		*out << ", " << si_u_t(kps) << " keys/s";
532  	      *out << ", " << si_u_t(objps) << " objects/s";
533  	    }
534  	  }
535  	}
536  	
537  	void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
538  	{
539  	  recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
540  	}
541  	
542  	void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
543  	{
544  	  recovery_summary(f, psl, pg_sum);
545  	}
546  	
547  	void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
548  	                                       uint64_t poolid) const
549  	{
550  	  auto p = per_pool_sum_delta.find(poolid);
551  	  if (p == per_pool_sum_delta.end())
552  	    return;
553  	
554  	  auto ts = per_pool_sum_deltas_stamps.find(p->first);
555  	  ceph_assert(ts != per_pool_sum_deltas_stamps.end());
556  	  recovery_rate_summary(f, out, p->second.first, ts->second);
557  	}
558  	
559  	void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
560  	                                  uint64_t poolid) const
561  	{
562  	  auto p = pg_pool_sum.find(poolid);
563  	  if (p == pg_pool_sum.end())
564  	    return;
565  	
566  	  recovery_summary(f, psl, p->second);
567  	}
568  	
569  	void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
570  	                                   const pool_stat_t& delta_sum,
571  	                                   utime_t delta_stamp) const
572  	{
573  	  pool_stat_t pos_delta = delta_sum;
574  	  pos_delta.floor(0);
575  	  if (pos_delta.stats.sum.num_rd ||
576  	      pos_delta.stats.sum.num_wr) {
577  	    if (pos_delta.stats.sum.num_rd) {
578  	      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
579  	      if (f) {
580  		f->dump_int("read_bytes_sec", rd);
581  	      } else {
582  		*out << byte_u_t(rd) << "/s rd, ";
583  	      }
584  	    }
585  	    if (pos_delta.stats.sum.num_wr) {
586  	      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
587  	      if (f) {
588  		f->dump_int("write_bytes_sec", wr);
589  	      } else {
590  		*out << byte_u_t(wr) << "/s wr, ";
591  	      }
592  	    }
593  	    int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
594  	    int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
595  	    if (f) {
596  	      f->dump_int("read_op_per_sec", iops_rd);
597  	      f->dump_int("write_op_per_sec", iops_wr);
598  	    } else {
599  	      *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
600  	    }
601  	  }
602  	}
603  	
604  	void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
605  	{
606  	  client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
607  	}
608  	
609  	void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
610  	                                        uint64_t poolid) const
611  	{
612  	  auto p = per_pool_sum_delta.find(poolid);
613  	  if (p == per_pool_sum_delta.end())
614  	    return;
615  	
616  	  auto ts = per_pool_sum_deltas_stamps.find(p->first);
617  	  ceph_assert(ts != per_pool_sum_deltas_stamps.end());
618  	  client_io_rate_summary(f, out, p->second.first, ts->second);
619  	}
620  	
621  	void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
622  	                                  const pool_stat_t& delta_sum,
623  	                                  utime_t delta_stamp) const
624  	{
625  	  pool_stat_t pos_delta = delta_sum;
626  	  pos_delta.floor(0);
627  	  bool have_output = false;
628  	
629  	  if (pos_delta.stats.sum.num_flush) {
630  	    int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
631  	    if (f) {
632  	      f->dump_int("flush_bytes_sec", flush);
633  	    } else {
634  	      *out << byte_u_t(flush) << "/s flush";
635  	      have_output = true;
636  	    }
637  	  }
638  	  if (pos_delta.stats.sum.num_evict) {
639  	    int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
640  	    if (f) {
641  	      f->dump_int("evict_bytes_sec", evict);
642  	    } else {
643  	      if (have_output)
644  		*out << ", ";
645  	      *out << byte_u_t(evict) << "/s evict";
646  	      have_output = true;
647  	    }
648  	  }
649  	  if (pos_delta.stats.sum.num_promote) {
650  	    int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
651  	    if (f) {
652  	      f->dump_int("promote_op_per_sec", promote);
653  	    } else {
654  	      if (have_output)
655  		*out << ", ";
656  	      *out << si_u_t(promote) << " op/s promote";
657  	      have_output = true;
658  	    }
659  	  }
660  	  if (pos_delta.stats.sum.num_flush_mode_low) {
661  	    if (f) {
662  	      f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
663  	    } else {
664  	      if (have_output)
665  		*out << ", ";
666  	      *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
667  	      have_output = true;
668  	    }
669  	  }
670  	  if (pos_delta.stats.sum.num_flush_mode_high) {
671  	    if (f) {
672  	      f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
673  	    } else {
674  	      if (have_output)
675  		*out << ", ";
676  	      *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
677  	      have_output = true;
678  	    }
679  	  }
680  	  if (pos_delta.stats.sum.num_evict_mode_some) {
681  	    if (f) {
682  	      f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
683  	    } else {
684  	      if (have_output)
685  		*out << ", ";
686  	      *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
687  	      have_output = true;
688  	    }
689  	  }
690  	  if (pos_delta.stats.sum.num_evict_mode_full) {
691  	    if (f) {
692  	      f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
693  	    } else {
694  	      if (have_output)
695  		*out << ", ";
696  	      *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
697  	    }
698  	  }
699  	}
700  	
701  	void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
702  	{
703  	  cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
704  	}
705  	
706  	void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
707  	                                       uint64_t poolid) const
708  	{
709  	  auto p = per_pool_sum_delta.find(poolid);
710  	  if (p == per_pool_sum_delta.end())
711  	    return;
712  	
713  	  auto ts = per_pool_sum_deltas_stamps.find(p->first);
714  	  ceph_assert(ts != per_pool_sum_deltas_stamps.end());
715  	  cache_io_rate_summary(f, out, p->second.first, ts->second);
716  	}
717  	
718  	ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
719  					    boost::optional<int64_t> data_pool) const
720  	{
721  	  ceph_statfs statfs;
722  	  bool filter = false;
723  	  object_stat_sum_t sum;
724  	
725  	  if (data_pool) {
726  	    auto i = pg_pool_sum.find(*data_pool);
727  	    if (i != pg_pool_sum.end()) {
728  	      sum = i->second.stats.sum;
729  	      filter = true;
730  	    }
731  	  }
732  	
733  	  if (filter) {
734  	    statfs.kb_used = (sum.num_bytes >> 10);
735  	    statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
736  	    statfs.num_objects = sum.num_objects;
737  	    statfs.kb = statfs.kb_used + statfs.kb_avail;
738  	  } else {
739  	    // these are in KB.
740  	    statfs.kb = osd_sum.statfs.kb();
741  	    statfs.kb_used = osd_sum.statfs.kb_used_raw();
742  	    statfs.kb_avail = osd_sum.statfs.kb_avail();
743  	    statfs.num_objects = pg_sum.stats.sum.num_objects;
744  	  }
745  	
746  	  return statfs;
747  	}
748  	
749  	void PGMapDigest::dump_pool_stats_full(
750  	  const OSDMap &osd_map,
751  	  stringstream *ss,
752  	  ceph::Formatter *f,
753  	  bool verbose) const
754  	{
755  	  TextTable tbl;
756  	
757  	  if (f) {
758  	    f->open_array_section("pools");
759  	  } else {
760  	    tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
761  	    tbl.define_column("ID", TextTable::LEFT, TextTable::RIGHT);
762  	    tbl.define_column("STORED", TextTable::LEFT, TextTable::RIGHT);
763  	    if (verbose) {
764  	      tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
765  	      tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
766  	    }
767  	    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
768  	    tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
769  	    if (verbose) {
770  	      tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
771  	      tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
772  	    }
773  	    tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
774  	    tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
775  	
776  	    if (verbose) {
777  	      tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
778  	      tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
779  	      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
780  	      tbl.define_column("USED COMPR", TextTable::LEFT, TextTable::RIGHT);
781  	      tbl.define_column("UNDER COMPR", TextTable::LEFT, TextTable::RIGHT);
782  	    }
783  	  }
784  	
785  	  map<int,uint64_t> avail_by_rule;
786  	  for (auto p = osd_map.get_pools().begin();
787  	       p != osd_map.get_pools().end(); ++p) {
788  	    int64_t pool_id = p->first;
789  	    if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
790  	      continue;
791  	
792  	    const string& pool_name = osd_map.get_pool_name(pool_id);
793  	    const pool_stat_t &stat = pg_pool_sum.at(pool_id);
794  	
795  	    const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
796  	    int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
797  	                                         pool->get_type(),
798  	                                         pool->get_size());
799  	    int64_t avail;
800  	    if (avail_by_rule.count(ruleno) == 0) {
801  	      // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
802  	      avail = get_rule_avail(ruleno);
803  	      if (avail < 0)
804  		avail = 0;
805  	      avail_by_rule[ruleno] = avail;
806  	    } else {
807  	      avail = avail_by_rule[ruleno];
808  	    }
809  	    if (f) {
810  	      f->open_object_section("pool");
811  	      f->dump_string("name", pool_name);
812  	      f->dump_int("id", pool_id);
813  	      f->open_object_section("stats");
814  	    } else {
815  	      tbl << pool_name
816  	          << pool_id;
817  	    }
818  	    float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
819  	    bool per_pool = use_per_pool_stats();
820  	    bool per_pool_omap = use_per_pool_omap_stats();
821  	    dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
822  				 per_pool_omap, pool);
823  	    if (f) {
824  	      f->close_section();  // stats
825  	      f->close_section();  // pool
826  	    } else {
827  	      tbl << TextTable::endrow;
828  	    }
829  	  }
830  	  if (f)
831  	    f->close_section();
832  	  else {
833  	    ceph_assert(ss != nullptr);
834  	    *ss << "POOLS:\n";
835  	    tbl.set_indent(4);
836  	    *ss << tbl;
837  	  }
838  	}
839  	
840  	void PGMapDigest::dump_cluster_stats(stringstream *ss,
841  					     ceph::Formatter *f,
842  					     bool verbose) const
843  	{
844  	  if (f) {
845  	    f->open_object_section("stats");
846  	    f->dump_int("total_bytes", osd_sum.statfs.total);
847  	    f->dump_int("total_avail_bytes", osd_sum.statfs.available);
848  	    f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
849  	    f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
850  	    f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
851  	    f->dump_unsigned("num_osds", osd_sum.num_osds);
852  	    f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
853  	    f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
854  	    f->close_section();
855  	    f->open_object_section("stats_by_class");
856  	    for (auto& i : osd_sum_by_class) {
857  	      f->open_object_section(i.first.c_str());
858  	      f->dump_int("total_bytes", i.second.statfs.total);
859  	      f->dump_int("total_avail_bytes", i.second.statfs.available);
860  	      f->dump_int("total_used_bytes", i.second.statfs.get_used());
861  	      f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
862  	      f->dump_float("total_used_raw_ratio",
863  			    i.second.statfs.get_used_raw_ratio());
864  	      f->close_section();
865  	    }
866  	    f->close_section();
867  	  } else {
868  	    ceph_assert(ss != nullptr);
869  	    TextTable tbl;
870  	    tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
871  	    tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
872  	    tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
873  	    tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
874  	    tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
875  	    tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
876  	
877  	
878  	    for (auto& i : osd_sum_by_class) {
879  	      tbl << i.first;
880  	      tbl << stringify(byte_u_t(i.second.statfs.total))
881  		  << stringify(byte_u_t(i.second.statfs.available))
882  		  << stringify(byte_u_t(i.second.statfs.get_used()))
883  		  << stringify(byte_u_t(i.second.statfs.get_used_raw()))
884  		  << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
885  		  << TextTable::endrow;
886  	    }
887  	    tbl << "TOTAL";
888  	    tbl << stringify(byte_u_t(osd_sum.statfs.total))
889  	        << stringify(byte_u_t(osd_sum.statfs.available))
890  	        << stringify(byte_u_t(osd_sum.statfs.get_used()))
891  	        << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
892  		<< percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
893  		<< TextTable::endrow;
894  	
895  	    *ss << "RAW STORAGE:\n";
896  	    tbl.set_indent(4);
897  	    *ss << tbl;
898  	  }
899  	}
900  	
901  	void PGMapDigest::dump_object_stat_sum(
902  	  TextTable &tbl, ceph::Formatter *f,
903  	  const pool_stat_t &pool_stat, uint64_t avail,
904  	  float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
905  	  const pg_pool_t *pool)
906  	{
907  	  const object_stat_sum_t &sum = pool_stat.stats.sum;
908  	  const store_statfs_t statfs = pool_stat.store_stats;
909  	
910  	  if (sum.num_object_copies > 0) {
911  	    raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
912  	  }
913  	
914  	  uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
915  	  uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
916  	  uint64_t used_bytes = used_data_bytes + used_omap_bytes;
917  	
918  	  float used = 0.0;
919  	  // note avail passed in is raw_avail, calc raw_used here.
920  	  if (avail) {
921  	    used = used_bytes;
922  	    used /= used + avail;
923  	  } else if (used_bytes) {
924  	    used = 1.0;
925  	  }
926  	  auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
927  	  // an approximation for actually stored user data
928  	  auto stored_data_normalized = pool_stat.get_user_data_bytes(
929  	    raw_used_rate, per_pool);
930  	  auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
931  	    raw_used_rate, per_pool_omap);
932  	  auto stored_normalized = stored_data_normalized + stored_omap_normalized;
933  	  // same, amplied by replication or EC
934  	  auto stored_raw = stored_normalized * raw_used_rate;
935  	  if (f) {
936  	    f->dump_int("stored", stored_normalized);
937  	    if (verbose) {
938  	      f->dump_int("stored_data", stored_data_normalized);
939  	      f->dump_int("stored_omap", stored_omap_normalized);
940  	    }
941  	    f->dump_int("objects", sum.num_objects);
942  	    f->dump_int("kb_used", shift_round_up(used_bytes, 10));
943  	    f->dump_int("bytes_used", used_bytes);
944  	    if (verbose) {
945  	      f->dump_int("data_bytes_used", used_data_bytes);
946  	      f->dump_int("omap_bytes_used", used_omap_bytes);
947  	    }
948  	    f->dump_float("percent_used", used);
949  	    f->dump_unsigned("max_avail", avail_res);
950  	    if (verbose) {
951  	      f->dump_int("quota_objects", pool->quota_max_objects);
952  	      f->dump_int("quota_bytes", pool->quota_max_bytes);
953  	      f->dump_int("dirty", sum.num_objects_dirty);
954  	      f->dump_int("rd", sum.num_rd);
955  	      f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
956  	      f->dump_int("wr", sum.num_wr);
957  	      f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
958  	      f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
959  	      f->dump_int("compress_under_bytes", statfs.data_compressed_original);
960  	      // Stored by user amplified by replication
961  	      f->dump_int("stored_raw", stored_raw);
962  	    }
963  	  } else {
964  	    tbl << stringify(byte_u_t(stored_normalized));
965  	    if (verbose) {
966  	      tbl << stringify(byte_u_t(stored_data_normalized));
967  	      tbl << stringify(byte_u_t(stored_omap_normalized));
968  	    }
969  	    tbl << stringify(si_u_t(sum.num_objects));
970  	    tbl << stringify(byte_u_t(used_bytes));
971  	    if (verbose) {
972  	      tbl << stringify(byte_u_t(used_data_bytes));
973  	      tbl << stringify(byte_u_t(used_omap_bytes));
974  	    }
975  	    tbl << percentify(used*100);
976  	    tbl << stringify(byte_u_t(avail_res));
977  	    if (verbose) {
978  	      if (pool->quota_max_objects == 0)
979  	        tbl << "N/A";
980  	      else
981  	        tbl << stringify(si_u_t(pool->quota_max_objects));
982  	
983  	      if (pool->quota_max_bytes == 0)
984  	        tbl << "N/A";
985  	      else
986  	        tbl << stringify(byte_u_t(pool->quota_max_bytes));
987  	
988  	      tbl << stringify(si_u_t(sum.num_objects_dirty))
989  		  << stringify(byte_u_t(statfs.data_compressed_allocated))
990  		  << stringify(byte_u_t(statfs.data_compressed_original))
991  		  ;
992  	    }
993  	  }
994  	}
995  	
996  	int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
997  						 int64_t poolid) const
998  	{
999  	  const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
1000 	  int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
1001 						pool->get_type(),
1002 						pool->get_size());
1003 	  int64_t avail;
1004 	  avail = get_rule_avail(ruleno);
1005 	  if (avail < 0)
1006 	    avail = 0;
1007 	
1008 	  return avail / osd_map.pool_raw_used_rate(poolid);
1009 	}
1010 	
1011 	int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
1012 	{
1013 	  map<int,float> wm;
1014 	  int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
1015 	  if (r < 0) {
1016 	    return r;
1017 	  }
1018 	  if (wm.empty()) {
1019 	    return 0;
1020 	  }
1021 	
1022 	  float fratio = osdmap.get_full_ratio();
1023 	
1024 	  int64_t min = -1;
1025 	  for (auto p = wm.begin(); p != wm.end(); ++p) {
1026 	    auto osd_info = osd_stat.find(p->first);
1027 	    if (osd_info != osd_stat.end()) {
1028 	      if (osd_info->second.statfs.total == 0 || p->second == 0) {
1029 		// osd must be out, hence its stats have been zeroed
1030 		// (unless we somehow managed to have a disk with size 0...)
1031 		//
1032 		// (p->second == 0), if osd weight is 0, no need to
1033 		// calculate proj below.
1034 		continue;
1035 	      }
1036 	      double unusable = (double)osd_info->second.statfs.kb() *
1037 		(1.0 - fratio);
1038 	      double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
1039 	      avail *= 1024.0;
1040 	      int64_t proj = (int64_t)(avail / (double)p->second);
1041 	      if (min < 0 || proj < min) {
1042 		min = proj;
1043 	      }
1044 	    } else {
1045 	      if (osdmap.is_up(p->first)) {
1046 	        // This is a level 4 rather than an error, because we might have
1047 	        // only just started, and not received the first stats message yet.
1048 	        dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
1049 	      }
1050 	    }
1051 	  }
1052 	  return min;
1053 	}
1054 	
1055 	void PGMap::get_rules_avail(const OSDMap& osdmap,
1056 				    std::map<int,int64_t> *avail_map) const
1057 	{
1058 	  avail_map->clear();
1059 	  for (auto p : osdmap.get_pools()) {
1060 	    int64_t pool_id = p.first;
1061 	    if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
1062 	      continue;
1063 	    const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
1064 	    int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
1065 						 pool->get_type(),
1066 						 pool->get_size());
1067 	    if (avail_map->count(ruleno) == 0)
1068 	      (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
1069 	  }
1070 	}
1071 	
1072 	// ---------------------
1073 	// PGMap
1074 	
1075 	void PGMap::Incremental::dump(ceph::Formatter *f) const
1076 	{
1077 	  f->dump_unsigned("version", version);
1078 	  f->dump_stream("stamp") << stamp;
1079 	  f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1080 	  f->dump_unsigned("pg_scan_epoch", pg_scan);
1081 	
1082 	  f->open_array_section("pg_stat_updates");
1083 	  for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1084 	    f->open_object_section("pg_stat");
1085 	    f->dump_stream("pgid") << p->first;
1086 	    p->second.dump(f);
1087 	    f->close_section();
1088 	  }
1089 	  f->close_section();
1090 	
1091 	  f->open_array_section("osd_stat_updates");
1092 	  for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1093 	    f->open_object_section("osd_stat");
1094 	    f->dump_int("osd", p->first);
1095 	    p->second.dump(f);
1096 	    f->close_section();
1097 	  }
1098 	  f->close_section();
1099 	  f->open_array_section("pool_statfs_updates");
1100 	  for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
1101 	    f->open_object_section("pool_statfs");
1102 	    f->dump_stream("poolid/osd") << p->first;
1103 	    p->second.dump(f);
1104 	    f->close_section();
1105 	  }
1106 	  f->close_section();
1107 	
1108 	  f->open_array_section("osd_stat_removals");
1109 	  for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1110 	    f->dump_int("osd", *p);
1111 	  f->close_section();
1112 	
1113 	  f->open_array_section("pg_removals");
1114 	  for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1115 	    f->dump_stream("pgid") << *p;
1116 	  f->close_section();
1117 	}
1118 	
1119 	void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1120 	{
1121 	  o.push_back(new Incremental);
1122 	  o.push_back(new Incremental);
1123 	  o.back()->version = 1;
1124 	  o.back()->stamp = utime_t(123,345);
1125 	  o.push_back(new Incremental);
1126 	  o.back()->version = 2;
1127 	  o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
1128 	  o.back()->osd_stat_updates[5] = osd_stat_t();
1129 	  o.push_back(new Incremental);
1130 	  o.back()->version = 3;
1131 	  o.back()->osdmap_epoch = 1;
1132 	  o.back()->pg_scan = 2;
1133 	  o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
1134 	  o.back()->osd_stat_updates[6] = osd_stat_t();
1135 	  o.back()->pg_remove.insert(pg_t(1,2));
1136 	  o.back()->osd_stat_rm.insert(5);
1137 	  o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
1138 	}
1139 	
1140 	// --
1141 	
1142 	void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1143 	{
1144 	  ceph_assert(inc.version == version+1);
1145 	  version++;
1146 	
1147 	  pool_stat_t pg_sum_old = pg_sum;
1148 	  mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
1149 	  pg_pool_sum_old = pg_pool_sum;
1150 	
1151 	  for (auto p = inc.pg_stat_updates.begin();
1152 	       p != inc.pg_stat_updates.end();
1153 	       ++p) {
1154 	    const pg_t &update_pg(p->first);
1155 	    auto update_pool = update_pg.pool();
1156 	    const pg_stat_t &update_stat(p->second);
1157 	
1158 	    auto pg_stat_iter = pg_stat.find(update_pg);
1159 	    pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1160 	    if (pg_stat_iter == pg_stat.end()) {
1161 	      pg_stat.insert(make_pair(update_pg, update_stat));
1162 	    } else {
1163 	      stat_pg_sub(update_pg, pg_stat_iter->second);
1164 	      pool_sum_ref.sub(pg_stat_iter->second);
1165 	      pg_stat_iter->second = update_stat;
1166 	    }
1167 	    stat_pg_add(update_pg, update_stat);
1168 	    pool_sum_ref.add(update_stat);
1169 	  }
1170 	
1171 	  for (auto p = inc.pool_statfs_updates.begin();
1172 	       p != inc.pool_statfs_updates.end();
1173 	       ++p) {
1174 	    auto update_pool = p->first.first;
1175 	    auto update_osd =  p->first.second;
1176 	    auto& statfs_inc = p->second;
1177 	
1178 	    auto pool_statfs_iter =
1179 	      pool_statfs.find(std::make_pair(update_pool, update_osd));
1180 	    pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1181 	    if (pool_statfs_iter == pool_statfs.end()) {
1182 	      pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
1183 	    } else {
1184 	      pool_sum_ref.sub(pool_statfs_iter->second);
1185 	      pool_statfs_iter->second = statfs_inc;
1186 	    }
1187 	    pool_sum_ref.add(statfs_inc);
1188 	  }
1189 	
1190 	  for (auto p = inc.get_osd_stat_updates().begin();
1191 	       p != inc.get_osd_stat_updates().end();
1192 	       ++p) {
1193 	    int osd = p->first;
1194 	    const osd_stat_t &new_stats(p->second);
1195 	
1196 	    auto t = osd_stat.find(osd);
1197 	    if (t == osd_stat.end()) {
1198 	      osd_stat.insert(make_pair(osd, new_stats));
1199 	    } else {
1200 	      stat_osd_sub(t->first, t->second);
1201 	      t->second = new_stats;
1202 	    }
1203 	    stat_osd_add(osd, new_stats);
1204 	  }
1205 	  set<int64_t> deleted_pools;
1206 	  for (auto p = inc.pg_remove.begin();
1207 	       p != inc.pg_remove.end();
1208 	       ++p) {
1209 	    const pg_t &removed_pg(*p);
1210 	    auto s = pg_stat.find(removed_pg);
1211 	    bool pool_erased = false;
1212 	    if (s != pg_stat.end()) {
1213 	      pool_erased = stat_pg_sub(removed_pg, s->second);
1214 	      pg_stat.erase(s);
1215 	      if (pool_erased) {
1216 	        deleted_pools.insert(removed_pg.pool());
1217 	      }
1218 	    }
1219 	  }
1220 	
1221 	  for (auto p = inc.get_osd_stat_rm().begin();
1222 	       p != inc.get_osd_stat_rm().end();
1223 	       ++p) {
1224 	    auto t = osd_stat.find(*p);
1225 	    if (t != osd_stat.end()) {
1226 	      stat_osd_sub(t->first, t->second);
1227 	      osd_stat.erase(t);
1228 	    }
1229 	    for (auto i = pool_statfs.begin();  i != pool_statfs.end(); ++i) {
1230 	      if (i->first.second == *p) {
1231 		pg_pool_sum[i->first.first].sub(i->second);
1232 		pool_statfs.erase(i);
1233 	      }
1234 	    }
1235 	  }
1236 	
1237 	  // skip calculating delta while sum was not synchronized
1238 	  if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1239 	    utime_t delta_t;
1240 	    delta_t = inc.stamp;
1241 	    delta_t -= stamp;
1242 	    // calculate a delta, and average over the last 2 deltas.
1243 	    pool_stat_t d = pg_sum;
1244 	    d.stats.sub(pg_sum_old.stats);
1245 	    pg_sum_deltas.push_back(make_pair(d, delta_t));
1246 	    stamp_delta += delta_t;
1247 	    pg_sum_delta.stats.add(d.stats);
1248 	    auto smooth_intervals =
1249 	      cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1250 	    while (pg_sum_deltas.size() > smooth_intervals) {
1251 	      pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1252 	      stamp_delta -= pg_sum_deltas.front().second;
1253 	      pg_sum_deltas.pop_front();
1254 	    }
1255 	  }
1256 	  stamp = inc.stamp;
1257 	
1258 	  update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1259 	
1260 	  for (auto p : deleted_pools) {
1261 	    if (cct)
1262 	      dout(20) << " deleted pool " << p << dendl;
1263 	    deleted_pool(p);
1264 	  }
1265 	
1266 	  if (inc.osdmap_epoch)
1267 	    last_osdmap_epoch = inc.osdmap_epoch;
1268 	  if (inc.pg_scan)
1269 	    last_pg_scan = inc.pg_scan;
1270 	}
1271 	
1272 	void PGMap::calc_stats()
1273 	{
1274 	  num_pg = 0;
1275 	  num_pg_active = 0;
1276 	  num_pg_unknown = 0;
1277 	  num_osd = 0;
1278 	  pg_pool_sum.clear();
1279 	  num_pg_by_pool.clear();
1280 	  pg_by_osd.clear();
1281 	  pg_sum = pool_stat_t();
1282 	  osd_sum = osd_stat_t();
1283 	  osd_sum_by_class.clear();
1284 	  num_pg_by_state.clear();
1285 	  num_pg_by_pool_state.clear();
1286 	  num_pg_by_osd.clear();
1287 	
1288 	  for (auto p = pg_stat.begin();
1289 	       p != pg_stat.end();
1290 	       ++p) {
1291 	    auto pg = p->first;
1292 	    stat_pg_add(pg, p->second);
1293 	    pg_pool_sum[pg.pool()].add(p->second);
1294 	  }
1295 	  for (auto p = pool_statfs.begin();
1296 	       p != pool_statfs.end();
1297 	       ++p) {
1298 	    auto pool = p->first.first;
1299 	    pg_pool_sum[pool].add(p->second);
1300 	  }
1301 	  for (auto p = osd_stat.begin();
1302 	       p != osd_stat.end();
1303 	       ++p)
1304 	    stat_osd_add(p->first, p->second);
1305 	}
1306 	
1307 	void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1308 	                        bool sameosds)
1309 	{
1310 	  auto pool = pgid.pool();
1311 	  pg_sum.add(s);
1312 	
1313 	  num_pg++;
1314 	  num_pg_by_state[s.state]++;
1315 	  num_pg_by_pool_state[pgid.pool()][s.state]++;
1316 	  num_pg_by_pool[pool]++;
1317 	
1318 	  if ((s.state & PG_STATE_CREATING) &&
1319 	      s.parent_split_bits == 0) {
1320 	    creating_pgs.insert(pgid);
1321 	    if (s.acting_primary >= 0) {
1322 	      creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1323 	    }
1324 	  }
1325 	
1326 	  if (s.state & PG_STATE_ACTIVE) {
1327 	    ++num_pg_active;
1328 	  }
1329 	  if (s.state == 0) {
1330 	    ++num_pg_unknown;
1331 	  }
1332 	
1333 	  if (sameosds)
1334 	    return;
1335 	
1336 	  for (auto p = s.blocked_by.begin();
1337 	       p != s.blocked_by.end();
1338 	       ++p) {
1339 	    ++blocked_by_sum[*p];
1340 	  }
1341 	
1342 	  for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1343 	    pg_by_osd[*p].insert(pgid);
1344 	    num_pg_by_osd[*p].acting++;
1345 	  }
1346 	  for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1347 	    auto& t = pg_by_osd[*p];
1348 	    if (t.find(pgid) == t.end()) {
1349 	      t.insert(pgid);
1350 	      num_pg_by_osd[*p].up_not_acting++;
1351 	    }
1352 	  }
1353 	
1354 	  if (s.up_primary >= 0) {
1355 	    num_pg_by_osd[s.up_primary].primary++;
1356 	  }
1357 	}
1358 	
1359 	bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1360 	                        bool sameosds)
1361 	{
1362 	  bool pool_erased = false;
1363 	  pg_sum.sub(s);
1364 	
1365 	  num_pg--;
1366 	  int end = --num_pg_by_state[s.state];
1367 	  ceph_assert(end >= 0);
1368 	  if (end == 0)
1369 	    num_pg_by_state.erase(s.state);
1370 	  if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
1371 	    num_pg_by_pool_state[pgid.pool()].erase(s.state);
1372 	  }
1373 	  end = --num_pg_by_pool[pgid.pool()];
1374 	  if (end == 0) {
1375 	    pool_erased = true;
1376 	  }
1377 	
1378 	  if ((s.state & PG_STATE_CREATING) &&
1379 	      s.parent_split_bits == 0) {
1380 	    creating_pgs.erase(pgid);
1381 	    if (s.acting_primary >= 0) {
1382 	      map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1383 	      r[s.mapping_epoch].erase(pgid);
1384 	      if (r[s.mapping_epoch].empty())
1385 		r.erase(s.mapping_epoch);
1386 	      if (r.empty())
1387 		creating_pgs_by_osd_epoch.erase(s.acting_primary);
1388 	    }
1389 	  }
1390 	
1391 	  if (s.state & PG_STATE_ACTIVE) {
1392 	    --num_pg_active;
1393 	  }
1394 	  if (s.state == 0) {
1395 	    --num_pg_unknown;
1396 	  }
1397 	
1398 	  if (sameosds)
1399 	    return pool_erased;
1400 	
1401 	  for (auto p = s.blocked_by.begin();
1402 	       p != s.blocked_by.end();
1403 	       ++p) {
1404 	    auto q = blocked_by_sum.find(*p);
1405 	    ceph_assert(q != blocked_by_sum.end());
1406 	    --q->second;
1407 	    if (q->second == 0)
1408 	      blocked_by_sum.erase(q);
1409 	  }
1410 	
1411 	  set<int32_t> actingset;
1412 	  for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1413 	    actingset.insert(*p);
1414 	    auto& oset = pg_by_osd[*p];
1415 	    oset.erase(pgid);
1416 	    if (oset.empty())
1417 	      pg_by_osd.erase(*p);
1418 	    auto it = num_pg_by_osd.find(*p);
1419 	    if (it != num_pg_by_osd.end() && it->second.acting > 0)
1420 	      it->second.acting--;
1421 	  }
1422 	  for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1423 	    auto& oset = pg_by_osd[*p];
1424 	    oset.erase(pgid);
1425 	    if (oset.empty())
1426 	      pg_by_osd.erase(*p);
1427 	    if (actingset.count(*p))
1428 	      continue;
1429 	    auto it = num_pg_by_osd.find(*p);
1430 	    if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
1431 	      it->second.up_not_acting--;
1432 	  }
1433 	
1434 	  if (s.up_primary >= 0) {
1435 	    auto it = num_pg_by_osd.find(s.up_primary);
1436 	    if (it != num_pg_by_osd.end() && it->second.primary > 0)
1437 	      it->second.primary--;
1438 	  }
1439 	  return pool_erased;
1440 	}
1441 	
1442 	void PGMap::calc_purged_snaps()
1443 	{
1444 	  purged_snaps.clear();
1445 	  set<int64_t> unknown;
1446 	  for (auto& i : pg_stat) {
1447 	    if (i.second.state == 0) {
1448 	      unknown.insert(i.first.pool());
1449 	      purged_snaps.erase(i.first.pool());
1450 	      continue;
1451 	    } else if (unknown.count(i.first.pool())) {
1452 	      continue;
1453 	    }
1454 	    auto j = purged_snaps.find(i.first.pool());
1455 	    if (j == purged_snaps.end()) {
1456 	      // base case
1457 	      purged_snaps[i.first.pool()] = i.second.purged_snaps;
1458 	    } else {
1459 	      j->second.intersection_of(i.second.purged_snaps);
1460 	    }
1461 	  }
1462 	}
1463 	
1464 	void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
1465 	{
1466 	  osd_sum_by_class.clear();
1467 	  for (auto& i : osd_stat) {
1468 	    const char *class_name = osdmap.crush->get_item_class(i.first);
1469 	    if (class_name) {
1470 	      osd_sum_by_class[class_name].add(i.second);
1471 	    }
1472 	  }
1473 	}
1474 	
1475 	void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1476 	{
1477 	  num_osd++;
1478 	  osd_sum.add(s);
1479 	  if (osd >= (int)osd_last_seq.size()) {
1480 	    osd_last_seq.resize(osd + 1);
1481 	  }
1482 	  osd_last_seq[osd] = s.seq;
1483 	}
1484 	
1485 	void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1486 	{
1487 	  num_osd--;
1488 	  osd_sum.sub(s);
1489 	  ceph_assert(osd < (int)osd_last_seq.size());
1490 	  osd_last_seq[osd] = 0;
1491 	}
1492 	
1493 	void PGMap::encode_digest(const OSDMap& osdmap,
1494 				  bufferlist& bl, uint64_t features)
1495 	{
1496 	  get_rules_avail(osdmap, &avail_space_by_rule);
1497 	  calc_osd_sum_by_class(osdmap);
1498 	  calc_purged_snaps();
1499 	  PGMapDigest::encode(bl, features);
1500 	}
1501 	
1502 	void PGMap::encode(bufferlist &bl, uint64_t features) const
1503 	{
1504 	  ENCODE_START(8, 8, bl);
1505 	  encode(version, bl);
1506 	  encode(pg_stat, bl);
1507 	  encode(osd_stat, bl, features);
1508 	  encode(last_osdmap_epoch, bl);
1509 	  encode(last_pg_scan, bl);
1510 	  encode(stamp, bl);
1511 	  encode(pool_statfs, bl, features);
1512 	  ENCODE_FINISH(bl);
1513 	}
1514 	
1515 	void PGMap::decode(bufferlist::const_iterator &bl)
1516 	{
1517 	  DECODE_START(8, bl);
1518 	  decode(version, bl);
1519 	  decode(pg_stat, bl);
1520 	  decode(osd_stat, bl);
1521 	  decode(last_osdmap_epoch, bl);
1522 	  decode(last_pg_scan, bl);
1523 	  decode(stamp, bl);
1524 	  decode(pool_statfs, bl);
1525 	  DECODE_FINISH(bl);
1526 	
1527 	  calc_stats();
1528 	}
1529 	
1530 	void PGMap::dump(ceph::Formatter *f) const
1531 	{
1532 	  dump_basic(f);
1533 	  dump_pg_stats(f, false);
1534 	  dump_pool_stats(f);
1535 	  dump_osd_stats(f);
1536 	}
1537 	
1538 	void PGMap::dump_basic(ceph::Formatter *f) const
1539 	{
1540 	  f->dump_unsigned("version", version);
1541 	  f->dump_stream("stamp") << stamp;
1542 	  f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1543 	  f->dump_unsigned("last_pg_scan", last_pg_scan);
1544 	
1545 	  f->open_object_section("pg_stats_sum");
1546 	  pg_sum.dump(f);
1547 	  f->close_section();
1548 	
1549 	  f->open_object_section("osd_stats_sum");
1550 	  osd_sum.dump(f);
1551 	  f->close_section();
1552 	
1553 	  dump_delta(f);
1554 	}
1555 	
1556 	void PGMap::dump_delta(ceph::Formatter *f) const
1557 	{
1558 	  f->open_object_section("pg_stats_delta");
1559 	  pg_sum_delta.dump(f);
1560 	  f->dump_stream("stamp_delta") << stamp_delta;
1561 	  f->close_section();
1562 	}
1563 	
1564 	void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
1565 	{
1566 	  f->open_array_section("pg_stats");
1567 	  for (auto i = pg_stat.begin();
1568 	       i != pg_stat.end();
1569 	       ++i) {
1570 	    f->open_object_section("pg_stat");
1571 	    f->dump_stream("pgid") << i->first;
1572 	    if (brief)
1573 	      i->second.dump_brief(f);
1574 	    else
1575 	      i->second.dump(f);
1576 	    f->close_section();
1577 	  }
1578 	  f->close_section();
1579 	}
1580 	
1581 	void PGMap::dump_pool_stats(ceph::Formatter *f) const
1582 	{
1583 	  f->open_array_section("pool_stats");
1584 	  for (auto p = pg_pool_sum.begin();
1585 	       p != pg_pool_sum.end();
1586 	       ++p) {
1587 	    f->open_object_section("pool_stat");
1588 	    f->dump_int("poolid", p->first);
1589 	    auto q = num_pg_by_pool.find(p->first);
1590 	    if (q != num_pg_by_pool.end())
1591 	      f->dump_unsigned("num_pg", q->second);
1592 	    p->second.dump(f);
1593 	    f->close_section();
1594 	  }
1595 	  f->close_section();
1596 	}
1597 	
1598 	void PGMap::dump_osd_stats(ceph::Formatter *f) const
1599 	{
1600 	  f->open_array_section("osd_stats");
1601 	  for (auto q = osd_stat.begin();
1602 	       q != osd_stat.end();
1603 	       ++q) {
1604 	    f->open_object_section("osd_stat");
1605 	    f->dump_int("osd", q->first);
1606 	    q->second.dump(f);
1607 	    f->close_section();
1608 	  }
1609 	  f->close_section();
1610 	}
1611 	
1612 	void PGMap::dump_pg_stats_plain(
1613 	  ostream& ss,
1614 	  const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1615 	  bool brief) const
1616 	{
1617 	  TextTable tab;
1618 	
1619 	  if (brief){
1620 	    tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1621 	    tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1622 	    tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1623 	    tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1624 	    tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1625 	    tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1626 	  }
1627 	  else {
1628 	    tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1629 	    tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1630 	    tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1631 	    tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1632 	    tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1633 	    tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1634 	    tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1635 	    tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1636 	    tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1637 	    tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1638 	    tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1639 	    tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1640 	    tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1641 	    tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1642 	    tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1643 	    tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1644 	    tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1645 	    tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1646 	    tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1647 	    tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1648 	    tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1649 	    tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1650 	    tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1651 	    tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
1652 	  }
1653 	
1654 	  for (auto i = pg_stats.begin();
1655 	       i != pg_stats.end(); ++i) {
1656 	    const pg_stat_t &st(i->second);
1657 	    if (brief) {
1658 	      tab << i->first
1659 	          << pg_state_string(st.state)
1660 	          << st.up
1661 	          << st.up_primary
1662 	          << st.acting
1663 	          << st.acting_primary
1664 	          << TextTable::endrow;
1665 	    } else {
1666 	      ostringstream reported;
1667 	      reported << st.reported_epoch << ":" << st.reported_seq;
1668 	
1669 	      tab << i->first
1670 	          << st.stats.sum.num_objects
1671 	          << st.stats.sum.num_objects_missing_on_primary
1672 	          << st.stats.sum.num_objects_degraded
1673 	          << st.stats.sum.num_objects_misplaced
1674 	          << st.stats.sum.num_objects_unfound
1675 	          << st.stats.sum.num_bytes
1676 	          << st.stats.sum.num_omap_bytes
1677 	          << st.stats.sum.num_omap_keys
1678 	          << st.log_size
1679 	          << st.ondisk_log_size
1680 	          << pg_state_string(st.state)
1681 	          << st.last_change
1682 	          << st.version
1683 	          << reported.str()
1684 	          << pg_vector_string(st.up)
1685 	          << st.up_primary
1686 	          << pg_vector_string(st.acting)
1687 	          << st.acting_primary
1688 	          << st.last_scrub
1689 	          << st.last_scrub_stamp
1690 	          << st.last_deep_scrub
1691 	          << st.last_deep_scrub_stamp
1692 	          << st.snaptrimq_len
1693 	          << TextTable::endrow;
1694 	    }
1695 	  }
1696 	
1697 	  ss << tab;
1698 	}
1699 	
1700 	void PGMap::dump(ostream& ss) const
1701 	{
1702 	  dump_basic(ss);
1703 	  dump_pg_stats(ss, false);
1704 	  dump_pool_stats(ss, false);
1705 	  dump_pg_sum_stats(ss, false);
1706 	  dump_osd_stats(ss);
1707 	}
1708 	
1709 	void PGMap::dump_basic(ostream& ss) const
1710 	{
1711 	  ss << "version " << version << std::endl;
1712 	  ss << "stamp " << stamp << std::endl;
1713 	  ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1714 	  ss << "last_pg_scan " << last_pg_scan << std::endl;
1715 	}
1716 	
1717 	void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1718 	{
1719 	  dump_pg_stats_plain(ss, pg_stat, brief);
1720 	}
1721 	
1722 	void PGMap::dump_pool_stats(ostream& ss, bool header) const
1723 	{
1724 	  TextTable tab;
1725 	
1726 	  if (header) {
1727 	    tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1728 	    tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1729 	    tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1730 	    tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1731 	    tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1732 	    tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1733 	    tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1734 	    tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1735 	    tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1736 	    tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1737 	    tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1738 	  } else {
1739 	    tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1740 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1741 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1742 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1743 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1744 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1745 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1746 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1747 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1748 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1749 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1750 	  }
1751 	
1752 	  for (auto p = pg_pool_sum.begin();
1753 	       p != pg_pool_sum.end();
1754 	       ++p) {
1755 	    tab << p->first
1756 	        << p->second.stats.sum.num_objects
1757 	        << p->second.stats.sum.num_objects_missing_on_primary
1758 	        << p->second.stats.sum.num_objects_degraded
1759 	        << p->second.stats.sum.num_objects_misplaced
1760 	        << p->second.stats.sum.num_objects_unfound
1761 	        << p->second.stats.sum.num_bytes
1762 	        << p->second.stats.sum.num_omap_bytes
1763 	        << p->second.stats.sum.num_omap_keys
1764 	        << p->second.log_size
1765 	        << p->second.ondisk_log_size
1766 	        << TextTable::endrow;
1767 	  }
1768 	
1769 	  ss << tab;
1770 	}
1771 	
1772 	void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1773 	{
1774 	  TextTable tab;
1775 	
1776 	  if (header) {
1777 	    tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1778 	    tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1779 	    tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1780 	    tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1781 	    tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1782 	    tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1783 	    tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1784 	    tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1785 	    tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1786 	    tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1787 	    tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1788 	  } else {
1789 	    tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1790 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1791 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1792 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1793 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1794 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1795 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1796 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1797 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1798 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1799 	    tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1800 	  };
1801 	
1802 	  tab << "sum"
1803 	      << pg_sum.stats.sum.num_objects
1804 	      << pg_sum.stats.sum.num_objects_missing_on_primary
1805 	      << pg_sum.stats.sum.num_objects_degraded
1806 	      << pg_sum.stats.sum.num_objects_misplaced
1807 	      << pg_sum.stats.sum.num_objects_unfound
1808 	      << pg_sum.stats.sum.num_bytes
1809 	      << pg_sum.stats.sum.num_omap_bytes
1810 	      << pg_sum.stats.sum.num_omap_keys
1811 	      << pg_sum.log_size
1812 	      << pg_sum.ondisk_log_size
1813 	      << TextTable::endrow;
1814 	
1815 	  ss << tab;
1816 	}
1817 	
1818 	void PGMap::dump_osd_stats(ostream& ss) const
1819 	{
1820 	  TextTable tab;
1821 	
1822 	  tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1823 	  tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1824 	  tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1825 	  tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1826 	  tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1827 	  tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1828 	  tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1829 	  tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1830 	
1831 	  for (auto p = osd_stat.begin();
1832 	       p != osd_stat.end();
1833 	       ++p) {
1834 	    tab << p->first
1835 	        << byte_u_t(p->second.statfs.get_used())
1836 	        << byte_u_t(p->second.statfs.available)
1837 	        << byte_u_t(p->second.statfs.get_used_raw())
1838 	        << byte_u_t(p->second.statfs.total)
1839 	        << p->second.hb_peers
1840 	        << get_num_pg_by_osd(p->first)
1841 	        << get_num_primary_pg_by_osd(p->first)
1842 	        << TextTable::endrow;
1843 	  }
1844 	
1845 	  tab << "sum"
1846 	      << byte_u_t(osd_sum.statfs.get_used())
1847 	      << byte_u_t(osd_sum.statfs.available)
1848 	      << byte_u_t(osd_sum.statfs.get_used_raw())
1849 	      << byte_u_t(osd_sum.statfs.total)
1850 	      << TextTable::endrow;
1851 	
1852 	  ss << tab;
1853 	}
1854 	
1855 	void PGMap::dump_osd_sum_stats(ostream& ss) const
1856 	{
1857 	  TextTable tab;
1858 	
1859 	  tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1860 	  tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1861 	  tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1862 	  tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1863 	  tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1864 	
1865 	  tab << "sum"
1866 	      << byte_u_t(osd_sum.statfs.get_used())
1867 	      << byte_u_t(osd_sum.statfs.available)
1868 	      << byte_u_t(osd_sum.statfs.get_used_raw())
1869 	      << byte_u_t(osd_sum.statfs.total)
1870 	      << TextTable::endrow;
1871 	
1872 	  ss << tab;
1873 	}
1874 	
1875 	void PGMap::get_stuck_stats(
1876 	  int types, const utime_t cutoff,
1877 	  mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1878 	{
1879 	  ceph_assert(types != 0);
1880 	  for (auto i = pg_stat.begin();
1881 	       i != pg_stat.end();
1882 	       ++i) {
1883 	    utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1884 	
1885 	    if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1886 	      if (i->second.last_active < val)
1887 		val = i->second.last_active;
1888 	    }
1889 	
1890 	    if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1891 	      if (i->second.last_clean < val)
1892 		val = i->second.last_clean;
1893 	    }
1894 	
1895 	    if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1896 	      if (i->second.last_undegraded < val)
1897 		val = i->second.last_undegraded;
1898 	    }
1899 	
1900 	    if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1901 	      if (i->second.last_fullsized < val)
1902 		val = i->second.last_fullsized;
1903 	    }
1904 	
1905 	    if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1906 	      if (i->second.last_unstale < val)
1907 		val = i->second.last_unstale;
1908 	    }
1909 	
1910 	    // val is now the earliest any of the requested stuck states began
1911 	    if (val < cutoff) {
1912 	      stuck_pgs[i->first] = i->second;
1913 	    }
1914 	  }
1915 	}
1916 	
1917 	bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
1918 	{
1919 	  int inactive = 0;
1920 	  int unclean = 0;
1921 	  int degraded = 0;
1922 	  int undersized = 0;
1923 	  int stale = 0;
1924 	
1925 	  for (auto i = pg_stat.begin();
1926 	       i != pg_stat.end();
1927 	       ++i) {
1928 	    if (! (i->second.state & PG_STATE_ACTIVE)) {
1929 	      if (i->second.last_active < cutoff)
1930 	        ++inactive;
1931 	    }
1932 	    if (! (i->second.state & PG_STATE_CLEAN)) {
1933 	      if (i->second.last_clean < cutoff)
1934 	        ++unclean;
1935 	    }
1936 	    if (i->second.state & PG_STATE_DEGRADED) {
1937 	      if (i->second.last_undegraded < cutoff)
1938 	        ++degraded;
1939 	    }
1940 	    if (i->second.state & PG_STATE_UNDERSIZED) {
1941 	      if (i->second.last_fullsized < cutoff)
1942 	        ++undersized;
1943 	    }
1944 	    if (i->second.state & PG_STATE_STALE) {
1945 	      if (i->second.last_unstale < cutoff)
1946 	        ++stale;
1947 	    }
1948 	  }
1949 	
1950 	  if (inactive)
1951 	    note["stuck inactive"] = inactive;
1952 	
1953 	  if (unclean)
1954 	    note["stuck unclean"] = unclean;
1955 	
1956 	  if (undersized)
1957 	    note["stuck undersized"] = undersized;
1958 	
1959 	  if (degraded)
1960 	    note["stuck degraded"] = degraded;
1961 	
1962 	  if (stale)
1963 	    note["stuck stale"] = stale;
1964 	
1965 	  return inactive || unclean || undersized || degraded || stale;
1966 	}
1967 	
1968 	void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
1969 	{
1970 	  mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1971 	  get_stuck_stats(types, cutoff, stuck_pg_stats);
1972 	  f->open_array_section("stuck_pg_stats");
1973 	  for (auto i = stuck_pg_stats.begin();
1974 	       i != stuck_pg_stats.end();
1975 	       ++i) {
1976 	    f->open_object_section("pg_stat");
1977 	    f->dump_stream("pgid") << i->first;
1978 	    i->second.dump(f);
1979 	    f->close_section();
1980 	  }
1981 	  f->close_section();
1982 	}
1983 	
1984 	void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
1985 	{
1986 	  mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1987 	  get_stuck_stats(types, cutoff, stuck_pg_stats);
1988 	  if (!stuck_pg_stats.empty())
1989 	    dump_pg_stats_plain(ss, stuck_pg_stats, true);
1990 	}
1991 	
1992 	int PGMap::dump_stuck_pg_stats(
1993 	  stringstream &ds,
1994 	  ceph::Formatter *f,
1995 	  int threshold,
1996 	  vector<string>& args) const
1997 	{
1998 	  int stuck_types = 0;
1999 	
2000 	  for (auto i = args.begin(); i != args.end(); ++i) {
2001 	    if (*i == "inactive")
2002 	      stuck_types |= PGMap::STUCK_INACTIVE;
2003 	    else if (*i == "unclean")
2004 	      stuck_types |= PGMap::STUCK_UNCLEAN;
2005 	    else if (*i == "undersized")
2006 	      stuck_types |= PGMap::STUCK_UNDERSIZED;
2007 	    else if (*i == "degraded")
2008 	      stuck_types |= PGMap::STUCK_DEGRADED;
2009 	    else if (*i == "stale")
2010 	      stuck_types |= PGMap::STUCK_STALE;
2011 	    else {
2012 	      ds << "Unknown type: " << *i << std::endl;
2013 	      return -EINVAL;
2014 	    }
2015 	  }
2016 	
2017 	  utime_t now(ceph_clock_now());
2018 	  utime_t cutoff = now - utime_t(threshold, 0);
2019 	
2020 	  if (!f) {
2021 	    dump_stuck_plain(ds, stuck_types, cutoff);
2022 	  } else {
2023 	    dump_stuck(f, stuck_types, cutoff);
2024 	    f->flush(ds);
2025 	  }
2026 	
2027 	  return 0;
2028 	}
2029 	
2030 	void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
2031 	{
2032 	  f->open_array_section("osd_perf_infos");
2033 	  for (auto i = osd_stat.begin();
2034 	       i != osd_stat.end();
2035 	       ++i) {
2036 	    f->open_object_section("osd");
2037 	    f->dump_int("id", i->first);
2038 	    {
2039 	      f->open_object_section("perf_stats");
2040 	      i->second.os_perf_stat.dump(f);
2041 	      f->close_section();
2042 	    }
2043 	    f->close_section();
2044 	  }
2045 	  f->close_section();
2046 	}
2047 	void PGMap::print_osd_perf_stats(std::ostream *ss) const
2048 	{
2049 	  TextTable tab;
2050 	  tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2051 	  tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2052 	  tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2053 	  for (auto i = osd_stat.begin();
2054 	       i != osd_stat.end();
2055 	       ++i) {
2056 	    tab << i->first;
2057 	    tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
2058 	    tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
2059 	    tab << TextTable::endrow;
2060 	  }
2061 	  (*ss) << tab;
2062 	}
2063 	
2064 	void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
2065 	{
2066 	  f->open_array_section("osd_blocked_by_infos");
2067 	  for (auto i = blocked_by_sum.begin();
2068 	       i != blocked_by_sum.end();
2069 	       ++i) {
2070 	    f->open_object_section("osd");
2071 	    f->dump_int("id", i->first);
2072 	    f->dump_int("num_blocked", i->second);
2073 	    f->close_section();
2074 	  }
2075 	  f->close_section();
2076 	}
2077 	void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2078 	{
2079 	  TextTable tab;
2080 	  tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2081 	  tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2082 	  for (auto i = blocked_by_sum.begin();
2083 	       i != blocked_by_sum.end();
2084 	       ++i) {
2085 	    tab << i->first;
2086 	    tab << i->second;
2087 	    tab << TextTable::endrow;
2088 	  }
2089 	  (*ss) << tab;
2090 	}
2091 	
2092 	
2093 	/**
2094 	 * update aggregated delta
2095 	 *
2096 	 * @param cct               ceph context
2097 	 * @param ts                Timestamp for the stats being delta'ed
2098 	 * @param old_pool_sum      Previous stats sum
2099 	 * @param last_ts           Last timestamp for pool
2100 	 * @param result_pool_sum   Resulting stats
2101 	 * @param result_pool_delta Resulting pool delta
2102 	 * @param result_ts_delta   Resulting timestamp delta
2103 	 * @param delta_avg_list    List of last N computed deltas, used to average
2104 	 */
2105 	void PGMap::update_delta(
2106 	  CephContext *cct,
2107 	  const utime_t ts,
2108 	  const pool_stat_t& old_pool_sum,
2109 	  utime_t *last_ts,
2110 	  const pool_stat_t& current_pool_sum,
2111 	  pool_stat_t *result_pool_delta,
2112 	  utime_t *result_ts_delta,
2113 	  mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2114 	{
2115 	  /* @p ts is the timestamp we want to associate with the data
2116 	   * in @p old_pool_sum, and on which we will base ourselves to
2117 	   * calculate the delta, stored in 'delta_t'.
2118 	   */
2119 	  utime_t delta_t;
2120 	  delta_t = ts;         // start with the provided timestamp
2121 	  delta_t -= *last_ts;  // take the last timestamp we saw
2122 	  *last_ts = ts;        // @p ts becomes the last timestamp we saw
2123 	
2124 	  // adjust delta_t, quick start if there is no update in a long period
2125 	  delta_t = std::min(delta_t,
2126 	                    utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2127 	
2128 	  // calculate a delta, and average over the last 6 deltas by default.
2129 	  /* start by taking a copy of our current @p result_pool_sum, and by
2130 	   * taking out the stats from @p old_pool_sum.  This generates a stats
2131 	   * delta.  Stash this stats delta in @p delta_avg_list, along with the
2132 	   * timestamp delta for these results.
2133 	   */
2134 	  pool_stat_t d = current_pool_sum;
2135 	  d.stats.sub(old_pool_sum.stats);
2136 	
2137 	  /* Aggregate current delta, and take out the last seen delta (if any) to
2138 	   * average it out.
2139 	   * Skip calculating delta while sum was not synchronized.
2140 	   */
2141 	  if(!old_pool_sum.stats.sum.is_zero()) {
2142 	    delta_avg_list->push_back(make_pair(d,delta_t));
2143 	    *result_ts_delta += delta_t;
2144 	    result_pool_delta->stats.add(d.stats);
2145 	  }
2146 	  size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2147 	  while (delta_avg_list->size() > s) {
2148 	    result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2149 	    *result_ts_delta -= delta_avg_list->front().second;
2150 	    delta_avg_list->pop_front();
2151 	  }
2152 	}
2153 	
2154 	/**
2155 	 * Update a given pool's deltas
2156 	 *
2157 	 * @param cct           Ceph Context
2158 	 * @param ts            Timestamp for the stats being delta'ed
2159 	 * @param pool          Pool's id
2160 	 * @param old_pool_sum  Previous stats sum
2161 	 */
2162 	void PGMap::update_one_pool_delta(
2163 	  CephContext *cct,
2164 	  const utime_t ts,
2165 	  const int64_t pool,
2166 	  const pool_stat_t& old_pool_sum)
2167 	{
2168 	  if (per_pool_sum_deltas.count(pool) == 0) {
2169 	    ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2170 	    ceph_assert(per_pool_sum_delta.count(pool) == 0);
2171 	  }
2172 	
2173 	  auto& sum_delta = per_pool_sum_delta[pool];
2174 	
2175 	  update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2176 	               &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2177 	               &per_pool_sum_deltas[pool]);
2178 	}
2179 	
2180 	/**
2181 	 * Update pools' deltas
2182 	 *
2183 	 * @param cct               CephContext
2184 	 * @param ts                Timestamp for the stats being delta'ed
2185 	 * @param pg_pool_sum_old   Map of pool stats for delta calcs.
2186 	 */
2187 	void PGMap::update_pool_deltas(
2188 	  CephContext *cct, const utime_t ts,
2189 	  const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
2190 	{
2191 	  for (auto it = pg_pool_sum_old.begin();
2192 	       it != pg_pool_sum_old.end(); ++it) {
2193 	    update_one_pool_delta(cct, ts, it->first, it->second);
2194 	  }
2195 	}
2196 	
2197 	void PGMap::clear_delta()
2198 	{
2199 	  pg_sum_delta = pool_stat_t();
2200 	  pg_sum_deltas.clear();
2201 	  stamp_delta = utime_t();
2202 	}
2203 	
2204 	void PGMap::generate_test_instances(list<PGMap*>& o)
2205 	{
2206 	  o.push_back(new PGMap);
2207 	  list<Incremental*> inc;
2208 	  Incremental::generate_test_instances(inc);
2209 	  delete inc.front();
2210 	  inc.pop_front();
2211 	  while (!inc.empty()) {
2212 	    PGMap *pmp = new PGMap();
2213 	    *pmp = *o.back();
2214 	    o.push_back(pmp);
2215 	    o.back()->apply_incremental(NULL, *inc.front());
2216 	    delete inc.front();
2217 	    inc.pop_front();
2218 	  }
2219 	}
2220 	
2221 	void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
2222 	                                  bool primary, set<pg_t>& pgs) const
2223 	{
2224 	  for (auto i = pg_stat.begin();
2225 	       i != pg_stat.end();
2226 	       ++i) {
2227 	    if ((poolid >= 0) && (poolid != i->first.pool()))
2228 	      continue;
2229 	    if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2230 	      continue;
2231 	    if (state == (uint64_t)-1 ||                 // "all"
2232 		(i->second.state & state) ||             // matches a state bit
2233 		(state == 0 && i->second.state == 0)) {  // matches "unknown" (== 0)
2234 	      pgs.insert(i->first);
2235 	    }
2236 	  }
2237 	}
2238 	
2239 	void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
2240 	{
2241 	  f->open_array_section("pg_stats");
2242 	  for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2243 	    const pg_stat_t& st = pg_stat.at(*i);
2244 	    f->open_object_section("pg_stat");
2245 	    f->dump_stream("pgid") << *i;
2246 	    st.dump(f);
2247 	    f->close_section();
2248 	  }
2249 	  f->close_section();
2250 	}
2251 	
2252 	void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2253 	{
2254 	  TextTable tab;
2255 	  utime_t now = ceph_clock_now();
2256 	
2257 	  tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
2258 	  tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2259 	  tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2260 	  tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2261 	  tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2262 	  tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2263 	  tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
2264 	  tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
2265 	  tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2266 	  tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2267 	  tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
2268 	  tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2269 	  tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2270 	  tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2271 	  tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2272 	  tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2273 	  tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2274 	
2275 	  for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2276 	    const pg_stat_t& st = pg_stat.at(*i);
2277 	
2278 	    ostringstream reported;
2279 	    reported << st.reported_epoch << ":" << st.reported_seq;
2280 	
2281 	    ostringstream upstr, actingstr;
2282 	    upstr << st.up << 'p' << st.up_primary;
2283 	    actingstr << st.acting << 'p' << st.acting_primary;
2284 	    tab << *i
2285 	        << st.stats.sum.num_objects
2286 	        << st.stats.sum.num_objects_degraded
2287 	        << st.stats.sum.num_objects_misplaced
2288 	        << st.stats.sum.num_objects_unfound
2289 	        << st.stats.sum.num_bytes
2290 	        << st.stats.sum.num_omap_bytes
2291 	        << st.stats.sum.num_omap_keys
2292 	        << st.log_size
2293 	        << pg_state_string(st.state)
2294 	        << utimespan_str(now - st.last_change)
2295 	        << st.version
2296 	        << reported.str()
2297 	        << upstr.str()
2298 	        << actingstr.str()
2299 	        << st.last_scrub_stamp
2300 	        << st.last_deep_scrub_stamp
2301 	        << TextTable::endrow;
2302 	  }
2303 	
2304 	  ss << tab;
2305 	}
2306 	
2307 	void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
2308 	                                        ceph::Formatter *f,
2309 	                                        stringstream *rs) const {
2310 	  string pool_name = osd_map.get_pool_name(poolid);
2311 	  if (f) {
2312 	    f->open_object_section("pool");
2313 	    f->dump_string("pool_name", pool_name.c_str());
2314 	    f->dump_int("pool_id", poolid);
2315 	    f->open_object_section("recovery");
2316 	  }
2317 	  list<string> sl;
2318 	  stringstream tss;
2319 	  pool_recovery_summary(f, &sl, poolid);
2320 	  if (!f && !sl.empty()) {
2321 	    for (auto &p : sl)
2322 	      tss << "  " << p << "\n";
2323 	  }
2324 	  if (f) {
2325 	    f->close_section(); // object section recovery
2326 	    f->open_object_section("recovery_rate");
2327 	  }
2328 	  ostringstream rss;
2329 	  pool_recovery_rate_summary(f, &rss, poolid);
2330 	  if (!f && !rss.str().empty())
2331 	    tss << "  recovery io " << rss.str() << "\n";
2332 	  if (f) {
2333 	    f->close_section(); // object section recovery_rate
2334 	    f->open_object_section("client_io_rate");
2335 	  }
2336 	  rss.clear();
2337 	  rss.str("");
2338 	  pool_client_io_rate_summary(f, &rss, poolid);
2339 	  if (!f && !rss.str().empty())
2340 	    tss << "  client io " << rss.str() << "\n";
2341 	  // dump cache tier IO rate for cache pool
2342 	  const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
2343 	  if (pool->is_tier()) {
2344 	    if (f) {
2345 	      f->close_section(); // object section client_io_rate
2346 	      f->open_object_section("cache_io_rate");
2347 	    }
2348 	    rss.clear();
2349 	    rss.str("");
2350 	    pool_cache_io_rate_summary(f, &rss, poolid);
2351 	    if (!f && !rss.str().empty())
2352 	      tss << "  cache tier io " << rss.str() << "\n";
2353 	  }
2354 	  if (f) {
2355 	    f->close_section(); // object section cache_io_rate
2356 	    f->close_section(); // object section pool
2357 	  } else {
2358 	    *rs << "pool " << pool_name << " id " << poolid << "\n";
2359 	    if (!tss.str().empty())
2360 	      *rs << tss.str() << "\n";
2361 	    else
2362 	      *rs << "  nothing is going on\n\n";
2363 	  }
2364 	}
2365 	
2366 	void PGMap::get_health_checks(
2367 	  CephContext *cct,
2368 	  const OSDMap& osdmap,
2369 	  health_check_map_t *checks) const
2370 	{
2371 	  utime_t now = ceph_clock_now();
2372 	  const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
2373 	  const auto& pools = osdmap.get_pools();
2374 	
2375 	  typedef enum pg_consequence_t {
2376 	    UNAVAILABLE = 1,   // Client IO to the pool may block
2377 	    DEGRADED = 2,      // Fewer than the requested number of replicas are present
2378 	    BACKFILL_FULL = 3, // Backfill is blocked for space considerations
2379 	                       // This may or may not be a deadlock condition.
2380 	    DAMAGED = 4,        // The data may be missing or inconsistent on disk and
2381 	                       //  requires repair
2382 	    RECOVERY_FULL = 5  // Recovery is blocked because OSDs are full
2383 	  } pg_consequence_t;
2384 	
2385 	  // For a given PG state, how should it be reported at the pool level?
2386 	  class PgStateResponse {
2387 	    public:
2388 	    pg_consequence_t consequence;
2389 	    typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2390 	    stuck_cb stuck_since;
2391 	    bool invert;
2392 	
2393 	    PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
2394 	      : consequence(c), stuck_since(std::move(s)), invert(false)
2395 	    {
2396 	    }
2397 	
2398 	    PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
2399 	      : consequence(c), stuck_since(std::move(s)), invert(i)
2400 	    {
2401 	    }
2402 	  };
2403 	
2404 	  // Record the PG state counts that contributed to a reported pool state
2405 	  class PgCauses {
2406 	    public:
2407 	    // Map of PG_STATE_* to number of pgs in that state.
2408 	    std::map<unsigned, unsigned> states;
2409 	
2410 	    // List of all PG IDs that had a state contributing
2411 	    // to this health condition.
2412 	    std::set<pg_t> pgs;
2413 	
2414 	    std::map<pg_t, std::string> pg_messages;
2415 	  };
2416 	
2417 	  // Map of PG state to how to respond to it
2418 	  std::map<unsigned, PgStateResponse> state_to_response = {
2419 	    // Immediate reports
2420 	    { PG_STATE_INCONSISTENT,     {DAMAGED,     {}} },
2421 	    { PG_STATE_INCOMPLETE,       {UNAVAILABLE, {}} },
2422 	    { PG_STATE_SNAPTRIM_ERROR,   {DAMAGED,     {}} },
2423 	    { PG_STATE_RECOVERY_UNFOUND, {DAMAGED,     {}} },
2424 	    { PG_STATE_BACKFILL_UNFOUND, {DAMAGED,     {}} },
2425 	    { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
2426 	    { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
2427 	    { PG_STATE_DEGRADED,         {DEGRADED,    {}} },
2428 	    { PG_STATE_DOWN,             {UNAVAILABLE, {}} },
2429 	    // Delayed (wait until stuck) reports
2430 	    { PG_STATE_PEERING,          {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;}    } },
2431 	    { PG_STATE_UNDERSIZED,       {DEGRADED,    [](const pg_stat_t &p){return p.last_fullsized;} } },
2432 	    { PG_STATE_STALE,            {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;}   } },
2433 	    // Delayed and inverted reports
2434 	    { PG_STATE_ACTIVE,           {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
2435 	  };
2436 	
2437 	  // Specialized state printer that takes account of inversion of
2438 	  // ACTIVE, CLEAN checks.
2439 	  auto state_name = [](const uint64_t &state) {
2440 	    // Special cases for the states that are inverted checks
2441 	    if (state == PG_STATE_CLEAN) {
2442 	      return std::string("unclean");
2443 	    } else if (state == PG_STATE_ACTIVE) {
2444 	      return std::string("inactive");
2445 	    } else {
2446 	      return pg_state_string(state);
2447 	    }
2448 	  };
2449 	
2450 	  // Map of what is wrong to information about why, implicitly also stores
2451 	  // the list of what is wrong.
2452 	  std::map<pg_consequence_t, PgCauses> detected;
2453 	
2454 	  // Optimisation: trim down the number of checks to apply based on
2455 	  // the summary counters
2456 	  std::map<unsigned, PgStateResponse> possible_responses;
2457 	  for (const auto &i : num_pg_by_state) {
2458 	    for (const auto &j : state_to_response) {
2459 	      if (!j.second.invert) {
2460 	        // Check for normal tests by seeing if any pgs have the flag
2461 	        if (i.first & j.first) {
2462 	          possible_responses.insert(j);
2463 	        }
2464 	      }
2465 	    }
2466 	  }
2467 	
2468 	  for (const auto &j : state_to_response) {
2469 	    if (j.second.invert) {
2470 	      // Check for inverted tests by seeing if not-all pgs have the flag
2471 	      const auto &found = num_pg_by_state.find(j.first);
2472 	      if (found == num_pg_by_state.end() || found->second != num_pg) {
2473 	        possible_responses.insert(j);
2474 	      }
2475 	    }
2476 	  }
2477 	
2478 	  utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
2479 	  // Loop over all PGs, if there are any possibly-unhealthy states in there
2480 	  if (!possible_responses.empty()) {
2481 	    for (const auto& i : pg_stat) {
2482 	      const auto &pg_id = i.first;
2483 	      const auto &pg_info = i.second;
2484 	
2485 	      for (const auto &j : state_to_response) {
2486 	        const auto &pg_response_state = j.first;
2487 	        const auto &pg_response = j.second;
2488 	
2489 	        // Apply the state test
2490 	        if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2491 	          continue;
2492 	        }
2493 	
2494 	        // Apply stuckness test if needed
2495 	        if (pg_response.stuck_since) {
2496 	          // Delayed response, check for stuckness
2497 	          utime_t last_whatever = pg_response.stuck_since(pg_info);
2498 	          if (last_whatever >= cutoff) {
2499 	            // Not stuck enough, ignore.
2500 	            continue;
2501 	          } else {
2502 	
2503 	          }
2504 	        }
2505 	
2506 	        auto &causes = detected[pg_response.consequence];
2507 	        causes.states[pg_response_state]++;
2508 	        causes.pgs.insert(pg_id);
2509 	
2510 	        // Don't bother composing detail string if we have already recorded
2511 	        // too many
2512 	        if (causes.pg_messages.size() > max) {
2513 	          continue;
2514 	        }
2515 	
2516 	        std::ostringstream ss;
2517 	        if (pg_response.stuck_since) {
2518 	          utime_t since = pg_response.stuck_since(pg_info);
2519 	          ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2520 	          if (since == utime_t()) {
2521 	            ss << " since forever";
2522 	          } else {
2523 	            utime_t dur = now - since;
2524 	            ss << " for " << utimespan_str(dur);
2525 	          }
2526 	          ss << ", current state " << pg_state_string(pg_info.state)
2527 	             << ", last acting " << pg_info.acting;
2528 	        } else {
2529 	          ss << "pg " << pg_id << " is "
2530 	             << pg_state_string(pg_info.state);
2531 	          ss << ", acting " << pg_info.acting;
2532 	          if (pg_info.stats.sum.num_objects_unfound) {
2533 	            ss << ", " << pg_info.stats.sum.num_objects_unfound
2534 	               << " unfound";
2535 	          }
2536 	        }
2537 	
2538 	        if (pg_info.state & PG_STATE_INCOMPLETE) {
2539 	          const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2540 	          if (pi && pi->min_size > 1) {
2541 	            ss << " (reducing pool "
2542 	               << osdmap.get_pool_name(pg_id.pool())
2543 	               << " min_size from " << (int)pi->min_size
2544 	               << " may help; search ceph.com/docs for 'incomplete')";
2545 	          }
2546 	        }
2547 	
2548 	        causes.pg_messages[pg_id] = ss.str();
2549 	      }
2550 	    }
2551 	  } else {
2552 	    dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2553 	  }
2554 	
2555 	  for (const auto &i : detected) {
2556 	    std::string health_code;
2557 	    health_status_t sev;
2558 	    std::string summary;
2559 	    switch(i.first) {
2560 	      case UNAVAILABLE:
2561 	        health_code = "PG_AVAILABILITY";
2562 	        sev = HEALTH_WARN;
2563 	        summary = "Reduced data availability: ";
2564 	        break;
2565 	      case DEGRADED:
2566 	        health_code = "PG_DEGRADED";
2567 	        summary = "Degraded data redundancy: ";
2568 	        sev = HEALTH_WARN;
2569 	        break;
2570 	      case BACKFILL_FULL:
2571 	        health_code = "PG_BACKFILL_FULL";
2572 	        summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2573 	        sev = HEALTH_WARN;
2574 	        break;
2575 	      case DAMAGED:
2576 	        health_code = "PG_DAMAGED";
2577 	        summary = "Possible data damage: ";
2578 	        sev = HEALTH_ERR;
2579 	        break;
2580 	      case RECOVERY_FULL:
2581 	        health_code = "PG_RECOVERY_FULL";
2582 	        summary = "Full OSDs blocking recovery: ";
2583 	        sev = HEALTH_ERR;
2584 	        break;
2585 	      default:
2586 	        ceph_abort();
2587 	    }
2588 	
2589 	    if (i.first == DEGRADED) {
2590 	      if (pg_sum.stats.sum.num_objects_degraded &&
2591 	          pg_sum.stats.sum.num_object_copies > 0) {
2592 	        double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2593 	          (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2594 	        char b[20];
2595 	        snprintf(b, sizeof(b), "%.3lf", pc);
2596 	        ostringstream ss;
2597 	        ss << pg_sum.stats.sum.num_objects_degraded
2598 	           << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2599 	           << b << "%)";
2600 	
2601 	        // Throw in a comma for the benefit of the following PG counts
2602 	        summary += ss.str() + ", ";
2603 	      }
2604 	    }
2605 	
2606 	    // Compose summary message saying how many PGs in what states led
2607 	    // to this health check failing
2608 	    std::vector<std::string> pg_msgs;
2609 	    int64_t count = 0;
2610 	    for (const auto &j : i.second.states) {
2611 	      std::ostringstream msg;
2612 	      msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2613 	      pg_msgs.push_back(msg.str());
2614 	      count += j.second;
2615 	    }
2616 	    summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2617 	
2618 	    health_check_t *check = &checks->add(
2619 	        health_code,
2620 	        sev,
2621 	        summary,
2622 		count);
2623 	
2624 	    // Compose list of PGs contributing to this health check failing
2625 	    for (const auto &j : i.second.pg_messages) {
2626 	      check->detail.push_back(j.second);
2627 	    }
2628 	  }
2629 	
2630 	  // OSD_SCRUB_ERRORS
2631 	  if (pg_sum.stats.sum.num_scrub_errors) {
2632 	    ostringstream ss;
2633 	    ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2634 	    checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
2635 			pg_sum.stats.sum.num_scrub_errors);
2636 	  }
2637 	
2638 	  // LARGE_OMAP_OBJECTS
2639 	  if (pg_sum.stats.sum.num_large_omap_objects) {
2640 	    list<string> detail;
2641 	    for (auto &pool : pools) {
2642 	      const string& pool_name = osdmap.get_pool_name(pool.first);
2643 	      auto it2 = pg_pool_sum.find(pool.first);
2644 	      if (it2 == pg_pool_sum.end()) {
2645 	        continue;
2646 	      }
2647 	      const pool_stat_t *pstat = &it2->second;
2648 	      if (pstat == nullptr) {
2649 	        continue;
2650 	      }
2651 	      const object_stat_sum_t& sum = pstat->stats.sum;
2652 	      if (sum.num_large_omap_objects) {
2653 	        stringstream ss;
2654 	        ss << sum.num_large_omap_objects << " large objects found in pool "
2655 	           << "'" << pool_name << "'";
2656 	        detail.push_back(ss.str());
2657 	      }
2658 	    }
2659 	    if (!detail.empty()) {
2660 	      ostringstream ss;
2661 	      ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
2662 	      auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
2663 				    pg_sum.stats.sum.num_large_omap_objects);
2664 	      stringstream tip;
2665 	      tip << "Search the cluster log for 'Large omap object found' for more "
2666 	          << "details.";
2667 	      detail.push_back(tip.str());
2668 	      d.detail.swap(detail);
2669 	    }
2670 	  }
2671 	
2672 	  // CACHE_POOL_NEAR_FULL
2673 	  {
2674 	    list<string> detail;
2675 	    unsigned num_pools = 0;
2676 	    for (auto& p : pools) {
2677 	      if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2678 		  !pg_pool_sum.count(p.first)) {
2679 		continue;
2680 	      }
2681 	      bool nearfull = false;
2682 	      const string& name = osdmap.get_pool_name(p.first);
2683 	      const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2684 	      uint64_t ratio = p.second.cache_target_full_ratio_micro +
2685 		((1000000 - p.second.cache_target_full_ratio_micro) *
2686 		 cct->_conf->mon_cache_target_full_warn_ratio);
2687 	      if (p.second.target_max_objects &&
2688 		  (uint64_t)(st.stats.sum.num_objects -
2689 			     st.stats.sum.num_objects_hit_set_archive) >
2690 		  p.second.target_max_objects * (ratio / 1000000.0)) {
2691 		ostringstream ss;
2692 		ss << "cache pool '" << name << "' with "
2693 		   << si_u_t(st.stats.sum.num_objects)
2694 		   << " objects at/near target max "
2695 		   << si_u_t(p.second.target_max_objects) << " objects";
2696 		detail.push_back(ss.str());
2697 		nearfull = true;
2698 	      }
2699 	      if (p.second.target_max_bytes &&
2700 		  (uint64_t)(st.stats.sum.num_bytes -
2701 			     st.stats.sum.num_bytes_hit_set_archive) >
2702 		  p.second.target_max_bytes * (ratio / 1000000.0)) {
2703 		ostringstream ss;
2704 		ss << "cache pool '" << name
2705 		   << "' with " << byte_u_t(st.stats.sum.num_bytes)
2706 		   << " at/near target max "
2707 		   << byte_u_t(p.second.target_max_bytes);
2708 		detail.push_back(ss.str());
2709 		nearfull = true;
2710 	      }
2711 	      if (nearfull) {
2712 		++num_pools;
2713 	      }
2714 	    }
2715 	    if (!detail.empty()) {
2716 	      ostringstream ss;
2717 	      ss << num_pools << " cache pools at or near target size";
2718 	      auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
2719 				    num_pools);
2720 	      d.detail.swap(detail);
2721 	    }
2722 	  }
2723 	
2724 	  // TOO_FEW_PGS
2725 	  unsigned num_in = osdmap.get_num_in_osds();
2726 	  auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2727 	  const auto min_pg_per_osd =
2728 	    cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
2729 	  if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2730 	    auto per = sum_pg_up / num_in;
2731 	    if (per < min_pg_per_osd && per) {
2732 	      ostringstream ss;
2733 	      ss << "too few PGs per OSD (" << per
2734 		 << " < min " << min_pg_per_osd << ")";
2735 	      checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
2736 			  min_pg_per_osd - per);
2737 	    }
2738 	  }
2739 	
2740 	  // TOO_MANY_PGS
2741 	  auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
2742 	  if (num_in && max_pg_per_osd > 0) {
2743 	    auto per = sum_pg_up / num_in;
2744 	    if (per > max_pg_per_osd) {
2745 	      ostringstream ss;
2746 	      ss << "too many PGs per OSD (" << per
2747 		 << " > max " << max_pg_per_osd << ")";
2748 	      checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
2749 			  per - max_pg_per_osd);
2750 	    }
2751 	  }
2752 	
2753 	  // TOO_FEW_OSDS
2754 	  auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
2755 	  auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
2756 	  if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
2757 	    ostringstream ss;
2758 	    ss << "OSD count " << osdmap.get_num_osds()
2759 		 << " < osd_pool_default_size " << osd_pool_default_size;
2760 	    checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
2761 			osd_pool_default_size - osdmap.get_num_osds());
2762 	  }
2763 	
2764 	  // SLOW_PING_TIME
2765 	  // Convert milliseconds to microseconds
2766 	  auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
2767 	  auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
2768 	  if (warn_slow_ping_time == 0) {
2769 	    double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
2770 	    warn_slow_ping_time = grace;
2771 	    warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2772 	  }
2773 	  if (warn_slow_ping_time > 0) {
2774 	
2775 	    struct mon_ping_item_t {
2776 	      uint32_t pingtime;
2777 	      int from;
2778 	      int to;
2779 	      bool improving;
2780 	
2781 	      bool operator<(const mon_ping_item_t& rhs) const {
2782 	        if (pingtime < rhs.pingtime)
2783 	          return true;
2784 	        if (pingtime > rhs.pingtime)
2785 	          return false;
2786 	        if (from < rhs.from)
2787 	          return true;
2788 	        if (from > rhs.from)
2789 	          return false;
2790 	        return to < rhs.to;
2791 	      }
2792 	    };
2793 	
2794 	    list<string> detail_back;
2795 	    list<string> detail_front;
2796 	    set<mon_ping_item_t> back_sorted, front_sorted;
2797 	    for (auto i : osd_stat) {
2798 	      for (auto j : i.second.hb_pingtime) {
2799 	
2800 		// Maybe source info is old
2801 		if (now.sec() - j.second.last_update > grace * 60)
2802 		  continue;
2803 	
2804 		mon_ping_item_t back;
2805 		back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2806 		back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
2807 		back.from = i.first;
2808 		back.to = j.first;
2809 		if (back.pingtime > warn_slow_ping_time) {
2810 		  back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
(1) Event original: "j.second.back_pingtime" looks like the original copy.
Also see events: [copy_paste_error][remediation]
2811 				    && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
2812 		  back_sorted.emplace(back);
2813 		}
2814 	
2815 		mon_ping_item_t front;
2816 		front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2817 		front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
2818 		front.from = i.first;
2819 		front.to = j.first;
2820 		if (front.pingtime > warn_slow_ping_time) {
2821 		  front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
(2) Event copy_paste_error: "back_pingtime" in "j.second.back_pingtime" looks like a copy-paste error.
(3) Event remediation: Should it say "front_pingtime" instead?
Also see events: [original]
2822 				     && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
2823 		  front_sorted.emplace(front);
2824 		}
2825 	      }
2826 	    }
2827 	    int max_detail = 10;
2828 	    for (auto &sback : boost::adaptors::reverse(back_sorted)) {
2829 	      ostringstream ss;
2830 	      if (max_detail == 0) {
2831 		ss << "Truncated long network list.  Use ceph daemon mgr.# dump_osd_network for more information";
2832 	        detail_back.push_back(ss.str());
2833 	        break;
2834 	      }
2835 	      max_detail--;
2836 	      ss << "Slow heartbeat ping on back interface from osd." << sback.from
2837 	         << (osdmap.is_down(sback.from) ? " (down)" : "")
2838 		 << " to osd." << sback.to
2839 	         << (osdmap.is_down(sback.to) ? " (down)" : "")
2840 		 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
2841 		 << (sback.improving ? " possibly improving" : "");
2842 	      detail_back.push_back(ss.str());
2843 	    }
2844 	    max_detail = 10;
2845 	    for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
2846 	      ostringstream ss;
2847 	      if (max_detail == 0) {
2848 		ss << "Truncated long network list.  Use ceph daemon mgr.# dump_osd_network for more information";
2849 	        detail_front.push_back(ss.str());
2850 	        break;
2851 	      }
2852 	      max_detail--;
2853 	      ss << "Slow heartbeat ping on front interface from osd." << sfront.from
2854 	         << (osdmap.is_down(sfront.from) ? " (down)" : "")
2855 	         << " to osd." << sfront.to
2856 	         << (osdmap.is_down(sfront.to) ? " (down)" : "")
2857 		 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
2858 		 << (sfront.improving ? " possibly improving" : "");
2859 	      detail_front.push_back(ss.str());
2860 	    }
2861 	    if (detail_back.size() != 0) {
2862 	      ostringstream ss;
2863 	      ss << "Long heartbeat ping times on back interface seen, longest is "
2864 		 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << " msec";
2865 	      auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
2866 			      back_sorted.size());
2867 	      d.detail.swap(detail_back);
2868 	    }
2869 	    if (detail_front.size() != 0) {
2870 	      ostringstream ss;
2871 	      ss << "Long heartbeat ping times on front interface seen, longest is "
2872 		 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << " msec";
2873 	      auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
2874 			      front_sorted.size());
2875 	      d.detail.swap(detail_front);
2876 	    }
2877 	  }
2878 	
2879 	  // SMALLER_PGP_NUM
2880 	  // MANY_OBJECTS_PER_PG
2881 	  if (!pg_stat.empty()) {
2882 	    list<string> pgp_detail, many_detail;
2883 	    const auto mon_pg_warn_min_objects =
2884 	      cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
2885 	    const auto mon_pg_warn_min_pool_objects =
2886 	      cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
2887 	    const auto mon_pg_warn_max_object_skew =
2888 	      cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
2889 	    for (auto p = pg_pool_sum.begin();
2890 	         p != pg_pool_sum.end();
2891 	         ++p) {
2892 	      const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2893 	      if (!pi)
2894 		continue;   // in case osdmap changes haven't propagated to PGMap yet
2895 	      const string& name = osdmap.get_pool_name(p->first);
2896 	      // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2897 	      // the warnings.  If the cluster is failing to converge on the target
2898 	      // values that is a separate issue!
2899 	      if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
2900 		  !(name.find(".DELETED") != string::npos &&
2901 		    cct->_conf->mon_fake_pool_delete)) {
2902 		ostringstream ss;
2903 		ss << "pool " << name << " pg_num "
2904 		   << pi->get_pg_num_target()
2905 		   << " > pgp_num " << pi->get_pgp_num_target();
2906 		pgp_detail.push_back(ss.str());
2907 	      }
2908 	      int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2909 	      if (average_objects_per_pg > 0 &&
2910 	          pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2911 	          p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
2912 		int objects_per_pg = p->second.stats.sum.num_objects /
2913 		  pi->get_pg_num_target();
2914 		float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2915 		if (mon_pg_warn_max_object_skew > 0 &&
2916 		    ratio > mon_pg_warn_max_object_skew) {
2917 		  ostringstream ss;
2918 		  ss << "pool " << name << " objects per pg ("
2919 		     << objects_per_pg << ") is more than " << ratio
2920 		     << " times cluster average ("
2921 		     << average_objects_per_pg << ")";
2922 		  many_detail.push_back(ss.str());
2923 		}
2924 	      }
2925 	    }
2926 	    if (!pgp_detail.empty()) {
2927 	      ostringstream ss;
2928 	      ss << pgp_detail.size() << " pools have pg_num > pgp_num";
2929 	      auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
2930 				    pgp_detail.size());
2931 	      d.detail.swap(pgp_detail);
2932 	    }
2933 	    if (!many_detail.empty()) {
2934 	      ostringstream ss;
2935 	      ss << many_detail.size() << " pools have many more objects per pg than"
2936 		 << " average";
2937 	      auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
2938 				    many_detail.size());
2939 	      d.detail.swap(many_detail);
2940 	    }
2941 	  }
2942 	
2943 	  // POOL_FULL
2944 	  // POOL_NEAR_FULL
2945 	  {
2946 	    float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
2947 	    float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
2948 	    list<string> full_detail, nearfull_detail;
2949 	    unsigned full_pools = 0, nearfull_pools = 0;
2950 	    for (auto it : pools) {
2951 	      auto it2 = pg_pool_sum.find(it.first);
2952 	      if (it2 == pg_pool_sum.end()) {
2953 		continue;
2954 	      }
2955 	      const pool_stat_t *pstat = &it2->second;
2956 	      const object_stat_sum_t& sum = pstat->stats.sum;
2957 	      const string& pool_name = osdmap.get_pool_name(it.first);
2958 	      const pg_pool_t &pool = it.second;
2959 	      bool full = false, nearfull = false;
2960 	      if (pool.quota_max_objects > 0) {
2961 		stringstream ss;
2962 		if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
2963 		} else if (crit_threshold > 0 &&
2964 			   sum.num_objects >= pool.quota_max_objects*crit_threshold) {
2965 		  ss << "pool '" << pool_name
2966 		     << "' has " << sum.num_objects << " objects"
2967 		     << " (max " << pool.quota_max_objects << ")";
2968 		  full_detail.push_back(ss.str());
2969 		  full = true;
2970 		} else if (warn_threshold > 0 &&
2971 			   sum.num_objects >= pool.quota_max_objects*warn_threshold) {
2972 		  ss << "pool '" << pool_name
2973 		     << "' has " << sum.num_objects << " objects"
2974 		     << " (max " << pool.quota_max_objects << ")";
2975 		  nearfull_detail.push_back(ss.str());
2976 		  nearfull = true;
2977 		}
2978 	      }
2979 	      if (pool.quota_max_bytes > 0) {
2980 		stringstream ss;
2981 		if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
2982 		} else if (crit_threshold > 0 &&
2983 			   sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
2984 		  ss << "pool '" << pool_name
2985 		     << "' has " << byte_u_t(sum.num_bytes)
2986 		     << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
2987 		  full_detail.push_back(ss.str());
2988 		  full = true;
2989 		} else if (warn_threshold > 0 &&
2990 			   sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
2991 		  ss << "pool '" << pool_name
2992 		     << "' has " << byte_u_t(sum.num_bytes)
2993 		     << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
2994 		  nearfull_detail.push_back(ss.str());
2995 		  nearfull = true;
2996 		}
2997 	      }
2998 	      if (full) {
2999 		++full_pools;
3000 	      }
3001 	      if (nearfull) {
3002 		++nearfull_pools;
3003 	      }
3004 	    }
3005 	    if (full_pools) {
3006 	      ostringstream ss;
3007 	      ss << full_pools << " pools full";
3008 	      auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
3009 	      d.detail.swap(full_detail);
3010 	    }
3011 	    if (nearfull_pools) {
3012 	      ostringstream ss;
3013 	      ss << nearfull_pools << " pools nearfull";
3014 	      auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
3015 	      d.detail.swap(nearfull_detail);
3016 	    }
3017 	  }
3018 	
3019 	  // OBJECT_MISPLACED
3020 	  if (pg_sum.stats.sum.num_objects_misplaced &&
3021 	      pg_sum.stats.sum.num_object_copies > 0 &&
3022 	      cct->_conf->mon_warn_on_misplaced) {
3023 	    double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3024 	      (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3025 	    char b[20];
3026 	    snprintf(b, sizeof(b), "%.3lf", pc);
3027 	    ostringstream ss;
3028 	    ss << pg_sum.stats.sum.num_objects_misplaced
3029 	       << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3030 	       << b << "%)";
3031 	    checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
3032 			pg_sum.stats.sum.num_objects_misplaced);
3033 	  }
3034 	
3035 	  // OBJECT_UNFOUND
3036 	  if (pg_sum.stats.sum.num_objects_unfound &&
3037 	      pg_sum.stats.sum.num_objects) {
3038 	    double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3039 	      (double)pg_sum.stats.sum.num_objects * (double)100.0;
3040 	    char b[20];
3041 	    snprintf(b, sizeof(b), "%.3lf", pc);
3042 	    ostringstream ss;
3043 	    ss << pg_sum.stats.sum.num_objects_unfound
3044 	       << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
3045 	    auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
3046 				  pg_sum.stats.sum.num_objects_unfound);
3047 	
3048 	    for (auto& p : pg_stat) {
3049 	      if (p.second.stats.sum.num_objects_unfound) {
3050 		ostringstream ss;
3051 		ss << "pg " << p.first
3052 		   << " has " << p.second.stats.sum.num_objects_unfound
3053 		   << " unfound objects";
3054 		d.detail.push_back(ss.str());
3055 		if (d.detail.size() > max) {
3056 		  d.detail.push_back("(additional pgs left out for brevity)");
3057 		  break;
3058 		}
3059 	      }
3060 	    }
3061 	  }
3062 	
3063 	  // REQUEST_SLOW
3064 	  // REQUEST_STUCK
3065 	  // SLOW_OPS unifies them in mimic.
3066 	  if (osdmap.require_osd_release < ceph_release_t::mimic &&
3067 	      cct->_conf->mon_osd_warn_op_age > 0 &&
3068 	      !osd_sum.op_queue_age_hist.h.empty() &&
3069 	      osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3070 	      cct->_conf->mon_osd_warn_op_age) {
3071 	    list<string> warn_detail, error_detail;
3072 	    unsigned warn = 0, error = 0;
3073 	    float err_age =
3074 	      cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3075 	    const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3076 	    for (unsigned i = h.h.size() - 1; i > 0; --i) {
3077 	      float ub = (float)(1 << i) / 1000.0;
3078 	      if (ub < cct->_conf->mon_osd_warn_op_age)
3079 		break;
3080 	      if (h.h[i]) {
3081 		ostringstream ss;
3082 		ss << h.h[i] << " ops are blocked > " << ub << " sec";
3083 		if (ub > err_age) {
3084 		  error += h.h[i];
3085 		  error_detail.push_back(ss.str());
3086 		} else {
3087 		  warn += h.h[i];
3088 		  warn_detail.push_back(ss.str());
3089 		}
3090 	      }
3091 	    }
3092 	
3093 	    map<float,set<int>> warn_osd_by_max; // max -> osds
3094 	    map<float,set<int>> error_osd_by_max; // max -> osds
3095 	    if (!warn_detail.empty() || !error_detail.empty()) {
3096 	      for (auto& p : osd_stat) {
3097 		const pow2_hist_t& h = p.second.op_queue_age_hist;
3098 		for (unsigned i = h.h.size() - 1; i > 0; --i) {
3099 		  float ub = (float)(1 << i) / 1000.0;
3100 		  if (ub < cct->_conf->mon_osd_warn_op_age)
3101 		    break;
3102 		  if (h.h[i]) {
3103 		    if (ub > err_age) {
3104 		      error_osd_by_max[ub].insert(p.first);
3105 		    } else {
3106 		      warn_osd_by_max[ub].insert(p.first);
3107 		    }
3108 		    break;
3109 		  }
3110 		}
3111 	      }
3112 	    }
3113 	
3114 	    if (!warn_detail.empty()) {
3115 	      ostringstream ss;
3116 	      ss << warn << " slow requests are blocked > "
3117 		 << cct->_conf->mon_osd_warn_op_age << " sec";
3118 	      auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
3119 	      d.detail.swap(warn_detail);
3120 	      int left = max;
3121 	      for (auto& p : warn_osd_by_max) {
3122 		ostringstream ss;
3123 		if (p.second.size() > 1) {
3124 		  ss << "osds " << p.second
3125 	             << " have blocked requests > " << p.first << " sec";
3126 		} else {
3127 		  ss << "osd." << *p.second.begin()
3128 	             << " has blocked requests > " << p.first << " sec";
3129 		}
3130 		d.detail.push_back(ss.str());
3131 		if (--left == 0) {
3132 		  break;
3133 		}
3134 	      }
3135 	    }
3136 	    if (!error_detail.empty()) {
3137 	      ostringstream ss;
3138 	      ss << error << " stuck requests are blocked > "
3139 		 << err_age << " sec";
3140 	      auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
3141 	      d.detail.swap(error_detail);
3142 	      int left = max;
3143 	      for (auto& p : error_osd_by_max) {
3144 		ostringstream ss;
3145 		if (p.second.size() > 1) {
3146 		  ss << "osds " << p.second
3147 	             << " have stuck requests > " << p.first << " sec";
3148 		} else {
3149 		  ss << "osd." << *p.second.begin()
3150 	             << " has stuck requests > " << p.first << " sec";
3151 		}
3152 		d.detail.push_back(ss.str());
3153 		if (--left == 0) {
3154 		  break;
3155 		}
3156 	      }
3157 	    }
3158 	  }
3159 	
3160 	  // OBJECT_STORE_WARN
3161 	  if (osd_sum.os_alerts.size()) {
3162 	    map<string, pair<size_t, list<string>>> os_alerts_sum;
3163 	
3164 	    for (auto& a : osd_sum.os_alerts) {
3165 	      int left = max;
3166 	      string s0 = " osd.";
3167 	      s0 += stringify(a.first);
3168 	      for (auto& aa : a.second) {
3169 	        string s(s0);
3170 	        s += " ";
3171 	        s += aa.second;
3172 	        auto it = os_alerts_sum.find(aa.first);
3173 	        if (it == os_alerts_sum.end()) {
3174 	          list<string> d;
3175 	          d.emplace_back(s);
3176 	          os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
3177 	        } else {
3178 	          auto& p = it->second;
3179 	          ++p.first;
3180 	          p.second.emplace_back(s);
3181 	        }
3182 		if (--left == 0) {
3183 		  break;
3184 		}
3185 	      }
3186 	    }
3187 	
3188 	    for (auto& asum : os_alerts_sum) {
3189 	      string summary = stringify(asum.second.first) + " OSD(s)";
3190 	      if (asum.first == "BLUEFS_SPILLOVER") {
3191 		summary += " experiencing BlueFS spillover";
3192 	      } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
3193 		summary += " have broken BlueStore compression";
3194 	      } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
3195 		summary += " reporting legacy (not per-pool) BlueStore stats";
3196 	      } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
3197 		summary += " have dangerous mismatch between BlueStore block device and free list sizes";
3198 	      } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
3199 		summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
3200 	      }
3201 	      auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
3202 	      for (auto& s : asum.second.second) {
3203 	        d.detail.push_back(s);
3204 	      }
3205 	    }
3206 	  }
3207 	  // PG_NOT_SCRUBBED
3208 	  // PG_NOT_DEEP_SCRUBBED
3209 	  if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
3210 	        cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3211 	    list<string> detail, deep_detail;
3212 	    int detail_max = max, deep_detail_max = max;
3213 	    int detail_more = 0, deep_detail_more = 0;
3214 	    int detail_total = 0, deep_detail_total = 0;
3215 	    for (auto& p : pg_stat) {
3216 	      int64_t pnum =  p.first.pool();
3217 	      auto pool = osdmap.get_pg_pool(pnum);
3218 	      if (!pool)
3219 	        continue;
3220 	      if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
3221 	        double scrub_max_interval = 0;
3222 	        pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3223 	        if (scrub_max_interval <= 0) {
3224 	          scrub_max_interval = cct->_conf->osd_scrub_max_interval;
3225 	        }
3226 	        const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
3227 	          scrub_max_interval;
3228 	        utime_t cutoff = now;
3229 	        cutoff -= age;
3230 	        if (p.second.last_scrub_stamp < cutoff) {
3231 	          if (detail_max > 0) {
3232 	            ostringstream ss;
3233 	            ss << "pg " << p.first << " not scrubbed since "
3234 	               << p.second.last_scrub_stamp;
3235 	            detail.push_back(ss.str());
3236 	            --detail_max;
3237 	          } else {
3238 	            ++detail_more;
3239 	          }
3240 	          ++detail_total;
3241 	        }
3242 	      }
3243 	      if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3244 	        double deep_scrub_interval = 0;
3245 	        pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3246 	        if (deep_scrub_interval <= 0) {
3247 	          deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3248 	        }
3249 	        double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
3250 	          deep_scrub_interval;
3251 	        utime_t deep_cutoff = now;
3252 	        deep_cutoff -= deep_age;
3253 	        if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3254 	          if (deep_detail_max > 0) {
3255 	            ostringstream ss;
3256 	            ss << "pg " << p.first << " not deep-scrubbed since "
3257 	               << p.second.last_deep_scrub_stamp;
3258 	            deep_detail.push_back(ss.str());
3259 	            --deep_detail_max;
3260 	          } else {
3261 	            ++deep_detail_more;
3262 	          }
3263 	          ++deep_detail_total;
3264 	        }
3265 	      }
3266 	    }
3267 	    if (detail_total) {
3268 	      ostringstream ss;
3269 	      ss << detail_total << " pgs not scrubbed in time";
3270 	      auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
3271 	
3272 	      if (!detail.empty()) {
3273 	        d.detail.swap(detail);
3274 	
3275 	        if (detail_more) {
3276 	          ostringstream ss;
3277 	          ss << detail_more << " more pgs... ";
3278 	          d.detail.push_back(ss.str());
3279 	        }
3280 	      }
3281 	    }
3282 	    if (deep_detail_total) {
3283 	      ostringstream ss;
3284 	      ss << deep_detail_total << " pgs not deep-scrubbed in time";
3285 	      auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
3286 				    deep_detail_total);
3287 	
3288 	      if (!deep_detail.empty()) {
3289 	        d.detail.swap(deep_detail);
3290 	
3291 	        if (deep_detail_more) {
3292 	          ostringstream ss;
3293 	          ss << deep_detail_more << " more pgs... ";
3294 	          d.detail.push_back(ss.str());
3295 	        }
3296 	      }
3297 	    }
3298 	  }
3299 	
3300 	  // POOL_APP
3301 	  if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
3302 	    list<string> detail;
3303 	    for (auto &it : pools) {
3304 	      const pg_pool_t &pool = it.second;
3305 	      const string& pool_name = osdmap.get_pool_name(it.first);
3306 	      auto it2 = pg_pool_sum.find(it.first);
3307 	      if (it2 == pg_pool_sum.end()) {
3308 	        continue;
3309 	      }
3310 	      const pool_stat_t *pstat = &it2->second;
3311 	      if (pstat == nullptr) {
3312 	        continue;
3313 	      }
3314 	      const object_stat_sum_t& sum = pstat->stats.sum;
3315 	      // application metadata is not encoded until luminous is minimum
3316 	      // required release
3317 	      if (sum.num_objects > 0 && pool.application_metadata.empty() &&
3318 	          !pool.is_tier()) {
3319 	        stringstream ss;
3320 	        ss << "application not enabled on pool '" << pool_name << "'";
3321 	        detail.push_back(ss.str());
3322 	      }
3323 	    }
3324 	    if (!detail.empty()) {
3325 	      ostringstream ss;
3326 	      ss << detail.size() << " pool(s) do not have an application enabled";
3327 	      auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
3328 				    detail.size());
3329 	      stringstream tip;
3330 	      tip << "use 'ceph osd pool application enable <pool-name> "
3331 	          << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3332 	          << "or freeform for custom applications.";
3333 	      detail.push_back(tip.str());
3334 	      d.detail.swap(detail);
3335 	    }
3336 	  }
3337 	
3338 	  // PG_SLOW_SNAP_TRIMMING
3339 	  if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3340 	    uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3341 	    uint64_t snaptrimq_exceeded = 0;
3342 	    uint32_t longest_queue = 0;
3343 	    const pg_t* longest_q_pg = nullptr;
3344 	    list<string> detail;
3345 	
3346 	    for (auto& i: pg_stat) {
3347 	      uint32_t current_len = i.second.snaptrimq_len;
3348 	      if (current_len >= snapthreshold) {
3349 	        snaptrimq_exceeded++;
3350 	        if (longest_queue <= current_len) {
3351 	          longest_q_pg = &i.first;
3352 	          longest_queue = current_len;
3353 	        }
3354 	        if (detail.size() < max - 1) {
3355 	          stringstream ss;
3356 	          ss << "snap trim queue for pg " << i.first << " at " << current_len;
3357 	          detail.push_back(ss.str());
3358 	          continue;
3359 	        }
3360 	        if (detail.size() < max) {
3361 	          detail.push_back("...more pgs affected");
3362 	          continue;
3363 	        }
3364 	      }
3365 	    }
3366 	
3367 	    if (snaptrimq_exceeded) {
3368 	      {
3369 	         ostringstream ss;
3370 	         ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3371 	         detail.push_back(ss.str());
3372 	      }
3373 	
3374 	      stringstream ss;
3375 	      ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
3376 	      auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
3377 				    snaptrimq_exceeded);
3378 	      detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3379 	      d.detail.swap(detail);
3380 	    }
3381 	  }
3382 	}
3383 	
3384 	int process_pg_map_command(
3385 	  const string& orig_prefix,
3386 	  const cmdmap_t& orig_cmdmap,
3387 	  const PGMap& pg_map,
3388 	  const OSDMap& osdmap,
3389 	  ceph::Formatter *f,
3390 	  stringstream *ss,
3391 	  bufferlist *odata)
3392 	{
3393 	  string prefix = orig_prefix;
3394 	  auto cmdmap = orig_cmdmap;
3395 	
3396 	  string omap_stats_note =
3397 	      "\n* NOTE: Omap statistics are gathered during deep scrub and "
3398 	      "may be inaccurate soon afterwards depending on utilisation. See "
3399 	      "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics "
3400 	      "for further details.\n";
3401 	  bool omap_stats_note_required = false;
3402 	
3403 	  // perhaps these would be better in the parsing, but it's weird
3404 	  bool primary = false;
3405 	  if (prefix == "pg dump_json") {
3406 	    vector<string> v;
3407 	    v.push_back(string("all"));
3408 	    cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3409 	    cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3410 	    prefix = "pg dump";
3411 	  } else if (prefix == "pg dump_pools_json") {
3412 	    vector<string> v;
3413 	    v.push_back(string("pools"));
3414 	    cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3415 	    cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3416 	    prefix = "pg dump";
3417 	  } else if (prefix == "pg ls-by-primary") {
3418 	    primary = true;
3419 	    prefix = "pg ls";
3420 	  } else if (prefix == "pg ls-by-osd") {
3421 	    prefix = "pg ls";
3422 	  } else if (prefix == "pg ls-by-pool") {
3423 	    prefix = "pg ls";
3424 	    string poolstr;
3425 	    cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
3426 	    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3427 	    if (pool < 0) {
3428 	      *ss << "pool " << poolstr << " does not exist";
3429 	      return -ENOENT;
3430 	    }
3431 	    cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3432 	  }
3433 	
3434 	  stringstream ds;
3435 	  if (prefix == "pg stat") {
3436 	    if (f) {
3437 	      f->open_object_section("pg_summary");
3438 	      pg_map.print_oneline_summary(f, NULL);
3439 	      f->close_section();
3440 	      f->flush(ds);
3441 	    } else {
3442 	      ds << pg_map;
3443 	    }
3444 	    odata->append(ds);
3445 	    return 0;
3446 	  }
3447 	
3448 	  if (prefix == "pg getmap") {
3449 	    pg_map.encode(*odata);
3450 	    *ss << "got pgmap version " << pg_map.version;
3451 	    return 0;
3452 	  }
3453 	
3454 	  if (prefix == "pg dump") {
3455 	    string val;
3456 	    vector<string> dumpcontents;
3457 	    set<string> what;
3458 	    if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3459 	      copy(dumpcontents.begin(), dumpcontents.end(),
3460 	           inserter(what, what.end()));
3461 	    }
3462 	    if (what.empty())
3463 	      what.insert("all");
3464 	    if (f) {
3465 	      if (what.count("all")) {
3466 		f->open_object_section("pg_map");
3467 		pg_map.dump(f);
3468 		f->close_section();
3469 	      } else if (what.count("summary") || what.count("sum")) {
3470 		f->open_object_section("pg_map");
3471 		pg_map.dump_basic(f);
3472 		f->close_section();
3473 	      } else {
3474 		if (what.count("pools")) {
3475 		  pg_map.dump_pool_stats(f);
3476 		}
3477 		if (what.count("osds")) {
3478 		  pg_map.dump_osd_stats(f);
3479 		}
3480 		if (what.count("pgs")) {
3481 		  pg_map.dump_pg_stats(f, false);
3482 		}
3483 		if (what.count("pgs_brief")) {
3484 		  pg_map.dump_pg_stats(f, true);
3485 		}
3486 		if (what.count("delta")) {
3487 		  f->open_object_section("delta");
3488 		  pg_map.dump_delta(f);
3489 		  f->close_section();
3490 		}
3491 	      }
3492 	      f->flush(*odata);
3493 	    } else {
3494 	      if (what.count("all")) {
3495 		pg_map.dump(ds);
3496 	        omap_stats_note_required = true;
3497 	      } else if (what.count("summary") || what.count("sum")) {
3498 		pg_map.dump_basic(ds);
3499 		pg_map.dump_pg_sum_stats(ds, true);
3500 		pg_map.dump_osd_sum_stats(ds);
3501 	        omap_stats_note_required = true;
3502 	      } else {
3503 		if (what.count("pgs_brief")) {
3504 		  pg_map.dump_pg_stats(ds, true);
3505 		}
3506 		bool header = true;
3507 		if (what.count("pgs")) {
3508 		  pg_map.dump_pg_stats(ds, false);
3509 		  header = false;
3510 	          omap_stats_note_required = true;
3511 		}
3512 		if (what.count("pools")) {
3513 		  pg_map.dump_pool_stats(ds, header);
3514 	          omap_stats_note_required = true;
3515 		}
3516 		if (what.count("osds")) {
3517 		  pg_map.dump_osd_stats(ds);
3518 		}
3519 	      }
3520 	      odata->append(ds);
3521 	      if (omap_stats_note_required) {
3522 	        odata->append(omap_stats_note);
3523 	      }
3524 	    }
3525 	    *ss << "dumped " << what;
3526 	    return 0;
3527 	  }
3528 	
3529 	  if (prefix == "pg ls") {
3530 	    int64_t osd = -1;
3531 	    int64_t pool = -1;
3532 	    vector<string>states;
3533 	    set<pg_t> pgs;
3534 	    cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3535 	    cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3536 	    cmd_getval(g_ceph_context, cmdmap, "states", states);
3537 	    if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3538 	      *ss << "pool " << pool << " does not exist";
3539 	      return -ENOENT;
3540 	    }
3541 	    if (osd >= 0 && !osdmap.is_up(osd)) {
3542 	      *ss << "osd " << osd << " is not up";
3543 	      return -EAGAIN;
3544 	    }
3545 	    if (states.empty())
3546 	      states.push_back("all");
3547 	
3548 	    uint64_t state = 0;
3549 	
3550 	    while (!states.empty()) {
3551 	      string state_str = states.back();
3552 	
3553 	      if (state_str == "all") {
3554 	        state = -1;
3555 	        break;
3556 	      } else {
3557 	        auto filter = pg_string_state(state_str);
3558 	        if (!filter) {
3559 	          *ss << "'" << state_str << "' is not a valid pg state,"
3560 	              << " available choices: " << pg_state_string(0xFFFFFFFF);
3561 	          return -EINVAL;
3562 	        }
3563 	        state |= *filter;
3564 	      }
3565 	
3566 	      states.pop_back();
3567 	    }
3568 	
3569 	    pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3570 	
3571 	    if (f && !pgs.empty()) {
3572 	      pg_map.dump_filtered_pg_stats(f, pgs);
3573 	      f->flush(*odata);
3574 	    } else if (!pgs.empty()) {
3575 	      pg_map.dump_filtered_pg_stats(ds, pgs);
3576 	      odata->append(ds);
3577 	      odata->append(omap_stats_note);
3578 	    }
3579 	    return 0;
3580 	  }
3581 	
3582 	  if (prefix == "pg dump_stuck") {
3583 	    vector<string> stuckop_vec;
3584 	    cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
3585 	    if (stuckop_vec.empty())
3586 	      stuckop_vec.push_back("unclean");
3587 	    int64_t threshold;
3588 	    cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
3589 	               g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
3590 	
3591 	    if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
3592 	      *ss << "failed";
3593 	    } else {
3594 	      *ss << "ok";
3595 	    }
3596 	    odata->append(ds);
3597 	    return 0;
3598 	  }
3599 	
3600 	  if (prefix == "pg debug") {
3601 	    string debugop;
3602 	    cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
3603 		       string("unfound_objects_exist"));
3604 	    if (debugop == "unfound_objects_exist") {
3605 	      bool unfound_objects_exist = false;
3606 	      for (const auto& p : pg_map.pg_stat) {
3607 		if (p.second.stats.sum.num_objects_unfound > 0) {
3608 		  unfound_objects_exist = true;
3609 		  break;
3610 		}
3611 	      }
3612 	      if (unfound_objects_exist)
3613 		ds << "TRUE";
3614 	      else
3615 		ds << "FALSE";
3616 	      odata->append(ds);
3617 	      return 0;
3618 	    }
3619 	    if (debugop == "degraded_pgs_exist") {
3620 	      bool degraded_pgs_exist = false;
3621 	      for (const auto& p : pg_map.pg_stat) {
3622 		if (p.second.stats.sum.num_objects_degraded > 0) {
3623 		  degraded_pgs_exist = true;
3624 		  break;
3625 		}
3626 	      }
3627 	      if (degraded_pgs_exist)
3628 		ds << "TRUE";
3629 	      else
3630 		ds << "FALSE";
3631 	      odata->append(ds);
3632 	      return 0;
3633 	    }
3634 	  }
3635 	
3636 	  if (prefix == "osd perf") {
3637 	    if (f) {
3638 	      f->open_object_section("osdstats");
3639 	      pg_map.dump_osd_perf_stats(f);
3640 	      f->close_section();
3641 	      f->flush(ds);
3642 	    } else {
3643 	      pg_map.print_osd_perf_stats(&ds);
3644 	    }
3645 	    odata->append(ds);
3646 	    return 0;
3647 	  }
3648 	
3649 	  if (prefix == "osd blocked-by") {
3650 	    if (f) {
3651 	      f->open_object_section("osd_blocked_by");
3652 	      pg_map.dump_osd_blocked_by_stats(f);
3653 	      f->close_section();
3654 	      f->flush(ds);
3655 	    } else {
3656 	      pg_map.print_osd_blocked_by_stats(&ds);
3657 	    }
3658 	    odata->append(ds);
3659 	    return 0;
3660 	  }
3661 	
3662 	  return -EOPNOTSUPP;
3663 	}
3664 	
3665 	void PGMapUpdater::check_osd_map(
3666 	  CephContext *cct,
3667 	  const OSDMap& osdmap,
3668 	  const PGMap& pgmap,
3669 	  PGMap::Incremental *pending_inc)
3670 	{
3671 	  for (auto& p : pgmap.osd_stat) {
3672 	    if (!osdmap.exists(p.first)) {
3673 	      // remove osd_stat
3674 	      pending_inc->rm_stat(p.first);
3675 	    } else if (osdmap.is_out(p.first)) {
3676 	      // zero osd_stat
3677 	      if (p.second.statfs.total != 0) {
3678 		pending_inc->stat_osd_out(p.first);
3679 	      }
3680 	    } else if (!osdmap.is_up(p.first)) {
3681 	      // zero the op_queue_age_hist
3682 	      if (!p.second.op_queue_age_hist.empty()) {
3683 		pending_inc->stat_osd_down_up(p.first, pgmap);
3684 	      }
3685 	    }
3686 	  }
3687 	
3688 	  // deleted pgs (pools)?
3689 	  for (auto& p : pgmap.pg_pool_sum) {
3690 	    if (!osdmap.have_pg_pool(p.first)) {
3691 	      ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3692 			     << dendl;
3693 	      for (auto& q : pgmap.pg_stat) {
3694 		if (q.first.pool() == p.first) {
3695 		  pending_inc->pg_remove.insert(q.first);
3696 		}
3697 	      }
3698 	      auto q = pending_inc->pg_stat_updates.begin();
3699 	      while (q != pending_inc->pg_stat_updates.end()) {
3700 		if (q->first.pool() == p.first) {
3701 		  q = pending_inc->pg_stat_updates.erase(q);
3702 		} else {
3703 		  ++q;
3704 		}
3705 	      }
3706 	    }
3707 	  }
3708 	
3709 	  // new (split or new pool) or merged pgs?
3710 	  map<int64_t,unsigned> new_pg_num;
3711 	  for (auto& p : osdmap.get_pools()) {
3712 	    int64_t poolid = p.first;
3713 	    const pg_pool_t& pi = p.second;
3714 	    auto q = pgmap.num_pg_by_pool.find(poolid);
3715 	    unsigned my_pg_num = 0;
3716 	    if (q != pgmap.num_pg_by_pool.end())
3717 	      my_pg_num = q->second;
3718 	    unsigned pg_num = pi.get_pg_num();
3719 	    new_pg_num[poolid] = pg_num;
3720 	    if (my_pg_num < pg_num) {
3721 	      ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3722 			    << " > my pg_num " << my_pg_num << dendl;
3723 	      for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3724 		pg_t pgid(ps, poolid);
3725 		if (pending_inc->pg_stat_updates.count(pgid) == 0) {
3726 		  ldout(cct,20) << __func__ << " adding " << pgid << dendl;
3727 		  pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3728 		  stats.last_fresh = osdmap.get_modified();
3729 		  stats.last_active = osdmap.get_modified();
3730 		  stats.last_change = osdmap.get_modified();
3731 		  stats.last_peered = osdmap.get_modified();
3732 		  stats.last_clean = osdmap.get_modified();
3733 		  stats.last_unstale = osdmap.get_modified();
3734 		  stats.last_undegraded = osdmap.get_modified();
3735 		  stats.last_fullsized = osdmap.get_modified();
3736 		  stats.last_scrub_stamp = osdmap.get_modified();
3737 		  stats.last_deep_scrub_stamp = osdmap.get_modified();
3738 		  stats.last_clean_scrub_stamp = osdmap.get_modified();
3739 		}
3740 	      }
3741 	    } else if (my_pg_num > pg_num) {
3742 	      ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3743 			    << " < my pg_num " << my_pg_num << dendl;
3744 	      for (unsigned i = pg_num; i < my_pg_num; ++i) {
3745 		pg_t pgid(i, poolid);
3746 		ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
3747 		if (pgmap.pg_stat.count(pgid)) {
3748 		  pending_inc->pg_remove.insert(pgid);
3749 		}
3750 		pending_inc->pg_stat_updates.erase(pgid);
3751 	      }
3752 	    }
3753 	  }
3754 	  auto i = pending_inc->pg_stat_updates.begin();
3755 	  while (i != pending_inc->pg_stat_updates.end()) {
3756 	    auto j = new_pg_num.find(i->first.pool());
3757 	    if (j == new_pg_num.end() ||
3758 		i->first.ps() >= j->second) {
3759 	      ldout(cct,20) << __func__ << " removing pending update to old "
3760 			    << i->first << dendl;
3761 	      i = pending_inc->pg_stat_updates.erase(i);
3762 	    } else {
3763 	      ++i;
3764 	    }
3765 	  }
3766 	}
3767 	
3768 	static void _try_mark_pg_stale(
3769 	  const OSDMap& osdmap,
3770 	  pg_t pgid,
3771 	  const pg_stat_t& cur,
3772 	  PGMap::Incremental *pending_inc)
3773 	{
3774 	  if ((cur.state & PG_STATE_STALE) == 0 &&
3775 	      cur.acting_primary != -1 &&
3776 	      osdmap.is_down(cur.acting_primary)) {
3777 	    pg_stat_t *newstat;
3778 	    auto q = pending_inc->pg_stat_updates.find(pgid);
3779 	    if (q != pending_inc->pg_stat_updates.end()) {
3780 	      if ((q->second.acting_primary == cur.acting_primary) ||
3781 		  ((q->second.state & PG_STATE_STALE) == 0 &&
3782 		   q->second.acting_primary != -1 &&
3783 		   osdmap.is_down(q->second.acting_primary))) {
3784 		newstat = &q->second;
3785 	      } else {
3786 		// pending update is no longer down or already stale
3787 		return;
3788 	      }
3789 	    } else {
3790 	      newstat = &pending_inc->pg_stat_updates[pgid];
3791 	      *newstat = cur;
3792 	    }
3793 	    dout(10) << __func__ << " marking pg " << pgid
3794 		     << " stale (acting_primary " << newstat->acting_primary
3795 		     << ")" << dendl;
3796 	    newstat->state |= PG_STATE_STALE;
3797 	    newstat->last_unstale = ceph_clock_now();
3798 	  }
3799 	}
3800 	
3801 	void PGMapUpdater::check_down_pgs(
3802 	    const OSDMap &osdmap,
3803 	    const PGMap &pg_map,
3804 	    bool check_all,
3805 	    const set<int>& need_check_down_pg_osds,
3806 	    PGMap::Incremental *pending_inc)
3807 	{
3808 	  // if a large number of osds changed state, just iterate over the whole
3809 	  // pg map.
3810 	  if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
3811 	      g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
3812 	    check_all = true;
3813 	  }
3814 	
3815 	  if (check_all) {
3816 	    for (const auto& p : pg_map.pg_stat) {
3817 	      _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3818 	    }
3819 	  } else {
3820 	    for (auto osd : need_check_down_pg_osds) {
3821 	      if (osdmap.is_down(osd)) {
3822 		auto p = pg_map.pg_by_osd.find(osd);
3823 		if (p == pg_map.pg_by_osd.end()) {
3824 		  continue;
3825 		}
3826 		for (auto pgid : p->second) {
3827 		  const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
3828 		  ceph_assert(stat.acting_primary == osd);
3829 		  _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3830 		}
3831 	      }
3832 	    }
3833 	  }
3834 	}
3835 	
3836 	int reweight::by_utilization(
3837 	    const OSDMap &osdmap,
3838 	    const PGMap &pgm,
3839 	    int oload,
3840 	    double max_changef,
3841 	    int max_osds,
3842 	    bool by_pg, const set<int64_t> *pools,
3843 	    bool no_increasing,
3844 	    mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3845 	    std::stringstream *ss,
3846 	    std::string *out_str,
3847 	    ceph::Formatter *f)
3848 	{
3849 	  if (oload <= 100) {
3850 	    *ss << "You must give a percentage higher than 100. "
3851 	      "The reweighting threshold will be calculated as <average-utilization> "
3852 	      "times <input-percentage>. For example, an argument of 200 would "
3853 	      "reweight OSDs which are twice as utilized as the average OSD.\n";
3854 	    return -EINVAL;
3855 	  }
3856 	
3857 	  vector<int> pgs_by_osd(osdmap.get_max_osd());
3858 	
3859 	  // Avoid putting a small number (or 0) in the denominator when calculating
3860 	  // average_util
3861 	  double average_util;
3862 	  if (by_pg) {
3863 	    // by pg mapping
3864 	    double weight_sum = 0.0;      // sum up the crush weights
3865 	    unsigned num_pg_copies = 0;
3866 	    int num_osds = 0;
3867 	    for (const auto& pg : pgm.pg_stat) {
3868 	      if (pools && pools->count(pg.first.pool()) == 0)
3869 		continue;
3870 	      for (const auto acting : pg.second.acting) {
3871 	        if (!osdmap.exists(acting)) {
3872 	          continue;
3873 	        }
3874 		if (acting >= (int)pgs_by_osd.size())
3875 		  pgs_by_osd.resize(acting);
3876 		if (pgs_by_osd[acting] == 0) {
3877 	          if (osdmap.crush->get_item_weightf(acting) <= 0) {
3878 	            //skip if we currently can not identify item
3879 	            continue;
3880 	          }
3881 		  weight_sum += osdmap.crush->get_item_weightf(acting);
3882 		  ++num_osds;
3883 		}
3884 		++pgs_by_osd[acting];
3885 		++num_pg_copies;
3886 	      }
3887 	    }
3888 	
3889 	    if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
3890 	      *ss << "Refusing to reweight: we only have " << num_pg_copies
3891 		  << " PGs across " << num_osds << " osds!\n";
3892 	      return -EDOM;
3893 	    }
3894 	
3895 	    average_util = (double)num_pg_copies / weight_sum;
3896 	  } else {
3897 	    // by osd utilization
3898 	    int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
3899 	    if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
3900 		< g_conf()->mon_reweight_min_bytes_per_osd) {
3901 	      *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
3902 		  << " kb across all osds!\n";
3903 	      return -EDOM;
3904 	    }
3905 	    if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
3906 		< g_conf()->mon_reweight_min_bytes_per_osd) {
3907 	      *ss << "Refusing to reweight: we only have "
3908 		  << pgm.osd_sum.statfs.kb_used_raw()
3909 		  << " kb used across all osds!\n";
3910 	      return -EDOM;
3911 	    }
3912 	
3913 	    average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
3914 	      (double)pgm.osd_sum.statfs.total;
3915 	  }
3916 	
3917 	  // adjust down only if we are above the threshold
3918 	  const double overload_util = average_util * (double)oload / 100.0;
3919 	
3920 	  // but aggressively adjust weights up whenever possible.
3921 	  const double underload_util = average_util;
3922 	
3923 	  const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
3924 	
3925 	  ostringstream oss;
3926 	  if (f) {
3927 	    f->open_object_section("reweight_by_utilization");
3928 	    f->dump_int("overload_min", oload);
3929 	    f->dump_float("max_change", max_changef);
3930 	    f->dump_int("max_change_osds", max_osds);
3931 	    f->dump_float("average_utilization", average_util);
3932 	    f->dump_float("overload_utilization", overload_util);
3933 	  } else {
3934 	    oss << "oload " << oload << "\n";
3935 	    oss << "max_change " << max_changef << "\n";
3936 	    oss << "max_change_osds " << max_osds << "\n";
3937 	    oss.precision(4);
3938 	    oss << "average_utilization " << std::fixed << average_util << "\n";
3939 	    oss << "overload_utilization " << overload_util << "\n";
3940 	  }
3941 	  int num_changed = 0;
3942 	
3943 	  // precompute util for each OSD
3944 	  std::vector<std::pair<int, float> > util_by_osd;
3945 	  for (const auto& p : pgm.osd_stat) {
3946 	    std::pair<int, float> osd_util;
3947 	    osd_util.first = p.first;
3948 	    if (by_pg) {
3949 	      if (p.first >= (int)pgs_by_osd.size() ||
3950 	        pgs_by_osd[p.first] == 0) {
3951 	        // skip if this OSD does not contain any pg
3952 	        // belonging to the specified pool(s).
3953 	        continue;
3954 	      }
3955 	
3956 	      if (osdmap.crush->get_item_weightf(p.first) <= 0) {
3957 	        // skip if we are unable to locate item.
3958 	        continue;
3959 	      }
3960 	
3961 	      osd_util.second =
3962 		pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
3963 	    } else {
3964 	      osd_util.second =
3965 		(double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
3966 	    }
3967 	    util_by_osd.push_back(osd_util);
3968 	  }
3969 	
3970 	  // sort by absolute deviation from the mean utilization,
3971 	  // in descending order.
3972 	  std::sort(util_by_osd.begin(), util_by_osd.end(),
3973 	    [average_util](std::pair<int, float> l, std::pair<int, float> r) {
3974 	      return abs(l.second - average_util) > abs(r.second - average_util);
3975 	    }
3976 	  );
3977 	
3978 	  if (f)
3979 	    f->open_array_section("reweights");
3980 	
3981 	  for (const auto& p : util_by_osd) {
3982 	    unsigned weight = osdmap.get_weight(p.first);
3983 	    if (weight == 0) {
3984 	      // skip if OSD is currently out
3985 	      continue;
3986 	    }
3987 	    float util = p.second;
3988 	
3989 	    if (util >= overload_util) {
3990 	      // Assign a lower weight to overloaded OSDs. The current weight
3991 	      // is a factor to take into account the original weights,
3992 	      // to represent e.g. differing storage capacities
3993 	      unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
3994 	      if (weight > max_change)
3995 		new_weight = std::max(new_weight, weight - max_change);
3996 	      new_weights->insert({p.first, new_weight});
3997 	      if (f) {
3998 		f->open_object_section("osd");
3999 		f->dump_int("osd", p.first);
4000 		f->dump_float("weight", (float)weight / (float)0x10000);
4001 		f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4002 		f->close_section();
4003 	      } else {
4004 	        oss << "osd." << p.first << " weight "
4005 	            << (float)weight / (float)0x10000 << " -> "
4006 	            << (float)new_weight / (float)0x10000 << "\n";
4007 	      }
4008 	      if (++num_changed >= max_osds)
4009 		break;
4010 	    }
4011 	    if (!no_increasing && util <= underload_util) {
4012 	      // assign a higher weight.. if we can.
4013 	      unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4014 	      new_weight = std::min(new_weight, weight + max_change);
4015 	      if (new_weight > 0x10000)
4016 		new_weight = 0x10000;
4017 	      if (new_weight > weight) {
4018 		new_weights->insert({p.first, new_weight});
4019 	        oss << "osd." << p.first << " weight "
4020 	            << (float)weight / (float)0x10000 << " -> "
4021 	            << (float)new_weight / (float)0x10000 << "\n";
4022 		if (++num_changed >= max_osds)
4023 		  break;
4024 	      }
4025 	    }
4026 	  }
4027 	  if (f) {
4028 	    f->close_section();
4029 	  }
4030 	
4031 	  OSDMap newmap;
4032 	  newmap.deepish_copy_from(osdmap);
4033 	  OSDMap::Incremental newinc;
4034 	  newinc.fsid = newmap.get_fsid();
4035 	  newinc.epoch = newmap.get_epoch() + 1;
4036 	  newinc.new_weight = *new_weights;
4037 	  newmap.apply_incremental(newinc);
4038 	
4039 	  osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4040 	
4041 	  if (f) {
4042 	    f->close_section();
4043 	  } else {
4044 	    *out_str += "\n";
4045 	    *out_str += oss.str();
4046 	  }
4047 	  return num_changed;
4048 	}
4049