1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include <boost/algorithm/string.hpp>
5
6 #include "PGMap.h"
7
8 #define dout_subsys ceph_subsys_mon
9 #include "common/debug.h"
10 #include "common/Clock.h"
11 #include "common/Formatter.h"
12 #include "global/global_context.h"
13 #include "include/ceph_features.h"
14 #include "include/stringify.h"
15
16 #include "osd/osd_types.h"
17 #include "osd/OSDMap.h"
18 #include <boost/range/adaptor/reversed.hpp>
19
20 #define dout_context g_ceph_context
21
22 using std::list;
23 using std::make_pair;
24 using std::map;
25 using std::pair;
26 using std::ostream;
27 using std::ostringstream;
28 using std::set;
29 using std::string;
30 using std::stringstream;
31 using std::vector;
32
33 using ceph::bufferlist;
34
35 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap);
36 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap);
37 MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap);
38
39
40 // ---------------------
41 // PGMapDigest
42
43 void PGMapDigest::encode(bufferlist& bl, uint64_t features) const
44 {
45 // NOTE: see PGMap::encode_digest
46 uint8_t v = 4;
47 if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
48 v = 1;
49 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
50 v = 3;
51 }
52 ENCODE_START(v, 1, bl);
53 encode(num_pg, bl);
54 encode(num_pg_active, bl);
55 encode(num_pg_unknown, bl);
56 encode(num_osd, bl);
57 encode(pg_pool_sum, bl, features);
58 encode(pg_sum, bl, features);
59 encode(osd_sum, bl, features);
60 if (v >= 2) {
61 encode(num_pg_by_state, bl);
62 } else {
63 uint32_t n = num_pg_by_state.size();
64 encode(n, bl);
65 for (auto p : num_pg_by_state) {
66 encode((uint32_t)p.first, bl);
67 encode(p.second, bl);
68 }
69 }
70 encode(num_pg_by_osd, bl);
71 encode(num_pg_by_pool, bl);
72 encode(osd_last_seq, bl);
73 encode(per_pool_sum_delta, bl, features);
74 encode(per_pool_sum_deltas_stamps, bl);
75 encode(pg_sum_delta, bl, features);
76 encode(stamp_delta, bl);
77 encode(avail_space_by_rule, bl);
78 if (struct_v >= 3) {
79 encode(purged_snaps, bl);
80 }
81 if (struct_v >= 4) {
82 encode(osd_sum_by_class, bl, features);
83 }
84 ENCODE_FINISH(bl);
85 }
86
87 void PGMapDigest::decode(bufferlist::const_iterator& p)
88 {
89 DECODE_START(4, p);
90 decode(num_pg, p);
91 decode(num_pg_active, p);
92 decode(num_pg_unknown, p);
93 decode(num_osd, p);
94 decode(pg_pool_sum, p);
95 decode(pg_sum, p);
96 decode(osd_sum, p);
97 if (struct_v >= 2) {
98 decode(num_pg_by_state, p);
99 } else {
100 map<int32_t, int32_t> nps;
101 decode(nps, p);
102 num_pg_by_state.clear();
103 for (auto i : nps) {
104 num_pg_by_state[i.first] = i.second;
105 }
106 }
107 decode(num_pg_by_osd, p);
108 decode(num_pg_by_pool, p);
109 decode(osd_last_seq, p);
110 decode(per_pool_sum_delta, p);
111 decode(per_pool_sum_deltas_stamps, p);
112 decode(pg_sum_delta, p);
113 decode(stamp_delta, p);
114 decode(avail_space_by_rule, p);
115 if (struct_v >= 3) {
116 decode(purged_snaps, p);
117 }
118 if (struct_v >= 4) {
119 decode(osd_sum_by_class, p);
120 }
121 DECODE_FINISH(p);
122 }
123
124 void PGMapDigest::dump(ceph::Formatter *f) const
125 {
126 f->dump_unsigned("num_pg", num_pg);
127 f->dump_unsigned("num_pg_active", num_pg_active);
128 f->dump_unsigned("num_pg_unknown", num_pg_unknown);
129 f->dump_unsigned("num_osd", num_osd);
130 f->dump_object("pool_sum", pg_sum);
131 f->dump_object("osd_sum", osd_sum);
132
133 f->open_object_section("osd_sum_by_class");
134 for (auto& i : osd_sum_by_class) {
135 f->dump_object(i.first.c_str(), i.second);
136 }
137 f->close_section();
138
139 f->open_array_section("pool_stats");
140 for (auto& p : pg_pool_sum) {
141 f->open_object_section("pool_stat");
142 f->dump_int("poolid", p.first);
143 auto q = num_pg_by_pool.find(p.first);
144 if (q != num_pg_by_pool.end())
145 f->dump_unsigned("num_pg", q->second);
146 p.second.dump(f);
147 f->close_section();
148 }
149 f->close_section();
150 f->open_array_section("osd_stats");
151 int i = 0;
152 // TODO: this isn't really correct since we can dump non-existent OSDs
153 // I dunno what osd_last_seq is set to in that case...
154 for (auto& p : osd_last_seq) {
155 f->open_object_section("osd_stat");
156 f->dump_int("osd", i);
157 f->dump_unsigned("seq", p);
158 f->close_section();
159 ++i;
160 }
161 f->close_section();
162 f->open_array_section("num_pg_by_state");
163 for (auto& p : num_pg_by_state) {
164 f->open_object_section("count");
165 f->dump_string("state", pg_state_string(p.first));
166 f->dump_unsigned("num", p.second);
167 f->close_section();
168 }
169 f->close_section();
170 f->open_array_section("num_pg_by_osd");
171 for (auto& p : num_pg_by_osd) {
172 f->open_object_section("count");
173 f->dump_unsigned("osd", p.first);
174 f->dump_unsigned("num_primary_pg", p.second.primary);
175 f->dump_unsigned("num_acting_pg", p.second.acting);
176 f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting);
177 f->close_section();
178 }
179 f->close_section();
180 f->open_array_section("purged_snaps");
181 for (auto& j : purged_snaps) {
182 f->open_object_section("pool");
183 f->dump_int("pool", j.first);
184 f->open_object_section("purged_snaps");
185 for (auto i = j.second.begin(); i != j.second.end(); ++i) {
186 f->open_object_section("interval");
187 f->dump_stream("start") << i.get_start();
188 f->dump_stream("length") << i.get_len();
189 f->close_section();
190 }
191 f->close_section();
192 f->close_section();
193 }
194 f->close_section();
195 }
196
197 void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls)
198 {
199 ls.push_back(new PGMapDigest);
200 }
201
202 inline std::string percentify(const float& a) {
203 std::stringstream ss;
204 if (a < 0.01)
205 ss << "0";
206 else
207 ss << std::fixed << std::setprecision(2) << a;
208 return ss.str();
209 }
210
211 void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const
212 {
213 if (f)
214 f->open_array_section("pgs_by_state");
215
216 // list is descending numeric order (by count)
217 std::multimap<int,int> state_by_count; // count -> state
218 for (auto p = num_pg_by_state.begin();
219 p != num_pg_by_state.end();
220 ++p) {
221 state_by_count.insert(make_pair(p->second, p->first));
222 }
223 if (f) {
224 for (auto p = state_by_count.rbegin();
225 p != state_by_count.rend();
226 ++p)
227 {
228 f->open_object_section("pgs_by_state_element");
229 f->dump_string("state_name", pg_state_string(p->second));
230 f->dump_unsigned("count", p->first);
231 f->close_section();
232 }
233 }
234 if (f)
235 f->close_section();
236
237 if (f) {
238 f->dump_unsigned("num_pgs", num_pg);
239 f->dump_unsigned("num_pools", pg_pool_sum.size());
240 f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects);
241 f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes);
242 f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw());
243 f->dump_unsigned("bytes_avail", osd_sum.statfs.available);
244 f->dump_unsigned("bytes_total", osd_sum.statfs.total);
245 } else {
246 *out << " pools: " << pg_pool_sum.size() << " pools, "
247 << num_pg << " pgs\n";
248 *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, "
249 << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n";
250 *out << " usage: "
251 << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, "
252 << byte_u_t(osd_sum.statfs.available) << " / "
253 << byte_u_t(osd_sum.statfs.total) << " avail\n";
254 *out << " pgs: ";
255 }
256
257 bool pad = false;
258
259 if (num_pg_unknown > 0) {
260 float p = (float)num_pg_unknown / (float)num_pg;
261 if (f) {
262 f->dump_float("unknown_pgs_ratio", p);
263 } else {
264 char b[20];
265 snprintf(b, sizeof(b), "%.3lf", p * 100.0);
266 *out << b << "% pgs unknown\n";
267 pad = true;
268 }
269 }
270
271 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
272 if (num_pg_inactive > 0) {
273 float p = (float)num_pg_inactive / (float)num_pg;
274 if (f) {
275 f->dump_float("inactive_pgs_ratio", p);
276 } else {
277 if (pad) {
278 *out << " ";
279 }
280 char b[20];
281 snprintf(b, sizeof(b), "%.3f", p * 100.0);
282 *out << b << "% pgs not active\n";
283 pad = true;
284 }
285 }
286
287 list<string> sl;
288 overall_recovery_summary(f, &sl);
289 if (!f && !sl.empty()) {
290 for (auto p = sl.begin(); p != sl.end(); ++p) {
291 if (pad) {
292 *out << " ";
293 }
294 *out << *p << "\n";
295 pad = true;
296 }
297 }
298 sl.clear();
299
300 if (!f) {
301 unsigned max_width = 1;
302 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
303 {
304 std::stringstream ss;
305 ss << p->first;
306 max_width = std::max<size_t>(ss.str().size(), max_width);
307 }
308
309 for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p)
310 {
311 if (pad) {
312 *out << " ";
313 }
314 pad = true;
315 out->setf(std::ios::left);
316 *out << std::setw(max_width) << p->first
317 << " " << pg_state_string(p->second) << "\n";
318 out->unsetf(std::ios::left);
319 }
320 }
321
322 ostringstream ss_rec_io;
323 overall_recovery_rate_summary(f, &ss_rec_io);
324 ostringstream ss_client_io;
325 overall_client_io_rate_summary(f, &ss_client_io);
326 ostringstream ss_cache_io;
327 overall_cache_io_rate_summary(f, &ss_cache_io);
328
329 if (!f && (ss_client_io.str().length() || ss_rec_io.str().length()
330 || ss_cache_io.str().length())) {
331 *out << "\n \n";
332 *out << " io:\n";
333 }
334
335 if (!f && ss_client_io.str().length())
336 *out << " client: " << ss_client_io.str() << "\n";
337 if (!f && ss_rec_io.str().length())
338 *out << " recovery: " << ss_rec_io.str() << "\n";
339 if (!f && ss_cache_io.str().length())
340 *out << " cache: " << ss_cache_io.str() << "\n";
341 }
342
343 void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const
344 {
345 std::stringstream ss;
346
347 if (f)
348 f->open_array_section("num_pg_by_state");
349 for (auto p = num_pg_by_state.begin();
350 p != num_pg_by_state.end();
351 ++p) {
352 if (f) {
353 f->open_object_section("state");
354 f->dump_string("name", pg_state_string(p->first));
355 f->dump_unsigned("num", p->second);
356 f->close_section();
357 }
358 if (p != num_pg_by_state.begin())
359 ss << ", ";
360 ss << p->second << " " << pg_state_string(p->first);
361 }
362 if (f)
363 f->close_section();
364
365 string states = ss.str();
366 if (out)
367 *out << num_pg << " pgs: "
368 << states << "; "
369 << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, "
370 << byte_u_t(osd_sum.statfs.get_used()) << " used, "
371 << byte_u_t(osd_sum.statfs.available) << " / "
372 << byte_u_t(osd_sum.statfs.total) << " avail";
373 if (f) {
374 f->dump_unsigned("num_pgs", num_pg);
375 f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes);
376 f->dump_int("total_bytes", osd_sum.statfs.total);
377 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
378 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
379 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
380 }
381
382 // make non-negative; we can get negative values if osds send
383 // uncommitted stats and then "go backward" or if they are just
384 // buggy/wrong.
385 pool_stat_t pos_delta = pg_sum_delta;
386 pos_delta.floor(0);
387 if (pos_delta.stats.sum.num_rd ||
388 pos_delta.stats.sum.num_wr) {
389 if (out)
390 *out << "; ";
391 if (pos_delta.stats.sum.num_rd) {
392 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
393 if (out)
394 *out << byte_u_t(rd) << "/s rd, ";
395 if (f)
396 f->dump_unsigned("read_bytes_sec", rd);
397 }
398 if (pos_delta.stats.sum.num_wr) {
399 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
400 if (out)
401 *out << byte_u_t(wr) << "/s wr, ";
402 if (f)
403 f->dump_unsigned("write_bytes_sec", wr);
404 }
405 int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
406 if (out)
407 *out << si_u_t(iops) << " op/s";
408 if (f)
409 f->dump_unsigned("io_sec", iops);
410 }
411
412 list<string> sl;
413 overall_recovery_summary(f, &sl);
414 if (out)
415 for (auto p = sl.begin(); p != sl.end(); ++p)
416 *out << "; " << *p;
417 std::stringstream ssr;
418 overall_recovery_rate_summary(f, &ssr);
419 if (out && ssr.str().length())
420 *out << "; " << ssr.str() << " recovering";
421 }
422
423 void PGMapDigest::get_recovery_stats(
424 double *misplaced_ratio,
425 double *degraded_ratio,
426 double *inactive_pgs_ratio,
427 double *unknown_pgs_ratio) const
428 {
429 if (pg_sum.stats.sum.num_objects_degraded &&
430 pg_sum.stats.sum.num_object_copies > 0) {
431 *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded /
432 (double)pg_sum.stats.sum.num_object_copies;
433 } else {
434 *degraded_ratio = 0;
435 }
436 if (pg_sum.stats.sum.num_objects_misplaced &&
437 pg_sum.stats.sum.num_object_copies > 0) {
438 *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced /
439 (double)pg_sum.stats.sum.num_object_copies;
440 } else {
441 *misplaced_ratio = 0;
442 }
443 if (num_pg > 0) {
444 int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown;
445 *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg;
446 *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg;
447 } else {
448 *inactive_pgs_ratio = 0;
449 *unknown_pgs_ratio = 0;
450 }
451 }
452
453 void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl,
454 const pool_stat_t& pool_sum) const
455 {
456 if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) {
457 double pc = (double)pool_sum.stats.sum.num_objects_degraded /
458 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
459 char b[20];
460 snprintf(b, sizeof(b), "%.3lf", pc);
461 if (f) {
462 f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded);
463 f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies);
464 f->dump_float("degraded_ratio", pc / 100.0);
465 } else {
466 ostringstream ss;
467 ss << pool_sum.stats.sum.num_objects_degraded
468 << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
469 psl->push_back(ss.str());
470 }
471 }
472 if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) {
473 double pc = (double)pool_sum.stats.sum.num_objects_misplaced /
474 (double)pool_sum.stats.sum.num_object_copies * (double)100.0;
475 char b[20];
476 snprintf(b, sizeof(b), "%.3lf", pc);
477 if (f) {
478 f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced);
479 f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies);
480 f->dump_float("misplaced_ratio", pc / 100.0);
481 } else {
482 ostringstream ss;
483 ss << pool_sum.stats.sum.num_objects_misplaced
484 << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)";
485 psl->push_back(ss.str());
486 }
487 }
488 if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) {
489 double pc = (double)pool_sum.stats.sum.num_objects_unfound /
490 (double)pool_sum.stats.sum.num_objects * (double)100.0;
491 char b[20];
492 snprintf(b, sizeof(b), "%.3lf", pc);
493 if (f) {
494 f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound);
495 f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects);
496 f->dump_float("unfound_ratio", pc / 100.0);
497 } else {
498 ostringstream ss;
499 ss << pool_sum.stats.sum.num_objects_unfound
500 << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
501 psl->push_back(ss.str());
502 }
503 }
504 }
505
506 void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out,
507 const pool_stat_t& delta_sum,
508 utime_t delta_stamp) const
509 {
510 // make non-negative; we can get negative values if osds send
511 // uncommitted stats and then "go backward" or if they are just
512 // buggy/wrong.
513 pool_stat_t pos_delta = delta_sum;
514 pos_delta.floor(0);
515 if (pos_delta.stats.sum.num_objects_recovered ||
516 pos_delta.stats.sum.num_bytes_recovered ||
517 pos_delta.stats.sum.num_keys_recovered) {
518 int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
519 int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
520 int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
521 if (f) {
522 f->dump_int("recovering_objects_per_sec", objps);
523 f->dump_int("recovering_bytes_per_sec", bps);
524 f->dump_int("recovering_keys_per_sec", kps);
525 f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered);
526 f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered);
527 f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered);
528 } else {
529 *out << byte_u_t(bps) << "/s";
530 if (pos_delta.stats.sum.num_keys_recovered)
531 *out << ", " << si_u_t(kps) << " keys/s";
532 *out << ", " << si_u_t(objps) << " objects/s";
533 }
534 }
535 }
536
537 void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const
538 {
539 recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
540 }
541
542 void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const
543 {
544 recovery_summary(f, psl, pg_sum);
545 }
546
547 void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out,
548 uint64_t poolid) const
549 {
550 auto p = per_pool_sum_delta.find(poolid);
551 if (p == per_pool_sum_delta.end())
552 return;
553
554 auto ts = per_pool_sum_deltas_stamps.find(p->first);
555 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
556 recovery_rate_summary(f, out, p->second.first, ts->second);
557 }
558
559 void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl,
560 uint64_t poolid) const
561 {
562 auto p = pg_pool_sum.find(poolid);
563 if (p == pg_pool_sum.end())
564 return;
565
566 recovery_summary(f, psl, p->second);
567 }
568
569 void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out,
570 const pool_stat_t& delta_sum,
571 utime_t delta_stamp) const
572 {
573 pool_stat_t pos_delta = delta_sum;
574 pos_delta.floor(0);
575 if (pos_delta.stats.sum.num_rd ||
576 pos_delta.stats.sum.num_wr) {
577 if (pos_delta.stats.sum.num_rd) {
578 int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
579 if (f) {
580 f->dump_int("read_bytes_sec", rd);
581 } else {
582 *out << byte_u_t(rd) << "/s rd, ";
583 }
584 }
585 if (pos_delta.stats.sum.num_wr) {
586 int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
587 if (f) {
588 f->dump_int("write_bytes_sec", wr);
589 } else {
590 *out << byte_u_t(wr) << "/s wr, ";
591 }
592 }
593 int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;
594 int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;
595 if (f) {
596 f->dump_int("read_op_per_sec", iops_rd);
597 f->dump_int("write_op_per_sec", iops_wr);
598 } else {
599 *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
600 }
601 }
602 }
603
604 void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const
605 {
606 client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
607 }
608
609 void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out,
610 uint64_t poolid) const
611 {
612 auto p = per_pool_sum_delta.find(poolid);
613 if (p == per_pool_sum_delta.end())
614 return;
615
616 auto ts = per_pool_sum_deltas_stamps.find(p->first);
617 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
618 client_io_rate_summary(f, out, p->second.first, ts->second);
619 }
620
621 void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out,
622 const pool_stat_t& delta_sum,
623 utime_t delta_stamp) const
624 {
625 pool_stat_t pos_delta = delta_sum;
626 pos_delta.floor(0);
627 bool have_output = false;
628
629 if (pos_delta.stats.sum.num_flush) {
630 int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp;
631 if (f) {
632 f->dump_int("flush_bytes_sec", flush);
633 } else {
634 *out << byte_u_t(flush) << "/s flush";
635 have_output = true;
636 }
637 }
638 if (pos_delta.stats.sum.num_evict) {
639 int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp;
640 if (f) {
641 f->dump_int("evict_bytes_sec", evict);
642 } else {
643 if (have_output)
644 *out << ", ";
645 *out << byte_u_t(evict) << "/s evict";
646 have_output = true;
647 }
648 }
649 if (pos_delta.stats.sum.num_promote) {
650 int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp;
651 if (f) {
652 f->dump_int("promote_op_per_sec", promote);
653 } else {
654 if (have_output)
655 *out << ", ";
656 *out << si_u_t(promote) << " op/s promote";
657 have_output = true;
658 }
659 }
660 if (pos_delta.stats.sum.num_flush_mode_low) {
661 if (f) {
662 f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low);
663 } else {
664 if (have_output)
665 *out << ", ";
666 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing";
667 have_output = true;
668 }
669 }
670 if (pos_delta.stats.sum.num_flush_mode_high) {
671 if (f) {
672 f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high);
673 } else {
674 if (have_output)
675 *out << ", ";
676 *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)";
677 have_output = true;
678 }
679 }
680 if (pos_delta.stats.sum.num_evict_mode_some) {
681 if (f) {
682 f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some);
683 } else {
684 if (have_output)
685 *out << ", ";
686 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting";
687 have_output = true;
688 }
689 }
690 if (pos_delta.stats.sum.num_evict_mode_full) {
691 if (f) {
692 f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full);
693 } else {
694 if (have_output)
695 *out << ", ";
696 *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)";
697 }
698 }
699 }
700
701 void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const
702 {
703 cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
704 }
705
706 void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out,
707 uint64_t poolid) const
708 {
709 auto p = per_pool_sum_delta.find(poolid);
710 if (p == per_pool_sum_delta.end())
711 return;
712
713 auto ts = per_pool_sum_deltas_stamps.find(p->first);
714 ceph_assert(ts != per_pool_sum_deltas_stamps.end());
715 cache_io_rate_summary(f, out, p->second.first, ts->second);
716 }
717
718 ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap,
719 boost::optional<int64_t> data_pool) const
720 {
721 ceph_statfs statfs;
722 bool filter = false;
723 object_stat_sum_t sum;
724
725 if (data_pool) {
726 auto i = pg_pool_sum.find(*data_pool);
727 if (i != pg_pool_sum.end()) {
728 sum = i->second.stats.sum;
729 filter = true;
730 }
731 }
732
733 if (filter) {
734 statfs.kb_used = (sum.num_bytes >> 10);
735 statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10;
736 statfs.num_objects = sum.num_objects;
737 statfs.kb = statfs.kb_used + statfs.kb_avail;
738 } else {
739 // these are in KB.
740 statfs.kb = osd_sum.statfs.kb();
741 statfs.kb_used = osd_sum.statfs.kb_used_raw();
742 statfs.kb_avail = osd_sum.statfs.kb_avail();
743 statfs.num_objects = pg_sum.stats.sum.num_objects;
744 }
745
746 return statfs;
747 }
748
749 void PGMapDigest::dump_pool_stats_full(
750 const OSDMap &osd_map,
751 stringstream *ss,
752 ceph::Formatter *f,
753 bool verbose) const
754 {
755 TextTable tbl;
756
757 if (f) {
758 f->open_array_section("pools");
759 } else {
760 tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
761 tbl.define_column("ID", TextTable::LEFT, TextTable::RIGHT);
762 tbl.define_column("STORED", TextTable::LEFT, TextTable::RIGHT);
763 if (verbose) {
764 tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
765 tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
766 }
767 tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
768 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
769 if (verbose) {
770 tbl.define_column("(DATA)", TextTable::LEFT, TextTable::RIGHT);
771 tbl.define_column("(OMAP)", TextTable::LEFT, TextTable::RIGHT);
772 }
773 tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
774 tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
775
776 if (verbose) {
777 tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
778 tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
779 tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
780 tbl.define_column("USED COMPR", TextTable::LEFT, TextTable::RIGHT);
781 tbl.define_column("UNDER COMPR", TextTable::LEFT, TextTable::RIGHT);
782 }
783 }
784
785 map<int,uint64_t> avail_by_rule;
786 for (auto p = osd_map.get_pools().begin();
787 p != osd_map.get_pools().end(); ++p) {
788 int64_t pool_id = p->first;
789 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
790 continue;
791
792 const string& pool_name = osd_map.get_pool_name(pool_id);
793 const pool_stat_t &stat = pg_pool_sum.at(pool_id);
794
795 const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
796 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
797 pool->get_type(),
798 pool->get_size());
799 int64_t avail;
800 if (avail_by_rule.count(ruleno) == 0) {
801 // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
802 avail = get_rule_avail(ruleno);
803 if (avail < 0)
804 avail = 0;
805 avail_by_rule[ruleno] = avail;
806 } else {
807 avail = avail_by_rule[ruleno];
808 }
809 if (f) {
810 f->open_object_section("pool");
811 f->dump_string("name", pool_name);
812 f->dump_int("id", pool_id);
813 f->open_object_section("stats");
814 } else {
815 tbl << pool_name
816 << pool_id;
817 }
818 float raw_used_rate = osd_map.pool_raw_used_rate(pool_id);
819 bool per_pool = use_per_pool_stats();
820 bool per_pool_omap = use_per_pool_omap_stats();
821 dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool,
822 per_pool_omap, pool);
823 if (f) {
824 f->close_section(); // stats
825 f->close_section(); // pool
826 } else {
827 tbl << TextTable::endrow;
828 }
829 }
830 if (f)
831 f->close_section();
832 else {
833 ceph_assert(ss != nullptr);
834 *ss << "POOLS:\n";
835 tbl.set_indent(4);
836 *ss << tbl;
837 }
838 }
839
840 void PGMapDigest::dump_cluster_stats(stringstream *ss,
841 ceph::Formatter *f,
842 bool verbose) const
843 {
844 if (f) {
845 f->open_object_section("stats");
846 f->dump_int("total_bytes", osd_sum.statfs.total);
847 f->dump_int("total_avail_bytes", osd_sum.statfs.available);
848 f->dump_int("total_used_bytes", osd_sum.statfs.get_used());
849 f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw());
850 f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio());
851 f->dump_unsigned("num_osds", osd_sum.num_osds);
852 f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds);
853 f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds);
854 f->close_section();
855 f->open_object_section("stats_by_class");
856 for (auto& i : osd_sum_by_class) {
857 f->open_object_section(i.first.c_str());
858 f->dump_int("total_bytes", i.second.statfs.total);
859 f->dump_int("total_avail_bytes", i.second.statfs.available);
860 f->dump_int("total_used_bytes", i.second.statfs.get_used());
861 f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw());
862 f->dump_float("total_used_raw_ratio",
863 i.second.statfs.get_used_raw_ratio());
864 f->close_section();
865 }
866 f->close_section();
867 } else {
868 ceph_assert(ss != nullptr);
869 TextTable tbl;
870 tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT);
871 tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
872 tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
873 tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
874 tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
875 tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
876
877
878 for (auto& i : osd_sum_by_class) {
879 tbl << i.first;
880 tbl << stringify(byte_u_t(i.second.statfs.total))
881 << stringify(byte_u_t(i.second.statfs.available))
882 << stringify(byte_u_t(i.second.statfs.get_used()))
883 << stringify(byte_u_t(i.second.statfs.get_used_raw()))
884 << percentify(i.second.statfs.get_used_raw_ratio()*100.0)
885 << TextTable::endrow;
886 }
887 tbl << "TOTAL";
888 tbl << stringify(byte_u_t(osd_sum.statfs.total))
889 << stringify(byte_u_t(osd_sum.statfs.available))
890 << stringify(byte_u_t(osd_sum.statfs.get_used()))
891 << stringify(byte_u_t(osd_sum.statfs.get_used_raw()))
892 << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0)
893 << TextTable::endrow;
894
895 *ss << "RAW STORAGE:\n";
896 tbl.set_indent(4);
897 *ss << tbl;
898 }
899 }
900
901 void PGMapDigest::dump_object_stat_sum(
902 TextTable &tbl, ceph::Formatter *f,
903 const pool_stat_t &pool_stat, uint64_t avail,
904 float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap,
905 const pg_pool_t *pool)
906 {
907 const object_stat_sum_t &sum = pool_stat.stats.sum;
908 const store_statfs_t statfs = pool_stat.store_stats;
909
910 if (sum.num_object_copies > 0) {
911 raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies;
912 }
913
914 uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool);
915 uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap);
916 uint64_t used_bytes = used_data_bytes + used_omap_bytes;
917
918 float used = 0.0;
919 // note avail passed in is raw_avail, calc raw_used here.
920 if (avail) {
921 used = used_bytes;
922 used /= used + avail;
923 } else if (used_bytes) {
924 used = 1.0;
925 }
926 auto avail_res = raw_used_rate ? avail / raw_used_rate : 0;
927 // an approximation for actually stored user data
928 auto stored_data_normalized = pool_stat.get_user_data_bytes(
929 raw_used_rate, per_pool);
930 auto stored_omap_normalized = pool_stat.get_user_omap_bytes(
931 raw_used_rate, per_pool_omap);
932 auto stored_normalized = stored_data_normalized + stored_omap_normalized;
933 // same, amplied by replication or EC
934 auto stored_raw = stored_normalized * raw_used_rate;
935 if (f) {
936 f->dump_int("stored", stored_normalized);
937 if (verbose) {
938 f->dump_int("stored_data", stored_data_normalized);
939 f->dump_int("stored_omap", stored_omap_normalized);
940 }
941 f->dump_int("objects", sum.num_objects);
942 f->dump_int("kb_used", shift_round_up(used_bytes, 10));
943 f->dump_int("bytes_used", used_bytes);
944 if (verbose) {
945 f->dump_int("data_bytes_used", used_data_bytes);
946 f->dump_int("omap_bytes_used", used_omap_bytes);
947 }
948 f->dump_float("percent_used", used);
949 f->dump_unsigned("max_avail", avail_res);
950 if (verbose) {
951 f->dump_int("quota_objects", pool->quota_max_objects);
952 f->dump_int("quota_bytes", pool->quota_max_bytes);
953 f->dump_int("dirty", sum.num_objects_dirty);
954 f->dump_int("rd", sum.num_rd);
955 f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull);
956 f->dump_int("wr", sum.num_wr);
957 f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull);
958 f->dump_int("compress_bytes_used", statfs.data_compressed_allocated);
959 f->dump_int("compress_under_bytes", statfs.data_compressed_original);
960 // Stored by user amplified by replication
961 f->dump_int("stored_raw", stored_raw);
962 }
963 } else {
964 tbl << stringify(byte_u_t(stored_normalized));
965 if (verbose) {
966 tbl << stringify(byte_u_t(stored_data_normalized));
967 tbl << stringify(byte_u_t(stored_omap_normalized));
968 }
969 tbl << stringify(si_u_t(sum.num_objects));
970 tbl << stringify(byte_u_t(used_bytes));
971 if (verbose) {
972 tbl << stringify(byte_u_t(used_data_bytes));
973 tbl << stringify(byte_u_t(used_omap_bytes));
974 }
975 tbl << percentify(used*100);
976 tbl << stringify(byte_u_t(avail_res));
977 if (verbose) {
978 if (pool->quota_max_objects == 0)
979 tbl << "N/A";
980 else
981 tbl << stringify(si_u_t(pool->quota_max_objects));
982
983 if (pool->quota_max_bytes == 0)
984 tbl << "N/A";
985 else
986 tbl << stringify(byte_u_t(pool->quota_max_bytes));
987
988 tbl << stringify(si_u_t(sum.num_objects_dirty))
989 << stringify(byte_u_t(statfs.data_compressed_allocated))
990 << stringify(byte_u_t(statfs.data_compressed_original))
991 ;
992 }
993 }
994 }
995
996 int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map,
997 int64_t poolid) const
998 {
999 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
1000 int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
1001 pool->get_type(),
1002 pool->get_size());
1003 int64_t avail;
1004 avail = get_rule_avail(ruleno);
1005 if (avail < 0)
1006 avail = 0;
1007
1008 return avail / osd_map.pool_raw_used_rate(poolid);
1009 }
1010
1011 int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const
1012 {
1013 map<int,float> wm;
1014 int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
1015 if (r < 0) {
1016 return r;
1017 }
1018 if (wm.empty()) {
1019 return 0;
1020 }
1021
1022 float fratio = osdmap.get_full_ratio();
1023
1024 int64_t min = -1;
1025 for (auto p = wm.begin(); p != wm.end(); ++p) {
1026 auto osd_info = osd_stat.find(p->first);
1027 if (osd_info != osd_stat.end()) {
1028 if (osd_info->second.statfs.total == 0 || p->second == 0) {
1029 // osd must be out, hence its stats have been zeroed
1030 // (unless we somehow managed to have a disk with size 0...)
1031 //
1032 // (p->second == 0), if osd weight is 0, no need to
1033 // calculate proj below.
1034 continue;
1035 }
1036 double unusable = (double)osd_info->second.statfs.kb() *
1037 (1.0 - fratio);
1038 double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable);
1039 avail *= 1024.0;
1040 int64_t proj = (int64_t)(avail / (double)p->second);
1041 if (min < 0 || proj < min) {
1042 min = proj;
1043 }
1044 } else {
1045 if (osdmap.is_up(p->first)) {
1046 // This is a level 4 rather than an error, because we might have
1047 // only just started, and not received the first stats message yet.
1048 dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl;
1049 }
1050 }
1051 }
1052 return min;
1053 }
1054
1055 void PGMap::get_rules_avail(const OSDMap& osdmap,
1056 std::map<int,int64_t> *avail_map) const
1057 {
1058 avail_map->clear();
1059 for (auto p : osdmap.get_pools()) {
1060 int64_t pool_id = p.first;
1061 if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
1062 continue;
1063 const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
1064 int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(),
1065 pool->get_type(),
1066 pool->get_size());
1067 if (avail_map->count(ruleno) == 0)
1068 (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno);
1069 }
1070 }
1071
1072 // ---------------------
1073 // PGMap
1074
1075 void PGMap::Incremental::dump(ceph::Formatter *f) const
1076 {
1077 f->dump_unsigned("version", version);
1078 f->dump_stream("stamp") << stamp;
1079 f->dump_unsigned("osdmap_epoch", osdmap_epoch);
1080 f->dump_unsigned("pg_scan_epoch", pg_scan);
1081
1082 f->open_array_section("pg_stat_updates");
1083 for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) {
1084 f->open_object_section("pg_stat");
1085 f->dump_stream("pgid") << p->first;
1086 p->second.dump(f);
1087 f->close_section();
1088 }
1089 f->close_section();
1090
1091 f->open_array_section("osd_stat_updates");
1092 for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) {
1093 f->open_object_section("osd_stat");
1094 f->dump_int("osd", p->first);
1095 p->second.dump(f);
1096 f->close_section();
1097 }
1098 f->close_section();
1099 f->open_array_section("pool_statfs_updates");
1100 for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) {
1101 f->open_object_section("pool_statfs");
1102 f->dump_stream("poolid/osd") << p->first;
1103 p->second.dump(f);
1104 f->close_section();
1105 }
1106 f->close_section();
1107
1108 f->open_array_section("osd_stat_removals");
1109 for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p)
1110 f->dump_int("osd", *p);
1111 f->close_section();
1112
1113 f->open_array_section("pg_removals");
1114 for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p)
1115 f->dump_stream("pgid") << *p;
1116 f->close_section();
1117 }
1118
1119 void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
1120 {
1121 o.push_back(new Incremental);
1122 o.push_back(new Incremental);
1123 o.back()->version = 1;
1124 o.back()->stamp = utime_t(123,345);
1125 o.push_back(new Incremental);
1126 o.back()->version = 2;
1127 o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t();
1128 o.back()->osd_stat_updates[5] = osd_stat_t();
1129 o.push_back(new Incremental);
1130 o.back()->version = 3;
1131 o.back()->osdmap_epoch = 1;
1132 o.back()->pg_scan = 2;
1133 o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t();
1134 o.back()->osd_stat_updates[6] = osd_stat_t();
1135 o.back()->pg_remove.insert(pg_t(1,2));
1136 o.back()->osd_stat_rm.insert(5);
1137 o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t();
1138 }
1139
1140 // --
1141
1142 void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
1143 {
1144 ceph_assert(inc.version == version+1);
1145 version++;
1146
1147 pool_stat_t pg_sum_old = pg_sum;
1148 mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
1149 pg_pool_sum_old = pg_pool_sum;
1150
1151 for (auto p = inc.pg_stat_updates.begin();
1152 p != inc.pg_stat_updates.end();
1153 ++p) {
1154 const pg_t &update_pg(p->first);
1155 auto update_pool = update_pg.pool();
1156 const pg_stat_t &update_stat(p->second);
1157
1158 auto pg_stat_iter = pg_stat.find(update_pg);
1159 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1160 if (pg_stat_iter == pg_stat.end()) {
1161 pg_stat.insert(make_pair(update_pg, update_stat));
1162 } else {
1163 stat_pg_sub(update_pg, pg_stat_iter->second);
1164 pool_sum_ref.sub(pg_stat_iter->second);
1165 pg_stat_iter->second = update_stat;
1166 }
1167 stat_pg_add(update_pg, update_stat);
1168 pool_sum_ref.add(update_stat);
1169 }
1170
1171 for (auto p = inc.pool_statfs_updates.begin();
1172 p != inc.pool_statfs_updates.end();
1173 ++p) {
1174 auto update_pool = p->first.first;
1175 auto update_osd = p->first.second;
1176 auto& statfs_inc = p->second;
1177
1178 auto pool_statfs_iter =
1179 pool_statfs.find(std::make_pair(update_pool, update_osd));
1180 pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
1181 if (pool_statfs_iter == pool_statfs.end()) {
1182 pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
1183 } else {
1184 pool_sum_ref.sub(pool_statfs_iter->second);
1185 pool_statfs_iter->second = statfs_inc;
1186 }
1187 pool_sum_ref.add(statfs_inc);
1188 }
1189
1190 for (auto p = inc.get_osd_stat_updates().begin();
1191 p != inc.get_osd_stat_updates().end();
1192 ++p) {
1193 int osd = p->first;
1194 const osd_stat_t &new_stats(p->second);
1195
1196 auto t = osd_stat.find(osd);
1197 if (t == osd_stat.end()) {
1198 osd_stat.insert(make_pair(osd, new_stats));
1199 } else {
1200 stat_osd_sub(t->first, t->second);
1201 t->second = new_stats;
1202 }
1203 stat_osd_add(osd, new_stats);
1204 }
1205 set<int64_t> deleted_pools;
1206 for (auto p = inc.pg_remove.begin();
1207 p != inc.pg_remove.end();
1208 ++p) {
1209 const pg_t &removed_pg(*p);
1210 auto s = pg_stat.find(removed_pg);
1211 bool pool_erased = false;
1212 if (s != pg_stat.end()) {
1213 pool_erased = stat_pg_sub(removed_pg, s->second);
1214 pg_stat.erase(s);
1215 if (pool_erased) {
1216 deleted_pools.insert(removed_pg.pool());
1217 }
1218 }
1219 }
1220
1221 for (auto p = inc.get_osd_stat_rm().begin();
1222 p != inc.get_osd_stat_rm().end();
1223 ++p) {
1224 auto t = osd_stat.find(*p);
1225 if (t != osd_stat.end()) {
1226 stat_osd_sub(t->first, t->second);
1227 osd_stat.erase(t);
1228 }
1229 for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) {
1230 if (i->first.second == *p) {
1231 pg_pool_sum[i->first.first].sub(i->second);
1232 pool_statfs.erase(i);
1233 }
1234 }
1235 }
1236
1237 // skip calculating delta while sum was not synchronized
1238 if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
1239 utime_t delta_t;
1240 delta_t = inc.stamp;
1241 delta_t -= stamp;
1242 // calculate a delta, and average over the last 2 deltas.
1243 pool_stat_t d = pg_sum;
1244 d.stats.sub(pg_sum_old.stats);
1245 pg_sum_deltas.push_back(make_pair(d, delta_t));
1246 stamp_delta += delta_t;
1247 pg_sum_delta.stats.add(d.stats);
1248 auto smooth_intervals =
1249 cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
1250 while (pg_sum_deltas.size() > smooth_intervals) {
1251 pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
1252 stamp_delta -= pg_sum_deltas.front().second;
1253 pg_sum_deltas.pop_front();
1254 }
1255 }
1256 stamp = inc.stamp;
1257
1258 update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
1259
1260 for (auto p : deleted_pools) {
1261 if (cct)
1262 dout(20) << " deleted pool " << p << dendl;
1263 deleted_pool(p);
1264 }
1265
1266 if (inc.osdmap_epoch)
1267 last_osdmap_epoch = inc.osdmap_epoch;
1268 if (inc.pg_scan)
1269 last_pg_scan = inc.pg_scan;
1270 }
1271
1272 void PGMap::calc_stats()
1273 {
1274 num_pg = 0;
1275 num_pg_active = 0;
1276 num_pg_unknown = 0;
1277 num_osd = 0;
1278 pg_pool_sum.clear();
1279 num_pg_by_pool.clear();
1280 pg_by_osd.clear();
1281 pg_sum = pool_stat_t();
1282 osd_sum = osd_stat_t();
1283 osd_sum_by_class.clear();
1284 num_pg_by_state.clear();
1285 num_pg_by_pool_state.clear();
1286 num_pg_by_osd.clear();
1287
1288 for (auto p = pg_stat.begin();
1289 p != pg_stat.end();
1290 ++p) {
1291 auto pg = p->first;
1292 stat_pg_add(pg, p->second);
1293 pg_pool_sum[pg.pool()].add(p->second);
1294 }
1295 for (auto p = pool_statfs.begin();
1296 p != pool_statfs.end();
1297 ++p) {
1298 auto pool = p->first.first;
1299 pg_pool_sum[pool].add(p->second);
1300 }
1301 for (auto p = osd_stat.begin();
1302 p != osd_stat.end();
1303 ++p)
1304 stat_osd_add(p->first, p->second);
1305 }
1306
1307 void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
1308 bool sameosds)
1309 {
1310 auto pool = pgid.pool();
1311 pg_sum.add(s);
1312
1313 num_pg++;
1314 num_pg_by_state[s.state]++;
1315 num_pg_by_pool_state[pgid.pool()][s.state]++;
1316 num_pg_by_pool[pool]++;
1317
1318 if ((s.state & PG_STATE_CREATING) &&
1319 s.parent_split_bits == 0) {
1320 creating_pgs.insert(pgid);
1321 if (s.acting_primary >= 0) {
1322 creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
1323 }
1324 }
1325
1326 if (s.state & PG_STATE_ACTIVE) {
1327 ++num_pg_active;
1328 }
1329 if (s.state == 0) {
1330 ++num_pg_unknown;
1331 }
1332
1333 if (sameosds)
1334 return;
1335
1336 for (auto p = s.blocked_by.begin();
1337 p != s.blocked_by.end();
1338 ++p) {
1339 ++blocked_by_sum[*p];
1340 }
1341
1342 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1343 pg_by_osd[*p].insert(pgid);
1344 num_pg_by_osd[*p].acting++;
1345 }
1346 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1347 auto& t = pg_by_osd[*p];
1348 if (t.find(pgid) == t.end()) {
1349 t.insert(pgid);
1350 num_pg_by_osd[*p].up_not_acting++;
1351 }
1352 }
1353
1354 if (s.up_primary >= 0) {
1355 num_pg_by_osd[s.up_primary].primary++;
1356 }
1357 }
1358
1359 bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
1360 bool sameosds)
1361 {
1362 bool pool_erased = false;
1363 pg_sum.sub(s);
1364
1365 num_pg--;
1366 int end = --num_pg_by_state[s.state];
1367 ceph_assert(end >= 0);
1368 if (end == 0)
1369 num_pg_by_state.erase(s.state);
1370 if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) {
1371 num_pg_by_pool_state[pgid.pool()].erase(s.state);
1372 }
1373 end = --num_pg_by_pool[pgid.pool()];
1374 if (end == 0) {
1375 pool_erased = true;
1376 }
1377
1378 if ((s.state & PG_STATE_CREATING) &&
1379 s.parent_split_bits == 0) {
1380 creating_pgs.erase(pgid);
1381 if (s.acting_primary >= 0) {
1382 map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
1383 r[s.mapping_epoch].erase(pgid);
1384 if (r[s.mapping_epoch].empty())
1385 r.erase(s.mapping_epoch);
1386 if (r.empty())
1387 creating_pgs_by_osd_epoch.erase(s.acting_primary);
1388 }
1389 }
1390
1391 if (s.state & PG_STATE_ACTIVE) {
1392 --num_pg_active;
1393 }
1394 if (s.state == 0) {
1395 --num_pg_unknown;
1396 }
1397
1398 if (sameosds)
1399 return pool_erased;
1400
1401 for (auto p = s.blocked_by.begin();
1402 p != s.blocked_by.end();
1403 ++p) {
1404 auto q = blocked_by_sum.find(*p);
1405 ceph_assert(q != blocked_by_sum.end());
1406 --q->second;
1407 if (q->second == 0)
1408 blocked_by_sum.erase(q);
1409 }
1410
1411 set<int32_t> actingset;
1412 for (auto p = s.acting.begin(); p != s.acting.end(); ++p) {
1413 actingset.insert(*p);
1414 auto& oset = pg_by_osd[*p];
1415 oset.erase(pgid);
1416 if (oset.empty())
1417 pg_by_osd.erase(*p);
1418 auto it = num_pg_by_osd.find(*p);
1419 if (it != num_pg_by_osd.end() && it->second.acting > 0)
1420 it->second.acting--;
1421 }
1422 for (auto p = s.up.begin(); p != s.up.end(); ++p) {
1423 auto& oset = pg_by_osd[*p];
1424 oset.erase(pgid);
1425 if (oset.empty())
1426 pg_by_osd.erase(*p);
1427 if (actingset.count(*p))
1428 continue;
1429 auto it = num_pg_by_osd.find(*p);
1430 if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0)
1431 it->second.up_not_acting--;
1432 }
1433
1434 if (s.up_primary >= 0) {
1435 auto it = num_pg_by_osd.find(s.up_primary);
1436 if (it != num_pg_by_osd.end() && it->second.primary > 0)
1437 it->second.primary--;
1438 }
1439 return pool_erased;
1440 }
1441
1442 void PGMap::calc_purged_snaps()
1443 {
1444 purged_snaps.clear();
1445 set<int64_t> unknown;
1446 for (auto& i : pg_stat) {
1447 if (i.second.state == 0) {
1448 unknown.insert(i.first.pool());
1449 purged_snaps.erase(i.first.pool());
1450 continue;
1451 } else if (unknown.count(i.first.pool())) {
1452 continue;
1453 }
1454 auto j = purged_snaps.find(i.first.pool());
1455 if (j == purged_snaps.end()) {
1456 // base case
1457 purged_snaps[i.first.pool()] = i.second.purged_snaps;
1458 } else {
1459 j->second.intersection_of(i.second.purged_snaps);
1460 }
1461 }
1462 }
1463
1464 void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap)
1465 {
1466 osd_sum_by_class.clear();
1467 for (auto& i : osd_stat) {
1468 const char *class_name = osdmap.crush->get_item_class(i.first);
1469 if (class_name) {
1470 osd_sum_by_class[class_name].add(i.second);
1471 }
1472 }
1473 }
1474
1475 void PGMap::stat_osd_add(int osd, const osd_stat_t &s)
1476 {
1477 num_osd++;
1478 osd_sum.add(s);
1479 if (osd >= (int)osd_last_seq.size()) {
1480 osd_last_seq.resize(osd + 1);
1481 }
1482 osd_last_seq[osd] = s.seq;
1483 }
1484
1485 void PGMap::stat_osd_sub(int osd, const osd_stat_t &s)
1486 {
1487 num_osd--;
1488 osd_sum.sub(s);
1489 ceph_assert(osd < (int)osd_last_seq.size());
1490 osd_last_seq[osd] = 0;
1491 }
1492
1493 void PGMap::encode_digest(const OSDMap& osdmap,
1494 bufferlist& bl, uint64_t features)
1495 {
1496 get_rules_avail(osdmap, &avail_space_by_rule);
1497 calc_osd_sum_by_class(osdmap);
1498 calc_purged_snaps();
1499 PGMapDigest::encode(bl, features);
1500 }
1501
1502 void PGMap::encode(bufferlist &bl, uint64_t features) const
1503 {
1504 ENCODE_START(8, 8, bl);
1505 encode(version, bl);
1506 encode(pg_stat, bl);
1507 encode(osd_stat, bl, features);
1508 encode(last_osdmap_epoch, bl);
1509 encode(last_pg_scan, bl);
1510 encode(stamp, bl);
1511 encode(pool_statfs, bl, features);
1512 ENCODE_FINISH(bl);
1513 }
1514
1515 void PGMap::decode(bufferlist::const_iterator &bl)
1516 {
1517 DECODE_START(8, bl);
1518 decode(version, bl);
1519 decode(pg_stat, bl);
1520 decode(osd_stat, bl);
1521 decode(last_osdmap_epoch, bl);
1522 decode(last_pg_scan, bl);
1523 decode(stamp, bl);
1524 decode(pool_statfs, bl);
1525 DECODE_FINISH(bl);
1526
1527 calc_stats();
1528 }
1529
1530 void PGMap::dump(ceph::Formatter *f) const
1531 {
1532 dump_basic(f);
1533 dump_pg_stats(f, false);
1534 dump_pool_stats(f);
1535 dump_osd_stats(f);
1536 }
1537
1538 void PGMap::dump_basic(ceph::Formatter *f) const
1539 {
1540 f->dump_unsigned("version", version);
1541 f->dump_stream("stamp") << stamp;
1542 f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch);
1543 f->dump_unsigned("last_pg_scan", last_pg_scan);
1544
1545 f->open_object_section("pg_stats_sum");
1546 pg_sum.dump(f);
1547 f->close_section();
1548
1549 f->open_object_section("osd_stats_sum");
1550 osd_sum.dump(f);
1551 f->close_section();
1552
1553 dump_delta(f);
1554 }
1555
1556 void PGMap::dump_delta(ceph::Formatter *f) const
1557 {
1558 f->open_object_section("pg_stats_delta");
1559 pg_sum_delta.dump(f);
1560 f->dump_stream("stamp_delta") << stamp_delta;
1561 f->close_section();
1562 }
1563
1564 void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const
1565 {
1566 f->open_array_section("pg_stats");
1567 for (auto i = pg_stat.begin();
1568 i != pg_stat.end();
1569 ++i) {
1570 f->open_object_section("pg_stat");
1571 f->dump_stream("pgid") << i->first;
1572 if (brief)
1573 i->second.dump_brief(f);
1574 else
1575 i->second.dump(f);
1576 f->close_section();
1577 }
1578 f->close_section();
1579 }
1580
1581 void PGMap::dump_pool_stats(ceph::Formatter *f) const
1582 {
1583 f->open_array_section("pool_stats");
1584 for (auto p = pg_pool_sum.begin();
1585 p != pg_pool_sum.end();
1586 ++p) {
1587 f->open_object_section("pool_stat");
1588 f->dump_int("poolid", p->first);
1589 auto q = num_pg_by_pool.find(p->first);
1590 if (q != num_pg_by_pool.end())
1591 f->dump_unsigned("num_pg", q->second);
1592 p->second.dump(f);
1593 f->close_section();
1594 }
1595 f->close_section();
1596 }
1597
1598 void PGMap::dump_osd_stats(ceph::Formatter *f) const
1599 {
1600 f->open_array_section("osd_stats");
1601 for (auto q = osd_stat.begin();
1602 q != osd_stat.end();
1603 ++q) {
1604 f->open_object_section("osd_stat");
1605 f->dump_int("osd", q->first);
1606 q->second.dump(f);
1607 f->close_section();
1608 }
1609 f->close_section();
1610 }
1611
1612 void PGMap::dump_pg_stats_plain(
1613 ostream& ss,
1614 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
1615 bool brief) const
1616 {
1617 TextTable tab;
1618
1619 if (brief){
1620 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1621 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1622 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1623 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1624 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1625 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1626 }
1627 else {
1628 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1629 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1630 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1631 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1632 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1633 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1634 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1635 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1636 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1637 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1638 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1639 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
1640 tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT);
1641 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
1642 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
1643 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
1644 tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1645 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
1646 tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1647 tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1648 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1649 tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
1650 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
1651 tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
1652 }
1653
1654 for (auto i = pg_stats.begin();
1655 i != pg_stats.end(); ++i) {
1656 const pg_stat_t &st(i->second);
1657 if (brief) {
1658 tab << i->first
1659 << pg_state_string(st.state)
1660 << st.up
1661 << st.up_primary
1662 << st.acting
1663 << st.acting_primary
1664 << TextTable::endrow;
1665 } else {
1666 ostringstream reported;
1667 reported << st.reported_epoch << ":" << st.reported_seq;
1668
1669 tab << i->first
1670 << st.stats.sum.num_objects
1671 << st.stats.sum.num_objects_missing_on_primary
1672 << st.stats.sum.num_objects_degraded
1673 << st.stats.sum.num_objects_misplaced
1674 << st.stats.sum.num_objects_unfound
1675 << st.stats.sum.num_bytes
1676 << st.stats.sum.num_omap_bytes
1677 << st.stats.sum.num_omap_keys
1678 << st.log_size
1679 << st.ondisk_log_size
1680 << pg_state_string(st.state)
1681 << st.last_change
1682 << st.version
1683 << reported.str()
1684 << pg_vector_string(st.up)
1685 << st.up_primary
1686 << pg_vector_string(st.acting)
1687 << st.acting_primary
1688 << st.last_scrub
1689 << st.last_scrub_stamp
1690 << st.last_deep_scrub
1691 << st.last_deep_scrub_stamp
1692 << st.snaptrimq_len
1693 << TextTable::endrow;
1694 }
1695 }
1696
1697 ss << tab;
1698 }
1699
1700 void PGMap::dump(ostream& ss) const
1701 {
1702 dump_basic(ss);
1703 dump_pg_stats(ss, false);
1704 dump_pool_stats(ss, false);
1705 dump_pg_sum_stats(ss, false);
1706 dump_osd_stats(ss);
1707 }
1708
1709 void PGMap::dump_basic(ostream& ss) const
1710 {
1711 ss << "version " << version << std::endl;
1712 ss << "stamp " << stamp << std::endl;
1713 ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl;
1714 ss << "last_pg_scan " << last_pg_scan << std::endl;
1715 }
1716
1717 void PGMap::dump_pg_stats(ostream& ss, bool brief) const
1718 {
1719 dump_pg_stats_plain(ss, pg_stat, brief);
1720 }
1721
1722 void PGMap::dump_pool_stats(ostream& ss, bool header) const
1723 {
1724 TextTable tab;
1725
1726 if (header) {
1727 tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT);
1728 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1729 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1730 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1731 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1732 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1733 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1734 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1735 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1736 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1737 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1738 } else {
1739 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1740 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1741 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1742 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1743 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1744 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1745 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1746 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1747 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1748 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1749 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1750 }
1751
1752 for (auto p = pg_pool_sum.begin();
1753 p != pg_pool_sum.end();
1754 ++p) {
1755 tab << p->first
1756 << p->second.stats.sum.num_objects
1757 << p->second.stats.sum.num_objects_missing_on_primary
1758 << p->second.stats.sum.num_objects_degraded
1759 << p->second.stats.sum.num_objects_misplaced
1760 << p->second.stats.sum.num_objects_unfound
1761 << p->second.stats.sum.num_bytes
1762 << p->second.stats.sum.num_omap_bytes
1763 << p->second.stats.sum.num_omap_keys
1764 << p->second.log_size
1765 << p->second.ondisk_log_size
1766 << TextTable::endrow;
1767 }
1768
1769 ss << tab;
1770 }
1771
1772 void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const
1773 {
1774 TextTable tab;
1775
1776 if (header) {
1777 tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT);
1778 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
1779 tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT);
1780 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
1781 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
1782 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
1783 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
1784 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
1785 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
1786 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
1787 tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT);
1788 } else {
1789 tab.define_column("", TextTable::LEFT, TextTable::LEFT);
1790 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1791 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1792 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1793 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1794 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1795 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1796 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1797 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1798 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1799 tab.define_column("", TextTable::LEFT, TextTable::RIGHT);
1800 };
1801
1802 tab << "sum"
1803 << pg_sum.stats.sum.num_objects
1804 << pg_sum.stats.sum.num_objects_missing_on_primary
1805 << pg_sum.stats.sum.num_objects_degraded
1806 << pg_sum.stats.sum.num_objects_misplaced
1807 << pg_sum.stats.sum.num_objects_unfound
1808 << pg_sum.stats.sum.num_bytes
1809 << pg_sum.stats.sum.num_omap_bytes
1810 << pg_sum.stats.sum.num_omap_keys
1811 << pg_sum.log_size
1812 << pg_sum.ondisk_log_size
1813 << TextTable::endrow;
1814
1815 ss << tab;
1816 }
1817
1818 void PGMap::dump_osd_stats(ostream& ss) const
1819 {
1820 TextTable tab;
1821
1822 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1823 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1824 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1825 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1826 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1827 tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
1828 tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1829 tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
1830
1831 for (auto p = osd_stat.begin();
1832 p != osd_stat.end();
1833 ++p) {
1834 tab << p->first
1835 << byte_u_t(p->second.statfs.get_used())
1836 << byte_u_t(p->second.statfs.available)
1837 << byte_u_t(p->second.statfs.get_used_raw())
1838 << byte_u_t(p->second.statfs.total)
1839 << p->second.hb_peers
1840 << get_num_pg_by_osd(p->first)
1841 << get_num_primary_pg_by_osd(p->first)
1842 << TextTable::endrow;
1843 }
1844
1845 tab << "sum"
1846 << byte_u_t(osd_sum.statfs.get_used())
1847 << byte_u_t(osd_sum.statfs.available)
1848 << byte_u_t(osd_sum.statfs.get_used_raw())
1849 << byte_u_t(osd_sum.statfs.total)
1850 << TextTable::endrow;
1851
1852 ss << tab;
1853 }
1854
1855 void PGMap::dump_osd_sum_stats(ostream& ss) const
1856 {
1857 TextTable tab;
1858
1859 tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT);
1860 tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
1861 tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
1862 tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT);
1863 tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
1864
1865 tab << "sum"
1866 << byte_u_t(osd_sum.statfs.get_used())
1867 << byte_u_t(osd_sum.statfs.available)
1868 << byte_u_t(osd_sum.statfs.get_used_raw())
1869 << byte_u_t(osd_sum.statfs.total)
1870 << TextTable::endrow;
1871
1872 ss << tab;
1873 }
1874
1875 void PGMap::get_stuck_stats(
1876 int types, const utime_t cutoff,
1877 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const
1878 {
1879 ceph_assert(types != 0);
1880 for (auto i = pg_stat.begin();
1881 i != pg_stat.end();
1882 ++i) {
1883 utime_t val = cutoff; // don't care about >= cutoff so that is infinity
1884
1885 if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) {
1886 if (i->second.last_active < val)
1887 val = i->second.last_active;
1888 }
1889
1890 if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) {
1891 if (i->second.last_clean < val)
1892 val = i->second.last_clean;
1893 }
1894
1895 if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) {
1896 if (i->second.last_undegraded < val)
1897 val = i->second.last_undegraded;
1898 }
1899
1900 if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) {
1901 if (i->second.last_fullsized < val)
1902 val = i->second.last_fullsized;
1903 }
1904
1905 if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) {
1906 if (i->second.last_unstale < val)
1907 val = i->second.last_unstale;
1908 }
1909
1910 // val is now the earliest any of the requested stuck states began
1911 if (val < cutoff) {
1912 stuck_pgs[i->first] = i->second;
1913 }
1914 }
1915 }
1916
1917 bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const
1918 {
1919 int inactive = 0;
1920 int unclean = 0;
1921 int degraded = 0;
1922 int undersized = 0;
1923 int stale = 0;
1924
1925 for (auto i = pg_stat.begin();
1926 i != pg_stat.end();
1927 ++i) {
1928 if (! (i->second.state & PG_STATE_ACTIVE)) {
1929 if (i->second.last_active < cutoff)
1930 ++inactive;
1931 }
1932 if (! (i->second.state & PG_STATE_CLEAN)) {
1933 if (i->second.last_clean < cutoff)
1934 ++unclean;
1935 }
1936 if (i->second.state & PG_STATE_DEGRADED) {
1937 if (i->second.last_undegraded < cutoff)
1938 ++degraded;
1939 }
1940 if (i->second.state & PG_STATE_UNDERSIZED) {
1941 if (i->second.last_fullsized < cutoff)
1942 ++undersized;
1943 }
1944 if (i->second.state & PG_STATE_STALE) {
1945 if (i->second.last_unstale < cutoff)
1946 ++stale;
1947 }
1948 }
1949
1950 if (inactive)
1951 note["stuck inactive"] = inactive;
1952
1953 if (unclean)
1954 note["stuck unclean"] = unclean;
1955
1956 if (undersized)
1957 note["stuck undersized"] = undersized;
1958
1959 if (degraded)
1960 note["stuck degraded"] = degraded;
1961
1962 if (stale)
1963 note["stuck stale"] = stale;
1964
1965 return inactive || unclean || undersized || degraded || stale;
1966 }
1967
1968 void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const
1969 {
1970 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1971 get_stuck_stats(types, cutoff, stuck_pg_stats);
1972 f->open_array_section("stuck_pg_stats");
1973 for (auto i = stuck_pg_stats.begin();
1974 i != stuck_pg_stats.end();
1975 ++i) {
1976 f->open_object_section("pg_stat");
1977 f->dump_stream("pgid") << i->first;
1978 i->second.dump(f);
1979 f->close_section();
1980 }
1981 f->close_section();
1982 }
1983
1984 void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const
1985 {
1986 mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats;
1987 get_stuck_stats(types, cutoff, stuck_pg_stats);
1988 if (!stuck_pg_stats.empty())
1989 dump_pg_stats_plain(ss, stuck_pg_stats, true);
1990 }
1991
1992 int PGMap::dump_stuck_pg_stats(
1993 stringstream &ds,
1994 ceph::Formatter *f,
1995 int threshold,
1996 vector<string>& args) const
1997 {
1998 int stuck_types = 0;
1999
2000 for (auto i = args.begin(); i != args.end(); ++i) {
2001 if (*i == "inactive")
2002 stuck_types |= PGMap::STUCK_INACTIVE;
2003 else if (*i == "unclean")
2004 stuck_types |= PGMap::STUCK_UNCLEAN;
2005 else if (*i == "undersized")
2006 stuck_types |= PGMap::STUCK_UNDERSIZED;
2007 else if (*i == "degraded")
2008 stuck_types |= PGMap::STUCK_DEGRADED;
2009 else if (*i == "stale")
2010 stuck_types |= PGMap::STUCK_STALE;
2011 else {
2012 ds << "Unknown type: " << *i << std::endl;
2013 return -EINVAL;
2014 }
2015 }
2016
2017 utime_t now(ceph_clock_now());
2018 utime_t cutoff = now - utime_t(threshold, 0);
2019
2020 if (!f) {
2021 dump_stuck_plain(ds, stuck_types, cutoff);
2022 } else {
2023 dump_stuck(f, stuck_types, cutoff);
2024 f->flush(ds);
2025 }
2026
2027 return 0;
2028 }
2029
2030 void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const
2031 {
2032 f->open_array_section("osd_perf_infos");
2033 for (auto i = osd_stat.begin();
2034 i != osd_stat.end();
2035 ++i) {
2036 f->open_object_section("osd");
2037 f->dump_int("id", i->first);
2038 {
2039 f->open_object_section("perf_stats");
2040 i->second.os_perf_stat.dump(f);
2041 f->close_section();
2042 }
2043 f->close_section();
2044 }
2045 f->close_section();
2046 }
2047 void PGMap::print_osd_perf_stats(std::ostream *ss) const
2048 {
2049 TextTable tab;
2050 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2051 tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2052 tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT);
2053 for (auto i = osd_stat.begin();
2054 i != osd_stat.end();
2055 ++i) {
2056 tab << i->first;
2057 tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull;
2058 tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull;
2059 tab << TextTable::endrow;
2060 }
2061 (*ss) << tab;
2062 }
2063
2064 void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const
2065 {
2066 f->open_array_section("osd_blocked_by_infos");
2067 for (auto i = blocked_by_sum.begin();
2068 i != blocked_by_sum.end();
2069 ++i) {
2070 f->open_object_section("osd");
2071 f->dump_int("id", i->first);
2072 f->dump_int("num_blocked", i->second);
2073 f->close_section();
2074 }
2075 f->close_section();
2076 }
2077 void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const
2078 {
2079 TextTable tab;
2080 tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT);
2081 tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT);
2082 for (auto i = blocked_by_sum.begin();
2083 i != blocked_by_sum.end();
2084 ++i) {
2085 tab << i->first;
2086 tab << i->second;
2087 tab << TextTable::endrow;
2088 }
2089 (*ss) << tab;
2090 }
2091
2092
2093 /**
2094 * update aggregated delta
2095 *
2096 * @param cct ceph context
2097 * @param ts Timestamp for the stats being delta'ed
2098 * @param old_pool_sum Previous stats sum
2099 * @param last_ts Last timestamp for pool
2100 * @param result_pool_sum Resulting stats
2101 * @param result_pool_delta Resulting pool delta
2102 * @param result_ts_delta Resulting timestamp delta
2103 * @param delta_avg_list List of last N computed deltas, used to average
2104 */
2105 void PGMap::update_delta(
2106 CephContext *cct,
2107 const utime_t ts,
2108 const pool_stat_t& old_pool_sum,
2109 utime_t *last_ts,
2110 const pool_stat_t& current_pool_sum,
2111 pool_stat_t *result_pool_delta,
2112 utime_t *result_ts_delta,
2113 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list)
2114 {
2115 /* @p ts is the timestamp we want to associate with the data
2116 * in @p old_pool_sum, and on which we will base ourselves to
2117 * calculate the delta, stored in 'delta_t'.
2118 */
2119 utime_t delta_t;
2120 delta_t = ts; // start with the provided timestamp
2121 delta_t -= *last_ts; // take the last timestamp we saw
2122 *last_ts = ts; // @p ts becomes the last timestamp we saw
2123
2124 // adjust delta_t, quick start if there is no update in a long period
2125 delta_t = std::min(delta_t,
2126 utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
2127
2128 // calculate a delta, and average over the last 6 deltas by default.
2129 /* start by taking a copy of our current @p result_pool_sum, and by
2130 * taking out the stats from @p old_pool_sum. This generates a stats
2131 * delta. Stash this stats delta in @p delta_avg_list, along with the
2132 * timestamp delta for these results.
2133 */
2134 pool_stat_t d = current_pool_sum;
2135 d.stats.sub(old_pool_sum.stats);
2136
2137 /* Aggregate current delta, and take out the last seen delta (if any) to
2138 * average it out.
2139 * Skip calculating delta while sum was not synchronized.
2140 */
2141 if(!old_pool_sum.stats.sum.is_zero()) {
2142 delta_avg_list->push_back(make_pair(d,delta_t));
2143 *result_ts_delta += delta_t;
2144 result_pool_delta->stats.add(d.stats);
2145 }
2146 size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
2147 while (delta_avg_list->size() > s) {
2148 result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
2149 *result_ts_delta -= delta_avg_list->front().second;
2150 delta_avg_list->pop_front();
2151 }
2152 }
2153
2154 /**
2155 * Update a given pool's deltas
2156 *
2157 * @param cct Ceph Context
2158 * @param ts Timestamp for the stats being delta'ed
2159 * @param pool Pool's id
2160 * @param old_pool_sum Previous stats sum
2161 */
2162 void PGMap::update_one_pool_delta(
2163 CephContext *cct,
2164 const utime_t ts,
2165 const int64_t pool,
2166 const pool_stat_t& old_pool_sum)
2167 {
2168 if (per_pool_sum_deltas.count(pool) == 0) {
2169 ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
2170 ceph_assert(per_pool_sum_delta.count(pool) == 0);
2171 }
2172
2173 auto& sum_delta = per_pool_sum_delta[pool];
2174
2175 update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
2176 &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
2177 &per_pool_sum_deltas[pool]);
2178 }
2179
2180 /**
2181 * Update pools' deltas
2182 *
2183 * @param cct CephContext
2184 * @param ts Timestamp for the stats being delta'ed
2185 * @param pg_pool_sum_old Map of pool stats for delta calcs.
2186 */
2187 void PGMap::update_pool_deltas(
2188 CephContext *cct, const utime_t ts,
2189 const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
2190 {
2191 for (auto it = pg_pool_sum_old.begin();
2192 it != pg_pool_sum_old.end(); ++it) {
2193 update_one_pool_delta(cct, ts, it->first, it->second);
2194 }
2195 }
2196
2197 void PGMap::clear_delta()
2198 {
2199 pg_sum_delta = pool_stat_t();
2200 pg_sum_deltas.clear();
2201 stamp_delta = utime_t();
2202 }
2203
2204 void PGMap::generate_test_instances(list<PGMap*>& o)
2205 {
2206 o.push_back(new PGMap);
2207 list<Incremental*> inc;
2208 Incremental::generate_test_instances(inc);
2209 delete inc.front();
2210 inc.pop_front();
2211 while (!inc.empty()) {
2212 PGMap *pmp = new PGMap();
2213 *pmp = *o.back();
2214 o.push_back(pmp);
2215 o.back()->apply_incremental(NULL, *inc.front());
2216 delete inc.front();
2217 inc.pop_front();
2218 }
2219 }
2220
2221 void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
2222 bool primary, set<pg_t>& pgs) const
2223 {
2224 for (auto i = pg_stat.begin();
2225 i != pg_stat.end();
2226 ++i) {
2227 if ((poolid >= 0) && (poolid != i->first.pool()))
2228 continue;
2229 if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary)))
2230 continue;
2231 if (state == (uint64_t)-1 || // "all"
2232 (i->second.state & state) || // matches a state bit
2233 (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0)
2234 pgs.insert(i->first);
2235 }
2236 }
2237 }
2238
2239 void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const
2240 {
2241 f->open_array_section("pg_stats");
2242 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2243 const pg_stat_t& st = pg_stat.at(*i);
2244 f->open_object_section("pg_stat");
2245 f->dump_stream("pgid") << *i;
2246 st.dump(f);
2247 f->close_section();
2248 }
2249 f->close_section();
2250 }
2251
2252 void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
2253 {
2254 TextTable tab;
2255 utime_t now = ceph_clock_now();
2256
2257 tab.define_column("PG", TextTable::LEFT, TextTable::LEFT);
2258 tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
2259 tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT);
2260 tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT);
2261 tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT);
2262 tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT);
2263 tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT);
2264 tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT);
2265 tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT);
2266 tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT);
2267 tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT);
2268 tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT);
2269 tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT);
2270 tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT);
2271 tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
2272 tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2273 tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
2274
2275 for (auto i = pgs.begin(); i != pgs.end(); ++i) {
2276 const pg_stat_t& st = pg_stat.at(*i);
2277
2278 ostringstream reported;
2279 reported << st.reported_epoch << ":" << st.reported_seq;
2280
2281 ostringstream upstr, actingstr;
2282 upstr << st.up << 'p' << st.up_primary;
2283 actingstr << st.acting << 'p' << st.acting_primary;
2284 tab << *i
2285 << st.stats.sum.num_objects
2286 << st.stats.sum.num_objects_degraded
2287 << st.stats.sum.num_objects_misplaced
2288 << st.stats.sum.num_objects_unfound
2289 << st.stats.sum.num_bytes
2290 << st.stats.sum.num_omap_bytes
2291 << st.stats.sum.num_omap_keys
2292 << st.log_size
2293 << pg_state_string(st.state)
2294 << utimespan_str(now - st.last_change)
2295 << st.version
2296 << reported.str()
2297 << upstr.str()
2298 << actingstr.str()
2299 << st.last_scrub_stamp
2300 << st.last_deep_scrub_stamp
2301 << TextTable::endrow;
2302 }
2303
2304 ss << tab;
2305 }
2306
2307 void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
2308 ceph::Formatter *f,
2309 stringstream *rs) const {
2310 string pool_name = osd_map.get_pool_name(poolid);
2311 if (f) {
2312 f->open_object_section("pool");
2313 f->dump_string("pool_name", pool_name.c_str());
2314 f->dump_int("pool_id", poolid);
2315 f->open_object_section("recovery");
2316 }
2317 list<string> sl;
2318 stringstream tss;
2319 pool_recovery_summary(f, &sl, poolid);
2320 if (!f && !sl.empty()) {
2321 for (auto &p : sl)
2322 tss << " " << p << "\n";
2323 }
2324 if (f) {
2325 f->close_section(); // object section recovery
2326 f->open_object_section("recovery_rate");
2327 }
2328 ostringstream rss;
2329 pool_recovery_rate_summary(f, &rss, poolid);
2330 if (!f && !rss.str().empty())
2331 tss << " recovery io " << rss.str() << "\n";
2332 if (f) {
2333 f->close_section(); // object section recovery_rate
2334 f->open_object_section("client_io_rate");
2335 }
2336 rss.clear();
2337 rss.str("");
2338 pool_client_io_rate_summary(f, &rss, poolid);
2339 if (!f && !rss.str().empty())
2340 tss << " client io " << rss.str() << "\n";
2341 // dump cache tier IO rate for cache pool
2342 const pg_pool_t *pool = osd_map.get_pg_pool(poolid);
2343 if (pool->is_tier()) {
2344 if (f) {
2345 f->close_section(); // object section client_io_rate
2346 f->open_object_section("cache_io_rate");
2347 }
2348 rss.clear();
2349 rss.str("");
2350 pool_cache_io_rate_summary(f, &rss, poolid);
2351 if (!f && !rss.str().empty())
2352 tss << " cache tier io " << rss.str() << "\n";
2353 }
2354 if (f) {
2355 f->close_section(); // object section cache_io_rate
2356 f->close_section(); // object section pool
2357 } else {
2358 *rs << "pool " << pool_name << " id " << poolid << "\n";
2359 if (!tss.str().empty())
2360 *rs << tss.str() << "\n";
2361 else
2362 *rs << " nothing is going on\n\n";
2363 }
2364 }
2365
2366 void PGMap::get_health_checks(
2367 CephContext *cct,
2368 const OSDMap& osdmap,
2369 health_check_map_t *checks) const
2370 {
2371 utime_t now = ceph_clock_now();
2372 const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail");
2373 const auto& pools = osdmap.get_pools();
2374
2375 typedef enum pg_consequence_t {
2376 UNAVAILABLE = 1, // Client IO to the pool may block
2377 DEGRADED = 2, // Fewer than the requested number of replicas are present
2378 BACKFILL_FULL = 3, // Backfill is blocked for space considerations
2379 // This may or may not be a deadlock condition.
2380 DAMAGED = 4, // The data may be missing or inconsistent on disk and
2381 // requires repair
2382 RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full
2383 } pg_consequence_t;
2384
2385 // For a given PG state, how should it be reported at the pool level?
2386 class PgStateResponse {
2387 public:
2388 pg_consequence_t consequence;
2389 typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
2390 stuck_cb stuck_since;
2391 bool invert;
2392
2393 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s)
2394 : consequence(c), stuck_since(std::move(s)), invert(false)
2395 {
2396 }
2397
2398 PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i)
2399 : consequence(c), stuck_since(std::move(s)), invert(i)
2400 {
2401 }
2402 };
2403
2404 // Record the PG state counts that contributed to a reported pool state
2405 class PgCauses {
2406 public:
2407 // Map of PG_STATE_* to number of pgs in that state.
2408 std::map<unsigned, unsigned> states;
2409
2410 // List of all PG IDs that had a state contributing
2411 // to this health condition.
2412 std::set<pg_t> pgs;
2413
2414 std::map<pg_t, std::string> pg_messages;
2415 };
2416
2417 // Map of PG state to how to respond to it
2418 std::map<unsigned, PgStateResponse> state_to_response = {
2419 // Immediate reports
2420 { PG_STATE_INCONSISTENT, {DAMAGED, {}} },
2421 { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} },
2422 { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} },
2423 { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} },
2424 { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} },
2425 { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} },
2426 { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} },
2427 { PG_STATE_DEGRADED, {DEGRADED, {}} },
2428 { PG_STATE_DOWN, {UNAVAILABLE, {}} },
2429 // Delayed (wait until stuck) reports
2430 { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } },
2431 { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } },
2432 { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } },
2433 // Delayed and inverted reports
2434 { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} }
2435 };
2436
2437 // Specialized state printer that takes account of inversion of
2438 // ACTIVE, CLEAN checks.
2439 auto state_name = [](const uint64_t &state) {
2440 // Special cases for the states that are inverted checks
2441 if (state == PG_STATE_CLEAN) {
2442 return std::string("unclean");
2443 } else if (state == PG_STATE_ACTIVE) {
2444 return std::string("inactive");
2445 } else {
2446 return pg_state_string(state);
2447 }
2448 };
2449
2450 // Map of what is wrong to information about why, implicitly also stores
2451 // the list of what is wrong.
2452 std::map<pg_consequence_t, PgCauses> detected;
2453
2454 // Optimisation: trim down the number of checks to apply based on
2455 // the summary counters
2456 std::map<unsigned, PgStateResponse> possible_responses;
2457 for (const auto &i : num_pg_by_state) {
2458 for (const auto &j : state_to_response) {
2459 if (!j.second.invert) {
2460 // Check for normal tests by seeing if any pgs have the flag
2461 if (i.first & j.first) {
2462 possible_responses.insert(j);
2463 }
2464 }
2465 }
2466 }
2467
2468 for (const auto &j : state_to_response) {
2469 if (j.second.invert) {
2470 // Check for inverted tests by seeing if not-all pgs have the flag
2471 const auto &found = num_pg_by_state.find(j.first);
2472 if (found == num_pg_by_state.end() || found->second != num_pg) {
2473 possible_responses.insert(j);
2474 }
2475 }
2476 }
2477
2478 utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0);
2479 // Loop over all PGs, if there are any possibly-unhealthy states in there
2480 if (!possible_responses.empty()) {
2481 for (const auto& i : pg_stat) {
2482 const auto &pg_id = i.first;
2483 const auto &pg_info = i.second;
2484
2485 for (const auto &j : state_to_response) {
2486 const auto &pg_response_state = j.first;
2487 const auto &pg_response = j.second;
2488
2489 // Apply the state test
2490 if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
2491 continue;
2492 }
2493
2494 // Apply stuckness test if needed
2495 if (pg_response.stuck_since) {
2496 // Delayed response, check for stuckness
2497 utime_t last_whatever = pg_response.stuck_since(pg_info);
2498 if (last_whatever >= cutoff) {
2499 // Not stuck enough, ignore.
2500 continue;
2501 } else {
2502
2503 }
2504 }
2505
2506 auto &causes = detected[pg_response.consequence];
2507 causes.states[pg_response_state]++;
2508 causes.pgs.insert(pg_id);
2509
2510 // Don't bother composing detail string if we have already recorded
2511 // too many
2512 if (causes.pg_messages.size() > max) {
2513 continue;
2514 }
2515
2516 std::ostringstream ss;
2517 if (pg_response.stuck_since) {
2518 utime_t since = pg_response.stuck_since(pg_info);
2519 ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
2520 if (since == utime_t()) {
2521 ss << " since forever";
2522 } else {
2523 utime_t dur = now - since;
2524 ss << " for " << utimespan_str(dur);
2525 }
2526 ss << ", current state " << pg_state_string(pg_info.state)
2527 << ", last acting " << pg_info.acting;
2528 } else {
2529 ss << "pg " << pg_id << " is "
2530 << pg_state_string(pg_info.state);
2531 ss << ", acting " << pg_info.acting;
2532 if (pg_info.stats.sum.num_objects_unfound) {
2533 ss << ", " << pg_info.stats.sum.num_objects_unfound
2534 << " unfound";
2535 }
2536 }
2537
2538 if (pg_info.state & PG_STATE_INCOMPLETE) {
2539 const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
2540 if (pi && pi->min_size > 1) {
2541 ss << " (reducing pool "
2542 << osdmap.get_pool_name(pg_id.pool())
2543 << " min_size from " << (int)pi->min_size
2544 << " may help; search ceph.com/docs for 'incomplete')";
2545 }
2546 }
2547
2548 causes.pg_messages[pg_id] = ss.str();
2549 }
2550 }
2551 } else {
2552 dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
2553 }
2554
2555 for (const auto &i : detected) {
2556 std::string health_code;
2557 health_status_t sev;
2558 std::string summary;
2559 switch(i.first) {
2560 case UNAVAILABLE:
2561 health_code = "PG_AVAILABILITY";
2562 sev = HEALTH_WARN;
2563 summary = "Reduced data availability: ";
2564 break;
2565 case DEGRADED:
2566 health_code = "PG_DEGRADED";
2567 summary = "Degraded data redundancy: ";
2568 sev = HEALTH_WARN;
2569 break;
2570 case BACKFILL_FULL:
2571 health_code = "PG_BACKFILL_FULL";
2572 summary = "Low space hindering backfill (add storage if this doesn't resolve itself): ";
2573 sev = HEALTH_WARN;
2574 break;
2575 case DAMAGED:
2576 health_code = "PG_DAMAGED";
2577 summary = "Possible data damage: ";
2578 sev = HEALTH_ERR;
2579 break;
2580 case RECOVERY_FULL:
2581 health_code = "PG_RECOVERY_FULL";
2582 summary = "Full OSDs blocking recovery: ";
2583 sev = HEALTH_ERR;
2584 break;
2585 default:
2586 ceph_abort();
2587 }
2588
2589 if (i.first == DEGRADED) {
2590 if (pg_sum.stats.sum.num_objects_degraded &&
2591 pg_sum.stats.sum.num_object_copies > 0) {
2592 double pc = (double)pg_sum.stats.sum.num_objects_degraded /
2593 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
2594 char b[20];
2595 snprintf(b, sizeof(b), "%.3lf", pc);
2596 ostringstream ss;
2597 ss << pg_sum.stats.sum.num_objects_degraded
2598 << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
2599 << b << "%)";
2600
2601 // Throw in a comma for the benefit of the following PG counts
2602 summary += ss.str() + ", ";
2603 }
2604 }
2605
2606 // Compose summary message saying how many PGs in what states led
2607 // to this health check failing
2608 std::vector<std::string> pg_msgs;
2609 int64_t count = 0;
2610 for (const auto &j : i.second.states) {
2611 std::ostringstream msg;
2612 msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
2613 pg_msgs.push_back(msg.str());
2614 count += j.second;
2615 }
2616 summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
2617
2618 health_check_t *check = &checks->add(
2619 health_code,
2620 sev,
2621 summary,
2622 count);
2623
2624 // Compose list of PGs contributing to this health check failing
2625 for (const auto &j : i.second.pg_messages) {
2626 check->detail.push_back(j.second);
2627 }
2628 }
2629
2630 // OSD_SCRUB_ERRORS
2631 if (pg_sum.stats.sum.num_scrub_errors) {
2632 ostringstream ss;
2633 ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
2634 checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
2635 pg_sum.stats.sum.num_scrub_errors);
2636 }
2637
2638 // LARGE_OMAP_OBJECTS
2639 if (pg_sum.stats.sum.num_large_omap_objects) {
2640 list<string> detail;
2641 for (auto &pool : pools) {
2642 const string& pool_name = osdmap.get_pool_name(pool.first);
2643 auto it2 = pg_pool_sum.find(pool.first);
2644 if (it2 == pg_pool_sum.end()) {
2645 continue;
2646 }
2647 const pool_stat_t *pstat = &it2->second;
2648 if (pstat == nullptr) {
2649 continue;
2650 }
2651 const object_stat_sum_t& sum = pstat->stats.sum;
2652 if (sum.num_large_omap_objects) {
2653 stringstream ss;
2654 ss << sum.num_large_omap_objects << " large objects found in pool "
2655 << "'" << pool_name << "'";
2656 detail.push_back(ss.str());
2657 }
2658 }
2659 if (!detail.empty()) {
2660 ostringstream ss;
2661 ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
2662 auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
2663 pg_sum.stats.sum.num_large_omap_objects);
2664 stringstream tip;
2665 tip << "Search the cluster log for 'Large omap object found' for more "
2666 << "details.";
2667 detail.push_back(tip.str());
2668 d.detail.swap(detail);
2669 }
2670 }
2671
2672 // CACHE_POOL_NEAR_FULL
2673 {
2674 list<string> detail;
2675 unsigned num_pools = 0;
2676 for (auto& p : pools) {
2677 if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
2678 !pg_pool_sum.count(p.first)) {
2679 continue;
2680 }
2681 bool nearfull = false;
2682 const string& name = osdmap.get_pool_name(p.first);
2683 const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
2684 uint64_t ratio = p.second.cache_target_full_ratio_micro +
2685 ((1000000 - p.second.cache_target_full_ratio_micro) *
2686 cct->_conf->mon_cache_target_full_warn_ratio);
2687 if (p.second.target_max_objects &&
2688 (uint64_t)(st.stats.sum.num_objects -
2689 st.stats.sum.num_objects_hit_set_archive) >
2690 p.second.target_max_objects * (ratio / 1000000.0)) {
2691 ostringstream ss;
2692 ss << "cache pool '" << name << "' with "
2693 << si_u_t(st.stats.sum.num_objects)
2694 << " objects at/near target max "
2695 << si_u_t(p.second.target_max_objects) << " objects";
2696 detail.push_back(ss.str());
2697 nearfull = true;
2698 }
2699 if (p.second.target_max_bytes &&
2700 (uint64_t)(st.stats.sum.num_bytes -
2701 st.stats.sum.num_bytes_hit_set_archive) >
2702 p.second.target_max_bytes * (ratio / 1000000.0)) {
2703 ostringstream ss;
2704 ss << "cache pool '" << name
2705 << "' with " << byte_u_t(st.stats.sum.num_bytes)
2706 << " at/near target max "
2707 << byte_u_t(p.second.target_max_bytes);
2708 detail.push_back(ss.str());
2709 nearfull = true;
2710 }
2711 if (nearfull) {
2712 ++num_pools;
2713 }
2714 }
2715 if (!detail.empty()) {
2716 ostringstream ss;
2717 ss << num_pools << " cache pools at or near target size";
2718 auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
2719 num_pools);
2720 d.detail.swap(detail);
2721 }
2722 }
2723
2724 // TOO_FEW_PGS
2725 unsigned num_in = osdmap.get_num_in_osds();
2726 auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size());
2727 const auto min_pg_per_osd =
2728 cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd");
2729 if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) {
2730 auto per = sum_pg_up / num_in;
2731 if (per < min_pg_per_osd && per) {
2732 ostringstream ss;
2733 ss << "too few PGs per OSD (" << per
2734 << " < min " << min_pg_per_osd << ")";
2735 checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
2736 min_pg_per_osd - per);
2737 }
2738 }
2739
2740 // TOO_MANY_PGS
2741 auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd");
2742 if (num_in && max_pg_per_osd > 0) {
2743 auto per = sum_pg_up / num_in;
2744 if (per > max_pg_per_osd) {
2745 ostringstream ss;
2746 ss << "too many PGs per OSD (" << per
2747 << " > max " << max_pg_per_osd << ")";
2748 checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
2749 per - max_pg_per_osd);
2750 }
2751 }
2752
2753 // TOO_FEW_OSDS
2754 auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds");
2755 auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
2756 if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) {
2757 ostringstream ss;
2758 ss << "OSD count " << osdmap.get_num_osds()
2759 << " < osd_pool_default_size " << osd_pool_default_size;
2760 checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
2761 osd_pool_default_size - osdmap.get_num_osds());
2762 }
2763
2764 // SLOW_PING_TIME
2765 // Convert milliseconds to microseconds
2766 auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000;
2767 auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
2768 if (warn_slow_ping_time == 0) {
2769 double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
2770 warn_slow_ping_time = grace;
2771 warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2772 }
2773 if (warn_slow_ping_time > 0) {
2774
2775 struct mon_ping_item_t {
2776 uint32_t pingtime;
2777 int from;
2778 int to;
2779 bool improving;
2780
2781 bool operator<(const mon_ping_item_t& rhs) const {
2782 if (pingtime < rhs.pingtime)
2783 return true;
2784 if (pingtime > rhs.pingtime)
2785 return false;
2786 if (from < rhs.from)
2787 return true;
2788 if (from > rhs.from)
2789 return false;
2790 return to < rhs.to;
2791 }
2792 };
2793
2794 list<string> detail_back;
2795 list<string> detail_front;
2796 set<mon_ping_item_t> back_sorted, front_sorted;
2797 for (auto i : osd_stat) {
2798 for (auto j : i.second.hb_pingtime) {
2799
2800 // Maybe source info is old
2801 if (now.sec() - j.second.last_update > grace * 60)
2802 continue;
2803
2804 mon_ping_item_t back;
2805 back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2806 back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
2807 back.from = i.first;
2808 back.to = j.first;
2809 if (back.pingtime > warn_slow_ping_time) {
2810 back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
2811 && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
2812 back_sorted.emplace(back);
2813 }
2814
2815 mon_ping_item_t front;
2816 front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2817 front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
2818 front.from = i.first;
2819 front.to = j.first;
2820 if (front.pingtime > warn_slow_ping_time) {
2821 front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
(2) Event copy_paste_error: |
"back_pingtime" in "j.second.back_pingtime" looks like a copy-paste error. |
(3) Event remediation: |
Should it say "front_pingtime" instead? |
Also see events: |
[original] |
2822 && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
2823 front_sorted.emplace(front);
2824 }
2825 }
2826 }
2827 int max_detail = 10;
2828 for (auto &sback : boost::adaptors::reverse(back_sorted)) {
2829 ostringstream ss;
2830 if (max_detail == 0) {
2831 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2832 detail_back.push_back(ss.str());
2833 break;
2834 }
2835 max_detail--;
2836 ss << "Slow heartbeat ping on back interface from osd." << sback.from
2837 << (osdmap.is_down(sback.from) ? " (down)" : "")
2838 << " to osd." << sback.to
2839 << (osdmap.is_down(sback.to) ? " (down)" : "")
2840 << " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
2841 << (sback.improving ? " possibly improving" : "");
2842 detail_back.push_back(ss.str());
2843 }
2844 max_detail = 10;
2845 for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
2846 ostringstream ss;
2847 if (max_detail == 0) {
2848 ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information";
2849 detail_front.push_back(ss.str());
2850 break;
2851 }
2852 max_detail--;
2853 ss << "Slow heartbeat ping on front interface from osd." << sfront.from
2854 << (osdmap.is_down(sfront.from) ? " (down)" : "")
2855 << " to osd." << sfront.to
2856 << (osdmap.is_down(sfront.to) ? " (down)" : "")
2857 << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
2858 << (sfront.improving ? " possibly improving" : "");
2859 detail_front.push_back(ss.str());
2860 }
2861 if (detail_back.size() != 0) {
2862 ostringstream ss;
2863 ss << "Long heartbeat ping times on back interface seen, longest is "
2864 << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << " msec";
2865 auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
2866 back_sorted.size());
2867 d.detail.swap(detail_back);
2868 }
2869 if (detail_front.size() != 0) {
2870 ostringstream ss;
2871 ss << "Long heartbeat ping times on front interface seen, longest is "
2872 << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << " msec";
2873 auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
2874 front_sorted.size());
2875 d.detail.swap(detail_front);
2876 }
2877 }
2878
2879 // SMALLER_PGP_NUM
2880 // MANY_OBJECTS_PER_PG
2881 if (!pg_stat.empty()) {
2882 list<string> pgp_detail, many_detail;
2883 const auto mon_pg_warn_min_objects =
2884 cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects");
2885 const auto mon_pg_warn_min_pool_objects =
2886 cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects");
2887 const auto mon_pg_warn_max_object_skew =
2888 cct->_conf.get_val<double>("mon_pg_warn_max_object_skew");
2889 for (auto p = pg_pool_sum.begin();
2890 p != pg_pool_sum.end();
2891 ++p) {
2892 const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
2893 if (!pi)
2894 continue; // in case osdmap changes haven't propagated to PGMap yet
2895 const string& name = osdmap.get_pool_name(p->first);
2896 // NOTE: we use pg_num_target and pgp_num_target for the purposes of
2897 // the warnings. If the cluster is failing to converge on the target
2898 // values that is a separate issue!
2899 if (pi->get_pg_num_target() > pi->get_pgp_num_target() &&
2900 !(name.find(".DELETED") != string::npos &&
2901 cct->_conf->mon_fake_pool_delete)) {
2902 ostringstream ss;
2903 ss << "pool " << name << " pg_num "
2904 << pi->get_pg_num_target()
2905 << " > pgp_num " << pi->get_pgp_num_target();
2906 pgp_detail.push_back(ss.str());
2907 }
2908 int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
2909 if (average_objects_per_pg > 0 &&
2910 pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects &&
2911 p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) {
2912 int objects_per_pg = p->second.stats.sum.num_objects /
2913 pi->get_pg_num_target();
2914 float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
2915 if (mon_pg_warn_max_object_skew > 0 &&
2916 ratio > mon_pg_warn_max_object_skew) {
2917 ostringstream ss;
2918 ss << "pool " << name << " objects per pg ("
2919 << objects_per_pg << ") is more than " << ratio
2920 << " times cluster average ("
2921 << average_objects_per_pg << ")";
2922 many_detail.push_back(ss.str());
2923 }
2924 }
2925 }
2926 if (!pgp_detail.empty()) {
2927 ostringstream ss;
2928 ss << pgp_detail.size() << " pools have pg_num > pgp_num";
2929 auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
2930 pgp_detail.size());
2931 d.detail.swap(pgp_detail);
2932 }
2933 if (!many_detail.empty()) {
2934 ostringstream ss;
2935 ss << many_detail.size() << " pools have many more objects per pg than"
2936 << " average";
2937 auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
2938 many_detail.size());
2939 d.detail.swap(many_detail);
2940 }
2941 }
2942
2943 // POOL_FULL
2944 // POOL_NEAR_FULL
2945 {
2946 float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100;
2947 float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100;
2948 list<string> full_detail, nearfull_detail;
2949 unsigned full_pools = 0, nearfull_pools = 0;
2950 for (auto it : pools) {
2951 auto it2 = pg_pool_sum.find(it.first);
2952 if (it2 == pg_pool_sum.end()) {
2953 continue;
2954 }
2955 const pool_stat_t *pstat = &it2->second;
2956 const object_stat_sum_t& sum = pstat->stats.sum;
2957 const string& pool_name = osdmap.get_pool_name(it.first);
2958 const pg_pool_t &pool = it.second;
2959 bool full = false, nearfull = false;
2960 if (pool.quota_max_objects > 0) {
2961 stringstream ss;
2962 if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
2963 } else if (crit_threshold > 0 &&
2964 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
2965 ss << "pool '" << pool_name
2966 << "' has " << sum.num_objects << " objects"
2967 << " (max " << pool.quota_max_objects << ")";
2968 full_detail.push_back(ss.str());
2969 full = true;
2970 } else if (warn_threshold > 0 &&
2971 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
2972 ss << "pool '" << pool_name
2973 << "' has " << sum.num_objects << " objects"
2974 << " (max " << pool.quota_max_objects << ")";
2975 nearfull_detail.push_back(ss.str());
2976 nearfull = true;
2977 }
2978 }
2979 if (pool.quota_max_bytes > 0) {
2980 stringstream ss;
2981 if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
2982 } else if (crit_threshold > 0 &&
2983 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
2984 ss << "pool '" << pool_name
2985 << "' has " << byte_u_t(sum.num_bytes)
2986 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
2987 full_detail.push_back(ss.str());
2988 full = true;
2989 } else if (warn_threshold > 0 &&
2990 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
2991 ss << "pool '" << pool_name
2992 << "' has " << byte_u_t(sum.num_bytes)
2993 << " (max " << byte_u_t(pool.quota_max_bytes) << ")";
2994 nearfull_detail.push_back(ss.str());
2995 nearfull = true;
2996 }
2997 }
2998 if (full) {
2999 ++full_pools;
3000 }
3001 if (nearfull) {
3002 ++nearfull_pools;
3003 }
3004 }
3005 if (full_pools) {
3006 ostringstream ss;
3007 ss << full_pools << " pools full";
3008 auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
3009 d.detail.swap(full_detail);
3010 }
3011 if (nearfull_pools) {
3012 ostringstream ss;
3013 ss << nearfull_pools << " pools nearfull";
3014 auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
3015 d.detail.swap(nearfull_detail);
3016 }
3017 }
3018
3019 // OBJECT_MISPLACED
3020 if (pg_sum.stats.sum.num_objects_misplaced &&
3021 pg_sum.stats.sum.num_object_copies > 0 &&
3022 cct->_conf->mon_warn_on_misplaced) {
3023 double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
3024 (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
3025 char b[20];
3026 snprintf(b, sizeof(b), "%.3lf", pc);
3027 ostringstream ss;
3028 ss << pg_sum.stats.sum.num_objects_misplaced
3029 << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
3030 << b << "%)";
3031 checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
3032 pg_sum.stats.sum.num_objects_misplaced);
3033 }
3034
3035 // OBJECT_UNFOUND
3036 if (pg_sum.stats.sum.num_objects_unfound &&
3037 pg_sum.stats.sum.num_objects) {
3038 double pc = (double)pg_sum.stats.sum.num_objects_unfound /
3039 (double)pg_sum.stats.sum.num_objects * (double)100.0;
3040 char b[20];
3041 snprintf(b, sizeof(b), "%.3lf", pc);
3042 ostringstream ss;
3043 ss << pg_sum.stats.sum.num_objects_unfound
3044 << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
3045 auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
3046 pg_sum.stats.sum.num_objects_unfound);
3047
3048 for (auto& p : pg_stat) {
3049 if (p.second.stats.sum.num_objects_unfound) {
3050 ostringstream ss;
3051 ss << "pg " << p.first
3052 << " has " << p.second.stats.sum.num_objects_unfound
3053 << " unfound objects";
3054 d.detail.push_back(ss.str());
3055 if (d.detail.size() > max) {
3056 d.detail.push_back("(additional pgs left out for brevity)");
3057 break;
3058 }
3059 }
3060 }
3061 }
3062
3063 // REQUEST_SLOW
3064 // REQUEST_STUCK
3065 // SLOW_OPS unifies them in mimic.
3066 if (osdmap.require_osd_release < ceph_release_t::mimic &&
3067 cct->_conf->mon_osd_warn_op_age > 0 &&
3068 !osd_sum.op_queue_age_hist.h.empty() &&
3069 osd_sum.op_queue_age_hist.upper_bound() / 1000.0 >
3070 cct->_conf->mon_osd_warn_op_age) {
3071 list<string> warn_detail, error_detail;
3072 unsigned warn = 0, error = 0;
3073 float err_age =
3074 cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
3075 const pow2_hist_t& h = osd_sum.op_queue_age_hist;
3076 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3077 float ub = (float)(1 << i) / 1000.0;
3078 if (ub < cct->_conf->mon_osd_warn_op_age)
3079 break;
3080 if (h.h[i]) {
3081 ostringstream ss;
3082 ss << h.h[i] << " ops are blocked > " << ub << " sec";
3083 if (ub > err_age) {
3084 error += h.h[i];
3085 error_detail.push_back(ss.str());
3086 } else {
3087 warn += h.h[i];
3088 warn_detail.push_back(ss.str());
3089 }
3090 }
3091 }
3092
3093 map<float,set<int>> warn_osd_by_max; // max -> osds
3094 map<float,set<int>> error_osd_by_max; // max -> osds
3095 if (!warn_detail.empty() || !error_detail.empty()) {
3096 for (auto& p : osd_stat) {
3097 const pow2_hist_t& h = p.second.op_queue_age_hist;
3098 for (unsigned i = h.h.size() - 1; i > 0; --i) {
3099 float ub = (float)(1 << i) / 1000.0;
3100 if (ub < cct->_conf->mon_osd_warn_op_age)
3101 break;
3102 if (h.h[i]) {
3103 if (ub > err_age) {
3104 error_osd_by_max[ub].insert(p.first);
3105 } else {
3106 warn_osd_by_max[ub].insert(p.first);
3107 }
3108 break;
3109 }
3110 }
3111 }
3112 }
3113
3114 if (!warn_detail.empty()) {
3115 ostringstream ss;
3116 ss << warn << " slow requests are blocked > "
3117 << cct->_conf->mon_osd_warn_op_age << " sec";
3118 auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
3119 d.detail.swap(warn_detail);
3120 int left = max;
3121 for (auto& p : warn_osd_by_max) {
3122 ostringstream ss;
3123 if (p.second.size() > 1) {
3124 ss << "osds " << p.second
3125 << " have blocked requests > " << p.first << " sec";
3126 } else {
3127 ss << "osd." << *p.second.begin()
3128 << " has blocked requests > " << p.first << " sec";
3129 }
3130 d.detail.push_back(ss.str());
3131 if (--left == 0) {
3132 break;
3133 }
3134 }
3135 }
3136 if (!error_detail.empty()) {
3137 ostringstream ss;
3138 ss << error << " stuck requests are blocked > "
3139 << err_age << " sec";
3140 auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
3141 d.detail.swap(error_detail);
3142 int left = max;
3143 for (auto& p : error_osd_by_max) {
3144 ostringstream ss;
3145 if (p.second.size() > 1) {
3146 ss << "osds " << p.second
3147 << " have stuck requests > " << p.first << " sec";
3148 } else {
3149 ss << "osd." << *p.second.begin()
3150 << " has stuck requests > " << p.first << " sec";
3151 }
3152 d.detail.push_back(ss.str());
3153 if (--left == 0) {
3154 break;
3155 }
3156 }
3157 }
3158 }
3159
3160 // OBJECT_STORE_WARN
3161 if (osd_sum.os_alerts.size()) {
3162 map<string, pair<size_t, list<string>>> os_alerts_sum;
3163
3164 for (auto& a : osd_sum.os_alerts) {
3165 int left = max;
3166 string s0 = " osd.";
3167 s0 += stringify(a.first);
3168 for (auto& aa : a.second) {
3169 string s(s0);
3170 s += " ";
3171 s += aa.second;
3172 auto it = os_alerts_sum.find(aa.first);
3173 if (it == os_alerts_sum.end()) {
3174 list<string> d;
3175 d.emplace_back(s);
3176 os_alerts_sum.emplace(aa.first, std::make_pair(1, d));
3177 } else {
3178 auto& p = it->second;
3179 ++p.first;
3180 p.second.emplace_back(s);
3181 }
3182 if (--left == 0) {
3183 break;
3184 }
3185 }
3186 }
3187
3188 for (auto& asum : os_alerts_sum) {
3189 string summary = stringify(asum.second.first) + " OSD(s)";
3190 if (asum.first == "BLUEFS_SPILLOVER") {
3191 summary += " experiencing BlueFS spillover";
3192 } else if (asum.first == "BLUESTORE_NO_COMPRESSION") {
3193 summary += " have broken BlueStore compression";
3194 } else if (asum.first == "BLUESTORE_LEGACY_STATFS") {
3195 summary += " reporting legacy (not per-pool) BlueStore stats";
3196 } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") {
3197 summary += " have dangerous mismatch between BlueStore block device and free list sizes";
3198 } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
3199 summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
3200 }
3201 auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
3202 for (auto& s : asum.second.second) {
3203 d.detail.push_back(s);
3204 }
3205 }
3206 }
3207 // PG_NOT_SCRUBBED
3208 // PG_NOT_DEEP_SCRUBBED
3209 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
3210 cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3211 list<string> detail, deep_detail;
3212 int detail_max = max, deep_detail_max = max;
3213 int detail_more = 0, deep_detail_more = 0;
3214 int detail_total = 0, deep_detail_total = 0;
3215 for (auto& p : pg_stat) {
3216 int64_t pnum = p.first.pool();
3217 auto pool = osdmap.get_pg_pool(pnum);
3218 if (!pool)
3219 continue;
3220 if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
3221 double scrub_max_interval = 0;
3222 pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3223 if (scrub_max_interval <= 0) {
3224 scrub_max_interval = cct->_conf->osd_scrub_max_interval;
3225 }
3226 const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
3227 scrub_max_interval;
3228 utime_t cutoff = now;
3229 cutoff -= age;
3230 if (p.second.last_scrub_stamp < cutoff) {
3231 if (detail_max > 0) {
3232 ostringstream ss;
3233 ss << "pg " << p.first << " not scrubbed since "
3234 << p.second.last_scrub_stamp;
3235 detail.push_back(ss.str());
3236 --detail_max;
3237 } else {
3238 ++detail_more;
3239 }
3240 ++detail_total;
3241 }
3242 }
3243 if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
3244 double deep_scrub_interval = 0;
3245 pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3246 if (deep_scrub_interval <= 0) {
3247 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3248 }
3249 double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
3250 deep_scrub_interval;
3251 utime_t deep_cutoff = now;
3252 deep_cutoff -= deep_age;
3253 if (p.second.last_deep_scrub_stamp < deep_cutoff) {
3254 if (deep_detail_max > 0) {
3255 ostringstream ss;
3256 ss << "pg " << p.first << " not deep-scrubbed since "
3257 << p.second.last_deep_scrub_stamp;
3258 deep_detail.push_back(ss.str());
3259 --deep_detail_max;
3260 } else {
3261 ++deep_detail_more;
3262 }
3263 ++deep_detail_total;
3264 }
3265 }
3266 }
3267 if (detail_total) {
3268 ostringstream ss;
3269 ss << detail_total << " pgs not scrubbed in time";
3270 auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
3271
3272 if (!detail.empty()) {
3273 d.detail.swap(detail);
3274
3275 if (detail_more) {
3276 ostringstream ss;
3277 ss << detail_more << " more pgs... ";
3278 d.detail.push_back(ss.str());
3279 }
3280 }
3281 }
3282 if (deep_detail_total) {
3283 ostringstream ss;
3284 ss << deep_detail_total << " pgs not deep-scrubbed in time";
3285 auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
3286 deep_detail_total);
3287
3288 if (!deep_detail.empty()) {
3289 d.detail.swap(deep_detail);
3290
3291 if (deep_detail_more) {
3292 ostringstream ss;
3293 ss << deep_detail_more << " more pgs... ";
3294 d.detail.push_back(ss.str());
3295 }
3296 }
3297 }
3298 }
3299
3300 // POOL_APP
3301 if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) {
3302 list<string> detail;
3303 for (auto &it : pools) {
3304 const pg_pool_t &pool = it.second;
3305 const string& pool_name = osdmap.get_pool_name(it.first);
3306 auto it2 = pg_pool_sum.find(it.first);
3307 if (it2 == pg_pool_sum.end()) {
3308 continue;
3309 }
3310 const pool_stat_t *pstat = &it2->second;
3311 if (pstat == nullptr) {
3312 continue;
3313 }
3314 const object_stat_sum_t& sum = pstat->stats.sum;
3315 // application metadata is not encoded until luminous is minimum
3316 // required release
3317 if (sum.num_objects > 0 && pool.application_metadata.empty() &&
3318 !pool.is_tier()) {
3319 stringstream ss;
3320 ss << "application not enabled on pool '" << pool_name << "'";
3321 detail.push_back(ss.str());
3322 }
3323 }
3324 if (!detail.empty()) {
3325 ostringstream ss;
3326 ss << detail.size() << " pool(s) do not have an application enabled";
3327 auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
3328 detail.size());
3329 stringstream tip;
3330 tip << "use 'ceph osd pool application enable <pool-name> "
3331 << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
3332 << "or freeform for custom applications.";
3333 detail.push_back(tip.str());
3334 d.detail.swap(detail);
3335 }
3336 }
3337
3338 // PG_SLOW_SNAP_TRIMMING
3339 if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) {
3340 uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on;
3341 uint64_t snaptrimq_exceeded = 0;
3342 uint32_t longest_queue = 0;
3343 const pg_t* longest_q_pg = nullptr;
3344 list<string> detail;
3345
3346 for (auto& i: pg_stat) {
3347 uint32_t current_len = i.second.snaptrimq_len;
3348 if (current_len >= snapthreshold) {
3349 snaptrimq_exceeded++;
3350 if (longest_queue <= current_len) {
3351 longest_q_pg = &i.first;
3352 longest_queue = current_len;
3353 }
3354 if (detail.size() < max - 1) {
3355 stringstream ss;
3356 ss << "snap trim queue for pg " << i.first << " at " << current_len;
3357 detail.push_back(ss.str());
3358 continue;
3359 }
3360 if (detail.size() < max) {
3361 detail.push_back("...more pgs affected");
3362 continue;
3363 }
3364 }
3365 }
3366
3367 if (snaptrimq_exceeded) {
3368 {
3369 ostringstream ss;
3370 ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue;
3371 detail.push_back(ss.str());
3372 }
3373
3374 stringstream ss;
3375 ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
3376 auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
3377 snaptrimq_exceeded);
3378 detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
3379 d.detail.swap(detail);
3380 }
3381 }
3382 }
3383
3384 int process_pg_map_command(
3385 const string& orig_prefix,
3386 const cmdmap_t& orig_cmdmap,
3387 const PGMap& pg_map,
3388 const OSDMap& osdmap,
3389 ceph::Formatter *f,
3390 stringstream *ss,
3391 bufferlist *odata)
3392 {
3393 string prefix = orig_prefix;
3394 auto cmdmap = orig_cmdmap;
3395
3396 string omap_stats_note =
3397 "\n* NOTE: Omap statistics are gathered during deep scrub and "
3398 "may be inaccurate soon afterwards depending on utilisation. See "
3399 "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics "
3400 "for further details.\n";
3401 bool omap_stats_note_required = false;
3402
3403 // perhaps these would be better in the parsing, but it's weird
3404 bool primary = false;
3405 if (prefix == "pg dump_json") {
3406 vector<string> v;
3407 v.push_back(string("all"));
3408 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3409 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3410 prefix = "pg dump";
3411 } else if (prefix == "pg dump_pools_json") {
3412 vector<string> v;
3413 v.push_back(string("pools"));
3414 cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
3415 cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
3416 prefix = "pg dump";
3417 } else if (prefix == "pg ls-by-primary") {
3418 primary = true;
3419 prefix = "pg ls";
3420 } else if (prefix == "pg ls-by-osd") {
3421 prefix = "pg ls";
3422 } else if (prefix == "pg ls-by-pool") {
3423 prefix = "pg ls";
3424 string poolstr;
3425 cmd_getval(g_ceph_context, cmdmap, "poolstr", poolstr);
3426 int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
3427 if (pool < 0) {
3428 *ss << "pool " << poolstr << " does not exist";
3429 return -ENOENT;
3430 }
3431 cmd_putval(g_ceph_context, cmdmap, "pool", pool);
3432 }
3433
3434 stringstream ds;
3435 if (prefix == "pg stat") {
3436 if (f) {
3437 f->open_object_section("pg_summary");
3438 pg_map.print_oneline_summary(f, NULL);
3439 f->close_section();
3440 f->flush(ds);
3441 } else {
3442 ds << pg_map;
3443 }
3444 odata->append(ds);
3445 return 0;
3446 }
3447
3448 if (prefix == "pg getmap") {
3449 pg_map.encode(*odata);
3450 *ss << "got pgmap version " << pg_map.version;
3451 return 0;
3452 }
3453
3454 if (prefix == "pg dump") {
3455 string val;
3456 vector<string> dumpcontents;
3457 set<string> what;
3458 if (cmd_getval(g_ceph_context, cmdmap, "dumpcontents", dumpcontents)) {
3459 copy(dumpcontents.begin(), dumpcontents.end(),
3460 inserter(what, what.end()));
3461 }
3462 if (what.empty())
3463 what.insert("all");
3464 if (f) {
3465 if (what.count("all")) {
3466 f->open_object_section("pg_map");
3467 pg_map.dump(f);
3468 f->close_section();
3469 } else if (what.count("summary") || what.count("sum")) {
3470 f->open_object_section("pg_map");
3471 pg_map.dump_basic(f);
3472 f->close_section();
3473 } else {
3474 if (what.count("pools")) {
3475 pg_map.dump_pool_stats(f);
3476 }
3477 if (what.count("osds")) {
3478 pg_map.dump_osd_stats(f);
3479 }
3480 if (what.count("pgs")) {
3481 pg_map.dump_pg_stats(f, false);
3482 }
3483 if (what.count("pgs_brief")) {
3484 pg_map.dump_pg_stats(f, true);
3485 }
3486 if (what.count("delta")) {
3487 f->open_object_section("delta");
3488 pg_map.dump_delta(f);
3489 f->close_section();
3490 }
3491 }
3492 f->flush(*odata);
3493 } else {
3494 if (what.count("all")) {
3495 pg_map.dump(ds);
3496 omap_stats_note_required = true;
3497 } else if (what.count("summary") || what.count("sum")) {
3498 pg_map.dump_basic(ds);
3499 pg_map.dump_pg_sum_stats(ds, true);
3500 pg_map.dump_osd_sum_stats(ds);
3501 omap_stats_note_required = true;
3502 } else {
3503 if (what.count("pgs_brief")) {
3504 pg_map.dump_pg_stats(ds, true);
3505 }
3506 bool header = true;
3507 if (what.count("pgs")) {
3508 pg_map.dump_pg_stats(ds, false);
3509 header = false;
3510 omap_stats_note_required = true;
3511 }
3512 if (what.count("pools")) {
3513 pg_map.dump_pool_stats(ds, header);
3514 omap_stats_note_required = true;
3515 }
3516 if (what.count("osds")) {
3517 pg_map.dump_osd_stats(ds);
3518 }
3519 }
3520 odata->append(ds);
3521 if (omap_stats_note_required) {
3522 odata->append(omap_stats_note);
3523 }
3524 }
3525 *ss << "dumped " << what;
3526 return 0;
3527 }
3528
3529 if (prefix == "pg ls") {
3530 int64_t osd = -1;
3531 int64_t pool = -1;
3532 vector<string>states;
3533 set<pg_t> pgs;
3534 cmd_getval(g_ceph_context, cmdmap, "pool", pool);
3535 cmd_getval(g_ceph_context, cmdmap, "osd", osd);
3536 cmd_getval(g_ceph_context, cmdmap, "states", states);
3537 if (pool >= 0 && !osdmap.have_pg_pool(pool)) {
3538 *ss << "pool " << pool << " does not exist";
3539 return -ENOENT;
3540 }
3541 if (osd >= 0 && !osdmap.is_up(osd)) {
3542 *ss << "osd " << osd << " is not up";
3543 return -EAGAIN;
3544 }
3545 if (states.empty())
3546 states.push_back("all");
3547
3548 uint64_t state = 0;
3549
3550 while (!states.empty()) {
3551 string state_str = states.back();
3552
3553 if (state_str == "all") {
3554 state = -1;
3555 break;
3556 } else {
3557 auto filter = pg_string_state(state_str);
3558 if (!filter) {
3559 *ss << "'" << state_str << "' is not a valid pg state,"
3560 << " available choices: " << pg_state_string(0xFFFFFFFF);
3561 return -EINVAL;
3562 }
3563 state |= *filter;
3564 }
3565
3566 states.pop_back();
3567 }
3568
3569 pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs);
3570
3571 if (f && !pgs.empty()) {
3572 pg_map.dump_filtered_pg_stats(f, pgs);
3573 f->flush(*odata);
3574 } else if (!pgs.empty()) {
3575 pg_map.dump_filtered_pg_stats(ds, pgs);
3576 odata->append(ds);
3577 odata->append(omap_stats_note);
3578 }
3579 return 0;
3580 }
3581
3582 if (prefix == "pg dump_stuck") {
3583 vector<string> stuckop_vec;
3584 cmd_getval(g_ceph_context, cmdmap, "stuckops", stuckop_vec);
3585 if (stuckop_vec.empty())
3586 stuckop_vec.push_back("unclean");
3587 int64_t threshold;
3588 cmd_getval(g_ceph_context, cmdmap, "threshold", threshold,
3589 g_conf().get_val<int64_t>("mon_pg_stuck_threshold"));
3590
3591 if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) {
3592 *ss << "failed";
3593 } else {
3594 *ss << "ok";
3595 }
3596 odata->append(ds);
3597 return 0;
3598 }
3599
3600 if (prefix == "pg debug") {
3601 string debugop;
3602 cmd_getval(g_ceph_context, cmdmap, "debugop", debugop,
3603 string("unfound_objects_exist"));
3604 if (debugop == "unfound_objects_exist") {
3605 bool unfound_objects_exist = false;
3606 for (const auto& p : pg_map.pg_stat) {
3607 if (p.second.stats.sum.num_objects_unfound > 0) {
3608 unfound_objects_exist = true;
3609 break;
3610 }
3611 }
3612 if (unfound_objects_exist)
3613 ds << "TRUE";
3614 else
3615 ds << "FALSE";
3616 odata->append(ds);
3617 return 0;
3618 }
3619 if (debugop == "degraded_pgs_exist") {
3620 bool degraded_pgs_exist = false;
3621 for (const auto& p : pg_map.pg_stat) {
3622 if (p.second.stats.sum.num_objects_degraded > 0) {
3623 degraded_pgs_exist = true;
3624 break;
3625 }
3626 }
3627 if (degraded_pgs_exist)
3628 ds << "TRUE";
3629 else
3630 ds << "FALSE";
3631 odata->append(ds);
3632 return 0;
3633 }
3634 }
3635
3636 if (prefix == "osd perf") {
3637 if (f) {
3638 f->open_object_section("osdstats");
3639 pg_map.dump_osd_perf_stats(f);
3640 f->close_section();
3641 f->flush(ds);
3642 } else {
3643 pg_map.print_osd_perf_stats(&ds);
3644 }
3645 odata->append(ds);
3646 return 0;
3647 }
3648
3649 if (prefix == "osd blocked-by") {
3650 if (f) {
3651 f->open_object_section("osd_blocked_by");
3652 pg_map.dump_osd_blocked_by_stats(f);
3653 f->close_section();
3654 f->flush(ds);
3655 } else {
3656 pg_map.print_osd_blocked_by_stats(&ds);
3657 }
3658 odata->append(ds);
3659 return 0;
3660 }
3661
3662 return -EOPNOTSUPP;
3663 }
3664
3665 void PGMapUpdater::check_osd_map(
3666 CephContext *cct,
3667 const OSDMap& osdmap,
3668 const PGMap& pgmap,
3669 PGMap::Incremental *pending_inc)
3670 {
3671 for (auto& p : pgmap.osd_stat) {
3672 if (!osdmap.exists(p.first)) {
3673 // remove osd_stat
3674 pending_inc->rm_stat(p.first);
3675 } else if (osdmap.is_out(p.first)) {
3676 // zero osd_stat
3677 if (p.second.statfs.total != 0) {
3678 pending_inc->stat_osd_out(p.first);
3679 }
3680 } else if (!osdmap.is_up(p.first)) {
3681 // zero the op_queue_age_hist
3682 if (!p.second.op_queue_age_hist.empty()) {
3683 pending_inc->stat_osd_down_up(p.first, pgmap);
3684 }
3685 }
3686 }
3687
3688 // deleted pgs (pools)?
3689 for (auto& p : pgmap.pg_pool_sum) {
3690 if (!osdmap.have_pg_pool(p.first)) {
3691 ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs"
3692 << dendl;
3693 for (auto& q : pgmap.pg_stat) {
3694 if (q.first.pool() == p.first) {
3695 pending_inc->pg_remove.insert(q.first);
3696 }
3697 }
3698 auto q = pending_inc->pg_stat_updates.begin();
3699 while (q != pending_inc->pg_stat_updates.end()) {
3700 if (q->first.pool() == p.first) {
3701 q = pending_inc->pg_stat_updates.erase(q);
3702 } else {
3703 ++q;
3704 }
3705 }
3706 }
3707 }
3708
3709 // new (split or new pool) or merged pgs?
3710 map<int64_t,unsigned> new_pg_num;
3711 for (auto& p : osdmap.get_pools()) {
3712 int64_t poolid = p.first;
3713 const pg_pool_t& pi = p.second;
3714 auto q = pgmap.num_pg_by_pool.find(poolid);
3715 unsigned my_pg_num = 0;
3716 if (q != pgmap.num_pg_by_pool.end())
3717 my_pg_num = q->second;
3718 unsigned pg_num = pi.get_pg_num();
3719 new_pg_num[poolid] = pg_num;
3720 if (my_pg_num < pg_num) {
3721 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3722 << " > my pg_num " << my_pg_num << dendl;
3723 for (unsigned ps = my_pg_num; ps < pg_num; ++ps) {
3724 pg_t pgid(ps, poolid);
3725 if (pending_inc->pg_stat_updates.count(pgid) == 0) {
3726 ldout(cct,20) << __func__ << " adding " << pgid << dendl;
3727 pg_stat_t &stats = pending_inc->pg_stat_updates[pgid];
3728 stats.last_fresh = osdmap.get_modified();
3729 stats.last_active = osdmap.get_modified();
3730 stats.last_change = osdmap.get_modified();
3731 stats.last_peered = osdmap.get_modified();
3732 stats.last_clean = osdmap.get_modified();
3733 stats.last_unstale = osdmap.get_modified();
3734 stats.last_undegraded = osdmap.get_modified();
3735 stats.last_fullsized = osdmap.get_modified();
3736 stats.last_scrub_stamp = osdmap.get_modified();
3737 stats.last_deep_scrub_stamp = osdmap.get_modified();
3738 stats.last_clean_scrub_stamp = osdmap.get_modified();
3739 }
3740 }
3741 } else if (my_pg_num > pg_num) {
3742 ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num
3743 << " < my pg_num " << my_pg_num << dendl;
3744 for (unsigned i = pg_num; i < my_pg_num; ++i) {
3745 pg_t pgid(i, poolid);
3746 ldout(cct,20) << __func__ << " removing merged " << pgid << dendl;
3747 if (pgmap.pg_stat.count(pgid)) {
3748 pending_inc->pg_remove.insert(pgid);
3749 }
3750 pending_inc->pg_stat_updates.erase(pgid);
3751 }
3752 }
3753 }
3754 auto i = pending_inc->pg_stat_updates.begin();
3755 while (i != pending_inc->pg_stat_updates.end()) {
3756 auto j = new_pg_num.find(i->first.pool());
3757 if (j == new_pg_num.end() ||
3758 i->first.ps() >= j->second) {
3759 ldout(cct,20) << __func__ << " removing pending update to old "
3760 << i->first << dendl;
3761 i = pending_inc->pg_stat_updates.erase(i);
3762 } else {
3763 ++i;
3764 }
3765 }
3766 }
3767
3768 static void _try_mark_pg_stale(
3769 const OSDMap& osdmap,
3770 pg_t pgid,
3771 const pg_stat_t& cur,
3772 PGMap::Incremental *pending_inc)
3773 {
3774 if ((cur.state & PG_STATE_STALE) == 0 &&
3775 cur.acting_primary != -1 &&
3776 osdmap.is_down(cur.acting_primary)) {
3777 pg_stat_t *newstat;
3778 auto q = pending_inc->pg_stat_updates.find(pgid);
3779 if (q != pending_inc->pg_stat_updates.end()) {
3780 if ((q->second.acting_primary == cur.acting_primary) ||
3781 ((q->second.state & PG_STATE_STALE) == 0 &&
3782 q->second.acting_primary != -1 &&
3783 osdmap.is_down(q->second.acting_primary))) {
3784 newstat = &q->second;
3785 } else {
3786 // pending update is no longer down or already stale
3787 return;
3788 }
3789 } else {
3790 newstat = &pending_inc->pg_stat_updates[pgid];
3791 *newstat = cur;
3792 }
3793 dout(10) << __func__ << " marking pg " << pgid
3794 << " stale (acting_primary " << newstat->acting_primary
3795 << ")" << dendl;
3796 newstat->state |= PG_STATE_STALE;
3797 newstat->last_unstale = ceph_clock_now();
3798 }
3799 }
3800
3801 void PGMapUpdater::check_down_pgs(
3802 const OSDMap &osdmap,
3803 const PGMap &pg_map,
3804 bool check_all,
3805 const set<int>& need_check_down_pg_osds,
3806 PGMap::Incremental *pending_inc)
3807 {
3808 // if a large number of osds changed state, just iterate over the whole
3809 // pg map.
3810 if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() *
3811 g_conf().get_val<double>("mon_pg_check_down_all_threshold")) {
3812 check_all = true;
3813 }
3814
3815 if (check_all) {
3816 for (const auto& p : pg_map.pg_stat) {
3817 _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc);
3818 }
3819 } else {
3820 for (auto osd : need_check_down_pg_osds) {
3821 if (osdmap.is_down(osd)) {
3822 auto p = pg_map.pg_by_osd.find(osd);
3823 if (p == pg_map.pg_by_osd.end()) {
3824 continue;
3825 }
3826 for (auto pgid : p->second) {
3827 const pg_stat_t &stat = pg_map.pg_stat.at(pgid);
3828 ceph_assert(stat.acting_primary == osd);
3829 _try_mark_pg_stale(osdmap, pgid, stat, pending_inc);
3830 }
3831 }
3832 }
3833 }
3834 }
3835
3836 int reweight::by_utilization(
3837 const OSDMap &osdmap,
3838 const PGMap &pgm,
3839 int oload,
3840 double max_changef,
3841 int max_osds,
3842 bool by_pg, const set<int64_t> *pools,
3843 bool no_increasing,
3844 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
3845 std::stringstream *ss,
3846 std::string *out_str,
3847 ceph::Formatter *f)
3848 {
3849 if (oload <= 100) {
3850 *ss << "You must give a percentage higher than 100. "
3851 "The reweighting threshold will be calculated as <average-utilization> "
3852 "times <input-percentage>. For example, an argument of 200 would "
3853 "reweight OSDs which are twice as utilized as the average OSD.\n";
3854 return -EINVAL;
3855 }
3856
3857 vector<int> pgs_by_osd(osdmap.get_max_osd());
3858
3859 // Avoid putting a small number (or 0) in the denominator when calculating
3860 // average_util
3861 double average_util;
3862 if (by_pg) {
3863 // by pg mapping
3864 double weight_sum = 0.0; // sum up the crush weights
3865 unsigned num_pg_copies = 0;
3866 int num_osds = 0;
3867 for (const auto& pg : pgm.pg_stat) {
3868 if (pools && pools->count(pg.first.pool()) == 0)
3869 continue;
3870 for (const auto acting : pg.second.acting) {
3871 if (!osdmap.exists(acting)) {
3872 continue;
3873 }
3874 if (acting >= (int)pgs_by_osd.size())
3875 pgs_by_osd.resize(acting);
3876 if (pgs_by_osd[acting] == 0) {
3877 if (osdmap.crush->get_item_weightf(acting) <= 0) {
3878 //skip if we currently can not identify item
3879 continue;
3880 }
3881 weight_sum += osdmap.crush->get_item_weightf(acting);
3882 ++num_osds;
3883 }
3884 ++pgs_by_osd[acting];
3885 ++num_pg_copies;
3886 }
3887 }
3888
3889 if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) {
3890 *ss << "Refusing to reweight: we only have " << num_pg_copies
3891 << " PGs across " << num_osds << " osds!\n";
3892 return -EDOM;
3893 }
3894
3895 average_util = (double)num_pg_copies / weight_sum;
3896 } else {
3897 // by osd utilization
3898 int num_osd = std::max<size_t>(1, pgm.osd_stat.size());
3899 if ((uint64_t)pgm.osd_sum.statfs.total / num_osd
3900 < g_conf()->mon_reweight_min_bytes_per_osd) {
3901 *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb()
3902 << " kb across all osds!\n";
3903 return -EDOM;
3904 }
3905 if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd
3906 < g_conf()->mon_reweight_min_bytes_per_osd) {
3907 *ss << "Refusing to reweight: we only have "
3908 << pgm.osd_sum.statfs.kb_used_raw()
3909 << " kb used across all osds!\n";
3910 return -EDOM;
3911 }
3912
3913 average_util = (double)pgm.osd_sum.statfs.get_used_raw() /
3914 (double)pgm.osd_sum.statfs.total;
3915 }
3916
3917 // adjust down only if we are above the threshold
3918 const double overload_util = average_util * (double)oload / 100.0;
3919
3920 // but aggressively adjust weights up whenever possible.
3921 const double underload_util = average_util;
3922
3923 const unsigned max_change = (unsigned)(max_changef * (double)0x10000);
3924
3925 ostringstream oss;
3926 if (f) {
3927 f->open_object_section("reweight_by_utilization");
3928 f->dump_int("overload_min", oload);
3929 f->dump_float("max_change", max_changef);
3930 f->dump_int("max_change_osds", max_osds);
3931 f->dump_float("average_utilization", average_util);
3932 f->dump_float("overload_utilization", overload_util);
3933 } else {
3934 oss << "oload " << oload << "\n";
3935 oss << "max_change " << max_changef << "\n";
3936 oss << "max_change_osds " << max_osds << "\n";
3937 oss.precision(4);
3938 oss << "average_utilization " << std::fixed << average_util << "\n";
3939 oss << "overload_utilization " << overload_util << "\n";
3940 }
3941 int num_changed = 0;
3942
3943 // precompute util for each OSD
3944 std::vector<std::pair<int, float> > util_by_osd;
3945 for (const auto& p : pgm.osd_stat) {
3946 std::pair<int, float> osd_util;
3947 osd_util.first = p.first;
3948 if (by_pg) {
3949 if (p.first >= (int)pgs_by_osd.size() ||
3950 pgs_by_osd[p.first] == 0) {
3951 // skip if this OSD does not contain any pg
3952 // belonging to the specified pool(s).
3953 continue;
3954 }
3955
3956 if (osdmap.crush->get_item_weightf(p.first) <= 0) {
3957 // skip if we are unable to locate item.
3958 continue;
3959 }
3960
3961 osd_util.second =
3962 pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first);
3963 } else {
3964 osd_util.second =
3965 (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total;
3966 }
3967 util_by_osd.push_back(osd_util);
3968 }
3969
3970 // sort by absolute deviation from the mean utilization,
3971 // in descending order.
3972 std::sort(util_by_osd.begin(), util_by_osd.end(),
3973 [average_util](std::pair<int, float> l, std::pair<int, float> r) {
3974 return abs(l.second - average_util) > abs(r.second - average_util);
3975 }
3976 );
3977
3978 if (f)
3979 f->open_array_section("reweights");
3980
3981 for (const auto& p : util_by_osd) {
3982 unsigned weight = osdmap.get_weight(p.first);
3983 if (weight == 0) {
3984 // skip if OSD is currently out
3985 continue;
3986 }
3987 float util = p.second;
3988
3989 if (util >= overload_util) {
3990 // Assign a lower weight to overloaded OSDs. The current weight
3991 // is a factor to take into account the original weights,
3992 // to represent e.g. differing storage capacities
3993 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
3994 if (weight > max_change)
3995 new_weight = std::max(new_weight, weight - max_change);
3996 new_weights->insert({p.first, new_weight});
3997 if (f) {
3998 f->open_object_section("osd");
3999 f->dump_int("osd", p.first);
4000 f->dump_float("weight", (float)weight / (float)0x10000);
4001 f->dump_float("new_weight", (float)new_weight / (float)0x10000);
4002 f->close_section();
4003 } else {
4004 oss << "osd." << p.first << " weight "
4005 << (float)weight / (float)0x10000 << " -> "
4006 << (float)new_weight / (float)0x10000 << "\n";
4007 }
4008 if (++num_changed >= max_osds)
4009 break;
4010 }
4011 if (!no_increasing && util <= underload_util) {
4012 // assign a higher weight.. if we can.
4013 unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
4014 new_weight = std::min(new_weight, weight + max_change);
4015 if (new_weight > 0x10000)
4016 new_weight = 0x10000;
4017 if (new_weight > weight) {
4018 new_weights->insert({p.first, new_weight});
4019 oss << "osd." << p.first << " weight "
4020 << (float)weight / (float)0x10000 << " -> "
4021 << (float)new_weight / (float)0x10000 << "\n";
4022 if (++num_changed >= max_osds)
4023 break;
4024 }
4025 }
4026 }
4027 if (f) {
4028 f->close_section();
4029 }
4030
4031 OSDMap newmap;
4032 newmap.deepish_copy_from(osdmap);
4033 OSDMap::Incremental newinc;
4034 newinc.fsid = newmap.get_fsid();
4035 newinc.epoch = newmap.get_epoch() + 1;
4036 newinc.new_weight = *new_weights;
4037 newmap.apply_incremental(newinc);
4038
4039 osdmap.summarize_mapping_stats(&newmap, pools, out_str, f);
4040
4041 if (f) {
4042 f->close_section();
4043 } else {
4044 *out_str += "\n";
4045 *out_str += oss.str();
4046 }
4047 return num_changed;
4048 }
4049