1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include <list>
19 #include <map>
20 #include <ostream>
21 #include <sstream>
22 #include <set>
23 #include <string>
24 #include <utility>
25 #include <vector>
26
27
28 #include <boost/assign/list_of.hpp>
29
30 #include "include/ceph_features.h"
31 #include "include/encoding.h"
32 #include "include/stringify.h"
33 extern "C" {
34 #include "crush/hash.h"
35 }
36
37 #include "common/Formatter.h"
38 #include "OSDMap.h"
39 #include "osd_types.h"
40 #include "os/Transaction.h"
41
42 using std::list;
43 using std::make_pair;
44 using std::map;
45 using std::ostream;
46 using std::ostringstream;
47 using std::pair;
48 using std::set;
49 using std::string;
50 using std::stringstream;
51 using std::unique_ptr;
52 using std::vector;
53
54 using ceph::decode;
55 using ceph::decode_nohead;
56 using ceph::encode;
57 using ceph::encode_nohead;
58 using ceph::Formatter;
59
60 using namespace std::literals;
61
62 const char *ceph_osd_flag_name(unsigned flag)
63 {
64 switch (flag) {
65 case CEPH_OSD_FLAG_ACK: return "ack";
66 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
67 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
68 case CEPH_OSD_FLAG_RETRY: return "retry";
69 case CEPH_OSD_FLAG_READ: return "read";
70 case CEPH_OSD_FLAG_WRITE: return "write";
71 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
72 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
73 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
74 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
75 case CEPH_OSD_FLAG_PGOP: return "pgop";
76 case CEPH_OSD_FLAG_EXEC: return "exec";
77 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
78 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
79 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
80 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
81 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
82 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
83 case CEPH_OSD_FLAG_FLUSH: return "flush";
84 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
85 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
86 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
87 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
88 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
89 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
90 case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
91 case CEPH_OSD_FLAG_RETURNVEC: return "returnvec";
92 default: return "???";
93 }
94 }
95
96 string ceph_osd_flag_string(unsigned flags)
97 {
98 string s;
99 for (unsigned i=0; i<32; ++i) {
100 if (flags & (1u<<i)) {
101 if (s.length())
102 s += "+";
103 s += ceph_osd_flag_name(1u << i);
104 }
105 }
106 if (s.length())
107 return s;
108 return string("-");
109 }
110
111 const char * ceph_osd_op_flag_name(unsigned flag)
112 {
113 const char *name;
114
115 switch(flag) {
116 case CEPH_OSD_OP_FLAG_EXCL:
117 name = "excl";
118 break;
119 case CEPH_OSD_OP_FLAG_FAILOK:
120 name = "failok";
121 break;
122 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
123 name = "fadvise_random";
124 break;
125 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
126 name = "fadvise_sequential";
127 break;
128 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
129 name = "favise_willneed";
130 break;
131 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
132 name = "fadvise_dontneed";
133 break;
134 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
135 name = "fadvise_nocache";
136 break;
137 case CEPH_OSD_OP_FLAG_WITH_REFERENCE:
138 name = "with_reference";
139 break;
140 case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
141 name = "bypass_clean_cache";
142 break;
143 default:
144 name = "???";
145 };
146
147 return name;
148 }
149
150 string ceph_osd_op_flag_string(unsigned flags)
151 {
152 string s;
153 for (unsigned i=0; i<32; ++i) {
154 if (flags & (1u<<i)) {
155 if (s.length())
156 s += "+";
157 s += ceph_osd_op_flag_name(1u << i);
158 }
159 }
160 if (s.length())
161 return s;
162 return string("-");
163 }
164
165 string ceph_osd_alloc_hint_flag_string(unsigned flags)
166 {
167 string s;
168 for (unsigned i=0; i<32; ++i) {
169 if (flags & (1u<<i)) {
170 if (s.length())
171 s += "+";
172 s += ceph_osd_alloc_hint_flag_name(1u << i);
173 }
174 }
175 if (s.length())
176 return s;
177 return string("-");
178 }
179
180 void pg_shard_t::encode(ceph::buffer::list &bl) const
181 {
182 ENCODE_START(1, 1, bl);
183 encode(osd, bl);
184 encode(shard, bl);
185 ENCODE_FINISH(bl);
186 }
187 void pg_shard_t::decode(ceph::buffer::list::const_iterator &bl)
188 {
189 DECODE_START(1, bl);
190 decode(osd, bl);
191 decode(shard, bl);
192 DECODE_FINISH(bl);
193 }
194
195 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
196 {
197 if (rhs.is_undefined())
198 return lhs << "?";
199 if (rhs.shard == shard_id_t::NO_SHARD)
200 return lhs << rhs.get_osd();
201 return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
202 }
203
204 void dump(Formatter* f, const osd_alerts_t& alerts)
205 {
206 for (auto& a : alerts) {
207 string s0 = " osd: ";
208 s0 += stringify(a.first);
209 string s;
210 for (auto& aa : a.second) {
211 s = s0;
212 s += " ";
213 s += aa.first;
214 s += ":";
215 s += aa.second;
216 f->dump_string("alert", s);
217 }
218 }
219 }
220
221 // -- osd_reqid_t --
222 void osd_reqid_t::dump(Formatter *f) const
223 {
224 f->dump_stream("name") << name;
225 f->dump_int("inc", inc);
226 f->dump_unsigned("tid", tid);
227 }
228
229 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
230 {
231 o.push_back(new osd_reqid_t);
232 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
233 }
234
235 // -- object_locator_t --
236
237 void object_locator_t::encode(ceph::buffer::list& bl) const
238 {
239 // verify that nobody's corrupted the locator
240 ceph_assert(hash == -1 || key.empty());
241 __u8 encode_compat = 3;
242 ENCODE_START(6, encode_compat, bl);
243 encode(pool, bl);
244 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
245 encode(preferred, bl);
246 encode(key, bl);
247 encode(nspace, bl);
248 encode(hash, bl);
249 if (hash != -1)
250 encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash
251 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
252 }
253
254 void object_locator_t::decode(ceph::buffer::list::const_iterator& p)
255 {
256 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
257 if (struct_v < 2) {
258 int32_t op;
259 decode(op, p);
260 pool = op;
261 int16_t pref;
262 decode(pref, p);
263 } else {
264 decode(pool, p);
265 int32_t preferred;
266 decode(preferred, p);
267 }
268 decode(key, p);
269 if (struct_v >= 5)
270 decode(nspace, p);
271 if (struct_v >= 6)
272 decode(hash, p);
273 else
274 hash = -1;
275 DECODE_FINISH(p);
276 // verify that nobody's corrupted the locator
277 ceph_assert(hash == -1 || key.empty());
278 }
279
280 void object_locator_t::dump(Formatter *f) const
281 {
282 f->dump_int("pool", pool);
283 f->dump_string("key", key);
284 f->dump_string("namespace", nspace);
285 f->dump_int("hash", hash);
286 }
287
288 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
289 {
290 o.push_back(new object_locator_t);
291 o.push_back(new object_locator_t(123));
292 o.push_back(new object_locator_t(123, 876));
293 o.push_back(new object_locator_t(1, "n2"));
294 o.push_back(new object_locator_t(1234, "", "key"));
295 o.push_back(new object_locator_t(12, "n1", "key2"));
296 }
297
298 // -- request_redirect_t --
299 void request_redirect_t::encode(ceph::buffer::list& bl) const
300 {
301 ENCODE_START(1, 1, bl);
302 encode(redirect_locator, bl);
303 encode(redirect_object, bl);
304 // legacy of the removed osd_instructions member
305 encode((uint32_t)0, bl);
306 ENCODE_FINISH(bl);
307 }
308
309 void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl)
310 {
311 DECODE_START(1, bl);
312 uint32_t legacy_osd_instructions_len;
313 decode(redirect_locator, bl);
314 decode(redirect_object, bl);
315 decode(legacy_osd_instructions_len, bl);
316 if (legacy_osd_instructions_len) {
317 bl.advance(legacy_osd_instructions_len);
318 }
319 DECODE_FINISH(bl);
320 }
321
322 void request_redirect_t::dump(Formatter *f) const
323 {
324 f->dump_string("object", redirect_object);
325 f->open_object_section("locator");
326 redirect_locator.dump(f);
327 f->close_section(); // locator
328 }
329
330 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
331 {
332 object_locator_t loc(1, "redir_obj");
333 o.push_back(new request_redirect_t());
334 o.push_back(new request_redirect_t(loc, 0));
335 o.push_back(new request_redirect_t(loc, "redir_obj"));
336 o.push_back(new request_redirect_t(loc));
337 }
338
339 void objectstore_perf_stat_t::dump(Formatter *f) const
340 {
341 // *_ms values just for compatibility.
342 f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0);
343 f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0);
344 f->dump_unsigned("commit_latency_ns", os_commit_latency_ns);
345 f->dump_unsigned("apply_latency_ns", os_apply_latency_ns);
346 }
347
348 void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
349 {
350 uint8_t target_v = 2;
351 if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) {
352 target_v = 1;
353 }
354 ENCODE_START(target_v, target_v, bl);
355 if (target_v >= 2) {
356 encode(os_commit_latency_ns, bl);
357 encode(os_apply_latency_ns, bl);
358 } else {
359 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
360 uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS;
361 uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS;
362 encode(commit_latency_ms, bl); // for compatibility with older monitor.
363 encode(apply_latency_ms, bl); // for compatibility with older monitor.
364 }
365 ENCODE_FINISH(bl);
366 }
367
368 void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl)
369 {
370 DECODE_START(2, bl);
371 if (struct_v >= 2) {
372 decode(os_commit_latency_ns, bl);
373 decode(os_apply_latency_ns, bl);
374 } else {
375 uint32_t commit_latency_ms;
376 uint32_t apply_latency_ms;
377 decode(commit_latency_ms, bl);
378 decode(apply_latency_ms, bl);
379 constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
380 os_commit_latency_ns = commit_latency_ms * NS_PER_MS;
381 os_apply_latency_ns = apply_latency_ms * NS_PER_MS;
382 }
383 DECODE_FINISH(bl);
384 }
385
386 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
387 {
388 o.push_back(new objectstore_perf_stat_t());
389 o.push_back(new objectstore_perf_stat_t());
390 o.back()->os_commit_latency_ns = 20000000;
391 o.back()->os_apply_latency_ns = 30000000;
392 }
393
394 // -- osd_stat_t --
395 void osd_stat_t::dump(Formatter *f) const
396 {
397 f->dump_unsigned("up_from", up_from);
398 f->dump_unsigned("seq", seq);
399 f->dump_unsigned("num_pgs", num_pgs);
400 f->dump_unsigned("num_osds", num_osds);
401 f->dump_unsigned("num_per_pool_osds", num_per_pool_osds);
402 f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds);
403
404 /// dump legacy stats fields to ensure backward compatibility.
405 f->dump_unsigned("kb", statfs.kb());
406 f->dump_unsigned("kb_used", statfs.kb_used_raw());
407 f->dump_unsigned("kb_used_data", statfs.kb_used_data());
408 f->dump_unsigned("kb_used_omap", statfs.kb_used_omap());
409 f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata());
410 f->dump_unsigned("kb_avail", statfs.kb_avail());
411 ////////////////////
412
413 f->open_object_section("statfs");
414 statfs.dump(f);
415 f->close_section();
416 f->open_array_section("hb_peers");
417 for (auto p : hb_peers)
418 f->dump_int("osd", p);
419 f->close_section();
420 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
421 f->dump_int("num_snap_trimming", num_snap_trimming);
422 f->dump_int("num_shards_repaired", num_shards_repaired);
423 f->open_object_section("op_queue_age_hist");
424 op_queue_age_hist.dump(f);
425 f->close_section();
426 f->open_object_section("perf_stat");
427 os_perf_stat.dump(f);
428 f->close_section();
429 f->open_array_section("alerts");
430 ::dump(f, os_alerts);
431 f->close_section();
432 f->open_array_section("network_ping_times");
433 for (auto &i : hb_pingtime) {
434 f->open_object_section("entry");
435 f->dump_int("osd", i.first);
436 const time_t lu(i.second.last_update);
437 char buffer[26];
438 string lustr(ctime_r(&lu, buffer));
439 lustr.pop_back(); // Remove trailing \n
440 f->dump_string("last update", lustr);
441 f->open_array_section("interfaces");
442 f->open_object_section("interface");
443 f->dump_string("interface", "back");
444 f->open_object_section("average");
445 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_pingtime[0],3).c_str());
446 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_pingtime[1],3).c_str());
447 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_pingtime[2],3).c_str());
448 f->close_section(); // average
449 f->open_object_section("min");
450 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_min[0],3).c_str());
451 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_min[1],3).c_str());
452 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_min[2],3).c_str());
453 f->close_section(); // min
454 f->open_object_section("max");
455 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.back_max[0],3).c_str());
456 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.back_max[1],3).c_str());
457 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.back_max[2],3).c_str());
458 f->close_section(); // max
459 f->dump_format_unquoted("last", "%s", fixed_u_to_string(i.second.back_last,3).c_str());
460 f->close_section(); // interface
461
462 if (i.second.front_pingtime[0] != 0) {
463 f->open_object_section("interface");
464 f->dump_string("interface", "front");
465 f->open_object_section("average");
466 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_pingtime[0],3).c_str());
467 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_pingtime[1],3).c_str());
468 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_pingtime[2],3).c_str());
469 f->close_section(); // average
470 f->open_object_section("min");
471 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_min[0],3).c_str());
472 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_min[1],3).c_str());
473 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_min[2],3).c_str());
474 f->close_section(); // min
475 f->open_object_section("max");
476 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(i.second.front_max[0],3).c_str());
477 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(i.second.front_max[1],3).c_str());
478 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(i.second.front_max[2],3).c_str());
479 f->close_section(); // max
480 f->dump_format_unquoted("last", "%s", fixed_u_to_string(i.second.front_last,3).c_str());
481 f->close_section(); // interface
482 }
483 f->close_section(); // interfaces
484 f->close_section(); // entry
485 }
486 f->close_section(); // network_ping_time
487 }
488
489 void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
490 {
491 ENCODE_START(14, 2, bl);
492
493 //////// for compatibility ////////
494 int64_t kb = statfs.kb();
495 int64_t kb_used = statfs.kb_used_raw();
496 int64_t kb_avail = statfs.kb_avail();
497 encode(kb, bl);
498 encode(kb_used, bl);
499 encode(kb_avail, bl);
500 ///////////////////////////////////
501
502 encode(snap_trim_queue_len, bl);
503 encode(num_snap_trimming, bl);
504 encode(hb_peers, bl);
505 encode((uint32_t)0, bl);
506 encode(op_queue_age_hist, bl);
507 encode(os_perf_stat, bl, features);
508 encode(up_from, bl);
509 encode(seq, bl);
510 encode(num_pgs, bl);
511
512 //////// for compatibility ////////
513 int64_t kb_used_data = statfs.kb_used_data();
514 int64_t kb_used_omap = statfs.kb_used_omap();
515 int64_t kb_used_meta = statfs.kb_used_internal_metadata();
516 encode(kb_used_data, bl);
517 encode(kb_used_omap, bl);
518 encode(kb_used_meta, bl);
519 encode(statfs, bl);
520 ///////////////////////////////////
521 encode(os_alerts, bl);
522 encode(num_shards_repaired, bl);
523 encode(num_osds, bl);
524 encode(num_per_pool_osds, bl);
525 encode(num_per_pool_omap_osds, bl);
526
527 // hb_pingtime map
528 encode((int)hb_pingtime.size(), bl);
529 for (auto i : hb_pingtime) {
530 encode(i.first, bl); // osd
531 encode(i.second.last_update, bl);
532 encode(i.second.back_pingtime[0], bl);
533 encode(i.second.back_pingtime[1], bl);
534 encode(i.second.back_pingtime[2], bl);
535 encode(i.second.back_min[0], bl);
536 encode(i.second.back_min[1], bl);
537 encode(i.second.back_min[2], bl);
538 encode(i.second.back_max[0], bl);
539 encode(i.second.back_max[1], bl);
540 encode(i.second.back_max[2], bl);
541 encode(i.second.back_last, bl);
542 encode(i.second.front_pingtime[0], bl);
543 encode(i.second.front_pingtime[1], bl);
544 encode(i.second.front_pingtime[2], bl);
545 encode(i.second.front_min[0], bl);
546 encode(i.second.front_min[1], bl);
547 encode(i.second.front_min[2], bl);
548 encode(i.second.front_max[0], bl);
549 encode(i.second.front_max[1], bl);
550 encode(i.second.front_max[2], bl);
551 encode(i.second.front_last, bl);
552 }
553 ENCODE_FINISH(bl);
554 }
555
556 void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl)
557 {
558 int64_t kb, kb_used,kb_avail;
559 int64_t kb_used_data, kb_used_omap, kb_used_meta;
560 DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl);
561 decode(kb, bl);
562 decode(kb_used, bl);
563 decode(kb_avail, bl);
564 decode(snap_trim_queue_len, bl);
565 decode(num_snap_trimming, bl);
566 decode(hb_peers, bl);
567 vector<int> num_hb_out;
568 decode(num_hb_out, bl);
569 if (struct_v >= 3)
570 decode(op_queue_age_hist, bl);
571 if (struct_v >= 4)
572 decode(os_perf_stat, bl);
573 if (struct_v >= 6) {
574 decode(up_from, bl);
575 decode(seq, bl);
576 }
577 if (struct_v >= 7) {
578 decode(num_pgs, bl);
579 }
580 if (struct_v >= 8) {
581 decode(kb_used_data, bl);
582 decode(kb_used_omap, bl);
583 decode(kb_used_meta, bl);
584 } else {
585 kb_used_data = kb_used;
586 kb_used_omap = 0;
587 kb_used_meta = 0;
588 }
589 if (struct_v >= 9) {
590 decode(statfs, bl);
591 } else {
592 statfs.reset();
593 statfs.total = kb << 10;
594 statfs.available = kb_avail << 10;
595 // actually it's totally unexpected to have ststfs.total < statfs.available
596 // here but unfortunately legacy generate_test_instances produced such a
597 // case hence inserting some handling rather than assert
598 statfs.internally_reserved =
599 statfs.total > statfs.available ? statfs.total - statfs.available : 0;
600 kb_used <<= 10;
601 if ((int64_t)statfs.internally_reserved > kb_used) {
602 statfs.internally_reserved -= kb_used;
603 } else {
604 statfs.internally_reserved = 0;
605 }
606 statfs.allocated = kb_used_data << 10;
607 statfs.omap_allocated = kb_used_omap << 10;
608 statfs.internal_metadata = kb_used_meta << 10;
609 }
610 if (struct_v >= 10) {
611 decode(os_alerts, bl);
612 } else {
613 os_alerts.clear();
614 }
615 if (struct_v >= 11) {
616 decode(num_shards_repaired, bl);
617 } else {
618 num_shards_repaired = 0;
619 }
620 if (struct_v >= 12) {
621 decode(num_osds, bl);
622 decode(num_per_pool_osds, bl);
623 } else {
624 num_osds = 0;
625 num_per_pool_osds = 0;
626 }
627 if (struct_v >= 13) {
628 decode(num_per_pool_omap_osds, bl);
629 } else {
630 num_per_pool_omap_osds = 0;
631 }
632 hb_pingtime.clear();
633 if (struct_v >= 14) {
634 int count;
635 decode(count, bl);
636 for (int i = 0 ; i < count ; i++) {
637 int osd;
638 decode(osd, bl);
639 struct Interfaces ifs;
640 decode(ifs.last_update, bl);
641 decode(ifs.back_pingtime[0],bl);
642 decode(ifs.back_pingtime[1], bl);
643 decode(ifs.back_pingtime[2], bl);
644 decode(ifs.back_min[0],bl);
645 decode(ifs.back_min[1], bl);
646 decode(ifs.back_min[2], bl);
647 decode(ifs.back_max[0],bl);
648 decode(ifs.back_max[1], bl);
649 decode(ifs.back_max[2], bl);
650 decode(ifs.back_last, bl);
651 decode(ifs.front_pingtime[0], bl);
652 decode(ifs.front_pingtime[1], bl);
653 decode(ifs.front_pingtime[2], bl);
654 decode(ifs.front_min[0], bl);
655 decode(ifs.front_min[1], bl);
656 decode(ifs.front_min[2], bl);
657 decode(ifs.front_max[0], bl);
658 decode(ifs.front_max[1], bl);
659 decode(ifs.front_max[2], bl);
660 decode(ifs.front_last, bl);
661 hb_pingtime[osd] = ifs;
662 }
663 }
664 DECODE_FINISH(bl);
665 }
666
667 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
668 {
669 o.push_back(new osd_stat_t);
670
671 o.push_back(new osd_stat_t);
672 list<store_statfs_t*> ll;
673 store_statfs_t::generate_test_instances(ll);
674 o.back()->statfs = *ll.back();
675 o.back()->hb_peers.push_back(7);
676 o.back()->snap_trim_queue_len = 8;
677 o.back()->num_snap_trimming = 99;
678 o.back()->num_shards_repaired = 101;
679 o.back()->os_alerts[0].emplace(
680 "some alert", "some alert details");
681 o.back()->os_alerts[1].emplace(
682 "some alert2", "some alert2 details");
683 struct Interfaces gen_interfaces = {
684 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
685 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
686 o.back()->hb_pingtime[20] = gen_interfaces;
687 gen_interfaces = {
688 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
689 o.back()->hb_pingtime[30] = gen_interfaces;
690 }
691
692 // -- pg_t --
693
694 int pg_t::print(char *o, int maxlen) const
695 {
696 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
697 }
698
699 bool pg_t::parse(const char *s)
700 {
701 uint64_t ppool;
702 uint32_t pseed;
703 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
704 if (r < 2)
705 return false;
706 m_pool = ppool;
707 m_seed = pseed;
708 return true;
709 }
710
711 bool spg_t::parse(const char *s)
712 {
713 shard = shard_id_t::NO_SHARD;
714 uint64_t ppool;
715 uint32_t pseed;
716 uint32_t pshard;
717 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
718 if (r < 2)
719 return false;
720 pgid.set_pool(ppool);
721 pgid.set_ps(pseed);
722
723 const char *p = strchr(s, 's');
724 if (p) {
725 r = sscanf(p, "s%u", &pshard);
726 if (r == 1) {
727 shard = shard_id_t(pshard);
728 } else {
729 return false;
730 }
731 }
732 return true;
733 }
734
735 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
736 {
737 while (*suffix_backwords)
738 *--buf = *suffix_backwords++;
739
740 if (!is_no_shard()) {
741 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
742 *--buf = 's';
743 }
744
745 return pgid.calc_name(buf, "");
746 }
747
748 ostream& operator<<(ostream& out, const spg_t &pg)
749 {
750 char buf[spg_t::calc_name_buf_size];
751 buf[spg_t::calc_name_buf_size - 1] = '\0';
752 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
753 return out;
754 }
755
756 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
757 {
758 int old_bits = cbits(old_pg_num);
759 int old_mask = (1 << old_bits) - 1;
760 pg_t ret = *this;
761 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
762 return ret;
763 }
764
765 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
766 {
767 //ceph_assert(m_seed < old_pg_num);
768 if (m_seed >= old_pg_num) {
769 // degenerate case
770 return false;
771 }
772 if (new_pg_num <= old_pg_num)
773 return false;
774
775 bool split = false;
776 if (true) {
777 unsigned old_bits = cbits(old_pg_num);
778 unsigned old_mask = (1 << old_bits) - 1;
779 for (unsigned n = 1; ; n++) {
780 unsigned next_bit = (n << (old_bits-1));
781 unsigned s = next_bit | m_seed;
782
783 if (s < old_pg_num || s == m_seed)
784 continue;
785 if (s >= new_pg_num)
786 break;
787 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
788 split = true;
789 if (children)
790 children->insert(pg_t(s, m_pool));
791 }
792 }
793 }
794 if (false) {
795 // brute force
796 int old_bits = cbits(old_pg_num);
797 int old_mask = (1 << old_bits) - 1;
798 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
799 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
800 if (o == m_seed) {
801 split = true;
802 children->insert(pg_t(x, m_pool));
803 }
804 }
805 }
806 return split;
807 }
808
809 unsigned pg_t::get_split_bits(unsigned pg_num) const {
810 if (pg_num == 1)
811 return 0;
812 ceph_assert(pg_num > 1);
813
814 // Find unique p such that pg_num \in [2^(p-1), 2^p)
815 unsigned p = cbits(pg_num);
816 ceph_assert(p); // silence coverity #751330
817
818 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
819 return p;
820 else
821 return p - 1;
822 }
823
824 bool pg_t::is_merge_source(
825 unsigned old_pg_num,
826 unsigned new_pg_num,
827 pg_t *parent) const
828 {
829 if (m_seed < old_pg_num &&
830 m_seed >= new_pg_num) {
831 if (parent) {
832 pg_t t = *this;
833 while (t.m_seed >= new_pg_num) {
834 t = t.get_parent();
835 }
836 *parent = t;
837 }
838 return true;
839 }
840 return false;
841 }
842
843 pg_t pg_t::get_parent() const
844 {
845 unsigned bits = cbits(m_seed);
846 ceph_assert(bits);
847 pg_t retval = *this;
848 retval.m_seed &= ~((~0)<<(bits - 1));
849 return retval;
850 }
851
852 hobject_t pg_t::get_hobj_start() const
853 {
854 return hobject_t(object_t(), string(), 0, m_seed, m_pool,
855 string());
856 }
857
858 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
859 {
860 // note: this assumes a bitwise sort; with the legacy nibblewise
861 // sort a PG did not always cover a single contiguous range of the
862 // (bit-reversed) hash range.
863 unsigned bits = get_split_bits(pg_num);
864 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
865 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
866 if (rev_end >= 0x100000000) {
867 ceph_assert(rev_end == 0x100000000);
868 return hobject_t::get_max();
869 } else {
870 return hobject_t(object_t(), string(), CEPH_NOSNAP,
871 hobject_t::_reverse_bits(rev_end), m_pool,
872 string());
873 }
874 }
875
876 void pg_t::dump(Formatter *f) const
877 {
878 f->dump_unsigned("pool", m_pool);
879 f->dump_unsigned("seed", m_seed);
880 }
881
882 void pg_t::generate_test_instances(list<pg_t*>& o)
883 {
884 o.push_back(new pg_t);
885 o.push_back(new pg_t(1, 2));
886 o.push_back(new pg_t(13123, 3));
887 o.push_back(new pg_t(131223, 4));
888 }
889
890 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
891 {
892 while (*suffix_backwords)
893 *--buf = *suffix_backwords++;
894
895 buf = ritoa<uint32_t, 16>(m_seed, buf);
896
897 *--buf = '.';
898
899 return ritoa<uint64_t, 10>(m_pool, buf);
900 }
901
902 ostream& operator<<(ostream& out, const pg_t &pg)
903 {
904 char buf[pg_t::calc_name_buf_size];
905 buf[pg_t::calc_name_buf_size - 1] = '\0';
906 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
907 return out;
908 }
909
910
911 // -- coll_t --
912
913 void coll_t::calc_str()
914 {
915 switch (type) {
916 case TYPE_META:
917 strcpy(_str_buff, "meta");
918 _str = _str_buff;
919 break;
920 case TYPE_PG:
921 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
922 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
923 break;
924 case TYPE_PG_TEMP:
925 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
926 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
927 break;
928 default:
929 ceph_abort_msg("unknown collection type");
930 }
931 }
932
933 bool coll_t::parse(const std::string& s)
934 {
935 if (s == "meta") {
936 type = TYPE_META;
937 pgid = spg_t();
938 removal_seq = 0;
939 calc_str();
940 ceph_assert(s == _str);
941 return true;
942 }
943 if (s.find("_head") == s.length() - 5 &&
944 pgid.parse(s.substr(0, s.length() - 5))) {
945 type = TYPE_PG;
946 removal_seq = 0;
947 calc_str();
948 ceph_assert(s == _str);
949 return true;
950 }
951 if (s.find("_TEMP") == s.length() - 5 &&
952 pgid.parse(s.substr(0, s.length() - 5))) {
953 type = TYPE_PG_TEMP;
954 removal_seq = 0;
955 calc_str();
956 ceph_assert(s == _str);
957 return true;
958 }
959 return false;
960 }
961
962 void coll_t::encode(ceph::buffer::list& bl) const
963 {
964 using ceph::encode;
965 // when changing this, remember to update encoded_size() too.
966 if (is_temp()) {
967 // can't express this as v2...
968 __u8 struct_v = 3;
969 encode(struct_v, bl);
970 encode(to_str(), bl);
971 } else {
972 __u8 struct_v = 2;
973 encode(struct_v, bl);
974 encode((__u8)type, bl);
975 encode(pgid, bl);
976 snapid_t snap = CEPH_NOSNAP;
977 encode(snap, bl);
978 }
979 }
980
981 size_t coll_t::encoded_size() const
982 {
983 size_t r = sizeof(__u8);
984 if (is_temp()) {
985 // v3
986 r += sizeof(__u32);
987 if (_str) {
988 r += strlen(_str);
989 }
990 } else {
991 // v2
992 // 1. type
993 r += sizeof(__u8);
994 // 2. pgid
995 // - encoding header
996 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
997 // - pg_t
998 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
999 // - shard_id_t
1000 r += sizeof(int8_t);
1001 // 3. snapid_t
1002 r += sizeof(uint64_t);
1003 }
1004
1005 return r;
1006 }
1007
1008 void coll_t::decode(ceph::buffer::list::const_iterator& bl)
1009 {
1010 using ceph::decode;
1011 __u8 struct_v;
1012 decode(struct_v, bl);
1013 switch (struct_v) {
1014 case 1:
1015 {
1016 snapid_t snap;
1017 decode(pgid, bl);
1018 decode(snap, bl);
1019
1020 // infer the type
1021 if (pgid == spg_t() && snap == 0) {
1022 type = TYPE_META;
1023 } else {
1024 type = TYPE_PG;
1025 }
1026 removal_seq = 0;
1027 }
1028 break;
1029
1030 case 2:
1031 {
1032 __u8 _type;
1033 snapid_t snap;
1034 decode(_type, bl);
1035 decode(pgid, bl);
1036 decode(snap, bl);
1037 type = (type_t)_type;
1038 removal_seq = 0;
1039 }
1040 break;
1041
1042 case 3:
1043 {
1044 string str;
1045 decode(str, bl);
1046 bool ok = parse(str);
1047 if (!ok)
1048 throw std::domain_error(std::string("unable to parse pg ") + str);
1049 }
1050 break;
1051
1052 default:
1053 {
1054 ostringstream oss;
1055 oss << "coll_t::decode(): don't know how to decode version "
1056 << struct_v;
1057 throw std::domain_error(oss.str());
1058 }
1059 }
1060 }
1061
1062 void coll_t::dump(Formatter *f) const
1063 {
1064 f->dump_unsigned("type_id", (unsigned)type);
1065 if (type != TYPE_META)
1066 f->dump_stream("pgid") << pgid;
1067 f->dump_string("name", to_str());
1068 }
1069
1070 void coll_t::generate_test_instances(list<coll_t*>& o)
1071 {
1072 o.push_back(new coll_t());
1073 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
1074 o.push_back(new coll_t(o.back()->get_temp()));
1075 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
1076 o.push_back(new coll_t(o.back()->get_temp()));
1077 o.push_back(new coll_t());
1078 }
1079
1080 // ---
1081
1082 std::string pg_vector_string(const vector<int32_t> &a)
1083 {
1084 ostringstream oss;
1085 oss << "[";
1086 for (auto i = a.cbegin(); i != a.cend(); ++i) {
1087 if (i != a.begin())
1088 oss << ",";
1089 if (*i != CRUSH_ITEM_NONE)
1090 oss << *i;
1091 else
1092 oss << "NONE";
1093 }
1094 oss << "]";
1095 return oss.str();
1096 }
1097
1098 std::string pg_state_string(uint64_t state)
1099 {
1100 ostringstream oss;
1101 if (state & PG_STATE_STALE)
1102 oss << "stale+";
1103 if (state & PG_STATE_CREATING)
1104 oss << "creating+";
1105 if (state & PG_STATE_ACTIVE)
1106 oss << "active+";
1107 if (state & PG_STATE_ACTIVATING)
1108 oss << "activating+";
1109 if (state & PG_STATE_CLEAN)
1110 oss << "clean+";
1111 if (state & PG_STATE_RECOVERY_WAIT)
1112 oss << "recovery_wait+";
1113 if (state & PG_STATE_RECOVERY_TOOFULL)
1114 oss << "recovery_toofull+";
1115 if (state & PG_STATE_RECOVERING)
1116 oss << "recovering+";
1117 if (state & PG_STATE_FORCED_RECOVERY)
1118 oss << "forced_recovery+";
1119 if (state & PG_STATE_DOWN)
1120 oss << "down+";
1121 if (state & PG_STATE_RECOVERY_UNFOUND)
1122 oss << "recovery_unfound+";
1123 if (state & PG_STATE_BACKFILL_UNFOUND)
1124 oss << "backfill_unfound+";
1125 if (state & PG_STATE_UNDERSIZED)
1126 oss << "undersized+";
1127 if (state & PG_STATE_DEGRADED)
1128 oss << "degraded+";
1129 if (state & PG_STATE_REMAPPED)
1130 oss << "remapped+";
1131 if (state & PG_STATE_PREMERGE)
1132 oss << "premerge+";
1133 if (state & PG_STATE_SCRUBBING)
1134 oss << "scrubbing+";
1135 if (state & PG_STATE_DEEP_SCRUB)
1136 oss << "deep+";
1137 if (state & PG_STATE_INCONSISTENT)
1138 oss << "inconsistent+";
1139 if (state & PG_STATE_PEERING)
1140 oss << "peering+";
1141 if (state & PG_STATE_REPAIR)
1142 oss << "repair+";
1143 if (state & PG_STATE_BACKFILL_WAIT)
1144 oss << "backfill_wait+";
1145 if (state & PG_STATE_BACKFILLING)
1146 oss << "backfilling+";
1147 if (state & PG_STATE_FORCED_BACKFILL)
1148 oss << "forced_backfill+";
1149 if (state & PG_STATE_BACKFILL_TOOFULL)
1150 oss << "backfill_toofull+";
1151 if (state & PG_STATE_INCOMPLETE)
1152 oss << "incomplete+";
1153 if (state & PG_STATE_PEERED)
1154 oss << "peered+";
1155 if (state & PG_STATE_SNAPTRIM)
1156 oss << "snaptrim+";
1157 if (state & PG_STATE_SNAPTRIM_WAIT)
1158 oss << "snaptrim_wait+";
1159 if (state & PG_STATE_SNAPTRIM_ERROR)
1160 oss << "snaptrim_error+";
1161 if (state & PG_STATE_FAILED_REPAIR)
1162 oss << "failed_repair+";
1163 if (state & PG_STATE_LAGGY)
1164 oss << "laggy+";
1165 if (state & PG_STATE_WAIT)
1166 oss << "wait+";
1167 string ret(oss.str());
1168 if (ret.length() > 0)
1169 ret.resize(ret.length() - 1);
1170 else
1171 ret = "unknown";
1172 return ret;
1173 }
1174
1175 std::optional<uint64_t> pg_string_state(const std::string& state)
1176 {
1177 std::optional<uint64_t> type;
1178 if (state == "active")
1179 type = PG_STATE_ACTIVE;
1180 else if (state == "clean")
1181 type = PG_STATE_CLEAN;
1182 else if (state == "down")
1183 type = PG_STATE_DOWN;
1184 else if (state == "recovery_unfound")
1185 type = PG_STATE_RECOVERY_UNFOUND;
1186 else if (state == "backfill_unfound")
1187 type = PG_STATE_BACKFILL_UNFOUND;
1188 else if (state == "premerge")
1189 type = PG_STATE_PREMERGE;
1190 else if (state == "scrubbing")
1191 type = PG_STATE_SCRUBBING;
1192 else if (state == "degraded")
1193 type = PG_STATE_DEGRADED;
1194 else if (state == "inconsistent")
1195 type = PG_STATE_INCONSISTENT;
1196 else if (state == "peering")
1197 type = PG_STATE_PEERING;
1198 else if (state == "repair")
1199 type = PG_STATE_REPAIR;
1200 else if (state == "recovering")
1201 type = PG_STATE_RECOVERING;
1202 else if (state == "forced_recovery")
1203 type = PG_STATE_FORCED_RECOVERY;
1204 else if (state == "backfill_wait")
1205 type = PG_STATE_BACKFILL_WAIT;
1206 else if (state == "incomplete")
1207 type = PG_STATE_INCOMPLETE;
1208 else if (state == "stale")
1209 type = PG_STATE_STALE;
1210 else if (state == "remapped")
1211 type = PG_STATE_REMAPPED;
1212 else if (state == "deep")
1213 type = PG_STATE_DEEP_SCRUB;
1214 else if (state == "backfilling")
1215 type = PG_STATE_BACKFILLING;
1216 else if (state == "forced_backfill")
1217 type = PG_STATE_FORCED_BACKFILL;
1218 else if (state == "backfill_toofull")
1219 type = PG_STATE_BACKFILL_TOOFULL;
1220 else if (state == "recovery_wait")
1221 type = PG_STATE_RECOVERY_WAIT;
1222 else if (state == "recovery_toofull")
1223 type = PG_STATE_RECOVERY_TOOFULL;
1224 else if (state == "undersized")
1225 type = PG_STATE_UNDERSIZED;
1226 else if (state == "activating")
1227 type = PG_STATE_ACTIVATING;
1228 else if (state == "peered")
1229 type = PG_STATE_PEERED;
1230 else if (state == "snaptrim")
1231 type = PG_STATE_SNAPTRIM;
1232 else if (state == "snaptrim_wait")
1233 type = PG_STATE_SNAPTRIM_WAIT;
1234 else if (state == "snaptrim_error")
1235 type = PG_STATE_SNAPTRIM_ERROR;
1236 else if (state == "creating")
1237 type = PG_STATE_CREATING;
1238 else if (state == "failed_repair")
1239 type = PG_STATE_FAILED_REPAIR;
1240 else if (state == "laggy")
1241 type = PG_STATE_LAGGY;
1242 else if (state == "wait")
1243 type = PG_STATE_WAIT;
1244 else if (state == "unknown")
1245 type = 0;
1246 else
1247 type = std::nullopt;
1248 return type;
1249 }
1250
1251 // -- eversion_t --
1252 string eversion_t::get_key_name() const
1253 {
1254 std::string key(32, ' ');
1255 get_key_name(&key[0]);
1256 key.resize(31); // remove the null terminator
1257 return key;
1258 }
1259
1260 // -- pool_snap_info_t --
1261 void pool_snap_info_t::dump(Formatter *f) const
1262 {
1263 f->dump_unsigned("snapid", snapid);
1264 f->dump_stream("stamp") << stamp;
1265 f->dump_string("name", name);
1266 }
1267
1268 void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
1269 {
1270 using ceph::encode;
1271 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1272 __u8 struct_v = 1;
1273 encode(struct_v, bl);
1274 encode(snapid, bl);
1275 encode(stamp, bl);
1276 encode(name, bl);
1277 return;
1278 }
1279 ENCODE_START(2, 2, bl);
1280 encode(snapid, bl);
1281 encode(stamp, bl);
1282 encode(name, bl);
1283 ENCODE_FINISH(bl);
1284 }
1285
1286 void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl)
1287 {
1288 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
1289 decode(snapid, bl);
1290 decode(stamp, bl);
1291 decode(name, bl);
1292 DECODE_FINISH(bl);
1293 }
1294
1295 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
1296 {
1297 o.push_back(new pool_snap_info_t);
1298 o.push_back(new pool_snap_info_t);
1299 o.back()->snapid = 1;
1300 o.back()->stamp = utime_t(1, 2);
1301 o.back()->name = "foo";
1302 }
1303
1304 // -- pool_opts_t --
1305
1306 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
1307 static opt_mapping_t opt_mapping = boost::assign::map_list_of
1308 ("scrub_min_interval", pool_opts_t::opt_desc_t(
1309 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
1310 ("scrub_max_interval", pool_opts_t::opt_desc_t(
1311 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
1312 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
1313 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
1314 ("recovery_priority", pool_opts_t::opt_desc_t(
1315 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
1316 ("recovery_op_priority", pool_opts_t::opt_desc_t(
1317 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
1318 ("scrub_priority", pool_opts_t::opt_desc_t(
1319 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
1320 ("compression_mode", pool_opts_t::opt_desc_t(
1321 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
1322 ("compression_algorithm", pool_opts_t::opt_desc_t(
1323 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
1324 ("compression_required_ratio", pool_opts_t::opt_desc_t(
1325 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
1326 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
1327 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
1328 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
1329 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
1330 ("csum_type", pool_opts_t::opt_desc_t(
1331 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1332 ("csum_max_block", pool_opts_t::opt_desc_t(
1333 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1334 ("csum_min_block", pool_opts_t::opt_desc_t(
1335 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT))
1336 ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
1337 pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR))
1338 ("pg_num_min", pool_opts_t::opt_desc_t(
1339 pool_opts_t::PG_NUM_MIN, pool_opts_t::INT))
1340 ("target_size_bytes", pool_opts_t::opt_desc_t(
1341 pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT))
1342 ("target_size_ratio", pool_opts_t::opt_desc_t(
1343 pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE))
1344 ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
1345 pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE))
1346 ("read_lease_interval", pool_opts_t::opt_desc_t(
1347 pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE));
1348
1349 bool pool_opts_t::is_opt_name(const std::string& name)
1350 {
1351 return opt_mapping.count(name);
1352 }
1353
1354 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name)
1355 {
1356 auto i = opt_mapping.find(name);
1357 ceph_assert(i != opt_mapping.end());
1358 return i->second;
1359 }
1360
1361 bool pool_opts_t::is_set(pool_opts_t::key_t key) const
1362 {
1363 return opts.count(key);
1364 }
1365
1366 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const
1367 {
1368 auto i = opts.find(key);
1369 ceph_assert(i != opts.end());
1370 return i->second;
1371 }
1372
1373 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1374 return opts.erase(key) > 0;
1375 }
1376
1377 class pool_opts_dumper_t : public boost::static_visitor<> {
1378 public:
1379 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1380 name(name_.c_str()), f(f_) {}
1381
1382 void operator()(std::string s) const {
1383 f->dump_string(name, s);
1384 }
1385 void operator()(int64_t i) const {
1386 f->dump_int(name, i);
1387 }
1388 void operator()(double d) const {
1389 f->dump_float(name, d);
1390 }
1391
1392 private:
1393 const char* name;
1394 Formatter* f;
1395 };
1396
1397 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1398 {
1399 const opt_desc_t& desc = get_opt_desc(name);
1400 auto i = opts.find(desc.key);
1401 if (i == opts.end()) {
1402 return;
1403 }
1404 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1405 }
1406
1407 void pool_opts_t::dump(Formatter* f) const
1408 {
1409 for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) {
1410 const std::string& name = i->first;
1411 const opt_desc_t& desc = i->second;
1412 auto j = opts.find(desc.key);
1413 if (j == opts.end()) {
1414 continue;
1415 }
1416 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1417 }
1418 }
1419
1420 class pool_opts_encoder_t : public boost::static_visitor<> {
1421 public:
1422 explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features)
1423 : bl(bl_),
1424 features(features) {}
1425
1426 void operator()(const std::string &s) const {
1427 encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1428 encode(s, bl);
1429 }
1430 void operator()(int64_t i) const {
1431 encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1432 if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1433 encode(i, bl);
1434 } else {
1435 encode(static_cast<int32_t>(i), bl);
1436 }
1437 }
1438 void operator()(double d) const {
1439 encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1440 encode(d, bl);
1441 }
1442
1443 private:
1444 ceph::buffer::list& bl;
1445 uint64_t features;
1446 };
1447
1448 void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const
1449 {
1450 unsigned v = 2;
1451 if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1452 v = 1;
1453 }
1454 ENCODE_START(v, 1, bl);
1455 uint32_t n = static_cast<uint32_t>(opts.size());
1456 encode(n, bl);
1457 for (auto i = opts.cbegin(); i != opts.cend(); ++i) {
1458 encode(static_cast<int32_t>(i->first), bl);
1459 boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second);
1460 }
1461 ENCODE_FINISH(bl);
1462 }
1463
1464 void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl)
1465 {
1466 DECODE_START(1, bl);
1467 __u32 n;
1468 decode(n, bl);
1469 opts.clear();
1470 while (n--) {
1471 int32_t k, t;
1472 decode(k, bl);
1473 decode(t, bl);
1474 if (t == STR) {
1475 std::string s;
1476 decode(s, bl);
1477 opts[static_cast<key_t>(k)] = s;
1478 } else if (t == INT) {
1479 int64_t i;
1480 if (struct_v >= 2) {
1481 decode(i, bl);
1482 } else {
1483 int ii;
1484 decode(ii, bl);
1485 i = ii;
1486 }
1487 opts[static_cast<key_t>(k)] = i;
1488 } else if (t == DOUBLE) {
1489 double d;
1490 decode(d, bl);
1491 opts[static_cast<key_t>(k)] = d;
1492 } else {
1493 ceph_assert(!"invalid type");
1494 }
1495 }
1496 DECODE_FINISH(bl);
1497 }
1498
1499 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1500 {
1501 for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) {
1502 const std::string& name = i->first;
1503 const pool_opts_t::opt_desc_t& desc = i->second;
1504 auto j = opts.opts.find(desc.key);
1505 if (j == opts.opts.end()) {
1506 continue;
1507 }
1508 out << " " << name << " " << j->second;
1509 }
1510 return out;
1511 }
1512
1513 // -- pg_pool_t --
1514
1515 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1516 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1517 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1518
1519 void pg_pool_t::dump(Formatter *f) const
1520 {
1521 f->dump_stream("create_time") << get_create_time();
1522 f->dump_unsigned("flags", get_flags());
1523 f->dump_string("flags_names", get_flags_string());
1524 f->dump_int("type", get_type());
1525 f->dump_int("size", get_size());
1526 f->dump_int("min_size", get_min_size());
1527 f->dump_int("crush_rule", get_crush_rule());
1528 f->dump_int("object_hash", get_object_hash());
1529 f->dump_string("pg_autoscale_mode",
1530 get_pg_autoscale_mode_name(pg_autoscale_mode));
1531 f->dump_unsigned("pg_num", get_pg_num());
1532 f->dump_unsigned("pg_placement_num", get_pgp_num());
1533 f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
1534 f->dump_unsigned("pg_num_target", get_pg_num_target());
1535 f->dump_unsigned("pg_num_pending", get_pg_num_pending());
1536 f->dump_object("last_pg_merge_meta", last_pg_merge_meta);
1537 f->dump_stream("last_change") << get_last_change();
1538 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1539 f->dump_stream("last_force_op_resend_prenautilus")
1540 << get_last_force_op_resend_prenautilus();
1541 f->dump_stream("last_force_op_resend_preluminous")
1542 << get_last_force_op_resend_preluminous();
1543 f->dump_unsigned("auid", get_auid());
1544 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1545 f->dump_unsigned("snap_seq", get_snap_seq());
1546 f->dump_unsigned("snap_epoch", get_snap_epoch());
1547 f->open_array_section("pool_snaps");
1548 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) {
1549 f->open_object_section("pool_snap_info");
1550 p->second.dump(f);
1551 f->close_section();
1552 }
1553 f->close_section();
1554 f->dump_stream("removed_snaps") << removed_snaps;
1555 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1556 f->dump_unsigned("quota_max_objects", quota_max_objects);
1557 f->open_array_section("tiers");
1558 for (auto p = tiers.cbegin(); p != tiers.cend(); ++p)
1559 f->dump_unsigned("pool_id", *p);
1560 f->close_section();
1561 f->dump_int("tier_of", tier_of);
1562 f->dump_int("read_tier", read_tier);
1563 f->dump_int("write_tier", write_tier);
1564 f->dump_string("cache_mode", get_cache_mode_name());
1565 f->dump_unsigned("target_max_bytes", target_max_bytes);
1566 f->dump_unsigned("target_max_objects", target_max_objects);
1567 f->dump_unsigned("cache_target_dirty_ratio_micro",
1568 cache_target_dirty_ratio_micro);
1569 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1570 cache_target_dirty_high_ratio_micro);
1571 f->dump_unsigned("cache_target_full_ratio_micro",
1572 cache_target_full_ratio_micro);
1573 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1574 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1575 f->dump_string("erasure_code_profile", erasure_code_profile);
1576 f->open_object_section("hit_set_params");
1577 hit_set_params.dump(f);
1578 f->close_section(); // hit_set_params
1579 f->dump_unsigned("hit_set_period", hit_set_period);
1580 f->dump_unsigned("hit_set_count", hit_set_count);
1581 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1582 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1583 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1584 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1585 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1586 f->open_array_section("grade_table");
1587 for (unsigned i = 0; i < hit_set_count; ++i)
1588 f->dump_unsigned("value", get_grade(i));
1589 f->close_section();
1590 f->dump_unsigned("stripe_width", get_stripe_width());
1591 f->dump_unsigned("expected_num_objects", expected_num_objects);
1592 f->dump_bool("fast_read", fast_read);
1593 f->open_object_section("options");
1594 opts.dump(f);
1595 f->close_section(); // options
1596 f->open_object_section("application_metadata");
1597 for (auto &app_pair : application_metadata) {
1598 f->open_object_section(app_pair.first.c_str());
1599 for (auto &kv_pair : app_pair.second) {
1600 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1601 }
1602 f->close_section(); // application
1603 }
1604 f->close_section(); // application_metadata
1605 }
1606
1607 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1608 for (size_t i = 0; i < from.size(); ++i) {
1609 if (from[i] != CRUSH_ITEM_NONE) {
1610 to->insert(
1611 pg_shard_t(
1612 from[i],
1613 is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1614 }
1615 }
1616 }
1617
1618 void pg_pool_t::calc_pg_masks()
1619 {
1620 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1621 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1622 }
1623
1624 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1625 {
1626 if (pg_num == pg_num_mask + 1)
1627 return pg_num; // power-of-2 split
1628 unsigned mask = pg_num_mask >> 1;
1629 if ((pgid.ps() & mask) < (pg_num & mask))
1630 return pg_num_mask + 1; // smaller bin size (already split)
1631 else
1632 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1633 }
1634
1635 bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const
1636 {
1637 if (pg_num_pending >= pg_num) {
1638 return false;
1639 }
1640 if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) {
1641 if (target) {
1642 *target = false;
1643 }
1644 return true;
1645 }
1646 for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) {
1647 if (pg_t(ps, pgid.pool()).get_parent() == pgid) {
1648 if (target) {
1649 *target = true;
1650 }
1651 return true;
1652 }
1653 }
1654 return false;
1655 }
1656
1657 /*
1658 * we have two snap modes:
1659 * - pool snaps
1660 * - snap existence/non-existence defined by snaps[] and snap_seq
1661 * - user managed snaps
1662 * - existence tracked by librados user
1663 */
1664 bool pg_pool_t::is_pool_snaps_mode() const
1665 {
1666 return has_flag(FLAG_POOL_SNAPS);
1667 }
1668
1669 bool pg_pool_t::is_unmanaged_snaps_mode() const
1670 {
1671 return has_flag(FLAG_SELFMANAGED_SNAPS);
1672 }
1673
1674 bool pg_pool_t::is_removed_snap(snapid_t s) const
1675 {
1676 if (is_pool_snaps_mode())
1677 return s <= get_snap_seq() && snaps.count(s) == 0;
1678 else
1679 return removed_snaps.contains(s);
1680 }
1681
1682 snapid_t pg_pool_t::snap_exists(const char *s) const
1683 {
1684 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
1685 if (p->second.name == s)
1686 return p->second.snapid;
1687 return 0;
1688 }
1689
1690 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1691 {
1692 ceph_assert(!is_unmanaged_snaps_mode());
1693 flags |= FLAG_POOL_SNAPS;
1694 snapid_t s = get_snap_seq() + 1;
1695 snap_seq = s;
1696 snaps[s].snapid = s;
1697 snaps[s].name = n;
1698 snaps[s].stamp = stamp;
1699 }
1700
1701 uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat)
1702 {
1703 ceph_assert(!is_pool_snaps_mode());
1704 if (snap_seq == 0) {
1705 if (preoctopus_compat) {
1706 // kludge for pre-mimic tracking of pool vs selfmanaged snaps. after
1707 // mimic this field is not decoded but our flag is set; pre-mimic, we
1708 // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
1709 removed_snaps.insert(snapid_t(1));
1710 }
1711 snap_seq = 1;
1712 }
1713 flags |= FLAG_SELFMANAGED_SNAPS;
1714 snap_seq = snap_seq + 1;
1715 return snap_seq;
1716 }
1717
1718 void pg_pool_t::remove_snap(snapid_t s)
1719 {
1720 ceph_assert(snaps.count(s));
1721 snaps.erase(s);
1722 snap_seq = snap_seq + 1;
1723 }
1724
1725 void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat)
1726 {
1727 ceph_assert(is_unmanaged_snaps_mode());
1728 ++snap_seq;
1729 if (preoctopus_compat) {
1730 removed_snaps.insert(s);
1731 // try to add in the new seq, just to try to keep the interval_set contiguous
1732 if (!removed_snaps.contains(get_snap_seq())) {
1733 removed_snaps.insert(get_snap_seq());
1734 }
1735 }
1736 }
1737
1738 SnapContext pg_pool_t::get_snap_context() const
1739 {
1740 vector<snapid_t> s(snaps.size());
1741 unsigned i = 0;
1742 for (auto p = snaps.crbegin(); p != snaps.crend(); ++p)
1743 s[i++] = p->first;
1744 return SnapContext(get_snap_seq(), s);
1745 }
1746
1747 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1748 {
1749 if (ns.empty())
1750 return ceph_str_hash(object_hash, key.data(), key.length());
1751 int nsl = ns.length();
1752 int len = key.length() + nsl + 1;
1753 char buf[len];
1754 memcpy(&buf[0], ns.data(), nsl);
1755 buf[nsl] = '\037';
1756 memcpy(&buf[nsl+1], key.data(), key.length());
1757 return ceph_str_hash(object_hash, &buf[0], len);
1758 }
1759
1760 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1761 {
1762 return ceph_stable_mod(v, pg_num, pg_num_mask);
1763 }
1764
1765 /*
1766 * map a raw pg (with full precision ps) into an actual pg, for storage
1767 */
1768 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1769 {
1770 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1771 return pg;
1772 }
1773
1774 /*
1775 * map raw pg (full precision ps) into a placement seed. include
1776 * pool id in that value so that different pools don't use the same
1777 * seeds.
1778 */
1779 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1780 {
1781 if (flags & FLAG_HASHPSPOOL) {
1782 // Hash the pool id so that pool PGs do not overlap.
1783 return
1784 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1785 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1786 pg.pool());
1787 } else {
1788 // Legacy behavior; add ps and pool together. This is not a great
1789 // idea because the PGs from each pool will essentially overlap on
1790 // top of each other: 0.5 == 1.4 == 2.3 == ...
1791 return
1792 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1793 pg.pool();
1794 }
1795 }
1796
1797 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1798 {
1799 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1800 if (pg_num == pg_num_mask + 1) {
1801 r &= ~pg_num_mask;
1802 } else {
1803 unsigned smaller_mask = pg_num_mask >> 1;
1804 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1805 r &= ~pg_num_mask;
1806 } else {
1807 r &= ~smaller_mask;
1808 }
1809 }
1810 r |= pg.ps();
1811 return r;
1812 }
1813
1814 void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
1815 {
1816 using ceph::encode;
1817 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1818 // this encoding matches the old struct ceph_pg_pool
1819 __u8 struct_v = 2;
1820 encode(struct_v, bl);
1821 encode(type, bl);
1822 encode(size, bl);
1823 encode(crush_rule, bl);
1824 encode(object_hash, bl);
1825 encode(pg_num, bl);
1826 encode(pgp_num, bl);
1827 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1828 encode(lpg_num, bl);
1829 encode(lpgp_num, bl);
1830 encode(last_change, bl);
1831 encode(snap_seq, bl);
1832 encode(snap_epoch, bl);
1833
1834 __u32 n = snaps.size();
1835 encode(n, bl);
1836 n = removed_snaps.num_intervals();
1837 encode(n, bl);
1838
1839 encode(auid, bl);
1840
1841 encode_nohead(snaps, bl, features);
1842 encode_nohead(removed_snaps, bl);
1843 return;
1844 }
1845
1846 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1847 __u8 struct_v = 4;
1848 encode(struct_v, bl);
1849 encode(type, bl);
1850 encode(size, bl);
1851 encode(crush_rule, bl);
1852 encode(object_hash, bl);
1853 encode(pg_num, bl);
1854 encode(pgp_num, bl);
1855 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1856 encode(lpg_num, bl);
1857 encode(lpgp_num, bl);
1858 encode(last_change, bl);
1859 encode(snap_seq, bl);
1860 encode(snap_epoch, bl);
1861 encode(snaps, bl, features);
1862 encode(removed_snaps, bl);
1863 encode(auid, bl);
1864 encode(flags, bl);
1865 encode((uint32_t)0, bl); // crash_replay_interval
1866 return;
1867 }
1868
1869 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1870 // we simply added last_force_op_resend here, which is a fully
1871 // backward compatible change. however, encoding the same map
1872 // differently between monitors triggers scrub noise (even though
1873 // they are decodable without the feature), so let's be pendantic
1874 // about it.
1875 ENCODE_START(14, 5, bl);
1876 encode(type, bl);
1877 encode(size, bl);
1878 encode(crush_rule, bl);
1879 encode(object_hash, bl);
1880 encode(pg_num, bl);
1881 encode(pgp_num, bl);
1882 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1883 encode(lpg_num, bl);
1884 encode(lpgp_num, bl);
1885 encode(last_change, bl);
1886 encode(snap_seq, bl);
1887 encode(snap_epoch, bl);
1888 encode(snaps, bl, features);
1889 encode(removed_snaps, bl);
1890 encode(auid, bl);
1891 encode(flags, bl);
1892 encode((uint32_t)0, bl); // crash_replay_interval
1893 encode(min_size, bl);
1894 encode(quota_max_bytes, bl);
1895 encode(quota_max_objects, bl);
1896 encode(tiers, bl);
1897 encode(tier_of, bl);
1898 __u8 c = cache_mode;
1899 encode(c, bl);
1900 encode(read_tier, bl);
1901 encode(write_tier, bl);
1902 encode(properties, bl);
1903 encode(hit_set_params, bl);
1904 encode(hit_set_period, bl);
1905 encode(hit_set_count, bl);
1906 encode(stripe_width, bl);
1907 encode(target_max_bytes, bl);
1908 encode(target_max_objects, bl);
1909 encode(cache_target_dirty_ratio_micro, bl);
1910 encode(cache_target_full_ratio_micro, bl);
1911 encode(cache_min_flush_age, bl);
1912 encode(cache_min_evict_age, bl);
1913 encode(erasure_code_profile, bl);
1914 ENCODE_FINISH(bl);
1915 return;
1916 }
1917
1918 uint8_t v = 29;
1919 // NOTE: any new encoding dependencies must be reflected by
1920 // SIGNIFICANT_FEATURES
1921 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1922 // this was the first post-hammer thing we added; if it's missing, encode
1923 // like hammer.
1924 v = 21;
1925 } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1926 v = 24;
1927 } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
1928 v = 26;
1929 } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
1930 v = 27;
1931 }
1932
1933 ENCODE_START(v, 5, bl);
1934 encode(type, bl);
1935 encode(size, bl);
1936 encode(crush_rule, bl);
1937 encode(object_hash, bl);
1938 encode(pg_num, bl);
1939 encode(pgp_num, bl);
1940 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1941 encode(lpg_num, bl);
1942 encode(lpgp_num, bl);
1943 encode(last_change, bl);
1944 encode(snap_seq, bl);
1945 encode(snap_epoch, bl);
1946 encode(snaps, bl, features);
1947 encode(removed_snaps, bl);
1948 encode(auid, bl);
1949 if (v >= 27) {
1950 encode(flags, bl);
1951 } else {
1952 auto tmp = flags;
1953 tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
1954 encode(tmp, bl);
1955 }
1956 encode((uint32_t)0, bl); // crash_replay_interval
1957 encode(min_size, bl);
1958 encode(quota_max_bytes, bl);
1959 encode(quota_max_objects, bl);
1960 encode(tiers, bl);
1961 encode(tier_of, bl);
1962 __u8 c = cache_mode;
1963 encode(c, bl);
1964 encode(read_tier, bl);
1965 encode(write_tier, bl);
1966 encode(properties, bl);
1967 encode(hit_set_params, bl);
1968 encode(hit_set_period, bl);
1969 encode(hit_set_count, bl);
1970 encode(stripe_width, bl);
1971 encode(target_max_bytes, bl);
1972 encode(target_max_objects, bl);
1973 encode(cache_target_dirty_ratio_micro, bl);
1974 encode(cache_target_full_ratio_micro, bl);
1975 encode(cache_min_flush_age, bl);
1976 encode(cache_min_evict_age, bl);
1977 encode(erasure_code_profile, bl);
1978 encode(last_force_op_resend_preluminous, bl);
1979 encode(min_read_recency_for_promote, bl);
1980 encode(expected_num_objects, bl);
1981 if (v >= 19) {
1982 encode(cache_target_dirty_high_ratio_micro, bl);
1983 }
1984 if (v >= 20) {
1985 encode(min_write_recency_for_promote, bl);
1986 }
1987 if (v >= 21) {
1988 encode(use_gmt_hitset, bl);
1989 }
1990 if (v >= 22) {
1991 encode(fast_read, bl);
1992 }
1993 if (v >= 23) {
1994 encode(hit_set_grade_decay_rate, bl);
1995 encode(hit_set_search_last_n, bl);
1996 }
1997 if (v >= 24) {
1998 encode(opts, bl, features);
1999 }
2000 if (v >= 25) {
2001 encode(last_force_op_resend_prenautilus, bl);
2002 }
2003 if (v >= 26) {
2004 encode(application_metadata, bl);
2005 }
2006 if (v >= 27) {
2007 encode(create_time, bl);
2008 }
2009 if (v >= 28) {
2010 encode(pg_num_target, bl);
2011 encode(pgp_num_target, bl);
2012 encode(pg_num_pending, bl);
2013 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01]
2014 encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01]
2015 encode(last_force_op_resend, bl);
2016 encode(pg_autoscale_mode, bl);
2017 }
2018 if (v >= 29) {
2019 encode(last_pg_merge_meta, bl);
2020 }
2021 ENCODE_FINISH(bl);
2022 }
2023
2024 void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
2025 {
2026 DECODE_START_LEGACY_COMPAT_LEN(29, 5, 5, bl);
2027 decode(type, bl);
2028 decode(size, bl);
2029 decode(crush_rule, bl);
2030 decode(object_hash, bl);
2031 decode(pg_num, bl);
2032 decode(pgp_num, bl);
2033 {
2034 __u32 lpg_num, lpgp_num;
2035 decode(lpg_num, bl);
2036 decode(lpgp_num, bl);
2037 }
2038 decode(last_change, bl);
2039 decode(snap_seq, bl);
2040 decode(snap_epoch, bl);
2041
2042 if (struct_v >= 3) {
2043 decode(snaps, bl);
2044 decode(removed_snaps, bl);
2045 decode(auid, bl);
2046 } else {
2047 __u32 n, m;
2048 decode(n, bl);
2049 decode(m, bl);
2050 decode(auid, bl);
2051 decode_nohead(n, snaps, bl);
2052 decode_nohead(m, removed_snaps, bl);
2053 }
2054
2055 if (struct_v >= 4) {
2056 decode(flags, bl);
2057 uint32_t crash_replay_interval;
2058 decode(crash_replay_interval, bl);
2059 } else {
2060 flags = 0;
2061 }
2062 // upgrade path for selfmanaged vs pool snaps
2063 if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
2064 if (!removed_snaps.empty()) {
2065 flags |= FLAG_SELFMANAGED_SNAPS;
2066 } else {
2067 flags |= FLAG_POOL_SNAPS;
2068 }
2069 }
2070 if (struct_v >= 7) {
2071 decode(min_size, bl);
2072 } else {
2073 min_size = size - size/2;
2074 }
2075 if (struct_v >= 8) {
2076 decode(quota_max_bytes, bl);
2077 decode(quota_max_objects, bl);
2078 }
2079 if (struct_v >= 9) {
2080 decode(tiers, bl);
2081 decode(tier_of, bl);
2082 __u8 v;
2083 decode(v, bl);
2084 cache_mode = (cache_mode_t)v;
2085 decode(read_tier, bl);
2086 decode(write_tier, bl);
2087 }
2088 if (struct_v >= 10) {
2089 decode(properties, bl);
2090 }
2091 if (struct_v >= 11) {
2092 decode(hit_set_params, bl);
2093 decode(hit_set_period, bl);
2094 decode(hit_set_count, bl);
2095 } else {
2096 pg_pool_t def;
2097 hit_set_period = def.hit_set_period;
2098 hit_set_count = def.hit_set_count;
2099 }
2100 if (struct_v >= 12) {
2101 decode(stripe_width, bl);
2102 } else {
2103 set_stripe_width(0);
2104 }
2105 if (struct_v >= 13) {
2106 decode(target_max_bytes, bl);
2107 decode(target_max_objects, bl);
2108 decode(cache_target_dirty_ratio_micro, bl);
2109 decode(cache_target_full_ratio_micro, bl);
2110 decode(cache_min_flush_age, bl);
2111 decode(cache_min_evict_age, bl);
2112 } else {
2113 target_max_bytes = 0;
2114 target_max_objects = 0;
2115 cache_target_dirty_ratio_micro = 0;
2116 cache_target_full_ratio_micro = 0;
2117 cache_min_flush_age = 0;
2118 cache_min_evict_age = 0;
2119 }
2120 if (struct_v >= 14) {
2121 decode(erasure_code_profile, bl);
2122 }
2123 if (struct_v >= 15) {
2124 decode(last_force_op_resend_preluminous, bl);
2125 } else {
2126 last_force_op_resend_preluminous = 0;
2127 }
2128 if (struct_v >= 16) {
2129 decode(min_read_recency_for_promote, bl);
2130 } else {
2131 min_read_recency_for_promote = 1;
2132 }
2133 if (struct_v >= 17) {
2134 decode(expected_num_objects, bl);
2135 } else {
2136 expected_num_objects = 0;
2137 }
2138 if (struct_v >= 19) {
2139 decode(cache_target_dirty_high_ratio_micro, bl);
2140 } else {
2141 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
2142 }
2143 if (struct_v >= 20) {
2144 decode(min_write_recency_for_promote, bl);
2145 } else {
2146 min_write_recency_for_promote = 1;
2147 }
2148 if (struct_v >= 21) {
2149 decode(use_gmt_hitset, bl);
2150 } else {
2151 use_gmt_hitset = false;
2152 }
2153 if (struct_v >= 22) {
2154 decode(fast_read, bl);
2155 } else {
2156 fast_read = false;
2157 }
2158 if (struct_v >= 23) {
2159 decode(hit_set_grade_decay_rate, bl);
2160 decode(hit_set_search_last_n, bl);
2161 } else {
2162 hit_set_grade_decay_rate = 0;
2163 hit_set_search_last_n = 1;
2164 }
2165 if (struct_v >= 24) {
2166 decode(opts, bl);
2167 }
2168 if (struct_v >= 25) {
2169 decode(last_force_op_resend_prenautilus, bl);
2170 } else {
2171 last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
2172 }
2173 if (struct_v >= 26) {
2174 decode(application_metadata, bl);
2175 }
2176 if (struct_v >= 27) {
2177 decode(create_time, bl);
2178 }
2179 if (struct_v >= 28) {
2180 decode(pg_num_target, bl);
2181 decode(pgp_num_target, bl);
2182 decode(pg_num_pending, bl);
2183 epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
2184 decode(old_merge_last_epoch_started, bl);
2185 decode(old_merge_last_epoch_clean, bl);
2186 decode(last_force_op_resend, bl);
2187 decode(pg_autoscale_mode, bl);
2188 if (struct_v >= 29) {
2189 decode(last_pg_merge_meta, bl);
2190 } else {
2191 last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
2192 last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
2193 }
2194 } else {
2195 pg_num_target = pg_num;
2196 pgp_num_target = pgp_num;
2197 pg_num_pending = pg_num;
2198 last_force_op_resend = last_force_op_resend_prenautilus;
2199 pg_autoscale_mode = pg_autoscale_mode_t::WARN; // default to warn on upgrade
2200 }
2201 DECODE_FINISH(bl);
2202 calc_pg_masks();
2203 calc_grade_table();
2204 }
2205
2206 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
2207 {
2208 pg_pool_t a;
2209 o.push_back(new pg_pool_t(a));
2210
2211 a.create_time = utime_t(4,5);
2212 a.type = TYPE_REPLICATED;
2213 a.size = 2;
2214 a.crush_rule = 3;
2215 a.object_hash = 4;
2216 a.pg_num = 6;
2217 a.pgp_num = 4;
2218 a.pgp_num_target = 4;
2219 a.pg_num_target = 5;
2220 a.pg_num_pending = 5;
2221 a.last_pg_merge_meta.last_epoch_started = 2;
2222 a.last_pg_merge_meta.last_epoch_clean = 2;
2223 a.last_change = 9;
2224 a.last_force_op_resend = 123823;
2225 a.last_force_op_resend_preluminous = 123824;
2226 a.snap_seq = 10;
2227 a.snap_epoch = 11;
2228 a.flags = FLAG_POOL_SNAPS;
2229 a.auid = 12;
2230 a.quota_max_bytes = 473;
2231 a.quota_max_objects = 474;
2232 o.push_back(new pg_pool_t(a));
2233
2234 a.snaps[3].name = "asdf";
2235 a.snaps[3].snapid = 3;
2236 a.snaps[3].stamp = utime_t(123, 4);
2237 a.snaps[6].name = "qwer";
2238 a.snaps[6].snapid = 6;
2239 a.snaps[6].stamp = utime_t(23423, 4);
2240 o.push_back(new pg_pool_t(a));
2241
2242 a.flags = FLAG_SELFMANAGED_SNAPS;
2243 a.snaps.clear();
2244 a.removed_snaps.insert(2);
2245 a.quota_max_bytes = 2473;
2246 a.quota_max_objects = 4374;
2247 a.tiers.insert(0);
2248 a.tiers.insert(1);
2249 a.tier_of = 2;
2250 a.cache_mode = CACHEMODE_WRITEBACK;
2251 a.read_tier = 1;
2252 a.write_tier = 1;
2253 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
2254 a.hit_set_period = 3600;
2255 a.hit_set_count = 8;
2256 a.min_read_recency_for_promote = 1;
2257 a.min_write_recency_for_promote = 1;
2258 a.hit_set_grade_decay_rate = 50;
2259 a.hit_set_search_last_n = 1;
2260 a.calc_grade_table();
2261 a.set_stripe_width(12345);
2262 a.target_max_bytes = 1238132132;
2263 a.target_max_objects = 1232132;
2264 a.cache_target_dirty_ratio_micro = 187232;
2265 a.cache_target_dirty_high_ratio_micro = 309856;
2266 a.cache_target_full_ratio_micro = 987222;
2267 a.cache_min_flush_age = 231;
2268 a.cache_min_evict_age = 2321;
2269 a.erasure_code_profile = "profile in osdmap";
2270 a.expected_num_objects = 123456;
2271 a.fast_read = false;
2272 a.application_metadata = {{"rbd", {{"key", "value"}}}};
2273 o.push_back(new pg_pool_t(a));
2274 }
2275
2276 ostream& operator<<(ostream& out, const pg_pool_t& p)
2277 {
2278 out << p.get_type_name();
2279 if (p.get_type_name() == "erasure") {
2280 out << " profile " << p.erasure_code_profile;
2281 }
2282 out << " size " << p.get_size()
2283 << " min_size " << p.get_min_size()
2284 << " crush_rule " << p.get_crush_rule()
2285 << " object_hash " << p.get_object_hash_name()
2286 << " pg_num " << p.get_pg_num()
2287 << " pgp_num " << p.get_pgp_num();
2288 if (p.get_pg_num_target() != p.get_pg_num()) {
2289 out << " pg_num_target " << p.get_pg_num_target();
2290 }
2291 if (p.get_pgp_num_target() != p.get_pgp_num()) {
2292 out << " pgp_num_target " << p.get_pgp_num_target();
2293 }
2294 if (p.get_pg_num_pending() != p.get_pg_num()) {
2295 out << " pg_num_pending " << p.get_pg_num_pending();
2296 }
2297 if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
2298 out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode);
2299 }
2300 out << " last_change " << p.get_last_change();
2301 if (p.get_last_force_op_resend() ||
2302 p.get_last_force_op_resend_prenautilus() ||
2303 p.get_last_force_op_resend_preluminous())
2304 out << " lfor " << p.get_last_force_op_resend() << "/"
2305 << p.get_last_force_op_resend_prenautilus() << "/"
2306 << p.get_last_force_op_resend_preluminous();
2307 if (p.get_auid())
2308 out << " owner " << p.get_auid();
2309 if (p.flags)
2310 out << " flags " << p.get_flags_string();
2311 if (p.quota_max_bytes)
2312 out << " max_bytes " << p.quota_max_bytes;
2313 if (p.quota_max_objects)
2314 out << " max_objects " << p.quota_max_objects;
2315 if (!p.tiers.empty())
2316 out << " tiers " << p.tiers;
2317 if (p.is_tier())
2318 out << " tier_of " << p.tier_of;
2319 if (p.has_read_tier())
2320 out << " read_tier " << p.read_tier;
2321 if (p.has_write_tier())
2322 out << " write_tier " << p.write_tier;
2323 if (p.cache_mode)
2324 out << " cache_mode " << p.get_cache_mode_name();
2325 if (p.target_max_bytes)
2326 out << " target_bytes " << p.target_max_bytes;
2327 if (p.target_max_objects)
2328 out << " target_objects " << p.target_max_objects;
2329 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
2330 out << " hit_set " << p.hit_set_params
2331 << " " << p.hit_set_period << "s"
2332 << " x" << p.hit_set_count << " decay_rate "
2333 << p.hit_set_grade_decay_rate
2334 << " search_last_n " << p.hit_set_search_last_n;
2335 }
2336 if (p.min_read_recency_for_promote)
2337 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
2338 if (p.min_write_recency_for_promote)
2339 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
2340 out << " stripe_width " << p.get_stripe_width();
2341 if (p.expected_num_objects)
2342 out << " expected_num_objects " << p.expected_num_objects;
2343 if (p.fast_read)
2344 out << " fast_read " << p.fast_read;
2345 out << p.opts;
2346 if (!p.application_metadata.empty()) {
2347 out << " application ";
2348 for (auto it = p.application_metadata.begin();
2349 it != p.application_metadata.end(); ++it) {
2350 if (it != p.application_metadata.begin())
2351 out << ",";
2352 out << it->first;
2353 }
2354 }
2355 return out;
2356 }
2357
2358
2359 // -- object_stat_sum_t --
2360
2361 void object_stat_sum_t::dump(Formatter *f) const
2362 {
2363 f->dump_int("num_bytes", num_bytes);
2364 f->dump_int("num_objects", num_objects);
2365 f->dump_int("num_object_clones", num_object_clones);
2366 f->dump_int("num_object_copies", num_object_copies);
2367 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
2368 f->dump_int("num_objects_missing", num_objects_missing);
2369 f->dump_int("num_objects_degraded", num_objects_degraded);
2370 f->dump_int("num_objects_misplaced", num_objects_misplaced);
2371 f->dump_int("num_objects_unfound", num_objects_unfound);
2372 f->dump_int("num_objects_dirty", num_objects_dirty);
2373 f->dump_int("num_whiteouts", num_whiteouts);
2374 f->dump_int("num_read", num_rd);
2375 f->dump_int("num_read_kb", num_rd_kb);
2376 f->dump_int("num_write", num_wr);
2377 f->dump_int("num_write_kb", num_wr_kb);
2378 f->dump_int("num_scrub_errors", num_scrub_errors);
2379 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
2380 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
2381 f->dump_int("num_objects_recovered", num_objects_recovered);
2382 f->dump_int("num_bytes_recovered", num_bytes_recovered);
2383 f->dump_int("num_keys_recovered", num_keys_recovered);
2384 f->dump_int("num_objects_omap", num_objects_omap);
2385 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
2386 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
2387 f->dump_int("num_flush", num_flush);
2388 f->dump_int("num_flush_kb", num_flush_kb);
2389 f->dump_int("num_evict", num_evict);
2390 f->dump_int("num_evict_kb", num_evict_kb);
2391 f->dump_int("num_promote", num_promote);
2392 f->dump_int("num_flush_mode_high", num_flush_mode_high);
2393 f->dump_int("num_flush_mode_low", num_flush_mode_low);
2394 f->dump_int("num_evict_mode_some", num_evict_mode_some);
2395 f->dump_int("num_evict_mode_full", num_evict_mode_full);
2396 f->dump_int("num_objects_pinned", num_objects_pinned);
2397 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
2398 f->dump_int("num_large_omap_objects", num_large_omap_objects);
2399 f->dump_int("num_objects_manifest", num_objects_manifest);
2400 f->dump_int("num_omap_bytes", num_omap_bytes);
2401 f->dump_int("num_omap_keys", num_omap_keys);
2402 f->dump_int("num_objects_repaired", num_objects_repaired);
2403 }
2404
2405 void object_stat_sum_t::encode(ceph::buffer::list& bl) const
2406 {
2407 ENCODE_START(20, 14, bl);
2408 #if defined(CEPH_LITTLE_ENDIAN)
2409 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
2410 #else
2411 encode(num_bytes, bl);
2412 encode(num_objects, bl);
2413 encode(num_object_clones, bl);
2414 encode(num_object_copies, bl);
2415 encode(num_objects_missing_on_primary, bl);
2416 encode(num_objects_degraded, bl);
2417 encode(num_objects_unfound, bl);
2418 encode(num_rd, bl);
2419 encode(num_rd_kb, bl);
2420 encode(num_wr, bl);
2421 encode(num_wr_kb, bl);
2422 encode(num_scrub_errors, bl);
2423 encode(num_objects_recovered, bl);
2424 encode(num_bytes_recovered, bl);
2425 encode(num_keys_recovered, bl);
2426 encode(num_shallow_scrub_errors, bl);
2427 encode(num_deep_scrub_errors, bl);
2428 encode(num_objects_dirty, bl);
2429 encode(num_whiteouts, bl);
2430 encode(num_objects_omap, bl);
2431 encode(num_objects_hit_set_archive, bl);
2432 encode(num_objects_misplaced, bl);
2433 encode(num_bytes_hit_set_archive, bl);
2434 encode(num_flush, bl);
2435 encode(num_flush_kb, bl);
2436 encode(num_evict, bl);
2437 encode(num_evict_kb, bl);
2438 encode(num_promote, bl);
2439 encode(num_flush_mode_high, bl);
2440 encode(num_flush_mode_low, bl);
2441 encode(num_evict_mode_some, bl);
2442 encode(num_evict_mode_full, bl);
2443 encode(num_objects_pinned, bl);
2444 encode(num_objects_missing, bl);
2445 encode(num_legacy_snapsets, bl);
2446 encode(num_large_omap_objects, bl);
2447 encode(num_objects_manifest, bl);
2448 encode(num_omap_bytes, bl);
2449 encode(num_omap_keys, bl);
2450 encode(num_objects_repaired, bl);
2451 #endif
2452 ENCODE_FINISH(bl);
2453 }
2454
2455 void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl)
2456 {
2457 bool decode_finish = false;
2458 static const int STAT_SUM_DECODE_VERSION = 20;
2459 DECODE_START(STAT_SUM_DECODE_VERSION, bl);
2460 #if defined(CEPH_LITTLE_ENDIAN)
2461 if (struct_v == STAT_SUM_DECODE_VERSION) {
2462 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
2463 decode_finish = true;
2464 }
2465 #endif
2466 if (!decode_finish) {
2467 decode(num_bytes, bl);
2468 decode(num_objects, bl);
2469 decode(num_object_clones, bl);
2470 decode(num_object_copies, bl);
2471 decode(num_objects_missing_on_primary, bl);
2472 decode(num_objects_degraded, bl);
2473 decode(num_objects_unfound, bl);
2474 decode(num_rd, bl);
2475 decode(num_rd_kb, bl);
2476 decode(num_wr, bl);
2477 decode(num_wr_kb, bl);
2478 decode(num_scrub_errors, bl);
2479 decode(num_objects_recovered, bl);
2480 decode(num_bytes_recovered, bl);
2481 decode(num_keys_recovered, bl);
2482 decode(num_shallow_scrub_errors, bl);
2483 decode(num_deep_scrub_errors, bl);
2484 decode(num_objects_dirty, bl);
2485 decode(num_whiteouts, bl);
2486 decode(num_objects_omap, bl);
2487 decode(num_objects_hit_set_archive, bl);
2488 decode(num_objects_misplaced, bl);
2489 decode(num_bytes_hit_set_archive, bl);
2490 decode(num_flush, bl);
2491 decode(num_flush_kb, bl);
2492 decode(num_evict, bl);
2493 decode(num_evict_kb, bl);
2494 decode(num_promote, bl);
2495 decode(num_flush_mode_high, bl);
2496 decode(num_flush_mode_low, bl);
2497 decode(num_evict_mode_some, bl);
2498 decode(num_evict_mode_full, bl);
2499 decode(num_objects_pinned, bl);
2500 decode(num_objects_missing, bl);
2501 if (struct_v >= 16) {
2502 decode(num_legacy_snapsets, bl);
2503 } else {
2504 num_legacy_snapsets = num_object_clones; // upper bound
2505 }
2506 if (struct_v >= 17) {
2507 decode(num_large_omap_objects, bl);
2508 }
2509 if (struct_v >= 18) {
2510 decode(num_objects_manifest, bl);
2511 }
2512 if (struct_v >= 19) {
2513 decode(num_omap_bytes, bl);
2514 decode(num_omap_keys, bl);
2515 }
2516 if (struct_v >= 20) {
2517 decode(num_objects_repaired, bl);
2518 }
2519 }
2520 DECODE_FINISH(bl);
2521 }
2522
2523 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2524 {
2525 object_stat_sum_t a;
2526
2527 a.num_bytes = 1;
2528 a.num_objects = 3;
2529 a.num_object_clones = 4;
2530 a.num_object_copies = 5;
2531 a.num_objects_missing_on_primary = 6;
2532 a.num_objects_missing = 123;
2533 a.num_objects_degraded = 7;
2534 a.num_objects_unfound = 8;
2535 a.num_rd = 9; a.num_rd_kb = 10;
2536 a.num_wr = 11; a.num_wr_kb = 12;
2537 a.num_objects_recovered = 14;
2538 a.num_bytes_recovered = 15;
2539 a.num_keys_recovered = 16;
2540 a.num_deep_scrub_errors = 17;
2541 a.num_shallow_scrub_errors = 18;
2542 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2543 a.num_objects_dirty = 21;
2544 a.num_whiteouts = 22;
2545 a.num_objects_misplaced = 1232;
2546 a.num_objects_hit_set_archive = 2;
2547 a.num_bytes_hit_set_archive = 27;
2548 a.num_flush = 5;
2549 a.num_flush_kb = 6;
2550 a.num_evict = 7;
2551 a.num_evict_kb = 8;
2552 a.num_promote = 9;
2553 a.num_flush_mode_high = 0;
2554 a.num_flush_mode_low = 1;
2555 a.num_evict_mode_some = 1;
2556 a.num_evict_mode_full = 0;
2557 a.num_objects_pinned = 20;
2558 a.num_large_omap_objects = 5;
2559 a.num_objects_manifest = 2;
2560 a.num_omap_bytes = 20000;
2561 a.num_omap_keys = 200;
2562 a.num_objects_repaired = 300;
2563 o.push_back(new object_stat_sum_t(a));
2564 }
2565
2566 void object_stat_sum_t::add(const object_stat_sum_t& o)
2567 {
2568 num_bytes += o.num_bytes;
2569 num_objects += o.num_objects;
2570 num_object_clones += o.num_object_clones;
2571 num_object_copies += o.num_object_copies;
2572 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2573 num_objects_missing += o.num_objects_missing;
2574 num_objects_degraded += o.num_objects_degraded;
2575 num_objects_misplaced += o.num_objects_misplaced;
2576 num_rd += o.num_rd;
2577 num_rd_kb += o.num_rd_kb;
2578 num_wr += o.num_wr;
2579 num_wr_kb += o.num_wr_kb;
2580 num_objects_unfound += o.num_objects_unfound;
2581 num_scrub_errors += o.num_scrub_errors;
2582 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2583 num_deep_scrub_errors += o.num_deep_scrub_errors;
2584 num_objects_recovered += o.num_objects_recovered;
2585 num_bytes_recovered += o.num_bytes_recovered;
2586 num_keys_recovered += o.num_keys_recovered;
2587 num_objects_dirty += o.num_objects_dirty;
2588 num_whiteouts += o.num_whiteouts;
2589 num_objects_omap += o.num_objects_omap;
2590 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2591 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2592 num_flush += o.num_flush;
2593 num_flush_kb += o.num_flush_kb;
2594 num_evict += o.num_evict;
2595 num_evict_kb += o.num_evict_kb;
2596 num_promote += o.num_promote;
2597 num_flush_mode_high += o.num_flush_mode_high;
2598 num_flush_mode_low += o.num_flush_mode_low;
2599 num_evict_mode_some += o.num_evict_mode_some;
2600 num_evict_mode_full += o.num_evict_mode_full;
2601 num_objects_pinned += o.num_objects_pinned;
2602 num_legacy_snapsets += o.num_legacy_snapsets;
2603 num_large_omap_objects += o.num_large_omap_objects;
2604 num_objects_manifest += o.num_objects_manifest;
2605 num_omap_bytes += o.num_omap_bytes;
2606 num_omap_keys += o.num_omap_keys;
2607 num_objects_repaired += o.num_objects_repaired;
2608 }
2609
2610 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2611 {
2612 num_bytes -= o.num_bytes;
2613 num_objects -= o.num_objects;
2614 num_object_clones -= o.num_object_clones;
2615 num_object_copies -= o.num_object_copies;
2616 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2617 num_objects_missing -= o.num_objects_missing;
2618 num_objects_degraded -= o.num_objects_degraded;
2619 num_objects_misplaced -= o.num_objects_misplaced;
2620 num_rd -= o.num_rd;
2621 num_rd_kb -= o.num_rd_kb;
2622 num_wr -= o.num_wr;
2623 num_wr_kb -= o.num_wr_kb;
2624 num_objects_unfound -= o.num_objects_unfound;
2625 num_scrub_errors -= o.num_scrub_errors;
2626 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2627 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2628 num_objects_recovered -= o.num_objects_recovered;
2629 num_bytes_recovered -= o.num_bytes_recovered;
2630 num_keys_recovered -= o.num_keys_recovered;
2631 num_objects_dirty -= o.num_objects_dirty;
2632 num_whiteouts -= o.num_whiteouts;
2633 num_objects_omap -= o.num_objects_omap;
2634 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2635 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2636 num_flush -= o.num_flush;
2637 num_flush_kb -= o.num_flush_kb;
2638 num_evict -= o.num_evict;
2639 num_evict_kb -= o.num_evict_kb;
2640 num_promote -= o.num_promote;
2641 num_flush_mode_high -= o.num_flush_mode_high;
2642 num_flush_mode_low -= o.num_flush_mode_low;
2643 num_evict_mode_some -= o.num_evict_mode_some;
2644 num_evict_mode_full -= o.num_evict_mode_full;
2645 num_objects_pinned -= o.num_objects_pinned;
2646 num_legacy_snapsets -= o.num_legacy_snapsets;
2647 num_large_omap_objects -= o.num_large_omap_objects;
2648 num_objects_manifest -= o.num_objects_manifest;
2649 num_omap_bytes -= o.num_omap_bytes;
2650 num_omap_keys -= o.num_omap_keys;
2651 num_objects_repaired -= o.num_objects_repaired;
2652 }
2653
2654 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2655 {
2656 return
2657 l.num_bytes == r.num_bytes &&
2658 l.num_objects == r.num_objects &&
2659 l.num_object_clones == r.num_object_clones &&
2660 l.num_object_copies == r.num_object_copies &&
2661 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2662 l.num_objects_missing == r.num_objects_missing &&
2663 l.num_objects_degraded == r.num_objects_degraded &&
2664 l.num_objects_misplaced == r.num_objects_misplaced &&
2665 l.num_objects_unfound == r.num_objects_unfound &&
2666 l.num_rd == r.num_rd &&
2667 l.num_rd_kb == r.num_rd_kb &&
2668 l.num_wr == r.num_wr &&
2669 l.num_wr_kb == r.num_wr_kb &&
2670 l.num_scrub_errors == r.num_scrub_errors &&
2671 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2672 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2673 l.num_objects_recovered == r.num_objects_recovered &&
2674 l.num_bytes_recovered == r.num_bytes_recovered &&
2675 l.num_keys_recovered == r.num_keys_recovered &&
2676 l.num_objects_dirty == r.num_objects_dirty &&
2677 l.num_whiteouts == r.num_whiteouts &&
2678 l.num_objects_omap == r.num_objects_omap &&
2679 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2680 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2681 l.num_flush == r.num_flush &&
2682 l.num_flush_kb == r.num_flush_kb &&
2683 l.num_evict == r.num_evict &&
2684 l.num_evict_kb == r.num_evict_kb &&
2685 l.num_promote == r.num_promote &&
2686 l.num_flush_mode_high == r.num_flush_mode_high &&
2687 l.num_flush_mode_low == r.num_flush_mode_low &&
2688 l.num_evict_mode_some == r.num_evict_mode_some &&
2689 l.num_evict_mode_full == r.num_evict_mode_full &&
2690 l.num_objects_pinned == r.num_objects_pinned &&
2691 l.num_legacy_snapsets == r.num_legacy_snapsets &&
2692 l.num_large_omap_objects == r.num_large_omap_objects &&
2693 l.num_objects_manifest == r.num_objects_manifest &&
2694 l.num_omap_bytes == r.num_omap_bytes &&
2695 l.num_omap_keys == r.num_omap_keys &&
2696 l.num_objects_repaired == r.num_objects_repaired;
2697 }
2698
2699 // -- object_stat_collection_t --
2700
2701 void object_stat_collection_t::dump(Formatter *f) const
2702 {
2703 f->open_object_section("stat_sum");
2704 sum.dump(f);
2705 f->close_section();
2706 }
2707
2708 void object_stat_collection_t::encode(ceph::buffer::list& bl) const
2709 {
2710 ENCODE_START(2, 2, bl);
2711 encode(sum, bl);
2712 encode((__u32)0, bl);
2713 ENCODE_FINISH(bl);
2714 }
2715
2716 void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl)
2717 {
2718 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2719 decode(sum, bl);
2720 {
2721 map<string,object_stat_sum_t> cat_sum;
2722 decode(cat_sum, bl);
2723 }
2724 DECODE_FINISH(bl);
2725 }
2726
2727 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2728 {
2729 object_stat_collection_t a;
2730 o.push_back(new object_stat_collection_t(a));
2731 list<object_stat_sum_t*> l;
2732 object_stat_sum_t::generate_test_instances(l);
2733 for (auto p = l.begin(); p != l.end(); ++p) {
2734 a.add(**p);
2735 o.push_back(new object_stat_collection_t(a));
2736 }
2737 }
2738
2739
2740 // -- pg_stat_t --
2741
2742 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2743 {
2744 if (primary && osd == acting_primary) {
2745 return true;
2746 } else if (!primary) {
2747 for(auto it = acting.cbegin(); it != acting.cend(); ++it)
2748 {
2749 if (*it == osd)
2750 return true;
2751 }
2752 }
2753 return false;
2754 }
2755
2756 void pg_stat_t::dump(Formatter *f) const
2757 {
2758 f->dump_stream("version") << version;
2759 f->dump_stream("reported_seq") << reported_seq;
2760 f->dump_stream("reported_epoch") << reported_epoch;
2761 f->dump_string("state", pg_state_string(state));
2762 f->dump_stream("last_fresh") << last_fresh;
2763 f->dump_stream("last_change") << last_change;
2764 f->dump_stream("last_active") << last_active;
2765 f->dump_stream("last_peered") << last_peered;
2766 f->dump_stream("last_clean") << last_clean;
2767 f->dump_stream("last_became_active") << last_became_active;
2768 f->dump_stream("last_became_peered") << last_became_peered;
2769 f->dump_stream("last_unstale") << last_unstale;
2770 f->dump_stream("last_undegraded") << last_undegraded;
2771 f->dump_stream("last_fullsized") << last_fullsized;
2772 f->dump_unsigned("mapping_epoch", mapping_epoch);
2773 f->dump_stream("log_start") << log_start;
2774 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2775 f->dump_unsigned("created", created);
2776 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2777 f->dump_stream("parent") << parent;
2778 f->dump_unsigned("parent_split_bits", parent_split_bits);
2779 f->dump_stream("last_scrub") << last_scrub;
2780 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2781 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2782 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2783 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2784 f->dump_int("log_size", log_size);
2785 f->dump_int("ondisk_log_size", ondisk_log_size);
2786 f->dump_bool("stats_invalid", stats_invalid);
2787 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2788 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2789 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2790 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2791 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2792 f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
2793 f->dump_unsigned("snaptrimq_len", snaptrimq_len);
2794 stats.dump(f);
2795 f->open_array_section("up");
2796 for (auto p = up.cbegin(); p != up.cend(); ++p)
2797 f->dump_int("osd", *p);
2798 f->close_section();
2799 f->open_array_section("acting");
2800 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
2801 f->dump_int("osd", *p);
2802 f->close_section();
2803 f->open_array_section("avail_no_missing");
2804 for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p)
2805 f->dump_stream("shard") << *p;
2806 f->close_section();
2807 f->open_array_section("object_location_counts");
2808 for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) {
2809 f->open_object_section("entry");
2810 f->dump_stream("shards") << p->first;
2811 f->dump_int("objects", p->second);
2812 f->close_section();
2813 }
2814 f->close_section();
2815 f->open_array_section("blocked_by");
2816 for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p)
2817 f->dump_int("osd", *p);
2818 f->close_section();
2819 f->dump_int("up_primary", up_primary);
2820 f->dump_int("acting_primary", acting_primary);
2821 f->open_array_section("purged_snaps");
2822 for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) {
2823 f->open_object_section("interval");
2824 f->dump_stream("start") << i.get_start();
2825 f->dump_stream("length") << i.get_len();
2826 f->close_section();
2827 }
2828 f->close_section();
2829 }
2830
2831 void pg_stat_t::dump_brief(Formatter *f) const
2832 {
2833 f->dump_string("state", pg_state_string(state));
2834 f->open_array_section("up");
2835 for (auto p = up.cbegin(); p != up.cend(); ++p)
2836 f->dump_int("osd", *p);
2837 f->close_section();
2838 f->open_array_section("acting");
2839 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
2840 f->dump_int("osd", *p);
2841 f->close_section();
2842 f->dump_int("up_primary", up_primary);
2843 f->dump_int("acting_primary", acting_primary);
2844 }
2845
2846 void pg_stat_t::encode(ceph::buffer::list &bl) const
2847 {
2848 ENCODE_START(26, 22, bl);
2849 encode(version, bl);
2850 encode(reported_seq, bl);
2851 encode(reported_epoch, bl);
2852 encode((__u32)state, bl); // for older peers
2853 encode(log_start, bl);
2854 encode(ondisk_log_start, bl);
2855 encode(created, bl);
2856 encode(last_epoch_clean, bl);
2857 encode(parent, bl);
2858 encode(parent_split_bits, bl);
2859 encode(last_scrub, bl);
2860 encode(last_scrub_stamp, bl);
2861 encode(stats, bl);
2862 encode(log_size, bl);
2863 encode(ondisk_log_size, bl);
2864 encode(up, bl);
2865 encode(acting, bl);
2866 encode(last_fresh, bl);
2867 encode(last_change, bl);
2868 encode(last_active, bl);
2869 encode(last_clean, bl);
2870 encode(last_unstale, bl);
2871 encode(mapping_epoch, bl);
2872 encode(last_deep_scrub, bl);
2873 encode(last_deep_scrub_stamp, bl);
2874 encode(stats_invalid, bl);
2875 encode(last_clean_scrub_stamp, bl);
2876 encode(last_became_active, bl);
2877 encode(dirty_stats_invalid, bl);
2878 encode(up_primary, bl);
2879 encode(acting_primary, bl);
2880 encode(omap_stats_invalid, bl);
2881 encode(hitset_stats_invalid, bl);
2882 encode(blocked_by, bl);
2883 encode(last_undegraded, bl);
2884 encode(last_fullsized, bl);
2885 encode(hitset_bytes_stats_invalid, bl);
2886 encode(last_peered, bl);
2887 encode(last_became_peered, bl);
2888 encode(pin_stats_invalid, bl);
2889 encode(snaptrimq_len, bl);
2890 __u32 top_state = (state >> 32);
2891 encode(top_state, bl);
2892 encode(purged_snaps, bl);
2893 encode(manifest_stats_invalid, bl);
2894 encode(avail_no_missing, bl);
2895 encode(object_location_counts, bl);
2896 ENCODE_FINISH(bl);
2897 }
2898
2899 void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
2900 {
2901 bool tmp;
2902 uint32_t old_state;
2903 DECODE_START(26, bl);
2904 decode(version, bl);
2905 decode(reported_seq, bl);
2906 decode(reported_epoch, bl);
2907 decode(old_state, bl);
2908 decode(log_start, bl);
2909 decode(ondisk_log_start, bl);
2910 decode(created, bl);
2911 decode(last_epoch_clean, bl);
2912 decode(parent, bl);
2913 decode(parent_split_bits, bl);
2914 decode(last_scrub, bl);
2915 decode(last_scrub_stamp, bl);
2916 decode(stats, bl);
2917 decode(log_size, bl);
2918 decode(ondisk_log_size, bl);
2919 decode(up, bl);
2920 decode(acting, bl);
2921 decode(last_fresh, bl);
2922 decode(last_change, bl);
2923 decode(last_active, bl);
2924 decode(last_clean, bl);
2925 decode(last_unstale, bl);
2926 decode(mapping_epoch, bl);
2927 decode(last_deep_scrub, bl);
2928 decode(last_deep_scrub_stamp, bl);
2929 decode(tmp, bl);
2930 stats_invalid = tmp;
2931 decode(last_clean_scrub_stamp, bl);
2932 decode(last_became_active, bl);
2933 decode(tmp, bl);
2934 dirty_stats_invalid = tmp;
2935 decode(up_primary, bl);
2936 decode(acting_primary, bl);
2937 decode(tmp, bl);
2938 omap_stats_invalid = tmp;
2939 decode(tmp, bl);
2940 hitset_stats_invalid = tmp;
2941 decode(blocked_by, bl);
2942 decode(last_undegraded, bl);
2943 decode(last_fullsized, bl);
2944 decode(tmp, bl);
2945 hitset_bytes_stats_invalid = tmp;
2946 decode(last_peered, bl);
2947 decode(last_became_peered, bl);
2948 decode(tmp, bl);
2949 pin_stats_invalid = tmp;
2950 if (struct_v >= 23) {
2951 decode(snaptrimq_len, bl);
2952 if (struct_v >= 24) {
2953 __u32 top_state;
2954 decode(top_state, bl);
2955 state = (uint64_t)old_state | ((uint64_t)top_state << 32);
2956 decode(purged_snaps, bl);
2957 } else {
2958 state = old_state;
2959 }
2960 if (struct_v >= 25) {
2961 decode(tmp, bl);
2962 manifest_stats_invalid = tmp;
2963 } else {
2964 manifest_stats_invalid = true;
2965 }
2966 if (struct_v >= 26) {
2967 decode(avail_no_missing, bl);
2968 decode(object_location_counts, bl);
2969 }
2970 }
2971 DECODE_FINISH(bl);
2972 }
2973
2974 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
2975 {
2976 pg_stat_t a;
2977 o.push_back(new pg_stat_t(a));
2978
2979 a.version = eversion_t(1, 3);
2980 a.reported_epoch = 1;
2981 a.reported_seq = 2;
2982 a.state = 123;
2983 a.mapping_epoch = 998;
2984 a.last_fresh = utime_t(1002, 1);
2985 a.last_change = utime_t(1002, 2);
2986 a.last_active = utime_t(1002, 3);
2987 a.last_clean = utime_t(1002, 4);
2988 a.last_unstale = utime_t(1002, 5);
2989 a.last_undegraded = utime_t(1002, 7);
2990 a.last_fullsized = utime_t(1002, 8);
2991 a.log_start = eversion_t(1, 4);
2992 a.ondisk_log_start = eversion_t(1, 5);
2993 a.created = 6;
2994 a.last_epoch_clean = 7;
2995 a.parent = pg_t(1, 2);
2996 a.parent_split_bits = 12;
2997 a.last_scrub = eversion_t(9, 10);
2998 a.last_scrub_stamp = utime_t(11, 12);
2999 a.last_deep_scrub = eversion_t(13, 14);
3000 a.last_deep_scrub_stamp = utime_t(15, 16);
3001 a.last_clean_scrub_stamp = utime_t(17, 18);
3002 a.snaptrimq_len = 1048576;
3003 list<object_stat_collection_t*> l;
3004 object_stat_collection_t::generate_test_instances(l);
3005 a.stats = *l.back();
3006 a.log_size = 99;
3007 a.ondisk_log_size = 88;
3008 a.up.push_back(123);
3009 a.up_primary = 123;
3010 a.acting.push_back(456);
3011 a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD));
3012 set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) };
3013 a.object_location_counts.insert(make_pair(sset, 10));
3014 sset.insert(pg_shard_t(2));
3015 a.object_location_counts.insert(make_pair(sset, 5));
3016 a.acting_primary = 456;
3017 o.push_back(new pg_stat_t(a));
3018
3019 a.up.push_back(124);
3020 a.up_primary = 124;
3021 a.acting.push_back(124);
3022 a.acting_primary = 124;
3023 a.blocked_by.push_back(155);
3024 a.blocked_by.push_back(156);
3025 o.push_back(new pg_stat_t(a));
3026 }
3027
3028 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
3029 {
3030 return
3031 l.version == r.version &&
3032 l.reported_seq == r.reported_seq &&
3033 l.reported_epoch == r.reported_epoch &&
3034 l.state == r.state &&
3035 l.last_fresh == r.last_fresh &&
3036 l.last_change == r.last_change &&
3037 l.last_active == r.last_active &&
3038 l.last_peered == r.last_peered &&
3039 l.last_clean == r.last_clean &&
3040 l.last_unstale == r.last_unstale &&
3041 l.last_undegraded == r.last_undegraded &&
3042 l.last_fullsized == r.last_fullsized &&
3043 l.log_start == r.log_start &&
3044 l.ondisk_log_start == r.ondisk_log_start &&
3045 l.created == r.created &&
3046 l.last_epoch_clean == r.last_epoch_clean &&
3047 l.parent == r.parent &&
3048 l.parent_split_bits == r.parent_split_bits &&
3049 l.last_scrub == r.last_scrub &&
3050 l.last_deep_scrub == r.last_deep_scrub &&
3051 l.last_scrub_stamp == r.last_scrub_stamp &&
3052 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
3053 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
3054 l.stats == r.stats &&
3055 l.stats_invalid == r.stats_invalid &&
3056 l.log_size == r.log_size &&
3057 l.ondisk_log_size == r.ondisk_log_size &&
3058 l.up == r.up &&
3059 l.acting == r.acting &&
3060 l.avail_no_missing == r.avail_no_missing &&
3061 l.object_location_counts == r.object_location_counts &&
3062 l.mapping_epoch == r.mapping_epoch &&
3063 l.blocked_by == r.blocked_by &&
3064 l.last_became_active == r.last_became_active &&
3065 l.last_became_peered == r.last_became_peered &&
3066 l.dirty_stats_invalid == r.dirty_stats_invalid &&
3067 l.omap_stats_invalid == r.omap_stats_invalid &&
3068 l.hitset_stats_invalid == r.hitset_stats_invalid &&
3069 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
3070 l.up_primary == r.up_primary &&
3071 l.acting_primary == r.acting_primary &&
3072 l.pin_stats_invalid == r.pin_stats_invalid &&
3073 l.manifest_stats_invalid == r.manifest_stats_invalid &&
3074 l.purged_snaps == r.purged_snaps &&
3075 l.snaptrimq_len == r.snaptrimq_len;
3076 }
3077
3078 // -- store_statfs_t --
3079
3080 bool store_statfs_t::operator==(const store_statfs_t& other) const
3081 {
3082 return total == other.total
3083 && available == other.available
3084 && allocated == other.allocated
3085 && internally_reserved == other.internally_reserved
3086 && data_stored == other.data_stored
3087 && data_compressed == other.data_compressed
3088 && data_compressed_allocated == other.data_compressed_allocated
3089 && data_compressed_original == other.data_compressed_original
3090 && omap_allocated == other.omap_allocated
3091 && internal_metadata == other.internal_metadata;
3092 }
3093
3094 void store_statfs_t::dump(Formatter *f) const
3095 {
3096 f->dump_int("total", total);
3097 f->dump_int("available", available);
3098 f->dump_int("internally_reserved", internally_reserved);
3099 f->dump_int("allocated", allocated);
3100 f->dump_int("data_stored", data_stored);
3101 f->dump_int("data_compressed", data_compressed);
3102 f->dump_int("data_compressed_allocated", data_compressed_allocated);
3103 f->dump_int("data_compressed_original", data_compressed_original);
3104 f->dump_int("omap_allocated", omap_allocated);
3105 f->dump_int("internal_metadata", internal_metadata);
3106 }
3107
3108 ostream& operator<<(ostream& out, const store_statfs_t &s)
3109 {
3110 out << std::hex
3111 << "store_statfs(0x" << s.available
3112 << "/0x" << s.internally_reserved
3113 << "/0x" << s.total
3114 << ", data 0x" << s.data_stored
3115 << "/0x" << s.allocated
3116 << ", compress 0x" << s.data_compressed
3117 << "/0x" << s.data_compressed_allocated
3118 << "/0x" << s.data_compressed_original
3119 << ", omap 0x" << s.omap_allocated
3120 << ", meta 0x" << s.internal_metadata
3121 << std::dec
3122 << ")";
3123 return out;
3124 }
3125
3126 void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o)
3127 {
3128 store_statfs_t a;
3129 o.push_back(new store_statfs_t(a));
3130 a.total = 234;
3131 a.available = 123;
3132 a.internally_reserved = 33;
3133 a.allocated = 32;
3134 a.data_stored = 44;
3135 a.data_compressed = 21;
3136 a.data_compressed_allocated = 12;
3137 a.data_compressed_original = 13;
3138 a.omap_allocated = 14;
3139 a.internal_metadata = 15;
3140 o.push_back(new store_statfs_t(a));
3141 }
3142
3143 // -- pool_stat_t --
3144
3145 void pool_stat_t::dump(Formatter *f) const
3146 {
3147 stats.dump(f);
3148 f->open_object_section("store_stats");
3149 store_stats.dump(f);
3150 f->close_section();
3151 f->dump_int("log_size", log_size);
3152 f->dump_int("ondisk_log_size", ondisk_log_size);
3153 f->dump_int("up", up);
3154 f->dump_int("acting", acting);
3155 f->dump_int("num_store_stats", num_store_stats);
3156 }
3157
3158 void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
3159 {
3160 using ceph::encode;
3161 if ((features & CEPH_FEATURE_OSDENC) == 0) {
3162 __u8 v = 4;
3163 encode(v, bl);
3164 encode(stats, bl);
3165 encode(log_size, bl);
3166 encode(ondisk_log_size, bl);
3167 return;
3168 }
3169
3170 ENCODE_START(7, 5, bl);
3171 encode(stats, bl);
3172 encode(log_size, bl);
3173 encode(ondisk_log_size, bl);
3174 encode(up, bl);
3175 encode(acting, bl);
3176 encode(store_stats, bl);
3177 encode(num_store_stats, bl);
3178 ENCODE_FINISH(bl);
3179 }
3180
3181 void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl)
3182 {
3183 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
3184 if (struct_v >= 4) {
3185 decode(stats, bl);
3186 decode(log_size, bl);
3187 decode(ondisk_log_size, bl);
3188 if (struct_v >= 6) {
3189 decode(up, bl);
3190 decode(acting, bl);
3191 } else {
3192 up = 0;
3193 acting = 0;
3194 }
3195 if (struct_v >= 7) {
3196 decode(store_stats, bl);
3197 decode(num_store_stats, bl);
3198 } else {
3199 store_stats.reset();
3200 num_store_stats = 0;
3201 }
3202
3203 } else {
3204 decode(stats.sum.num_bytes, bl);
3205 uint64_t num_kb;
3206 decode(num_kb, bl);
3207 decode(stats.sum.num_objects, bl);
3208 decode(stats.sum.num_object_clones, bl);
3209 decode(stats.sum.num_object_copies, bl);
3210 decode(stats.sum.num_objects_missing_on_primary, bl);
3211 decode(stats.sum.num_objects_degraded, bl);
3212 decode(log_size, bl);
3213 decode(ondisk_log_size, bl);
3214 if (struct_v >= 2) {
3215 decode(stats.sum.num_rd, bl);
3216 decode(stats.sum.num_rd_kb, bl);
3217 decode(stats.sum.num_wr, bl);
3218 decode(stats.sum.num_wr_kb, bl);
3219 }
3220 if (struct_v >= 3) {
3221 decode(stats.sum.num_objects_unfound, bl);
3222 }
3223 }
3224 DECODE_FINISH(bl);
3225 }
3226
3227 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
3228 {
3229 pool_stat_t a;
3230 o.push_back(new pool_stat_t(a));
3231
3232 list<object_stat_collection_t*> l;
3233 object_stat_collection_t::generate_test_instances(l);
3234 list<store_statfs_t*> ll;
3235 store_statfs_t::generate_test_instances(ll);
3236 a.stats = *l.back();
3237 a.store_stats = *ll.back();
3238 a.log_size = 123;
3239 a.ondisk_log_size = 456;
3240 a.acting = 3;
3241 a.up = 4;
3242 a.num_store_stats = 1;
3243 o.push_back(new pool_stat_t(a));
3244 }
3245
3246
3247 // -- pg_history_t --
3248
3249 void pg_history_t::encode(ceph::buffer::list &bl) const
3250 {
3251 ENCODE_START(10, 4, bl);
3252 encode(epoch_created, bl);
3253 encode(last_epoch_started, bl);
3254 encode(last_epoch_clean, bl);
3255 encode(last_epoch_split, bl);
3256 encode(same_interval_since, bl);
3257 encode(same_up_since, bl);
3258 encode(same_primary_since, bl);
3259 encode(last_scrub, bl);
3260 encode(last_scrub_stamp, bl);
3261 encode(last_deep_scrub, bl);
3262 encode(last_deep_scrub_stamp, bl);
3263 encode(last_clean_scrub_stamp, bl);
3264 encode(last_epoch_marked_full, bl);
3265 encode(last_interval_started, bl);
3266 encode(last_interval_clean, bl);
3267 encode(epoch_pool_created, bl);
3268 encode(prior_readable_until_ub, bl);
3269 ENCODE_FINISH(bl);
3270 }
3271
3272 void pg_history_t::decode(ceph::buffer::list::const_iterator &bl)
3273 {
3274 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
3275 decode(epoch_created, bl);
3276 decode(last_epoch_started, bl);
3277 if (struct_v >= 3)
3278 decode(last_epoch_clean, bl);
3279 else
3280 last_epoch_clean = last_epoch_started; // careful, it's a lie!
3281 decode(last_epoch_split, bl);
3282 decode(same_interval_since, bl);
3283 decode(same_up_since, bl);
3284 decode(same_primary_since, bl);
3285 if (struct_v >= 2) {
3286 decode(last_scrub, bl);
3287 decode(last_scrub_stamp, bl);
3288 }
3289 if (struct_v >= 5) {
3290 decode(last_deep_scrub, bl);
3291 decode(last_deep_scrub_stamp, bl);
3292 }
3293 if (struct_v >= 6) {
3294 decode(last_clean_scrub_stamp, bl);
3295 }
3296 if (struct_v >= 7) {
3297 decode(last_epoch_marked_full, bl);
3298 }
3299 if (struct_v >= 8) {
3300 decode(last_interval_started, bl);
3301 decode(last_interval_clean, bl);
3302 } else {
3303 if (last_epoch_started >= same_interval_since) {
3304 last_interval_started = same_interval_since;
3305 } else {
3306 last_interval_started = last_epoch_started; // best guess
3307 }
3308 if (last_epoch_clean >= same_interval_since) {
3309 last_interval_clean = same_interval_since;
3310 } else {
3311 last_interval_clean = last_epoch_clean; // best guess
3312 }
3313 }
3314 if (struct_v >= 9) {
3315 decode(epoch_pool_created, bl);
3316 } else {
3317 epoch_pool_created = epoch_created;
3318 }
3319 if (struct_v >= 10) {
3320 decode(prior_readable_until_ub, bl);
3321 }
3322 DECODE_FINISH(bl);
3323 }
3324
3325 void pg_history_t::dump(Formatter *f) const
3326 {
3327 f->dump_int("epoch_created", epoch_created);
3328 f->dump_int("epoch_pool_created", epoch_pool_created);
3329 f->dump_int("last_epoch_started", last_epoch_started);
3330 f->dump_int("last_interval_started", last_interval_started);
3331 f->dump_int("last_epoch_clean", last_epoch_clean);
3332 f->dump_int("last_interval_clean", last_interval_clean);
3333 f->dump_int("last_epoch_split", last_epoch_split);
3334 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
3335 f->dump_int("same_up_since", same_up_since);
3336 f->dump_int("same_interval_since", same_interval_since);
3337 f->dump_int("same_primary_since", same_primary_since);
3338 f->dump_stream("last_scrub") << last_scrub;
3339 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
3340 f->dump_stream("last_deep_scrub") << last_deep_scrub;
3341 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
3342 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
3343 f->dump_float(
3344 "prior_readable_until_ub",
3345 std::chrono::duration<double>(prior_readable_until_ub).count());
3346 }
3347
3348 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
3349 {
3350 o.push_back(new pg_history_t);
3351 o.push_back(new pg_history_t);
3352 o.back()->epoch_created = 1;
3353 o.back()->epoch_pool_created = 1;
3354 o.back()->last_epoch_started = 2;
3355 o.back()->last_interval_started = 2;
3356 o.back()->last_epoch_clean = 3;
3357 o.back()->last_interval_clean = 2;
3358 o.back()->last_epoch_split = 4;
3359 o.back()->prior_readable_until_ub = make_timespan(3.1415);
3360 o.back()->same_up_since = 5;
3361 o.back()->same_interval_since = 6;
3362 o.back()->same_primary_since = 7;
3363 o.back()->last_scrub = eversion_t(8, 9);
3364 o.back()->last_scrub_stamp = utime_t(10, 11);
3365 o.back()->last_deep_scrub = eversion_t(12, 13);
3366 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
3367 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
3368 o.back()->last_epoch_marked_full = 18;
3369 }
3370
3371
3372 // -- pg_info_t --
3373
3374 void pg_info_t::encode(ceph::buffer::list &bl) const
3375 {
3376 ENCODE_START(32, 26, bl);
3377 encode(pgid.pgid, bl);
3378 encode(last_update, bl);
3379 encode(last_complete, bl);
3380 encode(log_tail, bl);
3381 encode(hobject_t(), bl); // old (nibblewise) last_backfill
3382 encode(stats, bl);
3383 history.encode(bl);
3384 encode(purged_snaps, bl);
3385 encode(last_epoch_started, bl);
3386 encode(last_user_version, bl);
3387 encode(hit_set, bl);
3388 encode(pgid.shard, bl);
3389 encode(last_backfill, bl);
3390 encode(true, bl); // was last_backfill_bitwise
3391 encode(last_interval_started, bl);
3392 ENCODE_FINISH(bl);
3393 }
3394
3395 void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
3396 {
3397 DECODE_START(32, bl);
3398 decode(pgid.pgid, bl);
3399 decode(last_update, bl);
3400 decode(last_complete, bl);
3401 decode(log_tail, bl);
3402 {
3403 hobject_t old_last_backfill;
3404 decode(old_last_backfill, bl);
3405 }
3406 decode(stats, bl);
3407 history.decode(bl);
3408 decode(purged_snaps, bl);
3409 decode(last_epoch_started, bl);
3410 decode(last_user_version, bl);
3411 decode(hit_set, bl);
3412 decode(pgid.shard, bl);
3413 decode(last_backfill, bl);
3414 {
3415 bool last_backfill_bitwise;
3416 decode(last_backfill_bitwise, bl);
3417 // note: we may see a false value here since the default value for
3418 // the member was false, so it often didn't get set to true until
3419 // peering progressed.
3420 }
3421 if (struct_v >= 32) {
3422 decode(last_interval_started, bl);
3423 } else {
3424 last_interval_started = last_epoch_started;
3425 }
3426 DECODE_FINISH(bl);
3427 }
3428
3429 // -- pg_info_t --
3430
3431 void pg_info_t::dump(Formatter *f) const
3432 {
3433 f->dump_stream("pgid") << pgid;
3434 f->dump_stream("last_update") << last_update;
3435 f->dump_stream("last_complete") << last_complete;
3436 f->dump_stream("log_tail") << log_tail;
3437 f->dump_int("last_user_version", last_user_version);
3438 f->dump_stream("last_backfill") << last_backfill;
3439 f->open_array_section("purged_snaps");
3440 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
3441 i != purged_snaps.end();
3442 ++i) {
3443 f->open_object_section("purged_snap_interval");
3444 f->dump_stream("start") << i.get_start();
3445 f->dump_stream("length") << i.get_len();
3446 f->close_section();
3447 }
3448 f->close_section();
3449 f->open_object_section("history");
3450 history.dump(f);
3451 f->close_section();
3452 f->open_object_section("stats");
3453 stats.dump(f);
3454 f->close_section();
3455
3456 f->dump_int("empty", is_empty());
3457 f->dump_int("dne", dne());
3458 f->dump_int("incomplete", is_incomplete());
3459 f->dump_int("last_epoch_started", last_epoch_started);
3460
3461 f->open_object_section("hit_set_history");
3462 hit_set.dump(f);
3463 f->close_section();
3464 }
3465
3466 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
3467 {
3468 o.push_back(new pg_info_t);
3469 o.push_back(new pg_info_t);
3470 list<pg_history_t*> h;
3471 pg_history_t::generate_test_instances(h);
3472 o.back()->history = *h.back();
3473 o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD);
3474 o.back()->last_update = eversion_t(3, 4);
3475 o.back()->last_complete = eversion_t(5, 6);
3476 o.back()->last_user_version = 2;
3477 o.back()->log_tail = eversion_t(7, 8);
3478 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
3479 {
3480 list<pg_stat_t*> s;
3481 pg_stat_t::generate_test_instances(s);
3482 o.back()->stats = *s.back();
3483 }
3484 {
3485 list<pg_hit_set_history_t*> s;
3486 pg_hit_set_history_t::generate_test_instances(s);
3487 o.back()->hit_set = *s.back();
3488 }
3489 }
3490
3491 // -- pg_notify_t --
3492 void pg_notify_t::encode(ceph::buffer::list &bl) const
3493 {
3494 ENCODE_START(3, 2, bl);
3495 encode(query_epoch, bl);
3496 encode(epoch_sent, bl);
3497 encode(info, bl);
3498 encode(to, bl);
3499 encode(from, bl);
3500 encode(past_intervals, bl);
3501 ENCODE_FINISH(bl);
3502 }
3503
3504 void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl)
3505 {
3506 DECODE_START(3, bl);
3507 decode(query_epoch, bl);
3508 decode(epoch_sent, bl);
3509 decode(info, bl);
3510 decode(to, bl);
3511 decode(from, bl);
3512 if (struct_v >= 3) {
3513 decode(past_intervals, bl);
3514 }
3515 DECODE_FINISH(bl);
3516 }
3517
3518 void pg_notify_t::dump(Formatter *f) const
3519 {
3520 f->dump_int("from", from);
3521 f->dump_int("to", to);
3522 f->dump_unsigned("query_epoch", query_epoch);
3523 f->dump_unsigned("epoch_sent", epoch_sent);
3524 {
3525 f->open_object_section("info");
3526 info.dump(f);
3527 f->close_section();
3528 }
3529 f->dump_object("past_intervals", past_intervals);
3530 }
3531
3532 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
3533 {
3534 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1,
3535 pg_info_t(), PastIntervals()));
3536 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
3537 pg_info_t(), PastIntervals()));
3538 }
3539
3540 ostream &operator<<(ostream &lhs, const pg_notify_t ¬ify)
3541 {
3542 lhs << "(query:" << notify.query_epoch
3543 << " sent:" << notify.epoch_sent
3544 << " " << notify.info;
3545 if (notify.from != shard_id_t::NO_SHARD ||
3546 notify.to != shard_id_t::NO_SHARD)
3547 lhs << " " << (unsigned)notify.from
3548 << "->" << (unsigned)notify.to;
3549 lhs << " " << notify.past_intervals;
3550 return lhs << ")";
3551 }
3552
3553 // -- pg_interval_t --
3554
3555 void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const
3556 {
3557 ENCODE_START(4, 2, bl);
3558 encode(first, bl);
3559 encode(last, bl);
3560 encode(up, bl);
3561 encode(acting, bl);
3562 encode(maybe_went_rw, bl);
3563 encode(primary, bl);
3564 encode(up_primary, bl);
3565 ENCODE_FINISH(bl);
3566 }
3567
3568 void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl)
3569 {
3570 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
3571 decode(first, bl);
3572 decode(last, bl);
3573 decode(up, bl);
3574 decode(acting, bl);
3575 decode(maybe_went_rw, bl);
3576 if (struct_v >= 3) {
3577 decode(primary, bl);
3578 } else {
3579 if (acting.size())
3580 primary = acting[0];
3581 }
3582 if (struct_v >= 4) {
3583 decode(up_primary, bl);
3584 } else {
3585 if (up.size())
3586 up_primary = up[0];
3587 }
3588 DECODE_FINISH(bl);
3589 }
3590
3591 void PastIntervals::pg_interval_t::dump(Formatter *f) const
3592 {
3593 f->dump_unsigned("first", first);
3594 f->dump_unsigned("last", last);
3595 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
3596 f->open_array_section("up");
3597 for (auto p = up.cbegin(); p != up.cend(); ++p)
3598 f->dump_int("osd", *p);
3599 f->close_section();
3600 f->open_array_section("acting");
3601 for (auto p = acting.cbegin(); p != acting.cend(); ++p)
3602 f->dump_int("osd", *p);
3603 f->close_section();
3604 f->dump_int("primary", primary);
3605 f->dump_int("up_primary", up_primary);
3606 }
3607
3608 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
3609 {
3610 o.push_back(new pg_interval_t);
3611 o.push_back(new pg_interval_t);
3612 o.back()->up.push_back(1);
3613 o.back()->acting.push_back(2);
3614 o.back()->acting.push_back(3);
3615 o.back()->first = 4;
3616 o.back()->last = 5;
3617 o.back()->maybe_went_rw = true;
3618 }
3619
3620 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
3621
3622
3623 /**
3624 * pi_compact_rep
3625 *
3626 * PastIntervals only needs to be able to answer two questions:
3627 * 1) Where should the primary look for unfound objects?
3628 * 2) List a set of subsets of the OSDs such that contacting at least
3629 * one from each subset guarantees we speak to at least one witness
3630 * of any completed write.
3631 *
3632 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3633 * we don't need to keep any where maybe_went_rw would be false. We also
3634 * needn't keep two intervals where the actingset in one is a subset
3635 * of the other (only need to keep the smaller of the two sets). In order
3636 * to accurately trim the set of intervals as last_epoch_started changes
3637 * without rebuilding the set from scratch, we'll retain the larger set
3638 * if it in an older interval.
3639 */
3640 struct compact_interval_t {
3641 epoch_t first;
3642 epoch_t last;
3643 set<pg_shard_t> acting;
3644 bool supersedes(const compact_interval_t &other) {
3645 for (auto &&i: acting) {
3646 if (!other.acting.count(i))
3647 return false;
3648 }
3649 return true;
3650 }
3651 void dump(Formatter *f) const {
3652 f->open_object_section("compact_interval_t");
3653 f->dump_stream("first") << first;
3654 f->dump_stream("last") << last;
3655 f->dump_stream("acting") << acting;
3656 f->close_section();
3657 }
3658 void encode(ceph::buffer::list &bl) const {
3659 ENCODE_START(1, 1, bl);
3660 encode(first, bl);
3661 encode(last, bl);
3662 encode(acting, bl);
3663 ENCODE_FINISH(bl);
3664 }
3665 void decode(ceph::buffer::list::const_iterator &bl) {
3666 DECODE_START(1, bl);
3667 decode(first, bl);
3668 decode(last, bl);
3669 decode(acting, bl);
3670 DECODE_FINISH(bl);
3671 }
3672 static void generate_test_instances(list<compact_interval_t*> & o) {
3673 /* Not going to be used, we'll generate pi_compact_rep directly */
3674 }
3675 };
3676 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3677 {
3678 return o << "([" << rhs.first << "," << rhs.last
3679 << "] acting " << rhs.acting << ")";
3680 }
3681 WRITE_CLASS_ENCODER(compact_interval_t)
3682
3683 class pi_compact_rep : public PastIntervals::interval_rep {
3684 epoch_t first = 0;
3685 epoch_t last = 0; // inclusive
3686 set<pg_shard_t> all_participants;
3687 list<compact_interval_t> intervals;
3688 pi_compact_rep(
3689 bool ec_pool,
3690 std::list<PastIntervals::pg_interval_t> &&intervals) {
3691 for (auto &&i: intervals)
3692 add_interval(ec_pool, i);
3693 }
3694 public:
3695 pi_compact_rep() = default;
3696 pi_compact_rep(const pi_compact_rep &) = default;
3697 pi_compact_rep(pi_compact_rep &&) = default;
3698 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3699 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3700
3701 size_t size() const override { return intervals.size(); }
3702 bool empty() const override {
3703 return first > last || (first == 0 && last == 0);
3704 }
3705 void clear() override {
3706 *this = pi_compact_rep();
3707 }
3708 pair<epoch_t, epoch_t> get_bounds() const override {
3709 return make_pair(first, last + 1);
3710 }
3711 void adjust_start_backwards(epoch_t last_epoch_clean) {
3712 first = last_epoch_clean;
3713 }
3714
3715 set<pg_shard_t> get_all_participants(
3716 bool ec_pool) const override {
3717 return all_participants;
3718 }
3719 void add_interval(
3720 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3721 if (first == 0)
3722 first = interval.first;
3723 ceph_assert(interval.last > last);
3724 last = interval.last;
3725 set<pg_shard_t> acting;
3726 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3727 if (interval.acting[i] == CRUSH_ITEM_NONE)
3728 continue;
3729 acting.insert(
3730 pg_shard_t(
3731 interval.acting[i],
3732 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3733 }
3734 all_participants.insert(acting.begin(), acting.end());
3735 if (!interval.maybe_went_rw)
3736 return;
3737 intervals.push_back(
3738 compact_interval_t{interval.first, interval.last, acting});
3739 auto plast = intervals.end();
3740 --plast;
3741 for (auto cur = intervals.begin(); cur != plast; ) {
3742 if (plast->supersedes(*cur)) {
3743 intervals.erase(cur++);
3744 } else {
3745 ++cur;
3746 }
3747 }
3748 }
3749 unique_ptr<PastIntervals::interval_rep> clone() const override {
3750 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3751 }
3752 ostream &print(ostream &out) const override {
3753 return out << "([" << first << "," << last
3754 << "] all_participants=" << all_participants
3755 << " intervals=" << intervals << ")";
3756 }
3757 void encode(ceph::buffer::list &bl) const override {
3758 ENCODE_START(1, 1, bl);
3759 encode(first, bl);
3760 encode(last, bl);
3761 encode(all_participants, bl);
3762 encode(intervals, bl);
3763 ENCODE_FINISH(bl);
3764 }
3765 void decode(ceph::buffer::list::const_iterator &bl) override {
3766 DECODE_START(1, bl);
3767 decode(first, bl);
3768 decode(last, bl);
3769 decode(all_participants, bl);
3770 decode(intervals, bl);
3771 DECODE_FINISH(bl);
3772 }
3773 void dump(Formatter *f) const override {
3774 f->open_object_section("PastIntervals::compact_rep");
3775 f->dump_stream("first") << first;
3776 f->dump_stream("last") << last;
3777 f->open_array_section("all_participants");
3778 for (auto& i : all_participants) {
3779 f->dump_object("pg_shard", i);
3780 }
3781 f->close_section();
3782 f->open_array_section("intervals");
3783 for (auto &&i: intervals) {
3784 i.dump(f);
3785 }
3786 f->close_section();
3787 f->close_section();
3788 }
3789 static void generate_test_instances(list<pi_compact_rep*> &o) {
3790 using ival = PastIntervals::pg_interval_t;
3791 using ivallst = std::list<ival>;
3792 o.push_back(
3793 new pi_compact_rep(
3794 true, ivallst
3795 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3796 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3797 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3798 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3799 }));
3800 o.push_back(
3801 new pi_compact_rep(
3802 false, ivallst
3803 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3804 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3805 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3806 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3807 }));
3808 o.push_back(
3809 new pi_compact_rep(
3810 true, ivallst
3811 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3812 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3813 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3814 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3815 }));
3816 }
3817 void iterate_mayberw_back_to(
3818 epoch_t les,
3819 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3820 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3821 if (i->last < les)
3822 break;
3823 f(i->first, i->acting);
3824 }
3825 }
3826 virtual ~pi_compact_rep() override {}
3827 };
3828 WRITE_CLASS_ENCODER(pi_compact_rep)
3829
3830 PastIntervals::PastIntervals()
3831 {
3832 past_intervals.reset(new pi_compact_rep);
3833 }
3834
3835 PastIntervals::PastIntervals(const PastIntervals &rhs)
3836 : past_intervals(rhs.past_intervals ?
3837 rhs.past_intervals->clone() :
3838 nullptr) {}
3839
3840 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3841 {
3842 PastIntervals other(rhs);
3843 swap(other);
3844 return *this;
3845 }
3846
3847 ostream& operator<<(ostream& out, const PastIntervals &i)
3848 {
3849 if (i.past_intervals) {
3850 return i.past_intervals->print(out);
3851 } else {
3852 return out << "(empty)";
3853 }
3854 }
3855
3856 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3857 {
3858 return out << "PriorSet("
3859 << "ec_pool: " << i.ec_pool
3860 << ", probe: " << i.probe
3861 << ", down: " << i.down
3862 << ", blocked_by: " << i.blocked_by
3863 << ", pg_down: " << i.pg_down
3864 << ")";
3865 }
3866
3867 void PastIntervals::decode(ceph::buffer::list::const_iterator &bl)
3868 {
3869 DECODE_START(1, bl);
3870 __u8 type = 0;
3871 decode(type, bl);
3872 switch (type) {
3873 case 0:
3874 break;
3875 case 1:
3876 ceph_abort_msg("pi_simple_rep support removed post-luminous");
3877 break;
3878 case 2:
3879 past_intervals.reset(new pi_compact_rep);
3880 past_intervals->decode(bl);
3881 break;
3882 }
3883 DECODE_FINISH(bl);
3884 }
3885
3886 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
3887 {
3888 {
3889 list<pi_compact_rep *> compact;
3890 pi_compact_rep::generate_test_instances(compact);
3891 for (auto &&i: compact) {
3892 // takes ownership of contents
3893 o.push_back(new PastIntervals(i));
3894 }
3895 }
3896 return;
3897 }
3898
3899 bool PastIntervals::is_new_interval(
3900 int old_acting_primary,
3901 int new_acting_primary,
3902 const vector<int> &old_acting,
3903 const vector<int> &new_acting,
3904 int old_up_primary,
3905 int new_up_primary,
3906 const vector<int> &old_up,
3907 const vector<int> &new_up,
3908 int old_size,
3909 int new_size,
3910 int old_min_size,
3911 int new_min_size,
3912 unsigned old_pg_num,
3913 unsigned new_pg_num,
3914 unsigned old_pg_num_pending,
3915 unsigned new_pg_num_pending,
3916 bool old_sort_bitwise,
3917 bool new_sort_bitwise,
3918 bool old_recovery_deletes,
3919 bool new_recovery_deletes,
3920 pg_t pgid) {
3921 return old_acting_primary != new_acting_primary ||
3922 new_acting != old_acting ||
3923 old_up_primary != new_up_primary ||
3924 new_up != old_up ||
3925 old_min_size != new_min_size ||
3926 old_size != new_size ||
3927 pgid.is_split(old_pg_num, new_pg_num, 0) ||
3928 // (is or was) pre-merge source
3929 pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) ||
3930 pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) ||
3931 // merge source
3932 pgid.is_merge_source(old_pg_num, new_pg_num, 0) ||
3933 // (is or was) pre-merge target
3934 pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) ||
3935 pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) ||
3936 // merge target
3937 pgid.is_merge_target(old_pg_num, new_pg_num) ||
3938 old_sort_bitwise != new_sort_bitwise ||
3939 old_recovery_deletes != new_recovery_deletes;
3940 }
3941
3942 bool PastIntervals::is_new_interval(
3943 int old_acting_primary,
3944 int new_acting_primary,
3945 const vector<int> &old_acting,
3946 const vector<int> &new_acting,
3947 int old_up_primary,
3948 int new_up_primary,
3949 const vector<int> &old_up,
3950 const vector<int> &new_up,
3951 const OSDMap *osdmap,
3952 const OSDMap *lastmap,
3953 pg_t pgid)
3954 {
3955 const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool());
3956 if (!plast) {
3957 return false; // after pool is deleted there are no more interval changes
3958 }
3959 const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool());
3960 if (!pi) {
3961 return true; // pool was deleted this epoch -> (final!) interval change
3962 }
3963 return
3964 is_new_interval(old_acting_primary,
3965 new_acting_primary,
3966 old_acting,
3967 new_acting,
3968 old_up_primary,
3969 new_up_primary,
3970 old_up,
3971 new_up,
3972 plast->size,
3973 pi->size,
3974 plast->min_size,
3975 pi->min_size,
3976 plast->get_pg_num(),
3977 pi->get_pg_num(),
3978 plast->get_pg_num_pending(),
3979 pi->get_pg_num_pending(),
3980 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3981 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3982 lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3983 osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3984 pgid);
3985 }
3986
3987 bool PastIntervals::check_new_interval(
3988 int old_acting_primary,
3989 int new_acting_primary,
3990 const vector<int> &old_acting,
3991 const vector<int> &new_acting,
3992 int old_up_primary,
3993 int new_up_primary,
3994 const vector<int> &old_up,
3995 const vector<int> &new_up,
3996 epoch_t same_interval_since,
3997 epoch_t last_epoch_clean,
3998 const OSDMap *osdmap,
3999 const OSDMap *lastmap,
4000 pg_t pgid,
4001 const IsPGRecoverablePredicate &could_have_gone_active,
4002 PastIntervals *past_intervals,
4003 std::ostream *out)
4004 {
4005 /*
4006 * We have to be careful to gracefully deal with situations like
4007 * so. Say we have a power outage or something that takes out both
4008 * OSDs, but the monitor doesn't mark them down in the same epoch.
4009 * The history may look like
4010 *
4011 * 1: A B
4012 * 2: B
4013 * 3: let's say B dies for good, too (say, from the power spike)
4014 * 4: A
4015 *
4016 * which makes it look like B may have applied updates to the PG
4017 * that we need in order to proceed. This sucks...
4018 *
4019 * To minimize the risk of this happening, we CANNOT go active if
4020 * _any_ OSDs in the prior set are down until we send an MOSDAlive
4021 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
4022 * Then, we have something like
4023 *
4024 * 1: A B
4025 * 2: B up_thru[B]=0
4026 * 3:
4027 * 4: A
4028 *
4029 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
4030 *
4031 * or,
4032 *
4033 * 1: A B
4034 * 2: B up_thru[B]=0
4035 * 3: B up_thru[B]=2
4036 * 4:
4037 * 5: A
4038 *
4039 * -> we must wait for B, bc it was alive through 2, and could have
4040 * written to the pg.
4041 *
4042 * If B is really dead, then an administrator will need to manually
4043 * intervene by marking the OSD as "lost."
4044 */
4045
4046 // remember past interval
4047 // NOTE: a change in the up set primary triggers an interval
4048 // change, even though the interval members in the pg_interval_t
4049 // do not change.
4050 ceph_assert(past_intervals);
4051 ceph_assert(past_intervals->past_intervals);
4052 if (is_new_interval(
4053 old_acting_primary,
4054 new_acting_primary,
4055 old_acting,
4056 new_acting,
4057 old_up_primary,
4058 new_up_primary,
4059 old_up,
4060 new_up,
4061 osdmap,
4062 lastmap,
4063 pgid)) {
4064 pg_interval_t i;
4065 i.first = same_interval_since;
4066 i.last = osdmap->get_epoch() - 1;
4067 ceph_assert(i.first <= i.last);
4068 i.acting = old_acting;
4069 i.up = old_up;
4070 i.primary = old_acting_primary;
4071 i.up_primary = old_up_primary;
4072
4073 unsigned num_acting = 0;
4074 for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p)
4075 if (*p != CRUSH_ITEM_NONE)
4076 ++num_acting;
4077
4078 ceph_assert(lastmap->get_pools().count(pgid.pool()));
4079 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
4080 set<pg_shard_t> old_acting_shards;
4081 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
4082
4083 if (num_acting &&
4084 i.primary != -1 &&
4085 num_acting >= old_pg_pool.min_size &&
4086 could_have_gone_active(old_acting_shards)) {
4087 if (out)
4088 *out << __func__ << " " << i
4089 << " up_thru " << lastmap->get_up_thru(i.primary)
4090 << " up_from " << lastmap->get_up_from(i.primary)
4091 << " last_epoch_clean " << last_epoch_clean;
4092 if (lastmap->get_up_thru(i.primary) >= i.first &&
4093 lastmap->get_up_from(i.primary) <= i.first) {
4094 i.maybe_went_rw = true;
4095 if (out)
4096 *out << " " << i
4097 << " : primary up " << lastmap->get_up_from(i.primary)
4098 << "-" << lastmap->get_up_thru(i.primary)
4099 << " includes interval"
4100 << std::endl;
4101 } else if (last_epoch_clean >= i.first &&
4102 last_epoch_clean <= i.last) {
4103 // If the last_epoch_clean is included in this interval, then
4104 // the pg must have been rw (for recovery to have completed).
4105 // This is important because we won't know the _real_
4106 // first_epoch because we stop at last_epoch_clean, and we
4107 // don't want the oldest interval to randomly have
4108 // maybe_went_rw false depending on the relative up_thru vs
4109 // last_epoch_clean timing.
4110 i.maybe_went_rw = true;
4111 if (out)
4112 *out << " " << i
4113 << " : includes last_epoch_clean " << last_epoch_clean
4114 << " and presumed to have been rw"
4115 << std::endl;
4116 } else {
4117 i.maybe_went_rw = false;
4118 if (out)
4119 *out << " " << i
4120 << " : primary up " << lastmap->get_up_from(i.primary)
4121 << "-" << lastmap->get_up_thru(i.primary)
4122 << " does not include interval"
4123 << std::endl;
4124 }
4125 } else {
4126 i.maybe_went_rw = false;
4127 if (out)
4128 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
4129 }
4130 past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);
4131 return true;
4132 } else {
4133 return false;
4134 }
4135 }
4136
4137
4138 // true if the given map affects the prior set
4139 bool PastIntervals::PriorSet::affected_by_map(
4140 const OSDMap &osdmap,
4141 const DoutPrefixProvider *dpp) const
4142 {
4143 for (auto p = probe.begin(); p != probe.end(); ++p) {
4144 int o = p->osd;
4145
4146 // did someone in the prior set go down?
4147 if (osdmap.is_down(o) && down.count(o) == 0) {
(1) Event exp_primary_expr: |
expected an expression |
(2) Event caretline: |
^ |
4148 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
4149 return true;
4150 }
4151
4152 // did a down osd in cur get (re)marked as lost?
4153 auto r = blocked_by.find(o);
4154 if (r != blocked_by.end()) {
4155 if (!osdmap.exists(o)) {
4156 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4157 return true;
4158 }
4159 if (osdmap.get_info(o).lost_at != r->second) {
4160 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4161 return true;
4162 }
4163 }
4164 }
4165
4166 // did someone in the prior down set go up?
4167 for (auto p = down.cbegin(); p != down.cend(); ++p) {
4168 int o = *p;
4169
4170 if (osdmap.is_up(o)) {
4171 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
4172 return true;
4173 }
4174
4175 // did someone in the prior set get lost or destroyed?
4176 if (!osdmap.exists(o)) {
4177 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
4178 return true;
4179 }
4180 // did a down osd in down get (re)marked as lost?
4181 auto r = blocked_by.find(o);
4182 if (r != blocked_by.end()) {
4183 if (osdmap.get_info(o).lost_at != r->second) {
4184 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
4185 return true;
4186 }
4187 }
4188 }
4189
4190 return false;
4191 }
4192
4193 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
4194 {
4195 out << "interval(" << i.first << "-" << i.last
4196 << " up " << i.up << "(" << i.up_primary << ")"
4197 << " acting " << i.acting << "(" << i.primary << ")";
4198 if (i.maybe_went_rw)
4199 out << " maybe_went_rw";
4200 out << ")";
4201 return out;
4202 }
4203
4204
4205
4206 // -- pg_query_t --
4207
4208 void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const {
4209 ENCODE_START(3, 3, bl);
4210 encode(type, bl);
4211 encode(since, bl);
4212 history.encode(bl);
4213 encode(epoch_sent, bl);
4214 encode(to, bl);
4215 encode(from, bl);
4216 ENCODE_FINISH(bl);
4217 }
4218
4219 void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) {
4220 DECODE_START(3, bl);
4221 decode(type, bl);
4222 decode(since, bl);
4223 history.decode(bl);
4224 decode(epoch_sent, bl);
4225 decode(to, bl);
4226 decode(from, bl);
4227 DECODE_FINISH(bl);
4228 }
4229
4230 void pg_query_t::dump(Formatter *f) const
4231 {
4232 f->dump_int("from", from);
4233 f->dump_int("to", to);
4234 f->dump_string("type", get_type_name());
4235 f->dump_stream("since") << since;
4236 f->dump_stream("epoch_sent") << epoch_sent;
4237 f->open_object_section("history");
4238 history.dump(f);
4239 f->close_section();
4240 }
4241 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
4242 {
4243 o.push_back(new pg_query_t());
4244 list<pg_history_t*> h;
4245 pg_history_t::generate_test_instances(h);
4246 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
4247 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
4248 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
4249 eversion_t(4, 5), *h.back(), 4));
4250 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
4251 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
4252 *h.back(), 5));
4253 }
4254
4255 // -- pg_lease_t --
4256
4257 void pg_lease_t::encode(bufferlist& bl) const
4258 {
4259 ENCODE_START(1, 1, bl);
4260 encode(readable_until, bl);
4261 encode(readable_until_ub, bl);
4262 encode(interval, bl);
4263 ENCODE_FINISH(bl);
4264 }
4265
4266 void pg_lease_t::decode(bufferlist::const_iterator& p)
4267 {
4268 DECODE_START(1, p);
4269 decode(readable_until, p);
4270 decode(readable_until_ub, p);
4271 decode(interval, p);
4272 DECODE_FINISH(p);
4273 }
4274
4275 void pg_lease_t::dump(Formatter *f) const
4276 {
4277 f->dump_stream("readable_until") << readable_until;
4278 f->dump_stream("readable_until_ub") << readable_until_ub;
4279 f->dump_stream("interval") << interval;
4280 }
4281
4282 void pg_lease_t::generate_test_instances(std::list<pg_lease_t*>& o)
4283 {
4284 o.push_back(new pg_lease_t());
4285 o.push_back(new pg_lease_t());
4286 o.back()->readable_until = make_timespan(1.5);
4287 o.back()->readable_until_ub = make_timespan(3.4);
4288 o.back()->interval = make_timespan(1.0);
4289 }
4290
4291 // -- pg_lease_ack_t --
4292
4293 void pg_lease_ack_t::encode(bufferlist& bl) const
4294 {
4295 ENCODE_START(1, 1, bl);
4296 encode(readable_until_ub, bl);
4297 ENCODE_FINISH(bl);
4298 }
4299
4300 void pg_lease_ack_t::decode(bufferlist::const_iterator& p)
4301 {
4302 DECODE_START(1, p);
4303 decode(readable_until_ub, p);
4304 DECODE_FINISH(p);
4305 }
4306
4307 void pg_lease_ack_t::dump(Formatter *f) const
4308 {
4309 f->dump_stream("readable_until_ub") << readable_until_ub;
4310 }
4311
4312 void pg_lease_ack_t::generate_test_instances(std::list<pg_lease_ack_t*>& o)
4313 {
4314 o.push_back(new pg_lease_ack_t());
4315 o.push_back(new pg_lease_ack_t());
4316 o.back()->readable_until_ub = make_timespan(3.4);
4317 }
4318
4319
4320 // -- ObjectModDesc --
4321 void ObjectModDesc::visit(Visitor *visitor) const
4322 {
4323 auto bp = bl.cbegin();
4324 try {
4325 while (!bp.end()) {
4326 DECODE_START(max_required_version, bp);
4327 uint8_t code;
4328 decode(code, bp);
4329 switch (code) {
4330 case APPEND: {
4331 uint64_t size;
4332 decode(size, bp);
4333 visitor->append(size);
4334 break;
4335 }
4336 case SETATTRS: {
4337 map<string, std::optional<ceph::buffer::list> > attrs;
4338 decode(attrs, bp);
4339 visitor->setattrs(attrs);
4340 break;
4341 }
4342 case DELETE: {
4343 version_t old_version;
4344 decode(old_version, bp);
4345 visitor->rmobject(old_version);
4346 break;
4347 }
4348 case CREATE: {
4349 visitor->create();
4350 break;
4351 }
4352 case UPDATE_SNAPS: {
4353 set<snapid_t> snaps;
4354 decode(snaps, bp);
4355 visitor->update_snaps(snaps);
4356 break;
4357 }
4358 case TRY_DELETE: {
4359 version_t old_version;
4360 decode(old_version, bp);
4361 visitor->try_rmobject(old_version);
4362 break;
4363 }
4364 case ROLLBACK_EXTENTS: {
4365 vector<pair<uint64_t, uint64_t> > extents;
4366 version_t gen;
4367 decode(gen, bp);
4368 decode(extents, bp);
4369 visitor->rollback_extents(gen,extents);
4370 break;
4371 }
4372 default:
4373 ceph_abort_msg("Invalid rollback code");
4374 }
4375 DECODE_FINISH(bp);
4376 }
4377 } catch (...) {
4378 ceph_abort_msg("Invalid encoding");
4379 }
4380 }
4381
4382 struct DumpVisitor : public ObjectModDesc::Visitor {
4383 Formatter *f;
4384 explicit DumpVisitor(Formatter *f) : f(f) {}
4385 void append(uint64_t old_size) override {
4386 f->open_object_section("op");
4387 f->dump_string("code", "APPEND");
4388 f->dump_unsigned("old_size", old_size);
4389 f->close_section();
4390 }
4391 void setattrs(map<string, std::optional<ceph::buffer::list> > &attrs) override {
4392 f->open_object_section("op");
4393 f->dump_string("code", "SETATTRS");
4394 f->open_array_section("attrs");
4395 for (auto i = attrs.begin(); i != attrs.end(); ++i) {
4396 f->dump_string("attr_name", i->first);
4397 }
4398 f->close_section();
4399 f->close_section();
4400 }
4401 void rmobject(version_t old_version) override {
4402 f->open_object_section("op");
4403 f->dump_string("code", "RMOBJECT");
4404 f->dump_unsigned("old_version", old_version);
4405 f->close_section();
4406 }
4407 void try_rmobject(version_t old_version) override {
4408 f->open_object_section("op");
4409 f->dump_string("code", "TRY_RMOBJECT");
4410 f->dump_unsigned("old_version", old_version);
4411 f->close_section();
4412 }
4413 void create() override {
4414 f->open_object_section("op");
4415 f->dump_string("code", "CREATE");
4416 f->close_section();
4417 }
4418 void update_snaps(const set<snapid_t> &snaps) override {
4419 f->open_object_section("op");
4420 f->dump_string("code", "UPDATE_SNAPS");
4421 f->dump_stream("snaps") << snaps;
4422 f->close_section();
4423 }
4424 void rollback_extents(
4425 version_t gen,
4426 const vector<pair<uint64_t, uint64_t> > &extents) override {
4427 f->open_object_section("op");
4428 f->dump_string("code", "ROLLBACK_EXTENTS");
4429 f->dump_unsigned("gen", gen);
4430 f->dump_stream("snaps") << extents;
4431 f->close_section();
4432 }
4433 };
4434
4435 void ObjectModDesc::dump(Formatter *f) const
4436 {
4437 f->open_object_section("object_mod_desc");
4438 f->dump_bool("can_local_rollback", can_local_rollback);
4439 f->dump_bool("rollback_info_completed", rollback_info_completed);
4440 {
4441 f->open_array_section("ops");
4442 DumpVisitor vis(f);
4443 visit(&vis);
4444 f->close_section();
4445 }
4446 f->close_section();
4447 }
4448
4449 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
4450 {
4451 map<string, std::optional<ceph::buffer::list> > attrs;
4452 attrs[OI_ATTR];
4453 attrs[SS_ATTR];
4454 attrs["asdf"];
4455 o.push_back(new ObjectModDesc());
4456 o.back()->append(100);
4457 o.back()->setattrs(attrs);
4458 o.push_back(new ObjectModDesc());
4459 o.back()->rmobject(1001);
4460 o.push_back(new ObjectModDesc());
4461 o.back()->create();
4462 o.back()->setattrs(attrs);
4463 o.push_back(new ObjectModDesc());
4464 o.back()->create();
4465 o.back()->setattrs(attrs);
4466 o.back()->mark_unrollbackable();
4467 o.back()->append(1000);
4468 }
4469
4470 void ObjectModDesc::encode(ceph::buffer::list &_bl) const
4471 {
4472 ENCODE_START(max_required_version, max_required_version, _bl);
4473 encode(can_local_rollback, _bl);
4474 encode(rollback_info_completed, _bl);
4475 encode(bl, _bl);
4476 ENCODE_FINISH(_bl);
4477 }
4478 void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl)
4479 {
4480 DECODE_START(2, _bl);
4481 max_required_version = struct_v;
4482 decode(can_local_rollback, _bl);
4483 decode(rollback_info_completed, _bl);
4484 decode(bl, _bl);
4485 // ensure bl does not pin a larger ceph::buffer in memory
4486 bl.rebuild();
4487 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
4488 DECODE_FINISH(_bl);
4489 }
4490
4491 std::atomic<int32_t> ObjectCleanRegions::max_num_intervals = {10};
4492
4493 void ObjectCleanRegions::set_max_num_intervals(int32_t num)
4494 {
4495 max_num_intervals = num;
4496 }
4497
4498 void ObjectCleanRegions::trim()
4499 {
4500 while(clean_offsets.num_intervals() > max_num_intervals) {
4501 typename interval_set<uint64_t>::iterator shortest_interval = clean_offsets.begin();
4502 if (shortest_interval == clean_offsets.end())
4503 break;
4504 for (typename interval_set<uint64_t>::iterator it = clean_offsets.begin();
4505 it != clean_offsets.end();
4506 ++it) {
4507 if (it.get_len() < shortest_interval.get_len())
4508 shortest_interval = it;
4509 }
4510 clean_offsets.erase(shortest_interval);
4511 }
4512 }
4513
4514 void ObjectCleanRegions::merge(const ObjectCleanRegions &other)
4515 {
4516 clean_offsets.intersection_of(other.clean_offsets);
4517 clean_omap = clean_omap && other.clean_omap;
4518 trim();
4519 }
4520
4521 void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len)
4522 {
4523 interval_set<uint64_t> clean_region;
4524 clean_region.insert(0, (uint64_t)-1);
4525 clean_region.erase(offset, len);
4526 clean_offsets.intersection_of(clean_region);
4527 trim();
4528 }
4529
4530 void ObjectCleanRegions::mark_omap_dirty()
4531 {
4532 clean_omap = false;
4533 }
4534
4535 void ObjectCleanRegions::mark_object_new()
4536 {
4537 new_object = true;
4538 }
4539
4540 void ObjectCleanRegions::mark_fully_dirty()
4541 {
4542 mark_data_region_dirty(0, (uint64_t)-1);
4543 mark_omap_dirty();
4544 mark_object_new();
4545 }
4546
4547 interval_set<uint64_t> ObjectCleanRegions::get_dirty_regions() const
4548 {
4549 interval_set<uint64_t> dirty_region;
4550 dirty_region.insert(0, (uint64_t)-1);
4551 dirty_region.subtract(clean_offsets);
4552 return dirty_region;
4553 }
4554
4555 bool ObjectCleanRegions::omap_is_dirty() const
4556 {
4557 return !clean_omap;
4558 }
4559
4560 bool ObjectCleanRegions::object_is_exist() const
4561 {
4562 return !new_object;
4563 }
4564
4565 void ObjectCleanRegions::encode(bufferlist &bl) const
4566 {
4567 ENCODE_START(1, 1, bl);
4568 using ceph::encode;
4569 encode(clean_offsets, bl);
4570 encode(clean_omap, bl);
4571 encode(new_object, bl);
4572 ENCODE_FINISH(bl);
4573 }
4574
4575 void ObjectCleanRegions::decode(bufferlist::const_iterator &bl)
4576 {
4577 DECODE_START(1, bl);
4578 using ceph::decode;
4579 decode(clean_offsets, bl);
4580 decode(clean_omap, bl);
4581 decode(new_object, bl);
4582 DECODE_FINISH(bl);
4583 }
4584
4585 void ObjectCleanRegions::dump(Formatter *f) const
4586 {
4587 f->open_object_section("object_clean_regions");
4588 f->dump_stream("clean_offsets") << clean_offsets;
4589 f->dump_bool("clean_omap", clean_omap);
4590 f->dump_bool("new_object", new_object);
4591 f->close_section();
4592 }
4593
4594 void ObjectCleanRegions::generate_test_instances(list<ObjectCleanRegions*>& o)
4595 {
4596 o.push_back(new ObjectCleanRegions());
4597 o.push_back(new ObjectCleanRegions());
4598 o.back()->mark_data_region_dirty(4096, 40960);
4599 o.back()->mark_omap_dirty();
4600 o.back()->mark_object_new();
4601 }
4602
4603 ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr)
4604 {
4605 return out << "clean_offsets: " << ocr.clean_offsets
4606 << ", clean_omap: " << ocr.clean_omap
4607 << ", new_object: " << ocr.new_object;
4608 }
4609
4610 // -- pg_log_entry_t --
4611
4612 string pg_log_entry_t::get_key_name() const
4613 {
4614 return version.get_key_name();
4615 }
4616
4617 void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const
4618 {
4619 using ceph::encode;
4620 ceph::buffer::list ebl(sizeof(*this)*2);
4621 this->encode(ebl);
4622 __u32 crc = ebl.crc32c(0);
4623 encode(ebl, bl);
4624 encode(crc, bl);
4625 }
4626
4627 void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p)
4628 {
4629 using ceph::decode;
4630 ceph::buffer::list bl;
4631 decode(bl, p);
4632 __u32 crc;
4633 decode(crc, p);
4634 if (crc != bl.crc32c(0))
4635 throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
4636 auto q = bl.cbegin();
4637 this->decode(q);
4638 }
4639
4640 void pg_log_entry_t::encode(ceph::buffer::list &bl) const
4641 {
4642 ENCODE_START(14, 4, bl);
4643 encode(op, bl);
4644 encode(soid, bl);
4645 encode(version, bl);
4646
4647 /**
4648 * Added with reverting_to:
4649 * Previous code used prior_version to encode
4650 * what we now call reverting_to. This will
4651 * allow older code to decode reverting_to
4652 * into prior_version as expected.
4653 */
4654 if (op == LOST_REVERT)
4655 encode(reverting_to, bl);
4656 else
4657 encode(prior_version, bl);
4658
4659 encode(reqid, bl);
4660 encode(mtime, bl);
4661 if (op == LOST_REVERT)
4662 encode(prior_version, bl);
4663 encode(snaps, bl);
4664 encode(user_version, bl);
4665 encode(mod_desc, bl);
4666 encode(extra_reqids, bl);
4667 if (op == ERROR)
4668 encode(return_code, bl);
4669 if (!extra_reqids.empty())
4670 encode(extra_reqid_return_codes, bl);
4671 encode(clean_regions, bl);
4672 if (op != ERROR)
4673 encode(return_code, bl);
4674 encode(op_returns, bl);
4675 ENCODE_FINISH(bl);
4676 }
4677
4678 void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
4679 {
4680 DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl);
4681 decode(op, bl);
4682 if (struct_v < 2) {
4683 sobject_t old_soid;
4684 decode(old_soid, bl);
4685 soid.oid = old_soid.oid;
4686 soid.snap = old_soid.snap;
4687 invalid_hash = true;
4688 } else {
4689 decode(soid, bl);
4690 }
4691 if (struct_v < 3)
4692 invalid_hash = true;
4693 decode(version, bl);
4694
4695 if (struct_v >= 6 && op == LOST_REVERT)
4696 decode(reverting_to, bl);
4697 else
4698 decode(prior_version, bl);
4699
4700 decode(reqid, bl);
4701
4702 decode(mtime, bl);
4703 if (struct_v < 5)
4704 invalid_pool = true;
4705
4706 if (op == LOST_REVERT) {
4707 if (struct_v >= 6) {
4708 decode(prior_version, bl);
4709 } else {
4710 reverting_to = prior_version;
4711 }
4712 }
4713 if (struct_v >= 7 || // for v >= 7, this is for all ops.
4714 op == CLONE) { // for v < 7, it's only present for CLONE.
4715 decode(snaps, bl);
4716 // ensure snaps does not pin a larger ceph::buffer in memory
4717 snaps.rebuild();
4718 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4719 }
4720
4721 if (struct_v >= 8)
4722 decode(user_version, bl);
4723 else
4724 user_version = version.version;
4725
4726 if (struct_v >= 9)
4727 decode(mod_desc, bl);
4728 else
4729 mod_desc.mark_unrollbackable();
4730 if (struct_v >= 10)
4731 decode(extra_reqids, bl);
4732 if (struct_v >= 11 && op == ERROR)
4733 decode(return_code, bl);
4734 if (struct_v >= 12 && !extra_reqids.empty())
4735 decode(extra_reqid_return_codes, bl);
4736 if (struct_v >= 13)
4737 decode(clean_regions, bl);
4738 else
4739 clean_regions.mark_fully_dirty();
4740 if (struct_v >= 14) {
4741 if (op != ERROR) {
4742 decode(return_code, bl);
4743 }
4744 decode(op_returns, bl);
4745 }
4746 DECODE_FINISH(bl);
4747 }
4748
4749 void pg_log_entry_t::dump(Formatter *f) const
4750 {
4751 f->dump_string("op", get_op_name());
4752 f->dump_stream("object") << soid;
4753 f->dump_stream("version") << version;
4754 f->dump_stream("prior_version") << prior_version;
4755 f->dump_stream("reqid") << reqid;
4756 f->open_array_section("extra_reqids");
4757 uint32_t idx = 0;
4758 for (auto p = extra_reqids.begin();
4759 p != extra_reqids.end();
4760 ++idx, ++p) {
4761 f->open_object_section("extra_reqid");
4762 f->dump_stream("reqid") << p->first;
4763 f->dump_stream("user_version") << p->second;
4764 auto it = extra_reqid_return_codes.find(idx);
4765 if (it != extra_reqid_return_codes.end()) {
4766 f->dump_int("return_code", it->second);
4767 }
4768 f->close_section();
4769 }
4770 f->close_section();
4771 f->dump_stream("mtime") << mtime;
4772 f->dump_int("return_code", return_code);
4773 if (!op_returns.empty()) {
4774 f->open_array_section("op_returns");
4775 for (auto& i : op_returns) {
4776 f->dump_object("op", i);
4777 }
4778 f->close_section();
4779 }
4780 if (snaps.length() > 0) {
4781 vector<snapid_t> v;
4782 ceph::buffer::list c = snaps;
4783 auto p = c.cbegin();
4784 try {
4785 using ceph::decode;
4786 decode(v, p);
4787 } catch (...) {
4788 v.clear();
4789 }
4790 f->open_object_section("snaps");
4791 for (auto p = v.begin(); p != v.end(); ++p)
4792 f->dump_unsigned("snap", *p);
4793 f->close_section();
4794 }
4795 {
4796 f->open_object_section("mod_desc");
4797 mod_desc.dump(f);
4798 f->close_section();
4799 }
4800 {
4801 f->open_object_section("clean_regions");
4802 clean_regions.dump(f);
4803 f->close_section();
4804 }
4805 }
4806
4807 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4808 {
4809 o.push_back(new pg_log_entry_t());
4810 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4811 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4812 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4813 utime_t(8,9), 0));
4814 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4815 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4816 utime_t(8,9), -ENOENT));
4817 }
4818
4819 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4820 {
4821 out << e.version << " (" << e.prior_version << ") "
4822 << std::left << std::setw(8) << e.get_op_name() << ' '
4823 << e.soid << " by " << e.reqid << " " << e.mtime
4824 << " " << e.return_code;
4825 if (!e.op_returns.empty()) {
4826 out << " " << e.op_returns;
4827 }
4828 if (e.snaps.length()) {
4829 vector<snapid_t> snaps;
4830 ceph::buffer::list c = e.snaps;
4831 auto p = c.cbegin();
4832 try {
4833 decode(snaps, p);
4834 } catch (...) {
4835 snaps.clear();
4836 }
4837 out << " snaps " << snaps;
4838 }
4839 out << " ObjectCleanRegions " << e.clean_regions;
4840 return out;
4841 }
4842
4843 // -- pg_log_dup_t --
4844
4845 std::string pg_log_dup_t::get_key_name() const
4846 {
4847 static const char prefix[] = "dup_";
4848 std::string key(36, ' ');
4849 memcpy(&key[0], prefix, 4);
4850 version.get_key_name(&key[4]);
4851 key.resize(35); // remove the null terminator
4852 return key;
4853 }
4854
4855 void pg_log_dup_t::encode(ceph::buffer::list &bl) const
4856 {
4857 ENCODE_START(2, 1, bl);
4858 encode(reqid, bl);
4859 encode(version, bl);
4860 encode(user_version, bl);
4861 encode(return_code, bl);
4862 encode(op_returns, bl);
4863 ENCODE_FINISH(bl);
4864 }
4865
4866 void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl)
4867 {
4868 DECODE_START(2, bl);
4869 decode(reqid, bl);
4870 decode(version, bl);
4871 decode(user_version, bl);
4872 decode(return_code, bl);
4873 if (struct_v >= 2) {
4874 decode(op_returns, bl);
4875 }
4876 DECODE_FINISH(bl);
4877 }
4878
4879 void pg_log_dup_t::dump(Formatter *f) const
4880 {
4881 f->dump_stream("reqid") << reqid;
4882 f->dump_stream("version") << version;
4883 f->dump_stream("user_version") << user_version;
4884 f->dump_stream("return_code") << return_code;
4885 if (!op_returns.empty()) {
4886 f->open_array_section("op_returns");
4887 for (auto& i : op_returns) {
4888 f->dump_object("op", i);
4889 }
4890 f->close_section();
4891 }
4892 }
4893
4894 void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
4895 {
4896 o.push_back(new pg_log_dup_t());
4897 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4898 1,
4899 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4900 0));
4901 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4902 2,
4903 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4904 -ENOENT));
4905 }
4906
4907
4908 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
4909 out << "log_dup(reqid=" << e.reqid <<
4910 " v=" << e.version << " uv=" << e.user_version <<
4911 " rc=" << e.return_code;
4912 if (!e.op_returns.empty()) {
4913 out << " " << e.op_returns;
4914 }
4915 return out << ")";
4916 }
4917
4918
4919 // -- pg_log_t --
4920
4921 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4922 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4923 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
4924 const string &hit_set_namespace, const pg_log_t &in,
4925 pg_log_t &out, pg_log_t &reject)
4926 {
4927 out = in;
4928 out.log.clear();
4929 reject.log.clear();
4930
4931 for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) {
4932
4933 // Reject pg log entries for temporary objects
4934 if (i->soid.is_temp()) {
4935 reject.log.push_back(*i);
4936 continue;
4937 }
4938
4939 if (i->soid.nspace != hit_set_namespace) {
4940 object_t oid = i->soid.oid;
4941 object_locator_t loc(i->soid);
4942 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
4943 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
4944
4945 if (import_pgid.pgid == pgid) {
4946 out.log.push_back(*i);
4947 } else {
4948 reject.log.push_back(*i);
4949 }
4950 } else {
4951 out.log.push_back(*i);
4952 }
4953 }
4954 }
4955
4956 void pg_log_t::encode(ceph::buffer::list& bl) const
4957 {
4958 ENCODE_START(7, 3, bl);
4959 encode(head, bl);
4960 encode(tail, bl);
4961 encode(log, bl);
4962 encode(can_rollback_to, bl);
4963 encode(rollback_info_trimmed_to, bl);
4964 encode(dups, bl);
4965 ENCODE_FINISH(bl);
4966 }
4967
4968 void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool)
4969 {
4970 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
4971 decode(head, bl);
4972 decode(tail, bl);
4973 if (struct_v < 2) {
4974 bool backlog;
4975 decode(backlog, bl);
4976 }
4977 decode(log, bl);
4978 if (struct_v >= 5)
4979 decode(can_rollback_to, bl);
4980
4981 if (struct_v >= 6)
4982 decode(rollback_info_trimmed_to, bl);
4983 else
4984 rollback_info_trimmed_to = tail;
4985
4986 if (struct_v >= 7)
4987 decode(dups, bl);
4988
4989 DECODE_FINISH(bl);
4990
4991 // handle hobject_t format change
4992 if (struct_v < 4) {
4993 for (auto i = log.begin(); i != log.end(); ++i) {
4994 if (!i->soid.is_max() && i->soid.pool == -1)
4995 i->soid.pool = pool;
4996 }
4997 }
4998 }
4999
5000 void pg_log_t::dump(Formatter *f) const
5001 {
5002 f->dump_stream("head") << head;
5003 f->dump_stream("tail") << tail;
5004 f->open_array_section("log");
5005 for (auto p = log.cbegin(); p != log.cend(); ++p) {
5006 f->open_object_section("entry");
5007 p->dump(f);
5008 f->close_section();
5009 }
5010 f->close_section();
5011 f->open_array_section("dups");
5012 for (const auto& entry : dups) {
5013 f->open_object_section("entry");
5014 entry.dump(f);
5015 f->close_section();
5016 }
5017 f->close_section();
5018 }
5019
5020 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
5021 {
5022 o.push_back(new pg_log_t);
5023
5024 // this is nonsensical:
5025 o.push_back(new pg_log_t);
5026 o.back()->head = eversion_t(1,2);
5027 o.back()->tail = eversion_t(3,4);
5028 list<pg_log_entry_t*> e;
5029 pg_log_entry_t::generate_test_instances(e);
5030 for (auto p = e.begin(); p != e.end(); ++p)
5031 o.back()->log.push_back(**p);
5032 }
5033
5034 static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups)
5035 {
5036 auto earliest_dup_version =
5037 target.head.version < maxdups ? 0u : target.head.version - maxdups + 1;
5038 lgeneric_subdout(cct, osd, 20) << "copy_up_to/copy_after earliest_dup_version " << earliest_dup_version << dendl;
5039
5040 for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) {
5041 if (d->version.version >= earliest_dup_version) {
5042 lgeneric_subdout(cct, osd, 20)
5043 << "copy_up_to/copy_after copy dup version "
5044 << d->version << dendl;
5045 target.dups.push_back(pg_log_dup_t(*d));
5046 }
5047 }
5048
5049 for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) {
5050 ceph_assert(i->version > other.tail);
5051 if (i->version > target.tail)
5052 break;
5053 if (i->version.version >= earliest_dup_version) {
5054 lgeneric_subdout(cct, osd, 20)
5055 << "copy_up_to/copy_after copy dup from log version "
5056 << i->version << dendl;
5057 target.dups.push_back(pg_log_dup_t(*i));
5058 }
5059 }
5060 }
5061
5062
5063 void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v)
5064 {
5065 can_rollback_to = other.can_rollback_to;
5066 head = other.head;
5067 tail = other.tail;
5068 lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v << dendl;
5069 for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
5070 ceph_assert(i->version > other.tail);
5071 if (i->version <= v) {
5072 // make tail accurate.
5073 tail = i->version;
5074 break;
5075 }
5076 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
5077 log.push_front(*i);
5078 }
5079 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
5080 }
5081
5082 void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max)
5083 {
5084 can_rollback_to = other.can_rollback_to;
5085 int n = 0;
5086 head = other.head;
5087 tail = other.tail;
5088 lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max << dendl;
5089 for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
5090 ceph_assert(i->version > other.tail);
5091 if (n++ >= max) {
5092 tail = i->version;
5093 break;
5094 }
5095 lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
5096 log.push_front(*i);
5097 }
5098 _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
5099 }
5100
5101 ostream& pg_log_t::print(ostream& out) const
5102 {
5103 out << *this << std::endl;
5104 for (auto p = log.cbegin(); p != log.cend(); ++p)
5105 out << *p << std::endl;
5106 for (const auto& entry : dups) {
5107 out << " dup entry: " << entry << std::endl;
5108 }
5109 return out;
5110 }
5111
5112 // -- pg_missing_t --
5113
5114 ostream& operator<<(ostream& out, const pg_missing_item& i)
5115 {
5116 out << i.need;
5117 if (i.have != eversion_t())
5118 out << "(" << i.have << ")";
5119 out << " flags = " << i.flag_str()
5120 << " " << i.clean_regions;
5121 return out;
5122 }
5123
5124 // -- object_copy_cursor_t --
5125
5126 void object_copy_cursor_t::encode(ceph::buffer::list& bl) const
5127 {
5128 ENCODE_START(1, 1, bl);
5129 encode(attr_complete, bl);
5130 encode(data_offset, bl);
5131 encode(data_complete, bl);
5132 encode(omap_offset, bl);
5133 encode(omap_complete, bl);
5134 ENCODE_FINISH(bl);
5135 }
5136
5137 void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl)
5138 {
5139 DECODE_START(1, bl);
5140 decode(attr_complete, bl);
5141 decode(data_offset, bl);
5142 decode(data_complete, bl);
5143 decode(omap_offset, bl);
5144 decode(omap_complete, bl);
5145 DECODE_FINISH(bl);
5146 }
5147
5148 void object_copy_cursor_t::dump(Formatter *f) const
5149 {
5150 f->dump_unsigned("attr_complete", (int)attr_complete);
5151 f->dump_unsigned("data_offset", data_offset);
5152 f->dump_unsigned("data_complete", (int)data_complete);
5153 f->dump_string("omap_offset", omap_offset);
5154 f->dump_unsigned("omap_complete", (int)omap_complete);
5155 }
5156
5157 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
5158 {
5159 o.push_back(new object_copy_cursor_t);
5160 o.push_back(new object_copy_cursor_t);
5161 o.back()->attr_complete = true;
5162 o.back()->data_offset = 123;
5163 o.push_back(new object_copy_cursor_t);
5164 o.back()->attr_complete = true;
5165 o.back()->data_complete = true;
5166 o.back()->omap_offset = "foo";
5167 o.push_back(new object_copy_cursor_t);
5168 o.back()->attr_complete = true;
5169 o.back()->data_complete = true;
5170 o.back()->omap_complete = true;
5171 }
5172
5173 // -- object_copy_data_t --
5174
5175 void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const
5176 {
5177 ENCODE_START(8, 5, bl);
5178 encode(size, bl);
5179 encode(mtime, bl);
5180 encode(attrs, bl);
5181 encode(data, bl);
5182 encode(omap_data, bl);
5183 encode(cursor, bl);
5184 encode(omap_header, bl);
5185 encode(snaps, bl);
5186 encode(snap_seq, bl);
5187 encode(flags, bl);
5188 encode(data_digest, bl);
5189 encode(omap_digest, bl);
5190 encode(reqids, bl);
5191 encode(truncate_seq, bl);
5192 encode(truncate_size, bl);
5193 encode(reqid_return_codes, bl);
5194 ENCODE_FINISH(bl);
5195 }
5196
5197 void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl)
5198 {
5199 DECODE_START(8, bl);
5200 if (struct_v < 5) {
5201 // old
5202 decode(size, bl);
5203 decode(mtime, bl);
5204 {
5205 string category;
5206 decode(category, bl); // no longer used
5207 }
5208 decode(attrs, bl);
5209 decode(data, bl);
5210 {
5211 map<string,ceph::buffer::list> omap;
5212 decode(omap, bl);
5213 omap_data.clear();
5214 if (!omap.empty()) {
5215 using ceph::encode;
5216 encode(omap, omap_data);
5217 }
5218 }
5219 decode(cursor, bl);
5220 if (struct_v >= 2)
5221 decode(omap_header, bl);
5222 if (struct_v >= 3) {
5223 decode(snaps, bl);
5224 decode(snap_seq, bl);
5225 } else {
5226 snaps.clear();
5227 snap_seq = 0;
5228 }
5229 if (struct_v >= 4) {
5230 decode(flags, bl);
5231 decode(data_digest, bl);
5232 decode(omap_digest, bl);
5233 }
5234 } else {
5235 // current
5236 decode(size, bl);
5237 decode(mtime, bl);
5238 decode(attrs, bl);
5239 decode(data, bl);
5240 decode(omap_data, bl);
5241 decode(cursor, bl);
5242 decode(omap_header, bl);
5243 decode(snaps, bl);
5244 decode(snap_seq, bl);
5245 if (struct_v >= 4) {
5246 decode(flags, bl);
5247 decode(data_digest, bl);
5248 decode(omap_digest, bl);
5249 }
5250 if (struct_v >= 6) {
5251 decode(reqids, bl);
5252 }
5253 if (struct_v >= 7) {
5254 decode(truncate_seq, bl);
5255 decode(truncate_size, bl);
5256 }
5257 if (struct_v >= 8) {
5258 decode(reqid_return_codes, bl);
5259 }
5260 }
5261 DECODE_FINISH(bl);
5262 }
5263
5264 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
5265 {
5266 o.push_back(new object_copy_data_t());
5267
5268 list<object_copy_cursor_t*> cursors;
5269 object_copy_cursor_t::generate_test_instances(cursors);
5270 auto ci = cursors.begin();
5271 o.back()->cursor = **(ci++);
5272
5273 o.push_back(new object_copy_data_t());
5274 o.back()->cursor = **(ci++);
5275
5276 o.push_back(new object_copy_data_t());
5277 o.back()->size = 1234;
5278 o.back()->mtime.set_from_double(1234);
5279 ceph::buffer::ptr bp("there", 5);
5280 ceph::buffer::list bl;
5281 bl.push_back(bp);
5282 o.back()->attrs["hello"] = bl;
5283 ceph::buffer::ptr bp2("not", 3);
5284 ceph::buffer::list bl2;
5285 bl2.push_back(bp2);
5286 map<string,ceph::buffer::list> omap;
5287 omap["why"] = bl2;
5288 using ceph::encode;
5289 encode(omap, o.back()->omap_data);
5290 ceph::buffer::ptr databp("iamsomedatatocontain", 20);
5291 o.back()->data.push_back(databp);
5292 o.back()->omap_header.append("this is an omap header");
5293 o.back()->snaps.push_back(123);
5294 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
5295 }
5296
5297 void object_copy_data_t::dump(Formatter *f) const
5298 {
5299 f->open_object_section("cursor");
5300 cursor.dump(f);
5301 f->close_section(); // cursor
5302 f->dump_int("size", size);
5303 f->dump_stream("mtime") << mtime;
5304 /* we should really print out the attrs here, but ceph::buffer::list
5305 const-correctness prevents that */
5306 f->dump_int("attrs_size", attrs.size());
5307 f->dump_int("flags", flags);
5308 f->dump_unsigned("data_digest", data_digest);
5309 f->dump_unsigned("omap_digest", omap_digest);
5310 f->dump_int("omap_data_length", omap_data.length());
5311 f->dump_int("omap_header_length", omap_header.length());
5312 f->dump_int("data_length", data.length());
5313 f->open_array_section("snaps");
5314 for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
5315 f->dump_unsigned("snap", *p);
5316 f->close_section();
5317 f->open_array_section("reqids");
5318 uint32_t idx = 0;
5319 for (auto p = reqids.begin();
5320 p != reqids.end();
5321 ++idx, ++p) {
5322 f->open_object_section("extra_reqid");
5323 f->dump_stream("reqid") << p->first;
5324 f->dump_stream("user_version") << p->second;
5325 auto it = reqid_return_codes.find(idx);
5326 if (it != reqid_return_codes.end()) {
5327 f->dump_int("return_code", it->second);
5328 }
5329 f->close_section();
5330 }
5331 f->close_section();
5332 }
5333
5334 // -- pg_create_t --
5335
5336 void pg_create_t::encode(ceph::buffer::list &bl) const
5337 {
5338 ENCODE_START(1, 1, bl);
5339 encode(created, bl);
5340 encode(parent, bl);
5341 encode(split_bits, bl);
5342 ENCODE_FINISH(bl);
5343 }
5344
5345 void pg_create_t::decode(ceph::buffer::list::const_iterator &bl)
5346 {
5347 DECODE_START(1, bl);
5348 decode(created, bl);
5349 decode(parent, bl);
5350 decode(split_bits, bl);
5351 DECODE_FINISH(bl);
5352 }
5353
5354 void pg_create_t::dump(Formatter *f) const
5355 {
5356 f->dump_unsigned("created", created);
5357 f->dump_stream("parent") << parent;
5358 f->dump_int("split_bits", split_bits);
5359 }
5360
5361 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
5362 {
5363 o.push_back(new pg_create_t);
5364 o.push_back(new pg_create_t(1, pg_t(3, 4), 2));
5365 }
5366
5367
5368 // -- pg_hit_set_info_t --
5369
5370 void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const
5371 {
5372 ENCODE_START(2, 1, bl);
5373 encode(begin, bl);
5374 encode(end, bl);
5375 encode(version, bl);
5376 encode(using_gmt, bl);
5377 ENCODE_FINISH(bl);
5378 }
5379
5380 void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p)
5381 {
5382 DECODE_START(2, p);
5383 decode(begin, p);
5384 decode(end, p);
5385 decode(version, p);
5386 if (struct_v >= 2) {
5387 decode(using_gmt, p);
5388 } else {
5389 using_gmt = false;
5390 }
5391 DECODE_FINISH(p);
5392 }
5393
5394 void pg_hit_set_info_t::dump(Formatter *f) const
5395 {
5396 f->dump_stream("begin") << begin;
5397 f->dump_stream("end") << end;
5398 f->dump_stream("version") << version;
5399 f->dump_stream("using_gmt") << using_gmt;
5400 }
5401
5402 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
5403 {
5404 ls.push_back(new pg_hit_set_info_t);
5405 ls.push_back(new pg_hit_set_info_t);
5406 ls.back()->begin = utime_t(1, 2);
5407 ls.back()->end = utime_t(3, 4);
5408 }
5409
5410
5411 // -- pg_hit_set_history_t --
5412
5413 void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const
5414 {
5415 ENCODE_START(1, 1, bl);
5416 encode(current_last_update, bl);
5417 {
5418 utime_t dummy_stamp;
5419 encode(dummy_stamp, bl);
5420 }
5421 {
5422 pg_hit_set_info_t dummy_info;
5423 encode(dummy_info, bl);
5424 }
5425 encode(history, bl);
5426 ENCODE_FINISH(bl);
5427 }
5428
5429 void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p)
5430 {
5431 DECODE_START(1, p);
5432 decode(current_last_update, p);
5433 {
5434 utime_t dummy_stamp;
5435 decode(dummy_stamp, p);
5436 }
5437 {
5438 pg_hit_set_info_t dummy_info;
5439 decode(dummy_info, p);
5440 }
5441 decode(history, p);
5442 DECODE_FINISH(p);
5443 }
5444
5445 void pg_hit_set_history_t::dump(Formatter *f) const
5446 {
5447 f->dump_stream("current_last_update") << current_last_update;
5448 f->open_array_section("history");
5449 for (auto p = history.cbegin(); p != history.cend(); ++p) {
5450 f->open_object_section("info");
5451 p->dump(f);
5452 f->close_section();
5453 }
5454 f->close_section();
5455 }
5456
5457 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
5458 {
5459 ls.push_back(new pg_hit_set_history_t);
5460 ls.push_back(new pg_hit_set_history_t);
5461 ls.back()->current_last_update = eversion_t(1, 2);
5462 ls.back()->history.push_back(pg_hit_set_info_t());
5463 }
5464
5465 // -- OSDSuperblock --
5466
5467 void OSDSuperblock::encode(ceph::buffer::list &bl) const
5468 {
5469 ENCODE_START(9, 5, bl);
5470 encode(cluster_fsid, bl);
5471 encode(whoami, bl);
5472 encode(current_epoch, bl);
5473 encode(oldest_map, bl);
5474 encode(newest_map, bl);
5475 encode(weight, bl);
5476 compat_features.encode(bl);
5477 encode(clean_thru, bl);
5478 encode(mounted, bl);
5479 encode(osd_fsid, bl);
5480 encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
5481 encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
5482 encode(purged_snaps_last, bl);
5483 encode(last_purged_snaps_scrub, bl);
5484 ENCODE_FINISH(bl);
5485 }
5486
5487 void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl)
5488 {
5489 DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
5490 if (struct_v < 3) {
5491 string magic;
5492 decode(magic, bl);
5493 }
5494 decode(cluster_fsid, bl);
5495 decode(whoami, bl);
5496 decode(current_epoch, bl);
5497 decode(oldest_map, bl);
5498 decode(newest_map, bl);
5499 decode(weight, bl);
5500 if (struct_v >= 2) {
5501 compat_features.decode(bl);
5502 } else { //upgrade it!
5503 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
5504 }
5505 decode(clean_thru, bl);
5506 decode(mounted, bl);
5507 if (struct_v >= 4)
5508 decode(osd_fsid, bl);
5509 if (struct_v >= 6) {
5510 epoch_t last_map_marked_full;
5511 decode(last_map_marked_full, bl);
5512 }
5513 if (struct_v >= 7) {
5514 map<int64_t,epoch_t> pool_last_map_marked_full;
5515 decode(pool_last_map_marked_full, bl);
5516 }
5517 if (struct_v >= 9) {
5518 decode(purged_snaps_last, bl);
5519 decode(last_purged_snaps_scrub, bl);
5520 } else {
5521 purged_snaps_last = 0;
5522 }
5523 DECODE_FINISH(bl);
5524 }
5525
5526 void OSDSuperblock::dump(Formatter *f) const
5527 {
5528 f->dump_stream("cluster_fsid") << cluster_fsid;
5529 f->dump_stream("osd_fsid") << osd_fsid;
5530 f->dump_int("whoami", whoami);
5531 f->dump_int("current_epoch", current_epoch);
5532 f->dump_int("oldest_map", oldest_map);
5533 f->dump_int("newest_map", newest_map);
5534 f->dump_float("weight", weight);
5535 f->open_object_section("compat");
5536 compat_features.dump(f);
5537 f->close_section();
5538 f->dump_int("clean_thru", clean_thru);
5539 f->dump_int("last_epoch_mounted", mounted);
5540 f->dump_unsigned("purged_snaps_last", purged_snaps_last);
5541 f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
5542 }
5543
5544 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
5545 {
5546 OSDSuperblock z;
5547 o.push_back(new OSDSuperblock(z));
5548 z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101");
5549 z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
5550 z.whoami = 3;
5551 z.current_epoch = 4;
5552 z.oldest_map = 5;
5553 z.newest_map = 9;
5554 z.mounted = 8;
5555 z.clean_thru = 7;
5556 o.push_back(new OSDSuperblock(z));
5557 o.push_back(new OSDSuperblock(z));
5558 }
5559
5560 // -- SnapSet --
5561
5562 void SnapSet::encode(ceph::buffer::list& bl) const
5563 {
5564 ENCODE_START(3, 2, bl);
5565 encode(seq, bl);
5566 encode(true, bl); // head_exists
5567 encode(snaps, bl);
5568 encode(clones, bl);
5569 encode(clone_overlap, bl);
5570 encode(clone_size, bl);
5571 encode(clone_snaps, bl);
5572 ENCODE_FINISH(bl);
5573 }
5574
5575 void SnapSet::decode(ceph::buffer::list::const_iterator& bl)
5576 {
5577 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5578 decode(seq, bl);
5579 bl.advance(1u); // skip legacy head_exists (always true)
5580 decode(snaps, bl);
5581 decode(clones, bl);
5582 decode(clone_overlap, bl);
5583 decode(clone_size, bl);
5584 if (struct_v >= 3) {
5585 decode(clone_snaps, bl);
5586 } else {
5587 clone_snaps.clear();
5588 }
5589 DECODE_FINISH(bl);
5590 }
5591
5592 void SnapSet::dump(Formatter *f) const
5593 {
5594 f->dump_unsigned("seq", seq);
5595 f->open_array_section("clones");
5596 for (auto p = clones.cbegin(); p != clones.cend(); ++p) {
5597 f->open_object_section("clone");
5598 f->dump_unsigned("snap", *p);
5599 auto cs = clone_size.find(*p);
5600 if (cs != clone_size.end())
5601 f->dump_unsigned("size", cs->second);
5602 else
5603 f->dump_string("size", "????");
5604 auto co = clone_overlap.find(*p);
5605 if (co != clone_overlap.end())
5606 f->dump_stream("overlap") << co->second;
5607 else
5608 f->dump_stream("overlap") << "????";
5609 auto q = clone_snaps.find(*p);
5610 if (q != clone_snaps.end()) {
5611 f->open_array_section("snaps");
5612 for (auto s : q->second) {
5613 f->dump_unsigned("snap", s);
5614 }
5615 f->close_section();
5616 }
5617 f->close_section();
5618 }
5619 f->close_section();
5620 }
5621
5622 void SnapSet::generate_test_instances(list<SnapSet*>& o)
5623 {
5624 o.push_back(new SnapSet);
5625 o.push_back(new SnapSet);
5626 o.back()->seq = 123;
5627 o.back()->snaps.push_back(123);
5628 o.back()->snaps.push_back(12);
5629 o.push_back(new SnapSet);
5630 o.back()->seq = 123;
5631 o.back()->snaps.push_back(123);
5632 o.back()->snaps.push_back(12);
5633 o.back()->clones.push_back(12);
5634 o.back()->clone_size[12] = 12345;
5635 o.back()->clone_overlap[12];
5636 o.back()->clone_snaps[12] = {12, 10, 8};
5637 }
5638
5639 ostream& operator<<(ostream& out, const SnapSet& cs)
5640 {
5641 return out << cs.seq << "=" << cs.snaps << ":"
5642 << cs.clone_snaps;
5643 }
5644
5645 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
5646 {
5647 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
5648 // correct: it will not include snaps that still logically exist
5649 // but for which there was no clone that is defined. For all
5650 // practical purposes this doesn't matter, since we only use that
5651 // information to clone on the OSD, and we have already moved
5652 // forward past that part of the object history.
5653
5654 seq = ss.seq;
5655 set<snapid_t> _snaps;
5656 set<snapid_t> _clones;
5657 for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) {
5658 if (p->cloneid != librados::SNAP_HEAD) {
5659 _clones.insert(p->cloneid);
5660 _snaps.insert(p->snaps.begin(), p->snaps.end());
5661 clone_size[p->cloneid] = p->size;
5662 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
5663 for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q)
5664 clone_overlap[p->cloneid].insert(q->first, q->second);
5665 if (!legacy) {
5666 // p->snaps is ascending; clone_snaps is descending
5667 vector<snapid_t>& v = clone_snaps[p->cloneid];
5668 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
5669 v.push_back(*q);
5670 }
5671 }
5672 }
5673 }
5674
5675 // ascending
5676 clones.clear();
5677 clones.reserve(_clones.size());
5678 for (auto p = _clones.begin(); p != _clones.end(); ++p)
5679 clones.push_back(*p);
5680
5681 // descending
5682 snaps.clear();
5683 snaps.reserve(_snaps.size());
5684 for (auto p = _snaps.rbegin();
5685 p != _snaps.rend(); ++p)
5686 snaps.push_back(*p);
5687 }
5688
5689 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
5690 {
5691 ceph_assert(clone_size.count(clone));
5692 uint64_t size = clone_size.find(clone)->second;
5693 ceph_assert(clone_overlap.count(clone));
5694 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
5695 ceph_assert(size >= (uint64_t)overlap.size());
5696 return size - overlap.size();
5697 }
5698
5699 void SnapSet::filter(const pg_pool_t &pinfo)
5700 {
5701 vector<snapid_t> oldsnaps;
5702 oldsnaps.swap(snaps);
5703 for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) {
5704 if (!pinfo.is_removed_snap(*i))
5705 snaps.push_back(*i);
5706 }
5707 }
5708
5709 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5710 {
5711 SnapSet ss = *this;
5712 ss.filter(pinfo);
5713 return ss;
5714 }
5715
5716 // -- watch_info_t --
5717
5718 void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
5719 {
5720 ENCODE_START(4, 3, bl);
5721 encode(cookie, bl);
5722 encode(timeout_seconds, bl);
5723 encode(addr, bl, features);
5724 ENCODE_FINISH(bl);
5725 }
5726
5727 void watch_info_t::decode(ceph::buffer::list::const_iterator& bl)
5728 {
5729 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
5730 decode(cookie, bl);
5731 if (struct_v < 2) {
5732 uint64_t ver;
5733 decode(ver, bl);
5734 }
5735 decode(timeout_seconds, bl);
5736 if (struct_v >= 4) {
5737 decode(addr, bl);
5738 }
5739 DECODE_FINISH(bl);
5740 }
5741
5742 void watch_info_t::dump(Formatter *f) const
5743 {
5744 f->dump_unsigned("cookie", cookie);
5745 f->dump_unsigned("timeout_seconds", timeout_seconds);
5746 f->open_object_section("addr");
5747 addr.dump(f);
5748 f->close_section();
5749 }
5750
5751 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5752 {
5753 o.push_back(new watch_info_t);
5754 o.push_back(new watch_info_t);
5755 o.back()->cookie = 123;
5756 o.back()->timeout_seconds = 99;
5757 entity_addr_t ea;
5758 ea.set_type(entity_addr_t::TYPE_LEGACY);
5759 ea.set_nonce(1);
5760 ea.set_family(AF_INET);
5761 ea.set_in4_quad(0, 127);
5762 ea.set_in4_quad(1, 0);
5763 ea.set_in4_quad(2, 1);
5764 ea.set_in4_quad(3, 2);
5765 ea.set_port(2);
5766 o.back()->addr = ea;
5767 }
5768
5769 // -- chunk_info_t --
5770
5771 void chunk_info_t::encode(ceph::buffer::list& bl) const
5772 {
5773 ENCODE_START(1, 1, bl);
5774 encode(offset, bl);
5775 encode(length, bl);
5776 encode(oid, bl);
5777 __u32 _flags = flags;
5778 encode(_flags, bl);
5779 ENCODE_FINISH(bl);
5780 }
5781
5782 void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl)
5783 {
5784 DECODE_START(1, bl);
5785 decode(offset, bl);
5786 decode(length, bl);
5787 decode(oid, bl);
5788 __u32 _flags;
5789 decode(_flags, bl);
5790 flags = (cflag_t)_flags;
5791 DECODE_FINISH(bl);
5792 }
5793
5794 void chunk_info_t::dump(Formatter *f) const
5795 {
5796 f->dump_unsigned("length", length);
5797 f->open_object_section("oid");
5798 oid.dump(f);
5799 f->close_section();
5800 f->dump_unsigned("flags", flags);
5801 }
5802
5803 ostream& operator<<(ostream& out, const chunk_info_t& ci)
5804 {
5805 return out << "(len: " << ci.length << " oid: " << ci.oid
5806 << " offset: " << ci.offset
5807 << " flags: " << ci.get_flag_string(ci.flags) << ")";
5808 }
5809
5810 // -- object_manifest_t --
5811
5812 void object_manifest_t::encode(ceph::buffer::list& bl) const
5813 {
5814 ENCODE_START(1, 1, bl);
5815 encode(type, bl);
5816 switch (type) {
5817 case TYPE_NONE: break;
5818 case TYPE_REDIRECT:
5819 encode(redirect_target, bl);
5820 break;
5821 case TYPE_CHUNKED:
5822 encode(chunk_map, bl);
5823 break;
5824 default:
5825 ceph_abort();
5826 }
5827 ENCODE_FINISH(bl);
5828 }
5829
5830 void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl)
5831 {
5832 DECODE_START(1, bl);
5833 decode(type, bl);
5834 switch (type) {
5835 case TYPE_NONE: break;
5836 case TYPE_REDIRECT:
5837 decode(redirect_target, bl);
5838 break;
5839 case TYPE_CHUNKED:
5840 decode(chunk_map, bl);
5841 break;
5842 default:
5843 ceph_abort();
5844 }
5845 DECODE_FINISH(bl);
5846 }
5847
5848 void object_manifest_t::dump(Formatter *f) const
5849 {
5850 f->dump_unsigned("type", type);
5851 if (type == TYPE_REDIRECT) {
5852 f->open_object_section("redirect_target");
5853 redirect_target.dump(f);
5854 f->close_section();
5855 } else if (type == TYPE_CHUNKED) {
5856 f->open_array_section("chunk_map");
5857 for (auto& p : chunk_map) {
5858 f->open_object_section("chunk");
5859 f->dump_unsigned("offset", p.first);
5860 p.second.dump(f);
5861 f->close_section();
5862 }
5863 f->close_section();
5864 }
5865 }
5866
5867 void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
5868 {
5869 o.push_back(new object_manifest_t());
5870 o.back()->type = TYPE_REDIRECT;
5871 }
5872
5873 ostream& operator<<(ostream& out, const object_manifest_t& om)
5874 {
5875 out << "manifest(" << om.get_type_name();
5876 if (om.is_redirect()) {
5877 out << " " << om.redirect_target;
5878 } else if (om.is_chunked()) {
5879 out << " " << om.chunk_map;
5880 }
5881 out << ")";
5882 return out;
5883 }
5884
5885 // -- object_info_t --
5886
5887 void object_info_t::copy_user_bits(const object_info_t& other)
5888 {
5889 // these bits are copied from head->clone.
5890 size = other.size;
5891 mtime = other.mtime;
5892 local_mtime = other.local_mtime;
5893 last_reqid = other.last_reqid;
5894 truncate_seq = other.truncate_seq;
5895 truncate_size = other.truncate_size;
5896 flags = other.flags;
5897 user_version = other.user_version;
5898 data_digest = other.data_digest;
5899 omap_digest = other.omap_digest;
5900 }
5901
5902 void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
5903 {
5904 object_locator_t myoloc(soid);
5905 map<entity_name_t, watch_info_t> old_watchers;
5906 for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) {
5907 old_watchers.insert(make_pair(i->first.second, i->second));
5908 }
5909 ENCODE_START(17, 8, bl);
5910 encode(soid, bl);
5911 encode(myoloc, bl); //Retained for compatibility
5912 encode((__u32)0, bl); // was category, no longer used
5913 encode(version, bl);
5914 encode(prior_version, bl);
5915 encode(last_reqid, bl);
5916 encode(size, bl);
5917 encode(mtime, bl);
5918 if (soid.snap == CEPH_NOSNAP)
5919 encode(osd_reqid_t(), bl); // used to be wrlock_by
5920 else
5921 encode((uint32_t)0, bl); // was legacy_snaps
5922 encode(truncate_seq, bl);
5923 encode(truncate_size, bl);
5924 encode(is_lost(), bl);
5925 encode(old_watchers, bl, features);
5926 /* shenanigans to avoid breaking backwards compatibility in the disk format.
5927 * When we can, switch this out for simply putting the version_t on disk. */
5928 eversion_t user_eversion(0, user_version);
5929 encode(user_eversion, bl);
5930 encode(test_flag(FLAG_USES_TMAP), bl);
5931 encode(watchers, bl, features);
5932 __u32 _flags = flags;
5933 encode(_flags, bl);
5934 encode(local_mtime, bl);
5935 encode(data_digest, bl);
5936 encode(omap_digest, bl);
5937 encode(expected_object_size, bl);
5938 encode(expected_write_size, bl);
5939 encode(alloc_hint_flags, bl);
5940 if (has_manifest()) {
5941 encode(manifest, bl);
5942 }
5943 ENCODE_FINISH(bl);
5944 }
5945
5946 void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
5947 {
5948 object_locator_t myoloc;
5949 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
5950 map<entity_name_t, watch_info_t> old_watchers;
5951 decode(soid, bl);
5952 decode(myoloc, bl);
5953 {
5954 string category;
5955 decode(category, bl); // no longer used
5956 }
5957 decode(version, bl);
5958 decode(prior_version, bl);
5959 decode(last_reqid, bl);
5960 decode(size, bl);
5961 decode(mtime, bl);
5962 if (soid.snap == CEPH_NOSNAP) {
5963 osd_reqid_t wrlock_by;
5964 decode(wrlock_by, bl);
5965 } else {
5966 vector<snapid_t> legacy_snaps;
5967 decode(legacy_snaps, bl);
5968 }
5969 decode(truncate_seq, bl);
5970 decode(truncate_size, bl);
5971
5972 // if this is struct_v >= 13, we will overwrite this
5973 // below since this field is just here for backwards
5974 // compatibility
5975 __u8 lo;
5976 decode(lo, bl);
5977 flags = (flag_t)lo;
5978
5979 decode(old_watchers, bl);
5980 eversion_t user_eversion;
5981 decode(user_eversion, bl);
5982 user_version = user_eversion.version;
5983
5984 if (struct_v >= 9) {
5985 bool uses_tmap = false;
5986 decode(uses_tmap, bl);
5987 if (uses_tmap)
5988 set_flag(FLAG_USES_TMAP);
5989 } else {
5990 set_flag(FLAG_USES_TMAP);
5991 }
5992 if (struct_v < 10)
5993 soid.pool = myoloc.pool;
5994 if (struct_v >= 11) {
5995 decode(watchers, bl);
5996 } else {
5997 for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) {
5998 watchers.insert(
5999 make_pair(
6000 make_pair(i->second.cookie, i->first), i->second));
6001 }
6002 }
6003 if (struct_v >= 13) {
6004 __u32 _flags;
6005 decode(_flags, bl);
6006 flags = (flag_t)_flags;
6007 }
6008 if (struct_v >= 14) {
6009 decode(local_mtime, bl);
6010 } else {
6011 local_mtime = utime_t();
6012 }
6013 if (struct_v >= 15) {
6014 decode(data_digest, bl);
6015 decode(omap_digest, bl);
6016 } else {
6017 data_digest = omap_digest = -1;
6018 clear_flag(FLAG_DATA_DIGEST);
6019 clear_flag(FLAG_OMAP_DIGEST);
6020 }
6021 if (struct_v >= 16) {
6022 decode(expected_object_size, bl);
6023 decode(expected_write_size, bl);
6024 decode(alloc_hint_flags, bl);
6025 } else {
6026 expected_object_size = 0;
6027 expected_write_size = 0;
6028 alloc_hint_flags = 0;
6029 }
6030 if (struct_v >= 17) {
6031 if (has_manifest()) {
6032 decode(manifest, bl);
6033 }
6034 }
6035 DECODE_FINISH(bl);
6036 }
6037
6038 void object_info_t::dump(Formatter *f) const
6039 {
6040 f->open_object_section("oid");
6041 soid.dump(f);
6042 f->close_section();
6043 f->dump_stream("version") << version;
6044 f->dump_stream("prior_version") << prior_version;
6045 f->dump_stream("last_reqid") << last_reqid;
6046 f->dump_unsigned("user_version", user_version);
6047 f->dump_unsigned("size", size);
6048 f->dump_stream("mtime") << mtime;
6049 f->dump_stream("local_mtime") << local_mtime;
6050 f->dump_unsigned("lost", (int)is_lost());
6051 vector<string> sv = get_flag_vector(flags);
6052 f->open_array_section("flags");
6053 for (auto str: sv)
6054 f->dump_string("flags", str);
6055 f->close_section();
6056 f->dump_unsigned("truncate_seq", truncate_seq);
6057 f->dump_unsigned("truncate_size", truncate_size);
6058 f->dump_format("data_digest", "0x%08x", data_digest);
6059 f->dump_format("omap_digest", "0x%08x", omap_digest);
6060 f->dump_unsigned("expected_object_size", expected_object_size);
6061 f->dump_unsigned("expected_write_size", expected_write_size);
6062 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
6063 f->dump_object("manifest", manifest);
6064 f->open_object_section("watchers");
6065 for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) {
6066 stringstream ss;
6067 ss << p->first.second;
6068 f->open_object_section(ss.str().c_str());
6069 p->second.dump(f);
6070 f->close_section();
6071 }
6072 f->close_section();
6073 }
6074
6075 void object_info_t::generate_test_instances(list<object_info_t*>& o)
6076 {
6077 o.push_back(new object_info_t());
6078
6079 // fixme
6080 }
6081
6082
6083 ostream& operator<<(ostream& out, const object_info_t& oi)
6084 {
6085 out << oi.soid << "(" << oi.version
6086 << " " << oi.last_reqid;
6087 if (oi.flags)
6088 out << " " << oi.get_flag_string();
6089 out << " s " << oi.size;
6090 out << " uv " << oi.user_version;
6091 if (oi.is_data_digest())
6092 out << " dd " << std::hex << oi.data_digest << std::dec;
6093 if (oi.is_omap_digest())
6094 out << " od " << std::hex << oi.omap_digest << std::dec;
6095 out << " alloc_hint [" << oi.expected_object_size
6096 << " " << oi.expected_write_size
6097 << " " << oi.alloc_hint_flags << "]";
6098 if (oi.has_manifest())
6099 out << " " << oi.manifest;
6100 out << ")";
6101 return out;
6102 }
6103
6104 // -- ObjectRecovery --
6105 void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const
6106 {
6107 ENCODE_START(1, 1, bl);
6108 encode(first, bl);
6109 encode(data_complete, bl);
6110 encode(data_recovered_to, bl);
6111 encode(omap_recovered_to, bl);
6112 encode(omap_complete, bl);
6113 ENCODE_FINISH(bl);
6114 }
6115
6116 void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl)
6117 {
6118 DECODE_START(1, bl);
6119 decode(first, bl);
6120 decode(data_complete, bl);
6121 decode(data_recovered_to, bl);
6122 decode(omap_recovered_to, bl);
6123 decode(omap_complete, bl);
6124 DECODE_FINISH(bl);
6125 }
6126
6127 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
6128 {
6129 return prog.print(out);
6130 }
6131
6132 void ObjectRecoveryProgress::generate_test_instances(
6133 list<ObjectRecoveryProgress*>& o)
6134 {
6135 o.push_back(new ObjectRecoveryProgress);
6136 o.back()->first = false;
6137 o.back()->data_complete = true;
6138 o.back()->omap_complete = true;
6139 o.back()->data_recovered_to = 100;
6140
6141 o.push_back(new ObjectRecoveryProgress);
6142 o.back()->first = true;
6143 o.back()->data_complete = false;
6144 o.back()->omap_complete = false;
6145 o.back()->data_recovered_to = 0;
6146 }
6147
6148 ostream &ObjectRecoveryProgress::print(ostream &out) const
6149 {
6150 return out << "ObjectRecoveryProgress("
6151 << ( first ? "" : "!" ) << "first, "
6152 << "data_recovered_to:" << data_recovered_to
6153 << ", data_complete:" << ( data_complete ? "true" : "false" )
6154 << ", omap_recovered_to:" << omap_recovered_to
6155 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
6156 << ", error:" << ( error ? "true" : "false" )
6157 << ")";
6158 }
6159
6160 void ObjectRecoveryProgress::dump(Formatter *f) const
6161 {
6162 f->dump_int("first?", first);
6163 f->dump_int("data_complete?", data_complete);
6164 f->dump_unsigned("data_recovered_to", data_recovered_to);
6165 f->dump_int("omap_complete?", omap_complete);
6166 f->dump_string("omap_recovered_to", omap_recovered_to);
6167 }
6168
6169 void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
6170 {
6171 ENCODE_START(3, 1, bl);
6172 encode(soid, bl);
6173 encode(version, bl);
6174 encode(size, bl);
6175 encode(oi, bl, features);
6176 encode(ss, bl);
6177 encode(copy_subset, bl);
6178 encode(clone_subset, bl);
6179 encode(object_exist, bl);
6180 ENCODE_FINISH(bl);
6181 }
6182
6183 void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
6184 int64_t pool)
6185 {
6186 DECODE_START(3, bl);
6187 decode(soid, bl);
6188 decode(version, bl);
6189 decode(size, bl);
6190 decode(oi, bl);
6191 decode(ss, bl);
6192 decode(copy_subset, bl);
6193 decode(clone_subset, bl);
6194 if (struct_v > 2)
6195 decode(object_exist, bl);
6196 else
6197 object_exist = false;
6198 DECODE_FINISH(bl);
6199 if (struct_v < 2) {
6200 if (!soid.is_max() && soid.pool == -1)
6201 soid.pool = pool;
6202 map<hobject_t, interval_set<uint64_t>> tmp;
6203 tmp.swap(clone_subset);
6204 for (auto i = tmp.begin(); i != tmp.end(); ++i) {
6205 hobject_t first(i->first);
6206 if (!first.is_max() && first.pool == -1)
6207 first.pool = pool;
6208 clone_subset[first].swap(i->second);
6209 }
6210 }
6211 }
6212
6213 void ObjectRecoveryInfo::generate_test_instances(
6214 list<ObjectRecoveryInfo*>& o)
6215 {
6216 o.push_back(new ObjectRecoveryInfo);
6217 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
6218 o.back()->version = eversion_t(0,0);
6219 o.back()->size = 100;
6220 o.back()->object_exist = false;
6221 }
6222
6223
6224 void ObjectRecoveryInfo::dump(Formatter *f) const
6225 {
6226 f->dump_stream("object") << soid;
6227 f->dump_stream("at_version") << version;
6228 f->dump_stream("size") << size;
6229 {
6230 f->open_object_section("object_info");
6231 oi.dump(f);
6232 f->close_section();
6233 }
6234 {
6235 f->open_object_section("snapset");
6236 ss.dump(f);
6237 f->close_section();
6238 }
6239 f->dump_stream("copy_subset") << copy_subset;
6240 f->dump_stream("clone_subset") << clone_subset;
6241 f->dump_stream("object_exist") << object_exist;
6242 }
6243
6244 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
6245 {
6246 return inf.print(out);
6247 }
6248
6249 ostream &ObjectRecoveryInfo::print(ostream &out) const
6250 {
6251 return out << "ObjectRecoveryInfo("
6252 << soid << "@" << version
6253 << ", size: " << size
6254 << ", copy_subset: " << copy_subset
6255 << ", clone_subset: " << clone_subset
6256 << ", snapset: " << ss
6257 << ", object_exist: " << object_exist
6258 << ")";
6259 }
6260
6261 // -- PushReplyOp --
6262 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
6263 {
6264 o.push_back(new PushReplyOp);
6265 o.push_back(new PushReplyOp);
6266 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6267 o.push_back(new PushReplyOp);
6268 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6269 }
6270
6271 void PushReplyOp::encode(ceph::buffer::list &bl) const
6272 {
6273 ENCODE_START(1, 1, bl);
6274 encode(soid, bl);
6275 ENCODE_FINISH(bl);
6276 }
6277
6278 void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl)
6279 {
6280 DECODE_START(1, bl);
6281 decode(soid, bl);
6282 DECODE_FINISH(bl);
6283 }
6284
6285 void PushReplyOp::dump(Formatter *f) const
6286 {
6287 f->dump_stream("soid") << soid;
6288 }
6289
6290 ostream &PushReplyOp::print(ostream &out) const
6291 {
6292 return out
6293 << "PushReplyOp(" << soid
6294 << ")";
6295 }
6296
6297 ostream& operator<<(ostream& out, const PushReplyOp &op)
6298 {
6299 return op.print(out);
6300 }
6301
6302 uint64_t PushReplyOp::cost(CephContext *cct) const
6303 {
6304
6305 return cct->_conf->osd_push_per_object_cost +
6306 cct->_conf->osd_recovery_max_chunk;
6307 }
6308
6309 // -- PullOp --
6310 void PullOp::generate_test_instances(list<PullOp*> &o)
6311 {
6312 o.push_back(new PullOp);
6313 o.push_back(new PullOp);
6314 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6315 o.back()->recovery_info.version = eversion_t(3, 10);
6316 o.push_back(new PullOp);
6317 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6318 o.back()->recovery_info.version = eversion_t(0, 0);
6319 }
6320
6321 void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const
6322 {
6323 ENCODE_START(1, 1, bl);
6324 encode(soid, bl);
6325 encode(recovery_info, bl, features);
6326 encode(recovery_progress, bl);
6327 ENCODE_FINISH(bl);
6328 }
6329
6330 void PullOp::decode(ceph::buffer::list::const_iterator &bl)
6331 {
6332 DECODE_START(1, bl);
6333 decode(soid, bl);
6334 decode(recovery_info, bl);
6335 decode(recovery_progress, bl);
6336 DECODE_FINISH(bl);
6337 }
6338
6339 void PullOp::dump(Formatter *f) const
6340 {
6341 f->dump_stream("soid") << soid;
6342 {
6343 f->open_object_section("recovery_info");
6344 recovery_info.dump(f);
6345 f->close_section();
6346 }
6347 {
6348 f->open_object_section("recovery_progress");
6349 recovery_progress.dump(f);
6350 f->close_section();
6351 }
6352 }
6353
6354 ostream &PullOp::print(ostream &out) const
6355 {
6356 return out
6357 << "PullOp(" << soid
6358 << ", recovery_info: " << recovery_info
6359 << ", recovery_progress: " << recovery_progress
6360 << ")";
6361 }
6362
6363 ostream& operator<<(ostream& out, const PullOp &op)
6364 {
6365 return op.print(out);
6366 }
6367
6368 uint64_t PullOp::cost(CephContext *cct) const
6369 {
6370 return cct->_conf->osd_push_per_object_cost +
6371 cct->_conf->osd_recovery_max_chunk;
6372 }
6373
6374 // -- PushOp --
6375 void PushOp::generate_test_instances(list<PushOp*> &o)
6376 {
6377 o.push_back(new PushOp);
6378 o.push_back(new PushOp);
6379 o.back()->soid = hobject_t(sobject_t("asdf", 2));
6380 o.back()->version = eversion_t(3, 10);
6381 o.push_back(new PushOp);
6382 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
6383 o.back()->version = eversion_t(0, 0);
6384 }
6385
6386 void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const
6387 {
6388 ENCODE_START(1, 1, bl);
6389 encode(soid, bl);
6390 encode(version, bl);
6391 encode(data, bl);
6392 encode(data_included, bl);
6393 encode(omap_header, bl);
6394 encode(omap_entries, bl);
6395 encode(attrset, bl);
6396 encode(recovery_info, bl, features);
6397 encode(after_progress, bl);
6398 encode(before_progress, bl);
6399 ENCODE_FINISH(bl);
6400 }
6401
6402 void PushOp::decode(ceph::buffer::list::const_iterator &bl)
6403 {
6404 DECODE_START(1, bl);
6405 decode(soid, bl);
6406 decode(version, bl);
6407 decode(data, bl);
6408 decode(data_included, bl);
6409 decode(omap_header, bl);
6410 decode(omap_entries, bl);
6411 decode(attrset, bl);
6412 decode(recovery_info, bl);
6413 decode(after_progress, bl);
6414 decode(before_progress, bl);
6415 DECODE_FINISH(bl);
6416 }
6417
6418 void PushOp::dump(Formatter *f) const
6419 {
6420 f->dump_stream("soid") << soid;
6421 f->dump_stream("version") << version;
6422 f->dump_int("data_len", data.length());
6423 f->dump_stream("data_included") << data_included;
6424 f->dump_int("omap_header_len", omap_header.length());
6425 f->dump_int("omap_entries_len", omap_entries.size());
6426 f->dump_int("attrset_len", attrset.size());
6427 {
6428 f->open_object_section("recovery_info");
6429 recovery_info.dump(f);
6430 f->close_section();
6431 }
6432 {
6433 f->open_object_section("after_progress");
6434 after_progress.dump(f);
6435 f->close_section();
6436 }
6437 {
6438 f->open_object_section("before_progress");
6439 before_progress.dump(f);
6440 f->close_section();
6441 }
6442 }
6443
6444 ostream &PushOp::print(ostream &out) const
6445 {
6446 return out
6447 << "PushOp(" << soid
6448 << ", version: " << version
6449 << ", data_included: " << data_included
6450 << ", data_size: " << data.length()
6451 << ", omap_header_size: " << omap_header.length()
6452 << ", omap_entries_size: " << omap_entries.size()
6453 << ", attrset_size: " << attrset.size()
6454 << ", recovery_info: " << recovery_info
6455 << ", after_progress: " << after_progress
6456 << ", before_progress: " << before_progress
6457 << ")";
6458 }
6459
6460 ostream& operator<<(ostream& out, const PushOp &op)
6461 {
6462 return op.print(out);
6463 }
6464
6465 uint64_t PushOp::cost(CephContext *cct) const
6466 {
6467 uint64_t cost = data_included.size();
6468 for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) {
6469 cost += i->second.length();
6470 }
6471 cost += cct->_conf->osd_push_per_object_cost;
6472 return cost;
6473 }
6474
6475 // -- ScrubMap --
6476
6477 void ScrubMap::merge_incr(const ScrubMap &l)
6478 {
6479 ceph_assert(valid_through == l.incr_since);
6480 valid_through = l.valid_through;
6481
6482 for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){
6483 if (p->second.negative) {
6484 auto q = objects.find(p->first);
6485 if (q != objects.end()) {
6486 objects.erase(q);
6487 }
6488 } else {
6489 objects[p->first] = p->second;
6490 }
6491 }
6492 }
6493
6494 void ScrubMap::encode(ceph::buffer::list& bl) const
6495 {
6496 ENCODE_START(3, 2, bl);
6497 encode(objects, bl);
6498 encode((__u32)0, bl); // used to be attrs; now deprecated
6499 ceph::buffer::list old_logbl; // not used
6500 encode(old_logbl, bl);
6501 encode(valid_through, bl);
6502 encode(incr_since, bl);
6503 ENCODE_FINISH(bl);
6504 }
6505
6506 void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool)
6507 {
6508 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
6509 decode(objects, bl);
6510 {
6511 map<string,string> attrs; // deprecated
6512 decode(attrs, bl);
6513 }
6514 ceph::buffer::list old_logbl; // not used
6515 decode(old_logbl, bl);
6516 decode(valid_through, bl);
6517 decode(incr_since, bl);
6518 DECODE_FINISH(bl);
6519
6520 // handle hobject_t upgrade
6521 if (struct_v < 3) {
6522 map<hobject_t, object> tmp;
6523 tmp.swap(objects);
6524 for (auto i = tmp.begin(); i != tmp.end(); ++i) {
6525 hobject_t first(i->first);
6526 if (!first.is_max() && first.pool == -1)
6527 first.pool = pool;
6528 objects[first] = i->second;
6529 }
6530 }
6531 }
6532
6533 void ScrubMap::dump(Formatter *f) const
6534 {
6535 f->dump_stream("valid_through") << valid_through;
6536 f->dump_stream("incremental_since") << incr_since;
6537 f->open_array_section("objects");
6538 for (auto p = objects.cbegin(); p != objects.cend(); ++p) {
6539 f->open_object_section("object");
6540 f->dump_string("name", p->first.oid.name);
6541 f->dump_unsigned("hash", p->first.get_hash());
6542 f->dump_string("key", p->first.get_key());
6543 f->dump_int("snapid", p->first.snap);
6544 p->second.dump(f);
6545 f->close_section();
6546 }
6547 f->close_section();
6548 }
6549
6550 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
6551 {
6552 o.push_back(new ScrubMap);
6553 o.push_back(new ScrubMap);
6554 o.back()->valid_through = eversion_t(1, 2);
6555 o.back()->incr_since = eversion_t(3, 4);
6556 list<object*> obj;
6557 object::generate_test_instances(obj);
6558 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
6559 obj.pop_back();
6560 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
6561 }
6562
6563 // -- ScrubMap::object --
6564
6565 void ScrubMap::object::encode(ceph::buffer::list& bl) const
6566 {
6567 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
6568 ENCODE_START(10, 7, bl);
6569 encode(size, bl);
6570 encode(negative, bl);
6571 encode(attrs, bl);
6572 encode(digest, bl);
6573 encode(digest_present, bl);
6574 encode((uint32_t)0, bl); // obsolete nlinks
6575 encode((uint32_t)0, bl); // snapcolls
6576 encode(omap_digest, bl);
6577 encode(omap_digest_present, bl);
6578 encode(compat_read_error, bl);
6579 encode(stat_error, bl);
6580 encode(read_error, bl);
6581 encode(ec_hash_mismatch, bl);
6582 encode(ec_size_mismatch, bl);
6583 encode(large_omap_object_found, bl);
6584 encode(large_omap_object_key_count, bl);
6585 encode(large_omap_object_value_size, bl);
6586 encode(object_omap_bytes, bl);
6587 encode(object_omap_keys, bl);
6588 ENCODE_FINISH(bl);
6589 }
6590
6591 void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl)
6592 {
6593 DECODE_START(10, bl);
6594 decode(size, bl);
6595 bool tmp, compat_read_error = false;
6596 decode(tmp, bl);
6597 negative = tmp;
6598 decode(attrs, bl);
6599 decode(digest, bl);
6600 decode(tmp, bl);
6601 digest_present = tmp;
6602 {
6603 uint32_t nlinks;
6604 decode(nlinks, bl);
6605 set<snapid_t> snapcolls;
6606 decode(snapcolls, bl);
6607 }
6608 decode(omap_digest, bl);
6609 decode(tmp, bl);
6610 omap_digest_present = tmp;
6611 decode(compat_read_error, bl);
6612 decode(tmp, bl);
6613 stat_error = tmp;
6614 if (struct_v >= 8) {
6615 decode(tmp, bl);
6616 read_error = tmp;
6617 decode(tmp, bl);
6618 ec_hash_mismatch = tmp;
6619 decode(tmp, bl);
6620 ec_size_mismatch = tmp;
6621 }
6622 // If older encoder found a read_error, set read_error
6623 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
6624 read_error = true;
6625 if (struct_v >= 9) {
6626 decode(tmp, bl);
6627 large_omap_object_found = tmp;
6628 decode(large_omap_object_key_count, bl);
6629 decode(large_omap_object_value_size, bl);
6630 }
6631 if (struct_v >= 10) {
6632 decode(object_omap_bytes, bl);
6633 decode(object_omap_keys, bl);
6634 }
6635 DECODE_FINISH(bl);
6636 }
6637
6638 void ScrubMap::object::dump(Formatter *f) const
6639 {
6640 f->dump_int("size", size);
6641 f->dump_int("negative", negative);
6642 f->open_array_section("attrs");
6643 for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) {
6644 f->open_object_section("attr");
6645 f->dump_string("name", p->first);
6646 f->dump_int("length", p->second.length());
6647 f->close_section();
6648 }
6649 f->close_section();
6650 }
6651
6652 void ScrubMap::object::generate_test_instances(list<object*>& o)
6653 {
6654 o.push_back(new object);
6655 o.push_back(new object);
6656 o.back()->negative = true;
6657 o.push_back(new object);
6658 o.back()->size = 123;
6659 o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3);
6660 o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6);
6661 }
6662
6663 // -- OSDOp --
6664
6665 ostream& operator<<(ostream& out, const OSDOp& op)
6666 {
6667 out << ceph_osd_op_name(op.op.op);
6668 if (ceph_osd_op_type_data(op.op.op)) {
6669 // data extent
6670 switch (op.op.op) {
6671 case CEPH_OSD_OP_ASSERT_VER:
6672 out << " v" << op.op.assert_ver.ver;
6673 break;
6674 case CEPH_OSD_OP_TRUNCATE:
6675 out << " " << op.op.extent.offset;
6676 break;
6677 case CEPH_OSD_OP_MASKTRUNC:
6678 case CEPH_OSD_OP_TRIMTRUNC:
6679 out << " " << op.op.extent.truncate_seq << "@"
6680 << (int64_t)op.op.extent.truncate_size;
6681 break;
6682 case CEPH_OSD_OP_ROLLBACK:
6683 out << " " << snapid_t(op.op.snap.snapid);
6684 break;
6685 case CEPH_OSD_OP_WATCH:
6686 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
6687 << " cookie " << op.op.watch.cookie;
6688 if (op.op.watch.gen)
6689 out << " gen " << op.op.watch.gen;
6690 break;
6691 case CEPH_OSD_OP_NOTIFY:
6692 out << " cookie " << op.op.notify.cookie;
6693 break;
6694 case CEPH_OSD_OP_COPY_GET:
6695 out << " max " << op.op.copy_get.max;
6696 break;
6697 case CEPH_OSD_OP_COPY_FROM:
6698 out << " ver " << op.op.copy_from.src_version;
6699 break;
6700 case CEPH_OSD_OP_SETALLOCHINT:
6701 out << " object_size " << op.op.alloc_hint.expected_object_size
6702 << " write_size " << op.op.alloc_hint.expected_write_size;
6703 break;
6704 case CEPH_OSD_OP_READ:
6705 case CEPH_OSD_OP_SPARSE_READ:
6706 case CEPH_OSD_OP_SYNC_READ:
6707 case CEPH_OSD_OP_WRITE:
6708 case CEPH_OSD_OP_WRITEFULL:
6709 case CEPH_OSD_OP_ZERO:
6710 case CEPH_OSD_OP_APPEND:
6711 case CEPH_OSD_OP_MAPEXT:
6712 case CEPH_OSD_OP_CMPEXT:
6713 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
6714 if (op.op.extent.truncate_seq)
6715 out << " [" << op.op.extent.truncate_seq << "@"
6716 << (int64_t)op.op.extent.truncate_size << "]";
6717 if (op.op.flags)
6718 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
6719 default:
6720 // don't show any arg info
6721 break;
6722 }
6723 } else if (ceph_osd_op_type_attr(op.op.op)) {
6724 // xattr name
6725 if (op.op.xattr.name_len && op.indata.length()) {
6726 out << " ";
6727 op.indata.write(0, op.op.xattr.name_len, out);
6728 }
6729 if (op.op.xattr.value_len)
6730 out << " (" << op.op.xattr.value_len << ")";
6731 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
6732 out << " op " << (int)op.op.xattr.cmp_op
6733 << " mode " << (int)op.op.xattr.cmp_mode;
6734 } else if (ceph_osd_op_type_exec(op.op.op)) {
6735 // class.method
6736 if (op.op.cls.class_len && op.indata.length()) {
6737 out << " ";
6738 op.indata.write(0, op.op.cls.class_len, out);
6739 out << ".";
6740 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
6741 }
6742 } else if (ceph_osd_op_type_pg(op.op.op)) {
6743 switch (op.op.op) {
6744 case CEPH_OSD_OP_PGLS:
6745 case CEPH_OSD_OP_PGLS_FILTER:
6746 case CEPH_OSD_OP_PGNLS:
6747 case CEPH_OSD_OP_PGNLS_FILTER:
6748 out << " start_epoch " << op.op.pgls.start_epoch;
6749 break;
6750 case CEPH_OSD_OP_PG_HITSET_LS:
6751 break;
6752 case CEPH_OSD_OP_PG_HITSET_GET:
6753 out << " " << utime_t(op.op.hit_set_get.stamp);
6754 break;
6755 case CEPH_OSD_OP_SCRUBLS:
6756 break;
6757 }
6758 }
6759 if (op.indata.length()) {
6760 out << " in=" << op.indata.length() << "b";
6761 }
6762 if (op.outdata.length()) {
6763 out << " out=" << op.outdata.length() << "b";
6764 }
6765 return out;
6766 }
6767
6768
6769 void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, ceph::buffer::list& in)
6770 {
6771 ceph::buffer::list::iterator datap = in.begin();
6772 for (unsigned i = 0; i < ops.size(); i++) {
6773 if (ops[i].op.payload_len) {
6774 datap.copy(ops[i].op.payload_len, ops[i].indata);
6775 }
6776 }
6777 }
6778
6779 void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, ceph::buffer::list& out)
6780 {
6781 for (unsigned i = 0; i < ops.size(); i++) {
6782 if (ops[i].indata.length()) {
6783 ops[i].op.payload_len = ops[i].indata.length();
6784 out.append(ops[i].indata);
6785 }
6786 }
6787 }
6788
6789 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& in)
6790 {
6791 auto datap = in.begin();
6792 for (unsigned i = 0; i < ops.size(); i++) {
6793 if (ops[i].op.payload_len) {
6794 datap.copy(ops[i].op.payload_len, ops[i].outdata);
6795 }
6796 }
6797 }
6798
6799 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& out)
6800 {
6801 for (unsigned i = 0; i < ops.size(); i++) {
6802 ops[i].op.payload_len = ops[i].outdata.length();
6803 if (ops[i].outdata.length()) {
6804 out.append(ops[i].outdata);
6805 }
6806 }
6807 }
6808
6809 void OSDOp::clear_data(vector<OSDOp>& ops)
6810 {
6811 for (unsigned i = 0; i < ops.size(); i++) {
6812 OSDOp& op = ops[i];
6813 op.outdata.clear();
6814 if (ceph_osd_op_type_attr(op.op.op) &&
6815 op.op.xattr.name_len &&
6816 op.indata.length() >= op.op.xattr.name_len) {
6817 ceph::buffer::ptr bp(op.op.xattr.name_len);
6818 ceph::buffer::list bl;
6819 bl.append(bp);
6820 bl.copy_in(0, op.op.xattr.name_len, op.indata);
6821 op.indata.claim(bl);
6822 } else if (ceph_osd_op_type_exec(op.op.op) &&
6823 op.op.cls.class_len &&
6824 op.indata.length() >
6825 (op.op.cls.class_len + op.op.cls.method_len)) {
6826 __u8 len = op.op.cls.class_len + op.op.cls.method_len;
6827 ceph::buffer::ptr bp(len);
6828 ceph::buffer::list bl;
6829 bl.append(bp);
6830 bl.copy_in(0, len, op.indata);
6831 op.indata.claim(bl);
6832 } else {
6833 op.indata.clear();
6834 }
6835 }
6836 }
6837
6838 int prepare_info_keymap(
6839 CephContext* cct,
6840 map<string,bufferlist> *km,
6841 epoch_t epoch,
6842 pg_info_t &info,
6843 pg_info_t &last_written_info,
6844 PastIntervals &past_intervals,
6845 bool dirty_big_info,
6846 bool dirty_epoch,
6847 bool try_fast_info,
6848 PerfCounters *logger,
6849 DoutPrefixProvider *dpp)
6850 {
6851 if (dirty_epoch) {
6852 encode(epoch, (*km)[string(epoch_key)]);
6853 }
6854
6855 if (logger)
6856 logger->inc(l_osd_pg_info);
6857
6858 // try to do info efficiently?
6859 if (!dirty_big_info && try_fast_info &&
6860 info.last_update > last_written_info.last_update) {
6861 pg_fast_info_t fast;
6862 fast.populate_from(info);
6863 bool did = fast.try_apply_to(&last_written_info);
6864 ceph_assert(did); // we verified last_update increased above
6865 if (info == last_written_info) {
6866 encode(fast, (*km)[string(fastinfo_key)]);
6867 if (logger)
6868 logger->inc(l_osd_pg_fastinfo);
6869 return 0;
6870 }
6871 if (dpp) {
6872 ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n";
6873 {
6874 JSONFormatter jf(true);
6875 jf.dump_object("info", info);
6876 jf.flush(*_dout);
6877 }
6878 {
6879 *_dout << "\nlast_written_info:\n";
6880 JSONFormatter jf(true);
6881 jf.dump_object("last_written_info", last_written_info);
6882 jf.flush(*_dout);
6883 }
6884 *_dout << dendl;
6885 }
6886 }
6887
6888 last_written_info = info;
6889
6890 // info. store purged_snaps separately.
6891 interval_set<snapid_t> purged_snaps;
6892 purged_snaps.swap(info.purged_snaps);
6893 encode(info, (*km)[string(info_key)]);
6894 purged_snaps.swap(info.purged_snaps);
6895
6896 if (dirty_big_info) {
6897 // potentially big stuff
6898 bufferlist& bigbl = (*km)[string(biginfo_key)];
6899 encode(past_intervals, bigbl);
6900 encode(info.purged_snaps, bigbl);
6901 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
6902 if (logger)
6903 logger->inc(l_osd_pg_biginfo);
6904 }
6905
6906 return 0;
6907 }
6908
6909 void create_pg_collection(
6910 ceph::os::Transaction& t, spg_t pgid, int bits)
6911 {
6912 coll_t coll(pgid);
6913 t.create_collection(coll, bits);
6914 }
6915
6916 void init_pg_ondisk(
6917 ceph::os::Transaction& t,
6918 spg_t pgid,
6919 const pg_pool_t *pool)
6920 {
6921 coll_t coll(pgid);
6922 if (pool) {
6923 // Give a hint to the PG collection
6924 bufferlist hint;
6925 uint32_t pg_num = pool->get_pg_num();
6926 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
6927 encode(pg_num, hint);
6928 encode(expected_num_objects_pg, hint);
6929 uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
6930 t.collection_hint(coll, hint_type, hint);
6931 }
6932
6933 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
6934 t.touch(coll, pgmeta_oid);
6935 map<string,bufferlist> values;
6936 __u8 struct_v = pg_latest_struct_v;
6937 encode(struct_v, values[string(infover_key)]);
6938 t.omap_setkeys(coll, pgmeta_oid, values);
6939 }
6940
6941 PGLSFilter::PGLSFilter() : cct(nullptr)
6942 {
6943 }
6944
6945 PGLSFilter::~PGLSFilter()
6946 {
6947 }
6948
6949 int PGLSPlainFilter::init(ceph::bufferlist::const_iterator ¶ms)
6950 {
6951 try {
6952 decode(xattr, params);
6953 decode(val, params);
6954 } catch (buffer::error &e) {
6955 return -EINVAL;
6956 }
6957 return 0;
6958 }
6959
6960 bool PGLSPlainFilter::filter(const hobject_t& obj,
6961 const ceph::bufferlist& xattr_data) const
6962 {
6963 return xattr_data.contents_equal(val.c_str(), val.size());
6964 }
6965