1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #ifndef CEPH_OSD_TYPES_H
19 #define CEPH_OSD_TYPES_H
20
21 #include <atomic>
22 #include <sstream>
23 #include <cstdio>
24 #include <memory>
25 #include <string_view>
26
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/optional/optional_io.hpp>
29 #include <boost/variant.hpp>
30 #include <boost/smart_ptr/local_shared_ptr.hpp>
31
32 #include "include/rados/rados_types.hpp"
33 #include "include/mempool.h"
34
35 #include "msg/msg_types.h"
36 #include "include/types.h"
37 #include "include/utime.h"
38 #include "include/CompatSet.h"
39 #include "common/ceph_context.h"
40 #include "common/histogram.h"
41 #include "include/interval_set.h"
42 #include "include/inline_memory.h"
43 #include "common/Formatter.h"
44 #include "common/bloom_filter.hpp"
45 #include "common/hobject.h"
46 #include "common/snap_types.h"
47 #include "HitSet.h"
48 #include "Watch.h"
49 #include "include/cmp.h"
50 #include "librados/ListObjectImpl.h"
51 #include "compressor/Compressor.h"
52 #include "osd_perf_counters.h"
53
54 #define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
55
56 #define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
57 #define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
58 #define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
59 #define CEPH_OSD_FEATURE_INCOMPAT_LEC CompatSet::Feature(4, "last_epoch_clean")
60 #define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES CompatSet::Feature(5, "categories")
61 #define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL CompatSet::Feature(6, "hobjectpool")
62 #define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
63 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
64 #define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
65 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
66 #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
67 #define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
68 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
69 #define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
70 #define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
71 #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
72 #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
73
74
75 /// pool priority range set by user
76 #define OSD_POOL_PRIORITY_MAX 10
77 #define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
78
79 /// min recovery priority for MBackfillReserve
80 #define OSD_RECOVERY_PRIORITY_MIN 0
81
82 /// base backfill priority for MBackfillReserve
83 #define OSD_BACKFILL_PRIORITY_BASE 100
84
85 /// base backfill priority for MBackfillReserve (degraded PG)
86 #define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
87
88 /// base recovery priority for MBackfillReserve
89 #define OSD_RECOVERY_PRIORITY_BASE 180
90
91 /// base backfill priority for MBackfillReserve (inactive PG)
92 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
93
94 /// base recovery priority for MRecoveryReserve (inactive PG)
95 #define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
96
97 /// max manually/automatically set recovery priority for MBackfillReserve
98 #define OSD_RECOVERY_PRIORITY_MAX 253
99
100 /// backfill priority for MBackfillReserve, when forced manually
101 #define OSD_BACKFILL_PRIORITY_FORCED 254
102
103 /// recovery priority for MRecoveryReserve, when forced manually
104 #define OSD_RECOVERY_PRIORITY_FORCED 255
105
106 /// priority for pg deletion when osd is not fullish
107 #define OSD_DELETE_PRIORITY_NORMAL 179
108
109 /// priority for pg deletion when osd is approaching full
110 #define OSD_DELETE_PRIORITY_FULLISH 219
111
112 /// priority when more full
113 #define OSD_DELETE_PRIORITY_FULL 255
114
115 static std::map<int, int> max_prio_map = {
116 {OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
117 {OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
118 {OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
119 {OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
120 {OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
121 };
122
123 typedef hobject_t collection_list_handle_t;
124
125 /// convert a single CPEH_OSD_FLAG_* to a std::string
126 const char *ceph_osd_flag_name(unsigned flag);
127 /// convert a single CEPH_OSD_OF_FLAG_* to a std::string
128 const char *ceph_osd_op_flag_name(unsigned flag);
129
130 /// convert CEPH_OSD_FLAG_* op flags to a std::string
131 std::string ceph_osd_flag_string(unsigned flags);
132 /// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
133 std::string ceph_osd_op_flag_string(unsigned flags);
134 /// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
135 std::string ceph_osd_alloc_hint_flag_string(unsigned flags);
136
137 typedef std::map<std::string,std::string> osd_alert_list_t;
138 /// map osd id -> alert_list_t
139 typedef std::map<int, osd_alert_list_t> osd_alerts_t;
140 void dump(ceph::Formatter* f, const osd_alerts_t& alerts);
141
142
143 typedef interval_set<
144 snapid_t,
145 mempool::osdmap::flat_map<snapid_t,snapid_t>> snap_interval_set_t;
146
147
148 /**
149 * osd request identifier
150 *
151 * caller name + incarnation# + tid to unique identify this request.
152 */
153 struct osd_reqid_t {
154 entity_name_t name; // who
155 ceph_tid_t tid;
156 int32_t inc; // incarnation
157
158 osd_reqid_t()
159 : tid(0), inc(0)
160 {}
161 osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
162 : name(a), tid(t), inc(i)
163 {}
164
165 DENC(osd_reqid_t, v, p) {
166 DENC_START(2, 2, p);
167 denc(v.name, p);
168 denc(v.tid, p);
169 denc(v.inc, p);
170 DENC_FINISH(p);
171 }
172 void dump(ceph::Formatter *f) const;
173 static void generate_test_instances(std::list<osd_reqid_t*>& o);
174 };
175 WRITE_CLASS_DENC(osd_reqid_t)
176
177
178
179 struct pg_shard_t {
180 static const int32_t NO_OSD = 0x7fffffff;
181 int32_t osd;
182 shard_id_t shard;
183 pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
184 explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
185 pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
186 bool is_undefined() const {
187 return osd == -1;
188 }
189 std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); }
190 void encode(ceph::buffer::list &bl) const;
191 void decode(ceph::buffer::list::const_iterator &bl);
192 void dump(ceph::Formatter *f) const {
193 f->dump_unsigned("osd", osd);
194 if (shard != shard_id_t::NO_SHARD) {
195 f->dump_unsigned("shard", shard);
196 }
197 }
198 };
199 WRITE_CLASS_ENCODER(pg_shard_t)
200 WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
201 WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
202 std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs);
203
204 class IsPGRecoverablePredicate {
205 public:
206 /**
207 * have encodes the shards available
208 */
209 virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
210 virtual ~IsPGRecoverablePredicate() {}
211 };
212
213 class IsPGReadablePredicate {
214 public:
215 /**
216 * have encodes the shards available
217 */
218 virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
219 virtual ~IsPGReadablePredicate() {}
220 };
221
222 inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) {
223 return out << r.name << "." << r.inc << ":" << r.tid;
224 }
225
226 inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
227 return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
228 }
229 inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
230 return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
231 }
232 inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
233 return (l.name < r.name) || (l.inc < r.inc) ||
234 (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
235 }
236 inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
237 return (l.name < r.name) || (l.inc < r.inc) ||
238 (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
239 }
240 inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
241 inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
242
243 namespace std {
244 template<> struct hash<osd_reqid_t> {
245 size_t operator()(const osd_reqid_t &r) const {
246 static hash<uint64_t> H;
247 return H(r.name.num() ^ r.tid ^ r.inc);
248 }
249 };
250 } // namespace std
251
252
253 // -----
254
255 // a locator constrains the placement of an object. mainly, which pool
256 // does it go in.
257 struct object_locator_t {
258 // You specify either the hash or the key -- not both
259 int64_t pool; ///< pool id
260 std::string key; ///< key std::string (if non-empty)
261 std::string nspace; ///< namespace
262 int64_t hash; ///< hash position (if >= 0)
263
264 explicit object_locator_t()
265 : pool(-1), hash(-1) {}
266 explicit object_locator_t(int64_t po)
267 : pool(po), hash(-1) {}
268 explicit object_locator_t(int64_t po, int64_t ps)
269 : pool(po), hash(ps) {}
270 explicit object_locator_t(int64_t po, std::string ns)
271 : pool(po), nspace(ns), hash(-1) {}
272 explicit object_locator_t(int64_t po, std::string ns, int64_t ps)
273 : pool(po), nspace(ns), hash(ps) {}
274 explicit object_locator_t(int64_t po, std::string ns, std::string s)
275 : pool(po), key(s), nspace(ns), hash(-1) {}
276 explicit object_locator_t(const hobject_t& soid)
277 : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
278
279 int64_t get_pool() const {
280 return pool;
281 }
282
283 void clear() {
284 pool = -1;
285 key = "";
286 nspace = "";
287 hash = -1;
288 }
289
290 bool empty() const {
291 return pool == -1;
292 }
293
294 void encode(ceph::buffer::list& bl) const;
295 void decode(ceph::buffer::list::const_iterator& p);
296 void dump(ceph::Formatter *f) const;
297 static void generate_test_instances(std::list<object_locator_t*>& o);
298 };
299 WRITE_CLASS_ENCODER(object_locator_t)
300
301 inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
302 return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
303 }
304 inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
305 return !(l == r);
306 }
307
308 inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc)
309 {
310 out << "@" << loc.pool;
311 if (loc.nspace.length())
312 out << ";" << loc.nspace;
313 if (loc.key.length())
314 out << ":" << loc.key;
315 return out;
316 }
317
318 struct request_redirect_t {
319 private:
320 object_locator_t redirect_locator; ///< this is authoritative
321 std::string redirect_object; ///< If non-empty, the request goes to this object name
322
323 friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir);
324 public:
325
326 request_redirect_t() {}
327 explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
328 redirect_locator(orig) { redirect_locator.pool = rpool; }
329 explicit request_redirect_t(const object_locator_t& rloc) :
330 redirect_locator(rloc) {}
331 explicit request_redirect_t(const object_locator_t& orig,
332 const std::string& robj) :
333 redirect_locator(orig), redirect_object(robj) {}
334
335 bool empty() const { return redirect_locator.empty() &&
336 redirect_object.empty(); }
337
338 void combine_with_locator(object_locator_t& orig, std::string& obj) const {
339 orig = redirect_locator;
340 if (!redirect_object.empty())
341 obj = redirect_object;
342 }
343
344 void encode(ceph::buffer::list& bl) const;
345 void decode(ceph::buffer::list::const_iterator& bl);
346 void dump(ceph::Formatter *f) const;
347 static void generate_test_instances(std::list<request_redirect_t*>& o);
348 };
349 WRITE_CLASS_ENCODER(request_redirect_t)
350
351 inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) {
352 out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
353 return out;
354 }
355
356 // Internal OSD op flags - set by the OSD based on the op types
357 enum {
358 CEPH_OSD_RMW_FLAG_READ = (1 << 1),
359 CEPH_OSD_RMW_FLAG_WRITE = (1 << 2),
360 CEPH_OSD_RMW_FLAG_CLASS_READ = (1 << 3),
361 CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
362 CEPH_OSD_RMW_FLAG_PGOP = (1 << 5),
363 CEPH_OSD_RMW_FLAG_CACHE = (1 << 6),
364 CEPH_OSD_RMW_FLAG_FORCE_PROMOTE = (1 << 7),
365 CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
366 CEPH_OSD_RMW_FLAG_SKIP_PROMOTE = (1 << 9),
367 CEPH_OSD_RMW_FLAG_RWORDERED = (1 << 10),
368 CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11),
369 };
370
371
372 // pg stuff
373
374 #define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
375
376 // placement seed (a hash value)
377 typedef uint32_t ps_t;
378
379 // old (v1) pg_t encoding (wrap old struct ceph_pg)
380 struct old_pg_t {
381 ceph_pg v;
382 void encode(ceph::buffer::list& bl) const {
383 ceph::encode_raw(v, bl);
384 }
385 void decode(ceph::buffer::list::const_iterator& bl) {
386 ceph::decode_raw(v, bl);
387 }
388 };
389 WRITE_CLASS_ENCODER(old_pg_t)
390
391 // placement group id
392 struct pg_t {
393 uint64_t m_pool;
394 uint32_t m_seed;
395
396 pg_t() : m_pool(0), m_seed(0) {}
397 pg_t(ps_t seed, uint64_t pool) :
398 m_pool(pool), m_seed(seed) {}
399 // cppcheck-suppress noExplicitConstructor
400 pg_t(const ceph_pg& cpg) :
401 m_pool(cpg.pool), m_seed(cpg.ps) {}
402
403 // cppcheck-suppress noExplicitConstructor
404 pg_t(const old_pg_t& opg) {
405 *this = opg.v;
406 }
407
408 old_pg_t get_old_pg() const {
409 old_pg_t o;
410 ceph_assert(m_pool < 0xffffffffull);
411 o.v.pool = m_pool;
412 o.v.ps = m_seed;
413 o.v.preferred = (__s16)-1;
414 return o;
415 }
416
417 ps_t ps() const {
418 return m_seed;
419 }
420 int64_t pool() const {
421 return m_pool;
422 }
423
424 static const uint8_t calc_name_buf_size = 36; // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
425 char *calc_name(char *buf, const char *suffix_backwords) const;
426
427 void set_ps(ps_t p) {
428 m_seed = p;
429 }
430 void set_pool(uint64_t p) {
431 m_pool = p;
432 }
433
434 pg_t get_parent() const;
435 pg_t get_ancestor(unsigned old_pg_num) const;
436
437 int print(char *o, int maxlen) const;
438 bool parse(const char *s);
439
440 bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set<pg_t> *pchildren) const;
441
442 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
443 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
444 return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
445 }
446
447 /**
448 * Returns b such that for all object o:
449 * ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
450 */
451 unsigned get_split_bits(unsigned pg_num) const;
452
453 bool contains(int bits, const ghobject_t& oid) const {
454 return
455 (int64_t)m_pool == oid.hobj.get_logical_pool() &&
456 oid.match(bits, ps());
457 }
458 bool contains(int bits, const hobject_t& oid) const {
459 return
460 (int64_t)m_pool == oid.get_logical_pool() &&
461 oid.match(bits, ps());
462 }
463
464 hobject_t get_hobj_start() const;
465 hobject_t get_hobj_end(unsigned pg_num) const;
466
467 void encode(ceph::buffer::list& bl) const {
468 using ceph::encode;
469 __u8 v = 1;
470 encode(v, bl);
471 encode(m_pool, bl);
472 encode(m_seed, bl);
473 encode((int32_t)-1, bl); // was preferred
474 }
475 void decode(ceph::buffer::list::const_iterator& bl) {
476 using ceph::decode;
477 __u8 v;
478 decode(v, bl);
479 decode(m_pool, bl);
480 decode(m_seed, bl);
481 bl.advance(sizeof(int32_t)); // was preferred
482 }
483 void decode_old(ceph::buffer::list::const_iterator& bl) {
484 using ceph::decode;
485 old_pg_t opg;
486 decode(opg, bl);
487 *this = opg;
488 }
489 void dump(ceph::Formatter *f) const;
490 static void generate_test_instances(std::list<pg_t*>& o);
491 };
492 WRITE_CLASS_ENCODER(pg_t)
493
494 inline bool operator<(const pg_t& l, const pg_t& r) {
495 return l.pool() < r.pool() ||
496 (l.pool() == r.pool() && (l.ps() < r.ps()));
497 }
498 inline bool operator<=(const pg_t& l, const pg_t& r) {
499 return l.pool() < r.pool() ||
500 (l.pool() == r.pool() && (l.ps() <= r.ps()));
501 }
502 inline bool operator==(const pg_t& l, const pg_t& r) {
503 return l.pool() == r.pool() &&
504 l.ps() == r.ps();
505 }
506 inline bool operator!=(const pg_t& l, const pg_t& r) {
507 return l.pool() != r.pool() ||
508 l.ps() != r.ps();
509 }
510 inline bool operator>(const pg_t& l, const pg_t& r) {
511 return l.pool() > r.pool() ||
512 (l.pool() == r.pool() && (l.ps() > r.ps()));
513 }
514 inline bool operator>=(const pg_t& l, const pg_t& r) {
515 return l.pool() > r.pool() ||
516 (l.pool() == r.pool() && (l.ps() >= r.ps()));
517 }
518
519 std::ostream& operator<<(std::ostream& out, const pg_t &pg);
520
521 namespace std {
522 template<> struct hash< pg_t >
523 {
524 size_t operator()( const pg_t& x ) const
525 {
526 static hash<uint32_t> H;
527 // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
528 return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
529 }
530 };
531 } // namespace std
532
533 struct spg_t {
534 pg_t pgid;
535 shard_id_t shard;
536 spg_t() : shard(shard_id_t::NO_SHARD) {}
537 spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
538 explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
539 unsigned get_split_bits(unsigned pg_num) const {
540 return pgid.get_split_bits(pg_num);
541 }
542 spg_t get_parent() const {
543 return spg_t(pgid.get_parent(), shard);
544 }
545 ps_t ps() const {
546 return pgid.ps();
547 }
548 uint64_t pool() const {
549 return pgid.pool();
550 }
551 void reset_shard(shard_id_t s) {
552 shard = s;
553 }
554
555 static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
556 char *calc_name(char *buf, const char *suffix_backwords) const;
557
558 bool parse(const char *s);
559 bool parse(const std::string& s) {
560 return parse(s.c_str());
561 }
562
563 spg_t get_ancestor(unsigned old_pg_num) const {
564 return spg_t(pgid.get_ancestor(old_pg_num), shard);
565 }
566
567 bool is_split(unsigned old_pg_num, unsigned new_pg_num,
568 std::set<spg_t> *pchildren) const {
569 std::set<pg_t> _children;
570 std::set<pg_t> *children = pchildren ? &_children : NULL;
571 bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
572 if (pchildren && is_split) {
573 for (std::set<pg_t>::iterator i = _children.begin();
574 i != _children.end();
575 ++i) {
576 pchildren->insert(spg_t(*i, shard));
577 }
578 }
579 return is_split;
580 }
581 bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
582 return pgid.is_merge_target(old_pg_num, new_pg_num);
583 }
584 bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
585 spg_t *parent) const {
586 spg_t out = *this;
587 bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
588 if (r && parent) {
589 *parent = out;
590 }
591 return r;
592 }
593
594 bool is_no_shard() const {
595 return shard == shard_id_t::NO_SHARD;
596 }
597
598 ghobject_t make_pgmeta_oid() const {
599 return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
600 }
601
602 void encode(ceph::buffer::list &bl) const {
603 ENCODE_START(1, 1, bl);
604 encode(pgid, bl);
605 encode(shard, bl);
606 ENCODE_FINISH(bl);
607 }
608 void decode(ceph::buffer::list::const_iterator& bl) {
609 DECODE_START(1, bl);
610 decode(pgid, bl);
611 decode(shard, bl);
612 DECODE_FINISH(bl);
613 }
614
615 ghobject_t make_temp_ghobject(const std::string& name) const {
616 return ghobject_t(
617 hobject_t(object_t(name), "", CEPH_NOSNAP,
618 pgid.ps(),
619 hobject_t::get_temp_pool(pgid.pool()),
620 ""),
621 ghobject_t::NO_GEN,
622 shard);
623 }
624
625 unsigned hash_to_shard(unsigned num_shards) const {
626 return ps() % num_shards;
627 }
628 };
629 WRITE_CLASS_ENCODER(spg_t)
630 WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
631 WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
632
633 namespace std {
634 template<> struct hash< spg_t >
635 {
636 size_t operator()( const spg_t& x ) const
637 {
638 static hash<uint32_t> H;
639 return H(hash<pg_t>()(x.pgid) ^ x.shard);
640 }
641 };
642 } // namespace std
643
644 std::ostream& operator<<(std::ostream& out, const spg_t &pg);
645
646 // ----------------------
647
648 class coll_t {
649 enum type_t {
650 TYPE_META = 0,
651 TYPE_LEGACY_TEMP = 1, /* no longer used */
652 TYPE_PG = 2,
653 TYPE_PG_TEMP = 3,
654 };
655 type_t type;
656 spg_t pgid;
657 uint64_t removal_seq; // note: deprecated, not encoded
658
659 char _str_buff[spg_t::calc_name_buf_size];
660 char *_str;
661
662 void calc_str();
663
664 coll_t(type_t t, spg_t p, uint64_t r)
665 : type(t), pgid(p), removal_seq(r) {
666 calc_str();
667 }
668
669 public:
670 coll_t() : type(TYPE_META), removal_seq(0)
671 {
672 calc_str();
673 }
674
675 coll_t(const coll_t& other)
676 : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
677 calc_str();
678 }
679
680 explicit coll_t(spg_t pgid)
681 : type(TYPE_PG), pgid(pgid), removal_seq(0)
682 {
683 calc_str();
684 }
685
686 coll_t& operator=(const coll_t& rhs)
687 {
688 this->type = rhs.type;
689 this->pgid = rhs.pgid;
690 this->removal_seq = rhs.removal_seq;
691 this->calc_str();
692 return *this;
693 }
694
695 // named constructors
696 static coll_t meta() {
697 return coll_t();
698 }
699 static coll_t pg(spg_t p) {
700 return coll_t(p);
701 }
702
703 const std::string to_str() const {
704 return std::string(_str);
705 }
706 const char *c_str() const {
707 return _str;
708 }
709
710 bool parse(const std::string& s);
711
712 int operator<(const coll_t &rhs) const {
713 return type < rhs.type ||
714 (type == rhs.type && pgid < rhs.pgid);
715 }
716
717 bool is_meta() const {
718 return type == TYPE_META;
719 }
720 bool is_pg_prefix(spg_t *pgid_) const {
721 if (type == TYPE_PG || type == TYPE_PG_TEMP) {
722 *pgid_ = pgid;
723 return true;
724 }
725 return false;
726 }
727 bool is_pg() const {
728 return type == TYPE_PG;
729 }
730 bool is_pg(spg_t *pgid_) const {
731 if (type == TYPE_PG) {
732 *pgid_ = pgid;
733 return true;
734 }
735 return false;
736 }
737 bool is_temp() const {
738 return type == TYPE_PG_TEMP;
739 }
740 bool is_temp(spg_t *pgid_) const {
741 if (type == TYPE_PG_TEMP) {
742 *pgid_ = pgid;
743 return true;
744 }
745 return false;
746 }
747 int64_t pool() const {
748 return pgid.pool();
749 }
750
751 void encode(ceph::buffer::list& bl) const;
752 void decode(ceph::buffer::list::const_iterator& bl);
753 size_t encoded_size() const;
754
755 inline bool operator==(const coll_t& rhs) const {
756 // only compare type if meta
757 if (type != rhs.type)
758 return false;
759 if (type == TYPE_META)
760 return true;
761 return type == rhs.type && pgid == rhs.pgid;
762 }
763 inline bool operator!=(const coll_t& rhs) const {
764 return !(*this == rhs);
765 }
766
767 // get a TEMP collection that corresponds to the current collection,
768 // which we presume is a pg collection.
769 coll_t get_temp() const {
770 ceph_assert(type == TYPE_PG);
771 return coll_t(TYPE_PG_TEMP, pgid, 0);
772 }
773
774 ghobject_t get_min_hobj() const {
775 ghobject_t o;
776 switch (type) {
777 case TYPE_PG:
778 o.hobj.pool = pgid.pool();
779 o.set_shard(pgid.shard);
780 break;
781 case TYPE_META:
782 o.hobj.pool = -1;
783 break;
784 default:
785 break;
786 }
787 return o;
788 }
789
790 unsigned hash_to_shard(unsigned num_shards) const {
791 if (type == TYPE_PG)
792 return pgid.hash_to_shard(num_shards);
793 return 0; // whatever.
794 }
795
796 void dump(ceph::Formatter *f) const;
797 static void generate_test_instances(std::list<coll_t*>& o);
798 };
799
800 WRITE_CLASS_ENCODER(coll_t)
801
802 inline std::ostream& operator<<(std::ostream& out, const coll_t& c) {
803 out << c.to_str();
804 return out;
805 }
806
807 namespace std {
808 template<> struct hash<coll_t> {
809 size_t operator()(const coll_t &c) const {
810 size_t h = 0;
811 std::string str(c.to_str());
812 std::string::const_iterator end(str.end());
813 for (std::string::const_iterator s = str.begin(); s != end; ++s) {
814 h += *s;
815 h += (h << 10);
816 h ^= (h >> 6);
817 }
818 h += (h << 3);
819 h ^= (h >> 11);
820 h += (h << 15);
821 return h;
822 }
823 };
824 } // namespace std
825
826 inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol)
827 {
828 out << pg_t(ol.ol_pgid);
829 int su = ol.ol_stripe_unit;
830 if (su)
831 out << ".su=" << su;
832 return out;
833 }
834
835
836
837 // compound rados version type
838 /* WARNING: If add member in eversion_t, please make sure the encode/decode function
839 * work well. For little-endian machine, we should make sure there is no padding
840 * in 32-bit machine and 64-bit machine.
841 */
842 class eversion_t {
843 public:
844 version_t version;
845 epoch_t epoch;
846 __u32 __pad;
847 eversion_t() : version(0), epoch(0), __pad(0) {}
848 eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
849
850 // cppcheck-suppress noExplicitConstructor
851 eversion_t(const ceph_eversion& ce) :
852 version(ce.version),
853 epoch(ce.epoch),
854 __pad(0) { }
855
856 explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); }
857
858 static const eversion_t& max() {
859 static const eversion_t max(-1,-1);
860 return max;
861 }
862
863 operator ceph_eversion() {
864 ceph_eversion c;
865 c.epoch = epoch;
866 c.version = version;
867 return c;
868 }
869
870 std::string get_key_name() const;
871
872 // key must point to the beginning of a block of 32 chars
873 inline void get_key_name(char* key) const {
874 // Below is equivalent of sprintf("%010u.%020llu");
875 key[31] = 0;
876 ritoa<uint64_t, 10, 20>(version, key + 31);
877 key[10] = '.';
878 ritoa<uint32_t, 10, 10>(epoch, key + 10);
879 }
880
881 void encode(ceph::buffer::list &bl) const {
882 #if defined(CEPH_LITTLE_ENDIAN)
883 bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
884 #else
885 using ceph::encode;
886 encode(version, bl);
887 encode(epoch, bl);
888 #endif
889 }
890 void decode(ceph::buffer::list::const_iterator &bl) {
891 #if defined(CEPH_LITTLE_ENDIAN)
892 bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
893 #else
894 using ceph::decode;
895 decode(version, bl);
896 decode(epoch, bl);
897 #endif
898 }
899 void decode(ceph::buffer::list& bl) {
900 auto p = std::cbegin(bl);
901 decode(p);
902 }
903 };
904 WRITE_CLASS_ENCODER(eversion_t)
905
906 inline bool operator==(const eversion_t& l, const eversion_t& r) {
907 return (l.epoch == r.epoch) && (l.version == r.version);
908 }
909 inline bool operator!=(const eversion_t& l, const eversion_t& r) {
910 return (l.epoch != r.epoch) || (l.version != r.version);
911 }
912 inline bool operator<(const eversion_t& l, const eversion_t& r) {
913 return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
914 }
915 inline bool operator<=(const eversion_t& l, const eversion_t& r) {
916 return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
917 }
918 inline bool operator>(const eversion_t& l, const eversion_t& r) {
919 return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
920 }
921 inline bool operator>=(const eversion_t& l, const eversion_t& r) {
922 return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
923 }
924 inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) {
925 return out << e.epoch << "'" << e.version;
926 }
927
928 /**
929 * objectstore_perf_stat_t
930 *
931 * current perf information about the osd
932 */
933 struct objectstore_perf_stat_t {
934 // cur_op_latency is in ns since double add/sub are not associative
935 uint64_t os_commit_latency_ns;
936 uint64_t os_apply_latency_ns;
937
938 objectstore_perf_stat_t() :
939 os_commit_latency_ns(0), os_apply_latency_ns(0) {}
940
941 bool operator==(const objectstore_perf_stat_t &r) const {
942 return os_commit_latency_ns == r.os_commit_latency_ns &&
943 os_apply_latency_ns == r.os_apply_latency_ns;
944 }
945
946 void add(const objectstore_perf_stat_t &o) {
947 os_commit_latency_ns += o.os_commit_latency_ns;
948 os_apply_latency_ns += o.os_apply_latency_ns;
949 }
950 void sub(const objectstore_perf_stat_t &o) {
951 os_commit_latency_ns -= o.os_commit_latency_ns;
952 os_apply_latency_ns -= o.os_apply_latency_ns;
953 }
954 void dump(ceph::Formatter *f) const;
955 void encode(ceph::buffer::list &bl, uint64_t features) const;
956 void decode(ceph::buffer::list::const_iterator &bl);
957 static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
958 };
959 WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
960
961 /*
962 * pg states
963 */
964 #define PG_STATE_CREATING (1ULL << 0) // creating
965 #define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too)
966 #define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas.
967 #define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline
968 #define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound
969 #define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound
970 #define PG_STATE_PREMERGE (1ULL << 7) // i am prepare to merging
971 #define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing
972 //#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub
973 #define PG_STATE_DEGRADED (1ULL << 10) // pg contains objects with reduced redundancy
974 #define PG_STATE_INCONSISTENT (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
975 #define PG_STATE_PEERING (1ULL << 12) // pg is (re)peering
976 #define PG_STATE_REPAIR (1ULL << 13) // pg should repair on next scrub
977 #define PG_STATE_RECOVERING (1ULL << 14) // pg is recovering/migrating objects
978 #define PG_STATE_BACKFILL_WAIT (1ULL << 15) // [active] reserving backfill
979 #define PG_STATE_INCOMPLETE (1ULL << 16) // incomplete content, peering failed.
980 #define PG_STATE_STALE (1ULL << 17) // our state for this pg is stale, unknown.
981 #define PG_STATE_REMAPPED (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
982 #define PG_STATE_DEEP_SCRUB (1ULL << 19) // deep scrub: check CRC32 on files
983 #define PG_STATE_BACKFILLING (1ULL << 20) // [active] backfilling pg content
984 #define PG_STATE_BACKFILL_TOOFULL (1ULL << 21) // backfill can't proceed: too full
985 #define PG_STATE_RECOVERY_WAIT (1ULL << 22) // waiting for recovery reservations
986 #define PG_STATE_UNDERSIZED (1ULL << 23) // pg acting < pool size
987 #define PG_STATE_ACTIVATING (1ULL << 24) // pg is peered but not yet active
988 #define PG_STATE_PEERED (1ULL << 25) // peered, cannot go active, can recover
989 #define PG_STATE_SNAPTRIM (1ULL << 26) // trimming snaps
990 #define PG_STATE_SNAPTRIM_WAIT (1ULL << 27) // queued to trim snaps
991 #define PG_STATE_RECOVERY_TOOFULL (1ULL << 28) // recovery can't proceed: too full
992 #define PG_STATE_SNAPTRIM_ERROR (1ULL << 29) // error stopped trimming snaps
993 #define PG_STATE_FORCED_RECOVERY (1ULL << 30) // force recovery of this pg before any other
994 #define PG_STATE_FORCED_BACKFILL (1ULL << 31) // force backfill of this pg before any other
995 #define PG_STATE_FAILED_REPAIR (1ULL << 32) // A repair failed to fix all errors
996 #define PG_STATE_LAGGY (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
997 #define PG_STATE_WAIT (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
998
999 std::string pg_state_string(uint64_t state);
1000 std::string pg_vector_string(const std::vector<int32_t> &a);
1001 std::optional<uint64_t> pg_string_state(const std::string& state);
1002
1003
1004 /*
1005 * pool_snap_info_t
1006 *
1007 * attributes for a single pool snapshot.
1008 */
1009 struct pool_snap_info_t {
1010 snapid_t snapid;
1011 utime_t stamp;
1012 std::string name;
1013
1014 void dump(ceph::Formatter *f) const;
1015 void encode(ceph::buffer::list& bl, uint64_t features) const;
1016 void decode(ceph::buffer::list::const_iterator& bl);
1017 static void generate_test_instances(std::list<pool_snap_info_t*>& o);
1018 };
1019 WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
1020
1021 inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) {
1022 return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
1023 }
1024
1025
1026 /*
1027 * pool_opts_t
1028 *
1029 * pool options.
1030 */
1031
1032 class pool_opts_t {
1033 public:
1034 enum key_t {
1035 SCRUB_MIN_INTERVAL,
1036 SCRUB_MAX_INTERVAL,
1037 DEEP_SCRUB_INTERVAL,
1038 RECOVERY_PRIORITY,
1039 RECOVERY_OP_PRIORITY,
1040 SCRUB_PRIORITY,
1041 COMPRESSION_MODE,
1042 COMPRESSION_ALGORITHM,
1043 COMPRESSION_REQUIRED_RATIO,
1044 COMPRESSION_MAX_BLOB_SIZE,
1045 COMPRESSION_MIN_BLOB_SIZE,
1046 CSUM_TYPE,
1047 CSUM_MAX_BLOCK,
1048 CSUM_MIN_BLOCK,
1049 FINGERPRINT_ALGORITHM,
1050 PG_NUM_MIN, // min pg_num
1051 TARGET_SIZE_BYTES, // total bytes in pool
1052 TARGET_SIZE_RATIO, // fraction of total cluster
1053 PG_AUTOSCALE_BIAS,
1054 READ_LEASE_INTERVAL,
1055 };
1056
1057 enum type_t {
1058 STR,
1059 INT,
1060 DOUBLE,
1061 };
1062
1063 struct opt_desc_t {
1064 key_t key;
1065 type_t type;
1066
1067 opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
1068
1069 bool operator==(const opt_desc_t& rhs) const {
1070 return key == rhs.key && type == rhs.type;
1071 }
1072 };
1073
1074 typedef boost::variant<std::string,int64_t,double> value_t;
1075
1076 static bool is_opt_name(const std::string& name);
1077 static opt_desc_t get_opt_desc(const std::string& name);
1078
1079 pool_opts_t() : opts() {}
1080
1081 bool is_set(key_t key) const;
1082
1083 template<typename T>
1084 void set(key_t key, const T &val) {
1085 value_t value = val;
1086 opts[key] = value;
1087 }
1088
1089 template<typename T>
1090 bool get(key_t key, T *val) const {
1091 opts_t::const_iterator i = opts.find(key);
1092 if (i == opts.end()) {
1093 return false;
1094 }
1095 *val = boost::get<T>(i->second);
1096 return true;
1097 }
1098
1099 const value_t& get(key_t key) const;
1100
1101 bool unset(key_t key);
1102
1103 void dump(const std::string& name, ceph::Formatter *f) const;
1104
1105 void dump(ceph::Formatter *f) const;
1106 void encode(ceph::buffer::list &bl, uint64_t features) const;
1107 void decode(ceph::buffer::list::const_iterator &bl);
1108
1109 private:
1110 typedef std::map<key_t, value_t> opts_t;
1111 opts_t opts;
1112
1113 friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts);
1114 };
1115 WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
1116
1117 struct pg_merge_meta_t {
1118 pg_t source_pgid;
1119 epoch_t ready_epoch = 0;
1120 epoch_t last_epoch_started = 0;
1121 epoch_t last_epoch_clean = 0;
1122 eversion_t source_version;
1123 eversion_t target_version;
1124
1125 void encode(ceph::buffer::list& bl) const {
1126 ENCODE_START(1, 1, bl);
1127 encode(source_pgid, bl);
1128 encode(ready_epoch, bl);
1129 encode(last_epoch_started, bl);
1130 encode(last_epoch_clean, bl);
1131 encode(source_version, bl);
1132 encode(target_version, bl);
1133 ENCODE_FINISH(bl);
1134 }
1135 void decode(ceph::buffer::list::const_iterator& p) {
1136 DECODE_START(1, p);
1137 decode(source_pgid, p);
1138 decode(ready_epoch, p);
1139 decode(last_epoch_started, p);
1140 decode(last_epoch_clean, p);
1141 decode(source_version, p);
1142 decode(target_version, p);
1143 DECODE_FINISH(p);
1144 }
1145 void dump(ceph::Formatter *f) const {
1146 f->dump_stream("source_pgid") << source_pgid;
1147 f->dump_unsigned("ready_epoch", ready_epoch);
1148 f->dump_unsigned("last_epoch_started", last_epoch_started);
1149 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
1150 f->dump_stream("source_version") << source_version;
1151 f->dump_stream("target_version") << target_version;
1152 }
1153 };
1154 WRITE_CLASS_ENCODER(pg_merge_meta_t)
1155
1156 /*
1157 * pg_pool
1158 */
1159 struct pg_pool_t {
1160 static const char *APPLICATION_NAME_CEPHFS;
1161 static const char *APPLICATION_NAME_RBD;
1162 static const char *APPLICATION_NAME_RGW;
1163
1164 enum {
1165 TYPE_REPLICATED = 1, // replication
1166 //TYPE_RAID4 = 2, // raid4 (never implemented)
1167 TYPE_ERASURE = 3, // erasure-coded
1168 };
1169 static std::string_view get_type_name(int t) {
1170 switch (t) {
1171 case TYPE_REPLICATED: return "replicated";
1172 //case TYPE_RAID4: return "raid4";
1173 case TYPE_ERASURE: return "erasure";
1174 default: return "???";
1175 }
1176 }
1177 std::string_view get_type_name() const {
1178 return get_type_name(type);
1179 }
1180
1181 enum {
1182 FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
1183 FLAG_FULL = 1<<1, // pool is full
1184 FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
1185 FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
1186 FLAG_NODELETE = 1<<4, // pool can't be deleted
1187 FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
1188 FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
1189 FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
1190 FLAG_NOSCRUB = 1<<8, // block periodic scrub
1191 FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
1192 FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
1193 FLAG_NEARFULL = 1<<11, // pool is nearfull
1194 FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
1195 FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
1196 FLAG_POOL_SNAPS = 1<<14, // pool has pool snaps
1197 FLAG_CREATING = 1<<15, // initial pool PGs are being created
1198 };
1199
1200 static const char *get_flag_name(int f) {
1201 switch (f) {
1202 case FLAG_HASHPSPOOL: return "hashpspool";
1203 case FLAG_FULL: return "full";
1204 case FLAG_EC_OVERWRITES: return "ec_overwrites";
1205 case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
1206 case FLAG_NODELETE: return "nodelete";
1207 case FLAG_NOPGCHANGE: return "nopgchange";
1208 case FLAG_NOSIZECHANGE: return "nosizechange";
1209 case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
1210 case FLAG_NOSCRUB: return "noscrub";
1211 case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
1212 case FLAG_FULL_QUOTA: return "full_quota";
1213 case FLAG_NEARFULL: return "nearfull";
1214 case FLAG_BACKFILLFULL: return "backfillfull";
1215 case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
1216 case FLAG_POOL_SNAPS: return "pool_snaps";
1217 case FLAG_CREATING: return "creating";
1218 default: return "???";
1219 }
1220 }
1221 static std::string get_flags_string(uint64_t f) {
1222 std::string s;
1223 for (unsigned n=0; f && n<64; ++n) {
1224 if (f & (1ull << n)) {
1225 if (s.length())
1226 s += ",";
1227 s += get_flag_name(1ull << n);
1228 }
1229 }
1230 return s;
1231 }
1232 std::string get_flags_string() const {
1233 return get_flags_string(flags);
1234 }
1235 static uint64_t get_flag_by_name(const std::string& name) {
1236 if (name == "hashpspool")
1237 return FLAG_HASHPSPOOL;
1238 if (name == "full")
1239 return FLAG_FULL;
1240 if (name == "ec_overwrites")
1241 return FLAG_EC_OVERWRITES;
1242 if (name == "incomplete_clones")
1243 return FLAG_INCOMPLETE_CLONES;
1244 if (name == "nodelete")
1245 return FLAG_NODELETE;
1246 if (name == "nopgchange")
1247 return FLAG_NOPGCHANGE;
1248 if (name == "nosizechange")
1249 return FLAG_NOSIZECHANGE;
1250 if (name == "write_fadvise_dontneed")
1251 return FLAG_WRITE_FADVISE_DONTNEED;
1252 if (name == "noscrub")
1253 return FLAG_NOSCRUB;
1254 if (name == "nodeep-scrub")
1255 return FLAG_NODEEP_SCRUB;
1256 if (name == "full_quota")
1257 return FLAG_FULL_QUOTA;
1258 if (name == "nearfull")
1259 return FLAG_NEARFULL;
1260 if (name == "backfillfull")
1261 return FLAG_BACKFILLFULL;
1262 if (name == "selfmanaged_snaps")
1263 return FLAG_SELFMANAGED_SNAPS;
1264 if (name == "pool_snaps")
1265 return FLAG_POOL_SNAPS;
1266 if (name == "creating")
1267 return FLAG_CREATING;
1268 return 0;
1269 }
1270
1271 /// converts the acting/up vector to a set of pg shards
1272 void convert_to_pg_shards(const std::vector<int> &from, std::set<pg_shard_t>* to) const;
1273
1274 typedef enum {
1275 CACHEMODE_NONE = 0, ///< no caching
1276 CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
1277 CACHEMODE_FORWARD = 2, ///< forward if not in cache
1278 CACHEMODE_READONLY = 3, ///< handle reads, forward writes [not strongly consistent]
1279 CACHEMODE_READFORWARD = 4, ///< forward reads, write to cache flush later
1280 CACHEMODE_READPROXY = 5, ///< proxy reads, write to cache flush later
1281 CACHEMODE_PROXY = 6, ///< proxy if not in cache
1282 } cache_mode_t;
1283 static const char *get_cache_mode_name(cache_mode_t m) {
1284 switch (m) {
1285 case CACHEMODE_NONE: return "none";
1286 case CACHEMODE_WRITEBACK: return "writeback";
1287 case CACHEMODE_FORWARD: return "forward";
1288 case CACHEMODE_READONLY: return "readonly";
1289 case CACHEMODE_READFORWARD: return "readforward";
1290 case CACHEMODE_READPROXY: return "readproxy";
1291 case CACHEMODE_PROXY: return "proxy";
1292 default: return "unknown";
1293 }
1294 }
1295 static cache_mode_t get_cache_mode_from_str(const std::string& s) {
1296 if (s == "none")
1297 return CACHEMODE_NONE;
1298 if (s == "writeback")
1299 return CACHEMODE_WRITEBACK;
1300 if (s == "forward")
1301 return CACHEMODE_FORWARD;
1302 if (s == "readonly")
1303 return CACHEMODE_READONLY;
1304 if (s == "readforward")
1305 return CACHEMODE_READFORWARD;
1306 if (s == "readproxy")
1307 return CACHEMODE_READPROXY;
1308 if (s == "proxy")
1309 return CACHEMODE_PROXY;
1310 return (cache_mode_t)-1;
1311 }
1312 const char *get_cache_mode_name() const {
1313 return get_cache_mode_name(cache_mode);
1314 }
1315 bool cache_mode_requires_hit_set() const {
1316 switch (cache_mode) {
1317 case CACHEMODE_NONE:
1318 case CACHEMODE_FORWARD:
1319 case CACHEMODE_READONLY:
1320 case CACHEMODE_PROXY:
1321 return false;
1322 case CACHEMODE_WRITEBACK:
1323 case CACHEMODE_READFORWARD:
1324 case CACHEMODE_READPROXY:
1325 return true;
1326 default:
1327 ceph_abort_msg("implement me");
1328 }
1329 }
1330
1331 enum class pg_autoscale_mode_t : uint8_t {
1332 OFF = 0,
1333 WARN = 1,
1334 ON = 2,
1335 UNKNOWN = UINT8_MAX,
1336 };
1337 static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) {
1338 switch (m) {
1339 case pg_autoscale_mode_t::OFF: return "off";
1340 case pg_autoscale_mode_t::ON: return "on";
1341 case pg_autoscale_mode_t::WARN: return "warn";
1342 default: return "???";
1343 }
1344 }
1345 static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) {
1346 if (m == "off") {
1347 return pg_autoscale_mode_t::OFF;
1348 }
1349 if (m == "warn") {
1350 return pg_autoscale_mode_t::WARN;
1351 }
1352 if (m == "on") {
1353 return pg_autoscale_mode_t::ON;
1354 }
1355 return pg_autoscale_mode_t::UNKNOWN;
1356 }
1357
1358 utime_t create_time;
1359 uint64_t flags = 0; ///< FLAG_*
1360 __u8 type = 0; ///< TYPE_*
1361 __u8 size = 0, min_size = 0; ///< number of osds in each pg
1362 __u8 crush_rule = 0; ///< crush placement rule
1363 __u8 object_hash = 0; ///< hash mapping object name to ps
1364 pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
1365
1366 private:
1367 __u32 pg_num = 0, pgp_num = 0; ///< number of pgs
1368 __u32 pg_num_pending = 0; ///< pg_num we are about to merge down to
1369 __u32 pg_num_target = 0; ///< pg_num we should converge toward
1370 __u32 pgp_num_target = 0; ///< pgp_num we should converge toward
1371
1372 public:
1373 std::map<std::string, std::string> properties; ///< OBSOLETE
1374 std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap
1375 epoch_t last_change = 0; ///< most recent epoch changed, exclusing snapshot changes
1376
1377 /// last epoch that forced clients to resend
1378 epoch_t last_force_op_resend = 0;
1379 /// last epoch that forced clients to resend (pre-nautilus clients only)
1380 epoch_t last_force_op_resend_prenautilus = 0;
1381 /// last epoch that forced clients to resend (pre-luminous clients only)
1382 epoch_t last_force_op_resend_preluminous = 0;
1383
1384 /// metadata for the most recent PG merge
1385 pg_merge_meta_t last_pg_merge_meta;
1386
1387 snapid_t snap_seq = 0; ///< seq for per-pool snapshot
1388 epoch_t snap_epoch = 0; ///< osdmap epoch of last snap
1389 uint64_t auid = 0; ///< who owns the pg
1390
1391 uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool
1392 uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool
1393
1394 /*
1395 * Pool snaps (global to this pool). These define a SnapContext for
1396 * the pool, unless the client manually specifies an alternate
1397 * context.
1398 */
1399 std::map<snapid_t, pool_snap_info_t> snaps;
1400 /*
1401 * Alternatively, if we are defining non-pool snaps (e.g. via the
1402 * Ceph MDS), we must track @removed_snaps (since @snaps is not
1403 * used). Snaps and removed_snaps are to be used exclusive of each
1404 * other!
1405 */
1406 interval_set<snapid_t> removed_snaps;
1407
1408 unsigned pg_num_mask = 0, pgp_num_mask = 0;
1409
1410 std::set<uint64_t> tiers; ///< pools that are tiers of us
1411 int64_t tier_of = -1; ///< pool for which we are a tier
1412 // Note that write wins for read+write ops
1413 int64_t read_tier = -1; ///< pool/tier for objecter to direct reads to
1414 int64_t write_tier = -1; ///< pool/tier for objecter to direct writes to
1415 cache_mode_t cache_mode = CACHEMODE_NONE; ///< cache pool mode
1416
1417 bool is_tier() const { return tier_of >= 0; }
1418 bool has_tiers() const { return !tiers.empty(); }
1419 void clear_tier() {
1420 tier_of = -1;
1421 clear_read_tier();
1422 clear_write_tier();
1423 clear_tier_tunables();
1424 }
1425 bool has_read_tier() const { return read_tier >= 0; }
1426 void clear_read_tier() { read_tier = -1; }
1427 bool has_write_tier() const { return write_tier >= 0; }
1428 void clear_write_tier() { write_tier = -1; }
1429 void clear_tier_tunables() {
1430 if (cache_mode != CACHEMODE_NONE)
1431 flags |= FLAG_INCOMPLETE_CLONES;
1432 cache_mode = CACHEMODE_NONE;
1433
1434 target_max_bytes = 0;
1435 target_max_objects = 0;
1436 cache_target_dirty_ratio_micro = 0;
1437 cache_target_dirty_high_ratio_micro = 0;
1438 cache_target_full_ratio_micro = 0;
1439 hit_set_params = HitSet::Params();
1440 hit_set_period = 0;
1441 hit_set_count = 0;
1442 hit_set_grade_decay_rate = 0;
1443 hit_set_search_last_n = 0;
1444 grade_table.resize(0);
1445 }
1446
1447 uint64_t target_max_bytes = 0; ///< tiering: target max pool size
1448 uint64_t target_max_objects = 0; ///< tiering: target max pool size
1449
1450 uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty
1451 uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of target to flush with high speed
1452 uint32_t cache_target_full_ratio_micro = 0; ///< cache: fraction of target to fill before we evict in earnest
1453
1454 uint32_t cache_min_flush_age = 0; ///< minimum age (seconds) before we can flush
1455 uint32_t cache_min_evict_age = 0; ///< minimum age (seconds) before we can evict
1456
1457 HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
1458 uint32_t hit_set_period = 0; ///< periodicity of HitSet segments (seconds)
1459 uint32_t hit_set_count = 0; ///< number of periods to retain
1460 bool use_gmt_hitset = true; ///< use gmt to name the hitset archive object
1461 uint32_t min_read_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on read
1462 uint32_t min_write_recency_for_promote = 0; ///< minimum number of HitSet to check before promote on write
1463 uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects
1464 ///< temperature count,the follow hit_set's priority decay
1465 ///< by this params than pre hit_set
1466 uint32_t hit_set_search_last_n = 0; ///< accumulate atmost N hit_sets for temperature
1467
1468 uint32_t stripe_width = 0; ///< erasure coded stripe size in bytes
1469
1470 uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates
1471 ///< user does not specify any expected value
1472 bool fast_read = false; ///< whether turn on fast read on the pool or not
1473
1474 pool_opts_t opts; ///< options
1475
1476 typedef enum {
1477 TYPE_FINGERPRINT_NONE = 0,
1478 TYPE_FINGERPRINT_SHA1 = 1,
1479 TYPE_FINGERPRINT_SHA256 = 2,
1480 TYPE_FINGERPRINT_SHA512 = 3,
1481 } fingerprint_t;
1482 static fingerprint_t get_fingerprint_from_str(const std::string& s) {
1483 if (s == "none")
1484 return TYPE_FINGERPRINT_NONE;
1485 if (s == "sha1")
1486 return TYPE_FINGERPRINT_SHA1;
1487 if (s == "sha256")
1488 return TYPE_FINGERPRINT_SHA256;
1489 if (s == "sha512")
1490 return TYPE_FINGERPRINT_SHA512;
1491 return (fingerprint_t)-1;
1492 }
1493 const fingerprint_t get_fingerprint_type() const {
1494 std::string fp_str;
1495 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1496 return get_fingerprint_from_str(fp_str);
1497 }
1498 const char *get_fingerprint_name() const {
1499 std::string fp_str;
1500 fingerprint_t fp_t;
1501 opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
1502 fp_t = get_fingerprint_from_str(fp_str);
1503 return get_fingerprint_name(fp_t);
1504 }
1505 static const char *get_fingerprint_name(fingerprint_t m) {
1506 switch (m) {
1507 case TYPE_FINGERPRINT_NONE: return "none";
1508 case TYPE_FINGERPRINT_SHA1: return "sha1";
1509 case TYPE_FINGERPRINT_SHA256: return "sha256";
1510 case TYPE_FINGERPRINT_SHA512: return "sha512";
1511 default: return "unknown";
1512 }
1513 }
1514
1515 /// application -> key/value metadata
1516 std::map<std::string, std::map<std::string, std::string>> application_metadata;
1517
1518 private:
1519 std::vector<uint32_t> grade_table;
1520
1521 public:
1522 uint32_t get_grade(unsigned i) const {
1523 if (grade_table.size() <= i)
1524 return 0;
1525 return grade_table[i];
1526 }
1527 void calc_grade_table() {
1528 unsigned v = 1000000;
1529 grade_table.resize(hit_set_count);
1530 for (unsigned i = 0; i < hit_set_count; i++) {
1531 v = v * (1 - (hit_set_grade_decay_rate / 100.0));
1532 grade_table[i] = v;
1533 }
1534 }
1535
1536 pg_pool_t() = default;
1537
1538 void dump(ceph::Formatter *f) const;
1539
1540 const utime_t &get_create_time() const { return create_time; }
1541 uint64_t get_flags() const { return flags; }
1542 bool has_flag(uint64_t f) const { return flags & f; }
1543 void set_flag(uint64_t f) { flags |= f; }
1544 void unset_flag(uint64_t f) { flags &= ~f; }
1545
1546 bool require_rollback() const {
1547 return is_erasure();
1548 }
1549
1550 /// true if incomplete clones may be present
1551 bool allow_incomplete_clones() const {
1552 return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
1553 }
1554
1555 unsigned get_type() const { return type; }
1556 unsigned get_size() const { return size; }
1557 unsigned get_min_size() const { return min_size; }
1558 int get_crush_rule() const { return crush_rule; }
1559 int get_object_hash() const { return object_hash; }
1560 const char *get_object_hash_name() const {
1561 return ceph_str_hash_name(get_object_hash());
1562 }
1563 epoch_t get_last_change() const { return last_change; }
1564 epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
1565 epoch_t get_last_force_op_resend_prenautilus() const {
1566 return last_force_op_resend_prenautilus;
1567 }
1568 epoch_t get_last_force_op_resend_preluminous() const {
1569 return last_force_op_resend_preluminous;
1570 }
1571 epoch_t get_snap_epoch() const { return snap_epoch; }
1572 snapid_t get_snap_seq() const { return snap_seq; }
1573 uint64_t get_auid() const { return auid; }
1574
1575 void set_snap_seq(snapid_t s) { snap_seq = s; }
1576 void set_snap_epoch(epoch_t e) { snap_epoch = e; }
1577
1578 void set_stripe_width(uint32_t s) { stripe_width = s; }
1579 uint32_t get_stripe_width() const { return stripe_width; }
1580
1581 bool is_replicated() const { return get_type() == TYPE_REPLICATED; }
1582 bool is_erasure() const { return get_type() == TYPE_ERASURE; }
1583
1584 bool supports_omap() const {
1585 return !(get_type() == TYPE_ERASURE);
1586 }
1587
1588 bool requires_aligned_append() const {
1589 return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
1590 }
1591 uint64_t required_alignment() const { return stripe_width; }
1592
1593 bool allows_ecoverwrites() const {
1594 return has_flag(FLAG_EC_OVERWRITES);
1595 }
1596
1597 bool can_shift_osds() const {
1598 switch (get_type()) {
1599 case TYPE_REPLICATED:
1600 return true;
1601 case TYPE_ERASURE:
1602 return false;
1603 default:
1604 ceph_abort_msg("unhandled pool type");
1605 }
1606 }
1607
1608 unsigned get_pg_num() const { return pg_num; }
1609 unsigned get_pgp_num() const { return pgp_num; }
1610 unsigned get_pg_num_target() const { return pg_num_target; }
1611 unsigned get_pgp_num_target() const { return pgp_num_target; }
1612 unsigned get_pg_num_pending() const { return pg_num_pending; }
1613
1614 unsigned get_pg_num_mask() const { return pg_num_mask; }
1615 unsigned get_pgp_num_mask() const { return pgp_num_mask; }
1616
1617 // if pg_num is not a multiple of two, pgs are not equally sized.
1618 // return, for a given pg, the fraction (denominator) of the total
1619 // pool size that it represents.
1620 unsigned get_pg_num_divisor(pg_t pgid) const;
1621
1622 bool is_pending_merge(pg_t pgid, bool *target) const;
1623
1624 void set_pg_num(int p) {
1625 pg_num = p;
1626 pg_num_pending = p;
1627 calc_pg_masks();
1628 }
1629 void set_pgp_num(int p) {
1630 pgp_num = p;
1631 calc_pg_masks();
1632 }
1633 void set_pg_num_pending(int p) {
1634 pg_num_pending = p;
1635 calc_pg_masks();
1636 }
1637 void set_pg_num_target(int p) {
1638 pg_num_target = p;
1639 }
1640 void set_pgp_num_target(int p) {
1641 pgp_num_target = p;
1642 }
1643 void dec_pg_num(pg_t source_pgid,
1644 epoch_t ready_epoch,
1645 eversion_t source_version,
1646 eversion_t target_version,
1647 epoch_t last_epoch_started,
1648 epoch_t last_epoch_clean) {
1649 --pg_num;
1650 last_pg_merge_meta.source_pgid = source_pgid;
1651 last_pg_merge_meta.ready_epoch = ready_epoch;
1652 last_pg_merge_meta.source_version = source_version;
1653 last_pg_merge_meta.target_version = target_version;
1654 last_pg_merge_meta.last_epoch_started = last_epoch_started;
1655 last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
1656 calc_pg_masks();
1657 }
1658
1659 void set_quota_max_bytes(uint64_t m) {
1660 quota_max_bytes = m;
1661 }
1662 uint64_t get_quota_max_bytes() {
1663 return quota_max_bytes;
1664 }
1665
1666 void set_quota_max_objects(uint64_t m) {
1667 quota_max_objects = m;
1668 }
1669 uint64_t get_quota_max_objects() {
1670 return quota_max_objects;
1671 }
1672
1673 void set_last_force_op_resend(uint64_t t) {
1674 last_force_op_resend = t;
1675 last_force_op_resend_prenautilus = t;
1676 last_force_op_resend_preluminous = t;
1677 }
1678
1679 void calc_pg_masks();
1680
1681 /*
1682 * we have two snap modes:
1683 * - pool global snaps
1684 * - snap existence/non-existence defined by snaps[] and snap_seq
1685 * - user managed snaps
1686 * - removal governed by removed_snaps
1687 *
1688 * we know which mode we're using based on whether removed_snaps is empty.
1689 * If nothing has been created, both functions report false.
1690 */
1691 bool is_pool_snaps_mode() const;
1692 bool is_unmanaged_snaps_mode() const;
1693 bool is_removed_snap(snapid_t s) const;
1694
1695 snapid_t snap_exists(const char *s) const;
1696 void add_snap(const char *n, utime_t stamp);
1697 uint64_t add_unmanaged_snap(bool preoctopus_compat);
1698 void remove_snap(snapid_t s);
1699 void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat);
1700
1701 SnapContext get_snap_context() const;
1702
1703 /// hash a object name+namespace key to a hash position
1704 uint32_t hash_key(const std::string& key, const std::string& ns) const;
1705
1706 /// round a hash position down to a pg num
1707 uint32_t raw_hash_to_pg(uint32_t v) const;
1708
1709 /*
1710 * map a raw pg (with full precision ps) into an actual pg, for storage
1711 */
1712 pg_t raw_pg_to_pg(pg_t pg) const;
1713
1714 /*
1715 * map raw pg (full precision ps) into a placement seed. include
1716 * pool id in that value so that different pools don't use the same
1717 * seeds.
1718 */
1719 ps_t raw_pg_to_pps(pg_t pg) const;
1720
1721 /// choose a random hash position within a pg
1722 uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
1723
1724 void encode(ceph::buffer::list& bl, uint64_t features) const;
1725 void decode(ceph::buffer::list::const_iterator& bl);
1726
1727 static void generate_test_instances(std::list<pg_pool_t*>& o);
1728 };
1729 WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
1730
1731 std::ostream& operator<<(std::ostream& out, const pg_pool_t& p);
1732
1733
1734 /**
1735 * a summation of object stats
1736 *
1737 * This is just a container for object stats; we don't know what for.
1738 *
1739 * If you add members in object_stat_sum_t, you should make sure there are
1740 * not padding among these members.
1741 * You should also modify the padding_check function.
1742
1743 */
1744 struct object_stat_sum_t {
1745 /**************************************************************************
1746 * WARNING: be sure to update operator==, floor, and split when
1747 * adding/removing fields!
1748 **************************************************************************/
1749 int64_t num_bytes; // in bytes
1750 int64_t num_objects;
1751 int64_t num_object_clones;
1752 int64_t num_object_copies; // num_objects * num_replicas
1753 int64_t num_objects_missing_on_primary;
1754 int64_t num_objects_degraded;
1755 int64_t num_objects_unfound;
1756 int64_t num_rd;
1757 int64_t num_rd_kb;
1758 int64_t num_wr;
1759 int64_t num_wr_kb;
1760 int64_t num_scrub_errors; // total deep and shallow scrub errors
1761 int64_t num_objects_recovered;
1762 int64_t num_bytes_recovered;
1763 int64_t num_keys_recovered;
1764 int64_t num_shallow_scrub_errors;
1765 int64_t num_deep_scrub_errors;
1766 int64_t num_objects_dirty;
1767 int64_t num_whiteouts;
1768 int64_t num_objects_omap;
1769 int64_t num_objects_hit_set_archive;
1770 int64_t num_objects_misplaced;
1771 int64_t num_bytes_hit_set_archive;
1772 int64_t num_flush;
1773 int64_t num_flush_kb;
1774 int64_t num_evict;
1775 int64_t num_evict_kb;
1776 int64_t num_promote;
1777 int32_t num_flush_mode_high; // 1 when in high flush mode, otherwise 0
1778 int32_t num_flush_mode_low; // 1 when in low flush mode, otherwise 0
1779 int32_t num_evict_mode_some; // 1 when in evict some mode, otherwise 0
1780 int32_t num_evict_mode_full; // 1 when in evict full mode, otherwise 0
1781 int64_t num_objects_pinned;
1782 int64_t num_objects_missing;
1783 int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
1784 int64_t num_large_omap_objects = 0;
1785 int64_t num_objects_manifest = 0;
1786 int64_t num_omap_bytes = 0;
1787 int64_t num_omap_keys = 0;
1788 int64_t num_objects_repaired = 0;
1789
1790 object_stat_sum_t()
1791 : num_bytes(0),
1792 num_objects(0), num_object_clones(0), num_object_copies(0),
1793 num_objects_missing_on_primary(0), num_objects_degraded(0),
1794 num_objects_unfound(0),
1795 num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
1796 num_scrub_errors(0),
1797 num_objects_recovered(0),
1798 num_bytes_recovered(0),
1799 num_keys_recovered(0),
1800 num_shallow_scrub_errors(0),
1801 num_deep_scrub_errors(0),
1802 num_objects_dirty(0),
1803 num_whiteouts(0),
1804 num_objects_omap(0),
1805 num_objects_hit_set_archive(0),
1806 num_objects_misplaced(0),
1807 num_bytes_hit_set_archive(0),
1808 num_flush(0),
1809 num_flush_kb(0),
1810 num_evict(0),
1811 num_evict_kb(0),
1812 num_promote(0),
1813 num_flush_mode_high(0), num_flush_mode_low(0),
1814 num_evict_mode_some(0), num_evict_mode_full(0),
1815 num_objects_pinned(0),
1816 num_objects_missing(0),
1817 num_legacy_snapsets(0)
1818 {}
1819
1820 void floor(int64_t f) {
1821 #define FLOOR(x) if (x < f) x = f
1822 FLOOR(num_bytes);
1823 FLOOR(num_objects);
1824 FLOOR(num_object_clones);
1825 FLOOR(num_object_copies);
1826 FLOOR(num_objects_missing_on_primary);
1827 FLOOR(num_objects_missing);
1828 FLOOR(num_objects_degraded);
1829 FLOOR(num_objects_misplaced);
1830 FLOOR(num_objects_unfound);
1831 FLOOR(num_rd);
1832 FLOOR(num_rd_kb);
1833 FLOOR(num_wr);
1834 FLOOR(num_wr_kb);
1835 FLOOR(num_large_omap_objects);
1836 FLOOR(num_objects_manifest);
1837 FLOOR(num_omap_bytes);
1838 FLOOR(num_omap_keys);
1839 FLOOR(num_shallow_scrub_errors);
1840 FLOOR(num_deep_scrub_errors);
1841 num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
1842 FLOOR(num_objects_recovered);
1843 FLOOR(num_bytes_recovered);
1844 FLOOR(num_keys_recovered);
1845 FLOOR(num_objects_dirty);
1846 FLOOR(num_whiteouts);
1847 FLOOR(num_objects_omap);
1848 FLOOR(num_objects_hit_set_archive);
1849 FLOOR(num_bytes_hit_set_archive);
1850 FLOOR(num_flush);
1851 FLOOR(num_flush_kb);
1852 FLOOR(num_evict);
1853 FLOOR(num_evict_kb);
1854 FLOOR(num_promote);
1855 FLOOR(num_flush_mode_high);
1856 FLOOR(num_flush_mode_low);
1857 FLOOR(num_evict_mode_some);
1858 FLOOR(num_evict_mode_full);
1859 FLOOR(num_objects_pinned);
1860 FLOOR(num_legacy_snapsets);
1861 FLOOR(num_objects_repaired);
1862 #undef FLOOR
1863 }
1864
1865 void split(std::vector<object_stat_sum_t> &out) const {
1866 #define SPLIT(PARAM) \
1867 for (unsigned i = 0; i < out.size(); ++i) { \
1868 out[i].PARAM = PARAM / out.size(); \
1869 if (i < (PARAM % out.size())) { \
1870 out[i].PARAM++; \
1871 } \
1872 }
1873 #define SPLIT_PRESERVE_NONZERO(PARAM) \
1874 for (unsigned i = 0; i < out.size(); ++i) { \
1875 if (PARAM) \
1876 out[i].PARAM = 1 + PARAM / out.size(); \
1877 else \
1878 out[i].PARAM = 0; \
1879 }
1880
1881 SPLIT(num_bytes);
1882 SPLIT(num_objects);
1883 SPLIT(num_object_clones);
1884 SPLIT(num_object_copies);
1885 SPLIT(num_objects_missing_on_primary);
1886 SPLIT(num_objects_missing);
1887 SPLIT(num_objects_degraded);
1888 SPLIT(num_objects_misplaced);
1889 SPLIT(num_objects_unfound);
1890 SPLIT(num_rd);
1891 SPLIT(num_rd_kb);
1892 SPLIT(num_wr);
1893 SPLIT(num_wr_kb);
1894 SPLIT(num_large_omap_objects);
1895 SPLIT(num_objects_manifest);
1896 SPLIT(num_omap_bytes);
1897 SPLIT(num_omap_keys);
1898 SPLIT(num_objects_repaired);
1899 SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
1900 SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
1901 for (unsigned i = 0; i < out.size(); ++i) {
1902 out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
1903 out[i].num_deep_scrub_errors;
1904 }
1905 SPLIT(num_objects_recovered);
1906 SPLIT(num_bytes_recovered);
1907 SPLIT(num_keys_recovered);
1908 SPLIT(num_objects_dirty);
1909 SPLIT(num_whiteouts);
1910 SPLIT(num_objects_omap);
1911 SPLIT(num_objects_hit_set_archive);
1912 SPLIT(num_bytes_hit_set_archive);
1913 SPLIT(num_flush);
1914 SPLIT(num_flush_kb);
1915 SPLIT(num_evict);
1916 SPLIT(num_evict_kb);
1917 SPLIT(num_promote);
1918 SPLIT(num_flush_mode_high);
1919 SPLIT(num_flush_mode_low);
1920 SPLIT(num_evict_mode_some);
1921 SPLIT(num_evict_mode_full);
1922 SPLIT(num_objects_pinned);
1923 SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
1924 #undef SPLIT
1925 #undef SPLIT_PRESERVE_NONZERO
1926 }
1927
1928 void clear() {
1929 memset(this, 0, sizeof(*this));
1930 }
1931
1932 void calc_copies(int nrep) {
1933 num_object_copies = nrep * num_objects;
1934 }
1935
1936 bool is_zero() const {
1937 return mem_is_zero((char*)this, sizeof(*this));
1938 }
1939
1940 void add(const object_stat_sum_t& o);
1941 void sub(const object_stat_sum_t& o);
1942
1943 void dump(ceph::Formatter *f) const;
1944 void padding_check() {
1945 static_assert(
1946 sizeof(object_stat_sum_t) ==
1947 sizeof(num_bytes) +
1948 sizeof(num_objects) +
1949 sizeof(num_object_clones) +
1950 sizeof(num_object_copies) +
1951 sizeof(num_objects_missing_on_primary) +
1952 sizeof(num_objects_degraded) +
1953 sizeof(num_objects_unfound) +
1954 sizeof(num_rd) +
1955 sizeof(num_rd_kb) +
1956 sizeof(num_wr) +
1957 sizeof(num_wr_kb) +
1958 sizeof(num_scrub_errors) +
1959 sizeof(num_large_omap_objects) +
1960 sizeof(num_objects_manifest) +
1961 sizeof(num_omap_bytes) +
1962 sizeof(num_omap_keys) +
1963 sizeof(num_objects_repaired) +
1964 sizeof(num_objects_recovered) +
1965 sizeof(num_bytes_recovered) +
1966 sizeof(num_keys_recovered) +
1967 sizeof(num_shallow_scrub_errors) +
1968 sizeof(num_deep_scrub_errors) +
1969 sizeof(num_objects_dirty) +
1970 sizeof(num_whiteouts) +
1971 sizeof(num_objects_omap) +
1972 sizeof(num_objects_hit_set_archive) +
1973 sizeof(num_objects_misplaced) +
1974 sizeof(num_bytes_hit_set_archive) +
1975 sizeof(num_flush) +
1976 sizeof(num_flush_kb) +
1977 sizeof(num_evict) +
1978 sizeof(num_evict_kb) +
1979 sizeof(num_promote) +
1980 sizeof(num_flush_mode_high) +
1981 sizeof(num_flush_mode_low) +
1982 sizeof(num_evict_mode_some) +
1983 sizeof(num_evict_mode_full) +
1984 sizeof(num_objects_pinned) +
1985 sizeof(num_objects_missing) +
1986 sizeof(num_legacy_snapsets)
1987 ,
1988 "object_stat_sum_t have padding");
1989 }
1990 void encode(ceph::buffer::list& bl) const;
1991 void decode(ceph::buffer::list::const_iterator& bl);
1992 static void generate_test_instances(std::list<object_stat_sum_t*>& o);
1993 };
1994 WRITE_CLASS_ENCODER(object_stat_sum_t)
1995
1996 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
1997
1998 /**
1999 * a collection of object stat sums
2000 *
2001 * This is a collection of stat sums over different categories.
2002 */
2003 struct object_stat_collection_t {
2004 /**************************************************************************
2005 * WARNING: be sure to update the operator== when adding/removing fields! *
2006 **************************************************************************/
2007 object_stat_sum_t sum;
2008
2009 void calc_copies(int nrep) {
2010 sum.calc_copies(nrep);
2011 }
2012
2013 void dump(ceph::Formatter *f) const;
2014 void encode(ceph::buffer::list& bl) const;
2015 void decode(ceph::buffer::list::const_iterator& bl);
2016 static void generate_test_instances(std::list<object_stat_collection_t*>& o);
2017
2018 bool is_zero() const {
2019 return sum.is_zero();
2020 }
2021
2022 void clear() {
2023 sum.clear();
2024 }
2025
2026 void floor(int64_t f) {
2027 sum.floor(f);
2028 }
2029
2030 void add(const object_stat_sum_t& o) {
2031 sum.add(o);
2032 }
2033
2034 void add(const object_stat_collection_t& o) {
2035 sum.add(o.sum);
2036 }
2037 void sub(const object_stat_collection_t& o) {
2038 sum.sub(o.sum);
2039 }
2040 };
2041 WRITE_CLASS_ENCODER(object_stat_collection_t)
2042
2043 inline bool operator==(const object_stat_collection_t& l,
2044 const object_stat_collection_t& r) {
2045 return l.sum == r.sum;
2046 }
2047
2048
2049 /** pg_stat
2050 * aggregate stats for a single PG.
2051 */
2052 struct pg_stat_t {
2053 /**************************************************************************
2054 * WARNING: be sure to update the operator== when adding/removing fields! *
2055 **************************************************************************/
2056 eversion_t version;
2057 version_t reported_seq; // sequence number
2058 epoch_t reported_epoch; // epoch of this report
2059 uint64_t state;
2060 utime_t last_fresh; // last reported
2061 utime_t last_change; // new state != previous state
2062 utime_t last_active; // state & PG_STATE_ACTIVE
2063 utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
2064 utime_t last_clean; // state & PG_STATE_CLEAN
2065 utime_t last_unstale; // (state & PG_STATE_STALE) == 0
2066 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
2067 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
2068
2069 eversion_t log_start; // (log_start,version]
2070 eversion_t ondisk_log_start; // there may be more on disk
2071
2072 epoch_t created;
2073 epoch_t last_epoch_clean;
2074 pg_t parent;
2075 __u32 parent_split_bits;
2076
2077 eversion_t last_scrub;
2078 eversion_t last_deep_scrub;
2079 utime_t last_scrub_stamp;
2080 utime_t last_deep_scrub_stamp;
2081 utime_t last_clean_scrub_stamp;
2082
2083 object_stat_collection_t stats;
2084
2085 int64_t log_size;
2086 int64_t ondisk_log_size; // >= active_log_size
2087
2088 std::vector<int32_t> up, acting;
2089 std::vector<pg_shard_t> avail_no_missing;
2090 std::map< std::set<pg_shard_t>, int32_t > object_location_counts;
2091 epoch_t mapping_epoch;
2092
2093 std::vector<int32_t> blocked_by; ///< osds on which the pg is blocked
2094
2095 interval_set<snapid_t> purged_snaps; ///< recently removed snaps that we've purged
2096
2097 utime_t last_became_active;
2098 utime_t last_became_peered;
2099
2100 /// up, acting primaries
2101 int32_t up_primary;
2102 int32_t acting_primary;
2103
2104 // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
2105 // absurd already, so cap it to 2^32 and save 4 bytes at the same time
2106 uint32_t snaptrimq_len;
2107
2108 bool stats_invalid:1;
2109 /// true if num_objects_dirty is not accurate (because it was not
2110 /// maintained starting from pool creation)
2111 bool dirty_stats_invalid:1;
2112 bool omap_stats_invalid:1;
2113 bool hitset_stats_invalid:1;
2114 bool hitset_bytes_stats_invalid:1;
2115 bool pin_stats_invalid:1;
2116 bool manifest_stats_invalid:1;
2117
2118 pg_stat_t()
2119 : reported_seq(0),
2120 reported_epoch(0),
2121 state(0),
2122 created(0), last_epoch_clean(0),
2123 parent_split_bits(0),
2124 log_size(0), ondisk_log_size(0),
2125 mapping_epoch(0),
2126 up_primary(-1),
2127 acting_primary(-1),
2128 snaptrimq_len(0),
2129 stats_invalid(false),
2130 dirty_stats_invalid(false),
2131 omap_stats_invalid(false),
2132 hitset_stats_invalid(false),
2133 hitset_bytes_stats_invalid(false),
2134 pin_stats_invalid(false),
2135 manifest_stats_invalid(false)
2136 { }
2137
2138 epoch_t get_effective_last_epoch_clean() const {
2139 if (state & PG_STATE_CLEAN) {
2140 // we are clean as of this report, and should thus take the
2141 // reported epoch
2142 return reported_epoch;
2143 } else {
2144 return last_epoch_clean;
2145 }
2146 }
2147
2148 std::pair<epoch_t, version_t> get_version_pair() const {
2149 return { reported_epoch, reported_seq };
2150 }
2151
2152 void floor(int64_t f) {
2153 stats.floor(f);
2154 if (log_size < f)
2155 log_size = f;
2156 if (ondisk_log_size < f)
2157 ondisk_log_size = f;
2158 if (snaptrimq_len < f)
2159 snaptrimq_len = f;
2160 }
2161
2162 void add_sub_invalid_flags(const pg_stat_t& o) {
2163 // adding (or subtracting!) invalid stats render our stats invalid too
2164 stats_invalid |= o.stats_invalid;
2165 dirty_stats_invalid |= o.dirty_stats_invalid;
2166 omap_stats_invalid |= o.omap_stats_invalid;
2167 hitset_stats_invalid |= o.hitset_stats_invalid;
2168 hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid;
2169 pin_stats_invalid |= o.pin_stats_invalid;
2170 manifest_stats_invalid |= o.manifest_stats_invalid;
2171 }
2172 void add(const pg_stat_t& o) {
2173 stats.add(o.stats);
2174 log_size += o.log_size;
2175 ondisk_log_size += o.ondisk_log_size;
2176 snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
2177 (uint64_t)(1ull << 31));
2178 add_sub_invalid_flags(o);
2179 }
2180 void sub(const pg_stat_t& o) {
2181 stats.sub(o.stats);
2182 log_size -= o.log_size;
2183 ondisk_log_size -= o.ondisk_log_size;
2184 if (o.snaptrimq_len < snaptrimq_len) {
2185 snaptrimq_len -= o.snaptrimq_len;
2186 } else {
2187 snaptrimq_len = 0;
2188 }
2189 add_sub_invalid_flags(o);
2190 }
2191
2192 bool is_acting_osd(int32_t osd, bool primary) const;
2193 void dump(ceph::Formatter *f) const;
2194 void dump_brief(ceph::Formatter *f) const;
2195 void encode(ceph::buffer::list &bl) const;
2196 void decode(ceph::buffer::list::const_iterator &bl);
2197 static void generate_test_instances(std::list<pg_stat_t*>& o);
2198 };
2199 WRITE_CLASS_ENCODER(pg_stat_t)
2200
2201 bool operator==(const pg_stat_t& l, const pg_stat_t& r);
2202
2203 /** store_statfs_t
2204 * ObjectStore full statfs information
2205 */
2206 struct store_statfs_t
2207 {
2208 uint64_t total = 0; ///< Total bytes
2209 uint64_t available = 0; ///< Free bytes available
2210 uint64_t internally_reserved = 0; ///< Bytes reserved for internal purposes
2211
2212 int64_t allocated = 0; ///< Bytes allocated by the store
2213
2214 int64_t data_stored = 0; ///< Bytes actually stored by the user
2215 int64_t data_compressed = 0; ///< Bytes stored after compression
2216 int64_t data_compressed_allocated = 0; ///< Bytes allocated for compressed data
2217 int64_t data_compressed_original = 0; ///< Bytes that were compressed
2218
2219 int64_t omap_allocated = 0; ///< approx usage of omap data
2220 int64_t internal_metadata = 0; ///< approx usage of internal metadata
2221
2222 void reset() {
2223 *this = store_statfs_t();
2224 }
2225 void floor(int64_t f) {
2226 #define FLOOR(x) if (int64_t(x) < f) x = f
2227 FLOOR(total);
2228 FLOOR(available);
2229 FLOOR(internally_reserved);
2230 FLOOR(allocated);
2231 FLOOR(data_stored);
2232 FLOOR(data_compressed);
2233 FLOOR(data_compressed_allocated);
2234 FLOOR(data_compressed_original);
2235
2236 FLOOR(omap_allocated);
2237 FLOOR(internal_metadata);
2238 #undef FLOOR
2239 }
2240
2241 bool operator ==(const store_statfs_t& other) const;
2242 bool is_zero() const {
2243 return *this == store_statfs_t();
2244 }
2245
2246 uint64_t get_used() const {
2247 return total - available - internally_reserved;
2248 }
2249
2250 // this accumulates both actually used and statfs's internally_reserved
2251 uint64_t get_used_raw() const {
2252 return total - available;
2253 }
2254
2255 float get_used_raw_ratio() const {
2256 if (total) {
2257 return (float)get_used_raw() / (float)total;
2258 } else {
2259 return 0.0;
2260 }
2261 }
2262
2263 // helpers to ease legacy code porting
2264 uint64_t kb_avail() const {
2265 return available >> 10;
2266 }
2267 uint64_t kb() const {
2268 return total >> 10;
2269 }
2270 uint64_t kb_used() const {
2271 return (total - available - internally_reserved) >> 10;
2272 }
2273 uint64_t kb_used_raw() const {
2274 return get_used_raw() >> 10;
2275 }
2276
2277 uint64_t kb_used_data() const {
2278 return allocated >> 10;
2279 }
2280 uint64_t kb_used_omap() const {
2281 return omap_allocated >> 10;
2282 }
2283
2284 uint64_t kb_used_internal_metadata() const {
2285 return internal_metadata >> 10;
2286 }
2287
2288 void add(const store_statfs_t& o) {
2289 total += o.total;
2290 available += o.available;
2291 internally_reserved += o.internally_reserved;
2292 allocated += o.allocated;
2293 data_stored += o.data_stored;
2294 data_compressed += o.data_compressed;
2295 data_compressed_allocated += o.data_compressed_allocated;
2296 data_compressed_original += o.data_compressed_original;
2297 omap_allocated += o.omap_allocated;
2298 internal_metadata += o.internal_metadata;
2299 }
2300 void sub(const store_statfs_t& o) {
2301 total -= o.total;
2302 available -= o.available;
2303 internally_reserved -= o.internally_reserved;
2304 allocated -= o.allocated;
2305 data_stored -= o.data_stored;
2306 data_compressed -= o.data_compressed;
2307 data_compressed_allocated -= o.data_compressed_allocated;
2308 data_compressed_original -= o.data_compressed_original;
2309 omap_allocated -= o.omap_allocated;
2310 internal_metadata -= o.internal_metadata;
2311 }
2312 void dump(ceph::Formatter *f) const;
2313 DENC(store_statfs_t, v, p) {
2314 DENC_START(1, 1, p);
2315 denc(v.total, p);
2316 denc(v.available, p);
2317 denc(v.internally_reserved, p);
2318 denc(v.allocated, p);
2319 denc(v.data_stored, p);
2320 denc(v.data_compressed, p);
2321 denc(v.data_compressed_allocated, p);
2322 denc(v.data_compressed_original, p);
2323 denc(v.omap_allocated, p);
2324 denc(v.internal_metadata, p);
2325 DENC_FINISH(p);
2326 }
2327 static void generate_test_instances(std::list<store_statfs_t*>& o);
2328 };
2329 WRITE_CLASS_DENC(store_statfs_t)
2330
2331 std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs);
2332
2333 /** osd_stat
2334 * aggregate stats for an osd
2335 */
2336 struct osd_stat_t {
2337 store_statfs_t statfs;
2338 std::vector<int> hb_peers;
2339 int32_t snap_trim_queue_len, num_snap_trimming;
2340 uint64_t num_shards_repaired;
2341
2342 pow2_hist_t op_queue_age_hist;
2343
2344 objectstore_perf_stat_t os_perf_stat;
2345 osd_alerts_t os_alerts;
2346
2347 epoch_t up_from = 0;
2348 uint64_t seq = 0;
2349
2350 uint32_t num_pgs = 0;
2351
2352 uint32_t num_osds = 0;
2353 uint32_t num_per_pool_osds = 0;
2354 uint32_t num_per_pool_omap_osds = 0;
2355
2356 struct Interfaces {
2357 uint32_t last_update; // in seconds
2358 uint32_t back_pingtime[3];
2359 uint32_t back_min[3];
2360 uint32_t back_max[3];
2361 uint32_t back_last;
2362 uint32_t front_pingtime[3];
2363 uint32_t front_min[3];
2364 uint32_t front_max[3];
2365 uint32_t front_last;
2366 };
2367 map<int, Interfaces> hb_pingtime; ///< map of osd id to Interfaces
2368
2369 osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
2370 num_shards_repaired(0) {}
2371
2372 void add(const osd_stat_t& o) {
2373 statfs.add(o.statfs);
2374 snap_trim_queue_len += o.snap_trim_queue_len;
2375 num_snap_trimming += o.num_snap_trimming;
2376 num_shards_repaired += o.num_shards_repaired;
2377 op_queue_age_hist.add(o.op_queue_age_hist);
2378 os_perf_stat.add(o.os_perf_stat);
2379 num_pgs += o.num_pgs;
2380 num_osds += o.num_osds;
2381 num_per_pool_osds += o.num_per_pool_osds;
2382 num_per_pool_omap_osds += o.num_per_pool_omap_osds;
2383 for (const auto& a : o.os_alerts) {
2384 auto& target = os_alerts[a.first];
2385 for (auto& i : a.second) {
2386 target.emplace(i.first, i.second);
2387 }
2388 }
2389 }
2390 void sub(const osd_stat_t& o) {
2391 statfs.sub(o.statfs);
2392 snap_trim_queue_len -= o.snap_trim_queue_len;
2393 num_snap_trimming -= o.num_snap_trimming;
2394 num_shards_repaired -= o.num_shards_repaired;
2395 op_queue_age_hist.sub(o.op_queue_age_hist);
2396 os_perf_stat.sub(o.os_perf_stat);
2397 num_pgs -= o.num_pgs;
2398 num_osds -= o.num_osds;
2399 num_per_pool_osds -= o.num_per_pool_osds;
2400 num_per_pool_omap_osds -= o.num_per_pool_omap_osds;
2401 for (const auto& a : o.os_alerts) {
2402 auto& target = os_alerts[a.first];
2403 for (auto& i : a.second) {
2404 target.erase(i.first);
2405 }
2406 if (target.empty()) {
2407 os_alerts.erase(a.first);
2408 }
2409 }
2410 }
2411 void dump(ceph::Formatter *f) const;
2412 void encode(ceph::buffer::list &bl, uint64_t features) const;
2413 void decode(ceph::buffer::list::const_iterator &bl);
2414 static void generate_test_instances(std::list<osd_stat_t*>& o);
2415 };
2416 WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
2417
2418 inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
2419 return l.statfs == r.statfs &&
2420 l.snap_trim_queue_len == r.snap_trim_queue_len &&
2421 l.num_snap_trimming == r.num_snap_trimming &&
2422 l.num_shards_repaired == r.num_shards_repaired &&
2423 l.hb_peers == r.hb_peers &&
2424 l.op_queue_age_hist == r.op_queue_age_hist &&
2425 l.os_perf_stat == r.os_perf_stat &&
2426 l.num_pgs == r.num_pgs &&
2427 l.num_osds == r.num_osds &&
2428 l.num_per_pool_osds == r.num_per_pool_osds &&
2429 l.num_per_pool_omap_osds == r.num_per_pool_omap_osds;
2430 }
2431 inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
2432 return !(l == r);
2433 }
2434
2435 inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) {
2436 return out << "osd_stat(" << s.statfs << ", "
2437 << "peers " << s.hb_peers
2438 << " op hist " << s.op_queue_age_hist.h
2439 << ")";
2440 }
2441
2442 /*
2443 * summation over an entire pool
2444 */
2445 struct pool_stat_t {
2446 object_stat_collection_t stats;
2447 store_statfs_t store_stats;
2448 int64_t log_size;
2449 int64_t ondisk_log_size; // >= active_log_size
2450 int32_t up; ///< number of up replicas or shards
2451 int32_t acting; ///< number of acting replicas or shards
2452 int32_t num_store_stats; ///< amount of store_stats accumulated
2453
2454 pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
2455 num_store_stats(0)
2456 { }
2457
2458 void floor(int64_t f) {
2459 stats.floor(f);
2460 store_stats.floor(f);
2461 if (log_size < f)
2462 log_size = f;
2463 if (ondisk_log_size < f)
2464 ondisk_log_size = f;
2465 if (up < f)
2466 up = f;
2467 if (acting < f)
2468 acting = f;
2469 if (num_store_stats < f)
2470 num_store_stats = f;
2471 }
2472
2473 void add(const store_statfs_t& o) {
2474 store_stats.add(o);
2475 ++num_store_stats;
2476 }
2477 void sub(const store_statfs_t& o) {
2478 store_stats.sub(o);
2479 --num_store_stats;
2480 }
2481
2482 void add(const pg_stat_t& o) {
2483 stats.add(o.stats);
2484 log_size += o.log_size;
2485 ondisk_log_size += o.ondisk_log_size;
2486 up += o.up.size();
2487 acting += o.acting.size();
2488 }
2489 void sub(const pg_stat_t& o) {
2490 stats.sub(o.stats);
2491 log_size -= o.log_size;
2492 ondisk_log_size -= o.ondisk_log_size;
2493 up -= o.up.size();
2494 acting -= o.acting.size();
2495 }
2496
2497 bool is_zero() const {
2498 return (stats.is_zero() &&
2499 store_stats.is_zero() &&
2500 log_size == 0 &&
2501 ondisk_log_size == 0 &&
2502 up == 0 &&
2503 acting == 0 &&
2504 num_store_stats == 0);
2505 }
2506
2507 // helper accessors to retrieve used/netto bytes depending on the
2508 // collection method: new per-pool objectstore report or legacy PG
2509 // summation at OSD.
2510 // In legacy mode used and netto values are the same. But for new per-pool
2511 // collection 'used' provides amount of space ALLOCATED at all related OSDs
2512 // and 'netto' is amount of stored user data.
2513 uint64_t get_allocated_data_bytes(bool per_pool) const {
2514 if (per_pool) {
2515 return store_stats.allocated;
2516 } else {
2517 // legacy mode, use numbers from 'stats'
2518 return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2519 }
2520 }
2521 uint64_t get_allocated_omap_bytes(bool per_pool_omap) const {
2522 if (per_pool_omap) {
2523 return store_stats.omap_allocated;
2524 } else {
2525 // omap is not broken out by pool by nautilus bluestore; report the
2526 // scrub value. this will be imprecise in that it won't account for
2527 // any storage overhead/efficiency.
2528 return stats.sum.num_omap_bytes;
2529 }
2530 }
2531 uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor
2532 bool per_pool) const {
2533 // NOTE: we need the space amp factor so that we can work backwards from
2534 // the raw utilization to the amount of data that the user actually stored.
2535 if (per_pool) {
2536 return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
2537 } else {
2538 // legacy mode, use numbers from 'stats'. note that we do NOT use the
2539 // raw_used_rate factor here because we are working from the PG stats
2540 // directly.
2541 return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
2542 }
2543 }
2544 uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor
2545 bool per_pool_omap) const {
2546 if (per_pool_omap) {
2547 return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0;
2548 } else {
2549 // omap usage is lazily reported during scrub; this value may lag.
2550 return stats.sum.num_omap_bytes;
2551 }
2552 }
2553
2554 void dump(ceph::Formatter *f) const;
2555 void encode(ceph::buffer::list &bl, uint64_t features) const;
2556 void decode(ceph::buffer::list::const_iterator &bl);
2557 static void generate_test_instances(std::list<pool_stat_t*>& o);
2558 };
2559 WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
2560
2561
2562 // -----------------------------------------
2563
2564 /**
2565 * pg_hit_set_info_t - information about a single recorded HitSet
2566 *
2567 * Track basic metadata about a HitSet, like the number of insertions
2568 * and the time range it covers.
2569 */
2570 struct pg_hit_set_info_t {
2571 utime_t begin, end; ///< time interval
2572 eversion_t version; ///< version this HitSet object was written
2573 bool using_gmt; ///< use gmt for creating the hit_set archive object name
2574
2575 friend bool operator==(const pg_hit_set_info_t& l,
2576 const pg_hit_set_info_t& r) {
2577 return
2578 l.begin == r.begin &&
2579 l.end == r.end &&
2580 l.version == r.version &&
2581 l.using_gmt == r.using_gmt;
2582 }
2583
2584 explicit pg_hit_set_info_t(bool using_gmt = true)
2585 : using_gmt(using_gmt) {}
2586
2587 void encode(ceph::buffer::list &bl) const;
2588 void decode(ceph::buffer::list::const_iterator &bl);
2589 void dump(ceph::Formatter *f) const;
2590 static void generate_test_instances(std::list<pg_hit_set_info_t*>& o);
2591 };
2592 WRITE_CLASS_ENCODER(pg_hit_set_info_t)
2593
2594 /**
2595 * pg_hit_set_history_t - information about a history of hitsets
2596 *
2597 * Include information about the currently accumulating hit set as well
2598 * as archived/historical ones.
2599 */
2600 struct pg_hit_set_history_t {
2601 eversion_t current_last_update; ///< last version inserted into current set
2602 std::list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
2603
2604 friend bool operator==(const pg_hit_set_history_t& l,
2605 const pg_hit_set_history_t& r) {
2606 return
2607 l.current_last_update == r.current_last_update &&
2608 l.history == r.history;
2609 }
2610
2611 void encode(ceph::buffer::list &bl) const;
2612 void decode(ceph::buffer::list::const_iterator &bl);
2613 void dump(ceph::Formatter *f) const;
2614 static void generate_test_instances(std::list<pg_hit_set_history_t*>& o);
2615 };
2616 WRITE_CLASS_ENCODER(pg_hit_set_history_t)
2617
2618
2619 // -----------------------------------------
2620
2621 /**
2622 * pg_history_t - information about recent pg peering/mapping history
2623 *
2624 * This is aggressively shared between OSDs to bound the amount of past
2625 * history they need to worry about.
2626 */
2627 struct pg_history_t {
2628 epoch_t epoch_created = 0; // epoch in which *pg* was created (pool or pg)
2629 epoch_t epoch_pool_created = 0; // epoch in which *pool* was created
2630 // (note: may be pg creation epoch for
2631 // pre-luminous clusters)
2632 epoch_t last_epoch_started = 0;; // lower bound on last epoch started (anywhere, not necessarily locally)
2633 epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval
2634 epoch_t last_epoch_clean = 0;; // lower bound on last epoch the PG was completely clean.
2635 epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval
2636 epoch_t last_epoch_split = 0;; // as parent or child
2637 epoch_t last_epoch_marked_full = 0;; // pool or cluster
2638
2639 /**
2640 * In the event of a map discontinuity, same_*_since may reflect the first
2641 * map the osd has seen in the new map sequence rather than the actual start
2642 * of the interval. This is ok since a discontinuity at epoch e means there
2643 * must have been a clean interval between e and now and that we cannot be
2644 * in the active set during the interval containing e.
2645 */
2646 epoch_t same_up_since = 0;; // same acting set since
2647 epoch_t same_interval_since = 0;; // same acting AND up set since
2648 epoch_t same_primary_since = 0;; // same primary at least back through this epoch.
2649
2650 eversion_t last_scrub;
2651 eversion_t last_deep_scrub;
2652 utime_t last_scrub_stamp;
2653 utime_t last_deep_scrub_stamp;
2654 utime_t last_clean_scrub_stamp;
2655
2656 /// upper bound on how long prior interval readable (relative to encode time)
2657 ceph::timespan prior_readable_until_ub = ceph::timespan::zero();
2658
2659 friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
2660 return
2661 l.epoch_created == r.epoch_created &&
2662 l.epoch_pool_created == r.epoch_pool_created &&
2663 l.last_epoch_started == r.last_epoch_started &&
2664 l.last_interval_started == r.last_interval_started &&
2665 l.last_epoch_clean == r.last_epoch_clean &&
2666 l.last_interval_clean == r.last_interval_clean &&
2667 l.last_epoch_split == r.last_epoch_split &&
2668 l.last_epoch_marked_full == r.last_epoch_marked_full &&
2669 l.same_up_since == r.same_up_since &&
2670 l.same_interval_since == r.same_interval_since &&
2671 l.same_primary_since == r.same_primary_since &&
2672 l.last_scrub == r.last_scrub &&
2673 l.last_deep_scrub == r.last_deep_scrub &&
2674 l.last_scrub_stamp == r.last_scrub_stamp &&
2675 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2676 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2677 l.prior_readable_until_ub == r.prior_readable_until_ub;
2678 }
2679
2680 pg_history_t() {}
2681 pg_history_t(epoch_t created, utime_t stamp)
2682 : epoch_created(created),
2683 epoch_pool_created(created),
2684 same_up_since(created),
2685 same_interval_since(created),
2686 same_primary_since(created),
2687 last_scrub_stamp(stamp),
2688 last_deep_scrub_stamp(stamp),
2689 last_clean_scrub_stamp(stamp) {}
2690
2691 bool merge(const pg_history_t &other) {
2692 // Here, we only update the fields which cannot be calculated from the OSDmap.
2693 bool modified = false;
2694 if (epoch_created < other.epoch_created) {
2695 epoch_created = other.epoch_created;
2696 modified = true;
2697 }
2698 if (epoch_pool_created < other.epoch_pool_created) {
2699 // FIXME: for jewel compat only; this should either be 0 or always the
2700 // same value across all pg instances.
2701 epoch_pool_created = other.epoch_pool_created;
2702 modified = true;
2703 }
2704 if (last_epoch_started < other.last_epoch_started) {
2705 last_epoch_started = other.last_epoch_started;
2706 modified = true;
2707 }
2708 if (last_interval_started < other.last_interval_started) {
2709 last_interval_started = other.last_interval_started;
2710 // if we are learning about a newer *started* interval, our
2711 // readable_until_ub is obsolete
2712 prior_readable_until_ub = other.prior_readable_until_ub;
2713 modified = true;
2714 } else if (other.last_interval_started == last_interval_started &&
2715 other.prior_readable_until_ub < prior_readable_until_ub) {
2716 // if other is the *same* interval, than pull our upper bound in
2717 // if they have a tighter bound.
2718 prior_readable_until_ub = other.prior_readable_until_ub;
2719 modified = true;
2720 }
2721 if (last_epoch_clean < other.last_epoch_clean) {
2722 last_epoch_clean = other.last_epoch_clean;
2723 modified = true;
2724 }
2725 if (last_interval_clean < other.last_interval_clean) {
2726 last_interval_clean = other.last_interval_clean;
2727 modified = true;
2728 }
2729 if (last_epoch_split < other.last_epoch_split) {
2730 last_epoch_split = other.last_epoch_split;
2731 modified = true;
2732 }
2733 if (last_epoch_marked_full < other.last_epoch_marked_full) {
2734 last_epoch_marked_full = other.last_epoch_marked_full;
2735 modified = true;
2736 }
2737 if (other.last_scrub > last_scrub) {
2738 last_scrub = other.last_scrub;
2739 modified = true;
2740 }
2741 if (other.last_scrub_stamp > last_scrub_stamp) {
2742 last_scrub_stamp = other.last_scrub_stamp;
2743 modified = true;
2744 }
2745 if (other.last_deep_scrub > last_deep_scrub) {
2746 last_deep_scrub = other.last_deep_scrub;
2747 modified = true;
2748 }
2749 if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
2750 last_deep_scrub_stamp = other.last_deep_scrub_stamp;
2751 modified = true;
2752 }
2753 if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
2754 last_clean_scrub_stamp = other.last_clean_scrub_stamp;
2755 modified = true;
2756 }
2757 return modified;
2758 }
2759
2760 void encode(ceph::buffer::list& bl) const;
2761 void decode(ceph::buffer::list::const_iterator& p);
2762 void dump(ceph::Formatter *f) const;
2763 static void generate_test_instances(std::list<pg_history_t*>& o);
2764
2765 ceph::signedspan refresh_prior_readable_until_ub(
2766 ceph::signedspan now, ///< now, relative to osd startup_time
2767 ceph::signedspan ub) { ///< ub, relative to osd startup_time
2768 if (now >= ub) {
2769 // prior interval(s) are unreadable; we can zero the upper bound
2770 prior_readable_until_ub = ceph::signedspan::zero();
2771 return ceph::signedspan::zero();
2772 } else {
2773 prior_readable_until_ub = ub - now;
2774 return ub;
2775 }
2776 }
2777 ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) {
2778 if (prior_readable_until_ub == ceph::signedspan::zero()) {
2779 return ceph::signedspan::zero();
2780 }
2781 return now + prior_readable_until_ub;
2782 }
2783 };
2784 WRITE_CLASS_ENCODER(pg_history_t)
2785
2786 inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) {
2787 out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
2788 << " lis/c=" << h.last_interval_started
2789 << "/" << h.last_interval_clean
2790 << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean
2791 << "/" << h.last_epoch_marked_full
2792 << " sis=" << h.same_interval_since;
2793 if (h.prior_readable_until_ub != ceph::timespan::zero()) {
2794 out << " pruub=" << h.prior_readable_until_ub;
2795 }
2796 return out;
2797 }
2798
2799
2800 /**
2801 * pg_info_t - summary of PG statistics.
2802 *
2803 * some notes:
2804 * - last_complete implies we have all objects that existed as of that
2805 * stamp, OR a newer object, OR have already applied a later delete.
2806 * - if last_complete >= log.bottom, then we know pg contents thru log.head.
2807 * otherwise, we have no idea what the pg is supposed to contain.
2808 */
2809 struct pg_info_t {
2810 spg_t pgid;
2811 eversion_t last_update; ///< last object version applied to store.
2812 eversion_t last_complete; ///< last version pg was complete through.
2813 epoch_t last_epoch_started; ///< last epoch at which this pg started on this osd
2814 epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
2815
2816 version_t last_user_version; ///< last user object version applied to store
2817
2818 eversion_t log_tail; ///< oldest log entry.
2819
2820 hobject_t last_backfill; ///< objects >= this and < last_complete may be missing
2821
2822 interval_set<snapid_t> purged_snaps;
2823
2824 pg_stat_t stats;
2825
2826 pg_history_t history;
2827 pg_hit_set_history_t hit_set;
2828
2829 friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
2830 return
2831 l.pgid == r.pgid &&
2832 l.last_update == r.last_update &&
2833 l.last_complete == r.last_complete &&
2834 l.last_epoch_started == r.last_epoch_started &&
2835 l.last_interval_started == r.last_interval_started &&
2836 l.last_user_version == r.last_user_version &&
2837 l.log_tail == r.log_tail &&
2838 l.last_backfill == r.last_backfill &&
2839 l.purged_snaps == r.purged_snaps &&
2840 l.stats == r.stats &&
2841 l.history == r.history &&
2842 l.hit_set == r.hit_set;
2843 }
2844
2845 pg_info_t()
2846 : last_epoch_started(0),
2847 last_interval_started(0),
2848 last_user_version(0),
2849 last_backfill(hobject_t::get_max())
2850 { }
2851 // cppcheck-suppress noExplicitConstructor
2852 pg_info_t(spg_t p)
2853 : pgid(p),
2854 last_epoch_started(0),
2855 last_interval_started(0),
2856 last_user_version(0),
2857 last_backfill(hobject_t::get_max())
2858 { }
2859
2860 void set_last_backfill(hobject_t pos) {
2861 last_backfill = pos;
2862 }
2863
2864 bool is_empty() const { return last_update.version == 0; }
2865 bool dne() const { return history.epoch_created == 0; }
2866
2867 bool has_missing() const { return last_complete != last_update; }
2868 bool is_incomplete() const { return !last_backfill.is_max(); }
2869
2870 void encode(ceph::buffer::list& bl) const;
2871 void decode(ceph::buffer::list::const_iterator& p);
2872 void dump(ceph::Formatter *f) const;
2873 static void generate_test_instances(std::list<pg_info_t*>& o);
2874 };
2875 WRITE_CLASS_ENCODER(pg_info_t)
2876
2877 inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi)
2878 {
2879 out << pgi.pgid << "(";
2880 if (pgi.dne())
2881 out << " DNE";
2882 if (pgi.is_empty())
2883 out << " empty";
2884 else {
2885 out << " v " << pgi.last_update;
2886 if (pgi.last_complete != pgi.last_update)
2887 out << " lc " << pgi.last_complete;
2888 out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
2889 }
2890 if (pgi.is_incomplete())
2891 out << " lb " << pgi.last_backfill;
2892 //out << " c " << pgi.epoch_created;
2893 out << " local-lis/les=" << pgi.last_interval_started
2894 << "/" << pgi.last_epoch_started;
2895 out << " n=" << pgi.stats.stats.sum.num_objects;
2896 out << " " << pgi.history
2897 << ")";
2898 return out;
2899 }
2900
2901 /**
2902 * pg_fast_info_t - common pg_info_t fields
2903 *
2904 * These are the fields of pg_info_t (and children) that are updated for
2905 * most IO operations.
2906 *
2907 * ** WARNING **
2908 * Because we rely on these fields to be applied to the normal
2909 * info struct, adding a new field here that is not also new in info
2910 * means that we must set an incompat OSD feature bit!
2911 */
2912 struct pg_fast_info_t {
2913 eversion_t last_update;
2914 eversion_t last_complete;
2915 version_t last_user_version;
2916 struct { // pg_stat_t stats
2917 eversion_t version;
2918 version_t reported_seq;
2919 utime_t last_fresh;
2920 utime_t last_active;
2921 utime_t last_peered;
2922 utime_t last_clean;
2923 utime_t last_unstale;
2924 utime_t last_undegraded;
2925 utime_t last_fullsized;
2926 int64_t log_size; // (also ondisk_log_size, which has the same value)
2927 struct { // object_stat_collection_t stats;
2928 struct { // objct_stat_sum_t sum
2929 int64_t num_bytes; // in bytes
2930 int64_t num_objects;
2931 int64_t num_object_copies;
2932 int64_t num_rd;
2933 int64_t num_rd_kb;
2934 int64_t num_wr;
2935 int64_t num_wr_kb;
2936 int64_t num_objects_dirty;
2937 } sum;
2938 } stats;
2939 } stats;
2940
2941 void populate_from(const pg_info_t& info) {
2942 last_update = info.last_update;
2943 last_complete = info.last_complete;
2944 last_user_version = info.last_user_version;
2945 stats.version = info.stats.version;
2946 stats.reported_seq = info.stats.reported_seq;
2947 stats.last_fresh = info.stats.last_fresh;
2948 stats.last_active = info.stats.last_active;
2949 stats.last_peered = info.stats.last_peered;
2950 stats.last_clean = info.stats.last_clean;
2951 stats.last_unstale = info.stats.last_unstale;
2952 stats.last_undegraded = info.stats.last_undegraded;
2953 stats.last_fullsized = info.stats.last_fullsized;
2954 stats.log_size = info.stats.log_size;
2955 stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
2956 stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
2957 stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
2958 stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
2959 stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
2960 stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
2961 stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
2962 stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
2963 }
2964
2965 bool try_apply_to(pg_info_t* info) {
2966 if (last_update <= info->last_update)
2967 return false;
2968 info->last_update = last_update;
2969 info->last_complete = last_complete;
2970 info->last_user_version = last_user_version;
2971 info->stats.version = stats.version;
2972 info->stats.reported_seq = stats.reported_seq;
2973 info->stats.last_fresh = stats.last_fresh;
2974 info->stats.last_active = stats.last_active;
2975 info->stats.last_peered = stats.last_peered;
2976 info->stats.last_clean = stats.last_clean;
2977 info->stats.last_unstale = stats.last_unstale;
2978 info->stats.last_undegraded = stats.last_undegraded;
2979 info->stats.last_fullsized = stats.last_fullsized;
2980 info->stats.log_size = stats.log_size;
2981 info->stats.ondisk_log_size = stats.log_size;
2982 info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
2983 info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
2984 info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
2985 info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
2986 info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
2987 info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
2988 info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
2989 info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
2990 return true;
2991 }
2992
2993 void encode(ceph::buffer::list& bl) const {
2994 ENCODE_START(1, 1, bl);
2995 encode(last_update, bl);
2996 encode(last_complete, bl);
2997 encode(last_user_version, bl);
2998 encode(stats.version, bl);
2999 encode(stats.reported_seq, bl);
3000 encode(stats.last_fresh, bl);
3001 encode(stats.last_active, bl);
3002 encode(stats.last_peered, bl);
3003 encode(stats.last_clean, bl);
3004 encode(stats.last_unstale, bl);
3005 encode(stats.last_undegraded, bl);
3006 encode(stats.last_fullsized, bl);
3007 encode(stats.log_size, bl);
3008 encode(stats.stats.sum.num_bytes, bl);
3009 encode(stats.stats.sum.num_objects, bl);
3010 encode(stats.stats.sum.num_object_copies, bl);
3011 encode(stats.stats.sum.num_rd, bl);
3012 encode(stats.stats.sum.num_rd_kb, bl);
3013 encode(stats.stats.sum.num_wr, bl);
3014 encode(stats.stats.sum.num_wr_kb, bl);
3015 encode(stats.stats.sum.num_objects_dirty, bl);
3016 ENCODE_FINISH(bl);
3017 }
3018 void decode(ceph::buffer::list::const_iterator& p) {
3019 DECODE_START(1, p);
3020 decode(last_update, p);
3021 decode(last_complete, p);
3022 decode(last_user_version, p);
3023 decode(stats.version, p);
3024 decode(stats.reported_seq, p);
3025 decode(stats.last_fresh, p);
3026 decode(stats.last_active, p);
3027 decode(stats.last_peered, p);
3028 decode(stats.last_clean, p);
3029 decode(stats.last_unstale, p);
3030 decode(stats.last_undegraded, p);
3031 decode(stats.last_fullsized, p);
3032 decode(stats.log_size, p);
3033 decode(stats.stats.sum.num_bytes, p);
3034 decode(stats.stats.sum.num_objects, p);
3035 decode(stats.stats.sum.num_object_copies, p);
3036 decode(stats.stats.sum.num_rd, p);
3037 decode(stats.stats.sum.num_rd_kb, p);
3038 decode(stats.stats.sum.num_wr, p);
3039 decode(stats.stats.sum.num_wr_kb, p);
3040 decode(stats.stats.sum.num_objects_dirty, p);
3041 DECODE_FINISH(p);
3042 }
3043 };
3044 WRITE_CLASS_ENCODER(pg_fast_info_t)
3045
3046
3047 class OSDMap;
3048 /**
3049 * PastIntervals -- information needed to determine the PriorSet and
3050 * the might_have_unfound set
3051 */
3052 class PastIntervals {
3053 #ifdef WITH_SEASTAR
3054 using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
3055 #else
3056 using OSDMapRef = std::shared_ptr<const OSDMap>;
3057 #endif
3058 public:
3059 struct pg_interval_t {
3060 std::vector<int32_t> up, acting;
3061 epoch_t first, last;
3062 bool maybe_went_rw;
3063 int32_t primary;
3064 int32_t up_primary;
3065
3066 pg_interval_t()
3067 : first(0), last(0),
3068 maybe_went_rw(false),
3069 primary(-1),
3070 up_primary(-1)
3071 {}
3072
3073 pg_interval_t(
3074 std::vector<int32_t> &&up,
3075 std::vector<int32_t> &&acting,
3076 epoch_t first,
3077 epoch_t last,
3078 bool maybe_went_rw,
3079 int32_t primary,
3080 int32_t up_primary)
3081 : up(up), acting(acting), first(first), last(last),
3082 maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
3083 {}
3084
3085 void encode(ceph::buffer::list& bl) const;
3086 void decode(ceph::buffer::list::const_iterator& bl);
3087 void dump(ceph::Formatter *f) const;
3088 static void generate_test_instances(std::list<pg_interval_t*>& o);
3089 };
3090
3091 PastIntervals();
3092 PastIntervals(PastIntervals &&rhs) = default;
3093 PastIntervals &operator=(PastIntervals &&rhs) = default;
3094
3095 PastIntervals(const PastIntervals &rhs);
3096 PastIntervals &operator=(const PastIntervals &rhs);
3097
3098 class interval_rep {
3099 public:
3100 virtual size_t size() const = 0;
3101 virtual bool empty() const = 0;
3102 virtual void clear() = 0;
3103 virtual std::pair<epoch_t, epoch_t> get_bounds() const = 0;
3104 virtual std::set<pg_shard_t> get_all_participants(
3105 bool ec_pool) const = 0;
3106 virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
3107 virtual std::unique_ptr<interval_rep> clone() const = 0;
3108 virtual std::ostream &print(std::ostream &out) const = 0;
3109 virtual void encode(ceph::buffer::list &bl) const = 0;
3110 virtual void decode(ceph::buffer::list::const_iterator &bl) = 0;
3111 virtual void dump(ceph::Formatter *f) const = 0;
3112 virtual void iterate_mayberw_back_to(
3113 epoch_t les,
3114 std::function<void(epoch_t, const std::set<pg_shard_t> &)> &&f) const = 0;
3115
3116 virtual bool has_full_intervals() const { return false; }
3117 virtual void iterate_all_intervals(
3118 std::function<void(const pg_interval_t &)> &&f) const {
3119 ceph_assert(!has_full_intervals());
3120 ceph_abort_msg("not valid for this implementation");
3121 }
3122 virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
3123
3124 virtual ~interval_rep() {}
3125 };
3126 friend class pi_compact_rep;
3127 private:
3128
3129 std::unique_ptr<interval_rep> past_intervals;
3130
3131 explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
3132
3133 public:
3134 void add_interval(bool ec_pool, const pg_interval_t &interval) {
3135 ceph_assert(past_intervals);
3136 return past_intervals->add_interval(ec_pool, interval);
3137 }
3138
3139 void encode(ceph::buffer::list &bl) const {
3140 ENCODE_START(1, 1, bl);
3141 if (past_intervals) {
3142 __u8 type = 2;
3143 encode(type, bl);
3144 past_intervals->encode(bl);
3145 } else {
3146 encode((__u8)0, bl);
3147 }
3148 ENCODE_FINISH(bl);
3149 }
3150
3151 void decode(ceph::buffer::list::const_iterator &bl);
3152
3153 void dump(ceph::Formatter *f) const {
3154 ceph_assert(past_intervals);
3155 past_intervals->dump(f);
3156 }
3157 static void generate_test_instances(std::list<PastIntervals *> & o);
3158
3159 /**
3160 * Determines whether there is an interval change
3161 */
3162 static bool is_new_interval(
3163 int old_acting_primary,
3164 int new_acting_primary,
3165 const std::vector<int> &old_acting,
3166 const std::vector<int> &new_acting,
3167 int old_up_primary,
3168 int new_up_primary,
3169 const std::vector<int> &old_up,
3170 const std::vector<int> &new_up,
3171 int old_size,
3172 int new_size,
3173 int old_min_size,
3174 int new_min_size,
3175 unsigned old_pg_num,
3176 unsigned new_pg_num,
3177 unsigned old_pg_num_pending,
3178 unsigned new_pg_num_pending,
3179 bool old_sort_bitwise,
3180 bool new_sort_bitwise,
3181 bool old_recovery_deletes,
3182 bool new_recovery_deletes,
3183 pg_t pgid
3184 );
3185
3186 /**
3187 * Determines whether there is an interval change
3188 */
3189 static bool is_new_interval(
3190 int old_acting_primary, ///< [in] primary as of lastmap
3191 int new_acting_primary, ///< [in] primary as of lastmap
3192 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3193 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3194 int old_up_primary, ///< [in] up primary of lastmap
3195 int new_up_primary, ///< [in] up primary of osdmap
3196 const std::vector<int> &old_up, ///< [in] up as of lastmap
3197 const std::vector<int> &new_up, ///< [in] up as of osdmap
3198 const OSDMap *osdmap, ///< [in] current map
3199 const OSDMap *lastmap, ///< [in] last map
3200 pg_t pgid ///< [in] pgid for pg
3201 );
3202
3203 /**
3204 * Integrates a new map into *past_intervals, returns true
3205 * if an interval was closed out.
3206 */
3207 static bool check_new_interval(
3208 int old_acting_primary, ///< [in] primary as of lastmap
3209 int new_acting_primary, ///< [in] primary as of osdmap
3210 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3211 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3212 int old_up_primary, ///< [in] up primary of lastmap
3213 int new_up_primary, ///< [in] up primary of osdmap
3214 const std::vector<int> &old_up, ///< [in] up as of lastmap
3215 const std::vector<int> &new_up, ///< [in] up as of osdmap
3216 epoch_t same_interval_since, ///< [in] as of osdmap
3217 epoch_t last_epoch_clean, ///< [in] current
3218 const OSDMap *osdmap, ///< [in] current map
3219 const OSDMap *lastmap, ///< [in] last map
3220 pg_t pgid, ///< [in] pgid for pg
3221 const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3222 PastIntervals *past_intervals, ///< [out] intervals
3223 std::ostream *out = 0 ///< [out] debug ostream
3224 );
3225 static bool check_new_interval(
3226 int old_acting_primary, ///< [in] primary as of lastmap
3227 int new_acting_primary, ///< [in] primary as of osdmap
3228 const std::vector<int> &old_acting, ///< [in] acting as of lastmap
3229 const std::vector<int> &new_acting, ///< [in] acting as of osdmap
3230 int old_up_primary, ///< [in] up primary of lastmap
3231 int new_up_primary, ///< [in] up primary of osdmap
3232 const std::vector<int> &old_up, ///< [in] up as of lastmap
3233 const std::vector<int> &new_up, ///< [in] up as of osdmap
3234 epoch_t same_interval_since, ///< [in] as of osdmap
3235 epoch_t last_epoch_clean, ///< [in] current
3236 OSDMapRef osdmap, ///< [in] current map
3237 OSDMapRef lastmap, ///< [in] last map
3238 pg_t pgid, ///< [in] pgid for pg
3239 const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
3240 PastIntervals *past_intervals, ///< [out] intervals
3241 std::ostream *out = 0 ///< [out] debug ostream
3242 ) {
3243 return check_new_interval(
3244 old_acting_primary, new_acting_primary,
3245 old_acting, new_acting,
3246 old_up_primary, new_up_primary,
3247 old_up, new_up,
3248 same_interval_since, last_epoch_clean,
3249 osdmap.get(), lastmap.get(),
3250 pgid,
3251 could_have_gone_active,
3252 past_intervals,
3253 out);
3254 }
3255
3256 friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3257
3258 template <typename F>
3259 void iterate_mayberw_back_to(
3260 epoch_t les,
3261 F &&f) const {
3262 ceph_assert(past_intervals);
3263 past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
3264 }
3265 void clear() {
3266 ceph_assert(past_intervals);
3267 past_intervals->clear();
3268 }
3269
3270 /**
3271 * Should return a value which gives an indication of the amount
3272 * of state contained
3273 */
3274 size_t size() const {
3275 ceph_assert(past_intervals);
3276 return past_intervals->size();
3277 }
3278
3279 bool empty() const {
3280 ceph_assert(past_intervals);
3281 return past_intervals->empty();
3282 }
3283
3284 void swap(PastIntervals &other) {
3285 using std::swap;
3286 swap(other.past_intervals, past_intervals);
3287 }
3288
3289 /**
3290 * Return all shards which have been in the acting set back to the
3291 * latest epoch to which we have trimmed except for pg_whoami
3292 */
3293 std::set<pg_shard_t> get_might_have_unfound(
3294 pg_shard_t pg_whoami,
3295 bool ec_pool) const {
3296 ceph_assert(past_intervals);
3297 auto ret = past_intervals->get_all_participants(ec_pool);
3298 ret.erase(pg_whoami);
3299 return ret;
3300 }
3301
3302 /**
3303 * Return all shards which we might want to talk to for peering
3304 */
3305 std::set<pg_shard_t> get_all_probe(
3306 bool ec_pool) const {
3307 ceph_assert(past_intervals);
3308 return past_intervals->get_all_participants(ec_pool);
3309 }
3310
3311 /* Return the set of epochs [start, end) represented by the
3312 * past_interval set.
3313 */
3314 std::pair<epoch_t, epoch_t> get_bounds() const {
3315 ceph_assert(past_intervals);
3316 return past_intervals->get_bounds();
3317 }
3318
3319 void adjust_start_backwards(epoch_t last_epoch_clean) {
3320 ceph_assert(past_intervals);
3321 past_intervals->adjust_start_backwards(last_epoch_clean);
3322 }
3323
3324 enum osd_state_t {
3325 UP,
3326 DOWN,
3327 DNE,
3328 LOST
3329 };
3330 struct PriorSet {
3331 bool ec_pool = false;
3332 std::set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
3333 std::set<int> down; ///< down osds that would normally be in @a probe and might be interesting.
3334 std::map<int, epoch_t> blocked_by; ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
3335
3336 bool pg_down = false; ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
3337 const IsPGRecoverablePredicate* pcontdec = nullptr;
3338
3339 PriorSet() = default;
3340 PriorSet(PriorSet &&) = default;
3341 PriorSet &operator=(PriorSet &&) = default;
3342
3343 PriorSet &operator=(const PriorSet &) = delete;
3344 PriorSet(const PriorSet &) = delete;
3345
3346 bool operator==(const PriorSet &rhs) const {
3347 return (ec_pool == rhs.ec_pool) &&
3348 (probe == rhs.probe) &&
3349 (down == rhs.down) &&
3350 (blocked_by == rhs.blocked_by) &&
3351 (pg_down == rhs.pg_down);
3352 }
3353
3354 bool affected_by_map(
3355 const OSDMap &osdmap,
3356 const DoutPrefixProvider *dpp) const;
3357
3358 // For verifying tests
3359 PriorSet(
3360 bool ec_pool,
3361 std::set<pg_shard_t> probe,
3362 std::set<int> down,
3363 std::map<int, epoch_t> blocked_by,
3364 bool pg_down,
3365 const IsPGRecoverablePredicate *pcontdec)
3366 : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
3367 pg_down(pg_down), pcontdec(pcontdec) {}
3368
3369 private:
3370 template <typename F>
3371 PriorSet(
3372 const PastIntervals &past_intervals,
3373 bool ec_pool,
3374 epoch_t last_epoch_started,
3375 const IsPGRecoverablePredicate *c,
3376 F f,
3377 const std::vector<int> &up,
3378 const std::vector<int> &acting,
3379 const DoutPrefixProvider *dpp);
3380
3381 friend class PastIntervals;
3382 };
3383
3384 template <typename... Args>
3385 PriorSet get_prior_set(Args&&... args) const {
3386 return PriorSet(*this, std::forward<Args>(args)...);
3387 }
3388 };
3389 WRITE_CLASS_ENCODER(PastIntervals)
3390
3391 std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i);
3392 std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
3393 std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i);
3394
3395 template <typename F>
3396 PastIntervals::PriorSet::PriorSet(
3397 const PastIntervals &past_intervals,
3398 bool ec_pool,
3399 epoch_t last_epoch_started,
3400 const IsPGRecoverablePredicate *c,
3401 F f,
3402 const std::vector<int> &up,
3403 const std::vector<int> &acting,
3404 const DoutPrefixProvider *dpp)
3405 : ec_pool(ec_pool), pg_down(false), pcontdec(c)
3406 {
3407 /*
3408 * We have to be careful to gracefully deal with situations like
3409 * so. Say we have a power outage or something that takes out both
3410 * OSDs, but the monitor doesn't mark them down in the same epoch.
3411 * The history may look like
3412 *
3413 * 1: A B
3414 * 2: B
3415 * 3: let's say B dies for good, too (say, from the power spike)
3416 * 4: A
3417 *
3418 * which makes it look like B may have applied updates to the PG
3419 * that we need in order to proceed. This sucks...
3420 *
3421 * To minimize the risk of this happening, we CANNOT go active if
3422 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3423 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3424 * Then, we have something like
3425 *
3426 * 1: A B
3427 * 2: B up_thru[B]=0
3428 * 3:
3429 * 4: A
3430 *
3431 * -> we can ignore B, bc it couldn't have gone active (alive_thru
3432 * still 0).
3433 *
3434 * or,
3435 *
3436 * 1: A B
3437 * 2: B up_thru[B]=0
3438 * 3: B up_thru[B]=2
3439 * 4:
3440 * 5: A
3441 *
3442 * -> we must wait for B, bc it was alive through 2, and could have
3443 * written to the pg.
3444 *
3445 * If B is really dead, then an administrator will need to manually
3446 * intervene by marking the OSD as "lost."
3447 */
3448
3449 // Include current acting and up nodes... not because they may
3450 // contain old data (this interval hasn't gone active, obviously),
3451 // but because we want their pg_info to inform choose_acting(), and
3452 // so that we know what they do/do not have explicitly before
3453 // sending them any new info/logs/whatever.
3454 for (unsigned i = 0; i < acting.size(); i++) {
3455 if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3456 probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3457 }
3458 // It may be possible to exclude the up nodes, but let's keep them in
3459 // there for now.
3460 for (unsigned i = 0; i < up.size(); i++) {
3461 if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
3462 probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3463 }
3464
3465 std::set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
3466 ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
3467 for (auto &&i: all_probe) {
3468 switch (f(0, i.osd, nullptr)) {
3469 case UP: {
3470 probe.insert(i);
3471 break;
3472 }
3473 case DNE:
3474 case LOST:
3475 case DOWN: {
3476 down.insert(i.osd);
3477 break;
3478 }
3479 }
3480 }
3481
3482 past_intervals.iterate_mayberw_back_to(
3483 last_epoch_started,
3484 [&](epoch_t start, const std::set<pg_shard_t> &acting) {
3485 ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
3486 << ", acting: " << acting << dendl;
3487
3488 // look at candidate osds during this interval. each falls into
3489 // one of three categories: up, down (but potentially
3490 // interesting), or lost (down, but we won't wait for it).
3491 std::set<pg_shard_t> up_now;
3492 std::map<int, epoch_t> candidate_blocked_by;
3493 // any candidates down now (that might have useful data)
3494 bool any_down_now = false;
3495
3496 // consider ACTING osds
3497 for (auto &&so: acting) {
3498 epoch_t lost_at = 0;
3499 switch (f(start, so.osd, &lost_at)) {
3500 case UP: {
3501 // include past acting osds if they are up.
3502 up_now.insert(so);
3503 break;
3504 }
3505 case DNE: {
3506 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3507 << " no longer exists" << dendl;
3508 break;
3509 }
3510 case LOST: {
3511 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3512 << " is down, but lost_at " << lost_at << dendl;
3513 up_now.insert(so);
3514 break;
3515 }
3516 case DOWN: {
3517 ldpp_dout(dpp, 10) << "build_prior prior osd." << so.osd
3518 << " is down" << dendl;
3519 candidate_blocked_by[so.osd] = lost_at;
3520 any_down_now = true;
3521 break;
3522 }
3523 }
3524 }
3525
3526 // if not enough osds survived this interval, and we may have gone rw,
3527 // then we need to wait for one of those osds to recover to
3528 // ensure that we haven't lost any information.
3529 if (!(*pcontdec)(up_now) && any_down_now) {
3530 // fixme: how do we identify a "clean" shutdown anyway?
3531 ldpp_dout(dpp, 10) << "build_prior possibly went active+rw,"
3532 << " insufficient up; including down osds" << dendl;
3533 ceph_assert(!candidate_blocked_by.empty());
3534 pg_down = true;
3535 blocked_by.insert(
3536 candidate_blocked_by.begin(),
3537 candidate_blocked_by.end());
3538 }
3539 });
3540
3541 ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
3542 << " down " << down
3543 << " blocked_by " << blocked_by
3544 << (pg_down ? " pg_down":"")
3545 << dendl;
3546 }
3547
3548 struct pg_notify_t {
3549 epoch_t query_epoch;
3550 epoch_t epoch_sent;
3551 pg_info_t info;
3552 shard_id_t to;
3553 shard_id_t from;
3554 PastIntervals past_intervals;
3555 pg_notify_t() :
3556 query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
3557 from(shard_id_t::NO_SHARD) {}
3558 pg_notify_t(
3559 shard_id_t to,
3560 shard_id_t from,
3561 epoch_t query_epoch,
3562 epoch_t epoch_sent,
3563 const pg_info_t &info,
3564 const PastIntervals& pi)
3565 : query_epoch(query_epoch),
3566 epoch_sent(epoch_sent),
3567 info(info), to(to), from(from),
3568 past_intervals(pi) {
3569 ceph_assert(from == info.pgid.shard);
3570 }
3571 void encode(ceph::buffer::list &bl) const;
3572 void decode(ceph::buffer::list::const_iterator &p);
3573 void dump(ceph::Formatter *f) const;
3574 static void generate_test_instances(std::list<pg_notify_t*> &o);
3575 };
3576 WRITE_CLASS_ENCODER(pg_notify_t)
3577 std::ostream &operator<<(std::ostream &lhs, const pg_notify_t ¬ify);
3578
3579
3580 /**
3581 * pg_query_t - used to ask a peer for information about a pg.
3582 *
3583 * note: if version=0, type=LOG, then we just provide our full log.
3584 */
3585 struct pg_query_t {
3586 enum {
3587 INFO = 0,
3588 LOG = 1,
3589 MISSING = 4,
3590 FULLLOG = 5,
3591 };
3592 std::string_view get_type_name() const {
3593 switch (type) {
3594 case INFO: return "info";
3595 case LOG: return "log";
3596 case MISSING: return "missing";
3597 case FULLLOG: return "fulllog";
3598 default: return "???";
3599 }
3600 }
3601
3602 __s32 type;
3603 eversion_t since;
3604 pg_history_t history;
3605 epoch_t epoch_sent;
3606 shard_id_t to;
3607 shard_id_t from;
3608
3609 pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
3610 from(shard_id_t::NO_SHARD) {}
3611 pg_query_t(
3612 int t,
3613 shard_id_t to,
3614 shard_id_t from,
3615 const pg_history_t& h,
3616 epoch_t epoch_sent)
3617 : type(t),
3618 history(h),
3619 epoch_sent(epoch_sent),
3620 to(to), from(from) {
3621 ceph_assert(t != LOG);
3622 }
3623 pg_query_t(
3624 int t,
3625 shard_id_t to,
3626 shard_id_t from,
3627 eversion_t s,
3628 const pg_history_t& h,
3629 epoch_t epoch_sent)
3630 : type(t), since(s), history(h),
3631 epoch_sent(epoch_sent), to(to), from(from) {
3632 ceph_assert(t == LOG);
3633 }
3634
3635 void encode(ceph::buffer::list &bl, uint64_t features) const;
3636 void decode(ceph::buffer::list::const_iterator &bl);
3637
3638 void dump(ceph::Formatter *f) const;
3639 static void generate_test_instances(std::list<pg_query_t*>& o);
3640 };
3641 WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
3642
3643 inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) {
3644 out << "query(" << q.get_type_name() << " " << q.since;
3645 if (q.type == pg_query_t::LOG)
3646 out << " " << q.history;
3647 out << " epoch_sent " << q.epoch_sent;
3648 out << ")";
3649 return out;
3650 }
3651
3652 /**
3653 * pg_lease_t - readable lease metadata, from primary -> non-primary
3654 *
3655 * This metadata serves to increase either or both of the lease expiration
3656 * and upper bound on the non-primary.
3657 */
3658 struct pg_lease_t {
3659 /// pg readable_until value; replicas must not be readable beyond this
3660 ceph::signedspan readable_until = ceph::signedspan::zero();
3661
3662 /// upper bound on any acting osd's readable_until
3663 ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3664
3665 /// duration of the lease (in case clock deltas aren't available)
3666 ceph::signedspan interval = ceph::signedspan::zero();
3667
3668 pg_lease_t() {}
3669 pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub,
3670 ceph::signedspan i)
3671 : readable_until(ru),
3672 readable_until_ub(ruub),
3673 interval(i) {}
3674
3675 void encode(ceph::buffer::list &bl) const;
3676 void decode(ceph::buffer::list::const_iterator &bl);
3677 void dump(ceph::Formatter *f) const;
3678 static void generate_test_instances(std::list<pg_lease_t*>& o);
3679
3680 friend ostream& operator<<(ostream& out, const pg_lease_t& l) {
3681 return out << "pg_lease(ru " << l.readable_until
3682 << " ub " << l.readable_until_ub
3683 << " int " << l.interval << ")";
3684 }
3685 };
3686 WRITE_CLASS_ENCODER(pg_lease_t)
3687
3688 /**
3689 * pg_lease_ack_t - lease ack, from non-primary -> primary
3690 *
3691 * This metadata acknowledges to the primary what a non-primary's noted
3692 * upper bound is.
3693 */
3694 struct pg_lease_ack_t {
3695 /// highest upper bound non-primary has recorded (primary's clock)
3696 ceph::signedspan readable_until_ub = ceph::signedspan::zero();
3697
3698 pg_lease_ack_t() {}
3699 pg_lease_ack_t(ceph::signedspan ub)
3700 : readable_until_ub(ub) {}
3701
3702 void encode(ceph::buffer::list &bl) const;
3703 void decode(ceph::buffer::list::const_iterator &bl);
3704 void dump(ceph::Formatter *f) const;
3705 static void generate_test_instances(std::list<pg_lease_ack_t*>& o);
3706
3707 friend ostream& operator<<(ostream& out, const pg_lease_ack_t& l) {
3708 return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")";
3709 }
3710 };
3711 WRITE_CLASS_ENCODER(pg_lease_ack_t)
3712
3713
3714
3715 class PGBackend;
3716 class ObjectModDesc {
3717 bool can_local_rollback;
3718 bool rollback_info_completed;
3719
3720 // version required to decode, reflected in encode/decode version
3721 __u8 max_required_version = 1;
3722 public:
3723 class Visitor {
3724 public:
3725 virtual void append(uint64_t old_offset) {}
3726 virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {}
3727 virtual void rmobject(version_t old_version) {}
3728 /**
3729 * Used to support the unfound_lost_delete log event: if the stashed
3730 * version exists, we unstash it, otherwise, we do nothing. This way
3731 * each replica rolls back to whatever state it had prior to the attempt
3732 * at mark unfound lost delete
3733 */
3734 virtual void try_rmobject(version_t old_version) {
3735 rmobject(old_version);
3736 }
3737 virtual void create() {}
3738 virtual void update_snaps(const std::set<snapid_t> &old_snaps) {}
3739 virtual void rollback_extents(
3740 version_t gen,
3741 const std::vector<std::pair<uint64_t, uint64_t> > &extents) {}
3742 virtual ~Visitor() {}
3743 };
3744 void visit(Visitor *visitor) const;
3745 mutable ceph::buffer::list bl;
3746 enum ModID {
3747 APPEND = 1,
3748 SETATTRS = 2,
3749 DELETE = 3,
3750 CREATE = 4,
3751 UPDATE_SNAPS = 5,
3752 TRY_DELETE = 6,
3753 ROLLBACK_EXTENTS = 7
3754 };
3755 ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
3756 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3757 }
3758 void claim(ObjectModDesc &other) {
3759 bl.clear();
3760 bl.claim(other.bl);
3761 can_local_rollback = other.can_local_rollback;
3762 rollback_info_completed = other.rollback_info_completed;
3763 }
3764 void claim_append(ObjectModDesc &other) {
3765 if (!can_local_rollback || rollback_info_completed)
3766 return;
3767 if (!other.can_local_rollback) {
3768 mark_unrollbackable();
3769 return;
3770 }
3771 bl.claim_append(other.bl);
3772 rollback_info_completed = other.rollback_info_completed;
3773 }
3774 void swap(ObjectModDesc &other) {
3775 bl.swap(other.bl);
3776
3777 using std::swap;
3778 swap(other.can_local_rollback, can_local_rollback);
3779 swap(other.rollback_info_completed, rollback_info_completed);
3780 swap(other.max_required_version, max_required_version);
3781 }
3782 void append_id(ModID id) {
3783 using ceph::encode;
3784 uint8_t _id(id);
(2) Event overrun-buffer-val: |
Overrunning buffer pointed to by "_id" of 1 bytes by passing it to a function which accesses it at byte offset 7. [details] |
Also see events: |
[assignment] |
3785 encode(_id, bl);
3786 }
3787 void append(uint64_t old_size) {
3788 if (!can_local_rollback || rollback_info_completed)
3789 return;
3790 ENCODE_START(1, 1, bl);
3791 append_id(APPEND);
3792 encode(old_size, bl);
3793 ENCODE_FINISH(bl);
3794 }
3795 void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) {
3796 if (!can_local_rollback || rollback_info_completed)
3797 return;
3798 ENCODE_START(1, 1, bl);
3799 append_id(SETATTRS);
3800 encode(old_attrs, bl);
3801 ENCODE_FINISH(bl);
3802 }
3803 bool rmobject(version_t deletion_version) {
3804 if (!can_local_rollback || rollback_info_completed)
3805 return false;
3806 ENCODE_START(1, 1, bl);
3807 append_id(DELETE);
3808 encode(deletion_version, bl);
3809 ENCODE_FINISH(bl);
3810 rollback_info_completed = true;
3811 return true;
3812 }
3813 bool try_rmobject(version_t deletion_version) {
3814 if (!can_local_rollback || rollback_info_completed)
3815 return false;
3816 ENCODE_START(1, 1, bl);
3817 append_id(TRY_DELETE);
3818 encode(deletion_version, bl);
3819 ENCODE_FINISH(bl);
3820 rollback_info_completed = true;
3821 return true;
3822 }
3823 void create() {
3824 if (!can_local_rollback || rollback_info_completed)
3825 return;
3826 rollback_info_completed = true;
3827 ENCODE_START(1, 1, bl);
3828 append_id(CREATE);
3829 ENCODE_FINISH(bl);
3830 }
3831 void update_snaps(const std::set<snapid_t> &old_snaps) {
3832 if (!can_local_rollback || rollback_info_completed)
3833 return;
3834 ENCODE_START(1, 1, bl);
3835 append_id(UPDATE_SNAPS);
3836 encode(old_snaps, bl);
3837 ENCODE_FINISH(bl);
3838 }
3839 void rollback_extents(
3840 version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) {
3841 ceph_assert(can_local_rollback);
3842 ceph_assert(!rollback_info_completed);
3843 if (max_required_version < 2)
3844 max_required_version = 2;
3845 ENCODE_START(2, 2, bl);
3846 append_id(ROLLBACK_EXTENTS);
3847 encode(gen, bl);
3848 encode(extents, bl);
3849 ENCODE_FINISH(bl);
3850 }
3851
3852 // cannot be rolled back
3853 void mark_unrollbackable() {
3854 can_local_rollback = false;
3855 bl.clear();
3856 }
3857 bool can_rollback() const {
3858 return can_local_rollback;
3859 }
3860 bool empty() const {
3861 return can_local_rollback && (bl.length() == 0);
3862 }
3863
3864 bool requires_kraken() const {
3865 return max_required_version >= 2;
3866 }
3867
3868 /**
3869 * Create fresh copy of bl bytes to avoid keeping large buffers around
3870 * in the case that bl contains ptrs which point into a much larger
3871 * message buffer
3872 */
3873 void trim_bl() const {
3874 if (bl.length() > 0)
3875 bl.rebuild();
3876 }
3877 void encode(ceph::buffer::list &bl) const;
3878 void decode(ceph::buffer::list::const_iterator &bl);
3879 void dump(ceph::Formatter *f) const;
3880 static void generate_test_instances(std::list<ObjectModDesc*>& o);
3881 };
3882 WRITE_CLASS_ENCODER(ObjectModDesc)
3883
3884 class ObjectCleanRegions {
3885 private:
3886 bool new_object;
3887 bool clean_omap;
3888 interval_set<uint64_t> clean_offsets;
3889 static std::atomic<int32_t> max_num_intervals;
3890
3891 /**
3892 * trim the number of intervals if clean_offsets.num_intervals()
3893 * exceeds the given upbound max_num_intervals
3894 * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
3895 * then new interval [30~10] will evict out the shortest one [20~5]
3896 * finally, clean_offsets becomes {[5~10], [30~10]}
3897 */
3898 void trim();
3899 friend ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr);
3900 public:
3901 ObjectCleanRegions() : new_object(false), clean_omap(true) {
3902 clean_offsets.insert(0, (uint64_t)-1);
3903 }
3904 ObjectCleanRegions(uint64_t offset, uint64_t len, bool co)
3905 : new_object(false), clean_omap(co) {
3906 clean_offsets.insert(offset, len);
3907 }
3908 bool operator==(const ObjectCleanRegions &orc) const {
3909 return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets;
3910 }
3911 static void set_max_num_intervals(int32_t num);
3912 void merge(const ObjectCleanRegions &other);
3913 void mark_data_region_dirty(uint64_t offset, uint64_t len);
3914 void mark_omap_dirty();
3915 void mark_object_new();
3916 void mark_fully_dirty();
3917 interval_set<uint64_t> get_dirty_regions() const;
3918 bool omap_is_dirty() const;
3919 bool object_is_exist() const;
3920
3921 void encode(bufferlist &bl) const;
3922 void decode(bufferlist::const_iterator &bl);
3923 void dump(Formatter *f) const;
3924 static void generate_test_instances(list<ObjectCleanRegions*>& o);
3925 };
3926 WRITE_CLASS_ENCODER(ObjectCleanRegions)
3927 ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr);
3928
3929
3930 struct OSDOp {
3931 ceph_osd_op op;
3932 sobject_t soid;
3933
3934 ceph::buffer::list indata, outdata;
3935 errorcode32_t rval = 0;
3936
3937 OSDOp() {
3938 memset(&op, 0, sizeof(ceph_osd_op));
3939 }
3940
3941 OSDOp(const int op_code) {
3942 memset(&op, 0, sizeof(ceph_osd_op));
3943 op.op = op_code;
3944 }
3945
3946 /**
3947 * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
3948 *
3949 * @param ops [out] vector of OSDOps
3950 * @param in [in] combined data buffer
3951 */
3952 static void split_osd_op_vector_in_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
3953
3954 /**
3955 * merge indata members of a vector of OSDOp into a single ceph::buffer::list
3956 *
3957 * Notably this also encodes certain other OSDOp data into the data
3958 * buffer, including the sobject_t soid.
3959 *
3960 * @param ops [in] vector of OSDOps
3961 * @param out [out] combined data buffer
3962 */
3963 static void merge_osd_op_vector_in_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
3964
3965 /**
3966 * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
3967 *
3968 * @param ops [out] vector of OSDOps
3969 * @param in [in] combined data buffer
3970 */
3971 static void split_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
3972
3973 /**
3974 * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
3975 *
3976 * @param ops [in] vector of OSDOps
3977 * @param out [out] combined data buffer
3978 */
3979 static void merge_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
3980
3981 /**
3982 * Clear data as much as possible, leave minimal data for historical op dump
3983 *
3984 * @param ops [in] vector of OSDOps
3985 */
3986 static void clear_data(std::vector<OSDOp>& ops);
3987 };
3988 std::ostream& operator<<(std::ostream& out, const OSDOp& op);
3989
3990
3991 struct pg_log_op_return_item_t {
3992 int32_t rval;
3993 bufferlist bl;
3994 void encode(bufferlist& p) const {
3995 using ceph::encode;
3996 encode(rval, p);
3997 encode(bl, p);
3998 }
3999 void decode(bufferlist::const_iterator& p) {
4000 using ceph::decode;
4001 decode(rval, p);
4002 decode(bl, p);
4003 }
4004 void dump(Formatter *f) const {
4005 f->dump_int("rval", rval);
4006 f->dump_unsigned("bl_length", bl.length());
4007 }
4008 friend bool operator==(const pg_log_op_return_item_t& lhs,
4009 const pg_log_op_return_item_t& rhs) {
4010 return lhs.rval == rhs.rval &&
4011 lhs.bl.contents_equal(rhs.bl);
4012 }
4013 friend bool operator!=(const pg_log_op_return_item_t& lhs,
4014 const pg_log_op_return_item_t& rhs) {
4015 return !(lhs == rhs);
4016 }
4017 friend ostream& operator<<(ostream& out, const pg_log_op_return_item_t& i) {
4018 return out << "r=" << i.rval << "+" << i.bl.length() << "b";
4019 }
4020 };
4021 WRITE_CLASS_ENCODER(pg_log_op_return_item_t)
4022
4023 /**
4024 * pg_log_entry_t - single entry/event in pg log
4025 *
4026 */
4027 struct pg_log_entry_t {
4028 enum {
4029 MODIFY = 1, // some unspecified modification (but not *all* modifications)
4030 CLONE = 2, // cloned object from head
4031 DELETE = 3, // deleted object
4032 //BACKLOG = 4, // event invented by generate_backlog [obsolete]
4033 LOST_REVERT = 5, // lost new version, revert to an older version.
4034 LOST_DELETE = 6, // lost new version, revert to no object (deleted).
4035 LOST_MARK = 7, // lost new version, now EIO
4036 PROMOTE = 8, // promoted object from another tier
4037 CLEAN = 9, // mark an object clean
4038 ERROR = 10, // write that returned an error
4039 };
4040 static const char *get_op_name(int op) {
4041 switch (op) {
4042 case MODIFY:
4043 return "modify";
4044 case PROMOTE:
4045 return "promote";
4046 case CLONE:
4047 return "clone";
4048 case DELETE:
4049 return "delete";
4050 case LOST_REVERT:
4051 return "l_revert";
4052 case LOST_DELETE:
4053 return "l_delete";
4054 case LOST_MARK:
4055 return "l_mark";
4056 case CLEAN:
4057 return "clean";
4058 case ERROR:
4059 return "error";
4060 default:
4061 return "unknown";
4062 }
4063 }
4064 const char *get_op_name() const {
4065 return get_op_name(op);
4066 }
4067
4068 // describes state for a locally-rollbackable entry
4069 ObjectModDesc mod_desc;
4070 ceph::buffer::list snaps; // only for clone entries
4071 hobject_t soid;
4072 osd_reqid_t reqid; // caller+tid to uniquely identify request
4073 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
4074
4075 /// map extra_reqids by index to error return code (if any)
4076 mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
4077
4078 eversion_t version, prior_version, reverting_to;
4079 version_t user_version; // the user version for this entry
4080 utime_t mtime; // this is the _user_ mtime, mind you
4081 int32_t return_code; // only stored for ERRORs for dup detection
4082
4083 vector<pg_log_op_return_item_t> op_returns;
4084
4085 __s32 op;
4086 bool invalid_hash; // only when decoding sobject_t based entries
4087 bool invalid_pool; // only when decoding pool-less hobject based entries
4088 ObjectCleanRegions clean_regions;
4089
4090 pg_log_entry_t()
4091 : user_version(0), return_code(0), op(0),
4092 invalid_hash(false), invalid_pool(false) {
4093 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4094 }
4095 pg_log_entry_t(int _op, const hobject_t& _soid,
4096 const eversion_t& v, const eversion_t& pv,
4097 version_t uv,
4098 const osd_reqid_t& rid, const utime_t& mt,
4099 int return_code)
4100 : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
4101 mtime(mt), return_code(return_code), op(_op),
4102 invalid_hash(false), invalid_pool(false) {
4103 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4104 }
4105
4106 bool is_clone() const { return op == CLONE; }
4107 bool is_modify() const { return op == MODIFY; }
4108 bool is_promote() const { return op == PROMOTE; }
4109 bool is_clean() const { return op == CLEAN; }
4110 bool is_lost_revert() const { return op == LOST_REVERT; }
4111 bool is_lost_delete() const { return op == LOST_DELETE; }
4112 bool is_lost_mark() const { return op == LOST_MARK; }
4113 bool is_error() const { return op == ERROR; }
4114
4115 bool is_update() const {
4116 return
4117 is_clone() || is_modify() || is_promote() || is_clean() ||
4118 is_lost_revert() || is_lost_mark();
4119 }
4120 bool is_delete() const {
4121 return op == DELETE || op == LOST_DELETE;
4122 }
4123
4124 bool can_rollback() const {
4125 return mod_desc.can_rollback();
4126 }
4127
4128 void mark_unrollbackable() {
4129 mod_desc.mark_unrollbackable();
4130 }
4131
4132 bool requires_kraken() const {
4133 return mod_desc.requires_kraken();
4134 }
4135
4136 // Errors are only used for dup detection, whereas
4137 // the index by objects is used by recovery, copy_get,
4138 // and other facilities that don't expect or need to
4139 // be aware of error entries.
4140 bool object_is_indexed() const {
4141 return !is_error();
4142 }
4143
4144 bool reqid_is_indexed() const {
4145 return reqid != osd_reqid_t() &&
4146 (op == MODIFY || op == DELETE || op == ERROR);
4147 }
4148
4149 void set_op_returns(std::vector<OSDOp>& ops) {
4150 op_returns.resize(ops.size());
4151 for (unsigned i = 0; i < ops.size(); ++i) {
4152 op_returns[i].rval = ops[i].rval;
4153 op_returns[i].bl = ops[i].outdata;
4154 }
4155 }
4156
4157 std::string get_key_name() const;
4158 void encode_with_checksum(ceph::buffer::list& bl) const;
4159 void decode_with_checksum(ceph::buffer::list::const_iterator& p);
4160
4161 void encode(ceph::buffer::list &bl) const;
4162 void decode(ceph::buffer::list::const_iterator &bl);
4163 void dump(ceph::Formatter *f) const;
4164 static void generate_test_instances(std::list<pg_log_entry_t*>& o);
4165
4166 };
4167 WRITE_CLASS_ENCODER(pg_log_entry_t)
4168
4169 std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e);
4170
4171 struct pg_log_dup_t {
4172 osd_reqid_t reqid; // caller+tid to uniquely identify request
4173 eversion_t version;
4174 version_t user_version; // the user version for this entry
4175 int32_t return_code; // only stored for ERRORs for dup detection
4176
4177 vector<pg_log_op_return_item_t> op_returns;
4178
4179 pg_log_dup_t()
4180 : user_version(0), return_code(0)
4181 {}
4182 explicit pg_log_dup_t(const pg_log_entry_t& entry)
4183 : reqid(entry.reqid), version(entry.version),
4184 user_version(entry.user_version),
4185 return_code(entry.return_code),
4186 op_returns(entry.op_returns)
4187 {}
4188 pg_log_dup_t(const eversion_t& v, version_t uv,
4189 const osd_reqid_t& rid, int return_code)
4190 : reqid(rid), version(v), user_version(uv),
4191 return_code(return_code)
4192 {}
4193
4194 std::string get_key_name() const;
4195 void encode(ceph::buffer::list &bl) const;
4196 void decode(ceph::buffer::list::const_iterator &bl);
4197 void dump(ceph::Formatter *f) const;
4198 static void generate_test_instances(std::list<pg_log_dup_t*>& o);
4199
4200 bool operator==(const pg_log_dup_t &rhs) const {
4201 return reqid == rhs.reqid &&
4202 version == rhs.version &&
4203 user_version == rhs.user_version &&
4204 return_code == rhs.return_code &&
4205 op_returns == rhs.op_returns;
4206 }
4207 bool operator!=(const pg_log_dup_t &rhs) const {
4208 return !(*this == rhs);
4209 }
4210
4211 friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4212 };
4213 WRITE_CLASS_ENCODER(pg_log_dup_t)
4214
4215 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
4216
4217 /**
4218 * pg_log_t - incremental log of recent pg changes.
4219 *
4220 * serves as a recovery queue for recent changes.
4221 */
4222 struct pg_log_t {
4223 /*
4224 * head - newest entry (update|delete)
4225 * tail - entry previous to oldest (update|delete) for which we have
4226 * complete negative information.
4227 * i.e. we can infer pg contents for any store whose last_update >= tail.
4228 */
4229 eversion_t head; // newest entry
4230 eversion_t tail; // version prior to oldest
4231
4232 protected:
4233 // We can rollback rollback-able entries > can_rollback_to
4234 eversion_t can_rollback_to;
4235
4236 // always <= can_rollback_to, indicates how far stashed rollback
4237 // data can be found
4238 eversion_t rollback_info_trimmed_to;
4239
4240 public:
4241 // the actual log
4242 mempool::osd_pglog::list<pg_log_entry_t> log;
4243
4244 // entries just for dup op detection ordered oldest to newest
4245 mempool::osd_pglog::list<pg_log_dup_t> dups;
4246
4247 pg_log_t() = default;
4248 pg_log_t(const eversion_t &last_update,
4249 const eversion_t &log_tail,
4250 const eversion_t &can_rollback_to,
4251 const eversion_t &rollback_info_trimmed_to,
4252 mempool::osd_pglog::list<pg_log_entry_t> &&entries,
4253 mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
4254 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4255 rollback_info_trimmed_to(rollback_info_trimmed_to),
4256 log(std::move(entries)), dups(std::move(dup_entries)) {}
4257 pg_log_t(const eversion_t &last_update,
4258 const eversion_t &log_tail,
4259 const eversion_t &can_rollback_to,
4260 const eversion_t &rollback_info_trimmed_to,
4261 const std::list<pg_log_entry_t> &entries,
4262 const std::list<pg_log_dup_t> &dup_entries)
4263 : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
4264 rollback_info_trimmed_to(rollback_info_trimmed_to) {
4265 for (auto &&entry: entries) {
4266 log.push_back(entry);
4267 }
4268 for (auto &&entry: dup_entries) {
4269 dups.push_back(entry);
4270 }
4271 }
4272
4273 void clear() {
4274 eversion_t z;
4275 rollback_info_trimmed_to = can_rollback_to = head = tail = z;
4276 log.clear();
4277 dups.clear();
4278 }
4279
4280 eversion_t get_rollback_info_trimmed_to() const {
4281 return rollback_info_trimmed_to;
4282 }
4283 eversion_t get_can_rollback_to() const {
4284 return can_rollback_to;
4285 }
4286
4287
4288 pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
4289 mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
4290 oldlog.swap(log);
4291
4292 eversion_t old_tail;
4293 unsigned mask = ~((~0)<<split_bits);
4294 for (auto i = oldlog.begin();
4295 i != oldlog.end();
4296 ) {
4297 if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
4298 childlog.push_back(*i);
4299 } else {
4300 log.push_back(*i);
4301 }
4302 oldlog.erase(i++);
4303 }
4304
4305 // osd_reqid is unique, so it doesn't matter if there are extra
4306 // dup entries in each pg. To avoid storing oid with the dup
4307 // entries, just copy the whole list.
4308 auto childdups(dups);
4309
4310 return pg_log_t(
4311 head,
4312 tail,
4313 can_rollback_to,
4314 rollback_info_trimmed_to,
4315 std::move(childlog),
4316 std::move(childdups));
4317 }
4318
4319 mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
4320 ceph_assert(newhead >= tail);
4321
4322 mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
4323 mempool::osd_pglog::list<pg_log_entry_t> divergent;
4324 while (true) {
4325 if (p == log.begin()) {
4326 // yikes, the whole thing is divergent!
4327 using std::swap;
4328 swap(divergent, log);
4329 break;
4330 }
4331 --p;
4332 if (p->version.version <= newhead.version) {
4333 /*
4334 * look at eversion.version here. we want to avoid a situation like:
4335 * our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4336 * new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
4337 * lower_bound = 100'9
4338 * i.e, same request, different version. If the eversion.version is > the
4339 * lower_bound, we it is divergent.
4340 */
4341 ++p;
4342 divergent.splice(divergent.begin(), log, p, log.end());
4343 break;
4344 }
4345 ceph_assert(p->version > newhead);
4346 }
4347 head = newhead;
4348
4349 if (can_rollback_to > newhead)
4350 can_rollback_to = newhead;
4351
4352 if (rollback_info_trimmed_to > newhead)
4353 rollback_info_trimmed_to = newhead;
4354
4355 return divergent;
4356 }
4357
4358 void merge_from(const std::vector<pg_log_t*>& slogs, eversion_t last_update) {
4359 log.clear();
4360
4361 // sort and merge dups
4362 std::multimap<eversion_t,pg_log_dup_t> sorted;
4363 for (auto& d : dups) {
4364 sorted.emplace(d.version, d);
4365 }
4366 for (auto l : slogs) {
4367 for (auto& d : l->dups) {
4368 sorted.emplace(d.version, d);
4369 }
4370 }
4371 dups.clear();
4372 for (auto& i : sorted) {
4373 dups.push_back(i.second);
4374 }
4375
4376 head = last_update;
4377 tail = last_update;
4378 can_rollback_to = last_update;
4379 rollback_info_trimmed_to = last_update;
4380 }
4381
4382 bool empty() const {
4383 return log.empty();
4384 }
4385
4386 bool null() const {
4387 return head.version == 0 && head.epoch == 0;
4388 }
4389
4390 uint64_t approx_size() const {
4391 return head.version - tail.version;
4392 }
4393
4394 static void filter_log(spg_t import_pgid, const OSDMap &curmap,
4395 const std::string &hit_set_namespace, const pg_log_t &in,
4396 pg_log_t &out, pg_log_t &reject);
4397
4398 /**
4399 * copy entries from the tail of another pg_log_t
4400 *
4401 * @param other pg_log_t to copy from
4402 * @param from copy entries after this version
4403 */
4404 void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
4405
4406 /**
4407 * copy up to N entries
4408 *
4409 * @param other source log
4410 * @param max max number of entries to copy
4411 */
4412 void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
4413
4414 std::ostream& print(std::ostream& out) const;
4415
4416 void encode(ceph::buffer::list &bl) const;
4417 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
4418 void dump(ceph::Formatter *f) const;
4419 static void generate_test_instances(std::list<pg_log_t*>& o);
4420 };
4421 WRITE_CLASS_ENCODER(pg_log_t)
4422
4423 inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log)
4424 {
4425 out << "log((" << log.tail << "," << log.head << "], crt="
4426 << log.get_can_rollback_to() << ")";
4427 return out;
4428 }
4429
4430
4431 /**
4432 * pg_missing_t - summary of missing objects.
4433 *
4434 * kept in memory, as a supplement to pg_log_t
4435 * also used to pass missing info in messages.
4436 */
4437 struct pg_missing_item {
4438 eversion_t need, have;
4439 ObjectCleanRegions clean_regions;
4440 enum missing_flags_t {
4441 FLAG_NONE = 0,
4442 FLAG_DELETE = 1,
4443 } flags;
4444 pg_missing_item() : flags(FLAG_NONE) {}
4445 explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {} // have no old version
4446 pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) :
4447 need(n), have(h) {
4448 set_delete(is_delete);
4449 if (old_style)
4450 clean_regions.mark_fully_dirty();
4451 }
4452
4453 void encode(ceph::buffer::list& bl, uint64_t features) const {
4454 using ceph::encode;
4455 if (HAVE_FEATURE(features, SERVER_OCTOPUS)) {
4456 // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
4457 // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
4458 // possible. This can be replaced with the legacy encoding
4459 encode(eversion_t(), bl);
4460 encode(eversion_t(-1, -1), bl);
4461 encode(need, bl);
4462 encode(have, bl);
4463 encode(static_cast<uint8_t>(flags), bl);
4464 encode(clean_regions, bl);
4465 } else {
4466 encode(eversion_t(), bl);
4467 encode(need, bl);
4468 encode(have, bl);
4469 encode(static_cast<uint8_t>(flags), bl);
4470 }
4471 }
4472 void decode(ceph::buffer::list::const_iterator& bl) {
4473 using ceph::decode;
4474 eversion_t e, l;
4475 decode(e, bl);
4476 decode(l, bl);
4477 if(l == eversion_t(-1, -1)) {
4478 // support all
4479 decode(need, bl);
4480 decode(have, bl);
4481 uint8_t f;
4482 decode(f, bl);
4483 flags = static_cast<missing_flags_t>(f);
4484 decode(clean_regions, bl);
4485 } else {
4486 // support OSD_RECOVERY_DELETES
4487 need = l;
4488 decode(have, bl);
4489 uint8_t f;
4490 decode(f, bl);
4491 flags = static_cast<missing_flags_t>(f);
4492 clean_regions.mark_fully_dirty();
4493 }
4494 }
4495
4496 void set_delete(bool is_delete) {
4497 flags = is_delete ? FLAG_DELETE : FLAG_NONE;
4498 }
4499
4500 bool is_delete() const {
4501 return (flags & FLAG_DELETE) == FLAG_DELETE;
4502 }
4503
4504 std::string flag_str() const {
4505 if (flags == FLAG_NONE) {
4506 return "none";
4507 } else {
4508 return "delete";
4509 }
4510 }
4511
4512 void dump(ceph::Formatter *f) const {
4513 f->dump_stream("need") << need;
4514 f->dump_stream("have") << have;
4515 f->dump_stream("flags") << flag_str();
4516 f->dump_stream("clean_regions") << clean_regions;
4517 }
4518 static void generate_test_instances(std::list<pg_missing_item*>& o) {
4519 o.push_back(new pg_missing_item);
4520 o.push_back(new pg_missing_item);
4521 o.back()->need = eversion_t(1, 2);
4522 o.back()->have = eversion_t(1, 1);
4523 o.push_back(new pg_missing_item);
4524 o.back()->need = eversion_t(3, 5);
4525 o.back()->have = eversion_t(3, 4);
4526 o.back()->clean_regions.mark_data_region_dirty(4096, 8192);
4527 o.back()->clean_regions.mark_omap_dirty();
4528 o.back()->flags = FLAG_DELETE;
4529 }
4530 bool operator==(const pg_missing_item &rhs) const {
4531 return need == rhs.need && have == rhs.have && flags == rhs.flags;
4532 }
4533 bool operator!=(const pg_missing_item &rhs) const {
4534 return !(*this == rhs);
4535 }
4536 };
4537 WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
4538 std::ostream& operator<<(std::ostream& out, const pg_missing_item &item);
4539
4540 class pg_missing_const_i {
4541 public:
4542 virtual const std::map<hobject_t, pg_missing_item> &
4543 get_items() const = 0;
4544 virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0;
4545 virtual bool get_may_include_deletes() const = 0;
4546 virtual unsigned int num_missing() const = 0;
4547 virtual bool have_missing() const = 0;
4548 virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
4549 virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
4550 virtual ~pg_missing_const_i() {}
4551 };
4552
4553
4554 template <bool Track>
4555 class ChangeTracker {
4556 public:
4557 void changed(const hobject_t &obj) {}
4558 template <typename F>
4559 void get_changed(F &&f) const {}
4560 void flush() {}
4561 bool is_clean() const {
4562 return true;
4563 }
4564 };
4565 template <>
4566 class ChangeTracker<true> {
4567 std::set<hobject_t> _changed;
4568 public:
4569 void changed(const hobject_t &obj) {
4570 _changed.insert(obj);
4571 }
4572 template <typename F>
4573 void get_changed(F &&f) const {
4574 for (auto const &i: _changed) {
4575 f(i);
4576 }
4577 }
4578 void flush() {
4579 _changed.clear();
4580 }
4581 bool is_clean() const {
4582 return _changed.empty();
4583 }
4584 };
4585
4586 template <bool TrackChanges>
4587 class pg_missing_set : public pg_missing_const_i {
4588 using item = pg_missing_item;
4589 std::map<hobject_t, item> missing; // oid -> (need v, have v)
4590 std::map<version_t, hobject_t> rmissing; // v -> oid
4591 ChangeTracker<TrackChanges> tracker;
4592
4593 public:
4594 pg_missing_set() = default;
4595
4596 template <typename missing_type>
4597 pg_missing_set(const missing_type &m) {
4598 missing = m.get_items();
4599 rmissing = m.get_rmissing();
4600 may_include_deletes = m.get_may_include_deletes();
4601 for (auto &&i: missing)
4602 tracker.changed(i.first);
4603 }
4604
4605 bool may_include_deletes = false;
4606
4607 const std::map<hobject_t, item> &get_items() const override {
4608 return missing;
4609 }
4610 const std::map<version_t, hobject_t> &get_rmissing() const override {
4611 return rmissing;
4612 }
4613 bool get_may_include_deletes() const override {
4614 return may_include_deletes;
4615 }
4616 unsigned int num_missing() const override {
4617 return missing.size();
4618 }
4619 bool have_missing() const override {
4620 return !missing.empty();
4621 }
4622 void merge(const pg_log_entry_t& e) {
4623 auto miter = missing.find(e.soid);
4624 if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have)
4625 miter->second.clean_regions.merge(e.clean_regions);
4626 }
4627 bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
4628 auto iter = missing.find(oid);
4629 if (iter == missing.end())
4630 return false;
4631 if (out)
4632 *out = iter->second;
4633 return true;
4634 }
4635 bool is_missing(const hobject_t& oid, eversion_t v) const override {
4636 std::map<hobject_t, item>::const_iterator m =
4637 missing.find(oid);
4638 if (m == missing.end())
4639 return false;
4640 const item &item(m->second);
4641 if (item.need > v)
4642 return false;
4643 return true;
4644 }
4645 eversion_t get_oldest_need() const {
4646 if (missing.empty()) {
4647 return eversion_t();
4648 }
4649 auto it = missing.find(rmissing.begin()->second);
4650 ceph_assert(it != missing.end());
4651 return it->second.need;
4652 }
4653
4654 void claim(pg_missing_set& o) {
4655 static_assert(!TrackChanges, "Can't use claim with TrackChanges");
4656 missing.swap(o.missing);
4657 rmissing.swap(o.rmissing);
4658 }
4659
4660 /*
4661 * this needs to be called in log order as we extend the log. it
4662 * assumes missing is accurate up through the previous log entry.
4663 */
4664 void add_next_event(const pg_log_entry_t& e) {
4665 std::map<hobject_t, item>::iterator missing_it;
4666 missing_it = missing.find(e.soid);
4667 bool is_missing_divergent_item = missing_it != missing.end();
4668 if (e.prior_version == eversion_t() || e.is_clone()) {
4669 // new object.
4670 if (is_missing_divergent_item) { // use iterator
4671 rmissing.erase(missing_it->second.need.version);
4672 // .have = nil
4673 missing_it->second = item(e.version, eversion_t(), e.is_delete());
4674 missing_it->second.clean_regions.mark_fully_dirty();
4675 } else {
4676 // create new element in missing map
4677 // .have = nil
4678 missing[e.soid] = item(e.version, eversion_t(), e.is_delete());
4679 missing[e.soid].clean_regions.mark_fully_dirty();
4680 }
4681 } else if (is_missing_divergent_item) {
4682 // already missing (prior).
4683 rmissing.erase((missing_it->second).need.version);
4684 missing_it->second.need = e.version; // leave .have unchanged.
4685 missing_it->second.set_delete(e.is_delete());
4686 if (e.is_lost_revert())
4687 missing_it->second.clean_regions.mark_fully_dirty();
4688 else
4689 missing_it->second.clean_regions.merge(e.clean_regions);
4690 } else {
4691 // not missing, we must have prior_version (if any)
4692 ceph_assert(!is_missing_divergent_item);
4693 missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
4694 if (e.is_lost_revert())
4695 missing[e.soid].clean_regions.mark_fully_dirty();
4696 else
4697 missing[e.soid].clean_regions = e.clean_regions;
4698 }
4699 rmissing[e.version.version] = e.soid;
4700 tracker.changed(e.soid);
4701 }
4702
4703 void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
4704 auto p = missing.find(oid);
4705 if (p != missing.end()) {
4706 rmissing.erase((p->second).need.version);
4707 p->second.need = need; // do not adjust .have
4708 p->second.set_delete(is_delete);
4709 p->second.clean_regions.mark_fully_dirty();
4710 } else {
4711 missing[oid] = item(need, eversion_t(), is_delete);
4712 missing[oid].clean_regions.mark_fully_dirty();
4713 }
4714 rmissing[need.version] = oid;
4715
4716 tracker.changed(oid);
4717 }
4718
4719 void revise_have(hobject_t oid, eversion_t have) {
4720 auto p = missing.find(oid);
4721 if (p != missing.end()) {
4722 tracker.changed(oid);
4723 (p->second).have = have;
4724 }
4725 }
4726
4727 void mark_fully_dirty(const hobject_t& oid) {
4728 auto p = missing.find(oid);
4729 if (p != missing.end()) {
4730 tracker.changed(oid);
4731 (p->second).clean_regions.mark_fully_dirty();
4732 }
4733 }
4734
4735 void add(const hobject_t& oid, eversion_t need, eversion_t have,
4736 bool is_delete) {
4737 missing[oid] = item(need, have, is_delete, true);
4738 rmissing[need.version] = oid;
4739 tracker.changed(oid);
4740 }
4741
4742 void add(const hobject_t& oid, pg_missing_item&& item) {
4743 rmissing[item.need.version] = oid;
4744 missing.insert({oid, std::move(item)});
4745 tracker.changed(oid);
4746 }
4747
4748 void rm(const hobject_t& oid, eversion_t v) {
4749 std::map<hobject_t, item>::iterator p = missing.find(oid);
4750 if (p != missing.end() && p->second.need <= v)
4751 rm(p);
4752 }
4753
4754 void rm(std::map<hobject_t, item>::const_iterator m) {
4755 tracker.changed(m->first);
4756 rmissing.erase(m->second.need.version);
4757 missing.erase(m);
4758 }
4759
4760 void got(const hobject_t& oid, eversion_t v) {
4761 std::map<hobject_t, item>::iterator p = missing.find(oid);
4762 ceph_assert(p != missing.end());
4763 ceph_assert(p->second.need <= v || p->second.is_delete());
4764 got(p);
4765 }
4766
4767 void got(std::map<hobject_t, item>::const_iterator m) {
4768 tracker.changed(m->first);
4769 rmissing.erase(m->second.need.version);
4770 missing.erase(m);
4771 }
4772
4773 void split_into(
4774 pg_t child_pgid,
4775 unsigned split_bits,
4776 pg_missing_set *omissing) {
4777 omissing->may_include_deletes = may_include_deletes;
4778 unsigned mask = ~((~0)<<split_bits);
4779 for (std::map<hobject_t, item>::iterator i = missing.begin();
4780 i != missing.end();
4781 ) {
4782 if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
4783 omissing->add(i->first, i->second.need, i->second.have,
4784 i->second.is_delete());
4785 rm(i++);
4786 } else {
4787 ++i;
4788 }
4789 }
4790 }
4791
4792 void clear() {
4793 for (auto const &i: missing)
4794 tracker.changed(i.first);
4795 missing.clear();
4796 rmissing.clear();
4797 }
4798
4799 void encode(ceph::buffer::list &bl, uint64_t features) const {
4800 ENCODE_START(5, 2, bl)
4801 encode(missing, bl, features);
4802 encode(may_include_deletes, bl);
4803 ENCODE_FINISH(bl);
4804 }
4805 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) {
4806 for (auto const &i: missing)
4807 tracker.changed(i.first);
4808 DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
4809 decode(missing, bl);
4810 if (struct_v >= 4) {
4811 decode(may_include_deletes, bl);
4812 }
4813 DECODE_FINISH(bl);
4814
4815 if (struct_v < 3) {
4816 // Handle hobject_t upgrade
4817 std::map<hobject_t, item> tmp;
4818 for (std::map<hobject_t, item>::iterator i =
4819 missing.begin();
4820 i != missing.end();
4821 ) {
4822 if (!i->first.is_max() && i->first.pool == -1) {
4823 hobject_t to_insert(i->first);
4824 to_insert.pool = pool;
4825 tmp[to_insert] = i->second;
4826 missing.erase(i++);
4827 } else {
4828 ++i;
4829 }
4830 }
4831 missing.insert(tmp.begin(), tmp.end());
4832 }
4833
4834 for (std::map<hobject_t,item>::iterator it =
4835 missing.begin();
4836 it != missing.end();
4837 ++it)
4838 rmissing[it->second.need.version] = it->first;
4839 for (auto const &i: missing)
4840 tracker.changed(i.first);
4841 }
4842 void dump(ceph::Formatter *f) const {
4843 f->open_array_section("missing");
4844 for (std::map<hobject_t,item>::const_iterator p =
4845 missing.begin(); p != missing.end(); ++p) {
4846 f->open_object_section("item");
4847 f->dump_stream("object") << p->first;
4848 p->second.dump(f);
4849 f->close_section();
4850 }
4851 f->close_section();
4852 f->dump_bool("may_include_deletes", may_include_deletes);
4853 }
4854 template <typename F>
4855 void filter_objects(F &&f) {
4856 for (auto i = missing.begin(); i != missing.end();) {
4857 if (f(i->first)) {
4858 rm(i++);
4859 } else {
4860 ++i;
4861 }
4862 }
4863 }
4864 static void generate_test_instances(std::list<pg_missing_set*>& o) {
4865 o.push_back(new pg_missing_set);
4866 o.back()->may_include_deletes = true;
4867 o.push_back(new pg_missing_set);
4868 o.back()->add(
4869 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4870 eversion_t(5, 6), eversion_t(5, 1), false);
4871 o.back()->may_include_deletes = true;
4872 o.push_back(new pg_missing_set);
4873 o.back()->add(
4874 hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
4875 eversion_t(5, 6), eversion_t(5, 1), true);
4876 o.back()->may_include_deletes = true;
4877 }
4878 template <typename F>
4879 void get_changed(F &&f) const {
4880 tracker.get_changed(f);
4881 }
4882 void flush() {
4883 tracker.flush();
4884 }
4885 bool is_clean() const {
4886 return tracker.is_clean();
4887 }
4888 template <typename missing_t>
4889 bool debug_verify_from_init(
4890 const missing_t &init_missing,
4891 std::ostream *oss) const {
4892 if (!TrackChanges)
4893 return true;
4894 auto check_missing(init_missing.get_items());
4895 tracker.get_changed([&](const hobject_t &hoid) {
4896 check_missing.erase(hoid);
4897 if (missing.count(hoid)) {
4898 check_missing.insert(*(missing.find(hoid)));
4899 }
4900 });
4901 bool ok = true;
4902 if (check_missing.size() != missing.size()) {
4903 if (oss) {
4904 *oss << "Size mismatch, check: " << check_missing.size()
4905 << ", actual: " << missing.size() << "\n";
4906 }
4907 ok = false;
4908 }
4909 for (auto &i: missing) {
4910 if (!check_missing.count(i.first)) {
4911 if (oss)
4912 *oss << "check_missing missing " << i.first << "\n";
4913 ok = false;
4914 } else if (check_missing[i.first] != i.second) {
4915 if (oss)
4916 *oss << "check_missing missing item mismatch on " << i.first
4917 << ", check: " << check_missing[i.first]
4918 << ", actual: " << i.second << "\n";
4919 ok = false;
4920 }
4921 }
4922 if (oss && !ok) {
4923 *oss << "check_missing: " << check_missing << "\n";
4924 std::set<hobject_t> changed;
4925 tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
4926 *oss << "changed: " << changed << "\n";
4927 }
4928 return ok;
4929 }
4930 };
4931 template <bool TrackChanges>
4932 void encode(
4933 const pg_missing_set<TrackChanges> &c, ceph::buffer::list &bl, uint64_t features=0) {
4934 ENCODE_DUMP_PRE();
4935 c.encode(bl, features);
4936 ENCODE_DUMP_POST(cl);
4937 }
4938 template <bool TrackChanges>
4939 void decode(pg_missing_set<TrackChanges> &c, ceph::buffer::list::const_iterator &p) {
4940 c.decode(p);
4941 }
4942 template <bool TrackChanges>
4943 std::ostream& operator<<(std::ostream& out, const pg_missing_set<TrackChanges> &missing)
4944 {
4945 out << "missing(" << missing.num_missing()
4946 << " may_include_deletes = " << missing.may_include_deletes;
4947 //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
4948 out << ")";
4949 return out;
4950 }
4951
4952 using pg_missing_t = pg_missing_set<false>;
4953 using pg_missing_tracker_t = pg_missing_set<true>;
4954
4955
4956 /**
4957 * pg list objects response format
4958 *
4959 */
4960 struct pg_nls_response_t {
4961 collection_list_handle_t handle;
4962 std::list<librados::ListObjectImpl> entries;
4963
4964 void encode(ceph::buffer::list& bl) const {
4965 ENCODE_START(1, 1, bl);
4966 encode(handle, bl);
4967 __u32 n = (__u32)entries.size();
4968 encode(n, bl);
4969 for (std::list<librados::ListObjectImpl>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
4970 encode(i->nspace, bl);
4971 encode(i->oid, bl);
4972 encode(i->locator, bl);
4973 }
4974 ENCODE_FINISH(bl);
4975 }
4976 void decode(ceph::buffer::list::const_iterator& bl) {
4977 DECODE_START(1, bl);
4978 decode(handle, bl);
4979 __u32 n;
4980 decode(n, bl);
4981 entries.clear();
4982 while (n--) {
4983 librados::ListObjectImpl i;
4984 decode(i.nspace, bl);
4985 decode(i.oid, bl);
4986 decode(i.locator, bl);
4987 entries.push_back(i);
4988 }
4989 DECODE_FINISH(bl);
4990 }
4991 void dump(ceph::Formatter *f) const {
4992 f->dump_stream("handle") << handle;
4993 f->open_array_section("entries");
4994 for (std::list<librados::ListObjectImpl>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
4995 f->open_object_section("object");
4996 f->dump_string("namespace", p->nspace);
4997 f->dump_string("object", p->oid);
4998 f->dump_string("key", p->locator);
4999 f->close_section();
5000 }
5001 f->close_section();
5002 }
5003 static void generate_test_instances(std::list<pg_nls_response_t*>& o) {
5004 o.push_back(new pg_nls_response_t);
5005 o.push_back(new pg_nls_response_t);
5006 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5007 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5008 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5009 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5010 o.push_back(new pg_nls_response_t);
5011 o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
5012 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5013 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5014 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5015 o.push_back(new pg_nls_response_t);
5016 o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
5017 o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
5018 o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
5019 o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
5020 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
5021 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
5022 o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
5023 }
5024 };
5025
5026 WRITE_CLASS_ENCODER(pg_nls_response_t)
5027
5028 // For backwards compatibility with older OSD requests
5029 struct pg_ls_response_t {
5030 collection_list_handle_t handle;
5031 std::list<std::pair<object_t, std::string> > entries;
5032
5033 void encode(ceph::buffer::list& bl) const {
5034 using ceph::encode;
5035 __u8 v = 1;
5036 encode(v, bl);
5037 encode(handle, bl);
5038 encode(entries, bl);
5039 }
5040 void decode(ceph::buffer::list::const_iterator& bl) {
5041 using ceph::decode;
5042 __u8 v;
5043 decode(v, bl);
5044 ceph_assert(v == 1);
5045 decode(handle, bl);
5046 decode(entries, bl);
5047 }
5048 void dump(ceph::Formatter *f) const {
5049 f->dump_stream("handle") << handle;
5050 f->open_array_section("entries");
5051 for (std::list<std::pair<object_t, std::string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5052 f->open_object_section("object");
5053 f->dump_stream("object") << p->first;
5054 f->dump_string("key", p->second);
5055 f->close_section();
5056 }
5057 f->close_section();
5058 }
5059 static void generate_test_instances(std::list<pg_ls_response_t*>& o) {
5060 o.push_back(new pg_ls_response_t);
5061 o.push_back(new pg_ls_response_t);
5062 o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
5063 o.back()->entries.push_back(std::make_pair(object_t("one"), std::string()));
5064 o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey")));
5065 }
5066 };
5067
5068 WRITE_CLASS_ENCODER(pg_ls_response_t)
5069
5070 /**
5071 * object_copy_cursor_t
5072 */
5073 struct object_copy_cursor_t {
5074 uint64_t data_offset;
5075 std::string omap_offset;
5076 bool attr_complete;
5077 bool data_complete;
5078 bool omap_complete;
5079
5080 object_copy_cursor_t()
5081 : data_offset(0),
5082 attr_complete(false),
5083 data_complete(false),
5084 omap_complete(false)
5085 {}
5086
5087 bool is_initial() const {
5088 return !attr_complete && data_offset == 0 && omap_offset.empty();
5089 }
5090 bool is_complete() const {
5091 return attr_complete && data_complete && omap_complete;
5092 }
5093
5094 static void generate_test_instances(std::list<object_copy_cursor_t*>& o);
5095 void encode(ceph::buffer::list& bl) const;
5096 void decode(ceph::buffer::list::const_iterator &bl);
5097 void dump(ceph::Formatter *f) const;
5098 };
5099 WRITE_CLASS_ENCODER(object_copy_cursor_t)
5100
5101 /**
5102 * object_copy_data_t
5103 *
5104 * Return data from a copy request. The semantics are a little strange
5105 * as a result of the encoding's heritage.
5106 *
5107 * In particular, the sender unconditionally fills in the cursor (from what
5108 * it receives and sends), the size, and the mtime, but is responsible for
5109 * figuring out whether it should put any data in the attrs, data, or
5110 * omap members (corresponding to xattrs, object data, and the omap entries)
5111 * based on external data (the client includes a max amount to return with
5112 * the copy request). The client then looks into the attrs, data, and/or omap
5113 * based on the contents of the cursor.
5114 */
5115 struct object_copy_data_t {
5116 enum {
5117 FLAG_DATA_DIGEST = 1<<0,
5118 FLAG_OMAP_DIGEST = 1<<1,
5119 };
5120 object_copy_cursor_t cursor;
5121 uint64_t size;
5122 utime_t mtime;
5123 uint32_t data_digest, omap_digest;
5124 uint32_t flags;
5125 std::map<std::string, ceph::buffer::list> attrs;
5126 ceph::buffer::list data;
5127 ceph::buffer::list omap_header;
5128 ceph::buffer::list omap_data;
5129
5130 /// which snaps we are defined for (if a snap and not the head)
5131 std::vector<snapid_t> snaps;
5132 /// latest snap seq for the object (if head)
5133 snapid_t snap_seq;
5134
5135 /// recent reqids on this object
5136 mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids;
5137
5138 /// map reqids by index to error return code (if any)
5139 mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
5140
5141 uint64_t truncate_seq;
5142 uint64_t truncate_size;
5143
5144 public:
5145 object_copy_data_t() :
5146 size((uint64_t)-1), data_digest(-1),
5147 omap_digest(-1), flags(0),
5148 truncate_seq(0),
5149 truncate_size(0) {}
5150
5151 static void generate_test_instances(std::list<object_copy_data_t*>& o);
5152 void encode(ceph::buffer::list& bl, uint64_t features) const;
5153 void decode(ceph::buffer::list::const_iterator& bl);
5154 void dump(ceph::Formatter *f) const;
5155 };
5156 WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
5157
5158 /**
5159 * pg creation info
5160 */
5161 struct pg_create_t {
5162 epoch_t created; // epoch pg created
5163 pg_t parent; // split from parent (if != pg_t())
5164 __s32 split_bits;
5165
5166 pg_create_t()
5167 : created(0), split_bits(0) {}
5168 pg_create_t(unsigned c, pg_t p, int s)
5169 : created(c), parent(p), split_bits(s) {}
5170
5171 void encode(ceph::buffer::list &bl) const;
5172 void decode(ceph::buffer::list::const_iterator &bl);
5173 void dump(ceph::Formatter *f) const;
5174 static void generate_test_instances(std::list<pg_create_t*>& o);
5175 };
5176 WRITE_CLASS_ENCODER(pg_create_t)
5177
5178 // -----------------------------------------
5179
5180 class ObjectExtent {
5181 /**
5182 * ObjectExtents are used for specifying IO behavior against RADOS
5183 * objects when one is using the ObjectCacher.
5184 *
5185 * To use this in a real system, *every member* must be filled
5186 * out correctly. In particular, make sure to initialize the
5187 * oloc correctly, as its default values are deliberate poison
5188 * and will cause internal ObjectCacher asserts.
5189 *
5190 * Similarly, your buffer_extents vector *must* specify a total
5191 * size equal to your length. If the buffer_extents inadvertently
5192 * contain less space than the length member specifies, you
5193 * will get unintelligible asserts deep in the ObjectCacher.
5194 *
5195 * If you are trying to do testing and don't care about actual
5196 * RADOS function, the simplest thing to do is to initialize
5197 * the ObjectExtent (truncate_size can be 0), create a single entry
5198 * in buffer_extents matching the length, and set oloc.pool to 0.
5199 */
5200 public:
5201 object_t oid; // object id
5202 uint64_t objectno;
5203 uint64_t offset; // in object
5204 uint64_t length; // in object
5205 uint64_t truncate_size; // in object
5206
5207 object_locator_t oloc; // object locator (pool etc)
5208
5209 std::vector<std::pair<uint64_t,uint64_t> > buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
5210
5211 ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
5212 ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
5213 oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
5214 };
5215
5216 inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex)
5217 {
5218 return out << "extent("
5219 << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
5220 << " " << ex.offset << "~" << ex.length
5221 << " -> " << ex.buffer_extents
5222 << ")";
5223 }
5224
5225
5226 // ---------------------------------------
5227
5228 class OSDSuperblock {
5229 public:
5230 uuid_d cluster_fsid, osd_fsid;
5231 int32_t whoami = -1; // my role in this fs.
5232 epoch_t current_epoch = 0; // most recent epoch
5233 epoch_t oldest_map = 0, newest_map = 0; // oldest/newest maps we have.
5234 double weight = 0.0;
5235
5236 CompatSet compat_features;
5237
5238 // last interval over which i mounted and was then active
5239 epoch_t mounted = 0; // last epoch i mounted
5240 epoch_t clean_thru = 0; // epoch i was active and clean thru
5241
5242 epoch_t purged_snaps_last = 0;
5243 utime_t last_purged_snaps_scrub;
5244
5245 void encode(ceph::buffer::list &bl) const;
5246 void decode(ceph::buffer::list::const_iterator &bl);
5247 void dump(ceph::Formatter *f) const;
5248 static void generate_test_instances(std::list<OSDSuperblock*>& o);
5249 };
5250 WRITE_CLASS_ENCODER(OSDSuperblock)
5251
5252 inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb)
5253 {
5254 return out << "sb(" << sb.cluster_fsid
5255 << " osd." << sb.whoami
5256 << " " << sb.osd_fsid
5257 << " e" << sb.current_epoch
5258 << " [" << sb.oldest_map << "," << sb.newest_map << "]"
5259 << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
5260 << ")";
5261 }
5262
5263
5264 // -------
5265
5266
5267
5268
5269
5270
5271 /*
5272 * attached to object head. describes most recent snap context, and
5273 * set of existing clones.
5274 */
5275 struct SnapSet {
5276 snapid_t seq;
5277 // NOTE: this is for pre-octopus compatibility only! remove in Q release
5278 std::vector<snapid_t> snaps; // descending
5279 std::vector<snapid_t> clones; // ascending
5280 std::map<snapid_t, interval_set<uint64_t> > clone_overlap; // overlap w/ next newest
5281 std::map<snapid_t, uint64_t> clone_size;
5282 std::map<snapid_t, std::vector<snapid_t>> clone_snaps; // descending
5283
5284 SnapSet() : seq(0) {}
5285 explicit SnapSet(ceph::buffer::list& bl) {
5286 auto p = std::cbegin(bl);
5287 decode(p);
5288 }
5289
5290 /// populate SnapSet from a librados::snap_set_t
5291 void from_snap_set(const librados::snap_set_t& ss, bool legacy);
5292
5293 /// get space accounted to clone
5294 uint64_t get_clone_bytes(snapid_t clone) const;
5295
5296 void encode(ceph::buffer::list& bl) const;
5297 void decode(ceph::buffer::list::const_iterator& bl);
5298 void dump(ceph::Formatter *f) const;
5299 static void generate_test_instances(std::list<SnapSet*>& o);
5300
5301 SnapContext get_ssc_as_of(snapid_t as_of) const {
5302 SnapContext out;
5303 out.seq = as_of;
5304 for (auto p = clone_snaps.rbegin();
5305 p != clone_snaps.rend();
5306 ++p) {
5307 for (auto snap : p->second) {
5308 if (snap <= as_of) {
5309 out.snaps.push_back(snap);
5310 }
5311 }
5312 }
5313 return out;
5314 }
5315
5316
5317 SnapSet get_filtered(const pg_pool_t &pinfo) const;
5318 void filter(const pg_pool_t &pinfo);
5319 };
5320 WRITE_CLASS_ENCODER(SnapSet)
5321
5322 std::ostream& operator<<(std::ostream& out, const SnapSet& cs);
5323
5324
5325
5326 #define OI_ATTR "_"
5327 #define SS_ATTR "snapset"
5328
5329 struct watch_info_t {
5330 uint64_t cookie;
5331 uint32_t timeout_seconds;
5332 entity_addr_t addr;
5333
5334 watch_info_t() : cookie(0), timeout_seconds(0) { }
5335 watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
5336
5337 void encode(ceph::buffer::list& bl, uint64_t features) const;
5338 void decode(ceph::buffer::list::const_iterator& bl);
5339 void dump(ceph::Formatter *f) const;
5340 static void generate_test_instances(std::list<watch_info_t*>& o);
5341 };
5342 WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
5343
5344 static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
5345 return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
5346 && l.addr == r.addr;
5347 }
5348
5349 static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) {
5350 return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
5351 << " " << w.addr << ")";
5352 }
5353
5354 struct notify_info_t {
5355 uint64_t cookie;
5356 uint64_t notify_id;
5357 uint32_t timeout;
5358 ceph::buffer::list bl;
5359 };
5360
5361 static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) {
5362 return out << "notify(cookie " << n.cookie
5363 << " notify" << n.notify_id
5364 << " " << n.timeout << "s)";
5365 }
5366
5367 struct chunk_info_t {
5368 typedef enum {
5369 FLAG_DIRTY = 1,
5370 FLAG_MISSING = 2,
5371 FLAG_HAS_REFERENCE = 4,
5372 FLAG_HAS_FINGERPRINT = 8,
5373 } cflag_t;
5374 uint32_t offset;
5375 uint32_t length;
5376 hobject_t oid;
5377 cflag_t flags; // FLAG_*
5378
5379 chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
5380
5381 static std::string get_flag_string(uint64_t flags) {
5382 std::string r;
5383 if (flags & FLAG_DIRTY) {
5384 r += "|dirty";
5385 }
5386 if (flags & FLAG_MISSING) {
5387 r += "|missing";
5388 }
5389 if (flags & FLAG_HAS_REFERENCE) {
5390 r += "|has_reference";
5391 }
5392 if (flags & FLAG_HAS_FINGERPRINT) {
5393 r += "|has_fingerprint";
5394 }
5395 if (r.length())
5396 return r.substr(1);
5397 return r;
5398 }
5399 bool test_flag(cflag_t f) const {
5400 return (flags & f) == f;
5401 }
5402 void set_flag(cflag_t f) {
5403 flags = (cflag_t)(flags | f);
5404 }
5405 void set_flags(cflag_t f) {
5406 flags = f;
5407 }
5408 void clear_flag(cflag_t f) {
5409 flags = (cflag_t)(flags & ~f);
5410 }
5411 void clear_flags() {
5412 flags = (cflag_t)0;
5413 }
5414 bool is_dirty() const {
5415 return test_flag(FLAG_DIRTY);
5416 }
5417 bool is_missing() const {
5418 return test_flag(FLAG_MISSING);
5419 }
5420 bool has_reference() const {
5421 return test_flag(FLAG_HAS_REFERENCE);
5422 }
5423 bool has_fingerprint() const {
5424 return test_flag(FLAG_HAS_FINGERPRINT);
5425 }
5426 void encode(ceph::buffer::list &bl) const;
5427 void decode(ceph::buffer::list::const_iterator &bl);
5428 void dump(ceph::Formatter *f) const;
5429 friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
5430 };
5431 WRITE_CLASS_ENCODER(chunk_info_t)
5432 std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
5433
5434 struct object_info_t;
5435 struct object_manifest_t {
5436 enum {
5437 TYPE_NONE = 0,
5438 TYPE_REDIRECT = 1,
5439 TYPE_CHUNKED = 2,
5440 };
5441 uint8_t type; // redirect, chunked, ...
5442 hobject_t redirect_target;
5443 std::map<uint64_t, chunk_info_t> chunk_map;
5444
5445 object_manifest_t() : type(0) { }
5446 object_manifest_t(uint8_t type, const hobject_t& redirect_target)
5447 : type(type), redirect_target(redirect_target) { }
5448
5449 bool is_empty() const {
5450 return type == TYPE_NONE;
5451 }
5452 bool is_redirect() const {
5453 return type == TYPE_REDIRECT;
5454 }
5455 bool is_chunked() const {
5456 return type == TYPE_CHUNKED;
5457 }
5458 static std::string_view get_type_name(uint8_t m) {
5459 switch (m) {
5460 case TYPE_NONE: return "none";
5461 case TYPE_REDIRECT: return "redirect";
5462 case TYPE_CHUNKED: return "chunked";
5463 default: return "unknown";
5464 }
5465 }
5466 std::string_view get_type_name() const {
5467 return get_type_name(type);
5468 }
5469 void clear() {
5470 type = 0;
5471 redirect_target = hobject_t();
5472 chunk_map.clear();
5473 }
5474 static void generate_test_instances(std::list<object_manifest_t*>& o);
5475 void encode(ceph::buffer::list &bl) const;
5476 void decode(ceph::buffer::list::const_iterator &bl);
5477 void dump(ceph::Formatter *f) const;
5478 friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
5479 };
5480 WRITE_CLASS_ENCODER(object_manifest_t)
5481 std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi);
5482
5483 struct object_info_t {
5484 hobject_t soid;
5485 eversion_t version, prior_version;
5486 version_t user_version;
5487 osd_reqid_t last_reqid;
5488
5489 uint64_t size;
5490 utime_t mtime;
5491 utime_t local_mtime; // local mtime
5492
5493 // note: these are currently encoded into a total 16 bits; see
5494 // encode()/decode() for the weirdness.
5495 typedef enum {
5496 FLAG_LOST = 1<<0,
5497 FLAG_WHITEOUT = 1<<1, // object logically does not exist
5498 FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
5499 FLAG_OMAP = 1<<3, // has (or may have) some/any omap data
5500 FLAG_DATA_DIGEST = 1<<4, // has data crc
5501 FLAG_OMAP_DIGEST = 1<<5, // has omap crc
5502 FLAG_CACHE_PIN = 1<<6, // pin the object in cache tier
5503 FLAG_MANIFEST = 1<<7, // has manifest
5504 FLAG_USES_TMAP = 1<<8, // deprecated; no longer used
5505 FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
5506 } flag_t;
5507
5508 flag_t flags;
5509
5510 static std::string get_flag_string(flag_t flags) {
5511 std::string s;
5512 std::vector<std::string> sv = get_flag_vector(flags);
5513 for (auto ss : sv) {
5514 s += std::string("|") + ss;
5515 }
5516 if (s.length())
5517 return s.substr(1);
5518 return s;
5519 }
5520 static std::vector<std::string> get_flag_vector(flag_t flags) {
5521 std::vector<std::string> sv;
5522 if (flags & FLAG_LOST)
5523 sv.insert(sv.end(), "lost");
5524 if (flags & FLAG_WHITEOUT)
5525 sv.insert(sv.end(), "whiteout");
5526 if (flags & FLAG_DIRTY)
5527 sv.insert(sv.end(), "dirty");
5528 if (flags & FLAG_USES_TMAP)
5529 sv.insert(sv.end(), "uses_tmap");
5530 if (flags & FLAG_OMAP)
5531 sv.insert(sv.end(), "omap");
5532 if (flags & FLAG_DATA_DIGEST)
5533 sv.insert(sv.end(), "data_digest");
5534 if (flags & FLAG_OMAP_DIGEST)
5535 sv.insert(sv.end(), "omap_digest");
5536 if (flags & FLAG_CACHE_PIN)
5537 sv.insert(sv.end(), "cache_pin");
5538 if (flags & FLAG_MANIFEST)
5539 sv.insert(sv.end(), "manifest");
5540 if (flags & FLAG_REDIRECT_HAS_REFERENCE)
5541 sv.insert(sv.end(), "redirect_has_reference");
5542 return sv;
5543 }
5544 std::string get_flag_string() const {
5545 return get_flag_string(flags);
5546 }
5547
5548 uint64_t truncate_seq, truncate_size;
5549
5550 std::map<std::pair<uint64_t, entity_name_t>, watch_info_t> watchers;
5551
5552 // opportunistic checksums; may or may not be present
5553 __u32 data_digest; ///< data crc32c
5554 __u32 omap_digest; ///< omap crc32c
5555
5556 // alloc hint attribute
5557 uint64_t expected_object_size, expected_write_size;
5558 uint32_t alloc_hint_flags;
5559
5560 struct object_manifest_t manifest;
5561
5562 void copy_user_bits(const object_info_t& other);
5563
5564 bool test_flag(flag_t f) const {
5565 return (flags & f) == f;
5566 }
5567 void set_flag(flag_t f) {
5568 flags = (flag_t)(flags | f);
5569 }
5570 void clear_flag(flag_t f) {
5571 flags = (flag_t)(flags & ~f);
5572 }
5573 bool is_lost() const {
5574 return test_flag(FLAG_LOST);
5575 }
5576 bool is_whiteout() const {
5577 return test_flag(FLAG_WHITEOUT);
5578 }
5579 bool is_dirty() const {
5580 return test_flag(FLAG_DIRTY);
5581 }
5582 bool is_omap() const {
5583 return test_flag(FLAG_OMAP);
5584 }
5585 bool is_data_digest() const {
5586 return test_flag(FLAG_DATA_DIGEST);
5587 }
5588 bool is_omap_digest() const {
5589 return test_flag(FLAG_OMAP_DIGEST);
5590 }
5591 bool is_cache_pinned() const {
5592 return test_flag(FLAG_CACHE_PIN);
5593 }
5594 bool has_manifest() const {
5595 return test_flag(FLAG_MANIFEST);
5596 }
5597 void set_data_digest(__u32 d) {
5598 set_flag(FLAG_DATA_DIGEST);
5599 data_digest = d;
5600 }
5601 void set_omap_digest(__u32 d) {
5602 set_flag(FLAG_OMAP_DIGEST);
5603 omap_digest = d;
5604 }
5605 void clear_data_digest() {
5606 clear_flag(FLAG_DATA_DIGEST);
5607 data_digest = -1;
5608 }
5609 void clear_omap_digest() {
5610 clear_flag(FLAG_OMAP_DIGEST);
5611 omap_digest = -1;
5612 }
5613 void new_object() {
5614 clear_data_digest();
5615 clear_omap_digest();
5616 }
5617
5618 void encode(ceph::buffer::list& bl, uint64_t features) const;
5619 void decode(ceph::buffer::list::const_iterator& bl);
5620 void decode(ceph::buffer::list& bl) {
5621 auto p = std::cbegin(bl);
5622 decode(p);
5623 }
5624 void dump(ceph::Formatter *f) const;
5625 static void generate_test_instances(std::list<object_info_t*>& o);
5626
5627 explicit object_info_t()
5628 : user_version(0), size(0), flags((flag_t)0),
5629 truncate_seq(0), truncate_size(0),
5630 data_digest(-1), omap_digest(-1),
5631 expected_object_size(0), expected_write_size(0),
5632 alloc_hint_flags(0)
5633 {}
5634
5635 explicit object_info_t(const hobject_t& s)
5636 : soid(s),
5637 user_version(0), size(0), flags((flag_t)0),
5638 truncate_seq(0), truncate_size(0),
5639 data_digest(-1), omap_digest(-1),
5640 expected_object_size(0), expected_write_size(0),
5641 alloc_hint_flags(0)
5642 {}
5643
5644 explicit object_info_t(ceph::buffer::list& bl) {
5645 decode(bl);
5646 }
5647 };
5648 WRITE_CLASS_ENCODER_FEATURES(object_info_t)
5649
5650 std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
5651
5652
5653
5654 // Object recovery
5655 struct ObjectRecoveryInfo {
5656 hobject_t soid;
5657 eversion_t version;
5658 uint64_t size;
5659 object_info_t oi;
5660 SnapSet ss; // only populated if soid is_snap()
5661 interval_set<uint64_t> copy_subset;
5662 std::map<hobject_t, interval_set<uint64_t>> clone_subset;
5663 bool object_exist;
5664
5665 ObjectRecoveryInfo() : size(0), object_exist(true) { }
5666
5667 static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o);
5668 void encode(ceph::buffer::list &bl, uint64_t features) const;
5669 void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
5670 std::ostream &print(std::ostream &out) const;
5671 void dump(ceph::Formatter *f) const;
5672 };
5673 WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
5674 std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf);
5675
5676 struct ObjectRecoveryProgress {
5677 uint64_t data_recovered_to;
5678 std::string omap_recovered_to;
5679 bool first;
5680 bool data_complete;
5681 bool omap_complete;
5682 bool error = false;
5683
5684 ObjectRecoveryProgress()
5685 : data_recovered_to(0),
5686 first(true),
5687 data_complete(false), omap_complete(false) { }
5688
5689 bool is_complete(const ObjectRecoveryInfo& info) const {
5690 return (data_recovered_to >= (
5691 info.copy_subset.empty() ?
5692 0 : info.copy_subset.range_end())) &&
5693 omap_complete;
5694 }
5695
5696 static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o);
5697 void encode(ceph::buffer::list &bl) const;
5698 void decode(ceph::buffer::list::const_iterator &bl);
5699 std::ostream &print(std::ostream &out) const;
5700 void dump(ceph::Formatter *f) const;
5701 };
5702 WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
5703 std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog);
5704
5705 struct PushReplyOp {
5706 hobject_t soid;
5707
5708 static void generate_test_instances(std::list<PushReplyOp*>& o);
5709 void encode(ceph::buffer::list &bl) const;
5710 void decode(ceph::buffer::list::const_iterator &bl);
5711 std::ostream &print(std::ostream &out) const;
5712 void dump(ceph::Formatter *f) const;
5713
5714 uint64_t cost(CephContext *cct) const;
5715 };
5716 WRITE_CLASS_ENCODER(PushReplyOp)
5717 std::ostream& operator<<(std::ostream& out, const PushReplyOp &op);
5718
5719 struct PullOp {
5720 hobject_t soid;
5721
5722 ObjectRecoveryInfo recovery_info;
5723 ObjectRecoveryProgress recovery_progress;
5724
5725 static void generate_test_instances(std::list<PullOp*>& o);
5726 void encode(ceph::buffer::list &bl, uint64_t features) const;
5727 void decode(ceph::buffer::list::const_iterator &bl);
5728 std::ostream &print(std::ostream &out) const;
5729 void dump(ceph::Formatter *f) const;
5730
5731 uint64_t cost(CephContext *cct) const;
5732 };
5733 WRITE_CLASS_ENCODER_FEATURES(PullOp)
5734 std::ostream& operator<<(std::ostream& out, const PullOp &op);
5735
5736 struct PushOp {
5737 hobject_t soid;
5738 eversion_t version;
5739 ceph::buffer::list data;
5740 interval_set<uint64_t> data_included;
5741 ceph::buffer::list omap_header;
5742 std::map<std::string, ceph::buffer::list> omap_entries;
5743 std::map<std::string, ceph::buffer::list> attrset;
5744
5745 ObjectRecoveryInfo recovery_info;
5746 ObjectRecoveryProgress before_progress;
5747 ObjectRecoveryProgress after_progress;
5748
5749 static void generate_test_instances(std::list<PushOp*>& o);
5750 void encode(ceph::buffer::list &bl, uint64_t features) const;
5751 void decode(ceph::buffer::list::const_iterator &bl);
5752 std::ostream &print(std::ostream &out) const;
5753 void dump(ceph::Formatter *f) const;
5754
5755 uint64_t cost(CephContext *cct) const;
5756 };
5757 WRITE_CLASS_ENCODER_FEATURES(PushOp)
5758 std::ostream& operator<<(std::ostream& out, const PushOp &op);
5759
5760
5761 /*
5762 * summarize pg contents for purposes of a scrub
5763 */
5764 struct ScrubMap {
5765 struct object {
5766 std::map<std::string, ceph::buffer::ptr> attrs;
5767 uint64_t size;
5768 __u32 omap_digest; ///< omap crc32c
5769 __u32 digest; ///< data crc32c
5770 bool negative:1;
5771 bool digest_present:1;
5772 bool omap_digest_present:1;
5773 bool read_error:1;
5774 bool stat_error:1;
5775 bool ec_hash_mismatch:1;
5776 bool ec_size_mismatch:1;
5777 bool large_omap_object_found:1;
5778 uint64_t large_omap_object_key_count = 0;
5779 uint64_t large_omap_object_value_size = 0;
5780 uint64_t object_omap_bytes = 0;
5781 uint64_t object_omap_keys = 0;
5782
5783 object() :
5784 // Init invalid size so it won't match if we get a stat EIO error
5785 size(-1), omap_digest(0), digest(0),
5786 negative(false), digest_present(false), omap_digest_present(false),
5787 read_error(false), stat_error(false), ec_hash_mismatch(false),
5788 ec_size_mismatch(false), large_omap_object_found(false) {}
5789
5790 void encode(ceph::buffer::list& bl) const;
5791 void decode(ceph::buffer::list::const_iterator& bl);
5792 void dump(ceph::Formatter *f) const;
5793 static void generate_test_instances(std::list<object*>& o);
5794 };
5795 WRITE_CLASS_ENCODER(object)
5796
5797 std::map<hobject_t,object> objects;
5798 eversion_t valid_through;
5799 eversion_t incr_since;
5800 bool has_large_omap_object_errors:1;
5801 bool has_omap_keys:1;
5802
5803 void merge_incr(const ScrubMap &l);
5804 void clear_from(const hobject_t& start) {
5805 objects.erase(objects.lower_bound(start), objects.end());
5806 }
5807 void insert(const ScrubMap &r) {
5808 objects.insert(r.objects.begin(), r.objects.end());
5809 }
5810 void swap(ScrubMap &r) {
5811 using std::swap;
5812 swap(objects, r.objects);
5813 swap(valid_through, r.valid_through);
5814 swap(incr_since, r.incr_since);
5815 }
5816
5817 void encode(ceph::buffer::list& bl) const;
5818 void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1);
5819 void dump(ceph::Formatter *f) const;
5820 static void generate_test_instances(std::list<ScrubMap*>& o);
5821 };
5822 WRITE_CLASS_ENCODER(ScrubMap::object)
5823 WRITE_CLASS_ENCODER(ScrubMap)
5824
5825 struct ScrubMapBuilder {
5826 bool deep = false;
5827 std::vector<hobject_t> ls;
5828 size_t pos = 0;
5829 int64_t data_pos = 0;
5830 std::string omap_pos;
5831 int ret = 0;
5832 ceph::buffer::hash data_hash, omap_hash; ///< accumulatinng hash value
5833 uint64_t omap_keys = 0;
5834 uint64_t omap_bytes = 0;
5835
5836 bool empty() {
5837 return ls.empty();
5838 }
5839 bool done() {
5840 return pos >= ls.size();
5841 }
5842 void reset() {
5843 *this = ScrubMapBuilder();
5844 }
5845
5846 bool data_done() {
5847 return data_pos < 0;
5848 }
5849
5850 void next_object() {
5851 ++pos;
5852 data_pos = 0;
5853 omap_pos.clear();
5854 omap_keys = 0;
5855 omap_bytes = 0;
5856 }
5857
5858 friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) {
5859 out << "(" << pos.pos << "/" << pos.ls.size();
5860 if (pos.pos < pos.ls.size()) {
5861 out << " " << pos.ls[pos.pos];
5862 }
5863 if (pos.data_pos < 0) {
5864 out << " byte " << pos.data_pos;
5865 }
5866 if (!pos.omap_pos.empty()) {
5867 out << " key " << pos.omap_pos;
5868 }
5869 if (pos.deep) {
5870 out << " deep";
5871 }
5872 if (pos.ret) {
5873 out << " ret " << pos.ret;
5874 }
5875 return out << ")";
5876 }
5877 };
5878
5879 struct watch_item_t {
5880 entity_name_t name;
5881 uint64_t cookie;
5882 uint32_t timeout_seconds;
5883 entity_addr_t addr;
5884
5885 watch_item_t() : cookie(0), timeout_seconds(0) { }
5886 watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
5887 const entity_addr_t& addr)
5888 : name(name), cookie(cookie), timeout_seconds(timeout),
5889 addr(addr) { }
5890
5891 void encode(ceph::buffer::list &bl, uint64_t features) const {
5892 ENCODE_START(2, 1, bl);
5893 encode(name, bl);
5894 encode(cookie, bl);
5895 encode(timeout_seconds, bl);
5896 encode(addr, bl, features);
5897 ENCODE_FINISH(bl);
5898 }
5899 void decode(ceph::buffer::list::const_iterator &bl) {
5900 DECODE_START(2, bl);
5901 decode(name, bl);
5902 decode(cookie, bl);
5903 decode(timeout_seconds, bl);
5904 if (struct_v >= 2) {
5905 decode(addr, bl);
5906 }
5907 DECODE_FINISH(bl);
5908 }
5909 void dump(ceph::Formatter *f) const {
5910 f->dump_stream("watcher") << name;
5911 f->dump_int("cookie", cookie);
5912 f->dump_int("timeout", timeout_seconds);
5913 f->open_object_section("addr");
5914 addr.dump(f);
5915 f->close_section();
5916 }
5917 static void generate_test_instances(std::list<watch_item_t*>& o) {
5918 entity_addr_t ea;
5919 ea.set_type(entity_addr_t::TYPE_LEGACY);
5920 ea.set_nonce(1000);
5921 ea.set_family(AF_INET);
5922 ea.set_in4_quad(0, 127);
5923 ea.set_in4_quad(1, 0);
5924 ea.set_in4_quad(2, 0);
5925 ea.set_in4_quad(3, 1);
5926 ea.set_port(1024);
5927 o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
5928 ea.set_nonce(1001);
5929 ea.set_in4_quad(3, 2);
5930 ea.set_port(1025);
5931 o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
5932 }
5933 };
5934 WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
5935
5936 struct obj_watch_item_t {
5937 hobject_t obj;
5938 watch_item_t wi;
5939 };
5940
5941 /**
5942 * obj list watch response format
5943 *
5944 */
5945 struct obj_list_watch_response_t {
5946 std::list<watch_item_t> entries;
5947
5948 void encode(ceph::buffer::list& bl, uint64_t features) const {
5949 ENCODE_START(1, 1, bl);
5950 encode(entries, bl, features);
5951 ENCODE_FINISH(bl);
5952 }
5953 void decode(ceph::buffer::list::const_iterator& bl) {
5954 DECODE_START(1, bl);
5955 decode(entries, bl);
5956 DECODE_FINISH(bl);
5957 }
5958 void dump(ceph::Formatter *f) const {
5959 f->open_array_section("entries");
5960 for (std::list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
5961 f->open_object_section("watch");
5962 p->dump(f);
5963 f->close_section();
5964 }
5965 f->close_section();
5966 }
5967 static void generate_test_instances(std::list<obj_list_watch_response_t*>& o) {
5968 entity_addr_t ea;
5969 o.push_back(new obj_list_watch_response_t);
5970 o.push_back(new obj_list_watch_response_t);
5971 std::list<watch_item_t*> test_watchers;
5972 watch_item_t::generate_test_instances(test_watchers);
5973 for (auto &e : test_watchers) {
5974 o.back()->entries.push_back(*e);
5975 delete e;
5976 }
5977 }
5978 };
5979 WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
5980
5981 struct clone_info {
5982 snapid_t cloneid;
5983 std::vector<snapid_t> snaps; // ascending
5984 std::vector< std::pair<uint64_t,uint64_t> > overlap;
5985 uint64_t size;
5986
5987 clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
5988
5989 void encode(ceph::buffer::list& bl) const {
5990 ENCODE_START(1, 1, bl);
5991 encode(cloneid, bl);
5992 encode(snaps, bl);
5993 encode(overlap, bl);
5994 encode(size, bl);
5995 ENCODE_FINISH(bl);
5996 }
5997 void decode(ceph::buffer::list::const_iterator& bl) {
5998 DECODE_START(1, bl);
5999 decode(cloneid, bl);
6000 decode(snaps, bl);
6001 decode(overlap, bl);
6002 decode(size, bl);
6003 DECODE_FINISH(bl);
6004 }
6005 void dump(ceph::Formatter *f) const {
6006 if (cloneid == CEPH_NOSNAP)
6007 f->dump_string("cloneid", "HEAD");
6008 else
6009 f->dump_unsigned("cloneid", cloneid.val);
6010 f->open_array_section("snapshots");
6011 for (std::vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
6012 f->open_object_section("snap");
6013 f->dump_unsigned("id", p->val);
6014 f->close_section();
6015 }
6016 f->close_section();
6017 f->open_array_section("overlaps");
6018 for (std::vector< std::pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
6019 q != overlap.end(); ++q) {
6020 f->open_object_section("overlap");
6021 f->dump_unsigned("offset", q->first);
6022 f->dump_unsigned("length", q->second);
6023 f->close_section();
6024 }
6025 f->close_section();
6026 f->dump_unsigned("size", size);
6027 }
6028 static void generate_test_instances(std::list<clone_info*>& o) {
6029 o.push_back(new clone_info);
6030 o.push_back(new clone_info);
6031 o.back()->cloneid = 1;
6032 o.back()->snaps.push_back(1);
6033 o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6034 o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
6035 o.back()->size = 16384;
6036 o.push_back(new clone_info);
6037 o.back()->cloneid = CEPH_NOSNAP;
6038 o.back()->size = 32768;
6039 }
6040 };
6041 WRITE_CLASS_ENCODER(clone_info)
6042
6043 /**
6044 * obj list snaps response format
6045 *
6046 */
6047 struct obj_list_snap_response_t {
6048 std::vector<clone_info> clones; // ascending
6049 snapid_t seq;
6050
6051 void encode(ceph::buffer::list& bl) const {
6052 ENCODE_START(2, 1, bl);
6053 encode(clones, bl);
6054 encode(seq, bl);
6055 ENCODE_FINISH(bl);
6056 }
6057 void decode(ceph::buffer::list::const_iterator& bl) {
6058 DECODE_START(2, bl);
6059 decode(clones, bl);
6060 if (struct_v >= 2)
6061 decode(seq, bl);
6062 else
6063 seq = CEPH_NOSNAP;
6064 DECODE_FINISH(bl);
6065 }
6066 void dump(ceph::Formatter *f) const {
6067 f->open_array_section("clones");
6068 for (std::vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
6069 f->open_object_section("clone");
6070 p->dump(f);
6071 f->close_section();
6072 }
6073 f->dump_unsigned("seq", seq);
6074 f->close_section();
6075 }
6076 static void generate_test_instances(std::list<obj_list_snap_response_t*>& o) {
6077 o.push_back(new obj_list_snap_response_t);
6078 o.push_back(new obj_list_snap_response_t);
6079 clone_info cl;
6080 cl.cloneid = 1;
6081 cl.snaps.push_back(1);
6082 cl.overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
6083 cl.overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
6084 cl.size = 16384;
6085 o.back()->clones.push_back(cl);
6086 cl.cloneid = CEPH_NOSNAP;
6087 cl.snaps.clear();
6088 cl.overlap.clear();
6089 cl.size = 32768;
6090 o.back()->clones.push_back(cl);
6091 o.back()->seq = 123;
6092 }
6093 };
6094
6095 WRITE_CLASS_ENCODER(obj_list_snap_response_t)
6096
6097 // PromoteCounter
6098
6099 struct PromoteCounter {
6100 std::atomic<unsigned long long> attempts{0};
6101 std::atomic<unsigned long long> objects{0};
6102 std::atomic<unsigned long long> bytes{0};
6103
6104 void attempt() {
6105 attempts++;
6106 }
6107
6108 void finish(uint64_t size) {
6109 objects++;
6110 bytes += size;
6111 }
6112
6113 void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
6114 *a = attempts;
6115 *o = objects;
6116 *b = bytes;
6117 attempts = *a / 2;
6118 objects = *o / 2;
6119 bytes = *b / 2;
6120 }
6121 };
6122
6123 struct pool_pg_num_history_t {
6124 /// last epoch updated
6125 epoch_t epoch = 0;
6126 /// poolid -> epoch -> pg_num
6127 std::map<int64_t, std::map<epoch_t,uint32_t>> pg_nums;
6128 /// pair(epoch, poolid)
6129 std::set<std::pair<epoch_t,int64_t>> deleted_pools;
6130
6131 void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
6132 pg_nums[pool][epoch] = pg_num;
6133 }
6134 void log_pool_delete(epoch_t epoch, int64_t pool) {
6135 deleted_pools.insert(std::make_pair(epoch, pool));
6136 }
6137
6138 /// prune history based on oldest osdmap epoch in the cluster
6139 void prune(epoch_t oldest_epoch) {
6140 auto i = deleted_pools.begin();
6141 while (i != deleted_pools.end()) {
6142 if (i->first >= oldest_epoch) {
6143 break;
6144 }
6145 pg_nums.erase(i->second);
6146 i = deleted_pools.erase(i);
6147 }
6148 for (auto& j : pg_nums) {
6149 auto k = j.second.lower_bound(oldest_epoch);
6150 // keep this and the entry before it (just to be paranoid)
6151 if (k != j.second.begin()) {
6152 --k;
6153 j.second.erase(j.second.begin(), k);
6154 }
6155 }
6156 }
6157
6158 void encode(ceph::buffer::list& bl) const {
6159 ENCODE_START(1, 1, bl);
6160 encode(epoch, bl);
6161 encode(pg_nums, bl);
6162 encode(deleted_pools, bl);
6163 ENCODE_FINISH(bl);
6164 }
6165 void decode(ceph::buffer::list::const_iterator& p) {
6166 DECODE_START(1, p);
6167 decode(epoch, p);
6168 decode(pg_nums, p);
6169 decode(deleted_pools, p);
6170 DECODE_FINISH(p);
6171 }
6172 void dump(ceph::Formatter *f) const {
6173 f->dump_unsigned("epoch", epoch);
6174 f->open_object_section("pools");
6175 for (auto& i : pg_nums) {
6176 f->open_object_section("pool");
6177 f->dump_unsigned("pool_id", i.first);
6178 f->open_array_section("changes");
6179 for (auto& j : i.second) {
6180 f->open_object_section("change");
6181 f->dump_unsigned("epoch", j.first);
6182 f->dump_unsigned("pg_num", j.second);
6183 f->close_section();
6184 }
6185 f->close_section();
6186 f->close_section();
6187 }
6188 f->close_section();
6189 f->open_array_section("deleted_pools");
6190 for (auto& i : deleted_pools) {
6191 f->open_object_section("deletion");
6192 f->dump_unsigned("pool_id", i.second);
6193 f->dump_unsigned("epoch", i.first);
6194 f->close_section();
6195 }
6196 f->close_section();
6197 }
6198 static void generate_test_instances(std::list<pool_pg_num_history_t*>& ls) {
6199 ls.push_back(new pool_pg_num_history_t);
6200 }
6201 friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) {
6202 return out << "pg_num_history(e" << h.epoch
6203 << " pg_nums " << h.pg_nums
6204 << " deleted_pools " << h.deleted_pools
6205 << ")";
6206 }
6207 };
6208 WRITE_CLASS_ENCODER(pool_pg_num_history_t)
6209
6210 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
6211 // easily skip them
6212 static const string_view infover_key = "_infover"sv;
6213 static const string_view info_key = "_info"sv;
6214 static const string_view biginfo_key = "_biginfo"sv;
6215 static const string_view epoch_key = "_epoch"sv;
6216 static const string_view fastinfo_key = "_fastinfo"sv;
6217
6218 static const __u8 pg_latest_struct_v = 10;
6219 // v10 is the new past_intervals encoding
6220 // v9 was fastinfo_key addition
6221 // v8 was the move to a per-pg pgmeta object
6222 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
6223 // (first appeared in cuttlefish).
6224 static const __u8 pg_compat_struct_v = 10;
6225
6226 int prepare_info_keymap(
6227 CephContext* cct,
6228 map<string,bufferlist> *km,
6229 epoch_t epoch,
6230 pg_info_t &info,
6231 pg_info_t &last_written_info,
6232 PastIntervals &past_intervals,
6233 bool dirty_big_info,
6234 bool dirty_epoch,
6235 bool try_fast_info,
6236 PerfCounters *logger = nullptr,
6237 DoutPrefixProvider *dpp = nullptr);
6238
6239 namespace ceph::os {
6240 class Transaction;
6241 };
6242
6243 void create_pg_collection(
6244 ceph::os::Transaction& t, spg_t pgid, int bits);
6245
6246 void init_pg_ondisk(
6247 ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool);
6248
6249 // omap specific stats
6250 struct omap_stat_t {
6251 int large_omap_objects;
6252 int64_t omap_bytes;
6253 int64_t omap_keys;
6254 };
6255
6256 // filter for pg listings
6257 class PGLSFilter {
6258 CephContext* cct;
6259 protected:
6260 std::string xattr;
6261 public:
6262 PGLSFilter();
6263 virtual ~PGLSFilter();
6264 virtual bool filter(const hobject_t &obj,
6265 const ceph::buffer::list& xattr_data) const = 0;
6266
6267 /**
6268 * Arguments passed from the RADOS client. Implementations must
6269 * handle any encoding errors, and return an appropriate error code,
6270 * or 0 on valid input.
6271 */
6272 virtual int init(ceph::buffer::list::const_iterator ¶ms) = 0;
6273
6274 /**
6275 * xattr key, or empty string. If non-empty, this xattr will be fetched
6276 * and the value passed into ::filter
6277 */
6278 virtual const std::string& get_xattr() const { return xattr; }
6279
6280 /**
6281 * If true, objects without the named xattr (if xattr name is not empty)
6282 * will be rejected without calling ::filter
6283 */
6284 virtual bool reject_empty_xattr() const { return true; }
6285 };
6286
6287 class PGLSPlainFilter : public PGLSFilter {
6288 std::string val;
6289 public:
6290 int init(ceph::bufferlist::const_iterator ¶ms) override;
6291 ~PGLSPlainFilter() override {}
6292 bool filter(const hobject_t& obj,
6293 const ceph::bufferlist& xattr_data) const override;
6294 };
6295
6296
6297 #endif
6298