1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #include "FSMap.h"
17
18 #include "common/StackStringStream.h"
19
20 #include <sstream>
21 #ifdef WITH_SEASTAR
22 #include "crimson/common/config_proxy.h"
23 #else
24 #include "common/config_proxy.h"
25 #endif
26 #include "global/global_context.h"
27 #include "mon/health_check.h"
28
29 using std::stringstream;
30
31 void Filesystem::dump(Formatter *f) const
32 {
33 f->open_object_section("mdsmap");
34 mds_map.dump(f);
35 f->close_section();
36 f->dump_int("id", fscid);
37 }
38
39 void FSMap::dump(Formatter *f) const
40 {
41 f->dump_int("epoch", epoch);
42 // Use 'default' naming to match 'set-default' CLI
43 f->dump_int("default_fscid", legacy_client_fscid);
44
45 f->open_object_section("compat");
46 compat.dump(f);
47 f->close_section();
48
49 f->open_object_section("feature_flags");
50 f->dump_bool("enable_multiple", enable_multiple);
51 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
52 f->close_section();
53
54 f->open_array_section("standbys");
55 for (const auto &i : standby_daemons) {
56 f->open_object_section("info");
57 i.second.dump(f);
58 f->dump_int("epoch", standby_epochs.at(i.first));
59 f->close_section();
60 }
61 f->close_section();
62
63 f->open_array_section("filesystems");
64 for (const auto &fs : filesystems) {
65 f->open_object_section("filesystem");
66 fs.second->dump(f);
67 f->close_section();
68 }
69 f->close_section();
70 }
71
72 FSMap &FSMap::operator=(const FSMap &rhs)
73 {
74 epoch = rhs.epoch;
75 next_filesystem_id = rhs.next_filesystem_id;
76 legacy_client_fscid = rhs.legacy_client_fscid;
77 compat = rhs.compat;
78 enable_multiple = rhs.enable_multiple;
79 mds_roles = rhs.mds_roles;
80 standby_daemons = rhs.standby_daemons;
81 standby_epochs = rhs.standby_epochs;
82
83 filesystems.clear();
84 for (const auto &i : rhs.filesystems) {
85 const auto &fs = i.second;
86 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
87 }
88
89 return *this;
90 }
91
92 void FSMap::generate_test_instances(std::list<FSMap*>& ls)
93 {
94 FSMap *m = new FSMap();
95
96 std::list<MDSMap*> mds_map_instances;
97 MDSMap::generate_test_instances(mds_map_instances);
98
99 int k = 20;
100 for (auto i : mds_map_instances) {
101 auto fs = Filesystem::create();
102 fs->fscid = k++;
103 fs->mds_map = *i;
104 delete i;
105 m->filesystems[fs->fscid] = fs;
106 }
107 mds_map_instances.clear();
108
109 ls.push_back(m);
110 }
111
112 void FSMap::print(ostream& out) const
113 {
114 out << "e" << epoch << std::endl;
115 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
116 << ever_enabled_multiple << std::endl;
117 out << "compat: " << compat << std::endl;
118 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
119 out << " " << std::endl;
120
121 if (filesystems.empty()) {
122 out << "No filesystems configured" << std::endl;
123 }
124
125 for (const auto& p : filesystems) {
126 p.second->print(out);
127 out << " " << std::endl << " " << std::endl; // Space out a bit
128 }
129
130 if (!standby_daemons.empty()) {
131 out << "Standby daemons:" << std::endl << " " << std::endl;
132 }
133
134 for (const auto &p : standby_daemons) {
135 p.second.print_summary(out);
136 out << std::endl;
137 }
138 }
139
140 void FSMap::print_summary(Formatter *f, ostream *out) const
141 {
142 if (f) {
143 f->dump_unsigned("epoch", get_epoch());
144 for (const auto &p : filesystems) {
145 auto& fs = p.second;
146 f->dump_unsigned("id", fs->fscid);
147 f->dump_unsigned("up", fs->mds_map.up.size());
148 f->dump_unsigned("in", fs->mds_map.in.size());
149 f->dump_unsigned("max", fs->mds_map.max_mds);
150 }
151 } else {
152 auto count = filesystems.size();
153 if (count <= 3) {
154 bool first = true;
155 for (const auto& p : filesystems) {
156 const auto& fs = p.second;
157 if (!first) {
158 *out << " ";
159 }
160 if (fs->mds_map.is_degraded()) {
161 *out << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
162 } else {
163 *out << fs->mds_map.fs_name << ":" << fs->mds_map.in.size();
164 }
165 first = false;
166 }
167 } else {
168 *out << count << " fs";
169 unsigned degraded = 0;
170 CachedStackStringStream css;
171 *css << " (degraded: ";
172 for (const auto& p : filesystems) {
173 const auto& fs = p.second;
174 if (fs->mds_map.is_degraded()) {
175 degraded++;
176 if (degraded <= 3) {
177 *css << fs->mds_map.fs_name << ":" << fs->mds_map.up.size() << "/" << fs->mds_map.in.size();
178 }
179 }
180 }
181 if (degraded > 0) {
182 if (degraded <= 3) {
183 *css << ")";
184 *out << css->strv();
185 } else {
186 *out << " (degraded: " << degraded << " fs)";
187 }
188 }
189 }
190 }
191
192 if (f) {
193 f->open_array_section("by_rank");
194 }
195
196 std::map<MDSMap::DaemonState,unsigned> by_state;
197 std::map<mds_role_t, std::pair<MDSMap::DaemonState, std::string>> by_rank;
198 by_state[MDSMap::DaemonState::STATE_STANDBY] = standby_daemons.size();
199 for (const auto& [gid, fscid] : mds_roles) {
200 if (fscid == FS_CLUSTER_ID_NONE)
201 continue;
202
203 const auto& info = filesystems.at(fscid)->mds_map.get_info_gid(gid);
204 auto s = std::string(ceph_mds_state_name(info.state));
205 if (info.laggy()) {
206 s += "(laggy or crashed)";
207 }
208
209 if (f) {
210 f->open_object_section("mds");
211 f->dump_unsigned("filesystem_id", fscid);
212 f->dump_unsigned("rank", info.rank);
213 f->dump_string("name", info.name);
214 f->dump_string("status", s);
215 f->dump_unsigned("gid", gid);
216 f->close_section();
217 } else if (info.state != MDSMap::DaemonState::STATE_STANDBY_REPLAY) {
218 by_rank[mds_role_t(fscid, info.rank)] = std::make_pair(info.state, info.name + "=" + s);
219 }
220 by_state[info.state]++;
221 }
222
223 if (f) {
224 f->close_section();
225 } else {
226 if (0 < by_rank.size() && by_rank.size() < 5) {
227 if (filesystems.size() > 1) {
228 // Disambiguate filesystems
229 std::map<std::string, std::string> pretty;
230 for (const auto& [role,status] : by_rank) {
231 const auto &fs_name = filesystems.at(role.fscid)->mds_map.fs_name;
232 CachedStackStringStream css;
233 *css << fs_name << ":" << role.rank;
234 pretty.emplace(std::piecewise_construct, std::forward_as_tuple(css->strv()), std::forward_as_tuple(status.second));
235 --by_state[status.first]; /* already printed! */
236 }
237 *out << " " << pretty;
238 } else {
239 // Omit FSCID in output when only one filesystem exists
240 std::map<mds_rank_t, std::string> shortened;
241 for (const auto& [role,status] : by_rank) {
242 shortened[role.rank] = status.second;
243 --by_state[status.first]; /* already printed! */
244 }
245 *out << " " << shortened;
246 }
247 }
248 for (const auto& [state, count] : by_state) {
249 if (count > 0) {
250 auto s = std::string_view(ceph_mds_state_name(state));
251 *out << " " << count << " " << s;
252 }
253 }
254 }
255
256 if (f) {
257 const auto state = MDSMap::DaemonState::STATE_STANDBY;
258 auto&& name = ceph_mds_state_name(state);
259 auto count = standby_daemons.size();
260 f->dump_unsigned(name, count);
261 }
262
263 size_t failed = 0;
264 size_t damaged = 0;
265 for (const auto& p : filesystems) {
266 auto& fs = p.second;
267 failed += fs->mds_map.failed.size();
268 damaged += fs->mds_map.damaged.size();
269 }
270
271 if (failed > 0) {
272 if (f) {
273 f->dump_unsigned("failed", failed);
274 } else {
275 *out << ", " << failed << " failed";
276 }
277 }
278
279 if (damaged > 0) {
280 if (f) {
281 f->dump_unsigned("damaged", damaged);
282 } else {
283 *out << ", " << damaged << " damaged";
284 }
285 }
286 //if (stopped.size())
287 //out << ", " << stopped.size() << " stopped";
288 }
289
290 mds_gid_t Filesystem::get_standby_replay(mds_gid_t who) const
291 {
292 for (const auto &i : mds_map.mds_info) {
293 const auto &info = i.second;
294 if (info.state == MDSMap::STATE_STANDBY_REPLAY
295 && info.rank == mds_map.mds_info.at(who).rank) {
296 return info.global_id;
297 }
298 }
299 return MDS_GID_NONE;
300 }
301
302 Filesystem::ref FSMap::create_filesystem(std::string_view name,
303 int64_t metadata_pool, int64_t data_pool, uint64_t features)
304 {
305 auto fs = Filesystem::create();
306 fs->mds_map.epoch = epoch;
307 fs->mds_map.fs_name = name;
308 fs->mds_map.data_pools.push_back(data_pool);
309 fs->mds_map.metadata_pool = metadata_pool;
310 fs->mds_map.cas_pool = -1;
311 fs->mds_map.compat = compat;
312 fs->mds_map.created = ceph_clock_now();
313 fs->mds_map.modified = ceph_clock_now();
314 fs->mds_map.enabled = true;
315 fs->fscid = next_filesystem_id++;
316 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
317 // have initialized next_filesystem_id such that it's never used here.
318 ceph_assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
319 filesystems[fs->fscid] = fs;
320
321 // Created first filesystem? Set it as the one
322 // for legacy clients to use
323 if (filesystems.size() == 1) {
324 legacy_client_fscid = fs->fscid;
325 }
326
327 return fs;
328 }
329
330 Filesystem::const_ref FSMap::get_filesystem(std::string_view name) const
331 {
332 for (const auto& p : filesystems) {
333 if (p.second->mds_map.fs_name == name) {
334 return p.second;
335 }
336 }
337 return nullptr;
338 }
339
340 std::vector<Filesystem::const_ref> FSMap::get_filesystems(void) const
341 {
342 std::vector<Filesystem::const_ref> ret;
343 for (const auto& p : filesystems) {
344 ret.push_back(p.second);
345 }
346 return ret;
347 }
348
349 void FSMap::reset_filesystem(fs_cluster_id_t fscid)
350 {
351 auto fs = get_filesystem(fscid);
352 auto new_fs = Filesystem::create();
353
354 // Populate rank 0 as existing (so don't go into CREATING)
355 // but failed (so that next available MDS is assigned the rank)
356 new_fs->mds_map.in.insert(mds_rank_t(0));
357 new_fs->mds_map.failed.insert(mds_rank_t(0));
358
359 // Carry forward what makes sense
360 new_fs->fscid = fs->fscid;
361 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
362 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
363 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
364 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
365 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
366 new_fs->mds_map.compat = compat;
367 new_fs->mds_map.created = ceph_clock_now();
368 new_fs->mds_map.modified = ceph_clock_now();
369 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
370 new_fs->mds_map.enabled = true;
371
372 // Remember mds ranks that have ever started. (They should load old inotable
373 // instead of creating new one if they start again.)
374 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
375 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
376 new_fs->mds_map.stopped.erase(mds_rank_t(0));
377
378 // Persist the new FSMap
379 filesystems[new_fs->fscid] = new_fs;
380 }
381
382 void FSMap::get_health(list<pair<health_status_t,string> >& summary,
383 list<pair<health_status_t,string> > *detail) const
384 {
385 mds_rank_t standby_count_wanted = 0;
386 for (const auto &i : filesystems) {
387 const auto &fs = i.second;
388
389 // TODO: move get_health up into here so that we can qualify
390 // all the messages with what filesystem they're talking about
391 fs->mds_map.get_health(summary, detail);
392
393 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
394 }
395
396 if (standby_count_wanted) {
397 std::ostringstream oss;
398 oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
399 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
400 }
401 }
402
403 bool FSMap::check_health(void)
404 {
405 bool changed = false;
406 for (auto &i : filesystems) {
407 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
408 }
409 return changed;
410 }
411
412 void FSMap::get_health_checks(health_check_map_t *checks) const
413 {
414 mds_rank_t standby_count_wanted = 0;
415 for (const auto &i : filesystems) {
416 const auto &fs = i.second;
417 health_check_map_t fschecks;
418
419 fs->mds_map.get_health_checks(&fschecks);
420
421 // Some of the failed ranks might be transient (i.e. there are standbys
422 // ready to replace them). We will report only on "stuck" failed, i.e.
423 // ranks which are failed and have no standby replacement available.
424 std::set<mds_rank_t> stuck_failed;
425
426 for (const auto &rank : fs->mds_map.failed) {
427 auto&& replacement = find_replacement_for({fs->fscid, rank}, {});
428 if (replacement == MDS_GID_NONE) {
429 stuck_failed.insert(rank);
430 }
431 }
432
433 // FS_WITH_FAILED_MDS
434 if (!stuck_failed.empty()) {
435 health_check_t& fscheck = checks->get_or_add(
436 "FS_WITH_FAILED_MDS", HEALTH_WARN,
437 "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
438 ostringstream ss;
439 ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
440 << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
441 fscheck.detail.push_back(ss.str()); }
442
443 checks->merge(fschecks);
444 standby_count_wanted = std::max(
445 standby_count_wanted,
446 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
447 }
448
449 // MDS_INSUFFICIENT_STANDBY
450 if (standby_count_wanted) {
451 std::ostringstream oss, dss;
452 oss << "insufficient standby MDS daemons available";
453 auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str(), 1);
454 dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
455 << " more";
456 d.detail.push_back(dss.str());
457 }
458 }
459
460 void FSMap::update_compat(const CompatSet &c)
461 {
462 // We could do something more complicated here to enable
463 // different filesystems to be served by different MDS versions,
464 // but this is a lot simpler because it doesn't require us to
465 // track the compat versions for standby daemons.
466 compat = c;
467 for (const auto &i : filesystems) {
468 MDSMap &mds_map = i.second->mds_map;
469 mds_map.compat = c;
470 mds_map.epoch = epoch;
471 }
472 }
473
474 void FSMap::encode(bufferlist& bl, uint64_t features) const
475 {
476 ENCODE_START(7, 6, bl);
477 encode(epoch, bl);
478 encode(next_filesystem_id, bl);
479 encode(legacy_client_fscid, bl);
480 encode(compat, bl);
481 encode(enable_multiple, bl);
482 {
483 std::vector<Filesystem::ref> v;
484 v.reserve(filesystems.size());
485 for (auto& p : filesystems) v.emplace_back(p.second);
486 encode(v, bl, features);
487 }
488 encode(mds_roles, bl);
489 encode(standby_daemons, bl, features);
490 encode(standby_epochs, bl);
491 encode(ever_enabled_multiple, bl);
492 ENCODE_FINISH(bl);
493 }
494
495 void FSMap::decode(bufferlist::const_iterator& p)
496 {
497 // The highest MDSMap encoding version before we changed the
498 // MDSMonitor to store an FSMap instead of an MDSMap was
499 // 5, so anything older than 6 is decoded as an MDSMap,
500 // and anything newer is decoded as an FSMap.
501 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
502 if (struct_v < 6) {
503 // Because the mon used to store an MDSMap where we now
504 // store an FSMap, FSMap knows how to decode the legacy
505 // MDSMap format (it never needs to encode it though).
506 MDSMap legacy_mds_map;
507
508 // Decoding an MDSMap (upgrade)
509 decode(epoch, p);
510 decode(legacy_mds_map.flags, p);
511 decode(legacy_mds_map.last_failure, p);
512 decode(legacy_mds_map.root, p);
513 decode(legacy_mds_map.session_timeout, p);
514 decode(legacy_mds_map.session_autoclose, p);
515 decode(legacy_mds_map.max_file_size, p);
516 decode(legacy_mds_map.max_mds, p);
517 decode(legacy_mds_map.mds_info, p);
518 if (struct_v < 3) {
519 __u32 n;
520 decode(n, p);
521 while (n--) {
522 __u32 m;
523 decode(m, p);
524 legacy_mds_map.data_pools.push_back(m);
525 }
526 __s32 s;
527 decode(s, p);
528 legacy_mds_map.cas_pool = s;
529 } else {
530 decode(legacy_mds_map.data_pools, p);
531 decode(legacy_mds_map.cas_pool, p);
532 }
533
534 // kclient ignores everything from here
535 __u16 ev = 1;
536 if (struct_v >= 2)
537 decode(ev, p);
538 if (ev >= 3)
539 decode(legacy_mds_map.compat, p);
540 else
541 legacy_mds_map.compat = MDSMap::get_compat_set_base();
542 if (ev < 5) {
543 __u32 n;
544 decode(n, p);
545 legacy_mds_map.metadata_pool = n;
546 } else {
547 decode(legacy_mds_map.metadata_pool, p);
548 }
549 decode(legacy_mds_map.created, p);
550 decode(legacy_mds_map.modified, p);
551 decode(legacy_mds_map.tableserver, p);
552 decode(legacy_mds_map.in, p);
553 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
554 decode(inc, p);
555 decode(legacy_mds_map.up, p);
556 decode(legacy_mds_map.failed, p);
557 decode(legacy_mds_map.stopped, p);
558 if (ev >= 4)
559 decode(legacy_mds_map.last_failure_osd_epoch, p);
560 if (ev >= 6) {
561 if (ev < 10) {
562 // previously this was a bool about snaps, not a flag map
563 bool flag;
564 decode(flag, p);
565 legacy_mds_map.ever_allowed_features = flag ?
566 CEPH_MDSMAP_ALLOW_SNAPS : 0;
567 decode(flag, p);
568 legacy_mds_map.explicitly_allowed_features = flag ?
569 CEPH_MDSMAP_ALLOW_SNAPS : 0;
570 } else {
571 decode(legacy_mds_map.ever_allowed_features, p);
572 decode(legacy_mds_map.explicitly_allowed_features, p);
573 }
574 } else {
575 legacy_mds_map.ever_allowed_features = 0;
576 legacy_mds_map.explicitly_allowed_features = 0;
577 }
578 if (ev >= 7)
579 decode(legacy_mds_map.inline_data_enabled, p);
580
581 if (ev >= 8) {
582 ceph_assert(struct_v >= 5);
583 decode(legacy_mds_map.enabled, p);
584 decode(legacy_mds_map.fs_name, p);
585 } else {
586 legacy_mds_map.fs_name = "default";
587 if (epoch > 1) {
588 // If an MDS has ever been started, epoch will be greater than 1,
589 // assume filesystem is enabled.
590 legacy_mds_map.enabled = true;
591 } else {
592 // Upgrading from a cluster that never used an MDS, switch off
593 // filesystem until it's explicitly enabled.
594 legacy_mds_map.enabled = false;
595 }
596 }
597
598 if (ev >= 9) {
599 decode(legacy_mds_map.damaged, p);
600 }
601
602 // We're upgrading, populate filesystems from the legacy fields
603 filesystems.clear();
604 standby_daemons.clear();
605 standby_epochs.clear();
606 mds_roles.clear();
607 compat = legacy_mds_map.compat;
608 enable_multiple = false;
609
610 // Synthesise a Filesystem from legacy_mds_map, if enabled
611 if (legacy_mds_map.enabled) {
612 // Construct a Filesystem from the legacy MDSMap
613 auto migrate_fs = Filesystem::create();
614 migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
615 migrate_fs->mds_map = legacy_mds_map;
616 migrate_fs->mds_map.epoch = epoch;
617 filesystems[migrate_fs->fscid] = migrate_fs;
618
619 // List of GIDs that had invalid states
620 std::set<mds_gid_t> drop_gids;
621
622 // Construct mds_roles, standby_daemons, and remove
623 // standbys from the MDSMap in the Filesystem.
624 for (const auto& [gid, info] : migrate_fs->mds_map.mds_info) {
625 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
626 /* drop any legacy standby-replay daemons */
627 drop_gids.insert(gid);
628 } else if (info.rank == MDS_RANK_NONE) {
629 if (info.state != MDSMap::STATE_STANDBY) {
630 // Old MDSMaps can have down:dne here, which
631 // is invalid in an FSMap (#17837)
632 drop_gids.insert(gid);
633 } else {
634 insert(info); // into standby_daemons
635 }
636 } else {
637 mds_roles[gid] = migrate_fs->fscid;
638 }
639 }
(1) Event parameter_hidden: |
declaration hides parameter "p" (declared at line 495) |
(2) Event caretline: |
^ |
640 for (const auto &p : standby_daemons) {
641 // Erase from this Filesystem's MDSMap, because it has
642 // been copied into FSMap::Standby_daemons above
643 migrate_fs->mds_map.mds_info.erase(p.first);
644 }
645 for (const auto &gid : drop_gids) {
646 // Throw away all info for this MDS because it was identified
647 // as having invalid state above.
648 migrate_fs->mds_map.mds_info.erase(gid);
649 }
650
651 legacy_client_fscid = migrate_fs->fscid;
652 } else {
653 legacy_client_fscid = FS_CLUSTER_ID_NONE;
654 }
655 } else {
656 decode(epoch, p);
657 decode(next_filesystem_id, p);
658 decode(legacy_client_fscid, p);
659 decode(compat, p);
660 decode(enable_multiple, p);
661 {
662 std::vector<Filesystem::ref> v;
663 decode(v, p);
664 filesystems.clear();
665 for (auto& ref : v) {
666 auto em = filesystems.emplace(std::piecewise_construct, std::forward_as_tuple(ref->fscid), std::forward_as_tuple(std::move(ref)));
667 ceph_assert(em.second);
668 }
669 }
670 decode(mds_roles, p);
671 decode(standby_daemons, p);
672 decode(standby_epochs, p);
673 if (struct_v >= 7) {
674 decode(ever_enabled_multiple, p);
675 }
676 }
677
678 DECODE_FINISH(p);
679 }
680
681 void FSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
682 {
683 for (auto &fs : filesystems) {
684 fs.second->mds_map.sanitize(pool_exists);
685 }
686 }
687
688 void Filesystem::encode(bufferlist& bl, uint64_t features) const
689 {
690 ENCODE_START(1, 1, bl);
691 encode(fscid, bl);
692 bufferlist mdsmap_bl;
693 mds_map.encode(mdsmap_bl, features);
694 encode(mdsmap_bl, bl);
695 ENCODE_FINISH(bl);
696 }
697
698 void Filesystem::decode(bufferlist::const_iterator& p)
699 {
700 DECODE_START(1, p);
701 decode(fscid, p);
702 bufferlist mdsmap_bl;
703 decode(mdsmap_bl, p);
704 auto mdsmap_bl_iter = mdsmap_bl.cbegin();
705 mds_map.decode(mdsmap_bl_iter);
706 DECODE_FINISH(p);
707 }
708
709 int FSMap::parse_filesystem(
710 std::string_view ns_str,
711 Filesystem::const_ref* result
712 ) const
713 {
714 std::string ns_err;
715 std::string s(ns_str);
716 fs_cluster_id_t fscid = strict_strtol(s.c_str(), 10, &ns_err);
717 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
718 for (auto &fs : filesystems) {
719 if (fs.second->mds_map.fs_name == s) {
720 *result = std::const_pointer_cast<const Filesystem>(fs.second);
721 return 0;
722 }
723 }
724 return -ENOENT;
725 } else {
726 *result = get_filesystem(fscid);
727 return 0;
728 }
729 }
730
731 void Filesystem::print(std::ostream &out) const
732 {
733 out << "Filesystem '" << mds_map.fs_name
734 << "' (" << fscid << ")" << std::endl;
735 mds_map.print(out);
736 }
737
738 bool FSMap::is_any_degraded() const
739 {
740 for (auto& i : filesystems) {
741 if (i.second->mds_map.is_degraded()) {
742 return true;
743 }
744 }
745 return false;
746 }
747
748 std::map<mds_gid_t, MDSMap::mds_info_t> FSMap::get_mds_info() const
749 {
750 std::map<mds_gid_t, MDSMap::mds_info_t> result;
751 for (const auto &i : standby_daemons) {
752 result[i.first] = i.second;
753 }
754
755 for (const auto &i : filesystems) {
756 const auto &fs_info = i.second->mds_map.get_mds_info();
757 for (const auto &j : fs_info) {
758 result[j.first] = j.second;
759 }
760 }
761
762 return result;
763 }
764
765 mds_gid_t FSMap::get_available_standby() const
766 {
767 for (const auto& [gid, info] : standby_daemons) {
768 ceph_assert(info.rank == MDS_RANK_NONE);
769 ceph_assert(info.state == MDSMap::STATE_STANDBY);
770
771 if (info.laggy() || info.is_frozen()) {
772 continue;
773 }
774
775 return gid;
776 }
777 return MDS_GID_NONE;
778 }
779
780 mds_gid_t FSMap::find_mds_gid_by_name(std::string_view s) const
781 {
782 const auto info = get_mds_info();
783 for (const auto &p : info) {
784 if (p.second.name == s) {
785 return p.first;
786 }
787 }
788 return MDS_GID_NONE;
789 }
790
791 const MDSMap::mds_info_t* FSMap::find_by_name(std::string_view name) const
792 {
793 std::map<mds_gid_t, MDSMap::mds_info_t> result;
794 for (const auto &i : standby_daemons) {
795 if (i.second.name == name) {
796 return &(i.second);
797 }
798 }
799
800 for (const auto &i : filesystems) {
801 const auto &fs_info = i.second->mds_map.get_mds_info();
802 for (const auto &j : fs_info) {
803 if (j.second.name == name) {
804 return &(j.second);
805 }
806 }
807 }
808
809 return nullptr;
810 }
811
812 mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const
813 {
814 auto&& fs = get_filesystem(role.fscid);
815
816 // First see if we have a STANDBY_REPLAY
817 for (const auto& [gid, info] : fs->mds_map.mds_info) {
818 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
819 if (info.is_frozen()) {
820 /* the standby-replay is frozen, do nothing! */
821 return MDS_GID_NONE;
822 } else {
823 return gid;
824 }
825 }
826 }
827
828 return get_available_standby();
829 }
830
831 void FSMap::sanity() const
832 {
833 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
834 ceph_assert(filesystems.count(legacy_client_fscid) == 1);
835 }
836
837 for (const auto &i : filesystems) {
838 auto fs = i.second;
839 ceph_assert(fs->mds_map.compat.compare(compat) == 0);
840 ceph_assert(fs->fscid == i.first);
841 for (const auto &j : fs->mds_map.mds_info) {
842 ceph_assert(j.second.rank != MDS_RANK_NONE);
843 ceph_assert(mds_roles.count(j.first) == 1);
844 ceph_assert(standby_daemons.count(j.first) == 0);
845 ceph_assert(standby_epochs.count(j.first) == 0);
846 ceph_assert(mds_roles.at(j.first) == i.first);
847 if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
848 ceph_assert(fs->mds_map.up.at(j.second.rank) == j.first);
849 ceph_assert(fs->mds_map.failed.count(j.second.rank) == 0);
850 ceph_assert(fs->mds_map.damaged.count(j.second.rank) == 0);
851 }
852 }
853
854 for (const auto &j : fs->mds_map.up) {
855 mds_rank_t rank = j.first;
856 ceph_assert(fs->mds_map.in.count(rank) == 1);
857 mds_gid_t gid = j.second;
858 ceph_assert(fs->mds_map.mds_info.count(gid) == 1);
859 }
860 }
861
862 for (const auto &i : standby_daemons) {
863 ceph_assert(i.second.state == MDSMap::STATE_STANDBY);
864 ceph_assert(i.second.rank == MDS_RANK_NONE);
865 ceph_assert(i.second.global_id == i.first);
866 ceph_assert(standby_epochs.count(i.first) == 1);
867 ceph_assert(mds_roles.count(i.first) == 1);
868 ceph_assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
869 }
870
871 for (const auto &i : standby_epochs) {
872 ceph_assert(standby_daemons.count(i.first) == 1);
873 }
874
875 for (const auto &i : mds_roles) {
876 if (i.second == FS_CLUSTER_ID_NONE) {
877 ceph_assert(standby_daemons.count(i.first) == 1);
878 } else {
879 ceph_assert(filesystems.count(i.second) == 1);
880 ceph_assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
881 }
882 }
883 }
884
885 void FSMap::promote(
886 mds_gid_t standby_gid,
887 Filesystem& filesystem,
888 mds_rank_t assigned_rank)
889 {
890 ceph_assert(gid_exists(standby_gid));
891 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
892 if (!is_standby_replay) {
893 ceph_assert(standby_daemons.count(standby_gid));
894 ceph_assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
895 }
896
897 MDSMap &mds_map = filesystem.mds_map;
898
899 // Insert daemon state to Filesystem
900 if (!is_standby_replay) {
901 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
902 } else {
903 ceph_assert(mds_map.mds_info.count(standby_gid));
904 ceph_assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
905 ceph_assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
906 }
907 MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
908
909 if (mds_map.stopped.erase(assigned_rank)) {
910 // The cluster is being expanded with a stopped rank
911 info.state = MDSMap::STATE_STARTING;
912 } else if (!mds_map.is_in(assigned_rank)) {
913 // The cluster is being expanded with a new rank
914 info.state = MDSMap::STATE_CREATING;
915 } else {
916 // An existing rank is being assigned to a replacement
917 info.state = MDSMap::STATE_REPLAY;
918 mds_map.failed.erase(assigned_rank);
919 }
920 info.rank = assigned_rank;
921 info.inc = epoch;
922 mds_roles[standby_gid] = filesystem.fscid;
923
924 // Update the rank state in Filesystem
925 mds_map.in.insert(assigned_rank);
926 mds_map.up[assigned_rank] = standby_gid;
927
928 // Remove from the list of standbys
929 if (!is_standby_replay) {
930 standby_daemons.erase(standby_gid);
931 standby_epochs.erase(standby_gid);
932 }
933
934 // Indicate that Filesystem has been modified
935 mds_map.epoch = epoch;
936 }
937
938 void FSMap::assign_standby_replay(
939 const mds_gid_t standby_gid,
940 const fs_cluster_id_t leader_ns,
941 const mds_rank_t leader_rank)
942 {
943 ceph_assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
944 ceph_assert(gid_exists(standby_gid));
945 ceph_assert(!gid_has_rank(standby_gid));
946 ceph_assert(standby_daemons.count(standby_gid));
947
948 // Insert to the filesystem
949 auto fs = filesystems.at(leader_ns);
950 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
951 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
952 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
953 mds_roles[standby_gid] = leader_ns;
954
955 // Remove from the list of standbys
956 standby_daemons.erase(standby_gid);
957 standby_epochs.erase(standby_gid);
958
959 // Indicate that Filesystem has been modified
960 fs->mds_map.epoch = epoch;
961 }
962
963 void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
964 {
965 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
966 standby_daemons.erase(who);
967 standby_epochs.erase(who);
968 } else {
969 auto &fs = filesystems.at(mds_roles.at(who));
970 const auto &info = fs->mds_map.mds_info.at(who);
971 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
972 if (info.state == MDSMap::STATE_CREATING) {
973 // If this gid didn't make it past CREATING, then forget
974 // the rank ever existed so that next time it's handed out
975 // to a gid it'll go back into CREATING.
976 fs->mds_map.in.erase(info.rank);
977 } else {
978 // Put this rank into the failed list so that the next available
979 // STANDBY will pick it up.
980 fs->mds_map.failed.insert(info.rank);
981 }
982 ceph_assert(fs->mds_map.up.at(info.rank) == info.global_id);
983 fs->mds_map.up.erase(info.rank);
984 }
985 fs->mds_map.mds_info.erase(who);
986 fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
987 fs->mds_map.epoch = epoch;
988 }
989
990 mds_roles.erase(who);
991 }
992
993 void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
994 {
995 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
996 auto fs = filesystems.at(mds_roles.at(who));
997 mds_rank_t rank = fs->mds_map.mds_info[who].rank;
998
999 erase(who, blacklist_epoch);
1000 fs->mds_map.failed.erase(rank);
1001 fs->mds_map.damaged.insert(rank);
1002
1003 ceph_assert(fs->mds_map.epoch == epoch);
1004 }
1005
1006 /**
1007 * Update to indicate that the rank `rank` is to be removed
1008 * from the damaged list of the filesystem `fscid`
1009 */
1010 bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
1011 {
1012 auto fs = filesystems.at(fscid);
1013
1014 if (fs->mds_map.damaged.erase(rank)) {
1015 fs->mds_map.failed.insert(rank);
1016 fs->mds_map.epoch = epoch;
1017 return true;
1018 } else {
1019 return false;
1020 }
1021 }
1022
1023 void FSMap::insert(const MDSMap::mds_info_t &new_info)
1024 {
1025 ceph_assert(new_info.state == MDSMap::STATE_STANDBY);
1026 ceph_assert(new_info.rank == MDS_RANK_NONE);
1027 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
1028 standby_daemons[new_info.global_id] = new_info;
1029 standby_epochs[new_info.global_id] = epoch;
1030 }
1031
1032 std::vector<mds_gid_t> FSMap::stop(mds_gid_t who)
1033 {
1034 ceph_assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
1035 auto fs = filesystems.at(mds_roles.at(who));
1036 const auto &info = fs->mds_map.mds_info.at(who);
1037 fs->mds_map.up.erase(info.rank);
1038 fs->mds_map.in.erase(info.rank);
1039 fs->mds_map.stopped.insert(info.rank);
1040
1041 // Also drop any standby replays that were following this rank
1042 std::vector<mds_gid_t> standbys;
1043 for (const auto &i : fs->mds_map.mds_info) {
1044 const auto &other_gid = i.first;
1045 const auto &other_info = i.second;
1046 if (other_info.rank == info.rank
1047 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
1048 standbys.push_back(other_gid);
1049 erase(other_gid, 0);
1050 }
1051 }
1052
1053 fs->mds_map.mds_info.erase(who);
1054 mds_roles.erase(who);
1055
1056 fs->mds_map.epoch = epoch;
1057
1058 return standbys;
1059 }
1060
1061
1062 /**
1063 * Given one of the following forms:
1064 * <fs name>:<rank>
1065 * <fs id>:<rank>
1066 * <rank>
1067 *
1068 * Parse into a mds_role_t. The rank-only form is only valid
1069 * if legacy_client_ns is set.
1070 */
1071 int FSMap::parse_role(
1072 std::string_view role_str,
1073 mds_role_t *role,
1074 std::ostream &ss) const
1075 {
1076 size_t colon_pos = role_str.find(":");
1077 size_t rank_pos;
1078 Filesystem::const_ref fs;
1079 if (colon_pos == std::string::npos) {
1080 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1081 ss << "No filesystem selected";
1082 return -ENOENT;
1083 }
1084 fs = get_filesystem(legacy_client_fscid);
1085 rank_pos = 0;
1086 } else {
1087 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
1088 ss << "Invalid filesystem";
1089 return -ENOENT;
1090 }
1091 rank_pos = colon_pos+1;
1092 }
1093
1094 mds_rank_t rank;
1095 std::string err;
1096 std::string rank_str(role_str.substr(rank_pos));
1097 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1098 if (rank_i < 0 || !err.empty()) {
1099 ss << "Invalid rank '" << rank_str << "'";
1100 return -EINVAL;
1101 } else {
1102 rank = rank_i;
1103 }
1104
1105 if (fs->mds_map.in.count(rank) == 0) {
1106 ss << "Rank '" << rank << "' not found";
1107 return -ENOENT;
1108 }
1109
1110 *role = {fs->fscid, rank};
1111
1112 return 0;
1113 }
1114
1115 bool FSMap::pool_in_use(int64_t poolid) const
1116 {
1117 for (auto const &i : filesystems) {
1118 if (i.second->mds_map.is_data_pool(poolid)
1119 || i.second->mds_map.metadata_pool == poolid) {
1120 return true;
1121 }
1122 }
1123 return false;
1124 }
1125