1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/debug.h"
16 #include "mon/health_check.h"
17
18 #include "MDSMap.h"
19
20 #include <sstream>
21 using std::stringstream;
22
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_
25
26 // features
27 CompatSet MDSMap::get_compat_set_all() {
28 CompatSet::FeatureSet feature_compat;
29 CompatSet::FeatureSet feature_ro_compat;
30 CompatSet::FeatureSet feature_incompat;
31 feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
32 feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
33 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
34 feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
35 feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
36 feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
37 feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
38 feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
39 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
40 feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
41
42 return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
43 }
44
45 CompatSet MDSMap::get_compat_set_default() {
46 CompatSet::FeatureSet feature_compat;
47 CompatSet::FeatureSet feature_ro_compat;
48 CompatSet::FeatureSet feature_incompat;
49 feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
50 feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
51 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
52 feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
53 feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
54 feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
55 feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
56 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
57 feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
58
59 return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
60 }
61
62 // base (pre v0.20)
63 CompatSet MDSMap::get_compat_set_base() {
64 CompatSet::FeatureSet feature_compat_base;
65 CompatSet::FeatureSet feature_incompat_base;
66 feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE);
67 CompatSet::FeatureSet feature_ro_compat_base;
68
69 return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base);
70 }
71
72 void MDSMap::mds_info_t::dump(Formatter *f) const
73 {
74 f->dump_unsigned("gid", global_id);
75 f->dump_string("name", name);
76 f->dump_int("rank", rank);
77 f->dump_int("incarnation", inc);
78 f->dump_stream("state") << ceph_mds_state_name(state);
79 f->dump_int("state_seq", state_seq);
80 f->dump_stream("addr") << addrs.get_legacy_str();
81 f->dump_object("addrs", addrs);
82 if (laggy_since != utime_t())
83 f->dump_stream("laggy_since") << laggy_since;
84
85 f->open_array_section("export_targets");
86 for (set<mds_rank_t>::iterator p = export_targets.begin();
87 p != export_targets.end(); ++p) {
88 f->dump_int("mds", *p);
89 }
90 f->close_section();
91 f->dump_unsigned("features", mds_features);
92 f->dump_unsigned("flags", flags);
93 }
94
95 void MDSMap::mds_info_t::print_summary(ostream &out) const
96 {
97 out << global_id << ":\t"
98 << addrs
99 << " '" << name << "'"
100 << " mds." << rank
101 << "." << inc
102 << " " << ceph_mds_state_name(state)
103 << " seq " << state_seq;
104 if (laggy()) {
105 out << " laggy since " << laggy_since;
106 }
107 if (!export_targets.empty()) {
108 out << " export_targets=" << export_targets;
109 }
110 if (is_frozen()) {
111 out << " frozen";
112 }
113 }
114
115 void MDSMap::mds_info_t::generate_test_instances(std::list<mds_info_t*>& ls)
116 {
117 mds_info_t *sample = new mds_info_t();
118 ls.push_back(sample);
119 sample = new mds_info_t();
120 sample->global_id = 1;
121 sample->name = "test_instance";
122 sample->rank = 0;
123 ls.push_back(sample);
124 }
125
126 void MDSMap::dump(Formatter *f) const
127 {
128 f->dump_int("epoch", epoch);
129 f->dump_unsigned("flags", flags);
130 f->dump_unsigned("ever_allowed_features", ever_allowed_features);
131 f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features);
132 f->dump_stream("created") << created;
133 f->dump_stream("modified") << modified;
134 f->dump_int("tableserver", tableserver);
135 f->dump_int("root", root);
136 f->dump_int("session_timeout", session_timeout);
137 f->dump_int("session_autoclose", session_autoclose);
138 f->dump_stream("min_compat_client") << ceph::to_integer<int>(min_compat_client) << " ("
139 << min_compat_client << ")";
140 f->dump_int("max_file_size", max_file_size);
141 f->dump_int("last_failure", last_failure);
142 f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
143 f->open_object_section("compat");
144 compat.dump(f);
145 f->close_section();
146 f->dump_int("max_mds", max_mds);
147 f->open_array_section("in");
148 for (set<mds_rank_t>::const_iterator p = in.begin(); p != in.end(); ++p)
149 f->dump_int("mds", *p);
150 f->close_section();
151 f->open_object_section("up");
152 for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) {
153 char s[14];
154 sprintf(s, "mds_%d", int(p->first));
155 f->dump_int(s, p->second);
156 }
157 f->close_section();
158 f->open_array_section("failed");
159 for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p)
160 f->dump_int("mds", *p);
161 f->close_section();
162 f->open_array_section("damaged");
163 for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p)
164 f->dump_int("mds", *p);
165 f->close_section();
166 f->open_array_section("stopped");
167 for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p)
168 f->dump_int("mds", *p);
169 f->close_section();
170 f->open_object_section("info");
171 for (map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) {
172 char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0'
173 sprintf(s, "gid_%llu", (long long unsigned)p->first);
174 f->open_object_section(s);
175 p->second.dump(f);
176 f->close_section();
177 }
178 f->close_section();
179 f->open_array_section("data_pools");
180 for (const auto p: data_pools)
181 f->dump_int("pool", p);
182 f->close_section();
183 f->dump_int("metadata_pool", metadata_pool);
184 f->dump_bool("enabled", enabled);
185 f->dump_string("fs_name", fs_name);
186 f->dump_string("balancer", balancer);
187 f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
188 }
189
190 void MDSMap::generate_test_instances(std::list<MDSMap*>& ls)
191 {
192 MDSMap *m = new MDSMap();
193 m->max_mds = 1;
194 m->data_pools.push_back(0);
195 m->metadata_pool = 1;
196 m->cas_pool = 2;
197 m->compat = get_compat_set_all();
198
199 // these aren't the defaults, just in case anybody gets confused
200 m->session_timeout = 61;
201 m->session_autoclose = 301;
202 m->max_file_size = 1<<24;
203 ls.push_back(m);
204 }
205
206 void MDSMap::print(ostream& out) const
207 {
208 out << "fs_name\t" << fs_name << "\n";
209 out << "epoch\t" << epoch << "\n";
210 out << "flags\t" << hex << flags << dec << "\n";
211 out << "created\t" << created << "\n";
212 out << "modified\t" << modified << "\n";
213 out << "tableserver\t" << tableserver << "\n";
214 out << "root\t" << root << "\n";
215 out << "session_timeout\t" << session_timeout << "\n"
216 << "session_autoclose\t" << session_autoclose << "\n";
217 out << "max_file_size\t" << max_file_size << "\n";
218 out << "min_compat_client\t" << ceph::to_integer<int>(min_compat_client) << " ("
219 << min_compat_client << ")\n";
220 out << "last_failure\t" << last_failure << "\n"
221 << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n";
222 out << "compat\t" << compat << "\n";
223 out << "max_mds\t" << max_mds << "\n";
224 out << "in\t" << in << "\n"
225 << "up\t" << up << "\n"
226 << "failed\t" << failed << "\n"
227 << "damaged\t" << damaged << "\n"
228 << "stopped\t" << stopped << "\n";
229 out << "data_pools\t" << data_pools << "\n";
230 out << "metadata_pool\t" << metadata_pool << "\n";
231 out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
232 out << "balancer\t" << balancer << "\n";
233 out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
234
235 multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
236 for (const auto &p : mds_info) {
237 foo.insert(std::make_pair(
238 std::make_pair(p.second.rank, p.second.inc-1), p.first));
239 }
240
241 for (const auto &p : foo) {
242 const mds_info_t& info = mds_info.at(p.second);
243 info.print_summary(out);
244 out << "\n";
245 }
246 }
247
248
249
250 void MDSMap::print_summary(Formatter *f, ostream *out) const
251 {
252 map<mds_rank_t,string> by_rank;
253 map<string,int> by_state;
254
255 if (f) {
256 f->dump_unsigned("epoch", get_epoch());
257 f->dump_unsigned("up", up.size());
258 f->dump_unsigned("in", in.size());
259 f->dump_unsigned("max", max_mds);
260 } else {
261 *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up";
262 }
263
264 if (f)
265 f->open_array_section("by_rank");
266 for (const auto &p : mds_info) {
267 string s = ceph_mds_state_name(p.second.state);
268 if (p.second.laggy())
269 s += "(laggy or crashed)";
270
271 if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) {
272 if (f) {
273 f->open_object_section("mds");
274 f->dump_unsigned("rank", p.second.rank);
275 f->dump_string("name", p.second.name);
276 f->dump_string("status", s);
277 f->close_section();
278 } else {
279 by_rank[p.second.rank] = p.second.name + "=" + s;
280 }
281 } else {
282 by_state[s]++;
283 }
284 }
285 if (f) {
286 f->close_section();
287 } else {
288 if (!by_rank.empty())
289 *out << " " << by_rank;
290 }
291
292 for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
293 if (f) {
294 f->dump_unsigned(p->first.c_str(), p->second);
295 } else {
296 *out << ", " << p->second << " " << p->first;
297 }
298 }
299
300 if (!failed.empty()) {
301 if (f) {
302 f->dump_unsigned("failed", failed.size());
303 } else {
304 *out << ", " << failed.size() << " failed";
305 }
306 }
307
308 if (!damaged.empty()) {
309 if (f) {
310 f->dump_unsigned("damaged", damaged.size());
311 } else {
312 *out << ", " << damaged.size() << " damaged";
313 }
314 }
315 //if (stopped.size())
316 //out << ", " << stopped.size() << " stopped";
317 }
318
319 void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
320 list<pair<health_status_t,string> > *detail) const
321 {
322 if (!failed.empty()) {
323 std::ostringstream oss;
324 oss << "mds rank"
325 << ((failed.size() > 1) ? "s ":" ")
326 << failed
327 << ((failed.size() > 1) ? " have":" has")
328 << " failed";
329 summary.push_back(make_pair(HEALTH_ERR, oss.str()));
330 if (detail) {
331 for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p) {
332 std::ostringstream oss;
333 oss << "mds." << *p << " has failed";
334 detail->push_back(make_pair(HEALTH_ERR, oss.str()));
335 }
336 }
337 }
338
339 if (!damaged.empty()) {
340 std::ostringstream oss;
341 oss << "mds rank"
342 << ((damaged.size() > 1) ? "s ":" ")
343 << damaged
344 << ((damaged.size() > 1) ? " are":" is")
345 << " damaged";
346 summary.push_back(make_pair(HEALTH_ERR, oss.str()));
347 if (detail) {
348 for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) {
349 std::ostringstream oss;
350 oss << "mds." << *p << " is damaged";
351 detail->push_back(make_pair(HEALTH_ERR, oss.str()));
352 }
353 }
354 }
355
356 if (is_degraded()) {
357 summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
358 if (detail) {
359 detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
360 for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
361 if (!is_up(i))
362 continue;
363 mds_gid_t gid = up.find(i)->second;
364 map<mds_gid_t,mds_info_t>::const_iterator info = mds_info.find(gid);
365 stringstream ss;
366 if (is_resolve(i))
367 ss << "mds." << info->second.name << " at " << info->second.addrs
368 << " rank " << i << " is resolving";
369 if (is_replay(i))
370 ss << "mds." << info->second.name << " at " << info->second.addrs
371 << " rank " << i << " is replaying journal";
372 if (is_rejoin(i))
373 ss << "mds." << info->second.name << " at " << info->second.addrs
374 << " rank " << i << " is rejoining";
375 if (is_reconnect(i))
376 ss << "mds." << info->second.name << " at " << info->second.addrs
377 << " rank " << i << " is reconnecting to clients";
378 if (ss.str().length())
379 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
380 }
381 }
382 }
383
384 {
385 stringstream ss;
386 ss << fs_name << " max_mds " << max_mds;
387 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
388 }
389
390 if ((mds_rank_t)up.size() < max_mds) {
391 stringstream ss;
392 ss << fs_name << " has " << up.size()
393 << " active MDS(s), but has max_mds of " << max_mds;
394 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
395 }
396
397 map<mds_gid_t, mds_info_t>::const_iterator m_end = mds_info.end();
398 set<string> laggy;
399 for (const auto &u : up) {
400 map<mds_gid_t, mds_info_t>::const_iterator m = mds_info.find(u.second);
401 if (m == m_end) {
402 std::cerr << "Up rank " << u.first << " GID " << u.second << " not found!" << std::endl;
403 }
404 ceph_assert(m != m_end);
405 const mds_info_t &mds_info(m->second);
406 if (mds_info.laggy()) {
407 laggy.insert(mds_info.name);
408 if (detail) {
409 std::ostringstream oss;
410 oss << "mds." << mds_info.name << " at " << mds_info.addrs
411 << " is laggy/unresponsive";
412 detail->push_back(make_pair(HEALTH_WARN, oss.str()));
413 }
414 }
415 }
416
417 if (!laggy.empty()) {
418 std::ostringstream oss;
419 oss << "mds " << laggy
420 << ((laggy.size() > 1) ? " are":" is")
421 << " laggy";
422 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
423 }
424
425 if (get_max_mds() > 1 &&
426 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
427 std::ostringstream oss;
428 oss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
429 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
430 }
431 }
432
433 void MDSMap::get_health_checks(health_check_map_t *checks) const
434 {
435 // MDS_DAMAGE
436 if (!damaged.empty()) {
437 health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR,
438 "%num% mds daemon%plurals% damaged",
439 damaged.size());
440 for (auto p : damaged) {
441 std::ostringstream oss;
442 oss << "fs " << fs_name << " mds." << p << " is damaged";
443 check.detail.push_back(oss.str());
444 }
445 }
446
447 // FS_DEGRADED
448 if (is_degraded()) {
449 health_check_t& fscheck = checks->get_or_add(
450 "FS_DEGRADED", HEALTH_WARN,
451 "%num% filesystem%plurals% %isorare% degraded", 1);
452 ostringstream ss;
453 ss << "fs " << fs_name << " is degraded";
454 fscheck.detail.push_back(ss.str());
455
456 list<string> detail;
457 for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
458 if (!is_up(i))
459 continue;
460 mds_gid_t gid = up.find(i)->second;
461 map<mds_gid_t,mds_info_t>::const_iterator info = mds_info.find(gid);
462 stringstream ss;
463 ss << "fs " << fs_name << " mds." << info->second.name << " at "
464 << info->second.addrs << " rank " << i;
465 if (is_resolve(i))
466 ss << " is resolving";
467 if (is_replay(i))
468 ss << " is replaying journal";
469 if (is_rejoin(i))
470 ss << " is rejoining";
471 if (is_reconnect(i))
472 ss << " is reconnecting to clients";
473 if (ss.str().length())
474 detail.push_back(ss.str());
475 }
476 }
477
478 // MDS_UP_LESS_THAN_MAX
479 if ((mds_rank_t)get_num_in_mds() < get_max_mds()) {
480 health_check_t& check = checks->add(
481 "MDS_UP_LESS_THAN_MAX", HEALTH_WARN,
482 "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1);
483 stringstream ss;
484 ss << "fs " << fs_name << " has " << get_num_in_mds()
485 << " MDS online, but wants " << get_max_mds();
486 check.detail.push_back(ss.str());
487 }
488
489 // MDS_ALL_DOWN
490 if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) {
491 health_check_t &check = checks->add(
492 "MDS_ALL_DOWN", HEALTH_ERR,
493 "%num% filesystem%plurals% %isorare% offline", 1);
494 stringstream ss;
495 ss << "fs " << fs_name << " is offline because no MDS is active for it.";
496 check.detail.push_back(ss.str());
497 }
498
499 if (get_max_mds() > 1 &&
500 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
501 health_check_t &check = checks->add(
502 "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR,
503 "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1);
504 stringstream ss;
505 ss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
506 check.detail.push_back(ss.str());
507 }
508
509 if (get_inline_data_enabled()) {
510 health_check_t &check = checks->add(
511 "FS_INLINE_DATA_DEPRECATED", HEALTH_WARN,
512 "%num% filesystem%plurals% with deprecated feature inline_data", 1);
513 stringstream ss;
514 ss << "fs " << fs_name << " has deprecated feature inline_data enabled.";
515 check.detail.push_back(ss.str());
516 }
517 }
518
519 void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const
520 {
521 __u8 v = 9;
522 if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
523 v = 7;
524 }
525 ENCODE_START(v, 4, bl);
526 encode(global_id, bl);
527 encode(name, bl);
528 encode(rank, bl);
529 encode(inc, bl);
530 encode((int32_t)state, bl);
531 encode(state_seq, bl);
532 if (v < 8) {
533 encode(addrs.legacy_addr(), bl, features);
534 } else {
535 encode(addrs, bl, features);
536 }
537 encode(laggy_since, bl);
538 encode(MDS_RANK_NONE, bl); /* standby_for_rank */
539 encode(std::string(), bl); /* standby_for_name */
540 encode(export_targets, bl);
541 encode(mds_features, bl);
542 encode(FS_CLUSTER_ID_NONE, bl); /* standby_for_fscid */
543 encode(false, bl);
544 if (v >= 9) {
545 encode(flags, bl);
546 }
547 ENCODE_FINISH(bl);
548 }
549
550 void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const
551 {
552 __u8 struct_v = 3;
553 using ceph::encode;
(2) Event overrun-buffer-val: |
Overrunning buffer pointed to by "struct_v" of 1 bytes by passing it to a function which accesses it at byte offset 7. [details] |
Also see events: |
[assignment] |
554 encode(struct_v, bl);
555 encode(global_id, bl);
556 encode(name, bl);
557 encode(rank, bl);
558 encode(inc, bl);
559 encode((int32_t)state, bl);
560 encode(state_seq, bl);
561 encode(addrs.legacy_addr(), bl, 0);
562 encode(laggy_since, bl);
563 encode(MDS_RANK_NONE, bl);
564 encode(std::string(), bl);
565 encode(export_targets, bl);
566 }
567
568 void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl)
569 {
570 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
571 decode(global_id, bl);
572 decode(name, bl);
573 decode(rank, bl);
574 decode(inc, bl);
575 decode((int32_t&)(state), bl);
576 decode(state_seq, bl);
577 decode(addrs, bl);
578 decode(laggy_since, bl);
579 {
580 mds_rank_t standby_for_rank;
581 decode(standby_for_rank, bl);
582 }
583 {
584 std::string standby_for_name;
585 decode(standby_for_name, bl);
586 }
587 if (struct_v >= 2)
588 decode(export_targets, bl);
589 if (struct_v >= 5)
590 decode(mds_features, bl);
591 if (struct_v >= 6) {
592 fs_cluster_id_t standby_for_fscid;
593 decode(standby_for_fscid, bl);
594 }
595 if (struct_v >= 7) {
596 bool standby_replay;
597 decode(standby_replay, bl);
598 }
599 if (struct_v >= 9) {
600 decode(flags, bl);
601 }
602 DECODE_FINISH(bl);
603 }
604
605 std::string MDSMap::mds_info_t::human_name() const
606 {
607 // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost"
608 std::ostringstream out;
609 out << "daemon mds." << name;
610 return out.str();
611 }
612
613 void MDSMap::encode(bufferlist& bl, uint64_t features) const
614 {
615 std::map<mds_rank_t,int32_t> inc; // Legacy field, fake it so that
616 // old-mon peers have something sane
617 // during upgrade
618 for (const auto rank : in) {
619 inc.insert(std::make_pair(rank, epoch));
620 }
621
622 using ceph::encode;
623 if ((features & CEPH_FEATURE_PGID64) == 0) {
624 __u16 v = 2;
625 encode(v, bl);
626 encode(epoch, bl);
627 encode(flags, bl);
628 encode(last_failure, bl);
629 encode(root, bl);
630 encode(session_timeout, bl);
631 encode(session_autoclose, bl);
632 encode(max_file_size, bl);
633 encode(max_mds, bl);
634 __u32 n = mds_info.size();
635 encode(n, bl);
636 for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
637 i != mds_info.end(); ++i) {
638 encode(i->first, bl);
639 encode(i->second, bl, features);
640 }
641 n = data_pools.size();
642 encode(n, bl);
643 for (const auto p: data_pools) {
644 n = p;
645 encode(n, bl);
646 }
647
648 int32_t m = cas_pool;
649 encode(m, bl);
650 return;
651 } else if ((features & CEPH_FEATURE_MDSENC) == 0) {
652 __u16 v = 3;
653 encode(v, bl);
654 encode(epoch, bl);
655 encode(flags, bl);
656 encode(last_failure, bl);
657 encode(root, bl);
658 encode(session_timeout, bl);
659 encode(session_autoclose, bl);
660 encode(max_file_size, bl);
661 encode(max_mds, bl);
662 __u32 n = mds_info.size();
663 encode(n, bl);
664 for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
665 i != mds_info.end(); ++i) {
666 encode(i->first, bl);
667 encode(i->second, bl, features);
668 }
669 encode(data_pools, bl);
670 encode(cas_pool, bl);
671
672 // kclient ignores everything from here
673 __u16 ev = 5;
674 encode(ev, bl);
675 encode(compat, bl);
676 encode(metadata_pool, bl);
677 encode(created, bl);
678 encode(modified, bl);
679 encode(tableserver, bl);
680 encode(in, bl);
681 encode(inc, bl);
682 encode(up, bl);
683 encode(failed, bl);
684 encode(stopped, bl);
685 encode(last_failure_osd_epoch, bl);
686 return;
687 }
688
689 ENCODE_START(5, 4, bl);
690 encode(epoch, bl);
691 encode(flags, bl);
692 encode(last_failure, bl);
693 encode(root, bl);
694 encode(session_timeout, bl);
695 encode(session_autoclose, bl);
696 encode(max_file_size, bl);
697 encode(max_mds, bl);
698 encode(mds_info, bl, features);
699 encode(data_pools, bl);
700 encode(cas_pool, bl);
701
702 // kclient ignores everything from here
703 __u16 ev = 15;
704 encode(ev, bl);
705 encode(compat, bl);
706 encode(metadata_pool, bl);
707 encode(created, bl);
708 encode(modified, bl);
709 encode(tableserver, bl);
710 encode(in, bl);
711 encode(inc, bl);
712 encode(up, bl);
713 encode(failed, bl);
714 encode(stopped, bl);
715 encode(last_failure_osd_epoch, bl);
716 encode(ever_allowed_features, bl);
717 encode(explicitly_allowed_features, bl);
718 encode(inline_data_enabled, bl);
719 encode(enabled, bl);
720 encode(fs_name, bl);
721 encode(damaged, bl);
722 encode(balancer, bl);
723 encode(standby_count_wanted, bl);
724 encode(old_max_mds, bl);
725 encode(min_compat_client, bl);
726 ENCODE_FINISH(bl);
727 }
728
729 void MDSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
730 {
731 /* Before we did stricter checking, it was possible to remove a data pool
732 * without also deleting it from the MDSMap. Check for that here after
733 * decoding the data pools.
734 */
735
736 for (auto it = data_pools.begin(); it != data_pools.end();) {
737 if (!pool_exists(*it)) {
738 dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
739 it = data_pools.erase(it);
740 } else {
741 it++;
742 }
743 }
744 }
745
746 void MDSMap::decode(bufferlist::const_iterator& p)
747 {
748 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
749
750 cached_up_features = 0;
751 DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p);
752 decode(epoch, p);
753 decode(flags, p);
754 decode(last_failure, p);
755 decode(root, p);
756 decode(session_timeout, p);
757 decode(session_autoclose, p);
758 decode(max_file_size, p);
759 decode(max_mds, p);
760 decode(mds_info, p);
761 if (struct_v < 3) {
762 __u32 n;
763 decode(n, p);
764 while (n--) {
765 __u32 m;
766 decode(m, p);
767 data_pools.push_back(m);
768 }
769 __s32 s;
770 decode(s, p);
771 cas_pool = s;
772 } else {
773 decode(data_pools, p);
774 decode(cas_pool, p);
775 }
776
777 // kclient ignores everything from here
778 __u16 ev = 1;
779 if (struct_v >= 2)
780 decode(ev, p);
781 if (ev >= 3)
782 decode(compat, p);
783 else
784 compat = get_compat_set_base();
785 if (ev < 5) {
786 __u32 n;
787 decode(n, p);
788 metadata_pool = n;
789 } else {
790 decode(metadata_pool, p);
791 }
792 decode(created, p);
793 decode(modified, p);
794 decode(tableserver, p);
795 decode(in, p);
796 decode(inc, p);
797 decode(up, p);
798 decode(failed, p);
799 decode(stopped, p);
800 if (ev >= 4)
801 decode(last_failure_osd_epoch, p);
802 if (ev >= 6) {
803 if (ev < 10) {
804 // previously this was a bool about snaps, not a flag map
805 bool flag;
806 decode(flag, p);
807 ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
808 decode(flag, p);
809 explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
810 } else {
811 decode(ever_allowed_features, p);
812 decode(explicitly_allowed_features, p);
813 }
814 } else {
815 ever_allowed_features = 0;
816 explicitly_allowed_features = 0;
817 }
818 if (ev >= 7)
819 decode(inline_data_enabled, p);
820
821 if (ev >= 8) {
822 ceph_assert(struct_v >= 5);
823 decode(enabled, p);
824 decode(fs_name, p);
825 } else {
826 if (epoch > 1) {
827 // If an MDS has ever been started, epoch will be greater than 1,
828 // assume filesystem is enabled.
829 enabled = true;
830 } else {
831 // Upgrading from a cluster that never used an MDS, switch off
832 // filesystem until it's explicitly enabled.
833 enabled = false;
834 }
835 }
836
837 if (ev >= 9) {
838 decode(damaged, p);
839 }
840
841 if (ev >= 11) {
842 decode(balancer, p);
843 }
844
845 if (ev >= 12) {
846 decode(standby_count_wanted, p);
847 }
848
849 if (ev >= 13) {
850 decode(old_max_mds, p);
851 }
852
853 if (ev == 14) {
854 int8_t r;
855 decode(r, p);
856 if (r < 0) {
857 min_compat_client = ceph_release_t::unknown;
858 } else {
859 min_compat_client = ceph_release_t{static_cast<uint8_t>(r)};
860 }
861 } else if (ev > 14) {
862 decode(min_compat_client, p);
863 }
864
865 DECODE_FINISH(p);
866 }
867
868 MDSMap::availability_t MDSMap::is_cluster_available() const
869 {
870 if (epoch == 0) {
871 // If I'm a client, this means I'm looking at an MDSMap instance
872 // that was never actually initialized from the mons. Client should
873 // wait.
874 return TRANSIENT_UNAVAILABLE;
875 }
876
877 // If a rank is marked damage (unavailable until operator intervenes)
878 if (damaged.size()) {
879 return STUCK_UNAVAILABLE;
880 }
881
882 // If no ranks are created (filesystem not initialized)
883 if (in.empty()) {
884 return STUCK_UNAVAILABLE;
885 }
886
887 for (const auto rank : in) {
888 if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) {
889 // This might only be transient, but because we can't see
890 // standbys, we have no way of knowing whether there is a
891 // standby available to replace the laggy guy.
892 return STUCK_UNAVAILABLE;
893 }
894 }
895
896 if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) {
897 // Nobody looks stuck, so indicate to client they should go ahead
898 // and try mounting if anybody is active. This may include e.g.
899 // one MDS failing over and another active: the client should
900 // proceed to start talking to the active one and let the
901 // transiently-unavailable guy catch up later.
902 return AVAILABLE;
903 } else {
904 // Nothing indicating we were stuck, but nobody active (yet)
905 //return TRANSIENT_UNAVAILABLE;
906
907 // Because we don't have standbys in the MDSMap any more, we can't
908 // reliably indicate transient vs. stuck, so always say stuck so
909 // that the client doesn't block.
910 return STUCK_UNAVAILABLE;
911 }
912 }
913
914 bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next)
915 {
916 bool state_valid = true;
917 if (next != prev) {
918 if (prev == MDSMap::STATE_REPLAY) {
919 if (next != MDSMap::STATE_RESOLVE && next != MDSMap::STATE_RECONNECT) {
920 state_valid = false;
921 }
922 } else if (prev == MDSMap::STATE_REJOIN) {
923 if (next != MDSMap::STATE_ACTIVE &&
924 next != MDSMap::STATE_CLIENTREPLAY &&
925 next != MDSMap::STATE_STOPPED) {
926 state_valid = false;
927 }
928 } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) {
929 // Once I have entered replay, the only allowable transitions are to
930 // the next next along in the sequence.
931 if (next != prev + 1) {
932 state_valid = false;
933 }
934 }
935 }
936
937 return state_valid;
938 }
939
940 bool MDSMap::check_health(mds_rank_t standby_daemon_count)
941 {
942 std::set<mds_rank_t> standbys;
943 get_standby_replay_mds_set(standbys);
944 std::set<mds_rank_t> actives;
945 get_active_mds_set(actives);
946 mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count;
947
948 /* If there are standby daemons available/replaying and
949 * standby_count_wanted is unset (default), then we set it to 1. This will
950 * happen during health checks by the mons. Also, during initial creation
951 * of the FS we will have no actives so we don't want to change the default
952 * yet.
953 */
954 if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) {
955 set_standby_count_wanted(1);
956 return true;
957 }
958 return false;
959 }
960