1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <signal.h>
16
17 #include "HeartbeatMap.h"
18 #include "ceph_context.h"
19 #include "common/errno.h"
20 #include "common/valgrind.h"
21 #include "debug.h"
22
23 #define dout_subsys ceph_subsys_heartbeatmap
24 #undef dout_prefix
25 #define dout_prefix *_dout << "heartbeat_map "
26
27 namespace ceph {
28
29 HeartbeatMap::HeartbeatMap(CephContext *cct)
30 : m_cct(cct),
31 m_unhealthy_workers(0),
32 m_total_workers(0)
33 {
34 }
35
(1) Event exn_spec_violation: |
An exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE" is thrown but the throw list "throw()" doesn't allow it to be thrown. This will cause a call to unexpected() which usually calls terminate(). |
Also see events: |
[fun_call_w_exception] |
36 HeartbeatMap::~HeartbeatMap()
37 {
(2) Event fun_call_w_exception: |
Called function throws an exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE". [details] |
Also see events: |
[exn_spec_violation] |
38 ceph_assert(m_workers.empty());
39 }
40
41 heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id)
42 {
43 std::unique_lock locker{m_rwlock};
44 ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
45 heartbeat_handle_d *h = new heartbeat_handle_d(name);
46 ANNOTATE_BENIGN_RACE_SIZED(&h->timeout, sizeof(h->timeout),
47 "heartbeat_handle_d timeout");
48 ANNOTATE_BENIGN_RACE_SIZED(&h->suicide_timeout, sizeof(h->suicide_timeout),
49 "heartbeat_handle_d suicide_timeout");
50 m_workers.push_front(h);
51 h->list_item = m_workers.begin();
52 h->thread_id = thread_id;
53 return h;
54 }
55
56 void HeartbeatMap::remove_worker(const heartbeat_handle_d *h)
57 {
58 std::unique_lock locker{m_rwlock};
59 ldout(m_cct, 10) << "remove_worker '" << h->name << "'" << dendl;
60 m_workers.erase(h->list_item);
61 delete h;
62 }
63
64 bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who,
65 ceph::coarse_mono_clock::rep now)
66 {
67 bool healthy = true;
68 auto was = h->timeout.load();
69 if (was && was < now) {
70 ldout(m_cct, 1) << who << " '" << h->name << "'"
71 << " had timed out after " << h->grace << dendl;
72 healthy = false;
73 }
74 was = h->suicide_timeout;
75 if (was && was < now) {
76 ldout(m_cct, 1) << who << " '" << h->name << "'"
77 << " had suicide timed out after " << h->suicide_grace << dendl;
78 pthread_kill(h->thread_id, SIGABRT);
79 sleep(1);
80 ceph_abort_msg("hit suicide timeout");
81 }
82 return healthy;
83 }
84
85 void HeartbeatMap::reset_timeout(heartbeat_handle_d *h,
86 ceph::coarse_mono_clock::rep grace,
87 ceph::coarse_mono_clock::rep suicide_grace)
88 {
89 ldout(m_cct, 20) << "reset_timeout '" << h->name << "' grace " << grace
90 << " suicide " << suicide_grace << dendl;
91 auto now = chrono::duration_cast<chrono::seconds>(
92 ceph::coarse_mono_clock::now().time_since_epoch()).count();
93 _check(h, "reset_timeout", now);
94
95 h->timeout = now + grace;
96 h->grace = grace;
97
98 if (suicide_grace)
99 h->suicide_timeout = now + suicide_grace;
100 else
101 h->suicide_timeout = 0;
102 h->suicide_grace = suicide_grace;
103 }
104
105 void HeartbeatMap::clear_timeout(heartbeat_handle_d *h)
106 {
107 ldout(m_cct, 20) << "clear_timeout '" << h->name << "'" << dendl;
108 auto now = chrono::duration_cast<std::chrono::seconds>(
109 ceph::coarse_mono_clock::now().time_since_epoch()).count();
110 _check(h, "clear_timeout", now);
111 h->timeout = 0;
112 h->suicide_timeout = 0;
113 }
114
115 bool HeartbeatMap::is_healthy()
116 {
117 int unhealthy = 0;
118 int total = 0;
119 m_rwlock.lock_shared();
120 auto now = ceph::coarse_mono_clock::now();
121 if (m_cct->_conf->heartbeat_inject_failure) {
122 ldout(m_cct, 0) << "is_healthy injecting failure for next " << m_cct->_conf->heartbeat_inject_failure << " seconds" << dendl;
123 m_inject_unhealthy_until = now + std::chrono::seconds(m_cct->_conf->heartbeat_inject_failure);
124 m_cct->_conf.set_val("heartbeat_inject_failure", "0");
125 }
126
127 bool healthy = true;
128 if (now < m_inject_unhealthy_until) {
129 auto sec = std::chrono::duration_cast<std::chrono::seconds>(m_inject_unhealthy_until - now).count();
130 ldout(m_cct, 0) << "is_healthy = false, injected failure for next "
131 << sec << " seconds" << dendl;
132 healthy = false;
133 }
134
135 for (list<heartbeat_handle_d*>::iterator p = m_workers.begin();
136 p != m_workers.end();
137 ++p) {
138 heartbeat_handle_d *h = *p;
139 auto epoch = chrono::duration_cast<chrono::seconds>(now.time_since_epoch()).count();
140 if (!_check(h, "is_healthy", epoch)) {
141 healthy = false;
142 unhealthy++;
143 }
144 total++;
145 }
146 m_rwlock.unlock_shared();
147
148 m_unhealthy_workers = unhealthy;
149 m_total_workers = total;
150
151 ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY")
152 << ", total workers: " << total << ", number of unhealthy: " << unhealthy << dendl;
153 return healthy;
154 }
155
156 int HeartbeatMap::get_unhealthy_workers() const
157 {
158 return m_unhealthy_workers;
159 }
160
161 int HeartbeatMap::get_total_workers() const
162 {
163 return m_total_workers;
164 }
165
166 void HeartbeatMap::check_touch_file()
167 {
168 string path = m_cct->_conf->heartbeat_file;
169 if (path.length() && is_healthy()) {
170 int fd = ::open(path.c_str(), O_WRONLY|O_CREAT|O_CLOEXEC, 0644);
171 if (fd >= 0) {
172 ::utimes(path.c_str(), NULL);
173 ::close(fd);
174 } else {
175 ldout(m_cct, 0) << "unable to touch " << path << ": "
176 << cpp_strerror(errno) << dendl;
177 }
178 }
179 }
180
181 }
182