1    	// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
2    	// vim: ts=8 sw=2 smarttab
3    	/*
4    	 * Ceph - scalable distributed file system
5    	 *
6    	 * Copyright (C) 2011 Sage Weil <sage@newdream.net>
7    	 *
8    	 * This is free software; you can redistribute it and/or
9    	 * modify it under the terms of the GNU Lesser General Public
10   	 * License version 2.1, as published by the Free Software 
11   	 * Foundation.  See file COPYING.
12   	 * 
13   	 */
14   	
15   	#include <signal.h>
16   	
17   	#include "HeartbeatMap.h"
18   	#include "ceph_context.h"
19   	#include "common/errno.h"
20   	#include "common/valgrind.h"
21   	#include "debug.h"
22   	
23   	#define dout_subsys ceph_subsys_heartbeatmap
24   	#undef dout_prefix
25   	#define dout_prefix *_dout << "heartbeat_map "
26   	
27   	namespace ceph {
28   	
29   	HeartbeatMap::HeartbeatMap(CephContext *cct)
30   	  : m_cct(cct),
31   	    m_unhealthy_workers(0),
32   	    m_total_workers(0)
33   	{
34   	}
35   	
(1) Event exn_spec_violation: An exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE" is thrown but the throw list "throw()" doesn't allow it to be thrown. This will cause a call to unexpected() which usually calls terminate().
Also see events: [fun_call_w_exception]
36   	HeartbeatMap::~HeartbeatMap()
37   	{
(2) Event fun_call_w_exception: Called function throws an exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE". [details]
Also see events: [exn_spec_violation]
38   	  ceph_assert(m_workers.empty());
39   	}
40   	
41   	heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id)
42   	{
43   	  std::unique_lock locker{m_rwlock};
44   	  ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
45   	  heartbeat_handle_d *h = new heartbeat_handle_d(name);
46   	  ANNOTATE_BENIGN_RACE_SIZED(&h->timeout, sizeof(h->timeout),
47   	                             "heartbeat_handle_d timeout");
48   	  ANNOTATE_BENIGN_RACE_SIZED(&h->suicide_timeout, sizeof(h->suicide_timeout),
49   	                             "heartbeat_handle_d suicide_timeout");
50   	  m_workers.push_front(h);
51   	  h->list_item = m_workers.begin();
52   	  h->thread_id = thread_id;
53   	  return h;
54   	}
55   	
56   	void HeartbeatMap::remove_worker(const heartbeat_handle_d *h)
57   	{
58   	  std::unique_lock locker{m_rwlock};
59   	  ldout(m_cct, 10) << "remove_worker '" << h->name << "'" << dendl;
60   	  m_workers.erase(h->list_item);
61   	  delete h;
62   	}
63   	
64   	bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who,
65   				  ceph::coarse_mono_clock::rep now)
66   	{
67   	  bool healthy = true;
68   	  auto was = h->timeout.load();
69   	  if (was && was < now) {
70   	    ldout(m_cct, 1) << who << " '" << h->name << "'"
71   			    << " had timed out after " << h->grace << dendl;
72   	    healthy = false;
73   	  }
74   	  was = h->suicide_timeout;
75   	  if (was && was < now) {
76   	    ldout(m_cct, 1) << who << " '" << h->name << "'"
77   			    << " had suicide timed out after " << h->suicide_grace << dendl;
78   	    pthread_kill(h->thread_id, SIGABRT);
79   	    sleep(1);
80   	    ceph_abort_msg("hit suicide timeout");
81   	  }
82   	  return healthy;
83   	}
84   	
85   	void HeartbeatMap::reset_timeout(heartbeat_handle_d *h,
86   					 ceph::coarse_mono_clock::rep grace,
87   					 ceph::coarse_mono_clock::rep suicide_grace)
88   	{
89   	  ldout(m_cct, 20) << "reset_timeout '" << h->name << "' grace " << grace
90   			   << " suicide " << suicide_grace << dendl;
91   	  auto now = chrono::duration_cast<chrono::seconds>(
92   		       ceph::coarse_mono_clock::now().time_since_epoch()).count();
93   	  _check(h, "reset_timeout", now);
94   	
95   	  h->timeout = now + grace;
96   	  h->grace = grace;
97   	
98   	  if (suicide_grace)
99   	    h->suicide_timeout = now + suicide_grace;
100  	  else
101  	    h->suicide_timeout = 0;
102  	  h->suicide_grace = suicide_grace;
103  	}
104  	
105  	void HeartbeatMap::clear_timeout(heartbeat_handle_d *h)
106  	{
107  	  ldout(m_cct, 20) << "clear_timeout '" << h->name << "'" << dendl;
108  	  auto now = chrono::duration_cast<std::chrono::seconds>(
109  		       ceph::coarse_mono_clock::now().time_since_epoch()).count();
110  	  _check(h, "clear_timeout", now);
111  	  h->timeout = 0;
112  	  h->suicide_timeout = 0;
113  	}
114  	
115  	bool HeartbeatMap::is_healthy()
116  	{
117  	  int unhealthy = 0;
118  	  int total = 0;
119  	  m_rwlock.lock_shared();
120  	  auto now = ceph::coarse_mono_clock::now();
121  	  if (m_cct->_conf->heartbeat_inject_failure) {
122  	    ldout(m_cct, 0) << "is_healthy injecting failure for next " << m_cct->_conf->heartbeat_inject_failure << " seconds" << dendl;
123  	    m_inject_unhealthy_until = now + std::chrono::seconds(m_cct->_conf->heartbeat_inject_failure);
124  	    m_cct->_conf.set_val("heartbeat_inject_failure", "0");
125  	  }
126  	
127  	  bool healthy = true;
128  	  if (now < m_inject_unhealthy_until) {
129  	    auto sec = std::chrono::duration_cast<std::chrono::seconds>(m_inject_unhealthy_until - now).count();
130  	    ldout(m_cct, 0) << "is_healthy = false, injected failure for next "
131  	                    << sec << " seconds" << dendl;
132  	    healthy = false;
133  	  }
134  	
135  	  for (list<heartbeat_handle_d*>::iterator p = m_workers.begin();
136  	       p != m_workers.end();
137  	       ++p) {
138  	    heartbeat_handle_d *h = *p;
139  	    auto epoch = chrono::duration_cast<chrono::seconds>(now.time_since_epoch()).count();
140  	    if (!_check(h, "is_healthy", epoch)) {
141  	      healthy = false;
142  	      unhealthy++;
143  	    }
144  	    total++;
145  	  }
146  	  m_rwlock.unlock_shared();
147  	
148  	  m_unhealthy_workers = unhealthy;
149  	  m_total_workers = total;
150  	
151  	  ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY")
152  	    << ", total workers: " << total << ", number of unhealthy: " << unhealthy << dendl;
153  	  return healthy;
154  	}
155  	
156  	int HeartbeatMap::get_unhealthy_workers() const
157  	{
158  	  return m_unhealthy_workers;
159  	}
160  	
161  	int HeartbeatMap::get_total_workers() const
162  	{
163  	  return m_total_workers;
164  	}
165  	
166  	void HeartbeatMap::check_touch_file()
167  	{
168  	  string path = m_cct->_conf->heartbeat_file;
169  	  if (path.length() && is_healthy()) {
170  	    int fd = ::open(path.c_str(), O_WRONLY|O_CREAT|O_CLOEXEC, 0644);
171  	    if (fd >= 0) {
172  	      ::utimes(path.c_str(), NULL);
173  	      ::close(fd);
174  	    } else {
175  	      ldout(m_cct, 0) << "unable to touch " << path << ": "
176  	                     << cpp_strerror(errno) << dendl;
177  	    }
178  	  }
179  	}
180  	
181  	}
182