1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 XSKY <haomai@xsky.com>
7 *
8 * Author: Haomai Wang <haomaiwang@gmail.com>
9 *
10 * This is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License version 2.1, as published by the Free Software
13 * Foundation. See file COPYING.
14 *
15 */
16
17 #ifndef CEPH_INFINIBAND_H
18 #define CEPH_INFINIBAND_H
19
20 #include <boost/pool/pool.hpp>
21 // need this because boost messes with ceph log/assert definitions
22 #include "include/ceph_assert.h"
23
24 #include <infiniband/verbs.h>
25 #include <rdma/rdma_cma.h>
26
27 #include <atomic>
28 #include <string>
29 #include <vector>
30
31 #include "include/int_types.h"
32 #include "include/page.h"
33 #include "common/debug.h"
34 #include "common/errno.h"
35 #include "common/ceph_mutex.h"
36 #include "common/perf_counters.h"
37 #include "msg/msg_types.h"
38 #include "msg/async/net_handler.h"
39
40 #define HUGE_PAGE_SIZE_2MB (2 * 1024 * 1024)
41 #define ALIGN_TO_PAGE_2MB(x) \
42 (((x) + (HUGE_PAGE_SIZE_2MB - 1)) & ~(HUGE_PAGE_SIZE_2MB - 1))
43
44 #define PSN_LEN 24
45 #define PSN_MSK ((1 << PSN_LEN) - 1)
46
47 #define BEACON_WRID 0xDEADBEEF
48
49 struct ib_cm_meta_t {
50 uint16_t lid;
51 uint32_t local_qpn;
52 uint32_t psn;
53 uint32_t peer_qpn;
54 union ibv_gid gid;
55 } __attribute__((packed));
56
57 class RDMAStack;
58 class CephContext;
59
60 class Port {
61 struct ibv_context* ctxt;
62 int port_num;
63 struct ibv_port_attr port_attr;
64 uint16_t lid;
65 int gid_idx = 0;
66 union ibv_gid gid;
67
68 public:
69 explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
70 uint16_t get_lid() { return lid; }
71 ibv_gid get_gid() { return gid; }
72 int get_port_num() { return port_num; }
73 ibv_port_attr* get_port_attr() { return &port_attr; }
74 int get_gid_idx() { return gid_idx; }
75 };
76
77
78 class Device {
79 ibv_device *device;
80 const char* name;
81 uint8_t port_cnt = 0;
82 public:
83 explicit Device(CephContext *c, ibv_device* ib_dev);
84 explicit Device(CephContext *c, ibv_context *ib_ctx);
(1) Event exn_spec_violation: |
An exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE" is thrown but the throw list "throw()" doesn't allow it to be thrown. This will cause a call to unexpected() which usually calls terminate(). |
Also see events: |
[fun_call_w_exception] |
85 ~Device() {
86 if (active_port) {
87 delete active_port;
(2) Event fun_call_w_exception: |
Called function throws an exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE". [details] |
Also see events: |
[exn_spec_violation] |
88 ceph_assert(ibv_close_device(ctxt) == 0);
89 }
90 }
91 const char* get_name() { return name;}
92 uint16_t get_lid() { return active_port->get_lid(); }
93 ibv_gid get_gid() { return active_port->get_gid(); }
94 int get_gid_idx() { return active_port->get_gid_idx(); }
95 void binding_port(CephContext *c, int port_num);
96 struct ibv_context *ctxt;
97 ibv_device_attr device_attr;
98 Port* active_port;
99 };
100
101
102 class DeviceList {
103 struct ibv_device ** device_list;
104 struct ibv_context ** device_context_list;
105 int num;
106 Device** devices;
107 public:
108 explicit DeviceList(CephContext *cct): device_list(nullptr), device_context_list(nullptr),
109 num(0), devices(nullptr) {
110 device_list = ibv_get_device_list(&num);
111 ceph_assert(device_list);
112 ceph_assert(num);
113 if (cct->_conf->ms_async_rdma_cm) {
114 device_context_list = rdma_get_devices(NULL);
115 ceph_assert(device_context_list);
116 }
117 devices = new Device*[num];
118
119 for (int i = 0;i < num; ++i) {
120 if (cct->_conf->ms_async_rdma_cm) {
121 devices[i] = new Device(cct, device_context_list[i]);
122 } else {
123 devices[i] = new Device(cct, device_list[i]);
124 }
125 }
126 }
127 ~DeviceList() {
128 for (int i=0; i < num; ++i) {
129 delete devices[i];
130 }
131 delete []devices;
132 ibv_free_device_list(device_list);
133 rdma_free_devices(device_context_list);
134 }
135
136 Device* get_device(const char* device_name) {
137 for (int i = 0; i < num; ++i) {
138 if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
139 return devices[i];
140 }
141 }
142 return NULL;
143 }
144 };
145
146 // stat counters
147 enum {
148 l_msgr_rdma_dispatcher_first = 94000,
149
150 l_msgr_rdma_polling,
151 l_msgr_rdma_inflight_tx_chunks,
152 l_msgr_rdma_rx_bufs_in_use,
153 l_msgr_rdma_rx_bufs_total,
154
155 l_msgr_rdma_tx_total_wc,
156 l_msgr_rdma_tx_total_wc_errors,
157 l_msgr_rdma_tx_wc_retry_errors,
158 l_msgr_rdma_tx_wc_wr_flush_errors,
159
160 l_msgr_rdma_rx_total_wc,
161 l_msgr_rdma_rx_total_wc_errors,
162 l_msgr_rdma_rx_fin,
163
164 l_msgr_rdma_handshake_errors,
165
166 l_msgr_rdma_total_async_events,
167 l_msgr_rdma_async_last_wqe_events,
168
169 l_msgr_rdma_created_queue_pair,
170 l_msgr_rdma_active_queue_pair,
171
172 l_msgr_rdma_dispatcher_last,
173 };
174
175 enum {
176 l_msgr_rdma_first = 95000,
177
178 l_msgr_rdma_tx_no_mem,
179 l_msgr_rdma_tx_parital_mem,
180 l_msgr_rdma_tx_failed,
181
182 l_msgr_rdma_tx_chunks,
183 l_msgr_rdma_tx_bytes,
184 l_msgr_rdma_rx_chunks,
185 l_msgr_rdma_rx_bytes,
186 l_msgr_rdma_pending_sent_conns,
187
188 l_msgr_rdma_last,
189 };
190
191 class RDMADispatcher;
192
193 class Infiniband {
194 public:
195 class ProtectionDomain {
196 public:
197 explicit ProtectionDomain(CephContext *cct, Device *device);
198 ~ProtectionDomain();
199
200 ibv_pd* const pd;
201 };
202
203 class QueuePair;
204 class MemoryManager {
205 public:
206 class Chunk {
207 public:
208 Chunk(ibv_mr* m, uint32_t bytes, char* buffer, uint32_t offset = 0, uint32_t bound = 0, uint32_t lkey = 0, QueuePair* qp = nullptr);
209 ~Chunk();
210
211 uint32_t get_offset();
212 uint32_t get_size() const;
213 void prepare_read(uint32_t b);
214 uint32_t get_bound();
215 uint32_t read(char* buf, uint32_t len);
216 uint32_t write(char* buf, uint32_t len);
217 bool full();
218 void reset_read_chunk();
219 void reset_write_chunk();
220 void set_qp(QueuePair *qp) { this->qp = qp; }
221 void clear_qp() { set_qp(nullptr); }
222 QueuePair* get_qp() { return qp; }
223
224 public:
225 ibv_mr* mr;
226 QueuePair *qp;
227 uint32_t lkey;
228 uint32_t bytes;
229 uint32_t offset;
230 uint32_t bound;
231 char* buffer; // TODO: remove buffer/refactor TX
232 char data[0];
233 };
234
235 class Cluster {
236 public:
237 Cluster(MemoryManager& m, uint32_t s);
238 ~Cluster();
239
240 int fill(uint32_t num);
241 void take_back(std::vector<Chunk*> &ck);
242 int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
243 Chunk *get_chunk_by_buffer(const char *c) {
244 uint32_t idx = (c - base) / buffer_size;
245 Chunk *chunk = chunk_base + idx;
246 return chunk;
247 }
248 bool is_my_buffer(const char *c) const {
249 return c >= base && c < end;
250 }
251
252 MemoryManager& manager;
253 uint32_t buffer_size;
254 uint32_t num_chunk = 0;
255 ceph::mutex lock = ceph::make_mutex("cluster_lock");
256 std::vector<Chunk*> free_chunks;
257 char *base = nullptr;
258 char *end = nullptr;
259 Chunk* chunk_base = nullptr;
260 };
261
262 class MemPoolContext {
263 PerfCounters *perf_logger;
264
265 public:
266 MemoryManager *manager;
267 unsigned n_bufs_allocated;
268 // true if it is possible to alloc
269 // more memory for the pool
270 explicit MemPoolContext(MemoryManager *m) :
271 perf_logger(nullptr),
272 manager(m),
273 n_bufs_allocated(0) {}
274 bool can_alloc(unsigned nbufs);
275 void update_stats(int val);
276 void set_stat_logger(PerfCounters *logger);
277 };
278
279 class PoolAllocator {
280 struct mem_info {
281 ibv_mr *mr;
282 MemPoolContext *ctx;
283 unsigned nbufs;
284 Chunk chunks[0];
285 };
286 public:
287 typedef std::size_t size_type;
288 typedef std::ptrdiff_t difference_type;
289
290 static char * malloc(const size_type bytes);
291 static void free(char * const block);
292
293 static MemPoolContext *g_ctx;
294 static ceph::mutex lock;
295 };
296
297 /**
298 * modify boost pool so that it is possible to
299 * have a thread safe 'context' when allocating/freeing
300 * the memory. It is needed to allow a different pool
301 * configurations and bookkeeping per CephContext and
302 * also to be able to use same allocator to deal with
303 * RX and TX pool.
304 * TODO: use boost pool to allocate TX chunks too
305 */
306 class mem_pool : public boost::pool<PoolAllocator> {
307 private:
308 MemPoolContext *ctx;
309 void *slow_malloc();
310
311 public:
312 ceph::mutex lock = ceph::make_mutex("mem_pool_lock");
313 explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
314 const size_type nnext_size = 32,
315 const size_type nmax_size = 0) :
316 pool(nrequested_size, nnext_size, nmax_size),
317 ctx(ctx) { }
318
319 void *malloc() {
320 if (!store().empty())
321 return (store().malloc)();
322 // need to alloc more memory...
323 // slow path code
324 return slow_malloc();
325 }
326 };
327
328 MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
329 ~MemoryManager();
330
331 void* malloc(size_t size);
332 void free(void *ptr);
333
334 void create_tx_pool(uint32_t size, uint32_t tx_num);
335 void return_tx(std::vector<Chunk*> &chunks);
336 int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
337 bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
338 Chunk *get_tx_chunk_by_buffer(const char *c) {
339 return send->get_chunk_by_buffer(c);
340 }
341 uint32_t get_tx_buffer_size() const {
342 return send->buffer_size;
343 }
344
345 Chunk *get_rx_buffer() {
346 std::lock_guard l{rxbuf_pool.lock};
347 return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
348 }
349
350 void release_rx_buffer(Chunk *chunk) {
351 std::lock_guard l{rxbuf_pool.lock};
352 chunk->clear_qp();
353 rxbuf_pool.free(chunk);
354 }
355
356 void set_rx_stat_logger(PerfCounters *logger) {
357 rxbuf_pool_ctx.set_stat_logger(logger);
358 }
359
360 CephContext *cct;
361 private:
362 // TODO: Cluster -> TxPool txbuf_pool
363 // chunk layout fix
364 //
365 Cluster* send = nullptr;// SEND
366 Device *device;
367 ProtectionDomain *pd;
368 MemPoolContext rxbuf_pool_ctx;
369 mem_pool rxbuf_pool;
370
371
372 void* huge_pages_malloc(size_t size);
373 void huge_pages_free(void *ptr);
374 };
375
376 private:
377 uint32_t tx_queue_len = 0;
378 uint32_t rx_queue_len = 0;
379 uint32_t max_sge = 0;
380 uint8_t ib_physical_port = 0;
381 MemoryManager* memory_manager = nullptr;
382 ibv_srq* srq = nullptr; // shared receive work queue
383 Device *device = NULL;
384 ProtectionDomain *pd = NULL;
385 DeviceList *device_list = nullptr;
386 CephContext *cct;
387 ceph::mutex lock = ceph::make_mutex("IB lock");
388 bool initialized = false;
389 const std::string &device_name;
390 uint8_t port_num;
391 bool support_srq = false;
392
393 public:
394 explicit Infiniband(CephContext *c);
395 ~Infiniband();
396 void init();
397 static void verify_prereq(CephContext *cct);
398
399 class CompletionChannel {
400 static const uint32_t MAX_ACK_EVENT = 5000;
401 CephContext *cct;
402 Infiniband& infiniband;
403 ibv_comp_channel *channel;
404 ibv_cq *cq;
405 uint32_t cq_events_that_need_ack;
406
407 public:
408 CompletionChannel(CephContext *c, Infiniband &ib);
409 ~CompletionChannel();
410 int init();
411 bool get_cq_event();
412 int get_fd() { return channel->fd; }
413 ibv_comp_channel* get_channel() { return channel; }
414 void bind_cq(ibv_cq *c) { cq = c; }
415 void ack_events();
416 };
417
418 // this class encapsulates the creation, use, and destruction of an RC
419 // completion queue.
420 //
421 // You need to call init and it will create a cq and associate to comp channel
422 class CompletionQueue {
423 public:
424 CompletionQueue(CephContext *c, Infiniband &ib,
425 const uint32_t qd, CompletionChannel *cc)
426 : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
427 ~CompletionQueue();
428 int init();
429 int poll_cq(int num_entries, ibv_wc *ret_wc_array);
430
431 ibv_cq* get_cq() const { return cq; }
432 int rearm_notify(bool solicited_only=true);
433 CompletionChannel* get_cc() const { return channel; }
434 private:
435 CephContext *cct;
436 Infiniband& infiniband; // Infiniband to which this QP belongs
437 CompletionChannel *channel;
438 ibv_cq *cq;
439 uint32_t queue_depth;
440 };
441
442 // this class encapsulates the creation, use, and destruction of an RC
443 // queue pair.
444 //
445 // you need call init and it will create a qp and bring it to the INIT state.
446 // after obtaining the lid, qpn, and psn of a remote queue pair, one
447 // must call plumb() to bring the queue pair to the RTS state.
448 class QueuePair {
449 public:
450 typedef MemoryManager::Chunk Chunk;
451 QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
452 int ib_physical_port, ibv_srq *srq,
453 Infiniband::CompletionQueue* txcq,
454 Infiniband::CompletionQueue* rxcq,
455 uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
456 ~QueuePair();
457
458 int modify_qp_to_error();
459 int modify_qp_to_rts();
460 int modify_qp_to_rtr();
461 int modify_qp_to_init();
462 int init();
463
464 /**
465 * Get the initial packet sequence number for this QueuePair.
466 * This is randomly generated on creation. It should not be confused
467 * with the remote side's PSN, which is set in #plumb().
468 */
469 uint32_t get_initial_psn() const { return initial_psn; };
470 /**
471 * Get the local queue pair number for this QueuePair.
472 * QPNs are analogous to UDP/TCP port numbers.
473 */
474 uint32_t get_local_qp_number() const { return qp->qp_num; };
475 /**
476 * Get the remote queue pair number for this QueuePair, as set in #plumb().
477 * QPNs are analogous to UDP/TCP port numbers.
478 */
479 int get_remote_qp_number(uint32_t *rqp) const;
480 /**
481 * Get the remote infiniband address for this QueuePair, as set in #plumb().
482 * LIDs are "local IDs" in infiniband terminology. They are short, locally
483 * routable addresses.
484 */
485 int get_remote_lid(uint16_t *lid) const;
486 /**
487 * Get the state of a QueuePair.
488 */
489 int get_state() const;
490 /*
491 * send/receive connection management meta data
492 */
493 int send_cm_meta(CephContext *cct, int socket_fd);
494 int recv_cm_meta(CephContext *cct, int socket_fd);
495 void wire_gid_to_gid(const char *wgid, ib_cm_meta_t* cm_meta_data);
496 void gid_to_wire_gid(const ib_cm_meta_t& cm_meta_data, char wgid[]);
497 ibv_qp* get_qp() const { return qp; }
498 Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
499 Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
500 int to_dead();
501 bool is_dead() const { return dead; }
502 ib_cm_meta_t& get_peer_cm_meta() { return peer_cm_meta; }
503 ib_cm_meta_t& get_local_cm_meta() { return local_cm_meta; }
504 void add_rq_wr(Chunk* chunk)
505 {
506 if (srq) return;
507
508 std::lock_guard l{lock};
509 recv_queue.push_back(chunk);
510 }
511
512 void remove_rq_wr(Chunk* chunk) {
513 if (srq) return;
514
515 std::lock_guard l{lock};
516 auto it = std::find(recv_queue.begin(), recv_queue.end(), chunk);
517 ceph_assert(it != recv_queue.end());
518 recv_queue.erase(it);
519 }
520 ibv_srq* get_srq() const { return srq; }
521
522 private:
523 CephContext *cct;
524 Infiniband& infiniband; // Infiniband to which this QP belongs
525 ibv_qp_type type; // QP type (IBV_QPT_RC, etc.)
526 ibv_context* ctxt; // device context of the HCA to use
527 int ib_physical_port;
528 ibv_pd* pd; // protection domain
529 ibv_srq* srq; // shared receive queue
530 ibv_qp* qp; // infiniband verbs QP handle
531 struct rdma_cm_id *cm_id;
532 ib_cm_meta_t peer_cm_meta;
533 ib_cm_meta_t local_cm_meta;
534 Infiniband::CompletionQueue* txcq;
535 Infiniband::CompletionQueue* rxcq;
536 uint32_t initial_psn; // initial packet sequence number
537 uint32_t max_send_wr;
538 uint32_t max_recv_wr;
539 uint32_t q_key;
540 bool dead;
541 vector<Chunk*> recv_queue;
542 ceph::mutex lock = ceph::make_mutex("queue_pair_lock");
543 };
544
545 public:
546 typedef MemoryManager::Cluster Cluster;
547 typedef MemoryManager::Chunk Chunk;
548 QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
549 ibv_qp_type type, struct rdma_cm_id *cm_id);
550 ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
551 // post rx buffers to srq, return number of buffers actually posted
552 int post_chunks_to_rq(int num, QueuePair *qp = nullptr);
553 void post_chunk_to_pool(Chunk* chunk) {
554 QueuePair *qp = chunk->get_qp();
555 if (qp != nullptr) {
556 qp->remove_rq_wr(chunk);
557 }
558 get_memory_manager()->release_rx_buffer(chunk);
559 }
560 int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
561 CompletionChannel *create_comp_channel(CephContext *c);
562 CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
563 uint8_t get_ib_physical_port() { return ib_physical_port; }
564 uint16_t get_lid() { return device->get_lid(); }
565 ibv_gid get_gid() { return device->get_gid(); }
566 MemoryManager* get_memory_manager() { return memory_manager; }
567 Device* get_device() { return device; }
568 int get_async_fd() { return device->ctxt->async_fd; }
569 bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
570 Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
571 static const char* wc_status_to_string(int status);
572 static const char* qp_state_string(int status);
573 uint32_t get_rx_queue_len() const { return rx_queue_len; }
574 };
575
576 #endif
577