1    	// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
2    	// vim: ts=8 sw=2 smarttab
3    	/*
4    	 * Ceph - scalable distributed file system
5    	 *
6    	 * Copyright (C) 2016 XSKY <haomai@xsky.com>
7    	 *
8    	 * Author: Haomai Wang <haomaiwang@gmail.com>
9    	 *
10   	 * This is free software; you can redistribute it and/or
11   	 * modify it under the terms of the GNU Lesser General Public
12   	 * License version 2.1, as published by the Free Software
13   	 * Foundation.  See file COPYING.
14   	 *
15   	 */
16   	
17   	#ifndef CEPH_INFINIBAND_H
18   	#define CEPH_INFINIBAND_H
19   	
20   	#include <boost/pool/pool.hpp>
21   	// need this because boost messes with ceph log/assert definitions
22   	#include "include/ceph_assert.h"
23   	
24   	#include <infiniband/verbs.h>
25   	#include <rdma/rdma_cma.h>
26   	
27   	#include <atomic>
28   	#include <string>
29   	#include <vector>
30   	
31   	#include "include/int_types.h"
32   	#include "include/page.h"
33   	#include "common/debug.h"
34   	#include "common/errno.h"
35   	#include "common/ceph_mutex.h"
36   	#include "common/perf_counters.h"
37   	#include "msg/msg_types.h"
38   	#include "msg/async/net_handler.h"
39   	
40   	#define HUGE_PAGE_SIZE_2MB (2 * 1024 * 1024)
41   	#define ALIGN_TO_PAGE_2MB(x) \
42   	    (((x) + (HUGE_PAGE_SIZE_2MB - 1)) & ~(HUGE_PAGE_SIZE_2MB - 1))
43   	
44   	#define PSN_LEN 24
45   	#define PSN_MSK ((1 << PSN_LEN) - 1)
46   	
47   	#define BEACON_WRID 0xDEADBEEF
48   	
49   	struct ib_cm_meta_t {
50   	  uint16_t lid;
51   	  uint32_t local_qpn;
52   	  uint32_t psn;
53   	  uint32_t peer_qpn;
54   	  union ibv_gid gid;
55   	} __attribute__((packed));
56   	
57   	class RDMAStack;
58   	class CephContext;
59   	
60   	class Port {
61   	  struct ibv_context* ctxt;
62   	  int port_num;
63   	  struct ibv_port_attr port_attr;
64   	  uint16_t lid;
65   	  int gid_idx = 0;
66   	  union ibv_gid gid;
67   	
68   	 public:
69   	  explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
70   	  uint16_t get_lid() { return lid; }
71   	  ibv_gid  get_gid() { return gid; }
72   	  int get_port_num() { return port_num; }
73   	  ibv_port_attr* get_port_attr() { return &port_attr; }
74   	  int get_gid_idx() { return gid_idx; }
75   	};
76   	
77   	
78   	class Device {
79   	  ibv_device *device;
80   	  const char* name;
81   	  uint8_t  port_cnt = 0;
82   	 public:
83   	  explicit Device(CephContext *c, ibv_device* ib_dev);
84   	  explicit Device(CephContext *c, ibv_context *ib_ctx);
(1) Event exn_spec_violation: An exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE" is thrown but the throw list "throw()" doesn't allow it to be thrown. This will cause a call to unexpected() which usually calls terminate().
Also see events: [fun_call_w_exception]
85   	  ~Device() {
86   	    if (active_port) {
87   	      delete active_port;
(2) Event fun_call_w_exception: Called function throws an exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE". [details]
Also see events: [exn_spec_violation]
88   	      ceph_assert(ibv_close_device(ctxt) == 0);
89   	    }
90   	  }
91   	  const char* get_name() { return name;}
92   	  uint16_t get_lid() { return active_port->get_lid(); }
93   	  ibv_gid get_gid() { return active_port->get_gid(); }
94   	  int get_gid_idx() { return active_port->get_gid_idx(); }
95   	  void binding_port(CephContext *c, int port_num);
96   	  struct ibv_context *ctxt;
97   	  ibv_device_attr device_attr;
98   	  Port* active_port;
99   	};
100  	
101  	
102  	class DeviceList {
103  	  struct ibv_device ** device_list;
104  	  struct ibv_context ** device_context_list;
105  	  int num;
106  	  Device** devices;
107  	 public:
108  	  explicit DeviceList(CephContext *cct): device_list(nullptr), device_context_list(nullptr),
109  	                                         num(0), devices(nullptr) {
110  	    device_list = ibv_get_device_list(&num);
111  	    ceph_assert(device_list);
112  	    ceph_assert(num);
113  	    if (cct->_conf->ms_async_rdma_cm) {
114  	        device_context_list = rdma_get_devices(NULL);
115  	        ceph_assert(device_context_list);
116  	    }
117  	    devices = new Device*[num];
118  	
119  	    for (int i = 0;i < num; ++i) {
120  	      if (cct->_conf->ms_async_rdma_cm) {
121  	         devices[i] = new Device(cct, device_context_list[i]);
122  	      } else {
123  	         devices[i] = new Device(cct, device_list[i]);
124  	      }
125  	    }
126  	  }
127  	  ~DeviceList() {
128  	    for (int i=0; i < num; ++i) {
129  	      delete devices[i];
130  	    }
131  	    delete []devices;
132  	    ibv_free_device_list(device_list);
133  	    rdma_free_devices(device_context_list);
134  	  }
135  	
136  	  Device* get_device(const char* device_name) {
137  	    for (int i = 0; i < num; ++i) {
138  	      if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
139  	        return devices[i];
140  	      }
141  	    }
142  	    return NULL;
143  	  }
144  	};
145  	
146  	// stat counters
147  	enum {
148  	  l_msgr_rdma_dispatcher_first = 94000,
149  	
150  	  l_msgr_rdma_polling,
151  	  l_msgr_rdma_inflight_tx_chunks,
152  	  l_msgr_rdma_rx_bufs_in_use,
153  	  l_msgr_rdma_rx_bufs_total,
154  	
155  	  l_msgr_rdma_tx_total_wc,
156  	  l_msgr_rdma_tx_total_wc_errors,
157  	  l_msgr_rdma_tx_wc_retry_errors,
158  	  l_msgr_rdma_tx_wc_wr_flush_errors,
159  	
160  	  l_msgr_rdma_rx_total_wc,
161  	  l_msgr_rdma_rx_total_wc_errors,
162  	  l_msgr_rdma_rx_fin,
163  	
164  	  l_msgr_rdma_handshake_errors,
165  	
166  	  l_msgr_rdma_total_async_events,
167  	  l_msgr_rdma_async_last_wqe_events,
168  	
169  	  l_msgr_rdma_created_queue_pair,
170  	  l_msgr_rdma_active_queue_pair,
171  	
172  	  l_msgr_rdma_dispatcher_last,
173  	};
174  	
175  	enum {
176  	  l_msgr_rdma_first = 95000,
177  	
178  	  l_msgr_rdma_tx_no_mem,
179  	  l_msgr_rdma_tx_parital_mem,
180  	  l_msgr_rdma_tx_failed,
181  	
182  	  l_msgr_rdma_tx_chunks,
183  	  l_msgr_rdma_tx_bytes,
184  	  l_msgr_rdma_rx_chunks,
185  	  l_msgr_rdma_rx_bytes,
186  	  l_msgr_rdma_pending_sent_conns,
187  	
188  	  l_msgr_rdma_last,
189  	};
190  	
191  	class RDMADispatcher;
192  	
193  	class Infiniband {
194  	 public:
195  	  class ProtectionDomain {
196  	   public:
197  	    explicit ProtectionDomain(CephContext *cct, Device *device);
198  	    ~ProtectionDomain();
199  	
200  	    ibv_pd* const pd;
201  	  };
202  	
203  	  class QueuePair;
204  	  class MemoryManager {
205  	   public:
206  	    class Chunk {
207  	     public:
208  	      Chunk(ibv_mr* m, uint32_t bytes, char* buffer, uint32_t offset = 0, uint32_t bound = 0, uint32_t lkey = 0, QueuePair* qp = nullptr);
209  	      ~Chunk();
210  	
211  	      uint32_t get_offset();
212  	      uint32_t get_size() const;
213  	      void prepare_read(uint32_t b);
214  	      uint32_t get_bound();
215  	      uint32_t read(char* buf, uint32_t len);
216  	      uint32_t write(char* buf, uint32_t len);
217  	      bool full();
218  	      void reset_read_chunk();
219  	      void reset_write_chunk();
220  	      void set_qp(QueuePair *qp) { this->qp = qp; }
221  	      void clear_qp() { set_qp(nullptr); }
222  	      QueuePair* get_qp() { return qp; }
223  	
224  	     public:
225  	      ibv_mr* mr;
226  	      QueuePair *qp;
227  	      uint32_t lkey;
228  	      uint32_t bytes;
229  	      uint32_t offset;
230  	      uint32_t bound;
231  	      char* buffer; // TODO: remove buffer/refactor TX
232  	      char  data[0];
233  	    };
234  	
235  	    class Cluster {
236  	     public:
237  	      Cluster(MemoryManager& m, uint32_t s);
238  	      ~Cluster();
239  	
240  	      int fill(uint32_t num);
241  	      void take_back(std::vector<Chunk*> &ck);
242  	      int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
243  	      Chunk *get_chunk_by_buffer(const char *c) {
244  	        uint32_t idx = (c - base) / buffer_size;
245  	        Chunk *chunk = chunk_base + idx;
246  	        return chunk;
247  	      }
248  	      bool is_my_buffer(const char *c) const {
249  	        return c >= base && c < end;
250  	      }
251  	
252  	      MemoryManager& manager;
253  	      uint32_t buffer_size;
254  	      uint32_t num_chunk = 0;
255  	      ceph::mutex lock = ceph::make_mutex("cluster_lock");
256  	      std::vector<Chunk*> free_chunks;
257  	      char *base = nullptr;
258  	      char *end = nullptr;
259  	      Chunk* chunk_base = nullptr;
260  	    };
261  	
262  	    class MemPoolContext {
263  	      PerfCounters *perf_logger;
264  	
265  	     public:
266  	      MemoryManager *manager;
267  	      unsigned n_bufs_allocated;
268  	      // true if it is possible to alloc
269  	      // more memory for the pool
270  	      explicit MemPoolContext(MemoryManager *m) :
271  	        perf_logger(nullptr),
272  	        manager(m),
273  	        n_bufs_allocated(0) {}
274  	      bool can_alloc(unsigned nbufs);
275  	      void update_stats(int val);
276  	      void set_stat_logger(PerfCounters *logger);
277  	    };
278  	
279  	    class PoolAllocator {
280  	      struct mem_info {
281  	        ibv_mr   *mr;
282  	        MemPoolContext *ctx;
283  	        unsigned nbufs;
284  	        Chunk    chunks[0];
285  	      };
286  	     public:
287  	      typedef std::size_t size_type;
288  	      typedef std::ptrdiff_t difference_type;
289  	
290  	      static char * malloc(const size_type bytes);
291  	      static void free(char * const block);
292  	
293  	      static MemPoolContext  *g_ctx;
294  	      static ceph::mutex lock;
295  	    };
296  	
297  	    /**
298  	     * modify boost pool so that it is possible to
299  	     * have a thread safe 'context' when allocating/freeing
300  	     * the memory. It is needed to allow a different pool
301  	     * configurations and bookkeeping per CephContext and
302  	     * also to be able to use same allocator to deal with
303  	     * RX and TX pool.
304  	     * TODO: use boost pool to allocate TX chunks too
305  	     */
306  	    class mem_pool : public boost::pool<PoolAllocator> {
307  	     private:
308  	      MemPoolContext *ctx;
309  	      void *slow_malloc();
310  	
311  	     public:
312  	      ceph::mutex lock = ceph::make_mutex("mem_pool_lock");
313  	      explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
314  	          const size_type nnext_size = 32,
315  	          const size_type nmax_size = 0) :
316  	        pool(nrequested_size, nnext_size, nmax_size),
317  	        ctx(ctx) { }
318  	
319  	      void *malloc() {
320  	        if (!store().empty())
321  	          return (store().malloc)();
322  	        // need to alloc more memory...
323  	        // slow path code
324  	        return slow_malloc();
325  	      }
326  	    };
327  	
328  	    MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
329  	    ~MemoryManager();
330  	
331  	    void* malloc(size_t size);
332  	    void  free(void *ptr);
333  	
334  	    void create_tx_pool(uint32_t size, uint32_t tx_num);
335  	    void return_tx(std::vector<Chunk*> &chunks);
336  	    int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
337  	    bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
338  	    Chunk *get_tx_chunk_by_buffer(const char *c) {
339  	      return send->get_chunk_by_buffer(c);
340  	    }
341  	    uint32_t get_tx_buffer_size() const {
342  	      return send->buffer_size;
343  	    }
344  	
345  	    Chunk *get_rx_buffer() {
346  	       std::lock_guard l{rxbuf_pool.lock};
347  	       return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
348  	    }
349  	
350  	    void release_rx_buffer(Chunk *chunk) {
351  	      std::lock_guard l{rxbuf_pool.lock};
352  	      chunk->clear_qp();
353  	      rxbuf_pool.free(chunk);
354  	    }
355  	
356  	    void set_rx_stat_logger(PerfCounters *logger) {
357  	      rxbuf_pool_ctx.set_stat_logger(logger);
358  	    }
359  	
360  	    CephContext  *cct;
361  	   private:
362  	    // TODO: Cluster -> TxPool txbuf_pool
363  	    // chunk layout fix
364  	    //  
365  	    Cluster* send = nullptr;// SEND
366  	    Device *device;
367  	    ProtectionDomain *pd;
368  	    MemPoolContext rxbuf_pool_ctx;
369  	    mem_pool     rxbuf_pool;
370  	
371  	
372  	    void* huge_pages_malloc(size_t size);
373  	    void  huge_pages_free(void *ptr);
374  	  };
375  	
376  	 private:
377  	  uint32_t tx_queue_len = 0;
378  	  uint32_t rx_queue_len = 0;
379  	  uint32_t max_sge = 0;
380  	  uint8_t  ib_physical_port = 0;
381  	  MemoryManager* memory_manager = nullptr;
382  	  ibv_srq* srq = nullptr;             // shared receive work queue
383  	  Device *device = NULL;
384  	  ProtectionDomain *pd = NULL;
385  	  DeviceList *device_list = nullptr;
386  	  CephContext *cct;
387  	  ceph::mutex lock = ceph::make_mutex("IB lock");
388  	  bool initialized = false;
389  	  const std::string &device_name;
390  	  uint8_t port_num;
391  	  bool support_srq = false;
392  	
393  	 public:
394  	  explicit Infiniband(CephContext *c);
395  	  ~Infiniband();
396  	  void init();
397  	  static void verify_prereq(CephContext *cct);
398  	
399  	  class CompletionChannel {
400  	    static const uint32_t MAX_ACK_EVENT = 5000;
401  	    CephContext *cct;
402  	    Infiniband& infiniband;
403  	    ibv_comp_channel *channel;
404  	    ibv_cq *cq;
405  	    uint32_t cq_events_that_need_ack;
406  	
407  	   public:
408  	    CompletionChannel(CephContext *c, Infiniband &ib);
409  	    ~CompletionChannel();
410  	    int init();
411  	    bool get_cq_event();
412  	    int get_fd() { return channel->fd; }
413  	    ibv_comp_channel* get_channel() { return channel; }
414  	    void bind_cq(ibv_cq *c) { cq = c; }
415  	    void ack_events();
416  	  };
417  	
418  	  // this class encapsulates the creation, use, and destruction of an RC
419  	  // completion queue.
420  	  //
421  	  // You need to call init and it will create a cq and associate to comp channel
422  	  class CompletionQueue {
423  	   public:
424  	    CompletionQueue(CephContext *c, Infiniband &ib,
425  	                    const uint32_t qd, CompletionChannel *cc)
426  	      : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
427  	    ~CompletionQueue();
428  	    int init();
429  	    int poll_cq(int num_entries, ibv_wc *ret_wc_array);
430  	
431  	    ibv_cq* get_cq() const { return cq; }
432  	    int rearm_notify(bool solicited_only=true);
433  	    CompletionChannel* get_cc() const { return channel; }
434  	   private:
435  	    CephContext *cct;
436  	    Infiniband&  infiniband;     // Infiniband to which this QP belongs
437  	    CompletionChannel *channel;
438  	    ibv_cq *cq;
439  	    uint32_t queue_depth;
440  	  };
441  	
442  	  // this class encapsulates the creation, use, and destruction of an RC
443  	  // queue pair.
444  	  //
445  	  // you need call init and it will create a qp and bring it to the INIT state.
446  	  // after obtaining the lid, qpn, and psn of a remote queue pair, one
447  	  // must call plumb() to bring the queue pair to the RTS state.
448  	  class QueuePair {
449  	   public:
450  	    typedef MemoryManager::Chunk Chunk;
451  	    QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
452  	              int ib_physical_port,  ibv_srq *srq,
453  	              Infiniband::CompletionQueue* txcq,
454  	              Infiniband::CompletionQueue* rxcq,
455  	              uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
456  	    ~QueuePair();
457  	
458  	    int modify_qp_to_error();
459  	    int modify_qp_to_rts();
460  	    int modify_qp_to_rtr();
461  	    int modify_qp_to_init();
462  	    int init();
463  	
464  	    /**
465  	     * Get the initial packet sequence number for this QueuePair.
466  	     * This is randomly generated on creation. It should not be confused
467  	     * with the remote side's PSN, which is set in #plumb(). 
468  	     */
469  	    uint32_t get_initial_psn() const { return initial_psn; };
470  	    /**
471  	     * Get the local queue pair number for this QueuePair.
472  	     * QPNs are analogous to UDP/TCP port numbers.
473  	     */
474  	    uint32_t get_local_qp_number() const { return qp->qp_num; };
475  	    /**
476  	     * Get the remote queue pair number for this QueuePair, as set in #plumb().
477  	     * QPNs are analogous to UDP/TCP port numbers.
478  	     */
479  	    int get_remote_qp_number(uint32_t *rqp) const;
480  	    /**
481  	     * Get the remote infiniband address for this QueuePair, as set in #plumb().
482  	     * LIDs are "local IDs" in infiniband terminology. They are short, locally
483  	     * routable addresses.
484  	     */
485  	    int get_remote_lid(uint16_t *lid) const;
486  	    /**
487  	     * Get the state of a QueuePair.
488  	     */
489  	    int get_state() const;
490  	    /*
491  	     * send/receive connection management meta data
492  	     */
493  	    int send_cm_meta(CephContext *cct, int socket_fd);
494  	    int recv_cm_meta(CephContext *cct, int socket_fd);
495  	    void wire_gid_to_gid(const char *wgid, ib_cm_meta_t* cm_meta_data);
496  	    void gid_to_wire_gid(const ib_cm_meta_t& cm_meta_data, char wgid[]);
497  	    ibv_qp* get_qp() const { return qp; }
498  	    Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
499  	    Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
500  	    int to_dead();
501  	    bool is_dead() const { return dead; }
502  	    ib_cm_meta_t& get_peer_cm_meta() { return peer_cm_meta; }
503  	    ib_cm_meta_t& get_local_cm_meta() { return local_cm_meta; }
504  	    void add_rq_wr(Chunk* chunk)
505  	    {
506  	      if (srq) return;
507  	
508  	      std::lock_guard l{lock};
509  	      recv_queue.push_back(chunk);
510  	    }
511  	
512  	    void remove_rq_wr(Chunk* chunk) {
513  	      if (srq) return;
514  	
515  	      std::lock_guard l{lock};
516  	      auto it = std::find(recv_queue.begin(), recv_queue.end(), chunk);
517  	      ceph_assert(it != recv_queue.end());
518  	      recv_queue.erase(it);
519  	    }
520  	    ibv_srq* get_srq() const { return srq; }
521  	
522  	   private:
523  	    CephContext  *cct;
524  	    Infiniband&  infiniband;     // Infiniband to which this QP belongs
525  	    ibv_qp_type  type;           // QP type (IBV_QPT_RC, etc.)
526  	    ibv_context* ctxt;           // device context of the HCA to use
527  	    int ib_physical_port;
528  	    ibv_pd*      pd;             // protection domain
529  	    ibv_srq*     srq;            // shared receive queue
530  	    ibv_qp*      qp;             // infiniband verbs QP handle
531  	    struct rdma_cm_id *cm_id;
532  	    ib_cm_meta_t peer_cm_meta;
533  	    ib_cm_meta_t local_cm_meta;
534  	    Infiniband::CompletionQueue* txcq;
535  	    Infiniband::CompletionQueue* rxcq;
536  	    uint32_t     initial_psn;    // initial packet sequence number
537  	    uint32_t     max_send_wr;
538  	    uint32_t     max_recv_wr;
539  	    uint32_t     q_key;
540  	    bool dead;
541  	    vector<Chunk*> recv_queue;
542  	    ceph::mutex lock = ceph::make_mutex("queue_pair_lock");
543  	  };
544  	
545  	 public:
546  	  typedef MemoryManager::Cluster Cluster;
547  	  typedef MemoryManager::Chunk Chunk;
548  	  QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
549  	      ibv_qp_type type, struct rdma_cm_id *cm_id);
550  	  ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
551  	  // post rx buffers to srq, return number of buffers actually posted
552  	  int post_chunks_to_rq(int num, QueuePair *qp = nullptr);
553  	  void post_chunk_to_pool(Chunk* chunk) {
554  	    QueuePair *qp = chunk->get_qp();
555  	    if (qp != nullptr) {
556  	      qp->remove_rq_wr(chunk);
557  	    }
558  	    get_memory_manager()->release_rx_buffer(chunk);
559  	  }
560  	  int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
561  	  CompletionChannel *create_comp_channel(CephContext *c);
562  	  CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
563  	  uint8_t get_ib_physical_port() { return ib_physical_port; }
564  	  uint16_t get_lid() { return device->get_lid(); }
565  	  ibv_gid get_gid() { return device->get_gid(); }
566  	  MemoryManager* get_memory_manager() { return memory_manager; }
567  	  Device* get_device() { return device; }
568  	  int get_async_fd() { return device->ctxt->async_fd; }
569  	  bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
570  	  Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
571  	  static const char* wc_status_to_string(int status);
572  	  static const char* qp_state_string(int status);
573  	  uint32_t get_rx_queue_len() const { return rx_queue_len; }
574  	};
575  	
576  	#endif
577