1    	// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2    	// vim: ts=8 sw=2 smarttab
3    	#ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4    	#define CEPH_OS_BLUESTORE_BLUEFS_H
5    	
6    	#include <atomic>
7    	#include <mutex>
8    	
9    	#include "bluefs_types.h"
10   	#include "BlockDevice.h"
11   	
12   	#include "common/RefCountedObj.h"
13   	#include "common/ceph_context.h"
14   	#include "global/global_context.h"
15   	
16   	#include "boost/intrusive/list.hpp"
17   	
18   	class PerfCounters;
19   	
20   	class Allocator;
21   	
22   	enum {
23   	  l_bluefs_first = 732600,
24   	  l_bluefs_gift_bytes,
25   	  l_bluefs_reclaim_bytes,
26   	  l_bluefs_db_total_bytes,
27   	  l_bluefs_db_used_bytes,
28   	  l_bluefs_wal_total_bytes,
29   	  l_bluefs_wal_used_bytes,
30   	  l_bluefs_slow_total_bytes,
31   	  l_bluefs_slow_used_bytes,
32   	  l_bluefs_num_files,
33   	  l_bluefs_log_bytes,
34   	  l_bluefs_log_compactions,
35   	  l_bluefs_logged_bytes,
36   	  l_bluefs_files_written_wal,
37   	  l_bluefs_files_written_sst,
38   	  l_bluefs_bytes_written_wal,
39   	  l_bluefs_bytes_written_sst,
40   	  l_bluefs_bytes_written_slow,
41   	  l_bluefs_max_bytes_wal,
42   	  l_bluefs_max_bytes_db,
43   	  l_bluefs_max_bytes_slow,
44   	  l_bluefs_read_random_count,
45   	  l_bluefs_read_random_bytes,
46   	  l_bluefs_read_random_disk_count,
47   	  l_bluefs_read_random_disk_bytes,
48   	  l_bluefs_read_random_buffer_count,
49   	  l_bluefs_read_random_buffer_bytes,
50   	  l_bluefs_read_count,
51   	  l_bluefs_read_bytes,
52   	  l_bluefs_read_prefetch_count,
53   	  l_bluefs_read_prefetch_bytes,
54   	
55   	  l_bluefs_last,
56   	};
57   	
58   	class BlueFSDeviceExpander {
59   	protected:
60   	  ~BlueFSDeviceExpander() {}
61   	public:
62   	  virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
63   	    uint64_t bluefs_total) = 0;
64   	  virtual int allocate_freespace(
65   	    uint64_t min_size,
66   	    uint64_t size,
67   	    PExtentVector& extents) = 0;
68   	  /** Reports amount of space that can be transferred to BlueFS.
69   	   * This gives either current state, when alloc_size is currently used
70   	   * BlueFS's size, or simulation when alloc_size is different.
71   	   * @params
72   	   * alloc_size - allocation unit size to check
73   	   */
74   	  virtual size_t available_freespace(uint64_t alloc_size) = 0;
75   	};
76   	
77   	class BlueFS {
78   	public:
79   	  CephContext* cct;
80   	  static constexpr unsigned MAX_BDEV = 5;
81   	  static constexpr unsigned BDEV_WAL = 0;
82   	  static constexpr unsigned BDEV_DB = 1;
83   	  static constexpr unsigned BDEV_SLOW = 2;
84   	  static constexpr unsigned BDEV_NEWWAL = 3;
85   	  static constexpr unsigned BDEV_NEWDB = 4;
86   	
87   	  enum {
88   	    WRITER_UNKNOWN,
89   	    WRITER_WAL,
90   	    WRITER_SST,
91   	  };
92   	
93   	  struct File : public RefCountedObject {
94   	    MEMPOOL_CLASS_HELPERS();
95   	
96   	    bluefs_fnode_t fnode;
97   	    int refs;
98   	    uint64_t dirty_seq;
99   	    bool locked;
100  	    bool deleted;
101  	    boost::intrusive::list_member_hook<> dirty_item;
102  	
103  	    std::atomic_int num_readers, num_writers;
104  	    std::atomic_int num_reading;
105  	
106  	  private:
107  	    FRIEND_MAKE_REF(File);
108  	    File()
109  	      :
110  		refs(0),
111  		dirty_seq(0),
112  		locked(false),
113  		deleted(false),
114  		num_readers(0),
115  		num_writers(0),
116  		num_reading(0)
117  	      {}
(1) Event exn_spec_violation: An exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE" is thrown but the throw list "throw()" doesn't allow it to be thrown. This will cause a call to unexpected() which usually calls terminate().
Also see events: [fun_call_w_exception]
118  	    ~File() override {
119  	      ceph_assert(num_readers.load() == 0);
120  	      ceph_assert(num_writers.load() == 0);
121  	      ceph_assert(num_reading.load() == 0);
(2) Event fun_call_w_exception: Called function throws an exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE". [details]
Also see events: [exn_spec_violation]
122  	      ceph_assert(!locked);
123  	    }
124  	  };
125  	  using FileRef = ceph::ref_t<File>;
126  	
127  	  typedef boost::intrusive::list<
128  	      File,
129  	      boost::intrusive::member_hook<
130  	        File,
131  		boost::intrusive::list_member_hook<>,
132  		&File::dirty_item> > dirty_file_list_t;
133  	
134  	  struct Dir : public RefCountedObject {
135  	    MEMPOOL_CLASS_HELPERS();
136  	
137  	    mempool::bluefs::map<string,FileRef> file_map;
138  	
139  	  private:
140  	    FRIEND_MAKE_REF(Dir);
141  	    Dir() = default;
142  	  };
143  	  using DirRef = ceph::ref_t<Dir>;
144  	
145  	  struct FileWriter {
146  	    MEMPOOL_CLASS_HELPERS();
147  	
148  	    FileRef file;
149  	    uint64_t pos = 0;       ///< start offset for buffer
150  	    bufferlist buffer;      ///< new data to write (at end of file)
151  	    bufferlist tail_block;  ///< existing partial block at end of file, if any
152  	    bufferlist::page_aligned_appender buffer_appender;  //< for const char* only
153  	    int writer_type = 0;    ///< WRITER_*
154  	    int write_hint = WRITE_LIFE_NOT_SET;
155  	
156  	    ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
157  	    std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
158  	    std::array<bool, MAX_BDEV> dirty_devs;
159  	
160  	    FileWriter(FileRef f)
161  	      : file(f),
162  		buffer_appender(buffer.get_page_aligned_appender(
163  				  g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
164  	      ++file->num_writers;
165  	      iocv.fill(nullptr);
166  	      dirty_devs.fill(false);
167  	      if (f->fnode.ino == 1) {
168  		write_hint = WRITE_LIFE_MEDIUM;
169  	      }
170  	    }
171  	    // NOTE: caller must call BlueFS::close_writer()
172  	    ~FileWriter() {
173  	      --file->num_writers;
174  	    }
175  	
176  	    // note: BlueRocksEnv uses this append exclusively, so it's safe
177  	    // to use buffer_appender exclusively here (e.g., it's notion of
178  	    // offset will remain accurate).
179  	    void append(const char *buf, size_t len) {
180  	      buffer_appender.append(buf, len);
181  	    }
182  	
183  	    // note: used internally only, for ino 1 or 0.
184  	    void append(bufferlist& bl) {
185  	      buffer.claim_append(bl);
186  	    }
187  	
188  	    uint64_t get_effective_write_pos() {
189  	      buffer_appender.flush();
190  	      return pos + buffer.length();
191  	    }
192  	  };
193  	
194  	  struct FileReaderBuffer {
195  	    MEMPOOL_CLASS_HELPERS();
196  	
197  	    uint64_t bl_off;        ///< prefetch buffer logical offset
198  	    bufferlist bl;          ///< prefetch buffer
199  	    uint64_t pos;           ///< current logical offset
200  	    uint64_t max_prefetch;  ///< max allowed prefetch
201  	
202  	    explicit FileReaderBuffer(uint64_t mpf)
203  	      : bl_off(0),
204  		pos(0),
205  		max_prefetch(mpf) {}
206  	
207  	    uint64_t get_buf_end() {
208  	      return bl_off + bl.length();
209  	    }
210  	    uint64_t get_buf_remaining(uint64_t p) {
211  	      if (p >= bl_off && p < bl_off + bl.length())
212  		return bl_off + bl.length() - p;
213  	      return 0;
214  	    }
215  	
216  	    void skip(size_t n) {
217  	      pos += n;
218  	    }
219  	    void seek(uint64_t offset) {
220  	      pos = offset;
221  	    }
222  	  };
223  	
224  	  struct FileReader {
225  	    MEMPOOL_CLASS_HELPERS();
226  	
227  	    FileRef file;
228  	    FileReaderBuffer buf;
229  	    bool random;
230  	    bool ignore_eof;        ///< used when reading our log file
231  	
232  	    ceph::shared_mutex lock {
233  	     ceph::make_shared_mutex(std::string(), false, false, false)
234  	    };
235  	
236  	
237  	    FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
238  	      : file(f),
239  		buf(mpf),
240  		random(rand),
241  		ignore_eof(ie) {
242  	      ++file->num_readers;
243  	    }
244  	    ~FileReader() {
245  	      --file->num_readers;
246  	    }
247  	  };
248  	
249  	  struct FileLock {
250  	    MEMPOOL_CLASS_HELPERS();
251  	
252  	    FileRef file;
253  	    explicit FileLock(FileRef f) : file(f) {}
254  	  };
255  	
256  	private:
257  	  ceph::mutex lock = ceph::make_mutex("BlueFS::lock");
258  	
259  	  PerfCounters *logger = nullptr;
260  	
261  	  uint64_t max_bytes[MAX_BDEV] = {0};
262  	  uint64_t max_bytes_pcounters[MAX_BDEV] = {
263  	    l_bluefs_max_bytes_wal,
264  	    l_bluefs_max_bytes_db,
265  	    l_bluefs_max_bytes_slow,
266  	  };
267  	
268  	  // cache
269  	  mempool::bluefs::map<string, DirRef> dir_map;              ///< dirname -> Dir
270  	  mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
271  	
272  	  // map of dirty files, files of same dirty_seq are grouped into list.
273  	  map<uint64_t, dirty_file_list_t> dirty_files;
274  	
275  	  bluefs_super_t super;        ///< latest superblock (as last written)
276  	  uint64_t ino_last = 0;       ///< last assigned ino (this one is in use)
277  	  uint64_t log_seq = 0;        ///< last used log seq (by current pending log_t)
278  	  uint64_t log_seq_stable = 0; ///< last stable/synced log seq
279  	  FileWriter *log_writer = 0;  ///< writer for the log
280  	  bluefs_transaction_t log_t;  ///< pending, unwritten log transaction
281  	  bool log_flushing = false;   ///< true while flushing the log
282  	  ceph::condition_variable log_cond;
283  	
284  	  uint64_t new_log_jump_to = 0;
285  	  uint64_t old_log_jump_to = 0;
286  	  FileRef new_log = nullptr;
287  	  FileWriter *new_log_writer = nullptr;
288  	
289  	  /*
290  	   * There are up to 3 block devices:
291  	   *
292  	   *  BDEV_DB   db/      - the primary db device
293  	   *  BDEV_WAL  db.wal/  - a small, fast device, specifically for the WAL
294  	   *  BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
295  	   */
296  	  vector<BlockDevice*> bdev;                  ///< block devices we can use
297  	  vector<IOContext*> ioc;                     ///< IOContexts for bdevs
298  	  vector<interval_set<uint64_t> > block_all;  ///< extents in bdev we own
299  	  vector<Allocator*> alloc;                   ///< allocators for bdevs
300  	  vector<uint64_t> alloc_size;                ///< alloc size for each device
301  	  vector<interval_set<uint64_t>> pending_release; ///< extents to release
302  	
303  	  BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
304  	
305  	  BlueFSDeviceExpander* slow_dev_expander = nullptr;
306  	
307  	  class SocketHook;
308  	  SocketHook* asok_hook = nullptr;
309  	
310  	  void _init_logger();
311  	  void _shutdown_logger();
312  	  void _update_logger_stats();
313  	
314  	  void _init_alloc();
315  	  void _stop_alloc();
316  	
317  	  void _pad_bl(bufferlist& bl);  ///< pad bufferlist to block size w/ zeros
318  	
319  	  FileRef _get_file(uint64_t ino);
320  	  void _drop_link(FileRef f);
321  	
322  	  int _get_slow_device_id() { return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; }
323  	  const char* get_device_name(unsigned id);
324  	  int _expand_slow_device(uint64_t min_size, PExtentVector& extents);
325  	  int _allocate(uint8_t bdev, uint64_t len,
326  			bluefs_fnode_t* node);
327  	  int _allocate_without_fallback(uint8_t id, uint64_t len,
328  					 PExtentVector* extents);
329  	
330  	  int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
331  	  int _flush(FileWriter *h, bool force);
332  	  int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l);
333  	
334  	#ifdef HAVE_LIBAIO
335  	  void _claim_completed_aios(FileWriter *h, list<aio_t> *ls);
336  	  void wait_for_aio(FileWriter *h);  // safe to call without a lock
337  	#endif
338  	
339  	  int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
340  				  uint64_t want_seq = 0,
341  				  uint64_t jump_to = 0);
342  	  uint64_t _estimate_log_size();
343  	  bool _should_compact_log();
344  	
345  	  enum {
346  	    REMOVE_DB = 1,
347  	    REMOVE_WAL = 2,
348  	    RENAME_SLOW2DB = 4,
349  	    RENAME_DB2SLOW = 8,
350  	  };
351  	  void _compact_log_dump_metadata(bluefs_transaction_t *t,
352  					  int flags);
353  	  void _compact_log_sync();
354  	  void _compact_log_async(std::unique_lock<ceph::mutex>& l);
355  	
356  	  void _rewrite_log_and_layout_sync(bool allocate_with_fallback,
357  					    int super_dev,
358  					    int log_dev,
359  					    int new_log_dev,
360  					    int flags,
361  					    std::optional<bluefs_layout_t> layout);
362  	
363  	  //void _aio_finish(void *priv);
364  	
365  	  void _flush_bdev_safely(FileWriter *h);
366  	  void flush_bdev();  // this is safe to call without a lock
367  	  void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs);  // this is safe to call without a lock
368  	
369  	  int _preallocate(FileRef f, uint64_t off, uint64_t len);
370  	  int _truncate(FileWriter *h, uint64_t off);
371  	
372  	  int _read(
373  	    FileReader *h,   ///< [in] read from here
374  	    FileReaderBuffer *buf, ///< [in] reader state
375  	    uint64_t offset, ///< [in] offset
376  	    size_t len,      ///< [in] this many bytes
377  	    bufferlist *outbl,   ///< [out] optional: reference the result here
378  	    char *out);      ///< [out] optional: or copy it here
379  	  int _read_random(
380  	    FileReader *h,   ///< [in] read from here
381  	    uint64_t offset, ///< [in] offset
382  	    uint64_t len,    ///< [in] this many bytes
383  	    char *out);      ///< [out] optional: or copy it here
384  	
385  	  void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
386  	
387  	  int _open_super();
388  	  int _write_super(int dev);
389  	  int _replay(bool noop, bool to_stdout = false); ///< replay journal
390  	
391  	  FileWriter *_create_writer(FileRef f);
392  	  void _close_writer(FileWriter *h);
393  	
394  	  // always put the super in the second 4k block.  FIXME should this be
395  	  // block size independent?
396  	  unsigned get_super_offset() {
397  	    return 4096;
398  	  }
399  	  unsigned get_super_length() {
400  	    return 4096;
401  	  }
402  	
403  	  void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len);
404  	
405  	public:
406  	  BlueFS(CephContext* cct);
407  	  ~BlueFS();
408  	
409  	  // the super is always stored on bdev 0
410  	  int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
411  	  int mount();
412  	  int maybe_verify_layout(const bluefs_layout_t& layout) const;
413  	  void umount();
414  	  int prepare_new_device(int id, const bluefs_layout_t& layout);
415  	  
416  	  int log_dump();
417  	
418  	  void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id);
419  	  void get_devices(set<string> *ls);
420  	  uint64_t get_alloc_size(int id) {
421  	    return alloc_size[id];
422  	  }
423  	  int fsck();
424  	
425  	  int device_migrate_to_new(
426  	    CephContext *cct,
427  	    const set<int>& devs_source,
428  	    int dev_target,
429  	    const bluefs_layout_t& layout);
430  	  int device_migrate_to_existing(
431  	    CephContext *cct,
432  	    const set<int>& devs_source,
433  	    int dev_target,
434  	    const bluefs_layout_t& layout);
435  	
436  	  uint64_t get_used();
437  	  uint64_t get_total(unsigned id);
438  	  uint64_t get_free(unsigned id);
439  	  void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
440  	  void dump_perf_counters(Formatter *f);
441  	
442  	  void dump_block_extents(ostream& out);
443  	
444  	  /// get current extents that we own for given block device
445  	  int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
446  	
447  	  int open_for_write(
448  	    const string& dir,
449  	    const string& file,
450  	    FileWriter **h,
451  	    bool overwrite);
452  	
453  	  int open_for_read(
454  	    const string& dir,
455  	    const string& file,
456  	    FileReader **h,
457  	    bool random = false);
458  	
459  	  void close_writer(FileWriter *h) {
460  	    std::lock_guard l(lock);
461  	    _close_writer(h);
462  	  }
463  	
464  	  int rename(const string& old_dir, const string& old_file,
465  		     const string& new_dir, const string& new_file);
466  	
467  	  int readdir(const string& dirname, vector<string> *ls);
468  	
469  	  int unlink(const string& dirname, const string& filename);
470  	  int mkdir(const string& dirname);
471  	  int rmdir(const string& dirname);
472  	  bool wal_is_rotational();
473  	
474  	  bool dir_exists(const string& dirname);
475  	  int stat(const string& dirname, const string& filename,
476  		   uint64_t *size, utime_t *mtime);
477  	
478  	  int lock_file(const string& dirname, const string& filename, FileLock **p);
479  	  int unlock_file(FileLock *l);
480  	
481  	  void flush_log();
482  	  void compact_log();
483  	
484  	  /// sync any uncommitted state to disk
485  	  void sync_metadata();
486  	
487  	  void set_slow_device_expander(BlueFSDeviceExpander* a) {
488  	    slow_dev_expander = a;
489  	  }
490  	  int add_block_device(unsigned bdev, const string& path, bool trim,
491  			       bool shared_with_bluestore=false);
492  	  bool bdev_support_label(unsigned id);
493  	  uint64_t get_block_device_size(unsigned bdev);
494  	
495  	  /// gift more block space
496  	  void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len) {
497  	    std::unique_lock l(lock);
498  	    _add_block_extent(bdev, offset, len);
499  	    int r = _flush_and_sync_log(l);
500  	    ceph_assert(r == 0);
501  	  }
502  	
503  	  /// reclaim block space
504  	  int reclaim_blocks(unsigned bdev, uint64_t want,
505  			     PExtentVector *extents);
506  	
507  	  // handler for discard event
508  	  void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
509  	
510  	  void flush(FileWriter *h) {
511  	    std::lock_guard l(lock);
512  	    _flush(h, false);
513  	  }
514  	  void flush_range(FileWriter *h, uint64_t offset, uint64_t length) {
515  	    std::lock_guard l(lock);
516  	    _flush_range(h, offset, length);
517  	  }
518  	  int fsync(FileWriter *h) {
519  	    std::unique_lock l(lock);
520  	    return _fsync(h, l);
521  	  }
522  	  int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len,
523  		   bufferlist *outbl, char *out) {
524  	    // no need to hold the global lock here; we only touch h and
525  	    // h->file, and read vs write or delete is already protected (via
526  	    // atomics and asserts).
527  	    return _read(h, buf, offset, len, outbl, out);
528  	  }
529  	  int read_random(FileReader *h, uint64_t offset, size_t len,
530  			  char *out) {
531  	    // no need to hold the global lock here; we only touch h and
532  	    // h->file, and read vs write or delete is already protected (via
533  	    // atomics and asserts).
534  	    return _read_random(h, offset, len, out);
535  	  }
536  	  void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) {
537  	    std::lock_guard l(lock);
538  	    _invalidate_cache(f, offset, len);
539  	  }
540  	  int preallocate(FileRef f, uint64_t offset, uint64_t len) {
541  	    std::lock_guard l(lock);
542  	    return _preallocate(f, offset, len);
543  	  }
544  	  int truncate(FileWriter *h, uint64_t offset) {
545  	    std::lock_guard l(lock);
546  	    return _truncate(h, offset);
547  	  }
548  	
549  	};
550  	
551  	#endif
552