1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
5
6 #include <atomic>
7 #include <mutex>
8
9 #include "bluefs_types.h"
10 #include "BlockDevice.h"
11
12 #include "common/RefCountedObj.h"
13 #include "common/ceph_context.h"
14 #include "global/global_context.h"
15
16 #include "boost/intrusive/list.hpp"
17
18 class PerfCounters;
19
20 class Allocator;
21
22 enum {
23 l_bluefs_first = 732600,
24 l_bluefs_gift_bytes,
25 l_bluefs_reclaim_bytes,
26 l_bluefs_db_total_bytes,
27 l_bluefs_db_used_bytes,
28 l_bluefs_wal_total_bytes,
29 l_bluefs_wal_used_bytes,
30 l_bluefs_slow_total_bytes,
31 l_bluefs_slow_used_bytes,
32 l_bluefs_num_files,
33 l_bluefs_log_bytes,
34 l_bluefs_log_compactions,
35 l_bluefs_logged_bytes,
36 l_bluefs_files_written_wal,
37 l_bluefs_files_written_sst,
38 l_bluefs_bytes_written_wal,
39 l_bluefs_bytes_written_sst,
40 l_bluefs_bytes_written_slow,
41 l_bluefs_max_bytes_wal,
42 l_bluefs_max_bytes_db,
43 l_bluefs_max_bytes_slow,
44 l_bluefs_read_random_count,
45 l_bluefs_read_random_bytes,
46 l_bluefs_read_random_disk_count,
47 l_bluefs_read_random_disk_bytes,
48 l_bluefs_read_random_buffer_count,
49 l_bluefs_read_random_buffer_bytes,
50 l_bluefs_read_count,
51 l_bluefs_read_bytes,
52 l_bluefs_read_prefetch_count,
53 l_bluefs_read_prefetch_bytes,
54
55 l_bluefs_last,
56 };
57
58 class BlueFSDeviceExpander {
59 protected:
60 ~BlueFSDeviceExpander() {}
61 public:
62 virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
63 uint64_t bluefs_total) = 0;
64 virtual int allocate_freespace(
65 uint64_t min_size,
66 uint64_t size,
67 PExtentVector& extents) = 0;
68 /** Reports amount of space that can be transferred to BlueFS.
69 * This gives either current state, when alloc_size is currently used
70 * BlueFS's size, or simulation when alloc_size is different.
71 * @params
72 * alloc_size - allocation unit size to check
73 */
74 virtual size_t available_freespace(uint64_t alloc_size) = 0;
75 };
76
77 class BlueFS {
78 public:
79 CephContext* cct;
80 static constexpr unsigned MAX_BDEV = 5;
81 static constexpr unsigned BDEV_WAL = 0;
82 static constexpr unsigned BDEV_DB = 1;
83 static constexpr unsigned BDEV_SLOW = 2;
84 static constexpr unsigned BDEV_NEWWAL = 3;
85 static constexpr unsigned BDEV_NEWDB = 4;
86
87 enum {
88 WRITER_UNKNOWN,
89 WRITER_WAL,
90 WRITER_SST,
91 };
92
93 struct File : public RefCountedObject {
94 MEMPOOL_CLASS_HELPERS();
95
96 bluefs_fnode_t fnode;
97 int refs;
98 uint64_t dirty_seq;
99 bool locked;
100 bool deleted;
101 boost::intrusive::list_member_hook<> dirty_item;
102
103 std::atomic_int num_readers, num_writers;
104 std::atomic_int num_reading;
105
106 private:
107 FRIEND_MAKE_REF(File);
108 File()
109 :
110 refs(0),
111 dirty_seq(0),
112 locked(false),
113 deleted(false),
114 num_readers(0),
115 num_writers(0),
116 num_reading(0)
117 {}
(1) Event exn_spec_violation: |
An exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE" is thrown but the throw list "throw()" doesn't allow it to be thrown. This will cause a call to unexpected() which usually calls terminate(). |
Also see events: |
[fun_call_w_exception] |
118 ~File() override {
119 ceph_assert(num_readers.load() == 0);
120 ceph_assert(num_writers.load() == 0);
121 ceph_assert(num_reading.load() == 0);
(2) Event fun_call_w_exception: |
Called function throws an exception of type "_ZN5boost16exception_detail10clone_implINS0_19error_info_injectorINSt8ios_base7failureB5cxx11EEEEE". [details] |
Also see events: |
[exn_spec_violation] |
122 ceph_assert(!locked);
123 }
124 };
125 using FileRef = ceph::ref_t<File>;
126
127 typedef boost::intrusive::list<
128 File,
129 boost::intrusive::member_hook<
130 File,
131 boost::intrusive::list_member_hook<>,
132 &File::dirty_item> > dirty_file_list_t;
133
134 struct Dir : public RefCountedObject {
135 MEMPOOL_CLASS_HELPERS();
136
137 mempool::bluefs::map<string,FileRef> file_map;
138
139 private:
140 FRIEND_MAKE_REF(Dir);
141 Dir() = default;
142 };
143 using DirRef = ceph::ref_t<Dir>;
144
145 struct FileWriter {
146 MEMPOOL_CLASS_HELPERS();
147
148 FileRef file;
149 uint64_t pos = 0; ///< start offset for buffer
150 bufferlist buffer; ///< new data to write (at end of file)
151 bufferlist tail_block; ///< existing partial block at end of file, if any
152 bufferlist::page_aligned_appender buffer_appender; //< for const char* only
153 int writer_type = 0; ///< WRITER_*
154 int write_hint = WRITE_LIFE_NOT_SET;
155
156 ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
157 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
158 std::array<bool, MAX_BDEV> dirty_devs;
159
160 FileWriter(FileRef f)
161 : file(f),
162 buffer_appender(buffer.get_page_aligned_appender(
163 g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
164 ++file->num_writers;
165 iocv.fill(nullptr);
166 dirty_devs.fill(false);
167 if (f->fnode.ino == 1) {
168 write_hint = WRITE_LIFE_MEDIUM;
169 }
170 }
171 // NOTE: caller must call BlueFS::close_writer()
172 ~FileWriter() {
173 --file->num_writers;
174 }
175
176 // note: BlueRocksEnv uses this append exclusively, so it's safe
177 // to use buffer_appender exclusively here (e.g., it's notion of
178 // offset will remain accurate).
179 void append(const char *buf, size_t len) {
180 buffer_appender.append(buf, len);
181 }
182
183 // note: used internally only, for ino 1 or 0.
184 void append(bufferlist& bl) {
185 buffer.claim_append(bl);
186 }
187
188 uint64_t get_effective_write_pos() {
189 buffer_appender.flush();
190 return pos + buffer.length();
191 }
192 };
193
194 struct FileReaderBuffer {
195 MEMPOOL_CLASS_HELPERS();
196
197 uint64_t bl_off; ///< prefetch buffer logical offset
198 bufferlist bl; ///< prefetch buffer
199 uint64_t pos; ///< current logical offset
200 uint64_t max_prefetch; ///< max allowed prefetch
201
202 explicit FileReaderBuffer(uint64_t mpf)
203 : bl_off(0),
204 pos(0),
205 max_prefetch(mpf) {}
206
207 uint64_t get_buf_end() {
208 return bl_off + bl.length();
209 }
210 uint64_t get_buf_remaining(uint64_t p) {
211 if (p >= bl_off && p < bl_off + bl.length())
212 return bl_off + bl.length() - p;
213 return 0;
214 }
215
216 void skip(size_t n) {
217 pos += n;
218 }
219 void seek(uint64_t offset) {
220 pos = offset;
221 }
222 };
223
224 struct FileReader {
225 MEMPOOL_CLASS_HELPERS();
226
227 FileRef file;
228 FileReaderBuffer buf;
229 bool random;
230 bool ignore_eof; ///< used when reading our log file
231
232 ceph::shared_mutex lock {
233 ceph::make_shared_mutex(std::string(), false, false, false)
234 };
235
236
237 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
238 : file(f),
239 buf(mpf),
240 random(rand),
241 ignore_eof(ie) {
242 ++file->num_readers;
243 }
244 ~FileReader() {
245 --file->num_readers;
246 }
247 };
248
249 struct FileLock {
250 MEMPOOL_CLASS_HELPERS();
251
252 FileRef file;
253 explicit FileLock(FileRef f) : file(f) {}
254 };
255
256 private:
257 ceph::mutex lock = ceph::make_mutex("BlueFS::lock");
258
259 PerfCounters *logger = nullptr;
260
261 uint64_t max_bytes[MAX_BDEV] = {0};
262 uint64_t max_bytes_pcounters[MAX_BDEV] = {
263 l_bluefs_max_bytes_wal,
264 l_bluefs_max_bytes_db,
265 l_bluefs_max_bytes_slow,
266 };
267
268 // cache
269 mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir
270 mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
271
272 // map of dirty files, files of same dirty_seq are grouped into list.
273 map<uint64_t, dirty_file_list_t> dirty_files;
274
275 bluefs_super_t super; ///< latest superblock (as last written)
276 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
277 uint64_t log_seq = 0; ///< last used log seq (by current pending log_t)
278 uint64_t log_seq_stable = 0; ///< last stable/synced log seq
279 FileWriter *log_writer = 0; ///< writer for the log
280 bluefs_transaction_t log_t; ///< pending, unwritten log transaction
281 bool log_flushing = false; ///< true while flushing the log
282 ceph::condition_variable log_cond;
283
284 uint64_t new_log_jump_to = 0;
285 uint64_t old_log_jump_to = 0;
286 FileRef new_log = nullptr;
287 FileWriter *new_log_writer = nullptr;
288
289 /*
290 * There are up to 3 block devices:
291 *
292 * BDEV_DB db/ - the primary db device
293 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
294 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
295 */
296 vector<BlockDevice*> bdev; ///< block devices we can use
297 vector<IOContext*> ioc; ///< IOContexts for bdevs
298 vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
299 vector<Allocator*> alloc; ///< allocators for bdevs
300 vector<uint64_t> alloc_size; ///< alloc size for each device
301 vector<interval_set<uint64_t>> pending_release; ///< extents to release
302
303 BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
304
305 BlueFSDeviceExpander* slow_dev_expander = nullptr;
306
307 class SocketHook;
308 SocketHook* asok_hook = nullptr;
309
310 void _init_logger();
311 void _shutdown_logger();
312 void _update_logger_stats();
313
314 void _init_alloc();
315 void _stop_alloc();
316
317 void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros
318
319 FileRef _get_file(uint64_t ino);
320 void _drop_link(FileRef f);
321
322 int _get_slow_device_id() { return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; }
323 const char* get_device_name(unsigned id);
324 int _expand_slow_device(uint64_t min_size, PExtentVector& extents);
325 int _allocate(uint8_t bdev, uint64_t len,
326 bluefs_fnode_t* node);
327 int _allocate_without_fallback(uint8_t id, uint64_t len,
328 PExtentVector* extents);
329
330 int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
331 int _flush(FileWriter *h, bool force);
332 int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l);
333
334 #ifdef HAVE_LIBAIO
335 void _claim_completed_aios(FileWriter *h, list<aio_t> *ls);
336 void wait_for_aio(FileWriter *h); // safe to call without a lock
337 #endif
338
339 int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
340 uint64_t want_seq = 0,
341 uint64_t jump_to = 0);
342 uint64_t _estimate_log_size();
343 bool _should_compact_log();
344
345 enum {
346 REMOVE_DB = 1,
347 REMOVE_WAL = 2,
348 RENAME_SLOW2DB = 4,
349 RENAME_DB2SLOW = 8,
350 };
351 void _compact_log_dump_metadata(bluefs_transaction_t *t,
352 int flags);
353 void _compact_log_sync();
354 void _compact_log_async(std::unique_lock<ceph::mutex>& l);
355
356 void _rewrite_log_and_layout_sync(bool allocate_with_fallback,
357 int super_dev,
358 int log_dev,
359 int new_log_dev,
360 int flags,
361 std::optional<bluefs_layout_t> layout);
362
363 //void _aio_finish(void *priv);
364
365 void _flush_bdev_safely(FileWriter *h);
366 void flush_bdev(); // this is safe to call without a lock
367 void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
368
369 int _preallocate(FileRef f, uint64_t off, uint64_t len);
370 int _truncate(FileWriter *h, uint64_t off);
371
372 int _read(
373 FileReader *h, ///< [in] read from here
374 FileReaderBuffer *buf, ///< [in] reader state
375 uint64_t offset, ///< [in] offset
376 size_t len, ///< [in] this many bytes
377 bufferlist *outbl, ///< [out] optional: reference the result here
378 char *out); ///< [out] optional: or copy it here
379 int _read_random(
380 FileReader *h, ///< [in] read from here
381 uint64_t offset, ///< [in] offset
382 uint64_t len, ///< [in] this many bytes
383 char *out); ///< [out] optional: or copy it here
384
385 void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
386
387 int _open_super();
388 int _write_super(int dev);
389 int _replay(bool noop, bool to_stdout = false); ///< replay journal
390
391 FileWriter *_create_writer(FileRef f);
392 void _close_writer(FileWriter *h);
393
394 // always put the super in the second 4k block. FIXME should this be
395 // block size independent?
396 unsigned get_super_offset() {
397 return 4096;
398 }
399 unsigned get_super_length() {
400 return 4096;
401 }
402
403 void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len);
404
405 public:
406 BlueFS(CephContext* cct);
407 ~BlueFS();
408
409 // the super is always stored on bdev 0
410 int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
411 int mount();
412 int maybe_verify_layout(const bluefs_layout_t& layout) const;
413 void umount();
414 int prepare_new_device(int id, const bluefs_layout_t& layout);
415
416 int log_dump();
417
418 void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id);
419 void get_devices(set<string> *ls);
420 uint64_t get_alloc_size(int id) {
421 return alloc_size[id];
422 }
423 int fsck();
424
425 int device_migrate_to_new(
426 CephContext *cct,
427 const set<int>& devs_source,
428 int dev_target,
429 const bluefs_layout_t& layout);
430 int device_migrate_to_existing(
431 CephContext *cct,
432 const set<int>& devs_source,
433 int dev_target,
434 const bluefs_layout_t& layout);
435
436 uint64_t get_used();
437 uint64_t get_total(unsigned id);
438 uint64_t get_free(unsigned id);
439 void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
440 void dump_perf_counters(Formatter *f);
441
442 void dump_block_extents(ostream& out);
443
444 /// get current extents that we own for given block device
445 int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
446
447 int open_for_write(
448 const string& dir,
449 const string& file,
450 FileWriter **h,
451 bool overwrite);
452
453 int open_for_read(
454 const string& dir,
455 const string& file,
456 FileReader **h,
457 bool random = false);
458
459 void close_writer(FileWriter *h) {
460 std::lock_guard l(lock);
461 _close_writer(h);
462 }
463
464 int rename(const string& old_dir, const string& old_file,
465 const string& new_dir, const string& new_file);
466
467 int readdir(const string& dirname, vector<string> *ls);
468
469 int unlink(const string& dirname, const string& filename);
470 int mkdir(const string& dirname);
471 int rmdir(const string& dirname);
472 bool wal_is_rotational();
473
474 bool dir_exists(const string& dirname);
475 int stat(const string& dirname, const string& filename,
476 uint64_t *size, utime_t *mtime);
477
478 int lock_file(const string& dirname, const string& filename, FileLock **p);
479 int unlock_file(FileLock *l);
480
481 void flush_log();
482 void compact_log();
483
484 /// sync any uncommitted state to disk
485 void sync_metadata();
486
487 void set_slow_device_expander(BlueFSDeviceExpander* a) {
488 slow_dev_expander = a;
489 }
490 int add_block_device(unsigned bdev, const string& path, bool trim,
491 bool shared_with_bluestore=false);
492 bool bdev_support_label(unsigned id);
493 uint64_t get_block_device_size(unsigned bdev);
494
495 /// gift more block space
496 void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len) {
497 std::unique_lock l(lock);
498 _add_block_extent(bdev, offset, len);
499 int r = _flush_and_sync_log(l);
500 ceph_assert(r == 0);
501 }
502
503 /// reclaim block space
504 int reclaim_blocks(unsigned bdev, uint64_t want,
505 PExtentVector *extents);
506
507 // handler for discard event
508 void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
509
510 void flush(FileWriter *h) {
511 std::lock_guard l(lock);
512 _flush(h, false);
513 }
514 void flush_range(FileWriter *h, uint64_t offset, uint64_t length) {
515 std::lock_guard l(lock);
516 _flush_range(h, offset, length);
517 }
518 int fsync(FileWriter *h) {
519 std::unique_lock l(lock);
520 return _fsync(h, l);
521 }
522 int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len,
523 bufferlist *outbl, char *out) {
524 // no need to hold the global lock here; we only touch h and
525 // h->file, and read vs write or delete is already protected (via
526 // atomics and asserts).
527 return _read(h, buf, offset, len, outbl, out);
528 }
529 int read_random(FileReader *h, uint64_t offset, size_t len,
530 char *out) {
531 // no need to hold the global lock here; we only touch h and
532 // h->file, and read vs write or delete is already protected (via
533 // atomics and asserts).
534 return _read_random(h, offset, len, out);
535 }
536 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) {
537 std::lock_guard l(lock);
538 _invalidate_cache(f, offset, len);
539 }
540 int preallocate(FileRef f, uint64_t offset, uint64_t len) {
541 std::lock_guard l(lock);
542 return _preallocate(f, offset, len);
543 }
544 int truncate(FileWriter *h, uint64_t offset) {
545 std::lock_guard l(lock);
546 return _truncate(h, offset);
547 }
548
549 };
550
551 #endif
552