Bug Summary

File:home/bhubbard/working/src/ceph/src/rocksdb/env/io_posix.cc
Warning:line 629, column 5
Null pointer passed as an argument to a 'nonnull' parameter

Annotated Source Code

[?] Use j/k keys for keyboard navigation

1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#ifdef ROCKSDB_LIB_IO_POSIX1
11#include "env/io_posix.h"
12#include <errno(*__errno_location ()).h>
13#include <fcntl.h>
14#include <algorithm>
15#if defined(OS_LINUX1)
16#include <linux/fs.h>
17#endif
18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21#include <sys/ioctl.h>
22#include <sys/mman.h>
23#include <sys/stat.h>
24#include <sys/types.h>
25#ifdef OS_LINUX1
26#include <sys/statfs.h>
27#include <sys/syscall.h>
28#include <sys/sysmacros.h>
29#endif
30#include "env/posix_logger.h"
31#include "monitoring/iostats_context_imp.h"
32#include "port/port.h"
33#include "rocksdb/slice.h"
34#include "util/coding.h"
35#include "util/string_util.h"
36#include "util/sync_point.h"
37
38#if defined(OS_LINUX1) && !defined(F_SET_RW_HINT(1024 + 12))
39#define F_LINUX_SPECIFIC_BASE1024 1024
40#define F_SET_RW_HINT(1024 + 12) (F_LINUX_SPECIFIC_BASE1024 + 12)
41#endif
42
43namespace rocksdb {
44
45// A wrapper for fadvise, if the platform doesn't support fadvise,
46// it will simply return 0.
47int Fadvise(int fd, off_t offset, size_t len, int advice) {
48#ifdef OS_LINUX1
49 return posix_fadvise(fd, offset, len, advice);
50#else
51 (void)fd;
52 (void)offset;
53 (void)len;
54 (void)advice;
55 return 0; // simply do nothing.
56#endif
57}
58
59namespace {
60size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
61#ifdef OS_LINUX1
62 struct stat buf;
63 int result = fstat(fd, &buf);
64 if (result == -1) {
65 return kDefaultPageSize;
66 }
67 if (major(buf.st_dev)gnu_dev_major (buf.st_dev) == 0) {
68 // Unnamed devices (e.g. non-device mounts), reserved as null device number.
69 // These don't have an entry in /sys/dev/block/. Return a sensible default.
70 return kDefaultPageSize;
71 }
72
73 // Reading queue/logical_block_size does not require special permissions.
74 const int kBufferSize = 100;
75 char path[kBufferSize];
76 char real_path[PATH_MAX4096 + 1];
77 snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev)gnu_dev_major (buf.st_dev),
78 minor(buf.st_dev)gnu_dev_minor (buf.st_dev));
79 if (realpath(path, real_path) == nullptr) {
80 return kDefaultPageSize;
81 }
82 std::string device_dir(real_path);
83 if (!device_dir.empty() && device_dir.back() == '/') {
84 device_dir.pop_back();
85 }
86 // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
87 // and nvme0n1 have it.
88 // $ ls -al '/sys/dev/block/8:3'
89 // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
90 // ../../block/sda/sda3
91 // $ ls -al '/sys/dev/block/259:4'
92 // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
93 // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
94 size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
95 if (parent_end == std::string::npos) {
96 return kDefaultPageSize;
97 }
98 size_t parent_begin = device_dir.rfind('/', parent_end - 1);
99 if (parent_begin == std::string::npos) {
100 return kDefaultPageSize;
101 }
102 std::string parent =
103 device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
104 std::string child = device_dir.substr(parent_end + 1, std::string::npos);
105 if (parent != "block" &&
106 (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
107 device_dir = device_dir.substr(0, parent_end);
108 }
109 std::string fname = device_dir + "/queue/logical_block_size";
110 FILE* fp;
111 size_t size = 0;
112 fp = fopen(fname.c_str(), "r");
113 if (fp != nullptr) {
114 char* line = nullptr;
115 size_t len = 0;
116 if (getline(&line, &len, fp) != -1) {
117 sscanf(line, "%zu", &size);
118 }
119 free(line);
120 fclose(fp);
121 }
122 if (size != 0 && (size & (size - 1)) == 0) {
123 return size;
124 }
125#endif
126 return kDefaultPageSize;
127}
128} // namespace
129
130/*
131 * DirectIOHelper
132 */
133#ifndef NDEBUG1
134namespace {
135
136bool IsSectorAligned(const size_t off, size_t sector_size) {
137 return off % sector_size == 0;
138}
139
140bool IsSectorAligned(const void* ptr, size_t sector_size) {
141 return uintptr_t(ptr) % sector_size == 0;
142}
143
144}
145#endif
146
147/*
148 * PosixSequentialFile
149 */
150PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
151 int fd, const EnvOptions& options)
152 : filename_(fname),
153 file_(file),
154 fd_(fd),
155 use_direct_io_(options.use_direct_reads),
156 logical_sector_size_(GetLogicalBufferSize(fd_)) {
157 assert(!options.use_direct_reads || !options.use_mmap_reads)(static_cast<void> (0));
158}
159
160PosixSequentialFile::~PosixSequentialFile() {
161 if (!use_direct_io()) {
162 assert(file_)(static_cast<void> (0));
163 fclose(file_);
164 } else {
165 assert(fd_)(static_cast<void> (0));
166 close(fd_);
167 }
168}
169
170Status PosixSequentialFile::Read(size_t n, Slice* result, char* scratch) {
171 assert(result != nullptr && !use_direct_io())(static_cast<void> (0));
172 Status s;
173 size_t r = 0;
174 do {
175 r = fread_unlocked(scratch, 1, n, file_);
176 } while (r == 0 && ferror(file_) && errno(*__errno_location ()) == EINTR4);
177 *result = Slice(scratch, r);
178 if (r < n) {
179 if (feof(file_)) {
180 // We leave status as ok if we hit the end of the file
181 // We also clear the error so that the reads can continue
182 // if a new data is written to the file
183 clearerr(file_);
184 } else {
185 // A partial read with an error: return a non-ok status
186 s = IOError("While reading file sequentially", filename_, errno(*__errno_location ()));
187 }
188 }
189 return s;
190}
191
192Status PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
193 Slice* result, char* scratch) {
194 assert(use_direct_io())(static_cast<void> (0));
195 assert(IsSectorAligned(offset, GetRequiredBufferAlignment()))(static_cast<void> (0));
196 assert(IsSectorAligned(n, GetRequiredBufferAlignment()))(static_cast<void> (0));
197 assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()))(static_cast<void> (0));
198
199 Status s;
200 ssize_t r = -1;
201 size_t left = n;
202 char* ptr = scratch;
203 while (left > 0) {
204 r = pread(fd_, ptr, left, static_cast<off_t>(offset));
205 if (r <= 0) {
206 if (r == -1 && errno(*__errno_location ()) == EINTR4) {
207 continue;
208 }
209 break;
210 }
211 ptr += r;
212 offset += r;
213 left -= r;
214 if (r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
215 // Bytes reads don't fill sectors. Should only happen at the end
216 // of the file.
217 break;
218 }
219 }
220 if (r < 0) {
221 // An error: return a non-ok status
222 s = IOError(
223 "While pread " + ToString(n) + " bytes from offset " + ToString(offset),
224 filename_, errno(*__errno_location ()));
225 }
226 *result = Slice(scratch, (r < 0) ? 0 : n - left);
227 return s;
228}
229
230Status PosixSequentialFile::Skip(uint64_t n) {
231 if (fseek(file_, static_cast<long int>(n), SEEK_CUR1)) {
232 return IOError("While fseek to skip " + ToString(n) + " bytes", filename_,
233 errno(*__errno_location ()));
234 }
235 return Status::OK();
236}
237
238Status PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
239#ifndef OS_LINUX1
240 (void)offset;
241 (void)length;
242 return Status::OK();
243#else
244 if (!use_direct_io()) {
245 // free OS pages
246 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4);
247 if (ret != 0) {
248 return IOError("While fadvise NotNeeded offset " + ToString(offset) +
249 " len " + ToString(length),
250 filename_, errno(*__errno_location ()));
251 }
252 }
253 return Status::OK();
254#endif
255}
256
257/*
258 * PosixRandomAccessFile
259 */
260#if defined(OS_LINUX1)
261size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
262 if (max_size < kMaxVarint64Length * 3) {
263 return 0;
264 }
265
266 struct stat buf;
267 int result = fstat(fd, &buf);
268 if (result == -1) {
269 return 0;
270 }
271
272 long version = 0;
273 result = ioctl(fd, FS_IOC_GETVERSION(((2U) << (((0 +8)+8)+14)) | ((('v')) << (0 +8)) |
(((1)) << 0) | ((((sizeof(long)))) << ((0 +8)+8)
))
, &version);
274 TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
275 if (result == -1) {
276 return 0;
277 }
278 uint64_t uversion = (uint64_t)version;
279
280 char* rid = id;
281 rid = EncodeVarint64(rid, buf.st_dev);
282 rid = EncodeVarint64(rid, buf.st_ino);
283 rid = EncodeVarint64(rid, uversion);
284 assert(rid >= id)(static_cast<void> (0));
285 return static_cast<size_t>(rid - id);
286}
287#endif
288
289#if defined(OS_MACOSX) || defined(OS_AIX)
290size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
291 if (max_size < kMaxVarint64Length * 3) {
292 return 0;
293 }
294
295 struct stat buf;
296 int result = fstat(fd, &buf);
297 if (result == -1) {
298 return 0;
299 }
300
301 char* rid = id;
302 rid = EncodeVarint64(rid, buf.st_dev);
303 rid = EncodeVarint64(rid, buf.st_ino);
304 rid = EncodeVarint64(rid, buf.st_gen);
305 assert(rid >= id)(static_cast<void> (0));
306 return static_cast<size_t>(rid - id);
307}
308#endif
309/*
310 * PosixRandomAccessFile
311 *
312 * pread() based random-access
313 */
314PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd,
315 const EnvOptions& options)
316 : filename_(fname),
317 fd_(fd),
318 use_direct_io_(options.use_direct_reads),
319 logical_sector_size_(GetLogicalBufferSize(fd_)) {
320 assert(!options.use_direct_reads || !options.use_mmap_reads)(static_cast<void> (0));
321 assert(!options.use_mmap_reads || sizeof(void*) < 8)(static_cast<void> (0));
322}
323
324PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
325
326Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
327 char* scratch) const {
328 if (use_direct_io()) {
329 assert(IsSectorAligned(offset, GetRequiredBufferAlignment()))(static_cast<void> (0));
330 assert(IsSectorAligned(n, GetRequiredBufferAlignment()))(static_cast<void> (0));
331 assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()))(static_cast<void> (0));
332 }
333 Status s;
334 ssize_t r = -1;
335 size_t left = n;
336 char* ptr = scratch;
337 while (left > 0) {
338 r = pread(fd_, ptr, left, static_cast<off_t>(offset));
339 if (r <= 0) {
340 if (r == -1 && errno(*__errno_location ()) == EINTR4) {
341 continue;
342 }
343 break;
344 }
345 ptr += r;
346 offset += r;
347 left -= r;
348 if (use_direct_io() &&
349 r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
350 // Bytes reads don't fill sectors. Should only happen at the end
351 // of the file.
352 break;
353 }
354 }
355 if (r < 0) {
356 // An error: return a non-ok status
357 s = IOError(
358 "While pread offset " + ToString(offset) + " len " + ToString(n),
359 filename_, errno(*__errno_location ()));
360 }
361 *result = Slice(scratch, (r < 0) ? 0 : n - left);
362 return s;
363}
364
365Status PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n) {
366 Status s;
367 if (!use_direct_io()) {
368 ssize_t r = 0;
369#ifdef OS_LINUX1
370 r = readahead(fd_, offset, n);
371#endif
372#ifdef OS_MACOSX
373 radvisory advice;
374 advice.ra_offset = static_cast<off_t>(offset);
375 advice.ra_count = static_cast<int>(n);
376 r = fcntl(fd_, F_RDADVISE, &advice);
377#endif
378 if (r == -1) {
379 s = IOError("While prefetching offset " + ToString(offset) + " len " +
380 ToString(n),
381 filename_, errno(*__errno_location ()));
382 }
383 }
384 return s;
385}
386
387#if defined(OS_LINUX1) || defined(OS_MACOSX) || defined(OS_AIX)
388size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
389 return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
390}
391#endif
392
393void PosixRandomAccessFile::Hint(AccessPattern pattern) {
394 if (use_direct_io()) {
395 return;
396 }
397 switch (pattern) {
398 case NORMAL:
399 Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL0);
400 break;
401 case RANDOM:
402 Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM1);
403 break;
404 case SEQUENTIAL:
405 Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL2);
406 break;
407 case WILLNEED:
408 Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED3);
409 break;
410 case DONTNEED:
411 Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED4);
412 break;
413 default:
414 assert(false)(static_cast<void> (0));
415 break;
416 }
417}
418
419Status PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
420 if (use_direct_io()) {
421 return Status::OK();
422 }
423#ifndef OS_LINUX1
424 (void)offset;
425 (void)length;
426 return Status::OK();
427#else
428 // free OS pages
429 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4);
430 if (ret == 0) {
431 return Status::OK();
432 }
433 return IOError("While fadvise NotNeeded offset " + ToString(offset) +
434 " len " + ToString(length),
435 filename_, errno(*__errno_location ()));
436#endif
437}
438
439/*
440 * PosixMmapReadableFile
441 *
442 * mmap() based random-access
443 */
444// base[0,length-1] contains the mmapped contents of the file.
445PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
446 const std::string& fname,
447 void* base, size_t length,
448 const EnvOptions& options)
449 : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
450#ifdef NDEBUG1
451 (void)options;
452#endif
453 fd_ = fd_ + 0; // suppress the warning for used variables
454 assert(options.use_mmap_reads)(static_cast<void> (0));
455 assert(!options.use_direct_reads)(static_cast<void> (0));
456}
457
458PosixMmapReadableFile::~PosixMmapReadableFile() {
459 int ret = munmap(mmapped_region_, length_);
460 if (ret != 0) {
461 fprintf(stdoutstdout, "failed to munmap %p length %" ROCKSDB_PRIszt"zu" " \n",
462 mmapped_region_, length_);
463 }
464 close(fd_);
465}
466
467Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
468 char* /*scratch*/) const {
469 Status s;
470 if (offset > length_) {
471 *result = Slice();
472 return IOError("While mmap read offset " + ToString(offset) +
473 " larger than file length " + ToString(length_),
474 filename_, EINVAL22);
475 } else if (offset + n > length_) {
476 n = static_cast<size_t>(length_ - offset);
477 }
478 *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
479 return s;
480}
481
482Status PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
483#ifndef OS_LINUX1
484 (void)offset;
485 (void)length;
486 return Status::OK();
487#else
488 // free OS pages
489 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4);
490 if (ret == 0) {
491 return Status::OK();
492 }
493 return IOError("While fadvise not needed. Offset " + ToString(offset) +
494 " len" + ToString(length),
495 filename_, errno(*__errno_location ()));
496#endif
497}
498
499/*
500 * PosixMmapFile
501 *
502 * We preallocate up to an extra megabyte and use memcpy to append new
503 * data to the file. This is safe since we either properly close the
504 * file before reading from it, or for log files, the reading code
505 * knows enough to skip zero suffixes.
506 */
507Status PosixMmapFile::UnmapCurrentRegion() {
508 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
509 if (base_ != nullptr) {
16
Assuming the condition is true
17
Taking true branch
510 int munmap_status = munmap(base_, limit_ - base_);
511 if (munmap_status != 0) {
18
Assuming 'munmap_status' is equal to 0
19
Taking false branch
512 return IOError("While munmap", filename_, munmap_status);
513 }
514 file_offset_ += limit_ - base_;
515 base_ = nullptr;
516 limit_ = nullptr;
517 last_sync_ = nullptr;
518 dst_ = nullptr;
20
Null pointer value stored to field 'dst_'
519
520 // Increase the amount we map the next time, but capped at 1MB
521 if (map_size_ < (1 << 20)) {
21
Assuming the condition is false
22
Taking false branch
522 map_size_ *= 2;
523 }
524 }
525 return Status::OK();
526}
527
528Status PosixMmapFile::MapNewRegion() {
529#ifdef ROCKSDB_FALLOCATE_PRESENT1
530 assert(base_ == nullptr)(static_cast<void> (0));
531 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
532 // we can't fallocate with FALLOC_FL_KEEP_SIZE here
533 if (allow_fallocate_) {
534 IOSTATS_TIMER_GUARD(allocate_nanos)PerfStepTimer iostats_step_timer_allocate_nanos(&(iostats_context
.allocate_nanos)); iostats_step_timer_allocate_nanos.Start();
;
535 int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
536 if (alloc_status != 0) {
537 // fallback to posix_fallocate
538 alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
539 }
540 if (alloc_status != 0) {
541 return Status::IOError("Error allocating space to file : " + filename_ +
542 "Error : " + strerror(alloc_status));
543 }
544 }
545
546 TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
547 void* ptr = mmap(nullptr, map_size_, PROT_READ0x1 | PROT_WRITE0x2, MAP_SHARED0x01, fd_,
548 file_offset_);
549 if (ptr == MAP_FAILED((void *) -1)) {
550 return Status::IOError("MMap failed on " + filename_);
551 }
552 TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
553
554 base_ = reinterpret_cast<char*>(ptr);
555 limit_ = base_ + map_size_;
556 dst_ = base_;
557 last_sync_ = base_;
558 return Status::OK();
559#else
560 return Status::NotSupported("This platform doesn't support fallocate()");
561#endif
562}
563
564Status PosixMmapFile::Msync() {
565 if (dst_ == last_sync_) {
566 return Status::OK();
567 }
568 // Find the beginnings of the pages that contain the first and last
569 // bytes to be synced.
570 size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
571 size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
572 last_sync_ = dst_;
573 TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
574 if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC4) < 0) {
575 return IOError("While msync", filename_, errno(*__errno_location ()));
576 }
577 return Status::OK();
578}
579
580PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
581 const EnvOptions& options)
582 : filename_(fname),
583 fd_(fd),
584 page_size_(page_size),
585 map_size_(Roundup(65536, page_size)),
586 base_(nullptr),
587 limit_(nullptr),
588 dst_(nullptr),
589 last_sync_(nullptr),
590 file_offset_(0) {
591#ifdef ROCKSDB_FALLOCATE_PRESENT1
592 allow_fallocate_ = options.allow_fallocate;
593 fallocate_with_keep_size_ = options.fallocate_with_keep_size;
594#else
595 (void)options;
596#endif
597 assert((page_size & (page_size - 1)) == 0)(static_cast<void> (0));
598 assert(options.use_mmap_writes)(static_cast<void> (0));
599 assert(!options.use_direct_writes)(static_cast<void> (0));
600}
601
602PosixMmapFile::~PosixMmapFile() {
603 if (fd_ >= 0) {
604 PosixMmapFile::Close();
605 }
606}
607
608Status PosixMmapFile::Append(const Slice& data) {
609 const char* src = data.data();
610 size_t left = data.size();
611 while (left > 0) {
1
Assuming 'left' is > 0
2
Loop condition is true. Entering loop body
6
Assuming 'left' is > 0
7
Loop condition is true. Entering loop body
11
Assuming 'left' is > 0
12
Loop condition is true. Entering loop body
612 assert(base_ <= dst_)(static_cast<void> (0));
613 assert(dst_ <= limit_)(static_cast<void> (0));
614 size_t avail = limit_ - dst_;
615 if (avail == 0) {
3
Assuming 'avail' is not equal to 0
4
Taking false branch
8
Assuming 'avail' is not equal to 0
9
Taking false branch
13
Assuming 'avail' is equal to 0
14
Taking true branch
616 Status s = UnmapCurrentRegion();
15
Calling 'PosixMmapFile::UnmapCurrentRegion'
23
Returning from 'PosixMmapFile::UnmapCurrentRegion'
617 if (!s.ok()) {
24
Taking false branch
618 return s;
619 }
620 s = MapNewRegion();
621 if (!s.ok()) {
25
Taking false branch
622 return s;
623 }
624 TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
625 }
626
627 size_t n = (left <= avail) ? left : avail;
5
'?' condition is false
10
'?' condition is false
26
'?' condition is false
628 assert(dst_)(static_cast<void> (0));
629 memcpy(dst_, src, n);
27
Null pointer passed as an argument to a 'nonnull' parameter
630 dst_ += n;
631 src += n;
632 left -= n;
633 }
634 return Status::OK();
635}
636
637Status PosixMmapFile::Close() {
638 Status s;
639 size_t unused = limit_ - dst_;
640
641 s = UnmapCurrentRegion();
642 if (!s.ok()) {
643 s = IOError("While closing mmapped file", filename_, errno(*__errno_location ()));
644 } else if (unused > 0) {
645 // Trim the extra space at the end of the file
646 if (ftruncate(fd_, file_offset_ - unused) < 0) {
647 s = IOError("While ftruncating mmaped file", filename_, errno(*__errno_location ()));
648 }
649 }
650
651 if (close(fd_) < 0) {
652 if (s.ok()) {
653 s = IOError("While closing mmapped file", filename_, errno(*__errno_location ()));
654 }
655 }
656
657 fd_ = -1;
658 base_ = nullptr;
659 limit_ = nullptr;
660 return s;
661}
662
663Status PosixMmapFile::Flush() { return Status::OK(); }
664
665Status PosixMmapFile::Sync() {
666 if (fdatasync(fd_) < 0) {
667 return IOError("While fdatasync mmapped file", filename_, errno(*__errno_location ()));
668 }
669
670 return Msync();
671}
672
673/**
674 * Flush data as well as metadata to stable storage.
675 */
676Status PosixMmapFile::Fsync() {
677 if (fsync(fd_) < 0) {
678 return IOError("While fsync mmaped file", filename_, errno(*__errno_location ()));
679 }
680
681 return Msync();
682}
683
684/**
685 * Get the size of valid data in the file. This will not match the
686 * size that is returned from the filesystem because we use mmap
687 * to extend file by map_size every time.
688 */
689uint64_t PosixMmapFile::GetFileSize() {
690 size_t used = dst_ - base_;
691 return file_offset_ + used;
692}
693
694Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
695#ifndef OS_LINUX1
696 (void)offset;
697 (void)length;
698 return Status::OK();
699#else
700 // free OS pages
701 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4);
702 if (ret == 0) {
703 return Status::OK();
704 }
705 return IOError("While fadvise NotNeeded mmapped file", filename_, errno(*__errno_location ()));
706#endif
707}
708
709#ifdef ROCKSDB_FALLOCATE_PRESENT1
710Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
711 assert(offset <= std::numeric_limits<off_t>::max())(static_cast<void> (0));
712 assert(len <= std::numeric_limits<off_t>::max())(static_cast<void> (0));
713 TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
714 int alloc_status = 0;
715 if (allow_fallocate_) {
716 alloc_status = fallocate(
717 fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE1 : 0,
718 static_cast<off_t>(offset), static_cast<off_t>(len));
719 }
720 if (alloc_status == 0) {
721 return Status::OK();
722 } else {
723 return IOError(
724 "While fallocate offset " + ToString(offset) + " len " + ToString(len),
725 filename_, errno(*__errno_location ()));
726 }
727}
728#endif
729
730/*
731 * PosixWritableFile
732 *
733 * Use posix write to write data to a file.
734 */
735PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
736 const EnvOptions& options)
737 : filename_(fname),
738 use_direct_io_(options.use_direct_writes),
739 fd_(fd),
740 filesize_(0),
741 logical_sector_size_(GetLogicalBufferSize(fd_)) {
742#ifdef ROCKSDB_FALLOCATE_PRESENT1
743 allow_fallocate_ = options.allow_fallocate;
744 fallocate_with_keep_size_ = options.fallocate_with_keep_size;
745#endif
746 assert(!options.use_mmap_writes)(static_cast<void> (0));
747}
748
749PosixWritableFile::~PosixWritableFile() {
750 if (fd_ >= 0) {
751 PosixWritableFile::Close();
752 }
753}
754
755Status PosixWritableFile::Append(const Slice& data) {
756 if (use_direct_io()) {
757 assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()))(static_cast<void> (0));
758 assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()))(static_cast<void> (0));
759 }
760 const char* src = data.data();
761 size_t left = data.size();
762 while (left != 0) {
763 ssize_t done = write(fd_, src, left);
764 if (done < 0) {
765 if (errno(*__errno_location ()) == EINTR4) {
766 continue;
767 }
768 return IOError("While appending to file", filename_, errno(*__errno_location ()));
769 }
770 left -= done;
771 src += done;
772 }
773 filesize_ += data.size();
774 return Status::OK();
775}
776
777Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
778 if (use_direct_io()) {
779 assert(IsSectorAligned(offset, GetRequiredBufferAlignment()))(static_cast<void> (0));
780 assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()))(static_cast<void> (0));
781 assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()))(static_cast<void> (0));
782 }
783 assert(offset <= std::numeric_limits<off_t>::max())(static_cast<void> (0));
784 const char* src = data.data();
785 size_t left = data.size();
786 while (left != 0) {
787 ssize_t done = pwrite(fd_, src, left, static_cast<off_t>(offset));
788 if (done < 0) {
789 if (errno(*__errno_location ()) == EINTR4) {
790 continue;
791 }
792 return IOError("While pwrite to file at offset " + ToString(offset),
793 filename_, errno(*__errno_location ()));
794 }
795 left -= done;
796 offset += done;
797 src += done;
798 }
799 filesize_ = offset;
800 return Status::OK();
801}
802
803Status PosixWritableFile::Truncate(uint64_t size) {
804 Status s;
805 int r = ftruncate(fd_, size);
806 if (r < 0) {
807 s = IOError("While ftruncate file to size " + ToString(size), filename_,
808 errno(*__errno_location ()));
809 } else {
810 filesize_ = size;
811 }
812 return s;
813}
814
815Status PosixWritableFile::Close() {
816 Status s;
817
818 size_t block_size;
819 size_t last_allocated_block;
820 GetPreallocationStatus(&block_size, &last_allocated_block);
821 if (last_allocated_block > 0) {
822 // trim the extra space preallocated at the end of the file
823 // NOTE(ljin): we probably don't want to surface failure as an IOError,
824 // but it will be nice to log these errors.
825 int dummy __attribute__((__unused__));
826 dummy = ftruncate(fd_, filesize_);
827#if defined(ROCKSDB_FALLOCATE_PRESENT1) && defined(FALLOC_FL_PUNCH_HOLE2) && \
828 !defined(TRAVIS)
829 // in some file systems, ftruncate only trims trailing space if the
830 // new file size is smaller than the current size. Calling fallocate
831 // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
832 // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
833 // filesystems:
834 // XFS (since Linux 2.6.38)
835 // ext4 (since Linux 3.0)
836 // Btrfs (since Linux 3.7)
837 // tmpfs (since Linux 3.5)
838 // We ignore error since failure of this operation does not affect
839 // correctness.
840 // TRAVIS - this code does not work on TRAVIS filesystems.
841 // the FALLOC_FL_KEEP_SIZE option is expected to not change the size
842 // of the file, but it does. Simple strace report will show that.
843 // While we work with Travis-CI team to figure out if this is a
844 // quirk of Docker/AUFS, we will comment this out.
845 struct stat file_stats;
846 int result = fstat(fd_, &file_stats);
847 // After ftruncate, we check whether ftruncate has the correct behavior.
848 // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
849 if (result == 0 &&
850 (file_stats.st_size + file_stats.st_blksize - 1) /
851 file_stats.st_blksize !=
852 file_stats.st_blocks / (file_stats.st_blksize / 512)) {
853 IOSTATS_TIMER_GUARD(allocate_nanos)PerfStepTimer iostats_step_timer_allocate_nanos(&(iostats_context
.allocate_nanos)); iostats_step_timer_allocate_nanos.Start();
;
854 if (allow_fallocate_) {
855 fallocate(fd_, FALLOC_FL_KEEP_SIZE1 | FALLOC_FL_PUNCH_HOLE2, filesize_,
856 block_size * last_allocated_block - filesize_);
857 }
858 }
859#endif
860 }
861
862 if (close(fd_) < 0) {
863 s = IOError("While closing file after writing", filename_, errno(*__errno_location ()));
864 }
865 fd_ = -1;
866 return s;
867}
868
869// write out the cached data to the OS cache
870Status PosixWritableFile::Flush() { return Status::OK(); }
871
872Status PosixWritableFile::Sync() {
873 if (fdatasync(fd_) < 0) {
874 return IOError("While fdatasync", filename_, errno(*__errno_location ()));
875 }
876 return Status::OK();
877}
878
879Status PosixWritableFile::Fsync() {
880 if (fsync(fd_) < 0) {
881 return IOError("While fsync", filename_, errno(*__errno_location ()));
882 }
883 return Status::OK();
884}
885
886bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
887
888uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
889
890void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
891#ifdef OS_LINUX1
892// Suppress Valgrind "Unimplemented functionality" error.
893#ifndef ROCKSDB_VALGRIND_RUN
894 if (hint == write_hint_) {
895 return;
896 }
897 if (fcntl(fd_, F_SET_RW_HINT(1024 + 12), &hint) == 0) {
898 write_hint_ = hint;
899 }
900#else
901 (void)hint;
902#endif // ROCKSDB_VALGRIND_RUN
903#else
904 (void)hint;
905#endif // OS_LINUX
906}
907
908Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
909 if (use_direct_io()) {
910 return Status::OK();
911 }
912#ifndef OS_LINUX1
913 (void)offset;
914 (void)length;
915 return Status::OK();
916#else
917 // free OS pages
918 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4);
919 if (ret == 0) {
920 return Status::OK();
921 }
922 return IOError("While fadvise NotNeeded", filename_, errno(*__errno_location ()));
923#endif
924}
925
926#ifdef ROCKSDB_FALLOCATE_PRESENT1
927Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
928 assert(offset <= std::numeric_limits<off_t>::max())(static_cast<void> (0));
929 assert(len <= std::numeric_limits<off_t>::max())(static_cast<void> (0));
930 TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
931 IOSTATS_TIMER_GUARD(allocate_nanos)PerfStepTimer iostats_step_timer_allocate_nanos(&(iostats_context
.allocate_nanos)); iostats_step_timer_allocate_nanos.Start();
;
932 int alloc_status = 0;
933 if (allow_fallocate_) {
934 alloc_status = fallocate(
935 fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE1 : 0,
936 static_cast<off_t>(offset), static_cast<off_t>(len));
937 }
938 if (alloc_status == 0) {
939 return Status::OK();
940 } else {
941 return IOError(
942 "While fallocate offset " + ToString(offset) + " len " + ToString(len),
943 filename_, errno(*__errno_location ()));
944 }
945}
946#endif
947
948#ifdef ROCKSDB_RANGESYNC_PRESENT1
949Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) {
950 assert(offset <= std::numeric_limits<off_t>::max())(static_cast<void> (0));
951 assert(nbytes <= std::numeric_limits<off_t>::max())(static_cast<void> (0));
952 if (sync_file_range(fd_, static_cast<off_t>(offset),
953 static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE2) == 0) {
954 return Status::OK();
955 } else {
956 return IOError("While sync_file_range offset " + ToString(offset) +
957 " bytes " + ToString(nbytes),
958 filename_, errno(*__errno_location ()));
959 }
960}
961#endif
962
963#ifdef OS_LINUX1
964size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
965 return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
966}
967#endif
968
969/*
970 * PosixRandomRWFile
971 */
972
973PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
974 const EnvOptions& /*options*/)
975 : filename_(fname), fd_(fd) {}
976
977PosixRandomRWFile::~PosixRandomRWFile() {
978 if (fd_ >= 0) {
979 Close();
980 }
981}
982
983Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) {
984 const char* src = data.data();
985 size_t left = data.size();
986 while (left != 0) {
987 ssize_t done = pwrite(fd_, src, left, offset);
988 if (done < 0) {
989 // error while writing to file
990 if (errno(*__errno_location ()) == EINTR4) {
991 // write was interrupted, try again.
992 continue;
993 }
994 return IOError(
995 "While write random read/write file at offset " + ToString(offset),
996 filename_, errno(*__errno_location ()));
997 }
998
999 // Wrote `done` bytes
1000 left -= done;
1001 offset += done;
1002 src += done;
1003 }
1004
1005 return Status::OK();
1006}
1007
1008Status PosixRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
1009 char* scratch) const {
1010 size_t left = n;
1011 char* ptr = scratch;
1012 while (left > 0) {
1013 ssize_t done = pread(fd_, ptr, left, offset);
1014 if (done < 0) {
1015 // error while reading from file
1016 if (errno(*__errno_location ()) == EINTR4) {
1017 // read was interrupted, try again.
1018 continue;
1019 }
1020 return IOError("While reading random read/write file offset " +
1021 ToString(offset) + " len " + ToString(n),
1022 filename_, errno(*__errno_location ()));
1023 } else if (done == 0) {
1024 // Nothing more to read
1025 break;
1026 }
1027
1028 // Read `done` bytes
1029 ptr += done;
1030 offset += done;
1031 left -= done;
1032 }
1033
1034 *result = Slice(scratch, n - left);
1035 return Status::OK();
1036}
1037
1038Status PosixRandomRWFile::Flush() { return Status::OK(); }
1039
1040Status PosixRandomRWFile::Sync() {
1041 if (fdatasync(fd_) < 0) {
1042 return IOError("While fdatasync random read/write file", filename_, errno(*__errno_location ()));
1043 }
1044 return Status::OK();
1045}
1046
1047Status PosixRandomRWFile::Fsync() {
1048 if (fsync(fd_) < 0) {
1049 return IOError("While fsync random read/write file", filename_, errno(*__errno_location ()));
1050 }
1051 return Status::OK();
1052}
1053
1054Status PosixRandomRWFile::Close() {
1055 if (close(fd_) < 0) {
1056 return IOError("While close random read/write file", filename_, errno(*__errno_location ()));
1057 }
1058 fd_ = -1;
1059 return Status::OK();
1060}
1061
1062PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
1063 // TODO should have error handling though not much we can do...
1064 munmap(this->base_, length_);
1065}
1066
1067/*
1068 * PosixDirectory
1069 */
1070
1071PosixDirectory::~PosixDirectory() { close(fd_); }
1072
1073Status PosixDirectory::Fsync() {
1074#ifndef OS_AIX
1075 if (fsync(fd_) == -1) {
1076 return IOError("While fsync", "a directory", errno(*__errno_location ()));
1077 }
1078#endif
1079 return Status::OK();
1080}
1081} // namespace rocksdb
1082#endif