| File: | home/bhubbard/working/src/ceph/src/rocksdb/env/io_posix.cc | 
| Warning: | line 629, column 5 Null pointer passed as an argument to a 'nonnull' parameter  | 
[?] Use j/k keys for keyboard navigation
| 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. | |||
| 2 | // This source code is licensed under both the GPLv2 (found in the | |||
| 3 | // COPYING file in the root directory) and Apache 2.0 License | |||
| 4 | // (found in the LICENSE.Apache file in the root directory). | |||
| 5 | // | |||
| 6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |||
| 7 | // Use of this source code is governed by a BSD-style license that can be | |||
| 8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |||
| 9 | ||||
| 10 | #ifdef ROCKSDB_LIB_IO_POSIX1 | |||
| 11 | #include "env/io_posix.h" | |||
| 12 | #include <errno(*__errno_location ()).h> | |||
| 13 | #include <fcntl.h> | |||
| 14 | #include <algorithm> | |||
| 15 | #if defined(OS_LINUX1) | |||
| 16 | #include <linux/fs.h> | |||
| 17 | #endif | |||
| 18 | #include <stdio.h> | |||
| 19 | #include <stdlib.h> | |||
| 20 | #include <string.h> | |||
| 21 | #include <sys/ioctl.h> | |||
| 22 | #include <sys/mman.h> | |||
| 23 | #include <sys/stat.h> | |||
| 24 | #include <sys/types.h> | |||
| 25 | #ifdef OS_LINUX1 | |||
| 26 | #include <sys/statfs.h> | |||
| 27 | #include <sys/syscall.h> | |||
| 28 | #include <sys/sysmacros.h> | |||
| 29 | #endif | |||
| 30 | #include "env/posix_logger.h" | |||
| 31 | #include "monitoring/iostats_context_imp.h" | |||
| 32 | #include "port/port.h" | |||
| 33 | #include "rocksdb/slice.h" | |||
| 34 | #include "util/coding.h" | |||
| 35 | #include "util/string_util.h" | |||
| 36 | #include "util/sync_point.h" | |||
| 37 | ||||
| 38 | #if defined(OS_LINUX1) && !defined(F_SET_RW_HINT(1024 + 12)) | |||
| 39 | #define F_LINUX_SPECIFIC_BASE1024 1024 | |||
| 40 | #define F_SET_RW_HINT(1024 + 12) (F_LINUX_SPECIFIC_BASE1024 + 12) | |||
| 41 | #endif | |||
| 42 | ||||
| 43 | namespace rocksdb { | |||
| 44 | ||||
| 45 | // A wrapper for fadvise, if the platform doesn't support fadvise, | |||
| 46 | // it will simply return 0. | |||
| 47 | int Fadvise(int fd, off_t offset, size_t len, int advice) { | |||
| 48 | #ifdef OS_LINUX1 | |||
| 49 | return posix_fadvise(fd, offset, len, advice); | |||
| 50 | #else | |||
| 51 | (void)fd; | |||
| 52 | (void)offset; | |||
| 53 | (void)len; | |||
| 54 | (void)advice; | |||
| 55 | return 0; // simply do nothing. | |||
| 56 | #endif | |||
| 57 | } | |||
| 58 | ||||
| 59 | namespace { | |||
| 60 | size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) { | |||
| 61 | #ifdef OS_LINUX1 | |||
| 62 | struct stat buf; | |||
| 63 | int result = fstat(fd, &buf); | |||
| 64 | if (result == -1) { | |||
| 65 | return kDefaultPageSize; | |||
| 66 | } | |||
| 67 | if (major(buf.st_dev)gnu_dev_major (buf.st_dev) == 0) { | |||
| 68 | // Unnamed devices (e.g. non-device mounts), reserved as null device number. | |||
| 69 | // These don't have an entry in /sys/dev/block/. Return a sensible default. | |||
| 70 | return kDefaultPageSize; | |||
| 71 | } | |||
| 72 | ||||
| 73 | // Reading queue/logical_block_size does not require special permissions. | |||
| 74 | const int kBufferSize = 100; | |||
| 75 | char path[kBufferSize]; | |||
| 76 | char real_path[PATH_MAX4096 + 1]; | |||
| 77 | snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev)gnu_dev_major (buf.st_dev), | |||
| 78 | minor(buf.st_dev)gnu_dev_minor (buf.st_dev)); | |||
| 79 | if (realpath(path, real_path) == nullptr) { | |||
| 80 | return kDefaultPageSize; | |||
| 81 | } | |||
| 82 | std::string device_dir(real_path); | |||
| 83 | if (!device_dir.empty() && device_dir.back() == '/') { | |||
| 84 | device_dir.pop_back(); | |||
| 85 | } | |||
| 86 | // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda | |||
| 87 | // and nvme0n1 have it. | |||
| 88 | // $ ls -al '/sys/dev/block/8:3' | |||
| 89 | // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 -> | |||
| 90 | // ../../block/sda/sda3 | |||
| 91 | // $ ls -al '/sys/dev/block/259:4' | |||
| 92 | // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 -> | |||
| 93 | // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 | |||
| 94 | size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); | |||
| 95 | if (parent_end == std::string::npos) { | |||
| 96 | return kDefaultPageSize; | |||
| 97 | } | |||
| 98 | size_t parent_begin = device_dir.rfind('/', parent_end - 1); | |||
| 99 | if (parent_begin == std::string::npos) { | |||
| 100 | return kDefaultPageSize; | |||
| 101 | } | |||
| 102 | std::string parent = | |||
| 103 | device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); | |||
| 104 | std::string child = device_dir.substr(parent_end + 1, std::string::npos); | |||
| 105 | if (parent != "block" && | |||
| 106 | (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { | |||
| 107 | device_dir = device_dir.substr(0, parent_end); | |||
| 108 | } | |||
| 109 | std::string fname = device_dir + "/queue/logical_block_size"; | |||
| 110 | FILE* fp; | |||
| 111 | size_t size = 0; | |||
| 112 | fp = fopen(fname.c_str(), "r"); | |||
| 113 | if (fp != nullptr) { | |||
| 114 | char* line = nullptr; | |||
| 115 | size_t len = 0; | |||
| 116 | if (getline(&line, &len, fp) != -1) { | |||
| 117 | sscanf(line, "%zu", &size); | |||
| 118 | } | |||
| 119 | free(line); | |||
| 120 | fclose(fp); | |||
| 121 | } | |||
| 122 | if (size != 0 && (size & (size - 1)) == 0) { | |||
| 123 | return size; | |||
| 124 | } | |||
| 125 | #endif | |||
| 126 | return kDefaultPageSize; | |||
| 127 | } | |||
| 128 | } // namespace | |||
| 129 | ||||
| 130 | /* | |||
| 131 | * DirectIOHelper | |||
| 132 | */ | |||
| 133 | #ifndef NDEBUG1 | |||
| 134 | namespace { | |||
| 135 | ||||
| 136 | bool IsSectorAligned(const size_t off, size_t sector_size) { | |||
| 137 | return off % sector_size == 0; | |||
| 138 | } | |||
| 139 | ||||
| 140 | bool IsSectorAligned(const void* ptr, size_t sector_size) { | |||
| 141 | return uintptr_t(ptr) % sector_size == 0; | |||
| 142 | } | |||
| 143 | ||||
| 144 | } | |||
| 145 | #endif | |||
| 146 | ||||
| 147 | /* | |||
| 148 | * PosixSequentialFile | |||
| 149 | */ | |||
| 150 | PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file, | |||
| 151 | int fd, const EnvOptions& options) | |||
| 152 | : filename_(fname), | |||
| 153 | file_(file), | |||
| 154 | fd_(fd), | |||
| 155 | use_direct_io_(options.use_direct_reads), | |||
| 156 | logical_sector_size_(GetLogicalBufferSize(fd_)) { | |||
| 157 | assert(!options.use_direct_reads || !options.use_mmap_reads)(static_cast<void> (0)); | |||
| 158 | } | |||
| 159 | ||||
| 160 | PosixSequentialFile::~PosixSequentialFile() { | |||
| 161 | if (!use_direct_io()) { | |||
| 162 | assert(file_)(static_cast<void> (0)); | |||
| 163 | fclose(file_); | |||
| 164 | } else { | |||
| 165 | assert(fd_)(static_cast<void> (0)); | |||
| 166 | close(fd_); | |||
| 167 | } | |||
| 168 | } | |||
| 169 | ||||
| 170 | Status PosixSequentialFile::Read(size_t n, Slice* result, char* scratch) { | |||
| 171 | assert(result != nullptr && !use_direct_io())(static_cast<void> (0)); | |||
| 172 | Status s; | |||
| 173 | size_t r = 0; | |||
| 174 | do { | |||
| 175 | r = fread_unlocked(scratch, 1, n, file_); | |||
| 176 | } while (r == 0 && ferror(file_) && errno(*__errno_location ()) == EINTR4); | |||
| 177 | *result = Slice(scratch, r); | |||
| 178 | if (r < n) { | |||
| 179 | if (feof(file_)) { | |||
| 180 | // We leave status as ok if we hit the end of the file | |||
| 181 | // We also clear the error so that the reads can continue | |||
| 182 | // if a new data is written to the file | |||
| 183 | clearerr(file_); | |||
| 184 | } else { | |||
| 185 | // A partial read with an error: return a non-ok status | |||
| 186 | s = IOError("While reading file sequentially", filename_, errno(*__errno_location ())); | |||
| 187 | } | |||
| 188 | } | |||
| 189 | return s; | |||
| 190 | } | |||
| 191 | ||||
| 192 | Status PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, | |||
| 193 | Slice* result, char* scratch) { | |||
| 194 | assert(use_direct_io())(static_cast<void> (0)); | |||
| 195 | assert(IsSectorAligned(offset, GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 196 | assert(IsSectorAligned(n, GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 197 | assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 198 | ||||
| 199 | Status s; | |||
| 200 | ssize_t r = -1; | |||
| 201 | size_t left = n; | |||
| 202 | char* ptr = scratch; | |||
| 203 | while (left > 0) { | |||
| 204 | r = pread(fd_, ptr, left, static_cast<off_t>(offset)); | |||
| 205 | if (r <= 0) { | |||
| 206 | if (r == -1 && errno(*__errno_location ()) == EINTR4) { | |||
| 207 | continue; | |||
| 208 | } | |||
| 209 | break; | |||
| 210 | } | |||
| 211 | ptr += r; | |||
| 212 | offset += r; | |||
| 213 | left -= r; | |||
| 214 | if (r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) { | |||
| 215 | // Bytes reads don't fill sectors. Should only happen at the end | |||
| 216 | // of the file. | |||
| 217 | break; | |||
| 218 | } | |||
| 219 | } | |||
| 220 | if (r < 0) { | |||
| 221 | // An error: return a non-ok status | |||
| 222 | s = IOError( | |||
| 223 | "While pread " + ToString(n) + " bytes from offset " + ToString(offset), | |||
| 224 | filename_, errno(*__errno_location ())); | |||
| 225 | } | |||
| 226 | *result = Slice(scratch, (r < 0) ? 0 : n - left); | |||
| 227 | return s; | |||
| 228 | } | |||
| 229 | ||||
| 230 | Status PosixSequentialFile::Skip(uint64_t n) { | |||
| 231 | if (fseek(file_, static_cast<long int>(n), SEEK_CUR1)) { | |||
| 232 | return IOError("While fseek to skip " + ToString(n) + " bytes", filename_, | |||
| 233 | errno(*__errno_location ())); | |||
| 234 | } | |||
| 235 | return Status::OK(); | |||
| 236 | } | |||
| 237 | ||||
| 238 | Status PosixSequentialFile::InvalidateCache(size_t offset, size_t length) { | |||
| 239 | #ifndef OS_LINUX1 | |||
| 240 | (void)offset; | |||
| 241 | (void)length; | |||
| 242 | return Status::OK(); | |||
| 243 | #else | |||
| 244 | if (!use_direct_io()) { | |||
| 245 | // free OS pages | |||
| 246 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4); | |||
| 247 | if (ret != 0) { | |||
| 248 | return IOError("While fadvise NotNeeded offset " + ToString(offset) + | |||
| 249 | " len " + ToString(length), | |||
| 250 | filename_, errno(*__errno_location ())); | |||
| 251 | } | |||
| 252 | } | |||
| 253 | return Status::OK(); | |||
| 254 | #endif | |||
| 255 | } | |||
| 256 | ||||
| 257 | /* | |||
| 258 | * PosixRandomAccessFile | |||
| 259 | */ | |||
| 260 | #if defined(OS_LINUX1) | |||
| 261 | size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { | |||
| 262 | if (max_size < kMaxVarint64Length * 3) { | |||
| 263 | return 0; | |||
| 264 | } | |||
| 265 | ||||
| 266 | struct stat buf; | |||
| 267 | int result = fstat(fd, &buf); | |||
| 268 | if (result == -1) { | |||
| 269 | return 0; | |||
| 270 | } | |||
| 271 | ||||
| 272 | long version = 0; | |||
| 273 |   result = ioctl(fd, FS_IOC_GETVERSION(((2U) << (((0 +8)+8)+14)) | ((('v')) << (0 +8)) | (((1)) << 0) | ((((sizeof(long)))) << ((0 +8)+8) )), &version);  | |||
| 274 | TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); | |||
| 275 | if (result == -1) { | |||
| 276 | return 0; | |||
| 277 | } | |||
| 278 | uint64_t uversion = (uint64_t)version; | |||
| 279 | ||||
| 280 | char* rid = id; | |||
| 281 | rid = EncodeVarint64(rid, buf.st_dev); | |||
| 282 | rid = EncodeVarint64(rid, buf.st_ino); | |||
| 283 | rid = EncodeVarint64(rid, uversion); | |||
| 284 | assert(rid >= id)(static_cast<void> (0)); | |||
| 285 | return static_cast<size_t>(rid - id); | |||
| 286 | } | |||
| 287 | #endif | |||
| 288 | ||||
| 289 | #if defined(OS_MACOSX) || defined(OS_AIX) | |||
| 290 | size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { | |||
| 291 | if (max_size < kMaxVarint64Length * 3) { | |||
| 292 | return 0; | |||
| 293 | } | |||
| 294 | ||||
| 295 | struct stat buf; | |||
| 296 | int result = fstat(fd, &buf); | |||
| 297 | if (result == -1) { | |||
| 298 | return 0; | |||
| 299 | } | |||
| 300 | ||||
| 301 | char* rid = id; | |||
| 302 | rid = EncodeVarint64(rid, buf.st_dev); | |||
| 303 | rid = EncodeVarint64(rid, buf.st_ino); | |||
| 304 | rid = EncodeVarint64(rid, buf.st_gen); | |||
| 305 | assert(rid >= id)(static_cast<void> (0)); | |||
| 306 | return static_cast<size_t>(rid - id); | |||
| 307 | } | |||
| 308 | #endif | |||
| 309 | /* | |||
| 310 | * PosixRandomAccessFile | |||
| 311 | * | |||
| 312 | * pread() based random-access | |||
| 313 | */ | |||
| 314 | PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd, | |||
| 315 | const EnvOptions& options) | |||
| 316 | : filename_(fname), | |||
| 317 | fd_(fd), | |||
| 318 | use_direct_io_(options.use_direct_reads), | |||
| 319 | logical_sector_size_(GetLogicalBufferSize(fd_)) { | |||
| 320 | assert(!options.use_direct_reads || !options.use_mmap_reads)(static_cast<void> (0)); | |||
| 321 | assert(!options.use_mmap_reads || sizeof(void*) < 8)(static_cast<void> (0)); | |||
| 322 | } | |||
| 323 | ||||
| 324 | PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); } | |||
| 325 | ||||
| 326 | Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, | |||
| 327 | char* scratch) const { | |||
| 328 | if (use_direct_io()) { | |||
| 329 | assert(IsSectorAligned(offset, GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 330 | assert(IsSectorAligned(n, GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 331 | assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 332 | } | |||
| 333 | Status s; | |||
| 334 | ssize_t r = -1; | |||
| 335 | size_t left = n; | |||
| 336 | char* ptr = scratch; | |||
| 337 | while (left > 0) { | |||
| 338 | r = pread(fd_, ptr, left, static_cast<off_t>(offset)); | |||
| 339 | if (r <= 0) { | |||
| 340 | if (r == -1 && errno(*__errno_location ()) == EINTR4) { | |||
| 341 | continue; | |||
| 342 | } | |||
| 343 | break; | |||
| 344 | } | |||
| 345 | ptr += r; | |||
| 346 | offset += r; | |||
| 347 | left -= r; | |||
| 348 | if (use_direct_io() && | |||
| 349 | r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) { | |||
| 350 | // Bytes reads don't fill sectors. Should only happen at the end | |||
| 351 | // of the file. | |||
| 352 | break; | |||
| 353 | } | |||
| 354 | } | |||
| 355 | if (r < 0) { | |||
| 356 | // An error: return a non-ok status | |||
| 357 | s = IOError( | |||
| 358 | "While pread offset " + ToString(offset) + " len " + ToString(n), | |||
| 359 | filename_, errno(*__errno_location ())); | |||
| 360 | } | |||
| 361 | *result = Slice(scratch, (r < 0) ? 0 : n - left); | |||
| 362 | return s; | |||
| 363 | } | |||
| 364 | ||||
| 365 | Status PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n) { | |||
| 366 | Status s; | |||
| 367 | if (!use_direct_io()) { | |||
| 368 | ssize_t r = 0; | |||
| 369 | #ifdef OS_LINUX1 | |||
| 370 | r = readahead(fd_, offset, n); | |||
| 371 | #endif | |||
| 372 | #ifdef OS_MACOSX | |||
| 373 | radvisory advice; | |||
| 374 | advice.ra_offset = static_cast<off_t>(offset); | |||
| 375 | advice.ra_count = static_cast<int>(n); | |||
| 376 | r = fcntl(fd_, F_RDADVISE, &advice); | |||
| 377 | #endif | |||
| 378 | if (r == -1) { | |||
| 379 | s = IOError("While prefetching offset " + ToString(offset) + " len " + | |||
| 380 | ToString(n), | |||
| 381 | filename_, errno(*__errno_location ())); | |||
| 382 | } | |||
| 383 | } | |||
| 384 | return s; | |||
| 385 | } | |||
| 386 | ||||
| 387 | #if defined(OS_LINUX1) || defined(OS_MACOSX) || defined(OS_AIX) | |||
| 388 | size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { | |||
| 389 | return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); | |||
| 390 | } | |||
| 391 | #endif | |||
| 392 | ||||
| 393 | void PosixRandomAccessFile::Hint(AccessPattern pattern) { | |||
| 394 | if (use_direct_io()) { | |||
| 395 | return; | |||
| 396 | } | |||
| 397 | switch (pattern) { | |||
| 398 | case NORMAL: | |||
| 399 | Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL0); | |||
| 400 | break; | |||
| 401 | case RANDOM: | |||
| 402 | Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM1); | |||
| 403 | break; | |||
| 404 | case SEQUENTIAL: | |||
| 405 | Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL2); | |||
| 406 | break; | |||
| 407 | case WILLNEED: | |||
| 408 | Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED3); | |||
| 409 | break; | |||
| 410 | case DONTNEED: | |||
| 411 | Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED4); | |||
| 412 | break; | |||
| 413 | default: | |||
| 414 | assert(false)(static_cast<void> (0)); | |||
| 415 | break; | |||
| 416 | } | |||
| 417 | } | |||
| 418 | ||||
| 419 | Status PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) { | |||
| 420 | if (use_direct_io()) { | |||
| 421 | return Status::OK(); | |||
| 422 | } | |||
| 423 | #ifndef OS_LINUX1 | |||
| 424 | (void)offset; | |||
| 425 | (void)length; | |||
| 426 | return Status::OK(); | |||
| 427 | #else | |||
| 428 | // free OS pages | |||
| 429 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4); | |||
| 430 | if (ret == 0) { | |||
| 431 | return Status::OK(); | |||
| 432 | } | |||
| 433 | return IOError("While fadvise NotNeeded offset " + ToString(offset) + | |||
| 434 | " len " + ToString(length), | |||
| 435 | filename_, errno(*__errno_location ())); | |||
| 436 | #endif | |||
| 437 | } | |||
| 438 | ||||
| 439 | /* | |||
| 440 | * PosixMmapReadableFile | |||
| 441 | * | |||
| 442 | * mmap() based random-access | |||
| 443 | */ | |||
| 444 | // base[0,length-1] contains the mmapped contents of the file. | |||
| 445 | PosixMmapReadableFile::PosixMmapReadableFile(const int fd, | |||
| 446 | const std::string& fname, | |||
| 447 | void* base, size_t length, | |||
| 448 | const EnvOptions& options) | |||
| 449 | : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { | |||
| 450 | #ifdef NDEBUG1 | |||
| 451 | (void)options; | |||
| 452 | #endif | |||
| 453 | fd_ = fd_ + 0; // suppress the warning for used variables | |||
| 454 | assert(options.use_mmap_reads)(static_cast<void> (0)); | |||
| 455 | assert(!options.use_direct_reads)(static_cast<void> (0)); | |||
| 456 | } | |||
| 457 | ||||
| 458 | PosixMmapReadableFile::~PosixMmapReadableFile() { | |||
| 459 | int ret = munmap(mmapped_region_, length_); | |||
| 460 | if (ret != 0) { | |||
| 461 | fprintf(stdoutstdout, "failed to munmap %p length %" ROCKSDB_PRIszt"zu" " \n", | |||
| 462 | mmapped_region_, length_); | |||
| 463 | } | |||
| 464 | close(fd_); | |||
| 465 | } | |||
| 466 | ||||
| 467 | Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, | |||
| 468 | char* /*scratch*/) const { | |||
| 469 | Status s; | |||
| 470 | if (offset > length_) { | |||
| 471 | *result = Slice(); | |||
| 472 | return IOError("While mmap read offset " + ToString(offset) + | |||
| 473 | " larger than file length " + ToString(length_), | |||
| 474 | filename_, EINVAL22); | |||
| 475 | } else if (offset + n > length_) { | |||
| 476 | n = static_cast<size_t>(length_ - offset); | |||
| 477 | } | |||
| 478 | *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n); | |||
| 479 | return s; | |||
| 480 | } | |||
| 481 | ||||
| 482 | Status PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) { | |||
| 483 | #ifndef OS_LINUX1 | |||
| 484 | (void)offset; | |||
| 485 | (void)length; | |||
| 486 | return Status::OK(); | |||
| 487 | #else | |||
| 488 | // free OS pages | |||
| 489 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4); | |||
| 490 | if (ret == 0) { | |||
| 491 | return Status::OK(); | |||
| 492 | } | |||
| 493 | return IOError("While fadvise not needed. Offset " + ToString(offset) + | |||
| 494 | " len" + ToString(length), | |||
| 495 | filename_, errno(*__errno_location ())); | |||
| 496 | #endif | |||
| 497 | } | |||
| 498 | ||||
| 499 | /* | |||
| 500 | * PosixMmapFile | |||
| 501 | * | |||
| 502 | * We preallocate up to an extra megabyte and use memcpy to append new | |||
| 503 | * data to the file. This is safe since we either properly close the | |||
| 504 | * file before reading from it, or for log files, the reading code | |||
| 505 | * knows enough to skip zero suffixes. | |||
| 506 | */ | |||
| 507 | Status PosixMmapFile::UnmapCurrentRegion() { | |||
| 508 | TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); | |||
| 509 | if (base_ != nullptr) { | |||
| 510 | int munmap_status = munmap(base_, limit_ - base_); | |||
| 511 | if (munmap_status != 0) { | |||
| 512 | return IOError("While munmap", filename_, munmap_status); | |||
| 513 | } | |||
| 514 | file_offset_ += limit_ - base_; | |||
| 515 | base_ = nullptr; | |||
| 516 | limit_ = nullptr; | |||
| 517 | last_sync_ = nullptr; | |||
| 518 | dst_ = nullptr; | |||
| 519 | ||||
| 520 | // Increase the amount we map the next time, but capped at 1MB | |||
| 521 | if (map_size_ < (1 << 20)) { | |||
| 522 | map_size_ *= 2; | |||
| 523 | } | |||
| 524 | } | |||
| 525 | return Status::OK(); | |||
| 526 | } | |||
| 527 | ||||
| 528 | Status PosixMmapFile::MapNewRegion() { | |||
| 529 | #ifdef ROCKSDB_FALLOCATE_PRESENT1 | |||
| 530 | assert(base_ == nullptr)(static_cast<void> (0)); | |||
| 531 | TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); | |||
| 532 | // we can't fallocate with FALLOC_FL_KEEP_SIZE here | |||
| 533 | if (allow_fallocate_) { | |||
| 534 |     IOSTATS_TIMER_GUARD(allocate_nanos)PerfStepTimer iostats_step_timer_allocate_nanos(&(iostats_context .allocate_nanos)); iostats_step_timer_allocate_nanos.Start();;  | |||
| 535 | int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); | |||
| 536 | if (alloc_status != 0) { | |||
| 537 | // fallback to posix_fallocate | |||
| 538 | alloc_status = posix_fallocate(fd_, file_offset_, map_size_); | |||
| 539 | } | |||
| 540 | if (alloc_status != 0) { | |||
| 541 | return Status::IOError("Error allocating space to file : " + filename_ + | |||
| 542 | "Error : " + strerror(alloc_status)); | |||
| 543 | } | |||
| 544 | } | |||
| 545 | ||||
| 546 | TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds); | |||
| 547 | void* ptr = mmap(nullptr, map_size_, PROT_READ0x1 | PROT_WRITE0x2, MAP_SHARED0x01, fd_, | |||
| 548 | file_offset_); | |||
| 549 | if (ptr == MAP_FAILED((void *) -1)) { | |||
| 550 | return Status::IOError("MMap failed on " + filename_); | |||
| 551 | } | |||
| 552 | TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds); | |||
| 553 | ||||
| 554 | base_ = reinterpret_cast<char*>(ptr); | |||
| 555 | limit_ = base_ + map_size_; | |||
| 556 | dst_ = base_; | |||
| 557 | last_sync_ = base_; | |||
| 558 | return Status::OK(); | |||
| 559 | #else | |||
| 560 | return Status::NotSupported("This platform doesn't support fallocate()"); | |||
| 561 | #endif | |||
| 562 | } | |||
| 563 | ||||
| 564 | Status PosixMmapFile::Msync() { | |||
| 565 | if (dst_ == last_sync_) { | |||
| 566 | return Status::OK(); | |||
| 567 | } | |||
| 568 | // Find the beginnings of the pages that contain the first and last | |||
| 569 | // bytes to be synced. | |||
| 570 | size_t p1 = TruncateToPageBoundary(last_sync_ - base_); | |||
| 571 | size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); | |||
| 572 | last_sync_ = dst_; | |||
| 573 | TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds); | |||
| 574 | if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC4) < 0) { | |||
| 575 | return IOError("While msync", filename_, errno(*__errno_location ())); | |||
| 576 | } | |||
| 577 | return Status::OK(); | |||
| 578 | } | |||
| 579 | ||||
| 580 | PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size, | |||
| 581 | const EnvOptions& options) | |||
| 582 | : filename_(fname), | |||
| 583 | fd_(fd), | |||
| 584 | page_size_(page_size), | |||
| 585 | map_size_(Roundup(65536, page_size)), | |||
| 586 | base_(nullptr), | |||
| 587 | limit_(nullptr), | |||
| 588 | dst_(nullptr), | |||
| 589 | last_sync_(nullptr), | |||
| 590 | file_offset_(0) { | |||
| 591 | #ifdef ROCKSDB_FALLOCATE_PRESENT1 | |||
| 592 | allow_fallocate_ = options.allow_fallocate; | |||
| 593 | fallocate_with_keep_size_ = options.fallocate_with_keep_size; | |||
| 594 | #else | |||
| 595 | (void)options; | |||
| 596 | #endif | |||
| 597 | assert((page_size & (page_size - 1)) == 0)(static_cast<void> (0)); | |||
| 598 | assert(options.use_mmap_writes)(static_cast<void> (0)); | |||
| 599 | assert(!options.use_direct_writes)(static_cast<void> (0)); | |||
| 600 | } | |||
| 601 | ||||
| 602 | PosixMmapFile::~PosixMmapFile() { | |||
| 603 | if (fd_ >= 0) { | |||
| 604 | PosixMmapFile::Close(); | |||
| 605 | } | |||
| 606 | } | |||
| 607 | ||||
| 608 | Status PosixMmapFile::Append(const Slice& data) { | |||
| 609 | const char* src = data.data(); | |||
| 610 | size_t left = data.size(); | |||
| 611 | while (left > 0) { | |||
  | ||||
| 612 | assert(base_ <= dst_)(static_cast<void> (0)); | |||
| 613 | assert(dst_ <= limit_)(static_cast<void> (0)); | |||
| 614 | size_t avail = limit_ - dst_; | |||
| 615 | if (avail == 0) { | |||
| 616 | Status s = UnmapCurrentRegion(); | |||
| 617 | if (!s.ok()) { | |||
| 618 | return s; | |||
| 619 | } | |||
| 620 | s = MapNewRegion(); | |||
| 621 | if (!s.ok()) { | |||
| 622 | return s; | |||
| 623 | } | |||
| 624 | TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds); | |||
| 625 | } | |||
| 626 | ||||
| 627 | size_t n = (left <= avail) ? left : avail; | |||
| 628 | assert(dst_)(static_cast<void> (0)); | |||
| 629 | memcpy(dst_, src, n); | |||
  | ||||
| 630 | dst_ += n; | |||
| 631 | src += n; | |||
| 632 | left -= n; | |||
| 633 | } | |||
| 634 | return Status::OK(); | |||
| 635 | } | |||
| 636 | ||||
| 637 | Status PosixMmapFile::Close() { | |||
| 638 | Status s; | |||
| 639 | size_t unused = limit_ - dst_; | |||
| 640 | ||||
| 641 | s = UnmapCurrentRegion(); | |||
| 642 | if (!s.ok()) { | |||
| 643 | s = IOError("While closing mmapped file", filename_, errno(*__errno_location ())); | |||
| 644 | } else if (unused > 0) { | |||
| 645 | // Trim the extra space at the end of the file | |||
| 646 | if (ftruncate(fd_, file_offset_ - unused) < 0) { | |||
| 647 | s = IOError("While ftruncating mmaped file", filename_, errno(*__errno_location ())); | |||
| 648 | } | |||
| 649 | } | |||
| 650 | ||||
| 651 | if (close(fd_) < 0) { | |||
| 652 | if (s.ok()) { | |||
| 653 | s = IOError("While closing mmapped file", filename_, errno(*__errno_location ())); | |||
| 654 | } | |||
| 655 | } | |||
| 656 | ||||
| 657 | fd_ = -1; | |||
| 658 | base_ = nullptr; | |||
| 659 | limit_ = nullptr; | |||
| 660 | return s; | |||
| 661 | } | |||
| 662 | ||||
| 663 | Status PosixMmapFile::Flush() { return Status::OK(); } | |||
| 664 | ||||
| 665 | Status PosixMmapFile::Sync() { | |||
| 666 | if (fdatasync(fd_) < 0) { | |||
| 667 | return IOError("While fdatasync mmapped file", filename_, errno(*__errno_location ())); | |||
| 668 | } | |||
| 669 | ||||
| 670 | return Msync(); | |||
| 671 | } | |||
| 672 | ||||
| 673 | /** | |||
| 674 | * Flush data as well as metadata to stable storage. | |||
| 675 | */ | |||
| 676 | Status PosixMmapFile::Fsync() { | |||
| 677 | if (fsync(fd_) < 0) { | |||
| 678 | return IOError("While fsync mmaped file", filename_, errno(*__errno_location ())); | |||
| 679 | } | |||
| 680 | ||||
| 681 | return Msync(); | |||
| 682 | } | |||
| 683 | ||||
| 684 | /** | |||
| 685 | * Get the size of valid data in the file. This will not match the | |||
| 686 | * size that is returned from the filesystem because we use mmap | |||
| 687 | * to extend file by map_size every time. | |||
| 688 | */ | |||
| 689 | uint64_t PosixMmapFile::GetFileSize() { | |||
| 690 | size_t used = dst_ - base_; | |||
| 691 | return file_offset_ + used; | |||
| 692 | } | |||
| 693 | ||||
| 694 | Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) { | |||
| 695 | #ifndef OS_LINUX1 | |||
| 696 | (void)offset; | |||
| 697 | (void)length; | |||
| 698 | return Status::OK(); | |||
| 699 | #else | |||
| 700 | // free OS pages | |||
| 701 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4); | |||
| 702 | if (ret == 0) { | |||
| 703 | return Status::OK(); | |||
| 704 | } | |||
| 705 | return IOError("While fadvise NotNeeded mmapped file", filename_, errno(*__errno_location ())); | |||
| 706 | #endif | |||
| 707 | } | |||
| 708 | ||||
| 709 | #ifdef ROCKSDB_FALLOCATE_PRESENT1 | |||
| 710 | Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) { | |||
| 711 | assert(offset <= std::numeric_limits<off_t>::max())(static_cast<void> (0)); | |||
| 712 | assert(len <= std::numeric_limits<off_t>::max())(static_cast<void> (0)); | |||
| 713 | TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); | |||
| 714 | int alloc_status = 0; | |||
| 715 | if (allow_fallocate_) { | |||
| 716 | alloc_status = fallocate( | |||
| 717 | fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE1 : 0, | |||
| 718 | static_cast<off_t>(offset), static_cast<off_t>(len)); | |||
| 719 | } | |||
| 720 | if (alloc_status == 0) { | |||
| 721 | return Status::OK(); | |||
| 722 | } else { | |||
| 723 | return IOError( | |||
| 724 | "While fallocate offset " + ToString(offset) + " len " + ToString(len), | |||
| 725 | filename_, errno(*__errno_location ())); | |||
| 726 | } | |||
| 727 | } | |||
| 728 | #endif | |||
| 729 | ||||
| 730 | /* | |||
| 731 | * PosixWritableFile | |||
| 732 | * | |||
| 733 | * Use posix write to write data to a file. | |||
| 734 | */ | |||
| 735 | PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, | |||
| 736 | const EnvOptions& options) | |||
| 737 | : filename_(fname), | |||
| 738 | use_direct_io_(options.use_direct_writes), | |||
| 739 | fd_(fd), | |||
| 740 | filesize_(0), | |||
| 741 | logical_sector_size_(GetLogicalBufferSize(fd_)) { | |||
| 742 | #ifdef ROCKSDB_FALLOCATE_PRESENT1 | |||
| 743 | allow_fallocate_ = options.allow_fallocate; | |||
| 744 | fallocate_with_keep_size_ = options.fallocate_with_keep_size; | |||
| 745 | #endif | |||
| 746 | assert(!options.use_mmap_writes)(static_cast<void> (0)); | |||
| 747 | } | |||
| 748 | ||||
| 749 | PosixWritableFile::~PosixWritableFile() { | |||
| 750 | if (fd_ >= 0) { | |||
| 751 | PosixWritableFile::Close(); | |||
| 752 | } | |||
| 753 | } | |||
| 754 | ||||
| 755 | Status PosixWritableFile::Append(const Slice& data) { | |||
| 756 | if (use_direct_io()) { | |||
| 757 | assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 758 | assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 759 | } | |||
| 760 | const char* src = data.data(); | |||
| 761 | size_t left = data.size(); | |||
| 762 | while (left != 0) { | |||
| 763 | ssize_t done = write(fd_, src, left); | |||
| 764 | if (done < 0) { | |||
| 765 | if (errno(*__errno_location ()) == EINTR4) { | |||
| 766 | continue; | |||
| 767 | } | |||
| 768 | return IOError("While appending to file", filename_, errno(*__errno_location ())); | |||
| 769 | } | |||
| 770 | left -= done; | |||
| 771 | src += done; | |||
| 772 | } | |||
| 773 | filesize_ += data.size(); | |||
| 774 | return Status::OK(); | |||
| 775 | } | |||
| 776 | ||||
| 777 | Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { | |||
| 778 | if (use_direct_io()) { | |||
| 779 | assert(IsSectorAligned(offset, GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 780 | assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 781 | assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()))(static_cast<void> (0)); | |||
| 782 | } | |||
| 783 | assert(offset <= std::numeric_limits<off_t>::max())(static_cast<void> (0)); | |||
| 784 | const char* src = data.data(); | |||
| 785 | size_t left = data.size(); | |||
| 786 | while (left != 0) { | |||
| 787 | ssize_t done = pwrite(fd_, src, left, static_cast<off_t>(offset)); | |||
| 788 | if (done < 0) { | |||
| 789 | if (errno(*__errno_location ()) == EINTR4) { | |||
| 790 | continue; | |||
| 791 | } | |||
| 792 | return IOError("While pwrite to file at offset " + ToString(offset), | |||
| 793 | filename_, errno(*__errno_location ())); | |||
| 794 | } | |||
| 795 | left -= done; | |||
| 796 | offset += done; | |||
| 797 | src += done; | |||
| 798 | } | |||
| 799 | filesize_ = offset; | |||
| 800 | return Status::OK(); | |||
| 801 | } | |||
| 802 | ||||
| 803 | Status PosixWritableFile::Truncate(uint64_t size) { | |||
| 804 | Status s; | |||
| 805 | int r = ftruncate(fd_, size); | |||
| 806 | if (r < 0) { | |||
| 807 | s = IOError("While ftruncate file to size " + ToString(size), filename_, | |||
| 808 | errno(*__errno_location ())); | |||
| 809 | } else { | |||
| 810 | filesize_ = size; | |||
| 811 | } | |||
| 812 | return s; | |||
| 813 | } | |||
| 814 | ||||
| 815 | Status PosixWritableFile::Close() { | |||
| 816 | Status s; | |||
| 817 | ||||
| 818 | size_t block_size; | |||
| 819 | size_t last_allocated_block; | |||
| 820 | GetPreallocationStatus(&block_size, &last_allocated_block); | |||
| 821 | if (last_allocated_block > 0) { | |||
| 822 | // trim the extra space preallocated at the end of the file | |||
| 823 | // NOTE(ljin): we probably don't want to surface failure as an IOError, | |||
| 824 | // but it will be nice to log these errors. | |||
| 825 | int dummy __attribute__((__unused__)); | |||
| 826 | dummy = ftruncate(fd_, filesize_); | |||
| 827 | #if defined(ROCKSDB_FALLOCATE_PRESENT1) && defined(FALLOC_FL_PUNCH_HOLE2) && \ | |||
| 828 | !defined(TRAVIS) | |||
| 829 | // in some file systems, ftruncate only trims trailing space if the | |||
| 830 | // new file size is smaller than the current size. Calling fallocate | |||
| 831 | // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused | |||
| 832 | // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following | |||
| 833 | // filesystems: | |||
| 834 | // XFS (since Linux 2.6.38) | |||
| 835 | // ext4 (since Linux 3.0) | |||
| 836 | // Btrfs (since Linux 3.7) | |||
| 837 | // tmpfs (since Linux 3.5) | |||
| 838 | // We ignore error since failure of this operation does not affect | |||
| 839 | // correctness. | |||
| 840 | // TRAVIS - this code does not work on TRAVIS filesystems. | |||
| 841 | // the FALLOC_FL_KEEP_SIZE option is expected to not change the size | |||
| 842 | // of the file, but it does. Simple strace report will show that. | |||
| 843 | // While we work with Travis-CI team to figure out if this is a | |||
| 844 | // quirk of Docker/AUFS, we will comment this out. | |||
| 845 | struct stat file_stats; | |||
| 846 | int result = fstat(fd_, &file_stats); | |||
| 847 | // After ftruncate, we check whether ftruncate has the correct behavior. | |||
| 848 | // If not, we should hack it with FALLOC_FL_PUNCH_HOLE | |||
| 849 | if (result == 0 && | |||
| 850 | (file_stats.st_size + file_stats.st_blksize - 1) / | |||
| 851 | file_stats.st_blksize != | |||
| 852 | file_stats.st_blocks / (file_stats.st_blksize / 512)) { | |||
| 853 |       IOSTATS_TIMER_GUARD(allocate_nanos)PerfStepTimer iostats_step_timer_allocate_nanos(&(iostats_context .allocate_nanos)); iostats_step_timer_allocate_nanos.Start();;  | |||
| 854 | if (allow_fallocate_) { | |||
| 855 | fallocate(fd_, FALLOC_FL_KEEP_SIZE1 | FALLOC_FL_PUNCH_HOLE2, filesize_, | |||
| 856 | block_size * last_allocated_block - filesize_); | |||
| 857 | } | |||
| 858 | } | |||
| 859 | #endif | |||
| 860 | } | |||
| 861 | ||||
| 862 | if (close(fd_) < 0) { | |||
| 863 | s = IOError("While closing file after writing", filename_, errno(*__errno_location ())); | |||
| 864 | } | |||
| 865 | fd_ = -1; | |||
| 866 | return s; | |||
| 867 | } | |||
| 868 | ||||
| 869 | // write out the cached data to the OS cache | |||
| 870 | Status PosixWritableFile::Flush() { return Status::OK(); } | |||
| 871 | ||||
| 872 | Status PosixWritableFile::Sync() { | |||
| 873 | if (fdatasync(fd_) < 0) { | |||
| 874 | return IOError("While fdatasync", filename_, errno(*__errno_location ())); | |||
| 875 | } | |||
| 876 | return Status::OK(); | |||
| 877 | } | |||
| 878 | ||||
| 879 | Status PosixWritableFile::Fsync() { | |||
| 880 | if (fsync(fd_) < 0) { | |||
| 881 | return IOError("While fsync", filename_, errno(*__errno_location ())); | |||
| 882 | } | |||
| 883 | return Status::OK(); | |||
| 884 | } | |||
| 885 | ||||
| 886 | bool PosixWritableFile::IsSyncThreadSafe() const { return true; } | |||
| 887 | ||||
| 888 | uint64_t PosixWritableFile::GetFileSize() { return filesize_; } | |||
| 889 | ||||
| 890 | void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { | |||
| 891 | #ifdef OS_LINUX1 | |||
| 892 | // Suppress Valgrind "Unimplemented functionality" error. | |||
| 893 | #ifndef ROCKSDB_VALGRIND_RUN | |||
| 894 | if (hint == write_hint_) { | |||
| 895 | return; | |||
| 896 | } | |||
| 897 | if (fcntl(fd_, F_SET_RW_HINT(1024 + 12), &hint) == 0) { | |||
| 898 | write_hint_ = hint; | |||
| 899 | } | |||
| 900 | #else | |||
| 901 | (void)hint; | |||
| 902 | #endif // ROCKSDB_VALGRIND_RUN | |||
| 903 | #else | |||
| 904 | (void)hint; | |||
| 905 | #endif // OS_LINUX | |||
| 906 | } | |||
| 907 | ||||
| 908 | Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { | |||
| 909 | if (use_direct_io()) { | |||
| 910 | return Status::OK(); | |||
| 911 | } | |||
| 912 | #ifndef OS_LINUX1 | |||
| 913 | (void)offset; | |||
| 914 | (void)length; | |||
| 915 | return Status::OK(); | |||
| 916 | #else | |||
| 917 | // free OS pages | |||
| 918 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED4); | |||
| 919 | if (ret == 0) { | |||
| 920 | return Status::OK(); | |||
| 921 | } | |||
| 922 | return IOError("While fadvise NotNeeded", filename_, errno(*__errno_location ())); | |||
| 923 | #endif | |||
| 924 | } | |||
| 925 | ||||
| 926 | #ifdef ROCKSDB_FALLOCATE_PRESENT1 | |||
| 927 | Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) { | |||
| 928 | assert(offset <= std::numeric_limits<off_t>::max())(static_cast<void> (0)); | |||
| 929 | assert(len <= std::numeric_limits<off_t>::max())(static_cast<void> (0)); | |||
| 930 | TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); | |||
| 931 |   IOSTATS_TIMER_GUARD(allocate_nanos)PerfStepTimer iostats_step_timer_allocate_nanos(&(iostats_context .allocate_nanos)); iostats_step_timer_allocate_nanos.Start();;  | |||
| 932 | int alloc_status = 0; | |||
| 933 | if (allow_fallocate_) { | |||
| 934 | alloc_status = fallocate( | |||
| 935 | fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE1 : 0, | |||
| 936 | static_cast<off_t>(offset), static_cast<off_t>(len)); | |||
| 937 | } | |||
| 938 | if (alloc_status == 0) { | |||
| 939 | return Status::OK(); | |||
| 940 | } else { | |||
| 941 | return IOError( | |||
| 942 | "While fallocate offset " + ToString(offset) + " len " + ToString(len), | |||
| 943 | filename_, errno(*__errno_location ())); | |||
| 944 | } | |||
| 945 | } | |||
| 946 | #endif | |||
| 947 | ||||
| 948 | #ifdef ROCKSDB_RANGESYNC_PRESENT1 | |||
| 949 | Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { | |||
| 950 | assert(offset <= std::numeric_limits<off_t>::max())(static_cast<void> (0)); | |||
| 951 | assert(nbytes <= std::numeric_limits<off_t>::max())(static_cast<void> (0)); | |||
| 952 | if (sync_file_range(fd_, static_cast<off_t>(offset), | |||
| 953 | static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE2) == 0) { | |||
| 954 | return Status::OK(); | |||
| 955 | } else { | |||
| 956 | return IOError("While sync_file_range offset " + ToString(offset) + | |||
| 957 | " bytes " + ToString(nbytes), | |||
| 958 | filename_, errno(*__errno_location ())); | |||
| 959 | } | |||
| 960 | } | |||
| 961 | #endif | |||
| 962 | ||||
| 963 | #ifdef OS_LINUX1 | |||
| 964 | size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const { | |||
| 965 | return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); | |||
| 966 | } | |||
| 967 | #endif | |||
| 968 | ||||
| 969 | /* | |||
| 970 | * PosixRandomRWFile | |||
| 971 | */ | |||
| 972 | ||||
| 973 | PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd, | |||
| 974 | const EnvOptions& /*options*/) | |||
| 975 | : filename_(fname), fd_(fd) {} | |||
| 976 | ||||
| 977 | PosixRandomRWFile::~PosixRandomRWFile() { | |||
| 978 | if (fd_ >= 0) { | |||
| 979 | Close(); | |||
| 980 | } | |||
| 981 | } | |||
| 982 | ||||
| 983 | Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) { | |||
| 984 | const char* src = data.data(); | |||
| 985 | size_t left = data.size(); | |||
| 986 | while (left != 0) { | |||
| 987 | ssize_t done = pwrite(fd_, src, left, offset); | |||
| 988 | if (done < 0) { | |||
| 989 | // error while writing to file | |||
| 990 | if (errno(*__errno_location ()) == EINTR4) { | |||
| 991 | // write was interrupted, try again. | |||
| 992 | continue; | |||
| 993 | } | |||
| 994 | return IOError( | |||
| 995 | "While write random read/write file at offset " + ToString(offset), | |||
| 996 | filename_, errno(*__errno_location ())); | |||
| 997 | } | |||
| 998 | ||||
| 999 | // Wrote `done` bytes | |||
| 1000 | left -= done; | |||
| 1001 | offset += done; | |||
| 1002 | src += done; | |||
| 1003 | } | |||
| 1004 | ||||
| 1005 | return Status::OK(); | |||
| 1006 | } | |||
| 1007 | ||||
| 1008 | Status PosixRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, | |||
| 1009 | char* scratch) const { | |||
| 1010 | size_t left = n; | |||
| 1011 | char* ptr = scratch; | |||
| 1012 | while (left > 0) { | |||
| 1013 | ssize_t done = pread(fd_, ptr, left, offset); | |||
| 1014 | if (done < 0) { | |||
| 1015 | // error while reading from file | |||
| 1016 | if (errno(*__errno_location ()) == EINTR4) { | |||
| 1017 | // read was interrupted, try again. | |||
| 1018 | continue; | |||
| 1019 | } | |||
| 1020 | return IOError("While reading random read/write file offset " + | |||
| 1021 | ToString(offset) + " len " + ToString(n), | |||
| 1022 | filename_, errno(*__errno_location ())); | |||
| 1023 | } else if (done == 0) { | |||
| 1024 | // Nothing more to read | |||
| 1025 | break; | |||
| 1026 | } | |||
| 1027 | ||||
| 1028 | // Read `done` bytes | |||
| 1029 | ptr += done; | |||
| 1030 | offset += done; | |||
| 1031 | left -= done; | |||
| 1032 | } | |||
| 1033 | ||||
| 1034 | *result = Slice(scratch, n - left); | |||
| 1035 | return Status::OK(); | |||
| 1036 | } | |||
| 1037 | ||||
| 1038 | Status PosixRandomRWFile::Flush() { return Status::OK(); } | |||
| 1039 | ||||
| 1040 | Status PosixRandomRWFile::Sync() { | |||
| 1041 | if (fdatasync(fd_) < 0) { | |||
| 1042 | return IOError("While fdatasync random read/write file", filename_, errno(*__errno_location ())); | |||
| 1043 | } | |||
| 1044 | return Status::OK(); | |||
| 1045 | } | |||
| 1046 | ||||
| 1047 | Status PosixRandomRWFile::Fsync() { | |||
| 1048 | if (fsync(fd_) < 0) { | |||
| 1049 | return IOError("While fsync random read/write file", filename_, errno(*__errno_location ())); | |||
| 1050 | } | |||
| 1051 | return Status::OK(); | |||
| 1052 | } | |||
| 1053 | ||||
| 1054 | Status PosixRandomRWFile::Close() { | |||
| 1055 | if (close(fd_) < 0) { | |||
| 1056 | return IOError("While close random read/write file", filename_, errno(*__errno_location ())); | |||
| 1057 | } | |||
| 1058 | fd_ = -1; | |||
| 1059 | return Status::OK(); | |||
| 1060 | } | |||
| 1061 | ||||
| 1062 | PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() { | |||
| 1063 | // TODO should have error handling though not much we can do... | |||
| 1064 | munmap(this->base_, length_); | |||
| 1065 | } | |||
| 1066 | ||||
| 1067 | /* | |||
| 1068 | * PosixDirectory | |||
| 1069 | */ | |||
| 1070 | ||||
| 1071 | PosixDirectory::~PosixDirectory() { close(fd_); } | |||
| 1072 | ||||
| 1073 | Status PosixDirectory::Fsync() { | |||
| 1074 | #ifndef OS_AIX | |||
| 1075 | if (fsync(fd_) == -1) { | |||
| 1076 | return IOError("While fsync", "a directory", errno(*__errno_location ())); | |||
| 1077 | } | |||
| 1078 | #endif | |||
| 1079 | return Status::OK(); | |||
| 1080 | } | |||
| 1081 | } // namespace rocksdb | |||
| 1082 | #endif |