ObjectStore獲取文件系統的fsid

ceph version: Kraken
ObjectStore獲取文件系統的fsid。OSD在用戶態又構造了一層自己文件系統來管理數據,併爲其分配了唯一標識UUID。該UUID是其文件系統元信息中的一員,底層使用的驅動不同其保存的位置也不同,如BlueStore,保存在塊設備的第一個塊中,FileStore,保存在日誌設備中的第一個塊中。

獲取fsid方法:

int ObjectStore::probe_block_device_fsid(                                                                                                                                                    
  CephContext *cct,
  const string& path,
  uuid_d *fsid)
{
  int r;
//優先選擇bluestore
#if defined(HAVE_LIBAIO)
  // first try bluestore -- it has a crc on its header and will fail
  // reliably. 
  r = BlueStore::get_block_device_fsid(cct, path, fsid);
  if (r == 0) {                                      
  ¦ lgeneric_dout(cct, 0) << __func__ << " " << path << " is bluestore, "
                          << *fsid << dendl;
  ¦ return r; 
  }
#endif

  // okay, try FileStore (journal).
  r = FileStore::get_block_device_fsid(cct, path, fsid);
  if (r == 0) {
  ¦ lgeneric_dout(cct, 0) << __func__ << " " << path << " is filestore, "
                          << *fsid << dendl;
  ¦ return r;
  }

  return -EINVAL;
}
BlueStore 獲取osd文件系統的OSD uuid,該uuid保存在內存結構的bluestore_bdev_label_t,該結構保存在磁盤的第一個塊中。
int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,                                                                                        
                                ¦ ¦ ¦uuid_d *fsid)                                                                                                                
{                                                                                                                                                                 
  bluestore_bdev_label_t label;                                                                                                                                   
  int r = _read_bdev_label(cct, path, &label);                                                                                                                                               
  if (r < 0)                                                                                                                                                      
  ¦ return r;                                                                                                                                                     
  *fsid = label.osd_uuid;                                                                                                                                         
  return 0;                                                                                                                                                       
}
讀取第一個block,反序列化得到label
int BlueStore::_read_bdev_label(CephContext* cct, string path,
                                bluestore_bdev_label_t *label)
{
  dout(10) << __func__ << dendl;
  //打開設備
  int fd = ::open(path.c_str(), O_RDONLY);
  if (fd < 0) {
  ¦ fd = -errno;
  ¦ derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
         << dendl;
  ¦ return fd;
  }
  bufferlist bl;
  //從設備中讀取指定大小的數據
  int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE); //BDEV_LABEL_BLOCK_SIZE = 4096第一個數據塊
  VOID_TEMP_FAILURE_RETRY(::close(fd));
  if (r < 0) {
  ¦ derr << __func__ << " failed to read from " << path
         << ": " << cpp_strerror(r) << dendl;
  ¦ return r;
  }
//校驗數據的完整性,並將其反序列化
  uint32_t crc, expected_crc;
  bufferlist::iterator p = bl.begin();
  try {
  ¦ ::decode(*label, p);
  ¦ bufferlist t;
  ¦ t.substr_of(bl, 0, p.get_off());
  ¦ crc = t.crc32c(-1);
  ¦ ::decode(expected_crc, p);
  }
  catch (buffer::error& e) {
  ¦ derr << __func__ << " unable to decode label at offset " << p.get_off()
         << ": " << e.what()
         << dendl;
  ¦ return -EINVAL;
  }
  if (crc != expected_crc) {
  ¦ derr << __func__ << " bad crc on label, expected " << expected_crc
         << " != actual " << crc << dendl;                                                                                                                                                   
  ¦ return -EIO;
  }
  dout(10) << __func__ << " got " << *label << dendl;
  return 0;
}

FileStore 獲取osd文件系統的OSD uuid

int FileStore::get_block_device_fsid(CephContext* cct, const string& path,
                                     uuid_d *fsid)
{
  // make sure we don't try to use aio or direct_io (and get annoying
  // error messages from failing to do so); performance implications
  // should be irrelevant for this use
  FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false);
  return j.peek_fsid(*fsid);                                                                                                                                                                 
}

// This can not be used on an active journal
int FileJournal::peek_fsid(uuid_d& fsid)
{
  assert(fd == -1);
  int r = _open(false, false);
  if (r)
  ¦ return r;
  r = read_header(&header);
  if (r < 0)
  ¦ goto out;
  fsid = header.fsid;
out:
  close();
  return r;
} 


int FileJournal::_open(bool forwrite, bool create)
{
  int flags, ret;

  if (forwrite) {
  ¦ flags = O_RDWR;
  ¦ if (directio)
  ¦ ¦ flags |= O_DIRECT | O_DSYNC;
  } else {
  ¦ flags = O_RDONLY;
  }
  if (create)
  ¦ flags |= O_CREAT;

  if (fd >= 0) {
  ¦ if (TEMP_FAILURE_RETRY(::close(fd))) {
  ¦ ¦ int err = errno;
  ¦ ¦ derr << "FileJournal::_open: error closing old fd: "
        ¦ ¦<< cpp_strerror(err) << dendl;
  ¦ }
  }
  //打開日誌設備
  fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags, 0644));
  if (fd < 0) {
  ¦ int err = errno;
  ¦ dout(2) << "FileJournal::_open unable to open journal "
        ¦ ¦ << fn << ": " << cpp_strerror(err) << dendl;
  ¦ return -err;
  }
//獲取指定文件的元信息,讀取初始化日誌文件(或設備)的相關數據(大小,塊大小)
  struct stat st;
  ret = ::fstat(fd, &st);
  if (ret) {
  ¦ ret = errno;
  ¦ derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
  ¦ ret = -ret;
  ¦ goto out_fd;
  } 
  //判斷是常規文件還是裸塊設備
  if (S_ISBLK(st.st_mode)) {
  ¦ ret = _open_block_device();
  } else if (S_ISREG(st.st_mode)) {
  ¦ if (aio && !force_aio) {
  ¦ ¦ derr << "FileJournal::_open: disabling aio for non-block journal.  Use "
        ¦ ¦<< "journal_force_aio to force use of aio anyway" << dendl;
  ¦ ¦ aio = false;
  ¦ }
  ¦ ret = _open_file(st.st_size, st.st_blksize, create);
  } else {
  ¦ derr << "FileJournal::_open: wrong journal file type: " << st.st_mode
        ¦<< dendl;
  ¦ ret = -EINVAL;
  }

  if (ret)
  ¦ goto out_fd;
//初始化libaio
#ifdef HAVE_LIBAIO
  if (aio) {
  ¦ aio_ctx = 0;
  ¦ ret = io_setup(128, &aio_ctx);
  ¦ if (ret < 0) {
  ¦ ¦ switch (ret) {
        // Contrary to naive expectations -EAGIAN means ...
        case -EAGAIN:
        ¦ derr << "FileJournal::_open: user's limit of aio events exceeded. "
        ¦ ¦ ¦ ¦<< "Try increasing /proc/sys/fs/aio-max-nr" << dendl;
        ¦ break;
        default:
        ¦ derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl;
        ¦ break;
  ¦ ¦ }
  ¦ ¦ goto out_fd;
  ¦ }
  }
#endif

  /* We really want max_size to be a multiple of block_size. */
  max_size -= max_size % block_size;

  dout(1) << "_open " << fn << " fd " << fd
        ¦ << ": " << max_size
        ¦ << " bytes, block size " << block_size
        ¦ << " bytes, directio = " << directio
        ¦ << ", aio = " << aio
        ¦ << dendl;
  return 0;

 out_fd:
  VOID_TEMP_FAILURE_RETRY(::close(fd));
  fd = -1;
  return ret;
}
獲取塊設備的大小

獲取塊設備大小,檢查是否大於最小日誌大小要求。

int FileJournal::_open_block_device()
{
  int64_t bdev_sz = 0;
  int ret = get_block_device_size(fd, &bdev_sz);
  if (ret) {
  ¦ dout(0) << __func__ << ": failed to read block device size." << dendl;
  ¦ return -EIO;
  }

  /* Check for bdev_sz too small */
  if (bdev_sz < ONE_MEG) {
  ¦ dout(0) << __func__ << ": your block device must be at least "
  ¦ ¦ << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;
  ¦ return -EINVAL;
  }

  dout(10) << __func__ << ": ignoring osd journal size. "
        ¦ ¦<< "We'll use the entire block device (size: " << bdev_sz << ")"
        ¦ ¦<< dendl;
  max_size = bdev_sz;

  block_size = cct->_conf->journal_block_size;

  if (cct->_conf->journal_discard) {
  //獲取磁盤對discard的支持(/sys/block/sdb/queue/discard_granularity)
  ¦ discard = block_device_support_discard(fn.c_str());
  ¦ dout(10) << fn << " support discard: " << (int)discard << dendl;
  }

  return 0;
}
//獲取塊設備的大小
int get_block_device_size(int fd, int64_t *psize)
{                                                                                                                                                                                            
#ifdef BLKGETSIZE64
  int ret = ::ioctl(fd, BLKGETSIZE64, psize);
#elif defined(BLKGETSIZE)
  unsigned long sectors = 0;
  int ret = ::ioctl(fd, BLKGETSIZE, &sectors);
  *psize = sectors * 512ULL;
#else
// cppcheck-suppress preprocessorErrorDirective
# error "Linux configuration error (get_block_device_size)"
#endif
  if (ret < 0)
    ret = -errno;
  return ret; 
}
記錄OSD日誌的是一個文件,會使用該方法來打開該日誌文件。
int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,                                                                                                                              
                            bool create)
{
  int ret;
  //配置日誌文件的大小
  int64_t conf_journal_sz(cct->_conf->osd_journal_size);
  conf_journal_sz <<= 20;

  if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {
  ¦ derr << "I'm sorry, I don't know how large of a journal to create."
        ¦<< "Please specify a block device to use as the journal OR "
        ¦<< "set osd_journal_size in your ceph.conf" << dendl;
  ¦ return -EINVAL;
  }

  if (create && (oldsize < conf_journal_sz)) {
  ¦ uint64_t newsize(conf_journal_sz);
  ¦ dout(10) <<  __func__ << " _open extending to " << newsize << " bytes" << dendl;
  //擴展日誌文件大小,但是該方法只分配了虛擬的空間,即沒有實際的數據塊
  ¦ ret = ::ftruncate(fd, newsize);
  ¦ if (ret < 0) {
  ¦ ¦ int err = errno;
  ¦ ¦ derr << "FileJournal::_open_file : unable to extend journal to "
        ¦ ¦<< newsize << " bytes: " << cpp_strerror(err) << dendl;
  ¦ ¦ return -err;
  ¦ }
#ifdef HAVE_POSIX_FALLOCATE
//爲文件分配實際的磁盤空間,以防止磁盤空間不足導致寫入失敗。
  ¦ ret = ::posix_fallocate(fd, 0, newsize);
  ¦ if (ret) {
  ¦ ¦ derr << "FileJournal::_open_file : unable to preallocation journal to "
        ¦ ¦<< newsize << " bytes: " << cpp_strerror(ret) << dendl;
  ¦ ¦ return -ret;
  ¦ }
  ¦ max_size = newsize;
#elif defined(__APPLE__)
  ¦ fstore_t store;
  ¦ store.fst_flags = F_ALLOCATECONTIG;
  ¦ store.fst_posmode = F_PEOFPOSMODE;
  ¦ store.fst_offset = 0;
  ¦ store.fst_length = newsize;
//同上
  ¦ ret = ::fcntl(fd, F_PREALLOCATE, &store);
  ¦ if (ret == -1) {
  ¦ ¦ ret = -errno;
  ¦ ¦ derr << "FileJournal::_open_file : unable to preallocation journal to "
        ¦ ¦<< newsize << " bytes: " << cpp_strerror(ret) << dendl;
  ¦ ¦ return ret;
  ¦ }
  ¦ max_size = newsize;
#else
# error "Journal pre-allocation not supported on platform."
#endif
  }
  else {
  ¦ max_size = oldsize;
  }
  block_size = cct->_conf->journal_block_size;
//初始化日誌空間,通過填充‘0’
  if (create && cct->_conf->journal_zero_on_create) {
  ¦ derr << "FileJournal::_open_file : zeroing journal" << dendl;
  ¦ uint64_t write_size = 1 << 20;
  ¦ char *buf;
  //申請一塊block_size內存對其的write_size大小的內存空間。
  ¦ ret = ::posix_memalign((void **)&buf, block_size, write_size);
  ¦ if (ret != 0) {
  ¦ ¦ return -ret;
  ¦ }
  ¦ memset(static_cast<void*>(buf), 0, write_size);
  ¦ uint64_t i = 0;
  ¦ for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
  ¦ ¦ ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
  ¦ ¦ if (ret < 0) {
        free(buf);
        return -errno;
  ¦ ¦ }
  ¦ }
  ¦ if (i < (uint64_t)max_size) {
  ¦ ¦ ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
  ¦ ¦ if (ret < 0) {
        free(buf);
        return -errno;
  ¦ ¦ }
  ¦ }
  ¦ free(buf);
  }


  dout(10) << "_open journal is not a block device, NOT checking disk "
  ¦ ¦ ¦ ¦ ¦<< "write cache on '" << fn << "'" << dendl;

  return 0;
}
讀取日誌的頭,該頭在日誌的第一個塊中
int FileJournal::read_header(header_t *hdr) const
{
  dout(10) << "read_header" << dendl;
  bufferlist bl;

  buffer::ptr bp = buffer::create_page_aligned(block_size);
  char* bpdata = bp.c_str();
  int r = ::pread(fd, bpdata, bp.length(), 0);

  if (r < 0) {
  ¦ int err = errno;
  ¦ dout(0) << "read_header got " << cpp_strerror(err) << dendl;
  ¦ return -err;
  }

  // don't use bp.zero() here, because it also invalidates
  // crc cache (which is not yet populated anyway)
  if (bp.length() != (size_t)r) {
  ¦ ¦ // r will be always less or equal than bp.length
  ¦ ¦ bpdata += r;
  ¦ ¦ memset(bpdata, 0, bp.length() - r);
  }

  bl.push_back(std::move(bp));

  try {
  ¦ bufferlist::iterator p = bl.begin();
  ¦ ::decode(*hdr, p);
  }
  catch (buffer::error& e) {
  ¦ derr << "read_header error decoding journal header" << dendl;
  ¦ return -EINVAL;
  }

  /*
  ¦* Unfortunately we weren't initializing the flags field for new
  ¦* journals!  Aie.  This is safe(ish) now that we have only one
  ¦* flag.  Probably around when we add the next flag we need to
  ¦* remove this or else this (eventually old) code will clobber newer
  ¦* code's flags.
  ¦*/
  if (hdr->flags > 3) {
  ¦ derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
  ¦ hdr->flags = 0;
  }

  print_header(*hdr);

  return 0;
}

void FileJournal::print_header(const header_t &header) const                                                                                                                                 
{
  dout(10) << "header: block_size " << header.block_size
        ¦ ¦<< " alignment " << header.alignment
        ¦ ¦<< " max_size " << header.max_size
        ¦ ¦<< dendl;
  dout(10) << "header: start " << header.start << dendl;
  dout(10) << " write_pos " << write_pos << dendl;
} 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章