From 177cb1e136678cf9a53147cbb03b536b9ef0f63a Mon Sep 17 00:00:00 2001 From: caijieming Date: Thu, 3 Aug 2017 22:51:31 +0800 Subject: [PATCH 01/19] issue=1258, Tcache support block-level cache evict --- src/io/tablet_io.cc | 19 +- src/io/utils_leveldb.cc | 18 + src/io/utils_leveldb.h | 2 + src/leveldb/db/builder.cc | 3 - src/leveldb/db/db_impl.cc | 3 - src/leveldb/include/leveldb/block_cache.h | 101 ++ src/leveldb/include/leveldb/cache.h | 27 + src/leveldb/include/leveldb/slice.h | 6 + src/leveldb/util/block_cache.cc | 1252 +++++++++++++++++++++ src/leveldb/util/cache.cc | 193 +++- src/leveldb/util/coding_test.cc | 11 + src/sdk/sdk_zk.cc | 3 - src/tabletnode/tabletnode_impl.cc | 60 +- src/tera_flags.cc | 3 +- 14 files changed, 1629 insertions(+), 72 deletions(-) create mode 100644 src/leveldb/include/leveldb/block_cache.h create mode 100644 src/leveldb/util/block_cache.cc diff --git a/src/io/tablet_io.cc b/src/io/tablet_io.cc index 81222e447..6de53b462 100644 --- a/src/io/tablet_io.cc +++ b/src/io/tablet_io.cc @@ -59,11 +59,11 @@ DECLARE_bool(tera_leveldb_ignore_corruption_in_compaction); DECLARE_bool(tera_leveldb_use_file_lock); DECLARE_int32(tera_tabletnode_scan_pack_max_size); -DECLARE_bool(tera_tabletnode_cache_enabled); DECLARE_int32(tera_leveldb_env_local_seek_latency); DECLARE_int32(tera_leveldb_env_dfs_seek_latency); DECLARE_int32(tera_memenv_table_cache_size); DECLARE_bool(tera_use_flash_for_memenv); +DECLARE_bool(tera_tabletnode_block_cache_enabled); DECLARE_bool(tera_tablet_use_memtable_on_leveldb); DECLARE_int64(tera_tablet_memtable_ldb_write_buffer_size); @@ -1676,18 +1676,25 @@ void TabletIO::SetupOptionsForLG() { lg_info->env = LeveldbMockEnv(); } else if (store == MemoryStore) { if (FLAGS_tera_use_flash_for_memenv) { - lg_info->env = LeveldbFlashEnv(); + if (FLAGS_tera_tabletnode_block_cache_enabled) { + LOG(INFO) << "MemLG[" << lg_i << "] activate TCache"; + lg_info->env = io::DefaultBlockCacheEnv(); + } else { + lg_info->env = LeveldbFlashEnv(); + } } else { lg_info->env = LeveldbMemEnv(); } lg_info->seek_latency = 0; lg_info->block_cache = m_memory_cache; } else if (store == FlashStore) { - if (!FLAGS_tera_tabletnode_cache_enabled) { - lg_info->env = LeveldbFlashEnv(); + if (FLAGS_tera_tabletnode_block_cache_enabled) { + //LOG(INFO) << "activate block-level Cache store"; + //lg_info->env = leveldb::EnvThreeLevelCache(); + LOG(INFO) << "FlashLG[" << lg_i << "] activate TCache"; + lg_info->env = io::DefaultBlockCacheEnv(); } else { - LOG(INFO) << "activate block-level Cache store"; - lg_info->env = leveldb::EnvThreeLevelCache(); + lg_info->env = LeveldbFlashEnv(); } lg_info->seek_latency = FLAGS_tera_leveldb_env_local_seek_latency; } else { diff --git a/src/io/utils_leveldb.cc b/src/io/utils_leveldb.cc index 253e23f56..3d3249e1d 100644 --- a/src/io/utils_leveldb.cc +++ b/src/io/utils_leveldb.cc @@ -18,6 +18,7 @@ #include "leveldb/comparator.h" #include "leveldb/env_dfs.h" #include "leveldb/env_flash.h" +#include "leveldb/block_cache.h" #include "leveldb/env_inmem.h" #include "leveldb/env_mock.h" #include "leveldb/table_utils.h" @@ -31,6 +32,7 @@ DECLARE_string(tera_leveldb_env_hdfs2_nameservice_list); DECLARE_string(tera_tabletnode_path_prefix); DECLARE_string(tera_dfs_so_path); DECLARE_string(tera_dfs_conf); +DECLARE_int32(tera_leveldb_block_cache_env_num_thread); namespace tera { namespace io { @@ -66,6 +68,21 @@ leveldb::Env* LeveldbBaseEnv() { } } +// Tcache: default env +static pthread_once_t block_cache_once = PTHREAD_ONCE_INIT; +static leveldb::Env* default_block_cache_env; +static void InitDefaultBlockCacheEnv() { + default_block_cache_env = new leveldb::BlockCacheEnv(LeveldbBaseEnv()); + default_block_cache_env->SetBackgroundThreads(FLAGS_tera_leveldb_block_cache_env_num_thread); + LOG(INFO) << "init block cache, thread num " << FLAGS_tera_leveldb_block_cache_env_num_thread; +} + +leveldb::Env* DefaultBlockCacheEnv() { + pthread_once(&block_cache_once, InitDefaultBlockCacheEnv); + return default_block_cache_env; +} + +// mem env leveldb::Env* LeveldbMemEnv() { static Mutex mutex; static leveldb::Env* mem_env = NULL; @@ -78,6 +95,7 @@ leveldb::Env* LeveldbMemEnv() { return mem_env; } +// flash env leveldb::Env* LeveldbFlashEnv() { static Mutex mutex; static leveldb::Env* flash_env = NULL; diff --git a/src/io/utils_leveldb.h b/src/io/utils_leveldb.h index f77847db9..39e5d73c1 100644 --- a/src/io/utils_leveldb.h +++ b/src/io/utils_leveldb.h @@ -18,6 +18,8 @@ void InitDfsEnv(); // return the base env leveldb used (dfs/local), singleton leveldb::Env* LeveldbBaseEnv(); +leveldb::Env* DefaultBlockCacheEnv(); // ssd + base + // return the mem env leveldb used, singleton leveldb::Env* LeveldbMemEnv(); diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc index fdbae74af..5bce6f796 100644 --- a/src/leveldb/db/builder.cc +++ b/src/leveldb/db/builder.cc @@ -137,9 +137,6 @@ Status BuildTable(const std::string& dbname, delete builder; // Finish and check for file errors - if (s.ok()) { - s = file->Sync(); - } if (s.ok()) { s = file->Close(); } diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc index c076008de..7d72b617b 100644 --- a/src/leveldb/db/db_impl.cc +++ b/src/leveldb/db/db_impl.cc @@ -1179,9 +1179,6 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, compact->builder = NULL; // Finish and check for file errors - if (s.ok()) { - s = compact->outfile->Sync(); - } if (s.ok()) { s = compact->outfile->Close(); } diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h new file mode 100644 index 000000000..a48022bb7 --- /dev/null +++ b/src/leveldb/include/leveldb/block_cache.h @@ -0,0 +1,101 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H +#define STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H + +#include "leveldb/env.h" +#include "leveldb/options.h" +#include "leveldb/status.h" + +namespace leveldb { +///////////////////////////////////////////// +// Tcache +///////////////////////////////////////////// +extern uint64_t kBlockSize; +extern uint64_t kDataSetSize; +extern uint64_t kFidBatchNum; +extern uint64_t kCacheSize; +extern uint64_t kMetaBlockSize; +extern uint64_t kMetaTableSize; +extern uint64_t kWriteBufferSize; + +struct BlockCacheOptions { + Options opts; + std::string cache_dir; + uint64_t block_size; + uint64_t dataset_size; + uint64_t fid_batch_num; + uint64_t cache_size; + uint64_t dataset_num; + uint64_t meta_block_cache_size; + uint64_t meta_table_cache_size; + uint64_t write_buffer_size; + Env* env; + Env* cache_env; + + BlockCacheOptions() + : block_size(kBlockSize), + dataset_size(kDataSetSize), + fid_batch_num(kFidBatchNum), + cache_size(kCacheSize), + meta_block_cache_size(kMetaBlockSize), + meta_table_cache_size(kMetaTableSize), + write_buffer_size(kWriteBufferSize), + env(NULL) { + dataset_num = cache_size / dataset_size + 1; + } +}; + +class BlockCacheImpl; + +class BlockCacheEnv : public EnvWrapper { +public: + BlockCacheEnv(Env* base); + + ~BlockCacheEnv(); + + virtual Status FileExists(const std::string& fname); + + virtual Status GetChildren(const std::string& path, + std::vector* result); + + virtual Status DeleteFile(const std::string& fname); + + virtual Status CreateDir(const std::string& name); + + virtual Status DeleteDir(const std::string& name); + + virtual Status CopyFile(const std::string& from, + const std::string& to); + + virtual Status GetFileSize(const std::string& fname, uint64_t* size); + + virtual Status RenameFile(const std::string& src, const std::string& target); + + virtual Status LockFile(const std::string& fname, FileLock** lock); + + virtual Status UnlockFile(FileLock* lock); + + virtual Status NewSequentialFile(const std::string& fname, + SequentialFile** result); // never cache log + + // cache relatively + virtual Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result); // cache Pread + + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result); // cache Append + virtual Status LoadCache(const BlockCacheOptions& opts, const std::string& cache_dir); + +private: + std::vector cache_vec_; + Env* dfs_env_; +}; + +Env* NewBlockCacheEnv(Env* base); + +} // leveldb +#endif + diff --git a/src/leveldb/include/leveldb/cache.h b/src/leveldb/include/leveldb/cache.h index 636811b65..2299b2528 100644 --- a/src/leveldb/include/leveldb/cache.h +++ b/src/leveldb/include/leveldb/cache.h @@ -29,9 +29,36 @@ namespace leveldb { class Cache; +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + void (*deleter)(const Slice&, void* value); + LRUHandle* next_hash; + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + uint32_t refs; + uint32_t hash; // Hash of key(); used for fast sharding and comparisons + uint64_t cache_id; // cache id, user spec + char key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + // Create a new cache with a fixed size capacity. This implementation // of Cache uses a least-recently-used eviction policy. extern Cache* NewLRUCache(size_t capacity); +extern Cache* New2QCache(size_t capacity); class Cache { public: diff --git a/src/leveldb/include/leveldb/slice.h b/src/leveldb/include/leveldb/slice.h index 4f1eea30e..286f303f7 100644 --- a/src/leveldb/include/leveldb/slice.h +++ b/src/leveldb/include/leveldb/slice.h @@ -68,6 +68,12 @@ class Slice { size_ -= n; } + // Drop the last "n" bytes from this slice. + void remove_suffix(size_t n) { + assert(n <= size()); + size_ -= n; + } + // Return a string that contains the copy of the referenced data. std::string ToString() const { return std::string(data_, size_); } diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc new file mode 100644 index 000000000..8401cb1c0 --- /dev/null +++ b/src/leveldb/util/block_cache.cc @@ -0,0 +1,1252 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "leveldb/block_cache.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "db/table_cache.h" +#include "leveldb/db.h" +#include "leveldb/cache.h" +#include "leveldb/env.h" +#include "leveldb/iterator.h" +#include "leveldb/options.h" +#include "leveldb/status.h" +#include "leveldb/table_utils.h" +#include "leveldb/write_batch.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/string_ext.h" +#include "util/thread_pool.h" + +namespace leveldb { + +///////////////////////////////////////////// +// Tcache +///////////////////////////////////////////// +uint64_t kBlockSize = 4096UL; +uint64_t kDataSetSize = 134217728UL; +uint64_t kFidBatchNum = 200000UL; +uint64_t kCacheSize = 350000000000UL; +uint64_t kMetaBlockSize = 2000UL; +uint64_t kMetaTableSize = 500UL; +uint64_t kWriteBufferSize = 1048576UL; + +class BlockCacheWritableFile; +class BlockCacheRandomAccessFile; +class BlockCacheImpl; + +// Each SSD will New a BlockCache +// block state +uint64_t kCacheBlockValid = 1; +struct CacheBlock { + uint64_t fid; + uint64_t block_idx; + uint64_t sid; + uint64_t cache_block_idx; + uint64_t state; + port::CondVar cv; + Slice data_block; + bool data_block_alloc; + uint64_t data_block_refs; + LRUHandle* handle; + Status s; + + CacheBlock(port::Mutex* mu) + : fid(0), + block_idx(0), + sid(0xffffffffffffffff), + cache_block_idx(0xffffffffffffffff), + state(!kCacheBlockValid), + cv(mu), + data_block_alloc(false), + data_block_refs(0), + handle(NULL) { + } + + // access in cache lock + void GetDataBlock(uint64_t block_size, Slice data) { + if (data_block_refs == 0) { // first one alloc mem + assert(data_block.size() == 0); + assert(data_block_alloc == false); + if (data.size() == 0) { + char* buf = new char[block_size]; + data = Slice(buf, block_size); + data_block_alloc = true; + } + data_block = data; + } + ++data_block_refs; + } + + // access in cache lock + void ReleaseDataBlock() { + --data_block_refs; + if (data_block_refs == 0) { + if (data_block_alloc) { + char* data = (char*)data_block.data(); + delete[] data; + data_block_alloc = false; + } + data_block = Slice(); + } + } + + void DecodeFrom(Slice record) { + fid = DecodeFixed64(record.data()); + record.remove_prefix(sizeof(uint64_t)); + block_idx = DecodeFixed64(record.data()); + record.remove_prefix(sizeof(uint64_t)); + state = DecodeFixed64(record.data()); + return; + } + + const std::string Encode() { + std::string r; + PutFixed64(&r, fid); + PutFixed64(&r, block_idx); + PutFixed64(&r, state); + return r; + } + + const std::string ToString() { + std::stringstream ss; + ss << "CacheBlock: fid: " << fid << ", block_idx: " << block_idx + << ", sid: " << sid << ", cache_block_idx: " << cache_block_idx + << ", state " << state; + return ss.str(); + } +}; + +struct DataSet { + Cache* cache; + int fd; +}; + +class BlockCacheImpl { +public: + BlockCacheImpl(const BlockCacheOptions& options); + + ~BlockCacheImpl(); + + const std::string& WorkPath(); + + Status LoadCache(); // init cache + + Status NewWritableFile(const std::string& fname, + WritableFile** result); + + Status NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result); // cache Pread + static void BlockDeleter(const Slice& key, void* v); + +private: + friend struct DataSet; + struct LockContent; + + Status LockAndPut(LockContent& lc); + + Status FillCache(CacheBlock* block); + + Status ReadCache(CacheBlock* block); + + uint64_t AllocFileId(); // no more than fid_batch_num + + uint64_t FileId(const std::string& fname); + + DataSet* GetDataSet(uint64_t sid); + + CacheBlock* GetAndAllocBlock(uint64_t fid, uint64_t block_idx); + + Status LogRecord(CacheBlock* block); + + Status ReleaseBlock(CacheBlock* block); + +private: + friend class BlockCacheWritableFile; + friend class BlockCacheRandomAccessFile; + friend struct CacheBlock; + + BlockCacheOptions options_; + std::string work_path_; + Env* dfs_env_; + //Env* posix_env_; + + port::Mutex mu_; + // key lock list + struct Waiter { + port::CondVar cv; + int wait_num; + bool done; + Waiter(port::Mutex* mu):cv(mu), wait_num(0), done(false) {} + }; + typedef std::map LockKeyMap; + LockKeyMap lock_key_; + + uint64_t new_fid_; + uint64_t prev_fid_; + + enum LockKeyType { + kDBKey = 0, + kDataSetKey = 1, + }; + struct LockContent { + int type; + + // DB key + Slice db_lock_key; + Slice db_lock_val; + std::string* db_val; + + // data set id + uint64_t sid; + DataSet* data_set; + }; + typedef std::map DataSetMap; + DataSetMap data_set_map_; + + //WritableFile* logfile_; + //log::Writer* log_; + DB* db_; // store meta + ThreadPool bg_fill_; + ThreadPool bg_read_; + ThreadPool bg_flush_; +}; + +// Must insure not init more than twice +Env* NewBlockCacheEnv(Env* base) { + return new BlockCacheEnv(base); +} + +BlockCacheEnv::BlockCacheEnv(Env* base) + : EnvWrapper(NewPosixEnv()), dfs_env_(base) { + //target()->SetBackgroundThreads(30); +} + +BlockCacheEnv::~BlockCacheEnv() {} + +Status BlockCacheEnv::FileExists(const std::string& fname) { + return dfs_env_->FileExists(fname); +} + +Status BlockCacheEnv::GetChildren(const std::string& path, + std::vector* result) { + return dfs_env_->GetChildren(path, result); +} + +Status BlockCacheEnv::DeleteFile(const std::string& fname) { + return dfs_env_->DeleteFile(fname); +} + +Status BlockCacheEnv::CreateDir(const std::string& name) { + return dfs_env_->CreateDir(name); +} + +Status BlockCacheEnv::DeleteDir(const std::string& name) { + return dfs_env_->DeleteDir(name); +} + +Status BlockCacheEnv::CopyFile(const std::string& from, + const std::string& to) { + return dfs_env_->CopyFile(from, to); +} + +Status BlockCacheEnv::GetFileSize(const std::string& fname, uint64_t* size) { + return dfs_env_->GetFileSize(fname, size); +} + +Status BlockCacheEnv::RenameFile(const std::string& src, const std::string& target) { + return dfs_env_->RenameFile(src, target); +} + +Status BlockCacheEnv::LockFile(const std::string& fname, FileLock** lock) { + return dfs_env_->LockFile(fname, lock); +} + +Status BlockCacheEnv::UnlockFile(FileLock* lock) { + return dfs_env_->UnlockFile(lock); +} + +Status BlockCacheEnv::LoadCache(const BlockCacheOptions& opts, const std::string& cache_dir) { + BlockCacheOptions options = opts; + options.cache_dir = cache_dir; + options.env = dfs_env_; + options.cache_env = this->target(); + BlockCacheImpl* cache = new BlockCacheImpl(options); + Status s = cache->LoadCache(); + assert(s.ok()); + cache_vec_.push_back(cache); // no need lock + return s; +} + +Status BlockCacheEnv::NewSequentialFile(const std::string& fname, + SequentialFile** result) { + return dfs_env_->NewSequentialFile(fname, result); +} + +Status BlockCacheEnv::NewWritableFile(const std::string& fname, + WritableFile** result) { + if (fname.rfind(".sst") != fname.size() - 4) { + return dfs_env_->NewWritableFile(fname, result); + } + + // cache sst file + uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size(); + BlockCacheImpl* cache = cache_vec_[hash]; + Status s = cache->NewWritableFile(fname, result); + Log("[block_cache %s] open file write: %s, hash: %u, status: %s\n", + cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str()); + return s; +} + +Status BlockCacheEnv::NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size(); + BlockCacheImpl* cache = cache_vec_[hash]; + Status s = cache->NewRandomAccessFile(fname, result); + Log("[block_cache %s] open file read: %s, hash: %u, status: %s\n", + cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str()); + return s; +} + +class BlockCacheWriteBuffer { +public: + BlockCacheWriteBuffer(const std::string& path, + const std::string& file, + int block_size) + : offset_(0), + block_size_(block_size), + block_idx_(0), + tmp_storage_(NULL), + path_(path), + file_(file) { + } + + ~BlockCacheWriteBuffer() { + assert(block_list_.size() == 0); + } + + uint32_t NumFullBlock() { // use for BGFlush + MutexLock l(&mu_); + if (block_list_.size() > 1) { + return block_list_.size() - 1; + } + return 0; + } + + Status Append(const Slice& data) { + MutexLock l(&mu_); + if (tmp_storage_ == NULL) { + tmp_storage_ = new std::string(); + tmp_storage_->resize(0); + block_list_.push_back(tmp_storage_); + } + uint32_t begin = offset_ / block_size_; + uint32_t end = (offset_ + data.size()) / block_size_; + if (begin == end) { // in the same block + tmp_storage_->append(data.data(), data.size()); + } else { + uint32_t tmp_size = block_size_ - (offset_ % block_size_); + tmp_storage_->append(data.data(), tmp_size); + assert(tmp_storage_->size() == block_size_); + Slice buf(data.data() + tmp_size, data.size() - tmp_size); + for (uint32_t i = begin + 1; i <= end; ++i) { + tmp_storage_ = new std::string(); + tmp_storage_->resize(0); + block_list_.push_back(tmp_storage_); + if (i < end) { // last block + tmp_storage_->append(buf.data(), block_size_); + buf.remove_prefix(block_size_); + } else { // last block + tmp_storage_->append(buf.data(), buf.size()); + } + Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu\n", + path_.c_str(), + file_.c_str(), + offset_, + buf.size()); + } + } + offset_ += data.size(); + Log("[%s] add record: %s, begin: %u, end: %u, offset: %lu, data_size: %lu, block_size: %u\n", + path_.c_str(), + file_.c_str(), + begin, end, + offset_, data.size(), block_size_); + return Status::OK(); + } + + std::string* PopFrontBlock(uint64_t* block_idx) { + MutexLock l(&mu_); + std::string* block = block_list_.front(); + if (block_list_.size() == 0) { + return NULL; + } + block_list_.pop_front(); + *block_idx = block_idx_; + block_idx_++; + return block; + } + + std::string* PopBackBlock(uint64_t* block_idx) { + MutexLock l(&mu_); + if (block_list_.size() == 0) { + return NULL; + } + std::string* block = block_list_.back(); + block_list_.pop_back(); + *block_idx = offset_ / block_size_; + return block; + } + + void ReleaseBlock(std::string* block) { + delete block; + } + +private: + port::Mutex mu_; + uint64_t offset_; + uint32_t block_size_; + uint64_t block_idx_; + std::string* tmp_storage_; + std::list block_list_; // kBlockSize + std::string path_; + std::string file_; +}; + +class BlockCacheWritableFile : public WritableFile { +public: + BlockCacheWritableFile(BlockCacheImpl* c, const std::string& fname, Status* s) + : cache_(c), + bg_cv_(&c->mu_), + bg_block_flush_(0), + write_buffer_(cache_->WorkPath(), fname, cache_->options_.block_size), + fname_(fname) { // file open + *s = cache_->dfs_env_->NewWritableFile(fname_, &dfs_file_); + Log("[%s] dfs open: %s, block_size: %lu, status: %s\n", + cache_->WorkPath().c_str(), + fname.c_str(), + cache_->options_.block_size, + s->ToString().c_str()); + return; + } + + ~BlockCacheWritableFile() { + if (dfs_file_ != NULL) { + Log("[%s] dfs close for release %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + dfs_file_->Close(); + delete dfs_file_; + dfs_file_ = NULL; + } + + Log("[%s] begin release %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + MutexLock lockgard(&cache_->mu_); + uint64_t block_idx; + std::string* block_data = write_buffer_.PopBackBlock(&block_idx); + if (block_data == NULL) { + Log("[%s] end release(nothing) %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + return; + } + FillCache(block_data, block_idx); + + while (bg_block_flush_ > 0) { + bg_cv_.Wait(); + } + Log("[%s] end release %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + return; + } + + Status Append(const Slice& data) { + Status s = dfs_file_->Append(data); + if (!s.ok()) { + Log("[%s] dfs append fail: %s, status: %s\n", + cache_->WorkPath().c_str(), + fname_.c_str(), + s.ToString().c_str()); + return s; + } + write_buffer_.Append(data); + + MutexLock lockgard(&cache_->mu_); + MaybeScheduleBGFlush(); + return Status::OK(); + } + + Status Close() { + Log("[%s] begin close %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + Status s = dfs_file_->Close(); + delete dfs_file_; + dfs_file_ = NULL; + + MutexLock lockgard(&cache_->mu_); + uint64_t block_idx; + std::string* block_data = write_buffer_.PopBackBlock(&block_idx); + if (block_data == NULL) { + Log("[%s] end close state error: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + return s; + } + FillCache(block_data, block_idx); + + while (bg_block_flush_ > 0) { + bg_cv_.Wait(); + } + Log("[%s] end close %s, status %s\n", cache_->WorkPath().c_str(), fname_.c_str(), + s.ToString().c_str()); + return s; + } + + Status Flush() { + Log("[%s] dfs flush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + return dfs_file_->Flush(); + } + + Status Sync() { + Log("[%s] dfs sync: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + return dfs_file_->Sync(); + } + +private: + void MaybeScheduleBGFlush() { + cache_->mu_.AssertHeld(); + Log("[%s] Maybe schedule BGFlush: %s, bg_block_flush: %u, block_nr: %u\n", + cache_->WorkPath().c_str(), + fname_.c_str(), + bg_block_flush_, + write_buffer_.NumFullBlock()); + while (bg_block_flush_ < write_buffer_.NumFullBlock()) { + bg_block_flush_++; + cache_->bg_flush_.Schedule(&BlockCacheWritableFile::BGFlushFunc, this, 10); + } + } + + static void BGFlushFunc(void* arg) { + reinterpret_cast(arg)->BGFlush(); + } + void BGFlush() { + Log("[%s] begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + MutexLock lockgard(&cache_->mu_); + if (write_buffer_.NumFullBlock() == 0) { + return; + } + + uint64_t block_idx; + std::string* block_data = write_buffer_.PopFrontBlock(&block_idx); + assert(block_data != NULL); + FillCache(block_data, block_idx); + + bg_block_flush_--; + MaybeScheduleBGFlush(); + bg_cv_.Signal(); + return; + } + + Status FillCache(std::string* block_data, uint64_t block_idx) { + cache_->mu_.AssertHeld(); + uint64_t fid = cache_->FileId(fname_); + CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx); + assert(block->state != kCacheBlockValid); + block->GetDataBlock(cache_->options_.block_size, Slice(*block_data)); + cache_->mu_.Unlock(); + + // Do io without lock + cache_->LogRecord(block); + cache_->FillCache(block); + + cache_->mu_.Lock(); + block->state = kCacheBlockValid; + cache_->ReleaseBlock(block); + write_buffer_.ReleaseBlock(block_data); + return Status::OK(); + } + +private: + BlockCacheImpl* cache_; + //port::AtomicPointer shutting_down_; + port::CondVar bg_cv_; // Signalled when background work finishes + WritableFile* dfs_file_; + // protected by cache_.mu_ + uint32_t bg_block_flush_; + BlockCacheWriteBuffer write_buffer_; + std::string fname_; +}; + +class BlockCacheRandomAccessFile : public RandomAccessFile { +public: + BlockCacheRandomAccessFile(BlockCacheImpl* c, const std::string& fname, Status* s) + : cache_(c), + fname_(fname) { + *s = cache_->dfs_env_->NewRandomAccessFile(fname_, &dfs_file_); + Log("[%s] dfs open for read: %s, block_size: %lu, status: %s\n", + cache_->WorkPath().c_str(), + fname.c_str(), + cache_->options_.block_size, + s->ToString().c_str()); + return; + } + + ~BlockCacheRandomAccessFile() { + delete dfs_file_; + return; + } + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + MutexLock lockgard(&cache_->mu_); + uint64_t fid = cache_->FileId(fname_); + uint64_t begin = offset / cache_->options_.block_size; + uint64_t end = (offset + n) / cache_->options_.block_size; + assert(begin <= end); + std::vector c_miss; + std::vector c_locked; + std::vector c_valid; + std::vector block_queue; + + Log("[%s] begin pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu" + ", block_size %lu\n", + cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, + begin, end, cache_->options_.block_size); + for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) { + CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx); + assert(block->fid == fid && block->block_idx == block_idx); + block->GetDataBlock(cache_->options_.block_size, Slice()); + block_queue.push_back(block); // sort by block_idx + + if (block->state != kCacheBlockValid && block->handle->refs == 2) { // first one access this block + c_miss.push_back(block); + } else if (block->state == kCacheBlockValid && block->handle->refs == 2) { // frist one access this block + c_valid.push_back(block); + } else { + c_locked.push_back(block); + } + Log("[%s] queue block: %s, refs %u, data_block_refs %lu, alloc %u\n", + cache_->WorkPath().c_str(), block->ToString().c_str(), + block->handle->refs, block->data_block_refs, + block->data_block_alloc); + } + cache_->mu_.Unlock(); + + // async read miss data + for (uint32_t i = 0; i < c_miss.size(); ++i) { + CacheBlock* block = c_miss[i]; + AsyncDfsReader* reader = new AsyncDfsReader; + reader->file = const_cast(this); + reader->block = block; + Log("[%s] pread in miss list, %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); + cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10); + } + + // async read valid data + for (uint32_t i = 0; i < c_valid.size(); ++i) { + CacheBlock* block = c_valid[i]; + AsyncCacheReader* reader = new AsyncCacheReader; + reader->file = const_cast(this); + reader->block = block; + Log("[%s] pread in valid list, %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); + cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10); + } + + // wait dfs read done and async cache file + for (uint32_t i = 0; i < c_miss.size(); ++i) { + MutexLock lockgard(&cache_->mu_); + CacheBlock* block = c_miss[i]; + block->cv.Wait(); + Log("[%s] pread in miss list(dfs done), %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); + } + + for (uint32_t i = 0; i < c_miss.size(); ++i) { + CacheBlock* block = c_miss[i]; + AsyncCacheWriter* writer = new AsyncCacheWriter; + writer->file = const_cast(this); + writer->block = block; + Log("[%s] pread in miss list(fill cache), %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); + cache_->bg_fill_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheWrite, writer, 10); + } + + for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish + MutexLock lockgard(&cache_->mu_); + CacheBlock* block = c_miss[i]; + block->cv.Wait(); + Log("[%s] pread in miss list(fill cache done), %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); + } + + // wait cache read done + for (uint32_t i = 0; i < c_valid.size(); ++i) { + MutexLock lockgard(&cache_->mu_); + CacheBlock* block = c_valid[i]; + block->cv.Wait(); + Log("[%s] pread in valid list(done), %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); + } + + // wait other async read finish + for (uint32_t i = 0; i < c_locked.size(); ++i) { + MutexLock lockgard(&cache_->mu_); + CacheBlock* block = c_locked[i]; + while (block->state != kCacheBlockValid) { + block->cv.Wait(); + } + } + + // fill user mem + size_t msize = 0; + for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) { + CacheBlock* block = block_queue[block_idx - begin]; + Slice data_block = block->data_block; + if (block_idx == begin) { + data_block.remove_prefix(offset % cache_->options_.block_size); + } + if (block_idx == end) { + data_block.remove_suffix(cache_->options_.block_size - (n + offset) % cache_->options_.block_size); + } + memcpy(scratch + msize, data_block.data(), data_block.size()); + msize += data_block.size(); + Log("[%s] fill user data, %s, prefix %lu, suffix %lu, msize %lu, offset %lu\n", + cache_->WorkPath().c_str(), fname_.c_str(), + block_idx == begin ? offset % cache_->options_.block_size: 0, + block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size + : cache_->options_.block_size, + msize, offset); + } + assert(msize == n); + *result = Slice(scratch, n); + + cache_->mu_.Lock(); + for (uint32_t i = 0; i < c_miss.size(); ++i) { + CacheBlock* block = c_miss[i]; + block->state = kCacheBlockValid; + Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); + cache_->ReleaseBlock(block); + } + for (uint32_t i = 0; i < c_valid.size(); ++i) { + CacheBlock* block = c_valid[i]; + block->state = kCacheBlockValid; + Log("[%s] wakeup for valid, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); + cache_->ReleaseBlock(block); + } + for (uint32_t i = 0; i < c_locked.size(); ++i) { + CacheBlock* block = c_locked[i]; + block->state = kCacheBlockValid; + Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); + cache_->ReleaseBlock(block); + } + + Log("[%s] end pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu" + ", block_size %lu\n", + cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, + begin, end, cache_->options_.block_size); + return Status::OK(); + } + +private: + struct AsyncDfsReader { + BlockCacheRandomAccessFile* file; + CacheBlock* block; + }; + static void AsyncDfsRead(void* arg) { + AsyncDfsReader* reader = (AsyncDfsReader*)arg; + reader->file->HandleDfsRead(reader); + delete reader; + return; + } + void HandleDfsRead(AsyncDfsReader* reader) { + Status s; + CacheBlock* block = reader->block; + char* scratch = (char*)(block->data_block.data()); + Slice result; + uint64_t offset = block->block_idx * cache_->options_.block_size; + size_t n = cache_->options_.block_size; + s = dfs_file_->Read(offset, n, &result, scratch); + Log("[%s] cache async.dfs read, %s" + ", offset %lu, size %lu, status %s, res %lu\n", + cache_->WorkPath().c_str(), block->ToString().c_str(), + offset, n, + s.ToString().c_str(), result.size()); + + MutexLock lockgard(&cache_->mu_); + block->cv.SignalAll(); + return; + } + + struct AsyncCacheReader { + BlockCacheRandomAccessFile* file; + CacheBlock* block; + }; + static void AsyncCacheRead(void* arg) { + AsyncCacheReader* reader = (AsyncCacheReader*)arg; + reader->file->HandleCacheRead(reader); + delete reader; + return; + } + void HandleCacheRead(AsyncCacheReader* reader) { + CacheBlock* block = reader->block; + cache_->ReadCache(block); + + MutexLock lockgard(&cache_->mu_); + block->cv.SignalAll(); + return; + } + + struct AsyncCacheWriter { + BlockCacheRandomAccessFile* file; + CacheBlock* block; + }; + static void AsyncCacheWrite(void* arg) { + AsyncCacheWriter* writer = (AsyncCacheWriter*)arg; + writer->file->HandleCacheWrite(writer); + delete writer; + return; + } + void HandleCacheWrite(AsyncCacheWriter* writer) { + CacheBlock* block = writer->block; + Log("[%s] handle cache write, %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); + cache_->LogRecord(block); + cache_->FillCache(block); + + MutexLock lockgard(&cache_->mu_); + block->cv.SignalAll(); + return; + } + +private: + BlockCacheImpl* cache_; + RandomAccessFile* dfs_file_; + std::string fname_; +}; + +// Tcache impl +BlockCacheImpl::BlockCacheImpl(const BlockCacheOptions& options) + : options_(options), + dfs_env_(options.env), + new_fid_(0), + prev_fid_(0), + db_(NULL) { + bg_fill_.SetBackgroundThreads(30); + bg_read_.SetBackgroundThreads(30); + bg_flush_.SetBackgroundThreads(30); +} + +BlockCacheImpl::~BlockCacheImpl() {} + +Status BlockCacheImpl::NewWritableFile(const std::string& fname, + WritableFile** result) { + Status s; + BlockCacheWritableFile* file = new BlockCacheWritableFile(this, fname, &s); + *result = NULL; + if (s.ok()) { + *result = (WritableFile*)file; + } + return s; +} + +Status BlockCacheImpl::NewRandomAccessFile(const std::string& fname, + RandomAccessFile** result) { + Status s; + BlockCacheRandomAccessFile* file = new BlockCacheRandomAccessFile(this, fname, &s); + *result = NULL; + if (s.ok()) { + *result = (RandomAccessFile*)file; + } + return s; +} + +void BlockCacheImpl::BlockDeleter(const Slice& key, void* v) { + CacheBlock* block = (CacheBlock*)v; + Log("Evict blockcache: %s\n", block->ToString().c_str()); + delete block; + return; +} + +// if lock succ, put lock_val, else get newer value +Status BlockCacheImpl::LockAndPut(LockContent& lc) { + mu_.AssertHeld(); + Status s; + std::string key; + if (lc.type == kDBKey) { + key = lc.db_lock_key.ToString(); + } else if (lc.type == kDataSetKey) { + key = "DS#"; + PutFixed64(&key, lc.sid); + } else { + return Status::NotSupported("key type error"); + } + Log("[%s] trylock key: %s\n", + this->WorkPath().c_str(), + key.c_str()); + + Waiter* w = NULL; + LockKeyMap::iterator it = lock_key_.find(key); + if (it != lock_key_.end()){ + w = it->second; + w->wait_num ++; + while (!w->done) { + w->cv.Wait(); + } + mu_.Unlock(); + + if (lc.type == kDBKey) { + ReadOptions r_opts; + s = db_->Get(r_opts, key, lc.db_val); + Log("[%s] get lock key: %s, val: %s, status: %s\n", + this->WorkPath().c_str(), + key.c_str(), + lc.db_val->c_str(), + s.ToString().c_str()); + } else if (lc.type == kDataSetKey) { + lc.data_set = data_set_map_[lc.sid]; + Log("[%s] get dataset sid: %lu\n", + this->WorkPath().c_str(), + lc.sid); + } + + mu_.Lock(); + if (--w->wait_num == 0) { + // last thread wait for open + lock_key_.erase(key); + Log("[%s] wait done %s, delete cv\n", + this->WorkPath().c_str(), + key.c_str()); + delete w; + } else { + Log("[%s] wait done %s, not last\n", + this->WorkPath().c_str(), + key.c_str()); + } + } else { + w = new Waiter(&mu_); + w->wait_num = 1; + lock_key_[key] = w; + mu_.Unlock(); + + if (lc.type == kDBKey) { + WriteOptions w_opts; + s = db_->Put(w_opts, key, lc.db_lock_val); + if (s.ok()) { + lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size()); + } + Log("[%s] put kDBKey: %s, status %s\n", + this->WorkPath().c_str(), + key.c_str(), + s.ToString().c_str()); + } else if (lc.type == kDataSetKey) { + std::string end_ds = "DS#"; + PutFixed64(&end_ds, lc.sid + 1); + lc.data_set = new DataSet; + lc.data_set->cache = New2QCache((options_.dataset_size / options_.block_size) + 1);// number of blocks in DS + std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid); + lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644); + assert(lc.data_set->fd > 0); + Log("[%s] begin new dataset, sid: %lu, file: %s, cs: %lu, fd: %d\n", + this->WorkPath().c_str(), + lc.sid, file.c_str(), (options_.dataset_size / options_.block_size) + 1, + lc.data_set->fd); + + // reload hash lru + ReadOptions s_opts; + leveldb::Iterator* db_it = db_->NewIterator(s_opts); + for (db_it->Seek(key); + db_it->Valid() && db_it->key().ToString() < end_ds; + db_it->Next()) { + Slice lkey = db_it->key(); + lkey.remove_prefix(3 + sizeof(uint64_t));// remove DS#sid + //Slice lval = db_it->value(); + + CacheBlock* block = new CacheBlock(&mu_); + block->DecodeFrom(db_it->value()); // get fid and block_idx + std::string hkey; + PutFixed64(&hkey, block->fid); + PutFixed64(&hkey, block->block_idx); + block->sid = lc.sid; + block->cache_block_idx = DecodeFixed64(lkey.data()); + Log("[%s] insert cacheblock into 2QLru, %s\n", + this->WorkPath().c_str(), + block->ToString().c_str()); + LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter)); + assert(handle != NULL); + handle->cache_id = block->cache_block_idx; + lc.data_set->cache->Release((Cache::Handle*)handle); + } + delete db_it; + + mu_.Lock(); + data_set_map_[lc.sid] = lc.data_set; + mu_.Unlock(); + } + + mu_.Lock(); + if (--w->wait_num == 0) { + lock_key_.erase(key); + Log("[%s] put done %s, no wait thread\n", + this->WorkPath().c_str(), + key.c_str()); + delete w; + } else { + Log("[%s] put done %s, signal all wait thread\n", + this->WorkPath().c_str(), + key.c_str()); + w->done = true; + w->cv.SignalAll(); + } + } + return s; +} + +const std::string& BlockCacheImpl::WorkPath() { + return work_path_; +} + +Status BlockCacheImpl::LoadCache() { + // open meta file + work_path_ = options_.cache_dir; + std::string dbname = options_.cache_dir + "/meta/"; + options_.opts.env = options_.cache_env; // local write + options_.opts.filter_policy = NewBloomFilterPolicy(10); + options_.opts.block_cache = leveldb::NewLRUCache(options_.meta_block_cache_size * 1024UL * 1024); + options_.opts.table_cache = new leveldb::TableCache(options_.meta_table_cache_size * 1024UL * 1024); + options_.opts.write_buffer_size = options_.write_buffer_size; + options_.opts.info_log = Logger::DefaultLogger(); + Log("[block_cache %s] open meta db: block_cache: %lu, table_cache: %lu\n", + dbname.c_str(), + options_.meta_block_cache_size, + options_.meta_table_cache_size); + Status s = DB::Open(options_.opts, dbname, &db_); + assert(s.ok()); + + // recover fid + std::string key = "FID#"; + std::string val; + ReadOptions r_opts; + s = db_->Get(r_opts, key, &val); + if (!s.ok()) { + prev_fid_ = 0; + } else { + prev_fid_ = DecodeFixed64(val.c_str()); + } + new_fid_ = prev_fid_ + options_.fid_batch_num; + Log("[block_cache %s]: reuse block cache: prev_fid: %lu, new_fid: %lu\n", + dbname.c_str(), prev_fid_, new_fid_); + s = Status::OK(); + return s; +} + +Status BlockCacheImpl::FillCache(CacheBlock* block) { + MutexLock l(&mu_); + uint64_t sid = block->sid; + uint64_t cache_block_idx = block->cache_block_idx; + int fd = (data_set_map_[sid])->fd; + mu_.Unlock(); + + // do io without lock + ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(), + cache_block_idx * options_.block_size); + Log("[%s] fillcache: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", + this->WorkPath().c_str(), sid, fd, block->data_block.size(), + cache_block_idx, + block->ToString().c_str(), + res); + + mu_.Lock(); + if (res < 0) { + return Status::Corruption("FillCache error"); + } + return Status::OK(); +} + +Status BlockCacheImpl::ReadCache(CacheBlock* block) { + MutexLock l(&mu_); + uint64_t sid = block->sid; + uint64_t cache_block_idx = block->cache_block_idx; + int fd = (data_set_map_[sid])->fd; + mu_.Unlock(); + + // do io without lock + ssize_t res = pread(fd, (char*)block->data_block.data(), block->data_block.size(), + cache_block_idx * options_.block_size); + Log("[%s] readcache: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", + this->WorkPath().c_str(), sid, fd, block->data_block.size(), + cache_block_idx, + block->ToString().c_str(), + res); + + mu_.Lock(); + if (res < 0) { + return Status::Corruption("ReadCache error"); + } + return Status::OK(); +} + +uint64_t BlockCacheImpl::AllocFileId() { // no more than fid_batch_num + mu_.AssertHeld(); + uint64_t fid = ++new_fid_; + while (new_fid_ - prev_fid_ >= options_.fid_batch_num) { + std::string key = "FID#"; + std::string lock_val; + PutFixed64(&lock_val, new_fid_); + std::string val; + + LockContent lc; + lc.type = kDBKey; + lc.db_lock_key = key; + lc.db_lock_val = lock_val; + lc.db_val = &val; + Status s = LockAndPut(lc); + if (s.ok()) { + prev_fid_ = DecodeFixed64(val.c_str()); + } + Log("[%s] alloc fid: key %s, new_fid: %lu, prev_fid: %lu\n", + this->WorkPath().c_str(), + key.c_str(), + new_fid_, + prev_fid_); + } + return fid; +} + +uint64_t BlockCacheImpl::FileId(const std::string& fname) { + mu_.AssertHeld(); + uint64_t fid = 0; + std::string key = "FNAME#" + fname; + mu_.Unlock(); + + ReadOptions r_opts; + std::string val; + Status s = db_->Get(r_opts, key, &val); + if (!s.ok()) { // not exist + MutexLock l(&mu_); + fid = AllocFileId(); + std::string v; + PutFixed64(&val, fid); + + LockContent lc; + lc.type = kDBKey; + lc.db_lock_key = key; + lc.db_lock_val = val; + lc.db_val = &v; + Log("[%s] alloc fid: %lu, key: %s", + this->WorkPath().c_str(), + fid, key.c_str()); + s = LockAndPut(lc); + assert(s.ok()); + fid = DecodeFixed64(v.c_str()); + } else { // fid in cache + fid = DecodeFixed64(val.c_str()); + } + Log("[%s] Fid: %lu, fname: %s\n", + this->WorkPath().c_str(), + fid, fname.c_str()); + + mu_.Lock(); + return fid; +} + +DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) { + mu_.AssertHeld(); + DataSet* set = NULL; + + DataSetMap::iterator it = data_set_map_.find(sid); + if (it == data_set_map_.end()) { + LockContent lc; + lc.type = kDataSetKey; + lc.sid = sid; + lc.data_set = NULL; + Status s = LockAndPut(lc); + set = lc.data_set; + } else { + Log("[%s] get dataset from memcache, sid %lu\n", + this->WorkPath().c_str(), sid); + set = it->second; + } + return set; +} + +CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { + mu_.AssertHeld(); + std::string key; + PutFixed64(&key, fid); + PutFixed64(&key, block_idx); + uint32_t hash = Hash(key.c_str(), key.size(), 7); + uint64_t sid = hash % options_.dataset_num; + + Log("[%s] alloc block, try get dataset, fid: %lu, block_idx: %lu, hash: %u, sid %lu, dataset_num: %lu\n", + this->WorkPath().c_str(), fid, block_idx, hash, sid, options_.dataset_num); + CacheBlock* block = NULL; + DataSet* ds = GetDataSet(sid); // get and alloc ds + Cache* cache = ds->cache; + LRUHandle* h = (LRUHandle*)cache->Lookup(key); + if (h == NULL) { + block = new CacheBlock(&mu_); + h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); + assert(h != NULL); + block->fid = fid; + block->block_idx = block_idx; + block->sid = sid; + block->cache_block_idx = h->cache_id; + Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); + assert(block->state != kCacheBlockValid); + } else { + block = reinterpret_cast(cache->Value((Cache::Handle*)h)); + Log("[%s] get block from memcache, %s\n", + this->WorkPath().c_str(), block->ToString().c_str()); + } + block->handle = h; + return block; +} + +Status BlockCacheImpl::LogRecord(CacheBlock* block) { + std::string key = "DS#"; + PutFixed64(&key, block->sid); + PutFixed64(&key, block->cache_block_idx); + leveldb::WriteBatch batch; + batch.Put(key, block->Encode()); + return db_->Write(leveldb::WriteOptions(), &batch); +} + +Status BlockCacheImpl::ReleaseBlock(CacheBlock* block) { + mu_.AssertHeld(); + Status s; + std::string key = "DS#"; + PutFixed64(&key, block->sid); + PutFixed64(&key, block->cache_block_idx); + leveldb::WriteBatch batch; + batch.Put(key, block->Encode()); + + LRUHandle* h = block->handle; + DataSet* ds = GetDataSet(block->sid); // get and alloc ds + block->ReleaseDataBlock(); + block->handle = NULL; + block->cv.SignalAll(); + ds->cache->Release((Cache::Handle*)h); + mu_.Unlock(); + + // TODO: dump meta into memtable + Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); + s = db_->Write(leveldb::WriteOptions(), &batch); + mu_.Lock(); + return s; +} + +} // namespace leveldb + diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc index 6eab478a1..99c2dfa90 100644 --- a/src/leveldb/util/cache.cc +++ b/src/leveldb/util/cache.cc @@ -25,31 +25,6 @@ namespace { // LRU cache implementation -// An entry is a variable length heap-allocated structure. Entries -// are kept in a circular doubly linked list ordered by access time. -struct LRUHandle { - void* value; - void (*deleter)(const Slice&, void* value); - LRUHandle* next_hash; - LRUHandle* next; - LRUHandle* prev; - size_t charge; // TODO(opt): Only allow uint32_t? - size_t key_length; - uint32_t refs; - uint32_t hash; // Hash of key(); used for fast sharding and comparisons - char key_data[1]; // Beginning of key - - Slice key() const { - // For cheaper lookups, we allow a temporary Handle object - // to store a pointer to a key in "value". - if (next == this) { - return *(reinterpret_cast(value)); - } else { - return Slice(key_data, key_length); - } - } -}; - // We provide our own simple hash table since it removes a whole bunch // of porting hacks and is also faster than some of the built-in hash // table implementations in some of the compiler/runtime combinations @@ -286,6 +261,170 @@ size_t LRUCache::TotalCharge() { return usage_; } +class LRU2QCache: public Cache { + public: + explicit LRU2QCache(size_t capacity) + : capacity_(capacity), + usage_(0) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; + } + + ~LRU2QCache() {} + + // Like Cache methods, but with an extra "hash" parameter. + Cache::Handle* Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value)) { + const uint32_t hash = HashSlice(key); + MutexLock l(&mutex_); + LRUHandle* e = NULL; + e = table_.Lookup(key, hash); + if (e != NULL) { + return reinterpret_cast(NULL); + } + + if (usage_ < capacity_) { // cache full + e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + e->value = value; + e->deleter = deleter; + e->charge = 1; + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + e->cache_id = usage_; + memcpy(e->key_data, key.data(), key.size()); + + assert(table_.Insert(e) == NULL); + LRU_Append(e); + usage_++; + return reinterpret_cast(e); + } + + // cache full, reuse item + LRUHandle* old = lru_.next; + while (old != &lru_) { + if (old->refs > 1) { + old = old->next; + continue; + } + e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + e->value = value; + e->deleter = deleter; + e->charge = 1; + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + e->cache_id = old->cache_id; + memcpy(e->key_data, key.data(), key.size()); + + LRU_Remove(old); + table_.Remove(old->key(), old->hash); + Unref(old); + + assert(table_.Insert(e) == NULL); + LRU_Append(e); + return reinterpret_cast(e); + } + // TODO: try wait finish + return reinterpret_cast(NULL); + } + + Cache::Handle* Lookup(const Slice& key) { + const uint32_t hash = HashSlice(key); + MutexLock l(&mutex_); + LRUHandle* e = table_.Lookup(key, hash); + if (e != NULL) { + e->refs++; + LRU_Remove(e); + LRU_Append(e); + } + return reinterpret_cast(e); + } + + void Erase(const Slice& key) { + const uint32_t hash = HashSlice(key); + MutexLock l(&mutex_); + LRUHandle* e = table_.Remove(key, hash); + if (e != NULL) { + LRU_Remove(e); + Unref(e); + } + } + + void Release(Cache::Handle* handle) { + MutexLock l(&mutex_); + Unref(reinterpret_cast(handle)); + } + + void* Value(Cache::Handle* handle) { + return reinterpret_cast(handle)->value; + } + + uint64_t NewId() { + return 0; + } + + double HitRate(bool force_clear = false) { + return 99.9999; + } + + size_t Entries() { + MutexLock l(&mutex_); + return usage_; + } + + size_t TotalCharge() { + MutexLock l(&mutex_); + return usage_; + } + + private: + void LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; + } + + void LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; + } + + void Unref(LRUHandle* e) { + assert(e->refs > 0); + e->refs--; + if (e->refs <= 0) { + usage_ -= e->charge; + (*e->deleter)(e->key(), e->value); + free(e); + } + } + + inline uint32_t HashSlice(const Slice& s) { + return Hash(s.data(), s.size(), 0); + } + + // Initialized before use. + size_t capacity_; + + // mutex_ protects the following state. + port::Mutex mutex_; + size_t usage_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + //LRUHandle hot_lru_; + //LRUHandle cold_lru_; + LRUHandle lru_; + + HandleTable table_; +}; + static const int kNumShardBits = 4; static const int kNumShards = 1 << kNumShardBits; @@ -382,4 +521,8 @@ Cache* NewLRUCache(size_t capacity) { return new ShardedLRUCache(capacity); } +Cache* New2QCache(size_t capacity) { + return new LRU2QCache(capacity); +} + } // namespace leveldb diff --git a/src/leveldb/util/coding_test.cc b/src/leveldb/util/coding_test.cc index fc8fbf5c9..17848377b 100644 --- a/src/leveldb/util/coding_test.cc +++ b/src/leveldb/util/coding_test.cc @@ -219,6 +219,17 @@ TEST(Coding, PutLG_ugly) { ASSERT_EQ(a_slice.ToString(), b_slice.ToString()); } +TEST(Coding, PutFixed64Cmp) { + std::string sa, sb; + PutFixed64(&sa, 100); + PutFixed64(&sb, 50); + ASSERT_TRUE(sa > sb); + uint64_t a = DecodeFixed64(sa.c_str()); + uint64_t b = DecodeFixed64(sb.c_str()); + ASSERT_TRUE(a == 100); + ASSERT_TRUE(b == 50); +} + } // namespace leveldb int main(int argc, char** argv) { diff --git a/src/sdk/sdk_zk.cc b/src/sdk/sdk_zk.cc index e08bb6c9b..5f7b8c8f6 100644 --- a/src/sdk/sdk_zk.cc +++ b/src/sdk/sdk_zk.cc @@ -60,9 +60,6 @@ std::string ClusterFinder::ClusterId() { std::string name = Name(); std::string authority = Authority(); std::string path = Path(); - if (name.empty() || authority.empty() || path.empty()) { - LOG(FATAL) << "cluster name/authority/path must be non-empty"; - } std::string cluster_id = name + "://" + authority; if (path[0] != '/') { cluster_id += "/"; diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc index c472b9732..4d7919fd6 100644 --- a/src/tabletnode/tabletnode_impl.cc +++ b/src/tabletnode/tabletnode_impl.cc @@ -14,9 +14,11 @@ #include "db/filename.h" #include "db/table_cache.h" +#include "common/base/string_ext.h" #include "common/thread.h" #include "io/io_utils.h" #include "io/utils_leveldb.h" +#include "leveldb/block_cache.h" #include "leveldb/cache.h" #include "leveldb/env_cache.h" #include "leveldb/env_dfs.h" @@ -68,7 +70,7 @@ DECLARE_string(tera_tabletnode_path_prefix); // cache-related DECLARE_int32(tera_memenv_block_cache_size); -DECLARE_bool(tera_tabletnode_cache_enabled); +DECLARE_bool(tera_tabletnode_block_cache_enabled); DECLARE_string(tera_tabletnode_cache_paths); DECLARE_int32(tera_tabletnode_cache_block_size); DECLARE_string(tera_tabletnode_cache_name); @@ -150,11 +152,7 @@ TabletNodeImpl::TabletNodeImpl() sysinfo_.SetProcessStartTime(get_micros()); } -TabletNodeImpl::~TabletNodeImpl() { - if (FLAGS_tera_tabletnode_cache_enabled) { - leveldb::ThreeLevelCacheEnv::RemoveCachePaths(); - } -} +TabletNodeImpl::~TabletNodeImpl() {} bool TabletNodeImpl::Init() { if (FLAGS_tera_zk_enabled) { @@ -179,32 +177,32 @@ bool TabletNodeImpl::Init() { } void TabletNodeImpl::InitCacheSystem() { - if (!FLAGS_tera_tabletnode_cache_enabled) { - // compitable with legacy FlashEnv - leveldb::FlashEnv* flash_env = (leveldb::FlashEnv*)io::LeveldbFlashEnv(); - flash_env->SetFlashPath(FLAGS_tera_tabletnode_cache_paths, - FLAGS_tera_io_cache_path_vanish_allowed); - flash_env->SetUpdateFlashThreadNumber(FLAGS_tera_tabletnode_cache_update_thread_num); - flash_env->SetIfForceReadFromCache(FLAGS_tera_tabletnode_cache_force_read_from_cache); - return; - } + if (FLAGS_tera_tabletnode_block_cache_enabled) { + LOG(INFO) << "Tcache: set flash path: " << FLAGS_tera_tabletnode_cache_paths; + std::vector path_list; + SplitString(FLAGS_tera_tabletnode_cache_paths, ";", &path_list); + + leveldb::Env* posix_env = leveldb::Env::Default(); + for (uint32_t i = 0; i < path_list.size(); ++i) { + posix_env->CreateDir(path_list[i]); + } - LOG(INFO) << "activate new cache system"; - // new cache mechanism - leveldb::ThreeLevelCacheEnv::SetCachePaths(FLAGS_tera_tabletnode_cache_paths); - leveldb::ThreeLevelCacheEnv::s_mem_cache_size_in_KB_ = FLAGS_tera_tabletnode_cache_mem_size; - leveldb::ThreeLevelCacheEnv::s_disk_cache_size_in_MB_ = FLAGS_tera_tabletnode_cache_disk_size; - leveldb::ThreeLevelCacheEnv::s_block_size_ = FLAGS_tera_tabletnode_cache_block_size; - leveldb::ThreeLevelCacheEnv::s_disk_cache_file_num_ = FLAGS_tera_tabletnode_cache_disk_filenum; - leveldb::ThreeLevelCacheEnv::s_disk_cache_file_name_ = FLAGS_tera_tabletnode_cache_name; - - if (FLAGS_tera_tabletnode_cache_log_level < 3) { - LEVELDB_SET_LOG_LEVEL(WARNING); - } else if (FLAGS_tera_tabletnode_cache_log_level < 4) { - LEVELDB_SET_LOG_LEVEL(INFO); - } else { - LEVELDB_SET_LOG_LEVEL(DEBUG); + LOG(INFO) << "activate Tcache system"; + leveldb::Env* block_cache_env = io::DefaultBlockCacheEnv(); + for (uint32_t i = 0; i < path_list.size(); ++i) { + leveldb::BlockCacheOptions opts; + LOG(INFO) << "load cache: " << path_list[i]; + reinterpret_cast(block_cache_env)->LoadCache(opts, path_list[i] + "/block_cache/"); + } + return; } + // compitable with legacy FlashEnv + leveldb::FlashEnv* flash_env = (leveldb::FlashEnv*)io::LeveldbFlashEnv(); + flash_env->SetFlashPath(FLAGS_tera_tabletnode_cache_paths, + FLAGS_tera_io_cache_path_vanish_allowed); + flash_env->SetUpdateFlashThreadNumber(FLAGS_tera_tabletnode_cache_update_thread_num); + flash_env->SetIfForceReadFromCache(FLAGS_tera_tabletnode_cache_force_read_from_cache); + return; } bool TabletNodeImpl::Exit() { @@ -1070,7 +1068,7 @@ void TabletNodeImpl::UpdateMetaTableCallback(const SplitTabletRequest* rpc_reque * ------------------------------------------ */ void TabletNodeImpl::GarbageCollect() { - if (FLAGS_tera_tabletnode_cache_enabled) { + if (FLAGS_tera_tabletnode_block_cache_enabled) { return; } int64_t start_ms = get_micros(); diff --git a/src/tera_flags.cc b/src/tera_flags.cc index 59d7b7a9e..6a0a14ce5 100644 --- a/src/tera_flags.cc +++ b/src/tera_flags.cc @@ -64,6 +64,7 @@ DEFINE_int32(tera_leveldb_env_dfs_seek_latency, 10000000, "the random access lat DEFINE_int32(tera_memenv_table_cache_size, 100, "the max open file number in leveldb table_cache"); DEFINE_int32(tera_memenv_block_cache_size, 10000, "block cache size for leveldb which do not use share block cache"); DEFINE_bool(tera_use_flash_for_memenv, true, "Use flashenv for memery lg"); +DEFINE_int32(tera_leveldb_block_cache_env_num_thread, 30, "thread num of Tcache"); DEFINE_string(tera_leveldb_compact_strategy, "default", "the default strategy to drive consum compaction, should be [default|LG|dummy]"); DEFINE_bool(tera_leveldb_verify_checksums, true, "enable verify data read from storage against checksums"); @@ -201,7 +202,7 @@ DEFINE_string(tera_tabletnode_cpu_affinity_set, "1,2", "the cpu set of cpu affin DEFINE_bool(tera_tabletnode_hang_detect_enabled, false, "enable detect read/write hang"); DEFINE_int32(tera_tabletnode_hang_detect_threshold, 60000, "read/write hang detect threshold (in ms)"); -DEFINE_bool(tera_tabletnode_cache_enabled, false, "enable three-level cache mechasism"); +DEFINE_bool(tera_tabletnode_block_cache_enabled, true, "enable Tcache mechasism"); DEFINE_string(tera_tabletnode_cache_paths, "../data/cache/", "paths for cached data storage. Mutiple definition like: \"./path1/;./path2/\""); DEFINE_int32(tera_tabletnode_cache_block_size, 8192, "the block size of cache system"); DEFINE_string(tera_tabletnode_cache_name, "tera.cache", "prefix name for cache name"); From 3256bd290b4e8c6cf8806b3006ecdc7a4a591d35 Mon Sep 17 00:00:00 2001 From: caijieming Date: Mon, 7 Aug 2017 20:35:04 +0800 Subject: [PATCH 02/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/util/block_cache.cc | 131 ++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 47 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 8401cb1c0..c9ae949be 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -49,13 +49,17 @@ class BlockCacheImpl; // Each SSD will New a BlockCache // block state -uint64_t kCacheBlockValid = 1; +uint64_t kCacheBlockValid = 0x1; +uint64_t kCacheBlockLocked = 0x2; +uint64_t kCacheBlockDfsRead = 0x4; +uint64_t kCacheBlockCacheRead = 0x8; +uint64_t kCacheBlockCacheFill = 0x10; struct CacheBlock { uint64_t fid; uint64_t block_idx; uint64_t sid; uint64_t cache_block_idx; - uint64_t state; + volatile uint64_t state; port::CondVar cv; Slice data_block; bool data_block_alloc; @@ -68,13 +72,31 @@ struct CacheBlock { block_idx(0), sid(0xffffffffffffffff), cache_block_idx(0xffffffffffffffff), - state(!kCacheBlockValid), + state(0), cv(mu), data_block_alloc(false), data_block_refs(0), handle(NULL) { } + bool Test(uint64_t c_state) { + return (state & c_state) == c_state; + } + + void Clear(uint64_t c_state) { + state &= ~c_state; + } + + void Set(uint64_t c_state) { + state |= c_state; + } + + void WaitOnClear(uint64_t c_state) { // access in lock + while (Test(c_state)) { + cv.Wait(); + } + } + // access in cache lock void GetDataBlock(uint64_t block_size, Slice data) { if (data_block_refs == 0) { // first one alloc mem @@ -122,7 +144,7 @@ struct CacheBlock { const std::string ToString() { std::stringstream ss; - ss << "CacheBlock: fid: " << fid << ", block_idx: " << block_idx + ss << "CacheBlock(" << (uint64_t)this << "): fid: " << fid << ", block_idx: " << block_idx << ", sid: " << sid << ", cache_block_idx: " << cache_block_idx << ", state " << state; return ss.str(); @@ -339,10 +361,13 @@ class BlockCacheWriteBuffer { uint32_t NumFullBlock() { // use for BGFlush MutexLock l(&mu_); - if (block_list_.size() > 1) { + if (block_list_.size() == 0) { + return 0; + } else if ((block_list_.back())->size() < block_size_) { return block_list_.size() - 1; + } else { + return block_list_.size(); } - return 0; } Status Append(const Slice& data) { @@ -389,10 +414,14 @@ class BlockCacheWriteBuffer { std::string* PopFrontBlock(uint64_t* block_idx) { MutexLock l(&mu_); - std::string* block = block_list_.front(); if (block_list_.size() == 0) { return NULL; } + std::string* block = block_list_.front(); + assert(block->size() <= block_size_); + if (block->size() != block_size_) { + return NULL; + } block_list_.pop_front(); *block_idx = block_idx_; block_idx_++; @@ -431,6 +460,7 @@ class BlockCacheWritableFile : public WritableFile { : cache_(c), bg_cv_(&c->mu_), bg_block_flush_(0), + pending_block_num_(0), write_buffer_(cache_->WorkPath(), fname, cache_->options_.block_size), fname_(fname) { // file open *s = cache_->dfs_env_->NewWritableFile(fname_, &dfs_file_); @@ -454,11 +484,9 @@ class BlockCacheWritableFile : public WritableFile { MutexLock lockgard(&cache_->mu_); uint64_t block_idx; std::string* block_data = write_buffer_.PopBackBlock(&block_idx); - if (block_data == NULL) { - Log("[%s] end release(nothing) %s\n", cache_->WorkPath().c_str(), fname_.c_str()); - return; + if (block_data != NULL) { + FillCache(block_data, block_idx); } - FillCache(block_data, block_idx); while (bg_block_flush_ > 0) { bg_cv_.Wait(); @@ -492,11 +520,9 @@ class BlockCacheWritableFile : public WritableFile { MutexLock lockgard(&cache_->mu_); uint64_t block_idx; std::string* block_data = write_buffer_.PopBackBlock(&block_idx); - if (block_data == NULL) { - Log("[%s] end close state error: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); - return s; + if (block_data != NULL) { + FillCache(block_data, block_idx); } - FillCache(block_data, block_idx); while (bg_block_flush_ > 0) { bg_cv_.Wait(); @@ -524,7 +550,7 @@ class BlockCacheWritableFile : public WritableFile { fname_.c_str(), bg_block_flush_, write_buffer_.NumFullBlock()); - while (bg_block_flush_ < write_buffer_.NumFullBlock()) { + while (bg_block_flush_ < (write_buffer_.NumFullBlock() + pending_block_num_)) { bg_block_flush_++; cache_->bg_flush_.Schedule(&BlockCacheWritableFile::BGFlushFunc, this, 10); } @@ -536,14 +562,13 @@ class BlockCacheWritableFile : public WritableFile { void BGFlush() { Log("[%s] begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); MutexLock lockgard(&cache_->mu_); - if (write_buffer_.NumFullBlock() == 0) { - return; - } - uint64_t block_idx; std::string* block_data = write_buffer_.PopFrontBlock(&block_idx); - assert(block_data != NULL); - FillCache(block_data, block_idx); + if (block_data != NULL) { + pending_block_num_++; + FillCache(block_data, block_idx); + pending_block_num_--; + } bg_block_flush_--; MaybeScheduleBGFlush(); @@ -555,7 +580,7 @@ class BlockCacheWritableFile : public WritableFile { cache_->mu_.AssertHeld(); uint64_t fid = cache_->FileId(fname_); CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx); - assert(block->state != kCacheBlockValid); + block->state = 0; block->GetDataBlock(cache_->options_.block_size, Slice(*block_data)); cache_->mu_.Unlock(); @@ -577,6 +602,7 @@ class BlockCacheWritableFile : public WritableFile { WritableFile* dfs_file_; // protected by cache_.mu_ uint32_t bg_block_flush_; + uint32_t pending_block_num_; BlockCacheWriteBuffer write_buffer_; std::string fname_; }; @@ -622,13 +648,17 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { block->GetDataBlock(cache_->options_.block_size, Slice()); block_queue.push_back(block); // sort by block_idx - if (block->state != kCacheBlockValid && block->handle->refs == 2) { // first one access this block - c_miss.push_back(block); - } else if (block->state == kCacheBlockValid && block->handle->refs == 2) { // frist one access this block + if (!block->Test(kCacheBlockLocked) && + block->Test(kCacheBlockValid)) { + block->Set(kCacheBlockLocked | kCacheBlockCacheRead); c_valid.push_back(block); + } else if (!block->Test(kCacheBlockLocked)) { + block->Set(kCacheBlockLocked | kCacheBlockDfsRead); + c_miss.push_back(block); } else { c_locked.push_back(block); } + Log("[%s] queue block: %s, refs %u, data_block_refs %lu, alloc %u\n", cache_->WorkPath().c_str(), block->ToString().c_str(), block->handle->refs, block->data_block_refs, @@ -660,11 +690,25 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10); } + // wait async cache read done + for (uint32_t i = 0; i < c_valid.size(); ++i) { + MutexLock lockgard(&cache_->mu_); + CacheBlock* block = c_valid[i]; + block->WaitOnClear(kCacheBlockCacheRead); + block->Set(kCacheBlockValid); + block->Clear(kCacheBlockLocked); + block->cv.SignalAll(); + Log("[%s] pread in valid list(done), %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); + } + // wait dfs read done and async cache file for (uint32_t i = 0; i < c_miss.size(); ++i) { MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_miss[i]; - block->cv.Wait(); + block->WaitOnClear(kCacheBlockDfsRead); + block->Set(kCacheBlockCacheFill); Log("[%s] pread in miss list(dfs done), %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); @@ -684,29 +728,21 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_miss[i]; - block->cv.Wait(); + block->WaitOnClear(kCacheBlockCacheFill); + block->Set(kCacheBlockValid); + block->Clear(kCacheBlockLocked); + block->cv.SignalAll(); Log("[%s] pread in miss list(fill cache done), %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); } - // wait cache read done - for (uint32_t i = 0; i < c_valid.size(); ++i) { - MutexLock lockgard(&cache_->mu_); - CacheBlock* block = c_valid[i]; - block->cv.Wait(); - Log("[%s] pread in valid list(done), %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); - } - // wait other async read finish for (uint32_t i = 0; i < c_locked.size(); ++i) { MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_locked[i]; - while (block->state != kCacheBlockValid) { - block->cv.Wait(); - } + block->WaitOnClear(kCacheBlockLocked); + assert((block->state & kCacheBlockValid) == kCacheBlockValid); } // fill user mem @@ -735,19 +771,16 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->mu_.Lock(); for (uint32_t i = 0; i < c_miss.size(); ++i) { CacheBlock* block = c_miss[i]; - block->state = kCacheBlockValid; Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); cache_->ReleaseBlock(block); } for (uint32_t i = 0; i < c_valid.size(); ++i) { CacheBlock* block = c_valid[i]; - block->state = kCacheBlockValid; Log("[%s] wakeup for valid, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); cache_->ReleaseBlock(block); } for (uint32_t i = 0; i < c_locked.size(); ++i) { CacheBlock* block = c_locked[i]; - block->state = kCacheBlockValid; Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); cache_->ReleaseBlock(block); } @@ -785,6 +818,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { s.ToString().c_str(), result.size()); MutexLock lockgard(&cache_->mu_); + block->Clear(kCacheBlockDfsRead); block->cv.SignalAll(); return; } @@ -804,7 +838,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->ReadCache(block); MutexLock lockgard(&cache_->mu_); + block->Clear(kCacheBlockCacheRead); block->cv.SignalAll(); + //Log("[%s] async.cacheread signal, %s\n", cache_->WorkPath().c_str(), + // block->ToString().c_str()); return; } @@ -827,6 +864,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->FillCache(block); MutexLock lockgard(&cache_->mu_); + block->Clear(kCacheBlockCacheFill); block->cv.SignalAll(); return; } @@ -987,6 +1025,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter)); assert(handle != NULL); handle->cache_id = block->cache_block_idx; + block->handle = handle; lc.data_set->cache->Release((Cache::Handle*)handle); } delete db_it; @@ -1204,14 +1243,13 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { block->block_idx = block_idx; block->sid = sid; block->cache_block_idx = h->cache_id; + block->handle = h; Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); - assert(block->state != kCacheBlockValid); } else { block = reinterpret_cast(cache->Value((Cache::Handle*)h)); Log("[%s] get block from memcache, %s\n", this->WorkPath().c_str(), block->ToString().c_str()); } - block->handle = h; return block; } @@ -1236,7 +1274,6 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block) { LRUHandle* h = block->handle; DataSet* ds = GetDataSet(block->sid); // get and alloc ds block->ReleaseDataBlock(); - block->handle = NULL; block->cv.SignalAll(); ds->cache->Release((Cache::Handle*)h); mu_.Unlock(); From c5e3f83587ee66c15ccf5688cb81de305fb56e48 Mon Sep 17 00:00:00 2001 From: caijieming Date: Mon, 7 Aug 2017 23:22:20 +0800 Subject: [PATCH 03/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/util/block_cache.cc | 132 ++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 50 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index c9ae949be..ee99c8cb5 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -37,7 +37,7 @@ namespace leveldb { ///////////////////////////////////////////// uint64_t kBlockSize = 4096UL; uint64_t kDataSetSize = 134217728UL; -uint64_t kFidBatchNum = 200000UL; +uint64_t kFidBatchNum = 100000UL; uint64_t kCacheSize = 350000000000UL; uint64_t kMetaBlockSize = 2000UL; uint64_t kMetaTableSize = 500UL; @@ -234,6 +234,39 @@ class BlockCacheImpl { // data set id uint64_t sid; DataSet* data_set; + + const std::string Encode() { + if (type == kDBKey) { + return db_lock_key.ToString(); + } else if (type == kDataSetKey) { + std::string key = "DS#"; + PutFixed64(&key, sid); + return key; + } + return ""; + } + + const std::string KeyToString() { + if (type == kDBKey) { + return db_lock_key.ToString(); + } else if (type == kDataSetKey) { + std::stringstream ss; + ss << "DS#" << sid; + return ss.str(); + } else { + return ""; + } + } + + const std::string ValToString() { + if (type == kDBKey) { + uint64_t val = DecodeFixed64(db_lock_val.data()); + std::stringstream ss; + ss << val; + return ss.str(); + } + return ""; + } }; typedef std::map DataSetMap; DataSetMap data_set_map_; @@ -923,17 +956,12 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { mu_.AssertHeld(); Status s; std::string key; - if (lc.type == kDBKey) { - key = lc.db_lock_key.ToString(); - } else if (lc.type == kDataSetKey) { - key = "DS#"; - PutFixed64(&key, lc.sid); - } else { + if ((key = lc.Encode()) == "") { return Status::NotSupported("key type error"); } - Log("[%s] trylock key: %s\n", - this->WorkPath().c_str(), - key.c_str()); + //Log("[%s] trylock key: %s\n", + // this->WorkPath().c_str(), + // key.c_str()); Waiter* w = NULL; LockKeyMap::iterator it = lock_key_.find(key); @@ -948,30 +976,30 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { if (lc.type == kDBKey) { ReadOptions r_opts; s = db_->Get(r_opts, key, lc.db_val); - Log("[%s] get lock key: %s, val: %s, status: %s\n", - this->WorkPath().c_str(), - key.c_str(), - lc.db_val->c_str(), - s.ToString().c_str()); + //Log("[%s] get lock key: %s, val: %s, status: %s\n", + // this->WorkPath().c_str(), + // key.c_str(), + // lc.db_val->c_str(), + // s.ToString().c_str()); } else if (lc.type == kDataSetKey) { lc.data_set = data_set_map_[lc.sid]; - Log("[%s] get dataset sid: %lu\n", - this->WorkPath().c_str(), - lc.sid); + //Log("[%s] get dataset sid: %lu\n", + // this->WorkPath().c_str(), + // lc.sid); } mu_.Lock(); if (--w->wait_num == 0) { // last thread wait for open lock_key_.erase(key); - Log("[%s] wait done %s, delete cv\n", - this->WorkPath().c_str(), - key.c_str()); + //Log("[%s] wait done %s, delete cv\n", + // this->WorkPath().c_str(), + // key.c_str()); delete w; } else { - Log("[%s] wait done %s, not last\n", - this->WorkPath().c_str(), - key.c_str()); + //Log("[%s] wait done %s, not last\n", + // this->WorkPath().c_str(), + // key.c_str()); } } else { w = new Waiter(&mu_); @@ -985,9 +1013,10 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { if (s.ok()) { lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size()); } - Log("[%s] put kDBKey: %s, status %s\n", + Log("[%s] Insert db key : %s, val %s, status %s\n", this->WorkPath().c_str(), - key.c_str(), + lc.KeyToString().c_str(), + lc.ValToString().c_str(), s.ToString().c_str()); } else if (lc.type == kDataSetKey) { std::string end_ds = "DS#"; @@ -997,9 +1026,10 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid); lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644); assert(lc.data_set->fd > 0); - Log("[%s] begin new dataset, sid: %lu, file: %s, cs: %lu, fd: %d\n", + Log("[%s] New DataSet %s, file: %s, nr_block: %lu, fd: %d\n", this->WorkPath().c_str(), - lc.sid, file.c_str(), (options_.dataset_size / options_.block_size) + 1, + lc.KeyToString().c_str(), + file.c_str(), (options_.dataset_size / options_.block_size) + 1, lc.data_set->fd); // reload hash lru @@ -1019,8 +1049,10 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { PutFixed64(&hkey, block->block_idx); block->sid = lc.sid; block->cache_block_idx = DecodeFixed64(lkey.data()); - Log("[%s] insert cacheblock into 2QLru, %s\n", + block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0; + Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n", this->WorkPath().c_str(), + lc.KeyToString().c_str(), block->ToString().c_str()); LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter)); assert(handle != NULL); @@ -1038,14 +1070,14 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { mu_.Lock(); if (--w->wait_num == 0) { lock_key_.erase(key); - Log("[%s] put done %s, no wait thread\n", - this->WorkPath().c_str(), - key.c_str()); + //Log("[%s] put done %s, no wait thread\n", + // this->WorkPath().c_str(), + // key.c_str()); delete w; } else { - Log("[%s] put done %s, signal all wait thread\n", - this->WorkPath().c_str(), - key.c_str()); + //Log("[%s] put done %s, signal all wait thread\n", + // this->WorkPath().c_str(), + // key.c_str()); w->done = true; w->cv.SignalAll(); } @@ -1184,9 +1216,9 @@ uint64_t BlockCacheImpl::FileId(const std::string& fname) { lc.db_lock_key = key; lc.db_lock_val = val; lc.db_val = &v; - Log("[%s] alloc fid: %lu, key: %s", - this->WorkPath().c_str(), - fid, key.c_str()); + //Log("[%s] alloc fid: %lu, key: %s", + // this->WorkPath().c_str(), + // fid, key.c_str()); s = LockAndPut(lc); assert(s.ok()); fid = DecodeFixed64(v.c_str()); @@ -1214,8 +1246,8 @@ DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) { Status s = LockAndPut(lc); set = lc.data_set; } else { - Log("[%s] get dataset from memcache, sid %lu\n", - this->WorkPath().c_str(), sid); + //Log("[%s] get dataset from memcache, sid %lu\n", + // this->WorkPath().c_str(), sid); set = it->second; } return set; @@ -1236,19 +1268,19 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { Cache* cache = ds->cache; LRUHandle* h = (LRUHandle*)cache->Lookup(key); if (h == NULL) { - block = new CacheBlock(&mu_); - h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); - assert(h != NULL); - block->fid = fid; - block->block_idx = block_idx; - block->sid = sid; - block->cache_block_idx = h->cache_id; - block->handle = h; - Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); + block = new CacheBlock(&mu_); + h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); + assert(h != NULL); + block->fid = fid; + block->block_idx = block_idx; + block->sid = sid; + block->cache_block_idx = h->cache_id; + block->handle = h; + Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); } else { block = reinterpret_cast(cache->Value((Cache::Handle*)h)); Log("[%s] get block from memcache, %s\n", - this->WorkPath().c_str(), block->ToString().c_str()); + this->WorkPath().c_str(), block->ToString().c_str()); } return block; } From 9b00baa49086f165c8d2f9ae88fff5b14ed23f70 Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 8 Aug 2017 01:19:42 +0800 Subject: [PATCH 04/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/util/block_cache.cc | 126 ++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 55 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index ee99c8cb5..10a1e61b7 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -54,6 +54,7 @@ uint64_t kCacheBlockLocked = 0x2; uint64_t kCacheBlockDfsRead = 0x4; uint64_t kCacheBlockCacheRead = 0x8; uint64_t kCacheBlockCacheFill = 0x10; + struct CacheBlock { uint64_t fid; uint64_t block_idx; @@ -193,7 +194,7 @@ class BlockCacheImpl { Status LogRecord(CacheBlock* block); - Status ReleaseBlock(CacheBlock* block); + Status ReleaseBlock(CacheBlock* block, bool need_sync); private: friend class BlockCacheWritableFile; @@ -441,7 +442,7 @@ class BlockCacheWriteBuffer { path_.c_str(), file_.c_str(), begin, end, - offset_, data.size(), block_size_); + offset_ - data.size() , data.size(), block_size_); return Status::OK(); } @@ -502,6 +503,9 @@ class BlockCacheWritableFile : public WritableFile { fname.c_str(), cache_->options_.block_size, s->ToString().c_str()); + + MutexLock lockgard(&cache_->mu_); + fid_ = cache_->FileId(fname_); return; } @@ -578,11 +582,11 @@ class BlockCacheWritableFile : public WritableFile { private: void MaybeScheduleBGFlush() { cache_->mu_.AssertHeld(); - Log("[%s] Maybe schedule BGFlush: %s, bg_block_flush: %u, block_nr: %u\n", - cache_->WorkPath().c_str(), - fname_.c_str(), - bg_block_flush_, - write_buffer_.NumFullBlock()); + //Log("[%s] Maybe schedule BGFlush: %s, bg_block_flush: %u, block_nr: %u\n", + // cache_->WorkPath().c_str(), + // fname_.c_str(), + // bg_block_flush_, + // write_buffer_.NumFullBlock()); while (bg_block_flush_ < (write_buffer_.NumFullBlock() + pending_block_num_)) { bg_block_flush_++; cache_->bg_flush_.Schedule(&BlockCacheWritableFile::BGFlushFunc, this, 10); @@ -593,7 +597,7 @@ class BlockCacheWritableFile : public WritableFile { reinterpret_cast(arg)->BGFlush(); } void BGFlush() { - Log("[%s] begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + Log("[%s] Begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); MutexLock lockgard(&cache_->mu_); uint64_t block_idx; std::string* block_data = write_buffer_.PopFrontBlock(&block_idx); @@ -611,7 +615,7 @@ class BlockCacheWritableFile : public WritableFile { Status FillCache(std::string* block_data, uint64_t block_idx) { cache_->mu_.AssertHeld(); - uint64_t fid = cache_->FileId(fname_); + uint64_t fid = fid_; CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx); block->state = 0; block->GetDataBlock(cache_->options_.block_size, Slice(*block_data)); @@ -623,7 +627,7 @@ class BlockCacheWritableFile : public WritableFile { cache_->mu_.Lock(); block->state = kCacheBlockValid; - cache_->ReleaseBlock(block); + cache_->ReleaseBlock(block, true); write_buffer_.ReleaseBlock(block_data); return Status::OK(); } @@ -638,6 +642,7 @@ class BlockCacheWritableFile : public WritableFile { uint32_t pending_block_num_; BlockCacheWriteBuffer write_buffer_; std::string fname_; + uint64_t fid_; }; class BlockCacheRandomAccessFile : public RandomAccessFile { @@ -651,6 +656,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { fname.c_str(), cache_->options_.block_size, s->ToString().c_str()); + + MutexLock lockgard(&cache_->mu_); + fid_ = cache_->FileId(fname_); return; } @@ -661,20 +669,22 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { - MutexLock lockgard(&cache_->mu_); - uint64_t fid = cache_->FileId(fname_); + Status s; uint64_t begin = offset / cache_->options_.block_size; uint64_t end = (offset + n) / cache_->options_.block_size; assert(begin <= end); + uint64_t fid = fid_; std::vector c_miss; std::vector c_locked; std::vector c_valid; std::vector block_queue; - Log("[%s] begin pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu" + Log("[%s] Begin Pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu" ", block_size %lu\n", cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, begin, end, cache_->options_.block_size); + + MutexLock lockgard(&cache_->mu_); for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) { CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx); assert(block->fid == fid && block->block_idx == block_idx); @@ -692,7 +702,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { c_locked.push_back(block); } - Log("[%s] queue block: %s, refs %u, data_block_refs %lu, alloc %u\n", + Log("[%s] Queue block: %s, refs %u, data_block_refs %lu, alloc %u\n", cache_->WorkPath().c_str(), block->ToString().c_str(), block->handle->refs, block->data_block_refs, block->data_block_alloc); @@ -705,9 +715,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { AsyncDfsReader* reader = new AsyncDfsReader; reader->file = const_cast(this); reader->block = block; - Log("[%s] pread in miss list, %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); + //Log("[%s] pread in miss list, %s\n", + // cache_->WorkPath().c_str(), + // block->ToString().c_str()); cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10); } @@ -717,9 +727,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { AsyncCacheReader* reader = new AsyncCacheReader; reader->file = const_cast(this); reader->block = block; - Log("[%s] pread in valid list, %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); + //Log("[%s] pread in valid list, %s\n", + // cache_->WorkPath().c_str(), + // block->ToString().c_str()); cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10); } @@ -731,7 +741,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { block->Set(kCacheBlockValid); block->Clear(kCacheBlockLocked); block->cv.SignalAll(); - Log("[%s] pread in valid list(done), %s\n", + Log("[%s] cache read done, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); } @@ -742,7 +752,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { CacheBlock* block = c_miss[i]; block->WaitOnClear(kCacheBlockDfsRead); block->Set(kCacheBlockCacheFill); - Log("[%s] pread in miss list(dfs done), %s\n", + Log("[%s] dfs read done, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); } @@ -752,9 +762,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { AsyncCacheWriter* writer = new AsyncCacheWriter; writer->file = const_cast(this); writer->block = block; - Log("[%s] pread in miss list(fill cache), %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); + //Log("[%s] pread in miss list(fill cache), %s\n", + // cache_->WorkPath().c_str(), + // block->ToString().c_str()); cache_->bg_fill_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheWrite, writer, 10); } @@ -765,7 +775,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { block->Set(kCacheBlockValid); block->Clear(kCacheBlockLocked); block->cv.SignalAll(); - Log("[%s] pread in miss list(fill cache done), %s\n", + Log("[%s] cache fill done, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); } @@ -776,6 +786,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { CacheBlock* block = c_locked[i]; block->WaitOnClear(kCacheBlockLocked); assert((block->state & kCacheBlockValid) == kCacheBlockValid); + Log("[%s] wait locked done, %s\n", + cache_->WorkPath().c_str(), + block->ToString().c_str()); } // fill user mem @@ -791,7 +804,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } memcpy(scratch + msize, data_block.data(), data_block.size()); msize += data_block.size(); - Log("[%s] fill user data, %s, prefix %lu, suffix %lu, msize %lu, offset %lu\n", + Log("[%s] Fill user data, %s, prefix %lu, suffix %lu, msize %lu, offset %lu\n", cache_->WorkPath().c_str(), fname_.c_str(), block_idx == begin ? offset % cache_->options_.block_size: 0, block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size @@ -804,25 +817,26 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->mu_.Lock(); for (uint32_t i = 0; i < c_miss.size(); ++i) { CacheBlock* block = c_miss[i]; - Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); - cache_->ReleaseBlock(block); + //Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); + cache_->ReleaseBlock(block, true); } for (uint32_t i = 0; i < c_valid.size(); ++i) { CacheBlock* block = c_valid[i]; - Log("[%s] wakeup for valid, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); - cache_->ReleaseBlock(block); + //Log("[%s] wakeup for valid, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); + cache_->ReleaseBlock(block, false); } for (uint32_t i = 0; i < c_locked.size(); ++i) { CacheBlock* block = c_locked[i]; - Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); - cache_->ReleaseBlock(block); + //Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); + cache_->ReleaseBlock(block, false); } - Log("[%s] end pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu" + Log("[%s] End Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu" ", block_size %lu\n", cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, + result->size(), s.ToString().c_str(), begin, end, cache_->options_.block_size); - return Status::OK(); + return s; } private: @@ -844,7 +858,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { uint64_t offset = block->block_idx * cache_->options_.block_size; size_t n = cache_->options_.block_size; s = dfs_file_->Read(offset, n, &result, scratch); - Log("[%s] cache async.dfs read, %s" + Log("[%s] dfs read, %s" ", offset %lu, size %lu, status %s, res %lu\n", cache_->WorkPath().c_str(), block->ToString().c_str(), offset, n, @@ -890,9 +904,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } void HandleCacheWrite(AsyncCacheWriter* writer) { CacheBlock* block = writer->block; - Log("[%s] handle cache write, %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); + //Log("[%s] cache fill, %s\n", + // cache_->WorkPath().c_str(), + // block->ToString().c_str()); cache_->LogRecord(block); cache_->FillCache(block); @@ -906,6 +920,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { BlockCacheImpl* cache_; RandomAccessFile* dfs_file_; std::string fname_; + uint64_t fid_; }; // Tcache impl @@ -1133,7 +1148,7 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) { // do io without lock ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(), cache_block_idx * options_.block_size); - Log("[%s] fillcache: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", + Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", this->WorkPath().c_str(), sid, fd, block->data_block.size(), cache_block_idx, block->ToString().c_str(), @@ -1156,7 +1171,7 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block) { // do io without lock ssize_t res = pread(fd, (char*)block->data_block.data(), block->data_block.size(), cache_block_idx * options_.block_size); - Log("[%s] readcache: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", + Log("[%s] cache read: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", this->WorkPath().c_str(), sid, fd, block->data_block.size(), cache_block_idx, block->ToString().c_str(), @@ -1261,8 +1276,8 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { uint32_t hash = Hash(key.c_str(), key.size(), 7); uint64_t sid = hash % options_.dataset_num; - Log("[%s] alloc block, try get dataset, fid: %lu, block_idx: %lu, hash: %u, sid %lu, dataset_num: %lu\n", - this->WorkPath().c_str(), fid, block_idx, hash, sid, options_.dataset_num); + //Log("[%s] alloc block, try get dataset, fid: %lu, block_idx: %lu, hash: %u, sid %lu, dataset_num: %lu\n", + // this->WorkPath().c_str(), fid, block_idx, hash, sid, options_.dataset_num); CacheBlock* block = NULL; DataSet* ds = GetDataSet(sid); // get and alloc ds Cache* cache = ds->cache; @@ -1276,11 +1291,14 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { block->sid = sid; block->cache_block_idx = h->cache_id; block->handle = h; - Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); + Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", + this->WorkPath().c_str(), + block->ToString().c_str(), + sid, fid, block_idx, hash, options_.dataset_num); } else { block = reinterpret_cast(cache->Value((Cache::Handle*)h)); - Log("[%s] get block from memcache, %s\n", - this->WorkPath().c_str(), block->ToString().c_str()); + //Log("[%s] get block from memcache, %s\n", + // this->WorkPath().c_str(), block->ToString().c_str()); } return block; } @@ -1294,26 +1312,24 @@ Status BlockCacheImpl::LogRecord(CacheBlock* block) { return db_->Write(leveldb::WriteOptions(), &batch); } -Status BlockCacheImpl::ReleaseBlock(CacheBlock* block) { +Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) { mu_.AssertHeld(); Status s; - std::string key = "DS#"; - PutFixed64(&key, block->sid); - PutFixed64(&key, block->cache_block_idx); - leveldb::WriteBatch batch; - batch.Put(key, block->Encode()); + mu_.Unlock(); + if (need_sync) { + s = LogRecord(block); + } + + mu_.Lock(); LRUHandle* h = block->handle; DataSet* ds = GetDataSet(block->sid); // get and alloc ds block->ReleaseDataBlock(); block->cv.SignalAll(); ds->cache->Release((Cache::Handle*)h); - mu_.Unlock(); // TODO: dump meta into memtable Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); - s = db_->Write(leveldb::WriteOptions(), &batch); - mu_.Lock(); return s; } From 170318fe6cad5becae33d4036cc082d25194e295 Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 8 Aug 2017 02:19:29 +0800 Subject: [PATCH 05/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/util/block_cache.cc | 61 +++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 10a1e61b7..0b54e6df8 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -147,7 +147,7 @@ struct CacheBlock { std::stringstream ss; ss << "CacheBlock(" << (uint64_t)this << "): fid: " << fid << ", block_idx: " << block_idx << ", sid: " << sid << ", cache_block_idx: " << cache_block_idx - << ", state " << state; + << ", state " << state << ", status " << s.ToString(); return ss.str(); } }; @@ -615,6 +615,7 @@ class BlockCacheWritableFile : public WritableFile { Status FillCache(std::string* block_data, uint64_t block_idx) { cache_->mu_.AssertHeld(); + Status s; uint64_t fid = fid_; CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx); block->state = 0; @@ -622,14 +623,18 @@ class BlockCacheWritableFile : public WritableFile { cache_->mu_.Unlock(); // Do io without lock - cache_->LogRecord(block); - cache_->FillCache(block); + block->s = cache_->LogRecord(block); + if (block->s.ok()) { + block->s = cache_->FillCache(block); + } cache_->mu_.Lock(); - block->state = kCacheBlockValid; - cache_->ReleaseBlock(block, true); + if (block->s.ok()) { + block->state = kCacheBlockValid; + } + s = cache_->ReleaseBlock(block, true); write_buffer_.ReleaseBlock(block_data); - return Status::OK(); + return s; } private: @@ -738,7 +743,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_valid[i]; block->WaitOnClear(kCacheBlockCacheRead); - block->Set(kCacheBlockValid); + assert(block->Test(kCacheBlockValid)); + if (!block->s.ok() && s.ok()) { + s = block->s; // degrade read + } block->Clear(kCacheBlockLocked); block->cv.SignalAll(); Log("[%s] cache read done, %s\n", @@ -752,6 +760,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { CacheBlock* block = c_miss[i]; block->WaitOnClear(kCacheBlockDfsRead); block->Set(kCacheBlockCacheFill); + if (!block->s.ok() && s.ok()) { + s = block->s; // degrade read + } Log("[%s] dfs read done, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); @@ -772,7 +783,11 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_miss[i]; block->WaitOnClear(kCacheBlockCacheFill); - block->Set(kCacheBlockValid); + if (block->s.ok()) { + block->Set(kCacheBlockValid); + } else if (s.ok()) { + s = block->s; // degrade read + } block->Clear(kCacheBlockLocked); block->cv.SignalAll(); Log("[%s] cache fill done, %s\n", @@ -785,7 +800,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_locked[i]; block->WaitOnClear(kCacheBlockLocked); - assert((block->state & kCacheBlockValid) == kCacheBlockValid); Log("[%s] wait locked done, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); @@ -804,8 +818,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } memcpy(scratch + msize, data_block.data(), data_block.size()); msize += data_block.size(); - Log("[%s] Fill user data, %s, prefix %lu, suffix %lu, msize %lu, offset %lu\n", + Log("[%s] Fill user data, %s, fill_offset %lu, fill_size %lu, prefix %lu, suffix %lu, msize %lu, offset %lu\n", cache_->WorkPath().c_str(), fname_.c_str(), + block_idx * cache_->options_.block_size + (block_idx == begin ? offset % cache_->options_.block_size: 0), + data_block.size(), block_idx == begin ? offset % cache_->options_.block_size: 0, block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size : cache_->options_.block_size, @@ -831,6 +847,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->ReleaseBlock(block, false); } + if (!s.ok()) { + s = dfs_file_->Read(offset, n, result, scratch); + Log("[%s] Pread degrade %s, offset %lu, size %lu, status %s\n", + cache_->WorkPath().c_str(), fname_.c_str(), + offset, n, s.ToString().c_str()); + } + Log("[%s] End Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu" ", block_size %lu\n", cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, @@ -857,12 +880,12 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { Slice result; uint64_t offset = block->block_idx * cache_->options_.block_size; size_t n = cache_->options_.block_size; - s = dfs_file_->Read(offset, n, &result, scratch); + block->s = dfs_file_->Read(offset, n, &result, scratch); Log("[%s] dfs read, %s" ", offset %lu, size %lu, status %s, res %lu\n", cache_->WorkPath().c_str(), block->ToString().c_str(), offset, n, - s.ToString().c_str(), result.size()); + block->s.ToString().c_str(), result.size()); MutexLock lockgard(&cache_->mu_); block->Clear(kCacheBlockDfsRead); @@ -882,7 +905,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } void HandleCacheRead(AsyncCacheReader* reader) { CacheBlock* block = reader->block; - cache_->ReadCache(block); + block->s = cache_->ReadCache(block); MutexLock lockgard(&cache_->mu_); block->Clear(kCacheBlockCacheRead); @@ -907,8 +930,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { //Log("[%s] cache fill, %s\n", // cache_->WorkPath().c_str(), // block->ToString().c_str()); - cache_->LogRecord(block); - cache_->FillCache(block); + block->s = cache_->LogRecord(block); + if (block->s.ok()) { + block->s = cache_->FillCache(block); + } MutexLock lockgard(&cache_->mu_); block->Clear(kCacheBlockCacheFill); @@ -1318,6 +1343,7 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) { mu_.Unlock(); if (need_sync) { + // TODO: dump meta into memtable s = LogRecord(block); } @@ -1325,11 +1351,10 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) { LRUHandle* h = block->handle; DataSet* ds = GetDataSet(block->sid); // get and alloc ds block->ReleaseDataBlock(); + Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); + block->s = Status::OK(); // clear io status block->cv.SignalAll(); ds->cache->Release((Cache::Handle*)h); - - // TODO: dump meta into memtable - Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); return s; } From a1e51d3bc7c11a3e31658d2f2f898e686a91d65b Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 8 Aug 2017 02:56:43 +0800 Subject: [PATCH 06/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/util/block_cache.cc | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 0b54e6df8..603ca1f0a 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -617,7 +617,14 @@ class BlockCacheWritableFile : public WritableFile { cache_->mu_.AssertHeld(); Status s; uint64_t fid = fid_; - CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx); + CacheBlock* block = NULL; + while ((block = cache_->GetAndAllocBlock(fid, block_idx)) == NULL) { + Log("[%s] fill cache for write %s, fid %lu, block_idx %lu, wait 10ms after retry\n", + cache_->WorkPath().c_str(), fname_.c_str(), + fid, block_idx); + port::CondVar cv(&cache_->mu_); + cv.Wait(10); // timewait 10ms retry + } block->state = 0; block->GetDataBlock(cache_->options_.block_size, Slice(*block_data)); cache_->mu_.Unlock(); @@ -691,7 +698,14 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { MutexLock lockgard(&cache_->mu_); for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) { - CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx); + CacheBlock* block = NULL; + while ((block = cache_->GetAndAllocBlock(fid, block_idx)) == NULL) { + Log("[%s] fill cache for read %s, fid %lu, block_idx %lu, wait 10ms after retry\n", + cache_->WorkPath().c_str(), fname_.c_str(), + fid, block_idx); + port::CondVar cv(&cache_->mu_); + cv.Wait(10); // timewait 10ms retry + } assert(block->fid == fid && block->block_idx == block_idx); block->GetDataBlock(cache_->options_.block_size, Slice()); block_queue.push_back(block); // sort by block_idx @@ -1310,7 +1324,10 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { if (h == NULL) { block = new CacheBlock(&mu_); h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); - assert(h != NULL); + if (h == NULL) { + delete block; + return NULL; + } block->fid = fid; block->block_idx = block_idx; block->sid = sid; From b74989698a45f819fa44f8dc43756e71e1b3f21f Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 22 Aug 2017 01:04:32 +0800 Subject: [PATCH 07/19] issue=1258, Tcache support block-level cache evict bugfix: 1. cache reload core 2. support aio engine 3. cache fill TEST PASS --- src/io/utils_leveldb.cc | 8 +- src/leveldb/Makefile | 2 +- src/leveldb/include/leveldb/block_cache.h | 4 +- src/leveldb/include/leveldb/statistics.h | 393 ++++++++++++++++++++++ src/leveldb/util/block_cache.cc | 281 ++++++++++++---- src/leveldb/util/cache.cc | 17 +- src/leveldb/util/statistics.cc | 115 +++++++ src/tera_flags.cc | 2 +- 8 files changed, 735 insertions(+), 87 deletions(-) create mode 100644 src/leveldb/include/leveldb/statistics.h create mode 100644 src/leveldb/util/statistics.cc diff --git a/src/io/utils_leveldb.cc b/src/io/utils_leveldb.cc index 3d3249e1d..c6d16e2a8 100644 --- a/src/io/utils_leveldb.cc +++ b/src/io/utils_leveldb.cc @@ -15,10 +15,10 @@ #include "common/file/file_path.h" #include "common/mutex.h" #include "io/timekey_comparator.h" +#include "leveldb/block_cache.h" #include "leveldb/comparator.h" #include "leveldb/env_dfs.h" #include "leveldb/env_flash.h" -#include "leveldb/block_cache.h" #include "leveldb/env_inmem.h" #include "leveldb/env_mock.h" #include "leveldb/table_utils.h" @@ -32,7 +32,7 @@ DECLARE_string(tera_leveldb_env_hdfs2_nameservice_list); DECLARE_string(tera_tabletnode_path_prefix); DECLARE_string(tera_dfs_so_path); DECLARE_string(tera_dfs_conf); -DECLARE_int32(tera_leveldb_block_cache_env_num_thread); +DECLARE_int32(tera_leveldb_block_cache_env_thread_num); namespace tera { namespace io { @@ -73,8 +73,8 @@ static pthread_once_t block_cache_once = PTHREAD_ONCE_INIT; static leveldb::Env* default_block_cache_env; static void InitDefaultBlockCacheEnv() { default_block_cache_env = new leveldb::BlockCacheEnv(LeveldbBaseEnv()); - default_block_cache_env->SetBackgroundThreads(FLAGS_tera_leveldb_block_cache_env_num_thread); - LOG(INFO) << "init block cache, thread num " << FLAGS_tera_leveldb_block_cache_env_num_thread; + default_block_cache_env->SetBackgroundThreads(FLAGS_tera_leveldb_block_cache_env_thread_num); + LOG(INFO) << "init block cache, thread num " << FLAGS_tera_leveldb_block_cache_env_thread_num; } leveldb::Env* DefaultBlockCacheEnv() { diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile index c9162d2eb..9073e98a5 100644 --- a/src/leveldb/Makefile +++ b/src/leveldb/Makefile @@ -7,7 +7,7 @@ # to switch between compilation modes. # OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode) -OPT ?= -g2 -Wall -Werror # (B) Debug mode, w/ full line-level debugging symbols +OPT ?= -std=gnu++11 -g2 -Wall -Werror # (B) Debug mode, w/ full line-level debugging symbols # OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols #----------------------------------------------- diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h index a48022bb7..021964db4 100644 --- a/src/leveldb/include/leveldb/block_cache.h +++ b/src/leveldb/include/leveldb/block_cache.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#ifndef STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H -#define STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H +#ifndef STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H_ +#define STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H_ #include "leveldb/env.h" #include "leveldb/options.h" diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h new file mode 100644 index 000000000..81d4a4729 --- /dev/null +++ b/src/leveldb/include/leveldb/statistics.h @@ -0,0 +1,393 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef STORAGE_LEVELDB_INCLUDE_STATISTICS_H_ +#define STORAGE_LEVELDB_INCLUDE_STATISTICS_H_ + +#include +#include + +#include +#include + +namespace leveldb { + +/** + * Keep adding ticker's here. + * 1. Any ticker should be added before TICKER_ENUM_MAX. + * 2. Add a readable string in TickersNameMap below for the newly added ticker. + */ +enum Tickers : uint32_t { + // total block cache misses + // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + + // BLOCK_CACHE_FILTER_MISS + + // BLOCK_CACHE_DATA_MISS; + BLOCK_CACHE_MISS = 0, + // total block cache hit + // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + + // BLOCK_CACHE_FILTER_HIT + + // BLOCK_CACHE_DATA_HIT; + BLOCK_CACHE_HIT, + // # of blocks added to block cache. + BLOCK_CACHE_ADD, + // # of failures when adding blocks to block cache. + BLOCK_CACHE_ADD_FAILURES, + // # of times cache miss when accessing index block from block cache. + BLOCK_CACHE_INDEX_MISS, + // # of times cache hit when accessing index block from block cache. + BLOCK_CACHE_INDEX_HIT, + // # of bytes of index blocks inserted into cache + BLOCK_CACHE_INDEX_BYTES_INSERT, + // # of bytes of index block erased from cache + BLOCK_CACHE_INDEX_BYTES_EVICT, + // # of times cache miss when accessing filter block from block cache. + BLOCK_CACHE_FILTER_MISS, + // # of times cache hit when accessing filter block from block cache. + BLOCK_CACHE_FILTER_HIT, + // # of bytes of bloom filter blocks inserted into cache + BLOCK_CACHE_FILTER_BYTES_INSERT, + // # of bytes of bloom filter block erased from cache + BLOCK_CACHE_FILTER_BYTES_EVICT, + // # of times cache miss when accessing data block from block cache. + BLOCK_CACHE_DATA_MISS, + // # of times cache hit when accessing data block from block cache. + BLOCK_CACHE_DATA_HIT, + // # of bytes read from cache. + BLOCK_CACHE_BYTES_READ, + // # of bytes written into cache. + BLOCK_CACHE_BYTES_WRITE, + + // # of times bloom filter has avoided file reads. + BLOOM_FILTER_USEFUL, + + // # persistent cache hit + PERSISTENT_CACHE_HIT, + // # persistent cache miss + PERSISTENT_CACHE_MISS, + + // # of memtable hits. + MEMTABLE_HIT, + // # of memtable misses. + MEMTABLE_MISS, + + // # of Get() queries served by L0 + GET_HIT_L0, + // # of Get() queries served by L1 + GET_HIT_L1, + // # of Get() queries served by L2 and up + GET_HIT_L2_AND_UP, + + /** + * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction + * There are 3 reasons currently. + * 覆盖写;删除;用户函数删除 + */ + COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. + COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. + COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. + + // Number of keys written to the database via the Put and Write call's + NUMBER_KEYS_WRITTEN, + // Number of Keys read, + NUMBER_KEYS_READ, + // Number keys updated, if inplace update is enabled + NUMBER_KEYS_UPDATED, + // The number of uncompressed bytes issued by DB::Put(), DB::Delete(), + // DB::Merge(), and DB::Write(). + BYTES_WRITTEN, + // The number of uncompressed bytes read from DB::Get(). It could be + // either from memtables, cache, or table files. + // For the number of logical bytes read from DB::MultiGet(), + // please use NUMBER_MULTIGET_BYTES_READ. + BYTES_READ, + // The number of calls to seek/next/prev + NUMBER_DB_SEEK, + NUMBER_DB_NEXT, + NUMBER_DB_PREV, + // The number of calls to seek/next/prev that returned data + NUMBER_DB_SEEK_FOUND, + NUMBER_DB_NEXT_FOUND, + NUMBER_DB_PREV_FOUND, + // The number of uncompressed bytes read from an iterator. + // Includes size of key and value. + ITER_BYTES_READ, + NO_FILE_CLOSES, + NO_FILE_OPENS, + NO_FILE_ERRORS, + // DEPRECATED Time system had to wait to do LO-L1 compactions + STALL_L0_SLOWDOWN_MICROS, + // DEPRECATED Time system had to wait to move memtable to L1. + STALL_MEMTABLE_COMPACTION_MICROS, + // DEPRECATED write throttle because of too many files in L0 + STALL_L0_NUM_FILES_MICROS, + // Writer has to wait for compaction or flush to finish. + STALL_MICROS, + // The wait time for db mutex. + // Disabled by default. To enable it set stats level to kAll + DB_MUTEX_WAIT_MICROS, + RATE_LIMIT_DELAY_MILLIS, + NO_ITERATORS, // number of iterators currently open + + // Number of MultiGet calls, keys read, and bytes read + NUMBER_MULTIGET_CALLS, + NUMBER_MULTIGET_KEYS_READ, + NUMBER_MULTIGET_BYTES_READ, + + // Number of deletes records that were not required to be + // written to storage because key does not exist + NUMBER_FILTERED_DELETES, + NUMBER_MERGE_FAILURES, + SEQUENCE_NUMBER, + + // number of times bloom was checked before creating iterator on a + // file, and the number of times the check was useful in avoiding + // iterator creation (and thus likely IOPs). + BLOOM_FILTER_PREFIX_CHECKED, + BLOOM_FILTER_PREFIX_USEFUL, + + // Number of times we had to reseek inside an iteration to skip + // over large number of keys with same userkey. + NUMBER_OF_RESEEKS_IN_ITERATION, + + // Record the number of calls to GetUpadtesSince. Useful to keep track of + // transaction log iterator refreshes + GET_UPDATES_SINCE_CALLS, + BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache + BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache + // Number of blocks added to comopressed block cache + BLOCK_CACHE_COMPRESSED_ADD, + // Number of failures when adding blocks to compressed block cache + BLOCK_CACHE_COMPRESSED_ADD_FAILURES, + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL + + // Writes can be processed by requesting thread or by the thread at the + // head of the writers queue. + WRITE_DONE_BY_SELF, + WRITE_DONE_BY_OTHER, // Equivalent to writes done for others + WRITE_TIMEDOUT, // Number of writes ending up with timed-out. + WRITE_WITH_WAL, // Number of Write calls that request WAL + COMPACT_READ_BYTES, // Bytes read during compaction + COMPACT_WRITE_BYTES, // Bytes written during compaction + FLUSH_WRITE_BYTES, // Bytes written during flush + + // Number of table's properties loaded directly from file, without creating + // table reader object. + NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + NUMBER_SUPERVERSION_ACQUIRES, + NUMBER_SUPERVERSION_RELEASES, + NUMBER_SUPERVERSION_CLEANUPS, + NUMBER_BLOCK_NOT_COMPRESSED, + MERGE_OPERATION_TOTAL_TIME, + FILTER_OPERATION_TOTAL_TIME, + + // Row cache. + ROW_CACHE_HIT, + ROW_CACHE_MISS, + + TICKER_ENUM_MAX +}; + +// The order of items listed in Tickers should be the same as +// the order listed in TickersNameMap +const std::vector > TickersNameMap = { + {BLOCK_CACHE_MISS, "leveldb.block.cache.miss"}, + {BLOCK_CACHE_HIT, "leveldb.block.cache.hit"}, + {BLOCK_CACHE_ADD, "leveldb.block.cache.add"}, + {BLOCK_CACHE_ADD_FAILURES, "leveldb.block.cache.add.failures"}, + {BLOCK_CACHE_INDEX_MISS, "leveldb.block.cache.index.miss"}, + {BLOCK_CACHE_INDEX_HIT, "leveldb.block.cache.index.hit"}, + {BLOCK_CACHE_INDEX_BYTES_INSERT, "leveldb.block.cache.index.bytes.insert"}, + {BLOCK_CACHE_INDEX_BYTES_EVICT, "leveldb.block.cache.index.bytes.evict"}, + {BLOCK_CACHE_FILTER_MISS, "leveldb.block.cache.filter.miss"}, + {BLOCK_CACHE_FILTER_HIT, "leveldb.block.cache.filter.hit"}, + {BLOCK_CACHE_FILTER_BYTES_INSERT, + "leveldb.block.cache.filter.bytes.insert"}, + {BLOCK_CACHE_FILTER_BYTES_EVICT, "leveldb.block.cache.filter.bytes.evict"}, + {BLOCK_CACHE_DATA_MISS, "leveldb.block.cache.data.miss"}, + {BLOCK_CACHE_DATA_HIT, "leveldb.block.cache.data.hit"}, + {BLOCK_CACHE_BYTES_READ, "leveldb.block.cache.bytes.read"}, + {BLOCK_CACHE_BYTES_WRITE, "leveldb.block.cache.bytes.write"}, + {BLOOM_FILTER_USEFUL, "leveldb.bloom.filter.useful"}, + {MEMTABLE_HIT, "leveldb.memtable.hit"}, + {MEMTABLE_MISS, "leveldb.memtable.miss"}, + {GET_HIT_L0, "leveldb.l0.hit"}, + {GET_HIT_L1, "leveldb.l1.hit"}, + {GET_HIT_L2_AND_UP, "leveldb.l2andup.hit"}, + {COMPACTION_KEY_DROP_NEWER_ENTRY, "leveldb.compaction.key.drop.new"}, + {COMPACTION_KEY_DROP_OBSOLETE, "leveldb.compaction.key.drop.obsolete"}, + {COMPACTION_KEY_DROP_USER, "leveldb.compaction.key.drop.user"}, + {NUMBER_KEYS_WRITTEN, "leveldb.number.keys.written"}, + {NUMBER_KEYS_READ, "leveldb.number.keys.read"}, + {NUMBER_KEYS_UPDATED, "leveldb.number.keys.updated"}, + {BYTES_WRITTEN, "leveldb.bytes.written"}, + {BYTES_READ, "leveldb.bytes.read"}, + {NUMBER_DB_SEEK, "leveldb.number.db.seek"}, + {NUMBER_DB_NEXT, "leveldb.number.db.next"}, + {NUMBER_DB_PREV, "leveldb.number.db.prev"}, + {NUMBER_DB_SEEK_FOUND, "leveldb.number.db.seek.found"}, + {NUMBER_DB_NEXT_FOUND, "leveldb.number.db.next.found"}, + {NUMBER_DB_PREV_FOUND, "leveldb.number.db.prev.found"}, + {ITER_BYTES_READ, "leveldb.db.iter.bytes.read"}, + {NO_FILE_CLOSES, "leveldb.no.file.closes"}, + {NO_FILE_OPENS, "leveldb.no.file.opens"}, + {NO_FILE_ERRORS, "leveldb.no.file.errors"}, + {STALL_L0_SLOWDOWN_MICROS, "leveldb.l0.slowdown.micros"}, + {STALL_MEMTABLE_COMPACTION_MICROS, "leveldb.memtable.compaction.micros"}, + {STALL_L0_NUM_FILES_MICROS, "leveldb.l0.num.files.stall.micros"}, + {STALL_MICROS, "leveldb.stall.micros"}, + {DB_MUTEX_WAIT_MICROS, "leveldb.db.mutex.wait.micros"}, + {RATE_LIMIT_DELAY_MILLIS, "leveldb.rate.limit.delay.millis"}, + {NO_ITERATORS, "leveldb.num.iterators"}, + {NUMBER_MULTIGET_CALLS, "leveldb.number.multiget.get"}, + {NUMBER_MULTIGET_KEYS_READ, "leveldb.number.multiget.keys.read"}, + {NUMBER_MULTIGET_BYTES_READ, "leveldb.number.multiget.bytes.read"}, + {NUMBER_FILTERED_DELETES, "leveldb.number.deletes.filtered"}, + {NUMBER_MERGE_FAILURES, "leveldb.number.merge.failures"}, + {SEQUENCE_NUMBER, "leveldb.sequence.number"}, + {BLOOM_FILTER_PREFIX_CHECKED, "leveldb.bloom.filter.prefix.checked"}, + {BLOOM_FILTER_PREFIX_USEFUL, "leveldb.bloom.filter.prefix.useful"}, + {NUMBER_OF_RESEEKS_IN_ITERATION, "leveldb.number.reseeks.iteration"}, + {GET_UPDATES_SINCE_CALLS, "leveldb.getupdatessince.calls"}, + {BLOCK_CACHE_COMPRESSED_MISS, "leveldb.block.cachecompressed.miss"}, + {BLOCK_CACHE_COMPRESSED_HIT, "leveldb.block.cachecompressed.hit"}, + {BLOCK_CACHE_COMPRESSED_ADD, "leveldb.block.cachecompressed.add"}, + {BLOCK_CACHE_COMPRESSED_ADD_FAILURES, + "leveldb.block.cachecompressed.add.failures"}, + {WAL_FILE_SYNCED, "leveldb.wal.synced"}, + {WAL_FILE_BYTES, "leveldb.wal.bytes"}, + {WRITE_DONE_BY_SELF, "leveldb.write.self"}, + {WRITE_DONE_BY_OTHER, "leveldb.write.other"}, + {WRITE_WITH_WAL, "leveldb.write.wal"}, + {FLUSH_WRITE_BYTES, "leveldb.flush.write.bytes"}, + {COMPACT_READ_BYTES, "leveldb.compact.read.bytes"}, + {COMPACT_WRITE_BYTES, "leveldb.compact.write.bytes"}, + {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, + "leveldb.number.direct.load.table.properties"}, + {NUMBER_SUPERVERSION_ACQUIRES, "leveldb.number.superversion_acquires"}, + {NUMBER_SUPERVERSION_RELEASES, "leveldb.number.superversion_releases"}, + {NUMBER_SUPERVERSION_CLEANUPS, "leveldb.number.superversion_cleanups"}, + {NUMBER_BLOCK_NOT_COMPRESSED, "leveldb.number.block.not_compressed"}, + {MERGE_OPERATION_TOTAL_TIME, "leveldb.merge.operation.time.nanos"}, + {FILTER_OPERATION_TOTAL_TIME, "leveldb.filter.operation.time.nanos"}, + {ROW_CACHE_HIT, "leveldb.row.cache.hit"}, + {ROW_CACHE_MISS, "leveldb.row.cache.miss"}, +}; + +/** + * Keep adding histogram's here. + * Any histogram whould have value less than HISTOGRAM_ENUM_MAX + * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX + * Add a string representation in HistogramsNameMap below + * And increment HISTOGRAM_ENUM_MAX + */ +enum Histograms : uint32_t { + DB_GET = 0, + DB_WRITE, + COMPACTION_TIME, + SUBCOMPACTION_SETUP_TIME, + TABLE_SYNC_MICROS, + COMPACTION_OUTFILE_SYNC_MICROS, + WAL_FILE_SYNC_MICROS, + MANIFEST_FILE_SYNC_MICROS, + // TIME SPENT IN IO DURING TABLE OPEN + TABLE_OPEN_IO_MICROS, + DB_MULTIGET, + READ_BLOCK_COMPACTION_MICROS, + READ_BLOCK_GET_MICROS, + WRITE_RAW_BLOCK_MICROS, + STALL_L0_SLOWDOWN_COUNT, + STALL_MEMTABLE_COMPACTION_COUNT, + STALL_L0_NUM_FILES_COUNT, + HARD_RATE_LIMIT_DELAY_COUNT, + SOFT_RATE_LIMIT_DELAY_COUNT, + NUM_FILES_IN_SINGLE_COMPACTION, + DB_SEEK, + WRITE_STALL, + SST_READ_MICROS, + // The number of subcompactions actually scheduled during a compaction + NUM_SUBCOMPACTIONS_SCHEDULED, + // Value size distribution in each operation + BYTES_PER_READ, + BYTES_PER_WRITE, + BYTES_PER_MULTIGET, + // tera block cache spec + TERA_BLOCK_CACHE_PREAD_QUEUE, + TERA_BLOCK_CACHE_PREAD_SSD_READ, + TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, + TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK, + HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match +}; + +const std::vector > HistogramsNameMap = { + {DB_GET, "leveldb.db.get.micros"}, + {DB_WRITE, "leveldb.db.write.micros"}, + {COMPACTION_TIME, "leveldb.compaction.times.micros"}, + {SUBCOMPACTION_SETUP_TIME, "leveldb.subcompaction.setup.times.micros"}, + {TABLE_SYNC_MICROS, "leveldb.table.sync.micros"}, + {COMPACTION_OUTFILE_SYNC_MICROS, "leveldb.compaction.outfile.sync.micros"}, + {WAL_FILE_SYNC_MICROS, "leveldb.wal.file.sync.micros"}, + {MANIFEST_FILE_SYNC_MICROS, "leveldb.manifest.file.sync.micros"}, + {TABLE_OPEN_IO_MICROS, "leveldb.table.open.io.micros"}, + {DB_MULTIGET, "leveldb.db.multiget.micros"}, + {READ_BLOCK_COMPACTION_MICROS, "leveldb.read.block.compaction.micros"}, + {READ_BLOCK_GET_MICROS, "leveldb.read.block.get.micros"}, + {WRITE_RAW_BLOCK_MICROS, "leveldb.write.raw.block.micros"}, + {STALL_L0_SLOWDOWN_COUNT, "leveldb.l0.slowdown.count"}, + {STALL_MEMTABLE_COMPACTION_COUNT, "leveldb.memtable.compaction.count"}, + {STALL_L0_NUM_FILES_COUNT, "leveldb.num.files.stall.count"}, + {HARD_RATE_LIMIT_DELAY_COUNT, "leveldb.hard.rate.limit.delay.count"}, + {SOFT_RATE_LIMIT_DELAY_COUNT, "leveldb.soft.rate.limit.delay.count"}, + {NUM_FILES_IN_SINGLE_COMPACTION, "leveldb.numfiles.in.singlecompaction"}, + {DB_SEEK, "leveldb.db.seek.micros"}, + {WRITE_STALL, "leveldb.db.write.stall"}, + {SST_READ_MICROS, "leveldb.sst.read.micros"}, + {NUM_SUBCOMPACTIONS_SCHEDULED, "leveldb.num.subcompactions.scheduled"}, + {BYTES_PER_READ, "leveldb.bytes.per.read"}, + {BYTES_PER_WRITE, "leveldb.bytes.per.write"}, + {BYTES_PER_MULTIGET, "leveldb.bytes.per.multiget"}, + {TERA_BLOCK_CACHE_PREAD_QUEUE, "tera.block_cache.pread_queue"}, + {TERA_BLOCK_CACHE_PREAD_SSD_READ, "tera.block_cache.pread_ssd_read"}, + {TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, "tera.block_cache.pread_fill_user_data"}, + {TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK, "tera.block_cache.pread_release_block"}, +}; + +struct HistogramData { + double median; // 中值 + double percentile95; + double percentile99; // 99分为点 + double average; + double standard_deviation; +}; + +// Analyze the performance of a db +class Statistics { + public: + virtual ~Statistics() {} + + virtual int64_t GetTickerCount(uint32_t ticker_type) = 0; + virtual void RecordTick(uint32_t ticker_type, uint64_t count = 0) = 0; + virtual void SetTickerCount(uint32_t ticker_type, uint64_t count) = 0; + + virtual void GetHistogramData(uint32_t type, + HistogramData* const data) = 0; + virtual std::string GetBriefHistogramString(uint32_t type) { return ""; } + virtual std::string GetHistogramString(uint32_t type) const { return ""; } + virtual void MeasureTime(uint32_t histogram_type, uint64_t time) = 0; + virtual void ClearHistogram(uint32_t type) = 0; + + // String representation of the statistic object. + virtual std::string ToString() { + // Do nothing by default + return std::string("ToString(): not implemented"); + } + virtual void ClearAll() = 0; +}; + +// Create a concrete DBStatistics object +Statistics* CreateDBStatistics(); + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_INCLUDE_STATISTICS_H_ diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 603ca1f0a..14722f98a 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -4,6 +4,7 @@ #include "leveldb/block_cache.h" +#include #include #include #include @@ -20,6 +21,7 @@ #include "leveldb/env.h" #include "leveldb/iterator.h" #include "leveldb/options.h" +#include "leveldb/statistics.h" #include "leveldb/status.h" #include "leveldb/table_utils.h" #include "leveldb/write_batch.h" @@ -174,6 +176,10 @@ class BlockCacheImpl { RandomAccessFile** result); // cache Pread static void BlockDeleter(const Slice& key, void* v); + static void BGControlThreadFunc(void* arg); + + Status DeleteFile(const std::string& fname); + private: friend struct DataSet; struct LockContent; @@ -182,7 +188,7 @@ class BlockCacheImpl { Status FillCache(CacheBlock* block); - Status ReadCache(CacheBlock* block); + Status ReadCache(CacheBlock* block, struct aiocb* aio_context); uint64_t AllocFileId(); // no more than fid_batch_num @@ -196,6 +202,8 @@ class BlockCacheImpl { Status ReleaseBlock(CacheBlock* block, bool need_sync); + void BGControlThread(); + private: friend class BlockCacheWritableFile; friend class BlockCacheRandomAccessFile; @@ -223,6 +231,7 @@ class BlockCacheImpl { enum LockKeyType { kDBKey = 0, kDataSetKey = 1, + kDeleteDBKey = 2, }; struct LockContent { int type; @@ -237,7 +246,7 @@ class BlockCacheImpl { DataSet* data_set; const std::string Encode() { - if (type == kDBKey) { + if (type == kDBKey || type == kDeleteDBKey) { return db_lock_key.ToString(); } else if (type == kDataSetKey) { std::string key = "DS#"; @@ -248,7 +257,7 @@ class BlockCacheImpl { } const std::string KeyToString() { - if (type == kDBKey) { + if (type == kDBKey || type == kDeleteDBKey) { return db_lock_key.ToString(); } else if (type == kDataSetKey) { std::stringstream ss; @@ -272,12 +281,14 @@ class BlockCacheImpl { typedef std::map DataSetMap; DataSetMap data_set_map_; + Statistics* stat_; //WritableFile* logfile_; //log::Writer* log_; DB* db_; // store meta ThreadPool bg_fill_; ThreadPool bg_read_; ThreadPool bg_flush_; + ThreadPool bg_control_; }; // Must insure not init more than twice @@ -302,6 +313,11 @@ Status BlockCacheEnv::GetChildren(const std::string& path, } Status BlockCacheEnv::DeleteFile(const std::string& fname) { + if (fname.rfind(".sst") == fname.size() - 4) { + uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size(); + BlockCacheImpl* cache = cache_vec_[hash]; + cache->DeleteFile(fname); + } return dfs_env_->DeleteFile(fname); } @@ -371,8 +387,8 @@ Status BlockCacheEnv::NewRandomAccessFile(const std::string& fname, uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size(); BlockCacheImpl* cache = cache_vec_[hash]; Status s = cache->NewRandomAccessFile(fname, result); - Log("[block_cache %s] open file read: %s, hash: %u, status: %s\n", - cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str()); + //Log("[block_cache %s] open file read: %s, hash: %u, status: %s\n", + // cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str()); return s; } @@ -663,14 +679,15 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { : cache_(c), fname_(fname) { *s = cache_->dfs_env_->NewRandomAccessFile(fname_, &dfs_file_); - Log("[%s] dfs open for read: %s, block_size: %lu, status: %s\n", - cache_->WorkPath().c_str(), - fname.c_str(), - cache_->options_.block_size, - s->ToString().c_str()); + //Log("[%s] dfs open for read: %s, block_size: %lu, status: %s\n", + // cache_->WorkPath().c_str(), + // fname.c_str(), + // cache_->options_.block_size, + // s->ToString().c_str()); MutexLock lockgard(&cache_->mu_); fid_ = cache_->FileId(fname_); + aio_enabled_ = true; return; } @@ -691,11 +708,12 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { std::vector c_valid; std::vector block_queue; - Log("[%s] Begin Pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu" - ", block_size %lu\n", - cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, - begin, end, cache_->options_.block_size); + //Log("[%s] Begin Pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu" + // ", block_size %lu\n", + // cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, + // begin, end, cache_->options_.block_size); + uint64_t start_ts = cache_->options_.cache_env->NowMicros(); MutexLock lockgard(&cache_->mu_); for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) { CacheBlock* block = NULL; @@ -721,12 +739,14 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { c_locked.push_back(block); } - Log("[%s] Queue block: %s, refs %u, data_block_refs %lu, alloc %u\n", - cache_->WorkPath().c_str(), block->ToString().c_str(), - block->handle->refs, block->data_block_refs, - block->data_block_alloc); + //Log("[%s] Queue block: %s, refs %u, data_block_refs %lu, alloc %u\n", + // cache_->WorkPath().c_str(), block->ToString().c_str(), + // block->handle->refs, block->data_block_refs, + // block->data_block_alloc); } cache_->mu_.Unlock(); + uint64_t queue_ts = cache_->options_.cache_env->NowMicros(); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_QUEUE, queue_ts - start_ts); // async read miss data for (uint32_t i = 0; i < c_miss.size(); ++i) { @@ -739,6 +759,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // block->ToString().c_str()); cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10); } + //uint64_t miss_read_sched_ts = cache_->options_.cache_env->NowMicros(); // async read valid data for (uint32_t i = 0; i < c_valid.size(); ++i) { @@ -749,8 +770,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { //Log("[%s] pread in valid list, %s\n", // cache_->WorkPath().c_str(), // block->ToString().c_str()); - cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10); + if (aio_enabled_) { + AioCacheRead(reader); + } else { + cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10); + } } + //uint64_t ssd_read_sched_ts = cache_->options_.cache_env->NowMicros(); // wait async cache read done for (uint32_t i = 0; i < c_valid.size(); ++i) { @@ -763,10 +789,12 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } block->Clear(kCacheBlockLocked); block->cv.SignalAll(); - Log("[%s] cache read done, %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); + //Log("[%s] cache read done, %s\n", + // cache_->WorkPath().c_str(), + // block->ToString().c_str()); } + uint64_t ssd_read_ts = cache_->options_.cache_env->NowMicros(); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_SSD_READ, ssd_read_ts - queue_ts); // wait dfs read done and async cache file for (uint32_t i = 0; i < c_miss.size(); ++i) { @@ -781,6 +809,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->WorkPath().c_str(), block->ToString().c_str()); } + //uint64_t dfs_read_ts = cache_->options_.cache_env->NowMicros(); for (uint32_t i = 0; i < c_miss.size(); ++i) { CacheBlock* block = c_miss[i]; @@ -792,6 +821,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // block->ToString().c_str()); cache_->bg_fill_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheWrite, writer, 10); } + //uint64_t ssd_write_sched_ts = cache_->options_.cache_env->NowMicros(); for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish MutexLock lockgard(&cache_->mu_); @@ -804,20 +834,22 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } block->Clear(kCacheBlockLocked); block->cv.SignalAll(); - Log("[%s] cache fill done, %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); + //Log("[%s] cache fill done, %s\n", + // cache_->WorkPath().c_str(), + // block->ToString().c_str()); } + //uint64_t ssd_write_ts = cache_->options_.cache_env->NowMicros(); // wait other async read finish for (uint32_t i = 0; i < c_locked.size(); ++i) { MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_locked[i]; block->WaitOnClear(kCacheBlockLocked); - Log("[%s] wait locked done, %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); + //Log("[%s] wait locked done, %s\n", + // cache_->WorkPath().c_str(), + // block->ToString().c_str()); } + uint64_t wait_unlock_ts = cache_->options_.cache_env->NowMicros(); // fill user mem size_t msize = 0; @@ -832,17 +864,19 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } memcpy(scratch + msize, data_block.data(), data_block.size()); msize += data_block.size(); - Log("[%s] Fill user data, %s, fill_offset %lu, fill_size %lu, prefix %lu, suffix %lu, msize %lu, offset %lu\n", - cache_->WorkPath().c_str(), fname_.c_str(), - block_idx * cache_->options_.block_size + (block_idx == begin ? offset % cache_->options_.block_size: 0), - data_block.size(), - block_idx == begin ? offset % cache_->options_.block_size: 0, - block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size - : cache_->options_.block_size, - msize, offset); + //Log("[%s] Fill user data, %s, fill_offset %lu, fill_size %lu, prefix %lu, suffix %lu, msize %lu, offset %lu\n", + // cache_->WorkPath().c_str(), fname_.c_str(), + // block_idx * cache_->options_.block_size + (block_idx == begin ? offset % cache_->options_.block_size: 0), + // data_block.size(), + // block_idx == begin ? offset % cache_->options_.block_size: 0, + // block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size + // : cache_->options_.block_size, + // msize, offset); } assert(msize == n); *result = Slice(scratch, n); + uint64_t fill_user_data_ts = cache_->options_.cache_env->NowMicros(); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, fill_user_data_ts - wait_unlock_ts); cache_->mu_.Lock(); for (uint32_t i = 0; i < c_miss.size(); ++i) { @@ -860,6 +894,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { //Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); cache_->ReleaseBlock(block, false); } + uint64_t release_cache_block_ts = cache_->options_.cache_env->NowMicros(); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK, release_cache_block_ts - fill_user_data_ts); if (!s.ok()) { s = dfs_file_->Read(offset, n, result, scratch); @@ -868,11 +904,11 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { offset, n, s.ToString().c_str()); } - Log("[%s] End Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu" - ", block_size %lu\n", - cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, - result->size(), s.ToString().c_str(), - begin, end, cache_->options_.block_size); + //Log("[%s] Done Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu" + // ", block_size %lu\n", + // cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, + // result->size(), s.ToString().c_str(), + // begin, end, cache_->options_.block_size); return s; } @@ -895,11 +931,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { uint64_t offset = block->block_idx * cache_->options_.block_size; size_t n = cache_->options_.block_size; block->s = dfs_file_->Read(offset, n, &result, scratch); - Log("[%s] dfs read, %s" - ", offset %lu, size %lu, status %s, res %lu\n", - cache_->WorkPath().c_str(), block->ToString().c_str(), - offset, n, - block->s.ToString().c_str(), result.size()); + if (!block->s.ok()) { + Log("[%s] dfs read, %s" + ", offset %lu, size %lu, status %s, res %lu\n", + cache_->WorkPath().c_str(), block->ToString().c_str(), + offset, n, + block->s.ToString().c_str(), result.size()); + } MutexLock lockgard(&cache_->mu_); block->Clear(kCacheBlockDfsRead); @@ -910,23 +948,58 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { struct AsyncCacheReader { BlockCacheRandomAccessFile* file; CacheBlock* block; + + // aio spec + struct aiocb aio_context; }; + + // use use thread module to enhance sync io static void AsyncCacheRead(void* arg) { AsyncCacheReader* reader = (AsyncCacheReader*)arg; reader->file->HandleCacheRead(reader); delete reader; - return; } void HandleCacheRead(AsyncCacheReader* reader) { CacheBlock* block = reader->block; - block->s = cache_->ReadCache(block); + block->s = cache_->ReadCache(block, NULL); MutexLock lockgard(&cache_->mu_); block->Clear(kCacheBlockCacheRead); block->cv.SignalAll(); //Log("[%s] async.cacheread signal, %s\n", cache_->WorkPath().c_str(), // block->ToString().c_str()); - return; + } + + // support aio engine + static void AioCacheReadCallback(sigval_t sigval) { // kernel create thread + AsyncCacheReader* reader = (AsyncCacheReader*)sigval.sival_ptr; + reader->file->HandleAioCacheReadCallback(reader); + delete reader; + } + void HandleAioCacheReadCallback(AsyncCacheReader* reader) { + CacheBlock* block = reader->block; + assert(aio_error(&reader->aio_context) == 0); + //while (aio_error(reader->aio_context) == EINPROGRESS); + ssize_t res = aio_return(&reader->aio_context); + block->s = res < 0? Status::Corruption("AioReadCache error") : Status::OK(); + + MutexLock lockgard(&cache_->mu_); + block->Clear(kCacheBlockCacheRead); + block->cv.SignalAll(); + if (!block->s.ok()) { + Log("[%s] aio.cacheread signal, %s\n", cache_->WorkPath().c_str(), + block->ToString().c_str()); + } + } + void AioCacheRead(AsyncCacheReader* reader) const { + // setup sigevent + memset((char*)(&reader->aio_context), 0, sizeof(struct aiocb)); + reader->aio_context.aio_sigevent.sigev_notify = SIGEV_THREAD; + reader->aio_context.aio_sigevent.sigev_notify_function = &BlockCacheRandomAccessFile::AioCacheReadCallback; + reader->aio_context.aio_sigevent.sigev_notify_attributes = NULL; + reader->aio_context.aio_sigevent.sigev_value.sival_ptr = reader; + + cache_->ReadCache(reader->block, &reader->aio_context); } struct AsyncCacheWriter { @@ -960,6 +1033,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { RandomAccessFile* dfs_file_; std::string fname_; uint64_t fid_; + bool aio_enabled_; }; // Tcache impl @@ -972,9 +1046,35 @@ BlockCacheImpl::BlockCacheImpl(const BlockCacheOptions& options) bg_fill_.SetBackgroundThreads(30); bg_read_.SetBackgroundThreads(30); bg_flush_.SetBackgroundThreads(30); + bg_control_.SetBackgroundThreads(2); + stat_ = CreateDBStatistics(); } -BlockCacheImpl::~BlockCacheImpl() {} +BlockCacheImpl::~BlockCacheImpl() { + delete stat_; +} + +void BlockCacheImpl::BGControlThreadFunc(void* arg) { + reinterpret_cast(arg)->BGControlThread(); +} + +void BlockCacheImpl::BGControlThread() { + Log("[%s] statistics: %s", this->WorkPath().c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str()); + Log("[%s] statistics: %s", this->WorkPath().c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str()); + Log("[%s] statistics: %s", this->WorkPath().c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str()); + Log("[%s] statistics: %s", this->WorkPath().c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str()); + + // resched after 1s + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE); + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ); + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA); + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK); + bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 10000); +} Status BlockCacheImpl::NewWritableFile(const std::string& fname, WritableFile** result) { @@ -1072,9 +1172,15 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { lc.KeyToString().c_str(), lc.ValToString().c_str(), s.ToString().c_str()); + } else if (lc.type == kDeleteDBKey) { + WriteOptions w_opts; + s = db_->Delete(w_opts, key); + Log("[%s] Delete db key : %s, val %s, status %s\n", + this->WorkPath().c_str(), + lc.KeyToString().c_str(), + lc.ValToString().c_str(), + s.ToString().c_str()); } else if (lc.type == kDataSetKey) { - std::string end_ds = "DS#"; - PutFixed64(&end_ds, lc.sid + 1); lc.data_set = new DataSet; lc.data_set->cache = New2QCache((options_.dataset_size / options_.block_size) + 1);// number of blocks in DS std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid); @@ -1090,26 +1196,32 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { ReadOptions s_opts; leveldb::Iterator* db_it = db_->NewIterator(s_opts); for (db_it->Seek(key); - db_it->Valid() && db_it->key().ToString() < end_ds; + db_it->Valid() && db_it->key().starts_with("DS#"); db_it->Next()) { Slice lkey = db_it->key(); - lkey.remove_prefix(3 + sizeof(uint64_t));// remove DS#sid + uint64_t sid, cbi; + lkey.remove_prefix(3);// lkey = DS#, sid, cbi + sid = DecodeFixed64(lkey.data()); + lkey.remove_prefix(sizeof(uint64_t)); + cbi = DecodeFixed64(lkey.data()); //Slice lval = db_it->value(); + if (sid != lc.sid) { + break; + } CacheBlock* block = new CacheBlock(&mu_); block->DecodeFrom(db_it->value()); // get fid and block_idx std::string hkey; PutFixed64(&hkey, block->fid); PutFixed64(&hkey, block->block_idx); - block->sid = lc.sid; - block->cache_block_idx = DecodeFixed64(lkey.data()); + block->sid = sid; + block->cache_block_idx = cbi; block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0; Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n", this->WorkPath().c_str(), lc.KeyToString().c_str(), block->ToString().c_str()); LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter)); - assert(handle != NULL); handle->cache_id = block->cache_block_idx; block->handle = handle; lc.data_set->cache->Release((Cache::Handle*)handle); @@ -1173,6 +1285,8 @@ Status BlockCacheImpl::LoadCache() { new_fid_ = prev_fid_ + options_.fid_batch_num; Log("[block_cache %s]: reuse block cache: prev_fid: %lu, new_fid: %lu\n", dbname.c_str(), prev_fid_, new_fid_); + + bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 10000); s = Status::OK(); return s; } @@ -1200,7 +1314,7 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) { return Status::OK(); } -Status BlockCacheImpl::ReadCache(CacheBlock* block) { +Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) { MutexLock l(&mu_); uint64_t sid = block->sid; uint64_t cache_block_idx = block->cache_block_idx; @@ -1208,13 +1322,25 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block) { mu_.Unlock(); // do io without lock - ssize_t res = pread(fd, (char*)block->data_block.data(), block->data_block.size(), - cache_block_idx * options_.block_size); - Log("[%s] cache read: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", - this->WorkPath().c_str(), sid, fd, block->data_block.size(), - cache_block_idx, - block->ToString().c_str(), - res); + ssize_t res = 0; + if (aio_context != NULL) { // support aio engine + aio_context->aio_fildes = fd; + aio_context->aio_buf = (char*)block->data_block.data(); + aio_context->aio_nbytes = block->data_block.size(); + aio_context->aio_offset = cache_block_idx * options_.block_size; + res = aio_read(aio_context); + } else { + res = pread(fd, (char*)block->data_block.data(), block->data_block.size(), + cache_block_idx * options_.block_size); + } + + if (res < 0) { + Log("[%s] cache read: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", + this->WorkPath().c_str(), sid, fd, block->data_block.size(), + cache_block_idx, + block->ToString().c_str(), + res); + } mu_.Lock(); if (res < 0) { @@ -1279,14 +1405,31 @@ uint64_t BlockCacheImpl::FileId(const std::string& fname) { } else { // fid in cache fid = DecodeFixed64(val.c_str()); } - Log("[%s] Fid: %lu, fname: %s\n", - this->WorkPath().c_str(), - fid, fname.c_str()); + //Log("[%s] Fid: %lu, fname: %s\n", + // this->WorkPath().c_str(), + // fid, fname.c_str()); mu_.Lock(); return fid; } +Status BlockCacheImpl::DeleteFile(const std::string& fname) { + Status s; + std::string key = "FNAME#" + fname; + ReadOptions r_opts; + std::string val; + //s = db_->Get(r_opts, key, &val); + //if (!s.ok()) { // not exist + { + MutexLock l(&mu_); + LockContent lc; + lc.type = kDeleteDBKey; + lc.db_lock_key = key; + s = LockAndPut(lc); + } + return s; +} + DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) { mu_.AssertHeld(); DataSet* set = NULL; @@ -1368,7 +1511,7 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) { LRUHandle* h = block->handle; DataSet* ds = GetDataSet(block->sid); // get and alloc ds block->ReleaseDataBlock(); - Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); + //Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); block->s = Status::OK(); // clear io status block->cv.SignalAll(); ds->cache->Release((Cache::Handle*)h); diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc index 99c2dfa90..97e070bf3 100644 --- a/src/leveldb/util/cache.cc +++ b/src/leveldb/util/cache.cc @@ -274,17 +274,15 @@ class LRU2QCache: public Cache { ~LRU2QCache() {} // Like Cache methods, but with an extra "hash" parameter. - Cache::Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) { + Cache::Handle* Insert(const Slice& key, void* value, size_t cache_id, + void (*deleter)(const Slice& key, void* value)) { const uint32_t hash = HashSlice(key); MutexLock l(&mutex_); LRUHandle* e = NULL; e = table_.Lookup(key, hash); - if (e != NULL) { - return reinterpret_cast(NULL); - } + assert(e == NULL); - if (usage_ < capacity_) { // cache full + if (usage_ < capacity_) { // cache not full e = reinterpret_cast( malloc(sizeof(LRUHandle)-1 + key.size())); e->value = value; @@ -296,8 +294,8 @@ class LRU2QCache: public Cache { e->cache_id = usage_; memcpy(e->key_data, key.data(), key.size()); - assert(table_.Insert(e) == NULL); LRU_Append(e); + assert(table_.Insert(e) == NULL); usage_++; return reinterpret_cast(e); } @@ -324,12 +322,11 @@ class LRU2QCache: public Cache { table_.Remove(old->key(), old->hash); Unref(old); - assert(table_.Insert(e) == NULL); LRU_Append(e); + assert(table_.Insert(e) == NULL); return reinterpret_cast(e); } - // TODO: try wait finish - return reinterpret_cast(NULL); + return NULL; } Cache::Handle* Lookup(const Slice& key) { diff --git a/src/leveldb/util/statistics.cc b/src/leveldb/util/statistics.cc new file mode 100644 index 000000000..ee383a055 --- /dev/null +++ b/src/leveldb/util/statistics.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "leveldb/statistics.h" +#include +#include "util/histogram.h" +#include "../utils/counter.h" + +namespace leveldb { + +class StatisticsImpl : public Statistics { +public: + StatisticsImpl() {} + + ~StatisticsImpl() {} + + virtual int64_t GetTickerCount(uint32_t ticker_type) { + return counter_[ticker_type].Get(); + } + + virtual void RecordTick(uint32_t ticker_type, uint64_t count = 0) { + counter_[ticker_type].Add(count); + } + + virtual void SetTickerCount(uint32_t ticker_type, uint64_t count) { + counter_[ticker_type].Set(count); + } + + virtual void MeasureTime(uint32_t type, uint64_t time) { + hist_[type].Add(time); + } + + virtual void GetHistogramData(uint32_t type, + HistogramData* const data) { + data->median = hist_[type].Median(); + data->percentile95 = hist_[type].Percentile(95); + data->percentile99 = hist_[type].Percentile(99); + data->average = hist_[type].Average(); + data->standard_deviation = hist_[type].StandardDeviation(); + } + + virtual std::string GetHistogramString(uint32_t type) const { + return hist_[type].ToString(); + } + + virtual std::string GetBriefHistogramString(uint32_t type) { + assert(HistogramsNameMap[type].first == type); + + std::string res; + char buffer[200]; + HistogramData hData; + GetHistogramData(type, &hData); + snprintf(buffer, + 200, + "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n", + HistogramsNameMap[type].second.c_str(), + hData.median, + hData.percentile95, + hData.percentile99); + res.append(buffer); + res.shrink_to_fit(); + return res; + } + + void ClearHistogram(uint32_t type) { + hist_[type].Clear(); + } + + // String representation of the statistic object. + virtual std::string ToString() { + std::string res; + res.reserve(20000); + for (uint32_t i = 0; i < TickersNameMap.size(); i++) { + char buffer[200]; + snprintf(buffer, 200, "%s COUNT : %lu\n", + TickersNameMap[i].second.c_str(), GetTickerCount(TickersNameMap[i].first)); + res.append(buffer); + } + for (uint32_t i = 0; i < HistogramsNameMap.size(); i++) { + char buffer[200]; + HistogramData hData; + GetHistogramData(HistogramsNameMap[i].first, &hData); + snprintf(buffer, + 200, + "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n", + HistogramsNameMap[i].second.c_str(), + hData.median, + hData.percentile95, + hData.percentile99); + res.append(buffer); + } + res.shrink_to_fit(); + return res; + } + + void ClearAll() { + for (uint32_t i = 0; i < TICKER_ENUM_MAX; i++) { + counter_[i].Clear(); + } + for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; i++) { + hist_[i].Clear(); + } + } + +private: + tera::Counter counter_[TICKER_ENUM_MAX]; + Histogram hist_[HISTOGRAM_ENUM_MAX]; +}; + +Statistics* CreateDBStatistics() { + return new StatisticsImpl; +} + +} // namespace leveldb diff --git a/src/tera_flags.cc b/src/tera_flags.cc index 6a0a14ce5..e4f77130a 100644 --- a/src/tera_flags.cc +++ b/src/tera_flags.cc @@ -64,7 +64,7 @@ DEFINE_int32(tera_leveldb_env_dfs_seek_latency, 10000000, "the random access lat DEFINE_int32(tera_memenv_table_cache_size, 100, "the max open file number in leveldb table_cache"); DEFINE_int32(tera_memenv_block_cache_size, 10000, "block cache size for leveldb which do not use share block cache"); DEFINE_bool(tera_use_flash_for_memenv, true, "Use flashenv for memery lg"); -DEFINE_int32(tera_leveldb_block_cache_env_num_thread, 30, "thread num of Tcache"); +DEFINE_int32(tera_leveldb_block_cache_env_thread_num, 30, "thread num of Tcache"); DEFINE_string(tera_leveldb_compact_strategy, "default", "the default strategy to drive consum compaction, should be [default|LG|dummy]"); DEFINE_bool(tera_leveldb_verify_checksums, true, "enable verify data read from storage against checksums"); From 716270e59fef753042882276b06fdf571f733cfd Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 22 Aug 2017 01:41:15 +0800 Subject: [PATCH 08/19] issue=1258, Tcache support block-level cache evict bugfix: 1. cache reload core 2. support aio engine 3. cache fill TEST PASS --- src/leveldb/Makefile | 4 ++-- src/leveldb/util/block_cache.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile index 9073e98a5..72e322d16 100644 --- a/src/leveldb/Makefile +++ b/src/leveldb/Makefile @@ -7,7 +7,7 @@ # to switch between compilation modes. # OPT ?= -O2 -DNDEBUG # (A) Production use (optimized mode) -OPT ?= -std=gnu++11 -g2 -Wall -Werror # (B) Debug mode, w/ full line-level debugging symbols +OPT ?= -g2 -Wall -Werror # (B) Debug mode, w/ full line-level debugging symbols # OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols #----------------------------------------------- @@ -19,7 +19,7 @@ include ../../depends.mk include build_config.mk CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) -CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) +CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -std=gnu++11 LDFLAGS += $(PLATFORM_LDFLAGS) -L$(SNAPPY_LIBDIR) -lrt -ldl -lsnappy LIBS += $(PLATFORM_LIBS) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 14722f98a..9d90f219a 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -641,7 +641,7 @@ class BlockCacheWritableFile : public WritableFile { port::CondVar cv(&cache_->mu_); cv.Wait(10); // timewait 10ms retry } - block->state = 0; + assert(block->state == 0); block->GetDataBlock(cache_->options_.block_size, Slice(*block_data)); cache_->mu_.Unlock(); @@ -1119,7 +1119,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { Waiter* w = NULL; LockKeyMap::iterator it = lock_key_.find(key); - if (it != lock_key_.end()){ + if (it != lock_key_.end()) { w = it->second; w->wait_num ++; while (!w->done) { From e516c241a9454a2a1979603a7ccac90c2d979911 Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 22 Aug 2017 01:47:18 +0800 Subject: [PATCH 09/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/util/block_cache.cc | 3 +-- src/tabletnode/tabletnode_impl.cc | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 9d90f219a..65f218eb4 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -174,6 +174,7 @@ class BlockCacheImpl { Status NewRandomAccessFile(const std::string& fname, RandomAccessFile** result); // cache Pread + static void BlockDeleter(const Slice& key, void* v); static void BGControlThreadFunc(void* arg); @@ -424,7 +425,6 @@ class BlockCacheWriteBuffer { MutexLock l(&mu_); if (tmp_storage_ == NULL) { tmp_storage_ = new std::string(); - tmp_storage_->resize(0); block_list_.push_back(tmp_storage_); } uint32_t begin = offset_ / block_size_; @@ -438,7 +438,6 @@ class BlockCacheWriteBuffer { Slice buf(data.data() + tmp_size, data.size() - tmp_size); for (uint32_t i = begin + 1; i <= end; ++i) { tmp_storage_ = new std::string(); - tmp_storage_->resize(0); block_list_.push_back(tmp_storage_); if (i < end) { // last block tmp_storage_->append(buf.data(), block_size_); diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc index 4d7919fd6..0cfd97c86 100644 --- a/src/tabletnode/tabletnode_impl.cc +++ b/src/tabletnode/tabletnode_impl.cc @@ -199,7 +199,7 @@ void TabletNodeImpl::InitCacheSystem() { // compitable with legacy FlashEnv leveldb::FlashEnv* flash_env = (leveldb::FlashEnv*)io::LeveldbFlashEnv(); flash_env->SetFlashPath(FLAGS_tera_tabletnode_cache_paths, - FLAGS_tera_io_cache_path_vanish_allowed); + FLAGS_tera_io_cache_path_vanish_allowed); flash_env->SetUpdateFlashThreadNumber(FLAGS_tera_tabletnode_cache_update_thread_num); flash_env->SetIfForceReadFromCache(FLAGS_tera_tabletnode_cache_force_read_from_cache); return; From a94781f95f555caf9ee343259d396923d3dcbc16 Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 22 Aug 2017 01:57:54 +0800 Subject: [PATCH 10/19] issue=1258, Tcache support block-level cache evict --- src/tabletnode/tabletnode_impl.cc | 4 ++-- src/tera_flags.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc index 0cfd97c86..52077a718 100644 --- a/src/tabletnode/tabletnode_impl.cc +++ b/src/tabletnode/tabletnode_impl.cc @@ -178,7 +178,7 @@ bool TabletNodeImpl::Init() { void TabletNodeImpl::InitCacheSystem() { if (FLAGS_tera_tabletnode_block_cache_enabled) { - LOG(INFO) << "Tcache: set flash path: " << FLAGS_tera_tabletnode_cache_paths; + LOG(INFO) << "t-cache: set flash path: " << FLAGS_tera_tabletnode_cache_paths; std::vector path_list; SplitString(FLAGS_tera_tabletnode_cache_paths, ";", &path_list); @@ -187,7 +187,7 @@ void TabletNodeImpl::InitCacheSystem() { posix_env->CreateDir(path_list[i]); } - LOG(INFO) << "activate Tcache system"; + LOG(INFO) << "activate t-cache system"; leveldb::Env* block_cache_env = io::DefaultBlockCacheEnv(); for (uint32_t i = 0; i < path_list.size(); ++i) { leveldb::BlockCacheOptions opts; diff --git a/src/tera_flags.cc b/src/tera_flags.cc index e4f77130a..949c36b98 100644 --- a/src/tera_flags.cc +++ b/src/tera_flags.cc @@ -64,7 +64,7 @@ DEFINE_int32(tera_leveldb_env_dfs_seek_latency, 10000000, "the random access lat DEFINE_int32(tera_memenv_table_cache_size, 100, "the max open file number in leveldb table_cache"); DEFINE_int32(tera_memenv_block_cache_size, 10000, "block cache size for leveldb which do not use share block cache"); DEFINE_bool(tera_use_flash_for_memenv, true, "Use flashenv for memery lg"); -DEFINE_int32(tera_leveldb_block_cache_env_thread_num, 30, "thread num of Tcache"); +DEFINE_int32(tera_leveldb_block_cache_env_thread_num, 30, "thread num of t-cache"); DEFINE_string(tera_leveldb_compact_strategy, "default", "the default strategy to drive consum compaction, should be [default|LG|dummy]"); DEFINE_bool(tera_leveldb_verify_checksums, true, "enable verify data read from storage against checksums"); @@ -202,7 +202,7 @@ DEFINE_string(tera_tabletnode_cpu_affinity_set, "1,2", "the cpu set of cpu affin DEFINE_bool(tera_tabletnode_hang_detect_enabled, false, "enable detect read/write hang"); DEFINE_int32(tera_tabletnode_hang_detect_threshold, 60000, "read/write hang detect threshold (in ms)"); -DEFINE_bool(tera_tabletnode_block_cache_enabled, true, "enable Tcache mechasism"); +DEFINE_bool(tera_tabletnode_block_cache_enabled, true, "enable t-cache mechasism"); DEFINE_string(tera_tabletnode_cache_paths, "../data/cache/", "paths for cached data storage. Mutiple definition like: \"./path1/;./path2/\""); DEFINE_int32(tera_tabletnode_cache_block_size, 8192, "the block size of cache system"); DEFINE_string(tera_tabletnode_cache_name, "tera.cache", "prefix name for cache name"); From 1c4682e58974a1d1c36928698ef7e8ae222f0e64 Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 22 Aug 2017 16:08:21 +0800 Subject: [PATCH 11/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/include/leveldb/statistics.h | 311 +---------------------- src/leveldb/util/block_cache.cc | 114 ++++----- src/leveldb/util/statistics.cc | 7 +- 3 files changed, 67 insertions(+), 365 deletions(-) diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h index 81d4a4729..db4133e99 100644 --- a/src/leveldb/include/leveldb/statistics.h +++ b/src/leveldb/include/leveldb/statistics.h @@ -19,260 +19,12 @@ namespace leveldb { * 2. Add a readable string in TickersNameMap below for the newly added ticker. */ enum Tickers : uint32_t { - // total block cache misses - // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS + - // BLOCK_CACHE_FILTER_MISS + - // BLOCK_CACHE_DATA_MISS; - BLOCK_CACHE_MISS = 0, - // total block cache hit - // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT + - // BLOCK_CACHE_FILTER_HIT + - // BLOCK_CACHE_DATA_HIT; - BLOCK_CACHE_HIT, - // # of blocks added to block cache. - BLOCK_CACHE_ADD, - // # of failures when adding blocks to block cache. - BLOCK_CACHE_ADD_FAILURES, - // # of times cache miss when accessing index block from block cache. - BLOCK_CACHE_INDEX_MISS, - // # of times cache hit when accessing index block from block cache. - BLOCK_CACHE_INDEX_HIT, - // # of bytes of index blocks inserted into cache - BLOCK_CACHE_INDEX_BYTES_INSERT, - // # of bytes of index block erased from cache - BLOCK_CACHE_INDEX_BYTES_EVICT, - // # of times cache miss when accessing filter block from block cache. - BLOCK_CACHE_FILTER_MISS, - // # of times cache hit when accessing filter block from block cache. - BLOCK_CACHE_FILTER_HIT, - // # of bytes of bloom filter blocks inserted into cache - BLOCK_CACHE_FILTER_BYTES_INSERT, - // # of bytes of bloom filter block erased from cache - BLOCK_CACHE_FILTER_BYTES_EVICT, - // # of times cache miss when accessing data block from block cache. - BLOCK_CACHE_DATA_MISS, - // # of times cache hit when accessing data block from block cache. - BLOCK_CACHE_DATA_HIT, - // # of bytes read from cache. - BLOCK_CACHE_BYTES_READ, - // # of bytes written into cache. - BLOCK_CACHE_BYTES_WRITE, - - // # of times bloom filter has avoided file reads. - BLOOM_FILTER_USEFUL, - - // # persistent cache hit - PERSISTENT_CACHE_HIT, - // # persistent cache miss - PERSISTENT_CACHE_MISS, - - // # of memtable hits. - MEMTABLE_HIT, - // # of memtable misses. - MEMTABLE_MISS, - - // # of Get() queries served by L0 - GET_HIT_L0, - // # of Get() queries served by L1 - GET_HIT_L1, - // # of Get() queries served by L2 and up - GET_HIT_L2_AND_UP, - - /** - * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction - * There are 3 reasons currently. - * 覆盖写;删除;用户函数删除 - */ - COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value. - COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete. - COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key. - - // Number of keys written to the database via the Put and Write call's - NUMBER_KEYS_WRITTEN, - // Number of Keys read, - NUMBER_KEYS_READ, - // Number keys updated, if inplace update is enabled - NUMBER_KEYS_UPDATED, - // The number of uncompressed bytes issued by DB::Put(), DB::Delete(), - // DB::Merge(), and DB::Write(). - BYTES_WRITTEN, - // The number of uncompressed bytes read from DB::Get(). It could be - // either from memtables, cache, or table files. - // For the number of logical bytes read from DB::MultiGet(), - // please use NUMBER_MULTIGET_BYTES_READ. - BYTES_READ, - // The number of calls to seek/next/prev - NUMBER_DB_SEEK, - NUMBER_DB_NEXT, - NUMBER_DB_PREV, - // The number of calls to seek/next/prev that returned data - NUMBER_DB_SEEK_FOUND, - NUMBER_DB_NEXT_FOUND, - NUMBER_DB_PREV_FOUND, - // The number of uncompressed bytes read from an iterator. - // Includes size of key and value. - ITER_BYTES_READ, - NO_FILE_CLOSES, - NO_FILE_OPENS, - NO_FILE_ERRORS, - // DEPRECATED Time system had to wait to do LO-L1 compactions - STALL_L0_SLOWDOWN_MICROS, - // DEPRECATED Time system had to wait to move memtable to L1. - STALL_MEMTABLE_COMPACTION_MICROS, - // DEPRECATED write throttle because of too many files in L0 - STALL_L0_NUM_FILES_MICROS, - // Writer has to wait for compaction or flush to finish. - STALL_MICROS, - // The wait time for db mutex. - // Disabled by default. To enable it set stats level to kAll - DB_MUTEX_WAIT_MICROS, - RATE_LIMIT_DELAY_MILLIS, - NO_ITERATORS, // number of iterators currently open - - // Number of MultiGet calls, keys read, and bytes read - NUMBER_MULTIGET_CALLS, - NUMBER_MULTIGET_KEYS_READ, - NUMBER_MULTIGET_BYTES_READ, - - // Number of deletes records that were not required to be - // written to storage because key does not exist - NUMBER_FILTERED_DELETES, - NUMBER_MERGE_FAILURES, - SEQUENCE_NUMBER, - - // number of times bloom was checked before creating iterator on a - // file, and the number of times the check was useful in avoiding - // iterator creation (and thus likely IOPs). - BLOOM_FILTER_PREFIX_CHECKED, - BLOOM_FILTER_PREFIX_USEFUL, - - // Number of times we had to reseek inside an iteration to skip - // over large number of keys with same userkey. - NUMBER_OF_RESEEKS_IN_ITERATION, - - // Record the number of calls to GetUpadtesSince. Useful to keep track of - // transaction log iterator refreshes - GET_UPDATES_SINCE_CALLS, - BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache - BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache - // Number of blocks added to comopressed block cache - BLOCK_CACHE_COMPRESSED_ADD, - // Number of failures when adding blocks to compressed block cache - BLOCK_CACHE_COMPRESSED_ADD_FAILURES, - WAL_FILE_SYNCED, // Number of times WAL sync is done - WAL_FILE_BYTES, // Number of bytes written to WAL - - // Writes can be processed by requesting thread or by the thread at the - // head of the writers queue. - WRITE_DONE_BY_SELF, - WRITE_DONE_BY_OTHER, // Equivalent to writes done for others - WRITE_TIMEDOUT, // Number of writes ending up with timed-out. - WRITE_WITH_WAL, // Number of Write calls that request WAL - COMPACT_READ_BYTES, // Bytes read during compaction - COMPACT_WRITE_BYTES, // Bytes written during compaction - FLUSH_WRITE_BYTES, // Bytes written during flush - - // Number of table's properties loaded directly from file, without creating - // table reader object. - NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, - NUMBER_SUPERVERSION_ACQUIRES, - NUMBER_SUPERVERSION_RELEASES, - NUMBER_SUPERVERSION_CLEANUPS, - NUMBER_BLOCK_NOT_COMPRESSED, - MERGE_OPERATION_TOTAL_TIME, - FILTER_OPERATION_TOTAL_TIME, - - // Row cache. - ROW_CACHE_HIT, - ROW_CACHE_MISS, - TICKER_ENUM_MAX }; // The order of items listed in Tickers should be the same as // the order listed in TickersNameMap const std::vector > TickersNameMap = { - {BLOCK_CACHE_MISS, "leveldb.block.cache.miss"}, - {BLOCK_CACHE_HIT, "leveldb.block.cache.hit"}, - {BLOCK_CACHE_ADD, "leveldb.block.cache.add"}, - {BLOCK_CACHE_ADD_FAILURES, "leveldb.block.cache.add.failures"}, - {BLOCK_CACHE_INDEX_MISS, "leveldb.block.cache.index.miss"}, - {BLOCK_CACHE_INDEX_HIT, "leveldb.block.cache.index.hit"}, - {BLOCK_CACHE_INDEX_BYTES_INSERT, "leveldb.block.cache.index.bytes.insert"}, - {BLOCK_CACHE_INDEX_BYTES_EVICT, "leveldb.block.cache.index.bytes.evict"}, - {BLOCK_CACHE_FILTER_MISS, "leveldb.block.cache.filter.miss"}, - {BLOCK_CACHE_FILTER_HIT, "leveldb.block.cache.filter.hit"}, - {BLOCK_CACHE_FILTER_BYTES_INSERT, - "leveldb.block.cache.filter.bytes.insert"}, - {BLOCK_CACHE_FILTER_BYTES_EVICT, "leveldb.block.cache.filter.bytes.evict"}, - {BLOCK_CACHE_DATA_MISS, "leveldb.block.cache.data.miss"}, - {BLOCK_CACHE_DATA_HIT, "leveldb.block.cache.data.hit"}, - {BLOCK_CACHE_BYTES_READ, "leveldb.block.cache.bytes.read"}, - {BLOCK_CACHE_BYTES_WRITE, "leveldb.block.cache.bytes.write"}, - {BLOOM_FILTER_USEFUL, "leveldb.bloom.filter.useful"}, - {MEMTABLE_HIT, "leveldb.memtable.hit"}, - {MEMTABLE_MISS, "leveldb.memtable.miss"}, - {GET_HIT_L0, "leveldb.l0.hit"}, - {GET_HIT_L1, "leveldb.l1.hit"}, - {GET_HIT_L2_AND_UP, "leveldb.l2andup.hit"}, - {COMPACTION_KEY_DROP_NEWER_ENTRY, "leveldb.compaction.key.drop.new"}, - {COMPACTION_KEY_DROP_OBSOLETE, "leveldb.compaction.key.drop.obsolete"}, - {COMPACTION_KEY_DROP_USER, "leveldb.compaction.key.drop.user"}, - {NUMBER_KEYS_WRITTEN, "leveldb.number.keys.written"}, - {NUMBER_KEYS_READ, "leveldb.number.keys.read"}, - {NUMBER_KEYS_UPDATED, "leveldb.number.keys.updated"}, - {BYTES_WRITTEN, "leveldb.bytes.written"}, - {BYTES_READ, "leveldb.bytes.read"}, - {NUMBER_DB_SEEK, "leveldb.number.db.seek"}, - {NUMBER_DB_NEXT, "leveldb.number.db.next"}, - {NUMBER_DB_PREV, "leveldb.number.db.prev"}, - {NUMBER_DB_SEEK_FOUND, "leveldb.number.db.seek.found"}, - {NUMBER_DB_NEXT_FOUND, "leveldb.number.db.next.found"}, - {NUMBER_DB_PREV_FOUND, "leveldb.number.db.prev.found"}, - {ITER_BYTES_READ, "leveldb.db.iter.bytes.read"}, - {NO_FILE_CLOSES, "leveldb.no.file.closes"}, - {NO_FILE_OPENS, "leveldb.no.file.opens"}, - {NO_FILE_ERRORS, "leveldb.no.file.errors"}, - {STALL_L0_SLOWDOWN_MICROS, "leveldb.l0.slowdown.micros"}, - {STALL_MEMTABLE_COMPACTION_MICROS, "leveldb.memtable.compaction.micros"}, - {STALL_L0_NUM_FILES_MICROS, "leveldb.l0.num.files.stall.micros"}, - {STALL_MICROS, "leveldb.stall.micros"}, - {DB_MUTEX_WAIT_MICROS, "leveldb.db.mutex.wait.micros"}, - {RATE_LIMIT_DELAY_MILLIS, "leveldb.rate.limit.delay.millis"}, - {NO_ITERATORS, "leveldb.num.iterators"}, - {NUMBER_MULTIGET_CALLS, "leveldb.number.multiget.get"}, - {NUMBER_MULTIGET_KEYS_READ, "leveldb.number.multiget.keys.read"}, - {NUMBER_MULTIGET_BYTES_READ, "leveldb.number.multiget.bytes.read"}, - {NUMBER_FILTERED_DELETES, "leveldb.number.deletes.filtered"}, - {NUMBER_MERGE_FAILURES, "leveldb.number.merge.failures"}, - {SEQUENCE_NUMBER, "leveldb.sequence.number"}, - {BLOOM_FILTER_PREFIX_CHECKED, "leveldb.bloom.filter.prefix.checked"}, - {BLOOM_FILTER_PREFIX_USEFUL, "leveldb.bloom.filter.prefix.useful"}, - {NUMBER_OF_RESEEKS_IN_ITERATION, "leveldb.number.reseeks.iteration"}, - {GET_UPDATES_SINCE_CALLS, "leveldb.getupdatessince.calls"}, - {BLOCK_CACHE_COMPRESSED_MISS, "leveldb.block.cachecompressed.miss"}, - {BLOCK_CACHE_COMPRESSED_HIT, "leveldb.block.cachecompressed.hit"}, - {BLOCK_CACHE_COMPRESSED_ADD, "leveldb.block.cachecompressed.add"}, - {BLOCK_CACHE_COMPRESSED_ADD_FAILURES, - "leveldb.block.cachecompressed.add.failures"}, - {WAL_FILE_SYNCED, "leveldb.wal.synced"}, - {WAL_FILE_BYTES, "leveldb.wal.bytes"}, - {WRITE_DONE_BY_SELF, "leveldb.write.self"}, - {WRITE_DONE_BY_OTHER, "leveldb.write.other"}, - {WRITE_WITH_WAL, "leveldb.write.wal"}, - {FLUSH_WRITE_BYTES, "leveldb.flush.write.bytes"}, - {COMPACT_READ_BYTES, "leveldb.compact.read.bytes"}, - {COMPACT_WRITE_BYTES, "leveldb.compact.write.bytes"}, - {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, - "leveldb.number.direct.load.table.properties"}, - {NUMBER_SUPERVERSION_ACQUIRES, "leveldb.number.superversion_acquires"}, - {NUMBER_SUPERVERSION_RELEASES, "leveldb.number.superversion_releases"}, - {NUMBER_SUPERVERSION_CLEANUPS, "leveldb.number.superversion_cleanups"}, - {NUMBER_BLOCK_NOT_COMPRESSED, "leveldb.number.block.not_compressed"}, - {MERGE_OPERATION_TOTAL_TIME, "leveldb.merge.operation.time.nanos"}, - {FILTER_OPERATION_TOTAL_TIME, "leveldb.filter.operation.time.nanos"}, - {ROW_CACHE_HIT, "leveldb.row.cache.hit"}, - {ROW_CACHE_MISS, "leveldb.row.cache.miss"}, }; /** @@ -283,74 +35,25 @@ const std::vector > TickersNameMap = { * And increment HISTOGRAM_ENUM_MAX */ enum Histograms : uint32_t { - DB_GET = 0, - DB_WRITE, - COMPACTION_TIME, - SUBCOMPACTION_SETUP_TIME, - TABLE_SYNC_MICROS, - COMPACTION_OUTFILE_SYNC_MICROS, - WAL_FILE_SYNC_MICROS, - MANIFEST_FILE_SYNC_MICROS, - // TIME SPENT IN IO DURING TABLE OPEN - TABLE_OPEN_IO_MICROS, - DB_MULTIGET, - READ_BLOCK_COMPACTION_MICROS, - READ_BLOCK_GET_MICROS, - WRITE_RAW_BLOCK_MICROS, - STALL_L0_SLOWDOWN_COUNT, - STALL_MEMTABLE_COMPACTION_COUNT, - STALL_L0_NUM_FILES_COUNT, - HARD_RATE_LIMIT_DELAY_COUNT, - SOFT_RATE_LIMIT_DELAY_COUNT, - NUM_FILES_IN_SINGLE_COMPACTION, - DB_SEEK, - WRITE_STALL, - SST_READ_MICROS, - // The number of subcompactions actually scheduled during a compaction - NUM_SUBCOMPACTIONS_SCHEDULED, - // Value size distribution in each operation - BYTES_PER_READ, - BYTES_PER_WRITE, - BYTES_PER_MULTIGET, // tera block cache spec - TERA_BLOCK_CACHE_PREAD_QUEUE, + TERA_BLOCK_CACHE_PREAD_QUEUE = 0, TERA_BLOCK_CACHE_PREAD_SSD_READ, TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK, + TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, + TERA_BLOCK_CACHE_PREAD_GET_BLOCK, + TERA_BLOCK_CACHE_PREAD_BLOCK_NR, HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match }; const std::vector > HistogramsNameMap = { - {DB_GET, "leveldb.db.get.micros"}, - {DB_WRITE, "leveldb.db.write.micros"}, - {COMPACTION_TIME, "leveldb.compaction.times.micros"}, - {SUBCOMPACTION_SETUP_TIME, "leveldb.subcompaction.setup.times.micros"}, - {TABLE_SYNC_MICROS, "leveldb.table.sync.micros"}, - {COMPACTION_OUTFILE_SYNC_MICROS, "leveldb.compaction.outfile.sync.micros"}, - {WAL_FILE_SYNC_MICROS, "leveldb.wal.file.sync.micros"}, - {MANIFEST_FILE_SYNC_MICROS, "leveldb.manifest.file.sync.micros"}, - {TABLE_OPEN_IO_MICROS, "leveldb.table.open.io.micros"}, - {DB_MULTIGET, "leveldb.db.multiget.micros"}, - {READ_BLOCK_COMPACTION_MICROS, "leveldb.read.block.compaction.micros"}, - {READ_BLOCK_GET_MICROS, "leveldb.read.block.get.micros"}, - {WRITE_RAW_BLOCK_MICROS, "leveldb.write.raw.block.micros"}, - {STALL_L0_SLOWDOWN_COUNT, "leveldb.l0.slowdown.count"}, - {STALL_MEMTABLE_COMPACTION_COUNT, "leveldb.memtable.compaction.count"}, - {STALL_L0_NUM_FILES_COUNT, "leveldb.num.files.stall.count"}, - {HARD_RATE_LIMIT_DELAY_COUNT, "leveldb.hard.rate.limit.delay.count"}, - {SOFT_RATE_LIMIT_DELAY_COUNT, "leveldb.soft.rate.limit.delay.count"}, - {NUM_FILES_IN_SINGLE_COMPACTION, "leveldb.numfiles.in.singlecompaction"}, - {DB_SEEK, "leveldb.db.seek.micros"}, - {WRITE_STALL, "leveldb.db.write.stall"}, - {SST_READ_MICROS, "leveldb.sst.read.micros"}, - {NUM_SUBCOMPACTIONS_SCHEDULED, "leveldb.num.subcompactions.scheduled"}, - {BYTES_PER_READ, "leveldb.bytes.per.read"}, - {BYTES_PER_WRITE, "leveldb.bytes.per.write"}, - {BYTES_PER_MULTIGET, "leveldb.bytes.per.multiget"}, {TERA_BLOCK_CACHE_PREAD_QUEUE, "tera.block_cache.pread_queue"}, {TERA_BLOCK_CACHE_PREAD_SSD_READ, "tera.block_cache.pread_ssd_read"}, {TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, "tera.block_cache.pread_fill_user_data"}, {TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK, "tera.block_cache.pread_release_block"}, + {TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, "tera.block_cache.lockmap_ds_reload_nr"}, + {TERA_BLOCK_CACHE_PREAD_GET_BLOCK, "tera.block_cache.pread_get_block"}, + {TERA_BLOCK_CACHE_PREAD_BLOCK_NR, "tera.block_cache.pread_block_nr"}, }; struct HistogramData { diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 65f218eb4..0ad4b6f35 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -524,28 +524,7 @@ class BlockCacheWritableFile : public WritableFile { return; } - ~BlockCacheWritableFile() { - if (dfs_file_ != NULL) { - Log("[%s] dfs close for release %s\n", cache_->WorkPath().c_str(), fname_.c_str()); - dfs_file_->Close(); - delete dfs_file_; - dfs_file_ = NULL; - } - - Log("[%s] begin release %s\n", cache_->WorkPath().c_str(), fname_.c_str()); - MutexLock lockgard(&cache_->mu_); - uint64_t block_idx; - std::string* block_data = write_buffer_.PopBackBlock(&block_idx); - if (block_data != NULL) { - FillCache(block_data, block_idx); - } - - while (bg_block_flush_ > 0) { - bg_cv_.Wait(); - } - Log("[%s] end release %s\n", cache_->WorkPath().c_str(), fname_.c_str()); - return; - } + ~BlockCacheWritableFile() { Close(); } Status Append(const Slice& data) { Status s = dfs_file_->Append(data); @@ -564,10 +543,12 @@ class BlockCacheWritableFile : public WritableFile { } Status Close() { - Log("[%s] begin close %s\n", cache_->WorkPath().c_str(), fname_.c_str()); - Status s = dfs_file_->Close(); - delete dfs_file_; - dfs_file_ = NULL; + Status s; + if (dfs_file_ != NULL) { + s = dfs_file_->Close(); + delete dfs_file_; + dfs_file_ = NULL; + } MutexLock lockgard(&cache_->mu_); uint64_t block_idx; @@ -579,8 +560,8 @@ class BlockCacheWritableFile : public WritableFile { while (bg_block_flush_ > 0) { bg_cv_.Wait(); } - Log("[%s] end close %s, status %s\n", cache_->WorkPath().c_str(), fname_.c_str(), - s.ToString().c_str()); + //Log("[%s] end close %s, status %s\n", cache_->WorkPath().c_str(), fname_.c_str(), + // s.ToString().c_str()); return s; } @@ -640,7 +621,7 @@ class BlockCacheWritableFile : public WritableFile { port::CondVar cv(&cache_->mu_); cv.Wait(10); // timewait 10ms retry } - assert(block->state == 0); + block->state = 0; block->GetDataBlock(cache_->options_.block_size, Slice(*block_data)); cache_->mu_.Unlock(); @@ -686,7 +667,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { MutexLock lockgard(&cache_->mu_); fid_ = cache_->FileId(fname_); - aio_enabled_ = true; + aio_enabled_ = false; return; } @@ -715,6 +696,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { uint64_t start_ts = cache_->options_.cache_env->NowMicros(); MutexLock lockgard(&cache_->mu_); for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) { + uint64_t get_block_ts = cache_->options_.cache_env->NowMicros(); CacheBlock* block = NULL; while ((block = cache_->GetAndAllocBlock(fid, block_idx)) == NULL) { Log("[%s] fill cache for read %s, fid %lu, block_idx %lu, wait 10ms after retry\n", @@ -742,10 +724,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // cache_->WorkPath().c_str(), block->ToString().c_str(), // block->handle->refs, block->data_block_refs, // block->data_block_alloc); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_GET_BLOCK, + cache_->options_.cache_env->NowMicros() - get_block_ts); } cache_->mu_.Unlock(); uint64_t queue_ts = cache_->options_.cache_env->NowMicros(); cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_QUEUE, queue_ts - start_ts); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_BLOCK_NR, end - begin + 1); // async read miss data for (uint32_t i = 0; i < c_miss.size(); ++i) { @@ -1058,21 +1043,24 @@ void BlockCacheImpl::BGControlThreadFunc(void* arg) { } void BlockCacheImpl::BGControlThread() { - Log("[%s] statistics: %s", this->WorkPath().c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str()); - Log("[%s] statistics: %s", this->WorkPath().c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str()); - Log("[%s] statistics: %s", this->WorkPath().c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str()); - Log("[%s] statistics: %s", this->WorkPath().c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str()); + Log("[%s] statistics: %s, %s, %s, %s, %s, %s, %s\n", this->WorkPath().c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_GET_BLOCK).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_BLOCK_NR).c_str()); // resched after 1s stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK); - bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 10000); + stat_->ClearHistogram(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR); + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_GET_BLOCK); + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_BLOCK_NR); + bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 6000); } Status BlockCacheImpl::NewWritableFile(const std::string& fname, @@ -1192,6 +1180,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { lc.data_set->fd); // reload hash lru + uint64_t total_items = 0; ReadOptions s_opts; leveldb::Iterator* db_it = db_->NewIterator(s_opts); for (db_it->Seek(key); @@ -1207,6 +1196,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { if (sid != lc.sid) { break; } + total_items++; CacheBlock* block = new CacheBlock(&mu_); block->DecodeFrom(db_it->value()); // get fid and block_idx @@ -1216,16 +1206,17 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { block->sid = sid; block->cache_block_idx = cbi; block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0; - Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n", - this->WorkPath().c_str(), - lc.KeyToString().c_str(), - block->ToString().c_str()); + //Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n", + // this->WorkPath().c_str(), + // lc.KeyToString().c_str(), + // block->ToString().c_str()); LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter)); handle->cache_id = block->cache_block_idx; block->handle = handle; lc.data_set->cache->Release((Cache::Handle*)handle); } delete db_it; + stat_->MeasureTime(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, total_items); mu_.Lock(); data_set_map_[lc.sid] = lc.data_set; @@ -1285,7 +1276,7 @@ Status BlockCacheImpl::LoadCache() { Log("[block_cache %s]: reuse block cache: prev_fid: %lu, new_fid: %lu\n", dbname.c_str(), prev_fid_, new_fid_); - bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 10000); + bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 6000); s = Status::OK(); return s; } @@ -1462,24 +1453,33 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { CacheBlock* block = NULL; DataSet* ds = GetDataSet(sid); // get and alloc ds Cache* cache = ds->cache; + mu_.Unlock(); + LRUHandle* h = (LRUHandle*)cache->Lookup(key); if (h == NULL) { - block = new CacheBlock(&mu_); - h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); + mu_.Lock(); + h = (LRUHandle*)cache->Lookup(key); if (h == NULL) { - delete block; - return NULL; + block = new CacheBlock(&mu_); + h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); + if (h == NULL) { + delete block; + return NULL; + } + block->fid = fid; + block->block_idx = block_idx; + block->sid = sid; + block->cache_block_idx = h->cache_id; + block->handle = h; + Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", + this->WorkPath().c_str(), + block->ToString().c_str(), + sid, fid, block_idx, hash, options_.dataset_num); + } else { + block = reinterpret_cast(cache->Value((Cache::Handle*)h)); } - block->fid = fid; - block->block_idx = block_idx; - block->sid = sid; - block->cache_block_idx = h->cache_id; - block->handle = h; - Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", - this->WorkPath().c_str(), - block->ToString().c_str(), - sid, fid, block_idx, hash, options_.dataset_num); } else { + mu_.Lock(); block = reinterpret_cast(cache->Value((Cache::Handle*)h)); //Log("[%s] get block from memcache, %s\n", // this->WorkPath().c_str(), block->ToString().c_str()); diff --git a/src/leveldb/util/statistics.cc b/src/leveldb/util/statistics.cc index ee383a055..74352ae3a 100644 --- a/src/leveldb/util/statistics.cc +++ b/src/leveldb/util/statistics.cc @@ -53,11 +53,10 @@ class StatisticsImpl : public Statistics { GetHistogramData(type, &hData); snprintf(buffer, 200, - "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n", + "%s :=> %f(%f)", HistogramsNameMap[type].second.c_str(), - hData.median, - hData.percentile95, - hData.percentile99); + hData.average, + hData.percentile99 - hData.median); res.append(buffer); res.shrink_to_fit(); return res; From 749296e36083fa64dd3db493e3879dec025f317e Mon Sep 17 00:00:00 2001 From: caijieming Date: Thu, 24 Aug 2017 00:18:20 +0800 Subject: [PATCH 12/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/include/leveldb/block_cache.h | 2 +- src/leveldb/include/leveldb/statistics.h | 12 +- src/leveldb/util/block_cache.cc | 167 +++++++++++++--------- src/leveldb/util/statistics.cc | 2 +- 4 files changed, 110 insertions(+), 73 deletions(-) diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h index 021964db4..e331e05f4 100644 --- a/src/leveldb/include/leveldb/block_cache.h +++ b/src/leveldb/include/leveldb/block_cache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h index db4133e99..62916892f 100644 --- a/src/leveldb/include/leveldb/statistics.h +++ b/src/leveldb/include/leveldb/statistics.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -43,6 +43,11 @@ enum Histograms : uint32_t { TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, TERA_BLOCK_CACHE_PREAD_GET_BLOCK, TERA_BLOCK_CACHE_PREAD_BLOCK_NR, + TERA_BLOCK_CACHE_GET_DS, + TERA_BLOCK_CACHE_DS_LRU_LOOKUP, + TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, + TERA_BLOCK_CACHE_ALLOC_FID, + TERA_BLOCK_CACHE_GET_FID, HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match }; @@ -54,6 +59,11 @@ const std::vector > HistogramsNameMap = { {TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, "tera.block_cache.lockmap_ds_reload_nr"}, {TERA_BLOCK_CACHE_PREAD_GET_BLOCK, "tera.block_cache.pread_get_block"}, {TERA_BLOCK_CACHE_PREAD_BLOCK_NR, "tera.block_cache.pread_block_nr"}, + {TERA_BLOCK_CACHE_GET_DS, "tera.block_cache.get_ds"}, + {TERA_BLOCK_CACHE_DS_LRU_LOOKUP, "tera.block_cache.ds_lru_lookup"}, + {TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, "tera.block_cache.pread_wait_unlock"}, + {TERA_BLOCK_CACHE_ALLOC_FID, "tera.block_cache.alloc_fid"}, + {TERA_BLOCK_CACHE_GET_FID, "tera.block_cache.get_fid"}, }; struct HistogramData { diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 0ad4b6f35..797088f84 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -38,7 +38,7 @@ namespace leveldb { // Tcache ///////////////////////////////////////////// uint64_t kBlockSize = 4096UL; -uint64_t kDataSetSize = 134217728UL; +uint64_t kDataSetSize = 128UL << 20; uint64_t kFidBatchNum = 100000UL; uint64_t kCacheSize = 350000000000UL; uint64_t kMetaBlockSize = 2000UL; @@ -63,6 +63,7 @@ struct CacheBlock { uint64_t sid; uint64_t cache_block_idx; volatile uint64_t state; + port::Mutex mu; port::CondVar cv; Slice data_block; bool data_block_alloc; @@ -70,31 +71,35 @@ struct CacheBlock { LRUHandle* handle; Status s; - CacheBlock(port::Mutex* mu) + CacheBlock() : fid(0), block_idx(0), sid(0xffffffffffffffff), cache_block_idx(0xffffffffffffffff), state(0), - cv(mu), + cv(&mu), data_block_alloc(false), data_block_refs(0), handle(NULL) { } bool Test(uint64_t c_state) { + mu.AssertHeld(); return (state & c_state) == c_state; } void Clear(uint64_t c_state) { + mu.AssertHeld(); state &= ~c_state; } void Set(uint64_t c_state) { + mu.AssertHeld(); state |= c_state; } void WaitOnClear(uint64_t c_state) { // access in lock + mu.AssertHeld(); while (Test(c_state)) { cv.Wait(); } @@ -507,7 +512,7 @@ class BlockCacheWritableFile : public WritableFile { public: BlockCacheWritableFile(BlockCacheImpl* c, const std::string& fname, Status* s) : cache_(c), - bg_cv_(&c->mu_), + bg_cv_(&mu_), bg_block_flush_(0), pending_block_num_(0), write_buffer_(cache_->WorkPath(), fname, cache_->options_.block_size), @@ -519,7 +524,6 @@ class BlockCacheWritableFile : public WritableFile { cache_->options_.block_size, s->ToString().c_str()); - MutexLock lockgard(&cache_->mu_); fid_ = cache_->FileId(fname_); return; } @@ -537,7 +541,7 @@ class BlockCacheWritableFile : public WritableFile { } write_buffer_.Append(data); - MutexLock lockgard(&cache_->mu_); + MutexLock lockgard(&mu_); MaybeScheduleBGFlush(); return Status::OK(); } @@ -550,13 +554,13 @@ class BlockCacheWritableFile : public WritableFile { dfs_file_ = NULL; } - MutexLock lockgard(&cache_->mu_); uint64_t block_idx; std::string* block_data = write_buffer_.PopBackBlock(&block_idx); if (block_data != NULL) { FillCache(block_data, block_idx); } + MutexLock lockgard(&mu_); while (bg_block_flush_ > 0) { bg_cv_.Wait(); } @@ -577,7 +581,7 @@ class BlockCacheWritableFile : public WritableFile { private: void MaybeScheduleBGFlush() { - cache_->mu_.AssertHeld(); + mu_.AssertHeld(); //Log("[%s] Maybe schedule BGFlush: %s, bg_block_flush: %u, block_nr: %u\n", // cache_->WorkPath().c_str(), // fname_.c_str(), @@ -594,12 +598,15 @@ class BlockCacheWritableFile : public WritableFile { } void BGFlush() { Log("[%s] Begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); - MutexLock lockgard(&cache_->mu_); + MutexLock lockgard(&mu_); uint64_t block_idx; std::string* block_data = write_buffer_.PopFrontBlock(&block_idx); if (block_data != NULL) { pending_block_num_++; + mu_.Unlock(); + FillCache(block_data, block_idx); + mu_.Lock(); pending_block_num_--; } @@ -610,7 +617,6 @@ class BlockCacheWritableFile : public WritableFile { } Status FillCache(std::string* block_data, uint64_t block_idx) { - cache_->mu_.AssertHeld(); Status s; uint64_t fid = fid_; CacheBlock* block = NULL; @@ -618,22 +624,22 @@ class BlockCacheWritableFile : public WritableFile { Log("[%s] fill cache for write %s, fid %lu, block_idx %lu, wait 10ms after retry\n", cache_->WorkPath().c_str(), fname_.c_str(), fid, block_idx); - port::CondVar cv(&cache_->mu_); - cv.Wait(10); // timewait 10ms retry + cache_->options_.cache_env->SleepForMicroseconds(10000); } + + block->mu.Lock(); block->state = 0; block->GetDataBlock(cache_->options_.block_size, Slice(*block_data)); - cache_->mu_.Unlock(); + block->mu.Unlock(); // Do io without lock block->s = cache_->LogRecord(block); if (block->s.ok()) { block->s = cache_->FillCache(block); - } - - cache_->mu_.Lock(); - if (block->s.ok()) { - block->state = kCacheBlockValid; + if (block->s.ok()) { + MutexLock l(&block->mu); + block->state = kCacheBlockValid; + } } s = cache_->ReleaseBlock(block, true); write_buffer_.ReleaseBlock(block_data); @@ -643,6 +649,7 @@ class BlockCacheWritableFile : public WritableFile { private: BlockCacheImpl* cache_; //port::AtomicPointer shutting_down_; + port::Mutex mu_; port::CondVar bg_cv_; // Signalled when background work finishes WritableFile* dfs_file_; // protected by cache_.mu_ @@ -665,7 +672,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // cache_->options_.block_size, // s->ToString().c_str()); - MutexLock lockgard(&cache_->mu_); fid_ = cache_->FileId(fname_); aio_enabled_ = false; return; @@ -673,7 +679,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { ~BlockCacheRandomAccessFile() { delete dfs_file_; - return; } Status Read(uint64_t offset, size_t n, Slice* result, @@ -694,7 +699,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // begin, end, cache_->options_.block_size); uint64_t start_ts = cache_->options_.cache_env->NowMicros(); - MutexLock lockgard(&cache_->mu_); for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) { uint64_t get_block_ts = cache_->options_.cache_env->NowMicros(); CacheBlock* block = NULL; @@ -702,13 +706,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { Log("[%s] fill cache for read %s, fid %lu, block_idx %lu, wait 10ms after retry\n", cache_->WorkPath().c_str(), fname_.c_str(), fid, block_idx); - port::CondVar cv(&cache_->mu_); - cv.Wait(10); // timewait 10ms retry + cache_->options_.cache_env->SleepForMicroseconds(10000); } + + block->mu.Lock(); assert(block->fid == fid && block->block_idx == block_idx); block->GetDataBlock(cache_->options_.block_size, Slice()); block_queue.push_back(block); // sort by block_idx - if (!block->Test(kCacheBlockLocked) && block->Test(kCacheBlockValid)) { block->Set(kCacheBlockLocked | kCacheBlockCacheRead); @@ -719,6 +723,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } else { c_locked.push_back(block); } + block->mu.Unlock(); //Log("[%s] Queue block: %s, refs %u, data_block_refs %lu, alloc %u\n", // cache_->WorkPath().c_str(), block->ToString().c_str(), @@ -727,7 +732,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_GET_BLOCK, cache_->options_.cache_env->NowMicros() - get_block_ts); } - cache_->mu_.Unlock(); uint64_t queue_ts = cache_->options_.cache_env->NowMicros(); cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_QUEUE, queue_ts - start_ts); cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_BLOCK_NR, end - begin + 1); @@ -764,8 +768,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // wait async cache read done for (uint32_t i = 0; i < c_valid.size(); ++i) { - MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_valid[i]; + block->mu.Lock(); block->WaitOnClear(kCacheBlockCacheRead); assert(block->Test(kCacheBlockValid)); if (!block->s.ok() && s.ok()) { @@ -773,6 +777,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } block->Clear(kCacheBlockLocked); block->cv.SignalAll(); + block->mu.Unlock(); //Log("[%s] cache read done, %s\n", // cache_->WorkPath().c_str(), // block->ToString().c_str()); @@ -782,13 +787,14 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // wait dfs read done and async cache file for (uint32_t i = 0; i < c_miss.size(); ++i) { - MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_miss[i]; + block->mu.Lock(); block->WaitOnClear(kCacheBlockDfsRead); block->Set(kCacheBlockCacheFill); if (!block->s.ok() && s.ok()) { s = block->s; // degrade read } + block->mu.Unlock(); Log("[%s] dfs read done, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); @@ -808,8 +814,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { //uint64_t ssd_write_sched_ts = cache_->options_.cache_env->NowMicros(); for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish - MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_miss[i]; + block->mu.Lock(); block->WaitOnClear(kCacheBlockCacheFill); if (block->s.ok()) { block->Set(kCacheBlockValid); @@ -818,22 +824,25 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { } block->Clear(kCacheBlockLocked); block->cv.SignalAll(); + block->mu.Unlock(); //Log("[%s] cache fill done, %s\n", // cache_->WorkPath().c_str(), // block->ToString().c_str()); } - //uint64_t ssd_write_ts = cache_->options_.cache_env->NowMicros(); + uint64_t ssd_write_ts = cache_->options_.cache_env->NowMicros(); // wait other async read finish for (uint32_t i = 0; i < c_locked.size(); ++i) { - MutexLock lockgard(&cache_->mu_); CacheBlock* block = c_locked[i]; + block->mu.Lock(); block->WaitOnClear(kCacheBlockLocked); + block->mu.Unlock(); //Log("[%s] wait locked done, %s\n", // cache_->WorkPath().c_str(), // block->ToString().c_str()); } uint64_t wait_unlock_ts = cache_->options_.cache_env->NowMicros(); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, wait_unlock_ts - ssd_write_ts); // fill user mem size_t msize = 0; @@ -862,7 +871,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { uint64_t fill_user_data_ts = cache_->options_.cache_env->NowMicros(); cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, fill_user_data_ts - wait_unlock_ts); - cache_->mu_.Lock(); for (uint32_t i = 0; i < c_miss.size(); ++i) { CacheBlock* block = c_miss[i]; //Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); @@ -923,9 +931,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { block->s.ToString().c_str(), result.size()); } - MutexLock lockgard(&cache_->mu_); + block->mu.Lock(); block->Clear(kCacheBlockDfsRead); block->cv.SignalAll(); + block->mu.Unlock(); return; } @@ -947,9 +956,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { CacheBlock* block = reader->block; block->s = cache_->ReadCache(block, NULL); - MutexLock lockgard(&cache_->mu_); + block->mu.Lock(); block->Clear(kCacheBlockCacheRead); block->cv.SignalAll(); + block->mu.Unlock(); //Log("[%s] async.cacheread signal, %s\n", cache_->WorkPath().c_str(), // block->ToString().c_str()); } @@ -966,14 +976,15 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { //while (aio_error(reader->aio_context) == EINPROGRESS); ssize_t res = aio_return(&reader->aio_context); block->s = res < 0? Status::Corruption("AioReadCache error") : Status::OK(); - - MutexLock lockgard(&cache_->mu_); - block->Clear(kCacheBlockCacheRead); - block->cv.SignalAll(); if (!block->s.ok()) { Log("[%s] aio.cacheread signal, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str()); } + + block->mu.Lock(); + block->Clear(kCacheBlockCacheRead); + block->cv.SignalAll(); + block->mu.Unlock(); } void AioCacheRead(AsyncCacheReader* reader) const { // setup sigevent @@ -1006,9 +1017,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { block->s = cache_->FillCache(block); } - MutexLock lockgard(&cache_->mu_); + block->mu.Lock(); block->Clear(kCacheBlockCacheFill); block->cv.SignalAll(); + block->mu.Unlock(); return; } @@ -1043,16 +1055,27 @@ void BlockCacheImpl::BGControlThreadFunc(void* arg) { } void BlockCacheImpl::BGControlThread() { - Log("[%s] statistics: %s, %s, %s, %s, %s, %s, %s\n", this->WorkPath().c_str(), + Log("[%s] statistics: " + "%s, %s, %s, %s, %s, " + "%s, %s, %s, %s, %s, " + "%s, %s\n", + this->WorkPath().c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_GET_BLOCK).c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_BLOCK_NR).c_str()); + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_BLOCK_NR).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_DS).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_DS_LRU_LOOKUP).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK).c_str(), + + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_ALLOC_FID).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str()); - // resched after 1s + // resched after 6s stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA); @@ -1060,6 +1083,11 @@ void BlockCacheImpl::BGControlThread() { stat_->ClearHistogram(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_GET_BLOCK); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_BLOCK_NR); + stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_DS); + stat_->ClearHistogram(TERA_BLOCK_CACHE_DS_LRU_LOOKUP); + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK); + stat_->ClearHistogram(TERA_BLOCK_CACHE_ALLOC_FID); + stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_FID); bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 6000); } @@ -1198,7 +1226,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { } total_items++; - CacheBlock* block = new CacheBlock(&mu_); + CacheBlock* block = new CacheBlock; block->DecodeFrom(db_it->value()); // get fid and block_idx std::string hkey; PutFixed64(&hkey, block->fid); @@ -1282,7 +1310,7 @@ Status BlockCacheImpl::LoadCache() { } Status BlockCacheImpl::FillCache(CacheBlock* block) { - MutexLock l(&mu_); + mu_.Lock(); uint64_t sid = block->sid; uint64_t cache_block_idx = block->cache_block_idx; int fd = (data_set_map_[sid])->fd; @@ -1297,7 +1325,6 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) { block->ToString().c_str(), res); - mu_.Lock(); if (res < 0) { return Status::Corruption("FillCache error"); } @@ -1305,7 +1332,7 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) { } Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) { - MutexLock l(&mu_); + mu_.Lock(); uint64_t sid = block->sid; uint64_t cache_block_idx = block->cache_block_idx; int fd = (data_set_map_[sid])->fd; @@ -1330,10 +1357,6 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) { cache_block_idx, block->ToString().c_str(), res); - } - - mu_.Lock(); - if (res < 0) { return Status::Corruption("ReadCache error"); } return Status::OK(); @@ -1341,6 +1364,7 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) { uint64_t BlockCacheImpl::AllocFileId() { // no more than fid_batch_num mu_.AssertHeld(); + uint64_t start_ts = options_.cache_env->NowMicros(); uint64_t fid = ++new_fid_; while (new_fid_ - prev_fid_ >= options_.fid_batch_num) { std::string key = "FID#"; @@ -1363,17 +1387,18 @@ uint64_t BlockCacheImpl::AllocFileId() { // no more than fid_batch_num new_fid_, prev_fid_); } + stat_->MeasureTime(TERA_BLOCK_CACHE_ALLOC_FID, + options_.cache_env->NowMicros() - start_ts); return fid; } uint64_t BlockCacheImpl::FileId(const std::string& fname) { - mu_.AssertHeld(); uint64_t fid = 0; std::string key = "FNAME#" + fname; - mu_.Unlock(); - + uint64_t start_ts = options_.cache_env->NowMicros(); ReadOptions r_opts; std::string val; + Status s = db_->Get(r_opts, key, &val); if (!s.ok()) { // not exist MutexLock l(&mu_); @@ -1395,11 +1420,12 @@ uint64_t BlockCacheImpl::FileId(const std::string& fname) { } else { // fid in cache fid = DecodeFixed64(val.c_str()); } + //Log("[%s] Fid: %lu, fname: %s\n", // this->WorkPath().c_str(), // fid, fname.c_str()); - - mu_.Lock(); + stat_->MeasureTime(TERA_BLOCK_CACHE_GET_FID, + options_.cache_env->NowMicros() - start_ts); return fid; } @@ -1421,9 +1447,10 @@ Status BlockCacheImpl::DeleteFile(const std::string& fname) { } DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) { - mu_.AssertHeld(); DataSet* set = NULL; + uint64_t start_ts = options_.cache_env->NowMicros(); + MutexLock l(&mu_); DataSetMap::iterator it = data_set_map_.find(sid); if (it == data_set_map_.end()) { LockContent lc; @@ -1437,11 +1464,12 @@ DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) { // this->WorkPath().c_str(), sid); set = it->second; } + stat_->MeasureTime(TERA_BLOCK_CACHE_GET_DS, + options_.cache_env->NowMicros() - start_ts); return set; } CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { - mu_.AssertHeld(); std::string key; PutFixed64(&key, fid); PutFixed64(&key, block_idx); @@ -1453,14 +1481,14 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { CacheBlock* block = NULL; DataSet* ds = GetDataSet(sid); // get and alloc ds Cache* cache = ds->cache; - mu_.Unlock(); + uint64_t start_ts = options_.cache_env->NowMicros(); LRUHandle* h = (LRUHandle*)cache->Lookup(key); if (h == NULL) { - mu_.Lock(); + MutexLock l(&mu_); h = (LRUHandle*)cache->Lookup(key); if (h == NULL) { - block = new CacheBlock(&mu_); + block = new CacheBlock; h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); if (h == NULL) { delete block; @@ -1479,11 +1507,12 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { block = reinterpret_cast(cache->Value((Cache::Handle*)h)); } } else { - mu_.Lock(); block = reinterpret_cast(cache->Value((Cache::Handle*)h)); //Log("[%s] get block from memcache, %s\n", // this->WorkPath().c_str(), block->ToString().c_str()); } + stat_->MeasureTime(TERA_BLOCK_CACHE_DS_LRU_LOOKUP, + options_.cache_env->NowMicros() - start_ts); return block; } @@ -1497,22 +1526,20 @@ Status BlockCacheImpl::LogRecord(CacheBlock* block) { } Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) { - mu_.AssertHeld(); Status s; - - mu_.Unlock(); - if (need_sync) { - // TODO: dump meta into memtable + if (need_sync) { // TODO: dump meta into memtable s = LogRecord(block); } - mu_.Lock(); - LRUHandle* h = block->handle; - DataSet* ds = GetDataSet(block->sid); // get and alloc ds + block->mu.Lock(); block->ReleaseDataBlock(); - //Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); block->s = Status::OK(); // clear io status block->cv.SignalAll(); + block->mu.Unlock(); + + //Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); + LRUHandle* h = block->handle; + DataSet* ds = GetDataSet(block->sid); // get and alloc ds ds->cache->Release((Cache::Handle*)h); return s; } diff --git a/src/leveldb/util/statistics.cc b/src/leveldb/util/statistics.cc index 74352ae3a..130b06311 100644 --- a/src/leveldb/util/statistics.cc +++ b/src/leveldb/util/statistics.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. From ae0298a8631c58c22a938decf9b25763f9dcc15a Mon Sep 17 00:00:00 2001 From: caijieming Date: Fri, 25 Aug 2017 13:52:07 +0800 Subject: [PATCH 13/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/util/block_cache.cc | 53 +++++++++++++-------------------- src/leveldb/util/cache.cc | 25 ++++++++++------ 2 files changed, 37 insertions(+), 41 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 797088f84..1cd0ed54e 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -35,7 +35,7 @@ namespace leveldb { ///////////////////////////////////////////// -// Tcache +// t-cache impl ///////////////////////////////////////////// uint64_t kBlockSize = 4096UL; uint64_t kDataSetSize = 128UL << 20; @@ -1239,6 +1239,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { // lc.KeyToString().c_str(), // block->ToString().c_str()); LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter)); + assert((uint64_t)(lc.data_set->cache->Value((Cache::Handle*)handle)) == (uint64_t)block); handle->cache_id = block->cache_block_idx; block->handle = handle; lc.data_set->cache->Release((Cache::Handle*)handle); @@ -1381,11 +1382,11 @@ uint64_t BlockCacheImpl::AllocFileId() { // no more than fid_batch_num if (s.ok()) { prev_fid_ = DecodeFixed64(val.c_str()); } - Log("[%s] alloc fid: key %s, new_fid: %lu, prev_fid: %lu\n", - this->WorkPath().c_str(), - key.c_str(), - new_fid_, - prev_fid_); + //Log("[%s] alloc fid: key %s, new_fid: %lu, prev_fid: %lu\n", + // this->WorkPath().c_str(), + // key.c_str(), + // new_fid_, + // prev_fid_); } stat_->MeasureTime(TERA_BLOCK_CACHE_ALLOC_FID, options_.cache_env->NowMicros() - start_ts); @@ -1483,33 +1484,21 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { Cache* cache = ds->cache; uint64_t start_ts = options_.cache_env->NowMicros(); - LRUHandle* h = (LRUHandle*)cache->Lookup(key); - if (h == NULL) { - MutexLock l(&mu_); - h = (LRUHandle*)cache->Lookup(key); - if (h == NULL) { - block = new CacheBlock; - h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); - if (h == NULL) { - delete block; - return NULL; - } - block->fid = fid; - block->block_idx = block_idx; - block->sid = sid; - block->cache_block_idx = h->cache_id; - block->handle = h; - Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", - this->WorkPath().c_str(), - block->ToString().c_str(), - sid, fid, block_idx, hash, options_.dataset_num); - } else { - block = reinterpret_cast(cache->Value((Cache::Handle*)h)); - } + block = new CacheBlock; + LRUHandle* h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); + if (h != NULL && ((uint64_t)(cache->Value((Cache::Handle*)h)) == (uint64_t)block)) { + block->fid = fid; + block->block_idx = block_idx; + block->sid = sid; + block->cache_block_idx = h->cache_id; + block->handle = h; + //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", + // this->WorkPath().c_str(), + // block->ToString().c_str(), + // sid, fid, block_idx, hash, options_.dataset_num); } else { - block = reinterpret_cast(cache->Value((Cache::Handle*)h)); - //Log("[%s] get block from memcache, %s\n", - // this->WorkPath().c_str(), block->ToString().c_str()); + delete block; + block = (h == NULL) ? NULL: reinterpret_cast(cache->Value((Cache::Handle*)h)); } stat_->MeasureTime(TERA_BLOCK_CACHE_DS_LRU_LOOKUP, options_.cache_env->NowMicros() - start_ts); diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc index 97e070bf3..2595e7879 100644 --- a/src/leveldb/util/cache.cc +++ b/src/leveldb/util/cache.cc @@ -274,13 +274,16 @@ class LRU2QCache: public Cache { ~LRU2QCache() {} // Like Cache methods, but with an extra "hash" parameter. + // Notice: insert if absent,if exist, return the old one. Cache::Handle* Insert(const Slice& key, void* value, size_t cache_id, void (*deleter)(const Slice& key, void* value)) { const uint32_t hash = HashSlice(key); MutexLock l(&mutex_); LRUHandle* e = NULL; - e = table_.Lookup(key, hash); - assert(e == NULL); + e = (LRUHandle*)DoLookup(key, hash); + if (e != NULL) { + return reinterpret_cast(e); + } if (usage_ < capacity_) { // cache not full e = reinterpret_cast( @@ -332,13 +335,7 @@ class LRU2QCache: public Cache { Cache::Handle* Lookup(const Slice& key) { const uint32_t hash = HashSlice(key); MutexLock l(&mutex_); - LRUHandle* e = table_.Lookup(key, hash); - if (e != NULL) { - e->refs++; - LRU_Remove(e); - LRU_Append(e); - } - return reinterpret_cast(e); + return DoLookup(key, hash); } void Erase(const Slice& key) { @@ -379,6 +376,16 @@ class LRU2QCache: public Cache { } private: + Cache::Handle* DoLookup(const Slice& key, uint32_t hash) { + LRUHandle* e = table_.Lookup(key, hash); + if (e != NULL) { + e->refs++; + LRU_Remove(e); + LRU_Append(e); + } + return reinterpret_cast(e); + } + void LRU_Remove(LRUHandle* e) { e->next->prev = e->prev; e->prev->next = e->next; From 415d02651ddd71c03da7298f43c4730f2f71d08d Mon Sep 17 00:00:00 2001 From: caijieming Date: Mon, 28 Aug 2017 19:07:39 +0800 Subject: [PATCH 14/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/include/leveldb/block_cache.h | 3 + src/leveldb/include/leveldb/statistics.h | 2 + src/leveldb/util/block_cache.cc | 165 ++++++++++++++-------- src/leveldb/util/cache.cc | 8 +- 4 files changed, 119 insertions(+), 59 deletions(-) diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h index e331e05f4..7c2d2b965 100644 --- a/src/leveldb/include/leveldb/block_cache.h +++ b/src/leveldb/include/leveldb/block_cache.h @@ -84,6 +84,9 @@ class BlockCacheEnv : public EnvWrapper { // cache relatively virtual Status NewRandomAccessFile(const std::string& fname, RandomAccessFile** result); // cache Pread + virtual Status NewRandomAccessFile(const std::string& fname, + uint64_t fsize, + RandomAccessFile** result); // cache Pread virtual Status NewWritableFile(const std::string& fname, WritableFile** result); // cache Append diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h index 62916892f..ed2898996 100644 --- a/src/leveldb/include/leveldb/statistics.h +++ b/src/leveldb/include/leveldb/statistics.h @@ -48,6 +48,7 @@ enum Histograms : uint32_t { TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, TERA_BLOCK_CACHE_ALLOC_FID, TERA_BLOCK_CACHE_GET_FID, + TERA_BLOCK_CACHE_EVICT_NR, HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match }; @@ -64,6 +65,7 @@ const std::vector > HistogramsNameMap = { {TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, "tera.block_cache.pread_wait_unlock"}, {TERA_BLOCK_CACHE_ALLOC_FID, "tera.block_cache.alloc_fid"}, {TERA_BLOCK_CACHE_GET_FID, "tera.block_cache.get_fid"}, + {TERA_BLOCK_CACHE_EVICT_NR, "tera.block_cache.evict_nr"}, }; struct HistogramData { diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 1cd0ed54e..2d664b681 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -15,6 +15,8 @@ #include #include +#include "../utils/counter.h" + #include "db/table_cache.h" #include "leveldb/db.h" #include "leveldb/cache.h" @@ -34,6 +36,8 @@ namespace leveldb { +::tera::Counter tera_block_cache_evict_counter; + ///////////////////////////////////////////// // t-cache impl ///////////////////////////////////////////// @@ -160,6 +164,7 @@ struct CacheBlock { }; struct DataSet { + port::Mutex mu; Cache* cache; int fd; }; @@ -178,6 +183,7 @@ class BlockCacheImpl { WritableFile** result); Status NewRandomAccessFile(const std::string& fname, + uint64_t fsize, RandomAccessFile** result); // cache Pread static void BlockDeleter(const Slice& key, void* v); @@ -363,7 +369,6 @@ Status BlockCacheEnv::LoadCache(const BlockCacheOptions& opts, const std::string options.cache_env = this->target(); BlockCacheImpl* cache = new BlockCacheImpl(options); Status s = cache->LoadCache(); - assert(s.ok()); cache_vec_.push_back(cache); // no need lock return s; } @@ -383,18 +388,37 @@ Status BlockCacheEnv::NewWritableFile(const std::string& fname, uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size(); BlockCacheImpl* cache = cache_vec_[hash]; Status s = cache->NewWritableFile(fname, result); - Log("[block_cache %s] open file write: %s, hash: %u, status: %s\n", - cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str()); + if (!s.ok()) { + Log("[block_cache %s] open file write fail: %s, hash: %u, status: %s\n", + cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str()); + } return s; } Status BlockCacheEnv::NewRandomAccessFile(const std::string& fname, RandomAccessFile** result) { + //uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size(); + //BlockCacheImpl* cache = cache_vec_[hash]; + //Status s = cache->NewRandomAccessFile(fname, result); + //if (!s.ok()) { + // Log("[block_cache %s] open file read fail: %s, hash: %u, status: %s\n", + // cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str()); + //} + //return s; + abort(); + return Status::OK(); +} + +Status BlockCacheEnv::NewRandomAccessFile(const std::string& fname, + uint64_t fsize, + RandomAccessFile** result) { uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size(); BlockCacheImpl* cache = cache_vec_[hash]; - Status s = cache->NewRandomAccessFile(fname, result); - //Log("[block_cache %s] open file read: %s, hash: %u, status: %s\n", - // cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str()); + Status s = cache->NewRandomAccessFile(fname, fsize, result); + if (!s.ok()) { + Log("[block_cache %s] open file read fail: %s, hash: %u, status: %s, fsize %lu\n", + cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str(), fsize); + } return s; } @@ -450,19 +474,19 @@ class BlockCacheWriteBuffer { } else { // last block tmp_storage_->append(buf.data(), buf.size()); } - Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu\n", - path_.c_str(), - file_.c_str(), - offset_, - buf.size()); + //Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu\n", + // path_.c_str(), + // file_.c_str(), + // offset_, + // buf.size()); } } offset_ += data.size(); - Log("[%s] add record: %s, begin: %u, end: %u, offset: %lu, data_size: %lu, block_size: %u\n", - path_.c_str(), - file_.c_str(), - begin, end, - offset_ - data.size() , data.size(), block_size_); + //Log("[%s] add record: %s, begin: %u, end: %u, offset: %lu, data_size: %lu, block_size: %u\n", + // path_.c_str(), + // file_.c_str(), + // begin, end, + // offset_ - data.size() , data.size(), block_size_); return Status::OK(); } @@ -518,12 +542,14 @@ class BlockCacheWritableFile : public WritableFile { write_buffer_(cache_->WorkPath(), fname, cache_->options_.block_size), fname_(fname) { // file open *s = cache_->dfs_env_->NewWritableFile(fname_, &dfs_file_); - Log("[%s] dfs open: %s, block_size: %lu, status: %s\n", - cache_->WorkPath().c_str(), - fname.c_str(), - cache_->options_.block_size, - s->ToString().c_str()); - + if (!s->ok()) { + Log("[%s] dfs open: %s, block_size: %lu, status: %s\n", + cache_->WorkPath().c_str(), + fname.c_str(), + cache_->options_.block_size, + s->ToString().c_str()); + } + bg_status_ = *s; fid_ = cache_->FileId(fname_); return; } @@ -543,11 +569,11 @@ class BlockCacheWritableFile : public WritableFile { MutexLock lockgard(&mu_); MaybeScheduleBGFlush(); - return Status::OK(); + return s; } Status Close() { - Status s; + Status s, s1; if (dfs_file_ != NULL) { s = dfs_file_->Close(); delete dfs_file_; @@ -557,25 +583,28 @@ class BlockCacheWritableFile : public WritableFile { uint64_t block_idx; std::string* block_data = write_buffer_.PopBackBlock(&block_idx); if (block_data != NULL) { - FillCache(block_data, block_idx); + s1 = FillCache(block_data, block_idx); } MutexLock lockgard(&mu_); while (bg_block_flush_ > 0) { bg_cv_.Wait(); } + if (bg_status_.ok()) { + bg_status_ = s.ok() ? s1: s; + } //Log("[%s] end close %s, status %s\n", cache_->WorkPath().c_str(), fname_.c_str(), // s.ToString().c_str()); - return s; + return bg_status_; } Status Flush() { - Log("[%s] dfs flush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + //Log("[%s] dfs flush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); return dfs_file_->Flush(); } Status Sync() { - Log("[%s] dfs sync: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + //Log("[%s] dfs sync: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); return dfs_file_->Sync(); } @@ -597,7 +626,8 @@ class BlockCacheWritableFile : public WritableFile { reinterpret_cast(arg)->BGFlush(); } void BGFlush() { - Log("[%s] Begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + //Log("[%s] Begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str()); + Status s; MutexLock lockgard(&mu_); uint64_t block_idx; std::string* block_data = write_buffer_.PopFrontBlock(&block_idx); @@ -605,11 +635,12 @@ class BlockCacheWritableFile : public WritableFile { pending_block_num_++; mu_.Unlock(); - FillCache(block_data, block_idx); + s = FillCache(block_data, block_idx); mu_.Lock(); pending_block_num_--; } + bg_status_ = bg_status_.ok() ? s: bg_status_; bg_block_flush_--; MaybeScheduleBGFlush(); bg_cv_.Signal(); @@ -651,6 +682,7 @@ class BlockCacheWritableFile : public WritableFile { //port::AtomicPointer shutting_down_; port::Mutex mu_; port::CondVar bg_cv_; // Signalled when background work finishes + Status bg_status_; WritableFile* dfs_file_; // protected by cache_.mu_ uint32_t bg_block_flush_; @@ -662,9 +694,11 @@ class BlockCacheWritableFile : public WritableFile { class BlockCacheRandomAccessFile : public RandomAccessFile { public: - BlockCacheRandomAccessFile(BlockCacheImpl* c, const std::string& fname, Status* s) + BlockCacheRandomAccessFile(BlockCacheImpl* c, const std::string& fname, + uint64_t fsize, Status* s) : cache_(c), - fname_(fname) { + fname_(fname), + fsize_(fsize) { *s = cache_->dfs_env_->NewRandomAccessFile(fname_, &dfs_file_); //Log("[%s] dfs open for read: %s, block_size: %lu, status: %s\n", // cache_->WorkPath().c_str(), @@ -795,9 +829,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { s = block->s; // degrade read } block->mu.Unlock(); - Log("[%s] dfs read done, %s\n", - cache_->WorkPath().c_str(), - block->ToString().c_str()); + //Log("[%s] dfs read done, %s\n", + // cache_->WorkPath().c_str(), + // block->ToString().c_str()); } //uint64_t dfs_read_ts = cache_->options_.cache_env->NowMicros(); @@ -1029,6 +1063,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { RandomAccessFile* dfs_file_; std::string fname_; uint64_t fid_; + uint64_t fsize_; bool aio_enabled_; }; @@ -1055,10 +1090,13 @@ void BlockCacheImpl::BGControlThreadFunc(void* arg) { } void BlockCacheImpl::BGControlThread() { + stat_->MeasureTime(TERA_BLOCK_CACHE_EVICT_NR, + tera_block_cache_evict_counter.Clear()); + Log("[%s] statistics: " "%s, %s, %s, %s, %s, " "%s, %s, %s, %s, %s, " - "%s, %s\n", + "%s, %s, %s\n", this->WorkPath().c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str(), @@ -1073,7 +1111,8 @@ void BlockCacheImpl::BGControlThread() { stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_ALLOC_FID).c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str()); + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_EVICT_NR).c_str()); // resched after 6s stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE); @@ -1088,6 +1127,7 @@ void BlockCacheImpl::BGControlThread() { stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK); stat_->ClearHistogram(TERA_BLOCK_CACHE_ALLOC_FID); stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_FID); + stat_->ClearHistogram(TERA_BLOCK_CACHE_EVICT_NR); bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 6000); } @@ -1103,9 +1143,10 @@ Status BlockCacheImpl::NewWritableFile(const std::string& fname, } Status BlockCacheImpl::NewRandomAccessFile(const std::string& fname, + uint64_t fsize, RandomAccessFile** result) { Status s; - BlockCacheRandomAccessFile* file = new BlockCacheRandomAccessFile(this, fname, &s); + BlockCacheRandomAccessFile* file = new BlockCacheRandomAccessFile(this, fname, fsize, &s); *result = NULL; if (s.ok()) { *result = (RandomAccessFile*)file; @@ -1115,8 +1156,9 @@ Status BlockCacheImpl::NewRandomAccessFile(const std::string& fname, void BlockCacheImpl::BlockDeleter(const Slice& key, void* v) { CacheBlock* block = (CacheBlock*)v; - Log("Evict blockcache: %s\n", block->ToString().c_str()); + //Log("Evict blockcache: %s\n", block->ToString().c_str()); delete block; + tera_block_cache_evict_counter.Inc(); return; } @@ -1238,9 +1280,9 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { // this->WorkPath().c_str(), // lc.KeyToString().c_str(), // block->ToString().c_str()); - LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter)); + LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, cbi, &BlockCacheImpl::BlockDeleter)); assert((uint64_t)(lc.data_set->cache->Value((Cache::Handle*)handle)) == (uint64_t)block); - handle->cache_id = block->cache_block_idx; + assert(handle->cache_id == block->cache_block_idx); block->handle = handle; lc.data_set->cache->Release((Cache::Handle*)handle); } @@ -1320,11 +1362,11 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) { // do io without lock ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(), cache_block_idx * options_.block_size); - Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", - this->WorkPath().c_str(), sid, fd, block->data_block.size(), - cache_block_idx, - block->ToString().c_str(), - res); + //Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", + // this->WorkPath().c_str(), sid, fd, block->data_block.size(), + // cache_block_idx, + // block->ToString().c_str(), + // res); if (res < 0) { return Status::Corruption("FillCache error"); @@ -1484,22 +1526,31 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { Cache* cache = ds->cache; uint64_t start_ts = options_.cache_env->NowMicros(); - block = new CacheBlock; - LRUHandle* h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter); - if (h != NULL && ((uint64_t)(cache->Value((Cache::Handle*)h)) == (uint64_t)block)) { + ds->mu.Lock(); + LRUHandle* h = (LRUHandle*)cache->Lookup(key); + if (h == NULL) { + block = new CacheBlock; block->fid = fid; block->block_idx = block_idx; block->sid = sid; - block->cache_block_idx = h->cache_id; - block->handle = h; - //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", - // this->WorkPath().c_str(), - // block->ToString().c_str(), - // sid, fid, block_idx, hash, options_.dataset_num); + h = (LRUHandle*)cache->Insert(key, block, 0xffffffffffffffff, &BlockCacheImpl::BlockDeleter); + if (h != NULL) { + assert((uint64_t)(cache->Value((Cache::Handle*)h)) == (uint64_t)block); + block->cache_block_idx = h->cache_id; + block->handle = h; + //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", + // this->WorkPath().c_str(), + // block->ToString().c_str(), + // sid, fid, block_idx, hash, options_.dataset_num); + } else { + delete block; + block = NULL; + assert(0); + } } else { - delete block; - block = (h == NULL) ? NULL: reinterpret_cast(cache->Value((Cache::Handle*)h)); + block = reinterpret_cast(cache->Value((Cache::Handle*)h)); } + ds->mu.Unlock(); stat_->MeasureTime(TERA_BLOCK_CACHE_DS_LRU_LOOKUP, options_.cache_env->NowMicros() - start_ts); return block; diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc index 2595e7879..c3a4b7aea 100644 --- a/src/leveldb/util/cache.cc +++ b/src/leveldb/util/cache.cc @@ -265,7 +265,8 @@ class LRU2QCache: public Cache { public: explicit LRU2QCache(size_t capacity) : capacity_(capacity), - usage_(0) { + usage_(0), + max_cache_id_(0) { // Make empty circular linked list lru_.next = &lru_; lru_.prev = &lru_; @@ -294,14 +295,16 @@ class LRU2QCache: public Cache { e->key_length = key.size(); e->hash = hash; e->refs = 2; // One from LRUCache, one for the returned handle - e->cache_id = usage_; + e->cache_id = cache_id == 0xffffffffffffffff ? usage_: cache_id; memcpy(e->key_data, key.data(), key.size()); + max_cache_id_ = max_cache_id_ < e->cache_id ? e->cache_id : max_cache_id_; LRU_Append(e); assert(table_.Insert(e) == NULL); usage_++; return reinterpret_cast(e); } + assert(max_cache_id_ + 1 == usage_); // cache full, reuse item LRUHandle* old = lru_.next; @@ -419,6 +422,7 @@ class LRU2QCache: public Cache { // mutex_ protects the following state. port::Mutex mutex_; size_t usage_; + uint64_t max_cache_id_; // Dummy head of LRU list. // lru.prev is newest entry, lru.next is oldest entry. From 9794ff408b81a4f7f0c686eb6d919efe96819bad Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 29 Aug 2017 21:37:24 +0800 Subject: [PATCH 15/19] issue=1258, Tcache support block-level cache evict --- src/leveldb/util/block_cache.cc | 34 ++++++++++++++++++++++++----- src/tabletnode/remote_tabletnode.cc | 2 +- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 2d664b681..02cce0ef4 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -14,6 +14,7 @@ #include #include +#include #include "../utils/counter.h" @@ -229,10 +230,12 @@ class BlockCacheImpl { port::Mutex mu_; // key lock list struct Waiter { + int wait_num; // protected by BlockCacheImpl.mu_ + + port::Mutex mu; port::CondVar cv; - int wait_num; bool done; - Waiter(port::Mutex* mu):cv(mu), wait_num(0), done(false) {} + Waiter(): wait_num(0), cv(&mu), done(false) {} }; typedef std::map LockKeyMap; LockKeyMap lock_key_; @@ -290,7 +293,7 @@ class BlockCacheImpl { return ""; } }; - typedef std::map DataSetMap; + typedef std::unordered_map DataSetMap; DataSetMap data_set_map_; Statistics* stat_; @@ -1114,6 +1117,17 @@ void BlockCacheImpl::BGControlThread() { stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_EVICT_NR).c_str()); + Log("[%s] statistics(meta): " + "table_cache: %lf/%lu/%lu, " + "block_cache: %lf/%lu/%lu\n", + this->WorkPath().c_str(), + options_.opts.table_cache->HitRate(true), + options_.opts.table_cache->TableEntries(), + options_.opts.table_cache->ByteSize(), + options_.opts.block_cache->HitRate(true), + options_.opts.block_cache->Entries(), + options_.opts.block_cache->TotalCharge()); + // resched after 6s stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ); @@ -1179,10 +1193,13 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { if (it != lock_key_.end()) { w = it->second; w->wait_num ++; + mu_.Unlock(); + + w->mu.Lock(); while (!w->done) { w->cv.Wait(); } - mu_.Unlock(); + w->mu.Unlock(); if (lc.type == kDBKey) { ReadOptions r_opts; @@ -1193,7 +1210,9 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { // lc.db_val->c_str(), // s.ToString().c_str()); } else if (lc.type == kDataSetKey) { + mu_.Lock(); lc.data_set = data_set_map_[lc.sid]; + mu_.Unlock(); //Log("[%s] get dataset sid: %lu\n", // this->WorkPath().c_str(), // lc.sid); @@ -1213,7 +1232,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { // key.c_str()); } } else { - w = new Waiter(&mu_); + w = new Waiter; w->wait_num = 1; lock_key_[key] = w; mu_.Unlock(); @@ -1302,11 +1321,16 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { // key.c_str()); delete w; } else { + mu_.Unlock(); //Log("[%s] put done %s, signal all wait thread\n", // this->WorkPath().c_str(), // key.c_str()); + w->mu.Lock(); w->done = true; w->cv.SignalAll(); + w->mu.Unlock(); + + mu_.Lock(); } } return s; diff --git a/src/tabletnode/remote_tabletnode.cc b/src/tabletnode/remote_tabletnode.cc index 2d95a0e5a..a59061369 100644 --- a/src/tabletnode/remote_tabletnode.cc +++ b/src/tabletnode/remote_tabletnode.cc @@ -322,7 +322,7 @@ void RemoteTabletNode::DoReadTablet(google::protobuf::RpcController* controller, int64_t read_timeout = request->client_timeout_ms() * 1000; // ms -> us int64_t detal = get_micros() - start_micros; if (detal > read_timeout) { - VLOG(5) << "timeout, drop read request for:" << request->tablet_name() + VLOG(8) << "timeout, drop read request for:" << request->tablet_name() << ", detal(in us):" << detal << ", read_timeout(in us):" << read_timeout; is_read_timeout = true; From 13813f3a6aaf238df2ca5af04816240730681b07 Mon Sep 17 00:00:00 2001 From: caijieming Date: Wed, 30 Aug 2017 15:29:04 +0800 Subject: [PATCH 16/19] issue=1258, t-cache support block-level cache evict 1. lock optimize --- src/leveldb/util/block_cache.cc | 67 +++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 02cce0ef4..52ff00e9b 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -74,6 +74,7 @@ struct CacheBlock { bool data_block_alloc; uint64_t data_block_refs; LRUHandle* handle; + LRUHandle* data_set_handle; Status s; CacheBlock() @@ -85,7 +86,8 @@ struct CacheBlock { cv(&mu), data_block_alloc(false), data_block_refs(0), - handle(NULL) { + handle(NULL), + data_set_handle(NULL) { } bool Test(uint64_t c_state) { @@ -165,9 +167,12 @@ struct CacheBlock { }; struct DataSet { + LRUHandle* h; port::Mutex mu; Cache* cache; int fd; + + DataSet(): h(NULL), cache(NULL), fd(-1) {} }; class BlockCacheImpl { @@ -293,8 +298,7 @@ class BlockCacheImpl { return ""; } }; - typedef std::unordered_map DataSetMap; - DataSetMap data_set_map_; + Cache* data_set_cache_; Statistics* stat_; //WritableFile* logfile_; @@ -1210,9 +1214,8 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { // lc.db_val->c_str(), // s.ToString().c_str()); } else if (lc.type == kDataSetKey) { - mu_.Lock(); - lc.data_set = data_set_map_[lc.sid]; - mu_.Unlock(); + lc.data_set = GetDataSet(lc.sid); + assert(lc.data_set != NULL); //Log("[%s] get dataset sid: %lu\n", // this->WorkPath().c_str(), // lc.sid); @@ -1308,9 +1311,11 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { delete db_it; stat_->MeasureTime(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, total_items); - mu_.Lock(); - data_set_map_[lc.sid] = lc.data_set; - mu_.Unlock(); + std::string ds_key; + PutFixed64(&ds_key, lc.sid); + LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Insert(ds_key, lc.data_set, 1, NULL); + assert(ds_handle != NULL); + lc.data_set->h = ds_handle; } mu_.Lock(); @@ -1356,6 +1361,7 @@ Status BlockCacheImpl::LoadCache() { options_.meta_table_cache_size); Status s = DB::Open(options_.opts, dbname, &db_); assert(s.ok()); + data_set_cache_ = leveldb::NewLRUCache(128 * options_.dataset_num + 1); // recover fid std::string key = "FID#"; @@ -1377,20 +1383,20 @@ Status BlockCacheImpl::LoadCache() { } Status BlockCacheImpl::FillCache(CacheBlock* block) { - mu_.Lock(); - uint64_t sid = block->sid; uint64_t cache_block_idx = block->cache_block_idx; - int fd = (data_set_map_[sid])->fd; - mu_.Unlock(); + DataSet* ds = reinterpret_cast(data_set_cache_->Value((Cache::Handle*)block->data_set_handle)); + int fd = ds->fd; // do io without lock ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(), cache_block_idx * options_.block_size); - //Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", - // this->WorkPath().c_str(), sid, fd, block->data_block.size(), - // cache_block_idx, - // block->ToString().c_str(), - // res); + if (res < 0) { + Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", + this->WorkPath().c_str(), block->sid, fd, block->data_block.size(), + cache_block_idx, + block->ToString().c_str(), + res); + } if (res < 0) { return Status::Corruption("FillCache error"); @@ -1399,11 +1405,9 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) { } Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) { - mu_.Lock(); - uint64_t sid = block->sid; uint64_t cache_block_idx = block->cache_block_idx; - int fd = (data_set_map_[sid])->fd; - mu_.Unlock(); + DataSet* ds = reinterpret_cast(data_set_cache_->Value((Cache::Handle*)block->data_set_handle)); + int fd = ds->fd; // do io without lock ssize_t res = 0; @@ -1420,7 +1424,7 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) { if (res < 0) { Log("[%s] cache read: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", - this->WorkPath().c_str(), sid, fd, block->data_block.size(), + this->WorkPath().c_str(), block->sid, fd, block->data_block.size(), cache_block_idx, block->ToString().c_str(), res); @@ -1514,12 +1518,14 @@ Status BlockCacheImpl::DeleteFile(const std::string& fname) { } DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) { + std::string key; + PutFixed64(&key, sid); DataSet* set = NULL; uint64_t start_ts = options_.cache_env->NowMicros(); - MutexLock l(&mu_); - DataSetMap::iterator it = data_set_map_.find(sid); - if (it == data_set_map_.end()) { + LRUHandle* h = (LRUHandle*)data_set_cache_->Lookup(key); + if (h == NULL) { + MutexLock l(&mu_); LockContent lc; lc.type = kDataSetKey; lc.sid = sid; @@ -1529,7 +1535,8 @@ DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) { } else { //Log("[%s] get dataset from memcache, sid %lu\n", // this->WorkPath().c_str(), sid); - set = it->second; + set = reinterpret_cast(data_set_cache_->Value((Cache::Handle*)h)); + assert(set->h == h); } stat_->MeasureTime(TERA_BLOCK_CACHE_GET_DS, options_.cache_env->NowMicros() - start_ts); @@ -1562,6 +1569,7 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { assert((uint64_t)(cache->Value((Cache::Handle*)h)) == (uint64_t)block); block->cache_block_idx = h->cache_id; block->handle = h; + block->data_set_handle = ds->h; //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", // this->WorkPath().c_str(), // block->ToString().c_str(), @@ -1573,8 +1581,11 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { } } else { block = reinterpret_cast(cache->Value((Cache::Handle*)h)); + block->data_set_handle = block->data_set_handle == NULL? ds->h: block->data_set_handle; } ds->mu.Unlock(); + + data_set_cache_->Release((Cache::Handle*)ds->h); stat_->MeasureTime(TERA_BLOCK_CACHE_DS_LRU_LOOKUP, options_.cache_env->NowMicros() - start_ts); return block; @@ -1603,7 +1614,7 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) { //Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str()); LRUHandle* h = block->handle; - DataSet* ds = GetDataSet(block->sid); // get and alloc ds + DataSet* ds = reinterpret_cast(data_set_cache_->Value((Cache::Handle*)block->data_set_handle)); ds->cache->Release((Cache::Handle*)h); return s; } From d95181fd9b78f1588eb40ef2d30e81fc559f0efc Mon Sep 17 00:00:00 2001 From: caijieming Date: Wed, 30 Aug 2017 23:27:53 +0800 Subject: [PATCH 17/19] issue=1258, t-cache support block-level cache evict 1. lock optimize --- src/leveldb/util/block_cache.cc | 255 ++++++++++++++++++-------------- 1 file changed, 148 insertions(+), 107 deletions(-) diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 52ff00e9b..7b8a6b09d 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -204,6 +204,12 @@ class BlockCacheImpl { Status LockAndPut(LockContent& lc); + Status GetContentAfterWait(LockContent& lc); + + Status PutContentAfterLock(LockContent& lc); + + Status ReloadDataSet(LockContent& lc); + Status FillCache(CacheBlock* block); Status ReadCache(CacheBlock* block, struct aiocb* aio_context); @@ -241,6 +247,17 @@ class BlockCacheImpl { port::CondVar cv; bool done; Waiter(): wait_num(0), cv(&mu), done(false) {} + + void Wait() { + MutexLock l(&mu); + while (!done) { cv.Wait(); } + } + + void SignalAll() { + MutexLock l(&mu); + done = true; + cv.SignalAll(); + } }; typedef std::map LockKeyMap; LockKeyMap lock_key_; @@ -936,7 +953,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { cache_->WorkPath().c_str(), fname_.c_str(), offset, n, s.ToString().c_str()); } - //Log("[%s] Done Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu" // ", block_size %lu\n", // cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid, @@ -1005,7 +1021,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // block->ToString().c_str()); } - // support aio engine + // support posix aio engine static void AioCacheReadCallback(sigval_t sigval) { // kernel create thread AsyncCacheReader* reader = (AsyncCacheReader*)sigval.sival_ptr; reader->file->HandleAioCacheReadCallback(reader); @@ -1074,7 +1090,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { bool aio_enabled_; }; -// Tcache impl +// t-cache implementation BlockCacheImpl::BlockCacheImpl(const BlockCacheOptions& options) : options_(options), dfs_env_(options.env), @@ -1198,29 +1214,9 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { w = it->second; w->wait_num ++; mu_.Unlock(); + w->Wait(); - w->mu.Lock(); - while (!w->done) { - w->cv.Wait(); - } - w->mu.Unlock(); - - if (lc.type == kDBKey) { - ReadOptions r_opts; - s = db_->Get(r_opts, key, lc.db_val); - //Log("[%s] get lock key: %s, val: %s, status: %s\n", - // this->WorkPath().c_str(), - // key.c_str(), - // lc.db_val->c_str(), - // s.ToString().c_str()); - } else if (lc.type == kDataSetKey) { - lc.data_set = GetDataSet(lc.sid); - assert(lc.data_set != NULL); - //Log("[%s] get dataset sid: %lu\n", - // this->WorkPath().c_str(), - // lc.sid); - } - + s = GetContentAfterWait(lc); mu_.Lock(); if (--w->wait_num == 0) { // last thread wait for open @@ -1240,84 +1236,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { lock_key_[key] = w; mu_.Unlock(); - if (lc.type == kDBKey) { - WriteOptions w_opts; - s = db_->Put(w_opts, key, lc.db_lock_val); - if (s.ok()) { - lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size()); - } - Log("[%s] Insert db key : %s, val %s, status %s\n", - this->WorkPath().c_str(), - lc.KeyToString().c_str(), - lc.ValToString().c_str(), - s.ToString().c_str()); - } else if (lc.type == kDeleteDBKey) { - WriteOptions w_opts; - s = db_->Delete(w_opts, key); - Log("[%s] Delete db key : %s, val %s, status %s\n", - this->WorkPath().c_str(), - lc.KeyToString().c_str(), - lc.ValToString().c_str(), - s.ToString().c_str()); - } else if (lc.type == kDataSetKey) { - lc.data_set = new DataSet; - lc.data_set->cache = New2QCache((options_.dataset_size / options_.block_size) + 1);// number of blocks in DS - std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid); - lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644); - assert(lc.data_set->fd > 0); - Log("[%s] New DataSet %s, file: %s, nr_block: %lu, fd: %d\n", - this->WorkPath().c_str(), - lc.KeyToString().c_str(), - file.c_str(), (options_.dataset_size / options_.block_size) + 1, - lc.data_set->fd); - - // reload hash lru - uint64_t total_items = 0; - ReadOptions s_opts; - leveldb::Iterator* db_it = db_->NewIterator(s_opts); - for (db_it->Seek(key); - db_it->Valid() && db_it->key().starts_with("DS#"); - db_it->Next()) { - Slice lkey = db_it->key(); - uint64_t sid, cbi; - lkey.remove_prefix(3);// lkey = DS#, sid, cbi - sid = DecodeFixed64(lkey.data()); - lkey.remove_prefix(sizeof(uint64_t)); - cbi = DecodeFixed64(lkey.data()); - //Slice lval = db_it->value(); - if (sid != lc.sid) { - break; - } - total_items++; - - CacheBlock* block = new CacheBlock; - block->DecodeFrom(db_it->value()); // get fid and block_idx - std::string hkey; - PutFixed64(&hkey, block->fid); - PutFixed64(&hkey, block->block_idx); - block->sid = sid; - block->cache_block_idx = cbi; - block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0; - //Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n", - // this->WorkPath().c_str(), - // lc.KeyToString().c_str(), - // block->ToString().c_str()); - LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, cbi, &BlockCacheImpl::BlockDeleter)); - assert((uint64_t)(lc.data_set->cache->Value((Cache::Handle*)handle)) == (uint64_t)block); - assert(handle->cache_id == block->cache_block_idx); - block->handle = handle; - lc.data_set->cache->Release((Cache::Handle*)handle); - } - delete db_it; - stat_->MeasureTime(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, total_items); - - std::string ds_key; - PutFixed64(&ds_key, lc.sid); - LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Insert(ds_key, lc.data_set, 1, NULL); - assert(ds_handle != NULL); - lc.data_set->h = ds_handle; - } - + s = PutContentAfterLock(lc); mu_.Lock(); if (--w->wait_num == 0) { lock_key_.erase(key); @@ -1330,10 +1249,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { //Log("[%s] put done %s, signal all wait thread\n", // this->WorkPath().c_str(), // key.c_str()); - w->mu.Lock(); - w->done = true; - w->cv.SignalAll(); - w->mu.Unlock(); + w->SignalAll(); mu_.Lock(); } @@ -1341,6 +1257,131 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) { return s; } +Status BlockCacheImpl::GetContentAfterWait(LockContent& lc) { + Status s; + std::string key = lc.Encode(); + + if (lc.type == kDBKey) { + ReadOptions r_opts; + s = db_->Get(r_opts, key, lc.db_val); + //Log("[%s] get lock key: %s, val: %s, status: %s\n", + // this->WorkPath().c_str(), + // key.c_str(), + // lc.db_val->c_str(), + // s.ToString().c_str()); + } else if (lc.type == kDataSetKey) { + std::string ds_key; + PutFixed64(&ds_key, lc.sid); + LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Lookup(ds_key); + lc.data_set = reinterpret_cast(data_set_cache_->Value((Cache::Handle*)ds_handle)); + assert(ds_handle == lc.data_set->h); + //Log("[%s] get dataset sid: %lu\n", + // this->WorkPath().c_str(), + // lc.sid); + } + return s; +} + +Status BlockCacheImpl::PutContentAfterLock(LockContent& lc) { + Status s; + std::string key = lc.Encode(); + + if (lc.type == kDBKey) { + WriteOptions w_opts; + s = db_->Put(w_opts, key, lc.db_lock_val); + if (s.ok()) { + lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size()); + } + Log("[%s] Insert db key : %s, val %s, status %s\n", + this->WorkPath().c_str(), + lc.KeyToString().c_str(), + lc.ValToString().c_str(), + s.ToString().c_str()); + } else if (lc.type == kDeleteDBKey) { + WriteOptions w_opts; + s = db_->Delete(w_opts, key); + Log("[%s] Delete db key : %s, val %s, status %s\n", + this->WorkPath().c_str(), + lc.KeyToString().c_str(), + lc.ValToString().c_str(), + s.ToString().c_str()); + } else if (lc.type == kDataSetKey) { // cannot double insert + std::string ds_key; + PutFixed64(&ds_key, lc.sid); + LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Lookup(ds_key); + if (ds_handle != NULL) { + lc.data_set = reinterpret_cast(data_set_cache_->Value((Cache::Handle*)ds_handle)); + assert(ds_handle == lc.data_set->h); + } else { + s = ReloadDataSet(lc); + } + } + return s; +} + +Status BlockCacheImpl::ReloadDataSet(LockContent& lc) { + Status s; + std::string key = lc.Encode(); + + lc.data_set = new DataSet; + lc.data_set->cache = New2QCache((options_.dataset_size / options_.block_size) + 1);// number of blocks in DS + std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid); + lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644); + assert(lc.data_set->fd > 0); + Log("[%s] New DataSet %s, file: %s, nr_block: %lu, fd: %d\n", + this->WorkPath().c_str(), + lc.KeyToString().c_str(), + file.c_str(), (options_.dataset_size / options_.block_size) + 1, + lc.data_set->fd); + + // reload hash lru + uint64_t total_items = 0; + ReadOptions s_opts; + leveldb::Iterator* db_it = db_->NewIterator(s_opts); + for (db_it->Seek(key); + db_it->Valid() && db_it->key().starts_with("DS#"); + db_it->Next()) { + Slice lkey = db_it->key(); + uint64_t sid, cbi; + lkey.remove_prefix(3);// lkey = DS#, sid, cbi + sid = DecodeFixed64(lkey.data()); + lkey.remove_prefix(sizeof(uint64_t)); + cbi = DecodeFixed64(lkey.data()); + //Slice lval = db_it->value(); + if (sid != lc.sid) { + break; + } + total_items++; + + CacheBlock* block = new CacheBlock; + block->DecodeFrom(db_it->value()); // get fid and block_idx + std::string hkey; + PutFixed64(&hkey, block->fid); + PutFixed64(&hkey, block->block_idx); + block->sid = sid; + block->cache_block_idx = cbi; + block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0; + //Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n", + // this->WorkPath().c_str(), + // lc.KeyToString().c_str(), + // block->ToString().c_str()); + LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, cbi, &BlockCacheImpl::BlockDeleter)); + assert((uint64_t)(lc.data_set->cache->Value((Cache::Handle*)handle)) == (uint64_t)block); + assert(handle->cache_id == block->cache_block_idx); + block->handle = handle; + lc.data_set->cache->Release((Cache::Handle*)handle); + } + delete db_it; + stat_->MeasureTime(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, total_items); + + std::string ds_key; + PutFixed64(&ds_key, lc.sid); + LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Insert(ds_key, lc.data_set, 1, NULL); + assert(ds_handle != NULL); + lc.data_set->h = ds_handle; + return s; +} + const std::string& BlockCacheImpl::WorkPath() { return work_path_; } From 6394ef70df98c9390019f69fac355524c955188d Mon Sep 17 00:00:00 2001 From: caijieming Date: Tue, 24 Oct 2017 23:50:40 +0800 Subject: [PATCH 18/19] issue=1258, t-cache support block-level cache evict bugfix for cache evict --- src/leveldb/db/table_cache.cc | 2 +- src/leveldb/include/leveldb/statistics.h | 8 ++- src/leveldb/table/format.cc | 5 +- src/leveldb/util/block_cache.cc | 78 +++++++++++++----------- src/leveldb/util/cache.cc | 12 ++-- src/tabletnode/tabletnode_impl.cc | 2 +- 6 files changed, 64 insertions(+), 43 deletions(-) diff --git a/src/leveldb/db/table_cache.cc b/src/leveldb/db/table_cache.cc index e6af0d97b..c9cdb77ea 100644 --- a/src/leveldb/db/table_cache.cc +++ b/src/leveldb/db/table_cache.cc @@ -93,7 +93,7 @@ Status TableCache::FindTable(const std::string& dbname, const Options* options, if (!s.ok()) { assert(table == NULL); - fprintf(stderr, "open sstable file failed: [%s]\n", fname.c_str()); + fprintf(stderr, "open sstable file failed: [%s] %s\n", fname.c_str(), s.ToString().c_str()); delete file; // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h index ed2898996..235192db2 100644 --- a/src/leveldb/include/leveldb/statistics.h +++ b/src/leveldb/include/leveldb/statistics.h @@ -43,12 +43,14 @@ enum Histograms : uint32_t { TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, TERA_BLOCK_CACHE_PREAD_GET_BLOCK, TERA_BLOCK_CACHE_PREAD_BLOCK_NR, - TERA_BLOCK_CACHE_GET_DS, + TERA_BLOCK_CACHE_GET_DATA_SET, TERA_BLOCK_CACHE_DS_LRU_LOOKUP, TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, TERA_BLOCK_CACHE_ALLOC_FID, TERA_BLOCK_CACHE_GET_FID, TERA_BLOCK_CACHE_EVICT_NR, + TERA_BLOCK_CACHE_PREAD_DFS_READ, + TERA_BLOCK_CACHE_PREAD_SSD_WRITE, HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match }; @@ -60,12 +62,14 @@ const std::vector > HistogramsNameMap = { {TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, "tera.block_cache.lockmap_ds_reload_nr"}, {TERA_BLOCK_CACHE_PREAD_GET_BLOCK, "tera.block_cache.pread_get_block"}, {TERA_BLOCK_CACHE_PREAD_BLOCK_NR, "tera.block_cache.pread_block_nr"}, - {TERA_BLOCK_CACHE_GET_DS, "tera.block_cache.get_ds"}, + {TERA_BLOCK_CACHE_GET_DATA_SET, "tera.block_cache.get_data_set"}, {TERA_BLOCK_CACHE_DS_LRU_LOOKUP, "tera.block_cache.ds_lru_lookup"}, {TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, "tera.block_cache.pread_wait_unlock"}, {TERA_BLOCK_CACHE_ALLOC_FID, "tera.block_cache.alloc_fid"}, {TERA_BLOCK_CACHE_GET_FID, "tera.block_cache.get_fid"}, {TERA_BLOCK_CACHE_EVICT_NR, "tera.block_cache.evict_nr"}, + {TERA_BLOCK_CACHE_PREAD_DFS_READ, "tera.block_cache.pread_dfs_read"}, + {TERA_BLOCK_CACHE_PREAD_SSD_WRITE, "tera.block_cache.pread_ssd_write"}, }; struct HistogramData { diff --git a/src/leveldb/table/format.cc b/src/leveldb/table/format.cc index f4e2e5259..c226a152a 100644 --- a/src/leveldb/table/format.cc +++ b/src/leveldb/table/format.cc @@ -97,7 +97,10 @@ Status ReadBlock(RandomAccessFile* file, const uint32_t actual = crc32c::Value(data, n + 1); if (actual != crc) { delete[] buf; - s = Status::Corruption("block checksum mismatch"); + char err[128] = {'\0'}; + sprintf(err, "block checksum mismatch: crc %u, actual %u, offset %lu, size %lu", + crc, actual, handle.offset(), n + kBlockTrailerSize); + s = Status::Corruption(Slice(err, strlen(err))); return s; } } diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 7b8a6b09d..94bccd116 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -323,6 +323,7 @@ class BlockCacheImpl { DB* db_; // store meta ThreadPool bg_fill_; ThreadPool bg_read_; + ThreadPool bg_dfs_read_; ThreadPool bg_flush_; ThreadPool bg_control_; }; @@ -492,17 +493,18 @@ class BlockCacheWriteBuffer { for (uint32_t i = begin + 1; i <= end; ++i) { tmp_storage_ = new std::string(); block_list_.push_back(tmp_storage_); - if (i < end) { // last block + if (i < end) { tmp_storage_->append(buf.data(), block_size_); buf.remove_prefix(block_size_); } else { // last block tmp_storage_->append(buf.data(), buf.size()); + buf.remove_prefix(buf.size()); } - //Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu\n", + //Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu, idx %u\n", // path_.c_str(), // file_.c_str(), // offset_, - // buf.size()); + // buf.size(), i); } } offset_ += data.size(); @@ -803,7 +805,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { //Log("[%s] pread in miss list, %s\n", // cache_->WorkPath().c_str(), // block->ToString().c_str()); - cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10); + cache_->bg_dfs_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10); } //uint64_t miss_read_sched_ts = cache_->options_.cache_env->NowMicros(); @@ -857,7 +859,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // cache_->WorkPath().c_str(), // block->ToString().c_str()); } - //uint64_t dfs_read_ts = cache_->options_.cache_env->NowMicros(); + uint64_t dfs_read_ts = cache_->options_.cache_env->NowMicros(); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_DFS_READ, dfs_read_ts - ssd_read_ts); for (uint32_t i = 0; i < c_miss.size(); ++i) { CacheBlock* block = c_miss[i]; @@ -869,7 +872,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // block->ToString().c_str()); cache_->bg_fill_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheWrite, writer, 10); } - //uint64_t ssd_write_sched_ts = cache_->options_.cache_env->NowMicros(); + uint64_t ssd_write_sched_ts = cache_->options_.cache_env->NowMicros(); + //cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_SSD_WRITE_SCHED, ssd_write_sched_ts - dfs_read_ts); for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish CacheBlock* block = c_miss[i]; @@ -888,6 +892,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile { // block->ToString().c_str()); } uint64_t ssd_write_ts = cache_->options_.cache_env->NowMicros(); + cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_SSD_WRITE, ssd_write_ts - ssd_write_sched_ts); // wait other async read finish for (uint32_t i = 0; i < c_locked.size(); ++i) { @@ -1099,6 +1104,7 @@ BlockCacheImpl::BlockCacheImpl(const BlockCacheOptions& options) db_(NULL) { bg_fill_.SetBackgroundThreads(30); bg_read_.SetBackgroundThreads(30); + bg_dfs_read_.SetBackgroundThreads(30); bg_flush_.SetBackgroundThreads(30); bg_control_.SetBackgroundThreads(2); stat_ = CreateDBStatistics(); @@ -1119,20 +1125,22 @@ void BlockCacheImpl::BGControlThread() { Log("[%s] statistics: " "%s, %s, %s, %s, %s, " "%s, %s, %s, %s, %s, " - "%s, %s, %s\n", + "%s, %s, %s, %s, %s\n", this->WorkPath().c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_DFS_READ).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_WRITE).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR).c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_GET_BLOCK).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_BLOCK_NR).c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_DS).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_DATA_SET).c_str(), + stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_DS_LRU_LOOKUP).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK).c_str(), - stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_ALLOC_FID).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str(), stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_EVICT_NR).c_str()); @@ -1151,12 +1159,14 @@ void BlockCacheImpl::BGControlThread() { // resched after 6s stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ); + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_DFS_READ); + stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_WRITE); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK); stat_->ClearHistogram(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_GET_BLOCK); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_BLOCK_NR); - stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_DS); + stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_DATA_SET); stat_->ClearHistogram(TERA_BLOCK_CACHE_DS_LRU_LOOKUP); stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK); stat_->ClearHistogram(TERA_BLOCK_CACHE_ALLOC_FID); @@ -1292,19 +1302,19 @@ Status BlockCacheImpl::PutContentAfterLock(LockContent& lc) { if (s.ok()) { lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size()); } - Log("[%s] Insert db key : %s, val %s, status %s\n", - this->WorkPath().c_str(), - lc.KeyToString().c_str(), - lc.ValToString().c_str(), - s.ToString().c_str()); + //Log("[%s] Insert db key : %s, val %s, status %s\n", + // this->WorkPath().c_str(), + // lc.KeyToString().c_str(), + // lc.ValToString().c_str(), + // s.ToString().c_str()); } else if (lc.type == kDeleteDBKey) { WriteOptions w_opts; s = db_->Delete(w_opts, key); - Log("[%s] Delete db key : %s, val %s, status %s\n", - this->WorkPath().c_str(), - lc.KeyToString().c_str(), - lc.ValToString().c_str(), - s.ToString().c_str()); + //Log("[%s] Delete db key : %s, val %s, status %s\n", + // this->WorkPath().c_str(), + // lc.KeyToString().c_str(), + // lc.ValToString().c_str(), + // s.ToString().c_str()); } else if (lc.type == kDataSetKey) { // cannot double insert std::string ds_key; PutFixed64(&ds_key, lc.sid); @@ -1329,18 +1339,18 @@ Status BlockCacheImpl::ReloadDataSet(LockContent& lc) { lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644); assert(lc.data_set->fd > 0); Log("[%s] New DataSet %s, file: %s, nr_block: %lu, fd: %d\n", - this->WorkPath().c_str(), - lc.KeyToString().c_str(), - file.c_str(), (options_.dataset_size / options_.block_size) + 1, - lc.data_set->fd); + this->WorkPath().c_str(), + lc.KeyToString().c_str(), + file.c_str(), (options_.dataset_size / options_.block_size) + 1, + lc.data_set->fd); // reload hash lru uint64_t total_items = 0; ReadOptions s_opts; leveldb::Iterator* db_it = db_->NewIterator(s_opts); for (db_it->Seek(key); - db_it->Valid() && db_it->key().starts_with("DS#"); - db_it->Next()) { + db_it->Valid() && db_it->key().starts_with("DS#"); + db_it->Next()) { Slice lkey = db_it->key(); uint64_t sid, cbi; lkey.remove_prefix(3);// lkey = DS#, sid, cbi @@ -1389,7 +1399,7 @@ const std::string& BlockCacheImpl::WorkPath() { Status BlockCacheImpl::LoadCache() { // open meta file work_path_ = options_.cache_dir; - std::string dbname = options_.cache_dir + "/meta/"; + std::string dbname = options_.cache_dir + "/meta"; options_.opts.env = options_.cache_env; // local write options_.opts.filter_policy = NewBloomFilterPolicy(10); options_.opts.block_cache = leveldb::NewLRUCache(options_.meta_block_cache_size * 1024UL * 1024); @@ -1431,15 +1441,13 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) { // do io without lock ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(), cache_block_idx * options_.block_size); + if (res < 0) { Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n", this->WorkPath().c_str(), block->sid, fd, block->data_block.size(), cache_block_idx, block->ToString().c_str(), res); - } - - if (res < 0) { return Status::Corruption("FillCache error"); } return Status::OK(); @@ -1579,7 +1587,7 @@ DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) { set = reinterpret_cast(data_set_cache_->Value((Cache::Handle*)h)); assert(set->h == h); } - stat_->MeasureTime(TERA_BLOCK_CACHE_GET_DS, + stat_->MeasureTime(TERA_BLOCK_CACHE_GET_DATA_SET, options_.cache_env->NowMicros() - start_ts); return set; } @@ -1611,10 +1619,12 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) { block->cache_block_idx = h->cache_id; block->handle = h; block->data_set_handle = ds->h; - //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n", + //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, usage: %lu/%lu\n", // this->WorkPath().c_str(), // block->ToString().c_str(), - // sid, fid, block_idx, hash, options_.dataset_num); + // sid, fid, block_idx, hash, + // cache->TotalCharge(), + // options_.dataset_size / options_.block_size + 1); } else { delete block; block = NULL; diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc index c3a4b7aea..b7c7ca4e0 100644 --- a/src/leveldb/util/cache.cc +++ b/src/leveldb/util/cache.cc @@ -281,10 +281,11 @@ class LRU2QCache: public Cache { const uint32_t hash = HashSlice(key); MutexLock l(&mutex_); LRUHandle* e = NULL; - e = (LRUHandle*)DoLookup(key, hash); - if (e != NULL) { - return reinterpret_cast(e); - } + //e = (LRUHandle*)DoLookup(key, hash); + //if (e != NULL) { + // assert(0); + // return reinterpret_cast(e); + //} if (usage_ < capacity_) { // cache not full e = reinterpret_cast( @@ -305,6 +306,8 @@ class LRU2QCache: public Cache { return reinterpret_cast(e); } assert(max_cache_id_ + 1 == usage_); + assert(usage_ == capacity_); + //fprintf(stderr, "%lu, usage %lu, capacity %lu\n", (uint64_t)this, usage_, capacity_); // cache full, reuse item LRUHandle* old = lru_.next; @@ -330,6 +333,7 @@ class LRU2QCache: public Cache { LRU_Append(e); assert(table_.Insert(e) == NULL); + usage_++; return reinterpret_cast(e); } return NULL; diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc index 52077a718..ff65562a3 100644 --- a/src/tabletnode/tabletnode_impl.cc +++ b/src/tabletnode/tabletnode_impl.cc @@ -192,7 +192,7 @@ void TabletNodeImpl::InitCacheSystem() { for (uint32_t i = 0; i < path_list.size(); ++i) { leveldb::BlockCacheOptions opts; LOG(INFO) << "load cache: " << path_list[i]; - reinterpret_cast(block_cache_env)->LoadCache(opts, path_list[i] + "/block_cache/"); + reinterpret_cast(block_cache_env)->LoadCache(opts, path_list[i] + "/block_cache"); } return; } From 9dc16656891b70e26ff8b4d572e8dd4fa60cd672 Mon Sep 17 00:00:00 2001 From: caijieming Date: Wed, 25 Oct 2017 11:03:32 +0800 Subject: [PATCH 19/19] issue=1258, t-cache support block-level cache evict bugfix for cache evict --- src/leveldb/include/leveldb/block_cache.h | 2 ++ src/leveldb/util/block_cache.cc | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h index 7c2d2b965..ebd1b0cf1 100644 --- a/src/leveldb/include/leveldb/block_cache.h +++ b/src/leveldb/include/leveldb/block_cache.h @@ -1,6 +1,8 @@ // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +// +// Author: caijieming@baidu.com #ifndef STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H_ #define STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H_ diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc index 94bccd116..ab5421e6e 100644 --- a/src/leveldb/util/block_cache.cc +++ b/src/leveldb/util/block_cache.cc @@ -1,6 +1,8 @@ // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +// +// Author: caijieming@baidu.com #include "leveldb/block_cache.h"