From 177cb1e136678cf9a53147cbb03b536b9ef0f63a Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Thu, 3 Aug 2017 22:51:31 +0800
Subject: [PATCH 01/19] issue=1258, Tcache support block-level cache evict

---
 src/io/tablet_io.cc                       |   19 +-
 src/io/utils_leveldb.cc                   |   18 +
 src/io/utils_leveldb.h                    |    2 +
 src/leveldb/db/builder.cc                 |    3 -
 src/leveldb/db/db_impl.cc                 |    3 -
 src/leveldb/include/leveldb/block_cache.h |  101 ++
 src/leveldb/include/leveldb/cache.h       |   27 +
 src/leveldb/include/leveldb/slice.h       |    6 +
 src/leveldb/util/block_cache.cc           | 1252 +++++++++++++++++++++
 src/leveldb/util/cache.cc                 |  193 +++-
 src/leveldb/util/coding_test.cc           |   11 +
 src/sdk/sdk_zk.cc                         |    3 -
 src/tabletnode/tabletnode_impl.cc         |   60 +-
 src/tera_flags.cc                         |    3 +-
 14 files changed, 1629 insertions(+), 72 deletions(-)
 create mode 100644 src/leveldb/include/leveldb/block_cache.h
 create mode 100644 src/leveldb/util/block_cache.cc

diff --git a/src/io/tablet_io.cc b/src/io/tablet_io.cc
index 81222e447..6de53b462 100644
--- a/src/io/tablet_io.cc
+++ b/src/io/tablet_io.cc
@@ -59,11 +59,11 @@ DECLARE_bool(tera_leveldb_ignore_corruption_in_compaction);
 DECLARE_bool(tera_leveldb_use_file_lock);
 
 DECLARE_int32(tera_tabletnode_scan_pack_max_size);
-DECLARE_bool(tera_tabletnode_cache_enabled);
 DECLARE_int32(tera_leveldb_env_local_seek_latency);
 DECLARE_int32(tera_leveldb_env_dfs_seek_latency);
 DECLARE_int32(tera_memenv_table_cache_size);
 DECLARE_bool(tera_use_flash_for_memenv);
+DECLARE_bool(tera_tabletnode_block_cache_enabled);
 
 DECLARE_bool(tera_tablet_use_memtable_on_leveldb);
 DECLARE_int64(tera_tablet_memtable_ldb_write_buffer_size);
@@ -1676,18 +1676,25 @@ void TabletIO::SetupOptionsForLG() {
             lg_info->env = LeveldbMockEnv();
         } else if (store == MemoryStore) {
             if (FLAGS_tera_use_flash_for_memenv) {
-                lg_info->env = LeveldbFlashEnv();
+                if (FLAGS_tera_tabletnode_block_cache_enabled) {
+                    LOG(INFO) << "MemLG[" << lg_i << "] activate TCache";
+                    lg_info->env = io::DefaultBlockCacheEnv();
+                } else {
+                    lg_info->env = LeveldbFlashEnv();
+                }
             } else {
                 lg_info->env = LeveldbMemEnv();
             }
             lg_info->seek_latency = 0;
             lg_info->block_cache = m_memory_cache;
         } else if (store == FlashStore) {
-            if (!FLAGS_tera_tabletnode_cache_enabled) {
-                lg_info->env = LeveldbFlashEnv();
+            if (FLAGS_tera_tabletnode_block_cache_enabled) {
+                //LOG(INFO) << "activate block-level Cache store";
+                //lg_info->env = leveldb::EnvThreeLevelCache();
+                LOG(INFO) << "FlashLG[" << lg_i << "] activate TCache";
+                lg_info->env = io::DefaultBlockCacheEnv();
             } else {
-                LOG(INFO) << "activate block-level Cache store";
-                lg_info->env = leveldb::EnvThreeLevelCache();
+                lg_info->env = LeveldbFlashEnv();
             }
             lg_info->seek_latency = FLAGS_tera_leveldb_env_local_seek_latency;
         } else {
diff --git a/src/io/utils_leveldb.cc b/src/io/utils_leveldb.cc
index 253e23f56..3d3249e1d 100644
--- a/src/io/utils_leveldb.cc
+++ b/src/io/utils_leveldb.cc
@@ -18,6 +18,7 @@
 #include "leveldb/comparator.h"
 #include "leveldb/env_dfs.h"
 #include "leveldb/env_flash.h"
+#include "leveldb/block_cache.h"
 #include "leveldb/env_inmem.h"
 #include "leveldb/env_mock.h"
 #include "leveldb/table_utils.h"
@@ -31,6 +32,7 @@ DECLARE_string(tera_leveldb_env_hdfs2_nameservice_list);
 DECLARE_string(tera_tabletnode_path_prefix);
 DECLARE_string(tera_dfs_so_path);
 DECLARE_string(tera_dfs_conf);
+DECLARE_int32(tera_leveldb_block_cache_env_num_thread);
 
 namespace tera {
 namespace io {
@@ -66,6 +68,21 @@ leveldb::Env* LeveldbBaseEnv() {
     }
 }
 
+// Tcache: default env
+static pthread_once_t block_cache_once = PTHREAD_ONCE_INIT;
+static leveldb::Env* default_block_cache_env;
+static void InitDefaultBlockCacheEnv() {
+    default_block_cache_env = new leveldb::BlockCacheEnv(LeveldbBaseEnv());
+    default_block_cache_env->SetBackgroundThreads(FLAGS_tera_leveldb_block_cache_env_num_thread);
+    LOG(INFO) << "init block cache, thread num " << FLAGS_tera_leveldb_block_cache_env_num_thread;
+}
+
+leveldb::Env* DefaultBlockCacheEnv() {
+    pthread_once(&block_cache_once, InitDefaultBlockCacheEnv);
+    return default_block_cache_env;
+}
+
+// mem env
 leveldb::Env* LeveldbMemEnv() {
     static Mutex mutex;
     static leveldb::Env* mem_env = NULL;
@@ -78,6 +95,7 @@ leveldb::Env* LeveldbMemEnv() {
     return mem_env;
 }
 
+// flash env
 leveldb::Env* LeveldbFlashEnv() {
     static Mutex mutex;
     static leveldb::Env* flash_env = NULL;
diff --git a/src/io/utils_leveldb.h b/src/io/utils_leveldb.h
index f77847db9..39e5d73c1 100644
--- a/src/io/utils_leveldb.h
+++ b/src/io/utils_leveldb.h
@@ -18,6 +18,8 @@ void InitDfsEnv();
 // return the base env leveldb used (dfs/local), singleton
 leveldb::Env* LeveldbBaseEnv();
 
+leveldb::Env* DefaultBlockCacheEnv(); // ssd + base
+
 // return the mem env leveldb used, singleton
 leveldb::Env* LeveldbMemEnv();
 
diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc
index fdbae74af..5bce6f796 100644
--- a/src/leveldb/db/builder.cc
+++ b/src/leveldb/db/builder.cc
@@ -137,9 +137,6 @@ Status BuildTable(const std::string& dbname,
     delete builder;
 
     // Finish and check for file errors
-    if (s.ok()) {
-      s = file->Sync();
-    }
     if (s.ok()) {
       s = file->Close();
     }
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
index c076008de..7d72b617b 100644
--- a/src/leveldb/db/db_impl.cc
+++ b/src/leveldb/db/db_impl.cc
@@ -1179,9 +1179,6 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
   compact->builder = NULL;
 
   // Finish and check for file errors
-  if (s.ok()) {
-    s = compact->outfile->Sync();
-  }
   if (s.ok()) {
     s = compact->outfile->Close();
   }
diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h
new file mode 100644
index 000000000..a48022bb7
--- /dev/null
+++ b/src/leveldb/include/leveldb/block_cache.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef  STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H
+#define  STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H
+
+#include "leveldb/env.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+/////////////////////////////////////////////
+// Tcache
+/////////////////////////////////////////////
+extern uint64_t kBlockSize;
+extern uint64_t kDataSetSize;
+extern uint64_t kFidBatchNum;
+extern uint64_t kCacheSize;
+extern uint64_t kMetaBlockSize;
+extern uint64_t kMetaTableSize;
+extern uint64_t kWriteBufferSize;
+
+struct BlockCacheOptions {
+    Options opts;
+    std::string cache_dir;
+    uint64_t block_size;
+    uint64_t dataset_size;
+    uint64_t fid_batch_num;
+    uint64_t cache_size;
+    uint64_t dataset_num;
+    uint64_t meta_block_cache_size;
+    uint64_t meta_table_cache_size;
+    uint64_t write_buffer_size;
+    Env* env;
+    Env* cache_env;
+
+    BlockCacheOptions()
+    : block_size(kBlockSize),
+      dataset_size(kDataSetSize),
+      fid_batch_num(kFidBatchNum),
+      cache_size(kCacheSize),
+      meta_block_cache_size(kMetaBlockSize),
+      meta_table_cache_size(kMetaTableSize),
+      write_buffer_size(kWriteBufferSize),
+      env(NULL) {
+          dataset_num = cache_size / dataset_size + 1;
+    }
+};
+
+class BlockCacheImpl;
+
+class BlockCacheEnv : public EnvWrapper {
+public:
+    BlockCacheEnv(Env* base);
+
+    ~BlockCacheEnv();
+
+    virtual Status FileExists(const std::string& fname);
+
+    virtual Status GetChildren(const std::string& path,
+                               std::vector<std::string>* result);
+
+    virtual Status DeleteFile(const std::string& fname);
+
+    virtual Status CreateDir(const std::string& name);
+
+    virtual Status DeleteDir(const std::string& name);
+
+    virtual Status CopyFile(const std::string& from,
+                            const std::string& to);
+
+    virtual Status GetFileSize(const std::string& fname, uint64_t* size);
+
+    virtual Status RenameFile(const std::string& src, const std::string& target);
+
+    virtual Status LockFile(const std::string& fname, FileLock** lock);
+
+    virtual Status UnlockFile(FileLock* lock);
+
+    virtual Status NewSequentialFile(const std::string& fname,
+                                     SequentialFile** result); // never cache log
+
+    // cache relatively
+    virtual Status NewRandomAccessFile(const std::string& fname,
+                                       RandomAccessFile** result); // cache Pread
+
+    virtual Status NewWritableFile(const std::string& fname,
+                                   WritableFile** result); // cache Append
+    virtual Status LoadCache(const BlockCacheOptions& opts, const std::string& cache_dir);
+
+private:
+    std::vector<BlockCacheImpl*> cache_vec_;
+    Env* dfs_env_;
+};
+
+Env* NewBlockCacheEnv(Env* base);
+
+} // leveldb
+#endif
+
diff --git a/src/leveldb/include/leveldb/cache.h b/src/leveldb/include/leveldb/cache.h
index 636811b65..2299b2528 100644
--- a/src/leveldb/include/leveldb/cache.h
+++ b/src/leveldb/include/leveldb/cache.h
@@ -29,9 +29,36 @@ namespace leveldb {
 
 class Cache;
 
+// An entry is a variable length heap-allocated structure.  Entries
+// are kept in a circular doubly linked list ordered by access time.
+struct LRUHandle {
+  void* value;
+  void (*deleter)(const Slice&, void* value);
+  LRUHandle* next_hash;
+  LRUHandle* next;
+  LRUHandle* prev;
+  size_t charge;      // TODO(opt): Only allow uint32_t?
+  size_t key_length;
+  uint32_t refs;
+  uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
+  uint64_t cache_id; // cache id, user spec
+  char key_data[1];   // Beginning of key
+
+  Slice key() const {
+    // For cheaper lookups, we allow a temporary Handle object
+    // to store a pointer to a key in "value".
+    if (next == this) {
+      return *(reinterpret_cast<Slice*>(value));
+    } else {
+      return Slice(key_data, key_length);
+    }
+  }
+};
+
 // Create a new cache with a fixed size capacity.  This implementation
 // of Cache uses a least-recently-used eviction policy.
 extern Cache* NewLRUCache(size_t capacity);
+extern Cache* New2QCache(size_t capacity);
 
 class Cache {
  public:
diff --git a/src/leveldb/include/leveldb/slice.h b/src/leveldb/include/leveldb/slice.h
index 4f1eea30e..286f303f7 100644
--- a/src/leveldb/include/leveldb/slice.h
+++ b/src/leveldb/include/leveldb/slice.h
@@ -68,6 +68,12 @@ class Slice {
     size_ -= n;
   }
 
+  // Drop the last "n" bytes from this slice.
+  void remove_suffix(size_t n) {
+    assert(n <= size());
+    size_ -= n;
+  }
+
   // Return a string that contains the copy of the referenced data.
   std::string ToString() const { return std::string(data_, size_); }
 
diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
new file mode 100644
index 000000000..8401cb1c0
--- /dev/null
+++ b/src/leveldb/util/block_cache.cc
@@ -0,0 +1,1252 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "leveldb/block_cache.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <list>
+#include <sstream>
+
+#include "db/table_cache.h"
+#include "leveldb/db.h"
+#include "leveldb/cache.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+#include "leveldb/table_utils.h"
+#include "leveldb/write_batch.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/string_ext.h"
+#include "util/thread_pool.h"
+
+namespace leveldb {
+
+/////////////////////////////////////////////
+// Tcache
+/////////////////////////////////////////////
+uint64_t kBlockSize = 4096UL;
+uint64_t kDataSetSize = 134217728UL;
+uint64_t kFidBatchNum = 200000UL;
+uint64_t kCacheSize = 350000000000UL;
+uint64_t kMetaBlockSize = 2000UL;
+uint64_t kMetaTableSize = 500UL;
+uint64_t kWriteBufferSize = 1048576UL;
+
+class BlockCacheWritableFile;
+class BlockCacheRandomAccessFile;
+class BlockCacheImpl;
+
+// Each SSD will New a BlockCache
+// block state
+uint64_t kCacheBlockValid = 1;
+struct CacheBlock {
+    uint64_t fid;
+    uint64_t block_idx;
+    uint64_t sid;
+    uint64_t cache_block_idx;
+    uint64_t state;
+    port::CondVar cv;
+    Slice data_block;
+    bool data_block_alloc;
+    uint64_t data_block_refs;
+    LRUHandle* handle;
+    Status s;
+
+    CacheBlock(port::Mutex* mu)
+    : fid(0),
+      block_idx(0),
+      sid(0xffffffffffffffff),
+      cache_block_idx(0xffffffffffffffff),
+      state(!kCacheBlockValid),
+      cv(mu),
+      data_block_alloc(false),
+      data_block_refs(0),
+      handle(NULL) {
+    }
+
+    // access in cache lock
+    void GetDataBlock(uint64_t block_size, Slice data) {
+        if (data_block_refs == 0) { // first one alloc mem
+            assert(data_block.size() == 0);
+            assert(data_block_alloc == false);
+            if (data.size() == 0) {
+                char* buf = new char[block_size];
+                data = Slice(buf, block_size);
+                data_block_alloc = true;
+            }
+            data_block = data;
+        }
+        ++data_block_refs;
+    }
+
+    // access in cache lock
+    void ReleaseDataBlock() {
+        --data_block_refs;
+        if (data_block_refs == 0) {
+            if (data_block_alloc) {
+                char* data = (char*)data_block.data();
+                delete[] data;
+                data_block_alloc = false;
+            }
+            data_block = Slice();
+        }
+    }
+
+    void DecodeFrom(Slice record) {
+        fid = DecodeFixed64(record.data());
+        record.remove_prefix(sizeof(uint64_t));
+        block_idx = DecodeFixed64(record.data());
+        record.remove_prefix(sizeof(uint64_t));
+        state = DecodeFixed64(record.data());
+        return;
+    }
+
+    const std::string Encode() {
+        std::string r;
+        PutFixed64(&r, fid);
+        PutFixed64(&r, block_idx);
+        PutFixed64(&r, state);
+        return r;
+    }
+
+    const std::string ToString() {
+        std::stringstream ss;
+        ss << "CacheBlock: fid: " << fid << ", block_idx: " << block_idx
+           << ", sid: " << sid << ", cache_block_idx: " << cache_block_idx
+           << ", state " << state;
+        return ss.str();
+    }
+};
+
+struct DataSet {
+    Cache* cache;
+    int fd;
+};
+
+class BlockCacheImpl {
+public:
+    BlockCacheImpl(const BlockCacheOptions& options);
+
+    ~BlockCacheImpl();
+
+    const std::string& WorkPath();
+
+    Status LoadCache(); // init cache
+
+    Status NewWritableFile(const std::string& fname,
+                           WritableFile** result);
+
+    Status NewRandomAccessFile(const std::string& fname,
+                               RandomAccessFile** result); // cache Pread
+    static void BlockDeleter(const Slice& key, void* v);
+
+private:
+    friend struct DataSet;
+    struct LockContent;
+
+    Status LockAndPut(LockContent& lc);
+
+    Status FillCache(CacheBlock* block);
+
+    Status ReadCache(CacheBlock* block);
+
+    uint64_t AllocFileId(); // no more than fid_batch_num
+
+    uint64_t FileId(const std::string& fname);
+
+    DataSet* GetDataSet(uint64_t sid);
+
+    CacheBlock* GetAndAllocBlock(uint64_t fid, uint64_t block_idx);
+
+    Status LogRecord(CacheBlock* block);
+
+    Status ReleaseBlock(CacheBlock* block);
+
+private:
+    friend class BlockCacheWritableFile;
+    friend class BlockCacheRandomAccessFile;
+    friend struct CacheBlock;
+
+    BlockCacheOptions options_;
+    std::string work_path_;
+    Env* dfs_env_;
+    //Env* posix_env_;
+
+    port::Mutex mu_;
+    // key lock list
+    struct Waiter {
+        port::CondVar cv;
+        int wait_num;
+        bool done;
+        Waiter(port::Mutex* mu):cv(mu), wait_num(0), done(false) {}
+    };
+    typedef std::map<std::string, Waiter*> LockKeyMap;
+    LockKeyMap lock_key_;
+
+    uint64_t new_fid_;
+    uint64_t prev_fid_;
+
+    enum LockKeyType {
+        kDBKey = 0,
+        kDataSetKey = 1,
+    };
+    struct LockContent {
+        int type;
+
+        // DB key
+        Slice db_lock_key;
+        Slice db_lock_val;
+        std::string* db_val;
+
+        // data set id
+        uint64_t sid;
+        DataSet* data_set;
+    };
+    typedef std::map<uint64_t, DataSet*> DataSetMap;
+    DataSetMap data_set_map_;
+
+    //WritableFile* logfile_;
+    //log::Writer* log_;
+    DB* db_; // store meta
+    ThreadPool bg_fill_;
+    ThreadPool bg_read_;
+    ThreadPool bg_flush_;
+};
+
+// Must insure not init more than twice
+Env* NewBlockCacheEnv(Env* base) {
+    return new BlockCacheEnv(base);
+}
+
+BlockCacheEnv::BlockCacheEnv(Env* base)
+  : EnvWrapper(NewPosixEnv()), dfs_env_(base) {
+    //target()->SetBackgroundThreads(30);
+}
+
+BlockCacheEnv::~BlockCacheEnv() {}
+
+Status BlockCacheEnv::FileExists(const std::string& fname) {
+    return dfs_env_->FileExists(fname);
+}
+
+Status BlockCacheEnv::GetChildren(const std::string& path,
+                                  std::vector<std::string>* result) {
+    return dfs_env_->GetChildren(path, result);
+}
+
+Status BlockCacheEnv::DeleteFile(const std::string& fname) {
+    return dfs_env_->DeleteFile(fname);
+}
+
+Status BlockCacheEnv::CreateDir(const std::string& name) {
+    return dfs_env_->CreateDir(name);
+}
+
+Status BlockCacheEnv::DeleteDir(const std::string& name) {
+    return dfs_env_->DeleteDir(name);
+}
+
+Status BlockCacheEnv::CopyFile(const std::string& from,
+                               const std::string& to) {
+    return dfs_env_->CopyFile(from, to);
+}
+
+Status BlockCacheEnv::GetFileSize(const std::string& fname, uint64_t* size) {
+    return dfs_env_->GetFileSize(fname, size);
+}
+
+Status BlockCacheEnv::RenameFile(const std::string& src, const std::string& target) {
+    return dfs_env_->RenameFile(src, target);
+}
+
+Status BlockCacheEnv::LockFile(const std::string& fname, FileLock** lock) {
+    return dfs_env_->LockFile(fname, lock);
+}
+
+Status BlockCacheEnv::UnlockFile(FileLock* lock) {
+    return dfs_env_->UnlockFile(lock);
+}
+
+Status BlockCacheEnv::LoadCache(const BlockCacheOptions& opts, const std::string& cache_dir) {
+    BlockCacheOptions options = opts;
+    options.cache_dir = cache_dir;
+    options.env = dfs_env_;
+    options.cache_env = this->target();
+    BlockCacheImpl* cache = new BlockCacheImpl(options);
+    Status s = cache->LoadCache();
+    assert(s.ok());
+    cache_vec_.push_back(cache); // no need lock
+    return s;
+}
+
+Status BlockCacheEnv::NewSequentialFile(const std::string& fname,
+                                        SequentialFile** result) {
+    return dfs_env_->NewSequentialFile(fname, result);
+}
+
+Status BlockCacheEnv::NewWritableFile(const std::string& fname,
+                                      WritableFile** result) {
+    if (fname.rfind(".sst") != fname.size() - 4) {
+        return dfs_env_->NewWritableFile(fname, result);
+    }
+
+    // cache sst file
+    uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size();
+    BlockCacheImpl* cache = cache_vec_[hash];
+    Status s = cache->NewWritableFile(fname, result);
+    Log("[block_cache %s] open file write: %s, hash: %u, status: %s\n",
+        cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str());
+    return s;
+}
+
+Status BlockCacheEnv::NewRandomAccessFile(const std::string& fname,
+                                          RandomAccessFile** result) {
+    uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size();
+    BlockCacheImpl* cache = cache_vec_[hash];
+    Status s = cache->NewRandomAccessFile(fname, result);
+    Log("[block_cache %s] open file read: %s, hash: %u, status: %s\n",
+        cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str());
+    return s;
+}
+
+class BlockCacheWriteBuffer {
+public:
+    BlockCacheWriteBuffer(const std::string& path,
+                          const std::string& file,
+                          int block_size)
+        : offset_(0),
+        block_size_(block_size),
+        block_idx_(0),
+        tmp_storage_(NULL),
+        path_(path),
+        file_(file) {
+    }
+
+    ~BlockCacheWriteBuffer() {
+        assert(block_list_.size() == 0);
+    }
+
+    uint32_t NumFullBlock() { // use for BGFlush
+        MutexLock l(&mu_);
+        if (block_list_.size() > 1) {
+            return block_list_.size() - 1;
+        }
+        return 0;
+    }
+
+    Status Append(const Slice& data) {
+        MutexLock l(&mu_);
+        if (tmp_storage_ == NULL) {
+            tmp_storage_ = new std::string();
+            tmp_storage_->resize(0);
+            block_list_.push_back(tmp_storage_);
+        }
+        uint32_t begin = offset_ / block_size_;
+        uint32_t end = (offset_ + data.size()) / block_size_;
+        if (begin == end) { // in the same block
+            tmp_storage_->append(data.data(), data.size());
+        } else {
+            uint32_t tmp_size = block_size_ - (offset_ % block_size_);
+            tmp_storage_->append(data.data(), tmp_size);
+            assert(tmp_storage_->size() == block_size_);
+            Slice buf(data.data() + tmp_size, data.size() - tmp_size);
+            for (uint32_t i = begin + 1; i <= end; ++i) {
+                tmp_storage_ = new std::string();
+                tmp_storage_->resize(0);
+                block_list_.push_back(tmp_storage_);
+                if (i < end) { // last block
+                    tmp_storage_->append(buf.data(), block_size_);
+                    buf.remove_prefix(block_size_);
+                } else { // last block
+                    tmp_storage_->append(buf.data(), buf.size());
+                }
+                Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu\n",
+                    path_.c_str(),
+                    file_.c_str(),
+                    offset_,
+                    buf.size());
+            }
+        }
+        offset_ += data.size();
+        Log("[%s] add record: %s, begin: %u, end: %u, offset: %lu, data_size: %lu, block_size: %u\n",
+            path_.c_str(),
+            file_.c_str(),
+            begin, end,
+            offset_, data.size(), block_size_);
+        return Status::OK();
+    }
+
+    std::string* PopFrontBlock(uint64_t* block_idx) {
+        MutexLock l(&mu_);
+        std::string* block = block_list_.front();
+        if (block_list_.size() == 0) {
+            return NULL;
+        }
+        block_list_.pop_front();
+        *block_idx = block_idx_;
+        block_idx_++;
+        return block;
+    }
+
+    std::string* PopBackBlock(uint64_t* block_idx) {
+        MutexLock l(&mu_);
+        if (block_list_.size() == 0) {
+            return NULL;
+        }
+        std::string* block = block_list_.back();
+        block_list_.pop_back();
+        *block_idx = offset_ / block_size_;
+        return block;
+    }
+
+    void ReleaseBlock(std::string* block) {
+        delete block;
+    }
+
+private:
+    port::Mutex mu_;
+    uint64_t offset_;
+    uint32_t block_size_;
+    uint64_t block_idx_;
+    std::string* tmp_storage_;
+    std::list<std::string*> block_list_; // kBlockSize
+    std::string path_;
+    std::string file_;
+};
+
+class BlockCacheWritableFile : public WritableFile {
+public:
+    BlockCacheWritableFile(BlockCacheImpl* c, const std::string& fname, Status* s)
+        : cache_(c),
+          bg_cv_(&c->mu_),
+          bg_block_flush_(0),
+          write_buffer_(cache_->WorkPath(), fname, cache_->options_.block_size),
+          fname_(fname) { // file open
+        *s = cache_->dfs_env_->NewWritableFile(fname_, &dfs_file_);
+        Log("[%s] dfs open: %s, block_size: %lu, status: %s\n",
+            cache_->WorkPath().c_str(),
+            fname.c_str(),
+            cache_->options_.block_size,
+            s->ToString().c_str());
+        return;
+    }
+
+    ~BlockCacheWritableFile() {
+        if (dfs_file_ != NULL) {
+            Log("[%s] dfs close for release %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+            dfs_file_->Close();
+            delete dfs_file_;
+            dfs_file_ = NULL;
+        }
+
+        Log("[%s] begin release %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        MutexLock lockgard(&cache_->mu_);
+        uint64_t block_idx;
+        std::string* block_data = write_buffer_.PopBackBlock(&block_idx);
+        if (block_data == NULL) {
+            Log("[%s] end release(nothing) %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+            return;
+        }
+        FillCache(block_data, block_idx);
+
+        while (bg_block_flush_ > 0) {
+            bg_cv_.Wait();
+        }
+        Log("[%s] end release %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        return;
+    }
+
+    Status Append(const Slice& data) {
+        Status s = dfs_file_->Append(data);
+        if (!s.ok()) {
+            Log("[%s] dfs append fail: %s, status: %s\n",
+                cache_->WorkPath().c_str(),
+                fname_.c_str(),
+                s.ToString().c_str());
+            return s;
+        }
+        write_buffer_.Append(data);
+
+        MutexLock lockgard(&cache_->mu_);
+        MaybeScheduleBGFlush();
+        return Status::OK();
+    }
+
+    Status Close() {
+        Log("[%s] begin close %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        Status s = dfs_file_->Close();
+        delete dfs_file_;
+        dfs_file_ = NULL;
+
+        MutexLock lockgard(&cache_->mu_);
+        uint64_t block_idx;
+        std::string* block_data = write_buffer_.PopBackBlock(&block_idx);
+        if (block_data == NULL) {
+            Log("[%s] end close state error: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+            return s;
+        }
+        FillCache(block_data, block_idx);
+
+        while (bg_block_flush_ > 0) {
+            bg_cv_.Wait();
+        }
+        Log("[%s] end close %s, status %s\n", cache_->WorkPath().c_str(), fname_.c_str(),
+            s.ToString().c_str());
+        return s;
+    }
+
+    Status Flush() {
+        Log("[%s] dfs flush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        return dfs_file_->Flush();
+    }
+
+    Status Sync() {
+        Log("[%s] dfs sync: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        return dfs_file_->Sync();
+    }
+
+private:
+    void MaybeScheduleBGFlush() {
+        cache_->mu_.AssertHeld();
+        Log("[%s] Maybe schedule BGFlush: %s, bg_block_flush: %u, block_nr: %u\n",
+            cache_->WorkPath().c_str(),
+            fname_.c_str(),
+            bg_block_flush_,
+            write_buffer_.NumFullBlock());
+        while (bg_block_flush_ < write_buffer_.NumFullBlock()) {
+            bg_block_flush_++;
+            cache_->bg_flush_.Schedule(&BlockCacheWritableFile::BGFlushFunc, this, 10);
+        }
+    }
+
+    static void BGFlushFunc(void* arg) {
+        reinterpret_cast<BlockCacheWritableFile*>(arg)->BGFlush();
+    }
+    void BGFlush() {
+        Log("[%s] begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        MutexLock lockgard(&cache_->mu_);
+        if (write_buffer_.NumFullBlock() == 0) {
+            return;
+        }
+
+        uint64_t block_idx;
+        std::string* block_data = write_buffer_.PopFrontBlock(&block_idx);
+        assert(block_data != NULL);
+        FillCache(block_data, block_idx);
+
+        bg_block_flush_--;
+        MaybeScheduleBGFlush();
+        bg_cv_.Signal();
+        return;
+    }
+
+    Status FillCache(std::string* block_data, uint64_t block_idx) {
+        cache_->mu_.AssertHeld();
+        uint64_t fid = cache_->FileId(fname_);
+        CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx);
+        assert(block->state != kCacheBlockValid);
+        block->GetDataBlock(cache_->options_.block_size, Slice(*block_data));
+        cache_->mu_.Unlock();
+
+        // Do io without lock
+        cache_->LogRecord(block);
+        cache_->FillCache(block);
+
+        cache_->mu_.Lock();
+        block->state = kCacheBlockValid;
+        cache_->ReleaseBlock(block);
+        write_buffer_.ReleaseBlock(block_data);
+        return Status::OK();
+    }
+
+private:
+    BlockCacheImpl* cache_;
+    //port::AtomicPointer shutting_down_;
+    port::CondVar bg_cv_;          // Signalled when background work finishes
+    WritableFile* dfs_file_;
+    // protected by cache_.mu_
+    uint32_t bg_block_flush_;
+    BlockCacheWriteBuffer write_buffer_;
+    std::string fname_;
+};
+
+class BlockCacheRandomAccessFile : public RandomAccessFile {
+public:
+    BlockCacheRandomAccessFile(BlockCacheImpl* c, const std::string& fname, Status* s)
+    : cache_(c),
+      fname_(fname) {
+        *s = cache_->dfs_env_->NewRandomAccessFile(fname_, &dfs_file_);
+        Log("[%s] dfs open for read: %s, block_size: %lu, status: %s\n",
+            cache_->WorkPath().c_str(),
+            fname.c_str(),
+            cache_->options_.block_size,
+            s->ToString().c_str());
+        return;
+    }
+
+    ~BlockCacheRandomAccessFile() {
+        delete dfs_file_;
+        return;
+    }
+
+    Status Read(uint64_t offset, size_t n, Slice* result,
+                char* scratch) const {
+        MutexLock lockgard(&cache_->mu_);
+        uint64_t fid = cache_->FileId(fname_);
+        uint64_t begin = offset / cache_->options_.block_size;
+        uint64_t end = (offset + n) / cache_->options_.block_size;
+        assert(begin <= end);
+        std::vector<CacheBlock*> c_miss;
+        std::vector<CacheBlock*> c_locked;
+        std::vector<CacheBlock*> c_valid;
+        std::vector<CacheBlock*> block_queue;
+
+        Log("[%s] begin pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu"
+            ", block_size %lu\n",
+            cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
+            begin, end, cache_->options_.block_size);
+        for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) {
+            CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx);
+            assert(block->fid == fid && block->block_idx == block_idx);
+            block->GetDataBlock(cache_->options_.block_size, Slice());
+            block_queue.push_back(block); // sort by block_idx
+
+            if (block->state != kCacheBlockValid && block->handle->refs == 2) { // first one access this block
+                c_miss.push_back(block);
+            } else if (block->state == kCacheBlockValid && block->handle->refs == 2) { // frist one access this block
+                c_valid.push_back(block);
+            } else {
+                c_locked.push_back(block);
+            }
+            Log("[%s] queue block: %s, refs %u, data_block_refs %lu, alloc %u\n",
+                cache_->WorkPath().c_str(), block->ToString().c_str(),
+                block->handle->refs, block->data_block_refs,
+                block->data_block_alloc);
+        }
+        cache_->mu_.Unlock();
+
+        // async read miss data
+        for (uint32_t i = 0; i < c_miss.size(); ++i) {
+            CacheBlock* block = c_miss[i];
+            AsyncDfsReader* reader = new AsyncDfsReader;
+            reader->file = const_cast<BlockCacheRandomAccessFile*>(this);
+            reader->block = block;
+            Log("[%s] pread in miss list, %s\n",
+                cache_->WorkPath().c_str(),
+                block->ToString().c_str());
+            cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10);
+        }
+
+        // async read valid data
+        for (uint32_t i = 0; i < c_valid.size(); ++i) {
+            CacheBlock* block = c_valid[i];
+            AsyncCacheReader* reader = new AsyncCacheReader;
+            reader->file = const_cast<BlockCacheRandomAccessFile*>(this);
+            reader->block = block;
+            Log("[%s] pread in valid list, %s\n",
+                cache_->WorkPath().c_str(),
+                block->ToString().c_str());
+            cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10);
+        }
+
+        // wait dfs read done and async cache file
+        for (uint32_t i = 0; i < c_miss.size(); ++i) {
+            MutexLock lockgard(&cache_->mu_);
+            CacheBlock* block = c_miss[i];
+            block->cv.Wait();
+            Log("[%s] pread in miss list(dfs done), %s\n",
+                cache_->WorkPath().c_str(),
+                block->ToString().c_str());
+        }
+
+        for (uint32_t i = 0; i < c_miss.size(); ++i) {
+            CacheBlock* block = c_miss[i];
+            AsyncCacheWriter* writer = new AsyncCacheWriter;
+            writer->file = const_cast<BlockCacheRandomAccessFile*>(this);
+            writer->block = block;
+            Log("[%s] pread in miss list(fill cache), %s\n",
+                cache_->WorkPath().c_str(),
+                block->ToString().c_str());
+            cache_->bg_fill_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheWrite, writer, 10);
+        }
+
+        for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish
+            MutexLock lockgard(&cache_->mu_);
+            CacheBlock* block = c_miss[i];
+            block->cv.Wait();
+            Log("[%s] pread in miss list(fill cache done), %s\n",
+                cache_->WorkPath().c_str(),
+                block->ToString().c_str());
+        }
+
+        // wait cache read done
+        for (uint32_t i = 0; i < c_valid.size(); ++i) {
+            MutexLock lockgard(&cache_->mu_);
+            CacheBlock* block = c_valid[i];
+            block->cv.Wait();
+            Log("[%s] pread in valid list(done), %s\n",
+                cache_->WorkPath().c_str(),
+                block->ToString().c_str());
+        }
+
+        // wait other async read finish
+        for (uint32_t i = 0; i < c_locked.size(); ++i) {
+            MutexLock lockgard(&cache_->mu_);
+            CacheBlock* block = c_locked[i];
+            while (block->state != kCacheBlockValid) {
+                block->cv.Wait();
+            }
+        }
+
+        // fill user mem
+        size_t msize = 0;
+        for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) {
+            CacheBlock* block = block_queue[block_idx - begin];
+            Slice data_block = block->data_block;
+            if (block_idx == begin) {
+                data_block.remove_prefix(offset % cache_->options_.block_size);
+            }
+            if (block_idx == end) {
+                data_block.remove_suffix(cache_->options_.block_size - (n + offset) % cache_->options_.block_size);
+            }
+            memcpy(scratch + msize, data_block.data(), data_block.size());
+            msize += data_block.size();
+            Log("[%s] fill user data, %s, prefix %lu, suffix %lu, msize %lu, offset %lu\n",
+                cache_->WorkPath().c_str(), fname_.c_str(),
+                block_idx == begin ? offset % cache_->options_.block_size: 0,
+                block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size
+                                 : cache_->options_.block_size,
+                msize, offset);
+        }
+        assert(msize == n);
+        *result = Slice(scratch, n);
+
+        cache_->mu_.Lock();
+        for (uint32_t i = 0; i < c_miss.size(); ++i) {
+            CacheBlock* block = c_miss[i];
+            block->state = kCacheBlockValid;
+            Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
+            cache_->ReleaseBlock(block);
+        }
+        for (uint32_t i = 0; i < c_valid.size(); ++i) {
+            CacheBlock* block = c_valid[i];
+            block->state = kCacheBlockValid;
+            Log("[%s] wakeup for valid, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
+            cache_->ReleaseBlock(block);
+        }
+        for (uint32_t i = 0; i < c_locked.size(); ++i) {
+            CacheBlock* block = c_locked[i];
+            block->state = kCacheBlockValid;
+            Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
+            cache_->ReleaseBlock(block);
+        }
+
+        Log("[%s] end pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu"
+            ", block_size %lu\n",
+            cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
+            begin, end, cache_->options_.block_size);
+        return Status::OK();
+    }
+
+private:
+    struct AsyncDfsReader {
+        BlockCacheRandomAccessFile* file;
+        CacheBlock* block;
+    };
+    static void AsyncDfsRead(void* arg) {
+        AsyncDfsReader* reader = (AsyncDfsReader*)arg;
+        reader->file->HandleDfsRead(reader);
+        delete reader;
+        return;
+    }
+    void HandleDfsRead(AsyncDfsReader* reader) {
+        Status s;
+        CacheBlock* block = reader->block;
+        char* scratch = (char*)(block->data_block.data());
+        Slice result;
+        uint64_t offset = block->block_idx * cache_->options_.block_size;
+        size_t n = cache_->options_.block_size;
+        s = dfs_file_->Read(offset, n, &result, scratch);
+        Log("[%s] cache async.dfs read, %s"
+            ", offset %lu, size %lu, status %s, res %lu\n",
+            cache_->WorkPath().c_str(), block->ToString().c_str(),
+            offset, n,
+            s.ToString().c_str(), result.size());
+
+        MutexLock lockgard(&cache_->mu_);
+        block->cv.SignalAll();
+        return;
+    }
+
+    struct AsyncCacheReader {
+        BlockCacheRandomAccessFile* file;
+        CacheBlock* block;
+    };
+    static void AsyncCacheRead(void* arg) {
+        AsyncCacheReader* reader = (AsyncCacheReader*)arg;
+        reader->file->HandleCacheRead(reader);
+        delete reader;
+        return;
+    }
+    void HandleCacheRead(AsyncCacheReader* reader) {
+        CacheBlock* block = reader->block;
+        cache_->ReadCache(block);
+
+        MutexLock lockgard(&cache_->mu_);
+        block->cv.SignalAll();
+        return;
+    }
+
+    struct AsyncCacheWriter {
+        BlockCacheRandomAccessFile* file;
+        CacheBlock* block;
+    };
+    static void AsyncCacheWrite(void* arg) {
+        AsyncCacheWriter* writer = (AsyncCacheWriter*)arg;
+        writer->file->HandleCacheWrite(writer);
+        delete writer;
+        return;
+    }
+    void HandleCacheWrite(AsyncCacheWriter* writer) {
+        CacheBlock* block = writer->block;
+        Log("[%s] handle cache write, %s\n",
+            cache_->WorkPath().c_str(),
+            block->ToString().c_str());
+        cache_->LogRecord(block);
+        cache_->FillCache(block);
+
+        MutexLock lockgard(&cache_->mu_);
+        block->cv.SignalAll();
+        return;
+    }
+
+private:
+    BlockCacheImpl* cache_;
+    RandomAccessFile* dfs_file_;
+    std::string fname_;
+};
+
+// Tcache impl
+BlockCacheImpl::BlockCacheImpl(const BlockCacheOptions& options)
+    : options_(options),
+      dfs_env_(options.env),
+      new_fid_(0),
+      prev_fid_(0),
+      db_(NULL) {
+    bg_fill_.SetBackgroundThreads(30);
+    bg_read_.SetBackgroundThreads(30);
+    bg_flush_.SetBackgroundThreads(30);
+}
+
+BlockCacheImpl::~BlockCacheImpl() {}
+
+Status BlockCacheImpl::NewWritableFile(const std::string& fname,
+                                       WritableFile** result) {
+    Status s;
+    BlockCacheWritableFile* file = new BlockCacheWritableFile(this, fname, &s);
+    *result = NULL;
+    if (s.ok()) {
+        *result = (WritableFile*)file;
+    }
+    return s;
+}
+
+Status BlockCacheImpl::NewRandomAccessFile(const std::string& fname,
+                                           RandomAccessFile** result) {
+    Status s;
+    BlockCacheRandomAccessFile* file = new BlockCacheRandomAccessFile(this, fname, &s);
+    *result = NULL;
+    if (s.ok()) {
+        *result = (RandomAccessFile*)file;
+    }
+    return s;
+}
+
+void BlockCacheImpl::BlockDeleter(const Slice& key, void* v) {
+    CacheBlock* block = (CacheBlock*)v;
+    Log("Evict blockcache: %s\n", block->ToString().c_str());
+    delete block;
+    return;
+}
+
+// if lock succ, put lock_val, else get newer value
+Status BlockCacheImpl::LockAndPut(LockContent& lc) {
+    mu_.AssertHeld();
+    Status s;
+    std::string key;
+    if (lc.type == kDBKey) {
+        key = lc.db_lock_key.ToString();
+    } else if (lc.type == kDataSetKey) {
+        key = "DS#";
+        PutFixed64(&key, lc.sid);
+    } else {
+        return Status::NotSupported("key type error");
+    }
+    Log("[%s] trylock key: %s\n",
+        this->WorkPath().c_str(),
+        key.c_str());
+
+    Waiter* w = NULL;
+    LockKeyMap::iterator it = lock_key_.find(key);
+    if (it != lock_key_.end()){
+        w = it->second;
+        w->wait_num ++;
+        while (!w->done) {
+            w->cv.Wait();
+        }
+        mu_.Unlock();
+
+        if (lc.type == kDBKey) {
+            ReadOptions r_opts;
+            s = db_->Get(r_opts, key, lc.db_val);
+            Log("[%s] get lock key: %s, val: %s, status: %s\n",
+                this->WorkPath().c_str(),
+                key.c_str(),
+                lc.db_val->c_str(),
+                s.ToString().c_str());
+        } else if (lc.type == kDataSetKey) {
+            lc.data_set = data_set_map_[lc.sid];
+            Log("[%s] get dataset sid: %lu\n",
+                this->WorkPath().c_str(),
+                lc.sid);
+        }
+
+        mu_.Lock();
+        if (--w->wait_num == 0) {
+            // last thread wait for open
+            lock_key_.erase(key);
+            Log("[%s] wait done %s, delete cv\n",
+                this->WorkPath().c_str(),
+                key.c_str());
+            delete w;
+        } else {
+            Log("[%s] wait done %s, not last\n",
+                this->WorkPath().c_str(),
+                key.c_str());
+        }
+    } else {
+        w = new Waiter(&mu_);
+        w->wait_num = 1;
+        lock_key_[key] = w;
+        mu_.Unlock();
+
+        if (lc.type == kDBKey) {
+            WriteOptions w_opts;
+            s = db_->Put(w_opts, key, lc.db_lock_val);
+            if (s.ok()) {
+                lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size());
+            }
+            Log("[%s] put kDBKey: %s, status %s\n",
+                this->WorkPath().c_str(),
+                key.c_str(),
+                s.ToString().c_str());
+        } else if (lc.type == kDataSetKey) {
+            std::string end_ds = "DS#";
+            PutFixed64(&end_ds, lc.sid + 1);
+            lc.data_set = new DataSet;
+            lc.data_set->cache = New2QCache((options_.dataset_size / options_.block_size) + 1);// number of blocks in DS
+            std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid);
+            lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644);
+            assert(lc.data_set->fd > 0);
+            Log("[%s] begin new dataset, sid: %lu, file: %s, cs: %lu, fd: %d\n",
+                this->WorkPath().c_str(),
+                lc.sid, file.c_str(), (options_.dataset_size / options_.block_size) + 1,
+                lc.data_set->fd);
+
+            // reload hash lru
+            ReadOptions s_opts;
+            leveldb::Iterator* db_it = db_->NewIterator(s_opts);
+            for (db_it->Seek(key);
+                 db_it->Valid() && db_it->key().ToString() < end_ds;
+                 db_it->Next()) {
+                Slice lkey = db_it->key();
+                lkey.remove_prefix(3 + sizeof(uint64_t));// remove DS#sid
+                //Slice lval = db_it->value();
+
+                CacheBlock* block = new CacheBlock(&mu_);
+                block->DecodeFrom(db_it->value()); // get fid and block_idx
+                std::string hkey;
+                PutFixed64(&hkey, block->fid);
+                PutFixed64(&hkey, block->block_idx);
+                block->sid = lc.sid;
+                block->cache_block_idx = DecodeFixed64(lkey.data());
+                Log("[%s] insert cacheblock into 2QLru, %s\n",
+                    this->WorkPath().c_str(),
+                    block->ToString().c_str());
+                LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter));
+                assert(handle != NULL);
+                handle->cache_id = block->cache_block_idx;
+                lc.data_set->cache->Release((Cache::Handle*)handle);
+            }
+            delete db_it;
+
+            mu_.Lock();
+            data_set_map_[lc.sid] = lc.data_set;
+            mu_.Unlock();
+        }
+
+        mu_.Lock();
+        if (--w->wait_num == 0) {
+            lock_key_.erase(key);
+            Log("[%s] put done %s, no wait thread\n",
+                this->WorkPath().c_str(),
+                key.c_str());
+            delete w;
+        } else {
+            Log("[%s] put done %s, signal all wait thread\n",
+                this->WorkPath().c_str(),
+                key.c_str());
+            w->done = true;
+            w->cv.SignalAll();
+        }
+    }
+    return s;
+}
+
+const std::string& BlockCacheImpl::WorkPath() {
+    return work_path_;
+}
+
+Status BlockCacheImpl::LoadCache() {
+    // open meta file
+    work_path_ = options_.cache_dir;
+    std::string dbname = options_.cache_dir + "/meta/";
+    options_.opts.env = options_.cache_env; // local write
+    options_.opts.filter_policy = NewBloomFilterPolicy(10);
+    options_.opts.block_cache = leveldb::NewLRUCache(options_.meta_block_cache_size * 1024UL * 1024);
+    options_.opts.table_cache = new leveldb::TableCache(options_.meta_table_cache_size * 1024UL * 1024);
+    options_.opts.write_buffer_size = options_.write_buffer_size;
+    options_.opts.info_log = Logger::DefaultLogger();
+    Log("[block_cache %s] open meta db: block_cache: %lu, table_cache: %lu\n",
+        dbname.c_str(),
+        options_.meta_block_cache_size,
+        options_.meta_table_cache_size);
+    Status s = DB::Open(options_.opts, dbname, &db_);
+    assert(s.ok());
+
+    // recover fid
+    std::string key = "FID#";
+    std::string val;
+    ReadOptions r_opts;
+    s = db_->Get(r_opts, key, &val);
+    if (!s.ok()) {
+        prev_fid_ = 0;
+    } else {
+        prev_fid_ = DecodeFixed64(val.c_str());
+    }
+    new_fid_ = prev_fid_ + options_.fid_batch_num;
+    Log("[block_cache %s]: reuse block cache: prev_fid: %lu, new_fid: %lu\n",
+        dbname.c_str(), prev_fid_, new_fid_);
+    s = Status::OK();
+    return s;
+}
+
+Status BlockCacheImpl::FillCache(CacheBlock* block) {
+    MutexLock l(&mu_);
+    uint64_t sid = block->sid;
+    uint64_t cache_block_idx = block->cache_block_idx;
+    int fd = (data_set_map_[sid])->fd;
+    mu_.Unlock();
+
+    // do io without lock
+    ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(),
+                         cache_block_idx * options_.block_size);
+    Log("[%s] fillcache: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
+        this->WorkPath().c_str(), sid, fd, block->data_block.size(),
+        cache_block_idx,
+        block->ToString().c_str(),
+        res);
+
+    mu_.Lock();
+    if (res < 0) {
+        return Status::Corruption("FillCache error");
+    }
+    return Status::OK();
+}
+
+Status BlockCacheImpl::ReadCache(CacheBlock* block) {
+    MutexLock l(&mu_);
+    uint64_t sid = block->sid;
+    uint64_t cache_block_idx = block->cache_block_idx;
+    int fd = (data_set_map_[sid])->fd;
+    mu_.Unlock();
+
+    // do io without lock
+    ssize_t res = pread(fd, (char*)block->data_block.data(), block->data_block.size(),
+                         cache_block_idx * options_.block_size);
+    Log("[%s] readcache: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
+        this->WorkPath().c_str(), sid, fd, block->data_block.size(),
+        cache_block_idx,
+        block->ToString().c_str(),
+        res);
+
+    mu_.Lock();
+    if (res < 0) {
+        return Status::Corruption("ReadCache error");
+    }
+    return Status::OK();
+}
+
+uint64_t BlockCacheImpl::AllocFileId() { // no more than fid_batch_num
+    mu_.AssertHeld();
+    uint64_t fid = ++new_fid_;
+    while (new_fid_ - prev_fid_ >= options_.fid_batch_num) {
+        std::string key = "FID#";
+        std::string lock_val;
+        PutFixed64(&lock_val, new_fid_);
+        std::string val;
+
+        LockContent lc;
+        lc.type = kDBKey;
+        lc.db_lock_key = key;
+        lc.db_lock_val = lock_val;
+        lc.db_val = &val;
+        Status s = LockAndPut(lc);
+        if (s.ok()) {
+            prev_fid_ = DecodeFixed64(val.c_str());
+        }
+        Log("[%s] alloc fid: key %s, new_fid: %lu, prev_fid: %lu\n",
+            this->WorkPath().c_str(),
+            key.c_str(),
+            new_fid_,
+            prev_fid_);
+    }
+    return fid;
+}
+
+uint64_t BlockCacheImpl::FileId(const std::string& fname) {
+    mu_.AssertHeld();
+    uint64_t fid = 0;
+    std::string key = "FNAME#" + fname;
+    mu_.Unlock();
+
+    ReadOptions r_opts;
+    std::string val;
+    Status s = db_->Get(r_opts, key, &val);
+    if (!s.ok()) { // not exist
+        MutexLock l(&mu_);
+        fid = AllocFileId();
+        std::string v;
+        PutFixed64(&val, fid);
+
+        LockContent lc;
+        lc.type = kDBKey;
+        lc.db_lock_key = key;
+        lc.db_lock_val = val;
+        lc.db_val = &v;
+        Log("[%s] alloc fid: %lu, key: %s",
+            this->WorkPath().c_str(),
+            fid, key.c_str());
+        s = LockAndPut(lc);
+        assert(s.ok());
+        fid = DecodeFixed64(v.c_str());
+    } else { // fid in cache
+        fid = DecodeFixed64(val.c_str());
+    }
+    Log("[%s] Fid: %lu, fname: %s\n",
+        this->WorkPath().c_str(),
+        fid, fname.c_str());
+
+    mu_.Lock();
+    return fid;
+}
+
+DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) {
+    mu_.AssertHeld();
+    DataSet* set = NULL;
+
+    DataSetMap::iterator it = data_set_map_.find(sid);
+    if (it == data_set_map_.end()) {
+        LockContent lc;
+        lc.type = kDataSetKey;
+        lc.sid = sid;
+        lc.data_set = NULL;
+        Status s = LockAndPut(lc);
+        set = lc.data_set;
+    } else {
+        Log("[%s] get dataset from memcache, sid %lu\n",
+            this->WorkPath().c_str(), sid);
+        set = it->second;
+    }
+    return set;
+}
+
+CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
+    mu_.AssertHeld();
+    std::string key;
+    PutFixed64(&key, fid);
+    PutFixed64(&key, block_idx);
+    uint32_t hash = Hash(key.c_str(), key.size(), 7);
+    uint64_t sid = hash % options_.dataset_num;
+
+    Log("[%s] alloc block, try get dataset, fid: %lu, block_idx: %lu, hash: %u, sid %lu, dataset_num: %lu\n",
+        this->WorkPath().c_str(), fid, block_idx, hash, sid, options_.dataset_num);
+    CacheBlock* block = NULL;
+    DataSet* ds = GetDataSet(sid); // get and alloc ds
+    Cache* cache = ds->cache;
+    LRUHandle* h = (LRUHandle*)cache->Lookup(key);
+    if (h == NULL) {
+       block = new CacheBlock(&mu_);
+       h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
+       assert(h != NULL);
+       block->fid = fid;
+       block->block_idx = block_idx;
+       block->sid = sid;
+       block->cache_block_idx = h->cache_id;
+       Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
+       assert(block->state != kCacheBlockValid);
+    } else {
+        block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
+        Log("[%s] get block from memcache, %s\n",
+            this->WorkPath().c_str(), block->ToString().c_str());
+    }
+    block->handle = h;
+    return block;
+}
+
+Status BlockCacheImpl::LogRecord(CacheBlock* block) {
+    std::string key = "DS#";
+    PutFixed64(&key, block->sid);
+    PutFixed64(&key, block->cache_block_idx);
+    leveldb::WriteBatch batch;
+    batch.Put(key, block->Encode());
+    return db_->Write(leveldb::WriteOptions(), &batch);
+}
+
+Status BlockCacheImpl::ReleaseBlock(CacheBlock* block) {
+    mu_.AssertHeld();
+    Status s;
+    std::string key = "DS#";
+    PutFixed64(&key, block->sid);
+    PutFixed64(&key, block->cache_block_idx);
+    leveldb::WriteBatch batch;
+    batch.Put(key, block->Encode());
+
+    LRUHandle* h = block->handle;
+    DataSet* ds = GetDataSet(block->sid); // get and alloc ds
+    block->ReleaseDataBlock();
+    block->handle = NULL;
+    block->cv.SignalAll();
+    ds->cache->Release((Cache::Handle*)h);
+    mu_.Unlock();
+
+    // TODO: dump meta into memtable
+    Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
+    s = db_->Write(leveldb::WriteOptions(), &batch);
+    mu_.Lock();
+    return s;
+}
+
+}  // namespace leveldb
+
diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc
index 6eab478a1..99c2dfa90 100644
--- a/src/leveldb/util/cache.cc
+++ b/src/leveldb/util/cache.cc
@@ -25,31 +25,6 @@ namespace {
 
 // LRU cache implementation
 
-// An entry is a variable length heap-allocated structure.  Entries
-// are kept in a circular doubly linked list ordered by access time.
-struct LRUHandle {
-  void* value;
-  void (*deleter)(const Slice&, void* value);
-  LRUHandle* next_hash;
-  LRUHandle* next;
-  LRUHandle* prev;
-  size_t charge;      // TODO(opt): Only allow uint32_t?
-  size_t key_length;
-  uint32_t refs;
-  uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
-  char key_data[1];   // Beginning of key
-
-  Slice key() const {
-    // For cheaper lookups, we allow a temporary Handle object
-    // to store a pointer to a key in "value".
-    if (next == this) {
-      return *(reinterpret_cast<Slice*>(value));
-    } else {
-      return Slice(key_data, key_length);
-    }
-  }
-};
-
 // We provide our own simple hash table since it removes a whole bunch
 // of porting hacks and is also faster than some of the built-in hash
 // table implementations in some of the compiler/runtime combinations
@@ -286,6 +261,170 @@ size_t LRUCache::TotalCharge() {
   return usage_;
 }
 
+class LRU2QCache: public Cache {
+ public:
+  explicit LRU2QCache(size_t capacity)
+    : capacity_(capacity),
+      usage_(0) {
+     // Make empty circular linked list
+    lru_.next = &lru_;
+    lru_.prev = &lru_;
+  }
+
+  ~LRU2QCache() {}
+
+  // Like Cache methods, but with an extra "hash" parameter.
+  Cache::Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) {
+    const uint32_t hash = HashSlice(key);
+    MutexLock l(&mutex_);
+    LRUHandle* e = NULL;
+    e = table_.Lookup(key, hash);
+    if (e != NULL) {
+      return reinterpret_cast<Cache::Handle*>(NULL);
+    }
+
+    if (usage_ < capacity_) { // cache full
+      e = reinterpret_cast<LRUHandle*>(
+          malloc(sizeof(LRUHandle)-1 + key.size()));
+      e->value = value;
+      e->deleter = deleter;
+      e->charge = 1;
+      e->key_length = key.size();
+      e->hash = hash;
+      e->refs = 2;  // One from LRUCache, one for the returned handle
+      e->cache_id = usage_;
+      memcpy(e->key_data, key.data(), key.size());
+
+      assert(table_.Insert(e) == NULL);
+      LRU_Append(e);
+      usage_++;
+      return reinterpret_cast<Cache::Handle*>(e);
+    }
+
+    // cache full, reuse item
+    LRUHandle* old = lru_.next;
+    while (old != &lru_) {
+      if (old->refs > 1) {
+        old = old->next;
+        continue;
+      }
+      e = reinterpret_cast<LRUHandle*>(
+          malloc(sizeof(LRUHandle)-1 + key.size()));
+      e->value = value;
+      e->deleter = deleter;
+      e->charge = 1;
+      e->key_length = key.size();
+      e->hash = hash;
+      e->refs = 2;  // One from LRUCache, one for the returned handle
+      e->cache_id = old->cache_id;
+      memcpy(e->key_data, key.data(), key.size());
+
+      LRU_Remove(old);
+      table_.Remove(old->key(), old->hash);
+      Unref(old);
+
+      assert(table_.Insert(e) == NULL);
+      LRU_Append(e);
+      return reinterpret_cast<Cache::Handle*>(e);
+    }
+    // TODO: try wait finish
+    return reinterpret_cast<Cache::Handle*>(NULL);
+  }
+
+  Cache::Handle* Lookup(const Slice& key) {
+    const uint32_t hash = HashSlice(key);
+    MutexLock l(&mutex_);
+    LRUHandle* e = table_.Lookup(key, hash);
+    if (e != NULL) {
+        e->refs++;
+        LRU_Remove(e);
+        LRU_Append(e);
+    }
+    return reinterpret_cast<Cache::Handle*>(e);
+  }
+
+  void Erase(const Slice& key) {
+    const uint32_t hash = HashSlice(key);
+    MutexLock l(&mutex_);
+    LRUHandle* e = table_.Remove(key, hash);
+    if (e != NULL) {
+      LRU_Remove(e);
+      Unref(e);
+    }
+  }
+
+  void Release(Cache::Handle* handle) {
+    MutexLock l(&mutex_);
+    Unref(reinterpret_cast<LRUHandle*>(handle));
+  }
+
+  void* Value(Cache::Handle* handle) {
+    return reinterpret_cast<LRUHandle*>(handle)->value;
+  }
+
+  uint64_t NewId() {
+    return 0;
+  }
+
+  double HitRate(bool force_clear = false) {
+    return 99.9999;
+  }
+
+  size_t Entries() {
+    MutexLock l(&mutex_);
+    return usage_;
+  }
+
+  size_t TotalCharge() {
+    MutexLock l(&mutex_);
+    return usage_;
+  }
+
+ private:
+  void LRU_Remove(LRUHandle* e) {
+    e->next->prev = e->prev;
+    e->prev->next = e->next;
+  }
+
+  void LRU_Append(LRUHandle* e) {
+    // Make "e" newest entry by inserting just before lru_
+    e->next = &lru_;
+    e->prev = lru_.prev;
+    e->prev->next = e;
+    e->next->prev = e;
+  }
+
+  void Unref(LRUHandle* e) {
+    assert(e->refs > 0);
+    e->refs--;
+    if (e->refs <= 0) {
+      usage_ -= e->charge;
+      (*e->deleter)(e->key(), e->value);
+      free(e);
+    }
+  }
+
+  inline uint32_t HashSlice(const Slice& s) {
+    return Hash(s.data(), s.size(), 0);
+  }
+
+  // Initialized before use.
+  size_t capacity_;
+
+  // mutex_ protects the following state.
+  port::Mutex mutex_;
+  size_t usage_;
+
+  // Dummy head of LRU list.
+  // lru.prev is newest entry, lru.next is oldest entry.
+  //LRUHandle hot_lru_;
+  //LRUHandle cold_lru_;
+  LRUHandle lru_;
+
+  HandleTable table_;
+};
+
 static const int kNumShardBits = 4;
 static const int kNumShards = 1 << kNumShardBits;
 
@@ -382,4 +521,8 @@ Cache* NewLRUCache(size_t capacity) {
   return new ShardedLRUCache(capacity);
 }
 
+Cache* New2QCache(size_t capacity) {
+  return new LRU2QCache(capacity);
+}
+
 }  // namespace leveldb
diff --git a/src/leveldb/util/coding_test.cc b/src/leveldb/util/coding_test.cc
index fc8fbf5c9..17848377b 100644
--- a/src/leveldb/util/coding_test.cc
+++ b/src/leveldb/util/coding_test.cc
@@ -219,6 +219,17 @@ TEST(Coding, PutLG_ugly) {
     ASSERT_EQ(a_slice.ToString(), b_slice.ToString());
 }
 
+TEST(Coding, PutFixed64Cmp) {
+    std::string sa, sb;
+    PutFixed64(&sa, 100);
+    PutFixed64(&sb, 50);
+    ASSERT_TRUE(sa > sb);
+    uint64_t a = DecodeFixed64(sa.c_str());
+    uint64_t b = DecodeFixed64(sb.c_str());
+    ASSERT_TRUE(a == 100);
+    ASSERT_TRUE(b == 50);
+}
+
 }  // namespace leveldb
 
 int main(int argc, char** argv) {
diff --git a/src/sdk/sdk_zk.cc b/src/sdk/sdk_zk.cc
index e08bb6c9b..5f7b8c8f6 100644
--- a/src/sdk/sdk_zk.cc
+++ b/src/sdk/sdk_zk.cc
@@ -60,9 +60,6 @@ std::string ClusterFinder::ClusterId() {
     std::string name = Name();
     std::string authority = Authority();
     std::string path = Path();
-    if (name.empty() || authority.empty() || path.empty()) {
-        LOG(FATAL) << "cluster name/authority/path must be non-empty";
-    }
     std::string cluster_id = name + "://" + authority;
     if (path[0] != '/') {
         cluster_id += "/";
diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc
index c472b9732..4d7919fd6 100644
--- a/src/tabletnode/tabletnode_impl.cc
+++ b/src/tabletnode/tabletnode_impl.cc
@@ -14,9 +14,11 @@
 
 #include "db/filename.h"
 #include "db/table_cache.h"
+#include "common/base/string_ext.h"
 #include "common/thread.h"
 #include "io/io_utils.h"
 #include "io/utils_leveldb.h"
+#include "leveldb/block_cache.h"
 #include "leveldb/cache.h"
 #include "leveldb/env_cache.h"
 #include "leveldb/env_dfs.h"
@@ -68,7 +70,7 @@ DECLARE_string(tera_tabletnode_path_prefix);
 
 // cache-related
 DECLARE_int32(tera_memenv_block_cache_size);
-DECLARE_bool(tera_tabletnode_cache_enabled);
+DECLARE_bool(tera_tabletnode_block_cache_enabled);
 DECLARE_string(tera_tabletnode_cache_paths);
 DECLARE_int32(tera_tabletnode_cache_block_size);
 DECLARE_string(tera_tabletnode_cache_name);
@@ -150,11 +152,7 @@ TabletNodeImpl::TabletNodeImpl()
     sysinfo_.SetProcessStartTime(get_micros());
 }
 
-TabletNodeImpl::~TabletNodeImpl() {
-    if (FLAGS_tera_tabletnode_cache_enabled) {
-        leveldb::ThreeLevelCacheEnv::RemoveCachePaths();
-    }
-}
+TabletNodeImpl::~TabletNodeImpl() {}
 
 bool TabletNodeImpl::Init() {
     if (FLAGS_tera_zk_enabled) {
@@ -179,32 +177,32 @@ bool TabletNodeImpl::Init() {
 }
 
 void TabletNodeImpl::InitCacheSystem() {
-    if (!FLAGS_tera_tabletnode_cache_enabled) {
-        // compitable with legacy FlashEnv
-        leveldb::FlashEnv* flash_env = (leveldb::FlashEnv*)io::LeveldbFlashEnv();
-        flash_env->SetFlashPath(FLAGS_tera_tabletnode_cache_paths,
-                                FLAGS_tera_io_cache_path_vanish_allowed);
-        flash_env->SetUpdateFlashThreadNumber(FLAGS_tera_tabletnode_cache_update_thread_num);
-        flash_env->SetIfForceReadFromCache(FLAGS_tera_tabletnode_cache_force_read_from_cache);
-        return;
-    }
+    if (FLAGS_tera_tabletnode_block_cache_enabled) {
+        LOG(INFO) << "Tcache: set flash path: " << FLAGS_tera_tabletnode_cache_paths;
+        std::vector<std::string> path_list;
+        SplitString(FLAGS_tera_tabletnode_cache_paths, ";", &path_list);
+
+        leveldb::Env* posix_env = leveldb::Env::Default();
+        for (uint32_t i = 0; i < path_list.size(); ++i) {
+            posix_env->CreateDir(path_list[i]);
+        }
 
-    LOG(INFO) << "activate new cache system";
-    // new cache mechanism
-    leveldb::ThreeLevelCacheEnv::SetCachePaths(FLAGS_tera_tabletnode_cache_paths);
-    leveldb::ThreeLevelCacheEnv::s_mem_cache_size_in_KB_ = FLAGS_tera_tabletnode_cache_mem_size;
-    leveldb::ThreeLevelCacheEnv::s_disk_cache_size_in_MB_ = FLAGS_tera_tabletnode_cache_disk_size;
-    leveldb::ThreeLevelCacheEnv::s_block_size_ = FLAGS_tera_tabletnode_cache_block_size;
-    leveldb::ThreeLevelCacheEnv::s_disk_cache_file_num_ = FLAGS_tera_tabletnode_cache_disk_filenum;
-    leveldb::ThreeLevelCacheEnv::s_disk_cache_file_name_ = FLAGS_tera_tabletnode_cache_name;
-
-    if (FLAGS_tera_tabletnode_cache_log_level < 3) {
-        LEVELDB_SET_LOG_LEVEL(WARNING);
-    } else if (FLAGS_tera_tabletnode_cache_log_level < 4) {
-        LEVELDB_SET_LOG_LEVEL(INFO);
-    } else {
-        LEVELDB_SET_LOG_LEVEL(DEBUG);
+        LOG(INFO) << "activate Tcache system";
+        leveldb::Env* block_cache_env = io::DefaultBlockCacheEnv();
+        for (uint32_t i = 0; i < path_list.size(); ++i) {
+            leveldb::BlockCacheOptions opts;
+            LOG(INFO) << "load cache: " << path_list[i];
+            reinterpret_cast<leveldb::BlockCacheEnv*>(block_cache_env)->LoadCache(opts, path_list[i] + "/block_cache/");
+        }
+        return;
     }
+    // compitable with legacy FlashEnv
+    leveldb::FlashEnv* flash_env = (leveldb::FlashEnv*)io::LeveldbFlashEnv();
+    flash_env->SetFlashPath(FLAGS_tera_tabletnode_cache_paths,
+            FLAGS_tera_io_cache_path_vanish_allowed);
+    flash_env->SetUpdateFlashThreadNumber(FLAGS_tera_tabletnode_cache_update_thread_num);
+    flash_env->SetIfForceReadFromCache(FLAGS_tera_tabletnode_cache_force_read_from_cache);
+    return;
 }
 
 bool TabletNodeImpl::Exit() {
@@ -1070,7 +1068,7 @@ void TabletNodeImpl::UpdateMetaTableCallback(const SplitTabletRequest* rpc_reque
  * ------------------------------------------
  */
 void TabletNodeImpl::GarbageCollect() {
-    if (FLAGS_tera_tabletnode_cache_enabled) {
+    if (FLAGS_tera_tabletnode_block_cache_enabled) {
         return;
     }
     int64_t start_ms = get_micros();
diff --git a/src/tera_flags.cc b/src/tera_flags.cc
index 59d7b7a9e..6a0a14ce5 100644
--- a/src/tera_flags.cc
+++ b/src/tera_flags.cc
@@ -64,6 +64,7 @@ DEFINE_int32(tera_leveldb_env_dfs_seek_latency, 10000000, "the random access lat
 DEFINE_int32(tera_memenv_table_cache_size, 100, "the max open file number in leveldb table_cache");
 DEFINE_int32(tera_memenv_block_cache_size, 10000, "block cache size for leveldb which do not use share block cache");
 DEFINE_bool(tera_use_flash_for_memenv, true, "Use flashenv for memery lg");
+DEFINE_int32(tera_leveldb_block_cache_env_num_thread, 30, "thread num of Tcache");
 
 DEFINE_string(tera_leveldb_compact_strategy, "default", "the default strategy to drive consum compaction, should be [default|LG|dummy]");
 DEFINE_bool(tera_leveldb_verify_checksums, true, "enable verify data read from storage against checksums");
@@ -201,7 +202,7 @@ DEFINE_string(tera_tabletnode_cpu_affinity_set, "1,2", "the cpu set of cpu affin
 DEFINE_bool(tera_tabletnode_hang_detect_enabled, false, "enable detect read/write hang");
 DEFINE_int32(tera_tabletnode_hang_detect_threshold, 60000, "read/write hang detect threshold (in ms)");
 
-DEFINE_bool(tera_tabletnode_cache_enabled, false, "enable three-level cache mechasism");
+DEFINE_bool(tera_tabletnode_block_cache_enabled, true, "enable Tcache mechasism");
 DEFINE_string(tera_tabletnode_cache_paths, "../data/cache/", "paths for cached data storage. Mutiple definition like: \"./path1/;./path2/\"");
 DEFINE_int32(tera_tabletnode_cache_block_size, 8192, "the block size of cache system");
 DEFINE_string(tera_tabletnode_cache_name, "tera.cache", "prefix name for cache name");

From 3256bd290b4e8c6cf8806b3006ecdc7a4a591d35 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Mon, 7 Aug 2017 20:35:04 +0800
Subject: [PATCH 02/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/util/block_cache.cc | 131 ++++++++++++++++++++------------
 1 file changed, 84 insertions(+), 47 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 8401cb1c0..c9ae949be 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -49,13 +49,17 @@ class BlockCacheImpl;
 
 // Each SSD will New a BlockCache
 // block state
-uint64_t kCacheBlockValid = 1;
+uint64_t kCacheBlockValid = 0x1;
+uint64_t kCacheBlockLocked = 0x2;
+uint64_t kCacheBlockDfsRead = 0x4;
+uint64_t kCacheBlockCacheRead = 0x8;
+uint64_t kCacheBlockCacheFill = 0x10;
 struct CacheBlock {
     uint64_t fid;
     uint64_t block_idx;
     uint64_t sid;
     uint64_t cache_block_idx;
-    uint64_t state;
+    volatile uint64_t state;
     port::CondVar cv;
     Slice data_block;
     bool data_block_alloc;
@@ -68,13 +72,31 @@ struct CacheBlock {
       block_idx(0),
       sid(0xffffffffffffffff),
       cache_block_idx(0xffffffffffffffff),
-      state(!kCacheBlockValid),
+      state(0),
       cv(mu),
       data_block_alloc(false),
       data_block_refs(0),
       handle(NULL) {
     }
 
+    bool Test(uint64_t c_state) {
+        return (state & c_state) == c_state;
+    }
+
+    void Clear(uint64_t c_state) {
+        state &= ~c_state;
+    }
+
+    void Set(uint64_t c_state) {
+        state |= c_state;
+    }
+
+    void WaitOnClear(uint64_t c_state) { // access in lock
+        while (Test(c_state)) {
+            cv.Wait();
+        }
+    }
+
     // access in cache lock
     void GetDataBlock(uint64_t block_size, Slice data) {
         if (data_block_refs == 0) { // first one alloc mem
@@ -122,7 +144,7 @@ struct CacheBlock {
 
     const std::string ToString() {
         std::stringstream ss;
-        ss << "CacheBlock: fid: " << fid << ", block_idx: " << block_idx
+        ss << "CacheBlock(" << (uint64_t)this << "): fid: " << fid << ", block_idx: " << block_idx
            << ", sid: " << sid << ", cache_block_idx: " << cache_block_idx
            << ", state " << state;
         return ss.str();
@@ -339,10 +361,13 @@ class BlockCacheWriteBuffer {
 
     uint32_t NumFullBlock() { // use for BGFlush
         MutexLock l(&mu_);
-        if (block_list_.size() > 1) {
+        if (block_list_.size() == 0) {
+            return 0;
+        } else if ((block_list_.back())->size() < block_size_) {
             return block_list_.size() - 1;
+        } else {
+            return block_list_.size();
         }
-        return 0;
     }
 
     Status Append(const Slice& data) {
@@ -389,10 +414,14 @@ class BlockCacheWriteBuffer {
 
     std::string* PopFrontBlock(uint64_t* block_idx) {
         MutexLock l(&mu_);
-        std::string* block = block_list_.front();
         if (block_list_.size() == 0) {
             return NULL;
         }
+        std::string* block = block_list_.front();
+        assert(block->size() <= block_size_);
+        if (block->size() != block_size_) {
+            return NULL;
+        }
         block_list_.pop_front();
         *block_idx = block_idx_;
         block_idx_++;
@@ -431,6 +460,7 @@ class BlockCacheWritableFile : public WritableFile {
         : cache_(c),
           bg_cv_(&c->mu_),
           bg_block_flush_(0),
+          pending_block_num_(0),
           write_buffer_(cache_->WorkPath(), fname, cache_->options_.block_size),
           fname_(fname) { // file open
         *s = cache_->dfs_env_->NewWritableFile(fname_, &dfs_file_);
@@ -454,11 +484,9 @@ class BlockCacheWritableFile : public WritableFile {
         MutexLock lockgard(&cache_->mu_);
         uint64_t block_idx;
         std::string* block_data = write_buffer_.PopBackBlock(&block_idx);
-        if (block_data == NULL) {
-            Log("[%s] end release(nothing) %s\n", cache_->WorkPath().c_str(), fname_.c_str());
-            return;
+        if (block_data != NULL) {
+            FillCache(block_data, block_idx);
         }
-        FillCache(block_data, block_idx);
 
         while (bg_block_flush_ > 0) {
             bg_cv_.Wait();
@@ -492,11 +520,9 @@ class BlockCacheWritableFile : public WritableFile {
         MutexLock lockgard(&cache_->mu_);
         uint64_t block_idx;
         std::string* block_data = write_buffer_.PopBackBlock(&block_idx);
-        if (block_data == NULL) {
-            Log("[%s] end close state error: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
-            return s;
+        if (block_data != NULL) {
+            FillCache(block_data, block_idx);
         }
-        FillCache(block_data, block_idx);
 
         while (bg_block_flush_ > 0) {
             bg_cv_.Wait();
@@ -524,7 +550,7 @@ class BlockCacheWritableFile : public WritableFile {
             fname_.c_str(),
             bg_block_flush_,
             write_buffer_.NumFullBlock());
-        while (bg_block_flush_ < write_buffer_.NumFullBlock()) {
+        while (bg_block_flush_ < (write_buffer_.NumFullBlock() + pending_block_num_)) {
             bg_block_flush_++;
             cache_->bg_flush_.Schedule(&BlockCacheWritableFile::BGFlushFunc, this, 10);
         }
@@ -536,14 +562,13 @@ class BlockCacheWritableFile : public WritableFile {
     void BGFlush() {
         Log("[%s] begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
         MutexLock lockgard(&cache_->mu_);
-        if (write_buffer_.NumFullBlock() == 0) {
-            return;
-        }
-
         uint64_t block_idx;
         std::string* block_data = write_buffer_.PopFrontBlock(&block_idx);
-        assert(block_data != NULL);
-        FillCache(block_data, block_idx);
+        if (block_data != NULL) {
+            pending_block_num_++;
+            FillCache(block_data, block_idx);
+            pending_block_num_--;
+        }
 
         bg_block_flush_--;
         MaybeScheduleBGFlush();
@@ -555,7 +580,7 @@ class BlockCacheWritableFile : public WritableFile {
         cache_->mu_.AssertHeld();
         uint64_t fid = cache_->FileId(fname_);
         CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx);
-        assert(block->state != kCacheBlockValid);
+        block->state = 0;
         block->GetDataBlock(cache_->options_.block_size, Slice(*block_data));
         cache_->mu_.Unlock();
 
@@ -577,6 +602,7 @@ class BlockCacheWritableFile : public WritableFile {
     WritableFile* dfs_file_;
     // protected by cache_.mu_
     uint32_t bg_block_flush_;
+    uint32_t pending_block_num_;
     BlockCacheWriteBuffer write_buffer_;
     std::string fname_;
 };
@@ -622,13 +648,17 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             block->GetDataBlock(cache_->options_.block_size, Slice());
             block_queue.push_back(block); // sort by block_idx
 
-            if (block->state != kCacheBlockValid && block->handle->refs == 2) { // first one access this block
-                c_miss.push_back(block);
-            } else if (block->state == kCacheBlockValid && block->handle->refs == 2) { // frist one access this block
+            if (!block->Test(kCacheBlockLocked) &&
+                block->Test(kCacheBlockValid)) {
+                block->Set(kCacheBlockLocked | kCacheBlockCacheRead);
                 c_valid.push_back(block);
+            } else if (!block->Test(kCacheBlockLocked)) {
+                block->Set(kCacheBlockLocked | kCacheBlockDfsRead);
+                c_miss.push_back(block);
             } else {
                 c_locked.push_back(block);
             }
+
             Log("[%s] queue block: %s, refs %u, data_block_refs %lu, alloc %u\n",
                 cache_->WorkPath().c_str(), block->ToString().c_str(),
                 block->handle->refs, block->data_block_refs,
@@ -660,11 +690,25 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10);
         }
 
+        // wait async cache read done
+        for (uint32_t i = 0; i < c_valid.size(); ++i) {
+            MutexLock lockgard(&cache_->mu_);
+            CacheBlock* block = c_valid[i];
+            block->WaitOnClear(kCacheBlockCacheRead);
+            block->Set(kCacheBlockValid);
+            block->Clear(kCacheBlockLocked);
+            block->cv.SignalAll();
+            Log("[%s] pread in valid list(done), %s\n",
+                cache_->WorkPath().c_str(),
+                block->ToString().c_str());
+        }
+
         // wait dfs read done and async cache file
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
             MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_miss[i];
-            block->cv.Wait();
+            block->WaitOnClear(kCacheBlockDfsRead);
+            block->Set(kCacheBlockCacheFill);
             Log("[%s] pread in miss list(dfs done), %s\n",
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
@@ -684,29 +728,21 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish
             MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_miss[i];
-            block->cv.Wait();
+            block->WaitOnClear(kCacheBlockCacheFill);
+            block->Set(kCacheBlockValid);
+            block->Clear(kCacheBlockLocked);
+            block->cv.SignalAll();
             Log("[%s] pread in miss list(fill cache done), %s\n",
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
         }
 
-        // wait cache read done
-        for (uint32_t i = 0; i < c_valid.size(); ++i) {
-            MutexLock lockgard(&cache_->mu_);
-            CacheBlock* block = c_valid[i];
-            block->cv.Wait();
-            Log("[%s] pread in valid list(done), %s\n",
-                cache_->WorkPath().c_str(),
-                block->ToString().c_str());
-        }
-
         // wait other async read finish
         for (uint32_t i = 0; i < c_locked.size(); ++i) {
             MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_locked[i];
-            while (block->state != kCacheBlockValid) {
-                block->cv.Wait();
-            }
+            block->WaitOnClear(kCacheBlockLocked);
+            assert((block->state & kCacheBlockValid) == kCacheBlockValid);
         }
 
         // fill user mem
@@ -735,19 +771,16 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         cache_->mu_.Lock();
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
             CacheBlock* block = c_miss[i];
-            block->state = kCacheBlockValid;
             Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
             cache_->ReleaseBlock(block);
         }
         for (uint32_t i = 0; i < c_valid.size(); ++i) {
             CacheBlock* block = c_valid[i];
-            block->state = kCacheBlockValid;
             Log("[%s] wakeup for valid, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
             cache_->ReleaseBlock(block);
         }
         for (uint32_t i = 0; i < c_locked.size(); ++i) {
             CacheBlock* block = c_locked[i];
-            block->state = kCacheBlockValid;
             Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
             cache_->ReleaseBlock(block);
         }
@@ -785,6 +818,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             s.ToString().c_str(), result.size());
 
         MutexLock lockgard(&cache_->mu_);
+        block->Clear(kCacheBlockDfsRead);
         block->cv.SignalAll();
         return;
     }
@@ -804,7 +838,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         cache_->ReadCache(block);
 
         MutexLock lockgard(&cache_->mu_);
+        block->Clear(kCacheBlockCacheRead);
         block->cv.SignalAll();
+        //Log("[%s] async.cacheread signal, %s\n", cache_->WorkPath().c_str(),
+        //    block->ToString().c_str());
         return;
     }
 
@@ -827,6 +864,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         cache_->FillCache(block);
 
         MutexLock lockgard(&cache_->mu_);
+        block->Clear(kCacheBlockCacheFill);
         block->cv.SignalAll();
         return;
     }
@@ -987,6 +1025,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter));
                 assert(handle != NULL);
                 handle->cache_id = block->cache_block_idx;
+                block->handle = handle;
                 lc.data_set->cache->Release((Cache::Handle*)handle);
             }
             delete db_it;
@@ -1204,14 +1243,13 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
        block->block_idx = block_idx;
        block->sid = sid;
        block->cache_block_idx = h->cache_id;
+       block->handle = h;
        Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
-       assert(block->state != kCacheBlockValid);
     } else {
         block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
         Log("[%s] get block from memcache, %s\n",
             this->WorkPath().c_str(), block->ToString().c_str());
     }
-    block->handle = h;
     return block;
 }
 
@@ -1236,7 +1274,6 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block) {
     LRUHandle* h = block->handle;
     DataSet* ds = GetDataSet(block->sid); // get and alloc ds
     block->ReleaseDataBlock();
-    block->handle = NULL;
     block->cv.SignalAll();
     ds->cache->Release((Cache::Handle*)h);
     mu_.Unlock();

From c5e3f83587ee66c15ccf5688cb81de305fb56e48 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Mon, 7 Aug 2017 23:22:20 +0800
Subject: [PATCH 03/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/util/block_cache.cc | 132 ++++++++++++++++++++------------
 1 file changed, 82 insertions(+), 50 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index c9ae949be..ee99c8cb5 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -37,7 +37,7 @@ namespace leveldb {
 /////////////////////////////////////////////
 uint64_t kBlockSize = 4096UL;
 uint64_t kDataSetSize = 134217728UL;
-uint64_t kFidBatchNum = 200000UL;
+uint64_t kFidBatchNum = 100000UL;
 uint64_t kCacheSize = 350000000000UL;
 uint64_t kMetaBlockSize = 2000UL;
 uint64_t kMetaTableSize = 500UL;
@@ -234,6 +234,39 @@ class BlockCacheImpl {
         // data set id
         uint64_t sid;
         DataSet* data_set;
+
+        const std::string Encode() {
+            if (type == kDBKey) {
+                return db_lock_key.ToString();
+            } else if (type == kDataSetKey) {
+                std::string key = "DS#";
+                PutFixed64(&key, sid);
+                return key;
+            }
+            return "";
+        }
+
+        const std::string KeyToString() {
+            if (type == kDBKey) {
+                return db_lock_key.ToString();
+            } else if (type == kDataSetKey) {
+                std::stringstream ss;
+                ss << "DS#" << sid;
+                return ss.str();
+            } else {
+                return "";
+            }
+        }
+
+        const std::string ValToString() {
+            if (type == kDBKey) {
+                uint64_t val = DecodeFixed64(db_lock_val.data());
+                std::stringstream ss;
+                ss << val;
+                return ss.str();
+            }
+            return "";
+        }
     };
     typedef std::map<uint64_t, DataSet*> DataSetMap;
     DataSetMap data_set_map_;
@@ -923,17 +956,12 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
     mu_.AssertHeld();
     Status s;
     std::string key;
-    if (lc.type == kDBKey) {
-        key = lc.db_lock_key.ToString();
-    } else if (lc.type == kDataSetKey) {
-        key = "DS#";
-        PutFixed64(&key, lc.sid);
-    } else {
+    if ((key = lc.Encode()) == "") {
         return Status::NotSupported("key type error");
     }
-    Log("[%s] trylock key: %s\n",
-        this->WorkPath().c_str(),
-        key.c_str());
+    //Log("[%s] trylock key: %s\n",
+    //    this->WorkPath().c_str(),
+    //    key.c_str());
 
     Waiter* w = NULL;
     LockKeyMap::iterator it = lock_key_.find(key);
@@ -948,30 +976,30 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
         if (lc.type == kDBKey) {
             ReadOptions r_opts;
             s = db_->Get(r_opts, key, lc.db_val);
-            Log("[%s] get lock key: %s, val: %s, status: %s\n",
-                this->WorkPath().c_str(),
-                key.c_str(),
-                lc.db_val->c_str(),
-                s.ToString().c_str());
+            //Log("[%s] get lock key: %s, val: %s, status: %s\n",
+            //    this->WorkPath().c_str(),
+            //    key.c_str(),
+            //    lc.db_val->c_str(),
+            //    s.ToString().c_str());
         } else if (lc.type == kDataSetKey) {
             lc.data_set = data_set_map_[lc.sid];
-            Log("[%s] get dataset sid: %lu\n",
-                this->WorkPath().c_str(),
-                lc.sid);
+            //Log("[%s] get dataset sid: %lu\n",
+            //    this->WorkPath().c_str(),
+            //    lc.sid);
         }
 
         mu_.Lock();
         if (--w->wait_num == 0) {
             // last thread wait for open
             lock_key_.erase(key);
-            Log("[%s] wait done %s, delete cv\n",
-                this->WorkPath().c_str(),
-                key.c_str());
+            //Log("[%s] wait done %s, delete cv\n",
+            //    this->WorkPath().c_str(),
+            //    key.c_str());
             delete w;
         } else {
-            Log("[%s] wait done %s, not last\n",
-                this->WorkPath().c_str(),
-                key.c_str());
+            //Log("[%s] wait done %s, not last\n",
+            //    this->WorkPath().c_str(),
+            //    key.c_str());
         }
     } else {
         w = new Waiter(&mu_);
@@ -985,9 +1013,10 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             if (s.ok()) {
                 lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size());
             }
-            Log("[%s] put kDBKey: %s, status %s\n",
+            Log("[%s] Insert db key : %s, val %s, status %s\n",
                 this->WorkPath().c_str(),
-                key.c_str(),
+                lc.KeyToString().c_str(),
+                lc.ValToString().c_str(),
                 s.ToString().c_str());
         } else if (lc.type == kDataSetKey) {
             std::string end_ds = "DS#";
@@ -997,9 +1026,10 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid);
             lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644);
             assert(lc.data_set->fd > 0);
-            Log("[%s] begin new dataset, sid: %lu, file: %s, cs: %lu, fd: %d\n",
+            Log("[%s] New DataSet %s, file: %s, nr_block: %lu, fd: %d\n",
                 this->WorkPath().c_str(),
-                lc.sid, file.c_str(), (options_.dataset_size / options_.block_size) + 1,
+                lc.KeyToString().c_str(),
+                file.c_str(), (options_.dataset_size / options_.block_size) + 1,
                 lc.data_set->fd);
 
             // reload hash lru
@@ -1019,8 +1049,10 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 PutFixed64(&hkey, block->block_idx);
                 block->sid = lc.sid;
                 block->cache_block_idx = DecodeFixed64(lkey.data());
-                Log("[%s] insert cacheblock into 2QLru, %s\n",
+                block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0;
+                Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n",
                     this->WorkPath().c_str(),
+                    lc.KeyToString().c_str(),
                     block->ToString().c_str());
                 LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter));
                 assert(handle != NULL);
@@ -1038,14 +1070,14 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
         mu_.Lock();
         if (--w->wait_num == 0) {
             lock_key_.erase(key);
-            Log("[%s] put done %s, no wait thread\n",
-                this->WorkPath().c_str(),
-                key.c_str());
+            //Log("[%s] put done %s, no wait thread\n",
+            //    this->WorkPath().c_str(),
+            //    key.c_str());
             delete w;
         } else {
-            Log("[%s] put done %s, signal all wait thread\n",
-                this->WorkPath().c_str(),
-                key.c_str());
+            //Log("[%s] put done %s, signal all wait thread\n",
+            //    this->WorkPath().c_str(),
+            //    key.c_str());
             w->done = true;
             w->cv.SignalAll();
         }
@@ -1184,9 +1216,9 @@ uint64_t BlockCacheImpl::FileId(const std::string& fname) {
         lc.db_lock_key = key;
         lc.db_lock_val = val;
         lc.db_val = &v;
-        Log("[%s] alloc fid: %lu, key: %s",
-            this->WorkPath().c_str(),
-            fid, key.c_str());
+        //Log("[%s] alloc fid: %lu, key: %s",
+        //    this->WorkPath().c_str(),
+        //    fid, key.c_str());
         s = LockAndPut(lc);
         assert(s.ok());
         fid = DecodeFixed64(v.c_str());
@@ -1214,8 +1246,8 @@ DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) {
         Status s = LockAndPut(lc);
         set = lc.data_set;
     } else {
-        Log("[%s] get dataset from memcache, sid %lu\n",
-            this->WorkPath().c_str(), sid);
+        //Log("[%s] get dataset from memcache, sid %lu\n",
+        //    this->WorkPath().c_str(), sid);
         set = it->second;
     }
     return set;
@@ -1236,19 +1268,19 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
     Cache* cache = ds->cache;
     LRUHandle* h = (LRUHandle*)cache->Lookup(key);
     if (h == NULL) {
-       block = new CacheBlock(&mu_);
-       h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
-       assert(h != NULL);
-       block->fid = fid;
-       block->block_idx = block_idx;
-       block->sid = sid;
-       block->cache_block_idx = h->cache_id;
-       block->handle = h;
-       Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
+        block = new CacheBlock(&mu_);
+        h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
+        assert(h != NULL);
+        block->fid = fid;
+        block->block_idx = block_idx;
+        block->sid = sid;
+        block->cache_block_idx = h->cache_id;
+        block->handle = h;
+        Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
     } else {
         block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
         Log("[%s] get block from memcache, %s\n",
-            this->WorkPath().c_str(), block->ToString().c_str());
+                this->WorkPath().c_str(), block->ToString().c_str());
     }
     return block;
 }

From 9b00baa49086f165c8d2f9ae88fff5b14ed23f70 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 8 Aug 2017 01:19:42 +0800
Subject: [PATCH 04/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/util/block_cache.cc | 126 ++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 55 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index ee99c8cb5..10a1e61b7 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -54,6 +54,7 @@ uint64_t kCacheBlockLocked = 0x2;
 uint64_t kCacheBlockDfsRead = 0x4;
 uint64_t kCacheBlockCacheRead = 0x8;
 uint64_t kCacheBlockCacheFill = 0x10;
+
 struct CacheBlock {
     uint64_t fid;
     uint64_t block_idx;
@@ -193,7 +194,7 @@ class BlockCacheImpl {
 
     Status LogRecord(CacheBlock* block);
 
-    Status ReleaseBlock(CacheBlock* block);
+    Status ReleaseBlock(CacheBlock* block, bool need_sync);
 
 private:
     friend class BlockCacheWritableFile;
@@ -441,7 +442,7 @@ class BlockCacheWriteBuffer {
             path_.c_str(),
             file_.c_str(),
             begin, end,
-            offset_, data.size(), block_size_);
+            offset_ - data.size() , data.size(), block_size_);
         return Status::OK();
     }
 
@@ -502,6 +503,9 @@ class BlockCacheWritableFile : public WritableFile {
             fname.c_str(),
             cache_->options_.block_size,
             s->ToString().c_str());
+
+        MutexLock lockgard(&cache_->mu_);
+        fid_ = cache_->FileId(fname_);
         return;
     }
 
@@ -578,11 +582,11 @@ class BlockCacheWritableFile : public WritableFile {
 private:
     void MaybeScheduleBGFlush() {
         cache_->mu_.AssertHeld();
-        Log("[%s] Maybe schedule BGFlush: %s, bg_block_flush: %u, block_nr: %u\n",
-            cache_->WorkPath().c_str(),
-            fname_.c_str(),
-            bg_block_flush_,
-            write_buffer_.NumFullBlock());
+        //Log("[%s] Maybe schedule BGFlush: %s, bg_block_flush: %u, block_nr: %u\n",
+        //    cache_->WorkPath().c_str(),
+        //    fname_.c_str(),
+        //    bg_block_flush_,
+        //    write_buffer_.NumFullBlock());
         while (bg_block_flush_ < (write_buffer_.NumFullBlock() + pending_block_num_)) {
             bg_block_flush_++;
             cache_->bg_flush_.Schedule(&BlockCacheWritableFile::BGFlushFunc, this, 10);
@@ -593,7 +597,7 @@ class BlockCacheWritableFile : public WritableFile {
         reinterpret_cast<BlockCacheWritableFile*>(arg)->BGFlush();
     }
     void BGFlush() {
-        Log("[%s] begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        Log("[%s] Begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
         MutexLock lockgard(&cache_->mu_);
         uint64_t block_idx;
         std::string* block_data = write_buffer_.PopFrontBlock(&block_idx);
@@ -611,7 +615,7 @@ class BlockCacheWritableFile : public WritableFile {
 
     Status FillCache(std::string* block_data, uint64_t block_idx) {
         cache_->mu_.AssertHeld();
-        uint64_t fid = cache_->FileId(fname_);
+        uint64_t fid = fid_;
         CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx);
         block->state = 0;
         block->GetDataBlock(cache_->options_.block_size, Slice(*block_data));
@@ -623,7 +627,7 @@ class BlockCacheWritableFile : public WritableFile {
 
         cache_->mu_.Lock();
         block->state = kCacheBlockValid;
-        cache_->ReleaseBlock(block);
+        cache_->ReleaseBlock(block, true);
         write_buffer_.ReleaseBlock(block_data);
         return Status::OK();
     }
@@ -638,6 +642,7 @@ class BlockCacheWritableFile : public WritableFile {
     uint32_t pending_block_num_;
     BlockCacheWriteBuffer write_buffer_;
     std::string fname_;
+    uint64_t fid_;
 };
 
 class BlockCacheRandomAccessFile : public RandomAccessFile {
@@ -651,6 +656,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             fname.c_str(),
             cache_->options_.block_size,
             s->ToString().c_str());
+
+        MutexLock lockgard(&cache_->mu_);
+        fid_ = cache_->FileId(fname_);
         return;
     }
 
@@ -661,20 +669,22 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
 
     Status Read(uint64_t offset, size_t n, Slice* result,
                 char* scratch) const {
-        MutexLock lockgard(&cache_->mu_);
-        uint64_t fid = cache_->FileId(fname_);
+        Status s;
         uint64_t begin = offset / cache_->options_.block_size;
         uint64_t end = (offset + n) / cache_->options_.block_size;
         assert(begin <= end);
+        uint64_t fid = fid_;
         std::vector<CacheBlock*> c_miss;
         std::vector<CacheBlock*> c_locked;
         std::vector<CacheBlock*> c_valid;
         std::vector<CacheBlock*> block_queue;
 
-        Log("[%s] begin pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu"
+        Log("[%s] Begin Pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu"
             ", block_size %lu\n",
             cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
             begin, end, cache_->options_.block_size);
+
+        MutexLock lockgard(&cache_->mu_);
         for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) {
             CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx);
             assert(block->fid == fid && block->block_idx == block_idx);
@@ -692,7 +702,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
                 c_locked.push_back(block);
             }
 
-            Log("[%s] queue block: %s, refs %u, data_block_refs %lu, alloc %u\n",
+            Log("[%s] Queue block: %s, refs %u, data_block_refs %lu, alloc %u\n",
                 cache_->WorkPath().c_str(), block->ToString().c_str(),
                 block->handle->refs, block->data_block_refs,
                 block->data_block_alloc);
@@ -705,9 +715,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             AsyncDfsReader* reader = new AsyncDfsReader;
             reader->file = const_cast<BlockCacheRandomAccessFile*>(this);
             reader->block = block;
-            Log("[%s] pread in miss list, %s\n",
-                cache_->WorkPath().c_str(),
-                block->ToString().c_str());
+            //Log("[%s] pread in miss list, %s\n",
+            //    cache_->WorkPath().c_str(),
+            //    block->ToString().c_str());
             cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10);
         }
 
@@ -717,9 +727,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             AsyncCacheReader* reader = new AsyncCacheReader;
             reader->file = const_cast<BlockCacheRandomAccessFile*>(this);
             reader->block = block;
-            Log("[%s] pread in valid list, %s\n",
-                cache_->WorkPath().c_str(),
-                block->ToString().c_str());
+            //Log("[%s] pread in valid list, %s\n",
+            //    cache_->WorkPath().c_str(),
+            //    block->ToString().c_str());
             cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10);
         }
 
@@ -731,7 +741,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             block->Set(kCacheBlockValid);
             block->Clear(kCacheBlockLocked);
             block->cv.SignalAll();
-            Log("[%s] pread in valid list(done), %s\n",
+            Log("[%s] cache read done, %s\n",
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
         }
@@ -742,7 +752,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             CacheBlock* block = c_miss[i];
             block->WaitOnClear(kCacheBlockDfsRead);
             block->Set(kCacheBlockCacheFill);
-            Log("[%s] pread in miss list(dfs done), %s\n",
+            Log("[%s] dfs read done, %s\n",
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
         }
@@ -752,9 +762,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             AsyncCacheWriter* writer = new AsyncCacheWriter;
             writer->file = const_cast<BlockCacheRandomAccessFile*>(this);
             writer->block = block;
-            Log("[%s] pread in miss list(fill cache), %s\n",
-                cache_->WorkPath().c_str(),
-                block->ToString().c_str());
+            //Log("[%s] pread in miss list(fill cache), %s\n",
+            //    cache_->WorkPath().c_str(),
+            //    block->ToString().c_str());
             cache_->bg_fill_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheWrite, writer, 10);
         }
 
@@ -765,7 +775,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             block->Set(kCacheBlockValid);
             block->Clear(kCacheBlockLocked);
             block->cv.SignalAll();
-            Log("[%s] pread in miss list(fill cache done), %s\n",
+            Log("[%s] cache fill done, %s\n",
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
         }
@@ -776,6 +786,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             CacheBlock* block = c_locked[i];
             block->WaitOnClear(kCacheBlockLocked);
             assert((block->state & kCacheBlockValid) == kCacheBlockValid);
+            Log("[%s] wait locked done, %s\n",
+                cache_->WorkPath().c_str(),
+                block->ToString().c_str());
         }
 
         // fill user mem
@@ -791,7 +804,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             }
             memcpy(scratch + msize, data_block.data(), data_block.size());
             msize += data_block.size();
-            Log("[%s] fill user data, %s, prefix %lu, suffix %lu, msize %lu, offset %lu\n",
+            Log("[%s] Fill user data, %s, prefix %lu, suffix %lu, msize %lu, offset %lu\n",
                 cache_->WorkPath().c_str(), fname_.c_str(),
                 block_idx == begin ? offset % cache_->options_.block_size: 0,
                 block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size
@@ -804,25 +817,26 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         cache_->mu_.Lock();
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
             CacheBlock* block = c_miss[i];
-            Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
-            cache_->ReleaseBlock(block);
+            //Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
+            cache_->ReleaseBlock(block, true);
         }
         for (uint32_t i = 0; i < c_valid.size(); ++i) {
             CacheBlock* block = c_valid[i];
-            Log("[%s] wakeup for valid, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
-            cache_->ReleaseBlock(block);
+            //Log("[%s] wakeup for valid, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
+            cache_->ReleaseBlock(block, false);
         }
         for (uint32_t i = 0; i < c_locked.size(); ++i) {
             CacheBlock* block = c_locked[i];
-            Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
-            cache_->ReleaseBlock(block);
+            //Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
+            cache_->ReleaseBlock(block, false);
         }
 
-        Log("[%s] end pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu"
+        Log("[%s] End Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu"
             ", block_size %lu\n",
             cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
+            result->size(), s.ToString().c_str(),
             begin, end, cache_->options_.block_size);
-        return Status::OK();
+        return s;
     }
 
 private:
@@ -844,7 +858,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         uint64_t offset = block->block_idx * cache_->options_.block_size;
         size_t n = cache_->options_.block_size;
         s = dfs_file_->Read(offset, n, &result, scratch);
-        Log("[%s] cache async.dfs read, %s"
+        Log("[%s] dfs read, %s"
             ", offset %lu, size %lu, status %s, res %lu\n",
             cache_->WorkPath().c_str(), block->ToString().c_str(),
             offset, n,
@@ -890,9 +904,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
     }
     void HandleCacheWrite(AsyncCacheWriter* writer) {
         CacheBlock* block = writer->block;
-        Log("[%s] handle cache write, %s\n",
-            cache_->WorkPath().c_str(),
-            block->ToString().c_str());
+        //Log("[%s] cache fill, %s\n",
+        //    cache_->WorkPath().c_str(),
+        //    block->ToString().c_str());
         cache_->LogRecord(block);
         cache_->FillCache(block);
 
@@ -906,6 +920,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
     BlockCacheImpl* cache_;
     RandomAccessFile* dfs_file_;
     std::string fname_;
+    uint64_t fid_;
 };
 
 // Tcache impl
@@ -1133,7 +1148,7 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) {
     // do io without lock
     ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(),
                          cache_block_idx * options_.block_size);
-    Log("[%s] fillcache: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
+    Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
         this->WorkPath().c_str(), sid, fd, block->data_block.size(),
         cache_block_idx,
         block->ToString().c_str(),
@@ -1156,7 +1171,7 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block) {
     // do io without lock
     ssize_t res = pread(fd, (char*)block->data_block.data(), block->data_block.size(),
                          cache_block_idx * options_.block_size);
-    Log("[%s] readcache: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
+    Log("[%s] cache read: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
         this->WorkPath().c_str(), sid, fd, block->data_block.size(),
         cache_block_idx,
         block->ToString().c_str(),
@@ -1261,8 +1276,8 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
     uint32_t hash = Hash(key.c_str(), key.size(), 7);
     uint64_t sid = hash % options_.dataset_num;
 
-    Log("[%s] alloc block, try get dataset, fid: %lu, block_idx: %lu, hash: %u, sid %lu, dataset_num: %lu\n",
-        this->WorkPath().c_str(), fid, block_idx, hash, sid, options_.dataset_num);
+    //Log("[%s] alloc block, try get dataset, fid: %lu, block_idx: %lu, hash: %u, sid %lu, dataset_num: %lu\n",
+    //    this->WorkPath().c_str(), fid, block_idx, hash, sid, options_.dataset_num);
     CacheBlock* block = NULL;
     DataSet* ds = GetDataSet(sid); // get and alloc ds
     Cache* cache = ds->cache;
@@ -1276,11 +1291,14 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
         block->sid = sid;
         block->cache_block_idx = h->cache_id;
         block->handle = h;
-        Log("[%s] new blockcache: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
+        Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
+            this->WorkPath().c_str(),
+            block->ToString().c_str(),
+            sid, fid, block_idx, hash, options_.dataset_num);
     } else {
         block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
-        Log("[%s] get block from memcache, %s\n",
-                this->WorkPath().c_str(), block->ToString().c_str());
+        //Log("[%s] get block from memcache, %s\n",
+        //        this->WorkPath().c_str(), block->ToString().c_str());
     }
     return block;
 }
@@ -1294,26 +1312,24 @@ Status BlockCacheImpl::LogRecord(CacheBlock* block) {
     return db_->Write(leveldb::WriteOptions(), &batch);
 }
 
-Status BlockCacheImpl::ReleaseBlock(CacheBlock* block) {
+Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) {
     mu_.AssertHeld();
     Status s;
-    std::string key = "DS#";
-    PutFixed64(&key, block->sid);
-    PutFixed64(&key, block->cache_block_idx);
-    leveldb::WriteBatch batch;
-    batch.Put(key, block->Encode());
 
+    mu_.Unlock();
+    if (need_sync) {
+        s = LogRecord(block);
+    }
+
+    mu_.Lock();
     LRUHandle* h = block->handle;
     DataSet* ds = GetDataSet(block->sid); // get and alloc ds
     block->ReleaseDataBlock();
     block->cv.SignalAll();
     ds->cache->Release((Cache::Handle*)h);
-    mu_.Unlock();
 
     // TODO: dump meta into memtable
     Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
-    s = db_->Write(leveldb::WriteOptions(), &batch);
-    mu_.Lock();
     return s;
 }
 

From 170318fe6cad5becae33d4036cc082d25194e295 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 8 Aug 2017 02:19:29 +0800
Subject: [PATCH 05/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/util/block_cache.cc | 61 +++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 10a1e61b7..0b54e6df8 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -147,7 +147,7 @@ struct CacheBlock {
         std::stringstream ss;
         ss << "CacheBlock(" << (uint64_t)this << "): fid: " << fid << ", block_idx: " << block_idx
            << ", sid: " << sid << ", cache_block_idx: " << cache_block_idx
-           << ", state " << state;
+           << ", state " << state << ", status " << s.ToString();
         return ss.str();
     }
 };
@@ -615,6 +615,7 @@ class BlockCacheWritableFile : public WritableFile {
 
     Status FillCache(std::string* block_data, uint64_t block_idx) {
         cache_->mu_.AssertHeld();
+        Status s;
         uint64_t fid = fid_;
         CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx);
         block->state = 0;
@@ -622,14 +623,18 @@ class BlockCacheWritableFile : public WritableFile {
         cache_->mu_.Unlock();
 
         // Do io without lock
-        cache_->LogRecord(block);
-        cache_->FillCache(block);
+        block->s = cache_->LogRecord(block);
+        if (block->s.ok()) {
+            block->s = cache_->FillCache(block);
+        }
 
         cache_->mu_.Lock();
-        block->state = kCacheBlockValid;
-        cache_->ReleaseBlock(block, true);
+        if (block->s.ok()) {
+            block->state = kCacheBlockValid;
+        }
+        s = cache_->ReleaseBlock(block, true);
         write_buffer_.ReleaseBlock(block_data);
-        return Status::OK();
+        return s;
     }
 
 private:
@@ -738,7 +743,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_valid[i];
             block->WaitOnClear(kCacheBlockCacheRead);
-            block->Set(kCacheBlockValid);
+            assert(block->Test(kCacheBlockValid));
+            if (!block->s.ok() && s.ok()) {
+                s = block->s; // degrade read
+            }
             block->Clear(kCacheBlockLocked);
             block->cv.SignalAll();
             Log("[%s] cache read done, %s\n",
@@ -752,6 +760,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             CacheBlock* block = c_miss[i];
             block->WaitOnClear(kCacheBlockDfsRead);
             block->Set(kCacheBlockCacheFill);
+            if (!block->s.ok() && s.ok()) {
+                s = block->s; // degrade read
+            }
             Log("[%s] dfs read done, %s\n",
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
@@ -772,7 +783,11 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_miss[i];
             block->WaitOnClear(kCacheBlockCacheFill);
-            block->Set(kCacheBlockValid);
+            if (block->s.ok()) {
+                block->Set(kCacheBlockValid);
+            } else if (s.ok()) {
+                s = block->s; // degrade read
+            }
             block->Clear(kCacheBlockLocked);
             block->cv.SignalAll();
             Log("[%s] cache fill done, %s\n",
@@ -785,7 +800,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_locked[i];
             block->WaitOnClear(kCacheBlockLocked);
-            assert((block->state & kCacheBlockValid) == kCacheBlockValid);
             Log("[%s] wait locked done, %s\n",
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
@@ -804,8 +818,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             }
             memcpy(scratch + msize, data_block.data(), data_block.size());
             msize += data_block.size();
-            Log("[%s] Fill user data, %s, prefix %lu, suffix %lu, msize %lu, offset %lu\n",
+            Log("[%s] Fill user data, %s, fill_offset %lu, fill_size %lu, prefix %lu, suffix %lu, msize %lu, offset %lu\n",
                 cache_->WorkPath().c_str(), fname_.c_str(),
+                block_idx * cache_->options_.block_size + (block_idx == begin ? offset % cache_->options_.block_size: 0),
+                data_block.size(),
                 block_idx == begin ? offset % cache_->options_.block_size: 0,
                 block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size
                                  : cache_->options_.block_size,
@@ -831,6 +847,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             cache_->ReleaseBlock(block, false);
         }
 
+        if (!s.ok()) {
+            s = dfs_file_->Read(offset, n, result, scratch);
+            Log("[%s] Pread degrade %s, offset %lu, size %lu, status %s\n",
+                cache_->WorkPath().c_str(), fname_.c_str(),
+                offset, n, s.ToString().c_str());
+        }
+
         Log("[%s] End Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu"
             ", block_size %lu\n",
             cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
@@ -857,12 +880,12 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         Slice result;
         uint64_t offset = block->block_idx * cache_->options_.block_size;
         size_t n = cache_->options_.block_size;
-        s = dfs_file_->Read(offset, n, &result, scratch);
+        block->s = dfs_file_->Read(offset, n, &result, scratch);
         Log("[%s] dfs read, %s"
             ", offset %lu, size %lu, status %s, res %lu\n",
             cache_->WorkPath().c_str(), block->ToString().c_str(),
             offset, n,
-            s.ToString().c_str(), result.size());
+            block->s.ToString().c_str(), result.size());
 
         MutexLock lockgard(&cache_->mu_);
         block->Clear(kCacheBlockDfsRead);
@@ -882,7 +905,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
     }
     void HandleCacheRead(AsyncCacheReader* reader) {
         CacheBlock* block = reader->block;
-        cache_->ReadCache(block);
+        block->s = cache_->ReadCache(block);
 
         MutexLock lockgard(&cache_->mu_);
         block->Clear(kCacheBlockCacheRead);
@@ -907,8 +930,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         //Log("[%s] cache fill, %s\n",
         //    cache_->WorkPath().c_str(),
         //    block->ToString().c_str());
-        cache_->LogRecord(block);
-        cache_->FillCache(block);
+        block->s = cache_->LogRecord(block);
+        if (block->s.ok()) {
+            block->s = cache_->FillCache(block);
+        }
 
         MutexLock lockgard(&cache_->mu_);
         block->Clear(kCacheBlockCacheFill);
@@ -1318,6 +1343,7 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) {
 
     mu_.Unlock();
     if (need_sync) {
+        // TODO: dump meta into memtable
         s = LogRecord(block);
     }
 
@@ -1325,11 +1351,10 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) {
     LRUHandle* h = block->handle;
     DataSet* ds = GetDataSet(block->sid); // get and alloc ds
     block->ReleaseDataBlock();
+    Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
+    block->s = Status::OK(); // clear io status
     block->cv.SignalAll();
     ds->cache->Release((Cache::Handle*)h);
-
-    // TODO: dump meta into memtable
-    Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
     return s;
 }
 

From a1e51d3bc7c11a3e31658d2f2f898e686a91d65b Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 8 Aug 2017 02:56:43 +0800
Subject: [PATCH 06/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/util/block_cache.cc | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 0b54e6df8..603ca1f0a 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -617,7 +617,14 @@ class BlockCacheWritableFile : public WritableFile {
         cache_->mu_.AssertHeld();
         Status s;
         uint64_t fid = fid_;
-        CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx);
+        CacheBlock* block = NULL;
+        while ((block = cache_->GetAndAllocBlock(fid, block_idx)) == NULL) {
+            Log("[%s] fill cache for write %s, fid %lu, block_idx %lu, wait 10ms after retry\n",
+                cache_->WorkPath().c_str(), fname_.c_str(),
+                fid, block_idx);
+            port::CondVar cv(&cache_->mu_);
+            cv.Wait(10); // timewait 10ms retry
+        }
         block->state = 0;
         block->GetDataBlock(cache_->options_.block_size, Slice(*block_data));
         cache_->mu_.Unlock();
@@ -691,7 +698,14 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
 
         MutexLock lockgard(&cache_->mu_);
         for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) {
-            CacheBlock* block = cache_->GetAndAllocBlock(fid, block_idx);
+            CacheBlock* block = NULL;
+            while ((block = cache_->GetAndAllocBlock(fid, block_idx)) == NULL) {
+                Log("[%s] fill cache for read %s, fid %lu, block_idx %lu, wait 10ms after retry\n",
+                    cache_->WorkPath().c_str(), fname_.c_str(),
+                    fid, block_idx);
+                port::CondVar cv(&cache_->mu_);
+                cv.Wait(10); // timewait 10ms retry
+            }
             assert(block->fid == fid && block->block_idx == block_idx);
             block->GetDataBlock(cache_->options_.block_size, Slice());
             block_queue.push_back(block); // sort by block_idx
@@ -1310,7 +1324,10 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
     if (h == NULL) {
         block = new CacheBlock(&mu_);
         h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
-        assert(h != NULL);
+        if (h == NULL) {
+            delete block;
+            return NULL;
+        }
         block->fid = fid;
         block->block_idx = block_idx;
         block->sid = sid;

From b74989698a45f819fa44f8dc43756e71e1b3f21f Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 22 Aug 2017 01:04:32 +0800
Subject: [PATCH 07/19] issue=1258, Tcache support block-level cache evict

bugfix:
1. cache reload core
2. support aio engine
3. cache fill TEST PASS
---
 src/io/utils_leveldb.cc                   |   8 +-
 src/leveldb/Makefile                      |   2 +-
 src/leveldb/include/leveldb/block_cache.h |   4 +-
 src/leveldb/include/leveldb/statistics.h  | 393 ++++++++++++++++++++++
 src/leveldb/util/block_cache.cc           | 281 ++++++++++++----
 src/leveldb/util/cache.cc                 |  17 +-
 src/leveldb/util/statistics.cc            | 115 +++++++
 src/tera_flags.cc                         |   2 +-
 8 files changed, 735 insertions(+), 87 deletions(-)
 create mode 100644 src/leveldb/include/leveldb/statistics.h
 create mode 100644 src/leveldb/util/statistics.cc

diff --git a/src/io/utils_leveldb.cc b/src/io/utils_leveldb.cc
index 3d3249e1d..c6d16e2a8 100644
--- a/src/io/utils_leveldb.cc
+++ b/src/io/utils_leveldb.cc
@@ -15,10 +15,10 @@
 #include "common/file/file_path.h"
 #include "common/mutex.h"
 #include "io/timekey_comparator.h"
+#include "leveldb/block_cache.h"
 #include "leveldb/comparator.h"
 #include "leveldb/env_dfs.h"
 #include "leveldb/env_flash.h"
-#include "leveldb/block_cache.h"
 #include "leveldb/env_inmem.h"
 #include "leveldb/env_mock.h"
 #include "leveldb/table_utils.h"
@@ -32,7 +32,7 @@ DECLARE_string(tera_leveldb_env_hdfs2_nameservice_list);
 DECLARE_string(tera_tabletnode_path_prefix);
 DECLARE_string(tera_dfs_so_path);
 DECLARE_string(tera_dfs_conf);
-DECLARE_int32(tera_leveldb_block_cache_env_num_thread);
+DECLARE_int32(tera_leveldb_block_cache_env_thread_num);
 
 namespace tera {
 namespace io {
@@ -73,8 +73,8 @@ static pthread_once_t block_cache_once = PTHREAD_ONCE_INIT;
 static leveldb::Env* default_block_cache_env;
 static void InitDefaultBlockCacheEnv() {
     default_block_cache_env = new leveldb::BlockCacheEnv(LeveldbBaseEnv());
-    default_block_cache_env->SetBackgroundThreads(FLAGS_tera_leveldb_block_cache_env_num_thread);
-    LOG(INFO) << "init block cache, thread num " << FLAGS_tera_leveldb_block_cache_env_num_thread;
+    default_block_cache_env->SetBackgroundThreads(FLAGS_tera_leveldb_block_cache_env_thread_num);
+    LOG(INFO) << "init block cache, thread num " << FLAGS_tera_leveldb_block_cache_env_thread_num;
 }
 
 leveldb::Env* DefaultBlockCacheEnv() {
diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile
index c9162d2eb..9073e98a5 100644
--- a/src/leveldb/Makefile
+++ b/src/leveldb/Makefile
@@ -7,7 +7,7 @@
 # to switch between compilation modes.
 
 # OPT ?= -O2 -DNDEBUG       # (A) Production use (optimized mode)
-OPT ?= -g2 -Wall -Werror    # (B) Debug mode, w/ full line-level debugging symbols
+OPT ?= -std=gnu++11 -g2 -Wall -Werror    # (B) Debug mode, w/ full line-level debugging symbols
 # OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
 #-----------------------------------------------
 
diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h
index a48022bb7..021964db4 100644
--- a/src/leveldb/include/leveldb/block_cache.h
+++ b/src/leveldb/include/leveldb/block_cache.h
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-#ifndef  STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H
-#define  STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H
+#ifndef  STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H_
+#define  STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H_
 
 #include "leveldb/env.h"
 #include "leveldb/options.h"
diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h
new file mode 100644
index 000000000..81d4a4729
--- /dev/null
+++ b/src/leveldb/include/leveldb/statistics.h
@@ -0,0 +1,393 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_STATISTICS_H_
+#define STORAGE_LEVELDB_INCLUDE_STATISTICS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+namespace leveldb {
+
+/**
+ * Keep adding ticker's here.
+ *  1. Any ticker should be added before TICKER_ENUM_MAX.
+ *  2. Add a readable string in TickersNameMap below for the newly added ticker.
+ */
+enum Tickers : uint32_t {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS = 0,
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT,
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD,
+  // # of failures when adding blocks to block cache.
+  BLOCK_CACHE_ADD_FAILURES,
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS,
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT,
+  // # of bytes of index blocks inserted into cache
+  BLOCK_CACHE_INDEX_BYTES_INSERT,
+  // # of bytes of index block erased from cache
+  BLOCK_CACHE_INDEX_BYTES_EVICT,
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS,
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT,
+  // # of bytes of bloom filter blocks inserted into cache
+  BLOCK_CACHE_FILTER_BYTES_INSERT,
+  // # of bytes of bloom filter block erased from cache
+  BLOCK_CACHE_FILTER_BYTES_EVICT,
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS,
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT,
+  // # of bytes read from cache.
+  BLOCK_CACHE_BYTES_READ,
+  // # of bytes written into cache.
+  BLOCK_CACHE_BYTES_WRITE,
+
+  // # of times bloom filter has avoided file reads.
+  BLOOM_FILTER_USEFUL,
+
+  // # persistent cache hit
+  PERSISTENT_CACHE_HIT,
+  // # persistent cache miss
+  PERSISTENT_CACHE_MISS,
+
+  // # of memtable hits.
+  MEMTABLE_HIT,
+  // # of memtable misses.
+  MEMTABLE_MISS,
+
+  // # of Get() queries served by L0
+  GET_HIT_L0,
+  // # of Get() queries served by L1
+  GET_HIT_L1,
+  // # of Get() queries served by L2 and up
+  GET_HIT_L2_AND_UP,
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 3 reasons currently.
+   * 覆盖写；删除；用户函数删除
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY,  // key was written with a newer value.
+  COMPACTION_KEY_DROP_OBSOLETE,     // The key is obsolete.
+  COMPACTION_KEY_DROP_USER,  // user compaction function has dropped the key.
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN,
+  // Number of Keys read,
+  NUMBER_KEYS_READ,
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED,
+  // The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
+  // DB::Merge(), and DB::Write().
+  BYTES_WRITTEN,
+  // The number of uncompressed bytes read from DB::Get().  It could be
+  // either from memtables, cache, or table files.
+  // For the number of logical bytes read from DB::MultiGet(),
+  // please use NUMBER_MULTIGET_BYTES_READ.
+  BYTES_READ,
+  // The number of calls to seek/next/prev
+  NUMBER_DB_SEEK,
+  NUMBER_DB_NEXT,
+  NUMBER_DB_PREV,
+  // The number of calls to seek/next/prev that returned data
+  NUMBER_DB_SEEK_FOUND,
+  NUMBER_DB_NEXT_FOUND,
+  NUMBER_DB_PREV_FOUND,
+  // The number of uncompressed bytes read from an iterator.
+  // Includes size of key and value.
+  ITER_BYTES_READ,
+  NO_FILE_CLOSES,
+  NO_FILE_OPENS,
+  NO_FILE_ERRORS,
+  // DEPRECATED Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS,
+  // DEPRECATED Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS,
+  // DEPRECATED write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS,
+  // Writer has to wait for compaction or flush to finish.
+  STALL_MICROS,
+  // The wait time for db mutex.
+  // Disabled by default. To enable it set stats level to kAll
+  DB_MUTEX_WAIT_MICROS,
+  RATE_LIMIT_DELAY_MILLIS,
+  NO_ITERATORS,  // number of iterators currently open
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS,
+  NUMBER_MULTIGET_KEYS_READ,
+  NUMBER_MULTIGET_BYTES_READ,
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES,
+  NUMBER_MERGE_FAILURES,
+  SEQUENCE_NUMBER,
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED,
+  BLOOM_FILTER_PREFIX_USEFUL,
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION,
+
+  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS,
+  BLOCK_CACHE_COMPRESSED_MISS,  // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT,   // hit in the compressed block cache
+  // Number of blocks added to comopressed block cache
+  BLOCK_CACHE_COMPRESSED_ADD,
+  // Number of failures when adding blocks to compressed block cache
+  BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+  WAL_FILE_SYNCED,  // Number of times WAL sync is done
+  WAL_FILE_BYTES,   // Number of bytes written to WAL
+
+  // Writes can be processed by requesting thread or by the thread at the
+  // head of the writers queue.
+  WRITE_DONE_BY_SELF,
+  WRITE_DONE_BY_OTHER,  // Equivalent to writes done for others
+  WRITE_TIMEDOUT,       // Number of writes ending up with timed-out.
+  WRITE_WITH_WAL,       // Number of Write calls that request WAL
+  COMPACT_READ_BYTES,   // Bytes read during compaction
+  COMPACT_WRITE_BYTES,  // Bytes written during compaction
+  FLUSH_WRITE_BYTES,    // Bytes written during flush
+
+  // Number of table's properties loaded directly from file, without creating
+  // table reader object.
+  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+  NUMBER_SUPERVERSION_ACQUIRES,
+  NUMBER_SUPERVERSION_RELEASES,
+  NUMBER_SUPERVERSION_CLEANUPS,
+  NUMBER_BLOCK_NOT_COMPRESSED,
+  MERGE_OPERATION_TOTAL_TIME,
+  FILTER_OPERATION_TOTAL_TIME,
+
+  // Row cache.
+  ROW_CACHE_HIT,
+  ROW_CACHE_MISS,
+
+  TICKER_ENUM_MAX
+};
+
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+const std::vector<std::pair<Tickers, std::string> > TickersNameMap = {
+    {BLOCK_CACHE_MISS, "leveldb.block.cache.miss"},
+    {BLOCK_CACHE_HIT, "leveldb.block.cache.hit"},
+    {BLOCK_CACHE_ADD, "leveldb.block.cache.add"},
+    {BLOCK_CACHE_ADD_FAILURES, "leveldb.block.cache.add.failures"},
+    {BLOCK_CACHE_INDEX_MISS, "leveldb.block.cache.index.miss"},
+    {BLOCK_CACHE_INDEX_HIT, "leveldb.block.cache.index.hit"},
+    {BLOCK_CACHE_INDEX_BYTES_INSERT, "leveldb.block.cache.index.bytes.insert"},
+    {BLOCK_CACHE_INDEX_BYTES_EVICT, "leveldb.block.cache.index.bytes.evict"},
+    {BLOCK_CACHE_FILTER_MISS, "leveldb.block.cache.filter.miss"},
+    {BLOCK_CACHE_FILTER_HIT, "leveldb.block.cache.filter.hit"},
+    {BLOCK_CACHE_FILTER_BYTES_INSERT,
+     "leveldb.block.cache.filter.bytes.insert"},
+    {BLOCK_CACHE_FILTER_BYTES_EVICT, "leveldb.block.cache.filter.bytes.evict"},
+    {BLOCK_CACHE_DATA_MISS, "leveldb.block.cache.data.miss"},
+    {BLOCK_CACHE_DATA_HIT, "leveldb.block.cache.data.hit"},
+    {BLOCK_CACHE_BYTES_READ, "leveldb.block.cache.bytes.read"},
+    {BLOCK_CACHE_BYTES_WRITE, "leveldb.block.cache.bytes.write"},
+    {BLOOM_FILTER_USEFUL, "leveldb.bloom.filter.useful"},
+    {MEMTABLE_HIT, "leveldb.memtable.hit"},
+    {MEMTABLE_MISS, "leveldb.memtable.miss"},
+    {GET_HIT_L0, "leveldb.l0.hit"},
+    {GET_HIT_L1, "leveldb.l1.hit"},
+    {GET_HIT_L2_AND_UP, "leveldb.l2andup.hit"},
+    {COMPACTION_KEY_DROP_NEWER_ENTRY, "leveldb.compaction.key.drop.new"},
+    {COMPACTION_KEY_DROP_OBSOLETE, "leveldb.compaction.key.drop.obsolete"},
+    {COMPACTION_KEY_DROP_USER, "leveldb.compaction.key.drop.user"},
+    {NUMBER_KEYS_WRITTEN, "leveldb.number.keys.written"},
+    {NUMBER_KEYS_READ, "leveldb.number.keys.read"},
+    {NUMBER_KEYS_UPDATED, "leveldb.number.keys.updated"},
+    {BYTES_WRITTEN, "leveldb.bytes.written"},
+    {BYTES_READ, "leveldb.bytes.read"},
+    {NUMBER_DB_SEEK, "leveldb.number.db.seek"},
+    {NUMBER_DB_NEXT, "leveldb.number.db.next"},
+    {NUMBER_DB_PREV, "leveldb.number.db.prev"},
+    {NUMBER_DB_SEEK_FOUND, "leveldb.number.db.seek.found"},
+    {NUMBER_DB_NEXT_FOUND, "leveldb.number.db.next.found"},
+    {NUMBER_DB_PREV_FOUND, "leveldb.number.db.prev.found"},
+    {ITER_BYTES_READ, "leveldb.db.iter.bytes.read"},
+    {NO_FILE_CLOSES, "leveldb.no.file.closes"},
+    {NO_FILE_OPENS, "leveldb.no.file.opens"},
+    {NO_FILE_ERRORS, "leveldb.no.file.errors"},
+    {STALL_L0_SLOWDOWN_MICROS, "leveldb.l0.slowdown.micros"},
+    {STALL_MEMTABLE_COMPACTION_MICROS, "leveldb.memtable.compaction.micros"},
+    {STALL_L0_NUM_FILES_MICROS, "leveldb.l0.num.files.stall.micros"},
+    {STALL_MICROS, "leveldb.stall.micros"},
+    {DB_MUTEX_WAIT_MICROS, "leveldb.db.mutex.wait.micros"},
+    {RATE_LIMIT_DELAY_MILLIS, "leveldb.rate.limit.delay.millis"},
+    {NO_ITERATORS, "leveldb.num.iterators"},
+    {NUMBER_MULTIGET_CALLS, "leveldb.number.multiget.get"},
+    {NUMBER_MULTIGET_KEYS_READ, "leveldb.number.multiget.keys.read"},
+    {NUMBER_MULTIGET_BYTES_READ, "leveldb.number.multiget.bytes.read"},
+    {NUMBER_FILTERED_DELETES, "leveldb.number.deletes.filtered"},
+    {NUMBER_MERGE_FAILURES, "leveldb.number.merge.failures"},
+    {SEQUENCE_NUMBER, "leveldb.sequence.number"},
+    {BLOOM_FILTER_PREFIX_CHECKED, "leveldb.bloom.filter.prefix.checked"},
+    {BLOOM_FILTER_PREFIX_USEFUL, "leveldb.bloom.filter.prefix.useful"},
+    {NUMBER_OF_RESEEKS_IN_ITERATION, "leveldb.number.reseeks.iteration"},
+    {GET_UPDATES_SINCE_CALLS, "leveldb.getupdatessince.calls"},
+    {BLOCK_CACHE_COMPRESSED_MISS, "leveldb.block.cachecompressed.miss"},
+    {BLOCK_CACHE_COMPRESSED_HIT, "leveldb.block.cachecompressed.hit"},
+    {BLOCK_CACHE_COMPRESSED_ADD, "leveldb.block.cachecompressed.add"},
+    {BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+     "leveldb.block.cachecompressed.add.failures"},
+    {WAL_FILE_SYNCED, "leveldb.wal.synced"},
+    {WAL_FILE_BYTES, "leveldb.wal.bytes"},
+    {WRITE_DONE_BY_SELF, "leveldb.write.self"},
+    {WRITE_DONE_BY_OTHER, "leveldb.write.other"},
+    {WRITE_WITH_WAL, "leveldb.write.wal"},
+    {FLUSH_WRITE_BYTES, "leveldb.flush.write.bytes"},
+    {COMPACT_READ_BYTES, "leveldb.compact.read.bytes"},
+    {COMPACT_WRITE_BYTES, "leveldb.compact.write.bytes"},
+    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+     "leveldb.number.direct.load.table.properties"},
+    {NUMBER_SUPERVERSION_ACQUIRES, "leveldb.number.superversion_acquires"},
+    {NUMBER_SUPERVERSION_RELEASES, "leveldb.number.superversion_releases"},
+    {NUMBER_SUPERVERSION_CLEANUPS, "leveldb.number.superversion_cleanups"},
+    {NUMBER_BLOCK_NOT_COMPRESSED, "leveldb.number.block.not_compressed"},
+    {MERGE_OPERATION_TOTAL_TIME, "leveldb.merge.operation.time.nanos"},
+    {FILTER_OPERATION_TOTAL_TIME, "leveldb.filter.operation.time.nanos"},
+    {ROW_CACHE_HIT, "leveldb.row.cache.hit"},
+    {ROW_CACHE_MISS, "leveldb.row.cache.miss"},
+};
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram whould have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ */
+enum Histograms : uint32_t {
+  DB_GET = 0,
+  DB_WRITE,
+  COMPACTION_TIME,
+  SUBCOMPACTION_SETUP_TIME,
+  TABLE_SYNC_MICROS,
+  COMPACTION_OUTFILE_SYNC_MICROS,
+  WAL_FILE_SYNC_MICROS,
+  MANIFEST_FILE_SYNC_MICROS,
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS,
+  DB_MULTIGET,
+  READ_BLOCK_COMPACTION_MICROS,
+  READ_BLOCK_GET_MICROS,
+  WRITE_RAW_BLOCK_MICROS,
+  STALL_L0_SLOWDOWN_COUNT,
+  STALL_MEMTABLE_COMPACTION_COUNT,
+  STALL_L0_NUM_FILES_COUNT,
+  HARD_RATE_LIMIT_DELAY_COUNT,
+  SOFT_RATE_LIMIT_DELAY_COUNT,
+  NUM_FILES_IN_SINGLE_COMPACTION,
+  DB_SEEK,
+  WRITE_STALL,
+  SST_READ_MICROS,
+  // The number of subcompactions actually scheduled during a compaction
+  NUM_SUBCOMPACTIONS_SCHEDULED,
+  // Value size distribution in each operation
+  BYTES_PER_READ,
+  BYTES_PER_WRITE,
+  BYTES_PER_MULTIGET,
+  // tera block cache spec
+  TERA_BLOCK_CACHE_PREAD_QUEUE,
+  TERA_BLOCK_CACHE_PREAD_SSD_READ,
+  TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA,
+  TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK,
+  HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
+};
+
+const std::vector<std::pair<Histograms, std::string> > HistogramsNameMap = {
+    {DB_GET, "leveldb.db.get.micros"},
+    {DB_WRITE, "leveldb.db.write.micros"},
+    {COMPACTION_TIME, "leveldb.compaction.times.micros"},
+    {SUBCOMPACTION_SETUP_TIME, "leveldb.subcompaction.setup.times.micros"},
+    {TABLE_SYNC_MICROS, "leveldb.table.sync.micros"},
+    {COMPACTION_OUTFILE_SYNC_MICROS, "leveldb.compaction.outfile.sync.micros"},
+    {WAL_FILE_SYNC_MICROS, "leveldb.wal.file.sync.micros"},
+    {MANIFEST_FILE_SYNC_MICROS, "leveldb.manifest.file.sync.micros"},
+    {TABLE_OPEN_IO_MICROS, "leveldb.table.open.io.micros"},
+    {DB_MULTIGET, "leveldb.db.multiget.micros"},
+    {READ_BLOCK_COMPACTION_MICROS, "leveldb.read.block.compaction.micros"},
+    {READ_BLOCK_GET_MICROS, "leveldb.read.block.get.micros"},
+    {WRITE_RAW_BLOCK_MICROS, "leveldb.write.raw.block.micros"},
+    {STALL_L0_SLOWDOWN_COUNT, "leveldb.l0.slowdown.count"},
+    {STALL_MEMTABLE_COMPACTION_COUNT, "leveldb.memtable.compaction.count"},
+    {STALL_L0_NUM_FILES_COUNT, "leveldb.num.files.stall.count"},
+    {HARD_RATE_LIMIT_DELAY_COUNT, "leveldb.hard.rate.limit.delay.count"},
+    {SOFT_RATE_LIMIT_DELAY_COUNT, "leveldb.soft.rate.limit.delay.count"},
+    {NUM_FILES_IN_SINGLE_COMPACTION, "leveldb.numfiles.in.singlecompaction"},
+    {DB_SEEK, "leveldb.db.seek.micros"},
+    {WRITE_STALL, "leveldb.db.write.stall"},
+    {SST_READ_MICROS, "leveldb.sst.read.micros"},
+    {NUM_SUBCOMPACTIONS_SCHEDULED, "leveldb.num.subcompactions.scheduled"},
+    {BYTES_PER_READ, "leveldb.bytes.per.read"},
+    {BYTES_PER_WRITE, "leveldb.bytes.per.write"},
+    {BYTES_PER_MULTIGET, "leveldb.bytes.per.multiget"},
+    {TERA_BLOCK_CACHE_PREAD_QUEUE, "tera.block_cache.pread_queue"},
+    {TERA_BLOCK_CACHE_PREAD_SSD_READ, "tera.block_cache.pread_ssd_read"},
+    {TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, "tera.block_cache.pread_fill_user_data"},
+    {TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK, "tera.block_cache.pread_release_block"},
+};
+
+struct HistogramData {
+  double median; // 中值
+  double percentile95;
+  double percentile99; // 99分为点
+  double average;
+  double standard_deviation;
+};
+
+// Analyze the performance of a db
+class Statistics {
+ public:
+  virtual ~Statistics() {}
+
+  virtual int64_t GetTickerCount(uint32_t ticker_type) = 0;
+  virtual void RecordTick(uint32_t ticker_type, uint64_t count = 0) = 0;
+  virtual void SetTickerCount(uint32_t ticker_type, uint64_t count) = 0;
+
+  virtual void GetHistogramData(uint32_t type,
+                                HistogramData* const data) = 0;
+  virtual std::string GetBriefHistogramString(uint32_t type) { return ""; }
+  virtual std::string GetHistogramString(uint32_t type) const { return ""; }
+  virtual void MeasureTime(uint32_t histogram_type, uint64_t time) = 0;
+  virtual void ClearHistogram(uint32_t type) = 0;
+
+  // String representation of the statistic object.
+  virtual std::string ToString() {
+    // Do nothing by default
+    return std::string("ToString(): not implemented");
+  }
+  virtual void ClearAll() = 0;
+};
+
+// Create a concrete DBStatistics object
+Statistics* CreateDBStatistics();
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_INCLUDE_STATISTICS_H_
diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 603ca1f0a..14722f98a 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -4,6 +4,7 @@
 
 #include "leveldb/block_cache.h"
 
+#include <aio.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
@@ -20,6 +21,7 @@
 #include "leveldb/env.h"
 #include "leveldb/iterator.h"
 #include "leveldb/options.h"
+#include "leveldb/statistics.h"
 #include "leveldb/status.h"
 #include "leveldb/table_utils.h"
 #include "leveldb/write_batch.h"
@@ -174,6 +176,10 @@ class BlockCacheImpl {
                                RandomAccessFile** result); // cache Pread
     static void BlockDeleter(const Slice& key, void* v);
 
+    static void BGControlThreadFunc(void* arg);
+
+    Status DeleteFile(const std::string& fname);
+
 private:
     friend struct DataSet;
     struct LockContent;
@@ -182,7 +188,7 @@ class BlockCacheImpl {
 
     Status FillCache(CacheBlock* block);
 
-    Status ReadCache(CacheBlock* block);
+    Status ReadCache(CacheBlock* block, struct aiocb* aio_context);
 
     uint64_t AllocFileId(); // no more than fid_batch_num
 
@@ -196,6 +202,8 @@ class BlockCacheImpl {
 
     Status ReleaseBlock(CacheBlock* block, bool need_sync);
 
+    void BGControlThread();
+
 private:
     friend class BlockCacheWritableFile;
     friend class BlockCacheRandomAccessFile;
@@ -223,6 +231,7 @@ class BlockCacheImpl {
     enum LockKeyType {
         kDBKey = 0,
         kDataSetKey = 1,
+        kDeleteDBKey = 2,
     };
     struct LockContent {
         int type;
@@ -237,7 +246,7 @@ class BlockCacheImpl {
         DataSet* data_set;
 
         const std::string Encode() {
-            if (type == kDBKey) {
+            if (type == kDBKey || type == kDeleteDBKey) {
                 return db_lock_key.ToString();
             } else if (type == kDataSetKey) {
                 std::string key = "DS#";
@@ -248,7 +257,7 @@ class BlockCacheImpl {
         }
 
         const std::string KeyToString() {
-            if (type == kDBKey) {
+            if (type == kDBKey || type == kDeleteDBKey) {
                 return db_lock_key.ToString();
             } else if (type == kDataSetKey) {
                 std::stringstream ss;
@@ -272,12 +281,14 @@ class BlockCacheImpl {
     typedef std::map<uint64_t, DataSet*> DataSetMap;
     DataSetMap data_set_map_;
 
+    Statistics* stat_;
     //WritableFile* logfile_;
     //log::Writer* log_;
     DB* db_; // store meta
     ThreadPool bg_fill_;
     ThreadPool bg_read_;
     ThreadPool bg_flush_;
+    ThreadPool bg_control_;
 };
 
 // Must insure not init more than twice
@@ -302,6 +313,11 @@ Status BlockCacheEnv::GetChildren(const std::string& path,
 }
 
 Status BlockCacheEnv::DeleteFile(const std::string& fname) {
+    if (fname.rfind(".sst") == fname.size() - 4) {
+        uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size();
+        BlockCacheImpl* cache = cache_vec_[hash];
+        cache->DeleteFile(fname);
+    }
     return dfs_env_->DeleteFile(fname);
 }
 
@@ -371,8 +387,8 @@ Status BlockCacheEnv::NewRandomAccessFile(const std::string& fname,
     uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size();
     BlockCacheImpl* cache = cache_vec_[hash];
     Status s = cache->NewRandomAccessFile(fname, result);
-    Log("[block_cache %s] open file read: %s, hash: %u, status: %s\n",
-        cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str());
+    //Log("[block_cache %s] open file read: %s, hash: %u, status: %s\n",
+    //    cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str());
     return s;
 }
 
@@ -663,14 +679,15 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
     : cache_(c),
       fname_(fname) {
         *s = cache_->dfs_env_->NewRandomAccessFile(fname_, &dfs_file_);
-        Log("[%s] dfs open for read: %s, block_size: %lu, status: %s\n",
-            cache_->WorkPath().c_str(),
-            fname.c_str(),
-            cache_->options_.block_size,
-            s->ToString().c_str());
+        //Log("[%s] dfs open for read: %s, block_size: %lu, status: %s\n",
+        //    cache_->WorkPath().c_str(),
+        //    fname.c_str(),
+        //    cache_->options_.block_size,
+        //    s->ToString().c_str());
 
         MutexLock lockgard(&cache_->mu_);
         fid_ = cache_->FileId(fname_);
+        aio_enabled_ = true;
         return;
     }
 
@@ -691,11 +708,12 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         std::vector<CacheBlock*> c_valid;
         std::vector<CacheBlock*> block_queue;
 
-        Log("[%s] Begin Pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu"
-            ", block_size %lu\n",
-            cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
-            begin, end, cache_->options_.block_size);
+        //Log("[%s] Begin Pread %s, size %lu, offset %lu, fid %lu, start_block %lu, end_block %lu"
+        //    ", block_size %lu\n",
+        //    cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
+        //    begin, end, cache_->options_.block_size);
 
+        uint64_t start_ts = cache_->options_.cache_env->NowMicros();
         MutexLock lockgard(&cache_->mu_);
         for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) {
             CacheBlock* block = NULL;
@@ -721,12 +739,14 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
                 c_locked.push_back(block);
             }
 
-            Log("[%s] Queue block: %s, refs %u, data_block_refs %lu, alloc %u\n",
-                cache_->WorkPath().c_str(), block->ToString().c_str(),
-                block->handle->refs, block->data_block_refs,
-                block->data_block_alloc);
+            //Log("[%s] Queue block: %s, refs %u, data_block_refs %lu, alloc %u\n",
+            //    cache_->WorkPath().c_str(), block->ToString().c_str(),
+            //    block->handle->refs, block->data_block_refs,
+            //    block->data_block_alloc);
         }
         cache_->mu_.Unlock();
+        uint64_t queue_ts = cache_->options_.cache_env->NowMicros();
+        cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_QUEUE, queue_ts - start_ts);
 
         // async read miss data
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
@@ -739,6 +759,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //    block->ToString().c_str());
             cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10);
         }
+        //uint64_t miss_read_sched_ts = cache_->options_.cache_env->NowMicros();
 
         // async read valid data
         for (uint32_t i = 0; i < c_valid.size(); ++i) {
@@ -749,8 +770,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //Log("[%s] pread in valid list, %s\n",
             //    cache_->WorkPath().c_str(),
             //    block->ToString().c_str());
-            cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10);
+            if (aio_enabled_) {
+                AioCacheRead(reader);
+            } else {
+                cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheRead, reader, 10);
+            }
         }
+        //uint64_t ssd_read_sched_ts = cache_->options_.cache_env->NowMicros();
 
         // wait async cache read done
         for (uint32_t i = 0; i < c_valid.size(); ++i) {
@@ -763,10 +789,12 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             }
             block->Clear(kCacheBlockLocked);
             block->cv.SignalAll();
-            Log("[%s] cache read done, %s\n",
-                cache_->WorkPath().c_str(),
-                block->ToString().c_str());
+            //Log("[%s] cache read done, %s\n",
+            //    cache_->WorkPath().c_str(),
+            //    block->ToString().c_str());
         }
+        uint64_t ssd_read_ts = cache_->options_.cache_env->NowMicros();
+        cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_SSD_READ, ssd_read_ts - queue_ts);
 
         // wait dfs read done and async cache file
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
@@ -781,6 +809,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
         }
+        //uint64_t dfs_read_ts = cache_->options_.cache_env->NowMicros();
 
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
             CacheBlock* block = c_miss[i];
@@ -792,6 +821,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //    block->ToString().c_str());
             cache_->bg_fill_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheWrite, writer, 10);
         }
+        //uint64_t ssd_write_sched_ts = cache_->options_.cache_env->NowMicros();
 
         for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish
             MutexLock lockgard(&cache_->mu_);
@@ -804,20 +834,22 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             }
             block->Clear(kCacheBlockLocked);
             block->cv.SignalAll();
-            Log("[%s] cache fill done, %s\n",
-                cache_->WorkPath().c_str(),
-                block->ToString().c_str());
+            //Log("[%s] cache fill done, %s\n",
+            //    cache_->WorkPath().c_str(),
+            //    block->ToString().c_str());
         }
+        //uint64_t ssd_write_ts = cache_->options_.cache_env->NowMicros();
 
         // wait other async read finish
         for (uint32_t i = 0; i < c_locked.size(); ++i) {
             MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_locked[i];
             block->WaitOnClear(kCacheBlockLocked);
-            Log("[%s] wait locked done, %s\n",
-                cache_->WorkPath().c_str(),
-                block->ToString().c_str());
+            //Log("[%s] wait locked done, %s\n",
+            //    cache_->WorkPath().c_str(),
+            //    block->ToString().c_str());
         }
+        uint64_t wait_unlock_ts = cache_->options_.cache_env->NowMicros();
 
         // fill user mem
         size_t msize = 0;
@@ -832,17 +864,19 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             }
             memcpy(scratch + msize, data_block.data(), data_block.size());
             msize += data_block.size();
-            Log("[%s] Fill user data, %s, fill_offset %lu, fill_size %lu, prefix %lu, suffix %lu, msize %lu, offset %lu\n",
-                cache_->WorkPath().c_str(), fname_.c_str(),
-                block_idx * cache_->options_.block_size + (block_idx == begin ? offset % cache_->options_.block_size: 0),
-                data_block.size(),
-                block_idx == begin ? offset % cache_->options_.block_size: 0,
-                block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size
-                                 : cache_->options_.block_size,
-                msize, offset);
+            //Log("[%s] Fill user data, %s, fill_offset %lu, fill_size %lu, prefix %lu, suffix %lu, msize %lu, offset %lu\n",
+            //    cache_->WorkPath().c_str(), fname_.c_str(),
+            //    block_idx * cache_->options_.block_size + (block_idx == begin ? offset % cache_->options_.block_size: 0),
+            //    data_block.size(),
+            //    block_idx == begin ? offset % cache_->options_.block_size: 0,
+            //    block_idx == end ? cache_->options_.block_size - (n + offset) % cache_->options_.block_size
+            //                     : cache_->options_.block_size,
+            //    msize, offset);
         }
         assert(msize == n);
         *result = Slice(scratch, n);
+        uint64_t fill_user_data_ts = cache_->options_.cache_env->NowMicros();
+        cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, fill_user_data_ts - wait_unlock_ts);
 
         cache_->mu_.Lock();
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
@@ -860,6 +894,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //Log("[%s] wakeup for lock, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
             cache_->ReleaseBlock(block, false);
         }
+        uint64_t release_cache_block_ts = cache_->options_.cache_env->NowMicros();
+        cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK, release_cache_block_ts - fill_user_data_ts);
 
         if (!s.ok()) {
             s = dfs_file_->Read(offset, n, result, scratch);
@@ -868,11 +904,11 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
                 offset, n, s.ToString().c_str());
         }
 
-        Log("[%s] End Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu"
-            ", block_size %lu\n",
-            cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
-            result->size(), s.ToString().c_str(),
-            begin, end, cache_->options_.block_size);
+        //Log("[%s] Done Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu"
+        //    ", block_size %lu\n",
+        //    cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
+        //    result->size(), s.ToString().c_str(),
+        //    begin, end, cache_->options_.block_size);
         return s;
     }
 
@@ -895,11 +931,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         uint64_t offset = block->block_idx * cache_->options_.block_size;
         size_t n = cache_->options_.block_size;
         block->s = dfs_file_->Read(offset, n, &result, scratch);
-        Log("[%s] dfs read, %s"
-            ", offset %lu, size %lu, status %s, res %lu\n",
-            cache_->WorkPath().c_str(), block->ToString().c_str(),
-            offset, n,
-            block->s.ToString().c_str(), result.size());
+        if (!block->s.ok()) {
+            Log("[%s] dfs read, %s"
+                ", offset %lu, size %lu, status %s, res %lu\n",
+                cache_->WorkPath().c_str(), block->ToString().c_str(),
+                offset, n,
+                block->s.ToString().c_str(), result.size());
+        }
 
         MutexLock lockgard(&cache_->mu_);
         block->Clear(kCacheBlockDfsRead);
@@ -910,23 +948,58 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
     struct AsyncCacheReader {
         BlockCacheRandomAccessFile* file;
         CacheBlock* block;
+
+        // aio spec
+        struct aiocb aio_context;
     };
+
+    // use use thread module to enhance sync io
     static void AsyncCacheRead(void* arg) {
         AsyncCacheReader* reader = (AsyncCacheReader*)arg;
         reader->file->HandleCacheRead(reader);
         delete reader;
-        return;
     }
     void HandleCacheRead(AsyncCacheReader* reader) {
         CacheBlock* block = reader->block;
-        block->s = cache_->ReadCache(block);
+        block->s = cache_->ReadCache(block, NULL);
 
         MutexLock lockgard(&cache_->mu_);
         block->Clear(kCacheBlockCacheRead);
         block->cv.SignalAll();
         //Log("[%s] async.cacheread signal, %s\n", cache_->WorkPath().c_str(),
         //    block->ToString().c_str());
-        return;
+    }
+
+    // support aio engine
+    static void AioCacheReadCallback(sigval_t sigval) { // kernel create thread
+        AsyncCacheReader* reader = (AsyncCacheReader*)sigval.sival_ptr;
+        reader->file->HandleAioCacheReadCallback(reader);
+        delete reader;
+    }
+    void HandleAioCacheReadCallback(AsyncCacheReader* reader) {
+        CacheBlock* block = reader->block;
+        assert(aio_error(&reader->aio_context) == 0);
+        //while (aio_error(reader->aio_context) == EINPROGRESS);
+        ssize_t res = aio_return(&reader->aio_context);
+        block->s = res < 0? Status::Corruption("AioReadCache error") : Status::OK();
+
+        MutexLock lockgard(&cache_->mu_);
+        block->Clear(kCacheBlockCacheRead);
+        block->cv.SignalAll();
+        if (!block->s.ok()) {
+            Log("[%s] aio.cacheread signal, %s\n", cache_->WorkPath().c_str(),
+                block->ToString().c_str());
+        }
+    }
+    void AioCacheRead(AsyncCacheReader* reader) const {
+        // setup sigevent
+        memset((char*)(&reader->aio_context), 0, sizeof(struct aiocb));
+        reader->aio_context.aio_sigevent.sigev_notify = SIGEV_THREAD;
+        reader->aio_context.aio_sigevent.sigev_notify_function = &BlockCacheRandomAccessFile::AioCacheReadCallback;
+        reader->aio_context.aio_sigevent.sigev_notify_attributes = NULL;
+        reader->aio_context.aio_sigevent.sigev_value.sival_ptr = reader;
+
+        cache_->ReadCache(reader->block, &reader->aio_context);
     }
 
     struct AsyncCacheWriter {
@@ -960,6 +1033,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
     RandomAccessFile* dfs_file_;
     std::string fname_;
     uint64_t fid_;
+    bool aio_enabled_;
 };
 
 // Tcache impl
@@ -972,9 +1046,35 @@ BlockCacheImpl::BlockCacheImpl(const BlockCacheOptions& options)
     bg_fill_.SetBackgroundThreads(30);
     bg_read_.SetBackgroundThreads(30);
     bg_flush_.SetBackgroundThreads(30);
+    bg_control_.SetBackgroundThreads(2);
+    stat_ = CreateDBStatistics();
 }
 
-BlockCacheImpl::~BlockCacheImpl() {}
+BlockCacheImpl::~BlockCacheImpl() {
+    delete stat_;
+}
+
+void BlockCacheImpl::BGControlThreadFunc(void* arg) {
+    reinterpret_cast<BlockCacheImpl*>(arg)->BGControlThread();
+}
+
+void BlockCacheImpl::BGControlThread() {
+    Log("[%s] statistics: %s", this->WorkPath().c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str());
+    Log("[%s] statistics: %s", this->WorkPath().c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str());
+    Log("[%s] statistics: %s", this->WorkPath().c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str());
+    Log("[%s] statistics: %s", this->WorkPath().c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str());
+
+    // resched after 1s
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK);
+    bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 10000);
+}
 
 Status BlockCacheImpl::NewWritableFile(const std::string& fname,
                                        WritableFile** result) {
@@ -1072,9 +1172,15 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 lc.KeyToString().c_str(),
                 lc.ValToString().c_str(),
                 s.ToString().c_str());
+        } else if (lc.type == kDeleteDBKey) {
+            WriteOptions w_opts;
+            s = db_->Delete(w_opts, key);
+            Log("[%s] Delete db key : %s, val %s, status %s\n",
+                this->WorkPath().c_str(),
+                lc.KeyToString().c_str(),
+                lc.ValToString().c_str(),
+                s.ToString().c_str());
         } else if (lc.type == kDataSetKey) {
-            std::string end_ds = "DS#";
-            PutFixed64(&end_ds, lc.sid + 1);
             lc.data_set = new DataSet;
             lc.data_set->cache = New2QCache((options_.dataset_size / options_.block_size) + 1);// number of blocks in DS
             std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid);
@@ -1090,26 +1196,32 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             ReadOptions s_opts;
             leveldb::Iterator* db_it = db_->NewIterator(s_opts);
             for (db_it->Seek(key);
-                 db_it->Valid() && db_it->key().ToString() < end_ds;
+                 db_it->Valid() && db_it->key().starts_with("DS#");
                  db_it->Next()) {
                 Slice lkey = db_it->key();
-                lkey.remove_prefix(3 + sizeof(uint64_t));// remove DS#sid
+                uint64_t sid, cbi;
+                lkey.remove_prefix(3);// lkey = DS#, sid, cbi
+                sid = DecodeFixed64(lkey.data());
+                lkey.remove_prefix(sizeof(uint64_t));
+                cbi = DecodeFixed64(lkey.data());
                 //Slice lval = db_it->value();
+                if (sid != lc.sid) {
+                    break;
+                }
 
                 CacheBlock* block = new CacheBlock(&mu_);
                 block->DecodeFrom(db_it->value()); // get fid and block_idx
                 std::string hkey;
                 PutFixed64(&hkey, block->fid);
                 PutFixed64(&hkey, block->block_idx);
-                block->sid = lc.sid;
-                block->cache_block_idx = DecodeFixed64(lkey.data());
+                block->sid = sid;
+                block->cache_block_idx = cbi;
                 block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0;
                 Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n",
                     this->WorkPath().c_str(),
                     lc.KeyToString().c_str(),
                     block->ToString().c_str());
                 LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter));
-                assert(handle != NULL);
                 handle->cache_id = block->cache_block_idx;
                 block->handle = handle;
                 lc.data_set->cache->Release((Cache::Handle*)handle);
@@ -1173,6 +1285,8 @@ Status BlockCacheImpl::LoadCache() {
     new_fid_ = prev_fid_ + options_.fid_batch_num;
     Log("[block_cache %s]: reuse block cache: prev_fid: %lu, new_fid: %lu\n",
         dbname.c_str(), prev_fid_, new_fid_);
+
+    bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 10000);
     s = Status::OK();
     return s;
 }
@@ -1200,7 +1314,7 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) {
     return Status::OK();
 }
 
-Status BlockCacheImpl::ReadCache(CacheBlock* block) {
+Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) {
     MutexLock l(&mu_);
     uint64_t sid = block->sid;
     uint64_t cache_block_idx = block->cache_block_idx;
@@ -1208,13 +1322,25 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block) {
     mu_.Unlock();
 
     // do io without lock
-    ssize_t res = pread(fd, (char*)block->data_block.data(), block->data_block.size(),
-                         cache_block_idx * options_.block_size);
-    Log("[%s] cache read: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
-        this->WorkPath().c_str(), sid, fd, block->data_block.size(),
-        cache_block_idx,
-        block->ToString().c_str(),
-        res);
+    ssize_t res = 0;
+    if (aio_context != NULL) { // support aio engine
+        aio_context->aio_fildes = fd;
+        aio_context->aio_buf = (char*)block->data_block.data();
+        aio_context->aio_nbytes = block->data_block.size();
+        aio_context->aio_offset = cache_block_idx * options_.block_size;
+        res = aio_read(aio_context);
+    } else {
+        res = pread(fd, (char*)block->data_block.data(), block->data_block.size(),
+                    cache_block_idx * options_.block_size);
+    }
+
+    if (res < 0) {
+        Log("[%s] cache read: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
+            this->WorkPath().c_str(), sid, fd, block->data_block.size(),
+            cache_block_idx,
+            block->ToString().c_str(),
+            res);
+    }
 
     mu_.Lock();
     if (res < 0) {
@@ -1279,14 +1405,31 @@ uint64_t BlockCacheImpl::FileId(const std::string& fname) {
     } else { // fid in cache
         fid = DecodeFixed64(val.c_str());
     }
-    Log("[%s] Fid: %lu, fname: %s\n",
-        this->WorkPath().c_str(),
-        fid, fname.c_str());
+    //Log("[%s] Fid: %lu, fname: %s\n",
+    //    this->WorkPath().c_str(),
+    //    fid, fname.c_str());
 
     mu_.Lock();
     return fid;
 }
 
+Status BlockCacheImpl::DeleteFile(const std::string& fname) {
+    Status s;
+    std::string key = "FNAME#" + fname;
+    ReadOptions r_opts;
+    std::string val;
+    //s = db_->Get(r_opts, key, &val);
+    //if (!s.ok()) { // not exist
+    {
+        MutexLock l(&mu_);
+        LockContent lc;
+        lc.type = kDeleteDBKey;
+        lc.db_lock_key = key;
+        s = LockAndPut(lc);
+    }
+    return s;
+}
+
 DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) {
     mu_.AssertHeld();
     DataSet* set = NULL;
@@ -1368,7 +1511,7 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) {
     LRUHandle* h = block->handle;
     DataSet* ds = GetDataSet(block->sid); // get and alloc ds
     block->ReleaseDataBlock();
-    Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
+    //Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
     block->s = Status::OK(); // clear io status
     block->cv.SignalAll();
     ds->cache->Release((Cache::Handle*)h);
diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc
index 99c2dfa90..97e070bf3 100644
--- a/src/leveldb/util/cache.cc
+++ b/src/leveldb/util/cache.cc
@@ -274,17 +274,15 @@ class LRU2QCache: public Cache {
   ~LRU2QCache() {}
 
   // Like Cache methods, but with an extra "hash" parameter.
-  Cache::Handle* Insert(const Slice& key, void* value, size_t charge,
-                         void (*deleter)(const Slice& key, void* value)) {
+  Cache::Handle* Insert(const Slice& key, void* value, size_t cache_id,
+                        void (*deleter)(const Slice& key, void* value)) {
     const uint32_t hash = HashSlice(key);
     MutexLock l(&mutex_);
     LRUHandle* e = NULL;
     e = table_.Lookup(key, hash);
-    if (e != NULL) {
-      return reinterpret_cast<Cache::Handle*>(NULL);
-    }
+    assert(e == NULL);
 
-    if (usage_ < capacity_) { // cache full
+    if (usage_ < capacity_) { // cache not full
       e = reinterpret_cast<LRUHandle*>(
           malloc(sizeof(LRUHandle)-1 + key.size()));
       e->value = value;
@@ -296,8 +294,8 @@ class LRU2QCache: public Cache {
       e->cache_id = usage_;
       memcpy(e->key_data, key.data(), key.size());
 
-      assert(table_.Insert(e) == NULL);
       LRU_Append(e);
+      assert(table_.Insert(e) == NULL);
       usage_++;
       return reinterpret_cast<Cache::Handle*>(e);
     }
@@ -324,12 +322,11 @@ class LRU2QCache: public Cache {
       table_.Remove(old->key(), old->hash);
       Unref(old);
 
-      assert(table_.Insert(e) == NULL);
       LRU_Append(e);
+      assert(table_.Insert(e) == NULL);
       return reinterpret_cast<Cache::Handle*>(e);
     }
-    // TODO: try wait finish
-    return reinterpret_cast<Cache::Handle*>(NULL);
+    return NULL;
   }
 
   Cache::Handle* Lookup(const Slice& key) {
diff --git a/src/leveldb/util/statistics.cc b/src/leveldb/util/statistics.cc
new file mode 100644
index 000000000..ee383a055
--- /dev/null
+++ b/src/leveldb/util/statistics.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "leveldb/statistics.h"
+#include <assert.h>
+#include "util/histogram.h"
+#include "../utils/counter.h"
+
+namespace leveldb {
+
+class StatisticsImpl : public Statistics {
+public:
+  StatisticsImpl() {}
+
+  ~StatisticsImpl() {}
+
+  virtual int64_t GetTickerCount(uint32_t ticker_type) {
+    return counter_[ticker_type].Get();
+  }
+
+  virtual void RecordTick(uint32_t ticker_type, uint64_t count = 0) {
+    counter_[ticker_type].Add(count);
+  }
+
+  virtual void SetTickerCount(uint32_t ticker_type, uint64_t count) {
+    counter_[ticker_type].Set(count);
+  }
+
+  virtual void MeasureTime(uint32_t type, uint64_t time) {
+    hist_[type].Add(time);
+  }
+
+  virtual void GetHistogramData(uint32_t type,
+                                HistogramData* const data) {
+    data->median = hist_[type].Median();
+    data->percentile95 = hist_[type].Percentile(95);
+    data->percentile99 = hist_[type].Percentile(99);
+    data->average = hist_[type].Average();
+    data->standard_deviation = hist_[type].StandardDeviation();
+  }
+
+  virtual std::string GetHistogramString(uint32_t type) const {
+    return hist_[type].ToString();
+  }
+
+  virtual std::string GetBriefHistogramString(uint32_t type) {
+    assert(HistogramsNameMap[type].first == type);
+
+    std::string res;
+    char buffer[200];
+    HistogramData hData;
+    GetHistogramData(type, &hData);
+    snprintf(buffer,
+            200,
+            "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n",
+            HistogramsNameMap[type].second.c_str(),
+            hData.median,
+            hData.percentile95,
+            hData.percentile99);
+    res.append(buffer);
+    res.shrink_to_fit();
+    return res;
+  }
+
+  void ClearHistogram(uint32_t type) {
+    hist_[type].Clear();
+  }
+
+  // String representation of the statistic object.
+  virtual std::string ToString() {
+    std::string res;
+    res.reserve(20000);
+    for (uint32_t i = 0; i < TickersNameMap.size(); i++) {
+      char buffer[200];
+      snprintf(buffer, 200, "%s COUNT : %lu\n",
+               TickersNameMap[i].second.c_str(), GetTickerCount(TickersNameMap[i].first));
+      res.append(buffer);
+    }
+    for (uint32_t i = 0; i < HistogramsNameMap.size(); i++) {
+      char buffer[200];
+      HistogramData hData;
+      GetHistogramData(HistogramsNameMap[i].first, &hData);
+      snprintf(buffer,
+               200,
+               "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n",
+               HistogramsNameMap[i].second.c_str(),
+               hData.median,
+               hData.percentile95,
+               hData.percentile99);
+      res.append(buffer);
+    }
+    res.shrink_to_fit();
+    return res;
+  }
+
+  void ClearAll() {
+    for (uint32_t i = 0; i < TICKER_ENUM_MAX; i++) {
+      counter_[i].Clear();
+    }
+    for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; i++) {
+      hist_[i].Clear();
+    }
+  }
+
+private:
+  tera::Counter counter_[TICKER_ENUM_MAX];
+  Histogram hist_[HISTOGRAM_ENUM_MAX];
+};
+
+Statistics* CreateDBStatistics() {
+  return new StatisticsImpl;
+}
+
+} // namespace leveldb
diff --git a/src/tera_flags.cc b/src/tera_flags.cc
index 6a0a14ce5..e4f77130a 100644
--- a/src/tera_flags.cc
+++ b/src/tera_flags.cc
@@ -64,7 +64,7 @@ DEFINE_int32(tera_leveldb_env_dfs_seek_latency, 10000000, "the random access lat
 DEFINE_int32(tera_memenv_table_cache_size, 100, "the max open file number in leveldb table_cache");
 DEFINE_int32(tera_memenv_block_cache_size, 10000, "block cache size for leveldb which do not use share block cache");
 DEFINE_bool(tera_use_flash_for_memenv, true, "Use flashenv for memery lg");
-DEFINE_int32(tera_leveldb_block_cache_env_num_thread, 30, "thread num of Tcache");
+DEFINE_int32(tera_leveldb_block_cache_env_thread_num, 30, "thread num of Tcache");
 
 DEFINE_string(tera_leveldb_compact_strategy, "default", "the default strategy to drive consum compaction, should be [default|LG|dummy]");
 DEFINE_bool(tera_leveldb_verify_checksums, true, "enable verify data read from storage against checksums");

From 716270e59fef753042882276b06fdf571f733cfd Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 22 Aug 2017 01:41:15 +0800
Subject: [PATCH 08/19] issue=1258, Tcache support block-level cache evict

bugfix:
1. cache reload core
2. support aio engine
3. cache fill TEST PASS
---
 src/leveldb/Makefile            | 4 ++--
 src/leveldb/util/block_cache.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile
index 9073e98a5..72e322d16 100644
--- a/src/leveldb/Makefile
+++ b/src/leveldb/Makefile
@@ -7,7 +7,7 @@
 # to switch between compilation modes.
 
 # OPT ?= -O2 -DNDEBUG       # (A) Production use (optimized mode)
-OPT ?= -std=gnu++11 -g2 -Wall -Werror    # (B) Debug mode, w/ full line-level debugging symbols
+OPT ?= -g2 -Wall -Werror    # (B) Debug mode, w/ full line-level debugging symbols
 # OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols
 #-----------------------------------------------
 
@@ -19,7 +19,7 @@ include ../../depends.mk
 include build_config.mk
 
 CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
-CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
+CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -std=gnu++11
 
 LDFLAGS += $(PLATFORM_LDFLAGS) -L$(SNAPPY_LIBDIR) -lrt -ldl -lsnappy
 LIBS += $(PLATFORM_LIBS)
diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 14722f98a..9d90f219a 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -641,7 +641,7 @@ class BlockCacheWritableFile : public WritableFile {
             port::CondVar cv(&cache_->mu_);
             cv.Wait(10); // timewait 10ms retry
         }
-        block->state = 0;
+        assert(block->state == 0);
         block->GetDataBlock(cache_->options_.block_size, Slice(*block_data));
         cache_->mu_.Unlock();
 
@@ -1119,7 +1119,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
 
     Waiter* w = NULL;
     LockKeyMap::iterator it = lock_key_.find(key);
-    if (it != lock_key_.end()){
+    if (it != lock_key_.end()) {
         w = it->second;
         w->wait_num ++;
         while (!w->done) {

From e516c241a9454a2a1979603a7ccac90c2d979911 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 22 Aug 2017 01:47:18 +0800
Subject: [PATCH 09/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/util/block_cache.cc   | 3 +--
 src/tabletnode/tabletnode_impl.cc | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 9d90f219a..65f218eb4 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -174,6 +174,7 @@ class BlockCacheImpl {
 
     Status NewRandomAccessFile(const std::string& fname,
                                RandomAccessFile** result); // cache Pread
+
     static void BlockDeleter(const Slice& key, void* v);
 
     static void BGControlThreadFunc(void* arg);
@@ -424,7 +425,6 @@ class BlockCacheWriteBuffer {
         MutexLock l(&mu_);
         if (tmp_storage_ == NULL) {
             tmp_storage_ = new std::string();
-            tmp_storage_->resize(0);
             block_list_.push_back(tmp_storage_);
         }
         uint32_t begin = offset_ / block_size_;
@@ -438,7 +438,6 @@ class BlockCacheWriteBuffer {
             Slice buf(data.data() + tmp_size, data.size() - tmp_size);
             for (uint32_t i = begin + 1; i <= end; ++i) {
                 tmp_storage_ = new std::string();
-                tmp_storage_->resize(0);
                 block_list_.push_back(tmp_storage_);
                 if (i < end) { // last block
                     tmp_storage_->append(buf.data(), block_size_);
diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc
index 4d7919fd6..0cfd97c86 100644
--- a/src/tabletnode/tabletnode_impl.cc
+++ b/src/tabletnode/tabletnode_impl.cc
@@ -199,7 +199,7 @@ void TabletNodeImpl::InitCacheSystem() {
     // compitable with legacy FlashEnv
     leveldb::FlashEnv* flash_env = (leveldb::FlashEnv*)io::LeveldbFlashEnv();
     flash_env->SetFlashPath(FLAGS_tera_tabletnode_cache_paths,
-            FLAGS_tera_io_cache_path_vanish_allowed);
+                            FLAGS_tera_io_cache_path_vanish_allowed);
     flash_env->SetUpdateFlashThreadNumber(FLAGS_tera_tabletnode_cache_update_thread_num);
     flash_env->SetIfForceReadFromCache(FLAGS_tera_tabletnode_cache_force_read_from_cache);
     return;

From a94781f95f555caf9ee343259d396923d3dcbc16 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 22 Aug 2017 01:57:54 +0800
Subject: [PATCH 10/19] issue=1258, Tcache support block-level cache evict

---
 src/tabletnode/tabletnode_impl.cc | 4 ++--
 src/tera_flags.cc                 | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc
index 0cfd97c86..52077a718 100644
--- a/src/tabletnode/tabletnode_impl.cc
+++ b/src/tabletnode/tabletnode_impl.cc
@@ -178,7 +178,7 @@ bool TabletNodeImpl::Init() {
 
 void TabletNodeImpl::InitCacheSystem() {
     if (FLAGS_tera_tabletnode_block_cache_enabled) {
-        LOG(INFO) << "Tcache: set flash path: " << FLAGS_tera_tabletnode_cache_paths;
+        LOG(INFO) << "t-cache: set flash path: " << FLAGS_tera_tabletnode_cache_paths;
         std::vector<std::string> path_list;
         SplitString(FLAGS_tera_tabletnode_cache_paths, ";", &path_list);
 
@@ -187,7 +187,7 @@ void TabletNodeImpl::InitCacheSystem() {
             posix_env->CreateDir(path_list[i]);
         }
 
-        LOG(INFO) << "activate Tcache system";
+        LOG(INFO) << "activate t-cache system";
         leveldb::Env* block_cache_env = io::DefaultBlockCacheEnv();
         for (uint32_t i = 0; i < path_list.size(); ++i) {
             leveldb::BlockCacheOptions opts;
diff --git a/src/tera_flags.cc b/src/tera_flags.cc
index e4f77130a..949c36b98 100644
--- a/src/tera_flags.cc
+++ b/src/tera_flags.cc
@@ -64,7 +64,7 @@ DEFINE_int32(tera_leveldb_env_dfs_seek_latency, 10000000, "the random access lat
 DEFINE_int32(tera_memenv_table_cache_size, 100, "the max open file number in leveldb table_cache");
 DEFINE_int32(tera_memenv_block_cache_size, 10000, "block cache size for leveldb which do not use share block cache");
 DEFINE_bool(tera_use_flash_for_memenv, true, "Use flashenv for memery lg");
-DEFINE_int32(tera_leveldb_block_cache_env_thread_num, 30, "thread num of Tcache");
+DEFINE_int32(tera_leveldb_block_cache_env_thread_num, 30, "thread num of t-cache");
 
 DEFINE_string(tera_leveldb_compact_strategy, "default", "the default strategy to drive consum compaction, should be [default|LG|dummy]");
 DEFINE_bool(tera_leveldb_verify_checksums, true, "enable verify data read from storage against checksums");
@@ -202,7 +202,7 @@ DEFINE_string(tera_tabletnode_cpu_affinity_set, "1,2", "the cpu set of cpu affin
 DEFINE_bool(tera_tabletnode_hang_detect_enabled, false, "enable detect read/write hang");
 DEFINE_int32(tera_tabletnode_hang_detect_threshold, 60000, "read/write hang detect threshold (in ms)");
 
-DEFINE_bool(tera_tabletnode_block_cache_enabled, true, "enable Tcache mechasism");
+DEFINE_bool(tera_tabletnode_block_cache_enabled, true, "enable t-cache mechasism");
 DEFINE_string(tera_tabletnode_cache_paths, "../data/cache/", "paths for cached data storage. Mutiple definition like: \"./path1/;./path2/\"");
 DEFINE_int32(tera_tabletnode_cache_block_size, 8192, "the block size of cache system");
 DEFINE_string(tera_tabletnode_cache_name, "tera.cache", "prefix name for cache name");

From 1c4682e58974a1d1c36928698ef7e8ae222f0e64 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 22 Aug 2017 16:08:21 +0800
Subject: [PATCH 11/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/include/leveldb/statistics.h | 311 +----------------------
 src/leveldb/util/block_cache.cc          | 114 ++++-----
 src/leveldb/util/statistics.cc           |   7 +-
 3 files changed, 67 insertions(+), 365 deletions(-)

diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h
index 81d4a4729..db4133e99 100644
--- a/src/leveldb/include/leveldb/statistics.h
+++ b/src/leveldb/include/leveldb/statistics.h
@@ -19,260 +19,12 @@ namespace leveldb {
  *  2. Add a readable string in TickersNameMap below for the newly added ticker.
  */
 enum Tickers : uint32_t {
-  // total block cache misses
-  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
-  //                               BLOCK_CACHE_FILTER_MISS +
-  //                               BLOCK_CACHE_DATA_MISS;
-  BLOCK_CACHE_MISS = 0,
-  // total block cache hit
-  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
-  //                              BLOCK_CACHE_FILTER_HIT +
-  //                              BLOCK_CACHE_DATA_HIT;
-  BLOCK_CACHE_HIT,
-  // # of blocks added to block cache.
-  BLOCK_CACHE_ADD,
-  // # of failures when adding blocks to block cache.
-  BLOCK_CACHE_ADD_FAILURES,
-  // # of times cache miss when accessing index block from block cache.
-  BLOCK_CACHE_INDEX_MISS,
-  // # of times cache hit when accessing index block from block cache.
-  BLOCK_CACHE_INDEX_HIT,
-  // # of bytes of index blocks inserted into cache
-  BLOCK_CACHE_INDEX_BYTES_INSERT,
-  // # of bytes of index block erased from cache
-  BLOCK_CACHE_INDEX_BYTES_EVICT,
-  // # of times cache miss when accessing filter block from block cache.
-  BLOCK_CACHE_FILTER_MISS,
-  // # of times cache hit when accessing filter block from block cache.
-  BLOCK_CACHE_FILTER_HIT,
-  // # of bytes of bloom filter blocks inserted into cache
-  BLOCK_CACHE_FILTER_BYTES_INSERT,
-  // # of bytes of bloom filter block erased from cache
-  BLOCK_CACHE_FILTER_BYTES_EVICT,
-  // # of times cache miss when accessing data block from block cache.
-  BLOCK_CACHE_DATA_MISS,
-  // # of times cache hit when accessing data block from block cache.
-  BLOCK_CACHE_DATA_HIT,
-  // # of bytes read from cache.
-  BLOCK_CACHE_BYTES_READ,
-  // # of bytes written into cache.
-  BLOCK_CACHE_BYTES_WRITE,
-
-  // # of times bloom filter has avoided file reads.
-  BLOOM_FILTER_USEFUL,
-
-  // # persistent cache hit
-  PERSISTENT_CACHE_HIT,
-  // # persistent cache miss
-  PERSISTENT_CACHE_MISS,
-
-  // # of memtable hits.
-  MEMTABLE_HIT,
-  // # of memtable misses.
-  MEMTABLE_MISS,
-
-  // # of Get() queries served by L0
-  GET_HIT_L0,
-  // # of Get() queries served by L1
-  GET_HIT_L1,
-  // # of Get() queries served by L2 and up
-  GET_HIT_L2_AND_UP,
-
-  /**
-   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
-   * There are 3 reasons currently.
-   * 覆盖写；删除；用户函数删除
-   */
-  COMPACTION_KEY_DROP_NEWER_ENTRY,  // key was written with a newer value.
-  COMPACTION_KEY_DROP_OBSOLETE,     // The key is obsolete.
-  COMPACTION_KEY_DROP_USER,  // user compaction function has dropped the key.
-
-  // Number of keys written to the database via the Put and Write call's
-  NUMBER_KEYS_WRITTEN,
-  // Number of Keys read,
-  NUMBER_KEYS_READ,
-  // Number keys updated, if inplace update is enabled
-  NUMBER_KEYS_UPDATED,
-  // The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
-  // DB::Merge(), and DB::Write().
-  BYTES_WRITTEN,
-  // The number of uncompressed bytes read from DB::Get().  It could be
-  // either from memtables, cache, or table files.
-  // For the number of logical bytes read from DB::MultiGet(),
-  // please use NUMBER_MULTIGET_BYTES_READ.
-  BYTES_READ,
-  // The number of calls to seek/next/prev
-  NUMBER_DB_SEEK,
-  NUMBER_DB_NEXT,
-  NUMBER_DB_PREV,
-  // The number of calls to seek/next/prev that returned data
-  NUMBER_DB_SEEK_FOUND,
-  NUMBER_DB_NEXT_FOUND,
-  NUMBER_DB_PREV_FOUND,
-  // The number of uncompressed bytes read from an iterator.
-  // Includes size of key and value.
-  ITER_BYTES_READ,
-  NO_FILE_CLOSES,
-  NO_FILE_OPENS,
-  NO_FILE_ERRORS,
-  // DEPRECATED Time system had to wait to do LO-L1 compactions
-  STALL_L0_SLOWDOWN_MICROS,
-  // DEPRECATED Time system had to wait to move memtable to L1.
-  STALL_MEMTABLE_COMPACTION_MICROS,
-  // DEPRECATED write throttle because of too many files in L0
-  STALL_L0_NUM_FILES_MICROS,
-  // Writer has to wait for compaction or flush to finish.
-  STALL_MICROS,
-  // The wait time for db mutex.
-  // Disabled by default. To enable it set stats level to kAll
-  DB_MUTEX_WAIT_MICROS,
-  RATE_LIMIT_DELAY_MILLIS,
-  NO_ITERATORS,  // number of iterators currently open
-
-  // Number of MultiGet calls, keys read, and bytes read
-  NUMBER_MULTIGET_CALLS,
-  NUMBER_MULTIGET_KEYS_READ,
-  NUMBER_MULTIGET_BYTES_READ,
-
-  // Number of deletes records that were not required to be
-  // written to storage because key does not exist
-  NUMBER_FILTERED_DELETES,
-  NUMBER_MERGE_FAILURES,
-  SEQUENCE_NUMBER,
-
-  // number of times bloom was checked before creating iterator on a
-  // file, and the number of times the check was useful in avoiding
-  // iterator creation (and thus likely IOPs).
-  BLOOM_FILTER_PREFIX_CHECKED,
-  BLOOM_FILTER_PREFIX_USEFUL,
-
-  // Number of times we had to reseek inside an iteration to skip
-  // over large number of keys with same userkey.
-  NUMBER_OF_RESEEKS_IN_ITERATION,
-
-  // Record the number of calls to GetUpadtesSince. Useful to keep track of
-  // transaction log iterator refreshes
-  GET_UPDATES_SINCE_CALLS,
-  BLOCK_CACHE_COMPRESSED_MISS,  // miss in the compressed block cache
-  BLOCK_CACHE_COMPRESSED_HIT,   // hit in the compressed block cache
-  // Number of blocks added to comopressed block cache
-  BLOCK_CACHE_COMPRESSED_ADD,
-  // Number of failures when adding blocks to compressed block cache
-  BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
-  WAL_FILE_SYNCED,  // Number of times WAL sync is done
-  WAL_FILE_BYTES,   // Number of bytes written to WAL
-
-  // Writes can be processed by requesting thread or by the thread at the
-  // head of the writers queue.
-  WRITE_DONE_BY_SELF,
-  WRITE_DONE_BY_OTHER,  // Equivalent to writes done for others
-  WRITE_TIMEDOUT,       // Number of writes ending up with timed-out.
-  WRITE_WITH_WAL,       // Number of Write calls that request WAL
-  COMPACT_READ_BYTES,   // Bytes read during compaction
-  COMPACT_WRITE_BYTES,  // Bytes written during compaction
-  FLUSH_WRITE_BYTES,    // Bytes written during flush
-
-  // Number of table's properties loaded directly from file, without creating
-  // table reader object.
-  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
-  NUMBER_SUPERVERSION_ACQUIRES,
-  NUMBER_SUPERVERSION_RELEASES,
-  NUMBER_SUPERVERSION_CLEANUPS,
-  NUMBER_BLOCK_NOT_COMPRESSED,
-  MERGE_OPERATION_TOTAL_TIME,
-  FILTER_OPERATION_TOTAL_TIME,
-
-  // Row cache.
-  ROW_CACHE_HIT,
-  ROW_CACHE_MISS,
-
   TICKER_ENUM_MAX
 };
 
 // The order of items listed in  Tickers should be the same as
 // the order listed in TickersNameMap
 const std::vector<std::pair<Tickers, std::string> > TickersNameMap = {
-    {BLOCK_CACHE_MISS, "leveldb.block.cache.miss"},
-    {BLOCK_CACHE_HIT, "leveldb.block.cache.hit"},
-    {BLOCK_CACHE_ADD, "leveldb.block.cache.add"},
-    {BLOCK_CACHE_ADD_FAILURES, "leveldb.block.cache.add.failures"},
-    {BLOCK_CACHE_INDEX_MISS, "leveldb.block.cache.index.miss"},
-    {BLOCK_CACHE_INDEX_HIT, "leveldb.block.cache.index.hit"},
-    {BLOCK_CACHE_INDEX_BYTES_INSERT, "leveldb.block.cache.index.bytes.insert"},
-    {BLOCK_CACHE_INDEX_BYTES_EVICT, "leveldb.block.cache.index.bytes.evict"},
-    {BLOCK_CACHE_FILTER_MISS, "leveldb.block.cache.filter.miss"},
-    {BLOCK_CACHE_FILTER_HIT, "leveldb.block.cache.filter.hit"},
-    {BLOCK_CACHE_FILTER_BYTES_INSERT,
-     "leveldb.block.cache.filter.bytes.insert"},
-    {BLOCK_CACHE_FILTER_BYTES_EVICT, "leveldb.block.cache.filter.bytes.evict"},
-    {BLOCK_CACHE_DATA_MISS, "leveldb.block.cache.data.miss"},
-    {BLOCK_CACHE_DATA_HIT, "leveldb.block.cache.data.hit"},
-    {BLOCK_CACHE_BYTES_READ, "leveldb.block.cache.bytes.read"},
-    {BLOCK_CACHE_BYTES_WRITE, "leveldb.block.cache.bytes.write"},
-    {BLOOM_FILTER_USEFUL, "leveldb.bloom.filter.useful"},
-    {MEMTABLE_HIT, "leveldb.memtable.hit"},
-    {MEMTABLE_MISS, "leveldb.memtable.miss"},
-    {GET_HIT_L0, "leveldb.l0.hit"},
-    {GET_HIT_L1, "leveldb.l1.hit"},
-    {GET_HIT_L2_AND_UP, "leveldb.l2andup.hit"},
-    {COMPACTION_KEY_DROP_NEWER_ENTRY, "leveldb.compaction.key.drop.new"},
-    {COMPACTION_KEY_DROP_OBSOLETE, "leveldb.compaction.key.drop.obsolete"},
-    {COMPACTION_KEY_DROP_USER, "leveldb.compaction.key.drop.user"},
-    {NUMBER_KEYS_WRITTEN, "leveldb.number.keys.written"},
-    {NUMBER_KEYS_READ, "leveldb.number.keys.read"},
-    {NUMBER_KEYS_UPDATED, "leveldb.number.keys.updated"},
-    {BYTES_WRITTEN, "leveldb.bytes.written"},
-    {BYTES_READ, "leveldb.bytes.read"},
-    {NUMBER_DB_SEEK, "leveldb.number.db.seek"},
-    {NUMBER_DB_NEXT, "leveldb.number.db.next"},
-    {NUMBER_DB_PREV, "leveldb.number.db.prev"},
-    {NUMBER_DB_SEEK_FOUND, "leveldb.number.db.seek.found"},
-    {NUMBER_DB_NEXT_FOUND, "leveldb.number.db.next.found"},
-    {NUMBER_DB_PREV_FOUND, "leveldb.number.db.prev.found"},
-    {ITER_BYTES_READ, "leveldb.db.iter.bytes.read"},
-    {NO_FILE_CLOSES, "leveldb.no.file.closes"},
-    {NO_FILE_OPENS, "leveldb.no.file.opens"},
-    {NO_FILE_ERRORS, "leveldb.no.file.errors"},
-    {STALL_L0_SLOWDOWN_MICROS, "leveldb.l0.slowdown.micros"},
-    {STALL_MEMTABLE_COMPACTION_MICROS, "leveldb.memtable.compaction.micros"},
-    {STALL_L0_NUM_FILES_MICROS, "leveldb.l0.num.files.stall.micros"},
-    {STALL_MICROS, "leveldb.stall.micros"},
-    {DB_MUTEX_WAIT_MICROS, "leveldb.db.mutex.wait.micros"},
-    {RATE_LIMIT_DELAY_MILLIS, "leveldb.rate.limit.delay.millis"},
-    {NO_ITERATORS, "leveldb.num.iterators"},
-    {NUMBER_MULTIGET_CALLS, "leveldb.number.multiget.get"},
-    {NUMBER_MULTIGET_KEYS_READ, "leveldb.number.multiget.keys.read"},
-    {NUMBER_MULTIGET_BYTES_READ, "leveldb.number.multiget.bytes.read"},
-    {NUMBER_FILTERED_DELETES, "leveldb.number.deletes.filtered"},
-    {NUMBER_MERGE_FAILURES, "leveldb.number.merge.failures"},
-    {SEQUENCE_NUMBER, "leveldb.sequence.number"},
-    {BLOOM_FILTER_PREFIX_CHECKED, "leveldb.bloom.filter.prefix.checked"},
-    {BLOOM_FILTER_PREFIX_USEFUL, "leveldb.bloom.filter.prefix.useful"},
-    {NUMBER_OF_RESEEKS_IN_ITERATION, "leveldb.number.reseeks.iteration"},
-    {GET_UPDATES_SINCE_CALLS, "leveldb.getupdatessince.calls"},
-    {BLOCK_CACHE_COMPRESSED_MISS, "leveldb.block.cachecompressed.miss"},
-    {BLOCK_CACHE_COMPRESSED_HIT, "leveldb.block.cachecompressed.hit"},
-    {BLOCK_CACHE_COMPRESSED_ADD, "leveldb.block.cachecompressed.add"},
-    {BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
-     "leveldb.block.cachecompressed.add.failures"},
-    {WAL_FILE_SYNCED, "leveldb.wal.synced"},
-    {WAL_FILE_BYTES, "leveldb.wal.bytes"},
-    {WRITE_DONE_BY_SELF, "leveldb.write.self"},
-    {WRITE_DONE_BY_OTHER, "leveldb.write.other"},
-    {WRITE_WITH_WAL, "leveldb.write.wal"},
-    {FLUSH_WRITE_BYTES, "leveldb.flush.write.bytes"},
-    {COMPACT_READ_BYTES, "leveldb.compact.read.bytes"},
-    {COMPACT_WRITE_BYTES, "leveldb.compact.write.bytes"},
-    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
-     "leveldb.number.direct.load.table.properties"},
-    {NUMBER_SUPERVERSION_ACQUIRES, "leveldb.number.superversion_acquires"},
-    {NUMBER_SUPERVERSION_RELEASES, "leveldb.number.superversion_releases"},
-    {NUMBER_SUPERVERSION_CLEANUPS, "leveldb.number.superversion_cleanups"},
-    {NUMBER_BLOCK_NOT_COMPRESSED, "leveldb.number.block.not_compressed"},
-    {MERGE_OPERATION_TOTAL_TIME, "leveldb.merge.operation.time.nanos"},
-    {FILTER_OPERATION_TOTAL_TIME, "leveldb.filter.operation.time.nanos"},
-    {ROW_CACHE_HIT, "leveldb.row.cache.hit"},
-    {ROW_CACHE_MISS, "leveldb.row.cache.miss"},
 };
 
 /**
@@ -283,74 +35,25 @@ const std::vector<std::pair<Tickers, std::string> > TickersNameMap = {
  * And increment HISTOGRAM_ENUM_MAX
  */
 enum Histograms : uint32_t {
-  DB_GET = 0,
-  DB_WRITE,
-  COMPACTION_TIME,
-  SUBCOMPACTION_SETUP_TIME,
-  TABLE_SYNC_MICROS,
-  COMPACTION_OUTFILE_SYNC_MICROS,
-  WAL_FILE_SYNC_MICROS,
-  MANIFEST_FILE_SYNC_MICROS,
-  // TIME SPENT IN IO DURING TABLE OPEN
-  TABLE_OPEN_IO_MICROS,
-  DB_MULTIGET,
-  READ_BLOCK_COMPACTION_MICROS,
-  READ_BLOCK_GET_MICROS,
-  WRITE_RAW_BLOCK_MICROS,
-  STALL_L0_SLOWDOWN_COUNT,
-  STALL_MEMTABLE_COMPACTION_COUNT,
-  STALL_L0_NUM_FILES_COUNT,
-  HARD_RATE_LIMIT_DELAY_COUNT,
-  SOFT_RATE_LIMIT_DELAY_COUNT,
-  NUM_FILES_IN_SINGLE_COMPACTION,
-  DB_SEEK,
-  WRITE_STALL,
-  SST_READ_MICROS,
-  // The number of subcompactions actually scheduled during a compaction
-  NUM_SUBCOMPACTIONS_SCHEDULED,
-  // Value size distribution in each operation
-  BYTES_PER_READ,
-  BYTES_PER_WRITE,
-  BYTES_PER_MULTIGET,
   // tera block cache spec
-  TERA_BLOCK_CACHE_PREAD_QUEUE,
+  TERA_BLOCK_CACHE_PREAD_QUEUE = 0,
   TERA_BLOCK_CACHE_PREAD_SSD_READ,
   TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA,
   TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK,
+  TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR,
+  TERA_BLOCK_CACHE_PREAD_GET_BLOCK,
+  TERA_BLOCK_CACHE_PREAD_BLOCK_NR,
   HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
 };
 
 const std::vector<std::pair<Histograms, std::string> > HistogramsNameMap = {
-    {DB_GET, "leveldb.db.get.micros"},
-    {DB_WRITE, "leveldb.db.write.micros"},
-    {COMPACTION_TIME, "leveldb.compaction.times.micros"},
-    {SUBCOMPACTION_SETUP_TIME, "leveldb.subcompaction.setup.times.micros"},
-    {TABLE_SYNC_MICROS, "leveldb.table.sync.micros"},
-    {COMPACTION_OUTFILE_SYNC_MICROS, "leveldb.compaction.outfile.sync.micros"},
-    {WAL_FILE_SYNC_MICROS, "leveldb.wal.file.sync.micros"},
-    {MANIFEST_FILE_SYNC_MICROS, "leveldb.manifest.file.sync.micros"},
-    {TABLE_OPEN_IO_MICROS, "leveldb.table.open.io.micros"},
-    {DB_MULTIGET, "leveldb.db.multiget.micros"},
-    {READ_BLOCK_COMPACTION_MICROS, "leveldb.read.block.compaction.micros"},
-    {READ_BLOCK_GET_MICROS, "leveldb.read.block.get.micros"},
-    {WRITE_RAW_BLOCK_MICROS, "leveldb.write.raw.block.micros"},
-    {STALL_L0_SLOWDOWN_COUNT, "leveldb.l0.slowdown.count"},
-    {STALL_MEMTABLE_COMPACTION_COUNT, "leveldb.memtable.compaction.count"},
-    {STALL_L0_NUM_FILES_COUNT, "leveldb.num.files.stall.count"},
-    {HARD_RATE_LIMIT_DELAY_COUNT, "leveldb.hard.rate.limit.delay.count"},
-    {SOFT_RATE_LIMIT_DELAY_COUNT, "leveldb.soft.rate.limit.delay.count"},
-    {NUM_FILES_IN_SINGLE_COMPACTION, "leveldb.numfiles.in.singlecompaction"},
-    {DB_SEEK, "leveldb.db.seek.micros"},
-    {WRITE_STALL, "leveldb.db.write.stall"},
-    {SST_READ_MICROS, "leveldb.sst.read.micros"},
-    {NUM_SUBCOMPACTIONS_SCHEDULED, "leveldb.num.subcompactions.scheduled"},
-    {BYTES_PER_READ, "leveldb.bytes.per.read"},
-    {BYTES_PER_WRITE, "leveldb.bytes.per.write"},
-    {BYTES_PER_MULTIGET, "leveldb.bytes.per.multiget"},
     {TERA_BLOCK_CACHE_PREAD_QUEUE, "tera.block_cache.pread_queue"},
     {TERA_BLOCK_CACHE_PREAD_SSD_READ, "tera.block_cache.pread_ssd_read"},
     {TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, "tera.block_cache.pread_fill_user_data"},
     {TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK, "tera.block_cache.pread_release_block"},
+    {TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, "tera.block_cache.lockmap_ds_reload_nr"},
+    {TERA_BLOCK_CACHE_PREAD_GET_BLOCK, "tera.block_cache.pread_get_block"},
+    {TERA_BLOCK_CACHE_PREAD_BLOCK_NR, "tera.block_cache.pread_block_nr"},
 };
 
 struct HistogramData {
diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 65f218eb4..0ad4b6f35 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -524,28 +524,7 @@ class BlockCacheWritableFile : public WritableFile {
         return;
     }
 
-    ~BlockCacheWritableFile() {
-        if (dfs_file_ != NULL) {
-            Log("[%s] dfs close for release %s\n", cache_->WorkPath().c_str(), fname_.c_str());
-            dfs_file_->Close();
-            delete dfs_file_;
-            dfs_file_ = NULL;
-        }
-
-        Log("[%s] begin release %s\n", cache_->WorkPath().c_str(), fname_.c_str());
-        MutexLock lockgard(&cache_->mu_);
-        uint64_t block_idx;
-        std::string* block_data = write_buffer_.PopBackBlock(&block_idx);
-        if (block_data != NULL) {
-            FillCache(block_data, block_idx);
-        }
-
-        while (bg_block_flush_ > 0) {
-            bg_cv_.Wait();
-        }
-        Log("[%s] end release %s\n", cache_->WorkPath().c_str(), fname_.c_str());
-        return;
-    }
+    ~BlockCacheWritableFile() { Close(); }
 
     Status Append(const Slice& data) {
         Status s = dfs_file_->Append(data);
@@ -564,10 +543,12 @@ class BlockCacheWritableFile : public WritableFile {
     }
 
     Status Close() {
-        Log("[%s] begin close %s\n", cache_->WorkPath().c_str(), fname_.c_str());
-        Status s = dfs_file_->Close();
-        delete dfs_file_;
-        dfs_file_ = NULL;
+        Status s;
+        if (dfs_file_ != NULL) {
+            s = dfs_file_->Close();
+            delete dfs_file_;
+            dfs_file_ = NULL;
+        }
 
         MutexLock lockgard(&cache_->mu_);
         uint64_t block_idx;
@@ -579,8 +560,8 @@ class BlockCacheWritableFile : public WritableFile {
         while (bg_block_flush_ > 0) {
             bg_cv_.Wait();
         }
-        Log("[%s] end close %s, status %s\n", cache_->WorkPath().c_str(), fname_.c_str(),
-            s.ToString().c_str());
+        //Log("[%s] end close %s, status %s\n", cache_->WorkPath().c_str(), fname_.c_str(),
+        //    s.ToString().c_str());
         return s;
     }
 
@@ -640,7 +621,7 @@ class BlockCacheWritableFile : public WritableFile {
             port::CondVar cv(&cache_->mu_);
             cv.Wait(10); // timewait 10ms retry
         }
-        assert(block->state == 0);
+        block->state = 0;
         block->GetDataBlock(cache_->options_.block_size, Slice(*block_data));
         cache_->mu_.Unlock();
 
@@ -686,7 +667,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
 
         MutexLock lockgard(&cache_->mu_);
         fid_ = cache_->FileId(fname_);
-        aio_enabled_ = true;
+        aio_enabled_ = false;
         return;
     }
 
@@ -715,6 +696,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         uint64_t start_ts = cache_->options_.cache_env->NowMicros();
         MutexLock lockgard(&cache_->mu_);
         for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) {
+            uint64_t get_block_ts = cache_->options_.cache_env->NowMicros();
             CacheBlock* block = NULL;
             while ((block = cache_->GetAndAllocBlock(fid, block_idx)) == NULL) {
                 Log("[%s] fill cache for read %s, fid %lu, block_idx %lu, wait 10ms after retry\n",
@@ -742,10 +724,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //    cache_->WorkPath().c_str(), block->ToString().c_str(),
             //    block->handle->refs, block->data_block_refs,
             //    block->data_block_alloc);
+            cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_GET_BLOCK,
+                                       cache_->options_.cache_env->NowMicros() - get_block_ts);
         }
         cache_->mu_.Unlock();
         uint64_t queue_ts = cache_->options_.cache_env->NowMicros();
         cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_QUEUE, queue_ts - start_ts);
+        cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_BLOCK_NR, end - begin + 1);
 
         // async read miss data
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
@@ -1058,21 +1043,24 @@ void BlockCacheImpl::BGControlThreadFunc(void* arg) {
 }
 
 void BlockCacheImpl::BGControlThread() {
-    Log("[%s] statistics: %s", this->WorkPath().c_str(),
-        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str());
-    Log("[%s] statistics: %s", this->WorkPath().c_str(),
-        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str());
-    Log("[%s] statistics: %s", this->WorkPath().c_str(),
-        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str());
-    Log("[%s] statistics: %s", this->WorkPath().c_str(),
-        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str());
+    Log("[%s] statistics: %s, %s, %s, %s, %s, %s, %s\n", this->WorkPath().c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_GET_BLOCK).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_BLOCK_NR).c_str());
 
     // resched after 1s
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK);
-    bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 10000);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_GET_BLOCK);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_BLOCK_NR);
+    bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 6000);
 }
 
 Status BlockCacheImpl::NewWritableFile(const std::string& fname,
@@ -1192,6 +1180,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 lc.data_set->fd);
 
             // reload hash lru
+            uint64_t total_items = 0;
             ReadOptions s_opts;
             leveldb::Iterator* db_it = db_->NewIterator(s_opts);
             for (db_it->Seek(key);
@@ -1207,6 +1196,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 if (sid != lc.sid) {
                     break;
                 }
+                total_items++;
 
                 CacheBlock* block = new CacheBlock(&mu_);
                 block->DecodeFrom(db_it->value()); // get fid and block_idx
@@ -1216,16 +1206,17 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 block->sid = sid;
                 block->cache_block_idx = cbi;
                 block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0;
-                Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n",
-                    this->WorkPath().c_str(),
-                    lc.KeyToString().c_str(),
-                    block->ToString().c_str());
+                //Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n",
+                //    this->WorkPath().c_str(),
+                //    lc.KeyToString().c_str(),
+                //    block->ToString().c_str());
                 LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter));
                 handle->cache_id = block->cache_block_idx;
                 block->handle = handle;
                 lc.data_set->cache->Release((Cache::Handle*)handle);
             }
             delete db_it;
+            stat_->MeasureTime(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, total_items);
 
             mu_.Lock();
             data_set_map_[lc.sid] = lc.data_set;
@@ -1285,7 +1276,7 @@ Status BlockCacheImpl::LoadCache() {
     Log("[block_cache %s]: reuse block cache: prev_fid: %lu, new_fid: %lu\n",
         dbname.c_str(), prev_fid_, new_fid_);
 
-    bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 10000);
+    bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 6000);
     s = Status::OK();
     return s;
 }
@@ -1462,24 +1453,33 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
     CacheBlock* block = NULL;
     DataSet* ds = GetDataSet(sid); // get and alloc ds
     Cache* cache = ds->cache;
+    mu_.Unlock();
+
     LRUHandle* h = (LRUHandle*)cache->Lookup(key);
     if (h == NULL) {
-        block = new CacheBlock(&mu_);
-        h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
+        mu_.Lock();
+        h = (LRUHandle*)cache->Lookup(key);
         if (h == NULL) {
-            delete block;
-            return NULL;
+            block = new CacheBlock(&mu_);
+            h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
+            if (h == NULL) {
+                delete block;
+                return NULL;
+            }
+            block->fid = fid;
+            block->block_idx = block_idx;
+            block->sid = sid;
+            block->cache_block_idx = h->cache_id;
+            block->handle = h;
+            Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
+                    this->WorkPath().c_str(),
+                    block->ToString().c_str(),
+                    sid, fid, block_idx, hash, options_.dataset_num);
+        } else {
+            block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
         }
-        block->fid = fid;
-        block->block_idx = block_idx;
-        block->sid = sid;
-        block->cache_block_idx = h->cache_id;
-        block->handle = h;
-        Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
-            this->WorkPath().c_str(),
-            block->ToString().c_str(),
-            sid, fid, block_idx, hash, options_.dataset_num);
     } else {
+        mu_.Lock();
         block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
         //Log("[%s] get block from memcache, %s\n",
         //        this->WorkPath().c_str(), block->ToString().c_str());
diff --git a/src/leveldb/util/statistics.cc b/src/leveldb/util/statistics.cc
index ee383a055..74352ae3a 100644
--- a/src/leveldb/util/statistics.cc
+++ b/src/leveldb/util/statistics.cc
@@ -53,11 +53,10 @@ class StatisticsImpl : public Statistics {
     GetHistogramData(type, &hData);
     snprintf(buffer,
             200,
-            "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f\n",
+            "%s :=> %f(%f)",
             HistogramsNameMap[type].second.c_str(),
-            hData.median,
-            hData.percentile95,
-            hData.percentile99);
+            hData.average,
+            hData.percentile99 - hData.median);
     res.append(buffer);
     res.shrink_to_fit();
     return res;

From 749296e36083fa64dd3db493e3879dec025f317e Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Thu, 24 Aug 2017 00:18:20 +0800
Subject: [PATCH 12/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/include/leveldb/block_cache.h |   2 +-
 src/leveldb/include/leveldb/statistics.h  |  12 +-
 src/leveldb/util/block_cache.cc           | 167 +++++++++++++---------
 src/leveldb/util/statistics.cc            |   2 +-
 4 files changed, 110 insertions(+), 73 deletions(-)

diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h
index 021964db4..e331e05f4 100644
--- a/src/leveldb/include/leveldb/block_cache.h
+++ b/src/leveldb/include/leveldb/block_cache.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h
index db4133e99..62916892f 100644
--- a/src/leveldb/include/leveldb/statistics.h
+++ b/src/leveldb/include/leveldb/statistics.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -43,6 +43,11 @@ enum Histograms : uint32_t {
   TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR,
   TERA_BLOCK_CACHE_PREAD_GET_BLOCK,
   TERA_BLOCK_CACHE_PREAD_BLOCK_NR,
+  TERA_BLOCK_CACHE_GET_DS,
+  TERA_BLOCK_CACHE_DS_LRU_LOOKUP,
+  TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK,
+  TERA_BLOCK_CACHE_ALLOC_FID,
+  TERA_BLOCK_CACHE_GET_FID,
   HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
 };
 
@@ -54,6 +59,11 @@ const std::vector<std::pair<Histograms, std::string> > HistogramsNameMap = {
     {TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, "tera.block_cache.lockmap_ds_reload_nr"},
     {TERA_BLOCK_CACHE_PREAD_GET_BLOCK, "tera.block_cache.pread_get_block"},
     {TERA_BLOCK_CACHE_PREAD_BLOCK_NR, "tera.block_cache.pread_block_nr"},
+    {TERA_BLOCK_CACHE_GET_DS, "tera.block_cache.get_ds"},
+    {TERA_BLOCK_CACHE_DS_LRU_LOOKUP, "tera.block_cache.ds_lru_lookup"},
+    {TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, "tera.block_cache.pread_wait_unlock"},
+    {TERA_BLOCK_CACHE_ALLOC_FID, "tera.block_cache.alloc_fid"},
+    {TERA_BLOCK_CACHE_GET_FID, "tera.block_cache.get_fid"},
 };
 
 struct HistogramData {
diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 0ad4b6f35..797088f84 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -38,7 +38,7 @@ namespace leveldb {
 // Tcache
 /////////////////////////////////////////////
 uint64_t kBlockSize = 4096UL;
-uint64_t kDataSetSize = 134217728UL;
+uint64_t kDataSetSize = 128UL << 20;
 uint64_t kFidBatchNum = 100000UL;
 uint64_t kCacheSize = 350000000000UL;
 uint64_t kMetaBlockSize = 2000UL;
@@ -63,6 +63,7 @@ struct CacheBlock {
     uint64_t sid;
     uint64_t cache_block_idx;
     volatile uint64_t state;
+    port::Mutex mu;
     port::CondVar cv;
     Slice data_block;
     bool data_block_alloc;
@@ -70,31 +71,35 @@ struct CacheBlock {
     LRUHandle* handle;
     Status s;
 
-    CacheBlock(port::Mutex* mu)
+    CacheBlock()
     : fid(0),
       block_idx(0),
       sid(0xffffffffffffffff),
       cache_block_idx(0xffffffffffffffff),
       state(0),
-      cv(mu),
+      cv(&mu),
       data_block_alloc(false),
       data_block_refs(0),
       handle(NULL) {
     }
 
     bool Test(uint64_t c_state) {
+        mu.AssertHeld();
         return (state & c_state) == c_state;
     }
 
     void Clear(uint64_t c_state) {
+        mu.AssertHeld();
         state &= ~c_state;
     }
 
     void Set(uint64_t c_state) {
+        mu.AssertHeld();
         state |= c_state;
     }
 
     void WaitOnClear(uint64_t c_state) { // access in lock
+        mu.AssertHeld();
         while (Test(c_state)) {
             cv.Wait();
         }
@@ -507,7 +512,7 @@ class BlockCacheWritableFile : public WritableFile {
 public:
     BlockCacheWritableFile(BlockCacheImpl* c, const std::string& fname, Status* s)
         : cache_(c),
-          bg_cv_(&c->mu_),
+          bg_cv_(&mu_),
           bg_block_flush_(0),
           pending_block_num_(0),
           write_buffer_(cache_->WorkPath(), fname, cache_->options_.block_size),
@@ -519,7 +524,6 @@ class BlockCacheWritableFile : public WritableFile {
             cache_->options_.block_size,
             s->ToString().c_str());
 
-        MutexLock lockgard(&cache_->mu_);
         fid_ = cache_->FileId(fname_);
         return;
     }
@@ -537,7 +541,7 @@ class BlockCacheWritableFile : public WritableFile {
         }
         write_buffer_.Append(data);
 
-        MutexLock lockgard(&cache_->mu_);
+        MutexLock lockgard(&mu_);
         MaybeScheduleBGFlush();
         return Status::OK();
     }
@@ -550,13 +554,13 @@ class BlockCacheWritableFile : public WritableFile {
             dfs_file_ = NULL;
         }
 
-        MutexLock lockgard(&cache_->mu_);
         uint64_t block_idx;
         std::string* block_data = write_buffer_.PopBackBlock(&block_idx);
         if (block_data != NULL) {
             FillCache(block_data, block_idx);
         }
 
+        MutexLock lockgard(&mu_);
         while (bg_block_flush_ > 0) {
             bg_cv_.Wait();
         }
@@ -577,7 +581,7 @@ class BlockCacheWritableFile : public WritableFile {
 
 private:
     void MaybeScheduleBGFlush() {
-        cache_->mu_.AssertHeld();
+        mu_.AssertHeld();
         //Log("[%s] Maybe schedule BGFlush: %s, bg_block_flush: %u, block_nr: %u\n",
         //    cache_->WorkPath().c_str(),
         //    fname_.c_str(),
@@ -594,12 +598,15 @@ class BlockCacheWritableFile : public WritableFile {
     }
     void BGFlush() {
         Log("[%s] Begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
-        MutexLock lockgard(&cache_->mu_);
+        MutexLock lockgard(&mu_);
         uint64_t block_idx;
         std::string* block_data = write_buffer_.PopFrontBlock(&block_idx);
         if (block_data != NULL) {
             pending_block_num_++;
+            mu_.Unlock();
+
             FillCache(block_data, block_idx);
+            mu_.Lock();
             pending_block_num_--;
         }
 
@@ -610,7 +617,6 @@ class BlockCacheWritableFile : public WritableFile {
     }
 
     Status FillCache(std::string* block_data, uint64_t block_idx) {
-        cache_->mu_.AssertHeld();
         Status s;
         uint64_t fid = fid_;
         CacheBlock* block = NULL;
@@ -618,22 +624,22 @@ class BlockCacheWritableFile : public WritableFile {
             Log("[%s] fill cache for write %s, fid %lu, block_idx %lu, wait 10ms after retry\n",
                 cache_->WorkPath().c_str(), fname_.c_str(),
                 fid, block_idx);
-            port::CondVar cv(&cache_->mu_);
-            cv.Wait(10); // timewait 10ms retry
+            cache_->options_.cache_env->SleepForMicroseconds(10000);
         }
+
+        block->mu.Lock();
         block->state = 0;
         block->GetDataBlock(cache_->options_.block_size, Slice(*block_data));
-        cache_->mu_.Unlock();
+        block->mu.Unlock();
 
         // Do io without lock
         block->s = cache_->LogRecord(block);
         if (block->s.ok()) {
             block->s = cache_->FillCache(block);
-        }
-
-        cache_->mu_.Lock();
-        if (block->s.ok()) {
-            block->state = kCacheBlockValid;
+            if (block->s.ok()) {
+                MutexLock l(&block->mu);
+                block->state = kCacheBlockValid;
+            }
         }
         s = cache_->ReleaseBlock(block, true);
         write_buffer_.ReleaseBlock(block_data);
@@ -643,6 +649,7 @@ class BlockCacheWritableFile : public WritableFile {
 private:
     BlockCacheImpl* cache_;
     //port::AtomicPointer shutting_down_;
+    port::Mutex mu_;
     port::CondVar bg_cv_;          // Signalled when background work finishes
     WritableFile* dfs_file_;
     // protected by cache_.mu_
@@ -665,7 +672,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         //    cache_->options_.block_size,
         //    s->ToString().c_str());
 
-        MutexLock lockgard(&cache_->mu_);
         fid_ = cache_->FileId(fname_);
         aio_enabled_ = false;
         return;
@@ -673,7 +679,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
 
     ~BlockCacheRandomAccessFile() {
         delete dfs_file_;
-        return;
     }
 
     Status Read(uint64_t offset, size_t n, Slice* result,
@@ -694,7 +699,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         //    begin, end, cache_->options_.block_size);
 
         uint64_t start_ts = cache_->options_.cache_env->NowMicros();
-        MutexLock lockgard(&cache_->mu_);
         for (uint64_t block_idx = begin; block_idx <= end; ++block_idx) {
             uint64_t get_block_ts = cache_->options_.cache_env->NowMicros();
             CacheBlock* block = NULL;
@@ -702,13 +706,13 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
                 Log("[%s] fill cache for read %s, fid %lu, block_idx %lu, wait 10ms after retry\n",
                     cache_->WorkPath().c_str(), fname_.c_str(),
                     fid, block_idx);
-                port::CondVar cv(&cache_->mu_);
-                cv.Wait(10); // timewait 10ms retry
+                cache_->options_.cache_env->SleepForMicroseconds(10000);
             }
+
+            block->mu.Lock();
             assert(block->fid == fid && block->block_idx == block_idx);
             block->GetDataBlock(cache_->options_.block_size, Slice());
             block_queue.push_back(block); // sort by block_idx
-
             if (!block->Test(kCacheBlockLocked) &&
                 block->Test(kCacheBlockValid)) {
                 block->Set(kCacheBlockLocked | kCacheBlockCacheRead);
@@ -719,6 +723,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             } else {
                 c_locked.push_back(block);
             }
+            block->mu.Unlock();
 
             //Log("[%s] Queue block: %s, refs %u, data_block_refs %lu, alloc %u\n",
             //    cache_->WorkPath().c_str(), block->ToString().c_str(),
@@ -727,7 +732,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_GET_BLOCK,
                                        cache_->options_.cache_env->NowMicros() - get_block_ts);
         }
-        cache_->mu_.Unlock();
         uint64_t queue_ts = cache_->options_.cache_env->NowMicros();
         cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_QUEUE, queue_ts - start_ts);
         cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_BLOCK_NR, end - begin + 1);
@@ -764,8 +768,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
 
         // wait async cache read done
         for (uint32_t i = 0; i < c_valid.size(); ++i) {
-            MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_valid[i];
+            block->mu.Lock();
             block->WaitOnClear(kCacheBlockCacheRead);
             assert(block->Test(kCacheBlockValid));
             if (!block->s.ok() && s.ok()) {
@@ -773,6 +777,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             }
             block->Clear(kCacheBlockLocked);
             block->cv.SignalAll();
+            block->mu.Unlock();
             //Log("[%s] cache read done, %s\n",
             //    cache_->WorkPath().c_str(),
             //    block->ToString().c_str());
@@ -782,13 +787,14 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
 
         // wait dfs read done and async cache file
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
-            MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_miss[i];
+            block->mu.Lock();
             block->WaitOnClear(kCacheBlockDfsRead);
             block->Set(kCacheBlockCacheFill);
             if (!block->s.ok() && s.ok()) {
                 s = block->s; // degrade read
             }
+            block->mu.Unlock();
             Log("[%s] dfs read done, %s\n",
                 cache_->WorkPath().c_str(),
                 block->ToString().c_str());
@@ -808,8 +814,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         //uint64_t ssd_write_sched_ts = cache_->options_.cache_env->NowMicros();
 
         for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish
-            MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_miss[i];
+            block->mu.Lock();
             block->WaitOnClear(kCacheBlockCacheFill);
             if (block->s.ok()) {
                 block->Set(kCacheBlockValid);
@@ -818,22 +824,25 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             }
             block->Clear(kCacheBlockLocked);
             block->cv.SignalAll();
+            block->mu.Unlock();
             //Log("[%s] cache fill done, %s\n",
             //    cache_->WorkPath().c_str(),
             //    block->ToString().c_str());
         }
-        //uint64_t ssd_write_ts = cache_->options_.cache_env->NowMicros();
+        uint64_t ssd_write_ts = cache_->options_.cache_env->NowMicros();
 
         // wait other async read finish
         for (uint32_t i = 0; i < c_locked.size(); ++i) {
-            MutexLock lockgard(&cache_->mu_);
             CacheBlock* block = c_locked[i];
+            block->mu.Lock();
             block->WaitOnClear(kCacheBlockLocked);
+            block->mu.Unlock();
             //Log("[%s] wait locked done, %s\n",
             //    cache_->WorkPath().c_str(),
             //    block->ToString().c_str());
         }
         uint64_t wait_unlock_ts = cache_->options_.cache_env->NowMicros();
+        cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, wait_unlock_ts - ssd_write_ts);
 
         // fill user mem
         size_t msize = 0;
@@ -862,7 +871,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         uint64_t fill_user_data_ts = cache_->options_.cache_env->NowMicros();
         cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA, fill_user_data_ts - wait_unlock_ts);
 
-        cache_->mu_.Lock();
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
             CacheBlock* block = c_miss[i];
             //Log("[%s] wakeup for miss, %s\n", cache_->WorkPath().c_str(), block->ToString().c_str());
@@ -923,9 +931,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
                 block->s.ToString().c_str(), result.size());
         }
 
-        MutexLock lockgard(&cache_->mu_);
+        block->mu.Lock();
         block->Clear(kCacheBlockDfsRead);
         block->cv.SignalAll();
+        block->mu.Unlock();
         return;
     }
 
@@ -947,9 +956,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         CacheBlock* block = reader->block;
         block->s = cache_->ReadCache(block, NULL);
 
-        MutexLock lockgard(&cache_->mu_);
+        block->mu.Lock();
         block->Clear(kCacheBlockCacheRead);
         block->cv.SignalAll();
+        block->mu.Unlock();
         //Log("[%s] async.cacheread signal, %s\n", cache_->WorkPath().c_str(),
         //    block->ToString().c_str());
     }
@@ -966,14 +976,15 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         //while (aio_error(reader->aio_context) == EINPROGRESS);
         ssize_t res = aio_return(&reader->aio_context);
         block->s = res < 0? Status::Corruption("AioReadCache error") : Status::OK();
-
-        MutexLock lockgard(&cache_->mu_);
-        block->Clear(kCacheBlockCacheRead);
-        block->cv.SignalAll();
         if (!block->s.ok()) {
             Log("[%s] aio.cacheread signal, %s\n", cache_->WorkPath().c_str(),
                 block->ToString().c_str());
         }
+
+        block->mu.Lock();
+        block->Clear(kCacheBlockCacheRead);
+        block->cv.SignalAll();
+        block->mu.Unlock();
     }
     void AioCacheRead(AsyncCacheReader* reader) const {
         // setup sigevent
@@ -1006,9 +1017,10 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             block->s = cache_->FillCache(block);
         }
 
-        MutexLock lockgard(&cache_->mu_);
+        block->mu.Lock();
         block->Clear(kCacheBlockCacheFill);
         block->cv.SignalAll();
+        block->mu.Unlock();
         return;
     }
 
@@ -1043,16 +1055,27 @@ void BlockCacheImpl::BGControlThreadFunc(void* arg) {
 }
 
 void BlockCacheImpl::BGControlThread() {
-    Log("[%s] statistics: %s, %s, %s, %s, %s, %s, %s\n", this->WorkPath().c_str(),
+    Log("[%s] statistics: "
+        "%s, %s, %s, %s, %s, "
+        "%s, %s, %s, %s, %s, "
+        "%s, %s\n",
+        this->WorkPath().c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR).c_str(),
+
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_GET_BLOCK).c_str(),
-        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_BLOCK_NR).c_str());
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_BLOCK_NR).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_DS).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_DS_LRU_LOOKUP).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK).c_str(),
+
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_ALLOC_FID).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str());
 
-    // resched after 1s
+    // resched after 6s
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA);
@@ -1060,6 +1083,11 @@ void BlockCacheImpl::BGControlThread() {
     stat_->ClearHistogram(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_GET_BLOCK);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_BLOCK_NR);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_DS);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_DS_LRU_LOOKUP);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_ALLOC_FID);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_FID);
     bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 6000);
 }
 
@@ -1198,7 +1226,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 }
                 total_items++;
 
-                CacheBlock* block = new CacheBlock(&mu_);
+                CacheBlock* block = new CacheBlock;
                 block->DecodeFrom(db_it->value()); // get fid and block_idx
                 std::string hkey;
                 PutFixed64(&hkey, block->fid);
@@ -1282,7 +1310,7 @@ Status BlockCacheImpl::LoadCache() {
 }
 
 Status BlockCacheImpl::FillCache(CacheBlock* block) {
-    MutexLock l(&mu_);
+    mu_.Lock();
     uint64_t sid = block->sid;
     uint64_t cache_block_idx = block->cache_block_idx;
     int fd = (data_set_map_[sid])->fd;
@@ -1297,7 +1325,6 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) {
         block->ToString().c_str(),
         res);
 
-    mu_.Lock();
     if (res < 0) {
         return Status::Corruption("FillCache error");
     }
@@ -1305,7 +1332,7 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) {
 }
 
 Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) {
-    MutexLock l(&mu_);
+    mu_.Lock();
     uint64_t sid = block->sid;
     uint64_t cache_block_idx = block->cache_block_idx;
     int fd = (data_set_map_[sid])->fd;
@@ -1330,10 +1357,6 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) {
             cache_block_idx,
             block->ToString().c_str(),
             res);
-    }
-
-    mu_.Lock();
-    if (res < 0) {
         return Status::Corruption("ReadCache error");
     }
     return Status::OK();
@@ -1341,6 +1364,7 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) {
 
 uint64_t BlockCacheImpl::AllocFileId() { // no more than fid_batch_num
     mu_.AssertHeld();
+    uint64_t start_ts = options_.cache_env->NowMicros();
     uint64_t fid = ++new_fid_;
     while (new_fid_ - prev_fid_ >= options_.fid_batch_num) {
         std::string key = "FID#";
@@ -1363,17 +1387,18 @@ uint64_t BlockCacheImpl::AllocFileId() { // no more than fid_batch_num
             new_fid_,
             prev_fid_);
     }
+    stat_->MeasureTime(TERA_BLOCK_CACHE_ALLOC_FID,
+                       options_.cache_env->NowMicros() - start_ts);
     return fid;
 }
 
 uint64_t BlockCacheImpl::FileId(const std::string& fname) {
-    mu_.AssertHeld();
     uint64_t fid = 0;
     std::string key = "FNAME#" + fname;
-    mu_.Unlock();
-
+    uint64_t start_ts = options_.cache_env->NowMicros();
     ReadOptions r_opts;
     std::string val;
+
     Status s = db_->Get(r_opts, key, &val);
     if (!s.ok()) { // not exist
         MutexLock l(&mu_);
@@ -1395,11 +1420,12 @@ uint64_t BlockCacheImpl::FileId(const std::string& fname) {
     } else { // fid in cache
         fid = DecodeFixed64(val.c_str());
     }
+
     //Log("[%s] Fid: %lu, fname: %s\n",
     //    this->WorkPath().c_str(),
     //    fid, fname.c_str());
-
-    mu_.Lock();
+    stat_->MeasureTime(TERA_BLOCK_CACHE_GET_FID,
+                       options_.cache_env->NowMicros() - start_ts);
     return fid;
 }
 
@@ -1421,9 +1447,10 @@ Status BlockCacheImpl::DeleteFile(const std::string& fname) {
 }
 
 DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) {
-    mu_.AssertHeld();
     DataSet* set = NULL;
+    uint64_t start_ts = options_.cache_env->NowMicros();
 
+    MutexLock l(&mu_);
     DataSetMap::iterator it = data_set_map_.find(sid);
     if (it == data_set_map_.end()) {
         LockContent lc;
@@ -1437,11 +1464,12 @@ DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) {
         //    this->WorkPath().c_str(), sid);
         set = it->second;
     }
+    stat_->MeasureTime(TERA_BLOCK_CACHE_GET_DS,
+                       options_.cache_env->NowMicros() - start_ts);
     return set;
 }
 
 CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
-    mu_.AssertHeld();
     std::string key;
     PutFixed64(&key, fid);
     PutFixed64(&key, block_idx);
@@ -1453,14 +1481,14 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
     CacheBlock* block = NULL;
     DataSet* ds = GetDataSet(sid); // get and alloc ds
     Cache* cache = ds->cache;
-    mu_.Unlock();
 
+    uint64_t start_ts = options_.cache_env->NowMicros();
     LRUHandle* h = (LRUHandle*)cache->Lookup(key);
     if (h == NULL) {
-        mu_.Lock();
+        MutexLock l(&mu_);
         h = (LRUHandle*)cache->Lookup(key);
         if (h == NULL) {
-            block = new CacheBlock(&mu_);
+            block = new CacheBlock;
             h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
             if (h == NULL) {
                 delete block;
@@ -1479,11 +1507,12 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
             block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
         }
     } else {
-        mu_.Lock();
         block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
         //Log("[%s] get block from memcache, %s\n",
         //        this->WorkPath().c_str(), block->ToString().c_str());
     }
+    stat_->MeasureTime(TERA_BLOCK_CACHE_DS_LRU_LOOKUP,
+                       options_.cache_env->NowMicros() - start_ts);
     return block;
 }
 
@@ -1497,22 +1526,20 @@ Status BlockCacheImpl::LogRecord(CacheBlock* block) {
 }
 
 Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) {
-    mu_.AssertHeld();
     Status s;
-
-    mu_.Unlock();
-    if (need_sync) {
-        // TODO: dump meta into memtable
+    if (need_sync) { // TODO: dump meta into memtable
         s = LogRecord(block);
     }
 
-    mu_.Lock();
-    LRUHandle* h = block->handle;
-    DataSet* ds = GetDataSet(block->sid); // get and alloc ds
+    block->mu.Lock();
     block->ReleaseDataBlock();
-    //Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
     block->s = Status::OK(); // clear io status
     block->cv.SignalAll();
+    block->mu.Unlock();
+
+    //Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
+    LRUHandle* h = block->handle;
+    DataSet* ds = GetDataSet(block->sid); // get and alloc ds
     ds->cache->Release((Cache::Handle*)h);
     return s;
 }
diff --git a/src/leveldb/util/statistics.cc b/src/leveldb/util/statistics.cc
index 74352ae3a..130b06311 100644
--- a/src/leveldb/util/statistics.cc
+++ b/src/leveldb/util/statistics.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 

From ae0298a8631c58c22a938decf9b25763f9dcc15a Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Fri, 25 Aug 2017 13:52:07 +0800
Subject: [PATCH 13/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/util/block_cache.cc | 53 +++++++++++++--------------------
 src/leveldb/util/cache.cc       | 25 ++++++++++------
 2 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 797088f84..1cd0ed54e 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -35,7 +35,7 @@
 namespace leveldb {
 
 /////////////////////////////////////////////
-// Tcache
+// t-cache impl
 /////////////////////////////////////////////
 uint64_t kBlockSize = 4096UL;
 uint64_t kDataSetSize = 128UL << 20;
@@ -1239,6 +1239,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 //    lc.KeyToString().c_str(),
                 //    block->ToString().c_str());
                 LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter));
+                assert((uint64_t)(lc.data_set->cache->Value((Cache::Handle*)handle)) == (uint64_t)block);
                 handle->cache_id = block->cache_block_idx;
                 block->handle = handle;
                 lc.data_set->cache->Release((Cache::Handle*)handle);
@@ -1381,11 +1382,11 @@ uint64_t BlockCacheImpl::AllocFileId() { // no more than fid_batch_num
         if (s.ok()) {
             prev_fid_ = DecodeFixed64(val.c_str());
         }
-        Log("[%s] alloc fid: key %s, new_fid: %lu, prev_fid: %lu\n",
-            this->WorkPath().c_str(),
-            key.c_str(),
-            new_fid_,
-            prev_fid_);
+        //Log("[%s] alloc fid: key %s, new_fid: %lu, prev_fid: %lu\n",
+        //    this->WorkPath().c_str(),
+        //    key.c_str(),
+        //    new_fid_,
+        //    prev_fid_);
     }
     stat_->MeasureTime(TERA_BLOCK_CACHE_ALLOC_FID,
                        options_.cache_env->NowMicros() - start_ts);
@@ -1483,33 +1484,21 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
     Cache* cache = ds->cache;
 
     uint64_t start_ts = options_.cache_env->NowMicros();
-    LRUHandle* h = (LRUHandle*)cache->Lookup(key);
-    if (h == NULL) {
-        MutexLock l(&mu_);
-        h = (LRUHandle*)cache->Lookup(key);
-        if (h == NULL) {
-            block = new CacheBlock;
-            h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
-            if (h == NULL) {
-                delete block;
-                return NULL;
-            }
-            block->fid = fid;
-            block->block_idx = block_idx;
-            block->sid = sid;
-            block->cache_block_idx = h->cache_id;
-            block->handle = h;
-            Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
-                    this->WorkPath().c_str(),
-                    block->ToString().c_str(),
-                    sid, fid, block_idx, hash, options_.dataset_num);
-        } else {
-            block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
-        }
+    block = new CacheBlock;
+    LRUHandle* h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
+    if (h != NULL && ((uint64_t)(cache->Value((Cache::Handle*)h)) == (uint64_t)block)) {
+        block->fid = fid;
+        block->block_idx = block_idx;
+        block->sid = sid;
+        block->cache_block_idx = h->cache_id;
+        block->handle = h;
+        //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
+        //    this->WorkPath().c_str(),
+        //    block->ToString().c_str(),
+        //    sid, fid, block_idx, hash, options_.dataset_num);
     } else {
-        block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
-        //Log("[%s] get block from memcache, %s\n",
-        //        this->WorkPath().c_str(), block->ToString().c_str());
+        delete block;
+        block = (h == NULL) ? NULL: reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
     }
     stat_->MeasureTime(TERA_BLOCK_CACHE_DS_LRU_LOOKUP,
                        options_.cache_env->NowMicros() - start_ts);
diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc
index 97e070bf3..2595e7879 100644
--- a/src/leveldb/util/cache.cc
+++ b/src/leveldb/util/cache.cc
@@ -274,13 +274,16 @@ class LRU2QCache: public Cache {
   ~LRU2QCache() {}
 
   // Like Cache methods, but with an extra "hash" parameter.
+  // Notice: insert if absent,if exist, return the old one.
   Cache::Handle* Insert(const Slice& key, void* value, size_t cache_id,
                         void (*deleter)(const Slice& key, void* value)) {
     const uint32_t hash = HashSlice(key);
     MutexLock l(&mutex_);
     LRUHandle* e = NULL;
-    e = table_.Lookup(key, hash);
-    assert(e == NULL);
+    e = (LRUHandle*)DoLookup(key, hash);
+    if (e != NULL) {
+        return reinterpret_cast<Cache::Handle*>(e);
+    }
 
     if (usage_ < capacity_) { // cache not full
       e = reinterpret_cast<LRUHandle*>(
@@ -332,13 +335,7 @@ class LRU2QCache: public Cache {
   Cache::Handle* Lookup(const Slice& key) {
     const uint32_t hash = HashSlice(key);
     MutexLock l(&mutex_);
-    LRUHandle* e = table_.Lookup(key, hash);
-    if (e != NULL) {
-        e->refs++;
-        LRU_Remove(e);
-        LRU_Append(e);
-    }
-    return reinterpret_cast<Cache::Handle*>(e);
+    return DoLookup(key, hash);
   }
 
   void Erase(const Slice& key) {
@@ -379,6 +376,16 @@ class LRU2QCache: public Cache {
   }
 
  private:
+  Cache::Handle* DoLookup(const Slice& key, uint32_t hash) {
+    LRUHandle* e = table_.Lookup(key, hash);
+    if (e != NULL) {
+        e->refs++;
+        LRU_Remove(e);
+        LRU_Append(e);
+    }
+    return reinterpret_cast<Cache::Handle*>(e);
+  }
+
   void LRU_Remove(LRUHandle* e) {
     e->next->prev = e->prev;
     e->prev->next = e->next;

From 415d02651ddd71c03da7298f43c4730f2f71d08d Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Mon, 28 Aug 2017 19:07:39 +0800
Subject: [PATCH 14/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/include/leveldb/block_cache.h |   3 +
 src/leveldb/include/leveldb/statistics.h  |   2 +
 src/leveldb/util/block_cache.cc           | 165 ++++++++++++++--------
 src/leveldb/util/cache.cc                 |   8 +-
 4 files changed, 119 insertions(+), 59 deletions(-)

diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h
index e331e05f4..7c2d2b965 100644
--- a/src/leveldb/include/leveldb/block_cache.h
+++ b/src/leveldb/include/leveldb/block_cache.h
@@ -84,6 +84,9 @@ class BlockCacheEnv : public EnvWrapper {
     // cache relatively
     virtual Status NewRandomAccessFile(const std::string& fname,
                                        RandomAccessFile** result); // cache Pread
+    virtual Status NewRandomAccessFile(const std::string& fname,
+                                       uint64_t fsize,
+                                       RandomAccessFile** result); // cache Pread
 
     virtual Status NewWritableFile(const std::string& fname,
                                    WritableFile** result); // cache Append
diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h
index 62916892f..ed2898996 100644
--- a/src/leveldb/include/leveldb/statistics.h
+++ b/src/leveldb/include/leveldb/statistics.h
@@ -48,6 +48,7 @@ enum Histograms : uint32_t {
   TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK,
   TERA_BLOCK_CACHE_ALLOC_FID,
   TERA_BLOCK_CACHE_GET_FID,
+  TERA_BLOCK_CACHE_EVICT_NR,
   HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
 };
 
@@ -64,6 +65,7 @@ const std::vector<std::pair<Histograms, std::string> > HistogramsNameMap = {
     {TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, "tera.block_cache.pread_wait_unlock"},
     {TERA_BLOCK_CACHE_ALLOC_FID, "tera.block_cache.alloc_fid"},
     {TERA_BLOCK_CACHE_GET_FID, "tera.block_cache.get_fid"},
+    {TERA_BLOCK_CACHE_EVICT_NR, "tera.block_cache.evict_nr"},
 };
 
 struct HistogramData {
diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 1cd0ed54e..2d664b681 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -15,6 +15,8 @@
 #include <list>
 #include <sstream>
 
+#include "../utils/counter.h"
+
 #include "db/table_cache.h"
 #include "leveldb/db.h"
 #include "leveldb/cache.h"
@@ -34,6 +36,8 @@
 
 namespace leveldb {
 
+::tera::Counter tera_block_cache_evict_counter;
+
 /////////////////////////////////////////////
 // t-cache impl
 /////////////////////////////////////////////
@@ -160,6 +164,7 @@ struct CacheBlock {
 };
 
 struct DataSet {
+    port::Mutex mu;
     Cache* cache;
     int fd;
 };
@@ -178,6 +183,7 @@ class BlockCacheImpl {
                            WritableFile** result);
 
     Status NewRandomAccessFile(const std::string& fname,
+                               uint64_t fsize,
                                RandomAccessFile** result); // cache Pread
 
     static void BlockDeleter(const Slice& key, void* v);
@@ -363,7 +369,6 @@ Status BlockCacheEnv::LoadCache(const BlockCacheOptions& opts, const std::string
     options.cache_env = this->target();
     BlockCacheImpl* cache = new BlockCacheImpl(options);
     Status s = cache->LoadCache();
-    assert(s.ok());
     cache_vec_.push_back(cache); // no need lock
     return s;
 }
@@ -383,18 +388,37 @@ Status BlockCacheEnv::NewWritableFile(const std::string& fname,
     uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size();
     BlockCacheImpl* cache = cache_vec_[hash];
     Status s = cache->NewWritableFile(fname, result);
-    Log("[block_cache %s] open file write: %s, hash: %u, status: %s\n",
-        cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str());
+    if (!s.ok()) {
+        Log("[block_cache %s] open file write fail: %s, hash: %u, status: %s\n",
+             cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str());
+    }
     return s;
 }
 
 Status BlockCacheEnv::NewRandomAccessFile(const std::string& fname,
                                           RandomAccessFile** result) {
+    //uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size();
+    //BlockCacheImpl* cache = cache_vec_[hash];
+    //Status s = cache->NewRandomAccessFile(fname, result);
+    //if (!s.ok()) {
+    //    Log("[block_cache %s] open file read fail: %s, hash: %u, status: %s\n",
+    //         cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str());
+    //}
+    //return s;
+    abort();
+    return Status::OK();
+}
+
+Status BlockCacheEnv::NewRandomAccessFile(const std::string& fname,
+                                          uint64_t fsize,
+                                          RandomAccessFile** result) {
     uint32_t hash = (Hash(fname.c_str(), fname.size(), 13)) % cache_vec_.size();
     BlockCacheImpl* cache = cache_vec_[hash];
-    Status s = cache->NewRandomAccessFile(fname, result);
-    //Log("[block_cache %s] open file read: %s, hash: %u, status: %s\n",
-    //    cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str());
+    Status s = cache->NewRandomAccessFile(fname, fsize, result);
+    if (!s.ok()) {
+        Log("[block_cache %s] open file read fail: %s, hash: %u, status: %s, fsize %lu\n",
+             cache->WorkPath().c_str(), fname.c_str(), hash, s.ToString().c_str(), fsize);
+    }
     return s;
 }
 
@@ -450,19 +474,19 @@ class BlockCacheWriteBuffer {
                 } else { // last block
                     tmp_storage_->append(buf.data(), buf.size());
                 }
-                Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu\n",
-                    path_.c_str(),
-                    file_.c_str(),
-                    offset_,
-                    buf.size());
+                //Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu\n",
+                //    path_.c_str(),
+                //    file_.c_str(),
+                //    offset_,
+                //    buf.size());
             }
         }
         offset_ += data.size();
-        Log("[%s] add record: %s, begin: %u, end: %u, offset: %lu, data_size: %lu, block_size: %u\n",
-            path_.c_str(),
-            file_.c_str(),
-            begin, end,
-            offset_ - data.size() , data.size(), block_size_);
+        //Log("[%s] add record: %s, begin: %u, end: %u, offset: %lu, data_size: %lu, block_size: %u\n",
+        //    path_.c_str(),
+        //    file_.c_str(),
+        //    begin, end,
+        //    offset_ - data.size() , data.size(), block_size_);
         return Status::OK();
     }
 
@@ -518,12 +542,14 @@ class BlockCacheWritableFile : public WritableFile {
           write_buffer_(cache_->WorkPath(), fname, cache_->options_.block_size),
           fname_(fname) { // file open
         *s = cache_->dfs_env_->NewWritableFile(fname_, &dfs_file_);
-        Log("[%s] dfs open: %s, block_size: %lu, status: %s\n",
-            cache_->WorkPath().c_str(),
-            fname.c_str(),
-            cache_->options_.block_size,
-            s->ToString().c_str());
-
+        if (!s->ok()) {
+            Log("[%s] dfs open: %s, block_size: %lu, status: %s\n",
+                 cache_->WorkPath().c_str(),
+                 fname.c_str(),
+                 cache_->options_.block_size,
+                 s->ToString().c_str());
+        }
+        bg_status_ = *s;
         fid_ = cache_->FileId(fname_);
         return;
     }
@@ -543,11 +569,11 @@ class BlockCacheWritableFile : public WritableFile {
 
         MutexLock lockgard(&mu_);
         MaybeScheduleBGFlush();
-        return Status::OK();
+        return s;
     }
 
     Status Close() {
-        Status s;
+        Status s, s1;
         if (dfs_file_ != NULL) {
             s = dfs_file_->Close();
             delete dfs_file_;
@@ -557,25 +583,28 @@ class BlockCacheWritableFile : public WritableFile {
         uint64_t block_idx;
         std::string* block_data = write_buffer_.PopBackBlock(&block_idx);
         if (block_data != NULL) {
-            FillCache(block_data, block_idx);
+            s1 = FillCache(block_data, block_idx);
         }
 
         MutexLock lockgard(&mu_);
         while (bg_block_flush_ > 0) {
             bg_cv_.Wait();
         }
+        if (bg_status_.ok()) {
+            bg_status_ = s.ok() ? s1: s;
+        }
         //Log("[%s] end close %s, status %s\n", cache_->WorkPath().c_str(), fname_.c_str(),
         //    s.ToString().c_str());
-        return s;
+        return bg_status_;
     }
 
     Status Flush() {
-        Log("[%s] dfs flush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        //Log("[%s] dfs flush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
         return dfs_file_->Flush();
     }
 
     Status Sync() {
-        Log("[%s] dfs sync: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        //Log("[%s] dfs sync: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
         return dfs_file_->Sync();
     }
 
@@ -597,7 +626,8 @@ class BlockCacheWritableFile : public WritableFile {
         reinterpret_cast<BlockCacheWritableFile*>(arg)->BGFlush();
     }
     void BGFlush() {
-        Log("[%s] Begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        //Log("[%s] Begin BGFlush: %s\n", cache_->WorkPath().c_str(), fname_.c_str());
+        Status s;
         MutexLock lockgard(&mu_);
         uint64_t block_idx;
         std::string* block_data = write_buffer_.PopFrontBlock(&block_idx);
@@ -605,11 +635,12 @@ class BlockCacheWritableFile : public WritableFile {
             pending_block_num_++;
             mu_.Unlock();
 
-            FillCache(block_data, block_idx);
+            s = FillCache(block_data, block_idx);
             mu_.Lock();
             pending_block_num_--;
         }
 
+        bg_status_ = bg_status_.ok() ? s: bg_status_;
         bg_block_flush_--;
         MaybeScheduleBGFlush();
         bg_cv_.Signal();
@@ -651,6 +682,7 @@ class BlockCacheWritableFile : public WritableFile {
     //port::AtomicPointer shutting_down_;
     port::Mutex mu_;
     port::CondVar bg_cv_;          // Signalled when background work finishes
+    Status bg_status_;
     WritableFile* dfs_file_;
     // protected by cache_.mu_
     uint32_t bg_block_flush_;
@@ -662,9 +694,11 @@ class BlockCacheWritableFile : public WritableFile {
 
 class BlockCacheRandomAccessFile : public RandomAccessFile {
 public:
-    BlockCacheRandomAccessFile(BlockCacheImpl* c, const std::string& fname, Status* s)
+    BlockCacheRandomAccessFile(BlockCacheImpl* c, const std::string& fname,
+                               uint64_t fsize, Status* s)
     : cache_(c),
-      fname_(fname) {
+      fname_(fname),
+      fsize_(fsize) {
         *s = cache_->dfs_env_->NewRandomAccessFile(fname_, &dfs_file_);
         //Log("[%s] dfs open for read: %s, block_size: %lu, status: %s\n",
         //    cache_->WorkPath().c_str(),
@@ -795,9 +829,9 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
                 s = block->s; // degrade read
             }
             block->mu.Unlock();
-            Log("[%s] dfs read done, %s\n",
-                cache_->WorkPath().c_str(),
-                block->ToString().c_str());
+            //Log("[%s] dfs read done, %s\n",
+            //    cache_->WorkPath().c_str(),
+            //    block->ToString().c_str());
         }
         //uint64_t dfs_read_ts = cache_->options_.cache_env->NowMicros();
 
@@ -1029,6 +1063,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
     RandomAccessFile* dfs_file_;
     std::string fname_;
     uint64_t fid_;
+    uint64_t fsize_;
     bool aio_enabled_;
 };
 
@@ -1055,10 +1090,13 @@ void BlockCacheImpl::BGControlThreadFunc(void* arg) {
 }
 
 void BlockCacheImpl::BGControlThread() {
+    stat_->MeasureTime(TERA_BLOCK_CACHE_EVICT_NR,
+                       tera_block_cache_evict_counter.Clear());
+
     Log("[%s] statistics: "
         "%s, %s, %s, %s, %s, "
         "%s, %s, %s, %s, %s, "
-        "%s, %s\n",
+        "%s, %s, %s\n",
         this->WorkPath().c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str(),
@@ -1073,7 +1111,8 @@ void BlockCacheImpl::BGControlThread() {
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK).c_str(),
 
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_ALLOC_FID).c_str(),
-        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str());
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_EVICT_NR).c_str());
 
     // resched after 6s
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE);
@@ -1088,6 +1127,7 @@ void BlockCacheImpl::BGControlThread() {
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_ALLOC_FID);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_FID);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_EVICT_NR);
     bg_control_.Schedule(&BlockCacheImpl::BGControlThreadFunc, this, 10, 6000);
 }
 
@@ -1103,9 +1143,10 @@ Status BlockCacheImpl::NewWritableFile(const std::string& fname,
 }
 
 Status BlockCacheImpl::NewRandomAccessFile(const std::string& fname,
+                                           uint64_t fsize,
                                            RandomAccessFile** result) {
     Status s;
-    BlockCacheRandomAccessFile* file = new BlockCacheRandomAccessFile(this, fname, &s);
+    BlockCacheRandomAccessFile* file = new BlockCacheRandomAccessFile(this, fname, fsize, &s);
     *result = NULL;
     if (s.ok()) {
         *result = (RandomAccessFile*)file;
@@ -1115,8 +1156,9 @@ Status BlockCacheImpl::NewRandomAccessFile(const std::string& fname,
 
 void BlockCacheImpl::BlockDeleter(const Slice& key, void* v) {
     CacheBlock* block = (CacheBlock*)v;
-    Log("Evict blockcache: %s\n", block->ToString().c_str());
+    //Log("Evict blockcache: %s\n", block->ToString().c_str());
     delete block;
+    tera_block_cache_evict_counter.Inc();
     return;
 }
 
@@ -1238,9 +1280,9 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
                 //    this->WorkPath().c_str(),
                 //    lc.KeyToString().c_str(),
                 //    block->ToString().c_str());
-                LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, 1, &BlockCacheImpl::BlockDeleter));
+                LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, cbi, &BlockCacheImpl::BlockDeleter));
                 assert((uint64_t)(lc.data_set->cache->Value((Cache::Handle*)handle)) == (uint64_t)block);
-                handle->cache_id = block->cache_block_idx;
+                assert(handle->cache_id == block->cache_block_idx);
                 block->handle = handle;
                 lc.data_set->cache->Release((Cache::Handle*)handle);
             }
@@ -1320,11 +1362,11 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) {
     // do io without lock
     ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(),
                          cache_block_idx * options_.block_size);
-    Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
-        this->WorkPath().c_str(), sid, fd, block->data_block.size(),
-        cache_block_idx,
-        block->ToString().c_str(),
-        res);
+    //Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
+    //    this->WorkPath().c_str(), sid, fd, block->data_block.size(),
+    //    cache_block_idx,
+    //    block->ToString().c_str(),
+    //    res);
 
     if (res < 0) {
         return Status::Corruption("FillCache error");
@@ -1484,22 +1526,31 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
     Cache* cache = ds->cache;
 
     uint64_t start_ts = options_.cache_env->NowMicros();
-    block = new CacheBlock;
-    LRUHandle* h = (LRUHandle*)cache->Insert(key, block, 1, &BlockCacheImpl::BlockDeleter);
-    if (h != NULL && ((uint64_t)(cache->Value((Cache::Handle*)h)) == (uint64_t)block)) {
+    ds->mu.Lock();
+    LRUHandle* h = (LRUHandle*)cache->Lookup(key);
+    if (h == NULL) {
+        block = new CacheBlock;
         block->fid = fid;
         block->block_idx = block_idx;
         block->sid = sid;
-        block->cache_block_idx = h->cache_id;
-        block->handle = h;
-        //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
-        //    this->WorkPath().c_str(),
-        //    block->ToString().c_str(),
-        //    sid, fid, block_idx, hash, options_.dataset_num);
+        h = (LRUHandle*)cache->Insert(key, block, 0xffffffffffffffff, &BlockCacheImpl::BlockDeleter);
+        if (h != NULL) {
+            assert((uint64_t)(cache->Value((Cache::Handle*)h)) == (uint64_t)block);
+            block->cache_block_idx = h->cache_id;
+            block->handle = h;
+            //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
+            //    this->WorkPath().c_str(),
+            //    block->ToString().c_str(),
+            //    sid, fid, block_idx, hash, options_.dataset_num);
+        } else {
+            delete block;
+            block = NULL;
+            assert(0);
+        }
     } else {
-        delete block;
-        block = (h == NULL) ? NULL: reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
+        block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
     }
+    ds->mu.Unlock();
     stat_->MeasureTime(TERA_BLOCK_CACHE_DS_LRU_LOOKUP,
                        options_.cache_env->NowMicros() - start_ts);
     return block;
diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc
index 2595e7879..c3a4b7aea 100644
--- a/src/leveldb/util/cache.cc
+++ b/src/leveldb/util/cache.cc
@@ -265,7 +265,8 @@ class LRU2QCache: public Cache {
  public:
   explicit LRU2QCache(size_t capacity)
     : capacity_(capacity),
-      usage_(0) {
+      usage_(0),
+      max_cache_id_(0) {
      // Make empty circular linked list
     lru_.next = &lru_;
     lru_.prev = &lru_;
@@ -294,14 +295,16 @@ class LRU2QCache: public Cache {
       e->key_length = key.size();
       e->hash = hash;
       e->refs = 2;  // One from LRUCache, one for the returned handle
-      e->cache_id = usage_;
+      e->cache_id = cache_id == 0xffffffffffffffff ? usage_: cache_id;
       memcpy(e->key_data, key.data(), key.size());
+      max_cache_id_ = max_cache_id_ < e->cache_id ? e->cache_id : max_cache_id_;
 
       LRU_Append(e);
       assert(table_.Insert(e) == NULL);
       usage_++;
       return reinterpret_cast<Cache::Handle*>(e);
     }
+    assert(max_cache_id_ + 1 == usage_);
 
     // cache full, reuse item
     LRUHandle* old = lru_.next;
@@ -419,6 +422,7 @@ class LRU2QCache: public Cache {
   // mutex_ protects the following state.
   port::Mutex mutex_;
   size_t usage_;
+  uint64_t max_cache_id_;
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.

From 9794ff408b81a4f7f0c686eb6d919efe96819bad Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 29 Aug 2017 21:37:24 +0800
Subject: [PATCH 15/19] issue=1258, Tcache support block-level cache evict

---
 src/leveldb/util/block_cache.cc     | 34 ++++++++++++++++++++++++-----
 src/tabletnode/remote_tabletnode.cc |  2 +-
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 2d664b681..02cce0ef4 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -14,6 +14,7 @@
 
 #include <list>
 #include <sstream>
+#include <unordered_map>
 
 #include "../utils/counter.h"
 
@@ -229,10 +230,12 @@ class BlockCacheImpl {
     port::Mutex mu_;
     // key lock list
     struct Waiter {
+        int wait_num; // protected by BlockCacheImpl.mu_
+
+        port::Mutex mu;
         port::CondVar cv;
-        int wait_num;
         bool done;
-        Waiter(port::Mutex* mu):cv(mu), wait_num(0), done(false) {}
+        Waiter(): wait_num(0), cv(&mu), done(false) {}
     };
     typedef std::map<std::string, Waiter*> LockKeyMap;
     LockKeyMap lock_key_;
@@ -290,7 +293,7 @@ class BlockCacheImpl {
             return "";
         }
     };
-    typedef std::map<uint64_t, DataSet*> DataSetMap;
+    typedef std::unordered_map<uint64_t, DataSet*> DataSetMap;
     DataSetMap data_set_map_;
 
     Statistics* stat_;
@@ -1114,6 +1117,17 @@ void BlockCacheImpl::BGControlThread() {
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_EVICT_NR).c_str());
 
+    Log("[%s] statistics(meta): "
+        "table_cache: %lf/%lu/%lu, "
+        "block_cache: %lf/%lu/%lu\n",
+        this->WorkPath().c_str(),
+        options_.opts.table_cache->HitRate(true),
+        options_.opts.table_cache->TableEntries(),
+        options_.opts.table_cache->ByteSize(),
+        options_.opts.block_cache->HitRate(true),
+        options_.opts.block_cache->Entries(),
+        options_.opts.block_cache->TotalCharge());
+
     // resched after 6s
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ);
@@ -1179,10 +1193,13 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
     if (it != lock_key_.end()) {
         w = it->second;
         w->wait_num ++;
+        mu_.Unlock();
+
+        w->mu.Lock();
         while (!w->done) {
             w->cv.Wait();
         }
-        mu_.Unlock();
+        w->mu.Unlock();
 
         if (lc.type == kDBKey) {
             ReadOptions r_opts;
@@ -1193,7 +1210,9 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             //    lc.db_val->c_str(),
             //    s.ToString().c_str());
         } else if (lc.type == kDataSetKey) {
+            mu_.Lock();
             lc.data_set = data_set_map_[lc.sid];
+            mu_.Unlock();
             //Log("[%s] get dataset sid: %lu\n",
             //    this->WorkPath().c_str(),
             //    lc.sid);
@@ -1213,7 +1232,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             //    key.c_str());
         }
     } else {
-        w = new Waiter(&mu_);
+        w = new Waiter;
         w->wait_num = 1;
         lock_key_[key] = w;
         mu_.Unlock();
@@ -1302,11 +1321,16 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             //    key.c_str());
             delete w;
         } else {
+            mu_.Unlock();
             //Log("[%s] put done %s, signal all wait thread\n",
             //    this->WorkPath().c_str(),
             //    key.c_str());
+            w->mu.Lock();
             w->done = true;
             w->cv.SignalAll();
+            w->mu.Unlock();
+
+            mu_.Lock();
         }
     }
     return s;
diff --git a/src/tabletnode/remote_tabletnode.cc b/src/tabletnode/remote_tabletnode.cc
index 2d95a0e5a..a59061369 100644
--- a/src/tabletnode/remote_tabletnode.cc
+++ b/src/tabletnode/remote_tabletnode.cc
@@ -322,7 +322,7 @@ void RemoteTabletNode::DoReadTablet(google::protobuf::RpcController* controller,
         int64_t read_timeout = request->client_timeout_ms() * 1000; // ms -> us
         int64_t detal = get_micros() - start_micros;
         if (detal > read_timeout) {
-            VLOG(5) << "timeout, drop read request for:" << request->tablet_name()
+            VLOG(8) << "timeout, drop read request for:" << request->tablet_name()
                 << ", detal(in us):" << detal
                 << ", read_timeout(in us):" << read_timeout;
             is_read_timeout = true;

From 13813f3a6aaf238df2ca5af04816240730681b07 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Wed, 30 Aug 2017 15:29:04 +0800
Subject: [PATCH 16/19] issue=1258, t-cache support block-level cache evict

1. lock optimize
---
 src/leveldb/util/block_cache.cc | 67 +++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 02cce0ef4..52ff00e9b 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -74,6 +74,7 @@ struct CacheBlock {
     bool data_block_alloc;
     uint64_t data_block_refs;
     LRUHandle* handle;
+    LRUHandle* data_set_handle;
     Status s;
 
     CacheBlock()
@@ -85,7 +86,8 @@ struct CacheBlock {
       cv(&mu),
       data_block_alloc(false),
       data_block_refs(0),
-      handle(NULL) {
+      handle(NULL),
+      data_set_handle(NULL) {
     }
 
     bool Test(uint64_t c_state) {
@@ -165,9 +167,12 @@ struct CacheBlock {
 };
 
 struct DataSet {
+    LRUHandle* h;
     port::Mutex mu;
     Cache* cache;
     int fd;
+
+    DataSet(): h(NULL), cache(NULL), fd(-1) {}
 };
 
 class BlockCacheImpl {
@@ -293,8 +298,7 @@ class BlockCacheImpl {
             return "";
         }
     };
-    typedef std::unordered_map<uint64_t, DataSet*> DataSetMap;
-    DataSetMap data_set_map_;
+    Cache* data_set_cache_;
 
     Statistics* stat_;
     //WritableFile* logfile_;
@@ -1210,9 +1214,8 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             //    lc.db_val->c_str(),
             //    s.ToString().c_str());
         } else if (lc.type == kDataSetKey) {
-            mu_.Lock();
-            lc.data_set = data_set_map_[lc.sid];
-            mu_.Unlock();
+            lc.data_set = GetDataSet(lc.sid);
+            assert(lc.data_set != NULL);
             //Log("[%s] get dataset sid: %lu\n",
             //    this->WorkPath().c_str(),
             //    lc.sid);
@@ -1308,9 +1311,11 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             delete db_it;
             stat_->MeasureTime(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, total_items);
 
-            mu_.Lock();
-            data_set_map_[lc.sid] = lc.data_set;
-            mu_.Unlock();
+            std::string ds_key;
+            PutFixed64(&ds_key, lc.sid);
+            LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Insert(ds_key, lc.data_set, 1, NULL);
+            assert(ds_handle != NULL);
+            lc.data_set->h = ds_handle;
         }
 
         mu_.Lock();
@@ -1356,6 +1361,7 @@ Status BlockCacheImpl::LoadCache() {
         options_.meta_table_cache_size);
     Status s = DB::Open(options_.opts, dbname, &db_);
     assert(s.ok());
+    data_set_cache_ = leveldb::NewLRUCache(128 * options_.dataset_num + 1);
 
     // recover fid
     std::string key = "FID#";
@@ -1377,20 +1383,20 @@ Status BlockCacheImpl::LoadCache() {
 }
 
 Status BlockCacheImpl::FillCache(CacheBlock* block) {
-    mu_.Lock();
-    uint64_t sid = block->sid;
     uint64_t cache_block_idx = block->cache_block_idx;
-    int fd = (data_set_map_[sid])->fd;
-    mu_.Unlock();
+    DataSet* ds = reinterpret_cast<DataSet*>(data_set_cache_->Value((Cache::Handle*)block->data_set_handle));
+    int fd = ds->fd;
 
     // do io without lock
     ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(),
                          cache_block_idx * options_.block_size);
-    //Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
-    //    this->WorkPath().c_str(), sid, fd, block->data_block.size(),
-    //    cache_block_idx,
-    //    block->ToString().c_str(),
-    //    res);
+    if (res < 0) {
+        Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
+            this->WorkPath().c_str(), block->sid, fd, block->data_block.size(),
+            cache_block_idx,
+            block->ToString().c_str(),
+            res);
+    }
 
     if (res < 0) {
         return Status::Corruption("FillCache error");
@@ -1399,11 +1405,9 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) {
 }
 
 Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) {
-    mu_.Lock();
-    uint64_t sid = block->sid;
     uint64_t cache_block_idx = block->cache_block_idx;
-    int fd = (data_set_map_[sid])->fd;
-    mu_.Unlock();
+    DataSet* ds = reinterpret_cast<DataSet*>(data_set_cache_->Value((Cache::Handle*)block->data_set_handle));
+    int fd = ds->fd;
 
     // do io without lock
     ssize_t res = 0;
@@ -1420,7 +1424,7 @@ Status BlockCacheImpl::ReadCache(CacheBlock* block, struct aiocb* aio_context) {
 
     if (res < 0) {
         Log("[%s] cache read: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
-            this->WorkPath().c_str(), sid, fd, block->data_block.size(),
+            this->WorkPath().c_str(), block->sid, fd, block->data_block.size(),
             cache_block_idx,
             block->ToString().c_str(),
             res);
@@ -1514,12 +1518,14 @@ Status BlockCacheImpl::DeleteFile(const std::string& fname) {
 }
 
 DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) {
+    std::string key;
+    PutFixed64(&key, sid);
     DataSet* set = NULL;
     uint64_t start_ts = options_.cache_env->NowMicros();
 
-    MutexLock l(&mu_);
-    DataSetMap::iterator it = data_set_map_.find(sid);
-    if (it == data_set_map_.end()) {
+    LRUHandle* h = (LRUHandle*)data_set_cache_->Lookup(key);
+    if (h == NULL) {
+        MutexLock l(&mu_);
         LockContent lc;
         lc.type = kDataSetKey;
         lc.sid = sid;
@@ -1529,7 +1535,8 @@ DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) {
     } else {
         //Log("[%s] get dataset from memcache, sid %lu\n",
         //    this->WorkPath().c_str(), sid);
-        set = it->second;
+        set = reinterpret_cast<DataSet*>(data_set_cache_->Value((Cache::Handle*)h));
+        assert(set->h == h);
     }
     stat_->MeasureTime(TERA_BLOCK_CACHE_GET_DS,
                        options_.cache_env->NowMicros() - start_ts);
@@ -1562,6 +1569,7 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
             assert((uint64_t)(cache->Value((Cache::Handle*)h)) == (uint64_t)block);
             block->cache_block_idx = h->cache_id;
             block->handle = h;
+            block->data_set_handle = ds->h;
             //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
             //    this->WorkPath().c_str(),
             //    block->ToString().c_str(),
@@ -1573,8 +1581,11 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
         }
     } else {
         block = reinterpret_cast<CacheBlock*>(cache->Value((Cache::Handle*)h));
+        block->data_set_handle = block->data_set_handle == NULL? ds->h: block->data_set_handle;
     }
     ds->mu.Unlock();
+
+    data_set_cache_->Release((Cache::Handle*)ds->h);
     stat_->MeasureTime(TERA_BLOCK_CACHE_DS_LRU_LOOKUP,
                        options_.cache_env->NowMicros() - start_ts);
     return block;
@@ -1603,7 +1614,7 @@ Status BlockCacheImpl::ReleaseBlock(CacheBlock* block, bool need_sync) {
 
     //Log("[%s] release block: %s\n", this->WorkPath().c_str(), block->ToString().c_str());
     LRUHandle* h = block->handle;
-    DataSet* ds = GetDataSet(block->sid); // get and alloc ds
+    DataSet* ds = reinterpret_cast<DataSet*>(data_set_cache_->Value((Cache::Handle*)block->data_set_handle));
     ds->cache->Release((Cache::Handle*)h);
     return s;
 }

From d95181fd9b78f1588eb40ef2d30e81fc559f0efc Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Wed, 30 Aug 2017 23:27:53 +0800
Subject: [PATCH 17/19] issue=1258, t-cache support block-level cache evict

1. lock optimize
---
 src/leveldb/util/block_cache.cc | 255 ++++++++++++++++++--------------
 1 file changed, 148 insertions(+), 107 deletions(-)

diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 52ff00e9b..7b8a6b09d 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -204,6 +204,12 @@ class BlockCacheImpl {
 
     Status LockAndPut(LockContent& lc);
 
+    Status GetContentAfterWait(LockContent& lc);
+
+    Status PutContentAfterLock(LockContent& lc);
+
+    Status ReloadDataSet(LockContent& lc);
+
     Status FillCache(CacheBlock* block);
 
     Status ReadCache(CacheBlock* block, struct aiocb* aio_context);
@@ -241,6 +247,17 @@ class BlockCacheImpl {
         port::CondVar cv;
         bool done;
         Waiter(): wait_num(0), cv(&mu), done(false) {}
+
+        void Wait() {
+            MutexLock l(&mu);
+            while (!done) { cv.Wait(); }
+        }
+
+        void SignalAll() {
+            MutexLock l(&mu);
+            done = true;
+            cv.SignalAll();
+        }
     };
     typedef std::map<std::string, Waiter*> LockKeyMap;
     LockKeyMap lock_key_;
@@ -936,7 +953,6 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
                 cache_->WorkPath().c_str(), fname_.c_str(),
                 offset, n, s.ToString().c_str());
         }
-
         //Log("[%s] Done Pread %s, size %lu, offset %lu, fid %lu, res %lu, status %s, start_block %lu, end_block %lu"
         //    ", block_size %lu\n",
         //    cache_->WorkPath().c_str(), fname_.c_str(), n, offset, fid,
@@ -1005,7 +1021,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
         //    block->ToString().c_str());
     }
 
-    // support aio engine
+    // support posix aio engine
     static void AioCacheReadCallback(sigval_t sigval) { // kernel create thread
         AsyncCacheReader* reader = (AsyncCacheReader*)sigval.sival_ptr;
         reader->file->HandleAioCacheReadCallback(reader);
@@ -1074,7 +1090,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
     bool aio_enabled_;
 };
 
-// Tcache impl
+// t-cache implementation
 BlockCacheImpl::BlockCacheImpl(const BlockCacheOptions& options)
     : options_(options),
       dfs_env_(options.env),
@@ -1198,29 +1214,9 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
         w = it->second;
         w->wait_num ++;
         mu_.Unlock();
+        w->Wait();
 
-        w->mu.Lock();
-        while (!w->done) {
-            w->cv.Wait();
-        }
-        w->mu.Unlock();
-
-        if (lc.type == kDBKey) {
-            ReadOptions r_opts;
-            s = db_->Get(r_opts, key, lc.db_val);
-            //Log("[%s] get lock key: %s, val: %s, status: %s\n",
-            //    this->WorkPath().c_str(),
-            //    key.c_str(),
-            //    lc.db_val->c_str(),
-            //    s.ToString().c_str());
-        } else if (lc.type == kDataSetKey) {
-            lc.data_set = GetDataSet(lc.sid);
-            assert(lc.data_set != NULL);
-            //Log("[%s] get dataset sid: %lu\n",
-            //    this->WorkPath().c_str(),
-            //    lc.sid);
-        }
-
+        s = GetContentAfterWait(lc);
         mu_.Lock();
         if (--w->wait_num == 0) {
             // last thread wait for open
@@ -1240,84 +1236,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
         lock_key_[key] = w;
         mu_.Unlock();
 
-        if (lc.type == kDBKey) {
-            WriteOptions w_opts;
-            s = db_->Put(w_opts, key, lc.db_lock_val);
-            if (s.ok()) {
-                lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size());
-            }
-            Log("[%s] Insert db key : %s, val %s, status %s\n",
-                this->WorkPath().c_str(),
-                lc.KeyToString().c_str(),
-                lc.ValToString().c_str(),
-                s.ToString().c_str());
-        } else if (lc.type == kDeleteDBKey) {
-            WriteOptions w_opts;
-            s = db_->Delete(w_opts, key);
-            Log("[%s] Delete db key : %s, val %s, status %s\n",
-                this->WorkPath().c_str(),
-                lc.KeyToString().c_str(),
-                lc.ValToString().c_str(),
-                s.ToString().c_str());
-        } else if (lc.type == kDataSetKey) {
-            lc.data_set = new DataSet;
-            lc.data_set->cache = New2QCache((options_.dataset_size / options_.block_size) + 1);// number of blocks in DS
-            std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid);
-            lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644);
-            assert(lc.data_set->fd > 0);
-            Log("[%s] New DataSet %s, file: %s, nr_block: %lu, fd: %d\n",
-                this->WorkPath().c_str(),
-                lc.KeyToString().c_str(),
-                file.c_str(), (options_.dataset_size / options_.block_size) + 1,
-                lc.data_set->fd);
-
-            // reload hash lru
-            uint64_t total_items = 0;
-            ReadOptions s_opts;
-            leveldb::Iterator* db_it = db_->NewIterator(s_opts);
-            for (db_it->Seek(key);
-                 db_it->Valid() && db_it->key().starts_with("DS#");
-                 db_it->Next()) {
-                Slice lkey = db_it->key();
-                uint64_t sid, cbi;
-                lkey.remove_prefix(3);// lkey = DS#, sid, cbi
-                sid = DecodeFixed64(lkey.data());
-                lkey.remove_prefix(sizeof(uint64_t));
-                cbi = DecodeFixed64(lkey.data());
-                //Slice lval = db_it->value();
-                if (sid != lc.sid) {
-                    break;
-                }
-                total_items++;
-
-                CacheBlock* block = new CacheBlock;
-                block->DecodeFrom(db_it->value()); // get fid and block_idx
-                std::string hkey;
-                PutFixed64(&hkey, block->fid);
-                PutFixed64(&hkey, block->block_idx);
-                block->sid = sid;
-                block->cache_block_idx = cbi;
-                block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0;
-                //Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n",
-                //    this->WorkPath().c_str(),
-                //    lc.KeyToString().c_str(),
-                //    block->ToString().c_str());
-                LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, cbi, &BlockCacheImpl::BlockDeleter));
-                assert((uint64_t)(lc.data_set->cache->Value((Cache::Handle*)handle)) == (uint64_t)block);
-                assert(handle->cache_id == block->cache_block_idx);
-                block->handle = handle;
-                lc.data_set->cache->Release((Cache::Handle*)handle);
-            }
-            delete db_it;
-            stat_->MeasureTime(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, total_items);
-
-            std::string ds_key;
-            PutFixed64(&ds_key, lc.sid);
-            LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Insert(ds_key, lc.data_set, 1, NULL);
-            assert(ds_handle != NULL);
-            lc.data_set->h = ds_handle;
-        }
-
+        s = PutContentAfterLock(lc);
         mu_.Lock();
         if (--w->wait_num == 0) {
             lock_key_.erase(key);
@@ -1330,10 +1249,7 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
             //Log("[%s] put done %s, signal all wait thread\n",
             //    this->WorkPath().c_str(),
             //    key.c_str());
-            w->mu.Lock();
-            w->done = true;
-            w->cv.SignalAll();
-            w->mu.Unlock();
+            w->SignalAll();
 
             mu_.Lock();
         }
@@ -1341,6 +1257,131 @@ Status BlockCacheImpl::LockAndPut(LockContent& lc) {
     return s;
 }
 
+Status BlockCacheImpl::GetContentAfterWait(LockContent& lc) {
+    Status s;
+    std::string key = lc.Encode();
+
+    if (lc.type == kDBKey) {
+        ReadOptions r_opts;
+        s = db_->Get(r_opts, key, lc.db_val);
+        //Log("[%s] get lock key: %s, val: %s, status: %s\n",
+        //    this->WorkPath().c_str(),
+        //    key.c_str(),
+        //    lc.db_val->c_str(),
+        //    s.ToString().c_str());
+    } else if (lc.type == kDataSetKey) {
+        std::string ds_key;
+        PutFixed64(&ds_key, lc.sid);
+        LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Lookup(ds_key);
+        lc.data_set = reinterpret_cast<DataSet*>(data_set_cache_->Value((Cache::Handle*)ds_handle));
+        assert(ds_handle == lc.data_set->h);
+        //Log("[%s] get dataset sid: %lu\n",
+        //    this->WorkPath().c_str(),
+        //    lc.sid);
+    }
+    return s;
+}
+
+Status BlockCacheImpl::PutContentAfterLock(LockContent& lc) {
+    Status s;
+    std::string key = lc.Encode();
+
+    if (lc.type == kDBKey) {
+        WriteOptions w_opts;
+        s = db_->Put(w_opts, key, lc.db_lock_val);
+        if (s.ok()) {
+            lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size());
+        }
+        Log("[%s] Insert db key : %s, val %s, status %s\n",
+            this->WorkPath().c_str(),
+            lc.KeyToString().c_str(),
+            lc.ValToString().c_str(),
+            s.ToString().c_str());
+    } else if (lc.type == kDeleteDBKey) {
+        WriteOptions w_opts;
+        s = db_->Delete(w_opts, key);
+        Log("[%s] Delete db key : %s, val %s, status %s\n",
+            this->WorkPath().c_str(),
+            lc.KeyToString().c_str(),
+            lc.ValToString().c_str(),
+            s.ToString().c_str());
+    } else if (lc.type == kDataSetKey) { // cannot double insert
+        std::string ds_key;
+        PutFixed64(&ds_key, lc.sid);
+        LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Lookup(ds_key);
+        if (ds_handle != NULL) {
+            lc.data_set = reinterpret_cast<DataSet*>(data_set_cache_->Value((Cache::Handle*)ds_handle));
+            assert(ds_handle == lc.data_set->h);
+        } else {
+            s = ReloadDataSet(lc);
+        }
+    }
+    return s;
+}
+
+Status BlockCacheImpl::ReloadDataSet(LockContent& lc) {
+    Status s;
+    std::string key = lc.Encode();
+
+    lc.data_set = new DataSet;
+    lc.data_set->cache = New2QCache((options_.dataset_size / options_.block_size) + 1);// number of blocks in DS
+    std::string file = options_.cache_dir + "/" + Uint64ToString(lc.sid);
+    lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644);
+    assert(lc.data_set->fd > 0);
+    Log("[%s] New DataSet %s, file: %s, nr_block: %lu, fd: %d\n",
+            this->WorkPath().c_str(),
+            lc.KeyToString().c_str(),
+            file.c_str(), (options_.dataset_size / options_.block_size) + 1,
+            lc.data_set->fd);
+
+    // reload hash lru
+    uint64_t total_items = 0;
+    ReadOptions s_opts;
+    leveldb::Iterator* db_it = db_->NewIterator(s_opts);
+    for (db_it->Seek(key);
+            db_it->Valid() && db_it->key().starts_with("DS#");
+            db_it->Next()) {
+        Slice lkey = db_it->key();
+        uint64_t sid, cbi;
+        lkey.remove_prefix(3);// lkey = DS#, sid, cbi
+        sid = DecodeFixed64(lkey.data());
+        lkey.remove_prefix(sizeof(uint64_t));
+        cbi = DecodeFixed64(lkey.data());
+        //Slice lval = db_it->value();
+        if (sid != lc.sid) {
+            break;
+        }
+        total_items++;
+
+        CacheBlock* block = new CacheBlock;
+        block->DecodeFrom(db_it->value()); // get fid and block_idx
+        std::string hkey;
+        PutFixed64(&hkey, block->fid);
+        PutFixed64(&hkey, block->block_idx);
+        block->sid = sid;
+        block->cache_block_idx = cbi;
+        block->state = (block->Test(kCacheBlockValid)) ? kCacheBlockValid : 0;
+        //Log("[%s] Recovery %s, insert cacheblock into 2QLru, %s\n",
+        //    this->WorkPath().c_str(),
+        //    lc.KeyToString().c_str(),
+        //    block->ToString().c_str());
+        LRUHandle* handle = (LRUHandle*)(lc.data_set->cache->Insert(hkey, block, cbi, &BlockCacheImpl::BlockDeleter));
+        assert((uint64_t)(lc.data_set->cache->Value((Cache::Handle*)handle)) == (uint64_t)block);
+        assert(handle->cache_id == block->cache_block_idx);
+        block->handle = handle;
+        lc.data_set->cache->Release((Cache::Handle*)handle);
+    }
+    delete db_it;
+    stat_->MeasureTime(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, total_items);
+
+    std::string ds_key;
+    PutFixed64(&ds_key, lc.sid);
+    LRUHandle* ds_handle = (LRUHandle*)data_set_cache_->Insert(ds_key, lc.data_set, 1, NULL);
+    assert(ds_handle != NULL);
+    lc.data_set->h = ds_handle;
+    return s;
+}
+
 const std::string& BlockCacheImpl::WorkPath() {
     return work_path_;
 }

From 6394ef70df98c9390019f69fac355524c955188d Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Tue, 24 Oct 2017 23:50:40 +0800
Subject: [PATCH 18/19] issue=1258, t-cache support block-level cache evict

bugfix for cache evict
---
 src/leveldb/db/table_cache.cc            |  2 +-
 src/leveldb/include/leveldb/statistics.h |  8 ++-
 src/leveldb/table/format.cc              |  5 +-
 src/leveldb/util/block_cache.cc          | 78 +++++++++++++-----------
 src/leveldb/util/cache.cc                | 12 ++--
 src/tabletnode/tabletnode_impl.cc        |  2 +-
 6 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/src/leveldb/db/table_cache.cc b/src/leveldb/db/table_cache.cc
index e6af0d97b..c9cdb77ea 100644
--- a/src/leveldb/db/table_cache.cc
+++ b/src/leveldb/db/table_cache.cc
@@ -93,7 +93,7 @@ Status TableCache::FindTable(const std::string& dbname, const Options* options,
 
       if (!s.ok()) {
         assert(table == NULL);
-        fprintf(stderr, "open sstable file failed: [%s]\n", fname.c_str());
+        fprintf(stderr, "open sstable file failed: [%s] %s\n", fname.c_str(), s.ToString().c_str());
         delete file;
         // We do not cache error results so that if the error is transient,
         // or somebody repairs the file, we recover automatically.
diff --git a/src/leveldb/include/leveldb/statistics.h b/src/leveldb/include/leveldb/statistics.h
index ed2898996..235192db2 100644
--- a/src/leveldb/include/leveldb/statistics.h
+++ b/src/leveldb/include/leveldb/statistics.h
@@ -43,12 +43,14 @@ enum Histograms : uint32_t {
   TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR,
   TERA_BLOCK_CACHE_PREAD_GET_BLOCK,
   TERA_BLOCK_CACHE_PREAD_BLOCK_NR,
-  TERA_BLOCK_CACHE_GET_DS,
+  TERA_BLOCK_CACHE_GET_DATA_SET,
   TERA_BLOCK_CACHE_DS_LRU_LOOKUP,
   TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK,
   TERA_BLOCK_CACHE_ALLOC_FID,
   TERA_BLOCK_CACHE_GET_FID,
   TERA_BLOCK_CACHE_EVICT_NR,
+  TERA_BLOCK_CACHE_PREAD_DFS_READ,
+  TERA_BLOCK_CACHE_PREAD_SSD_WRITE,
   HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
 };
 
@@ -60,12 +62,14 @@ const std::vector<std::pair<Histograms, std::string> > HistogramsNameMap = {
     {TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR, "tera.block_cache.lockmap_ds_reload_nr"},
     {TERA_BLOCK_CACHE_PREAD_GET_BLOCK, "tera.block_cache.pread_get_block"},
     {TERA_BLOCK_CACHE_PREAD_BLOCK_NR, "tera.block_cache.pread_block_nr"},
-    {TERA_BLOCK_CACHE_GET_DS, "tera.block_cache.get_ds"},
+    {TERA_BLOCK_CACHE_GET_DATA_SET, "tera.block_cache.get_data_set"},
     {TERA_BLOCK_CACHE_DS_LRU_LOOKUP, "tera.block_cache.ds_lru_lookup"},
     {TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK, "tera.block_cache.pread_wait_unlock"},
     {TERA_BLOCK_CACHE_ALLOC_FID, "tera.block_cache.alloc_fid"},
     {TERA_BLOCK_CACHE_GET_FID, "tera.block_cache.get_fid"},
     {TERA_BLOCK_CACHE_EVICT_NR, "tera.block_cache.evict_nr"},
+    {TERA_BLOCK_CACHE_PREAD_DFS_READ, "tera.block_cache.pread_dfs_read"},
+    {TERA_BLOCK_CACHE_PREAD_SSD_WRITE, "tera.block_cache.pread_ssd_write"},
 };
 
 struct HistogramData {
diff --git a/src/leveldb/table/format.cc b/src/leveldb/table/format.cc
index f4e2e5259..c226a152a 100644
--- a/src/leveldb/table/format.cc
+++ b/src/leveldb/table/format.cc
@@ -97,7 +97,10 @@ Status ReadBlock(RandomAccessFile* file,
     const uint32_t actual = crc32c::Value(data, n + 1);
     if (actual != crc) {
       delete[] buf;
-      s = Status::Corruption("block checksum mismatch");
+      char err[128] = {'\0'};
+      sprintf(err, "block checksum mismatch: crc %u, actual %u, offset %lu, size %lu",
+              crc, actual, handle.offset(), n + kBlockTrailerSize);
+      s = Status::Corruption(Slice(err, strlen(err)));
       return s;
     }
   }
diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 7b8a6b09d..94bccd116 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -323,6 +323,7 @@ class BlockCacheImpl {
     DB* db_; // store meta
     ThreadPool bg_fill_;
     ThreadPool bg_read_;
+    ThreadPool bg_dfs_read_;
     ThreadPool bg_flush_;
     ThreadPool bg_control_;
 };
@@ -492,17 +493,18 @@ class BlockCacheWriteBuffer {
             for (uint32_t i = begin + 1; i <= end; ++i) {
                 tmp_storage_ = new std::string();
                 block_list_.push_back(tmp_storage_);
-                if (i < end) { // last block
+                if (i < end) {
                     tmp_storage_->append(buf.data(), block_size_);
                     buf.remove_prefix(block_size_);
                 } else { // last block
                     tmp_storage_->append(buf.data(), buf.size());
+                    buf.remove_prefix(buf.size());
                 }
-                //Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu\n",
+                //Log("[%s] add tmp_storage %s: offset: %lu, buf_size: %lu, idx %u\n",
                 //    path_.c_str(),
                 //    file_.c_str(),
                 //    offset_,
-                //    buf.size());
+                //    buf.size(), i);
             }
         }
         offset_ += data.size();
@@ -803,7 +805,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //Log("[%s] pread in miss list, %s\n",
             //    cache_->WorkPath().c_str(),
             //    block->ToString().c_str());
-            cache_->bg_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10);
+            cache_->bg_dfs_read_.Schedule(&BlockCacheRandomAccessFile::AsyncDfsRead, reader, 10);
         }
         //uint64_t miss_read_sched_ts = cache_->options_.cache_env->NowMicros();
 
@@ -857,7 +859,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //    cache_->WorkPath().c_str(),
             //    block->ToString().c_str());
         }
-        //uint64_t dfs_read_ts = cache_->options_.cache_env->NowMicros();
+        uint64_t dfs_read_ts = cache_->options_.cache_env->NowMicros();
+        cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_DFS_READ, dfs_read_ts - ssd_read_ts);
 
         for (uint32_t i = 0; i < c_miss.size(); ++i) {
             CacheBlock* block = c_miss[i];
@@ -869,7 +872,8 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //    block->ToString().c_str());
             cache_->bg_fill_.Schedule(&BlockCacheRandomAccessFile::AsyncCacheWrite, writer, 10);
         }
-        //uint64_t ssd_write_sched_ts = cache_->options_.cache_env->NowMicros();
+        uint64_t ssd_write_sched_ts = cache_->options_.cache_env->NowMicros();
+        //cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_SSD_WRITE_SCHED, ssd_write_sched_ts - dfs_read_ts);
 
         for (uint32_t i = 0; i < c_miss.size(); ++i) { // wait cache fill finish
             CacheBlock* block = c_miss[i];
@@ -888,6 +892,7 @@ class BlockCacheRandomAccessFile : public RandomAccessFile {
             //    block->ToString().c_str());
         }
         uint64_t ssd_write_ts = cache_->options_.cache_env->NowMicros();
+        cache_->stat_->MeasureTime(TERA_BLOCK_CACHE_PREAD_SSD_WRITE, ssd_write_ts - ssd_write_sched_ts);
 
         // wait other async read finish
         for (uint32_t i = 0; i < c_locked.size(); ++i) {
@@ -1099,6 +1104,7 @@ BlockCacheImpl::BlockCacheImpl(const BlockCacheOptions& options)
       db_(NULL) {
     bg_fill_.SetBackgroundThreads(30);
     bg_read_.SetBackgroundThreads(30);
+    bg_dfs_read_.SetBackgroundThreads(30);
     bg_flush_.SetBackgroundThreads(30);
     bg_control_.SetBackgroundThreads(2);
     stat_ = CreateDBStatistics();
@@ -1119,20 +1125,22 @@ void BlockCacheImpl::BGControlThread() {
     Log("[%s] statistics: "
         "%s, %s, %s, %s, %s, "
         "%s, %s, %s, %s, %s, "
-        "%s, %s, %s\n",
+        "%s, %s, %s, %s, %s\n",
         this->WorkPath().c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_QUEUE).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_READ).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_DFS_READ).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_SSD_WRITE).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA).c_str(),
+
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR).c_str(),
-
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_GET_BLOCK).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_BLOCK_NR).c_str(),
-        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_DS).c_str(),
+        stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_DATA_SET).c_str(),
+
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_DS_LRU_LOOKUP).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK).c_str(),
-
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_ALLOC_FID).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_GET_FID).c_str(),
         stat_->GetBriefHistogramString(TERA_BLOCK_CACHE_EVICT_NR).c_str());
@@ -1151,12 +1159,14 @@ void BlockCacheImpl::BGControlThread() {
     // resched after 6s
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_QUEUE);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_READ);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_DFS_READ);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_SSD_WRITE);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_FILL_USER_DATA);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_RELEASE_BLOCK);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_LOCKMAP_DS_RELOAD_NR);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_GET_BLOCK);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_BLOCK_NR);
-    stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_DS);
+    stat_->ClearHistogram(TERA_BLOCK_CACHE_GET_DATA_SET);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_DS_LRU_LOOKUP);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_PREAD_WAIT_UNLOCK);
     stat_->ClearHistogram(TERA_BLOCK_CACHE_ALLOC_FID);
@@ -1292,19 +1302,19 @@ Status BlockCacheImpl::PutContentAfterLock(LockContent& lc) {
         if (s.ok()) {
             lc.db_val->append(lc.db_lock_val.data(), lc.db_lock_val.size());
         }
-        Log("[%s] Insert db key : %s, val %s, status %s\n",
-            this->WorkPath().c_str(),
-            lc.KeyToString().c_str(),
-            lc.ValToString().c_str(),
-            s.ToString().c_str());
+        //Log("[%s] Insert db key : %s, val %s, status %s\n",
+        //    this->WorkPath().c_str(),
+        //    lc.KeyToString().c_str(),
+        //    lc.ValToString().c_str(),
+        //    s.ToString().c_str());
     } else if (lc.type == kDeleteDBKey) {
         WriteOptions w_opts;
         s = db_->Delete(w_opts, key);
-        Log("[%s] Delete db key : %s, val %s, status %s\n",
-            this->WorkPath().c_str(),
-            lc.KeyToString().c_str(),
-            lc.ValToString().c_str(),
-            s.ToString().c_str());
+        //Log("[%s] Delete db key : %s, val %s, status %s\n",
+        //    this->WorkPath().c_str(),
+        //    lc.KeyToString().c_str(),
+        //    lc.ValToString().c_str(),
+        //    s.ToString().c_str());
     } else if (lc.type == kDataSetKey) { // cannot double insert
         std::string ds_key;
         PutFixed64(&ds_key, lc.sid);
@@ -1329,18 +1339,18 @@ Status BlockCacheImpl::ReloadDataSet(LockContent& lc) {
     lc.data_set->fd = open(file.c_str(), O_RDWR | O_CREAT, 0644);
     assert(lc.data_set->fd > 0);
     Log("[%s] New DataSet %s, file: %s, nr_block: %lu, fd: %d\n",
-            this->WorkPath().c_str(),
-            lc.KeyToString().c_str(),
-            file.c_str(), (options_.dataset_size / options_.block_size) + 1,
-            lc.data_set->fd);
+        this->WorkPath().c_str(),
+        lc.KeyToString().c_str(),
+        file.c_str(), (options_.dataset_size / options_.block_size) + 1,
+        lc.data_set->fd);
 
     // reload hash lru
     uint64_t total_items = 0;
     ReadOptions s_opts;
     leveldb::Iterator* db_it = db_->NewIterator(s_opts);
     for (db_it->Seek(key);
-            db_it->Valid() && db_it->key().starts_with("DS#");
-            db_it->Next()) {
+         db_it->Valid() && db_it->key().starts_with("DS#");
+         db_it->Next()) {
         Slice lkey = db_it->key();
         uint64_t sid, cbi;
         lkey.remove_prefix(3);// lkey = DS#, sid, cbi
@@ -1389,7 +1399,7 @@ const std::string& BlockCacheImpl::WorkPath() {
 Status BlockCacheImpl::LoadCache() {
     // open meta file
     work_path_ = options_.cache_dir;
-    std::string dbname = options_.cache_dir + "/meta/";
+    std::string dbname = options_.cache_dir + "/meta";
     options_.opts.env = options_.cache_env; // local write
     options_.opts.filter_policy = NewBloomFilterPolicy(10);
     options_.opts.block_cache = leveldb::NewLRUCache(options_.meta_block_cache_size * 1024UL * 1024);
@@ -1431,15 +1441,13 @@ Status BlockCacheImpl::FillCache(CacheBlock* block) {
     // do io without lock
     ssize_t res = pwrite(fd, block->data_block.data(), block->data_block.size(),
                          cache_block_idx * options_.block_size);
+
     if (res < 0) {
         Log("[%s] cache fill: sid %lu, dataset.fd %d, datablock size %lu, cb_idx %lu, %s, res %ld\n",
             this->WorkPath().c_str(), block->sid, fd, block->data_block.size(),
             cache_block_idx,
             block->ToString().c_str(),
             res);
-    }
-
-    if (res < 0) {
         return Status::Corruption("FillCache error");
     }
     return Status::OK();
@@ -1579,7 +1587,7 @@ DataSet* BlockCacheImpl::GetDataSet(uint64_t sid) {
         set = reinterpret_cast<DataSet*>(data_set_cache_->Value((Cache::Handle*)h));
         assert(set->h == h);
     }
-    stat_->MeasureTime(TERA_BLOCK_CACHE_GET_DS,
+    stat_->MeasureTime(TERA_BLOCK_CACHE_GET_DATA_SET,
                        options_.cache_env->NowMicros() - start_ts);
     return set;
 }
@@ -1611,10 +1619,12 @@ CacheBlock* BlockCacheImpl::GetAndAllocBlock(uint64_t fid, uint64_t block_idx) {
             block->cache_block_idx = h->cache_id;
             block->handle = h;
             block->data_set_handle = ds->h;
-            //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, dataset_nr %lu\n",
+            //Log("[%s] Alloc Block: %s, sid %lu, fid %lu, block_idx %lu, hash %u, usage: %lu/%lu\n",
             //    this->WorkPath().c_str(),
             //    block->ToString().c_str(),
-            //    sid, fid, block_idx, hash, options_.dataset_num);
+            //    sid, fid, block_idx, hash,
+            //    cache->TotalCharge(),
+            //    options_.dataset_size / options_.block_size + 1);
         } else {
             delete block;
             block = NULL;
diff --git a/src/leveldb/util/cache.cc b/src/leveldb/util/cache.cc
index c3a4b7aea..b7c7ca4e0 100644
--- a/src/leveldb/util/cache.cc
+++ b/src/leveldb/util/cache.cc
@@ -281,10 +281,11 @@ class LRU2QCache: public Cache {
     const uint32_t hash = HashSlice(key);
     MutexLock l(&mutex_);
     LRUHandle* e = NULL;
-    e = (LRUHandle*)DoLookup(key, hash);
-    if (e != NULL) {
-        return reinterpret_cast<Cache::Handle*>(e);
-    }
+    //e = (LRUHandle*)DoLookup(key, hash);
+    //if (e != NULL) {
+    //    assert(0);
+    //    return reinterpret_cast<Cache::Handle*>(e);
+    //}
 
     if (usage_ < capacity_) { // cache not full
       e = reinterpret_cast<LRUHandle*>(
@@ -305,6 +306,8 @@ class LRU2QCache: public Cache {
       return reinterpret_cast<Cache::Handle*>(e);
     }
     assert(max_cache_id_ + 1 == usage_);
+    assert(usage_ == capacity_);
+    //fprintf(stderr, "%lu, usage %lu, capacity %lu\n", (uint64_t)this, usage_, capacity_);
 
     // cache full, reuse item
     LRUHandle* old = lru_.next;
@@ -330,6 +333,7 @@ class LRU2QCache: public Cache {
 
       LRU_Append(e);
       assert(table_.Insert(e) == NULL);
+      usage_++;
       return reinterpret_cast<Cache::Handle*>(e);
     }
     return NULL;
diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc
index 52077a718..ff65562a3 100644
--- a/src/tabletnode/tabletnode_impl.cc
+++ b/src/tabletnode/tabletnode_impl.cc
@@ -192,7 +192,7 @@ void TabletNodeImpl::InitCacheSystem() {
         for (uint32_t i = 0; i < path_list.size(); ++i) {
             leveldb::BlockCacheOptions opts;
             LOG(INFO) << "load cache: " << path_list[i];
-            reinterpret_cast<leveldb::BlockCacheEnv*>(block_cache_env)->LoadCache(opts, path_list[i] + "/block_cache/");
+            reinterpret_cast<leveldb::BlockCacheEnv*>(block_cache_env)->LoadCache(opts, path_list[i] + "/block_cache");
         }
         return;
     }

From 9dc16656891b70e26ff8b4d572e8dd4fa60cd672 Mon Sep 17 00:00:00 2001
From: caijieming <caijieming@baidu.com>
Date: Wed, 25 Oct 2017 11:03:32 +0800
Subject: [PATCH 19/19] issue=1258, t-cache support block-level cache evict

bugfix for cache evict
---
 src/leveldb/include/leveldb/block_cache.h | 2 ++
 src/leveldb/util/block_cache.cc           | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/leveldb/include/leveldb/block_cache.h b/src/leveldb/include/leveldb/block_cache.h
index 7c2d2b965..ebd1b0cf1 100644
--- a/src/leveldb/include/leveldb/block_cache.h
+++ b/src/leveldb/include/leveldb/block_cache.h
@@ -1,6 +1,8 @@
 // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
+//
+// Author: caijieming@baidu.com
 
 #ifndef  STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H_
 #define  STOREAGE_LEVELDB_UTIL_BLOCK_CACHE_H_
diff --git a/src/leveldb/util/block_cache.cc b/src/leveldb/util/block_cache.cc
index 94bccd116..ab5421e6e 100644
--- a/src/leveldb/util/block_cache.cc
+++ b/src/leveldb/util/block_cache.cc
@@ -1,6 +1,8 @@
 // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
+//
+// Author: caijieming@baidu.com
 
 #include "leveldb/block_cache.h"