diff --git a/.travis.yml b/.travis.yml index 7fbbc6b9e..20116d62a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,15 @@ -language: cpp -compiler: gcc env: global: - secure: "EBGwhqHaPbERmOAPA7a1IprZZdFjEZqnuekgkNTBtzmGTaIYuh1BbSNGmVtnj3DuXuqAusiYN6olW2lMax15Fqw3Mwh++vh6DJFQ4wePImCzot7D4fTcopmNS2yoPl0IeyL/sLyQrxjflBfoTzw6DUZAXiU55gGB1faqCAfM5sQ=" + - CC=gcc-4.8 + - CXX=g++-4.8 addons: apt: + sources: + - ubuntu-toolchain-r-test packages: + - gcc-4.8 + - g++-4.8 - gdb - apport coverity_scan: diff --git a/Makefile b/Makefile index a2dff08e7..71cd57b9b 100644 --- a/Makefile +++ b/Makefile @@ -4,13 +4,17 @@ include depends.mk OPT ?= -g2 -Wall -Werror # (B) Debug mode, w/ full line-level debugging symbols # OPT ?= -O2 -g2 -DNDEBUG # (C) Profiling mode: opt, but w/debugging symbols -CC = cc -CXX = g++ +ifndef CXX + CXX = g++ +endif +ifndef CC + CC = gcc +endif INCPATH += -I./src -I./include -I./src/leveldb/include -I./src/leveldb \ -I./src/sdk/java/native-src $(DEPS_INCPATH) CFLAGS += $(OPT) $(INCPATH) -fPIC -fvisibility=hidden # hide internal symbol of tera -CXXFLAGS += $(CFLAGS) +CXXFLAGS += -std=gnu++11 $(CFLAGS) LDFLAGS += -rdynamic $(DEPS_LDPATH) $(DEPS_LDFLAGS) -lpthread -lrt -lz -ldl \ -lreadline -lncurses SO_LDFLAGS += -rdynamic $(DEPS_LDPATH) $(SO_DEPS_LDFLAGS) -lpthread -lrt -lz -ldl \ @@ -32,7 +36,8 @@ OTHER_SRC := $(wildcard src/zk/*.cc) $(wildcard src/utils/*.cc) $(VERSION_SRC) \ src/tera_flags.cc COMMON_SRC := $(wildcard src/common/base/*.cc) $(wildcard src/common/net/*.cc) \ $(wildcard src/common/file/*.cc) $(wildcard src/common/file/recordio/*.cc) \ - $(wildcard src/common/console/*.cc) + $(wildcard src/common/console/*.cc) +SERVER_WRAPPER_SRC := src/tera_main_wrapper.cc SERVER_SRC := src/tera_main.cc src/tera_entry.cc CLIENT_SRC := src/teracli_main.cc TEST_CLIENT_SRC := src/tera_test_main.cc @@ -41,7 +46,7 @@ MONITOR_SRC := src/monitor/teramo_main.cc MARK_SRC := src/benchmark/mark.cc src/benchmark/mark_main.cc TEST_SRC := src/utils/test/prop_tree_test.cc src/utils/test/tprinter_test.cc \ src/io/test/tablet_io_test.cc src/io/test/tablet_scanner_test.cc \ - src/master/test/master_impl_test.cc src/io/test/load_test.cc + src/master/test/master_impl_test.cc src/io/test/load_test.cc TEST_OUTPUT := test_output UNITTEST_OUTPUT := $(TEST_OUTPUT)/unittest @@ -54,6 +59,7 @@ PROTO_OBJ := $(PROTO_SRC:.cc=.o) JNI_TERA_OBJ := $(JNI_TERA_SRC:.cc=.o) OTHER_OBJ := $(OTHER_SRC:.cc=.o) COMMON_OBJ := $(COMMON_SRC:.cc=.o) +SERVER_WRAPPER_OBJ := $(SERVER_WRAPPER_SRC:.cc=.o) SERVER_OBJ := $(SERVER_SRC:.cc=.o) CLIENT_OBJ := $(CLIENT_SRC:.cc=.o) TEST_CLIENT_OBJ := $(TEST_CLIENT_SRC:.cc=.o) @@ -64,11 +70,12 @@ HTTP_OBJ := $(HTTP_SRC:.cc=.o) TEST_OBJ := $(TEST_SRC:.cc=.o) ALL_OBJ := $(MASTER_OBJ) $(TABLETNODE_OBJ) $(IO_OBJ) $(SDK_OBJ) $(PROTO_OBJ) \ $(JNI_TERA_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(SERVER_OBJ) $(CLIENT_OBJ) \ - $(TEST_CLIENT_OBJ) $(TERA_C_OBJ) $(MONITOR_OBJ) $(MARK_OBJ) $(TEST_OBJ) + $(TEST_CLIENT_OBJ) $(TERA_C_OBJ) $(MONITOR_OBJ) $(MARK_OBJ) $(TEST_OBJ) \ + $(SERVER_WRAPPER_OBJ) LEVELDB_LIB := src/leveldb/libleveldb.a -LEVELDB_UTIL := src/leveldb/util/histogram.o +LEVELDB_UTIL := src/leveldb/util/histogram.o src/leveldb/port/port_posix.o -PROGRAM = tera_main teracli teramo tera_test +PROGRAM = tera_main tera_master tabletserver teracli teramo tera_test LIBRARY = libtera.a SOLIBRARY = libtera.so TERA_C_SO = libtera_c.so @@ -110,8 +117,15 @@ cleanall: $(MAKE) clean rm -rf build -tera_main: $(SERVER_OBJ) $(MASTER_OBJ) $(TABLETNODE_OBJ) $(IO_OBJ) $(SDK_OBJ) \ - $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) +tera_main: src/tera_main_wrapper.o src/version.o src/tera_flags.o + $(CXX) -o $@ $^ $(LDFLAGS) + +tera_master: $(SERVER_OBJ) $(MASTER_OBJ) $(IO_OBJ) $(SDK_OBJ) \ + $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) + $(CXX) -o $@ $^ $(LDFLAGS) + +tabletserver: $(SERVER_OBJ) $(TABLETNODE_OBJ) $(IO_OBJ) $(SDK_OBJ) \ + $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) $(CXX) -o $@ $^ $(LDFLAGS) libtera.a: $(SDK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_UTIL) @@ -142,7 +156,7 @@ libjni_tera.so: $(JNI_TERA_OBJ) $(LIBRARY) $(CXX) -o $@ $^ $(SO_LDFLAGS) src/leveldb/libleveldb.a: FORCE - $(MAKE) -C src/leveldb + CC=$(CC) CXX=$(CXX) $(MAKE) -C src/leveldb tera_bench: @@ -174,8 +188,8 @@ tablet_scanner_test: src/io/test/tablet_scanner_test.o src/tabletnode/tabletnode $(IO_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) $(CXX) -o $@ $^ $(LDFLAGS) -master_impl_test: src/master/test/master_impl_test.o src/tera_entry.o $(MASTER_OBJ) $(TABLETNODE_OBJ) $(IO_OBJ) $(SDK_OBJ) \ - $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) +master_impl_test: src/master/test/master_impl_test.o src/tera_entry.cc $(MASTER_OBJ) $(IO_OBJ) $(SDK_OBJ) \ + $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) $(CXX) -o $@ $^ $(LDFLAGS) $(ALL_OBJ): %.o: %.cc $(PROTO_OUT_H) diff --git a/build.conf.template b/build.conf.template index 8d91cf747..bda1a85ca 100755 --- a/build.conf.template +++ b/build.conf.template @@ -9,7 +9,7 @@ MIRROR=$1 BOOST_VERSION=1_58_0 PROTOBUF_VERSION=2.6.1 SNAPPY_VERSION=1.1.3 -SOFA_PBRPC_VERSION=1.1.1 +SOFA_PBRPC_VERSION=1.1.3 ZOOKEEPER_VERSION=3.4.9 GFLAGS_VERSION=2.1.2 GLOG_VERSION=0.3.3 @@ -28,7 +28,7 @@ if [ $MIRROR == "china" ]; then GFLAGS_URL=https://github.com/schuhschuh/gflags/archive/v${GFLAGS_VERSION}.tar.gz GLOG_URL=https://github.com/google/glog/archive/v${GLOG_VERSION}.tar.gz GTEST_URL=https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz - LIBUNWIND_URL=http://mirrors.163.com/gentoo/distfiles/libunwind-${LIBUNWIND_VERSION}.tar.gz + LIBUNWIND_URL=http://repository.timesys.com/buildsources/l/libunwind/libunwind-${LIBUNWIND_VERSION}/libunwind-${LIBUNWIND_VERSION}.tar.gz GPERFTOOLS_URL=https://github.com/00k/gperftools/raw/master/gperftools-${GPERFTOOLS_VERSION}.tar.gz INS_URL=https://github.com/baidu/ins/archive/${INS_VERSION}.tar.gz NOSE_URL=http://mirrors.163.com/gentoo/distfiles/nose-${NOSE_VERSION}.tar.gz diff --git a/doc/cn/cluster_setup.md b/doc/cn/cluster_setup.md index 6fc4daf54..8950bee99 100644 --- a/doc/cn/cluster_setup.md +++ b/doc/cn/cluster_setup.md @@ -47,12 +47,12 @@ Tera集群搭建 * 在master节点上,执行以下命令 ``` cd ${tera_prefix}/bin - nohup ./tera_main --flagfile=../conf/tera.flag --tera_role=master &> ../log/master.stderr & + nohup ./tera_master --flagfile=../conf/tera.flag &> ../log/master.stderr & ``` * 在TabletServer节点上,执行以下命令 ``` cd ${tera_prefix}/bin - nohup ./tera_main --flagfile=../conf/tera.flag --tera_role=tabletnode &> ../log/tabletserver.stderr & + nohup ./tabletserver --flagfile=../conf/tera.flag &> ../log/tabletserver.stderr & ``` 5. 停止tera - * 用kill命令杀掉tera_main对应的进程即可 + * 用kill命令杀掉tera_master和tabletserver对应的进程即可 diff --git a/doc/cn/onebox.md b/doc/cn/onebox.md index f2bf68fc9..60a0546e7 100644 --- a/doc/cn/onebox.md +++ b/doc/cn/onebox.md @@ -8,7 +8,7 @@ ## 准备工作 1. 完成Tera的编译,请参考:https://github.com/baidu/tera/wiki/Build-Manual -2. 将编译生成的tera_main, teracli两个二进制文件放入example/onebox/bin +2. 将编译生成的tera_master, tabletserver, teracli三个二进制文件放入example/onebox/bin 3. 如有需要,通过修改example/onebox/bin/config中的选项配置tabletnode个数 4. 进入example/onebox/bin/目录 diff --git a/doc/en/onebox.md b/doc/en/onebox.md index b366d0f3d..5d63f8b73 100644 --- a/doc/en/onebox.md +++ b/doc/en/onebox.md @@ -9,7 +9,7 @@ Your can run Tera in pseudo-distributed mode which means that Tera runs on a sin 1. git clone https://github.com/baidu/tera.git 1. cd tera 1. ./build.sh -1. cp {tera_main,teracli} example/onebox/bin +1. cp {tera_master,tabletserver,teracli} example/onebox/bin 1. cd example/onebox/bin diff --git a/doc/ubuntu_install_onebox_and_cluster.md b/doc/ubuntu_install_onebox_and_cluster.md index 55c0bd474..16d7b7f9e 100644 --- a/doc/ubuntu_install_onebox_and_cluster.md +++ b/doc/ubuntu_install_onebox_and_cluster.md @@ -41,12 +41,15 @@ ubuntu安装tera(单机和集群) 4). 提示错误:fatal error: readline/history.h: sudo apt-get install libreadline-dev - 5). 提示缺少ncurses包:、 + 5). 提示缺少ncurses包: sudo apt-get install libncurses5-dev + 6). 提示 "cmake: command not found": + sudo apt-get install cmake + 3. 等待编译结束 & 单机体验 - 将编译生成的tera_main和teracli文件copy到example/onebox/bin目录下,进入目录执行:sh launch_tera.sh。然后执行./teracli进终端交互。Have fun! + 将编译生成的tera_master,tabletserver和teracli文件copy到example/onebox/bin目录下,进入目录执行:sh launch_tera.sh。然后执行./teracli进终端交互。Have fun! 详见:https://github.com/baidu/tera/blob/master/doc/onebox-cn.md @@ -259,10 +262,10 @@ ubuntu安装tera(单机和集群) > 在zookeeper目录中执行:./bin/zkServer.sh status,找到leader那台机器,即master > 进入tera_root的bin目录, 在master上执行: - nohup ./tera_main --flagfile=../conf/tera.flag --tera_role=master &> ../log/master.stderr & + nohup ./tera_master --flagfile=../conf/tera.flag &> ../log/master.stderr & 在其他两台slave机器上执行: - nohup ./tera_main --flagfile=../conf/tera.flag --tera_role=tabletnode &> ../log/tabletserver.stderr & + nohup ./tabletserver --flagfile=../conf/tera.flag &> ../log/tabletserver.stderr & > 在任意一台机器上的tera_root/bin目录中执行:./teracli Have Fun! diff --git a/example/docker/master b/example/docker/master index 87d74626b..38f6ed238 100755 --- a/example/docker/master +++ b/example/docker/master @@ -3,4 +3,4 @@ export CLASSPATH=.:$CLASSPATH:$JAVA_HOME/lib:$JAVA_HOME/jre/lib:$JAVA_HOME/jre/lib/amd64/:/home/ubuntu/leiliyuan/hadoop-1.2.1/lib:$(hadoop classpath) cd /opt/tera/bin -nohup ./tera_main --flagfile=../conf/tera.flag --tera_local_addr $1 --tera_role=master &> ../../share/log/master.stderr & +nohup ./tera_master --flagfile=../conf/tera.flag --tera_local_addr $1 &> ../../share/log/master.stderr & diff --git a/example/docker/tabletnode b/example/docker/tabletnode index 326afd7bd..4995e59b1 100755 --- a/example/docker/tabletnode +++ b/example/docker/tabletnode @@ -2,4 +2,4 @@ export CLASSPATH=.:$CLASSPATH:$JAVA_HOME/lib:$JAVA_HOME/jre/lib:$JAVA_HOME/jre/lib/amd64/:/home/ubuntu/leiliyuan/hadoop-1.2.1/lib:$(hadoop classpath) cd /opt/tera/bin -nohup ./tera_main --flagfile=../conf/tera.flag --tera_local_addr $1 --tera_role=tabletnode &> ../../share/log/tabletserver.stderr & +nohup ./tabletserver --flagfile=../conf/tera.flag --tera_local_addr $1 &> ../../share/log/tabletserver.stderr & diff --git a/example/onebox/bin/kill_tera.sh b/example/onebox/bin/kill_tera.sh index fde9dbfed..047de86bb 100755 --- a/example/onebox/bin/kill_tera.sh +++ b/example/onebox/bin/kill_tera.sh @@ -1,8 +1,13 @@ #!/bin/bash source ./config -for ((i=0; i<=${TABLETNODE_NUM}; i++)); do - PID=`ps x | grep tera_main | grep $((PORT+i)) | awk '{print $1}'`; +PID=`ps x | grep tera_master | grep $PORT | awk '{print $1}'`; +if [ ${PID}"x" != "x" ]; then + kill -9 $PID; +fi + +for ((i=1; i<=${TABLETNODE_NUM}; i++)); do + PID=`ps x | grep tabletserver | grep $((PORT+i)) | awk '{print $1}'`; if [ ${PID}"x" != "x" ]; then kill -9 $PID; fi diff --git a/example/onebox/bin/launch_tera.sh b/example/onebox/bin/launch_tera.sh index b0004153c..c6d5f2fba 100755 --- a/example/onebox/bin/launch_tera.sh +++ b/example/onebox/bin/launch_tera.sh @@ -33,9 +33,8 @@ for ((i=1; i<=$TABLETNODE_NUM; i++)); do if [ ! -x $CACHE_PATH ];then mkdir -p $CACHE_PATH fi - ${CURRENT_DIR}/tera_main \ + ${CURRENT_DIR}/tabletserver \ --flagfile=${CURRENT_DIR}/../conf/tera.flag \ - --tera_role=tabletnode \ --tera_tabletnode_port=$((PORT+i)) \ --tera_leveldb_log_path=${LEVELDB_LOG_FILE} \ --tera_tabletnode_cache_paths=${CACHE_PATH} \ @@ -53,9 +52,8 @@ MASTER_LOG_FILE=${CURRENT_DIR}/../log/master.stderr if [ -f ${MASTER_LOG_FILE} ];then mv ${MASTER_LOG_FILE} ${MASTER_LOG_FILE}.${TIME} fi -${CURRENT_DIR}/tera_main \ +${CURRENT_DIR}/tera_master \ --flagfile=${CURRENT_DIR}/../conf/tera.flag \ - --tera_role=master \ --tera_master_port=${PORT} \ --tera_fake_zk_path_prefix=${FAKE_ZK_PATH_PREFIX} \ --tera_log_prefix=master &> ${MASTER_LOG_FILE} > > #系统架构 系统主要由Tabletserver、Master和ClientSDK三部分构成。其中Tabletserver是核心服务器,承载着所有的数据管理与访问;Master是系统的仲裁者,负责表格的创建、schema更新与负载均衡;ClientSDK包含供管理员使用的命令行工具teracli和给用户使用的SDK。 -表格被按RowKey全局排序,并横向切分成多个Tablet,每个Tablet负责服务RowKey的一个区间,表格又被纵向且分为多个LocalityGroup,一个Tablet的多个Localitygroup在物理上单独存储,可以选择不同的存储介质,以优化访问效率。 +表格被按RowKey全局排序,并横向切分成多个Tablet,每个Tablet负责服务RowKey的一个区间,表格又被纵向切分为多个LocalityGroup,一个Tablet的多个Localitygroup在物理上单独存储,可以选择不同的存储介质,以优化访问效率。 ![架构图](resources/images/arch.png) diff --git a/src/benchmark/mark.cc b/src/benchmark/mark.cc index d3741f71e..a0081e2e4 100644 --- a/src/benchmark/mark.cc +++ b/src/benchmark/mark.cc @@ -74,7 +74,7 @@ void Adapter::Write(const std::string& row, if (FLAGS_verify) { add_checksum(row, family, qualifier, &value); } - row_mu->Put(family, qualifier, timestamp, value); + row_mu->Put(family, qualifier, value, (int64_t)timestamp); if (FLAGS_verify) { remove_checksum(&value); } diff --git a/src/benchmark/mark_main.cc b/src/benchmark/mark_main.cc index e32bc216c..36ae66c4b 100644 --- a/src/benchmark/mark_main.cc +++ b/src/benchmark/mark_main.cc @@ -424,6 +424,7 @@ void print_summary(Statistic* marker, double duration) { print_opt(marker); std::streamsize precision = std::cout.precision(); + std::ios::fmtflags flag(std::cout.flags()); std::cout.precision(3); std::cout << " Summary: " << std::fixed << duration << " s\n" << " total: " << finish_size << " bytes " @@ -434,6 +435,7 @@ void print_summary(Statistic* marker, double duration) { << (double)success_size / 1048576 / duration << " MB/s" << std::endl; std::cout.precision(precision); + std::cout.flags(flag); } void print_summary_proc(Adapter* adapter, double duration) { diff --git a/src/common/thread.h b/src/common/thread.h index 188730725..24e6842e1 100644 --- a/src/common/thread.h +++ b/src/common/thread.h @@ -7,16 +7,15 @@ #ifndef TERA_COMMON_THREAD_H_ #define TERA_COMMON_THREAD_H_ +#include #include -#include - namespace common { class Thread { public: Thread() : tid_(0) {} - bool Start(boost::function thread_proc) { + bool Start(std::function thread_proc) { user_proc_ = thread_proc; int ret = pthread_create(&tid_, NULL, ProcWrapper, this); return (ret == 0); @@ -33,7 +32,7 @@ class Thread { } private: - boost::function user_proc_; + std::function user_proc_; pthread_t tid_; }; diff --git a/src/common/thread_attributes.h b/src/common/thread_attributes.h index c9f18d405..d415feb47 100644 --- a/src/common/thread_attributes.h +++ b/src/common/thread_attributes.h @@ -40,7 +40,7 @@ class ThreadAttributes { } bool SetCpuMask(int32_t cpu_id) { - if (cpu_id < 0 || cpu_id > cpu_num_) { + if (cpu_id < 0 || cpu_id >= cpu_num_) { return false; } diff --git a/src/common/thread_pool.h b/src/common/thread_pool.h index 7dabd67a8..934b98fd1 100644 --- a/src/common/thread_pool.h +++ b/src/common/thread_pool.h @@ -8,12 +8,13 @@ #define TERA_COMMON_THREAD_POOL_H_ #include +#include #include #include #include #include #include -#include + #include "mutex.h" #include "timer.h" @@ -60,7 +61,8 @@ class ThreadPool { bool Stop(bool wait) { if (wait) { while (pending_num_ > 0) { - usleep(10000); + struct timespec ts = {0, 10000000}; + nanosleep(&ts, NULL); } } @@ -77,7 +79,7 @@ class ThreadPool { } // Task definition. - typedef boost::function Task; + typedef std::function Task; // Add a task to the thread pool. void AddTask(const Task& task) { diff --git a/src/io/default_compact_strategy.cc b/src/io/default_compact_strategy.cc index b9c665fbc..a37b74661 100644 --- a/src/io/default_compact_strategy.cc +++ b/src/io/default_compact_strategy.cc @@ -403,6 +403,37 @@ bool DefaultCompactStrategy::DropByLifeTime(int32_t cf_idx, int64_t timestamp) c } } +bool DefaultCompactStrategy::CheckTag(const Slice& tera_key, bool* del_tag, int64_t* ttl_tag) { + *del_tag = false; + *ttl_tag = -1; + Slice key, col, qual; + int64_t ts = -1; + leveldb::TeraKeyType type; + + if (!raw_key_operator_->ExtractTeraKey(tera_key, &key, &col, &qual, &ts, &type)) { + LOG(WARNING) << "invalid tera key: " << tera_key.ToString(); + return false; + } + + if (type == leveldb::TKT_DEL || + type == leveldb::TKT_DEL_COLUMN || + type == leveldb::TKT_DEL_QUALIFIERS || + type == leveldb::TKT_DEL_QUALIFIER) { + *del_tag = true; + } + int32_t cf = -1; + int64_t ttl = -1; + if (!DropIllegalColumnFamily(col.ToString(), &cf) && + schema_.column_families(cf).time_to_live() > 0) { + ttl = schema_.column_families(cf).time_to_live(); + *ttl_tag = ts + ttl * 1000000LL; + } + VLOG(11) << "default strategy, del " << *del_tag << ", key_ts " << ts + << ", ttl " << ttl + << ", ttl_tag " << *ttl_tag; + return true; +} + bool DefaultCompactStrategy::CheckCompactLowerBound(const Slice& cur_key, const std::string& lower_bound) { if (lower_bound.empty()) { diff --git a/src/io/default_compact_strategy.h b/src/io/default_compact_strategy.h index 196ab1dbf..495caa5f9 100644 --- a/src/io/default_compact_strategy.h +++ b/src/io/default_compact_strategy.h @@ -6,6 +6,7 @@ #define TERA_IO_DEFAULT_COMPACT_STRATEGY_H_ #include "leveldb/compact_strategy.h" +#include "leveldb/slice.h" #include "common/mutex.h" #include "io/io_utils.h" @@ -31,6 +32,7 @@ class DefaultCompactStrategy : public leveldb::CompactStrategy { virtual const char* Name() const; virtual void SetSnapshot(uint64_t snapshot); + virtual bool CheckTag(const leveldb::Slice& tera_key, bool* del_tag, int64_t* ttl_tag); virtual bool ScanMergedValue(leveldb::Iterator* it, std::string* merged_value, diff --git a/src/io/tablet_io.cc b/src/io/tablet_io.cc index 249bc5970..9e92c121b 100644 --- a/src/io/tablet_io.cc +++ b/src/io/tablet_io.cc @@ -25,6 +25,7 @@ #include "leveldb/env_inmem.h" #include "leveldb/env_mock.h" #include "leveldb/filter_policy.h" +#include "leveldb/raw_key_operator.h" #include "types.h" #include "utils/counter.h" #include "utils/scan_filter.h" @@ -37,6 +38,8 @@ DECLARE_int64(tera_tablet_log_file_size); DECLARE_int64(tera_tablet_max_write_buffer_size); DECLARE_int64(tera_tablet_write_block_size); DECLARE_int32(tera_tablet_level0_file_limit); +DECLARE_int32(tera_tablet_ttl_percentage); +DECLARE_int32(tera_tablet_del_percentage); DECLARE_int32(tera_tablet_max_block_log_number); DECLARE_int64(tera_tablet_write_log_time_out); DECLARE_bool(tera_log_async_mode); @@ -53,6 +56,7 @@ DECLARE_int32(tera_tabletnode_retry_period); DECLARE_string(tera_leveldb_compact_strategy); DECLARE_bool(tera_leveldb_verify_checksums); DECLARE_bool(tera_leveldb_ignore_corruption_in_compaction); +DECLARE_bool(tera_leveldb_use_file_lock); DECLARE_int32(tera_tabletnode_scan_pack_max_size); DECLARE_bool(tera_tabletnode_cache_enabled); @@ -65,16 +69,25 @@ DECLARE_bool(tera_tablet_use_memtable_on_leveldb); DECLARE_int64(tera_tablet_memtable_ldb_write_buffer_size); DECLARE_int64(tera_tablet_memtable_ldb_block_size); -extern tera::Counter row_read_delay; +tera::Counter row_read_delay; namespace tera { namespace io { -TabletIO::TabletIO(const std::string& key_start, const std::string& key_end) +std::ostream& operator << (std::ostream& o, const TabletIO& tablet_io) { + o << tablet_io.short_path_ + << " [" << DebugString(tablet_io.start_key_) + << ", " << DebugString(tablet_io.end_key_) << "]"; + return o; +} + +TabletIO::TabletIO(const std::string& key_start, const std::string& key_end, + const std::string& path) : async_writer_(NULL), scan_context_manager_(NULL), start_key_(key_start), end_key_(key_end), + short_path_(path), compact_status_(kTableNotCompact), status_(kNotInit), ref_count_(1), db_ref_count_(0), db_(NULL), @@ -213,6 +226,8 @@ bool TabletIO::Load(const TableSchema& schema, ldb_options_.key_start = raw_start_key_; ldb_options_.key_end = raw_end_key_; ldb_options_.l0_slowdown_writes_trigger = FLAGS_tera_tablet_level0_file_limit; + ldb_options_.ttl_percentage = FLAGS_tera_tablet_ttl_percentage; + ldb_options_.del_percentage = FLAGS_tera_tablet_del_percentage; ldb_options_.block_size = FLAGS_tera_tablet_write_block_size * 1024; ldb_options_.max_block_log_number = FLAGS_tera_tablet_max_block_log_number; ldb_options_.write_log_time_out = FLAGS_tera_tablet_write_log_time_out; @@ -232,8 +247,15 @@ bool TabletIO::Load(const TableSchema& schema, if (kv_only_ && table_schema_.raw_key() == TTLKv) { ldb_options_.filter_policy = leveldb::NewTTLKvBloomFilterPolicy(10); - } else { + } else if (kv_only_) { ldb_options_.filter_policy = leveldb::NewBloomFilterPolicy(10); + } else if (table_schema_.raw_key() == Readable) { + ldb_options_.filter_policy = + leveldb::NewRowKeyBloomFilterPolicy(10, leveldb::ReadableRawKeyOperator()); + } else { + CHECK_EQ(table_schema_.raw_key(), Binary); + ldb_options_.filter_policy = + leveldb::NewRowKeyBloomFilterPolicy(10, leveldb::BinaryRawKeyOperator()); } ldb_options_.block_cache = block_cache; ldb_options_.table_cache = table_cache; @@ -253,6 +275,7 @@ bool TabletIO::Load(const TableSchema& schema, } ldb_options_.verify_checksums_in_compaction = FLAGS_tera_leveldb_verify_checksums; ldb_options_.ignore_corruption_in_compaction = FLAGS_tera_leveldb_ignore_corruption_in_compaction; + ldb_options_.use_file_lock = FLAGS_tera_leveldb_use_file_lock; ldb_options_.disable_wal = table_schema_.disable_wal(); SetupOptionsForLG(); @@ -262,7 +285,8 @@ bool TabletIO::Load(const TableSchema& schema, } tablet_path_ = path_prefix + path; - LOG(INFO) << "[Load] Start Open " << tablet_path_; + LOG(INFO) << "[Load] Start Open " << tablet_path_ + << ", kv_only " << kv_only_ << ", raw_key_operator " << key_operator_->Name(); // recover snapshot for (std::map::iterator it = snapshots.begin(); it != snapshots.end(); ++it) { id_to_snapshot_num_[it->first] = it->second; @@ -479,7 +503,7 @@ bool TabletIO::Split(std::string* split_key, StatusCode* status) { } } -bool TabletIO::Compact(int lg_no, StatusCode* status) { +bool TabletIO::Compact(int lg_no, StatusCode* status, CompactionType type) { { MutexLock lock(&mutex_); if (status_ != kReady) { @@ -493,30 +517,15 @@ bool TabletIO::Compact(int lg_no, StatusCode* status) { db_ref_count_++; } CHECK_NOTNULL(db_); - db_->CompactRange(NULL, NULL, lg_no); - - { - MutexLock lock(&mutex_); - compact_status_ = kTableCompacted; - db_ref_count_--; - } - return true; -} - -bool TabletIO::CompactMinor(StatusCode* status) { - { - MutexLock lock(&mutex_); - if (status_ != kReady) { - SetStatusCode(status_, status); - return false; - } - db_ref_count_++; + if (type == kManualCompaction) { + db_->CompactRange(NULL, NULL, lg_no); + } else if (type == kMinorCompaction) { + db_->MinorCompact(); } - CHECK_NOTNULL(db_); - db_->MinorCompact(); { MutexLock lock(&mutex_); + compact_status_ = kTableCompacted; db_ref_count_--; } return true; @@ -635,9 +644,10 @@ bool TabletIO::Read(const leveldb::Slice& key, std::string* value, return true; } -StatusCode TabletIO::InitedScanInterator(const std::string& start_tera_key, - const ScanOptions& scan_options, - leveldb::Iterator** scan_it) { +StatusCode TabletIO::InitedScanIterator(const std::string& start_tera_key, + const std::string& end_row_key, + const ScanOptions& scan_options, + leveldb::Iterator** scan_it) { leveldb::Slice start_key, start_col, start_qual; key_operator_->ExtractTeraKey(start_tera_key, &start_key, &start_col, &start_qual, NULL, NULL); @@ -653,6 +663,10 @@ StatusCode TabletIO::InitedScanInterator(const std::string& start_tera_key, } } read_option.rollbacks = rollbacks_; + // single row scan + if (start_key.ToString() + '\0' == end_row_key) { + SetupSingleRowIteratorOptions(start_key.ToString(), &read_option); + } *scan_it = db_->NewIterator(read_option); TearDownIteratorOptions(&read_option); @@ -676,7 +690,7 @@ bool TabletIO::LowLevelScan(const std::string& start_tera_key, bool* is_complete, StatusCode* status) { leveldb::Iterator* it = NULL; - StatusCode ret_code = InitedScanInterator(start_tera_key, scan_options, &it); + StatusCode ret_code = InitedScanIterator(start_tera_key, end_row_key, scan_options, &it); if (ret_code != kTabletNodeOk) { SetStatusCode(ret_code, status); return false; @@ -844,8 +858,11 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, int64_t now_time = GetTimeStampInMs(); int64_t time_out = now_time + scan_options.timeout; KeyValuePair next_start_kv_pair; - VLOG(9) << "ll-scan timeout set to be " << scan_options.timeout; + VLOG(9) << "ll-scan timeout set to be " << scan_options.timeout + << ", start_tera_key " << DebugString(start_tera_key) + << ", end_row_key " << DebugString(end_row_key); + *is_complete = false; for (; it->Valid();) { bool has_merged = false; std::string merged_value; @@ -872,12 +889,7 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, if (end_row_key.size() && key.compare(end_row_key) >= 0) { // scan finished - break; - } - - if (now_time > time_out) { - VLOG(9) << "ll-scan timeout. Mark next start key: " << DebugString(tera_key.ToString()); - MakeKvPair(key, col, qual, ts, "", &next_start_kv_pair); + *is_complete = true; break; } @@ -919,6 +931,12 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, *read_row_count += 1; ProcessRowBuffer(row_buf, scan_options, value_list, &buffer_size, &number_limit); row_buf.clear(); + + if (now_time > time_out && (next_start_point != NULL)) { + VLOG(9) << "ll-scan timeout. Mark next start key: " << DebugString(tera_key.ToString()); + MakeKvPair(key, col, qual, ts, "", next_start_point); + break; + } } // max version filter @@ -935,8 +953,7 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, last_qual.assign(qual.data(), qual.size()); version_num = 1; int64_t merged_num = 0; - has_merged = - compact_strategy->ScanMergedValue(it, &merged_value, &merged_num); + has_merged = compact_strategy->ScanMergedValue(it, &merged_value, &merged_num); if (has_merged) { counter_.low_read_cell.Add(merged_num - 1); value = merged_value; @@ -970,6 +987,7 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, it->Next(); } } + *is_complete = !it->Valid() ? true : *is_complete; if (ScanWithFilter(scan_options) && it->Valid() @@ -981,30 +999,13 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, ProcessRowBuffer(row_buf, scan_options, value_list, &buffer_size, &number_limit); } - leveldb::Status it_status; - if (!it->Valid()) { - it_status = it->status(); - } - - if (!it_status.ok()) { - SetStatusCode(it_status, status); - VLOG(10) << "ll-seek fail: " << "tablet=[" << tablet_path_ << - "] status=[" << StatusCodeToString(*status); + if (!it->Valid() && !(it->status().ok())) { + SetStatusCode(it->status(), status); + VLOG(10) << "ll-scan fail: " << "tablet=[" << tablet_path_ << "], " + << "status=[" << StatusCodeToString(*status) << "]"; return false; } - - // check if scan finished SetStatusCode(kTabletNodeOk, status); - if ((buffer_size < scan_options.max_size) && - (number_limit < scan_options.number_limit) && - (now_time <= time_out)) { - *is_complete = true; - } else { - if (now_time > time_out && next_start_point) { - next_start_point->CopyFrom(next_start_kv_pair); - } - *is_complete = false; - } return true; } @@ -1038,6 +1039,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key, } } read_option.rollbacks = rollbacks_; + SetupSingleRowIteratorOptions(row_key, &read_option); leveldb::Iterator* it_data = db_->NewIterator(read_option); TearDownIteratorOptions(&read_option); @@ -1441,12 +1443,10 @@ bool TabletIO::HandleScan(const ScanTabletRequest* request, // first rpc init iterator and scan parameter if (context->it == NULL) { - std::string start_tera_key; - std::string end_row_key; SetupScanInternalTeraKey(request, &(context->start_tera_key), &(context->end_row_key)); SetupScanRowOptions(request, &(context->scan_options)); - context->ret_code = InitedScanInterator(context->start_tera_key, context->scan_options, - &(context->it)); + context->ret_code = InitedScanIterator(context->start_tera_key, context->end_row_key, + context->scan_options, &(context->it)); context->compact_strategy = ldb_options_.compact_strategy_factory->NewInstance(); } // schedule scan context @@ -1796,6 +1796,18 @@ void TabletIO::SetupIteratorOptions(const ScanOptions& scan_options, } } +void TabletIO::SetupSingleRowIteratorOptions(const std::string& row_key, + leveldb::ReadOptions* opts) { + std::string row_start_key, row_end_key; + key_operator_->EncodeTeraKey(row_key, "", "", kLatestTs, + leveldb::TKT_FORSEEK, &row_start_key); + key_operator_->EncodeTeraKey(row_key + '\0', "", "", kLatestTs, + leveldb::TKT_FORSEEK, &row_end_key); + opts->read_single_row = true; + opts->row_start_key = row_start_key; + opts->row_end_key = row_end_key; +} + void TabletIO::TearDownIteratorOptions(leveldb::ReadOptions* opts) { if (opts->target_lgs) { delete opts->target_lgs; diff --git a/src/io/tablet_io.h b/src/io/tablet_io.h index 421b26428..ba5cd99cf 100644 --- a/src/io/tablet_io.h +++ b/src/io/tablet_io.h @@ -5,15 +5,13 @@ #ifndef TERA_IO_TABLET_IO_H_ #define TERA_IO_TABLET_IO_H_ +#include #include #include #include #include #include -#include -#include - #include "common/base/scoped_ptr.h" #include "common/mutex.h" #include "io/tablet_scanner.h" @@ -40,6 +38,11 @@ class ScanContextManager; class TabletIO { public: + enum CompactionType { + kManualCompaction = 1, + kMinorCompaction = 2, + }; + enum TabletStatus { kNotInit = kTabletNotInit, kReady = kTabletReady, @@ -63,11 +66,14 @@ class TabletIO { tera::Counter write_size; }; - typedef boost::function*, - std::vector*)> WriteCallback; + typedef std::function*, + std::vector*)> WriteCallback; + + friend std::ostream& operator << (std::ostream& o, const TabletIO& tablet_io); public: - TabletIO(const std::string& key_start, const std::string& key_end); + TabletIO(const std::string& key_start, const std::string& key_end, + const std::string& path); virtual ~TabletIO(); // for testing @@ -96,8 +102,7 @@ class TabletIO { StatusCode* status = NULL); virtual bool Unload(StatusCode* status = NULL); virtual bool Split(std::string* split_key, StatusCode* status = NULL); - virtual bool Compact(int lg_no = -1, StatusCode* status = NULL); - bool CompactMinor(StatusCode* status = NULL); + virtual bool Compact(int lg_no = -1, StatusCode* status = NULL, CompactionType type = kManualCompaction); bool Destroy(StatusCode* status = NULL); virtual bool GetDataSize(uint64_t* size, std::vector* lgsize = NULL, StatusCode* status = NULL); @@ -181,6 +186,8 @@ class TabletIO { void SetupIteratorOptions(const ScanOptions& scan_options, leveldb::ReadOptions* leveldb_opts); + void SetupSingleRowIteratorOptions(const std::string& row_key, + leveldb::ReadOptions* opts); void TearDownIteratorOptions(leveldb::ReadOptions* opts); void ProcessRowBuffer(std::list& row_buf, @@ -189,9 +196,10 @@ class TabletIO { uint32_t* buffer_size, int64_t* number_limit); - StatusCode InitedScanInterator(const std::string& start_tera_key, - const ScanOptions& scan_options, - leveldb::Iterator** scan_it); + StatusCode InitedScanIterator(const std::string& start_tera_key, + const std::string& end_row_key, + const ScanOptions& scan_options, + leveldb::Iterator** scan_it); bool ScanRowsRestricted(const ScanTabletRequest* request, ScanTabletResponse* response, @@ -249,6 +257,7 @@ class TabletIO { std::string tablet_path_; const std::string start_key_; const std::string end_key_; + const std::string short_path_; std::string raw_start_key_; std::string raw_end_key_; CompactStatus compact_status_; diff --git a/src/io/tablet_writer.cc b/src/io/tablet_writer.cc index d377d5f15..066e8bf33 100644 --- a/src/io/tablet_writer.cc +++ b/src/io/tablet_writer.cc @@ -6,7 +6,6 @@ #include -#include #include #include @@ -56,7 +55,7 @@ void TabletWriter::Start() { stopped_ = false; } LOG(INFO) << "start tablet writer ..."; - thread_.Start(boost::bind(&TabletWriter::DoWork, this)); + thread_.Start(std::bind(&TabletWriter::DoWork, this)); ThisThread::Yield(); } @@ -193,177 +192,241 @@ bool TabletWriter::SwapActiveBuffer(bool force) { return true; } -void TabletWriter::BatchRequest(const std::vector& row_mutation_vec, +void TabletWriter::BatchRequest(WriteTaskBuffer* task_buffer, leveldb::WriteBatch* batch) { int64_t timestamp_old = 0; - - for (uint32_t i = 0; i < row_mutation_vec.size(); ++i) { - const RowMutationSequence& row_mu = *row_mutation_vec[i]; - const std::string& row_key = row_mu.row_key(); - int32_t mu_num = row_mu.mutation_sequence().size(); - if (mu_num == 0) { - continue; - } - if (tablet_->KvOnly()) { - // only the last mutation take effect for kv - const Mutation& mu = row_mu.mutation_sequence().Get(mu_num - 1); - std::string tera_key; - if (tablet_->GetSchema().raw_key() == TTLKv) { // TTL-KV - if (mu.ttl() == -1) { // never expires - tablet_->GetRawKeyOperator()->EncodeTeraKey(row_key, "", "", - kLatestTs, leveldb::TKT_FORSEEK, &tera_key); - } else { // no check of overflow risk ... - tablet_->GetRawKeyOperator()->EncodeTeraKey(row_key, "", "", - get_micros() / 1000000 + mu.ttl(), leveldb::TKT_FORSEEK, &tera_key); - } - } else { // Readable-KV - tera_key.assign(row_key); + for (uint32_t task_idx = 0; task_idx < task_buffer->size(); ++task_idx) { + WriteTask& task = (*task_buffer)[task_idx]; + const std::vector& row_mutation_vec = *(task.row_mutation_vec); + std::vector* status_vec = task.status_vec; + + for (uint32_t i = 0; i < row_mutation_vec.size(); ++i) { + StatusCode* status = &((*status_vec)[i]); + const RowMutationSequence& row_mu = *row_mutation_vec[i]; + const std::string& row_key = row_mu.row_key(); + int32_t mu_num = row_mu.mutation_sequence().size(); + if (*status != kTabletNodeOk) { + VLOG(11) << "batch write fail, row " << DebugString(row_key) + << ", status " << StatusCodeToString(*status); + continue; } - if (mu.type() == kPut) { - batch->Put(tera_key, mu.value()); - } else { - batch->Delete(tera_key); + if (mu_num == 0) { + continue; } - } else { - for (int32_t t = 0; t < mu_num; ++t) { - const Mutation& mu = row_mu.mutation_sequence().Get(t); + if (tablet_->KvOnly()) { + // only the last mutation take effect for kv + const Mutation& mu = row_mu.mutation_sequence().Get(mu_num - 1); std::string tera_key; - leveldb::TeraKeyType type = leveldb::TKT_VALUE; - switch (mu.type()) { - case kDeleteRow: - type = leveldb::TKT_DEL; - break; - case kDeleteFamily: - type = leveldb::TKT_DEL_COLUMN; - break; - case kDeleteColumn: - type = leveldb::TKT_DEL_QUALIFIER; - break; - case kDeleteColumns: - type = leveldb::TKT_DEL_QUALIFIERS; - break; - case kAdd: - type = leveldb::TKT_ADD; - break; - case kAddInt64: - type = leveldb::TKT_ADDINT64; - break; - case kPutIfAbsent: - type = leveldb::TKT_PUT_IFABSENT; - break; - case kAppend: - type = leveldb::TKT_APPEND; - break; - default: - break; + if (tablet_->GetSchema().raw_key() == TTLKv) { // TTL-KV + if (mu.ttl() == -1) { // never expires + tablet_->GetRawKeyOperator()->EncodeTeraKey(row_key, "", "", + kLatestTs, leveldb::TKT_FORSEEK, &tera_key); + } else { // no check of overflow risk ... + tablet_->GetRawKeyOperator()->EncodeTeraKey(row_key, "", "", + get_micros() / 1000000 + mu.ttl(), leveldb::TKT_FORSEEK, &tera_key); + } + } else { // Readable-KV + tera_key.assign(row_key); } - int64_t timestamp = get_unique_micros(timestamp_old); - timestamp_old = timestamp; - if (!tablet_->GetSchema().enable_txn() && - leveldb::TeraKey::IsTypeAllowUserSetTimestamp(type) && - mu.has_timestamp() && mu.timestamp() < timestamp) { - timestamp = mu.timestamp(); + if (mu.type() == kPut) { + batch->Put(tera_key, mu.value()); + } else { + batch->Delete(tera_key); } - tablet_->GetRawKeyOperator()->EncodeTeraKey(row_key, mu.family(), mu.qualifier(), - timestamp, type, &tera_key); - uint32_t lg_id = 0; - size_t lg_num = tablet_->ldb_options_.exist_lg_list->size(); - if (lg_num > 1) { - if (type != leveldb::TKT_DEL) { - lg_id = tablet_->GetLGidByCFName(mu.family()); - leveldb::PutFixed32LGId(&tera_key, lg_id); - VLOG(10) << "Batch Request, key: " << DebugString(row_key) - << " family: " << mu.family() << ", lg_id: " << lg_id; - batch->Put(tera_key, mu.value()); - } else { - // put row_del mark to all LGs - for (lg_id = 0; lg_id < lg_num; ++lg_id) { - std::string tera_key_tmp = tera_key; - leveldb::PutFixed32LGId(&tera_key_tmp, lg_id); + } else { + for (int32_t t = 0; t < mu_num; ++t) { + const Mutation& mu = row_mu.mutation_sequence().Get(t); + std::string tera_key; + leveldb::TeraKeyType type = leveldb::TKT_VALUE; + switch (mu.type()) { + case kDeleteRow: + type = leveldb::TKT_DEL; + break; + case kDeleteFamily: + type = leveldb::TKT_DEL_COLUMN; + break; + case kDeleteColumn: + type = leveldb::TKT_DEL_QUALIFIER; + break; + case kDeleteColumns: + type = leveldb::TKT_DEL_QUALIFIERS; + break; + case kAdd: + type = leveldb::TKT_ADD; + break; + case kAddInt64: + type = leveldb::TKT_ADDINT64; + break; + case kPutIfAbsent: + type = leveldb::TKT_PUT_IFABSENT; + break; + case kAppend: + type = leveldb::TKT_APPEND; + break; + default: + break; + } + int64_t timestamp = get_unique_micros(timestamp_old); + timestamp_old = timestamp; + if (!tablet_->GetSchema().enable_txn() && + leveldb::TeraKey::IsTypeAllowUserSetTimestamp(type) && + mu.has_timestamp() && mu.timestamp() < timestamp) { + timestamp = mu.timestamp(); + } + tablet_->GetRawKeyOperator()->EncodeTeraKey(row_key, mu.family(), mu.qualifier(), + timestamp, type, &tera_key); + uint32_t lg_id = 0; + size_t lg_num = tablet_->ldb_options_.exist_lg_list->size(); + if (lg_num > 1) { + if (type != leveldb::TKT_DEL) { + lg_id = tablet_->GetLGidByCFName(mu.family()); + leveldb::PutFixed32LGId(&tera_key, lg_id); VLOG(10) << "Batch Request, key: " << DebugString(row_key) << " family: " << mu.family() << ", lg_id: " << lg_id; - batch->Put(tera_key_tmp, mu.value()); + batch->Put(tera_key, mu.value()); + } else { + // put row_del mark to all LGs + for (lg_id = 0; lg_id < lg_num; ++lg_id) { + std::string tera_key_tmp = tera_key; + leveldb::PutFixed32LGId(&tera_key_tmp, lg_id); + VLOG(10) << "Batch Request, key: " << DebugString(row_key) + << " family: " << mu.family() << ", lg_id: " << lg_id; + batch->Put(tera_key_tmp, mu.value()); + } } + } else { + VLOG(10) << "Batch Request, key: " << DebugString(row_key) + << " family: " << mu.family() << ", qualifier " << mu.qualifier() + << ", ts " << timestamp << ", type " << type << ", lg_id: " << lg_id; + batch->Put(tera_key, mu.value()); } - } else { - VLOG(10) << "Batch Request, key: " << DebugString(row_key) - << " family: " << mu.family() << ", lg_id: " << lg_id; - batch->Put(tera_key, mu.value()); } } } } + return; +} + +void TabletWriter::FinishTask(WriteTaskBuffer* task_buffer, StatusCode status) { + for (uint32_t task_idx = 0; task_idx < task_buffer->size(); ++task_idx) { + WriteTask& task = (*task_buffer)[task_idx]; + tablet_->GetCounter().write_rows.Add(task.row_mutation_vec->size()); + for (uint32_t i = 0; i < task.row_mutation_vec->size(); i++) { + tablet_->GetCounter().write_kvs.Add((*task.row_mutation_vec)[i]->mutation_sequence_size()); + // set batch_write status for row_mu + if ((*task.status_vec)[i] == kTabletNodeOk) { + (*task.status_vec)[i] = status; + } + } + task.callback(task.row_mutation_vec, task.status_vec); + } + return; } -bool TabletWriter::CheckConflict(const RowMutationSequence& row_mu, - std::set* commit_row_key_set, - StatusCode* status) { +// set status to kTxnFail, if transaction conflicts. +bool TabletWriter::CheckSingleRowTxnConflict(const RowMutationSequence& row_mu, + std::set* commit_row_key_set, + StatusCode* status) { const std::string& row_key = row_mu.row_key(); if (row_mu.txn_read_info().has_read()) { if (!tablet_->GetSchema().enable_txn()) { VLOG(10) << "txn of row " << DebugString(row_key) << " is interrupted: txn not enabled"; SetStatusCode(kTxnFail, status); - return false; + return true; } if (commit_row_key_set->find(row_key) != commit_row_key_set->end()) { VLOG(10) << "txn of row " << DebugString(row_key) << " is interrupted: found same row in one batch"; SetStatusCode(kTxnFail, status); - return false; + return true; } if (!tablet_->SingleRowTxnCheck(row_key, row_mu.txn_read_info(), status)) { VLOG(10) << "txn of row " << DebugString(row_key) << " is interrupted: check fail, status: " << StatusCodeToString(*status); - return false; + return true; } VLOG(10) << "txn of row " << DebugString(row_key) << " check pass"; } commit_row_key_set->insert(row_key); - return true; + return false; } -void TabletWriter::FinishTask(const WriteTask& task, StatusCode status) { - int32_t row_num = task.row_mutation_vec->size(); - tablet_->GetCounter().write_rows.Add(row_num); - for (int32_t i = 0; i < row_num; i++) { - tablet_->GetCounter().write_kvs.Add((*task.row_mutation_vec)[i]->mutation_sequence_size()); - if ((*task.status_vec)[i] == kTabletNodeOk) { - (*task.status_vec)[i] = status; +bool TabletWriter::CheckIllegalRowArg(const RowMutationSequence& row_mu, + const std::set& cf_set, + StatusCode* status) { + // check arguments + if (row_mu.row_key().size() >= 64 * 1024) { + SetStatusCode(kTableInvalidArg, status); + return true; + } + for (int32_t i = 0; i < row_mu.mutation_sequence().size(); ++i) { + const Mutation& mu = row_mu.mutation_sequence(i); + if (mu.value().size() >= 32 * 1024 * 1024) { + SetStatusCode(kTableInvalidArg, status); + return true; + } + if (!tablet_->KvOnly()) { + if (mu.qualifier().size() >= 64 * 1024) { // 64KB + SetStatusCode(kTableInvalidArg, status); + return true; + } + if (mu.type() != kDeleteRow && + (cf_set.find(mu.family()) == cf_set.end())) { + SetStatusCode(kTableInvalidArg, status); + VLOG(11) << "batch write check, illegal cf, row " << DebugString(row_mu.row_key()) + << ", cf " << mu.family() << ", qu " << mu.qualifier() + << ", ts " << mu.timestamp() << ", type " << mu.type() + << ", cf_set.size " << cf_set.size() + << ", status " << StatusCodeToString(*status); + return true; + } } } - task.callback(task.row_mutation_vec, task.status_vec); + return false; } -StatusCode TabletWriter::FlushToDiskBatch(WriteTaskBuffer* task_buffer) { - size_t task_num = task_buffer->size(); - leveldb::WriteBatch batch; +void TabletWriter::CheckRows(WriteTaskBuffer* task_buffer) { + std::set cf_set; + TableSchema schema = tablet_->GetSchema(); + for (int32_t cf_idx = 0; cf_idx < schema.column_families_size(); ++cf_idx) { + cf_set.insert(schema.column_families(cf_idx).name()); + } std::set commit_row_key_set; - std::vector commit_row_mu_vec; - for (size_t i = 0; i < task_num; ++i) { - WriteTask& task = (*task_buffer)[i]; + for (uint32_t task_idx = 0; task_idx < task_buffer->size(); ++task_idx) { + WriteTask& task = (*task_buffer)[task_idx]; std::vector& row_mutation_vec = *task.row_mutation_vec; std::vector& status_vec = *task.status_vec; - for (size_t j = 0; j < row_mutation_vec.size(); ++j) { - const RowMutationSequence* row_mu = row_mutation_vec[j]; - if (CheckConflict(*row_mu, &commit_row_key_set, &status_vec[j])) { - commit_row_mu_vec.push_back(row_mu); - status_vec[j] = kTabletNodeOk; + + for (uint32_t row_idx = 0; row_idx < row_mutation_vec.size(); ++row_idx) { + const RowMutationSequence* row_mu = row_mutation_vec[row_idx]; + if(CheckSingleRowTxnConflict(*row_mu, &commit_row_key_set, &status_vec[row_idx])) { + continue; } + if (CheckIllegalRowArg(*row_mu, cf_set, &status_vec[row_idx])) { + continue; + } + status_vec[row_idx] = kTabletNodeOk; } } - BatchRequest(commit_row_mu_vec, &batch); + return; +} +StatusCode TabletWriter::FlushToDiskBatch(WriteTaskBuffer* task_buffer) { + int64_t ts = get_micros(); + CheckRows(task_buffer); + + leveldb::WriteBatch batch; + BatchRequest(task_buffer, &batch); StatusCode status = kTabletNodeOk; const bool disable_wal = false; tablet_->WriteBatch(&batch, disable_wal, FLAGS_tera_sync_log, &status); batch.Clear(); - for (size_t i = 0; i < task_num; i++) { - FinishTask((*task_buffer)[i], status); - } - VLOG(7) << "finish a batch: " << task_num; + + FinishTask(task_buffer, status); + VLOG(7) << "finish a batch: " << task_buffer->size() << ", use " << get_micros() - ts; return status; } diff --git a/src/io/tablet_writer.h b/src/io/tablet_writer.h index ae327b9ad..561db7b1d 100644 --- a/src/io/tablet_writer.h +++ b/src/io/tablet_writer.h @@ -5,6 +5,8 @@ #ifndef TERA_TABLETNODE_TABLET_WRITER_H_ #define TERA_TABLETNODE_TABLET_WRITER_H_ +#include + #include "common/event.h" #include "common/mutex.h" #include "common/thread.h" @@ -23,8 +25,8 @@ class TabletIO; class TabletWriter { public: - typedef boost::function*, \ - std::vector*)> WriteCallback; + typedef std::function*, \ + std::vector*)> WriteCallback; struct WriteTask { std::vector* row_mutation_vec; @@ -50,13 +52,17 @@ class TabletWriter { void DoWork(); bool SwapActiveBuffer(bool force); /// 把一个request打到一个leveldbbatch里去, request是原子的, batch也是, so .. - void BatchRequest(const std::vector& row_mutation_vec, + void BatchRequest(WriteTaskBuffer* task_buffer, leveldb::WriteBatch* batch); - bool CheckConflict(const RowMutationSequence& row_mu, - std::set* commit_row_key_set, - StatusCode* status = NULL); + bool CheckSingleRowTxnConflict(const RowMutationSequence& row_mu, + std::set* commit_row_key_set, + StatusCode* status); + bool CheckIllegalRowArg(const RowMutationSequence& row_mu, + const std::set& cf_set, + StatusCode* status); + void CheckRows(WriteTaskBuffer* task_buffer); /// 任务完成, 执行回调 - void FinishTask(const WriteTask& task, StatusCode status); + void FinishTask(WriteTaskBuffer* task_buffer, StatusCode status); /// 将buffer刷到磁盘(leveldb), 并sync StatusCode FlushToDiskBatch(WriteTaskBuffer* task_buffer); diff --git a/src/io/test/load_test.cc b/src/io/test/load_test.cc index fe0867d09..714758a5f 100644 --- a/src/io/test/load_test.cc +++ b/src/io/test/load_test.cc @@ -95,7 +95,7 @@ TEST_F(TabletIOTest, General) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); env->SetPrefix(mock_env_prefix); tablet.SetMockEnv(env); @@ -136,7 +136,7 @@ TEST_F(TabletIOTest, CurrentLost) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); env->SetPrefix(mock_env_prefix); env->SetGetChildrenCallback(DropCurrent); @@ -167,7 +167,7 @@ TEST_F(TabletIOTest, CurrentReadFailed) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); env->SetPrefix(mock_env_prefix); env->SetNewSequentialFileFailedCallback(CannotReadCurrent); @@ -204,7 +204,7 @@ TEST_F(TabletIOTest, CurrentCorrupted) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); env->SetPrefix(mock_env_prefix); @@ -242,7 +242,7 @@ TEST_F(TabletIOTest, ManifestLost) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); env->SetPrefix(mock_env_prefix); @@ -273,7 +273,7 @@ TEST_F(TabletIOTest, ManifestReadFailed) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); env->SetPrefix(mock_env_prefix); env->SetNewSequentialFileFailedCallback(CannotReadManifest); @@ -310,7 +310,7 @@ TEST_F(TabletIOTest, ManifestCorrupted) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); env->SetPrefix(mock_env_prefix); @@ -341,7 +341,7 @@ TEST_F(TabletIOTest, SstLost) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); env->SetPrefix(mock_env_prefix); @@ -364,7 +364,7 @@ TEST_F(TabletIOTest, SstLostButIgnore) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); std::string fname = mock_env_prefix + tablet_path + "/0/__oops"; @@ -386,6 +386,7 @@ TEST_F(TabletIOTest, SstLostButIgnore) { empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); + close(fd); } //#endif diff --git a/src/io/test/tablet_io_test.cc b/src/io/test/tablet_io_test.cc index 491a9c3e2..90da431f9 100644 --- a/src/io/test/tablet_io_test.cc +++ b/src/io/test/tablet_io_test.cc @@ -20,6 +20,7 @@ #include "proto/status_code.pb.h" #include "utils/timer.h" #include "utils/utils_cmd.h" +#include "utils/string_util.h" #include "io/tablet_scanner.h" DECLARE_string(tera_tabletnode_path_prefix); @@ -47,8 +48,8 @@ class TabletIOTest : public ::testing::Test { } ~TabletIOTest() { - std::string cmd = std::string("rm -rf ") + working_dir; - system(cmd.c_str()); + std::string cmd = std::string("rm -rf ") + working_dir; + system(cmd.c_str()); } const TableSchema& GetTableSchema() { @@ -90,7 +91,7 @@ TEST_F(TabletIOTest, General) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -115,7 +116,7 @@ TEST_F(TabletIOTest, Split) { StatusCode status; uint64_t size = 0; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -136,7 +137,7 @@ TEST_F(TabletIOTest, Split) { // open tablet for other key scope key_start = "5000"; key_end = "8000"; - TabletIO other_tablet(key_start, key_end); + TabletIO other_tablet(key_start, key_end, tablet_path); EXPECT_TRUE(other_tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); other_tablet.GetDataSize(&size, NULL, &status); @@ -152,7 +153,7 @@ TEST_F(TabletIOTest, Split) { key_start = ""; key_end = "5000"; - TabletIO l_tablet(key_start, key_end); + TabletIO l_tablet(key_start, key_end, tablet_path); EXPECT_TRUE(l_tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); l_tablet.GetDataSize(&size, NULL, &status); @@ -162,7 +163,7 @@ TEST_F(TabletIOTest, Split) { key_start = "8000"; key_end = ""; - TabletIO r_tablet(key_start, key_end); + TabletIO r_tablet(key_start, key_end, tablet_path); EXPECT_TRUE(r_tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); r_tablet.GetDataSize(&size, NULL, &status); @@ -179,7 +180,7 @@ TEST_F(TabletIOTest, SplitAndCheckSize) { StatusCode status; uint64_t size = 0; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -199,7 +200,7 @@ TEST_F(TabletIOTest, SplitAndCheckSize) { EXPECT_TRUE(tablet.Unload()); // open from split key to check scope size - TabletIO l_tablet(key_start, split_key); + TabletIO l_tablet(key_start, split_key, tablet_path); EXPECT_TRUE(l_tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); l_tablet.GetDataSize(&size, NULL, &status); @@ -207,7 +208,7 @@ TEST_F(TabletIOTest, SplitAndCheckSize) { << "]: size = " << size; EXPECT_TRUE(l_tablet.Unload()); - TabletIO r_tablet(split_key, key_end); + TabletIO r_tablet(split_key, key_end, tablet_path); EXPECT_TRUE(r_tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); r_tablet.GetDataSize(&size, NULL, &status); @@ -224,7 +225,7 @@ TEST_F(TabletIOTest, OverWrite) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -250,7 +251,7 @@ TEST_F(TabletIOTest, Compact) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -266,7 +267,7 @@ TEST_F(TabletIOTest, Compact) { // open another scope std::string new_key_start = StringFormat("%011llu", 5); // NumberToString(500); std::string new_key_end = StringFormat("%011llu", 50); // NumberToString(800); - TabletIO new_tablet(new_key_start, new_key_end); + TabletIO new_tablet(new_key_start, new_key_end, tablet_path); EXPECT_TRUE(new_tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); EXPECT_TRUE(new_tablet.Compact(0, &status)); @@ -296,7 +297,7 @@ TEST_F(TabletIOTest, LowLevelScan) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -379,7 +380,7 @@ TEST_F(TabletIOTest, SplitToSubTable) { StatusCode status; uint64_t size = 0; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -415,7 +416,7 @@ TEST_F(TabletIOTest, SplitToSubTable) { parent_tablet.push_back(1); // 1. load sub-table 1 - TabletIO l_tablet(key_start, split_key); + TabletIO l_tablet(key_start, split_key, split_path_1); EXPECT_TRUE(l_tablet.Load(TableSchema(), split_path_1, parent_tablet, empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); l_tablet.GetDataSize(&size, NULL, &status); @@ -433,7 +434,7 @@ TEST_F(TabletIOTest, SplitToSubTable) { EXPECT_TRUE(l_tablet.Unload()); // 2. load sub-table 2 - TabletIO r_tablet(split_key, key_end); + TabletIO r_tablet(split_key, key_end, split_path_2); EXPECT_TRUE(r_tablet.Load(TableSchema(), split_path_2, parent_tablet, empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); r_tablet.GetDataSize(&size, NULL, &status); @@ -503,7 +504,7 @@ TEST_F(TabletIOTest, FindAverageKey) { //ASSERT_EQ(ave, "\xb0"); ASSERT_LT(start, ave); ASSERT_NE(ave, start); - std::cout << start << ", " << ave << ", " << std::endl; + std::cout << DebugString(start) << ", " << DebugString(ave) << ", " << std::endl; start = "000000000000001480186993"; end = "000000000000002147352684"; @@ -542,6 +543,71 @@ TEST_F(TabletIOTest, FindAverageKey) { ASSERT_TRUE(TabletIO::FindAverageKey(start, end, &ave)); ASSERT_EQ(ave, "a\xff\xff\x80"); } + +TEST_F(TabletIOTest, RowBloomFilter) { + const int32_t NR = 10000; + const int32_t CR = 10; + std::string tablet_path = working_dir + "row_bloomfilter"; + std::string key_start = ""; + std::string key_end = ""; + StatusCode status; + + TabletIO tablet(key_start, key_end, tablet_path); + EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), + empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + + // prepare data + leveldb::WriteBatch batch; + for (int32_t i = 0; i < NR; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%06d", i); + std::string row(buf); + + for (int32_t j = 0; j < CR; j++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%03d", j); + std::string col(buf); + + std::string tera_key; + tablet.GetRawKeyOperator()->EncodeTeraKey(row, "column", col, get_micros(), + leveldb::TKT_VALUE, &tera_key); + batch.Put(tera_key, ""); + } + } + ASSERT_TRUE(tablet.WriteBatch(&batch, false, true, NULL)); + + // read and verify + for (int32_t i = 0; i < NR; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%06d", i); + std::string row(buf); + + std::string start_tera_key; + tablet.GetRawKeyOperator()->EncodeTeraKey(row, "", "", kLatestTs, leveldb::TKT_FORSEEK, + &start_tera_key); + std::string end_row_key = row + '\0'; + + RowResult value_list; + KeyValuePair next_start_point; + uint32_t read_row_count = 0; + uint32_t read_bytes = 0; + bool is_complete = false; + ASSERT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, ScanOptions(), &value_list, + &next_start_point, &read_row_count, &read_bytes, + &is_complete, NULL)); + ASSERT_EQ(value_list.key_values_size(), CR); + for (int32_t j = 0; j < CR; j++) { + char buf[16]; + snprintf(buf, sizeof(buf), "%03d", j); + std::string col(buf); + + const KeyValuePair& kv = value_list.key_values(j); + EXPECT_EQ(kv.key(), row); + EXPECT_EQ(kv.qualifier(), col); + } + } +} + } // namespace io } // namespace tera diff --git a/src/io/test/tablet_scanner_test.cc b/src/io/test/tablet_scanner_test.cc index 125e2354a..a53f2d52a 100644 --- a/src/io/test/tablet_scanner_test.cc +++ b/src/io/test/tablet_scanner_test.cc @@ -4,9 +4,9 @@ #include "io/tablet_io.h" +#include #include -#include #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" @@ -39,6 +39,10 @@ const std::string working_dir = "testdata/"; class TabletScannerTest : public ::testing::Test { public: TabletScannerTest() { + session_id_ = 0; + last_key_= 0; + done_cnt_ = 0; + std::string cmd = std::string("mkdir -p ") + working_dir; FLAGS_tera_tabletnode_path_prefix = "./"; system(cmd.c_str()); @@ -219,7 +223,7 @@ TEST_F(TabletScannerTest, General) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -240,7 +244,7 @@ TEST_F(TabletScannerTest, CacheEvict) { std::string key_end = ""; StatusCode status; - TabletIO tablet(key_start, key_end); + TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); @@ -251,7 +255,7 @@ TEST_F(TabletScannerTest, CacheEvict) { ThreadPool pool(nr_thread); for (uint32_t i = 0; i < nr_thread; i++) { ThreadPool::Task task = - boost::bind(&TabletScannerTest::MultiScan, this, &tablet); + std::bind(&TabletScannerTest::MultiScan, this, &tablet); pool.AddTask(task); } pool.Stop(true); diff --git a/src/io/ttlkv_compact_strategy.cc b/src/io/ttlkv_compact_strategy.cc index 0ca904d1f..1edbfc907 100644 --- a/src/io/ttlkv_compact_strategy.cc +++ b/src/io/ttlkv_compact_strategy.cc @@ -29,6 +29,17 @@ void KvCompactStrategy::SetSnapshot(uint64_t snapshot) { snapshot_ = snapshot; } +bool KvCompactStrategy::CheckTag(const Slice& tera_key, bool* del_tag, int64_t* ttl_tag) { + *del_tag = false; + leveldb::Slice row_key; + int64_t expire_timestamp; + raw_key_operator_->ExtractTeraKey(tera_key, &row_key, NULL, NULL, + &expire_timestamp, NULL); + *ttl_tag = (expire_timestamp > 0 && expire_timestamp != kLatestTs) ? (expire_timestamp * 1000000LL): -1; + VLOG(11) << "CheckTag, expire " << expire_timestamp << ", ttl_tag " << *ttl_tag; + return true; +} + bool KvCompactStrategy::Drop(const leveldb::Slice& tera_key, uint64_t n, const std::string& lower_bound) { leveldb::Slice row_key; diff --git a/src/io/ttlkv_compact_strategy.h b/src/io/ttlkv_compact_strategy.h index 19f13013d..5e756d3c4 100644 --- a/src/io/ttlkv_compact_strategy.h +++ b/src/io/ttlkv_compact_strategy.h @@ -8,6 +8,7 @@ #include "common/mutex.h" #include "leveldb/compact_strategy.h" #include "leveldb/raw_key_operator.h" +#include "leveldb/slice.h" #include "proto/table_schema.pb.h" namespace tera { @@ -18,6 +19,7 @@ class KvCompactStrategy : public leveldb::CompactStrategy { KvCompactStrategy(const TableSchema& schema); virtual ~KvCompactStrategy(); + virtual bool CheckTag(const leveldb::Slice& tera_key, bool* del_tag, int64_t* ttl_tag); virtual bool Drop(const leveldb::Slice& k, uint64_t n, const std::string& lower_bound); diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc index 72605756a..3d0293b94 100644 --- a/src/leveldb/db/builder.cc +++ b/src/leveldb/db/builder.cc @@ -8,6 +8,8 @@ #include "db/builder.h" +#include + #include "db/filename.h" #include "db/dbformat.h" #include "db/table_cache.h" @@ -28,6 +30,9 @@ Status BuildTable(const std::string& dbname, uint64_t* saved_size, uint64_t smallest_snapshot) { Status s; + int64_t del_num = 0; // statistic: delete tag's percentage in sst + std::vector ttls; // use for calculate timeout percentage + int64_t entries = 0; meta->file_size = 0; iter->SeekToFirst(); @@ -46,48 +51,57 @@ Status BuildTable(const std::string& dbname, compact_strategy->SetSnapshot(snapshot); } + ParsedInternalKey ikey; TableBuilder* builder = new TableBuilder(options, file); meta->smallest.DecodeFrom(iter->key()); for (;iter->Valid();) { Slice key = iter->key(); // no-length-prefix-key + assert(ParseInternalKey(key, &ikey)); - const char* entry = key.data(); - Slice raw_key(entry, key.size() - 8); - - const uint64_t tag = DecodeFixed64(entry + key.size() - 8); - const uint64_t sequence_id = tag >> 8; bool has_atom_merged = false; - - if (static_cast(tag & 0xff) == kTypeValue && compact_strategy && sequence_id <= snapshot) { - bool drop = compact_strategy->Drop(raw_key, sequence_id); + if (ikey.type == kTypeValue && compact_strategy && ikey.sequence <= snapshot) { + bool drop = compact_strategy->Drop(ikey.user_key, ikey.sequence); if (drop) { - iter->Next(); -// Log(options.info_log, "[Memtable Drop] sequence_id: %llu, raw_key: %s", -// sequence_id, entry); - continue; // drop it before build - } - else { - std::string merged_value; - std::string merged_key; - has_atom_merged = compact_strategy->MergeAtomicOPs(iter, &merged_value, - &merged_key); - if (has_atom_merged) { - meta->largest.DecodeFrom(Slice(merged_key)); - builder->Add(Slice(merged_key), Slice(merged_value)); - } + iter->Next(); + // Log(options.info_log, "[Memtable Drop] sequence_id: %llu, raw_key: %s", + // ikey.sequence, ikey.user_key); + continue; // drop it before build + } else { + std::string merged_value; + std::string merged_key; + has_atom_merged = compact_strategy->MergeAtomicOPs(iter, &merged_value, + &merged_key); + if (has_atom_merged) { + meta->largest.DecodeFrom(Slice(merged_key)); + builder->Add(Slice(merged_key), Slice(merged_value)); + } } } if (!has_atom_merged) { - meta->largest.DecodeFrom(key); - builder->Add(key, iter->value()); - iter->Next(); + bool del_tag = false; + int64_t ttl = -1; + compact_strategy && compact_strategy->CheckTag(ikey.user_key, &del_tag, &ttl); + if (ikey.type == kTypeDeletion || del_tag) { + //Log(options_.info_log, "[%s] add del_tag %d, key_type %d\n", + // dbname_.c_str(), del_tag, ikey.type); + del_num++; + } else if (ttl > 0) { // del tag has not ttl + //Log(options_.info_log, "[%s] add ttl_tag %ld\n", + // dbname_.c_str(), ttl); + ttls.push_back(ttl); + } + + meta->largest.DecodeFrom(key); + builder->Add(key, iter->value()); + iter->Next(); } - //Log(options.info_log, "[Memtable Not Drop] sequence_id: %llu, raw_key: %s", sequence_id, entry); + // Log(options.info_log, "[Memtable Not Drop] sequence_id: %llu, raw_key: %s", + // ikey.sequence, ikey.user_key); } if (compact_strategy) { - delete compact_strategy; + delete compact_strategy; } // Finish and check for builder errors @@ -98,6 +112,24 @@ Status BuildTable(const std::string& dbname, meta->file_size = builder->FileSize(); assert(meta->file_size > 0); *saved_size = builder->SavedSize(); + + // update ttl/del information + entries = builder->NumEntries(); + std::sort(ttls.begin(), ttls.end()); + uint32_t idx = ttls.size() * options.ttl_percentage / 100 ; + meta->del_percentage = del_num * 100 / entries; /* delete tag percentage */ + meta->check_ttl_ts = ((ttls.size() > 0) && (idx < ttls.size())) ? ttls[idx] : 0; /* sst's check ttl's time */ + meta->ttl_percentage = ((ttls.size() > 0) && (idx < ttls.size())) ? idx * 100 / ttls.size() : 0; /* ttl tag percentage */ + Log(options.info_log, "[%s] (mem dump) AddFile, number #%u, entries %ld, del_nr %lu" + ", ttl_nr %lu, del_p %lu, ttl_check_ts %lu, ttl_p %lu\n", + dbname.c_str(), + (unsigned int) meta->number, + entries, + del_num, + ttls.size(), + meta->del_percentage, + meta->check_ttl_ts, + meta->ttl_percentage); } } else { builder->Abandon(); diff --git a/src/leveldb/db/corruption_test.cc b/src/leveldb/db/corruption_test.cc index 73aa397f0..e2be33d54 100644 --- a/src/leveldb/db/corruption_test.cc +++ b/src/leveldb/db/corruption_test.cc @@ -214,7 +214,7 @@ TEST(CorruptionTest, Recovery) { TEST(CorruptionTest, RecoverWriteError) { env_.writable_file_error_ = true; Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ok()); } TEST(CorruptionTest, NewFileErrorDuringWrite) { diff --git a/src/leveldb/db/db.cc b/src/leveldb/db/db.cc index 01a7d51c4..727ca506e 100644 --- a/src/leveldb/db/db.cc +++ b/src/leveldb/db/db.cc @@ -92,7 +92,6 @@ Status DestroyLG(const std::string& lgname, const Options& options) { } env->DeleteDir(lgname + "/lost"); env->UnlockFile(lock); // Ignore error since state is already gone - env->DeleteFile(lockname); env->DeleteDir(lgname); // Ignore error in case dir contains other files return result; @@ -134,9 +133,11 @@ Status DestroyDB(const std::string& dbname, const Options& opt) { } lg_opt.compression = lg_info->compression; delete lg_info; + info_it->second = NULL; } } else if (options.lg_info_list) { delete options.lg_info_list; + options.lg_info_list = NULL; } Status lg_ret = DestroyLG(lgname, lg_opt); if (!lg_ret.ok()) { @@ -166,7 +167,6 @@ Status DestroyDB(const std::string& dbname, const Options& opt) { } env->DeleteDir(dbname + "/lost"); env->UnlockFile(lock); // Ignore error since state is already gone - env->DeleteFile(lockname); env->DeleteDir(dbname); // Ignore error in case dir contains other files return result; diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc index a27480aa1..44be2ec02 100644 --- a/src/leveldb/db/db_bench.cc +++ b/src/leveldb/db/db_bench.cc @@ -467,7 +467,7 @@ class Benchmark { write_options_.disable_wal = FLAGS_disable_wal; void (Benchmark::*method)(ThreadState*) = NULL; - bool fresh_db = false; + //bool fresh_db = false; int num_threads = FLAGS_threads; if (name == Slice("fillseq")) { @@ -547,18 +547,18 @@ class Benchmark { } } - if (fresh_db) { - if (FLAGS_use_existing_db) { - fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n", - name.ToString().c_str()); - method = NULL; - } else { - delete db_; - db_ = NULL; - DestroyDB(FLAGS_db, Options()); - Open(); - } - } + //if (fresh_db) { + // if (FLAGS_use_existing_db) { + // fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n", + // name.ToString().c_str()); + // method = NULL; + // } else { + // delete db_; + // db_ = NULL; + // DestroyDB(FLAGS_db, Options()); + // Open(); + // } + //} if (method != NULL) { RunBenchmark(num_threads, name, method); diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc index 4b30f81bd..72fd8082e 100644 --- a/src/leveldb/db/db_impl.cc +++ b/src/leveldb/db/db_impl.cc @@ -54,7 +54,7 @@ struct DBImpl::Writer { WriteBatch* batch; port::CondVar cv; - explicit Writer(port::Mutex* mu) : cv(mu) { } + explicit Writer(port::Mutex* mu) : batch(NULL), cv(mu) { } }; struct DBImpl::CompactionState { @@ -70,7 +70,15 @@ struct DBImpl::CompactionState { struct Output { uint64_t number; uint64_t file_size; + int64_t del_num; // statistic: delete tag's percentage in sst + std::vector ttls; // use for calculate timeout percentage + int64_t entries; InternalKey smallest, largest; + + Output(): number(0), + file_size(0), + del_num(0), + entries(0) {} }; std::vector outputs; @@ -145,9 +153,9 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) owns_info_log_(options_.info_log != options.info_log), owns_block_cache_(options_.block_cache != options.block_cache), dbname_(dbname), + db_lock_(NULL), table_cache_(options_.table_cache), owns_table_cache_(options_.table_cache == NULL), - db_lock_(NULL), shutting_down_(NULL), bg_cv_(&mutex_), writting_mem_cv_(&mutex_), @@ -160,6 +168,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) bound_log_size_(0), bg_compaction_scheduled_(false), bg_compaction_score_(0), + bg_compaction_timeout_(0), bg_schedule_id_(0), manual_compaction_(NULL), consecutive_compaction_errors_(0), @@ -188,7 +197,7 @@ Status DBImpl::Shutdown1() { Log(options_.info_log, "[%s] wait bg compact finish", dbname_.c_str()); if (bg_compaction_scheduled_) { - env_->ReSchedule(bg_schedule_id_, kDumpMemTableUrgentScore); + env_->ReSchedule(bg_schedule_id_, kDumpMemTableUrgentScore, 0); } while (bg_compaction_scheduled_) { bg_cv_.Wait(); @@ -242,9 +251,6 @@ DBImpl::~DBImpl() { Shutdown2(); } } - if (db_lock_ != NULL) { - env_->UnlockFile(db_lock_); - } delete versions_; if (mem_ != NULL) mem_->Unref(); @@ -261,6 +267,9 @@ DBImpl::~DBImpl() { if (owns_block_cache_) { delete options_.block_cache; } + if (db_lock_) { + env_->UnlockFile(db_lock_); + } } Status DBImpl::NewDB() { @@ -383,6 +392,45 @@ void DBImpl::DeleteObsoleteFiles() { } } +// Returns: +// Status OK: iff *exists == true -> exists +// iff *exists == false -> not exists +// Status not OK: +// 1). Status::Corruption -> CURRENT lost, +// 2). Status::IOError -> Maybe request timeout, don't use *exists +Status DBImpl::ParentCurrentStatus(uint64_t parent_no, bool* exists) { + assert(exists != NULL); + std::string current = + CurrentFileName(RealDbName(dbname_, parent_no)); + Status s = env_->FileExists(current); + if (s.ok()) { + *exists = true; + return s; + } else if (s.IsNotFound()) { + *exists = false; + if (options_.ignore_corruption_in_open) { + // Drop all data in parent tablet + Log(options_.info_log, "[%s] parent tablet(%ld) CURRENT error(drop all data): %s", + dbname_.c_str(), + static_cast(parent_no), + s.ToString().c_str()); + return Status::OK(); // Data lost, reopen it as a new db + } else { + Log(options_.info_log, "[%s] parent tablet(%ld) CURRENT error: %s", + dbname_.c_str(), + static_cast(parent_no), + s.ToString().c_str()); + return Status::Corruption("parent CURRENT lost"); + } + } else { + // Maybe request timeout, should retry open + Log(options_.info_log, "[%s] parent tablet(%ld) CURRENT timeout", + dbname_.c_str(), + static_cast(parent_no)); + return Status::IOError("parent CURRENT timeout"); + } +} + // Returns: // OK: iff *exists == true -> exists // iff *exists == false -> not exists @@ -451,41 +499,48 @@ Status DBImpl::DbExists(bool* exists) { return Status::OK(); } else if (options_.parent_tablets.size() == 1) { // This is a new db generated by splitting - std::string current = - CurrentFileName(RealDbName(dbname_, options_.parent_tablets[0])); - s = env_->FileExists(current); - if (!s.ok()) { - // maybe lost, maybe timeout, maybe permission denied, maybe ... - Log(options_.info_log, "[%s] parent tablet(%ld) current error: %s", - dbname_.c_str(), - static_cast(options_.parent_tablets[0]), s.ToString().c_str()); - return Status::IOError("parent CURRENT error"); - } - *exists = true; - return s; + // We expect parent tablet exists + return ParentCurrentStatus(options_.parent_tablets[0], exists); } else if (options_.parent_tablets.size() == 2) { // This is a new db generated by merging - std::string current0 = - CurrentFileName(RealDbName(dbname_, options_.parent_tablets[0])); - s = env_->FileExists(current0); + // We expect parent tablets exist + bool parent0_exists = true; + uint64_t parent0 = options_.parent_tablets[0]; + s = ParentCurrentStatus(options_.parent_tablets[0], &parent0_exists); if (!s.ok()) { - Log(options_.info_log, "[%s] parent tablet-0(%ld) current error: %s", - dbname_.c_str(), - static_cast(options_.parent_tablets[0]), s.ToString().c_str()); - return Status::IOError("parent CURRENT error"); + return s; } - std::string current1 = - CurrentFileName(RealDbName(dbname_, options_.parent_tablets[1])); - s = env_->FileExists(current1); + bool parent1_exists = true; + uint64_t parent1 = options_.parent_tablets[1]; + s = ParentCurrentStatus(options_.parent_tablets[1], &parent1_exists); if (!s.ok()) { - Log(options_.info_log, "[%s] parent tablet-1(%ld) current error: %s", - dbname_.c_str(), - static_cast(options_.parent_tablets[1]), s.ToString().c_str()); - return Status::IOError("parent CURRENT error"); + return s; } - *exists = true; - return Status::OK(); + + assert((parent0_exists && parent1_exists) || options_.ignore_corruption_in_open); + + if (parent0_exists && parent1_exists) { + *exists = true; + } else if (parent0_exists) { + *exists = true; + options_.parent_tablets.resize(0); + options_.parent_tablets.push_back(parent0); + Log(options_.info_log, "[%s] ignore parent(%ld) lost", + dbname_.c_str(), parent1); + } else if (parent1_exists) { + *exists = true; + options_.parent_tablets.resize(0); + options_.parent_tablets.push_back(parent1); + Log(options_.info_log, "[%s] ignore parent(%ld) lost", + dbname_.c_str(), parent0); + } else { + // Parents data lost, open this db as an empty db + *exists = false; + Log(options_.info_log, "[%s] ignore all parents(%ld, %ld) lost", + dbname_.c_str(), parent0, parent1); + } + return s; } else { assert(false); } @@ -513,21 +568,22 @@ Status DBImpl::Recover(VersionEdit* edit) { return s; } } else if (s.ok()) { - // db exists, do nothing + // Directory exists, do nothing } else { - // unknown status + // Unknown status return s; } } - assert(db_lock_ == NULL); - Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); - if (!s.ok()) { - return s; + if (options_.use_file_lock) { + Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } } bool db_exists; - s = DbExists(&db_exists); + Status s = DbExists(&db_exists); if (!s.ok()) { return s; } @@ -563,10 +619,27 @@ Status DBImpl::Recover(VersionEdit* edit) { std::string path = RealDbName(dbname_, *it_tablet); Log(options_.info_log, "[%s] GetChildren(%s)", dbname_.c_str(), path.c_str()); std::vector filenames; - if (!env_->GetChildren(path, &filenames).ok()) { - Log(options_.info_log, "[%s] GetChildren(%s) fail: %s", + s = env_->GetChildren(path, &filenames); + if (s.ok()) { + // Do nothing + } else if (s.IsTimeOut()) { + // Should retry open + Log(options_.info_log, "[%s] GetChildren(%s) timeout: %s", dbname_.c_str(), path.c_str(), s.ToString().c_str()); - return Status::IOError("GetChildren failed"); + return Status::TimeOut("GetChildren timeout"); + } else { + // Cannot read the directory + if (options_.ignore_corruption_in_open) { + Log(options_.info_log, "[%s] GetChildren(%s) fail: %s, still open!", + dbname_.c_str(), path.c_str(), s.ToString().c_str()); + // Reset the status + s = Status::OK(); + continue; + } else { + Log(options_.info_log, "[%s] GetChildren(%s) fail: %s", + dbname_.c_str(), path.c_str(), s.ToString().c_str()); + return Status::IOError("GetChildren fail"); + } } uint64_t number; FileType type; @@ -621,18 +694,9 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, &saved_size, smallest_snapshot); mutex_.Lock(); } - - VersionSet::LevelSummaryStorage tmp; - Log(options_.info_log, "[%s] Level-0 table #%u: %lld (+ %lld ) bytes %s, %s", - dbname_.c_str(), (unsigned int) meta.number, - (unsigned long long) meta.file_size, - (unsigned long long) saved_size, - s.ToString().c_str(), - versions_->LevelSummary(&tmp)); delete iter; pending_outputs_.erase(meta.number); - // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. int level = 0; @@ -642,8 +706,16 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, if (base != NULL) { level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); } - edit->AddFile(level, meta.number, meta.file_size, meta.smallest, meta.largest); + edit->AddFile(level, meta); } + VersionSet::LevelSummaryStorage tmp; + Log(options_.info_log, "[%s] Level-0 table #%u: dump-level %d, %lld (+ %lld ) bytes %s, %s", + dbname_.c_str(), (unsigned int) meta.number, + level, + (unsigned long long) meta.file_size, + (unsigned long long) saved_size, + s.ToString().c_str(), + versions_->LevelSummary(&tmp)); CompactionStats stats; stats.micros = env_->NowMicros() - start_micros; @@ -863,28 +935,43 @@ void DBImpl::MaybeScheduleCompaction() { if (shutting_down_.Acquire_Load()) { // DB is being deleted; no more background compactions } else { - double score = versions_->CompactionScore(); + uint64_t timeout = 0; + double score = versions_->CompactionScore(&timeout); if (manual_compaction_ != NULL) { - score = kManualCompactScore; + score = kManualCompactScore; + timeout = 0; } if (imm_ != NULL) { - score = kDumpMemTableScore; + score = kDumpMemTableScore; + timeout = 0; } if (score > 0) { - if (bg_compaction_scheduled_ && score <= bg_compaction_score_) { - // Already scheduled - } else if (bg_compaction_scheduled_) { - env_->ReSchedule(bg_schedule_id_, score); - Log(options_.info_log, "[%s] ReSchedule Compact[%ld] score= %.2f", - dbname_.c_str(), bg_schedule_id_, score); - bg_compaction_score_ = score; - } else { - bg_schedule_id_ = env_->Schedule(&DBImpl::BGWork, this, score); - Log(options_.info_log, "[%s] Schedule Compact[%ld] score= %.2f", - dbname_.c_str(), bg_schedule_id_, score); - bg_compaction_score_ = score; - bg_compaction_scheduled_ = true; + if (!bg_compaction_scheduled_) { + bg_schedule_id_ = env_->Schedule(&DBImpl::BGWork, this, score, timeout); + Log(options_.info_log, "[%s] Schedule Compact[%ld] score= %.2f, timeout=%lu", + dbname_.c_str(), bg_schedule_id_, score, timeout); + bg_compaction_score_ = score; + bg_compaction_timeout_ = timeout; + bg_compaction_scheduled_ = true; + assert(score <= 1 || timeout == 0); // if score > 1, then timeout MUST be 0 + } else { + // use the same way to compute priority score, like util/thread_pool.h + bool need_resched = false; + if (timeout != bg_compaction_timeout_) { + need_resched = timeout < bg_compaction_timeout_; + } else if (score != bg_compaction_score_) { + need_resched = score > bg_compaction_score_; } + + if (need_resched) { + env_->ReSchedule(bg_schedule_id_, score, timeout); + Log(options_.info_log, "[%s] ReSchedule Compact[%ld] score= %.2f, timeout=%lu", + dbname_.c_str(), bg_schedule_id_, score, timeout); + bg_compaction_score_ = score; + bg_compaction_timeout_ = timeout; + assert(score <= 1 || timeout == 0); // if score > 1, then timeout MUST be 0 + } + } } else { // No work to be done } @@ -972,14 +1059,14 @@ Status DBImpl::BackgroundCompaction() { assert(c->num_input_files(0) == 1); FileMetaData* f = c->input(0, 0); c->edit()->DeleteFile(c->level(), *f); - c->edit()->AddFile(c->level() + 1, *f); + c->edit()->AddFile(c->output_level(), *f); status = versions_->LogAndApply(c->edit(), &mutex_); VersionSet::LevelSummaryStorage tmp; - Log(options_.info_log, "[%s] Moved #%08u, %08u to level-%d %lld bytes %s: %s\n", + Log(options_.info_log, "[%s] Moved #%08u, #%u to level-%d %lld bytes %s: %s\n", dbname_.c_str(), static_cast(f->number >> 32 & 0x7fffffff), //tablet number static_cast(f->number & 0xffffffff), //sst number - c->level() + 1, + c->output_level(), static_cast(f->file_size), status.ToString().c_str(), versions_->LevelSummary(&tmp)); @@ -1078,6 +1165,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, s = input->status(); } const uint64_t current_entries = compact->builder->NumEntries(); + compact->current_output()->entries = current_entries; if (s.ok()) { s = compact->builder->Finish(); } else { @@ -1124,7 +1212,6 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, return s; } - Status DBImpl::InstallCompactionResults(CompactionState* compact) { mutex_.AssertHeld(); Log(options_.info_log, "[%s] Compacted %d@%d + %d@%d files => %lld bytes", @@ -1132,17 +1219,36 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), - compact->compaction->level() + 1, + compact->compaction->output_level(), static_cast(compact->total_bytes)); - // Add compaction outputs + // Add compaction outputs, skip file without entries compact->compaction->AddInputDeletions(compact->compaction->edit()); - const int level = compact->compaction->level(); for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; + CompactionState::Output& out = compact->outputs[i]; + if (out.entries <= 0) { + continue; + } + + std::sort(out.ttls.begin(), out.ttls.end()); + uint32_t idx = out.ttls.size() * options_.ttl_percentage / 100 ; compact->compaction->edit()->AddFile( - level + 1, BuildFullFileNumber(dbname_, out.number), - out.file_size, out.smallest, out.largest); + compact->compaction->output_level(), BuildFullFileNumber(dbname_, out.number), + out.file_size, out.smallest, out.largest, + out.del_num * 100 / out.entries /* delete tag percentage */, + ((out.ttls.size() > 0) && (idx < out.ttls.size())) ? out.ttls[idx] : 0 /* sst's check ttl's time */, + ((out.ttls.size() > 0) && (idx < out.ttls.size())) ? idx * 100 / out.ttls.size() : 0 /* delete tag percentage */); + Log(options_.info_log, "[%s] AddFile, level %d, number #%lu, entries %ld, del_nr %lu" + ", ttl_nr %lu, del_p %lu, ttl_check_ts %lu, ttl_p %lu\n", + dbname_.c_str(), + compact->compaction->output_level(), + out.number, + out.entries, + out.del_num, + out.ttls.size(), + out.del_num * 100 / out.entries, + ((out.ttls.size() > 0) && (idx < out.ttls.size())) ? out.ttls[idx] : 0, + ((out.ttls.size() > 0) && (idx < out.ttls.size())) ? idx * 100 / out.ttls.size() : 0); } return versions_->LogAndApply(compact->compaction->edit(), &mutex_); } @@ -1156,7 +1262,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), - compact->compaction->level() + 1); + compact->compaction->output_level()); assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); assert(compact->builder == NULL); @@ -1206,7 +1312,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { Slice key = input->key(); if (compact->compaction->ShouldStopBefore(key) && - compact->builder != NULL) { + compact->builder != NULL) { // should not overlap level() + 2 too much status = FinishCompactionOutputFile(compact, input); if (!status.ok()) { break; @@ -1251,7 +1357,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } else if (compact_strategy && ikey.sequence <= compact->smallest_snapshot) { std::string lower_bound; if (options_.drop_base_level_del_in_compaction) { - lower_bound = compact->compaction->drop_lower_bound(); + lower_bound = compact->compaction->drop_lower_bound(); } drop = compact_strategy->Drop(ikey.user_key, ikey.sequence, lower_bound); } @@ -1295,7 +1401,20 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } if (!has_atom_merged) { - compact->builder->Add(key, input->value()); + // check del tag and ttl tag + bool del_tag = false; + int64_t ttl = -1; + compact_strategy && compact_strategy->CheckTag(ikey.user_key, &del_tag, &ttl); + if (ikey.type == kTypeDeletion || del_tag) { + //Log(options_.info_log, "[%s] add del_tag %d, key_type %d\n", + // dbname_.c_str(), del_tag, ikey.type); + compact->current_output()->del_num++; + } else if (ttl > 0) { // del tag has not ttl + //Log(options_.info_log, "[%s] add ttl_tag %ld\n", + // dbname_.c_str(), ttl); + compact->current_output()->ttls.push_back(ttl); + } + compact->builder->Add(key, input->value()); } // Close output file if it is big enough if (compact->builder->FileSize() >= @@ -1345,7 +1464,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } mutex_.Lock(); - stats_[compact->compaction->level() + 1].Add(stats); + stats_[compact->compaction->output_level()].Add(stats); if (status.ok()) { status = InstallCompactionResults(compact); @@ -1533,7 +1652,8 @@ bool DBImpl::BusyWrite() { void DBImpl::Workload(double* write_workload) { MutexLock l(&mutex_); - double wwl = versions_->CompactionScore(); + uint64_t timeout = 0; + double wwl = versions_->CompactionScore(&timeout); if (wwl >= 0) { *write_workload = wwl; } else { diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h index 01d72e099..05b1ae623 100644 --- a/src/leveldb/db/db_impl.h +++ b/src/leveldb/db/db_impl.h @@ -133,6 +133,14 @@ class DBImpl : public DB { Status InstallCompactionResults(CompactionState* compact) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Returns: + // Status OK: iff *exists == true -> exists + // iff *exists == false -> not exists + // Status not OK: + // 1). Status::Corruption -> CURRENT lost, + // 2). Status::IOError -> Maybe request timeout, don't use *exists + Status ParentCurrentStatus(uint64_t parent_no, bool* exists); + State state_; // tera-specific @@ -154,14 +162,13 @@ class DBImpl : public DB { bool owns_info_log_; bool owns_block_cache_; const std::string dbname_; + // Lock over the persistent DB state. Non-NULL iff successfully acquired. + FileLock* db_lock_; // table_cache_ provides its own synchronization TableCache* table_cache_; bool owns_table_cache_; - // Lock over the persistent DB state. Non-NULL iff successfully acquired. - FileLock* db_lock_; - // State below is protected by mutex_ port::Mutex mutex_; port::AtomicPointer shutting_down_; @@ -191,6 +198,7 @@ class DBImpl : public DB { // Has a background compaction been scheduled or is running? bool bg_compaction_scheduled_; double bg_compaction_score_; + uint64_t bg_compaction_timeout_; int64_t bg_schedule_id_; // Information for a manual compaction diff --git a/src/leveldb/db/db_table.cc b/src/leveldb/db/db_table.cc index 9ce7024d0..e0e35f7a2 100644 --- a/src/leveldb/db/db_table.cc +++ b/src/leveldb/db/db_table.cc @@ -105,7 +105,7 @@ DBTable::DBTable(const Options& options, const std::string& dbname) : state_(kNotOpen), shutting_down_(NULL), bg_cv_(&mutex_), bg_cv_timer_(&mutex_), bg_cv_sleeper_(&mutex_), options_(InitDefaultOptions(options, dbname)), - dbname_(dbname), env_(options.env), + dbname_(dbname), env_(options.env), db_lock_(NULL), created_own_lg_list_(options_.exist_lg_list != options.exist_lg_list), created_own_info_log_(options_.info_log != options.info_log), created_own_compact_strategy_(options_.compact_strategy_factory != options.compact_strategy_factory), @@ -216,12 +216,22 @@ DBTable::~DBTable() { delete options_.info_log; } delete tmp_batch_; + if (db_lock_) { + env_->UnlockFile(db_lock_); + } } Status DBTable::Init() { std::vector lg_edits; - Status s; Log(options_.info_log, "[%s] start Init()", dbname_.c_str()); + Status s; + if (options_.use_file_lock) { + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + Log(options_.info_log, "[%s] Get db lock fail", dbname_.c_str()); + return s; + } + } MutexLock lock(&mutex_); uint64_t min_log_sequence = kMaxSequenceNumber; diff --git a/src/leveldb/db/db_table.h b/src/leveldb/db/db_table.h index 0b18bb22b..4fa0a11c4 100644 --- a/src/leveldb/db/db_table.h +++ b/src/leveldb/db/db_table.h @@ -24,6 +24,7 @@ namespace leveldb { class DBImpl; class MemTable; +class FileLock; class DBTable : public DB { public: @@ -179,6 +180,8 @@ class DBTable : public DB { const Options options_; const std::string dbname_; Env* const env_; + // Lock over the persistent DB state. Non-NULL iff successfully acquired. + FileLock* db_lock_; bool created_own_lg_list_; bool created_own_info_log_; bool created_own_compact_strategy_; diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc index 768aba78b..7c25f2de6 100644 --- a/src/leveldb/db/db_test.cc +++ b/src/leveldb/db/db_test.cc @@ -85,6 +85,9 @@ class SpecialEnv : public EnvWrapper { AtomicCounter sleep_counter_; AtomicCounter sleep_time_counter_; + AtomicCounter sync_retry_c_; + AtomicCounter write_retry_c_; + explicit SpecialEnv(Env* base) : EnvWrapper(base) { delay_sstable_sync_.Release_Store(NULL); no_space_.Release_Store(NULL); @@ -131,7 +134,9 @@ class SpecialEnv : public EnvWrapper { ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { } ~ManifestFile() { delete base_; } Status Append(const Slice& data) { - if (env_->manifest_write_error_.Acquire_Load() != NULL) { + env_->write_retry_c_.Increment(); + if (env_->manifest_write_error_.Acquire_Load() != NULL && + env_->write_retry_c_.Read() < 10) { return Status::IOError("simulated writer error"); } else { return base_->Append(data); @@ -140,7 +145,9 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { - if (env_->manifest_sync_error_.Acquire_Load() != NULL) { + env_->sync_retry_c_.Increment(); + if (env_->manifest_sync_error_.Acquire_Load() != NULL && + env_->sync_retry_c_.Read() < 10) { return Status::IOError("simulated sync error"); } else { return base_->Sync(); @@ -1280,8 +1287,11 @@ TEST(DBTest, DeletionMarkers2) { ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); dbfull()->TEST_CompactRange(last-2, NULL, NULL); + sleep(3); // del compaction stragety will be auto trigger. + // DEL kept: "last" file overlaps - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + //ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); + ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); dbfull()->TEST_CompactRange(last-1, NULL, NULL); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc index d8ea1a77c..fc95284e6 100644 --- a/src/leveldb/db/version_edit.cc +++ b/src/leveldb/db/version_edit.cc @@ -15,7 +15,7 @@ namespace leveldb { // Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. max tag number = 4096, min tag number = 1 +// disk and should not be changed. max tag number = 1<<20, min tag number = 1 enum Tag { kComparator = 1, kLogNumber = 2, @@ -28,7 +28,9 @@ enum Tag { kPrevLogNumber = 9, kNewFile = 10, kDeletedFile = 11, - // no more than 4096 + kNewFileInfo = 12, + + // no more than 1<<20 kMaxTag = 1 << 20, }; @@ -135,6 +137,16 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutVarint32(dst, str.size() + kMaxTag); PutVarint32(dst, kNewFile); dst->append(str.data(), str.size()); + + // add statictis info + str.clear(); + PutVarint64(&str, f.del_percentage); + PutVarint64(&str, f.ttl_percentage); + PutVarint64(&str, f.check_ttl_ts); + + PutVarint32(dst, str.size() + kMaxTag); + PutVarint32(dst, kNewFileInfo); + dst->append(str.data(), str.size()); } } @@ -272,6 +284,30 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } else { f.largest_fake = true; } + + // new file format parser + Slice file_ptr = input; + uint32_t file_tag; + GetVarint32(&file_ptr, &file_tag); + if (file_tag > kMaxTag) { + // file_tag - kMaxTag; + GetVarint32(&file_ptr, &tag); + } + switch (tag) { + case kNewFileInfo: + GetVarint32(&input, &tag);// ignore len + GetVarint32(&input, &tag);// ignore tag + GetVarint64(&input, &f.del_percentage); + GetVarint64(&input, &f.ttl_percentage); + GetVarint64(&input, &f.check_ttl_ts); + break; + + default: + fprintf(stderr, "NewFile %lu without info, skip tag %d, len %d\n", + f.number & 0xffffffff, + tag, file_tag); + break; + } new_files_.push_back(std::make_pair(level, f)); } else { msg = "new-file entry 1"; @@ -368,6 +404,12 @@ std::string VersionEdit::DebugString() const { r.append(f.smallest.DebugString()); r.append(" .. "); r.append(f.largest.DebugString()); + r.append(" del_percentage "); + AppendNumberTo(&r, f.del_percentage); + r.append(" ttl_percentage "); + AppendNumberTo(&r, f.ttl_percentage); + r.append(" ttl_check_ts "); + AppendNumberTo(&r, f.check_ttl_ts); } r.append("\n}\n"); return r; diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h index c01d11ff8..0c64728d0 100644 --- a/src/leveldb/db/version_edit.h +++ b/src/leveldb/db/version_edit.h @@ -23,6 +23,9 @@ class VersionSetBuilder; struct FileMetaData { int refs; int allowed_seeks; // Seeks allowed until compaction + uint64_t check_ttl_ts; // statistic: Descripe this sst file when to timeout check + uint64_t ttl_percentage; // statistic: By default, if 50% entry timeout, will trigger compaction + uint64_t del_percentage; // statistic: delete tag's percentage in sst uint64_t number; uint64_t file_size; // File size in bytes uint64_t data_size; // data_size <= file_size @@ -34,6 +37,10 @@ struct FileMetaData { FileMetaData() : refs(0), allowed_seeks(1 << 30), + check_ttl_ts(0), + ttl_percentage(0), + del_percentage(0), + number(0), file_size(0), data_size(0), smallest_fake(false), @@ -143,12 +150,18 @@ class VersionEdit { void AddFile(int level, uint64_t file, uint64_t file_size, const InternalKey& smallest, - const InternalKey& largest) { + const InternalKey& largest, + uint64_t del_percentage = 0, + uint64_t check_ttl_ts = 0, + uint64_t ttl_percentage = 0) { FileMetaData f; f.number = file; f.file_size = file_size; f.smallest = smallest; f.largest = largest; + f.del_percentage = del_percentage; + f.ttl_percentage = ttl_percentage; + f.check_ttl_ts = check_ttl_ts; new_files_.push_back(std::make_pair(level, f)); } diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc index 193680459..c728af4cc 100644 --- a/src/leveldb/db/version_edit_test.cc +++ b/src/leveldb/db/version_edit_test.cc @@ -12,7 +12,7 @@ namespace leveldb { // Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. max tag number = 4096, min tag number = 1 +// disk and should not be changed. max tag number = 1<<20, min tag number = 1 enum Tag { kComparator = 1, kLogNumber = 2, @@ -25,7 +25,9 @@ enum Tag { kPrevLogNumber = 9, kNewFile = 10, kDeletedFile = 11, - // no more than 4096 + kNewFileInfo = 12, + + // no more than 1<<20 kMaxTag = 1 << 20, }; enum EditTestTag { @@ -34,7 +36,9 @@ enum EditTestTag { class VersionEditTest: public VersionEdit { public: - VersionEditTest() : has_error_tag_(false) {} + VersionEditTest() : has_error_tag_(false) { + Clear(); + } void AddErrorTag(const std::string& str) { has_error_tag_ = true; error_code_ = str; @@ -108,10 +112,10 @@ static void TestEncodeDecode(const VersionEditTest& edit) { } static void CreateEditContent(VersionEditTest* edit) { for (int i = 0; i < 5; i++) { - TestEncodeDecode(*edit); - edit->AddFile(i, 100 + i, 200 + i, - InternalKey("aoo", 300 + i, kTypeValue), - InternalKey("zoo", 400 + i, kTypeDeletion)); + TestEncodeDecode(*edit); + edit->AddFile(i, 100 + i, 200 + i, + InternalKey("aoo", 300 + i, kTypeValue), + InternalKey("zoo", 400 + i, kTypeDeletion)); edit->DeleteFile(i, 500 + i); edit->SetCompactPointer(i, InternalKey("x00", 600 + i, kTypeValue)); } @@ -129,6 +133,30 @@ static void CreateEditContentV2(VersionEditTest* edit) { edit->SetLastSequence(900); TestEncodeDecode(*edit); } +static void CreateEditWithTtlInfo(VersionEditTest* edit) { + for (int i = 0; i < 5; i++) { + TestEncodeDecode(*edit); + edit->AddFile(i, 100 + i, 200 + i, + InternalKey("apple", 300 + i, kTypeValue), + InternalKey("zookeeper", 400 + i, kTypeDeletion), + 20 + i/* del percentage */, + 1000000000 + i/* timeout */, + 50 + i/* del percentage */); + edit->DeleteFile(i, 500 + i); + edit->SetCompactPointer(i, InternalKey("x00", 600 + i, kTypeValue)); + } + + edit->SetComparatorName("test_nil_cmp"); + edit->SetLogNumber(700); + edit->SetNextFile(800); + edit->SetLastSequence(900); + TestEncodeDecode(*edit); +} +TEST(VersionEditTest, EncodeFileInfoTag) { + VersionEditTest edit; + CreateEditWithTtlInfo(&edit); + fprintf(stderr, "%s\n", edit.DebugString().c_str()); +} TEST(VersionEditTest, OldFormatRead) { VersionEditTest edit; CreateEditContentV2(&edit); diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc index 6cd513a26..a0c816c01 100644 --- a/src/leveldb/db/version_set.cc +++ b/src/leveldb/db/version_set.cc @@ -173,14 +173,27 @@ class Version::LevelFileNumIterator : public Iterator { public: LevelFileNumIterator(const InternalKeyComparator& icmp, const std::vector* flist, - const std::string& dbname) + const std::string& dbname, + const ReadOptions& opts) : icmp_(icmp), flist_(flist), dbname_(dbname), - index_(flist->size()) { // Marks as invalid + index_(flist->size()), // Marks as invalid + read_single_row_(opts.read_single_row), + row_start_key_(opts.row_start_key, kMaxSequenceNumber, kValueTypeForSeek), + row_end_key_(opts.row_end_key, kMaxSequenceNumber, kValueTypeForSeek) { } virtual bool Valid() const { - return index_ < flist_->size(); + if (index_ >= flist_->size()) { + return false; + } + FileMetaData* f = (*flist_)[index_]; + if (read_single_row_ && + (icmp_.InternalKeyComparator::Compare(f->largest.Encode(), row_start_key_.Encode()) < 0 || + icmp_.InternalKeyComparator::Compare(f->smallest.Encode(), row_end_key_.Encode()) >= 0)) { + return false; + } + return true; } virtual void Seek(const Slice& target) { index_ = FindFile(icmp_, *flist_, target); @@ -227,6 +240,9 @@ class Version::LevelFileNumIterator : public Iterator { const std::vector* const flist_; const std::string dbname_; uint32_t index_; + bool read_single_row_; + InternalKey row_start_key_; + InternalKey row_end_key_; // Backing store for value(). Holds the file number and size. mutable std::string value_buf_; @@ -257,7 +273,7 @@ Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, ReadOptions opts = options; opts.db_opt = vset_->options_; return NewTwoLevelIterator( - new LevelFileNumIterator(vset_->icmp_, &files_[level], vset_->dbname_), + new LevelFileNumIterator(vset_->icmp_, &files_[level], vset_->dbname_, opts), &GetFileIterator, vset_->table_cache_, opts); } @@ -1021,24 +1037,27 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { last_switch_manifest_ + switch_interval < env_->NowMicros()) { force_switch_manifest_ = true; } - // timeout cause switch or failure cause switch - if (force_switch_manifest_) { - manifest_file_number_ = NewFileNumber(); - last_switch_manifest_ = env_->NowMicros(); - } - std::string new_manifest_file; + int retry_count = 0; Status s; // Unlock during expensive MANIFEST log write - { + do { + s = Status::OK(); + std::string new_manifest_file; + // timeout cause switch or failure cause switch + if (force_switch_manifest_) { + manifest_file_number_ = NewFileNumber(); + last_switch_manifest_ = env_->NowMicros(); + } mu->Unlock(); + if (force_switch_manifest_) { delete descriptor_log_; delete descriptor_file_; descriptor_log_ = NULL; descriptor_file_ = NULL; Log(options_->info_log, "[%s] force switch MANIFEST to %lu", - dbname_.c_str(), manifest_file_number_); + dbname_.c_str(), manifest_file_number_); force_switch_manifest_ = false; } @@ -1052,9 +1071,13 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { if (s.ok()) { descriptor_log_ = new log::Writer(descriptor_file_); s = WriteSnapshot(descriptor_log_); + if (!s.ok()) { + Log(options_->info_log, "[%s][dfs error] writesnapshot MANIFEST[%s] error, status[%s].\n", + dbname_.c_str(), new_manifest_file.c_str(), s.ToString().c_str()); + } } else { Log(options_->info_log, "[%s][dfs error] open MANIFEST[%s] error, status[%s].\n", - dbname_.c_str(), new_manifest_file.c_str(), s.ToString().c_str()); + dbname_.c_str(), new_manifest_file.c_str(), s.ToString().c_str()); } } @@ -1065,48 +1088,48 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { s = descriptor_log_->AddRecord(record); if (s.ok()) { s = descriptor_file_->Sync(); - } - if (!s.ok()) { - Log(options_->info_log, "[%s][dfs error] MANIFEST sync error: %s\n", + if (!s.ok()) { + Log(options_->info_log, "[%s][dfs error] MANIFEST sync error: %s\n", dbname_.c_str(), s.ToString().c_str()); - if (ManifestContains(record)) { - Log(options_->info_log, - "[%s] MANIFEST contains log record despite error; advancing to new " - "version to prevent mismatch between in-memory and logged state", - dbname_.c_str()); - s = Status::OK(); } + } else { + Log(options_->info_log, "[%s][dfs error] AddRecord MANIFEST error: %s\n", + dbname_.c_str(), s.ToString().c_str()); } } - // If we just created a new descriptor file, install it by writing a - // new CURRENT file that points to it. if (s.ok() && !new_manifest_file.empty()) { s = SetCurrentFile(env_, dbname_, manifest_file_number_); if (s.ok()) { Log(options_->info_log, "[%s] set CURRENT to %llu\n", - dbname_.c_str(), static_cast(manifest_file_number_)); + dbname_.c_str(), static_cast(manifest_file_number_)); } else { Log(options_->info_log, "[%s][dfs error] set CURRENT error: %s\n", - dbname_.c_str(), s.ToString().c_str()); + dbname_.c_str(), s.ToString().c_str()); } - // No need to double-check MANIFEST in case of error since it - // will be discarded below. } - // write manifest error, cause switch if (!s.ok()) { + force_switch_manifest_ = true; if (!new_manifest_file.empty()) { - delete descriptor_log_; - delete descriptor_file_; - descriptor_log_ = NULL; - descriptor_file_ = NULL; env_->DeleteFile(new_manifest_file); } } + // retry until success + if (force_switch_manifest_) { + retry_count++; + int sec = 1; + for (int i = 1; i < retry_count && i < 4; i++) { + sec *= 2; + } + Log(options_->info_log, "[%s] Waiting after %d, LogAndApply sync error: %s, retry: %d", + dbname_.c_str(), sec, s.ToString().c_str(), retry_count); + env_->SleepForMicroseconds(sec * 1000000); + } + mu->Lock(); - } + } while (force_switch_manifest_); // Install the new version if (s.ok()) { @@ -1221,15 +1244,15 @@ Status VersionSet::ReadCurrentFile(uint64_t tablet, std::string* dscname) { } } if (manifest_set.size() < 1) { - Log(options_->info_log, "[%s] none available manifest file.", + Log(options_->info_log, "[%s] none available manifest file", dbname_.c_str()); ArchiveFile(env_, CurrentFileName(pdbname)); - return Status::Corruption("DB has none available manifest file."); + return Status::Corruption("DB has none available manifest file"); } // select the largest manifest number std::set::reverse_iterator it = manifest_set.rbegin(); *dscname = pdbname + "/" + *it; - Log(options_->info_log, "[%s] use backup manifest: %s.", + Log(options_->info_log, "[%s] use backup manifest: %s", dbname_.c_str(), dscname->c_str()); return Status::OK(); } @@ -1257,12 +1280,12 @@ Status VersionSet::Recover() { return s; } } else if (parent_size == 1) { - Log(options_->info_log, "[%s] generated by splitting, parent tablet: %llu", + Log(options_->info_log, "[%s] generated by splitting/merging, parent tablet: %llu", dbname_.c_str(), static_cast(options_->parent_tablets[0])); dscname.resize(1); s = ReadCurrentFile(options_->parent_tablets[0], &dscname[0]); if (!s.ok()) { - Log(options_->info_log, "[%s] fail to read current (split): %ld.", + Log(options_->info_log, "[%s] fail to read current (split/merge): %ld.", dbname_.c_str(), options_->parent_tablets[0]); return s; } @@ -1427,9 +1450,12 @@ Status VersionSet::Recover() { FileMetaData* f = files[i]; ModifyFileSize(f); // Debug - Log(options_->info_log, "[%s] recover: %s, level: %d, s: %d %s, l: %d %s\n", + Log(options_->info_log, "[%s] recover: %s, level: %d, del_p: %lu, check_ttl_ts %lu, ttl_p %lu, s: %d %s, l: %d %s\n", dbname_.c_str(), FileNumberDebugString(f->number).c_str(), level, + f->del_percentage, + f->check_ttl_ts, + f->ttl_percentage, f->smallest_fake, f->smallest.user_key().ToString().data(), f->largest_fake, f->largest.user_key().ToString().data()); } @@ -1487,8 +1513,13 @@ void VersionSet::Finalize(Version* v) { // Precomputed best level for next compaction int best_level = -1; double best_score = -1; + int best_del_level = -1; + int best_del_idx = -1; + int best_ttl_level = -1; + int best_ttl_idx = -1; - for (int level = 0; level < config::kNumLevels-1; level++) { + int base_level = -1; + for (int level = config::kNumLevels - 1; level >= 0; level--) { double score; if (level == 0) { // We treat level-0 specially by bounding the number of files @@ -1514,14 +1545,63 @@ void VersionSet::Finalize(Version* v) { / MaxBytesForLevel(level, options_->sst_size); } - if (score > best_score) { + // locate base level + if (v->files_[level].size() > 0 && base_level < 0) { + base_level = level; + } + + // size compaction does not allow trigger by base level + if ((score > best_score) && (level < config::kNumLevels - 1)) { best_level = level; best_score = score; } + + for (size_t i = 0; i < v->files_[level].size(); i++) { + FileMetaData* f = v->files_[level][i]; + // del compaction does not allow trigger by base level + if ((level > 0) && (level < base_level) && + (f->del_percentage > options_->del_percentage) && + (best_del_level < 0 || + v->files_[best_del_level][best_del_idx]->del_percentage < f->del_percentage)) { + best_del_level = level; + best_del_idx = i; + } + + // ttl compaction can trigger in base level + if ((f->check_ttl_ts > 0) && + (best_ttl_level < 0 || + v->files_[best_ttl_level][best_ttl_idx]->check_ttl_ts > f->check_ttl_ts)) { + best_ttl_level = level; + best_ttl_idx = i; + } + } } v->compaction_level_ = best_level; v->compaction_score_ = best_score; + if (best_del_level >= 0) { + v->del_trigger_compact_ = v->files_[best_del_level][best_del_idx]; + v->del_trigger_compact_level_ = best_del_level; + Log(options_->info_log, + "[%s] del_strategy(current), level %d, num #%lu, file_size %lu, del_p %lu\n", + dbname_.c_str(), + v->del_trigger_compact_level_, + (v->del_trigger_compact_->number) & 0xffffffff, + v->del_trigger_compact_->file_size, + v->del_trigger_compact_->del_percentage); + } + if (best_ttl_level >= 0) { + v->ttl_trigger_compact_ = v->files_[best_ttl_level][best_ttl_idx]; + v->ttl_trigger_compact_level_ = best_ttl_level; + Log(options_->info_log, + "[%s] ttl_strategy(current), level %d, num #%lu, file_size %lu, ttl_p %lu, check_ts %lu\n", + dbname_.c_str(), + v->ttl_trigger_compact_level_, + (v->ttl_trigger_compact_->number) & 0xffffffff, + v->ttl_trigger_compact_->file_size, + v->ttl_trigger_compact_->ttl_percentage, + v->ttl_trigger_compact_->check_ttl_ts); + } } Status VersionSet::WriteSnapshot(log::Writer* log) { @@ -1751,7 +1831,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { } else { // Create concatenating iterator for the files from this level list[num++] = NewTwoLevelIterator( - new Version::LevelFileNumIterator(icmp_, &c->inputs_[which], dbname_), + new Version::LevelFileNumIterator(icmp_, &c->inputs_[which], dbname_, options), &GetFileIterator, table_cache_, options); } } @@ -1762,6 +1842,34 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { return result; } +// timeout for micro_second +double VersionSet::CompactionScore(uint64_t* timeout) const { + *timeout = 0; + uint64_t ts = env_->NowMicros(); + Version* v = current_; + if (v->compaction_score_ >= 1) { + return v->compaction_score_; + } else if (v->del_trigger_compact_ != NULL && + v->del_trigger_compact_->del_percentage > options_->del_percentage) { + return (double)(v->del_trigger_compact_->del_percentage / 100.0); + } else if (v->ttl_trigger_compact_ != NULL && + ts >= v->ttl_trigger_compact_->check_ttl_ts) { + return (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0); + } else if (v->file_to_compact_ != NULL) { + return 0.1f; + } + + // delay task + if (v->ttl_trigger_compact_ != NULL && + ts < v->ttl_trigger_compact_->check_ttl_ts) { + *timeout = (v->ttl_trigger_compact_->check_ttl_ts - ts + 1000000) / 1000; + return (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0); + } + + // nothing to do + return -1.0; +} + Compaction* VersionSet::PickCompaction() { Compaction* c; int level; @@ -1770,6 +1878,8 @@ Compaction* VersionSet::PickCompaction() { // the compactions triggered by seeks. const bool size_compaction = (current_->compaction_score_ >= 1); const bool seek_compaction = (current_->file_to_compact_ != NULL); + const bool del_compaction = (current_->del_trigger_compact_ != NULL); + const bool ttl_compaction = (current_->ttl_trigger_compact_ != NULL); if (size_compaction) { level = current_->compaction_level_; assert(level >= 0); @@ -1790,9 +1900,46 @@ Compaction* VersionSet::PickCompaction() { c->inputs_[0].push_back(current_->files_[level][0]); } } else if (seek_compaction) { + // compaction trigger by seek percentage + // TODO: multithread should lock it level = current_->file_to_compact_level_; c = new Compaction(level); c->inputs_[0].push_back(current_->file_to_compact_); + } else if (del_compaction) { + // compaction trigger by delete tags percentage; + // TODO: multithread should lock it + level = current_->del_trigger_compact_level_; + assert(level >= 0); + assert(level+1 < config::kNumLevels); + c = new Compaction(level); + c->SetNonTrivial(true); + c->inputs_[0].push_back(current_->del_trigger_compact_); + Log(options_->info_log, + "[%s] compact trigger by del stragety, level %d, num #%lu, file_size %lu, del_p %lu\n", + dbname_.c_str(), + current_->del_trigger_compact_level_, + (current_->del_trigger_compact_->number) & 0xffffffff, + current_->del_trigger_compact_->file_size, + current_->del_trigger_compact_->del_percentage); + } else if (ttl_compaction) { + // compaction trigger by ttl tags percentage + // TODO: multithread should lock it + level = current_->ttl_trigger_compact_level_; + assert(level >= 0); + c = new Compaction(level); + c->SetNonTrivial(true); + c->inputs_[0].push_back(current_->ttl_trigger_compact_); + if (level == config::kNumLevels - 1) {// level in last level + c->set_output_level(level); + } + Log(options_->info_log, + "[%s] compact trigger by ttl stragety, level %d, num #%lu, file_size %lu, ttl_p %lu, check_ts %lu\n", + dbname_.c_str(), + current_->ttl_trigger_compact_level_, + (current_->ttl_trigger_compact_->number) & 0xffffffff, + current_->ttl_trigger_compact_->file_size, + current_->ttl_trigger_compact_->ttl_percentage, + current_->ttl_trigger_compact_->check_ttl_ts); } else { return NULL; } @@ -1800,7 +1947,7 @@ Compaction* VersionSet::PickCompaction() { c->input_version_ = current_; c->input_version_->Ref(); c->max_output_file_size_ = - MaxFileSizeForLevel(level + 1, current_->vset_->options_->sst_size); + MaxFileSizeForLevel(c->output_level(), current_->vset_->options_->sst_size); // Files in level 0 may overlap each other, so pick up all overlapping ones if (level == 0) { @@ -1812,18 +1959,22 @@ Compaction* VersionSet::PickCompaction() { current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); assert(!c->inputs_[0].empty()); } - SetupOtherInputs(c); - + // tera-specific: calculate the smallest rowkey which overlap with file not + // in this compaction. + SetupCompactionBoundary(c); return c; } void VersionSet::SetupOtherInputs(Compaction* c) { + if (c->level() == c->output_level()) { // self level compaction, should select next level + return; + } const int level = c->level(); InternalKey smallest, largest; GetRange(c->inputs_[0], &smallest, &largest); - current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]); + current_->GetOverlappingInputs(c->output_level(), &smallest, &largest, &c->inputs_[1]); // Get entire range covered by compaction InternalKey all_start, all_limit; @@ -1843,7 +1994,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) { InternalKey new_start, new_limit; GetRange(expanded0, &new_start, &new_limit); std::vector expanded1; - current_->GetOverlappingInputs(level+1, &new_start, &new_limit, + current_->GetOverlappingInputs(c->output_level(), &new_start, &new_limit, &expanded1); if (expanded1.size() == c->inputs_[1].size()) { Log(options_->info_log, @@ -1867,8 +2018,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) { // Compute the set of grandparent files that overlap this compaction // (parent == level+1; grandparent == level+2) - if (level + 2 < config::kNumLevels) { - current_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + if (c->output_level() + 1 < config::kNumLevels) { + current_->GetOverlappingInputs(c->output_level() + 1, &all_start, &all_limit, &c->grandparents_); } @@ -1886,10 +2037,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) { // key range next time. compact_pointer_[level] = largest.Encode().ToString(); c->edit_.SetCompactPointer(level, largest); - - // tera-specific: calculate the smallest rowkey which overlap with file not - // in this compaction. - SetupCompactionBoundary(c); + return; } void VersionSet::SetupCompactionBoundary(Compaction* c) { @@ -1905,15 +2053,20 @@ void VersionSet::SetupCompactionBoundary(Compaction* c) { } base_level--; } - if (base_level > c->level_ + 1) { + if (base_level > c->output_level()) { // not base level return; } - // do not need calculate input[1]. + // consider case: + // level: sst: 100---------200 sst:200--------300 + // level + 1: sst: 100------------------------250 + // if use 250 as lower_bound, then 200's del tag may miss + // could not calculate input[1].large key. int input0_size = c->inputs_[0].size(); FileMetaData* last_file = c->inputs_[0][input0_size - 1]; c->set_drop_lower_bound(last_file->largest.user_key().ToString()); + return; } Compaction* VersionSet::CompactRange( @@ -1948,19 +2101,24 @@ Compaction* VersionSet::CompactRange( c->input_version_ = current_; c->input_version_->Ref(); c->max_output_file_size_ = - MaxFileSizeForLevel(level + 1, current_->vset_->options_->sst_size); + MaxFileSizeForLevel(c->output_level(), current_->vset_->options_->sst_size); c->inputs_[0] = inputs; SetupOtherInputs(c); + // tera-specific: calculate the smallest rowkey which overlap with file not + // in this compaction. + SetupCompactionBoundary(c); return c; } Compaction::Compaction(int level) : level_(level), + output_level_(level + 1), max_output_file_size_(0), input_version_(NULL), grandparent_index_(0), seen_key_(false), - overlapped_bytes_(0) { + overlapped_bytes_(0), + force_non_trivial_(false) { for (int i = 0; i < config::kNumLevels; i++) { level_ptrs_[i] = 0; } @@ -1972,7 +2130,13 @@ Compaction::~Compaction() { } } +void Compaction::SetNonTrivial(bool non_trivial) { + force_non_trivial_ = non_trivial; +} bool Compaction::IsTrivialMove() const { + if (force_non_trivial_) { + return false; + } // Avoid a move if there is lots of overlapping grandparent data. // Otherwise, the move could create a parent file that will require // a very expensive merge later on. @@ -1993,7 +2157,7 @@ void Compaction::AddInputDeletions(VersionEdit* edit) { bool Compaction::IsBaseLevelForKey(const Slice& user_key) { // Maybe use binary search to find right entry instead of linear search? const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { + for (int lvl = output_level_ + 1; lvl < config::kNumLevels; lvl++) { const std::vector& files = input_version_->files_[lvl]; for (; level_ptrs_[lvl] < files.size(); ) { FileMetaData* f = files[level_ptrs_[lvl]]; diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h index 676cc64ee..5a01d8dba 100644 --- a/src/leveldb/db/version_set.h +++ b/src/leveldb/db/version_set.h @@ -137,6 +137,13 @@ class Version { FileMetaData* file_to_compact_; int file_to_compact_level_; + // ttl strategy: ttl trigger compaction + FileMetaData* ttl_trigger_compact_; + int ttl_trigger_compact_level_; + // del strategy: delete trigger compaction + FileMetaData* del_trigger_compact_; + int del_trigger_compact_level_; + // Level that should be compacted next and its compaction score. // Score < 1 means compaction is not strictly needed. These fields // are initialized by Finalize(). @@ -147,6 +154,10 @@ class Version { : vset_(vset), next_(this), prev_(this), refs_(0), file_to_compact_(NULL), file_to_compact_level_(-1), + ttl_trigger_compact_(NULL), + ttl_trigger_compact_level_(-1), + del_trigger_compact_(NULL), + del_trigger_compact_level_(-1), compaction_score_(-1), compaction_level_(-1) { } @@ -220,6 +231,7 @@ class VersionSet { // being compacted, or zero if there is no such log file. uint64_t PrevLogNumber() const { return prev_log_number_; } + double CompactionScore(uint64_t* timeout) const; // Pick level and inputs for a new compaction. // Returns NULL if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that @@ -243,22 +255,6 @@ class VersionSet { // The caller should delete the iterator when no longer needed. Iterator* MakeInputIterator(Compaction* c); - // Returns true iff some level needs a compaction. - bool NeedsCompaction() const { - Version* v = current_; - return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL); - } - - double CompactionScore() const { - Version* v = current_; - if (v->compaction_score_ >= 1) { - return v->compaction_score_; - } else if (v->file_to_compact_ != NULL) { - return 0.1f; - } - return -1.0; - } - // Add all files listed in any live version to *live. // May also mutate some internal state. void AddLiveFiles(std::set* live); @@ -343,6 +339,8 @@ class Compaction { // Return the level that is being compacted. Inputs from "level" // and "level+1" will be merged to produce a set of "level+1" files. int level() const { return level_; } + void set_output_level(int output_level) {output_level_ = output_level;} + int output_level() const { return output_level_; } // Return the object that holds the edits to the descriptor done // by this compaction. @@ -357,6 +355,7 @@ class Compaction { // Maximum size of files to build during this compaction. uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + void SetNonTrivial(bool non_trivial); // Is this a trivial compaction that can be implemented by just // moving a single input file to the next level (no merging or splitting) bool IsTrivialMove() const; @@ -389,6 +388,7 @@ class Compaction { explicit Compaction(int level); int level_; + int output_level_; // compact ouputfile should step into output_level_, use for self level compaction uint64_t max_output_file_size_; Version* input_version_; VersionEdit edit_; @@ -397,7 +397,7 @@ class Compaction { std::vector inputs_[2]; // The two sets of inputs // State used to check for number of of overlapping grandparent files - // (parent == level_ + 1, grandparent == level_ + 2) + // (parent == output_level_ + 1, grandparent == output_level_ + 2) std::vector grandparents_; size_t grandparent_index_; // Index in grandparent_starts_ bool seen_key_; // Some output key has been seen @@ -417,6 +417,9 @@ class Compaction { // If delete mark is not less than this lower_bound, do not drop it. // If compaction is not on base level, this is an empty string. std::string drop_lower_bound_; + + // support self compaction + bool force_non_trivial_; }; } // namespace leveldb diff --git a/src/leveldb/include/leveldb/compact_strategy.h b/src/leveldb/include/leveldb/compact_strategy.h index 07122912e..e29fbfd0e 100644 --- a/src/leveldb/include/leveldb/compact_strategy.h +++ b/src/leveldb/include/leveldb/compact_strategy.h @@ -39,6 +39,8 @@ class CompactStrategy { // are protected by snpashot virtual void SetSnapshot(uint64_t snapshot) = 0; + virtual bool CheckTag(const Slice& tera_key, bool* del_tag, int64_t* ttl_tag) = 0; + virtual const char* Name() const = 0; }; @@ -72,6 +74,12 @@ class DummyCompactStrategy : public CompactStrategy { int64_t* merged_num) { return false; } + + virtual bool CheckTag(const Slice& tera_key, bool* del_tag, int64_t* ttl_tag) { + *del_tag = false; + *ttl_tag = -1; + return true; + } }; // each strategy object has its own inner status or context, diff --git a/src/leveldb/include/leveldb/dfs.h b/src/leveldb/include/leveldb/dfs.h index caf1672e9..b0151a7b7 100644 --- a/src/leveldb/include/leveldb/dfs.h +++ b/src/leveldb/include/leveldb/dfs.h @@ -64,6 +64,10 @@ class Dfs { /// Returns 0 on success. virtual int32_t ListDirectory(const std::string& path, std::vector* result) = 0; + /// Returns 0 on success. + virtual int32_t LockDirectory(const std::string& path) = 0; + /// Returns 0 on success. + virtual int32_t UnlockDirectory(const std::string& path) = 0; /// Returns DfsFile handler on success, NULL on error.WithTime virtual DfsFile* OpenFile(const std::string& filename, int32_t flags) = 0; /// Returns Dfs handler on success, NULL on error. diff --git a/src/leveldb/include/leveldb/filter_policy.h b/src/leveldb/include/leveldb/filter_policy.h index 389a8c9d7..4ba4706f8 100644 --- a/src/leveldb/include/leveldb/filter_policy.h +++ b/src/leveldb/include/leveldb/filter_policy.h @@ -22,6 +22,8 @@ #include +#include "leveldb/raw_key_operator.h" + namespace leveldb { class Slice; @@ -70,6 +72,9 @@ class FilterPolicy { extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); // bloomfilter for ttl-kv mode. extern const FilterPolicy* NewTTLKvBloomFilterPolicy(int bits_per_key); +// for bigtable mode +extern const FilterPolicy* NewRowKeyBloomFilterPolicy(int bits_per_key, + const RawKeyOperator* raw_key_operator); } diff --git a/src/leveldb/include/leveldb/options.h b/src/leveldb/include/leveldb/options.h index e4057ef1c..be78d0d30 100644 --- a/src/leveldb/include/leveldb/options.h +++ b/src/leveldb/include/leveldb/options.h @@ -292,6 +292,9 @@ struct Options { // Default: false bool ignore_corruption_in_compaction; + // If true, env::FileLock will be called during leveldb's load + bool use_file_lock; + // disable write-ahead-log bool disable_wal; @@ -306,6 +309,14 @@ struct Options { // 3). ignore sst lost bool ignore_corruption_in_open; + // Statistic: By default, if 10% entry timeout, will trigger compaction + // Default: 10 % + uint64_t ttl_percentage; + + // Statistic: delete tag's percentage in sst + // Default: 10 % + uint64_t del_percentage; + // Create an Options object with default values for all fields. Options(); }; @@ -337,6 +348,11 @@ struct ReadOptions { // Default: NULL std::set* target_lgs; + // if read a single row, optimization may be applied to this read + bool read_single_row; + std::string row_start_key; // start key of this row + std::string row_end_key; // start key of next row + // db option const Options* db_opt; @@ -345,6 +361,7 @@ struct ReadOptions { fill_cache(true), snapshot(kMaxSequenceNumber), target_lgs(NULL), + read_single_row(false), db_opt(db_option) { } ReadOptions() { diff --git a/src/leveldb/include/leveldb/raw_key_operator.h b/src/leveldb/include/leveldb/raw_key_operator.h index ed790b16c..c8868c237 100644 --- a/src/leveldb/include/leveldb/raw_key_operator.h +++ b/src/leveldb/include/leveldb/raw_key_operator.h @@ -29,6 +29,7 @@ class RawKeyOperator { TeraKeyType* type) const = 0; virtual int Compare(const Slice& key1, const Slice& key2) const = 0; + virtual const char* Name() const = 0; }; const RawKeyOperator* ReadableRawKeyOperator(); diff --git a/src/leveldb/include/nfs.h b/src/leveldb/include/nfs.h index 7dde59dfb..6c90e3f1e 100644 --- a/src/leveldb/include/nfs.h +++ b/src/leveldb/include/nfs.h @@ -386,6 +386,34 @@ struct ::dirent* Readdir(NFSDIR* dir); */ int Closedir(NFSDIR* dir); +/** + * @brief Set Dir Owner. only dir owner has create delete permission. + * @param dir + * @return + * 0 - on success + * -1 - on error + * @errno When error, the nfs errno will be set appropriately: + * NFSE_BANNED - dir is lock + * EINVAL - invalid argument + * ETIMEDOUT - access nfs service timeout + * EIO - other error + */ +int SetDirOwner(const char* path); + +/** + * @brief clear dir Owner. + * @param dir + * @return + * 0 - on success + * -1 - on error + * @errno When error, the nfs errno will be set appropriately: + * NFSE_BANNED - dir is lock + * EINVAL - invalid argument + * ETIMEDOUT - access nfs service timeout + * EIO - other error + */ +int ClearDirOwner(const char* path); + /** * @brief Create a file. The default mode is 0666. * @param path @@ -470,6 +498,20 @@ NFSFILE* Open(const char* path, const char* mode); */ int Close(NFSFILE* stream); +/** + * @brief Force Close a file. + * @param path + * @return + * 0 - on success + * -1 - on error + * Usually used to force close the file "write opened" by other client + * If file is opened by self, will close and clear open info from NFSCLient, but not NFSFILE, will memory leak + * If file is opened by other client, force release will close it, if other is writing, will reopen and .. + * Must be used very caseful + * @errno the same to Close + */ +int ForceRelease(const char* path); + /** * @brief Read size bytes to the buf pointed by ptr from the file stream. * Libnfs will assume that the offset is the finished offset you read last time. diff --git a/src/leveldb/port/port_posix.cc b/src/leveldb/port/port_posix.cc index 3bd18263f..37ba0c3bb 100644 --- a/src/leveldb/port/port_posix.cc +++ b/src/leveldb/port/port_posix.cc @@ -59,12 +59,12 @@ void CondVar::Wait() { PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); } -bool CondVar::Wait(int32_t wait_millisec) { +bool CondVar::Wait(int64_t wait_millisec) { assert(wait_millisec >= 0); struct timespec ts; struct timeval tp; gettimeofday(&tp, NULL); - uint32_t usec = tp.tv_usec + wait_millisec * 1000; + uint64_t usec = tp.tv_usec + wait_millisec * 1000; ts.tv_sec = tp.tv_sec + usec / 1000000; ts.tv_nsec = (usec % 1000000) * 1000; return (0 == pthread_cond_timedwait(&cv_, &mu_->mu_, &ts)); diff --git a/src/leveldb/port/port_posix.h b/src/leveldb/port/port_posix.h index 676fdd27c..bb80c31f9 100644 --- a/src/leveldb/port/port_posix.h +++ b/src/leveldb/port/port_posix.h @@ -109,7 +109,7 @@ class CondVar { explicit CondVar(Mutex* mu); ~CondVar(); void Wait(); - bool Wait(int32_t wait_millisec); + bool Wait(int64_t wait_millisec); void Signal(); void SignalAll(); private: diff --git a/src/leveldb/table/table.cc b/src/leveldb/table/table.cc index 12a92b4f9..c135bb15e 100644 --- a/src/leveldb/table/table.cc +++ b/src/leveldb/table/table.cc @@ -137,6 +137,132 @@ class TableIter : public Iterator { std::string largest_; }; +class IndexBlockIter : public Iterator { + public: + IndexBlockIter(const ReadOptions& opts, + Block* index_block, + FilterBlockReader* filter) + : valid_(false), + iter_(index_block->NewIterator(opts.db_opt->comparator)), + comparator_(opts.db_opt->comparator), + filter_(filter), + read_single_row_(opts.read_single_row), + row_start_key_(opts.row_start_key, kMaxSequenceNumber, kValueTypeForSeek), + row_end_key_(opts.row_end_key, kMaxSequenceNumber, kValueTypeForSeek) { + } + virtual ~IndexBlockIter() { + delete iter_; + } + virtual void Seek(const Slice& target) { + iter_->Seek(target); + SkipUnmatchedBlocksForward(); + } + virtual void SeekToFirst() { + iter_->SeekToFirst(); + SkipUnmatchedBlocksForward(); + } + virtual void SeekToLast() { + iter_->SeekToLast(); + SkipUnmatchedBlocksBackward(); + } + virtual void Next() { + iter_->Next(); + SkipUnmatchedBlocksForward(); + } + virtual void Prev() { + iter_->Prev(); + SkipUnmatchedBlocksBackward(); + } + virtual bool Valid() const { + return valid_; + } + virtual Slice key() const { + assert(Valid()); + return iter_->key(); + } + virtual Slice value() const { + assert(Valid()); + return iter_->value(); + } + virtual Status status() const { + return iter_->status(); + } + + private: + void SkipUnmatchedBlocksForward() { + valid_ = false; + while (iter_->Valid()) { + if (!read_single_row_) { + valid_ = true; + break; + } + if (!valid_index_key_.empty() && comparator_->Compare(iter_->key(), valid_index_key_) > 0) { + //Log("bloomfilter: skip block by range"); + break; + } + if (comparator_->Compare(iter_->key(), row_end_key_.Encode()) >= 0 && + (valid_index_key_.empty() || comparator_->Compare(iter_->key(), valid_index_key_) < 0)) { + valid_index_key_ = iter_->key().ToString(); + } + if (CheckFilter()) { + valid_ = true; + //Log("bloomfilter: valid block"); + break; + } + //Log("bloomfilter: skip block by bloom"); + iter_->Next(); + } + } + void SkipUnmatchedBlocksBackward() { + valid_ = false; + while (iter_->Valid()) { + if (!read_single_row_) { + valid_ = true; + break; + } + if (comparator_->Compare(iter_->key(), row_start_key_.Encode()) < 0) { + //Log("bloomfilter: skip block by range"); + break; + } + if (comparator_->Compare(iter_->key(), row_end_key_.Encode()) >= 0 && + (valid_index_key_.empty() || comparator_->Compare(iter_->key(), valid_index_key_) < 0)) { + valid_index_key_ = iter_->key().ToString(); + } + if (CheckFilter()) { + valid_ = true; + //Log("bloomfilter: valid block"); + break; + } + //Log("bloomfilter: skip block by bloom"); + iter_->Prev(); + } + } + bool CheckFilter() { + assert(iter_->Valid()); + Slice handle_value = iter_->value(); + BlockHandle handle; + if (!read_single_row_ || + filter_ == NULL || + !handle.DecodeFrom(&handle_value).ok() || + filter_->KeyMayMatch(handle.offset(), row_start_key_.Encode())) { + return true; + } + return false; + } + + private: + bool valid_; + Iterator* iter_; + const Comparator* comparator_; + FilterBlockReader* filter_; + bool read_single_row_; + InternalKey row_start_key_; + InternalKey row_end_key_; + + // smallest index key which is larger than row end key + std::string valid_index_key_; +}; + Status Table::Open(const Options& options, RandomAccessFile* file, uint64_t size, @@ -314,9 +440,7 @@ Iterator* Table::BlockReader(void* arg, } Iterator* Table::NewIterator(const ReadOptions& options) const { - return NewTwoLevelIterator( - rep_->index_block->NewIterator(options.db_opt->comparator), - &Table::BlockReader, const_cast(this), options); + return NewIterator(options, Slice(), Slice()); } Iterator* Table::NewIterator(const ReadOptions& options, @@ -324,9 +448,9 @@ Iterator* Table::NewIterator(const ReadOptions& options, const Slice& largest) const { return new TableIter( NewTwoLevelIterator( - rep_->index_block->NewIterator(options.db_opt->comparator), + new IndexBlockIter(options, rep_->index_block, rep_->filter), &Table::BlockReader, const_cast(this), options), - options.db_opt->comparator, smallest, largest); + options.db_opt->comparator, smallest, largest); } Status Table::InternalGet(const ReadOptions& options, const Slice& k, diff --git a/src/leveldb/util/bloom.cc b/src/leveldb/util/bloom.cc index cae5135e3..46bc022ee 100644 --- a/src/leveldb/util/bloom.cc +++ b/src/leveldb/util/bloom.cc @@ -8,6 +8,7 @@ #include "leveldb/filter_policy.h" +#include "leveldb/raw_key_operator.h" #include "leveldb/slice.h" #include "util/hash.h" @@ -99,6 +100,47 @@ class BloomFilterPolicy : public FilterPolicy { return true; } }; + +class RowKeyBloomFilterPolicy : public BloomFilterPolicy { + private: + const RawKeyOperator* raw_key_operator_; + + public: + explicit RowKeyBloomFilterPolicy(int bits_per_key, BloomHashMethod hash_method, + const RawKeyOperator* raw_key_operator) + : BloomFilterPolicy(bits_per_key, hash_method), + raw_key_operator_(raw_key_operator) { + } + + virtual const char* Name() const { + return "tera.RowKeyBloomFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + Slice* row_keys = new Slice[n]; + int row_key_num = 0; + + for (int i = 0; i < n; i++) { + Slice row_key; + raw_key_operator_->ExtractTeraKey(keys[i], &row_key, NULL, NULL, NULL, NULL); + if (row_key_num == 0 || row_key.compare(row_keys[row_key_num - 1]) != 0) { + row_keys[row_key_num++] = row_key; + } + } + BloomFilterPolicy::CreateFilter(row_keys, row_key_num, dst); + delete[] row_keys; + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + Slice row_key; + if (raw_key_operator_->ExtractTeraKey(key, &row_key, NULL, NULL, NULL, NULL)) { + return BloomFilterPolicy::KeyMayMatch(row_key, bloom_filter); + } else { + return true; + } + } +}; + } const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { @@ -109,4 +151,8 @@ const FilterPolicy* NewTTLKvBloomFilterPolicy(int bits_per_key) { return new BloomFilterPolicy(bits_per_key, TTLKvBloomHash); } +const FilterPolicy* NewRowKeyBloomFilterPolicy(int bits_per_key, const RawKeyOperator* raw_key_operator) { + return new RowKeyBloomFilterPolicy(bits_per_key, BuiltInBloomHash, raw_key_operator); +} + } // namespace leveldb diff --git a/src/leveldb/util/env_dfs.cc b/src/leveldb/util/env_dfs.cc index 8450c12a6..53fde1804 100644 --- a/src/leveldb/util/env_dfs.cc +++ b/src/leveldb/util/env_dfs.cc @@ -308,6 +308,12 @@ class DfsWritableFile: public WritableFile { } }; +class DfsFileLock : public FileLock { +public: + DfsFileLock(const std::string& path) : dir_path_(path) {} + std::string dir_path_; +}; + DfsEnv::DfsEnv(Dfs* dfs) : EnvWrapper(Env::Default()), dfs_(dfs) { } @@ -394,11 +400,16 @@ Status DfsEnv::GetChildren(const std::string& path, std::vector* re { tera::AutoCounter ac(&dfs_list_hang_counter, "ListDirectory", path.c_str()); dfs_list_counter.Inc(); - if (0 != dfs_->ListDirectory(path, result)) { - Log("GetChildren call with path not exists: %s\n", path.data()); + if (0 == dfs_->ListDirectory(path, result)) { + return Status::OK(); + } + if (errno == ETIMEDOUT) { + Log("[env_dfs] GetChildren timeout: %s\n", path.c_str()); + return Status::TimeOut("ListDirectory", path); + } else { + Log("[env_dfs] GetChildren call with path not exists: %s\n", path.data()); return Status::IOError("Path not exist", path); } - return Status::OK(); } bool DfsEnv::CheckDelete(const std::string& fname, std::vector* flags) @@ -490,13 +501,29 @@ Status DfsEnv::RenameFile(const std::string& src, const std::string& target) Status DfsEnv::LockFile(const std::string& fname, FileLock** lock) { - *lock = NULL; + std::size_t found = fname.find_last_of("/"); + if (found == std::string::npos) { + return Status::IOError("lock path error: " + fname); + } + std::string dir_path(fname.c_str(), found); + if (dfs_->LockDirectory(dir_path) != 0) { + return Status::IOError("lock " + dir_path); + } + *lock = new DfsFileLock(dir_path); return Status::OK(); } Status DfsEnv::UnlockFile(FileLock* lock) { - return Status::OK(); + if (DfsFileLock* dfs_lock = dynamic_cast(lock)) { + const std::string& dir_path = dfs_lock->dir_path_; + dfs_->UnlockDirectory(dir_path.c_str()); + delete lock; + return Status::OK(); + } else { + Log("[env_dfs]: wrong file lock at %p\n", lock); + abort(); + } } static bool inited = false; diff --git a/src/leveldb/util/env_posix.cc b/src/leveldb/util/env_posix.cc index ae3d59bb2..2918a73f1 100644 --- a/src/leveldb/util/env_posix.cc +++ b/src/leveldb/util/env_posix.cc @@ -753,6 +753,7 @@ class PosixEnv : public Env { } locks_.Remove(my_lock->name_); close(my_lock->fd_); + remove(my_lock->name_.c_str()); delete my_lock; return result; } diff --git a/src/leveldb/util/hdfs.cc b/src/leveldb/util/hdfs.cc index b1c4aab32..b90fea36e 100644 --- a/src/leveldb/util/hdfs.cc +++ b/src/leveldb/util/hdfs.cc @@ -223,5 +223,16 @@ int32_t Hdfs::ListDirectory(const std::string& path, return 0; } +int32_t Hdfs::LockDirectory(const std::string& path) { + // no implementation + return -1; +} + +int32_t Hdfs::UnlockDirectory(const std::string& path) { + // no implementation + return -1; } + +} + /* vim: set expandtab ts=2 sw=2 sts=2 tw=100: */ diff --git a/src/leveldb/util/hdfs.h b/src/leveldb/util/hdfs.h index 57479b3c0..81ed269ac 100644 --- a/src/leveldb/util/hdfs.h +++ b/src/leveldb/util/hdfs.h @@ -46,6 +46,8 @@ class Hdfs : public Dfs { int32_t Rename(const std::string& from, const std::string& to); int32_t Copy(const std::string& from, const std::string& to); int32_t ListDirectory(const std::string& path, std::vector* result); + int32_t LockDirectory(const std::string& path); + int32_t UnlockDirectory(const std::string& path); DfsFile* OpenFile(const std::string& filename, int32_t flags); private: @@ -88,6 +90,8 @@ class Hdfs2 : public Dfs { int32_t Rename(const std::string& from, const std::string& to); int32_t Copy(const std::string& from, const std::string& to); int32_t ListDirectory(const std::string& path, std::vector* result); + int32_t LockDirectory(const std::string& path); + int32_t UnlockDirectory(const std::string& path); DfsFile* OpenFile(const std::string& filename, int32_t flags); private: diff --git a/src/leveldb/util/hdfs2.cc b/src/leveldb/util/hdfs2.cc index 357743253..fa3a8902c 100644 --- a/src/leveldb/util/hdfs2.cc +++ b/src/leveldb/util/hdfs2.cc @@ -247,6 +247,17 @@ void* Hdfs2::GetFSHandle(const std::string& path) { return fs_list_[index]; } +int32_t Hdfs2::LockDirectory(const std::string& path) { + // no implementation + return -1; +} + +int32_t Hdfs2::UnlockDirectory(const std::string& path) { + // no implementation + return -1; +} + + } // namespace leveldb /* vim: set expandtab ts=2 sw=2 sts=2 tw=100: */ diff --git a/src/leveldb/util/histogram.cc b/src/leveldb/util/histogram.cc index 12fc1e0b8..d636f9c73 100644 --- a/src/leveldb/util/histogram.cc +++ b/src/leveldb/util/histogram.cc @@ -35,6 +35,7 @@ const double Histogram::kBucketLimit[kNumBuckets] = { }; void Histogram::Clear() { + MutexLock lock(&mutex_); min_ = kBucketLimit[kNumBuckets-1]; max_ = 0; num_ = 0; @@ -46,6 +47,7 @@ void Histogram::Clear() { } void Histogram::Add(double value) { + MutexLock lock(&mutex_); // Linear search is fast enough for our usage in db_bench int b = 0; while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) { @@ -60,6 +62,7 @@ void Histogram::Add(double value) { } void Histogram::Merge(const Histogram& other) { + MutexLock lock(&mutex_); if (other.min_ < min_) min_ = other.min_; if (other.max_ > max_) max_ = other.max_; num_ += other.num_; @@ -75,6 +78,7 @@ double Histogram::Median() const { } double Histogram::Percentile(double p) const { + MutexLock lock(&mutex_); double threshold = num_ * (p / 100.0); double sum = 0; for (int b = 0; b < kNumBuckets; b++) { @@ -96,11 +100,13 @@ double Histogram::Percentile(double p) const { } double Histogram::Average() const { + MutexLock lock(&mutex_); if (num_ == 0.0) return 0; return sum_ / num_; } double Histogram::StandardDeviation() const { + MutexLock lock(&mutex_); if (num_ == 0.0) return 0; double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); return sqrt(variance); diff --git a/src/leveldb/util/histogram.h b/src/leveldb/util/histogram.h index ecdc726e6..f60602efd 100644 --- a/src/leveldb/util/histogram.h +++ b/src/leveldb/util/histogram.h @@ -10,12 +10,14 @@ #define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_ #include +#include "port/port_posix.h" +#include "util/mutexlock.h" namespace leveldb { class Histogram { public: - Histogram() { } + Histogram() { Clear(); } ~Histogram() { } void Clear(); @@ -25,6 +27,7 @@ class Histogram { std::string ToString() const; private: + mutable port::Mutex mutex_; double min_; double max_; double num_; diff --git a/src/leveldb/util/nfs.cc b/src/leveldb/util/nfs.cc index 44a3f3853..4ebe8ab32 100644 --- a/src/leveldb/util/nfs.cc +++ b/src/leveldb/util/nfs.cc @@ -27,6 +27,8 @@ static int (*nfsRmdir)(const char* path); static nfs::NFSDIR* (*nfsOpendir)(const char* path); static struct ::dirent* (*nfsReaddir)(nfs::NFSDIR* dir); static int (*nfsClosedir)(nfs::NFSDIR* dir); +static int (*nfsSetDirOwner)(const char* path); +static int (*nfsClearDirOwner)(const char* path); static int (*nfsStat)(const char* path, struct ::stat* stat); static int (*nfsUnlink)(const char* path); @@ -35,6 +37,7 @@ static int (*nfsRename)(const char* oldpath, const char* newpath); static nfs::NFSFILE* (*nfsOpen)(const char* path, const char* mode); static int (*nfsClose)(nfs::NFSFILE* stream); +static int (*nfsForceRelease)(const char* path); static ssize_t (*nfsRead)(nfs::NFSFILE* stream, void* ptr, size_t size); static ssize_t (*nfsPRead)(nfs::NFSFILE* stream, void* ptr, size_t size, @@ -57,6 +60,19 @@ void* ResolveSymbol(void* dl, const char* sym) { fprintf(stderr, "libnfs.so does not support federation\n"); return NULL; } + if (strcmp(sym,"SetDirOwner") == 0 && error != NULL) { + fprintf(stderr, "libnfs.so does not support SetDirOwner\n"); + return NULL; + } + if (strcmp(sym,"ClearDirOwner") == 0 && error != NULL) { + fprintf(stderr, "libnfs.so does not support ClearDirOwner\n"); + return NULL; + } + + if (strcmp(sym,"ForceRelease") == 0 && error != NULL) { + fprintf(stderr, "libnfs.so does not support ForceRelease\n"); + return NULL; + } if (error != NULL) { fprintf(stderr, "resolve symbol %s from libnfs.so error: %s\n", sym, error); @@ -84,12 +100,15 @@ void Nfs::LoadSymbol() { *(void**)(&nfsOpendir) = ResolveSymbol(dl, "Opendir"); *(void**)(&nfsReaddir) = ResolveSymbol(dl, "Readdir"); *(void**)(&nfsClosedir) = ResolveSymbol(dl, "Closedir"); + *(void**)(&nfsSetDirOwner) = ResolveSymbol(dl, "SetDirOwner"); + *(void**)(&nfsClearDirOwner) = ResolveSymbol(dl, "ClearDirOwner"); *(void**)(&nfsStat) = ResolveSymbol(dl, "Stat"); *(void**)(&nfsUnlink) = ResolveSymbol(dl, "Unlink"); *(void**)(&nfsAccess) = ResolveSymbol(dl, "Access"); *(void**)(&nfsRename) = ResolveSymbol(dl, "Rename"); *(void**)(&nfsOpen) = ResolveSymbol(dl, "Open"); *(void**)(&nfsClose) = ResolveSymbol(dl, "Close"); + *(void**)(&nfsForceRelease) = ResolveSymbol(dl, "ForceRelease"); *(void**)(&nfsRead) = ResolveSymbol(dl, "Read"); *(void**)(&nfsPRead) = ResolveSymbol(dl, "PRead"); *(void**)(&nfsWrite) = ResolveSymbol(dl, "Write"); @@ -257,7 +276,9 @@ int32_t Nfs::Exists(const std::string& filename) { int32_t retval = (*nfsAccess)(filename.c_str(), F_OK); if (retval != 0) { errno = (*nfsGetErrno)(); + int errno_saved = errno; fprintf(stderr, "[%s] Exists %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno); + errno = errno_saved; } return retval; } @@ -314,7 +335,9 @@ int32_t Nfs::ListDirectory(const std::string& path, nfs::NFSDIR* dir = (*nfsOpendir)(path.c_str()); if (NULL == dir) { errno = (*nfsGetErrno)(); + int errno_saved = errno; fprintf(stderr, "[%s] Opendir %s fail: %d\n", common::timer::get_curtime_str().c_str(), path.c_str(), errno); + errno = errno_saved; return -1; } struct ::dirent* dir_info = NULL; @@ -325,14 +348,51 @@ int32_t Nfs::ListDirectory(const std::string& path, } } errno = (*nfsGetErrno)(); + int errno_saved = errno; if (0 != errno) { fprintf(stderr, "[%s] List %s error: %d\n", common::timer::get_curtime_str().c_str(), path.c_str(), errno); (*nfsClosedir)(dir); + errno = errno_saved; return -1; } (*nfsClosedir)(dir); return 0; } +int32_t Nfs::LockDirectory(const std::string& path) { + int ret = (*nfsSetDirOwner)(path.c_str()); + if (ret != 0) { + fprintf(stderr, "[LockDirectory] lock dir %s fail, errno: %d", + path.c_str(), errno); + return -1; + } + + std::vector files; + ret = ListDirectory(path, &files); + if (ret != 0) { + fprintf(stderr, "[LockDirectory] list dir %s fail, errno: %d", + path.c_str(), errno); + return -1; + } + + for (size_t i = 0; i < files.size(); i++) { + if (files[i].find(".log") != std::string::npos || + files[i].find("MANIFEST") != std::string::npos) { + std::string file_name = path + "/" + files[i]; + ret = (*nfsForceRelease)(file_name.c_str()); + if (ret != 0) { + fprintf(stderr, "[LockDirectory] force release file %s fail, errno: %d", + file_name.c_str(), errno); + return -1; + } + } + } + return 0; +} + +int32_t Nfs::UnlockDirectory(const std::string& path) { + return (*nfsClearDirOwner)(path.c_str()); +} + } /* vim: set expandtab ts=2 sw=2 sts=2 tw=100: */ diff --git a/src/leveldb/util/nfs.h b/src/leveldb/util/nfs.h index e7ffc8352..b80dd0316 100644 --- a/src/leveldb/util/nfs.h +++ b/src/leveldb/util/nfs.h @@ -48,6 +48,8 @@ class Nfs : public Dfs { int32_t Rename(const std::string& from, const std::string& to); int32_t Copy(const std::string& from, const std::string& to); int32_t ListDirectory(const std::string& path, std::vector* result); + int32_t LockDirectory(const std::string& path); + int32_t UnlockDirectory(const std::string& path); DfsFile* OpenFile(const std::string& filename, int32_t flags); private: Nfs(); diff --git a/src/leveldb/util/nfs_wrapper.h b/src/leveldb/util/nfs_wrapper.h index fde4ebd63..b51fe7cb6 100644 --- a/src/leveldb/util/nfs_wrapper.h +++ b/src/leveldb/util/nfs_wrapper.h @@ -40,6 +40,10 @@ struct ::dirent* Readdir(nfs::NFSDIR* dir); int Closedir(nfs::NFSDIR* dir); +int SetDirOwner(const char* path); + +int ClearDirOwner(const char* path); + int Create(const char* path); int Unlink(const char* path); @@ -48,6 +52,8 @@ nfs::NFSFILE* Open(const char* path, const char* mode); int Close(nfs::NFSFILE* stream); +int ForceRelease(const char* path); + ssize_t Read(nfs::NFSFILE* stream, void* ptr, size_t size); ssize_t PRead(nfs::NFSFILE* stream, void* ptr, size_t size, uint64_t offset); diff --git a/src/leveldb/util/options.cc b/src/leveldb/util/options.cc index 98bea7af3..ecd11b57e 100644 --- a/src/leveldb/util/options.cc +++ b/src/leveldb/util/options.cc @@ -49,8 +49,11 @@ Options::Options() sst_size(kDefaultSstSize), verify_checksums_in_compaction(false), ignore_corruption_in_compaction(false), + use_file_lock(true), disable_wal(false), - ignore_corruption_in_open(false) { + ignore_corruption_in_open(false), + ttl_percentage(99), + del_percentage(20) { } } // namespace leveldb diff --git a/src/leveldb/util/raw_key_operator.cc b/src/leveldb/util/raw_key_operator.cc index d78c40f86..9d5b5d3dc 100644 --- a/src/leveldb/util/raw_key_operator.cc +++ b/src/leveldb/util/raw_key_operator.cc @@ -102,6 +102,10 @@ class ReadableRawKeyOperatorImpl : public RawKeyOperator { virtual int Compare(const Slice& key1, const Slice& key2) const { return key1.compare(key2); } + + const char* Name() const { + return "tera.RawKeyOperator.readable"; + } }; /** @@ -228,6 +232,10 @@ class BinaryRawKeyOperatorImpl : public RawKeyOperator { Slice ts_type2(data2 + size2 - 12, 8); return ts_type1.compare(ts_type2); } + + const char* Name() const { + return "tera.RawKeyOperator.binary"; + } }; // support KV-pair with TTL, Key's format : @@ -267,6 +275,10 @@ class KvRawKeyOperatorImpl : public RawKeyOperator { leveldb::Slice key2_rowkey(key2.data(), key2.size() - sizeof(int64_t)); return key1_rowkey.compare(key2_rowkey); } + + const char* Name() const { + return "tera.RawKeyOperator.kv"; + } }; static pthread_once_t once = PTHREAD_ONCE_INIT; diff --git a/src/leveldb/util/string_ext.cc b/src/leveldb/util/string_ext.cc index b34f77f42..1f8c67510 100644 --- a/src/leveldb/util/string_ext.cc +++ b/src/leveldb/util/string_ext.cc @@ -59,21 +59,15 @@ void SplitStringEnd(const std::string& full, std::string* begin_part, void SplitStringStart(const std::string& full, std::string* begin_part, std::string* end_part, std::string delim) { std::string::size_type pos = full.find_first_of(delim); - if (pos != std::string::npos && pos != 0) { + if (pos == std::string::npos || (pos < full.size() - 1)) { if (end_part) { *end_part = full.substr(pos + 1); } + } + if (pos != std::string::npos && pos >= 1) { if (begin_part) { *begin_part = full.substr(0, pos); } - } else if (pos == 0) { - if (end_part) { - *end_part = full; - } - } else if (pos != std::string::npos) { - if (begin_part) { - *begin_part = full; - } } } diff --git a/src/leveldb/util/testutil.h b/src/leveldb/util/testutil.h index 9b452b378..281f25299 100644 --- a/src/leveldb/util/testutil.h +++ b/src/leveldb/util/testutil.h @@ -42,7 +42,7 @@ class ErrorEnv : public EnvWrapper { virtual Status NewWritableFile(const std::string& fname, WritableFile** result) { - if (writable_file_error_) { + if (writable_file_error_ && num_writable_file_errors_ < 10) { ++num_writable_file_errors_; *result = NULL; return Status::IOError(fname, "fake error"); diff --git a/src/master/availability.cc b/src/master/availability.cc index 1e011a5ca..998c14a8e 100644 --- a/src/master/availability.cc +++ b/src/master/availability.cc @@ -24,7 +24,7 @@ DECLARE_string(tera_master_meta_table_path); namespace tera { namespace master { -TabletAvailability::TabletAvailability(boost::shared_ptr t) +TabletAvailability::TabletAvailability(std::shared_ptr t) : tablet_manager_(t) { start_ts_ = get_micros(); } @@ -34,22 +34,41 @@ void TabletAvailability::AddNotReadyTablet(const std::string& path) { int64_t ts = get_micros(); tablets_.insert(std::pair(path, ts)); + if (tablets_hist_cost_[path].start_ts > 0) { + VLOG(10) << "notready again " << path; + return; + } + tablets_hist_cost_[path].start_ts = ts; tablets_hist_cost_[path].notready_num++; + VLOG(10) << "addnotready " << path + << ", total_cost " << tablets_hist_cost_[path].total + << ", start_ts " << tablets_hist_cost_[path].start_ts + << ", notready " << tablets_hist_cost_[path].notready_num + << ", reready " << tablets_hist_cost_[path].reready_num; } void TabletAvailability::EraseNotReadyTablet(const std::string& path) { MutexLock lock(&mutex_); tablets_.erase(path); + if (tablets_hist_cost_.find(path) == tablets_hist_cost_.end() || + tablets_hist_cost_[path].start_ts == 0) { + VLOG(10) << "reready again " << path; + return; + } + int64_t ts = get_micros(); if (tablets_hist_cost_[path].start_ts > 0) { tablets_hist_cost_[path].total += ts - tablets_hist_cost_[path].start_ts; - } else { - tablets_hist_cost_[path].total += ts - start_ts_; } tablets_hist_cost_[path].start_ts = 0; tablets_hist_cost_[path].reready_num++; + VLOG(10) << "delnotready " << path + << ", total_cost " << tablets_hist_cost_[path].total + << ", start_ts " << tablets_hist_cost_[path].start_ts + << ", notready " << tablets_hist_cost_[path].notready_num + << ", reready " << tablets_hist_cost_[path].reready_num; } static std::string GetNameFromPath(const std::string& path) { @@ -113,7 +132,6 @@ void TabletAvailability::LogAvailability() { int64_t total_time = 0, all_time = start - start_ts_; start_ts_ = start; - int64_t nr_notready_tablets = tablets_hist_cost_.size(); int64_t total_notready = 0, total_reready = 0; std::map::iterator stat_it; for (stat_it = tablets_hist_cost_.begin(); @@ -136,6 +154,7 @@ void TabletAvailability::LogAvailability() { tablets_hist_cost_.erase(stat_it++); } } + int64_t nr_notready_tablets = tablets_hist_cost_.size(); LOG(INFO) << "[availability][tablet_staticstic] time_interval: " << all_time / 1000 << ", notready_time: " << total_time / 1000 << ", total_time: " << (all_time * all_tablets) / 1000 diff --git a/src/master/availability.h b/src/master/availability.h index 17becbd84..ddbe6a5f4 100644 --- a/src/master/availability.h +++ b/src/master/availability.h @@ -23,14 +23,14 @@ struct TimeStatistic { }; class TabletAvailability { public: - TabletAvailability(boost::shared_ptr t); + TabletAvailability(std::shared_ptr t); void LogAvailability(); void AddNotReadyTablet(const std::string& id); void EraseNotReadyTablet(const std::string& id); private: Mutex mutex_; - boost::shared_ptr tablet_manager_; + std::shared_ptr tablet_manager_; std::map tablets_; int64_t start_ts_; diff --git a/src/master/gc_strategy.cc b/src/master/gc_strategy.cc index f5eed5d20..9eba59fcf 100644 --- a/src/master/gc_strategy.cc +++ b/src/master/gc_strategy.cc @@ -5,7 +5,6 @@ #include "master/gc_strategy.h" #include -#include #include "db/filename.h" #include "io/utils_leveldb.h" @@ -18,7 +17,7 @@ DECLARE_int32(tera_garbage_collect_debug_log); namespace tera { namespace master { -BatchGcStrategy::BatchGcStrategy (boost::shared_ptr tablet_manager) +BatchGcStrategy::BatchGcStrategy (std::shared_ptr tablet_manager) : tablet_manager_(tablet_manager), file_total_num_(0), file_delete_num_(0) {} @@ -38,7 +37,7 @@ bool BatchGcStrategy::PreQuery () { continue; } GcTabletSet& tablet_set = gc_tablets_[tables[i]->GetTableName()]; - if (!tables[i]->GetTabletsForGc(&tablet_set.first, &tablet_set.second)) { + if (!tables[i]->GetTabletsForGc(&tablet_set.first, &tablet_set.second, false)) { // tablet not ready or there is none dead tablets gc_tablets_.erase(tables[i]->GetTableName()); continue; @@ -140,7 +139,7 @@ void BatchGcStrategy::CollectDeadTabletsFiles() { } } -void BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum) { +bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum) { std::string tablepath = FLAGS_tera_tabletnode_path_prefix + tablename; std::string tablet_path = leveldb::GetTabletPathFromNum(tablepath, tabletnum); leveldb::Env* env = io::LeveldbBaseEnv(); @@ -150,7 +149,7 @@ void BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint if (children.size() == 0) { LOG(INFO) << "[gc] delete empty tablet dir: " << tablet_path; env->DeleteDir(tablet_path); - return; + return false; } for (size_t lg = 0; lg < children.size(); ++lg) { std::string lg_path = tablet_path + "/" + children[lg]; @@ -203,6 +202,7 @@ void BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint file_set[lg_num].insert(full_number); } } + return true; } void BatchGcStrategy::DeleteObsoleteFiles() { @@ -223,7 +223,7 @@ void BatchGcStrategy::DeleteObsoleteFiles() { } } -IncrementalGcStrategy::IncrementalGcStrategy(boost::shared_ptr tablet_manager) +IncrementalGcStrategy::IncrementalGcStrategy(std::shared_ptr tablet_manager) : tablet_manager_(tablet_manager), last_gc_time_(std::numeric_limits::max()), max_ts_(std::numeric_limits::max()) {} @@ -241,7 +241,9 @@ bool IncrementalGcStrategy::PreQuery () { live_tablet_files_.insert(std::make_pair(table_name, tablet_files)); std::set live_tablets, dead_tablets; - tables[i]->GetTabletsForGc(&live_tablets, &dead_tablets); + if (!tables[i]->GetTabletsForGc(&live_tablets, &dead_tablets, true)) { + continue; + } std::set::iterator it; // update dead tablets for (it = dead_tablets.begin(); it != dead_tablets.end(); ++it) { @@ -249,8 +251,13 @@ bool IncrementalGcStrategy::PreQuery () { TabletFileSet tablet_file_set(get_micros() / 1000000, 0); bool ret = temp_tablet_files.insert(std::make_pair(*it, tablet_file_set)).second; if (ret) { - VLOG(12) << "[gc] newly dead talbet " << table_name << " " << *it; - CollectSingleDeadTablet(table_name, *it); + VLOG(10) << "[gc] newly dead talbet: " << leveldb::GetTabletPathFromNum(table_name, *it); + if (!CollectSingleDeadTablet(table_name, *it)) { + // collect from DFS fails, so rollback memory status, retry in the next time + assert(dead_tablet_files_[table_name].erase(*it) == 1); + } + } else { + VLOG(20) << "[gc] old dead talbet: " << leveldb::GetTabletPathFromNum(table_name, *it); } } @@ -302,8 +309,8 @@ void IncrementalGcStrategy::ProcessQueryCallbackForGc(QueryResponse* response) { ready_tables.find(table_name) == ready_tables.end()) { continue; } - VLOG(12) << "[gc] see live table " << table_name; int64_t tablet_number = static_cast(leveldb::GetTabletNumFromPath(meta.path())); + VLOG(15) << "[gc] see live tablet " << leveldb::GetTabletPathFromNum(table_name, tablet_number); if (live_tablet_files_[table_name].find(tablet_number) == live_tablet_files_[table_name].end()) continue; live_tablet_files_[table_name][tablet_number].ready_time_ = get_micros() / 1000000; } @@ -326,7 +333,7 @@ void IncrementalGcStrategy::ProcessQueryCallbackForGc(QueryResponse* response) { leveldb::ParseFullFileNumber(file_number, &tablet_number, &file); if (dead_tablet_files_[table_name].find(tablet_number) == dead_tablet_files_[table_name].end()) { - VLOG(12) << "[gc] skip newly dead tablet " << tablet_number; + VLOG(12) << "[gc] skip live tablet " << tablet_number; continue; } TabletFileSet tablet_file_set; @@ -335,9 +342,11 @@ void IncrementalGcStrategy::ProcessQueryCallbackForGc(QueryResponse* response) { LgFileSet lg_files; temp_tablet_file_set.files_.insert(std::make_pair(lg_no, lg_files)); temp_tablet_file_set.files_[lg_no].live_files_.insert(file_number); - VLOG(12) << "[gc] insert live file " << tablet_number << "/" << lg_no << "/" << file; + VLOG(12) << "[gc] insert live file " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no << "/" << file; const LgFileSet& check = ((dead_tablet_files_[table_name][tablet_number]).files_)[lg_no]; - CHECK(check.storage_files_.find(file_number) != check.storage_files_.end()) << "[gc] insert error"; + if (check.storage_files_.find(file_number) == check.storage_files_.end()) { + LOG(WARNING) << "[gc] insert error " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no << "/" << file; + } } } // update live files in dead tablets @@ -346,7 +355,7 @@ void IncrementalGcStrategy::ProcessQueryCallbackForGc(QueryResponse* response) { for (; tablet_it != temp_tablet_files.end(); ++tablet_it) { uint64_t tablet_number = tablet_it->first; if (dead_tablets.find(tablet_number) == dead_tablets.end()) { - VLOG(12) << "[gc] skip tablet " << table_name << "/" << tablet_number; + VLOG(12) << "[gc] skip live tablet " << table_name << "/" << tablet_number; continue; } std::map& live_lg = (tablet_it->second).files_; @@ -359,7 +368,7 @@ void IncrementalGcStrategy::ProcessQueryCallbackForGc(QueryResponse* response) { for (std::set::iterator it = live_lg[lg_no].live_files_.begin(); it != live_lg[lg_no].live_files_.end(); ++it) { dead_lg[lg_no].live_files_.insert(*it); } - VLOG(12) << "[gc] copy " << tablet_number << "-" << lg_no; + VLOG(12) << "[gc] dead tablet's live lg: " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no; } } } @@ -407,17 +416,20 @@ void IncrementalGcStrategy::DeleteTableFiles(const std::string& table_name) { } } - VLOG(12) << "[gc] earliest ready time " << earliest_ready_time; + if (earliest_ready_time != max_ts_) { + VLOG(12) << "[gc] earliest ready time " << earliest_ready_time << " : " << common::timer::get_time_str(earliest_ready_time); + } else { + VLOG(12) << "[gc] " << table_name << "'s tablets not ready"; + } std::set gc_tablets; for (tablet_it = dead_tablets.begin(); tablet_it != dead_tablets.end(); ++tablet_it) { if (tablet_it->second.dead_time_ < earliest_ready_time) { gc_tablets.insert(tablet_it->first); - VLOG(12) << "[gc] push back gc tablet " << tablet_it->first; + VLOG(12) << "[gc] will gc tablet: " << leveldb::GetTabletPathFromNum(table_name, tablet_it->first); } } - std::set::iterator gc_it = gc_tablets.begin(); - for (; gc_it != gc_tablets.end();) { + for (std::set::iterator gc_it = gc_tablets.begin(); gc_it != gc_tablets.end();) { std::map& lg_files = dead_tablets[*gc_it].files_; std::map::iterator lg_it = lg_files.begin(); std::string tablet_path = leveldb::GetTabletPathFromNum(table_path, *gc_it); @@ -434,14 +446,20 @@ void IncrementalGcStrategy::DeleteTableFiles(const std::string& table_name) { for (std::set::iterator it = lg_file_set.live_files_.begin(); it != lg_file_set.live_files_.end(); ++it) { uint64_t file_no; leveldb::ParseFullFileNumber(*it, NULL, &file_no); - debug_str += " " + boost::lexical_cast(file_no); + debug_str += " " + std::to_string(file_no); } - VLOG(12) << "[gc] live = " << debug_str; + // VLOG(12) << "[gc] live = " << debug_str; LOG(INFO) << "[gc] delete: " << file_path; - env->DeleteFile(file_path); - lg_file_set.storage_files_.erase(file_it++); + if (env->DeleteFile(file_path).ok()) { + lg_file_set.storage_files_.erase(file_it++); + } else { + ++file_it; + // do nothing, try to delete next time + // TODO: if retry times > MAX ? + // TODO: if failed due to timeout but delete ok in DFS, it will always retry + } } else { - file_it++; + ++file_it; } } if (lg_file_set.storage_files_.size() == 0) { @@ -449,42 +467,58 @@ void IncrementalGcStrategy::DeleteTableFiles(const std::string& table_name) { uint64_t full_number = *(lg_file_set.live_files_.begin()); uint64_t tablet_number, file_number; leveldb::ParseFullFileNumber(full_number, &tablet_number, &file_number); - LOG(ERROR) << "still has live files: " << tablet_number << "/" << lg_it->first << "/" << file_number; - assert(0); + LOG(ERROR) << "[gc] empty tablet still has live files: " << tablet_number << "/" << lg_it->first << "/" << file_number; + } else { + std::string lg_str = std::to_string(lg_it->first); + std::string lg_path = tablet_path + "/" + lg_str; + LOG(INFO) << "[gc] delete empty lg dir: " << lg_path; + if (io::DeleteEnvDir(lg_path)) { + lg_files.erase(lg_it++); + } else { + ++lg_it; + // do nothing, try to delete next time + // TODO: iff retry times > MAX ? + // TODO: if failed due to timeout but delete ok in DFS, it will always retry + } } - std::string lg_str = boost::lexical_cast(lg_it->first); - std::string lg_path = tablet_path + "/" + lg_str; - LOG(INFO) << "[gc] delete empty lg dir: " << lg_path; - env->DeleteDir(lg_path); - lg_files.erase(lg_it++); } else { - lg_it++; + ++lg_it; } } if (lg_files.size() == 0) { LOG(INFO) << "[gc] delete empty tablet dir: " << tablet_path; - env->DeleteDir(tablet_path); - dead_tablets.erase(*gc_it); + if (env->DeleteDir(tablet_path).ok()) { + dead_tablets.erase(*gc_it); + } else { + LOG(ERROR) << "[gc] rm dir fail: " << tablet_path; + // do nothing, try to delete next time + // TODO: iff retry times > MAX ? + // TODO: if failed due to timeout but delete ok in DFS, it will always retry + } } else { // clear live_files_ in dead_tablets for next round of gc for (lg_it = lg_files.begin(); lg_it != lg_files.end(); ++lg_it) { - VLOG(12) << "[gc] clear live_files_ " << *gc_it << "/" << lg_it->first; + VLOG(12) << "[gc] clear live_files_(lg_no/file_no): " << *gc_it << "/" << lg_it->first; lg_it->second.live_files_.clear(); } - VLOG(12) << "[gc] update dead_time_ "; dead_tablets[*gc_it].dead_time_ = get_micros() / 1000000; + VLOG(12) << "[gc] update dead_time_ " << dead_tablets[*gc_it].dead_time_ << " " << common::timer::get_time_str(dead_tablets[*gc_it].dead_time_); } gc_it++; } } -void IncrementalGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum) { +bool IncrementalGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum) { std::string tablepath = FLAGS_tera_tabletnode_path_prefix + tablename; std::string tablet_path = leveldb::GetTabletPathFromNum(tablepath, tabletnum); leveldb::Env* env = io::LeveldbBaseEnv(); std::vector children; - env->GetChildren(tablet_path, &children); + leveldb::Status s = env->GetChildren(tablet_path, &children); + if (!s.ok()) { + LOG(ERROR) << "[gc] list directory fail: " << tablet_path; + return false; + } list_count_.Inc(); for (size_t lg = 0; lg < children.size(); ++lg) { @@ -508,7 +542,7 @@ void IncrementalGcStrategy::CollectSingleDeadTablet(const std::string& tablename env->GetChildren(lg_path, &files); list_count_.Inc(); - int64_t lg_no = boost::lexical_cast(children[lg]); + int64_t lg_no = std::stoll(children[lg]); std::map& tablet_files = dead_tablet_files_[tablename][tabletnum].files_; LgFileSet lg_file_set; tablet_files.insert(std::make_pair(lg_no, lg_file_set)); @@ -519,8 +553,7 @@ void IncrementalGcStrategy::CollectSingleDeadTablet(const std::string& tablename number = 0; if (!ParseFileName(files[f], &number, &type) || type != leveldb::kTableFile) { - // only keep sst, delete rest files - io::DeleteEnvDir(file_path); + // skip manifest/CURRENT continue; } @@ -528,6 +561,7 @@ void IncrementalGcStrategy::CollectSingleDeadTablet(const std::string& tablename temp_lg_files_set.storage_files_.insert(full_number); } } + return true; } void IncrementalGcStrategy::DEBUG_print_files(bool print_dead) { @@ -557,7 +591,7 @@ void IncrementalGcStrategy::DEBUG_print_files(bool print_dead) { for (std::set::iterator it = f.begin(); it != f.end(); ++it) { uint64_t file_no; leveldb::ParseFullFileNumber(*it, NULL, &file_no); - debug_str += " " + boost::lexical_cast(file_no); + debug_str += " " + std::to_string(file_no); } LOG(INFO) << "[gc] lg stor -- " << lg_it->first << "-" << (lg_it->second).storage_files_.size() << debug_str; f = (lg_it->second).live_files_; @@ -565,7 +599,7 @@ void IncrementalGcStrategy::DEBUG_print_files(bool print_dead) { for (std::set::iterator it = f.begin(); it != f.end(); ++it) { uint64_t file_no; leveldb::ParseFullFileNumber(*it, NULL, &file_no); - debug_str += " " + boost::lexical_cast(file_no); + debug_str += " " + std::to_string(file_no); } LOG(INFO) << "[gc] lg live -- " << lg_it->first << "-" << (lg_it->second).live_files_.size() << debug_str; } diff --git a/src/master/gc_strategy.h b/src/master/gc_strategy.h index 8c0fe5267..cccbd91b0 100644 --- a/src/master/gc_strategy.h +++ b/src/master/gc_strategy.h @@ -35,7 +35,7 @@ class GcStrategy { class BatchGcStrategy : public GcStrategy { public: - BatchGcStrategy (boost::shared_ptr tablet_manager); + BatchGcStrategy (std::shared_ptr tablet_manager); virtual ~BatchGcStrategy() {} // get file system image before query @@ -51,10 +51,10 @@ class BatchGcStrategy : public GcStrategy { private: void CollectDeadTabletsFiles(); - void CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum); + bool CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum); void DeleteObsoleteFiles(); - boost::shared_ptr tablet_manager_; + std::shared_ptr tablet_manager_; // tabletnode garbage clean // first: live tablet, second: dead tablet @@ -70,7 +70,7 @@ class BatchGcStrategy : public GcStrategy { class IncrementalGcStrategy : public GcStrategy{ public: - IncrementalGcStrategy(boost::shared_ptr tablet_manager); + IncrementalGcStrategy(std::shared_ptr tablet_manager); virtual ~IncrementalGcStrategy() {} // get dead tablets @@ -87,7 +87,7 @@ class IncrementalGcStrategy : public GcStrategy{ private: void DEBUG_print_files(bool print_dead); - void CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum); + bool CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum); void DeleteTableFiles(const std::string& table_name); struct LgFileSet { @@ -112,7 +112,7 @@ class IncrementalGcStrategy : public GcStrategy{ typedef std::map TabletFiles; // tablet_number -> files typedef std::map TableFiles; // table_name -> files mutable Mutex gc_mutex_; - boost::shared_ptr tablet_manager_; + std::shared_ptr tablet_manager_; int64_t last_gc_time_; TableFiles dead_tablet_files_; TableFiles live_tablet_files_; diff --git a/src/master/master_entry.cc b/src/master/master_entry.cc index 872b9876e..1e958c028 100644 --- a/src/master/master_entry.cc +++ b/src/master/master_entry.cc @@ -16,6 +16,14 @@ DECLARE_string(tera_master_port); DECLARE_int32(tera_master_rpc_server_max_inflow); DECLARE_int32(tera_master_rpc_server_max_outflow); +std::string GetTeraEntryName() { + return "master"; +} + +tera::TeraEntry* GetTeraEntry() { + return new tera::master::MasterEntry(); +} + namespace tera { namespace master { diff --git a/src/master/master_impl.cc b/src/master/master_impl.cc index 4627b533c..4d606dfac 100644 --- a/src/master/master_impl.cc +++ b/src/master/master_impl.cc @@ -6,8 +6,8 @@ #include "tabletnode/tablet_manager.h" #include -#include #include +#include #include #include @@ -106,6 +106,8 @@ DECLARE_int32(tera_master_schema_update_retry_times); DECLARE_int64(tera_master_availability_check_period); DECLARE_bool(tera_master_availability_check_enabled); +using namespace std::placeholders; + namespace tera { namespace master { @@ -147,10 +149,10 @@ MasterImpl::MasterImpl() if (FLAGS_tera_master_gc_strategy == "default") { LOG(INFO) << "[gc] gc strategy is BatchGcStrategy"; - gc_strategy_ = boost::shared_ptr(new BatchGcStrategy(tablet_manager_)); + gc_strategy_ = std::shared_ptr(new BatchGcStrategy(tablet_manager_)); } else if (FLAGS_tera_master_gc_strategy == "incremental") { LOG(INFO) << "[gc] gc strategy is IncrementalGcStrategy"; - gc_strategy_ = boost::shared_ptr(new IncrementalGcStrategy(tablet_manager_)); + gc_strategy_ = std::shared_ptr(new IncrementalGcStrategy(tablet_manager_)); } else { LOG(ERROR) << "Unknown gc strategy"; } @@ -174,7 +176,7 @@ bool MasterImpl::Init() { LOG(INFO) << "[acl] " << (FLAGS_tera_acl_enabled ? "enabled" : "disabled"); SetMasterStatus(kIsSecondary); - thread_pool_->AddTask(boost::bind(&MasterImpl::InitAsync, this)); + thread_pool_->AddTask(std::bind(&MasterImpl::InitAsync, this)); return true; } @@ -539,7 +541,8 @@ bool MasterImpl::LoadMetaTable(const std::string& meta_tablet_addr, lg->set_store_type(MemoryStore); tablet_manager_->AddTablet(FLAGS_tera_master_meta_table_name, "", "", FLAGS_tera_master_meta_table_path, meta_tablet_addr, - schema, kTableReady, 0, &meta_tablet); + schema, kTableNotInit, 0, &meta_tablet); + meta_tablet->SetStatus(kTableReady); return true; } uint32_t record_size = response.results().key_values_size(); @@ -891,7 +894,7 @@ void MasterImpl::DisableTable(const DisableTableRequest* request, WriteClosure* closure = NewClosure(this, &MasterImpl::UpdateTableRecordForDisableCallback, table, FLAGS_tera_master_meta_retry_times, response, done); - BatchWriteMetaTableAsync(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2), false, closure); } @@ -933,7 +936,7 @@ void MasterImpl::EnableTable(const EnableTableRequest* request, WriteClosure* closure = NewClosure(this, &MasterImpl::UpdateTableRecordForEnableCallback, table, FLAGS_tera_master_meta_retry_times, response, done); - BatchWriteMetaTableAsync(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2), false, closure); } @@ -1008,7 +1011,7 @@ void MasterImpl::UpdateTable(const UpdateTableRequest* request, WriteClosure* closure = NewClosure(this, &MasterImpl::UpdateTableRecordForUpdateCallback, table, FLAGS_tera_master_meta_retry_times, response, done); - BatchWriteMetaTableAsync(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2), false, closure); return; } @@ -1282,7 +1285,7 @@ void MasterImpl::AddUserInfoToMetaCallback(UserPtr user_ptr, WriteClosure* done = NewClosure(this, &MasterImpl::AddUserInfoToMetaCallback, user_ptr, retry_times - 1, rpc_request, rpc_response, rpc_done); - SuspendMetaOperation(boost::bind(&User::ToMetaTableKeyValue, user_ptr, _1, _2), + SuspendMetaOperation(std::bind(&User::ToMetaTableKeyValue, user_ptr, _1, _2), rpc_request->op_type() == kDeleteUser, done); } return; @@ -1400,7 +1403,7 @@ void MasterImpl::OperateUser(const OperateUserRequest* request, WriteClosure* closure = NewClosure(this, &MasterImpl::AddUserInfoToMetaCallback, user_ptr, FLAGS_tera_master_meta_retry_times, request, response, done); - BatchWriteMetaTableAsync(boost::bind(&User::ToMetaTableKeyValue, user_ptr, _1, _2), + BatchWriteMetaTableAsync(std::bind(&User::ToMetaTableKeyValue, user_ptr, _1, _2), is_delete, closure); } @@ -1678,7 +1681,7 @@ void MasterImpl::ScheduleQueryTabletNode() { LOG(INFO) << "schedule query tabletnodes after " << schedule_delay << "ms."; - ThreadPool::Task task = boost::bind(&MasterImpl::QueryTabletNode, this); + ThreadPool::Task task = std::bind(&MasterImpl::QueryTabletNode, this); query_tabletnode_timer_id_ = thread_pool_->DelayTask(schedule_delay, task); } @@ -1714,7 +1717,7 @@ void MasterImpl::ScheduleLoadBalance() { } ThreadPool::Task task = - boost::bind(&MasterImpl::LoadBalance, this); + std::bind(static_cast(&MasterImpl::LoadBalance), this); thread_pool_->AddTask(task); } @@ -1960,7 +1963,7 @@ void MasterImpl::ReleaseCacheWrapper() { void MasterImpl::EnableReleaseCacheTimer() { assert(release_cache_timer_id_ == kInvalidTimerId); ThreadPool::Task task = - boost::bind(&MasterImpl::ReleaseCacheWrapper, this); + std::bind(&MasterImpl::ReleaseCacheWrapper, this); int64_t timeout_period = 1000LL * FLAGS_tera_master_cache_release_period; release_cache_timer_id_ = thread_pool_->DelayTask( @@ -2052,96 +2055,38 @@ void MasterImpl::AddTabletNode(const std::string& tabletnode_addr, } } - tabletnode_manager_->AddTabletNode(tabletnode_addr, tabletnode_uuid); - QueryClosure* done = - NewClosure(this, &MasterImpl::TabletNodeRecoveryCallback, tabletnode_addr); - QueryTabletNodeAsync(tabletnode_addr, - FLAGS_tera_master_collect_info_timeout, false, done); -} - -void MasterImpl::TabletNodeRecoveryCallback(std::string addr, - QueryRequest* request, - QueryResponse* response, - bool failed, int error_code) { - TabletNodePtr node; - if (!tabletnode_manager_->FindTabletNode(addr, &node)) { - LOG(WARNING) << "fail to query: server down, id: " - << request->sequence_id() << ", server: " << addr; - delete request; - delete response; - return; - } - - if (failed || response->status() != kTabletNodeOk) { - if (failed) { - LOG(WARNING) << "fail to query: " - << sofa::pbrpc::RpcErrorCodeToString(error_code) - << ", id: " << request->sequence_id() << ", server: " << addr; - } else { - LOG(WARNING) << "fail to query: " - << StatusCodeToString(response->status()) - << ", id: " << request->sequence_id() << ", server: " << addr; - } - int32_t fail_count = node->IncQueryFailCount(); - if (fail_count >= FLAGS_tera_master_collect_info_retry_times) { - LOG(ERROR) << kSms << "fail to query " << addr - << " for " << fail_count << " times"; - TryKickTabletNode(addr); - } else { - ThreadPool::Task task = - boost::bind(&MasterImpl::RetryQueryNewTabletNode, this, addr); - thread_pool_->DelayTask(FLAGS_tera_master_collect_info_retry_period, task); - } - delete request; - delete response; - return; - } - node->ResetQueryFailCount(); - - // New tabletnode should not have any tablet. - uint32_t meta_num = response->tabletmeta_list().meta_size(); - if (meta_num > 0) { - LOG(WARNING) << "new tabletnode " << addr << " has " << meta_num << " tablets"; - TryKickTabletNode(node->GetAddr()); - delete request; - delete response; - return; - } + TabletNodePtr node = tabletnode_manager_->AddTabletNode(tabletnode_addr, tabletnode_uuid); // update tabletnode info timeval update_time; gettimeofday(&update_time, NULL); TabletNode state; - state.addr_ = addr; - state.report_status_ = response->tabletnode_info().status_t(); - state.info_ = response->tabletnode_info(); - state.info_.set_addr(addr); - state.load_ = response->tabletnode_info().load(); + state.addr_ = tabletnode_addr; + state.report_status_ = kTabletNodeReady; + state.info_.set_addr(tabletnode_addr); state.data_size_ = 0; state.qps_ = 0; state.update_time_ = update_time.tv_sec * 1000 + update_time.tv_usec / 1000; - tabletnode_manager_->UpdateTabletNode(addr, state); + tabletnode_manager_->UpdateTabletNode(tabletnode_addr, state); NodeState old_state; node->SetState(kReady, &old_state); - delete request; - delete response; // If all tabletnodes restart in one zk callback, // master will not enter restore/wait state; // meta table must be scheduled to load from here. TabletPtr meta_tablet; if (tablet_manager_->FindTablet(FLAGS_tera_master_meta_table_name, "", - &meta_tablet) + &meta_tablet) && meta_tablet->GetStatus() == kTableOffLine) { - LOG(INFO) << "try load meta tablet on new ts: " << addr; + LOG(INFO) << "try load meta tablet on new ts: " << tabletnode_addr; TryLoadTablet(meta_tablet); } // load offline tablets std::vector tablet_list; - tablet_manager_->FindTablet(addr, + tablet_manager_->FindTablet(tabletnode_addr, &tablet_list, true); // need disabled table/tablets std::vector::iterator it = tablet_list.begin(); @@ -2152,19 +2097,13 @@ void MasterImpl::TabletNodeRecoveryCallback(std::string addr, } if (tablet->GetStatus() == kTableOffLine) { LOG(INFO) << "try load, " << tablet; - TryLoadTablet(tablet, addr); + TryLoadTablet(tablet, tabletnode_addr); } } TryLeaveSafeMode(); } -void MasterImpl::RetryQueryNewTabletNode(std::string addr) { - QueryClosure* done = - NewClosure(this, &MasterImpl::TabletNodeRecoveryCallback, addr); - QueryTabletNodeAsync(addr, FLAGS_tera_master_collect_info_timeout, false, done); -} - void MasterImpl::DeleteTabletNode(const std::string& tabletnode_addr) { tabletnode_manager_->DelTabletNode(tabletnode_addr); // possible status: running, readonly, wait. @@ -2219,7 +2158,7 @@ void MasterImpl::DeleteTabletNode(const std::string& tabletnode_addr) { << "(ms) later"; MutexLock lock(&tabletnode_timer_mutex_); ThreadPool::Task task = - boost::bind(&MasterImpl::TryMovePendingTablets, this, tabletnode_addr); + std::bind(&MasterImpl::TryMovePendingTablets, this, tabletnode_addr); int64_t timer_id = thread_pool_->DelayTask( FLAGS_tera_master_tabletnode_timeout, task); tabletnode_timer_id_map_[tabletnode_addr] = timer_id; @@ -2566,7 +2505,6 @@ void MasterImpl::LoadTabletCallback(TabletPtr tablet, int32_t retry, // success if (!failed && (status == kTabletNodeOk || status == kTabletReady)) { LOG(INFO) << "load tablet success, " << tablet; - tablet->SetLoadTime(get_micros()); tablet->SetStatusIf(kTableReady, kTableOnLoad); tablet_availability_->EraseNotReadyTablet(tablet->GetPath()); if (tablet->GetTableName() == FLAGS_tera_master_meta_table_name) { @@ -2586,7 +2524,7 @@ void MasterImpl::LoadTabletCallback(TabletPtr tablet, int32_t retry, WriteClosure* done = NewClosure(this, &MasterImpl::UpdateMetaForLoadCallback, next_tablet, FLAGS_tera_master_meta_retry_times); - BatchWriteMetaTableAsync(boost::bind(&Tablet::ToMetaTableKeyValue, next_tablet, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Tablet::ToMetaTableKeyValue, next_tablet, _1, _2), false, done); break; } @@ -2629,7 +2567,7 @@ void MasterImpl::LoadTabletCallback(TabletPtr tablet, int32_t retry, // retry ThreadPool::Task task = - boost::bind(&MasterImpl::RetryLoadTablet, this, tablet, retry + 1); + std::bind(&MasterImpl::RetryLoadTablet, this, tablet, retry + 1); thread_pool_->DelayTask( FLAGS_tera_master_control_tabletnode_retry_period, task); } @@ -2727,7 +2665,6 @@ void MasterImpl::UnloadTabletCallback(TabletPtr tablet, int32_t retry, // success if (!failed && (status == kTabletNodeOk || status == kKeyNotInRange)) { LOG(INFO) << "unload tablet success, " << tablet; - tablet_availability_->AddNotReadyTablet(tablet->GetPath()); if (tablet->GetMergeParam() != NULL) { CHECK(tablet->GetStatus() == kTableUnLoading); MergeTabletUnloadCallback(tablet); @@ -2749,7 +2686,7 @@ void MasterImpl::UnloadTabletCallback(TabletPtr tablet, int32_t retry, WriteClosure* done = NewClosure(this, &MasterImpl::UpdateMetaForLoadCallback, next_tablet, FLAGS_tera_master_meta_retry_times); - BatchWriteMetaTableAsync(boost::bind(&Tablet::ToMetaTableKeyValue, next_tablet, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Tablet::ToMetaTableKeyValue, next_tablet, _1, _2), false, done); break; } @@ -2803,7 +2740,7 @@ void MasterImpl::UnloadTabletCallback(TabletPtr tablet, int32_t retry, // retry ThreadPool::Task task = - boost::bind(&MasterImpl::RetryUnloadTablet, this, tablet, retry - 1); + std::bind(&MasterImpl::RetryUnloadTablet, this, tablet, retry - 1); thread_pool_->DelayTask( FLAGS_tera_master_control_tabletnode_retry_period, task); } @@ -3350,7 +3287,7 @@ void MasterImpl::UpdateSchemaCallback(std::string table_name, NewClosure(this, &MasterImpl::UpdateSchemaCallback, tablet->GetTableName(), tablet->GetPath(), tablet->GetKeyStart(), tablet->GetKeyEnd(), retry_times + 1); ThreadPool::Task task = - boost::bind(&MasterImpl::NoticeTabletNodeSchemaUpdatedAsync, this, tablet, done); + std::bind(&MasterImpl::NoticeTabletNodeSchemaUpdatedAsync, this, tablet, done); thread_pool_->DelayTask(FLAGS_tera_master_schema_update_retry_period * 1000, task); } return; @@ -3426,7 +3363,7 @@ void MasterImpl::QueryTabletNodeAsync(std::string addr, int32_t timeout, void MasterImpl::QueryTabletNodeCallback(std::string addr, QueryRequest* request, QueryResponse* response, bool failed, int error_code) { - int64_t start_query = get_micros(); + int64_t query_callback_start = get_micros(); TabletNodePtr node; if (!tabletnode_manager_->FindTabletNode(addr, &node)) { LOG(WARNING) << "fail to query: server down, id: " @@ -3458,70 +3395,84 @@ void MasterImpl::QueryTabletNodeCallback(std::string addr, QueryRequest* request const std::string& key_start = meta.key_range().key_start(); const std::string& key_end = meta.key_range().key_end(); - tabletnode::TabletRange range(table_name, key_start, key_end); - std::map::iterator it = tablet_map.find(range); - if (it != tablet_map.end()) { - LOG(WARNING) << "query found ts has more than one table_name+startkey item: " - << table_name << ", " << DebugString(key_start) << ", " << DebugString(key_end); - } else { - tablet_map[range] = 1; + std::vector tablets; + if (!tablet_manager_->FindOverlappedTablets(table_name, key_start, key_end, &tablets)) { + LOG(WARNING) << "[query] table not exist, tablet: " << meta.path() + << " [" << DebugString(key_start) + << ", " << DebugString(key_end) + << "] @ " << meta.server_addr() + << " status: " << meta.status(); + continue; } - TabletPtr tablet; - if (meta.status() != kTableReady) { - VLOG(30) << "non-ready tablet: " << meta.table_name() - << ", path: " << meta.path() - << ", range: [" << DebugString(key_start) - << ", " << DebugString(key_end) - << "], size: " << meta.size() - << ", addr: " << meta.server_addr() - << ", status: " << meta.status(); - } else if (tablet_manager_->FindTablet(table_name, key_start, &tablet)) { - int64_t pre_time = tablet->SetUpdateTime(start_query); - int64_t load_time = tablet->LoadTime(); - if (load_time < start_query_time_ && pre_time > start_query_time_) { - LOG(ERROR) << "caution: one tablet multi-loaded, path " - << tablet->GetPath() << ", addr " << meta.server_addr() - << " vs " << tablet->GetServerAddr() - << ", start_query " << start_query_time_ - << ", pre_update " << pre_time - << ", cur_time " << start_query; - } - if (tablet->Verify(table_name, key_start, key_end, meta.path(), - meta.server_addr())) { - if (tablet->GetTable()->GetStatus() == kTableDisable) { - if (tablet->SetStatusIf(kTableUnLoading, kTableReady)) { - UnloadClosure* done = - NewClosure(this, &MasterImpl::UnloadTabletCallback, tablet, - FLAGS_tera_master_impl_retry_times); - UnloadTabletAsync(tablet, done); - LOG(INFO) << "Unload disable tablet: " << tablet->GetPath(); - } else { - LOG(INFO) << "Discard disable tablet: " << tablet->GetPath() - << ", status: " << tablet->GetStatus(); - } - } else { - tablet->UpdateSize(meta); - tablet->SetCounter(counter); - tablet->SetCompactStatus(meta.compact_status()); - ClearUnusedSnapshots(tablet, meta); + if (tablets.size() > 1) { + bool any_tablet_load_before_query = false; + for (uint32_t j = 0; j < tablets.size(); ++j) { + if (tablets[j]->ReadyTime() < start_query_time_) { + any_tablet_load_before_query = true; + break; } + } + if (any_tablet_load_before_query) { + LOG(ERROR) << "[query] range error tablet: " << meta.path() + << " [" << DebugString(key_start) + << ", " << DebugString(key_end) + << "] @ " << meta.server_addr() + << " status: " << meta.status(); } else { - VLOG(10) << "fail to verify tablet: " << meta.table_name() - << ", path: " << meta.path() - << ", range: [" << DebugString(key_start) + VLOG(20) << "[query] ignore mutable tablet: " << meta.path() + << " [" << DebugString(key_start) << ", " << DebugString(key_end) - << "], size: " << meta.size() - << ", addr: " << meta.server_addr(); + << "] @ " << meta.server_addr() + << " status: " << meta.status(); } - VLOG(30) << "[query] " << tablet; - } else { - LOG(WARNING) << "fail to find tablet: " << meta.table_name() - << ", path: " << meta.path() - << ", range: [" << DebugString(key_start) + continue; + } + + CHECK_EQ(tablets.size(), 1u); + TabletPtr tablet = tablets[0]; + if (tablet->ReadyTime() >= start_query_time_) { + VLOG(20) << "[query] ignore mutable tablet: " << meta.path() + << " [" << DebugString(key_start) + << ", " << DebugString(key_end) + << "] @ " << meta.server_addr() + << " status: " << meta.status(); + } else if (tablet->GetKeyStart() != key_start || tablet->GetKeyEnd() != key_end) { + LOG(ERROR) << "[query] range error tablet: " << meta.path() + << " [" << DebugString(key_start) << ", " << DebugString(key_end) - << "], size: " << meta.size() - << ", addr: " << meta.server_addr(); + << "] @ " << meta.server_addr(); + } else if (tablet->GetPath() != meta.path()) { + LOG(ERROR) << "[query] path error tablet: " << meta.path() + << "] @ " << meta.server_addr() + << " should be " << tablet->GetPath(); + } else if (kTableReady != meta.status()) { + LOG(ERROR) << "[query] status error tablet: " << meta.path() + << "] @ " << meta.server_addr() + << " should be kTabletReady"; + } else if (tablet->GetServerAddr() != meta.server_addr()) { + LOG(ERROR) << "[query] addr error tablet: " << meta.path() + << " @ " << meta.server_addr() + << " should @ " << tablet->GetServerAddr(); + } else if (tablet->GetTable()->GetStatus() == kTableDisable) { + if (tablet->SetStatusIf(kTableUnLoading, kTableReady)) { + UnloadClosure* done = + NewClosure(this, &MasterImpl::UnloadTabletCallback, tablet, + FLAGS_tera_master_impl_retry_times); + UnloadTabletAsync(tablet, done); + LOG(INFO) << "Unload disable tablet: " << tablet->GetPath(); + } else { + LOG(INFO) << "Discard disable tablet: " << tablet->GetPath() + << ", status: " << tablet->GetStatus(); + } + } else { + VLOG(20) << "[query] OK tablet: " << meta.path() + << "] @ " << meta.server_addr(); + tablet->SetUpdateTime(query_callback_start); + tablet->UpdateSize(meta); + tablet->SetCounter(counter); + tablet->SetCompactStatus(meta.compact_status()); + ClearUnusedSnapshots(tablet, meta); } } @@ -3546,12 +3497,12 @@ void MasterImpl::QueryTabletNodeCallback(std::string addr, QueryRequest* request std::vector::iterator it; for (it = tablet_list.begin(); it != tablet_list.end(); ++it) { TabletPtr tablet = *it; - tabletnode::TabletRange range(tablet->GetTableName(), tablet->GetKeyStart(), - tablet->GetKeyEnd()); - if (tablet_map.find(range) == tablet_map.end()) { - LOG(INFO) << "master load tablet, but ts not: addr " << addr - << ", " << tablet; - continue; + if (tablet->UpdateTime() != query_callback_start) { + if (tablet->ReadyTime() < start_query_time_) { + LOG(ERROR) << "[query] missed tablet: " << tablet; + } else { + VLOG(20) << "[query] ignore mutable missed tablet: " << tablet; + } } TabletStatus tablet_status = tablet->GetStatus(); @@ -3606,7 +3557,7 @@ void MasterImpl::QueryTabletNodeCallback(std::string addr, QueryRequest* request delete response; VLOG(20) << "query tabletnode finish " << addr << ", id " << query_tabletnode_timer_id_ - << ", callback cost " << (get_micros() - start_query) / 1000 << "ms."; + << ", callback cost " << (get_micros() - query_callback_start) / 1000 << "ms."; } void MasterImpl::CollectTabletInfoCallback(std::string addr, @@ -3673,8 +3624,8 @@ void MasterImpl::CollectTabletInfoCallback(std::string addr, TryKickTabletNode(addr); } else { ThreadPool::Task task = - boost::bind(&MasterImpl::RetryCollectTabletInfo, this, addr, - tablet_list, finish_counter, mutex); + std::bind(&MasterImpl::RetryCollectTabletInfo, this, addr, + tablet_list, finish_counter, mutex); thread_pool_->DelayTask(FLAGS_tera_master_collect_info_retry_period, task); delete request; @@ -3723,6 +3674,7 @@ void MasterImpl::SplitTabletAsync(TabletPtr tablet) { LOG(INFO) << "SplitTabletAsync id: " << request->sequence_id() << ", " << tablet; + tablet_availability_->AddNotReadyTablet(tablet->GetPath()); node_client.SplitTablet(request, response, done); } @@ -3795,6 +3747,7 @@ void MasterImpl::SplitTabletCallback(TabletPtr tablet, LOG(ERROR) << "ts refused to split tablet: " << StatusCodeToString(status) << ", " << tablet << ", tablet status " << StatusCodeToString(tablet->GetStatus()); + tablet_availability_->EraseNotReadyTablet(tablet->GetPath()); return; } @@ -3840,7 +3793,7 @@ void MasterImpl::TryLoadTablet(TabletPtr tablet, std::string server_addr) { LOG(INFO) << "load tablet " << tablet << " on " << server_addr << " " << FLAGS_tera_master_tabletnode_timeout << "(ms) later"; ThreadPool::Task task = - boost::bind(&MasterImpl::TryMovePendingTablet, this, tablet); + std::bind(&MasterImpl::TryMovePendingTablet, this, tablet); thread_pool_->DelayTask(FLAGS_tera_master_tabletnode_timeout, task); return; } else if (GetMasterStatus() == kIsRunning) { @@ -3916,7 +3869,7 @@ void MasterImpl::TryLoadTablet(TabletPtr tablet, std::string server_addr) { WriteClosure* done = NewClosure(this, &MasterImpl::UpdateMetaForLoadCallback, next_tablet, FLAGS_tera_master_meta_retry_times); - BatchWriteMetaTableAsync(boost::bind(&Tablet::ToMetaTableKeyValue, next_tablet, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Tablet::ToMetaTableKeyValue, next_tablet, _1, _2), false, done); break; } @@ -3930,7 +3883,7 @@ void MasterImpl::TryLoadTablet(TabletPtr tablet, std::string server_addr) { WriteClosure* done = NewClosure(this, &MasterImpl::UpdateMetaForLoadCallback, tablet, FLAGS_tera_master_meta_retry_times); - BatchWriteMetaTableAsync(boost::bind(&Tablet::ToMetaTableKeyValue, tablet, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Tablet::ToMetaTableKeyValue, tablet, _1, _2), false, done); return; } @@ -4033,11 +3986,18 @@ bool MasterImpl::TryMergeTablet(TabletPtr tablet) { } TabletPtr tablet2; - if (!tablet_manager_->PickMergeTablet(tablet, &tablet2) || - tablet2->GetStatus() != kTableReady || + if (!tablet_manager_->PickMergeTablet(tablet, &tablet2)) { + VLOG(20) << "[merge] merge failed, no proper tablet for " << tablet; + return false; + } + + if (tablet2->GetStatus() != kTableReady || tablet2->IsBusy() || tablet2->GetCounter().write_workload() >= 1) { - VLOG(20) << "[merge] merge failed, none proper tablet"; + VLOG(20) << "[merge] merge failed, none proper tablet." + << " status:" << tablet2->GetStatus() + << " isbusy:" << tablet2->IsBusy() + << " write workload:" << tablet2->GetCounter().write_workload(); return false; } @@ -4048,26 +4008,41 @@ bool MasterImpl::TryMergeTablet(TabletPtr tablet) { } void MasterImpl::MergeTabletAsync(TabletPtr tablet_p1, TabletPtr tablet_p2) { - if (tablet_p1->SetStatusIf(kTableUnLoading, kTableReady) && - tablet_p2->SetStatusIf(kTableUnLoading, kTableReady)) { - MutexPtr mu(new Mutex()); - MergeParam* param1 = new MergeParam(mu, tablet_p2); - MergeParam* param2 = new MergeParam(mu, tablet_p1); - tablet_p1->SetMergeParam(param1); - tablet_p2->SetMergeParam(param2); - UnloadClosure* done1 = - NewClosure(this, &MasterImpl::UnloadTabletCallback, tablet_p1, - FLAGS_tera_master_impl_retry_times); - UnloadClosure* done2 = - NewClosure(this, &MasterImpl::UnloadTabletCallback, tablet_p2, - FLAGS_tera_master_impl_retry_times); - UnloadTabletAsync(tablet_p1, done1); - UnloadTabletAsync(tablet_p2, done2); - } else { - LOG(WARNING) << "[merge] tablet not ready, merge failed and rollback."; - tablet_p1->SetStatusIf(kTableReady, kTableUnLoading); - tablet_p2->SetStatusIf(kTableReady, kTableUnLoading); + bool switch_ok = false; + + // prepare + switch_ok = tablet_p1->SetStatusIf(kTableUnLoading, kTableReady); + if (!switch_ok) { + // why this tablet is not Ready? maybe someone changes it's state + LOG(WARNING) << "[merge] tablet not ready, merge failed:" << tablet_p1; + return; } + switch_ok = tablet_p2->SetStatusIf(kTableUnLoading, kTableReady); + if (!switch_ok) { + // why this tablet is not Ready? maybe someone changes it's state + LOG(WARNING) << "[merge] tablet not ready, merge failed:" << tablet_p2; + // rollback + CHECK(tablet_p1->SetStatusIf(kTableReady, kTableUnLoading)); + return; + } + + // commit + MutexPtr mu(new Mutex()); + MergeParam* param1 = new MergeParam(mu, tablet_p2); + MergeParam* param2 = new MergeParam(mu, tablet_p1); + tablet_p1->SetMergeParam(param1); + tablet_p2->SetMergeParam(param2); + UnloadClosure* done1 = + NewClosure(this, &MasterImpl::UnloadTabletCallback, tablet_p1, + FLAGS_tera_master_impl_retry_times); + UnloadClosure* done2 = + NewClosure(this, &MasterImpl::UnloadTabletCallback, tablet_p2, + FLAGS_tera_master_impl_retry_times); + + tablet_availability_->AddNotReadyTablet(tablet_p1->GetPath()); + tablet_availability_->AddNotReadyTablet(tablet_p2->GetPath()); + UnloadTabletAsync(tablet_p1, done1); + UnloadTabletAsync(tablet_p2, done2); } void MasterImpl::MergeTabletAsyncPhase2(TabletPtr tablet_p1, TabletPtr tablet_p2) { @@ -4243,6 +4218,7 @@ void MasterImpl::MergeTabletWriteMetaCallback(TabletPtr tablet_c, tablet_manager_->AddTablet(new_meta, TableSchema(), &tablet_c); DeleteTablet(tablet_p1); } + tablet_availability_->AddNotReadyTablet(tablet_c->GetPath()); ProcessOffLineTablet(tablet_c); TryLoadTablet(tablet_c); delete request; @@ -4277,11 +4253,11 @@ void MasterImpl::BatchWriteMetaTableAsync(TablePtr table, std::vector meta_entries; TablePtr null_ptr; if (table != null_ptr) { - meta_entries.push_back(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2)); + meta_entries.push_back(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2)); } if (tablets.size() != 0) { for (size_t i = 0; i < tablets.size(); ++i) { - meta_entries.push_back(boost::bind(&Tablet::ToMetaTableKeyValue, tablets[i], _1, _2)); + meta_entries.push_back(std::bind(&Tablet::ToMetaTableKeyValue, tablets[i], _1, _2)); } } BatchWriteMetaTableAsync(meta_entries, is_delete, done); @@ -4411,7 +4387,7 @@ void MasterImpl::UpdateTableRecordForDisableCallback(TablePtr table, int32_t ret WriteClosure* done = NewClosure(this, &MasterImpl::UpdateTableRecordForDisableCallback, table, retry_times - 1, rpc_response, rpc_done); - SuspendMetaOperation(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2), + SuspendMetaOperation(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2), false, done); } return; @@ -4467,7 +4443,7 @@ void MasterImpl::UpdateTableRecordForEnableCallback(TablePtr table, int32_t retr WriteClosure* done = NewClosure(this, &MasterImpl::UpdateTableRecordForEnableCallback, table, retry_times - 1, rpc_response, rpc_done); - SuspendMetaOperation(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2), + SuspendMetaOperation(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2), false, done); } return; @@ -4522,7 +4498,7 @@ void MasterImpl::UpdateTableRecordForUpdateCallback(TablePtr table, int32_t retr WriteClosure* done = NewClosure(this, &MasterImpl::UpdateTableRecordForUpdateCallback, table, retry_times - 1, rpc_response, rpc_done); - SuspendMetaOperation(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2), + SuspendMetaOperation(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2), false, done); } return; @@ -4586,7 +4562,7 @@ void MasterImpl::UpdateTableRecordForRenameCallback(TablePtr table, int32_t retr NewClosure(this, &MasterImpl::UpdateTableRecordForRenameCallback, table, retry_times - 1, rpc_response, rpc_done, old_alias, new_alias); - SuspendMetaOperation(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2), + SuspendMetaOperation(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2), false, done); } return; @@ -4628,7 +4604,7 @@ void MasterImpl::UpdateTabletRecordCallback(TabletPtr tablet, int32_t retry_time WriteClosure* done = NewClosure(this, &MasterImpl::UpdateTabletRecordCallback, tablet, retry_times - 1); - SuspendMetaOperation(boost::bind(&Tablet::ToMetaTableKeyValue, tablet, _1, _2), + SuspendMetaOperation(std::bind(&Tablet::ToMetaTableKeyValue, tablet, _1, _2), false, done); } return; @@ -4677,7 +4653,7 @@ void MasterImpl::UpdateMetaForLoadCallback(TabletPtr tablet, int32_t retry_times WriteClosure* done = NewClosure(this, &MasterImpl::UpdateMetaForLoadCallback, next_tablet, FLAGS_tera_master_meta_retry_times); - BatchWriteMetaTableAsync(boost::bind(&Tablet::ToMetaTableKeyValue, next_tablet, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Tablet::ToMetaTableKeyValue, next_tablet, _1, _2), false, done); break; } @@ -4688,7 +4664,7 @@ void MasterImpl::UpdateMetaForLoadCallback(TabletPtr tablet, int32_t retry_times WriteClosure* done = NewClosure(this, &MasterImpl::UpdateMetaForLoadCallback, tablet, retry_times - 1); - SuspendMetaOperation(boost::bind(&Tablet::ToMetaTableKeyValue, tablet, _1, _2), + SuspendMetaOperation(std::bind(&Tablet::ToMetaTableKeyValue, tablet, _1, _2), false, done); } return; @@ -4882,7 +4858,10 @@ void MasterImpl::ScanMetaCallbackForSplit(TabletPtr tablet, first_meta.set_status(kTableOffLine); tablet_manager_->AddTablet(first_meta, TableSchema(), &first_tablet); - LOG(INFO) << "try load child tablets, \nfirst: " << first_meta.ShortDebugString() + tablet_availability_->AddNotReadyTablet(first_tablet->GetPath()); + tablet_availability_->AddNotReadyTablet(second_tablet->GetPath()); + LOG(INFO) << "split finish, " << tablet << ", try load child tablets, " + << "\nfirst: " << first_meta.ShortDebugString() << "\nsecond: " << second_meta.ShortDebugString(); ProcessOffLineTablet(first_tablet); TryLoadTablet(first_tablet, server_addr); @@ -4983,10 +4962,10 @@ void MasterImpl::SuspendMetaOperation(TablePtr table, const std::vector meta_entries; TablePtr null_ptr; if (table != null_ptr) { - meta_entries.push_back(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2)); + meta_entries.push_back(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2)); } for (size_t i = 0; i < tablets.size(); ++i) { - meta_entries.push_back(boost::bind(&Tablet::ToMetaTableKeyValue, tablets[i], _1, _2)); + meta_entries.push_back(std::bind(&Tablet::ToMetaTableKeyValue, tablets[i], _1, _2)); } SuspendMetaOperation(meta_entries, is_delete, done); } @@ -5087,6 +5066,7 @@ void MasterImpl::TryMoveTablet(TabletPtr tablet, const std::string& server_addr, tabletnode_manager_->FindTabletNode(server_addr, &node)) { node->PlanToMoveIn(); } + tablet_availability_->AddNotReadyTablet(tablet->GetPath()); UnloadClosure* done = NewClosure(this, &MasterImpl::UnloadTabletCallback, tablet, FLAGS_tera_master_impl_retry_times); @@ -5203,7 +5183,7 @@ void MasterImpl::ScheduleTabletNodeGc() { mutex_.AssertHeld(); LOG(INFO) << "[gc] ScheduleTabletNodeGcTimer"; ThreadPool::Task task = - boost::bind(&MasterImpl::DoTabletNodeGc, this); + std::bind(&MasterImpl::DoTabletNodeGc, this); gc_timer_id_ = thread_pool_->DelayTask( FLAGS_tera_master_gc_period, task); } @@ -5227,7 +5207,7 @@ void MasterImpl::DoAvailableCheck() { void MasterImpl::ScheduleAvailableCheck() { mutex_.AssertHeld(); ThreadPool::Task task = - boost::bind(&MasterImpl::DoAvailableCheck, this); + std::bind(&MasterImpl::DoAvailableCheck, this); thread_pool_->DelayTask( FLAGS_tera_master_availability_check_period * 1000, task); } @@ -5357,7 +5337,7 @@ void MasterImpl::RenameTable(const RenameTableRequest* request, NewClosure(this, &MasterImpl::UpdateTableRecordForRenameCallback, table, FLAGS_tera_master_meta_retry_times, response, done, old_alias, new_alias); - BatchWriteMetaTableAsync(boost::bind(&Table::ToMetaTableKeyValue, table, _1, _2), + BatchWriteMetaTableAsync(std::bind(&Table::ToMetaTableKeyValue, table, _1, _2), false, closure); } @@ -5371,7 +5351,7 @@ void MasterImpl::RefreshTableCounter() { // Set refresh interval as query-interval / 2, because each table counter // changed after query callback reached. - ThreadPool::Task task = boost::bind(&MasterImpl::RefreshTableCounter, this); + ThreadPool::Task task = std::bind(&MasterImpl::RefreshTableCounter, this); thread_pool_->DelayTask(FLAGS_tera_master_query_tabletnode_period / 2, task); LOG(INFO) << "RefreshTableCounter, cost: " << ((get_micros() - start) / 1000) << "ms."; diff --git a/src/master/master_impl.h b/src/master/master_impl.h index b5a96d70c..b142a1b6f 100644 --- a/src/master/master_impl.h +++ b/src/master/master_impl.h @@ -165,8 +165,8 @@ class MasterImpl { typedef Closure SplitClosure; typedef Closure WriteClosure; typedef Closure ScanClosure; - typedef boost::function ToMetaFunc; - typedef boost::shared_ptr MutexPtr; + typedef std::function ToMetaFunc; + typedef std::shared_ptr MutexPtr; enum MetaTaskType { kWrite = 0, @@ -244,8 +244,6 @@ class MasterImpl { const std::string& key_start, const std::string& key_end, const std::string& server_addr, StatusCode* status); - void UnloadTabletAsync(std::string table_name, std::string key_start, - std::string server_addr, int32_t retry); void RetryLoadTablet(TabletPtr tablet, int32_t retry_times); void RetryUnloadTablet(TabletPtr tablet, int32_t retry_times); @@ -272,7 +270,7 @@ class MasterImpl { int error_code); bool RemoveTablet(const TabletMeta& meta, StatusCode* status); - void UnloadTabletAsync(TabletPtr tablet, UnloadClosure* done); + virtual void UnloadTabletAsync(TabletPtr tablet, UnloadClosure* done); void UnloadTabletCallback(TabletPtr tablet, int32_t retry, UnloadTabletRequest* request, UnloadTabletResponse* response, bool failed, @@ -357,13 +355,9 @@ class MasterImpl { QueryRequest* request, QueryResponse* response, bool failed, int error_code); - void TabletNodeRecoveryCallback(std::string addr, QueryRequest* request, - QueryResponse* response, bool failed, - int error_code); void RetryCollectTabletInfo(std::string addr, std::vector* tablet_list, sem_t* finish_counter, Mutex* mutex); - void RetryQueryNewTabletNode(std::string addr); void SplitTabletAsync(TabletPtr tablet); void SplitTabletCallback(TabletPtr tablet, SplitTabletRequest* request, @@ -576,9 +570,9 @@ class MasterImpl { mutable Mutex tabletnode_mutex_; bool restored_; - boost::shared_ptr tablet_manager_; - boost::shared_ptr tabletnode_manager_; - boost::shared_ptr user_manager_; + std::shared_ptr tablet_manager_; + std::shared_ptr tabletnode_manager_; + std::shared_ptr user_manager_; scoped_ptr zk_adapter_; scoped_ptr size_scheduler_; scoped_ptr load_scheduler_; @@ -619,11 +613,11 @@ class MasterImpl { bool gc_enabled_; int64_t gc_timer_id_; bool gc_query_enable_; - boost::shared_ptr gc_strategy_; + std::shared_ptr gc_strategy_; std::map alias_; mutable Mutex alias_mutex_; - boost::shared_ptr tablet_availability_; + std::shared_ptr tablet_availability_; }; } // namespace master diff --git a/src/master/remote_master.cc b/src/master/remote_master.cc index 6f8c21e23..25f496926 100644 --- a/src/master/remote_master.cc +++ b/src/master/remote_master.cc @@ -4,7 +4,7 @@ #include "master/remote_master.h" -#include +#include #include "common/base/closure.h" #include "gflags/gflags.h" @@ -33,8 +33,7 @@ void RemoteMaster::GetSnapshot(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (GetSnapshot): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoGetSnapshot, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoGetSnapshot, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -44,8 +43,7 @@ void RemoteMaster::DelSnapshot(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (DelSnapshot): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoDelSnapshot, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoDelSnapshot, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -55,8 +53,7 @@ void RemoteMaster::GetRollback(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (Rollback): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoRollback, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoRollback, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -66,8 +63,7 @@ void RemoteMaster::CreateTable(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (CreateTable): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoCreateTable, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoCreateTable, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -77,8 +73,7 @@ void RemoteMaster::DeleteTable(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (DeleteTable): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoDeleteTable, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoDeleteTable, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -88,8 +83,7 @@ void RemoteMaster::DisableTable(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (DisableTable): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoDisableTable, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoDisableTable, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -99,8 +93,7 @@ void RemoteMaster::EnableTable(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (EnableTable): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoEnableTable, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoEnableTable, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -110,8 +103,7 @@ void RemoteMaster::UpdateTable(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (UpdateTable): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoUpdateTable, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoUpdateTable, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -121,8 +113,7 @@ void RemoteMaster::UpdateCheck(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (UpdateCheck): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoUpdateCheck, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoUpdateCheck, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -132,8 +123,7 @@ void RemoteMaster::CompactTable(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (CompactTable): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoCompactTable, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoCompactTable, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -143,8 +133,7 @@ void RemoteMaster::SearchTable(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (SearchTable): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoSearchTable, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoSearchTable, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -154,8 +143,7 @@ void RemoteMaster::ShowTables(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (ShowTables): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoShowTables, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoShowTables, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -165,8 +153,7 @@ void RemoteMaster::ShowTabletNodes(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (ShowTabletNodes): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoShowTabletNodes, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoShowTabletNodes, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -176,8 +163,7 @@ void RemoteMaster::CmdCtrl(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (CmdCtrl): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoCmdCtrl, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoCmdCtrl, this, controller, request, response, done); thread_pool_->AddTask(callback); } @@ -187,8 +173,7 @@ void RemoteMaster::OperateUser(google::protobuf::RpcController* controller, google::protobuf::Closure* done) { LOG(INFO) << "accept RPC (OperateUser): " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteMaster::DoOperateUser, this, controller, - request, response, done); + std::bind(&RemoteMaster::DoOperateUser, this, controller, request, response, done); thread_pool_->AddTask(callback); } diff --git a/src/master/tablet_manager.cc b/src/master/tablet_manager.cc index b05691e31..6960eb534 100644 --- a/src/master/tablet_manager.cc +++ b/src/master/tablet_manager.cc @@ -45,12 +45,10 @@ namespace master { std::ostream& operator << (std::ostream& o, const Tablet& tablet) { MutexLock lock(&tablet.mutex_); - o << "table: " << tablet.meta_.table_name() << ", range: [" + o << tablet.meta_.path() << " [" << DebugString(tablet.meta_.key_range().key_start()) << ", " - << DebugString(tablet.meta_.key_range().key_end()) << "], path: " - << tablet.meta_.path() << ", server: " - << tablet.meta_.server_addr() << ", serverid: " - << tablet.server_id_; + << DebugString(tablet.meta_.key_range().key_end()) << "] @ " + << tablet.meta_.server_addr() << "/" << tablet.server_id_; return o; } @@ -62,14 +60,14 @@ std::ostream& operator << (std::ostream& o, const TabletPtr& tablet) { Tablet::Tablet(const TabletMeta& meta) : meta_(meta), update_time_(common::timer::get_micros()), - load_time_(std::numeric_limits::max()), + ready_time_(std::numeric_limits::max()), merge_param_(NULL) {} Tablet::Tablet(const TabletMeta& meta, TablePtr table) : meta_(meta), table_(table), update_time_(common::timer::get_micros()), - load_time_(std::numeric_limits::max()), + ready_time_(std::numeric_limits::max()), merge_param_(NULL) {} Tablet::~Tablet() { @@ -251,6 +249,9 @@ bool Tablet::SetStatus(TabletStatus new_status, TabletStatus* old_status) { } if (CheckStatusSwitch(meta_.status(), new_status)) { meta_.set_status(new_status); + if (new_status == kTableReady) { + ready_time_ = get_micros(); + } return true; } return false; @@ -265,6 +266,9 @@ bool Tablet::SetStatusIf(TabletStatus new_status, TabletStatus if_status, if (meta_.status() == if_status && CheckStatusSwitch(meta_.status(), new_status)) { meta_.set_status(new_status); + if (new_status == kTableReady) { + ready_time_ = get_micros(); + } return true; } return false; @@ -283,6 +287,9 @@ bool Tablet::SetStatusIf(TabletStatus new_status, TabletStatus if_status, if (meta_.status() == if_status && table_->status_ == if_table_status && CheckStatusSwitch(meta_.status(), new_status)) { meta_.set_status(new_status); + if (new_status == kTableReady) { + ready_time_ = get_micros(); + } return true; } return false; @@ -311,6 +318,9 @@ bool Tablet::SetAddrAndStatus(const std::string& server_addr, if (CheckStatusSwitch(meta_.status(), new_status)) { meta_.set_status(new_status); meta_.set_server_addr(server_addr); + if (new_status == kTableReady) { + ready_time_ = get_micros(); + } return true; } return false; @@ -327,6 +337,9 @@ bool Tablet::SetAddrAndStatusIf(const std::string& server_addr, && CheckStatusSwitch(meta_.status(), new_status)) { meta_.set_status(new_status); meta_.set_server_addr(server_addr); + if (new_status == kTableReady) { + ready_time_ = get_micros(); + } return true; } return false; @@ -344,22 +357,15 @@ int64_t Tablet::SetUpdateTime(int64_t timestamp) { return ts; } -int64_t Tablet::LoadTime() { +int64_t Tablet::ReadyTime() { MutexLock lock(&mutex_); if (meta_.status() != kTableReady) { return std::numeric_limits::max(); } else { - return load_time_; + return ready_time_; } } -int64_t Tablet::SetLoadTime(int64_t timestamp) { - MutexLock lock(&mutex_); - int64_t ts = load_time_; - load_time_ = timestamp; - return ts; -} - int32_t Tablet::AddSnapshot(uint64_t snapshot) { MutexLock lock(&mutex_); meta_.add_snapshot_list(snapshot); @@ -779,25 +785,36 @@ uint64_t Table::GetNextTabletNo() { } bool Table::GetTabletsForGc(std::set* live_tablets, - std::set* dead_tablets) { + std::set* dead_tablets, + bool ignore_not_ready) { MutexLock lock(&mutex_); + + std::vector children; + leveldb::Env* env = io::LeveldbBaseEnv(); + std::string table_path = FLAGS_tera_tabletnode_path_prefix + name_; + mutex_.Unlock(); + leveldb::Status s = env->GetChildren(table_path, &children); + mutex_.Lock(); + if (!s.ok()) { + LOG(ERROR) << "[gc] fail to list directory: " << table_path; + return false; + } + std::vector tablet_list; Table::TabletList::iterator it = tablets_list_.begin(); for (; it != tablets_list_.end(); ++it) { TabletPtr tablet = it->second; if (tablet->GetStatus() != kTableReady) { - // any tablet not ready, stop gc - return false; + if (!ignore_not_ready) { + // any tablet not ready, stop gc + return false; + } } const std::string& path = tablet->GetPath(); live_tablets->insert(leveldb::GetTabletNumFromPath(path)); - VLOG(10) << "[gc] add live tablet: " << path; + VLOG(20) << "[gc] add live tablet: " << path; } - std::vector children; - leveldb::Env* env = io::LeveldbBaseEnv(); - std::string table_path = FLAGS_tera_tabletnode_path_prefix + name_; - env->GetChildren(table_path, &children); for (size_t i = 0; i < children.size(); ++i) { if (children[i].size() < 5) { // skip directory . and .. @@ -1162,6 +1179,42 @@ void TabletManager::FindTablet(const std::string& server_addr, mutex_.Unlock(); } +bool TabletManager::FindOverlappedTablets(const std::string& table_name, + const std::string& key_start, + const std::string& key_end, + std::vector* tablets, + StatusCode* ret_status) { + // lock table list + mutex_.Lock(); + + // search table + TableList::iterator it = all_tables_.find(table_name); + if (it == all_tables_.end()) { + mutex_.Unlock(); + VLOG(5) << "table: " << table_name << " not exist"; + SetStatusCode(kTableNotFound, ret_status); + return false; + } + Table& table = *it->second; + + // lock table + table.mutex_.Lock(); + mutex_.Unlock(); + + // search tablet + Table::TabletList::iterator it2 = table.tablets_list_.upper_bound(key_start); + CHECK(it2 != table.tablets_list_.begin()); + --it2; + while (it2 != table.tablets_list_.end() && + (key_end.empty() || it2->second->meta_.key_range().key_start() < key_end)) { + tablets->push_back(it2->second); + ++it2; + } + table.mutex_.Unlock(); + CHECK_GT(tablets->size(), 0u); + return true; +} + bool TabletManager::FindTable(const std::string& table_name, std::vector* tablet_meta_list, StatusCode* ret_status) { @@ -1671,25 +1724,25 @@ bool TabletManager::GetMetaTabletAddr(std::string* addr) { } bool TabletManager::PickMergeTablet(TabletPtr& tablet, TabletPtr* tablet2) { - MutexLock lock(&mutex_); std::string table_name = tablet->GetTableName(); + mutex_.Lock(); // search table TableList::iterator it = all_tables_.find(table_name); if (it == all_tables_.end()) { + mutex_.Unlock(); LOG(ERROR) << "[merge] table: " << table_name << " not exist"; return false; } Table& table = *it->second; + MutexLock table_lock(&table.mutex_); + mutex_.Unlock(); + if (table.tablets_list_.size() < 2) { VLOG(20) << "[merge] table: " << table_name << " only have 1 tablet."; return false; } - // make sure no other thread ref this table - table.mutex_.Lock(); - table.mutex_.Unlock(); - // search tablet Table::TabletList::iterator it2 = table.tablets_list_.find(tablet->GetKeyStart()); if (it2 == table.tablets_list_.end()) { diff --git a/src/master/tablet_manager.h b/src/master/tablet_manager.h index fa5fdc569..058fc5678 100644 --- a/src/master/tablet_manager.h +++ b/src/master/tablet_manager.h @@ -8,13 +8,12 @@ #include #include #include +#include #include #include #include #include -#include - #include "common/base/closure.h" #include "common/mutex.h" #include "common/thread_pool.h" @@ -52,7 +51,7 @@ namespace master { class MasterImpl; class Table; -typedef boost::shared_ptr TablePtr; +typedef std::shared_ptr
TablePtr; class Tablet { friend class TabletManager; @@ -127,8 +126,7 @@ class Tablet { int64_t UpdateTime(); int64_t SetUpdateTime(int64_t timestamp); - int64_t LoadTime(); - int64_t SetLoadTime(int64_t timestamp); + int64_t ReadyTime(); void* GetMergeParam(); void SetMergeParam(void* merge_param); @@ -144,7 +142,7 @@ class Tablet { TabletMeta meta_; TablePtr table_; int64_t update_time_; - int64_t load_time_; + int64_t ready_time_; std::string server_id_; std::string expect_server_addr_; std::list counter_list_; @@ -168,7 +166,7 @@ class Tablet { void* merge_param_; }; -typedef class boost::shared_ptr TabletPtr; +typedef class std::shared_ptr TabletPtr; std::ostream& operator << (std::ostream& o, const TabletPtr& tablet); std::ostream& operator << (std::ostream& o, const TablePtr& table); @@ -202,7 +200,8 @@ class Table { void ToMeta(TableMeta* meta); uint64_t GetNextTabletNo(); bool GetTabletsForGc(std::set* live_tablets, - std::set* dead_tablets); + std::set* dead_tablets, + bool ignore_not_ready); void RefreshCounter(); int64_t GetTabletsCount(); bool GetSchemaIsSyncing(); @@ -287,6 +286,12 @@ class TabletManager { std::vector* tablet_meta_list, bool need_disabled_tables); + bool FindOverlappedTablets(const std::string& table_name, + const std::string& key_start, + const std::string& key_end, + std::vector* tablets, + StatusCode* ret_status = NULL); + bool FindTable(const std::string& table_name, std::vector* tablet_meta_list, StatusCode* ret_status = NULL); diff --git a/src/master/tabletnode_manager.cc b/src/master/tabletnode_manager.cc index be3a239d8..d10bb835d 100644 --- a/src/master/tabletnode_manager.cc +++ b/src/master/tabletnode_manager.cc @@ -22,6 +22,8 @@ TabletNode::TabletNode() : state_(kOffLine), update_time_(0), query_fail_count_(0), onload_count_(0), onsplit_count_(0), plan_move_in_count_(0) { info_.set_addr(""); + info_.set_status_m(NodeStateToString(state_)); + info_.set_timestamp(get_micros()); } TabletNode::TabletNode(const std::string& addr, const std::string& uuid) @@ -30,6 +32,8 @@ TabletNode::TabletNode(const std::string& addr, const std::string& uuid) update_time_(0), query_fail_count_(0), onload_count_(0), onsplit_count_(0), plan_move_in_count_(0) { info_.set_addr(addr); + info_.set_status_m(NodeStateToString(state_)); + info_.set_timestamp(get_micros()); } TabletNode::TabletNode(const TabletNode& t) { @@ -319,19 +323,20 @@ TabletNodeManager::~TabletNodeManager() { MutexLock lock(&mutex_); } -void TabletNodeManager::AddTabletNode(const std::string& addr, - const std::string& uuid) { +TabletNodePtr TabletNodeManager::AddTabletNode(const std::string& addr, + const std::string& uuid) { MutexLock lock(&mutex_); TabletNodePtr null_ptr; std::pair ret = tabletnode_list_.insert( std::pair(addr, null_ptr)); if (!ret.second) { LOG(ERROR) << "tabletnode [" << addr << "] exists"; - return; + return ret.first->second; } TabletNodePtr& state = ret.first->second; state.reset(new TabletNode(addr, uuid)); LOG(INFO) << "add tabletnode : " << addr << ", id : " << uuid; + return state; } void TabletNodeManager::DelTabletNode(const std::string& addr) { @@ -384,7 +389,7 @@ void TabletNodeManager::UpdateTabletNode(const std::string& addr, CounterWeightedSum(state.info_.scan_pending(), node->average_counter_.scan_pending_); node->average_counter_.row_read_delay_ = - CounterWeightedSum(state.info_.extra_info(1).value(), + CounterWeightedSum(state.info_.extra_info_size() > 1 ? state.info_.extra_info(1).value() : 0, node->average_counter_.row_read_delay_); VLOG(15) << "update tabletnode : " << addr; } diff --git a/src/master/tabletnode_manager.h b/src/master/tabletnode_manager.h index a77d32ac5..327d77668 100644 --- a/src/master/tabletnode_manager.h +++ b/src/master/tabletnode_manager.h @@ -7,11 +7,10 @@ #include #include +#include #include #include -#include - #include "common/mutex.h" #include "common/thread_pool.h" @@ -117,7 +116,7 @@ struct TabletNode { TabletNode& operator=(const TabletNode& t); }; -typedef boost::shared_ptr TabletNodePtr; +typedef std::shared_ptr TabletNodePtr; class WorkloadGetter; class Scheduler; @@ -128,7 +127,7 @@ class TabletNodeManager { explicit TabletNodeManager(MasterImpl* master_impl); ~TabletNodeManager(); - void AddTabletNode(const std::string& addr, const std::string& uuid); + TabletNodePtr AddTabletNode(const std::string& addr, const std::string& uuid); void DelTabletNode(const std::string& addr); void UpdateTabletNode(const std::string& addr, const TabletNode& info); bool FindTabletNode(const std::string& addr, TabletNodePtr* info); diff --git a/src/master/test/master_impl_test.cc b/src/master/test/master_impl_test.cc index 6cec6bd12..1c212c20d 100644 --- a/src/master/test/master_impl_test.cc +++ b/src/master/test/master_impl_test.cc @@ -91,6 +91,55 @@ class MasterImplTest : public ::testing::Test, public MasterImpl { UnloadTabletCallback(tablet_p2, retry, request, response, failed, error_code); EXPECT_TRUE(merge_enter_phase2); } + + TabletPtr MakeTabletPtr(const std::string& start, const std::string& end, TablePtr table) { + TabletMeta meta; + meta.mutable_key_range()->set_key_start(start); + meta.mutable_key_range()->set_key_end(end); + TabletPtr tablet(new Tablet(meta, table)); + return tablet; + } + + // This unload function will not send unload request + // Tablet will stay in kTableUnLoading status forever + // It can be used to simulate a slow unload + virtual void UnloadTabletAsync(TabletPtr tablet, UnloadClosure* done) { + LOG(ERROR) << "dummy UnloadTabletAsync..."; + } + + void MergeTabletBorkenTest() { + TablePtr table(new Table("mergetest")); + TabletPtr t1 = MakeTabletPtr("", "a", table); + t1->SetStatus(kTableReady); + + TabletPtr t2 = MakeTabletPtr("a", "z", table); + t2->SetStatus(kTableReady); + + TabletPtr t3 = MakeTabletPtr("z", "", table); + t3->SetStatus(kTableReady); + + LOG(ERROR) << t1->GetStatus() << ";" << t2->GetStatus() << ";" << t3->GetStatus(); + + MergeTabletAsync(t1, t2); + LOG(ERROR) << t1->GetStatus() << ";" << t2->GetStatus() << ";" << t3->GetStatus(); + EXPECT_TRUE((t1->GetStatus() == kTableUnLoading) + && (t2->GetStatus() == kTableUnLoading) + && (t3->GetStatus() == kTableReady)); + + // t2 & t3's merge should fail since t1 & t2 is merging + MergeTabletAsync(t2, t3); + LOG(ERROR) << t1->GetStatus() << ";" << t2->GetStatus() << ";" << t3->GetStatus(); + EXPECT_TRUE((t1->GetStatus() == kTableUnLoading) + && (t2->GetStatus() == kTableUnLoading) + && (t3->GetStatus() == kTableReady)); + + // t3 & t2's merge should fail since t1 & t2 is merging + MergeTabletAsync(t3, t2); + LOG(ERROR) << t1->GetStatus() << ";" << t2->GetStatus() << ";" << t3->GetStatus(); + EXPECT_TRUE((t1->GetStatus() == kTableUnLoading) + && (t2->GetStatus() == kTableUnLoading) + && (t3->GetStatus() == kTableReady)); + } }; TEST_F(MasterImplTest, SplitTest) { @@ -101,6 +150,10 @@ TEST_F(MasterImplTest, MergeTest) { MergeTabletTest(); } +TEST_F(MasterImplTest, MergeTabletBorkenTest) { + MergeTabletBorkenTest(); +} + } // master } // tera diff --git a/src/master/user_manager.cc b/src/master/user_manager.cc index 0b31c4a77..cc290dc14 100644 --- a/src/master/user_manager.cc +++ b/src/master/user_manager.cc @@ -54,7 +54,7 @@ void User::ToMetaTableKeyValue(std::string* packed_key, bool UserManager::AddUser(const std::string& user_name, const UserInfo& user_info) { MutexLock locker(&mutex_); - boost::shared_ptr user(new User(user_name, user_info)); + std::shared_ptr user(new User(user_name, user_info)); user->SetUserInfo(user_info); std::pair ret = diff --git a/src/master/user_manager.h b/src/master/user_manager.h index cd33aed29..08899996b 100644 --- a/src/master/user_manager.h +++ b/src/master/user_manager.h @@ -5,7 +5,7 @@ #ifndef TERA_MASTER_USER_MANAGER_H_ #define TERA_MASTER_USER_MANAGER_H_ -#include +#include #include "common/base/scoped_ptr.h" #include "common/mutex.h" @@ -34,7 +34,7 @@ class User { std::string name_; UserInfo user_info_; }; -typedef boost::shared_ptr UserPtr; +typedef std::shared_ptr UserPtr; class UserManager { public: diff --git a/src/master/workload_scheduler.cc b/src/master/workload_scheduler.cc index 0ca2ec48d..f0f70540c 100644 --- a/src/master/workload_scheduler.cc +++ b/src/master/workload_scheduler.cc @@ -4,6 +4,8 @@ #include "master/workload_scheduler.h" +#include + #include "glog/logging.h" #include "master/tablet_manager.h" diff --git a/src/proto/rpc_client.h b/src/proto/rpc_client.h index a787a5f98..657b40d82 100644 --- a/src/proto/rpc_client.h +++ b/src/proto/rpc_client.h @@ -7,7 +7,7 @@ #include -#include +#include #include #include @@ -41,6 +41,7 @@ class RpcClientBase { public: static void SetOption(int32_t max_inflow, int32_t max_outflow, int32_t pending_buffer_size, int32_t thread_num) { + channel_options_.create_with_init = false; if (-1 != max_inflow) { rpc_client_options_.max_throughput_in = max_inflow; } @@ -75,9 +76,15 @@ class RpcClientBase { if (it != rpc_channel_list_.end()) { rpc_channel_ = it->second; } else { - rpc_channel_ = rpc_channel_list_[server_addr] - = new sofa::pbrpc::RpcChannel(&rpc_client_, server_addr, - channel_options_); + sofa::pbrpc::RpcChannel* c = new sofa::pbrpc::RpcChannel(&rpc_client_, + server_addr, + channel_options_); + if (c->Init()) { + rpc_channel_ = rpc_channel_list_[server_addr] = c; + } else { + delete c; + rpc_channel_ = NULL; + } } mutex_.Unlock(); } @@ -119,7 +126,11 @@ class RpcClient : public RpcClientBase { } */ RpcClientBase::ResetClient(server_addr); - server_client_.reset(new ServerType(rpc_channel_)); + if (rpc_channel_ == NULL) { + server_client_.reset(NULL); + } else { + server_client_.reset(new ServerType(rpc_channel_)); + } server_addr_ = server_addr; // VLOG(5) << "reset connected address to: " << server_addr; } @@ -139,7 +150,7 @@ class RpcClient : public RpcClientBase { // async call ThreadPool::Task callback = - boost::bind(&RpcClient::template UserCallback, + std::bind(&RpcClient::template UserCallback, request, response, closure, true, (int)sofa::pbrpc::RPC_ERROR_RESOLVE_ADDRESS); thread_pool->AddTask(callback); @@ -193,7 +204,7 @@ class RpcClient : public RpcClientBase { // async call ThreadPool::Task done = - boost::bind(&RpcClient::template UserCallback, + std::bind(&RpcClient::template UserCallback, request, response, closure, failed, error); thread_pool->AddTask(done); } diff --git a/src/sdk/http/http.cc b/src/sdk/http/http.cc index 75dd8365a..562c647f3 100644 --- a/src/sdk/http/http.cc +++ b/src/sdk/http/http.cc @@ -4,7 +4,7 @@ #include -#include "boost/bind.hpp" +#include "functional" #include "common/mutex.h" #include "common/thread_pool.h" #include "gflags/gflags.h" @@ -80,7 +80,7 @@ class HttpProxyImpl : public tera::http::HttpProxy { VLOG(25) << "accept RPC (Get)"; read_request_counter.Add(1); common::ThreadPool::Task callback = - boost::bind(&HttpProxyImpl::DoGet, this, controller, request, response, done); + std::bind(&HttpProxyImpl::DoGet, this, controller, request, response, done); request_pool_->AddTask(callback); } virtual void DoGet(google::protobuf::RpcController* controller, @@ -95,7 +95,7 @@ class HttpProxyImpl : public tera::http::HttpProxy { VLOG(25) << "accept RPC (Put)"; write_request_counter.Add(1); common::ThreadPool::Task callback = - boost::bind(&HttpProxyImpl::DoPut, this, controller, request, response, done); + std::bind(&HttpProxyImpl::DoPut, this, controller, request, response, done); request_pool_->AddTask(callback); } virtual void DoPut(google::protobuf::RpcController* controller, @@ -236,7 +236,7 @@ void HttpProxyImpl::LogCounter() { LOG(INFO) << "[read] request: " << read_request_counter.Clear() << " response: " << read_response_counter.Clear(); common::ThreadPool::Task callback = - boost::bind(&HttpProxyImpl::LogCounter, this); + std::bind(&HttpProxyImpl::LogCounter, this); ctrl_pool_->DelayTask(1000, callback); } diff --git a/src/sdk/mutate_impl.cc b/src/sdk/mutate_impl.cc index 1b2779923..a90f850d8 100644 --- a/src/sdk/mutate_impl.cc +++ b/src/sdk/mutate_impl.cc @@ -21,6 +21,7 @@ RowMutationImpl::RowMutationImpl(Table* table, const std::string& row_key) finish_cond_(&finish_mutex_), commit_times_(0), on_finish_callback_(NULL), + start_ts_(get_micros()), txn_(NULL) { SetErrorIfInvalid(row_key, kRowkey); } diff --git a/src/sdk/python/TeraSdk.py b/src/sdk/python/TeraSdk.py index 01b3f1b2b..07d07b889 100644 --- a/src/sdk/python/TeraSdk.py +++ b/src/sdk/python/TeraSdk.py @@ -324,7 +324,8 @@ def PutKV(self, value, ttl): value(string): cell的值 ttl: value 过期时间 """ - lib.tera_row_mutation_put_kv(self.mutation, value, c_uint64(len(value)), c_int32(ttl)) + lib.tera_row_mutation_put_kv(self.mutation, value, + c_uint64(len(value)), c_int32(ttl)) def Put(self, cf, qu, value): """ 写入(修改)这一行上 @@ -339,6 +340,12 @@ def Put(self, cf, qu, value): qu, c_uint64(len(qu)), value, c_uint64(len(value))) + def PutWithTimestamp(self, cf, qu, timestamp, value): + lib.tera_row_mutation_put_with_timestamp(self.mutation, cf, + qu, c_uint64(len(qu)), + timestamp, + value, c_uint64(len(value))) + def DeleteColumnAllVersions(self, cf, qu): """ 删除这一行上 ColumnFamily为, Qualifier为的cell的所有版本 @@ -950,7 +957,7 @@ def init_function_prototype_for_table(): POINTER(c_char_p)] lib.tera_table_put.restype = c_bool - lib.tera_table_put_kv.argtypes = [c_void_p, c_char_p, c_uint64, + lib.tera_table_put_kv.argtypes = [c_void_p, c_char_p, c_uint64, c_char_p, c_uint64, c_int32, POINTER(c_char_p)] lib.tera_table_put_kv.restype = c_bool @@ -999,7 +1006,8 @@ def init_function_prototype_for_table(): def init_function_prototype_for_row_mutation(): """ row_mutation""" - lib.tera_row_mutation_put_kv.argtypes = [c_void_p, c_char_p, c_uint64, c_int32] + lib.tera_row_mutation_put_kv.argtypes = [c_void_p, c_char_p, + c_uint64, c_int32] lib.tera_row_mutation_put_kv.restype = None lib.tera_row_mutation_put.argtypes = [c_void_p, c_char_p, @@ -1007,6 +1015,12 @@ def init_function_prototype_for_row_mutation(): c_char_p, c_uint64] lib.tera_row_mutation_put.restype = None + lib.tera_row_mutation_put_with_timestamp.argtypes = [c_void_p, c_char_p, + c_char_p, c_uint64, + c_int64, + c_void_p, c_uint64] + lib.tera_row_mutation_put_with_timestamp.restype = None + lib.tera_row_mutation_put_int64.argtypes = [c_void_p, c_char_p, c_char_p, c_uint64, c_int64] lib.tera_row_mutation_put_int64.restype = None diff --git a/src/sdk/python/sample.py b/src/sdk/python/sample.py index e5c735223..a95efa74e 100644 --- a/src/sdk/python/sample.py +++ b/src/sdk/python/sample.py @@ -78,6 +78,8 @@ def main(): # async get async_get(table) + put_get_with_timestamp(table) + table.Close() client.Close() print("main() done\n") @@ -170,6 +172,35 @@ def async_get(table): time.sleep(0.01) +def put_get_with_timestamp(table): + print("\nput_get_with_timestamp") + key = "nput_get_with_timestamp" + mu = table.NewRowMutation(key) + mu.PutWithTimestamp("cf0", "qu0", 42, "value") + table.ApplyMutation(mu) + while not table.IsPutFinished(): + time.sleep(0.01) + + reader = table.NewRowReader(key) + reader.AddColumn("cf0", "qu0") + table.ApplyReader(reader) + while not table.IsGetFinished(): + time.sleep(0.01) + + status = reader.GetStatus() + if status.GetReasonNumber() != Status.OK: + print(status.GetReasonString()) + return + while not reader.Done(): + row = reader.RowKey() + column = reader.Family() + ":" + reader.Qualifier() + timestamp = str(reader.Timestamp()) + assert reader.Timestamp() == 42 + val = reader.Value() + print row + ":" + column + ":" + timestamp + ":" + val + reader.Next() + + def put_get_int64(table, rowkey, cf, qu, value): try: table.PutInt64(rowkey, cf, qu, value) diff --git a/src/sdk/read_impl.cc b/src/sdk/read_impl.cc index 02cd52f4b..352e645b0 100644 --- a/src/sdk/read_impl.cc +++ b/src/sdk/read_impl.cc @@ -25,6 +25,7 @@ RowReaderImpl::RowReaderImpl(TableImpl* table, const std::string& row_key) result_pos_(0), commit_times_(0), on_finish_callback_(NULL), + start_ts_(get_micros()), txn_(NULL) { } diff --git a/src/sdk/scan_impl.cc b/src/sdk/scan_impl.cc index 271c68a3e..d18e186f3 100644 --- a/src/sdk/scan_impl.cc +++ b/src/sdk/scan_impl.cc @@ -4,7 +4,7 @@ #include "sdk/scan_impl.h" -#include +#include #include "common/this_thread.h" #include "common/base/closure.h" @@ -22,6 +22,9 @@ DECLARE_bool(tera_sdk_batch_scan_enabled); DECLARE_int64(tera_sdk_scan_number_limit); DECLARE_int64(tera_sdk_scan_buffer_size); DECLARE_int32(tera_sdk_max_batch_scan_req); +DECLARE_int32(tera_sdk_batch_scan_max_retry); +DECLARE_int64(tera_sdk_scan_timeout); +DECLARE_int64(batch_scan_delay_retry_in_us); namespace tera { @@ -69,10 +72,13 @@ std::string ResultStreamImpl::GetNextStartPoint(const std::string& str) { /////////////////////////////////////// ResultStreamBatchImpl::ResultStreamBatchImpl(TableImpl* table, ScanDescImpl* scan_desc) : ResultStreamImpl(table, scan_desc), - cv_(&mu_), ref_count_(1) { + cv_(&mu_), session_retry_(0), ref_count_(1) { // do something startup sliding_window_.resize(FLAGS_tera_sdk_max_batch_scan_req); session_end_key_ = scan_desc_impl_->GetStartRowKey(); + slot_last_key_.set_key(session_end_key_); + slot_last_key_.set_timestamp(INT64_MAX); + mu_.Lock(); ScanSessionReset(); mu_.Unlock(); @@ -86,7 +92,7 @@ void ResultStreamBatchImpl::GetRpcHandle(ScanTabletRequest** request_ptr, MutexLock mutex(&mu_); (*request_ptr)->set_part_of_session(part_of_session_); (*request_ptr)->set_session_id((int64_t)session_id_); - VLOG(28) << "Get rpc handle, part_of_session_ " << part_of_session_ + VLOG(28) << "get rpc handle, part_of_session_ " << part_of_session_ << ", session_id_ " << session_id_ << ", response " << (uint64_t)(*response_ptr); } @@ -98,7 +104,8 @@ void ResultStreamBatchImpl::ReleaseRpcHandle(ScanTabletRequest* request, MutexLock mutex(&mu_); ref_count_--; - VLOG(28) << "release rpc handle and wakeup, ref_count_ " << ref_count_; + VLOG(28) << "release rpc handle and wakeup, ref_count_ " << ref_count_ + << ", response " << (uint64_t)(response); cv_.Signal(); } @@ -147,7 +154,7 @@ void ResultStreamBatchImpl::OnFinish(ScanTabletRequest* request, ScanSlot* slot = &(sliding_window_[slot_idx]); if (slot->state_ == SCANSLOT_INVALID) { slot->state_ = SCANSLOT_VALID; - slot->cell_ = response->results(); + slot->cell_.CopyFrom(response->results()); VLOG(28) << "cache scan result, session_id " << session_id_ << ", slot_idx " << slot_idx << ", kv.size() " << slot->cell_.key_values_size() << ", resp.kv.size() " << response->results().key_values_size(); @@ -172,6 +179,26 @@ ResultStreamBatchImpl::~ResultStreamBatchImpl() { while (ref_count_ != 0) { cv_.Wait();} } +void ResultStreamBatchImpl::ComputeStartKey(const KeyValuePair& kv, KeyValuePair* start_key) { + if (scan_desc_impl_->IsKvOnlyTable()) { // kv, set next key + start_key->set_key(GetNextStartPoint(kv.key())); + start_key->set_column_family(kv.column_family()); + start_key->set_qualifier(kv.qualifier()); + start_key->set_timestamp(kv.timestamp()); + } else if (kv.timestamp() == 0) { // table timestamp == 0 + start_key->set_key(kv.key()); + start_key->set_column_family(kv.column_family()); + start_key->set_qualifier(GetNextStartPoint(kv.qualifier())); + start_key->set_timestamp(INT64_MAX); + } else { // table has timestamp > 0 + start_key->set_key(kv.key()); + start_key->set_column_family(kv.column_family()); + start_key->set_qualifier(kv.qualifier()); + start_key->set_timestamp(kv.timestamp() - 1); + } + return; +} + void ResultStreamBatchImpl::ScanSessionReset() { mu_.AssertHeld(); // reset session parameter @@ -193,8 +220,11 @@ void ResultStreamBatchImpl::ScanSessionReset() { } ref_count_ += FLAGS_tera_sdk_max_batch_scan_req; - scan_desc_impl_->SetStart(session_end_key_); - VLOG(28) << "scan session reset, start key " << session_end_key_ + KeyValuePair start_key; + ComputeStartKey(slot_last_key_, &start_key); + scan_desc_impl_->SetStart(start_key.key(), start_key.column_family(), + start_key.qualifier(), start_key.timestamp()); + VLOG(28) << "scan session reset, start key " << start_key.key() << ", ref_count " << ref_count_; mu_.Unlock(); // do io, release lock @@ -208,6 +238,10 @@ void ResultStreamBatchImpl::ScanSessionReset() { void ResultStreamBatchImpl::ClearAndScanNextSlot(bool scan_next) { mu_.AssertHeld(); ScanSlot* slot = &(sliding_window_[sliding_window_idx_]); + assert(next_idx_ == slot->cell_.key_values_size()); + if (next_idx_ > 0) { // update last slot kv_pair + slot_last_key_.CopyFrom(slot->cell_.key_values(next_idx_ - 1)); + } slot->cell_.Clear(); slot->state_ = SCANSLOT_INVALID; next_idx_ = 0; @@ -235,23 +269,42 @@ bool ResultStreamBatchImpl::Done(ErrorCode* error) { // 2. ts not available, or // 3. rpc not available, or ScanSlot* slot = &(sliding_window_[sliding_window_idx_]); - while(slot->state_ == SCANSLOT_INVALID) { + while (slot->state_ == SCANSLOT_INVALID) { // stale results_id, re-enable another scan req if (session_error_ != kTabletNodeOk) { // TODO: kKeyNotInRange, do reset session - LOG(WARNING) << "scan done: session error " << StatusCodeToString(session_error_); + LOG(WARNING) << "[RETRY " << ++session_retry_ << "] scan session error: " + << StatusCodeToString(session_error_) + << ", data_idx " << session_data_idx_ << ", slice_idx " << sliding_window_idx_; + assert(session_done_); + if (session_retry_ <= FLAGS_tera_sdk_batch_scan_max_retry) { + break; + } + + // give up scan, report session error if (error) { error->SetFailed(ErrorCode::kSystem, StatusCodeToString(session_error_)); } return true; } if (ref_count_ == 1) { - // ts refuse scan... - LOG(WARNING) << "ts refuse scan, scan later...\n"; + // check wether ts refuse scan + if (error) { + error->SetFailed(ErrorCode::kSystem, StatusCodeToString(session_error_)); + } + LOG(WARNING) << "[CHECK]: ts refuse scan, scan later.\n"; return true; } cv_.Wait(); } + if (slot->state_ == SCANSLOT_INVALID) { // TODO: error break, maybe delay retry + while (ref_count_ > 1) { cv_.Wait();} + cv_.TimeWaitInUs(FLAGS_batch_scan_delay_retry_in_us, "BatchScanRetryTimeWait"); + ScanSessionReset(); + continue; + } + + // slot valid if (next_idx_ < slot->cell_.key_values_size()) { break; } VLOG(28) << "session_done_ " << session_done_ << ", session_data_idx_ " @@ -276,6 +329,8 @@ bool ResultStreamBatchImpl::Done(ErrorCode* error) { } // scan next tablet + slot_last_key_.set_key(session_end_key_); + slot_last_key_.set_timestamp(INT64_MAX); ScanSessionReset(); } return false; @@ -366,7 +421,7 @@ bool ResultStreamSyncImpl::Done(ErrorCode* err) { kv.qualifier(), kv.timestamp()); } else if (kv.timestamp() == 0) { scan_desc_impl_->SetStart(kv.key(), kv.column_family(), - GetNextStartPoint(kv.qualifier()), kv.timestamp()); + GetNextStartPoint(kv.qualifier()), INT64_MAX); } else { scan_desc_impl_->SetStart(kv.key(), kv.column_family(), kv.qualifier(), kv.timestamp() - 1); @@ -487,7 +542,7 @@ ScanDescImpl::ScanDescImpl(const string& rowkey) number_limit_(FLAGS_tera_sdk_scan_number_limit), is_async_(FLAGS_tera_sdk_batch_scan_enabled), max_version_(1), - pack_interval_(5000), + pack_interval_(FLAGS_tera_sdk_scan_timeout), snapshot_(0), value_converter_(&DefaultValueConverter) { SetStart(rowkey); diff --git a/src/sdk/scan_impl.h b/src/sdk/scan_impl.h index 97eca7e6a..2d808044f 100644 --- a/src/sdk/scan_impl.h +++ b/src/sdk/scan_impl.h @@ -91,12 +91,14 @@ class ResultStreamBatchImpl : public ResultStreamImpl { ScanTabletResponse* response); // scan callback private: void ClearAndScanNextSlot(bool scan_next); + void ComputeStartKey(const KeyValuePair& kv, KeyValuePair* start_key); void ScanSessionReset(); private: mutable Mutex mu_; CondVar cv_; + int32_t session_retry_; int32_t ref_count_; // use for scan_imple destory // session control @@ -106,6 +108,7 @@ class ResultStreamBatchImpl : public ResultStreamImpl { uint32_t session_data_idx_; // current result id wait bool part_of_session_; // TODO, should be deleted std::string session_end_key_; + KeyValuePair slot_last_key_; uint32_t session_last_idx_; // if session done, point to the last data_idx // sliding window control diff --git a/src/sdk/schema_impl.h b/src/sdk/schema_impl.h index 784b5dfdd..a68a09f77 100644 --- a/src/sdk/schema_impl.h +++ b/src/sdk/schema_impl.h @@ -191,6 +191,9 @@ class TableDescImpl { void SetAlias(const std::string& alias); std::string Alias() const; + static const std::string DEFAULT_LG_NAME; + static const std::string DEFAULT_CF_NAME; + private: typedef std::map LGMap; typedef std::map CFMap; @@ -202,8 +205,6 @@ class TableDescImpl { int32_t next_lg_id_; int32_t next_cf_id_; std::vector snapshots_; - static const std::string DEFAULT_LG_NAME; - static const std::string DEFAULT_CF_NAME; RawKeyType raw_key_type_; int64_t split_size_; int64_t merge_size_; diff --git a/src/sdk/sdk_task.cc b/src/sdk/sdk_task.cc index adf3318fb..ce1d64e2d 100644 --- a/src/sdk/sdk_task.cc +++ b/src/sdk/sdk_task.cc @@ -38,14 +38,6 @@ void SdkTask::ExcludeOtherRef() { CHECK_EQ(ref_, 1); } -int64_t GetSdkTaskId(SdkTask* task) { - return task->GetId(); -} - -uint64_t GetSdkTaskDueTime(SdkTask* task) { - return task->DueTime(); -} - SdkTimeoutManager::SdkTimeoutManager(ThreadPool* thread_pool) : thread_pool_(thread_pool), timeout_precision_(FLAGS_tera_sdk_timeout_precision), @@ -53,7 +45,7 @@ SdkTimeoutManager::SdkTimeoutManager(ThreadPool* thread_pool) bg_exit_(false), bg_cond_(&bg_mutex_), bg_func_id_(0), - bg_func_(boost::bind(&SdkTimeoutManager::CheckTimeout, this)) { + bg_func_(std::bind(&SdkTimeoutManager::CheckTimeout, this)) { if (timeout_precision_ <= 0) { timeout_precision_ = 1; } @@ -94,10 +86,11 @@ bool SdkTimeoutManager::PutTask(SdkTask* task, int64_t timeout, Mutex& mutex = mutex_shard_[shard_id]; MutexLock l(&mutex); - std::pair insert_ret; - insert_ret = map.insert(task); + std::pair insert_ret; + insert_ret = map.id_hash_map.insert(std::pair(task_id, task)); bool insert_success = insert_ret.second; if (insert_success) { + map.due_time_map.insert(task); task->IncRef(); } return insert_success; @@ -109,10 +102,9 @@ SdkTask* SdkTimeoutManager::GetTask(int64_t task_id) { Mutex& mutex = mutex_shard_[shard_id]; MutexLock l(&mutex); - TaskIdIndex& id_index = map.get(); - TaskIdIndex::iterator it = id_index.find(task_id); - if (it != id_index.end()) { - SdkTask* task = *it; + IdHashMap::iterator it = map.id_hash_map.find(task_id); + if (it != map.id_hash_map.end()) { + SdkTask* task = it->second; CHECK_EQ(task->GetId(), task_id); task->IncRef(); return task; @@ -127,12 +119,12 @@ SdkTask* SdkTimeoutManager::PopTask(int64_t task_id) { Mutex& mutex = mutex_shard_[shard_id]; MutexLock l(&mutex); - TaskIdIndex& id_index = map.get(); - TaskIdIndex::iterator it = id_index.find(task_id); - if (it != id_index.end()) { - SdkTask* task = *it; + IdHashMap::iterator it = map.id_hash_map.find(task_id); + if (it != map.id_hash_map.end()) { + SdkTask* task = it->second; CHECK_EQ(task->GetId(), task_id); - id_index.erase(it); + map.id_hash_map.erase(it); + map.due_time_map.erase(task); return task; } else { return NULL; @@ -146,16 +138,16 @@ void SdkTimeoutManager::CheckTimeout() { Mutex& mutex = mutex_shard_[shard_id]; MutexLock l(&mutex); - while (!map.empty()) { - TaskDueTimeIndex& due_time_index = map.get(); - TaskDueTimeIndex::iterator it = due_time_index.begin(); + while (!map.due_time_map.empty()) { + DueTimeMap::iterator it = map.due_time_map.begin(); SdkTask* task = *it; if (task->DueTime() > (uint64_t)now_ms) { break; } - due_time_index.erase(it); + map.due_time_map.erase(it); + map.id_hash_map.erase(task->GetId()); mutex.Unlock(); - thread_pool_->AddTask(boost::bind(&SdkTimeoutManager::RunTimeoutFunc, this, task)); + thread_pool_->AddTask(std::bind(&SdkTimeoutManager::RunTimeoutFunc, this, task)); mutex.Lock(); } } diff --git a/src/sdk/sdk_task.h b/src/sdk/sdk_task.h index e231a1a44..58f61f65a 100644 --- a/src/sdk/sdk_task.h +++ b/src/sdk/sdk_task.h @@ -5,12 +5,9 @@ #ifndef TERA_SDK_SDK_TASK_H_ #define TERA_SDK_SDK_TASK_H_ -#include -#include -#include -#include -#include -#include +#include +#include +#include #include "common/base/stdint.h" #include "common/mutex.h" @@ -23,7 +20,7 @@ namespace tera { class SdkTask { public: - typedef boost::function TimeoutFunc; + typedef std::function TimeoutFunc; enum TYPE { READ, MUTATION, @@ -77,9 +74,11 @@ class SdkTask { typedef void (*StatCallback)(Table* table, SdkTask* task); -int64_t GetSdkTaskId(SdkTask* task); - -uint64_t GetSdkTaskDueTime(SdkTask* task); +struct SdkTaskDueTimeComp { + bool operator() (SdkTask* lhs, SdkTask* rhs) { + return lhs->DueTime() < rhs->DueTime(); + } +}; class SdkTimeoutManager { public: @@ -101,24 +100,14 @@ class SdkTimeoutManager { private: const static uint32_t kShardBits = 6; const static uint32_t kShardNum = (1 << kShardBits); - typedef boost::multi_index_container< - SdkTask*, - boost::multi_index::indexed_by< - // hashed on SdkTask::id_ - boost::multi_index::hashed_unique< - boost::multi_index::global_fun >, - - // sort by less on SdkTask::due_time_ms_ - boost::multi_index::ordered_non_unique< - boost::multi_index::global_fun > - > - > TaskMap; - enum { - INDEX_BY_ID = 0, - INDEX_BY_DUE_TIME = 1, + + typedef std::multiset DueTimeMap; + typedef std::unordered_map IdHashMap; + struct TaskMap { + DueTimeMap due_time_map; + IdHashMap id_hash_map; }; - typedef TaskMap::nth_index::type TaskIdIndex; - typedef TaskMap::nth_index::type TaskDueTimeIndex; + TaskMap map_shard_[kShardNum]; mutable Mutex mutex_shard_[kShardNum]; ThreadPool* thread_pool_; diff --git a/src/sdk/sdk_utils.cc b/src/sdk/sdk_utils.cc index 5d963a479..175bc7245 100644 --- a/src/sdk/sdk_utils.cc +++ b/src/sdk/sdk_utils.cc @@ -16,6 +16,7 @@ #include "glog/logging.h" #include "utils/string_util.h" +#include "sdk/schema_impl.h" #include "sdk/filter_utils.h" DECLARE_int64(tera_tablet_write_block_size); @@ -563,6 +564,15 @@ bool CheckTableDescrptor(const TableDescriptor& desc, ErrorCode* err) { } return false; } + if ((desc.RawKey() == kReadable || desc.RawKey() == kBinary)) { + if (desc.ColumnFamilyNum() == 0) { + ss << "kBinary/kReadable MUST have cf"; + if (err != NULL) { + err->SetFailed(ErrorCode::kBadParam, ss.str()); + } + return false; + } + } return true; } @@ -664,8 +674,17 @@ bool UpdateKvTableProperties(const PropTree::Node* table_node, TableDescriptor* LocalityGroupDescriptor* lg_desc = const_cast(table_desc->LocalityGroup("kv")); if (lg_desc == NULL) { - LOG(ERROR) << "[update] fail to get locality group: kv"; - return false; + LOG(ERROR) << "[update][WARNING] can not get locality group: kv(kv table)"; + + // maybe this is a old kv table, it's LocalityGroup name is TableDescImpl::DEFAULT_LG_NAME + lg_desc = + const_cast(table_desc->LocalityGroup(TableDescImpl::DEFAULT_LG_NAME)); + if (lg_desc == NULL) { + LOG(ERROR) << "[update] fail to get locality group: " << TableDescImpl::DEFAULT_LG_NAME; + return false; + } else { + LOG(ERROR) << "[update][WARNING] it seems this is a old-style kv table"; + } } for (std::map::const_iterator i = table_node->properties_.begin(); i != table_node->properties_.end(); ++i) { @@ -755,9 +774,9 @@ bool FillTableDescriptor(PropTree& schema_tree, TableDescriptor* table_desc) { // simple table mode, have 1 default lg // e.g. table1{cf1, cf2, cf3} LocalityGroupDescriptor* lg_desc; - lg_desc = table_desc->AddLocalityGroup("lg0"); + lg_desc = table_desc->AddLocalityGroup(TableDescImpl::DEFAULT_LG_NAME); if (lg_desc == NULL) { - LOG(ERROR) << "fail to add locality group: lg0"; + LOG(ERROR) << "fail to add locality group: " << TableDescImpl::DEFAULT_LG_NAME; return false; } // add all column families and properties @@ -931,9 +950,15 @@ bool ParseDelimiterFile(const string& filename, std::vector* delims) { bool is_delim_error = false; for (size_t i = 1; i < delimiters.size(); i++) { if (delimiters[i] <= delimiters[i-1]) { - LOG(ERROR) << "delimiter error: line: " << i + 1 - << ", [" << delimiters[i] << "]"; + LOG(ERROR) << "line[" << i << "]" << " SHOULD less than line[" << i + 1 + << "] (bitwise comparison, maybe LC_ALL=C if you use command sort(1))"; + LOG(ERROR) << "line[" << i << "]: (" << delimiters[i-1] << ")"; + LOG(ERROR) << "line[" << i + 1 << "]: (" << delimiters[i] << ")"; is_delim_error = true; + // just print the 1st invalid input case, + // if print all invalid input, + // it will print too many log to read/understand + break; } } if (is_delim_error) { diff --git a/src/sdk/single_row_txn.cc b/src/sdk/single_row_txn.cc index 5895ae894..7c870f592 100644 --- a/src/sdk/single_row_txn.cc +++ b/src/sdk/single_row_txn.cc @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include +#include #include "common/thread_pool.h" #include "common/base/string_format.h" @@ -45,7 +45,7 @@ void SingleRowTxn::ApplyMutation(RowMutation* row_mu) { } if (row_mu->IsAsync()) { - ThreadPool::Task task = boost::bind(&RowMutationImpl::RunCallback, row_mu_impl); + ThreadPool::Task task = std::bind(&RowMutationImpl::RunCallback, row_mu_impl); thread_pool_->AddTask(task); } } @@ -79,7 +79,7 @@ void SingleRowTxn::Get(RowReader* row_reader) { } if (reader_impl->GetError().GetType() != ErrorCode::kOK) { if (is_async) { - ThreadPool::Task task = boost::bind(&RowReaderImpl::RunCallback, reader_impl); + ThreadPool::Task task = std::bind(&RowReaderImpl::RunCallback, reader_impl); thread_pool_->AddTask(task); } return; @@ -171,7 +171,7 @@ void SingleRowTxn::Commit() { table_->ApplyMutation(&mutation_buffer_); } else { if (user_commit_callback_ != NULL) { - ThreadPool::Task task = boost::bind(user_commit_callback_, this); + ThreadPool::Task task = std::bind(user_commit_callback_, this); thread_pool_->AddTask(task); } } diff --git a/src/sdk/table_impl.cc b/src/sdk/table_impl.cc index 175d03d97..815a61cf6 100644 --- a/src/sdk/table_impl.cc +++ b/src/sdk/table_impl.cc @@ -3,24 +3,23 @@ // found in the LICENSE file. #include "table_impl.h" -#include "tera.h" #include #include +#include #include #include #include #include +#include #include #include -#include #include #include "common/base/string_format.h" #include "common/file/file_path.h" #include "common/file/recordio/record_io.h" - #include "io/coding.h" #include "proto/kv_helper.h" #include "proto/proto_helper.h" @@ -32,6 +31,7 @@ #include "sdk/scan_impl.h" #include "sdk/schema_impl.h" #include "sdk/sdk_zk.h" +#include "tera.h" #include "utils/crypt.h" #include "utils/string_util.h" #include "utils/timer.h" @@ -59,6 +59,8 @@ DECLARE_bool(tera_sdk_perf_counter_enabled); DECLARE_int64(tera_sdk_perf_counter_log_interval); DECLARE_int32(tera_rpc_timeout_period); +using namespace std::placeholders; + namespace tera { TableImpl::TableImpl(const std::string& table_name, @@ -135,10 +137,14 @@ void OpStatCallback(Table* table, SdkTask* task) { void TableImpl::ApplyMutation(RowMutation* row_mu) { perf_counter_.user_mu_cnt.Add(1); ((RowMutationImpl*)row_mu)->Prepare(OpStatCallback); - if (row_mu->GetError().GetType() != ErrorCode::kOK) { + if (row_mu->GetError().GetType() != ErrorCode::kOK) { // local check fail + if (!((RowMutationImpl*)row_mu)->IsAsync()) { + ((RowMutationImpl*)row_mu)->RunCallback(); + return; + } ThreadPool::Task task = - boost::bind(&RowMutationImpl::RunCallback, - static_cast(row_mu)); + std::bind(&RowMutationImpl::RunCallback, + static_cast(row_mu)); thread_pool_->AddTask(task); return; } @@ -152,10 +158,14 @@ void TableImpl::ApplyMutation(const std::vector& row_mutations) { for (uint32_t i = 0; i < row_mutations.size(); i++) { perf_counter_.user_mu_cnt.Add(1); ((RowMutationImpl*)row_mutations[i])->Prepare(OpStatCallback); - if (row_mutations[i]->GetError().GetType() != ErrorCode::kOK) { + if (row_mutations[i]->GetError().GetType() != ErrorCode::kOK) { // local check fail + if (!((RowMutationImpl*)row_mutations[i])->IsAsync()) { + ((RowMutationImpl*)row_mutations[i])->RunCallback(); + continue; + } ThreadPool::Task task = - boost::bind(&RowMutationImpl::RunCallback, - static_cast(row_mutations[i])); + std::bind(&RowMutationImpl::RunCallback, + static_cast(row_mutations[i])); thread_pool_->AddTask(task); continue; } @@ -178,6 +188,7 @@ bool TableImpl::Put(const std::string& row_key, const std::string& family, row_mu->Put(family, qualifier, value); ApplyMutation(row_mu); *err = row_mu->GetError(); + delete row_mu; return (err->GetType() == ErrorCode::kOK ? true : false); } @@ -188,6 +199,7 @@ bool TableImpl::Put(const std::string& row_key, const std::string& family, row_mu->Put(family, qualifier, timestamp, value); ApplyMutation(row_mu); *err = row_mu->GetError(); + delete row_mu; return (err->GetType() == ErrorCode::kOK ? true : false); } @@ -198,6 +210,7 @@ bool TableImpl::Put(const std::string& row_key, const std::string& family, row_mu->Put(family, qualifier, value, ttl); ApplyMutation(row_mu); *err = row_mu->GetError(); + delete row_mu; return (err->GetType() == ErrorCode::kOK ? true : false); } @@ -208,6 +221,7 @@ bool TableImpl::Put(const std::string& row_key, const std::string& family, row_mu->Put(family, qualifier, timestamp, value, ttl); ApplyMutation(row_mu); *err = row_mu->GetError(); + delete row_mu; return (err->GetType() == ErrorCode::kOK ? true : false); } @@ -217,6 +231,7 @@ bool TableImpl::Add(const std::string& row_key, const std::string& family, row_mu->Add(family, qualifier, delta); ApplyMutation(row_mu); *err = row_mu->GetError(); + delete row_mu; return (err->GetType() == ErrorCode::kOK ? true : false); } @@ -226,6 +241,7 @@ bool TableImpl::AddInt64(const std::string& row_key, const std::string& family, row_mu->AddInt64(family, qualifier, delta); ApplyMutation(row_mu); *err = row_mu->GetError(); + delete row_mu; return (err->GetType() == ErrorCode::kOK ? true : false); } @@ -236,6 +252,7 @@ bool TableImpl::PutIfAbsent(const std::string& row_key, const std::string& famil row_mu->PutIfAbsent(family, qualifier, value); ApplyMutation(row_mu); *err = row_mu->GetError(); + delete row_mu; return (err->GetType() == ErrorCode::kOK ? true : false); } @@ -246,6 +263,7 @@ bool TableImpl::Append(const std::string& row_key, const std::string& family, row_mu->Append(family, qualifier, value); ApplyMutation(row_mu); *err = row_mu->GetError(); + delete row_mu; return (err->GetType() == ErrorCode::kOK ? true : false); } @@ -335,8 +353,10 @@ bool TableImpl::Get(const std::string& row_key, const std::string& family, *err = row_reader->GetError(); if (err->GetType() == ErrorCode::kOK) { *value = row_reader->Value(); + delete row_reader; return true; } + delete row_reader; return false; } @@ -344,7 +364,8 @@ ResultStream* TableImpl::Scan(const ScanDescriptor& desc, ErrorCode* err) { ScanDescImpl * impl = desc.GetImpl(); impl->SetTableSchema(table_schema_); ResultStream * results = NULL; - if (desc.IsAsync() && (table_schema_.raw_key() != GeneralKv)) { + if (desc.IsAsync() && + (table_schema_.raw_key() == Binary || table_schema_.raw_key() == Readable)) { VLOG(6) << "activate async-scan"; results = new ResultStreamBatchImpl(this, impl); } else { @@ -425,6 +446,10 @@ void TableImpl::CommitScan(ScanTask* scan_task, column_family->CopyFrom(*(impl->GetColumnFamily(i))); } + VLOG(20) << "table " << request->table_name() + << ", start_key " << request->start() + << ", end_key " << request->end() + << ", scan to " << server_addr; request->set_timestamp(common::timer::get_micros()); Closure* done = NewClosure(this, &TableImpl::ScanCallBack, scan_task); @@ -464,9 +489,10 @@ void TableImpl::ScanCallBack(ScanTask* scan_task, } scan_task->SetInternalError(err); - if (err == kTabletNodeOk - || err == kSnapshotNotExist - || scan_task->RetryTimes() >= static_cast(FLAGS_tera_sdk_retry_times)) { + if (err == kTabletNodeOk || + err == kSnapshotNotExist || + stream->GetScanDesc()->IsAsync() || // batch scan retry internal + scan_task->RetryTimes() >= static_cast(FLAGS_tera_sdk_retry_times)) { if (err == kKeyNotInRange || err == kConnectError) { ScheduleUpdateMeta(stream->GetScanDesc()->GetStartRowKey(), scan_task->GetMetaTimeStamp()); @@ -479,7 +505,8 @@ void TableImpl::ScanCallBack(ScanTask* scan_task, } else { scan_task->IncRetryTimes(); ThreadPool::Task retry_task = - boost::bind(&TableImpl::ScanTabletAsync, this, scan_task, false); + std::bind(static_cast(&TableImpl::ScanTabletAsync), + this, scan_task, false); CHECK(scan_task->RetryTimes() > 0); int64_t retry_interval = static_cast(pow(FLAGS_tera_sdk_delay_send_internal, @@ -554,7 +581,7 @@ void TableImpl::DistributeMutations(const std::vector& mu_list } else { row_timeout = row_mutation->TimeOut() > 0 ? row_mutation->TimeOut() : timeout_; } - SdkTask::TimeoutFunc task = boost::bind(&TableImpl::MutationTimeout, this, _1); + SdkTask::TimeoutFunc task = std::bind(&TableImpl::MutationTimeout, this, _1); task_pool_.PutTask(row_mutation, row_timeout, task); } @@ -570,7 +597,7 @@ void TableImpl::DistributeMutations(const std::vector& mu_list cur_commit_pending_counter_.Sub(row_mutation->MutationNum()); row_mutation->SetError(ErrorCode::kBusy, "pending too much mutations, try it later."); ThreadPool::Task task = - boost::bind(&TableImpl::BreakRequest, this, row_mutation->GetId()); + std::bind(&TableImpl::BreakRequest, this, row_mutation->GetId()); row_mutation->DecRef(); thread_pool_->AddTask(task); continue; @@ -640,8 +667,8 @@ void TableImpl::PackMutations(const std::string& server_addr, mutation_batch = &mutation_batch_map_[server_addr]; mutation_batch->sequence_num = mutation_batch_seq_++; mutation_batch->row_id_list = new std::vector; - ThreadPool::Task task = boost::bind(&TableImpl::MutationBatchTimeout, this, - server_addr, mutation_batch->sequence_num); + ThreadPool::Task task = std::bind(&TableImpl::MutationBatchTimeout, this, + server_addr, mutation_batch->sequence_num); int64_t timer_id = thread_pool_->DelayTask(write_commit_timeout_, task); mutation_batch->timer_id = timer_id; mutation_batch->byte_size = 0; @@ -789,7 +816,7 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, err = response->row_status_list(i); } - if (err == kTabletNodeOk || err == kTxnFail) { + if (err == kTabletNodeOk || err == kTxnFail || err == kTableInvalidArg) { perf_counter_.mutate_ok_cnt.Inc(); SdkTask* task = task_pool_.PopTask(mu_id); if (task == NULL) { @@ -801,8 +828,10 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, RowMutationImpl* row_mutation = (RowMutationImpl*)task; if (err == kTabletNodeOk) { row_mutation->SetError(ErrorCode::kOK); - } else { + } else if (err == kTxnFail) { row_mutation->SetError(ErrorCode::kTxnFail, "transaction commit fail"); + } else { + row_mutation->SetError(ErrorCode::kBadParam, "illegal arg error"); } // only for flow control @@ -856,7 +885,7 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, int64_t retry_interval = static_cast(pow(FLAGS_tera_sdk_delay_send_internal, it->first) * 1000); ThreadPool::Task retry_task = - boost::bind(&TableImpl::DistributeMutationsById, this, it->second); + std::bind(&TableImpl::DistributeMutationsById, this, it->second); thread_pool_->DelayTask(retry_interval, retry_task); } @@ -938,7 +967,7 @@ void TableImpl::DistributeReaders(const std::vector& row_reader_ if (row_reader->IsAsync()) { row_timeout = row_reader->TimeOut() > 0 ? row_reader->TimeOut() : timeout_; } - SdkTask::TimeoutFunc task = boost::bind(&TableImpl::ReaderTimeout, this, _1); + SdkTask::TimeoutFunc task = std::bind(&TableImpl::ReaderTimeout, this, _1); task_pool_.PutTask(row_reader, row_timeout, task); } @@ -954,7 +983,7 @@ void TableImpl::DistributeReaders(const std::vector& row_reader_ cur_reader_pending_counter_.Dec(); row_reader->SetError(ErrorCode::kBusy, "pending too much readers, try it later."); ThreadPool::Task task = - boost::bind(&TableImpl::BreakRequest, this, row_reader->GetId()); + std::bind(&TableImpl::BreakRequest, this, row_reader->GetId()); row_reader->DecRef(); thread_pool_->AddTask(task); continue; @@ -1005,8 +1034,8 @@ void TableImpl::PackReaders(const std::string& server_addr, reader_buffer = &reader_batch_map_[server_addr]; reader_buffer->sequence_num = reader_batch_seq_++; reader_buffer->row_id_list = new std::vector; - ThreadPool::Task task = boost::bind(&TableImpl::ReaderBatchTimeout, this, - server_addr, reader_buffer->sequence_num); + ThreadPool::Task task = std::bind(&TableImpl::ReaderBatchTimeout, this, + server_addr, reader_buffer->sequence_num); uint64_t timer_id = thread_pool_->DelayTask(read_commit_timeout_, task); reader_buffer->timer_id = timer_id; } @@ -1092,6 +1121,7 @@ void TableImpl::CommitReaders(const std::string server_addr, row_reader->AddCommitTimes(); row_reader->DecRef(); } + VLOG(20) << "commit " << reader_list.size() << " reads to " << server_addr; request->set_timestamp(common::timer::get_micros()); Closure* done = NewClosure(this, &TableImpl::ReaderCallBack, reader_id_list); @@ -1205,7 +1235,7 @@ void TableImpl::ReaderCallBack(std::vector* reader_id_list, int64_t retry_interval = static_cast(pow(FLAGS_tera_sdk_delay_send_internal, it->first) * 1000); ThreadPool::Task retry_task = - boost::bind(&TableImpl::DistributeReadersById, this, it->second); + std::bind(&TableImpl::DistributeReadersById, this, it->second); thread_pool_->DelayTask(retry_interval, retry_task); } @@ -1319,9 +1349,9 @@ bool TableImpl::GetTabletAddrOrScheduleUpdateMeta(const std::string& row, VLOG(10) << "update meta in " << update_interval << " (ms) for key:" << row; node->status = DELAY_UPDATE; ThreadPool::Task delay_task = - boost::bind(&TableImpl::DelayUpdateMeta, this, - node->meta.key_range().key_start(), - node->meta.key_range().key_end()); + std::bind(&TableImpl::DelayUpdateMeta, this, + node->meta.key_range().key_start(), + node->meta.key_range().key_end()); thread_pool_->DelayTask(update_interval, delay_task); } return false; @@ -1436,8 +1466,8 @@ void TableImpl::ScanMetaTableAsync(const std::string& key_start, const std::stri VLOG(6) << "root is empty"; ThreadPool::Task retry_task = - boost::bind(&TableImpl::ScanMetaTableAsyncInLock, this, key_start, key_end, - expand_key_end, true); + std::bind(&TableImpl::ScanMetaTableAsyncInLock, this, key_start, key_end, + expand_key_end, true); thread_pool_->DelayTask(FLAGS_tera_sdk_update_meta_internal, retry_task); return; } @@ -1495,8 +1525,8 @@ void TableImpl::ScanMetaTableCallBack(std::string key_start, GiveupUpdateTabletMeta(key_start, key_end); } ThreadPool::Task retry_task = - boost::bind(&TableImpl::ScanMetaTableAsyncInLock, this, key_start, key_end, - expand_key_end, true); + std::bind(&TableImpl::ScanMetaTableAsyncInLock, this, key_start, key_end, + expand_key_end, true); thread_pool_->DelayTask(FLAGS_tera_sdk_update_meta_internal, retry_task); delete request; delete response; @@ -1740,9 +1770,9 @@ void TableImpl::ScheduleUpdateMeta(const std::string& row, } else { node->status = DELAY_UPDATE; ThreadPool::Task delay_task = - boost::bind(&TableImpl::DelayUpdateMeta, this, - node->meta.key_range().key_start(), - node->meta.key_range().key_end()); + std::bind(&TableImpl::DelayUpdateMeta, this, + node->meta.key_range().key_start(), + node->meta.key_range().key_end()); thread_pool_->DelayTask(update_interval, delay_task); } } @@ -1784,7 +1814,7 @@ void TableImpl::ReadTableMetaAsync(ErrorCode* ret_err, int32_t retry_times, int64_t retry_interval = static_cast(pow(FLAGS_tera_sdk_delay_send_internal, retry_times) * 1000); ThreadPool::Task retry_task = - boost::bind(&TableImpl::ReadTableMetaAsync, this, ret_err, retry_times + 1, true); + std::bind(&TableImpl::ReadTableMetaAsync, this, ret_err, retry_times + 1, true); thread_pool_->DelayTask(retry_interval, retry_task); } return; @@ -1872,7 +1902,7 @@ void TableImpl::ReadTableMetaCallBack(ErrorCode* ret_err, int64_t retry_interval = static_cast(pow(FLAGS_tera_sdk_delay_send_internal, retry_times) * 1000); ThreadPool::Task retry_task = - boost::bind(&TableImpl::ReadTableMetaAsync, this, ret_err, retry_times + 1, true); + std::bind(&TableImpl::ReadTableMetaAsync, this, ret_err, retry_times + 1, true); thread_pool_->DelayTask(retry_interval, retry_task); } @@ -1954,12 +1984,12 @@ void TableImpl::DoDumpCookie() { void TableImpl::DumpCookie() { DoDumpCookie(); - ThreadPool::Task task = boost::bind(&TableImpl::DumpCookie, this); + ThreadPool::Task task = std::bind(&TableImpl::DumpCookie, this); AddDelayTask(FLAGS_tera_sdk_cookie_update_interval * 1000LL, task); } void TableImpl::EnableCookieUpdateTimer() { - ThreadPool::Task task = boost::bind(&TableImpl::DumpCookie, this); + ThreadPool::Task task = std::bind(&TableImpl::DumpCookie, this); AddDelayTask(FLAGS_tera_sdk_cookie_update_interval * 1000LL, task); } @@ -1977,18 +2007,10 @@ std::string TableImpl::GetCookieFileName(const std::string& tablename, return fname.str(); } -static int64_t CalcAverage(Counter& sum, Counter& cnt, int64_t interval) { - if (cnt.Get() == 0 || interval == 0) { - return 0; - } else { - return sum.Clear() * 1000 / cnt.Clear() / interval / 1000; - } -} - void TableImpl::DumpPerfCounterLogDelay() { DoDumpPerfCounterLog(); ThreadPool::Task task = - boost::bind(&TableImpl::DumpPerfCounterLogDelay, this); + std::bind(&TableImpl::DumpPerfCounterLogDelay, this); AddDelayTask(FLAGS_tera_sdk_perf_counter_log_interval * 1000, task); } @@ -2000,14 +2022,12 @@ void TableImpl::DoDumpPerfCounterLog() { } void TableImpl::PerfCounter::DoDumpPerfCounterLog(const std::string& log_prefix) { - int64_t ts = common::timer::get_micros(); - int64_t interval = (ts - start_time) / 1000; LOG(INFO) << log_prefix << "[delay](ms)" - << " get meta: " << CalcAverage(get_meta, get_meta_cnt, interval) - << " callback: " << CalcAverage(user_callback, user_callback_cnt, interval) - << " rpc_r: " << CalcAverage(rpc_r, rpc_r_cnt, interval) - << " rpc_w: " << CalcAverage(rpc_w, rpc_w_cnt, interval) - << " rpc_s: " << CalcAverage(rpc_s, rpc_s_cnt, interval); + << " get meta: " << (get_meta_cnt.Get() > 0 ? get_meta.Clear() / get_meta_cnt.Clear() / 1000 : 0) + << " callback: " << (user_callback_cnt.Get() > 0 ? user_callback.Clear() / user_callback_cnt.Clear() / 1000 : 0) + << " rpc_r: " << (rpc_r_cnt.Get() > 0 ? rpc_r.Clear() / rpc_r_cnt.Clear() / 1000 : 0) + << " rpc_w: " << (rpc_w_cnt.Get() > 0 ? rpc_w.Clear() / rpc_w_cnt.Clear() / 1000 : 0) + << " rpc_s: " << (rpc_s_cnt.Get() > 0 ? rpc_s.Clear() / rpc_s_cnt.Clear() / 1000 : 0); LOG(INFO) << log_prefix << "[mutation]" << " all: " << mutate_cnt.Clear() @@ -2029,11 +2049,11 @@ void TableImpl::PerfCounter::DoDumpPerfCounterLog(const std::string& log_prefix) << " cnt: " << user_mu_cnt.Clear() << " suc: " << user_mu_suc.Clear() << " fail: " << user_mu_fail.Clear(); - LOG(INFO) << log_prefix << "[user_mu_cost]" + LOG(INFO) << log_prefix << "[user_mu_cost]" << std::fixed << std::setprecision(2) << " cost_ave: " << hist_mu_cost.Average() - << " cost_50: " << hist_mu_cost.Percentile(0.5) - << " cost_90: " << hist_mu_cost.Percentile(0.9) - << " cost_99: " << hist_mu_cost.Percentile(0.99); + << " cost_50: " << hist_mu_cost.Percentile(50) + << " cost_90: " << hist_mu_cost.Percentile(90) + << " cost_99: " << hist_mu_cost.Percentile(99); hist_mu_cost.Clear(); LOG(INFO) << log_prefix << "[user_rd]" @@ -2041,39 +2061,45 @@ void TableImpl::PerfCounter::DoDumpPerfCounterLog(const std::string& log_prefix) << " suc: " << user_read_suc.Clear() << " notfound: " << user_read_notfound.Clear() << " fail: " << user_read_fail.Clear(); - LOG(INFO) << log_prefix << "[user_rd_cost]" + LOG(INFO) << log_prefix << "[user_rd_cost]" << std::fixed << std::setprecision(2) << " cost_ave: " << hist_read_cost.Average() - << " cost_50: " << hist_read_cost.Percentile(0.5) - << " cost_90: " << hist_read_cost.Percentile(0.9) - << " cost_99: " << hist_read_cost.Percentile(0.99); + << " cost_50: " << hist_read_cost.Percentile(50) + << " cost_90: " << hist_read_cost.Percentile(90) + << " cost_99: " << hist_read_cost.Percentile(99); hist_read_cost.Clear(); } void TableImpl::DelayTaskWrapper(ThreadPool::Task task, int64_t task_id) { + task(task_id); { MutexLock lock(&delay_task_id_mutex_); - if (delay_task_ids_.erase(task_id) == 0) { - // this task has been canceled - return; - } + delay_task_ids_.erase(task_id); } - task(task_id); } + int64_t TableImpl::AddDelayTask(int64_t delay_time, ThreadPool::Task task) { MutexLock lock(&delay_task_id_mutex_); ThreadPool::Task t = - boost::bind(&TableImpl::DelayTaskWrapper, this, task, _1); + std::bind(&TableImpl::DelayTaskWrapper, this, task, _1); int64_t t_id = thread_pool_->DelayTask(delay_time, t); delay_task_ids_.insert(t_id); return t_id; } + void TableImpl::ClearDelayTask() { MutexLock lock(&delay_task_id_mutex_); std::set::iterator it = delay_task_ids_.begin(); - for (; it != delay_task_ids_.end(); ++it) { - thread_pool_->CancelTask(*it); + while (it != delay_task_ids_.end()) { + int64_t task_id = *it; + // may deadlock, MUST unlock + delay_task_id_mutex_.Unlock(); + bool cancelled = thread_pool_->CancelTask(*it); + delay_task_id_mutex_.Lock(); + if (cancelled) { + delay_task_ids_.erase(task_id); + } + it = delay_task_ids_.begin(); } - delay_task_ids_.clear(); } void TableImpl::BreakRequest(int64_t task_id) { diff --git a/src/sdk/tera_easy.cc b/src/sdk/tera_easy.cc index b0f5255e0..c0758eb1d 100644 --- a/src/sdk/tera_easy.cc +++ b/src/sdk/tera_easy.cc @@ -6,8 +6,8 @@ #include "tera_easy.h" +#include #include -#include #include #include @@ -29,7 +29,7 @@ class TableImpl : public Table { : table_(table), client_(client), scanner_(NULL) { - ThreadPool::Task task = boost::bind(&TableImpl::PrintStatus, this); + ThreadPool::Task task = std::bind(&TableImpl::PrintStatus, this); thread_pool_.DelayTask(1000, task); } @@ -204,7 +204,7 @@ class TableImpl : public Table { << ", pending size " << s_pending_size_.Get() << ", success " << s_write_succ_num_.Clear() << ", fail " << s_write_fail_num_.Clear(); - ThreadPool::Task task = boost::bind(&TableImpl::PrintStatus, this); + ThreadPool::Task task = std::bind(&TableImpl::PrintStatus, this); thread_pool_.DelayTask(1000, task); } diff --git a/src/tabletnode/remote_tabletnode.cc b/src/tabletnode/remote_tabletnode.cc index 89549f081..2d95a0e5a 100644 --- a/src/tabletnode/remote_tabletnode.cc +++ b/src/tabletnode/remote_tabletnode.cc @@ -4,7 +4,7 @@ #include "tabletnode/remote_tabletnode.h" -#include +#include #include "gflags/gflags.h" #include "glog/logging.h" @@ -84,8 +84,8 @@ void RemoteTabletNode::LoadTablet(google::protobuf::RpcController* controller, uint64_t id = request->sequence_id(); LOG(INFO) << "accept RPC (LoadTablet) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoLoadTablet, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoLoadTablet, this, controller, + request, response, done); ctrl_thread_pool_->AddTask(callback); } @@ -96,8 +96,8 @@ void RemoteTabletNode::UnloadTablet(google::protobuf::RpcController* controller, uint64_t id = request->sequence_id(); LOG(INFO) << "accept RPC (UnloadTablet) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoUnloadTablet, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoUnloadTablet, this, controller, + request, response, done); ctrl_thread_pool_->AddTask(callback); } @@ -127,8 +127,8 @@ void RemoteTabletNode::ReadTablet(google::protobuf::RpcController* controller, ReadRpc* rpc = new ReadRpc(controller, request, response, done, timer, start_micros); read_rpc_schedule_->EnqueueRpc(request->tablet_name(), rpc); - read_thread_pool_->AddTask(boost::bind(&RemoteTabletNode::DoScheduleRpc, this, - read_rpc_schedule_.get())); + read_thread_pool_->AddTask(std::bind(&RemoteTabletNode::DoScheduleRpc, this, + read_rpc_schedule_.get())); } } @@ -155,8 +155,8 @@ void RemoteTabletNode::WriteTablet(google::protobuf::RpcController* controller, WriteRpcTimer* timer = new WriteRpcTimer(request, response, done, start_micros); RpcTimerList::Instance()->Push(timer); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoWriteTablet, this, - controller, request, response, done, timer); + std::bind(&RemoteTabletNode::DoWriteTablet, this, + controller, request, response, done, timer); write_thread_pool_->AddTask(callback); } } @@ -175,8 +175,8 @@ void RemoteTabletNode::ScanTablet(google::protobuf::RpcController* controller, scan_pending_counter.Inc(); ScanRpc* rpc = new ScanRpc(controller, request, response, done); scan_rpc_schedule_->EnqueueRpc(request->table_name(), rpc); - scan_thread_pool_->AddTask(boost::bind(&RemoteTabletNode::DoScheduleRpc, - this, scan_rpc_schedule_.get())); + scan_thread_pool_->AddTask(std::bind(&RemoteTabletNode::DoScheduleRpc, + this, scan_rpc_schedule_.get())); } } @@ -187,8 +187,8 @@ void RemoteTabletNode::GetSnapshot(google::protobuf::RpcController* controller, uint64_t id = request->sequence_id(); LOG(INFO) << "accept RPC (GetSnapshot) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoGetSnapshot, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoGetSnapshot, this, controller, + request, response, done); write_thread_pool_->AddPriorityTask(callback); } @@ -199,8 +199,8 @@ void RemoteTabletNode::ReleaseSnapshot(google::protobuf::RpcController* controll uint64_t id = request->sequence_id(); LOG(INFO) << "accept RPC (ReleaseSnapshot) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoReleaseSnapshot, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoReleaseSnapshot, this, controller, + request, response, done); write_thread_pool_->AddPriorityTask(callback); } @@ -211,8 +211,8 @@ void RemoteTabletNode::Rollback(google::protobuf::RpcController* controller, uint64_t id = request->sequence_id(); LOG(INFO) << "accept RPC (Rollback) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoRollback, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoRollback, this, controller, + request, response, done); write_thread_pool_->AddPriorityTask(callback); } @@ -224,8 +224,8 @@ void RemoteTabletNode::Query(google::protobuf::RpcController* controller, uint64_t id = request->sequence_id(); LOG(INFO) << "accept RPC (Query) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoQuery, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoQuery, this, controller, + request, response, done); ctrl_thread_pool_->AddPriorityTask(callback); } @@ -237,8 +237,8 @@ void RemoteTabletNode::CmdCtrl(google::protobuf::RpcController* controller, LOG(INFO) << "accept RPC (CmdCtrl) id: " << id << ", [" << request->command() << "] src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoCmdCtrl, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoCmdCtrl, this, controller, + request, response, done); ctrl_thread_pool_->AddPriorityTask(callback); } @@ -249,8 +249,8 @@ void RemoteTabletNode::SplitTablet(google::protobuf::RpcController* controller, uint64_t id = request->sequence_id(); LOG(INFO) << "accept RPC (SplitTablet) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoSplitTablet, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoSplitTablet, this, controller, + request, response, done); ctrl_thread_pool_->AddTask(callback); } @@ -262,8 +262,8 @@ void RemoteTabletNode::CompactTablet(google::protobuf::RpcController* controller LOG(INFO) << "accept RPC (CompactTablet) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); compact_pending_counter.Inc(); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoCompactTablet, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoCompactTablet, this, controller, + request, response, done); compact_thread_pool_->AddTask(callback); } @@ -274,8 +274,8 @@ void RemoteTabletNode::Update(google::protobuf::RpcController* controller, uint64_t id = request->sequence_id(); LOG(INFO) << "accept RPC (Update) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); ThreadPool::Task callback = - boost::bind(&RemoteTabletNode::DoUpdate, this, controller, - request, response, done); + std::bind(&RemoteTabletNode::DoUpdate, this, controller, + request, response, done); ctrl_thread_pool_->AddTask(callback); } diff --git a/src/tabletnode/tablet_manager.cc b/src/tabletnode/tablet_manager.cc index 726386771..158a6e255 100644 --- a/src/tabletnode/tablet_manager.cc +++ b/src/tabletnode/tablet_manager.cc @@ -40,7 +40,7 @@ bool TabletManager::AddTablet(const std::string& table_name, SetStatusCode(kTableExist, status); return false; } - *tablet_io = tablet_list_[tablet_range] = new io::TabletIO(key_start, key_end); + *tablet_io = tablet_list_[tablet_range] = new io::TabletIO(key_start, key_end, table_path); (*tablet_io)->AddRef(); return true; } diff --git a/src/tabletnode/tabletnode_entry.cc b/src/tabletnode/tabletnode_entry.cc index 8580d278d..37ac8409f 100644 --- a/src/tabletnode/tabletnode_entry.cc +++ b/src/tabletnode/tabletnode_entry.cc @@ -34,6 +34,14 @@ DECLARE_int32(tera_tabletnode_hang_detect_threshold); DECLARE_int32(tera_tabletnode_rpc_server_max_inflow); DECLARE_int32(tera_tabletnode_rpc_server_max_outflow); +std::string GetTeraEntryName() { + return "tabletnode"; +} + +tera::TeraEntry* GetTeraEntry() { + return new tera::tabletnode::TabletNodeEntry(); +} + namespace tera { namespace tabletnode { @@ -74,11 +82,9 @@ bool TabletNodeEntry::StartServer() { } void TabletNodeEntry::ShutdownServer() { + tabletnode_impl_->Exit(); LOG(INFO) << "shut down server"; - // StopServer要保证调用后, 不会再调用serveice的任何方法. rpc_server_->Stop(); - tabletnode_impl_->Exit(); - tabletnode_impl_.reset(); LOG(INFO) << "TabletNodeEntry stop done!"; } diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc index b66fd9521..0ed32fd14 100644 --- a/src/tabletnode/tabletnode_impl.cc +++ b/src/tabletnode/tabletnode_impl.cc @@ -4,16 +4,17 @@ #include "tabletnode/tabletnode_impl.h" +#include #include #include -#include #include #include #include #include "db/filename.h" #include "db/table_cache.h" +#include "common/thread.h" #include "io/io_utils.h" #include "io/utils_leveldb.h" #include "leveldb/cache.h" @@ -87,6 +88,8 @@ DECLARE_int64(tera_tabletnode_tcm_cache_size); DECLARE_string(flagfile); +using namespace std::placeholders; + extern tera::Counter range_error_counter; extern tera::Counter rand_read_delay; @@ -163,7 +166,7 @@ bool TabletNodeImpl::Init() { } SetTabletNodeStatus(kIsIniting); - thread_pool_->AddTask(boost::bind(&TabletNodeZkAdapterBase::Init, zk_adapter_.get())); + thread_pool_->AddTask(std::bind(&TabletNodeZkAdapterBase::Init, zk_adapter_.get())); return true; } @@ -197,23 +200,49 @@ void TabletNodeImpl::InitCacheSystem() { } bool TabletNodeImpl::Exit() { - thread_pool_.reset(); + std::vector tablet_ios; + tablet_manager_->GetAllTablets(&tablet_ios); - std::vector tablet_meta_list; - tablet_manager_->GetAllTabletMeta(&tablet_meta_list); - std::vector::iterator it = tablet_meta_list.begin(); - for (; it != tablet_meta_list.end(); ++it) { - TabletMeta*& tablet_meta = *it; - StatusCode status = kTabletNodeOk; - bool ret = UnloadTablet(tablet_meta->table_name(), - tablet_meta->key_range().key_start(), - tablet_meta->key_range().key_end(), &status); - LOG(INFO) << "unload tablet [" << tablet_meta->path() << "] return " << ret; - delete tablet_meta; + std::vector unload_threads; + unload_threads.resize(tablet_ios.size()); + + Counter worker_count; + worker_count.Set(tablet_ios.size()); + + for (uint32_t i = 0; i < tablet_ios.size(); ++i) { + io::TabletIO* tablet_io = tablet_ios[i]; + common::Thread& thread = unload_threads[i]; + thread.Start(std::bind(&TabletNodeImpl::UnloadTabletProc, + this, tablet_io, &worker_count)); + } + int64_t print_ms_ = get_millis(); + int64_t left = 0; + while ((left = worker_count.Get()) > 0) { + if (get_millis() - print_ms_ > 1000) { + LOG(INFO) << "[Exit] " << left << " tablets are still unloading ..."; + print_ms_ = get_millis(); + } + ThisThread::Sleep(100); + } + for (uint32_t i = 0; i < tablet_ios.size(); ++i) { + unload_threads[i].Join(); } return true; } +void TabletNodeImpl::UnloadTabletProc(io::TabletIO* tablet_io, Counter* worker_count) { + LOG(INFO) << "begin to unload tablet: " << *tablet_io; + StatusCode status; + if (!tablet_io->Unload(&status)) { + LOG(ERROR) << "fail to unload tablet: " << *tablet_io + << ", status: " << StatusCodeToString(status); + } else { + LOG(INFO) << "unload tablet success: " << *tablet_io; + } + tablet_io->DecRef(); + worker_count->Dec(); +} + void TabletNodeImpl::LoadTablet(const LoadTabletRequest* request, LoadTabletResponse* response, google::protobuf::Closure* done) { @@ -488,34 +517,6 @@ void TabletNodeImpl::WriteTablet(const WriteTabletRequest* request, return; } - // check arguments - for (int32_t i = 0; i < row_num; i++) { - const RowMutationSequence& mu_seq = request->row_list(i); - if (mu_seq.row_key().size() >= 64 * 1024) { // 64KB - response->set_status(kTableNotSupport); - done->Run(); - if (NULL != timer) { - RpcTimerList::Instance()->Erase(timer); - delete timer; - } - return; - } - int32_t mu_num = mu_seq.mutation_sequence_size(); - for (int32_t k = 0; k < mu_num; k++) { - const Mutation& mu = mu_seq.mutation_sequence(k); - if ((mu.qualifier().size() >= 64 * 1024) // 64KB - || (mu.value().size() >= 32 * 1024 * 1024)) { // 32MB - response->set_status(kTableNotSupport); - done->Run(); - if (NULL != timer) { - RpcTimerList::Instance()->Erase(timer); - delete timer; - } - return; - } - } - } - Counter* row_done_counter = new Counter; for (int32_t i = 0; i < row_num; i++) { io::TabletIO* tablet_io = tablet_manager_->GetTablet( @@ -555,8 +556,8 @@ void TabletNodeImpl::WriteTablet(const WriteTabletRequest* request, } else if (!tablet_io->Write(&tablet_task->row_mutation_vec, &tablet_task->row_status_vec, request->is_instant(), - boost::bind(&TabletNodeImpl::WriteTabletCallback, this, - tablet_task, _1, _2), + std::bind(&TabletNodeImpl::WriteTabletCallback, this, + tablet_task, _1, _2), &status)) { tablet_io->DecRef(); WriteTabletFail(tablet_task, status); @@ -1224,7 +1225,7 @@ void TabletNodeImpl::ReleaseMallocCache() { void TabletNodeImpl::EnableReleaseMallocCacheTimer(int32_t expand_factor) { assert(release_cache_timer_id_ == kInvalidTimerId); ThreadPool::Task task = - boost::bind(&TabletNodeImpl::ReleaseMallocCache, this); + std::bind(&TabletNodeImpl::ReleaseMallocCache, this); int64_t timeout_period = expand_factor * 1000LL * FLAGS_tera_tabletnode_tcm_cache_release_period; release_cache_timer_id_ = thread_pool_->DelayTask(timeout_period, task); diff --git a/src/tabletnode/tabletnode_impl.h b/src/tabletnode/tabletnode_impl.h index d0b5e722d..4b6fa7808 100644 --- a/src/tabletnode/tabletnode_impl.h +++ b/src/tabletnode/tabletnode_impl.h @@ -180,6 +180,9 @@ class TabletNodeImpl { const std::set active_tablets); bool ApplySchema(const UpdateRequest* request); + + void UnloadTabletProc(io::TabletIO* tablet_io, Counter* worker_count); + private: mutable Mutex status_mutex_; TabletNodeStatus status_; diff --git a/src/tabletnode/tabletnode_sysinfo.cc b/src/tabletnode/tabletnode_sysinfo.cc index c307c9712..4c1575b97 100644 --- a/src/tabletnode/tabletnode_sysinfo.cc +++ b/src/tabletnode/tabletnode_sysinfo.cc @@ -85,7 +85,7 @@ extern tera::Counter ssd_write_size_counter; } tera::Counter rand_read_delay; -tera::Counter row_read_delay; +extern tera::Counter row_read_delay; tera::Counter range_error_counter; tera::Counter read_pending_counter; tera::Counter write_pending_counter; @@ -295,9 +295,9 @@ static long long ProcessCpuTick() { // return number of cpu(cores) static int GetCpuCount() { -#ifdef _SC_NPROCESSORS_ONLN +#if defined(_SC_NPROCESSORS_ONLN) return sysconf(_SC_NPROCESSORS_ONLN); -#endif +#else FILE *fp = fopen("/proc/stat", "r"); if (fp == NULL) { return 1; @@ -323,6 +323,7 @@ static int GetCpuCount() { fclose(fp); free(aline); return i-1 > 0 ? i-1 : 1; +#endif } // irix_on == 1 --> irix mode on diff --git a/src/tera_c.cc b/src/tera_c.cc index ccc703db2..fd3fb2994 100644 --- a/src/tera_c.cc +++ b/src/tera_c.cc @@ -359,7 +359,7 @@ bool tera_table_is_get_finished(tera_table_t* table) { return table->rep->IsGetFinished(); } -void tera_row_mutation_put_kv(tera_row_mutation_t* mu, +void tera_row_mutation_put_kv(tera_row_mutation_t* mu, const char* val, uint64_t vallen, int32_t ttl) { mu->rep->Put(std::string(val, vallen), ttl); } @@ -376,6 +376,13 @@ void tera_row_mutation_put(tera_row_mutation_t* mu, const char* cf, mu->rep->Put(cf, std::string(qu, qulen), std::string(val, vallen)); } +void tera_row_mutation_put_with_timestamp(tera_row_mutation_t* mu, const char* cf, + const char* qu, uint64_t qulen, + int64_t timestamp, + const char* val, uint64_t vallen) { + mu->rep->Put(cf, std::string(qu, qulen), std::string(val, vallen), (int64_t)timestamp); +} + void tera_row_mutation_delete_column(tera_row_mutation_t* mu, const char* cf, const char* qu, uint64_t qulen) { mu->rep->DeleteColumn(cf, std::string(qu, qulen)); diff --git a/src/tera_flags.cc b/src/tera_flags.cc index beeceddc9..d5c51031a 100644 --- a/src/tera_flags.cc +++ b/src/tera_flags.cc @@ -67,6 +67,7 @@ DEFINE_bool(tera_use_flash_for_memenv, true, "Use flashenv for memery lg"); DEFINE_string(tera_leveldb_compact_strategy, "default", "the default strategy to drive consum compaction, should be [default|LG|dummy]"); DEFINE_bool(tera_leveldb_verify_checksums, true, "enable verify data read from storage against checksums"); DEFINE_bool(tera_leveldb_ignore_corruption_in_compaction, false, "skip corruption blocks of sst file in compaction"); +DEFINE_bool(tera_leveldb_use_file_lock, false, "hold file lock during loading leveldb"); DEFINE_int32(tera_rpc_client_max_inflow, -1, "the max input flow (in MB/s) for rpc-client, -1 means no limit"); DEFINE_int32(tera_rpc_client_max_outflow, -1, "the max input flow (in MB/s) for rpc-client, -1 means no limit"); @@ -178,6 +179,8 @@ DEFINE_int32(tera_tabletnode_scan_pack_max_size, 10240, "the max size(KB) of the DEFINE_int32(tera_asyncwriter_pending_limit, 10000, "the max pending data size (KB) in async writer"); DEFINE_bool(tera_enable_level0_limit, true, "enable level0 limit"); DEFINE_int32(tera_tablet_level0_file_limit, 20000, "the max level0 file num before write busy"); +DEFINE_int32(tera_tablet_ttl_percentage, 99, "percentage of ttl tag in sst file begin to trigger compaction"); +DEFINE_int32(tera_tablet_del_percentage, 20, "percentage of del tag in sst file begin to trigger compaction"); DEFINE_int32(tera_asyncwriter_sync_interval, 100, "the interval (in ms) to sync write buffer to disk"); DEFINE_int32(tera_asyncwriter_sync_size_threshold, 1024, "force sync per X KB"); DEFINE_int32(tera_asyncwriter_batch_size, 1024, "write batch to leveldb per X KB"); @@ -258,10 +261,13 @@ DEFINE_int32(tera_sdk_cookie_update_interval, 600, "the interval of cookie updat DEFINE_bool(tera_sdk_perf_counter_enabled, true, "enable performance counter log"); DEFINE_int64(tera_sdk_perf_counter_log_interval, 60, "the interval period (in sec) of performance counter log dumping"); -DEFINE_bool(tera_sdk_batch_scan_enabled, false, "enable batch scan"); +DEFINE_bool(tera_sdk_batch_scan_enabled, true, "enable batch scan"); DEFINE_int64(tera_sdk_scan_buffer_size, 65536, "default buffer limit for scan"); DEFINE_int64(tera_sdk_scan_number_limit, 1000000000, "default number limit for scan"); -DEFINE_int32(tera_sdk_max_batch_scan_req, 10, "the max number of concurrent scan req"); +DEFINE_int32(tera_sdk_max_batch_scan_req, 30, "the max number of concurrent scan req"); +DEFINE_int32(tera_sdk_batch_scan_max_retry, 60, "the max retry times for session scan"); +DEFINE_int64(tera_sdk_scan_timeout, 30000, "scan timeout"); +DEFINE_int64(batch_scan_delay_retry_in_us, 1000000, "timewait in us before retry batch scan"); DEFINE_string(tera_ins_addr_list, "", "the ins cluster addr. e.g. abc.com:1234,abb.com:1234"); DEFINE_string(tera_ins_root_path, "", "root path on ins. e.g /ps/sandbox"); diff --git a/src/tera_main.cc b/src/tera_main.cc index 5088f3036..2331436b9 100644 --- a/src/tera_main.cc +++ b/src/tera_main.cc @@ -8,43 +8,29 @@ #include #include "common/base/scoped_ptr.h" -#include "master/master_entry.h" -#include "tabletnode/tabletnode_entry.h" #include "tera_entry.h" #include "utils/utils_cmd.h" #include "version.h" -DECLARE_string(tera_role); DECLARE_string(tera_log_prefix); DECLARE_string(tera_local_addr); +extern std::string GetTeraEntryName(); +extern tera::TeraEntry* GetTeraEntry(); + volatile sig_atomic_t g_quit = 0; static void SignalIntHandler(int sig) { g_quit = 1; } -tera::TeraEntry* SwitchTeraEntry() { - const std::string& server_name = FLAGS_tera_role; - - if (server_name == "master") { - return new tera::master::MasterEntry(); - } else if (server_name == "tabletnode") { - return new tera::tabletnode::TabletNodeEntry(); - } - LOG(ERROR) << "FLAGS_tera_role should be one of (" - << "master | tabletnode" - << "), not : " << FLAGS_tera_role; - return NULL; -} - int main(int argc, char** argv) { ::google::ParseCommandLineFlags(&argc, &argv, true); ::google::InitGoogleLogging(argv[0]); if (!FLAGS_tera_log_prefix.empty()) { tera::utils::SetupLog(FLAGS_tera_log_prefix); } else { - tera::utils::SetupLog(FLAGS_tera_role); + tera::utils::SetupLog(GetTeraEntryName()); } if (argc > 1) { @@ -58,7 +44,7 @@ int main(int argc, char** argv) { signal(SIGINT, SignalIntHandler); signal(SIGTERM, SignalIntHandler); - scoped_ptr entry(SwitchTeraEntry()); + scoped_ptr entry(GetTeraEntry()); if (entry.get() == NULL) { return -1; } @@ -72,10 +58,6 @@ int main(int argc, char** argv) { LOG(ERROR) << "Server run error ,and then exit now "; break; } - // jvm会抢注这个, 时刻准备着抢回来 - signal(SIGINT, SignalIntHandler); - signal(SIGTERM, SignalIntHandler); - // signal(SIGSEGV, SIG_DFL); // 如果这个被改回SIG_DFL, jvm会core, 不知道为啥... } if (g_quit) { LOG(INFO) << "received interrupt signal from user, will stop"; diff --git a/src/tera_main_wrapper.cc b/src/tera_main_wrapper.cc new file mode 100644 index 000000000..8ce87b54a --- /dev/null +++ b/src/tera_main_wrapper.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include +#include +#include + +#include + +#include "version.h" + +DECLARE_string(tera_role); + +int main(int argc, char** argv) { + if (argc > 1 && strcmp(argv[1], "version") == 0) { + PrintSystemVersion(); + return 0; + } + + google::AllowCommandLineReparsing(); + google::ParseCommandLineFlags(&argc, &argv, false); + + const char* program = NULL; + if (FLAGS_tera_role == "master") { + program = "./tera_master"; + } else if (FLAGS_tera_role == "tabletnode") { + program = "./tabletserver"; + } else { + std::cerr << "FLAGS_tera_role should be one of (master | tabletnode)" << std::endl; + return -1; + } + + std::vector myargv; + myargv.resize(argc + 1); + myargv[0] = (char*)"tera_main"; + for (int i = 1; i < argc; i++) { + myargv[i] = argv[i]; + } + myargv[argc] = NULL; + if (-1 == execv(program, &myargv[0])) { + std::cerr << "execv " << program << " error: " << errno << std::endl; + return -1; + } + return 0; +} diff --git a/src/tera_test_main.cc b/src/tera_test_main.cc index 60ea163e4..f7fb788c7 100644 --- a/src/tera_test_main.cc +++ b/src/tera_test_main.cc @@ -3,16 +3,15 @@ // found in the LICENSE file. // -#include -#include -#include - #include +#include #include #include #include +#include +#include +#include -#include #include #include @@ -68,7 +67,7 @@ class KeySet { // gen row keys while (keys_.size() < key_num) { std::stringstream ss; - ss << (uint64_t)(rand() * rand()) << "abcdefghijklmnopqrstuvwxyz"; + ss << ((uint64_t)rand()) * ((uint64_t)rand()) << "abcdefghijklmnopqrstuvwxyz"; std::string key = ss.str(); keys_[key] = 0; keys_stat_[key] = false; @@ -295,7 +294,7 @@ int32_t SharedTableImplTest(int32_t argc, char** argv, ErrorCode* err) { ThreadPool thread_pool(100); for (int i = 0; i < 1000000; ++i) { ThreadPool::Task task = - boost::bind(&SharedTableImplTask, client, err); + std::bind(&SharedTableImplTask, client, err); thread_pool.AddTask(task); } while (thread_pool.PendingNum() > 0) { diff --git a/src/teracli_main.cc b/src/teracli_main.cc index 71769bb09..b53228d66 100644 --- a/src/teracli_main.cc +++ b/src/teracli_main.cc @@ -13,9 +13,10 @@ #include #include #include +#include +#include #include -#include #include #include @@ -52,7 +53,6 @@ DECLARE_int64(tera_sdk_status_timeout); DEFINE_int32(tera_client_batch_put_num, 1000, "num of each batch in batch put mode"); DEFINE_int32(tera_client_scan_package_size, 1024, "the package size (in KB) of each scan request"); -DEFINE_int64(scan_pack_interval, 5000, "scan timeout"); DEFINE_int64(snapshot, 0, "read | scan snapshot"); DEFINE_string(rollback_switch, "close", "Pandora's box, do not open"); DEFINE_string(rollback_name, "", "rollback operation's name"); @@ -62,6 +62,7 @@ DEFINE_int32(concurrency, 1, "concurrency for compact table."); DEFINE_int64(timestamp, -1, "timestamp."); DEFINE_string(tablets_file, "", "tablet set file"); +DEFINE_bool(readable, true, "readable input"); DEFINE_bool(printable, true, "printable output"); DEFINE_bool(print_data, true, "is print data when scan"); DEFINE_bool(rowkey_count, false, "is print rowkey count when scan"); @@ -84,13 +85,19 @@ tera::TPrinter::PrintOpt g_printer_opt; using namespace tera; -typedef boost::shared_ptr
TablePtr; -typedef boost::shared_ptr TableImplPtr; +typedef std::shared_ptr
TablePtr; +typedef std::shared_ptr TableImplPtr; +typedef std::map CommandTable; /// global variables of single-row-txn used in interactive mode tera::Transaction* g_row_txn = NULL; Table* g_row_txn_table = NULL; +static CommandTable& GetCommandTable(){ + static CommandTable command_table; + return command_table; +} + const char* builtin_cmd_list[] = { "create", "create [] \n\ @@ -301,6 +308,10 @@ static void PrintCmdHelpInfo(const char* msg) { } } +static void PrintCmdHelpInfo(const std::string& msg) { + PrintCmdHelpInfo(msg.c_str()); +} + static void PrintAllCmd() { std::cout << "there is cmd list:" << std::endl; int count = sizeof(builtin_cmd_list)/sizeof(char*); @@ -350,7 +361,7 @@ static void PrintUnknownCmdHelpInfo(const char* msg) { PrintAllCmd(); } -int32_t CreateOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t CreateOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -381,7 +392,7 @@ int32_t CreateOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t CreateByFileOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t CreateByFileOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -412,7 +423,7 @@ int32_t CreateByFileOp(Client* client, int32_t argc, char** argv, ErrorCode* err return 0; } -int32_t UpdateCheckOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t UpdateCheckOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -426,7 +437,7 @@ int32_t UpdateCheckOp(Client* client, int32_t argc, char** argv, ErrorCode* err) return 0; } -int32_t UpdateOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t UpdateOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -459,7 +470,7 @@ int32_t UpdateOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t DropOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t DropOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -473,7 +484,7 @@ int32_t DropOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t EnableOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t EnableOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -487,7 +498,7 @@ int32_t EnableOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t DisableOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t DisableOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -549,7 +560,7 @@ void ParseCfQualifier(const std::string& input, std::string* columnfamily, } } -int32_t PutInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t PutInt64Op(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5 && argc != 6) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -586,7 +597,7 @@ int32_t PutInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t PutCounterOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t PutCounterOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5 && argc != 6) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -625,7 +636,7 @@ int32_t PutCounterOp(Client* client, int32_t argc, char** argv, ErrorCode* err) return 0; } -int32_t PutOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t PutOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5 && argc != 6) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -669,7 +680,7 @@ int32_t PutOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t PutTTLOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t PutTTLOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 6 && argc != 7) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -691,11 +702,11 @@ int32_t PutTTLOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { if (argc == 6) { // use table as kv value = argv[4]; - ttl = atoi(argv[5]); + ttl = atoi(argv[5].c_str()); } else if (argc == 7) { ParseCfQualifier(argv[4], &columnfamily, &qualifier); value = argv[5]; - ttl = atoi(argv[6]); + ttl = atoi(argv[6].c_str()); } if (!table->Put(rowkey, columnfamily, qualifier, value, ttl, err)) { LOG(ERROR) << "fail to put record to table: " << tablename; @@ -704,7 +715,7 @@ int32_t PutTTLOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t AppendOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t AppendOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5 && argc != 6) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -736,7 +747,7 @@ int32_t AppendOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t PutIfAbsentOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t PutIfAbsentOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5 && argc != 6) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -768,7 +779,7 @@ int32_t PutIfAbsentOp(Client* client, int32_t argc, char** argv, ErrorCode* err) return 0; } -int32_t AddOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t AddOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5 && argc != 6) { LOG(ERROR)<< "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -805,7 +816,7 @@ int32_t AddOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t AddInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t AddInt64Op(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5 && argc != 6) { LOG(ERROR)<< "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -842,7 +853,7 @@ int32_t AddInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t GetInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t GetInt64Op(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4 && argc != 5) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -883,7 +894,7 @@ std::string PrintableFormatter(const std::string& value) { } } -int32_t GetOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t GetOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4 && argc != 5) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -933,7 +944,7 @@ int32_t GetOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t GetCounterOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t GetCounterOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4 && argc != 5) { LOG(ERROR) << "args number error: " << argc << ", need 5 | 6."; PrintCmdHelpInfo(argv[1]); @@ -973,7 +984,7 @@ int32_t GetCounterOp(Client* client, int32_t argc, char** argv, ErrorCode* err) } -int32_t DeleteOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t DeleteOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4 && argc != 5) { PrintCmdHelpInfo("delete"); return -1; @@ -1040,7 +1051,6 @@ int32_t DeleteOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { int32_t ScanRange(TablePtr& table, ScanDescriptor& desc, ErrorCode* err) { desc.SetBufferSize(FLAGS_tera_client_scan_package_size << 10); desc.SetAsync(FLAGS_tera_sdk_batch_scan_enabled); - desc.SetPackInterval(FLAGS_scan_pack_interval); desc.SetSnapshot(FLAGS_snapshot); ResultStream* result_stream; @@ -1097,7 +1107,7 @@ int32_t ScanRange(TablePtr& table, ScanDescriptor& desc, ErrorCode* err) { return 0; } -int32_t ScanOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t ScanOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5 && argc != 6) { PrintCmdHelpInfo("scan"); return -1; @@ -1238,8 +1248,8 @@ int32_t ShowTabletList(const TabletMetaList& tablet_list, bool is_server_addr, b uint64_t size = meta.size(); row.push_back(BytesNumberToString(size)); - row.push_back(DebugString(meta.key_range().key_start()).substr(0, 20)); - row.push_back(DebugString(meta.key_range().key_end()).substr(0, 20)); + row.push_back(DebugString(meta.key_range().key_start())); + row.push_back(DebugString(meta.key_range().key_end())); printer.AddRow(row); } } @@ -1644,7 +1654,7 @@ int32_t ShowTabletNodesInfo(Client* client, bool is_x, ErrorCode* err) { return 0; } -int32_t ShowTabletNodesOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t ShowTabletNodesOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 2) { LOG(ERROR) << "args number error: " << argc << ", need >2."; PrintCmdHelpInfo(argv[1]); @@ -1662,7 +1672,7 @@ int32_t ShowTabletNodesOp(Client* client, int32_t argc, char** argv, ErrorCode* return ret_val; } -int32_t ShowOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t ShowOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 2) { LOG(ERROR) << "args number error: " << argc << ", need >2."; PrintCmdHelpInfo(argv[1]); @@ -1682,7 +1692,7 @@ int32_t ShowOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return ret_val; } -int32_t ShowSchemaOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t ShowSchemaOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 3) { PrintCmdHelpInfo("showschema"); return -1; @@ -1725,7 +1735,7 @@ void BatchPutCallBack(RowMutation* mutation) { delete mutation; } -int32_t BatchPutOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t BatchPutOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4) { LOG(ERROR) << "args number error: " << argc << ", need 4."; PrintCmdHelpInfo(argv[1]); @@ -1753,10 +1763,18 @@ int32_t BatchPutOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { LOG(ERROR) << "input file format error, skip it: " << buf; continue; } - std::string& rowkey = input_v[0]; + std::string rowkey = input_v[0]; + if (FLAGS_readable && !ParseDebugString(input_v[0], &rowkey)) { + LOG(ERROR) << "input file format error, skip it: " << buf; + continue; + } std::string family; std::string qualifier; - std::string& value = input_v[input_v.size() - 1]; + std::string value = input_v[input_v.size() - 1]; + if (FLAGS_readable && !ParseDebugString(input_v[input_v.size() - 1], &value)) { + LOG(ERROR) << "input file format error, skip it: " << buf; + continue; + } RowMutation* mutation = table->NewRowMutation(rowkey); if (input_v.size() == 2) { // for kv mode @@ -1779,7 +1797,7 @@ int32_t BatchPutOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t BatchPutInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t BatchPutInt64Op(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4) { LOG(ERROR) << "args number error: " << argc << ", need 4."; PrintCmdHelpInfo(argv[1]); @@ -1807,10 +1825,18 @@ int32_t BatchPutInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* er LOG(ERROR) << "input file format error, skip it: " << buf; continue; } - std::string& rowkey = input_v[0]; + std::string rowkey = input_v[0]; + if (FLAGS_readable && !ParseDebugString(input_v[0], &rowkey)) { + LOG(ERROR) << "input file format error, skip it: " << buf; + continue; + } std::string family; std::string qualifier; - std::string& value = input_v[input_v.size() - 1]; + std::string value = input_v[input_v.size() - 1]; + if (FLAGS_readable && !ParseDebugString(input_v[input_v.size() - 1], &value)) { + LOG(ERROR) << "input file format error, skip it: " << buf; + continue; + } RowMutation* mutation = table->NewRowMutation(rowkey); int64_t value_int; if (!StringToNumber(value.c_str(), &value_int)) { @@ -1865,7 +1891,7 @@ void BatchGetCallBack(RowReader* reader) { delete reader; } -int32_t BatchGetOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t BatchGetOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4 && argc != 5) { LOG(ERROR) << "args number error: " << argc << ", need 4 | 5."; PrintCmdHelpInfo(argv[1]); @@ -1899,7 +1925,11 @@ int32_t BatchGetOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { LOG(ERROR) << "input file format error: " << buf; continue; } - std::string& rowkey = input_v[0]; + std::string rowkey = input_v[0]; + if (FLAGS_readable && !ParseDebugString(input_v[0], &rowkey)) { + LOG(ERROR) << "input file format error, skip it: " << buf; + continue; + } RowReader* reader = table->NewRowReader(rowkey); for (size_t i = 1; i < input_v.size(); ++i) { std::string& cfqu = input_v[i]; @@ -1955,7 +1985,7 @@ void BatchGetInt64CallBack(RowReader* reader) { delete reader; } -int32_t BatchGetInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t BatchGetInt64Op(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4 && argc != 5) { LOG(ERROR) << "args number error: " << argc << ", need 4 | 5."; PrintCmdHelpInfo(argv[1]); @@ -1989,7 +2019,11 @@ int32_t BatchGetInt64Op(Client* client, int32_t argc, char** argv, ErrorCode* er LOG(ERROR) << "input file format error: " << buf; continue; } - std::string& rowkey = input_v[0]; + std::string rowkey = input_v[0]; + if (FLAGS_readable && !ParseDebugString(input_v[0], &rowkey)) { + LOG(ERROR) << "input file format error, skip it: " << buf; + continue; + } if (input_v.size() == 1) { // only rowkey explicit, scan all records out ScanDescriptor desc(rowkey); @@ -2070,7 +2104,7 @@ int32_t GetRandomNumKey(int32_t key_size,std::string *p_key){ return 0; } -int32_t SnapshotOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t SnapshotOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 4) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2078,19 +2112,19 @@ int32_t SnapshotOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { std::string tablename = argv[2]; uint64_t snapshot = 0; - if (argc == 5 && strcmp(argv[3], "del") == 0) { + if (argc == 5 && strcmp(argv[3].c_str(), "del") == 0) { if (!client->DelSnapshot(tablename, FLAGS_snapshot, err)) { LOG(ERROR) << "fail to del snapshot: " << FLAGS_snapshot << " ," << err->ToString(); return -1; } std::cout << "Del snapshot " << snapshot << std::endl; - } else if (strcmp(argv[3], "create") == 0) { + } else if (strcmp(argv[3].c_str(), "create") == 0) { if (!client->GetSnapshot(tablename, &snapshot, err)) { LOG(ERROR) << "fail to get snapshot: " << err->ToString(); return -1; } std::cout << "new snapshot: " << snapshot << std::endl; - } else if (FLAGS_rollback_switch == "open" && strcmp(argv[3], "rollback") == 0) { + } else if (FLAGS_rollback_switch == "open" && strcmp(argv[3].c_str(), "rollback") == 0) { if (FLAGS_snapshot == 0) { std::cerr << "missing or invalid --snapshot option" << std::endl; return -1; @@ -2110,7 +2144,7 @@ int32_t SnapshotOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t SafeModeOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t SafeModeOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2142,7 +2176,7 @@ int32_t SafeModeOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t CookieOp(int32_t argc, char** argv) { +int32_t CookieOp(Client*, int32_t argc, std::string* argv, ErrorCode*) { std::string command; if (argc == 4) { command = argv[2]; @@ -2160,7 +2194,7 @@ int32_t CookieOp(int32_t argc, char** argv) { } // e.g. ./teracli kick : -int32_t KickTabletServerOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t KickTabletServerOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if ((argc != 3)) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2176,7 +2210,7 @@ int32_t KickTabletServerOp(Client* client, int32_t argc, char** argv, ErrorCode* return 0; } -int32_t ReloadConfigOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t ReloadConfigOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if ((argc != 4) || (std::string(argv[2]) != "config")) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2251,7 +2285,7 @@ int32_t CompactTablet(TabletInfo& tablet, int lg) { return 0; } -int32_t CompactTabletOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t CompactTabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2324,7 +2358,7 @@ bool GetTabletInfo(Client* client, const std::string& tablename, return true; } -int32_t ScanTabletOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t ScanTabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 4) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2367,7 +2401,7 @@ int32_t ScanTabletOp(Client* client, int32_t argc, char** argv, ErrorCode* err) return ret; } -int32_t TabletOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t TabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if ((argc != 4) && (argc != 5)) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2403,7 +2437,7 @@ int32_t TabletOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t RenameOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t RenameOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4 ) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2485,7 +2519,7 @@ bool FiltrateTabletsByFile(std::vector& tablet_list) { return true; } -int32_t CompactOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t CompactOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2511,7 +2545,7 @@ int32_t CompactOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { std::vector::iterator tablet_it = tablet_list.begin(); for (; tablet_it != tablet_list.end(); ++tablet_it) { ThreadPool::Task task = - boost::bind(&CompactTablet, *tablet_it, FLAGS_lg); + std::bind(&CompactTablet, *tablet_it, FLAGS_lg); thread_pool.AddTask(task); } while (thread_pool.PendingNum() > 0) { @@ -2524,7 +2558,7 @@ int32_t CompactOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t FindMasterOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t FindMasterOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 2) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2534,7 +2568,7 @@ int32_t FindMasterOp(Client* client, int32_t argc, char** argv, ErrorCode* err) return 0; } -int32_t FindTsOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t FindTsOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 3 && argc != 4) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2766,7 +2800,7 @@ int32_t ProcessMeta(const std::string& op, const TableMetaList& table_list, return 0; } -int32_t MetaOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t MetaOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4 && argc != 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2804,7 +2838,7 @@ int32_t MetaOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t FindTabletOp(int32_t argc, char** argv, ErrorCode* err) { +int32_t FindTabletOp(Client*, int32_t argc, std::string* argv, ErrorCode* err) { if ((argc != 4) && (argc != 5)) { PrintCmdHelpInfo(argv[1]); return -1; @@ -2889,7 +2923,7 @@ int32_t FindTabletOp(int32_t argc, char** argv, ErrorCode* err) { return 0; } -int32_t Meta2Op(Client *client, int32_t argc, char** argv) { +int32_t Meta2Op(Client*, int32_t argc, std::string* argv, ErrorCode*) { if (argc < 3) { PrintCmdHelpInfo("meta"); return -1; @@ -3022,7 +3056,7 @@ static int32_t DeleteUserFromGroup(Client* client, const std::string& user, return 0; } -int32_t UserOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t UserOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 4) { PrintCmdHelpInfo(argv[1]); return -1; @@ -3045,7 +3079,7 @@ int32_t UserOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return -1; } -int32_t RangeOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int32_t RangeOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -3082,7 +3116,7 @@ int32_t RangeOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int StartRowTxnOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int StartRowTxnOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 5) { PrintCmdHelpInfo(argv[1]); return -1; @@ -3111,7 +3145,7 @@ int StartRowTxnOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int CommitRowTxnOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int CommitRowTxnOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 3) { PrintCmdHelpInfo(argv[1]); return -1; @@ -3130,7 +3164,7 @@ int CommitRowTxnOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { return 0; } -int TxnOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { +int TxnOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc < 3) { LOG(ERROR) << "args number error: " << argc << ", need > 2"; PrintCmdHelpInfo(argv[1]); @@ -3148,7 +3182,7 @@ int TxnOp(Client* client, int32_t argc, char** argv, ErrorCode* err) { } } -int32_t HelpOp(int32_t argc, char** argv) { +int32_t HelpOp(Client*, int32_t argc, std::string* argv, ErrorCode*) { if (argc == 2) { PrintAllCmd(); } else if (argc == 3) { @@ -3159,105 +3193,99 @@ int32_t HelpOp(int32_t argc, char** argv) { return 0; } -int ExecuteCommand(Client* client, int argc, char* argv[]) { +int32_t HelpOp(int32_t argc, char** argv) { + std::vector argv_svec(argv, argv + argc); + return HelpOp(NULL, argc, &argv_svec[0], NULL); +} + +bool ParseCommand(int argc, char** arg_list, std::vector* parsed_arg_list) { + for (int i = 0; i < argc; i++) { + std::string parsed_arg = arg_list[i]; + if (FLAGS_readable && !ParseDebugString(arg_list[i], &parsed_arg)) { + std::cout << "invalid debug format of argument: " << arg_list[i] << std::endl; + return false; + } + parsed_arg_list->push_back(parsed_arg); + } + return true; +} + +static void InitializeCommandTable(){ + CommandTable& command_table = GetCommandTable(); + command_table["create"] = CreateOp; + command_table["createbyfile"] = CreateByFileOp; + command_table["update"] = UpdateOp; + command_table["update-check"] = UpdateCheckOp; + command_table["drop"] = DropOp; + command_table["enable"] = EnableOp; + command_table["disable"] = DisableOp; + command_table["show"] = ShowOp; + command_table["showx"] = ShowOp; + command_table["showall"] = ShowOp; + command_table["showschema"] = ShowSchemaOp; + command_table["showschemax"] = ShowSchemaOp; + command_table["showts"] = ShowTabletNodesOp; + command_table["showtsx"] = ShowTabletNodesOp; + command_table["put"] = PutOp; + command_table["putint64"] = PutInt64Op; + command_table["put-ttl"] = PutTTLOp; + command_table["put_counter"] = PutCounterOp; + command_table["add"] = AddOp; + command_table["addint64"] = AddInt64Op; + command_table["putif"] = PutIfAbsentOp; + command_table["append"] = AppendOp; + command_table["get"] = GetOp; + command_table["getint64"] = GetInt64Op; + command_table["get_counter"] = GetCounterOp; + command_table["delete"] = DeleteOp; + command_table["delete1v"] = DeleteOp; + command_table["batchput"] = BatchPutOp; + command_table["batchputint64"] = BatchPutInt64Op; + command_table["batchget"] = BatchGetOp; + command_table["batchgetint64"] = BatchGetInt64Op; + command_table["scan"] = ScanOp; + command_table["scanallv"] = ScanOp; + command_table["safemode"] = SafeModeOp; + command_table["tablet"] = TabletOp; + command_table["rename"] = RenameOp; + command_table["meta"] = MetaOp; + command_table["compact"] = CompactOp; + command_table["findmaster"] = FindMasterOp; + command_table["findts"] = FindTsOp; + command_table["findtablet"] = FindTabletOp; + command_table["meta2"] = Meta2Op; + command_table["user"] = UserOp; + command_table["reload"] = ReloadConfigOp; + command_table["kick"] = KickTabletServerOp; + command_table["cookie"] = CookieOp; + command_table["snapshot"] = SnapshotOp; + command_table["range"] = RangeOp; + command_table["rangex"] = RangeOp; + command_table["txn"] = TxnOp; + command_table["help"] = HelpOp; +} + +int ExecuteCommand(Client* client, int argc, char** arg_list) { int ret = 0; ErrorCode error_code; + + std::vector parsed_arg_list; + if (!ParseCommand(argc, arg_list, &parsed_arg_list)) { + return 1; + } + std::string* argv = &parsed_arg_list[0]; + + CommandTable& command_table = GetCommandTable(); std::string cmd = argv[1]; - if (cmd == "create") { - ret = CreateOp(client, argc, argv, &error_code); - } else if (cmd == "createbyfile") { - ret = CreateByFileOp(client, argc, argv, &error_code); - } else if (cmd == "update") { - ret = UpdateOp(client, argc, argv, &error_code); - } else if (cmd == "update-check") { - ret = UpdateCheckOp(client, argc, argv, &error_code); - } else if (cmd == "drop") { - ret = DropOp(client, argc, argv, &error_code); - } else if (cmd == "enable") { - ret = EnableOp(client, argc, argv, &error_code); - } else if (cmd == "disable") { - ret = DisableOp(client, argc, argv, &error_code); - } else if (cmd == "show" || cmd == "showx" || cmd == "showall") { - ret = ShowOp(client, argc, argv, &error_code); - } else if (cmd == "showschema" || cmd == "showschemax") { - ret = ShowSchemaOp(client, argc, argv, &error_code); - } else if (cmd == "showts" || cmd == "showtsx") { - ret = ShowTabletNodesOp(client, argc, argv, &error_code); - } else if (cmd == "put") { - ret = PutOp(client, argc, argv, &error_code); - } else if (cmd == "putint64") { - ret = PutInt64Op(client, argc, argv, &error_code); - } else if (cmd == "put-ttl") { - ret = PutTTLOp(client, argc, argv, &error_code); - } else if (cmd == "put_counter") { - ret = PutCounterOp(client, argc, argv, &error_code); - } else if (cmd == "add") { - ret = AddOp(client, argc, argv, &error_code); - } else if (cmd == "addint64") { - ret = AddInt64Op(client, argc, argv, &error_code); - } else if (cmd == "putif") { - ret = PutIfAbsentOp(client, argc, argv, &error_code); - } else if (cmd == "append") { - ret = AppendOp(client, argc, argv, &error_code); - } else if (cmd == "get") { - ret = GetOp(client, argc, argv, &error_code); - } else if (cmd == "getint64") { - ret = GetInt64Op(client, argc, argv, &error_code); - } else if (cmd == "get_counter") { - ret = GetCounterOp(client, argc, argv, &error_code); - } else if (cmd == "delete" || cmd == "delete1v") { - ret = DeleteOp(client, argc, argv, &error_code); - } else if (cmd == "batchput") { - ret = BatchPutOp(client, argc, argv, &error_code); - } else if (cmd == "batchputint64") { - ret = BatchPutInt64Op(client, argc, argv, &error_code); - } else if (cmd == "batchget") { - ret = BatchGetOp(client, argc, argv, &error_code); - } else if (cmd == "batchgetint64") { - ret = BatchGetInt64Op(client, argc, argv, &error_code); - } else if (cmd == "scan" || cmd == "scanallv") { - ret = ScanOp(client, argc, argv, &error_code); - } else if (cmd == "safemode") { - ret = SafeModeOp(client, argc, argv, &error_code); - } else if (cmd == "tablet") { - ret = TabletOp(client, argc, argv, &error_code); - } else if (cmd == "rename") { - ret = RenameOp(client, argc, argv, &error_code); - } else if (cmd == "meta") { - ret = MetaOp(client, argc, argv, &error_code); - } else if (cmd == "compact") { - ret = CompactOp(client, argc, argv, &error_code); - } else if (cmd == "findmaster") { - // get master addr(hostname:port) - ret = FindMasterOp(client, argc, argv, &error_code); - } else if (cmd == "findts") { - // get tabletnode addr from a key - ret = FindTsOp(client, argc, argv, &error_code); - } else if (cmd == "findtablet") { - ret = FindTabletOp(argc, argv, &error_code); - } else if (cmd == "meta2") { - ret = Meta2Op(client, argc, argv); - } else if (cmd == "user") { - ret = UserOp(client, argc, argv, &error_code); - } else if (cmd == "reload") { - ret = ReloadConfigOp(client, argc, argv, &error_code); - } else if (cmd == "kick") { - ret = KickTabletServerOp(client, argc, argv, &error_code); - } else if (cmd == "cookie") { - ret = CookieOp(argc, argv); - } else if (cmd == "snapshot") { - ret = SnapshotOp(client, argc, argv, &error_code); - } else if (cmd == "range" || cmd == "rangex") { - ret = RangeOp(client, argc, argv, &error_code); - } else if (cmd == "txn") { - ret = TxnOp(client, argc, argv, &error_code); - } else if (cmd == "version") { + if (cmd == "version") { PrintSystemVersion(); - } else if (cmd == "help") { - HelpOp(argc, argv); + } else if (command_table.find(cmd) != command_table.end()) { + ret = command_table[cmd](client, argc, argv, &error_code); } else { - PrintUnknownCmdHelpInfo(argv[1]); + PrintUnknownCmdHelpInfo(argv[1].c_str()); + ret = 1; } + if (error_code.GetType() != ErrorCode::kOK) { LOG(ERROR) << "fail reason: " << error_code.ToString(); } @@ -3283,6 +3311,8 @@ int main(int argc, char* argv[]) { } g_printer_opt.print_head = FLAGS_stdout_is_tty; + InitializeCommandTable(); + int ret = 0; if (argc == 1) { char* line = NULL; diff --git a/src/utils/config_utils.cc b/src/utils/config_utils.cc index 96a7178c5..fe74f2572 100644 --- a/src/utils/config_utils.cc +++ b/src/utils/config_utils.cc @@ -21,6 +21,7 @@ bool LoadFlagFile(const std::string& file) { argv[1] = const_cast(flag.c_str()); argv[2] = NULL; ::google::ParseCommandLineFlags(&argc, &argv, false); + argv[1] = NULL; delete[] argv; return true; } diff --git a/src/utils/string_util.cc b/src/utils/string_util.cc index 40bb0ea2a..4e0cf1865 100644 --- a/src/utils/string_util.cc +++ b/src/utils/string_util.cc @@ -13,7 +13,11 @@ namespace tera { bool IsVisible(char c) { - return (c >= 0x20 && c <= 0x7E); + return (c >= 0x21 && c <= 0x7E); // exclude space (0x20) +} + +char IsHex(uint8_t i) { + return ((i >= '0' && i <= '9') || (i >= 'a' && i <= 'f') || (i >= 'A' && i <= 'F')); } char ToHex(uint8_t i) { @@ -26,6 +30,18 @@ char ToHex(uint8_t i) { return j; } +char ToBinary(uint8_t i) { + char j = 0; + if (i >= '0' && i <= '9') { + j = i - '0'; + } else if (i >= 'a' && i <= 'f') { + j = i - 'a' + 10; + } else { + j = i - 'A' + 10; + } + return j; +} + std::string DebugString(const std::string& src) { size_t src_len = src.size(); std::string dst; @@ -47,6 +63,69 @@ std::string DebugString(const std::string& src) { return dst.substr(0, j); } +bool ParseDebugString(const std::string& src, std::string* dst) { + size_t src_len = src.size(); + std::string tmp; + tmp.resize(src_len); + + int state = 0; // 0: normal, 1: \, 2: \x, 3: \x[0-9a-fAZ-F] + char bin_char = 0; + size_t j = 0; + for (size_t i = 0; i < src_len; i++) { + uint8_t c = src[i]; + if (!IsVisible(c) && !isspace(c)) { + return false; + } + switch (state) { + case 0: + if (c == '\\') { + state = 1; + } else { + tmp[j++] = c; + } + break; + case 1: + if (c == 'x') { + state = 2; + } else if (c == '\\') { + tmp[j++] = '\\'; + state = 0; + } else { + return false; + } + break; + case 2: + if (!IsHex(c)) { + return false; + } else { + bin_char |= (ToBinary(c) << 4); + state = 3; + } + break; + case 3: + if (!IsHex(c)) { + return false; + } else { + bin_char |= ToBinary(c) & 0xF; + tmp[j++] = bin_char; + bin_char = 0; + state = 0; + } + break; + default: + abort(); + break; + } + } + + if (state != 0) { + return false; + } + + dst->assign(tmp.substr(0, j)); + return true; +} + bool IsValidTableName(const std::string& str) { return IsValidName(str); } @@ -72,7 +151,7 @@ bool IsValidName(const std::string& str) { for (size_t i = 0; i < str.size(); ++i) { char c = str[i]; if (!(isdigit(c) || isupper(c) || islower(c) - || (c == '_') || (c == '.') || (c == '-'))) { + || (c == '_') || (c == '.') || (c == '-') || (c == '#'))) { return false; } } diff --git a/src/utils/string_util.h b/src/utils/string_util.h index 09049b844..bc1f409d4 100644 --- a/src/utils/string_util.h +++ b/src/utils/string_util.h @@ -12,7 +12,11 @@ namespace tera { extern const size_t kNameLenMin; extern const size_t kNameLenMax; + // binary string to debug string std::string DebugString(const std::string& src); + // debug string to binary string + bool ParseDebugString(const std::string& src, std::string* dst); + bool IsValidName(const std::string& str); bool IsValidTableName(const std::string& str); bool IsValidGroupName(const std::string& name); diff --git a/src/utils/test/counter_test.cc b/src/utils/test/counter_test.cc index 49d02cff7..526f9cae6 100644 --- a/src/utils/test/counter_test.cc +++ b/src/utils/test/counter_test.cc @@ -5,7 +5,7 @@ #include #include -#include +#include #include "gtest/gtest.h" @@ -69,17 +69,17 @@ TEST(CounterTest, Basic) { Counter counter; ThreadPool* pool = new ThreadPool(thread_num); for (int i = 0; i < thread_num / 4; ++i) { - boost::function callback = - boost::bind(&callback_add, &counter); + std::function callback = + std::bind(&callback_add, &counter); pool->AddTask(callback); - callback = boost::bind(&callback_sub, &counter); + callback = std::bind(&callback_sub, &counter); pool->AddTask(callback); - callback = boost::bind(&callback_inc, &counter); + callback = std::bind(&callback_inc, &counter); pool->AddTask(callback); - callback = boost::bind(&callback_dec, &counter); + callback = std::bind(&callback_dec, &counter); pool->AddTask(callback); MutexLock locker(&mutex); @@ -99,14 +99,14 @@ TEST(CounterTest, Clear) { Counter counter; ThreadPool* pool = new ThreadPool(thread_num); for (int i = 0; i < thread_num / 3; ++i) { - boost::function callback = - boost::bind(&callback_add, &counter); + std::function callback = + std::bind(&callback_add, &counter); pool->AddTask(callback); - callback = boost::bind(&callback_inc, &counter); + callback = std::bind(&callback_inc, &counter); pool->AddTask(callback); - callback = boost::bind(&callback_clear, &counter); + callback = std::bind(&callback_clear, &counter); pool->AddTask(callback); MutexLock lock(&mutex); diff --git a/src/zk/zk_adapter.cc b/src/zk/zk_adapter.cc index 5a46dce08..926005a82 100644 --- a/src/zk/zk_adapter.cc +++ b/src/zk/zk_adapter.cc @@ -7,8 +7,8 @@ #include "zk/zk_adapter.h" #include +#include -#include #include #include "common/this_thread.h" @@ -816,7 +816,7 @@ void ZooKeeperAdapter::SessionEventCallBack(int state) { LOG(INFO) << "disconnect from zk server, enable timer: " << session_timeout_ << " ms"; ThreadPool::Task task = - boost::bind(&ZooKeeperAdapter::SessionTimeoutWrapper, this); + std::bind(&ZooKeeperAdapter::SessionTimeoutWrapper, this); session_timer_id_ = thread_pool_.DelayTask(session_timeout_, task); } session_id_ = -1; diff --git a/test/testcase/common.py b/test/testcase/common.py old mode 100755 new mode 100644 index 897415b66..56fa734f7 --- a/test/testcase/common.py +++ b/test/testcase/common.py @@ -32,6 +32,7 @@ def runcmd(cmd, ignore_status=False): print "stderr: " print err print "returncode: %d" % p.returncode + p.wait() ret = p.returncode if not ignore_status: assert( ret == 0 ) @@ -83,7 +84,7 @@ def cleanup(): for f in files: if f.endswith('.out'): os.remove(f) - + def print_debug_msg(sid=0, msg=""): """ provide general print interface @@ -180,10 +181,11 @@ def createbyfile(schema, deli=''): def rowread_table(table_name, file_path): allv = 'scan' - + flags = '--printable=false' + tmpfile = 'tmp.file' - scan_cmd = '{teracli} {op} {table_name} "" "" > {out}'.format( - teracli=const.teracli_binary, op=allv, table_name=table_name, out=tmpfile) + scan_cmd = '{teracli} {flags} {op} {table_name} "" "" > {out}'.format( + teracli=const.teracli_binary, flags=flags, op=allv, table_name=table_name, out=tmpfile) print scan_cmd ret = subprocess.Popen(scan_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) ret.communicate() @@ -196,9 +198,9 @@ def rowread_table(table_name, file_path): print awk_cmd ret = subprocess.Popen(awk_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) ret.communicate() - - rowread_cmd = 'while read line; do {teracli} get {table_name} $line; done < {out1} > {output}'.format( - teracli=const.teracli_binary, table_name=table_name, out1=tmpfile2, output=file_path) + + rowread_cmd = 'while read line; do {teracli} {flags} get {table_name} $line; done < {out1} > {output}'.format( + teracli=const.teracli_binary, flags=flags, table_name=table_name, out1=tmpfile2, output=file_path) ret = subprocess.Popen(rowread_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) ret.communicate() @@ -287,16 +289,16 @@ def scan_table(table_name, file_path, allversion, snapshot=0, is_async=False): allv += 'scanallv' else: allv += 'scan' - + if is_async is True: - async_flag = '--tera_sdk_batch_scan_enabled=true --v=30 ' + async_flag = '--tera_sdk_batch_scan_enabled=true --v=30 --printable=false' else: - async_flag = '--tera_sdk_batch_scan_enabled=false' - + async_flag = '--tera_sdk_batch_scan_enabled=false --printable=false' + snapshot_args = '' if snapshot != 0: snapshot_args += '--snapshot={snapshot}'.format(snapshot=snapshot) - + scan_cmd = '{teracli} {flags} {op} {table_name} "" "" {snapshot} > {out}'.format( teracli=const.teracli_binary, flags=async_flag, op=allv, table_name=table_name, snapshot=snapshot_args, out=file_path) runcmd(scan_cmd) diff --git a/test/testcase/shell_script/launch_ts_first.sh b/test/testcase/shell_script/launch_ts_first.sh index 9f716402b..02ceb6502 100755 --- a/test/testcase/shell_script/launch_ts_first.sh +++ b/test/testcase/shell_script/launch_ts_first.sh @@ -25,9 +25,8 @@ MASTER_LOG_FILE=${CURRENT_DIR}/../log/master.stderr if [ -f ${MASTER_LOG_FILE} ];then mv ${MASTER_LOG_FILE} ${MASTER_LOG_FILE}.${TIME} fi -${CURRENT_DIR}/tera_main \ +${CURRENT_DIR}/tera_master \ --flagfile=${CURRENT_DIR}/../conf/tera.flag \ - --tera_role=master \ --tera_master_port=${PORT} \ --tera_fake_zk_path_prefix=${FAKE_ZK_PATH_PREFIX} \ --tera_log_prefix=master &> ${MASTER_LOG_FILE}