diff --git a/src/build_log.cc b/src/build_log.cc
index 52c7c84f85..073d2fe81e 100644
--- a/src/build_log.cc
+++ b/src/build_log.cc
@@ -53,63 +53,14 @@ using namespace std;
 namespace {
 
 const char kFileSignature[] = "# ninja log v%d\n";
-const int kOldestSupportedVersion = 6;
-const int kCurrentVersion = 6;
-
-// 64bit MurmurHash2, by Austin Appleby
-#if defined(_MSC_VER)
-#define BIG_CONSTANT(x) (x)
-#else   // defined(_MSC_VER)
-#define BIG_CONSTANT(x) (x##LLU)
-#endif // !defined(_MSC_VER)
-inline
-uint64_t MurmurHash64A(const void* key, size_t len) {
-  static const uint64_t seed = 0xDECAFBADDECAFBADull;
-  const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
-  const int r = 47;
-  uint64_t h = seed ^ (len * m);
-  const unsigned char* data = static_cast<const unsigned char*>(key);
-  while (len >= 8) {
-    uint64_t k;
-    memcpy(&k, data, sizeof k);
-    k *= m;
-    k ^= k >> r;
-    k *= m;
-    h ^= k;
-    h *= m;
-    data += 8;
-    len -= 8;
-  }
-  switch (len & 7)
-  {
-  case 7: h ^= uint64_t(data[6]) << 48;
-          NINJA_FALLTHROUGH;
-  case 6: h ^= uint64_t(data[5]) << 40;
-          NINJA_FALLTHROUGH;
-  case 5: h ^= uint64_t(data[4]) << 32;
-          NINJA_FALLTHROUGH;
-  case 4: h ^= uint64_t(data[3]) << 24;
-          NINJA_FALLTHROUGH;
-  case 3: h ^= uint64_t(data[2]) << 16;
-          NINJA_FALLTHROUGH;
-  case 2: h ^= uint64_t(data[1]) << 8;
-          NINJA_FALLTHROUGH;
-  case 1: h ^= uint64_t(data[0]);
-          h *= m;
-  };
-  h ^= h >> r;
-  h *= m;
-  h ^= h >> r;
-  return h;
-}
-#undef BIG_CONSTANT
-
+const int kOldestSupportedVersion = 7;
+const int kCurrentVersion = 7;
 
 }  // namespace
 
 // static
 uint64_t BuildLog::LogEntry::HashCommand(StringPiece command) {
-  return MurmurHash64A(command.str_, command.len_);
+  return rapidhash(command.str_, command.len_);
 }
 
 BuildLog::LogEntry::LogEntry(const string& output)
diff --git a/src/build_log_test.cc b/src/build_log_test.cc
index 12c2dc742c..630b1f1a92 100644
--- a/src/build_log_test.cc
+++ b/src/build_log_test.cc
@@ -104,7 +104,7 @@ TEST_F(BuildLogTest, FirstWriteAddsSignature) {
 
 TEST_F(BuildLogTest, DoubleEntry) {
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "0\t1\t2\tout\t%" PRIx64 "\n",
       BuildLog::LogEntry::HashCommand("command abc"));
   fprintf(f, "0\t1\t2\tout\t%" PRIx64 "\n",
@@ -177,7 +177,7 @@ TEST_F(BuildLogTest, ObsoleteOldVersion) {
 
 TEST_F(BuildLogTest, SpacesInOutput) {
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "123\t456\t456\tout with space\t%" PRIx64 "\n",
       BuildLog::LogEntry::HashCommand("command"));
   fclose(f);
@@ -200,10 +200,10 @@ TEST_F(BuildLogTest, DuplicateVersionHeader) {
   // build log on Windows. This shouldn't crash, and the second version header
   // should be ignored.
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "123\t456\t456\tout\t%" PRIx64 "\n",
       BuildLog::LogEntry::HashCommand("command"));
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "456\t789\t789\tout2\t%" PRIx64 "\n",
       BuildLog::LogEntry::HashCommand("command2"));
   fclose(f);
@@ -252,7 +252,7 @@ struct TestDiskInterface : public DiskInterface {
 
 TEST_F(BuildLogTest, Restat) {
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n"
+  fprintf(f, "# ninja log v7\n"
              "1\t2\t3\tout\tcommand\n");
   fclose(f);
   std::string err;
@@ -280,7 +280,7 @@ TEST_F(BuildLogTest, VeryLongInputLine) {
   // Ninja's build log buffer is currently 256kB. Lines longer than that are
   // silently ignored, but don't affect parsing of other lines.
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "123\t456\t456\tout\tcommand start");
   for (size_t i = 0; i < (512 << 10) / strlen(" more_command"); ++i)
     fputs(" more_command", f);
diff --git a/src/deps_log.cc b/src/deps_log.cc
index ec0a4898a2..364d54b896 100644
--- a/src/deps_log.cc
+++ b/src/deps_log.cc
@@ -185,15 +185,13 @@ LoadStatus DepsLog::Load(const string& path, State* state, string* err) {
     return LOAD_SUCCESS;
   }
 
-  long offset;
+  long offset = ftell(f);
   bool read_failed = false;
   int unique_dep_record_count = 0;
   int total_dep_record_count = 0;
   for (;;) {
-    offset = ftell(f);
-
     unsigned size;
-    if (fread(&size, 4, 1, f) < 1) {
+    if (fread(&size, sizeof(size), 1, f) < 1) {
       if (!feof(f))
         read_failed = true;
       break;
@@ -205,6 +203,7 @@ LoadStatus DepsLog::Load(const string& path, State* state, string* err) {
       read_failed = true;
       break;
     }
+    offset += size + sizeof(size);
 
     if (is_deps) {
       if ((size % 4) != 0) {
diff --git a/src/eval_env.cc b/src/eval_env.cc
index 796a3264d1..cbc935acc2 100644
--- a/src/eval_env.cc
+++ b/src/eval_env.cc
@@ -99,6 +99,10 @@ string BindingEnv::LookupWithFallback(const string& var,
 }
 
 string EvalString::Evaluate(Env* env) const {
+  if (parsed_.empty()) {
+    return single_token_;
+  }
+
   string result;
   for (TokenList::const_iterator i = parsed_.begin(); i != parsed_.end(); ++i) {
     if (i->second == RAW)
@@ -110,40 +114,57 @@ string EvalString::Evaluate(Env* env) const {
 }
 
 void EvalString::AddText(StringPiece text) {
-  // Add it to the end of an existing RAW token if possible.
-  if (!parsed_.empty() && parsed_.back().second == RAW) {
-    parsed_.back().first.append(text.str_, text.len_);
+  if (parsed_.empty()) {
+    single_token_.append(text.begin(), text.end());
+  } else if (!parsed_.empty() && parsed_.back().second == RAW) {
+    parsed_.back().first.append(text.begin(), text.end());
   } else {
-    parsed_.push_back(make_pair(text.AsString(), RAW));
+    parsed_.push_back(std::make_pair(text.AsString(), RAW));
   }
 }
+
 void EvalString::AddSpecial(StringPiece text) {
-  parsed_.push_back(make_pair(text.AsString(), SPECIAL));
+  if (parsed_.empty() && !single_token_.empty()) {
+    // Going from one to two tokens, so we can no longer apply
+    // our single_token_ optimization and need to push everything
+    // onto the vector.
+    parsed_.push_back(std::make_pair(std::move(single_token_), RAW));
+  }
+  parsed_.push_back(std::make_pair(text.AsString(), SPECIAL));
 }
 
 string EvalString::Serialize() const {
   string result;
-  for (TokenList::const_iterator i = parsed_.begin();
-       i != parsed_.end(); ++i) {
+  if (parsed_.empty() && !single_token_.empty()) {
     result.append("[");
-    if (i->second == SPECIAL)
-      result.append("$");
-    result.append(i->first);
+    result.append(single_token_);
     result.append("]");
+  } else {
+    for (const auto& pair : parsed_) {
+      result.append("[");
+      if (pair.second == SPECIAL)
+        result.append("$");
+      result.append(pair.first.begin(), pair.first.end());
+      result.append("]");
+    }
   }
   return result;
 }
 
 string EvalString::Unparse() const {
   string result;
-  for (TokenList::const_iterator i = parsed_.begin();
-       i != parsed_.end(); ++i) {
-    bool special = (i->second == SPECIAL);
-    if (special)
-      result.append("${");
-    result.append(i->first);
-    if (special)
-      result.append("}");
+  if (parsed_.empty() && !single_token_.empty()) {
+    result.append(single_token_.begin(), single_token_.end());
+  } else {
+    for (TokenList::const_iterator i = parsed_.begin();
+         i != parsed_.end(); ++i) {
+      bool special = (i->second == SPECIAL);
+      if (special)
+        result.append("${");
+      result.append(i->first.begin(), i->first.end());
+      if (special)
+        result.append("}");
+    }
   }
   return result;
 }
diff --git a/src/eval_env.h b/src/eval_env.h
index 677dc217a2..ae6d8bc898 100644
--- a/src/eval_env.h
+++ b/src/eval_env.h
@@ -39,8 +39,8 @@ struct EvalString {
   /// @return The string with variables not expanded.
   std::string Unparse() const;
 
-  void Clear() { parsed_.clear(); }
-  bool empty() const { return parsed_.empty(); }
+  void Clear() { parsed_.clear(); single_token_.clear(); }
+  bool empty() const { return parsed_.empty() && single_token_.empty(); }
 
   void AddText(StringPiece text);
   void AddSpecial(StringPiece text);
@@ -53,6 +53,12 @@ struct EvalString {
   enum TokenType { RAW, SPECIAL };
   typedef std::vector<std::pair<std::string, TokenType> > TokenList;
   TokenList parsed_;
+
+  // If we hold only a single RAW token, then we keep it here instead of
+  // pushing it on TokenList. This saves a bunch of allocations for
+  // what is a common case. If parsed_ is nonempty, then this value
+  // must be ignored.
+  std::string single_token_;
 };
 
 /// An invocable build command and associated metadata (description, etc.).
diff --git a/src/graph.cc b/src/graph.cc
index f04ffb47c8..c1276daefb 100644
--- a/src/graph.cc
+++ b/src/graph.cc
@@ -740,12 +740,13 @@ bool ImplicitDepLoader::LoadDepsFromLog(Edge* edge, string* err) {
     return false;
   }
 
-  vector<Node*>::iterator implicit_dep =
-      PreallocateSpace(edge, deps->node_count);
-  for (int i = 0; i < deps->node_count; ++i, ++implicit_dep) {
-    Node* node = deps->nodes[i];
-    *implicit_dep = node;
-    node->AddOutEdge(edge);
+  Node** nodes = deps->nodes;
+  size_t node_count = deps->node_count;
+  edge->inputs_.insert(edge->inputs_.end() - edge->order_only_deps_,
+                       nodes, nodes + node_count);
+  edge->implicit_deps_ += node_count;
+  for (size_t i = 0; i < node_count; ++i) {
+    nodes[i]->AddOutEdge(edge);
   }
   return true;
 }
diff --git a/src/hash_map.h b/src/hash_map.h
index 3f465338ac..4361c80a35 100644
--- a/src/hash_map.h
+++ b/src/hash_map.h
@@ -20,40 +20,8 @@
 #include "string_piece.h"
 #include "util.h"
 
-// MurmurHash2, by Austin Appleby
-static inline
-unsigned int MurmurHash2(const void* key, size_t len) {
-  static const unsigned int seed = 0xDECAFBAD;
-  const unsigned int m = 0x5bd1e995;
-  const int r = 24;
-  unsigned int h = seed ^ len;
-  const unsigned char* data = static_cast<const unsigned char*>(key);
-  while (len >= 4) {
-    unsigned int k;
-    memcpy(&k, data, sizeof k);
-    k *= m;
-    k ^= k >> r;
-    k *= m;
-    h *= m;
-    h ^= k;
-    data += 4;
-    len -= 4;
-  }
-  switch (len) {
-  case 3: h ^= data[2] << 16;
-          NINJA_FALLTHROUGH;
-  case 2: h ^= data[1] << 8;
-          NINJA_FALLTHROUGH;
-  case 1: h ^= data[0];
-    h *= m;
-  };
-  h ^= h >> 13;
-  h *= m;
-  h ^= h >> 15;
-  return h;
-}
-
-#include <unordered_map>
+#include "third_party/emhash/hash_table8.hpp"
+#include "third_party/rapidhash/rapidhash.h"
 
 namespace std {
 template<>
@@ -62,7 +30,7 @@ struct hash<StringPiece> {
   typedef size_t result_type;
 
   size_t operator()(StringPiece key) const {
-    return MurmurHash2(key.str_, key.len_);
+    return rapidhash(key.str_, key.len_);
   }
 };
 }
@@ -73,7 +41,7 @@ struct hash<StringPiece> {
 /// mapping StringPiece => Foo*.
 template<typename V>
 struct ExternalStringHashMap {
-  typedef std::unordered_map<StringPiece, V> Type;
+  typedef emhash8::HashMap<StringPiece, V> Type;
 };
 
 #endif // NINJA_MAP_H_
diff --git a/src/manifest_parser.cc b/src/manifest_parser.cc
index c4b2980164..373dc65a83 100644
--- a/src/manifest_parser.cc
+++ b/src/manifest_parser.cc
@@ -209,14 +209,16 @@ bool ManifestParser::ParseDefault(string* err) {
 }
 
 bool ManifestParser::ParseEdge(string* err) {
-  vector<EvalString> ins, outs, validations;
+  ins_.clear();
+  outs_.clear();
+  validations_.clear();
 
   {
     EvalString out;
     if (!lexer_.ReadPath(&out, err))
       return false;
     while (!out.empty()) {
-      outs.push_back(out);
+      outs_.push_back(std::move(out));
 
       out.Clear();
       if (!lexer_.ReadPath(&out, err))
@@ -233,12 +235,12 @@ bool ManifestParser::ParseEdge(string* err) {
         return false;
       if (out.empty())
         break;
-      outs.push_back(out);
+      outs_.push_back(std::move(out));
       ++implicit_outs;
     }
   }
 
-  if (outs.empty())
+  if (outs_.empty())
     return lexer_.Error("expected path", err);
 
   if (!ExpectToken(Lexer::COLON, err))
@@ -259,7 +261,7 @@ bool ManifestParser::ParseEdge(string* err) {
       return false;
     if (in.empty())
       break;
-    ins.push_back(in);
+    ins_.push_back(std::move(in));
   }
 
   // Add all implicit deps, counting how many as we go.
@@ -271,7 +273,7 @@ bool ManifestParser::ParseEdge(string* err) {
         return false;
       if (in.empty())
         break;
-      ins.push_back(in);
+      ins_.push_back(std::move(in));
       ++implicit;
     }
   }
@@ -285,7 +287,7 @@ bool ManifestParser::ParseEdge(string* err) {
         return false;
       if (in.empty())
         break;
-      ins.push_back(in);
+      ins_.push_back(std::move(in));
       ++order_only;
     }
   }
@@ -298,7 +300,7 @@ bool ManifestParser::ParseEdge(string* err) {
         return false;
       if (validation.empty())
         break;
-      validations.push_back(validation);
+      validations_.push_back(std::move(validation));
     }
   }
 
@@ -329,9 +331,9 @@ bool ManifestParser::ParseEdge(string* err) {
     edge->pool_ = pool;
   }
 
-  edge->outputs_.reserve(outs.size());
-  for (size_t i = 0, e = outs.size(); i != e; ++i) {
-    string path = outs[i].Evaluate(env);
+  edge->outputs_.reserve(outs_.size());
+  for (size_t i = 0, e = outs_.size(); i != e; ++i) {
+    string path = outs_[i].Evaluate(env);
     if (path.empty())
       return lexer_.Error("empty path", err);
     uint64_t slash_bits;
@@ -351,8 +353,8 @@ bool ManifestParser::ParseEdge(string* err) {
   }
   edge->implicit_outs_ = implicit_outs;
 
-  edge->inputs_.reserve(ins.size());
-  for (vector<EvalString>::iterator i = ins.begin(); i != ins.end(); ++i) {
+  edge->inputs_.reserve(ins_.size());
+  for (vector<EvalString>::iterator i = ins_.begin(); i != ins_.end(); ++i) {
     string path = i->Evaluate(env);
     if (path.empty())
       return lexer_.Error("empty path", err);
@@ -363,9 +365,9 @@ bool ManifestParser::ParseEdge(string* err) {
   edge->implicit_deps_ = implicit;
   edge->order_only_deps_ = order_only;
 
-  edge->validations_.reserve(validations.size());
-  for (std::vector<EvalString>::iterator v = validations.begin();
-      v != validations.end(); ++v) {
+  edge->validations_.reserve(validations_.size());
+  for (std::vector<EvalString>::iterator v = validations_.begin();
+      v != validations_.end(); ++v) {
     string path = v->Evaluate(env);
     if (path.empty())
       return lexer_.Error("empty path", err);
@@ -419,14 +421,16 @@ bool ManifestParser::ParseFileInclude(bool new_scope, string* err) {
     return false;
   string path = eval.Evaluate(env_);
 
-  ManifestParser subparser(state_, file_reader_, options_);
+  if (subparser_ == nullptr) {
+    subparser_.reset(new ManifestParser(state_, file_reader_, options_));
+  }
   if (new_scope) {
-    subparser.env_ = new BindingEnv(env_);
+    subparser_->env_ = new BindingEnv(env_);
   } else {
-    subparser.env_ = env_;
+    subparser_->env_ = env_;
   }
 
-  if (!subparser.Load(path, err, &lexer_))
+  if (!subparser_->Load(path, err, &lexer_))
     return false;
 
   if (!ExpectToken(Lexer::NEWLINE, err))
diff --git a/src/manifest_parser.h b/src/manifest_parser.h
index db6812dce4..ce37759676 100644
--- a/src/manifest_parser.h
+++ b/src/manifest_parser.h
@@ -17,6 +17,9 @@
 
 #include "parser.h"
 
+#include <memory>
+#include <vector>
+
 struct BindingEnv;
 struct EvalString;
 
@@ -63,6 +66,12 @@ struct ManifestParser : public Parser {
   BindingEnv* env_;
   ManifestParserOptions options_;
   bool quiet_;
+
+  // ins_/out_/validations_ are reused across invocations to ParseEdge(),
+  // to save on the otherwise constant memory reallocation.
+  // subparser_ is reused solely to get better reuse out ins_/outs_/validation_.
+  std::unique_ptr<ManifestParser> subparser_;
+  std::vector<EvalString> ins_, outs_, validations_;
 };
 
 #endif  // NINJA_MANIFEST_PARSER_H_
diff --git a/src/string_piece.h b/src/string_piece.h
index 1c0bee6e1c..7e7367c20a 100644
--- a/src/string_piece.h
+++ b/src/string_piece.h
@@ -63,6 +63,10 @@ struct StringPiece {
     return len_;
   }
 
+  size_t empty() const {
+    return len_ == 0;
+  }
+
   const char* str_;
   size_t len_;
 };
diff --git a/src/third_party/emhash/README.ninja b/src/third_party/emhash/README.ninja
new file mode 100644
index 0000000000..12ead4e545
--- /dev/null
+++ b/src/third_party/emhash/README.ninja
@@ -0,0 +1,8 @@
+Description: emhash8::HashMap for C++14/17
+Version: 1.6.5 (commit bdebddbdce1b473bbc189178fd523ef4a876ea01)
+URL: https://github.com/ktprime/emhash
+Copyright: Copyright (c) 2021-2024 Huang Yuanbing & bailuzhou AT 163.com
+SPDX-License-Identifier: MIT
+Local changes:
+ - Added includes for _mm_prefetch on MinGW.
+ - Fixed some spelling errors to appease the linter.
diff --git a/src/third_party/emhash/hash_table8.hpp b/src/third_party/emhash/hash_table8.hpp
new file mode 100644
index 0000000000..36b7218381
--- /dev/null
+++ b/src/third_party/emhash/hash_table8.hpp
@@ -0,0 +1,1834 @@
+// emhash8::HashMap for C++14/17
+// version 1.6.5
+// https://github.com/ktprime/emhash/blob/master/hash_table8.hpp
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2021-2024 Huang Yuanbing & bailuzhou AT 163.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE
+
+#pragma once
+
+#include <cstring>
+#include <string>
+#include <cstdlib>
+#include <type_traits>
+#include <cassert>
+#include <utility>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <algorithm>
+#include <memory>
+
+#undef  EMH_NEW
+#undef  EMH_EMPTY
+
+// likely/unlikely
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+#    define EMH_LIKELY(condition)   __builtin_expect(condition, 1)
+#    define EMH_UNLIKELY(condition) __builtin_expect(condition, 0)
+#else
+#    define EMH_LIKELY(condition)   condition
+#    define EMH_UNLIKELY(condition) condition
+#endif
+
+#define EMH_EMPTY(n) (0 > (int)(_index[n].next))
+#define EMH_EQHASH(n, key_hash) (((size_type)(key_hash) & ~_mask) == (_index[n].slot & ~_mask))
+//#define EMH_EQHASH(n, key_hash) ((size_type)(key_hash - _index[n].slot) & ~_mask) == 0
+#define EMH_NEW(key, val, bucket, key_hash) \
+    new(_pairs + _num_filled) value_type(key, val); \
+    _etail = bucket; \
+    _index[bucket] = {bucket, _num_filled++ | ((size_type)(key_hash) & ~_mask)}
+
+#if _WIN32 && defined(_M_IX86)
+#include <xmmintrin.h>
+#endif
+
+namespace emhash8 {
+
+struct DefaultPolicy {
+    static constexpr float load_factor = 0.80f;
+    static constexpr float min_load_factor = 0.20f;
+    static constexpr size_t cacheline_size = 64U;
+};
+
+template<typename KeyT, typename ValueT,
+         typename HashT = std::hash<KeyT>,
+         typename EqT = std::equal_to<KeyT>,
+         typename Allocator = std::allocator<std::pair<KeyT, ValueT>>, //never used
+         typename Policy = DefaultPolicy> //never used
+class HashMap
+{
+#ifndef EMH_DEFAULT_LOAD_FACTOR
+    constexpr static float EMH_DEFAULT_LOAD_FACTOR = 0.80f;
+#endif
+    constexpr static float EMH_MIN_LOAD_FACTOR     = 0.25f; //< 0.5
+    constexpr static uint32_t EMH_CACHE_LINE_SIZE  = 64; //debug only
+
+public:
+    using htype = HashMap<KeyT, ValueT, HashT, EqT>;
+    using value_type = std::pair<KeyT, ValueT>;
+    using key_type = KeyT;
+    using mapped_type = ValueT;
+    //using dPolicy = Policy;
+
+#ifdef EMH_SMALL_TYPE
+    using size_type = uint16_t;
+#elif EMH_SIZE_TYPE == 0
+    using size_type = uint32_t;
+#else
+    using size_type = size_t;
+#endif
+
+    using hasher = HashT;
+    using key_equal = EqT;
+
+    constexpr static size_type INACTIVE = 0-1u;
+    //constexpr uint32_t END      = 0-0x1u;
+    constexpr static size_type EAD      = 2;
+
+    struct Index
+    {
+        size_type next;
+        size_type slot;
+    };
+
+    class const_iterator;
+    class iterator
+    {
+    public:
+        using iterator_category = std::bidirectional_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+        using value_type      = typename htype::value_type;
+        using pointer         = value_type*;
+        using const_pointer   = const value_type* ;
+        using reference       = value_type&;
+        using const_reference = const value_type&;
+
+        iterator() : kv_(nullptr) {}
+        iterator(const_iterator& cit) {
+            kv_ = cit.kv_;
+        }
+
+        iterator(const htype* hash_map, size_type bucket) {
+            kv_ = hash_map->_pairs + (int)bucket;
+        }
+
+        iterator& operator++()
+        {
+            kv_ ++;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            auto cur = *this; kv_ ++;
+            return cur;
+        }
+
+        iterator& operator--()
+        {
+            kv_ --;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            auto cur = *this; kv_ --;
+            return cur;
+        }
+
+        reference operator*() const { return *kv_; }
+        pointer operator->() const { return kv_; }
+
+        bool operator == (const iterator& rhs) const { return kv_ == rhs.kv_; }
+        bool operator != (const iterator& rhs) const { return kv_ != rhs.kv_; }
+        bool operator == (const const_iterator& rhs) const { return kv_ == rhs.kv_; }
+        bool operator != (const const_iterator& rhs) const { return kv_ != rhs.kv_; }
+
+    public:
+        value_type* kv_;
+    };
+
+    class const_iterator
+    {
+    public:
+        using iterator_category = std::bidirectional_iterator_tag;
+        using value_type        = typename htype::value_type;
+        using difference_type   = std::ptrdiff_t;
+        using pointer           = value_type*;
+        using const_pointer     = const value_type*;
+        using reference         = value_type&;
+        using const_reference   = const value_type&;
+
+        const_iterator(const iterator& it) {
+            kv_ = it.kv_;
+        }
+
+        const_iterator (const htype* hash_map, size_type bucket) {
+            kv_ = hash_map->_pairs + (int)bucket;
+        }
+
+        const_iterator& operator++()
+        {
+            kv_ ++;
+            return *this;
+        }
+
+        const_iterator operator++(int)
+        {
+            auto cur = *this; kv_ ++;
+            return cur;
+        }
+
+        const_iterator& operator--()
+        {
+            kv_ --;
+            return *this;
+        }
+
+        const_iterator operator--(int)
+        {
+            auto cur = *this; kv_ --;
+            return cur;
+        }
+
+        const_reference operator*() const { return *kv_; }
+        const_pointer operator->() const { return kv_; }
+
+        bool operator == (const iterator& rhs) const { return kv_ == rhs.kv_; }
+        bool operator != (const iterator& rhs) const { return kv_ != rhs.kv_; }
+        bool operator == (const const_iterator& rhs) const { return kv_ == rhs.kv_; }
+        bool operator != (const const_iterator& rhs) const { return kv_ != rhs.kv_; }
+    public:
+        const value_type* kv_;
+    };
+
+    void init(size_type bucket, float mlf = EMH_DEFAULT_LOAD_FACTOR)
+    {
+        _pairs = nullptr;
+        _index = nullptr;
+        _mask  = _num_buckets = 0;
+        _num_filled = 0;
+        _mlf = (uint32_t)((1 << 27) / EMH_DEFAULT_LOAD_FACTOR);
+        max_load_factor(mlf);
+        rehash(bucket);
+    }
+
+    HashMap(size_type bucket = 2, float mlf = EMH_DEFAULT_LOAD_FACTOR)
+    {
+        init(bucket, mlf);
+    }
+
+    HashMap(const HashMap& rhs)
+    {
+        if (rhs.load_factor() > EMH_MIN_LOAD_FACTOR) {
+            _pairs = alloc_bucket((size_type)(rhs._num_buckets * rhs.max_load_factor()) + 4);
+            _index = alloc_index(rhs._num_buckets);
+            clone(rhs);
+        } else {
+            init(rhs._num_filled + 2, rhs.max_load_factor());
+            for (auto it = rhs.begin(); it != rhs.end(); ++it)
+                insert_unique(it->first, it->second);
+        }
+    }
+
+    HashMap(HashMap&& rhs) noexcept
+    {
+        init(0);
+        *this = std::move(rhs);
+    }
+
+    HashMap(std::initializer_list<value_type> ilist)
+    {
+        init((size_type)ilist.size());
+        for (auto it = ilist.begin(); it != ilist.end(); ++it)
+            do_insert(*it);
+    }
+
+    template<class InputIt>
+    HashMap(InputIt first, InputIt last, size_type bucket_count=4)
+    {
+        init(std::distance(first, last) + bucket_count);
+        for (; first != last; ++first)
+            emplace(*first);
+    }
+
+    HashMap& operator=(const HashMap& rhs)
+    {
+        if (this == &rhs)
+            return *this;
+
+        if (rhs.load_factor() < EMH_MIN_LOAD_FACTOR) {
+            clear(); free(_pairs); _pairs = nullptr;
+            rehash(rhs._num_filled + 2);
+            for (auto it = rhs.begin(); it != rhs.end(); ++it)
+                insert_unique(it->first, it->second);
+            return *this;
+        }
+
+        clearkv();
+
+        if (_num_buckets != rhs._num_buckets) {
+            free(_pairs); free(_index);
+            _index = alloc_index(rhs._num_buckets);
+            _pairs = alloc_bucket((size_type)(rhs._num_buckets * rhs.max_load_factor()) + 4);
+        }
+
+        clone(rhs);
+        return *this;
+    }
+
+    HashMap& operator=(HashMap&& rhs) noexcept
+    {
+        if (this != &rhs) {
+            swap(rhs);
+            rhs.clear();
+        }
+        return *this;
+    }
+
+    template<typename Con>
+    bool operator == (const Con& rhs) const
+    {
+        if (size() != rhs.size())
+            return false;
+
+        for (auto it = begin(), last = end(); it != last; ++it) {
+            auto oi = rhs.find(it->first);
+            if (oi == rhs.end() || it->second != oi->second)
+                return false;
+        }
+        return true;
+    }
+
+    template<typename Con>
+    bool operator != (const Con& rhs) const { return !(*this == rhs); }
+
+    ~HashMap() noexcept
+    {
+        clearkv();
+        free(_pairs);
+        free(_index);
+        _index = nullptr;
+        _pairs = nullptr;
+    }
+
+    void clone(const HashMap& rhs)
+    {
+        _hasher      = rhs._hasher;
+//        _eq          = rhs._eq;
+        _num_buckets = rhs._num_buckets;
+        _num_filled  = rhs._num_filled;
+        _mlf         = rhs._mlf;
+        _last        = rhs._last;
+        _mask        = rhs._mask;
+#if EMH_HIGH_LOAD
+        _ehead       = rhs._ehead;
+#endif
+        _etail       = rhs._etail;
+
+        auto opairs  = rhs._pairs;
+        memcpy((char*)_index, (char*)rhs._index, (_num_buckets + EAD) * sizeof(Index));
+
+        if (is_copy_trivially()) {
+            memcpy((char*)_pairs, (char*)opairs, _num_filled * sizeof(value_type));
+        } else {
+            for (size_type slot = 0; slot < _num_filled; slot++)
+                new(_pairs + slot) value_type(opairs[slot]);
+        }
+    }
+
+    void swap(HashMap& rhs)
+    {
+        //      std::swap(_eq, rhs._eq);
+        std::swap(_hasher, rhs._hasher);
+        std::swap(_pairs, rhs._pairs);
+        std::swap(_index, rhs._index);
+        std::swap(_num_buckets, rhs._num_buckets);
+        std::swap(_num_filled, rhs._num_filled);
+        std::swap(_mask, rhs._mask);
+        std::swap(_mlf, rhs._mlf);
+        std::swap(_last, rhs._last);
+#if EMH_HIGH_LOAD
+        std::swap(_ehead, rhs._ehead);
+#endif
+        std::swap(_etail, rhs._etail);
+    }
+
+    // -------------------------------------------------------------
+    iterator first() const { return {this, 0}; }
+    iterator last() const { return {this, _num_filled - 1}; }
+
+    value_type& front() { return _pairs[0]; }
+    const value_type& front() const { return _pairs[0]; }
+    value_type& back() { return _pairs[_num_filled - 1]; }
+    const value_type& back() const { return _pairs[_num_filled - 1]; }
+
+    void pop_front() { erase(begin()); } //TODO. only erase first without move last
+    void pop_back() { erase(last()); }
+
+    iterator begin() { return first(); }
+    const_iterator cbegin() const { return first(); }
+    const_iterator begin() const { return first(); }
+
+    iterator end() { return {this, _num_filled}; }
+    const_iterator cend() const { return {this, _num_filled}; }
+    const_iterator end() const { return cend(); }
+
+    const value_type* values() const { return _pairs; }
+    const Index* index() const { return _index; }
+
+    size_type size() const { return _num_filled; }
+    bool empty() const { return _num_filled == 0; }
+    size_type bucket_count() const { return _num_buckets; }
+
+    /// Returns average number of elements per bucket.
+    float load_factor() const { return static_cast<float>(_num_filled) / (_mask + 1); }
+
+    HashT& hash_function() const { return _hasher; }
+    EqT& key_eq() const { return _eq; }
+
+    void max_load_factor(float mlf)
+    {
+        if (mlf < 0.992 && mlf > EMH_MIN_LOAD_FACTOR) {
+            _mlf = (uint32_t)((1 << 27) / mlf);
+            if (_num_buckets > 0) rehash(_num_buckets);
+        }
+    }
+
+    constexpr float max_load_factor() const { return (1 << 27) / (float)_mlf; }
+    constexpr size_type max_size() const { return (1ull << (sizeof(size_type) * 8 - 1)); }
+    constexpr size_type max_bucket_count() const { return max_size(); }
+
+#if EMH_STATIS
+    //Returns the bucket number where the element with key k is located.
+    size_type bucket(const KeyT& key) const
+    {
+        const auto bucket = hash_bucket(key);
+        const auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return 0;
+        else if (bucket == next_bucket)
+            return bucket + 1;
+
+        return hash_main(bucket) + 1;
+    }
+
+    //Returns the number of elements in bucket n.
+    size_type bucket_size(const size_type bucket) const
+    {
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return 0;
+
+        next_bucket = hash_main(bucket);
+        size_type ibucket_size = 1;
+
+        //iterator each item in current main bucket
+        while (true) {
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket) {
+                break;
+            }
+            ibucket_size ++;
+            next_bucket = nbucket;
+        }
+        return ibucket_size;
+    }
+
+    size_type get_main_bucket(const size_type bucket) const
+    {
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return INACTIVE;
+
+        return hash_main(bucket);
+    }
+
+    size_type get_diss(size_type bucket, size_type next_bucket, const size_type slots) const
+    {
+        auto pbucket = reinterpret_cast<uint64_t>(&_pairs[bucket]);
+        auto pnext   = reinterpret_cast<uint64_t>(&_pairs[next_bucket]);
+        if (pbucket / EMH_CACHE_LINE_SIZE == pnext / EMH_CACHE_LINE_SIZE)
+            return 0;
+        size_type diff = pbucket > pnext ? (pbucket - pnext) : (pnext - pbucket);
+        if (diff / EMH_CACHE_LINE_SIZE < slots - 1)
+            return diff / EMH_CACHE_LINE_SIZE + 1;
+        return slots - 1;
+    }
+
+    int get_bucket_info(const size_type bucket, size_type steps[], const size_type slots) const
+    {
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return -1;
+
+        const auto main_bucket = hash_main(bucket);
+        if (next_bucket == main_bucket)
+            return 1;
+        else if (main_bucket != bucket)
+            return 0;
+
+        steps[get_diss(bucket, next_bucket, slots)] ++;
+        size_type ibucket_size = 2;
+        //find a empty and linked it to tail
+        while (true) {
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                break;
+
+            steps[get_diss(nbucket, next_bucket, slots)] ++;
+            ibucket_size ++;
+            next_bucket = nbucket;
+        }
+        return (int)ibucket_size;
+    }
+
+    void dump_statics() const
+    {
+        const size_type slots = 128;
+        size_type buckets[slots + 1] = {0};
+        size_type steps[slots + 1]   = {0};
+        for (size_type bucket = 0; bucket < _num_buckets; ++bucket) {
+            auto bsize = get_bucket_info(bucket, steps, slots);
+            if (bsize > 0)
+                buckets[bsize] ++;
+        }
+
+        size_type sumb = 0, collision = 0, sumc = 0, finds = 0, sumn = 0;
+        puts("============== buckets size ration =========");
+        for (size_type i = 0; i < sizeof(buckets) / sizeof(buckets[0]); i++) {
+            const auto bucketsi = buckets[i];
+            if (bucketsi == 0)
+                continue;
+            sumb += bucketsi;
+            sumn += bucketsi * i;
+            collision += bucketsi * (i - 1);
+            finds += bucketsi * i * (i + 1) / 2;
+            printf("  %2u  %8u  %2.2lf|  %.2lf\n", i, bucketsi, bucketsi * 100.0 * i / _num_filled, sumn * 100.0 / _num_filled);
+        }
+
+        puts("========== collision miss ration ===========");
+        for (size_type i = 0; i < sizeof(steps) / sizeof(steps[0]); i++) {
+            sumc += steps[i];
+            if (steps[i] <= 2)
+                continue;
+            printf("  %2u  %8u  %.2lf  %.2lf\n", i, steps[i], steps[i] * 100.0 / collision, sumc * 100.0 / collision);
+        }
+
+        if (sumb == 0)  return;
+        printf("    _num_filled/bucket_size/packed collision/cache_miss/hit_find = %u/%.2lf/%zd/ %.2lf%%/%.2lf%%/%.2lf\n",
+                _num_filled, _num_filled * 1.0 / sumb, sizeof(value_type), (collision * 100.0 / _num_filled), (collision - steps[0]) * 100.0 / _num_filled, finds * 1.0 / _num_filled);
+        assert(sumn == _num_filled);
+        assert(sumc == collision);
+        puts("============== buckets size end =============");
+    }
+#endif
+
+    void pack_zero(ValueT zero)
+    {
+        _pairs[_num_filled] = {KeyT(), zero};
+    }
+
+    // ------------------------------------------------------------
+    template<typename K=KeyT>
+    iterator find(const K& key) noexcept
+    {
+        return {this, find_filled_slot(key)};
+    }
+
+    template<typename K=KeyT>
+    const_iterator find(const K& key) const noexcept
+    {
+        return {this, find_filled_slot(key)};
+    }
+
+    template<typename K=KeyT>
+    ValueT& at(const K& key)
+    {
+        const auto slot = find_filled_slot(key);
+        //throw
+        return _pairs[slot].second;
+    }
+
+    template<typename K=KeyT>
+    const ValueT& at(const K& key) const
+    {
+        const auto slot = find_filled_slot(key);
+        //throw
+        return _pairs[slot].second;
+    }
+
+    const ValueT& index(const uint32_t index) const
+    {
+        return _pairs[index].second;
+    }
+
+    ValueT& index(const uint32_t index)
+    {
+        return _pairs[index].second;
+    }
+
+    template<typename K=KeyT>
+    bool contains(const K& key) const noexcept
+    {
+        return find_filled_slot(key) != _num_filled;
+    }
+
+    template<typename K=KeyT>
+    size_type count(const K& key) const noexcept
+    {
+        return find_filled_slot(key) == _num_filled ? 0 : 1;
+        //return find_sorted_bucket(key) == END ? 0 : 1;
+        //return find_hash_bucket(key) == END ? 0 : 1;
+    }
+
+    template<typename K=KeyT>
+    std::pair<iterator, iterator> equal_range(const K& key)
+    {
+        const auto found = find(key);
+        if (found.second == _num_filled)
+            return { found, found };
+        else
+            return { found, std::next(found) };
+    }
+
+    void merge(HashMap& rhs)
+    {
+        if (empty()) {
+            *this = std::move(rhs);
+            return;
+        }
+
+        for (auto rit = rhs.begin(); rit != rhs.end(); ) {
+            auto fit = find(rit->first);
+            if (fit == end()) {
+                insert_unique(rit->first, std::move(rit->second));
+                rit = rhs.erase(rit);
+            } else {
+                ++rit;
+            }
+        }
+    }
+
+    /// Returns the matching ValueT or nullptr if k isn't found.
+    bool try_get(const KeyT& key, ValueT& val) const noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        const auto found = slot != _num_filled;
+        if (found) {
+            val = _pairs[slot].second;
+        }
+        return found;
+    }
+
+    /// Returns the matching ValueT or nullptr if k isn't found.
+    ValueT* try_get(const KeyT& key) noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        return slot != _num_filled ? &_pairs[slot].second : nullptr;
+    }
+
+    /// Const version of the above
+    ValueT* try_get(const KeyT& key) const noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        return slot != _num_filled ? &_pairs[slot].second : nullptr;
+    }
+
+    /// set value if key exist
+    bool try_set(const KeyT& key, const ValueT& val) noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        if (slot == _num_filled)
+            return false;
+
+        _pairs[slot].second = val;
+        return true;
+    }
+
+    /// set value if key exist
+    bool try_set(const KeyT& key, ValueT&& val) noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        if (slot == _num_filled)
+            return false;
+
+        _pairs[slot].second = std::move(val);
+        return true;
+    }
+
+    /// Convenience function.
+    ValueT get_or_return_default(const KeyT& key) const noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        return slot == _num_filled ? ValueT() : _pairs[slot].second;
+    }
+
+    // -----------------------------------------------------
+    std::pair<iterator, bool> do_insert(const value_type& value) noexcept
+    {
+        const auto key_hash = hash_key(value.first);
+        const auto bucket = find_or_allocate(value.first, key_hash);
+        const auto bempty = EMH_EMPTY(bucket);
+        if (bempty) {
+            EMH_NEW(value.first, value.second, bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return { {this, slot}, bempty };
+    }
+
+    std::pair<iterator, bool> do_insert(value_type&& value) noexcept
+    {
+        const auto key_hash = hash_key(value.first);
+        const auto bucket = find_or_allocate(value.first, key_hash);
+        const auto bempty = EMH_EMPTY(bucket);
+        if (bempty) {
+            EMH_NEW(std::move(value.first), std::move(value.second), bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return { {this, slot}, bempty };
+    }
+
+    template<typename K, typename V>
+    std::pair<iterator, bool> do_insert(K&& key, V&& val) noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        const auto bempty = EMH_EMPTY(bucket);
+        if (bempty) {
+            EMH_NEW(std::forward<K>(key), std::forward<V>(val), bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return { {this, slot}, bempty };
+    }
+
+    template<typename K, typename V>
+    std::pair<iterator, bool> do_assign(K&& key, V&& val) noexcept
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        const auto bempty = EMH_EMPTY(bucket);
+        if (bempty) {
+            EMH_NEW(std::forward<K>(key), std::forward<V>(val), bucket, key_hash);
+        } else {
+            _pairs[_index[bucket].slot & _mask].second = std::move(val);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return { {this, slot}, bempty };
+    }
+
+    std::pair<iterator, bool> insert(const value_type& p)
+    {
+        check_expand_need();
+        return do_insert(p);
+    }
+
+    std::pair<iterator, bool> insert(value_type && p)
+    {
+        check_expand_need();
+        return do_insert(std::move(p));
+    }
+
+    void insert(std::initializer_list<value_type> ilist)
+    {
+        reserve(ilist.size() + _num_filled, false);
+        for (auto it = ilist.begin(); it != ilist.end(); ++it)
+            do_insert(*it);
+    }
+
+    template <typename Iter>
+    void insert(Iter first, Iter last)
+    {
+        reserve(std::distance(first, last) + _num_filled, false);
+        for (; first != last; ++first)
+            do_insert(first->first, first->second);
+    }
+
+#if 0
+    template <typename Iter>
+    void insert_unique(Iter begin, Iter end)
+    {
+        reserve(std::distance(begin, end) + _num_filled, false);
+        for (; begin != end; ++begin) {
+            insert_unique(*begin);
+        }
+    }
+#endif
+
+    template<typename K, typename V>
+    size_type insert_unique(K&& key, V&& val)
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        auto bucket = find_unique_bucket(key_hash);
+        EMH_NEW(std::forward<K>(key), std::forward<V>(val), bucket, key_hash);
+        return bucket;
+    }
+
+    size_type insert_unique(value_type&& value)
+    {
+        return insert_unique(std::move(value.first), std::move(value.second));
+    }
+
+    size_type insert_unique(const value_type& value)
+    {
+        return insert_unique(value.first, value.second);
+    }
+
+    template <class... Args>
+    std::pair<iterator, bool> emplace(Args&&... args) noexcept
+    {
+        check_expand_need();
+        return do_insert(std::forward<Args>(args)...);
+    }
+
+    //no any optimize for position
+    template <class... Args>
+    iterator emplace_hint(const_iterator hint, Args&&... args)
+    {
+        (void)hint;
+        check_expand_need();
+        return do_insert(std::forward<Args>(args)...).first;
+    }
+
+    template<class... Args>
+    std::pair<iterator, bool> try_emplace(const KeyT& k, Args&&... args)
+    {
+        check_expand_need();
+        return do_insert(k, std::forward<Args>(args)...);
+    }
+
+    template<class... Args>
+    std::pair<iterator, bool> try_emplace(KeyT&& k, Args&&... args)
+    {
+        check_expand_need();
+        return do_insert(std::move(k), std::forward<Args>(args)...);
+    }
+
+    template <class... Args>
+    size_type emplace_unique(Args&&... args)
+    {
+        return insert_unique(std::forward<Args>(args)...);
+    }
+
+    std::pair<iterator, bool> insert_or_assign(const KeyT& key, ValueT&& val) { return do_assign(key, std::forward<ValueT>(val)); }
+    std::pair<iterator, bool> insert_or_assign(KeyT&& key, ValueT&& val) { return do_assign(std::move(key), std::forward<ValueT>(val)); }
+
+    /// Return the old value or ValueT() if it didn't exist.
+    ValueT set_get(const KeyT& key, const ValueT& val)
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        if (EMH_EMPTY(bucket)) {
+            EMH_NEW(key, val, bucket, key_hash);
+            return ValueT();
+        } else {
+            const auto slot = _index[bucket].slot & _mask;
+            ValueT old_value(val);
+            std::swap(_pairs[slot].second, old_value);
+            return old_value;
+        }
+    }
+
+    /// Like std::map<KeyT, ValueT>::operator[].
+    ValueT& operator[](const KeyT& key) noexcept
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        if (EMH_EMPTY(bucket)) {
+            /* Check if inserting a value rather than overwriting an old entry */
+            EMH_NEW(key, std::move(ValueT()), bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return _pairs[slot].second;
+    }
+
+    ValueT& operator[](KeyT&& key) noexcept
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        if (EMH_EMPTY(bucket)) {
+            EMH_NEW(std::move(key), std::move(ValueT()), bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return _pairs[slot].second;
+    }
+
+    /// Erase an element from the hash table.
+    /// return 0 if element was not found
+    size_type erase(const KeyT& key) noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto sbucket = find_filled_bucket(key, key_hash);
+        if (sbucket == INACTIVE)
+            return 0;
+
+        const auto main_bucket = key_hash & _mask;
+        erase_slot(sbucket, (size_type)main_bucket);
+        return 1;
+    }
+
+    //iterator erase(const_iterator begin_it, const_iterator end_it)
+    iterator erase(const const_iterator& cit) noexcept
+    {
+        const auto slot = (size_type)(cit.kv_ - _pairs);
+        size_type main_bucket;
+        const auto sbucket = find_slot_bucket(slot, main_bucket); //TODO
+        erase_slot(sbucket, main_bucket);
+        return {this, slot};
+    }
+
+    //only last >= first
+    iterator erase(const_iterator first, const_iterator last) noexcept
+    {
+        auto esize = long(last.kv_ - first.kv_);
+        auto tsize = long((_pairs + _num_filled) - last.kv_); //last to tail size
+        auto next = first;
+        while (tsize -- > 0) {
+            if (esize-- <= 0)
+                break;
+            next = ++erase(next);
+        }
+
+        //fast erase from last
+        next = this->last();
+        while (esize -- > 0)
+            next = --erase(next);
+
+        return {this, size_type(next.kv_ - _pairs)};
+    }
+
+    template<typename Pred>
+    size_type erase_if(Pred pred)
+    {
+        auto old_size = size();
+        for (auto it = begin(); it != end();) {
+            if (pred(*it))
+                it = erase(it);
+            else
+                ++it;
+        }
+        return old_size - size();
+    }
+
+    static constexpr bool is_triviall_destructable()
+    {
+#if __cplusplus >= 201402L || _MSC_VER > 1600
+        return !(std::is_trivially_destructible<KeyT>::value && std::is_trivially_destructible<ValueT>::value);
+#else
+        return !(std::is_pod<KeyT>::value && std::is_pod<ValueT>::value);
+#endif
+    }
+
+    static constexpr bool is_copy_trivially()
+    {
+#if __cplusplus >= 201103L || _MSC_VER > 1600
+        return (std::is_trivially_copyable<KeyT>::value && std::is_trivially_copyable<ValueT>::value);
+#else
+        return (std::is_pod<KeyT>::value && std::is_pod<ValueT>::value);
+#endif
+    }
+
+    void clearkv()
+    {
+        if (is_triviall_destructable()) {
+            while (_num_filled --)
+                _pairs[_num_filled].~value_type();
+        }
+    }
+
+    /// Remove all elements, keeping full capacity.
+    void clear() noexcept
+    {
+        clearkv();
+
+        if (_num_filled > 0)
+            memset((char*)_index, INACTIVE, sizeof(_index[0]) * _num_buckets);
+
+        _last = _num_filled = 0;
+        _etail = INACTIVE;
+
+#if EMH_HIGH_LOAD
+        _ehead = 0;
+#endif
+    }
+
+    void shrink_to_fit(const float min_factor = EMH_DEFAULT_LOAD_FACTOR / 4)
+    {
+        if (load_factor() < min_factor && bucket_count() > 10) //safe guard
+            rehash(_num_filled + 1);
+    }
+
+#if EMH_HIGH_LOAD
+    #define EMH_PREVET(i, n) i[n].slot
+    void set_empty()
+    {
+        auto prev = 0;
+        for (int32_t bucket = 1; bucket < _num_buckets; ++bucket) {
+            if (EMH_EMPTY(bucket)) {
+                if (prev != 0) {
+                    EMH_PREVET(_index, bucket) = prev;
+                    _index[_prev].next = -bucket;
+                }
+                else
+                    _ehead = bucket;
+                prev = bucket;
+            }
+        }
+
+        EMH_PREVET(_index, _ehead) = prev;
+        _index[_prev].next = 0-_ehead;
+        _ehead = 0-_index[_ehead].next;
+    }
+
+    void clear_empty()
+    {
+        auto prev = EMH_PREVET(_index, _ehead);
+        while (prev != _ehead) {
+            _index[_prev].next = INACTIVE;
+            prev = EMH_PREVET(_index, prev);
+        }
+        _index[_ehead].next = INACTIVE;
+        _ehead = 0;
+    }
+
+    //prev-ehead->next
+    size_type pop_empty(const size_type bucket)
+    {
+        const auto prev_bucket = EMH_PREVET(_index, bucket);
+        const int next_bucket = 0-_index[bucket].next;
+
+        EMH_PREVET(_index, next_bucket) = prev_bucket;
+        _index[prev_bucket].next = -next_bucket;
+
+        _ehead = next_bucket;
+        return bucket;
+    }
+
+    //ehead->bucket->next
+    void push_empty(const int32_t bucket)
+    {
+        const int next_bucket = 0-_index[_ehead].next;
+        assert(next_bucket > 0);
+
+        EMH_PREVET(_index, bucket) = _ehead;
+        _index[bucket].next = -next_bucket;
+
+        EMH_PREVET(_index, next_bucket) = bucket;
+        _index[_ehead].next = -bucket;
+        //        _ehead = bucket;
+    }
+#endif
+
+    /// Make room for this many elements
+    bool reserve(uint64_t num_elems, bool force)
+    {
+        (void)force;
+#if EMH_HIGH_LOAD == 0
+        const auto required_buckets = num_elems * _mlf >> 27;
+        if (EMH_LIKELY(required_buckets < _mask)) // && !force
+            return false;
+
+#elif EMH_HIGH_LOAD
+        const auto required_buckets = num_elems + num_elems * 1 / 9;
+        if (EMH_LIKELY(required_buckets < _mask))
+            return false;
+
+        else if (_num_buckets < 16 && _num_filled < _num_buckets)
+            return false;
+
+        else if (_num_buckets > EMH_HIGH_LOAD) {
+            if (_ehead == 0) {
+                set_empty();
+                return false;
+            } else if (/*_num_filled + 100 < _num_buckets && */_index[_ehead].next != 0-_ehead) {
+                return false;
+            }
+        }
+#endif
+#if EMH_STATIS
+        if (_num_filled > EMH_STATIS) dump_statics();
+#endif
+
+        //assert(required_buckets < max_size());
+        rehash(required_buckets + 2);
+        return true;
+    }
+
+    static value_type* alloc_bucket(size_type num_buckets)
+    {
+#ifdef EMH_ALLOC
+        auto new_pairs = aligned_alloc(32, (uint64_t)num_buckets * sizeof(value_type));
+#else
+        auto new_pairs = malloc((uint64_t)num_buckets * sizeof(value_type));
+#endif
+        return (value_type *)(new_pairs);
+    }
+
+    static Index* alloc_index(size_type num_buckets)
+    {
+        auto new_index = (char*)malloc((uint64_t)(EAD + num_buckets) * sizeof(Index));
+        return (Index *)(new_index);
+    }
+
+    bool reserve(size_type required_buckets) noexcept
+    {
+        if (_num_filled != required_buckets)
+            return reserve(required_buckets, true);
+
+        _last = 0;
+#if EMH_HIGH_LOAD
+        _ehead = 0;
+#endif
+
+#if EMH_SORT
+        std::sort(_pairs, _pairs + _num_filled, [this](const value_type & l, const value_type & r) {
+            const auto hashl = (size_type)hash_key(l.first) & _mask, hashr = (size_type)hash_key(r.first) & _mask;
+            return hashl < hashr;
+            //return l.first < r.first;
+        });
+#endif
+
+        memset((char*)_index, INACTIVE, sizeof(_index[0]) * _num_buckets);
+        for (size_type slot = 0; slot < _num_filled; slot++) {
+            const auto& key = _pairs[slot].first;
+            const auto key_hash = hash_key(key);
+            const auto bucket = size_type(key_hash & _mask);
+            auto& next_bucket = _index[bucket].next;
+            if ((int)next_bucket < 0)
+                _index[bucket] = {1, slot | ((size_type)(key_hash) & ~_mask)};
+            else {
+                _index[bucket].slot |= (size_type)(key_hash) & ~_mask;
+                next_bucket ++;
+            }
+        }
+        return true;
+    }
+
+    void rebuild(size_type num_buckets) noexcept
+    {
+        free(_index);
+        auto new_pairs = (value_type*)alloc_bucket((size_type)(num_buckets * max_load_factor()) + 4);
+        if (is_copy_trivially()) {
+            if (_pairs)
+            memcpy((char*)new_pairs, (char*)_pairs, _num_filled * sizeof(value_type));
+        } else {
+            for (size_type slot = 0; slot < _num_filled; slot++) {
+                new(new_pairs + slot) value_type(std::move(_pairs[slot]));
+                if (is_triviall_destructable())
+                    _pairs[slot].~value_type();
+            }
+        }
+        free(_pairs);
+        _pairs = new_pairs;
+        _index = (Index*)alloc_index (num_buckets);
+
+        memset((char*)_index, INACTIVE, sizeof(_index[0]) * num_buckets);
+        memset((char*)(_index + num_buckets), 0, sizeof(_index[0]) * EAD);
+    }
+
+    void rehash(uint64_t required_buckets)
+    {
+        if (required_buckets < _num_filled)
+            return;
+
+        assert(required_buckets < max_size());
+        auto num_buckets = _num_filled > (1u << 16) ? (1u << 16) : 4u;
+        while (num_buckets < required_buckets) { num_buckets *= 2; }
+#if EMH_SAVE_MEM
+        if (sizeof(KeyT) < sizeof(size_type) && num_buckets >= (1ul << (2 * 8)))
+            num_buckets = 2ul << (sizeof(KeyT) * 8);
+#endif
+
+#if EMH_REHASH_LOG
+        auto last = _last;
+        size_type collision = 0;
+#endif
+
+#if EMH_HIGH_LOAD
+        _ehead = 0;
+#endif
+        _last = 0;
+
+        _mask        = num_buckets - 1;
+#if EMH_PACK_TAIL > 1
+        _last = _mask;
+        num_buckets += num_buckets * EMH_PACK_TAIL / 100; //add more 5-10%
+#endif
+        _num_buckets = num_buckets;
+
+        rebuild(num_buckets);
+
+#ifdef EMH_SORT
+        std::sort(_pairs, _pairs + _num_filled, [this](const value_type & l, const value_type & r) {
+            const auto hashl = hash_key(l.first), hashr = hash_key(r.first);
+            auto diff = int64_t((hashl & _mask) - (hashr & _mask));
+            if (diff != 0)
+                return diff < 0;
+            return hashl < hashr;
+//          return l.first < r.first;
+        });
+#endif
+
+        _etail = INACTIVE;
+        for (size_type slot = 0; slot < _num_filled; ++slot) {
+            const auto& key = _pairs[slot].first;
+            const auto key_hash = hash_key(key);
+            const auto bucket = find_unique_bucket(key_hash);
+            _index[bucket] = { bucket, slot | ((size_type)(key_hash) & ~_mask) };
+
+#if EMH_REHASH_LOG
+            if (bucket != hash_main(bucket))
+                collision ++;
+#endif
+        }
+
+#if EMH_REHASH_LOG
+        if (_num_filled > EMH_REHASH_LOG) {
+            auto mbucket = _num_filled - collision;
+            char buff[255] = {0};
+            sprintf(buff, "    _num_filled/aver_size/K.V/pack/collision|last = %u/%.2lf/%s.%s/%zd|%.2lf%%,%.2lf%%",
+                    _num_filled, double (_num_filled) / mbucket, typeid(KeyT).name(), typeid(ValueT).name(), sizeof(_pairs[0]), collision * 100.0 / _num_filled, last * 100.0 / _num_buckets);
+#ifdef EMH_LOG
+            static uint32_t ihashs = 0; EMH_LOG() << "hash_nums = " << ihashs ++ << "|" <<__FUNCTION__ << "|" << buff << endl;
+#else
+            puts(buff);
+#endif
+        }
+#endif
+    }
+
+private:
+    // Can we fit another element?
+    bool check_expand_need()
+    {
+        return reserve(_num_filled, false);
+    }
+
+    static void prefetch_heap_block(char* ctrl)
+    {
+        // Prefetch the heap-allocated memory region to resolve potential TLB
+        // misses.  This is intended to overlap with execution of calculating the hash for a key.
+#if __linux__
+        __builtin_prefetch(static_cast<const void*>(ctrl));
+#elif _WIN32 && defined(_M_IX86)
+        _mm_prefetch((const char*)ctrl, _MM_HINT_T0);
+#endif
+    }
+
+    size_type slot_to_bucket(const size_type slot) const noexcept
+    {
+        size_type main_bucket;
+        return find_slot_bucket(slot, main_bucket); //TODO
+    }
+
+    //very slow
+    void erase_slot(const size_type sbucket, const size_type main_bucket) noexcept
+    {
+        const auto slot = _index[sbucket].slot & _mask;
+        const auto ebucket = erase_bucket(sbucket, main_bucket);
+        const auto last_slot = --_num_filled;
+        if (EMH_LIKELY(slot != last_slot)) {
+            const auto last_bucket = (_etail == INACTIVE || ebucket == _etail)
+                ? slot_to_bucket(last_slot) : _etail;
+
+            _pairs[slot] = std::move(_pairs[last_slot]);
+            _index[last_bucket].slot = slot | (_index[last_bucket].slot & ~_mask);
+        }
+
+        if (is_triviall_destructable())
+            _pairs[last_slot].~value_type();
+
+        _etail = INACTIVE;
+        _index[ebucket] = {INACTIVE, 0};
+#if EMH_HIGH_LOAD
+        if (_ehead) {
+            if (10 * _num_filled < 8 * _num_buckets)
+                clear_empty();
+            else if (ebucket)
+                push_empty(ebucket);
+        }
+#endif
+    }
+
+    size_type erase_bucket(const size_type bucket, const size_type main_bucket) noexcept
+    {
+        const auto next_bucket = _index[bucket].next;
+        if (bucket == main_bucket) {
+            if (main_bucket != next_bucket) {
+                const auto nbucket = _index[next_bucket].next;
+                _index[main_bucket] = {
+                    (nbucket == next_bucket) ? main_bucket : nbucket,
+                    _index[next_bucket].slot
+                };
+            }
+            return next_bucket;
+        }
+
+        const auto prev_bucket = find_prev_bucket(main_bucket, bucket);
+        _index[prev_bucket].next = (bucket == next_bucket) ? prev_bucket : next_bucket;
+        return bucket;
+    }
+
+    // Find the slot with this key, or return bucket size
+    size_type find_slot_bucket(const size_type slot, size_type& main_bucket) const
+    {
+        const auto key_hash = hash_key(_pairs[slot].first);
+        const auto bucket = main_bucket = size_type(key_hash & _mask);
+        if (slot == (_index[bucket].slot & _mask))
+            return bucket;
+
+        auto next_bucket = _index[bucket].next;
+        while (true) {
+            if (EMH_LIKELY(slot == (_index[next_bucket].slot & _mask)))
+                return next_bucket;
+            next_bucket = _index[next_bucket].next;
+        }
+
+        return INACTIVE;
+    }
+
+    // Find the slot with this key, or return bucket size
+    size_type find_filled_bucket(const KeyT& key, uint64_t key_hash) const noexcept
+    {
+        const auto bucket = size_type(key_hash & _mask);
+        auto next_bucket  = _index[bucket].next;
+        if (EMH_UNLIKELY((int)next_bucket < 0))
+            return INACTIVE;
+
+        const auto slot = _index[bucket].slot & _mask;
+        //prefetch_heap_block((char*)&_pairs[slot]);
+        if (EMH_EQHASH(bucket, key_hash)) {
+            if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                return bucket;
+        }
+        if (next_bucket == bucket)
+            return INACTIVE;
+
+        while (true) {
+            if (EMH_EQHASH(next_bucket, key_hash)) {
+                const auto slot = _index[next_bucket].slot & _mask;
+                if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                    return next_bucket;
+            }
+
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                return INACTIVE;
+            next_bucket = nbucket;
+        }
+
+        return INACTIVE;
+    }
+
+    // Find the slot with this key, or return bucket size
+    template<typename K=KeyT>
+    size_type find_filled_slot(const K& key) const noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto bucket = size_type(key_hash & _mask);
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return _num_filled;
+
+        const auto slot = _index[bucket].slot & _mask;
+        //prefetch_heap_block((char*)&_pairs[slot]);
+        if (EMH_EQHASH(bucket, key_hash)) {
+            if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                return slot;
+        }
+        if (next_bucket == bucket)
+            return _num_filled;
+
+        while (true) {
+            if (EMH_EQHASH(next_bucket, key_hash)) {
+                const auto slot = _index[next_bucket].slot & _mask;
+                if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                    return slot;
+            }
+
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                return _num_filled;
+            next_bucket = nbucket;
+        }
+
+        return _num_filled;
+    }
+
+#if EMH_SORT
+    size_type find_hash_bucket(const KeyT& key) const noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto bucket = size_type(key_hash & _mask);
+        const auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return END;
+
+        auto slot = _index[bucket].slot & _mask;
+        if (_eq(key, _pairs[slot++].first))
+            return slot;
+        else if (next_bucket == bucket)
+            return END;
+
+        while (true) {
+            const auto& okey = _pairs[slot++].first;
+            if (_eq(key, okey))
+                return slot;
+
+            const auto hasho = hash_key(okey);
+            if ((hasho & _mask) != bucket)
+                break;
+            else if (hasho > key_hash)
+                break;
+            else if (EMH_UNLIKELY(slot >= _num_filled))
+                break;
+        }
+
+        return END;
+    }
+
+    //only for find/can not insert
+    size_type find_sorted_bucket(const KeyT& key) const noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto bucket = size_type(key_hash & _mask);
+        const auto slots = (int)(_index[bucket].next); //TODO
+        if (slots < 0 /**|| key < _pairs[slot].first*/)
+            return END;
+
+        const auto slot = _index[bucket].slot & _mask;
+        auto ormask = _index[bucket].slot & ~_mask;
+        auto hmask  = (size_type)(key_hash) & ~_mask;
+        if ((hmask | ormask) != ormask)
+            return END;
+
+        if (_eq(key, _pairs[slot].first))
+            return slot;
+        else if (slots == 1 || key < _pairs[slot].first)
+            return END;
+
+#if EMH_SORT
+        if (key < _pairs[slot].first || key > _pairs[slots + slot - 1].first)
+            return END;
+#endif
+
+        for (size_type i = 1; i < slots; ++i) {
+            const auto& okey = _pairs[slot + i].first;
+            if (_eq(key, okey))
+                return slot + i;
+            //            else if (okey > key)
+            //                return END;
+        }
+
+        return END;
+    }
+#endif
+
+    //kick out bucket and find empty to occpuy
+    //it will break the origin link and relink again.
+    //before: main_bucket-->prev_bucket --> bucket   --> next_bucket
+    //atfer : main_bucket-->prev_bucket --> (removed)--> new_bucket--> next_bucket
+    size_type kickout_bucket(const size_type kmain, const size_type bucket) noexcept
+    {
+        const auto next_bucket = _index[bucket].next;
+        const auto new_bucket  = find_empty_bucket(next_bucket, 2);
+        const auto prev_bucket = find_prev_bucket(kmain, bucket);
+
+        const auto last = next_bucket == bucket ? new_bucket : next_bucket;
+        _index[new_bucket] = {last, _index[bucket].slot};
+
+        _index[prev_bucket].next = new_bucket;
+        _index[bucket].next = INACTIVE;
+
+        return bucket;
+    }
+
+    /*
+     ** inserts a new key into a hash table; first, check whether key's main
+     ** bucket/position is free. If not, check whether colliding node/bucket is in its main
+     ** position or not: if it is not, move colliding bucket to an empty place and
+     ** put new key in its main position; otherwise (colliding bucket is in its main
+     ** position), new key goes to an empty position.
+     */
+    template<typename K=KeyT>
+    size_type find_or_allocate(const K& key, uint64_t key_hash) noexcept
+    {
+        const auto bucket = size_type(key_hash & _mask);
+        auto next_bucket = _index[bucket].next;
+        prefetch_heap_block((char*)&_pairs[bucket]);
+        if ((int)next_bucket < 0) {
+#if EMH_HIGH_LOAD
+            if (next_bucket != INACTIVE)
+                pop_empty(bucket);
+#endif
+            return bucket;
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        if (EMH_EQHASH(bucket, key_hash))
+            if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                return bucket;
+
+        //check current bucket_key is in main bucket or not
+        const auto kmain = hash_bucket(_pairs[slot].first);
+        if (kmain != bucket)
+            return kickout_bucket(kmain, bucket);
+        else if (next_bucket == bucket)
+            return _index[next_bucket].next = find_empty_bucket(next_bucket, 1);
+
+        uint32_t csize = 1;
+        //find next linked bucket and check key
+        while (true) {
+            const auto eslot = _index[next_bucket].slot & _mask;
+            if (EMH_EQHASH(next_bucket, key_hash)) {
+                if (EMH_LIKELY(_eq(key, _pairs[eslot].first)))
+                    return next_bucket;
+            }
+
+            csize += 1;
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                break;
+            next_bucket = nbucket;
+        }
+
+        //find a empty and link it to tail
+        const auto new_bucket = find_empty_bucket(next_bucket, csize);
+        prefetch_heap_block((char*)&_pairs[new_bucket]);
+        return _index[next_bucket].next = new_bucket;
+    }
+
+    size_type find_unique_bucket(uint64_t key_hash) noexcept
+    {
+        const auto bucket = size_type(key_hash & _mask);
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0) {
+#if EMH_HIGH_LOAD
+            if (next_bucket != INACTIVE)
+                pop_empty(bucket);
+#endif
+            return bucket;
+        }
+
+        //check current bucket_key is in main bucket or not
+        const auto kmain = hash_main(bucket);
+        if (EMH_UNLIKELY(kmain != bucket))
+            return kickout_bucket(kmain, bucket);
+        else if (EMH_UNLIKELY(next_bucket != bucket))
+            next_bucket = find_last_bucket(next_bucket);
+
+        return _index[next_bucket].next = find_empty_bucket(next_bucket, 2);
+    }
+
+    /***
+      Different probing techniques usually provide a trade-off between memory locality and avoidance of clustering.
+      Since Robin Hood hashing is relatively resilient to clustering (both primary and secondary), linear probing is the most cache friendly alternativeis typically used.
+
+      It's the core algorithm of this hash map with highly optimization/benchmark.
+      normally linear probing is inefficient with high load factor, it use a new 3-way linear
+      probing strategy to search empty slot. from benchmark even the load factor > 0.9, it's more 2-3 timer fast than
+      one-way search strategy.
+
+      1. linear or quadratic probing a few cache line for less cache miss from input slot "bucket_from".
+      2. the first  search  slot from member variant "_last", init with 0
+      3. the second search slot from calculated pos "(_num_filled + _last) & _mask", it's like a rand value
+      */
+    // key is not in this mavalue. Find a place to put it.
+    size_type find_empty_bucket(const size_type bucket_from, uint32_t csize) noexcept
+    {
+        (void)csize;
+#if EMH_HIGH_LOAD
+        if (_ehead)
+            return pop_empty(_ehead);
+#endif
+
+        auto bucket = bucket_from;
+        if (EMH_EMPTY(++bucket) || EMH_EMPTY(++bucket))
+            return bucket;
+
+#ifdef EMH_QUADRATIC
+        constexpr size_type linear_probe_length = 2 * EMH_CACHE_LINE_SIZE / sizeof(Index);//16
+        for (size_type offset = csize + 2, step = 4; offset <= linear_probe_length; ) {
+            bucket = (bucket_from + offset) & _mask;
+            if (EMH_EMPTY(bucket) || EMH_EMPTY(++bucket))
+                return bucket;
+            offset += step; //7/8. 12. 16
+        }
+#else
+        constexpr size_type quadratic_probe_length = 6u;
+        for (size_type offset = 4u, step = 3u; step < quadratic_probe_length; ) {
+            bucket = (bucket_from + offset) & _mask;
+            if (EMH_EMPTY(bucket) || EMH_EMPTY(++bucket))
+                return bucket;
+            offset += step++;
+        }
+#endif
+
+#if EMH_PREFETCH
+        __builtin_prefetch(static_cast<const void*>(_index + _last + 1), 0, EMH_PREFETCH);
+#endif
+
+        for (;;) {
+#if EMH_PACK_TAIL
+            //find empty bucket and skip next
+            if (EMH_EMPTY(_last++))// || EMH_EMPTY(_last++))
+                return _last++ - 1;
+
+            if (EMH_UNLIKELY(_last >= _num_buckets))
+                _last = 0;
+
+            auto medium = (_mask / 4 + _last++) & _mask;
+            if (EMH_EMPTY(medium))
+                return medium;
+#else
+            _last &= _mask;
+            if (EMH_EMPTY(++_last))// || EMH_EMPTY(++_last))
+                return _last;
+
+            auto medium = (_num_buckets / 2 + _last) & _mask;
+            if (EMH_EMPTY(medium))// || EMH_EMPTY(++medium))
+                return medium;
+#endif
+        }
+
+        return 0;
+    }
+
+    size_type find_last_bucket(size_type main_bucket) const
+    {
+        auto next_bucket = _index[main_bucket].next;
+        if (next_bucket == main_bucket)
+            return main_bucket;
+
+        while (true) {
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                return next_bucket;
+            next_bucket = nbucket;
+        }
+    }
+
+    size_type find_prev_bucket(const size_type main_bucket, const size_type bucket) const
+    {
+        auto next_bucket = _index[main_bucket].next;
+        if (next_bucket == bucket)
+            return main_bucket;
+
+        while (true) {
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == bucket)
+                return next_bucket;
+            next_bucket = nbucket;
+        }
+    }
+
+    size_type hash_bucket(const KeyT& key) const noexcept
+    {
+        return (size_type)hash_key(key) & _mask;
+    }
+
+    size_type hash_main(const size_type bucket) const noexcept
+    {
+        const auto slot = _index[bucket].slot & _mask;
+        return (size_type)hash_key(_pairs[slot].first) & _mask;
+    }
+
+#if EMH_INT_HASH
+    static constexpr uint64_t KC = UINT64_C(11400714819323198485);
+    static uint64_t hash64(uint64_t key)
+    {
+#if __SIZEOF_INT128__ && EMH_INT_HASH == 1
+        __uint128_t r = key; r *= KC;
+        return (uint64_t)(r >> 64) + (uint64_t)r;
+#elif EMH_INT_HASH == 2
+        //MurmurHash3Mixer
+        uint64_t h = key;
+        h ^= h >> 33;
+        h *= 0xff51afd7ed558ccd;
+        h ^= h >> 33;
+        h *= 0xc4ceb9fe1a85ec53;
+        h ^= h >> 33;
+        return h;
+#elif _WIN64 && EMH_INT_HASH == 1
+        uint64_t high;
+        return _umul128(key, KC, &high) + high;
+#elif EMH_INT_HASH == 3
+        auto ror  = (key >> 32) | (key << 32);
+        auto low  = key * 0xA24BAED4963EE407ull;
+        auto high = ror * 0x9FB21C651E98DF25ull;
+        auto mix  = low + high;
+        return mix;
+#elif EMH_INT_HASH == 1
+        uint64_t r = key * UINT64_C(0xca4bcaa75ec3f625);
+        return (r >> 32) + r;
+#elif EMH_WYHASH64
+        return wyhash64(key, KC);
+#else
+        uint64_t x = key;
+        x = (x ^ (x >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
+        x = (x ^ (x >> 27)) * UINT64_C(0x94d049bb133111eb);
+        x = x ^ (x >> 31);
+        return x;
+#endif
+    }
+#endif
+
+#if EMH_WYHASH_HASH
+    //#define WYHASH_CONDOM 1
+    static uint64_t wymix(uint64_t A, uint64_t B)
+    {
+#if defined(__SIZEOF_INT128__)
+        __uint128_t r = A; r *= B;
+#if WYHASH_CONDOM2
+        A ^= (uint64_t)r; B ^= (uint64_t)(r >> 64);
+#else
+        A = (uint64_t)r; B = (uint64_t)(r >> 64);
+#endif
+
+#elif defined(_MSC_VER) && defined(_M_X64)
+#if WYHASH_CONDOM2
+        uint64_t a, b;
+        a = _umul128(A, B, &b);
+        A ^= a; B ^= b;
+#else
+        A = _umul128(A, B, &B);
+#endif
+#else
+        uint64_t ha = A >> 32, hb = B >> 32, la = (uint32_t)A, lb = (uint32_t)B, hi, lo;
+        uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb, t = rl + (rm0 << 32), c = t < rl;
+        lo = t + (rm1 << 32); c += lo < t; hi = rh + (rm0 >> 32) + (rm1 >> 32) + c;
+#if WYHASH_CONDOM2
+        A ^= lo; B ^= hi;
+#else
+        A = lo; B = hi;
+#endif
+#endif
+        return A ^ B;
+    }
+
+    //multiply and xor mix function, aka MUM
+    static inline uint64_t wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return v; }
+    static inline uint64_t wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return v; }
+    static inline uint64_t wyr3(const uint8_t *p, size_t k) {
+        return (((uint64_t)p[0]) << 16) | (((uint64_t)p[k >> 1]) << 8) | p[k - 1];
+    }
+
+    inline static const uint64_t secret[4] = {
+        0x2d358dccaa6c78a5ull, 0x8bb84b93962eacc9ull,
+        0x4b33a62ed433d4a3ull, 0x4d5a2da51de1aa47ull};
+public:
+    //wyhash main function https://github.com/wangyi-fudan/wyhash
+    static uint64_t wyhashstr(const char *key, const size_t len)
+    {
+        uint64_t a = 0, b = 0, seed = secret[0];
+        const uint8_t *p = (const uint8_t*)key;
+        if (EMH_LIKELY(len <= 16)) {
+            if (EMH_LIKELY(len >= 4)) {
+                const auto half = (len >> 3) << 2;
+                a = (wyr4(p) << 32U) | wyr4(p + half); p += len - 4;
+                b = (wyr4(p) << 32U) | wyr4(p - half);
+            } else if (len) {
+                a = wyr3(p, len);
+            }
+        } else {
+            size_t i = len;
+            if (EMH_UNLIKELY(i > 48)) {
+                uint64_t see1 = seed, see2 = seed;
+                do {
+                    seed = wymix(wyr8(p +  0) ^ secret[1], wyr8(p +  8) ^ seed);
+                    see1 = wymix(wyr8(p + 16) ^ secret[2], wyr8(p + 24) ^ see1);
+                    see2 = wymix(wyr8(p + 32) ^ secret[3], wyr8(p + 40) ^ see2);
+                    p += 48; i -= 48;
+                } while (EMH_LIKELY(i > 48));
+                seed ^= see1 ^ see2;
+            }
+            while (i > 16) {
+                seed = wymix(wyr8(p) ^ secret[1], wyr8(p + 8) ^ seed);
+                i -= 16; p += 16;
+            }
+            a = wyr8(p + i - 16);
+            b = wyr8(p + i - 8);
+        }
+
+        return wymix(secret[1] ^ len, wymix(a ^ secret[1], b ^ seed));
+    }
+#endif
+
+private:
+    template<typename UType, typename std::enable_if<std::is_integral<UType>::value, uint32_t>::type = 0>
+        inline uint64_t hash_key(const UType key) const
+        {
+#if EMH_INT_HASH
+            return hash64(key);
+#elif EMH_IDENTITY_HASH
+            return key + (key >> 24);
+#else
+            return _hasher(key);
+#endif
+        }
+
+    template<typename UType, typename std::enable_if<std::is_same<UType, std::string>::value, uint32_t>::type = 0>
+        inline uint64_t hash_key(const UType& key) const
+        {
+#if EMH_WYHASH_HASH
+            return wyhashstr(key.data(), key.size());
+#else
+            return _hasher(key);
+#endif
+        }
+
+    template<typename UType, typename std::enable_if<!std::is_integral<UType>::value && !std::is_same<UType, std::string>::value, uint32_t>::type = 0>
+        inline uint64_t hash_key(const UType& key) const
+        {
+            return _hasher(key);
+        }
+
+private:
+    Index*    _index;
+    value_type*_pairs;
+
+    HashT     _hasher;
+    EqT       _eq;
+    uint32_t  _mlf;
+    size_type _mask;
+    size_type _num_buckets;
+    size_type _num_filled;
+    size_type _last;
+#if EMH_HIGH_LOAD
+    size_type _ehead;
+#endif
+    size_type _etail;
+};
+} // namespace emhash
+
diff --git a/src/third_party/rapidhash/README.ninja b/src/third_party/rapidhash/README.ninja
new file mode 100644
index 0000000000..1d74b67c1f
--- /dev/null
+++ b/src/third_party/rapidhash/README.ninja
@@ -0,0 +1,7 @@
+Description: Very fast, high quality, platform-independent hashing algorithm.
+Version: commit 4a6b2570e868536be84800353efd92c699f37d2c
+URL: https://github.com/Nicoshev/rapidhash
+Copyright: Copyright (C) 2024 Nicolas De Carli, Based on 'wyhash', by Wang Yi <godspeed_china@yeah.net>
+SPDX-License-Identifier: BSD-2-Clause
+Local changes:
+ - Changed to UNIX line endings
diff --git a/src/third_party/rapidhash/rapidhash.h b/src/third_party/rapidhash/rapidhash.h
new file mode 100755
index 0000000000..463f733d85
--- /dev/null
+++ b/src/third_party/rapidhash/rapidhash.h
@@ -0,0 +1,323 @@
+/*
+ * rapidhash - Very fast, high quality, platform-independent hashing algorithm.
+ * Copyright (C) 2024 Nicolas De Carli
+ *
+ * Based on 'wyhash', by Wang Yi <godspeed_china@yeah.net>
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - rapidhash source repository: https://github.com/Nicoshev/rapidhash
+ */
+
+/*
+ *  Includes.
+ */
+#include <stdint.h>
+#include <string.h>
+#if defined(_MSC_VER)
+  #include <intrin.h>
+  #if defined(_M_X64) && !defined(_M_ARM64EC)
+    #pragma intrinsic(_umul128)
+  #endif
+#endif
+
+/*
+ *  C++ macros.
+ *
+ *  RAPIDHASH_INLINE can be overridden to be stronger than a hint, i.e. by adding __attribute__((always_inline)).
+ */
+#ifdef __cplusplus
+  #define RAPIDHASH_NOEXCEPT noexcept
+  #define RAPIDHASH_CONSTEXPR constexpr
+  #ifndef RAPIDHASH_INLINE
+    #define RAPIDHASH_INLINE inline
+  #endif
+#else
+  #define RAPIDHASH_NOEXCEPT
+  #define RAPIDHASH_CONSTEXPR static const
+  #ifndef RAPIDHASH_INLINE
+    #define RAPIDHASH_INLINE static inline
+  #endif
+#endif
+
+/*
+ *  Protection macro, alters behaviour of rapid_mum multiplication function.
+ *
+ *  RAPIDHASH_FAST: Normal behavior, max speed.
+ *  RAPIDHASH_PROTECTED: Extra protection against entropy loss.
+ */
+#ifndef RAPIDHASH_PROTECTED
+  #define RAPIDHASH_FAST
+#elif defined(RAPIDHASH_FAST)
+  #error "cannot define RAPIDHASH_PROTECTED and RAPIDHASH_FAST simultaneously."
+#endif
+
+/*
+ *  Unrolling macros, changes code definition for main hash function.
+ *
+ *  RAPIDHASH_COMPACT: Legacy variant, each loop process 48 bytes.
+ *  RAPIDHASH_UNROLLED: Unrolled variant, each loop process 96 bytes.
+ *
+ *  Most modern CPUs should benefit from having RAPIDHASH_UNROLLED.
+ *
+ *  These macros do not alter the output hash.
+ */
+#ifndef RAPIDHASH_COMPACT
+  #define RAPIDHASH_UNROLLED
+#elif defined(RAPIDHASH_UNROLLED)
+  #error "cannot define RAPIDHASH_COMPACT and RAPIDHASH_UNROLLED simultaneously."
+#endif
+
+/*
+ *  Likely and unlikely macros.
+ */
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+  #define _likely_(x)  __builtin_expect(x,1)
+  #define _unlikely_(x)  __builtin_expect(x,0)
+#else
+  #define _likely_(x) (x)
+  #define _unlikely_(x) (x)
+#endif
+
+/*
+ *  Endianness macros.
+ */
+#ifndef RAPIDHASH_LITTLE_ENDIAN
+  #if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+    #define RAPIDHASH_LITTLE_ENDIAN
+  #elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    #define RAPIDHASH_BIG_ENDIAN
+  #else
+    #warning "could not determine endianness! Falling back to little endian."
+    #define RAPIDHASH_LITTLE_ENDIAN
+  #endif
+#endif
+
+/*
+ *  Default seed.
+ */
+#define RAPID_SEED (0xbdd89aa982704029ull)
+
+/*
+ *  Default secret parameters.
+ */
+RAPIDHASH_CONSTEXPR uint64_t rapid_secret[3] = {0x2d358dccaa6c78a5ull, 0x8bb84b93962eacc9ull, 0x4b33a62ed433d4a3ull};
+
+/*
+ *  64*64 -> 128bit multiply function.
+ *
+ *  @param A  Address of 64-bit number.
+ *  @param B  Address of 64-bit number.
+ *
+ *  Calculates 128-bit C = *A * *B.
+ *
+ *  When RAPIDHASH_FAST is defined:
+ *  Overwrites A contents with C's low 64 bits.
+ *  Overwrites B contents with C's high 64 bits.
+ *
+ *  When RAPIDHASH_PROTECTED is defined:
+ *  Xors and overwrites A contents with C's low 64 bits.
+ *  Xors and overwrites B contents with C's high 64 bits.
+ */
+RAPIDHASH_INLINE void rapid_mum(uint64_t *A, uint64_t *B) RAPIDHASH_NOEXCEPT {
+#if defined(__SIZEOF_INT128__)
+  __uint128_t r=*A; r*=*B;
+  #ifdef RAPIDHASH_PROTECTED
+  *A^=(uint64_t)r; *B^=(uint64_t)(r>>64);
+  #else
+  *A=(uint64_t)r; *B=(uint64_t)(r>>64);
+  #endif
+#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))
+  #if defined(_M_X64)
+    #ifdef RAPIDHASH_PROTECTED
+    uint64_t a, b;
+    a=_umul128(*A,*B,&b);
+    *A^=a;  *B^=b;
+    #else
+    *A=_umul128(*A,*B,B);
+    #endif
+  #else
+    #ifdef RAPIDHASH_PROTECTED
+    uint64_t a, b;
+    b = __umulh(*A, *B);
+    a = *A * *B;
+    *A^=a;  *B^=b;
+    #else
+    uint64_t c = __umulh(*A, *B);
+    *A = *A * *B;
+    *B = c;
+    #endif
+  #endif
+#else
+  uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B, hi, lo;
+  uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t<rl;
+  lo=t+(rm1<<32); c+=lo<t; hi=rh+(rm0>>32)+(rm1>>32)+c;
+  #ifdef RAPIDHASH_PROTECTED
+  *A^=lo;  *B^=hi;
+  #else
+  *A=lo;  *B=hi;
+  #endif
+#endif
+}
+
+/*
+ *  Multiply and xor mix function.
+ *
+ *  @param A  64-bit number.
+ *  @param B  64-bit number.
+ *
+ *  Calculates 128-bit C = A * B.
+ *  Returns 64-bit xor between high and low 64 bits of C.
+ */
+RAPIDHASH_INLINE uint64_t rapid_mix(uint64_t A, uint64_t B) RAPIDHASH_NOEXCEPT { rapid_mum(&A,&B); return A^B; }
+
+/*
+ *  Read functions.
+ */
+#ifdef RAPIDHASH_LITTLE_ENDIAN
+RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return v;}
+RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return v;}
+#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return __builtin_bswap64(v);}
+RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return __builtin_bswap32(v);}
+#elif defined(_MSC_VER)
+RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return _byteswap_uint64(v);}
+RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return _byteswap_ulong(v);}
+#else
+RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT {
+  uint64_t v; memcpy(&v, p, 8);
+  return (((v >> 56) & 0xff)| ((v >> 40) & 0xff00)| ((v >> 24) & 0xff0000)| ((v >>  8) & 0xff000000)| ((v <<  8) & 0xff00000000)| ((v << 24) & 0xff0000000000)| ((v << 40) & 0xff000000000000)| ((v << 56) & 0xff00000000000000));
+}
+RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT {
+  uint32_t v; memcpy(&v, p, 4);
+  return (((v >> 24) & 0xff)| ((v >>  8) & 0xff00)| ((v <<  8) & 0xff0000)| ((v << 24) & 0xff000000));
+}
+#endif
+
+/*
+ *  Reads and combines 3 bytes of input.
+ *
+ *  @param p  Buffer to read from.
+ *  @param k  Length of @p, in bytes.
+ *
+ *  Always reads and combines 3 bytes from memory.
+ *  Guarantees to read each buffer position at least once.
+ *
+ *  Returns a 64-bit value containing all three bytes read.
+ */
+RAPIDHASH_INLINE uint64_t rapid_readSmall(const uint8_t *p, size_t k) RAPIDHASH_NOEXCEPT { return (((uint64_t)p[0])<<56)|(((uint64_t)p[k>>1])<<32)|p[k-1];}
+
+/*
+ *  rapidhash main function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *  @param seed    64-bit seed used to alter the hash result predictably.
+ *  @param secret  Triplet of 64-bit secrets used to alter hash result predictably.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE uint64_t rapidhash_internal(const void *key, size_t len, uint64_t seed, const uint64_t* secret) RAPIDHASH_NOEXCEPT {
+  const uint8_t *p=(const uint8_t *)key; seed^=rapid_mix(seed^secret[0],secret[1])^len;  uint64_t  a,  b;
+  if(_likely_(len<=16)){
+    if(_likely_(len>=4)){
+      const uint8_t * plast = p + len - 4;
+      a = (rapid_read32(p) << 32) | rapid_read32(plast);
+      const uint64_t delta = ((len&24)>>(len>>3));
+      b = ((rapid_read32(p + delta) << 32) | rapid_read32(plast - delta)); }
+    else if(_likely_(len>0)){ a=rapid_readSmall(p,len); b=0;}
+    else a=b=0;
+  }
+  else{
+    size_t i=len;
+    if(_unlikely_(i>48)){
+      uint64_t see1=seed, see2=seed;
+#ifdef RAPIDHASH_UNROLLED
+      while(_likely_(i>=96)){
+        seed=rapid_mix(rapid_read64(p)^secret[0],rapid_read64(p+8)^seed);
+        see1=rapid_mix(rapid_read64(p+16)^secret[1],rapid_read64(p+24)^see1);
+        see2=rapid_mix(rapid_read64(p+32)^secret[2],rapid_read64(p+40)^see2);
+        seed=rapid_mix(rapid_read64(p+48)^secret[0],rapid_read64(p+56)^seed);
+        see1=rapid_mix(rapid_read64(p+64)^secret[1],rapid_read64(p+72)^see1);
+        see2=rapid_mix(rapid_read64(p+80)^secret[2],rapid_read64(p+88)^see2);
+        p+=96; i-=96;
+      }
+      if(_unlikely_(i>=48)){
+        seed=rapid_mix(rapid_read64(p)^secret[0],rapid_read64(p+8)^seed);
+        see1=rapid_mix(rapid_read64(p+16)^secret[1],rapid_read64(p+24)^see1);
+        see2=rapid_mix(rapid_read64(p+32)^secret[2],rapid_read64(p+40)^see2);
+        p+=48; i-=48;
+      }
+#else
+      do {
+        seed=rapid_mix(rapid_read64(p)^secret[0],rapid_read64(p+8)^seed);
+        see1=rapid_mix(rapid_read64(p+16)^secret[1],rapid_read64(p+24)^see1);
+        see2=rapid_mix(rapid_read64(p+32)^secret[2],rapid_read64(p+40)^see2);
+        p+=48; i-=48;
+      } while (_likely_(i>=48));
+#endif
+      seed^=see1^see2;
+    }
+    if(i>16){
+      seed=rapid_mix(rapid_read64(p)^secret[2],rapid_read64(p+8)^seed^secret[1]);
+      if(i>32)
+        seed=rapid_mix(rapid_read64(p+16)^secret[2],rapid_read64(p+24)^seed);
+    }
+    a=rapid_read64(p+i-16);  b=rapid_read64(p+i-8);
+  }
+  a^=secret[1]; b^=seed;  rapid_mum(&a,&b);
+  return  rapid_mix(a^secret[0]^len,b^secret[1]);
+}
+
+/*
+ *  rapidhash default seeded hash function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *  @param seed    64-bit seed used to alter the hash result predictably.
+ *
+ *  Calls rapidhash_internal using provided parameters and default secrets.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE uint64_t rapidhash_withSeed(const void *key, size_t len, uint64_t seed) RAPIDHASH_NOEXCEPT {
+  return rapidhash_internal(key, len, seed, rapid_secret);
+}
+
+/*
+ *  rapidhash default hash function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *
+ *  Calls rapidhash_withSeed using provided parameters and the default seed.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE uint64_t rapidhash(const void *key, size_t len) RAPIDHASH_NOEXCEPT {
+  return rapidhash_withSeed(key, len, RAPID_SEED);
+}