From d85cfed45295c78cbd3f3a3c7bd4a186b279f31f Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@chromium.org>
Date: Fri, 1 Nov 2024 11:39:06 +0100
Subject: [PATCH 1/8] Reuse some vectors in ManifestParser.

For a no-op build of Chromium (Linux, Zen 2),
this reduces time spent from 5.76 to 5.48 seconds.
---
 src/manifest_parser.cc | 44 +++++++++++++++++++++++-------------------
 src/manifest_parser.h  |  9 +++++++++
 2 files changed, 33 insertions(+), 20 deletions(-)
diff --git a/src/manifest_parser.cc b/src/manifest_parser.cc
index c4b2980164..373dc65a83 100644
--- a/src/manifest_parser.cc
+++ b/src/manifest_parser.cc
@@ -209,14 +209,16 @@ bool ManifestParser::ParseDefault(string* err) {
 }
 
 bool ManifestParser::ParseEdge(string* err) {
-  vector<EvalString> ins, outs, validations;
+  ins_.clear();
+  outs_.clear();
+  validations_.clear();
 
   {
     EvalString out;
     if (!lexer_.ReadPath(&out, err))
       return false;
     while (!out.empty()) {
-      outs.push_back(out);
+      outs_.push_back(std::move(out));
 
       out.Clear();
       if (!lexer_.ReadPath(&out, err))
@@ -233,12 +235,12 @@ bool ManifestParser::ParseEdge(string* err) {
         return false;
       if (out.empty())
         break;
-      outs.push_back(out);
+      outs_.push_back(std::move(out));
       ++implicit_outs;
     }
   }
 
-  if (outs.empty())
+  if (outs_.empty())
     return lexer_.Error("expected path", err);
 
   if (!ExpectToken(Lexer::COLON, err))
@@ -259,7 +261,7 @@ bool ManifestParser::ParseEdge(string* err) {
       return false;
     if (in.empty())
       break;
-    ins.push_back(in);
+    ins_.push_back(std::move(in));
   }
 
   // Add all implicit deps, counting how many as we go.
@@ -271,7 +273,7 @@ bool ManifestParser::ParseEdge(string* err) {
         return false;
       if (in.empty())
         break;
-      ins.push_back(in);
+      ins_.push_back(std::move(in));
       ++implicit;
     }
   }
@@ -285,7 +287,7 @@ bool ManifestParser::ParseEdge(string* err) {
         return false;
       if (in.empty())
         break;
-      ins.push_back(in);
+      ins_.push_back(std::move(in));
       ++order_only;
     }
   }
@@ -298,7 +300,7 @@ bool ManifestParser::ParseEdge(string* err) {
         return false;
       if (validation.empty())
         break;
-      validations.push_back(validation);
+      validations_.push_back(std::move(validation));
     }
   }
 
@@ -329,9 +331,9 @@ bool ManifestParser::ParseEdge(string* err) {
     edge->pool_ = pool;
   }
 
-  edge->outputs_.reserve(outs.size());
-  for (size_t i = 0, e = outs.size(); i != e; ++i) {
-    string path = outs[i].Evaluate(env);
+  edge->outputs_.reserve(outs_.size());
+  for (size_t i = 0, e = outs_.size(); i != e; ++i) {
+    string path = outs_[i].Evaluate(env);
     if (path.empty())
       return lexer_.Error("empty path", err);
     uint64_t slash_bits;
@@ -351,8 +353,8 @@ bool ManifestParser::ParseEdge(string* err) {
   }
   edge->implicit_outs_ = implicit_outs;
 
-  edge->inputs_.reserve(ins.size());
-  for (vector<EvalString>::iterator i = ins.begin(); i != ins.end(); ++i) {
+  edge->inputs_.reserve(ins_.size());
+  for (vector<EvalString>::iterator i = ins_.begin(); i != ins_.end(); ++i) {
     string path = i->Evaluate(env);
     if (path.empty())
       return lexer_.Error("empty path", err);
@@ -363,9 +365,9 @@ bool ManifestParser::ParseEdge(string* err) {
   edge->implicit_deps_ = implicit;
   edge->order_only_deps_ = order_only;
 
-  edge->validations_.reserve(validations.size());
-  for (std::vector<EvalString>::iterator v = validations.begin();
-      v != validations.end(); ++v) {
+  edge->validations_.reserve(validations_.size());
+  for (std::vector<EvalString>::iterator v = validations_.begin();
+      v != validations_.end(); ++v) {
     string path = v->Evaluate(env);
     if (path.empty())
       return lexer_.Error("empty path", err);
@@ -419,14 +421,16 @@ bool ManifestParser::ParseFileInclude(bool new_scope, string* err) {
     return false;
   string path = eval.Evaluate(env_);
 
-  ManifestParser subparser(state_, file_reader_, options_);
+  if (subparser_ == nullptr) {
+    subparser_.reset(new ManifestParser(state_, file_reader_, options_));
+  }
   if (new_scope) {
-    subparser.env_ = new BindingEnv(env_);
+    subparser_->env_ = new BindingEnv(env_);
   } else {
-    subparser.env_ = env_;
+    subparser_->env_ = env_;
   }
 
-  if (!subparser.Load(path, err, &lexer_))
+  if (!subparser_->Load(path, err, &lexer_))
     return false;
 
   if (!ExpectToken(Lexer::NEWLINE, err))
diff --git a/src/manifest_parser.h b/src/manifest_parser.h
index db6812dce4..ce37759676 100644
--- a/src/manifest_parser.h
+++ b/src/manifest_parser.h
@@ -17,6 +17,9 @@
 
 #include "parser.h"
 
+#include <memory>
+#include <vector>
+
 struct BindingEnv;
 struct EvalString;
 
@@ -63,6 +66,12 @@ struct ManifestParser : public Parser {
   BindingEnv* env_;
   ManifestParserOptions options_;
   bool quiet_;
+
+  // ins_/out_/validations_ are reused across invocations to ParseEdge(),
+  // to save on the otherwise constant memory reallocation.
+  // subparser_ is reused solely to get better reuse out ins_/outs_/validation_.
+  std::unique_ptr<ManifestParser> subparser_;
+  std::vector<EvalString> ins_, outs_, validations_;
 };
 
 #endif  // NINJA_MANIFEST_PARSER_H_

From 40efd00fc08c8117294c48c76646a11130f45c69 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@chromium.org>
Date: Thu, 31 Oct 2024 15:59:10 +0100
Subject: [PATCH 2/8] Apply a short-vector optimization to EvalString.

This very often holds only a single RAW token, so we do not
need to allocate elements on an std::vector for it in the
common case.

For a no-op build of Chromium (Linux, Zen 2),
this reduces time spent from 5.48 to 5.14 seconds.

Note that this opens up for a potential optimization where
EvalString::Evaluate() could just return a StringPiece, without
making a std::string out of it (which requires allocation; this is
about 5% of remaining runtime). However, this would also require
that CanonicalizePath() somehow learned to work with StringPiece
(presumably allocating a new StringPiece if and only if changes
were needed).
---
 src/eval_env.cc    | 57 +++++++++++++++++++++++++++++++---------------
 src/eval_env.h     | 10 ++++++--
 src/string_piece.h |  4 ++++
 3 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/src/eval_env.cc b/src/eval_env.cc
index 796a3264d1..cbc935acc2 100644
--- a/src/eval_env.cc
+++ b/src/eval_env.cc
@@ -99,6 +99,10 @@ string BindingEnv::LookupWithFallback(const string& var,
 }
 
 string EvalString::Evaluate(Env* env) const {
+  if (parsed_.empty()) {
+    return single_token_;
+  }
+
   string result;
   for (TokenList::const_iterator i = parsed_.begin(); i != parsed_.end(); ++i) {
     if (i->second == RAW)
@@ -110,40 +114,57 @@ string EvalString::Evaluate(Env* env) const {
 }
 
 void EvalString::AddText(StringPiece text) {
-  // Add it to the end of an existing RAW token if possible.
-  if (!parsed_.empty() && parsed_.back().second == RAW) {
-    parsed_.back().first.append(text.str_, text.len_);
+  if (parsed_.empty()) {
+    single_token_.append(text.begin(), text.end());
+  } else if (!parsed_.empty() && parsed_.back().second == RAW) {
+    parsed_.back().first.append(text.begin(), text.end());
   } else {
-    parsed_.push_back(make_pair(text.AsString(), RAW));
+    parsed_.push_back(std::make_pair(text.AsString(), RAW));
   }
 }
+
 void EvalString::AddSpecial(StringPiece text) {
-  parsed_.push_back(make_pair(text.AsString(), SPECIAL));
+  if (parsed_.empty() && !single_token_.empty()) {
+    // Going from one to two tokens, so we can no longer apply
+    // our single_token_ optimization and need to push everything
+    // onto the vector.
+    parsed_.push_back(std::make_pair(std::move(single_token_), RAW));
+  }
+  parsed_.push_back(std::make_pair(text.AsString(), SPECIAL));
 }
 
 string EvalString::Serialize() const {
   string result;
-  for (TokenList::const_iterator i = parsed_.begin();
-       i != parsed_.end(); ++i) {
+  if (parsed_.empty() && !single_token_.empty()) {
     result.append("[");
-    if (i->second == SPECIAL)
-      result.append("$");
-    result.append(i->first);
+    result.append(single_token_);
     result.append("]");
+  } else {
+    for (const auto& pair : parsed_) {
+      result.append("[");
+      if (pair.second == SPECIAL)
+        result.append("$");
+      result.append(pair.first.begin(), pair.first.end());
+      result.append("]");
+    }
   }
   return result;
 }
 
 string EvalString::Unparse() const {
   string result;
-  for (TokenList::const_iterator i = parsed_.begin();
-       i != parsed_.end(); ++i) {
-    bool special = (i->second == SPECIAL);
-    if (special)
-      result.append("${");
-    result.append(i->first);
-    if (special)
-      result.append("}");
+  if (parsed_.empty() && !single_token_.empty()) {
+    result.append(single_token_.begin(), single_token_.end());
+  } else {
+    for (TokenList::const_iterator i = parsed_.begin();
+         i != parsed_.end(); ++i) {
+      bool special = (i->second == SPECIAL);
+      if (special)
+        result.append("${");
+      result.append(i->first.begin(), i->first.end());
+      if (special)
+        result.append("}");
+    }
   }
   return result;
 }
diff --git a/src/eval_env.h b/src/eval_env.h
index 677dc217a2..ae6d8bc898 100644
--- a/src/eval_env.h
+++ b/src/eval_env.h
@@ -39,8 +39,8 @@ struct EvalString {
   /// @return The string with variables not expanded.
   std::string Unparse() const;
 
-  void Clear() { parsed_.clear(); }
-  bool empty() const { return parsed_.empty(); }
+  void Clear() { parsed_.clear(); single_token_.clear(); }
+  bool empty() const { return parsed_.empty() && single_token_.empty(); }
 
   void AddText(StringPiece text);
   void AddSpecial(StringPiece text);
@@ -53,6 +53,12 @@ struct EvalString {
   enum TokenType { RAW, SPECIAL };
   typedef std::vector<std::pair<std::string, TokenType> > TokenList;
   TokenList parsed_;
+
+  // If we hold only a single RAW token, then we keep it here instead of
+  // pushing it on TokenList. This saves a bunch of allocations for
+  // what is a common case. If parsed_ is nonempty, then this value
+  // must be ignored.
+  std::string single_token_;
 };
 
 /// An invocable build command and associated metadata (description, etc.).
diff --git a/src/string_piece.h b/src/string_piece.h
index 1c0bee6e1c..7e7367c20a 100644
--- a/src/string_piece.h
+++ b/src/string_piece.h
@@ -63,6 +63,10 @@ struct StringPiece {
     return len_;
   }
 
+  size_t empty() const {
+    return len_ == 0;
+  }
+
   const char* str_;
   size_t len_;
 };

From f1a2f421859742f45c3b964a579221c869ef276c Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@chromium.org>
Date: Fri, 1 Nov 2024 12:32:57 +0100
Subject: [PATCH 3/8] Switch hash tables to emhash8::HashMap.

This is much faster than std::unordered_map, and also slightly faster
than phmap::flat_hash_map that was included in PR #2468.
It is MIT-licensed, and we just include the .h file wholesale.

I haven't done a detailed test of all the various unordered_maps
out there, but this is the overall highest-ranking contender on

  https://martin.ankerl.com/2022/08/27/hashmap-bench-01/

except for ankerl::unordered_dense::map, which requires C++17.

For a no-op build of Chromium (Linux, Zen 2),
this reduces time spent from 5.14 to 4.62 seconds.
---
 src/hash_map.h                         |    5 +-
 src/third_party/emhash/README.ninja    |    7 +
 src/third_party/emhash/hash_table8.hpp | 1830 ++++++++++++++++++++++++
 3 files changed, 1839 insertions(+), 3 deletions(-)
 create mode 100644 src/third_party/emhash/README.ninja
 create mode 100644 src/third_party/emhash/hash_table8.hpp

diff --git a/src/hash_map.h b/src/hash_map.h
index 3f465338ac..5585b1dc51 100644
--- a/src/hash_map.h
+++ b/src/hash_map.h
@@ -19,6 +19,7 @@
 #include <string.h>
 #include "string_piece.h"
 #include "util.h"
+#include "third_party/emhash/hash_table8.hpp"
 
 // MurmurHash2, by Austin Appleby
 static inline
@@ -53,8 +54,6 @@ unsigned int MurmurHash2(const void* key, size_t len) {
   return h;
 }
 
-#include <unordered_map>
-
 namespace std {
 template<>
 struct hash<StringPiece> {
@@ -73,7 +72,7 @@ struct hash<StringPiece> {
 /// mapping StringPiece => Foo*.
 template<typename V>
 struct ExternalStringHashMap {
-  typedef std::unordered_map<StringPiece, V> Type;
+  typedef emhash8::HashMap<StringPiece, V> Type;
 };
 
 #endif // NINJA_MAP_H_
diff --git a/src/third_party/emhash/README.ninja b/src/third_party/emhash/README.ninja
new file mode 100644
index 0000000000..8a236c2d94
--- /dev/null
+++ b/src/third_party/emhash/README.ninja
@@ -0,0 +1,7 @@
+Description: emhash8::HashMap for C++14/17
+Version: 1.6.5 (commit bdebddbdce1b473bbc189178fd523ef4a876ea01)
+URL: https://github.com/ktprime/emhash
+Copyright: Copyright (c) 2021-2024 Huang Yuanbing & bailuzhou AT 163.com
+SPDX-License-Identifier: MIT
+Local changes:
+ - None.
diff --git a/src/third_party/emhash/hash_table8.hpp b/src/third_party/emhash/hash_table8.hpp
new file mode 100644
index 0000000000..ba121cbdfc
--- /dev/null
+++ b/src/third_party/emhash/hash_table8.hpp
@@ -0,0 +1,1830 @@
+// emhash8::HashMap for C++14/17
+// version 1.6.5
+// https://github.com/ktprime/emhash/blob/master/hash_table8.hpp
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2021-2024 Huang Yuanbing & bailuzhou AT 163.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE
+
+#pragma once
+
+#include <cstring>
+#include <string>
+#include <cstdlib>
+#include <type_traits>
+#include <cassert>
+#include <utility>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <algorithm>
+#include <memory>
+
+#undef  EMH_NEW
+#undef  EMH_EMPTY
+
+// likely/unlikely
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+#    define EMH_LIKELY(condition)   __builtin_expect(condition, 1)
+#    define EMH_UNLIKELY(condition) __builtin_expect(condition, 0)
+#else
+#    define EMH_LIKELY(condition)   condition
+#    define EMH_UNLIKELY(condition) condition
+#endif
+
+#define EMH_EMPTY(n) (0 > (int)(_index[n].next))
+#define EMH_EQHASH(n, key_hash) (((size_type)(key_hash) & ~_mask) == (_index[n].slot & ~_mask))
+//#define EMH_EQHASH(n, key_hash) ((size_type)(key_hash - _index[n].slot) & ~_mask) == 0
+#define EMH_NEW(key, val, bucket, key_hash) \
+    new(_pairs + _num_filled) value_type(key, val); \
+    _etail = bucket; \
+    _index[bucket] = {bucket, _num_filled++ | ((size_type)(key_hash) & ~_mask)}
+
+namespace emhash8 {
+
+struct DefaultPolicy {
+    static constexpr float load_factor = 0.80f;
+    static constexpr float min_load_factor = 0.20f;
+    static constexpr size_t cacheline_size = 64U;
+};
+
+template<typename KeyT, typename ValueT,
+         typename HashT = std::hash<KeyT>,
+         typename EqT = std::equal_to<KeyT>,
+         typename Allocator = std::allocator<std::pair<KeyT, ValueT>>, //never used
+         typename Policy = DefaultPolicy> //never used
+class HashMap
+{
+#ifndef EMH_DEFAULT_LOAD_FACTOR
+    constexpr static float EMH_DEFAULT_LOAD_FACTOR = 0.80f;
+#endif
+    constexpr static float EMH_MIN_LOAD_FACTOR     = 0.25f; //< 0.5
+    constexpr static uint32_t EMH_CACHE_LINE_SIZE  = 64; //debug only
+
+public:
+    using htype = HashMap<KeyT, ValueT, HashT, EqT>;
+    using value_type = std::pair<KeyT, ValueT>;
+    using key_type = KeyT;
+    using mapped_type = ValueT;
+    //using dPolicy = Policy;
+
+#ifdef EMH_SMALL_TYPE
+    using size_type = uint16_t;
+#elif EMH_SIZE_TYPE == 0
+    using size_type = uint32_t;
+#else
+    using size_type = size_t;
+#endif
+
+    using hasher = HashT;
+    using key_equal = EqT;
+
+    constexpr static size_type INACTIVE = 0-1u;
+    //constexpr uint32_t END      = 0-0x1u;
+    constexpr static size_type EAD      = 2;
+
+    struct Index
+    {
+        size_type next;
+        size_type slot;
+    };
+
+    class const_iterator;
+    class iterator
+    {
+    public:
+        using iterator_category = std::bidirectional_iterator_tag;
+        using difference_type = std::ptrdiff_t;
+        using value_type      = typename htype::value_type;
+        using pointer         = value_type*;
+        using const_pointer   = const value_type* ;
+        using reference       = value_type&;
+        using const_reference = const value_type&;
+
+        iterator() : kv_(nullptr) {}
+        iterator(const_iterator& cit) {
+            kv_ = cit.kv_;
+        }
+
+        iterator(const htype* hash_map, size_type bucket) {
+            kv_ = hash_map->_pairs + (int)bucket;
+        }
+
+        iterator& operator++()
+        {
+            kv_ ++;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            auto cur = *this; kv_ ++;
+            return cur;
+        }
+
+        iterator& operator--()
+        {
+            kv_ --;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            auto cur = *this; kv_ --;
+            return cur;
+        }
+
+        reference operator*() const { return *kv_; }
+        pointer operator->() const { return kv_; }
+
+        bool operator == (const iterator& rhs) const { return kv_ == rhs.kv_; }
+        bool operator != (const iterator& rhs) const { return kv_ != rhs.kv_; }
+        bool operator == (const const_iterator& rhs) const { return kv_ == rhs.kv_; }
+        bool operator != (const const_iterator& rhs) const { return kv_ != rhs.kv_; }
+
+    public:
+        value_type* kv_;
+    };
+
+    class const_iterator
+    {
+    public:
+        using iterator_category = std::bidirectional_iterator_tag;
+        using value_type        = typename htype::value_type;
+        using difference_type   = std::ptrdiff_t;
+        using pointer           = value_type*;
+        using const_pointer     = const value_type*;
+        using reference         = value_type&;
+        using const_reference   = const value_type&;
+
+        const_iterator(const iterator& it) {
+            kv_ = it.kv_;
+        }
+
+        const_iterator (const htype* hash_map, size_type bucket) {
+            kv_ = hash_map->_pairs + (int)bucket;
+        }
+
+        const_iterator& operator++()
+        {
+            kv_ ++;
+            return *this;
+        }
+
+        const_iterator operator++(int)
+        {
+            auto cur = *this; kv_ ++;
+            return cur;
+        }
+
+        const_iterator& operator--()
+        {
+            kv_ --;
+            return *this;
+        }
+
+        const_iterator operator--(int)
+        {
+            auto cur = *this; kv_ --;
+            return cur;
+        }
+
+        const_reference operator*() const { return *kv_; }
+        const_pointer operator->() const { return kv_; }
+
+        bool operator == (const iterator& rhs) const { return kv_ == rhs.kv_; }
+        bool operator != (const iterator& rhs) const { return kv_ != rhs.kv_; }
+        bool operator == (const const_iterator& rhs) const { return kv_ == rhs.kv_; }
+        bool operator != (const const_iterator& rhs) const { return kv_ != rhs.kv_; }
+    public:
+        const value_type* kv_;
+    };
+
+    void init(size_type bucket, float mlf = EMH_DEFAULT_LOAD_FACTOR)
+    {
+        _pairs = nullptr;
+        _index = nullptr;
+        _mask  = _num_buckets = 0;
+        _num_filled = 0;
+        _mlf = (uint32_t)((1 << 27) / EMH_DEFAULT_LOAD_FACTOR);
+        max_load_factor(mlf);
+        rehash(bucket);
+    }
+
+    HashMap(size_type bucket = 2, float mlf = EMH_DEFAULT_LOAD_FACTOR)
+    {
+        init(bucket, mlf);
+    }
+
+    HashMap(const HashMap& rhs)
+    {
+        if (rhs.load_factor() > EMH_MIN_LOAD_FACTOR) {
+            _pairs = alloc_bucket((size_type)(rhs._num_buckets * rhs.max_load_factor()) + 4);
+            _index = alloc_index(rhs._num_buckets);
+            clone(rhs);
+        } else {
+            init(rhs._num_filled + 2, rhs.max_load_factor());
+            for (auto it = rhs.begin(); it != rhs.end(); ++it)
+                insert_unique(it->first, it->second);
+        }
+    }
+
+    HashMap(HashMap&& rhs) noexcept
+    {
+        init(0);
+        *this = std::move(rhs);
+    }
+
+    HashMap(std::initializer_list<value_type> ilist)
+    {
+        init((size_type)ilist.size());
+        for (auto it = ilist.begin(); it != ilist.end(); ++it)
+            do_insert(*it);
+    }
+
+    template<class InputIt>
+    HashMap(InputIt first, InputIt last, size_type bucket_count=4)
+    {
+        init(std::distance(first, last) + bucket_count);
+        for (; first != last; ++first)
+            emplace(*first);
+    }
+
+    HashMap& operator=(const HashMap& rhs)
+    {
+        if (this == &rhs)
+            return *this;
+
+        if (rhs.load_factor() < EMH_MIN_LOAD_FACTOR) {
+            clear(); free(_pairs); _pairs = nullptr;
+            rehash(rhs._num_filled + 2);
+            for (auto it = rhs.begin(); it != rhs.end(); ++it)
+                insert_unique(it->first, it->second);
+            return *this;
+        }
+
+        clearkv();
+
+        if (_num_buckets != rhs._num_buckets) {
+            free(_pairs); free(_index);
+            _index = alloc_index(rhs._num_buckets);
+            _pairs = alloc_bucket((size_type)(rhs._num_buckets * rhs.max_load_factor()) + 4);
+        }
+
+        clone(rhs);
+        return *this;
+    }
+
+    HashMap& operator=(HashMap&& rhs) noexcept
+    {
+        if (this != &rhs) {
+            swap(rhs);
+            rhs.clear();
+        }
+        return *this;
+    }
+
+    template<typename Con>
+    bool operator == (const Con& rhs) const
+    {
+        if (size() != rhs.size())
+            return false;
+
+        for (auto it = begin(), last = end(); it != last; ++it) {
+            auto oi = rhs.find(it->first);
+            if (oi == rhs.end() || it->second != oi->second)
+                return false;
+        }
+        return true;
+    }
+
+    template<typename Con>
+    bool operator != (const Con& rhs) const { return !(*this == rhs); }
+
+    ~HashMap() noexcept
+    {
+        clearkv();
+        free(_pairs);
+        free(_index);
+        _index = nullptr;
+        _pairs = nullptr;
+    }
+
+    void clone(const HashMap& rhs)
+    {
+        _hasher      = rhs._hasher;
+//        _eq          = rhs._eq;
+        _num_buckets = rhs._num_buckets;
+        _num_filled  = rhs._num_filled;
+        _mlf         = rhs._mlf;
+        _last        = rhs._last;
+        _mask        = rhs._mask;
+#if EMH_HIGH_LOAD
+        _ehead       = rhs._ehead;
+#endif
+        _etail       = rhs._etail;
+
+        auto opairs  = rhs._pairs;
+        memcpy((char*)_index, (char*)rhs._index, (_num_buckets + EAD) * sizeof(Index));
+
+        if (is_copy_trivially()) {
+            memcpy((char*)_pairs, (char*)opairs, _num_filled * sizeof(value_type));
+        } else {
+            for (size_type slot = 0; slot < _num_filled; slot++)
+                new(_pairs + slot) value_type(opairs[slot]);
+        }
+    }
+
+    void swap(HashMap& rhs)
+    {
+        //      std::swap(_eq, rhs._eq);
+        std::swap(_hasher, rhs._hasher);
+        std::swap(_pairs, rhs._pairs);
+        std::swap(_index, rhs._index);
+        std::swap(_num_buckets, rhs._num_buckets);
+        std::swap(_num_filled, rhs._num_filled);
+        std::swap(_mask, rhs._mask);
+        std::swap(_mlf, rhs._mlf);
+        std::swap(_last, rhs._last);
+#if EMH_HIGH_LOAD
+        std::swap(_ehead, rhs._ehead);
+#endif
+        std::swap(_etail, rhs._etail);
+    }
+
+    // -------------------------------------------------------------
+    iterator first() const { return {this, 0}; }
+    iterator last() const { return {this, _num_filled - 1}; }
+
+    value_type& front() { return _pairs[0]; }
+    const value_type& front() const { return _pairs[0]; }
+    value_type& back() { return _pairs[_num_filled - 1]; }
+    const value_type& back() const { return _pairs[_num_filled - 1]; }
+
+    void pop_front() { erase(begin()); } //TODO. only erase first without move last
+    void pop_back() { erase(last()); }
+
+    iterator begin() { return first(); }
+    const_iterator cbegin() const { return first(); }
+    const_iterator begin() const { return first(); }
+
+    iterator end() { return {this, _num_filled}; }
+    const_iterator cend() const { return {this, _num_filled}; }
+    const_iterator end() const { return cend(); }
+
+    const value_type* values() const { return _pairs; }
+    const Index* index() const { return _index; }
+
+    size_type size() const { return _num_filled; }
+    bool empty() const { return _num_filled == 0; }
+    size_type bucket_count() const { return _num_buckets; }
+
+    /// Returns average number of elements per bucket.
+    float load_factor() const { return static_cast<float>(_num_filled) / (_mask + 1); }
+
+    HashT& hash_function() const { return _hasher; }
+    EqT& key_eq() const { return _eq; }
+
+    void max_load_factor(float mlf)
+    {
+        if (mlf < 0.992 && mlf > EMH_MIN_LOAD_FACTOR) {
+            _mlf = (uint32_t)((1 << 27) / mlf);
+            if (_num_buckets > 0) rehash(_num_buckets);
+        }
+    }
+
+    constexpr float max_load_factor() const { return (1 << 27) / (float)_mlf; }
+    constexpr size_type max_size() const { return (1ull << (sizeof(size_type) * 8 - 1)); }
+    constexpr size_type max_bucket_count() const { return max_size(); }
+
+#if EMH_STATIS
+    //Returns the bucket number where the element with key k is located.
+    size_type bucket(const KeyT& key) const
+    {
+        const auto bucket = hash_bucket(key);
+        const auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return 0;
+        else if (bucket == next_bucket)
+            return bucket + 1;
+
+        return hash_main(bucket) + 1;
+    }
+
+    //Returns the number of elements in bucket n.
+    size_type bucket_size(const size_type bucket) const
+    {
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return 0;
+
+        next_bucket = hash_main(bucket);
+        size_type ibucket_size = 1;
+
+        //iterator each item in current main bucket
+        while (true) {
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket) {
+                break;
+            }
+            ibucket_size ++;
+            next_bucket = nbucket;
+        }
+        return ibucket_size;
+    }
+
+    size_type get_main_bucket(const size_type bucket) const
+    {
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return INACTIVE;
+
+        return hash_main(bucket);
+    }
+
+    size_type get_diss(size_type bucket, size_type next_bucket, const size_type slots) const
+    {
+        auto pbucket = reinterpret_cast<uint64_t>(&_pairs[bucket]);
+        auto pnext   = reinterpret_cast<uint64_t>(&_pairs[next_bucket]);
+        if (pbucket / EMH_CACHE_LINE_SIZE == pnext / EMH_CACHE_LINE_SIZE)
+            return 0;
+        size_type diff = pbucket > pnext ? (pbucket - pnext) : (pnext - pbucket);
+        if (diff / EMH_CACHE_LINE_SIZE < slots - 1)
+            return diff / EMH_CACHE_LINE_SIZE + 1;
+        return slots - 1;
+    }
+
+    int get_bucket_info(const size_type bucket, size_type steps[], const size_type slots) const
+    {
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return -1;
+
+        const auto main_bucket = hash_main(bucket);
+        if (next_bucket == main_bucket)
+            return 1;
+        else if (main_bucket != bucket)
+            return 0;
+
+        steps[get_diss(bucket, next_bucket, slots)] ++;
+        size_type ibucket_size = 2;
+        //find a empty and linked it to tail
+        while (true) {
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                break;
+
+            steps[get_diss(nbucket, next_bucket, slots)] ++;
+            ibucket_size ++;
+            next_bucket = nbucket;
+        }
+        return (int)ibucket_size;
+    }
+
+    void dump_statics() const
+    {
+        const size_type slots = 128;
+        size_type buckets[slots + 1] = {0};
+        size_type steps[slots + 1]   = {0};
+        for (size_type bucket = 0; bucket < _num_buckets; ++bucket) {
+            auto bsize = get_bucket_info(bucket, steps, slots);
+            if (bsize > 0)
+                buckets[bsize] ++;
+        }
+
+        size_type sumb = 0, collision = 0, sumc = 0, finds = 0, sumn = 0;
+        puts("============== buckets size ration =========");
+        for (size_type i = 0; i < sizeof(buckets) / sizeof(buckets[0]); i++) {
+            const auto bucketsi = buckets[i];
+            if (bucketsi == 0)
+                continue;
+            sumb += bucketsi;
+            sumn += bucketsi * i;
+            collision += bucketsi * (i - 1);
+            finds += bucketsi * i * (i + 1) / 2;
+            printf("  %2u  %8u  %2.2lf|  %.2lf\n", i, bucketsi, bucketsi * 100.0 * i / _num_filled, sumn * 100.0 / _num_filled);
+        }
+
+        puts("========== collision miss ration ===========");
+        for (size_type i = 0; i < sizeof(steps) / sizeof(steps[0]); i++) {
+            sumc += steps[i];
+            if (steps[i] <= 2)
+                continue;
+            printf("  %2u  %8u  %.2lf  %.2lf\n", i, steps[i], steps[i] * 100.0 / collision, sumc * 100.0 / collision);
+        }
+
+        if (sumb == 0)  return;
+        printf("    _num_filled/bucket_size/packed collision/cache_miss/hit_find = %u/%.2lf/%zd/ %.2lf%%/%.2lf%%/%.2lf\n",
+                _num_filled, _num_filled * 1.0 / sumb, sizeof(value_type), (collision * 100.0 / _num_filled), (collision - steps[0]) * 100.0 / _num_filled, finds * 1.0 / _num_filled);
+        assert(sumn == _num_filled);
+        assert(sumc == collision);
+        puts("============== buckets size end =============");
+    }
+#endif
+
+    void pack_zero(ValueT zero)
+    {
+        _pairs[_num_filled] = {KeyT(), zero};
+    }
+
+    // ------------------------------------------------------------
+    template<typename K=KeyT>
+    iterator find(const K& key) noexcept
+    {
+        return {this, find_filled_slot(key)};
+    }
+
+    template<typename K=KeyT>
+    const_iterator find(const K& key) const noexcept
+    {
+        return {this, find_filled_slot(key)};
+    }
+
+    template<typename K=KeyT>
+    ValueT& at(const K& key)
+    {
+        const auto slot = find_filled_slot(key);
+        //throw
+        return _pairs[slot].second;
+    }
+
+    template<typename K=KeyT>
+    const ValueT& at(const K& key) const
+    {
+        const auto slot = find_filled_slot(key);
+        //throw
+        return _pairs[slot].second;
+    }
+
+    const ValueT& index(const uint32_t index) const
+    {
+        return _pairs[index].second;
+    }
+
+    ValueT& index(const uint32_t index)
+    {
+        return _pairs[index].second;
+    }
+
+    template<typename K=KeyT>
+    bool contains(const K& key) const noexcept
+    {
+        return find_filled_slot(key) != _num_filled;
+    }
+
+    template<typename K=KeyT>
+    size_type count(const K& key) const noexcept
+    {
+        return find_filled_slot(key) == _num_filled ? 0 : 1;
+        //return find_sorted_bucket(key) == END ? 0 : 1;
+        //return find_hash_bucket(key) == END ? 0 : 1;
+    }
+
+    template<typename K=KeyT>
+    std::pair<iterator, iterator> equal_range(const K& key)
+    {
+        const auto found = find(key);
+        if (found.second == _num_filled)
+            return { found, found };
+        else
+            return { found, std::next(found) };
+    }
+
+    void merge(HashMap& rhs)
+    {
+        if (empty()) {
+            *this = std::move(rhs);
+            return;
+        }
+
+        for (auto rit = rhs.begin(); rit != rhs.end(); ) {
+            auto fit = find(rit->first);
+            if (fit == end()) {
+                insert_unique(rit->first, std::move(rit->second));
+                rit = rhs.erase(rit);
+            } else {
+                ++rit;
+            }
+        }
+    }
+
+    /// Returns the matching ValueT or nullptr if k isn't found.
+    bool try_get(const KeyT& key, ValueT& val) const noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        const auto found = slot != _num_filled;
+        if (found) {
+            val = _pairs[slot].second;
+        }
+        return found;
+    }
+
+    /// Returns the matching ValueT or nullptr if k isn't found.
+    ValueT* try_get(const KeyT& key) noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        return slot != _num_filled ? &_pairs[slot].second : nullptr;
+    }
+
+    /// Const version of the above
+    ValueT* try_get(const KeyT& key) const noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        return slot != _num_filled ? &_pairs[slot].second : nullptr;
+    }
+
+    /// set value if key exist
+    bool try_set(const KeyT& key, const ValueT& val) noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        if (slot == _num_filled)
+            return false;
+
+        _pairs[slot].second = val;
+        return true;
+    }
+
+    /// set value if key exist
+    bool try_set(const KeyT& key, ValueT&& val) noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        if (slot == _num_filled)
+            return false;
+
+        _pairs[slot].second = std::move(val);
+        return true;
+    }
+
+    /// Convenience function.
+    ValueT get_or_return_default(const KeyT& key) const noexcept
+    {
+        const auto slot = find_filled_slot(key);
+        return slot == _num_filled ? ValueT() : _pairs[slot].second;
+    }
+
+    // -----------------------------------------------------
+    std::pair<iterator, bool> do_insert(const value_type& value) noexcept
+    {
+        const auto key_hash = hash_key(value.first);
+        const auto bucket = find_or_allocate(value.first, key_hash);
+        const auto bempty = EMH_EMPTY(bucket);
+        if (bempty) {
+            EMH_NEW(value.first, value.second, bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return { {this, slot}, bempty };
+    }
+
+    std::pair<iterator, bool> do_insert(value_type&& value) noexcept
+    {
+        const auto key_hash = hash_key(value.first);
+        const auto bucket = find_or_allocate(value.first, key_hash);
+        const auto bempty = EMH_EMPTY(bucket);
+        if (bempty) {
+            EMH_NEW(std::move(value.first), std::move(value.second), bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return { {this, slot}, bempty };
+    }
+
+    template<typename K, typename V>
+    std::pair<iterator, bool> do_insert(K&& key, V&& val) noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        const auto bempty = EMH_EMPTY(bucket);
+        if (bempty) {
+            EMH_NEW(std::forward<K>(key), std::forward<V>(val), bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return { {this, slot}, bempty };
+    }
+
+    template<typename K, typename V>
+    std::pair<iterator, bool> do_assign(K&& key, V&& val) noexcept
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        const auto bempty = EMH_EMPTY(bucket);
+        if (bempty) {
+            EMH_NEW(std::forward<K>(key), std::forward<V>(val), bucket, key_hash);
+        } else {
+            _pairs[_index[bucket].slot & _mask].second = std::move(val);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return { {this, slot}, bempty };
+    }
+
+    std::pair<iterator, bool> insert(const value_type& p)
+    {
+        check_expand_need();
+        return do_insert(p);
+    }
+
+    std::pair<iterator, bool> insert(value_type && p)
+    {
+        check_expand_need();
+        return do_insert(std::move(p));
+    }
+
+    void insert(std::initializer_list<value_type> ilist)
+    {
+        reserve(ilist.size() + _num_filled, false);
+        for (auto it = ilist.begin(); it != ilist.end(); ++it)
+            do_insert(*it);
+    }
+
+    template <typename Iter>
+    void insert(Iter first, Iter last)
+    {
+        reserve(std::distance(first, last) + _num_filled, false);
+        for (; first != last; ++first)
+            do_insert(first->first, first->second);
+    }
+
+#if 0
+    template <typename Iter>
+    void insert_unique(Iter begin, Iter end)
+    {
+        reserve(std::distance(begin, end) + _num_filled, false);
+        for (; begin != end; ++begin) {
+            insert_unique(*begin);
+        }
+    }
+#endif
+
+    template<typename K, typename V>
+    size_type insert_unique(K&& key, V&& val)
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        auto bucket = find_unique_bucket(key_hash);
+        EMH_NEW(std::forward<K>(key), std::forward<V>(val), bucket, key_hash);
+        return bucket;
+    }
+
+    size_type insert_unique(value_type&& value)
+    {
+        return insert_unique(std::move(value.first), std::move(value.second));
+    }
+
+    size_type insert_unique(const value_type& value)
+    {
+        return insert_unique(value.first, value.second);
+    }
+
+    template <class... Args>
+    std::pair<iterator, bool> emplace(Args&&... args) noexcept
+    {
+        check_expand_need();
+        return do_insert(std::forward<Args>(args)...);
+    }
+
+    //no any optimize for position
+    template <class... Args>
+    iterator emplace_hint(const_iterator hint, Args&&... args)
+    {
+        (void)hint;
+        check_expand_need();
+        return do_insert(std::forward<Args>(args)...).first;
+    }
+
+    template<class... Args>
+    std::pair<iterator, bool> try_emplace(const KeyT& k, Args&&... args)
+    {
+        check_expand_need();
+        return do_insert(k, std::forward<Args>(args)...);
+    }
+
+    template<class... Args>
+    std::pair<iterator, bool> try_emplace(KeyT&& k, Args&&... args)
+    {
+        check_expand_need();
+        return do_insert(std::move(k), std::forward<Args>(args)...);
+    }
+
+    template <class... Args>
+    size_type emplace_unique(Args&&... args)
+    {
+        return insert_unique(std::forward<Args>(args)...);
+    }
+
+    std::pair<iterator, bool> insert_or_assign(const KeyT& key, ValueT&& val) { return do_assign(key, std::forward<ValueT>(val)); }
+    std::pair<iterator, bool> insert_or_assign(KeyT&& key, ValueT&& val) { return do_assign(std::move(key), std::forward<ValueT>(val)); }
+
+    /// Return the old value or ValueT() if it didn't exist.
+    ValueT set_get(const KeyT& key, const ValueT& val)
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        if (EMH_EMPTY(bucket)) {
+            EMH_NEW(key, val, bucket, key_hash);
+            return ValueT();
+        } else {
+            const auto slot = _index[bucket].slot & _mask;
+            ValueT old_value(val);
+            std::swap(_pairs[slot].second, old_value);
+            return old_value;
+        }
+    }
+
+    /// Like std::map<KeyT, ValueT>::operator[].
+    ValueT& operator[](const KeyT& key) noexcept
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        if (EMH_EMPTY(bucket)) {
+            /* Check if inserting a value rather than overwriting an old entry */
+            EMH_NEW(key, std::move(ValueT()), bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return _pairs[slot].second;
+    }
+
+    ValueT& operator[](KeyT&& key) noexcept
+    {
+        check_expand_need();
+        const auto key_hash = hash_key(key);
+        const auto bucket = find_or_allocate(key, key_hash);
+        if (EMH_EMPTY(bucket)) {
+            EMH_NEW(std::move(key), std::move(ValueT()), bucket, key_hash);
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        return _pairs[slot].second;
+    }
+
+    /// Erase an element from the hash table.
+    /// return 0 if element was not found
+    size_type erase(const KeyT& key) noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto sbucket = find_filled_bucket(key, key_hash);
+        if (sbucket == INACTIVE)
+            return 0;
+
+        const auto main_bucket = key_hash & _mask;
+        erase_slot(sbucket, (size_type)main_bucket);
+        return 1;
+    }
+
+    //iterator erase(const_iterator begin_it, const_iterator end_it)
+    iterator erase(const const_iterator& cit) noexcept
+    {
+        const auto slot = (size_type)(cit.kv_ - _pairs);
+        size_type main_bucket;
+        const auto sbucket = find_slot_bucket(slot, main_bucket); //TODO
+        erase_slot(sbucket, main_bucket);
+        return {this, slot};
+    }
+
+    //only last >= first
+    iterator erase(const_iterator first, const_iterator last) noexcept
+    {
+        auto esize = long(last.kv_ - first.kv_);
+        auto tsize = long((_pairs + _num_filled) - last.kv_); //last to tail size
+        auto next = first;
+        while (tsize -- > 0) {
+            if (esize-- <= 0)
+                break;
+            next = ++erase(next);
+        }
+
+        //fast erase from last
+        next = this->last();
+        while (esize -- > 0)
+            next = --erase(next);
+
+        return {this, size_type(next.kv_ - _pairs)};
+    }
+
+    template<typename Pred>
+    size_type erase_if(Pred pred)
+    {
+        auto old_size = size();
+        for (auto it = begin(); it != end();) {
+            if (pred(*it))
+                it = erase(it);
+            else
+                ++it;
+        }
+        return old_size - size();
+    }
+
+    static constexpr bool is_triviall_destructable()
+    {
+#if __cplusplus >= 201402L || _MSC_VER > 1600
+        return !(std::is_trivially_destructible<KeyT>::value && std::is_trivially_destructible<ValueT>::value);
+#else
+        return !(std::is_pod<KeyT>::value && std::is_pod<ValueT>::value);
+#endif
+    }
+
+    static constexpr bool is_copy_trivially()
+    {
+#if __cplusplus >= 201103L || _MSC_VER > 1600
+        return (std::is_trivially_copyable<KeyT>::value && std::is_trivially_copyable<ValueT>::value);
+#else
+        return (std::is_pod<KeyT>::value && std::is_pod<ValueT>::value);
+#endif
+    }
+
+    void clearkv()
+    {
+        if (is_triviall_destructable()) {
+            while (_num_filled --)
+                _pairs[_num_filled].~value_type();
+        }
+    }
+
+    /// Remove all elements, keeping full capacity.
+    void clear() noexcept
+    {
+        clearkv();
+
+        if (_num_filled > 0)
+            memset((char*)_index, INACTIVE, sizeof(_index[0]) * _num_buckets);
+
+        _last = _num_filled = 0;
+        _etail = INACTIVE;
+
+#if EMH_HIGH_LOAD
+        _ehead = 0;
+#endif
+    }
+
+    void shrink_to_fit(const float min_factor = EMH_DEFAULT_LOAD_FACTOR / 4)
+    {
+        if (load_factor() < min_factor && bucket_count() > 10) //safe guard
+            rehash(_num_filled + 1);
+    }
+
+#if EMH_HIGH_LOAD
+    #define EMH_PREVET(i, n) i[n].slot
+    void set_empty()
+    {
+        auto prev = 0;
+        for (int32_t bucket = 1; bucket < _num_buckets; ++bucket) {
+            if (EMH_EMPTY(bucket)) {
+                if (prev != 0) {
+                    EMH_PREVET(_index, bucket) = prev;
+                    _index[_prev].next = -bucket;
+                }
+                else
+                    _ehead = bucket;
+                prev = bucket;
+            }
+        }
+
+        EMH_PREVET(_index, _ehead) = prev;
+        _index[_prev].next = 0-_ehead;
+        _ehead = 0-_index[_ehead].next;
+    }
+
+    void clear_empty()
+    {
+        auto prev = EMH_PREVET(_index, _ehead);
+        while (prev != _ehead) {
+            _index[_prev].next = INACTIVE;
+            prev = EMH_PREVET(_index, prev);
+        }
+        _index[_ehead].next = INACTIVE;
+        _ehead = 0;
+    }
+
+    //prev-ehead->next
+    size_type pop_empty(const size_type bucket)
+    {
+        const auto prev_bucket = EMH_PREVET(_index, bucket);
+        const int next_bucket = 0-_index[bucket].next;
+
+        EMH_PREVET(_index, next_bucket) = prev_bucket;
+        _index[prev_bucket].next = -next_bucket;
+
+        _ehead = next_bucket;
+        return bucket;
+    }
+
+    //ehead->bucket->next
+    void push_empty(const int32_t bucket)
+    {
+        const int next_bucket = 0-_index[_ehead].next;
+        assert(next_bucket > 0);
+
+        EMH_PREVET(_index, bucket) = _ehead;
+        _index[bucket].next = -next_bucket;
+
+        EMH_PREVET(_index, next_bucket) = bucket;
+        _index[_ehead].next = -bucket;
+        //        _ehead = bucket;
+    }
+#endif
+
+    /// Make room for this many elements
+    bool reserve(uint64_t num_elems, bool force)
+    {
+        (void)force;
+#if EMH_HIGH_LOAD == 0
+        const auto required_buckets = num_elems * _mlf >> 27;
+        if (EMH_LIKELY(required_buckets < _mask)) // && !force
+            return false;
+
+#elif EMH_HIGH_LOAD
+        const auto required_buckets = num_elems + num_elems * 1 / 9;
+        if (EMH_LIKELY(required_buckets < _mask))
+            return false;
+
+        else if (_num_buckets < 16 && _num_filled < _num_buckets)
+            return false;
+
+        else if (_num_buckets > EMH_HIGH_LOAD) {
+            if (_ehead == 0) {
+                set_empty();
+                return false;
+            } else if (/*_num_filled + 100 < _num_buckets && */_index[_ehead].next != 0-_ehead) {
+                return false;
+            }
+        }
+#endif
+#if EMH_STATIS
+        if (_num_filled > EMH_STATIS) dump_statics();
+#endif
+
+        //assert(required_buckets < max_size());
+        rehash(required_buckets + 2);
+        return true;
+    }
+
+    static value_type* alloc_bucket(size_type num_buckets)
+    {
+#ifdef EMH_ALLOC
+        auto new_pairs = aligned_alloc(32, (uint64_t)num_buckets * sizeof(value_type));
+#else
+        auto new_pairs = malloc((uint64_t)num_buckets * sizeof(value_type));
+#endif
+        return (value_type *)(new_pairs);
+    }
+
+    static Index* alloc_index(size_type num_buckets)
+    {
+        auto new_index = (char*)malloc((uint64_t)(EAD + num_buckets) * sizeof(Index));
+        return (Index *)(new_index);
+    }
+
+    bool reserve(size_type required_buckets) noexcept
+    {
+        if (_num_filled != required_buckets)
+            return reserve(required_buckets, true);
+
+        _last = 0;
+#if EMH_HIGH_LOAD
+        _ehead = 0;
+#endif
+
+#if EMH_SORT
+        std::sort(_pairs, _pairs + _num_filled, [this](const value_type & l, const value_type & r) {
+            const auto hashl = (size_type)hash_key(l.first) & _mask, hashr = (size_type)hash_key(r.first) & _mask;
+            return hashl < hashr;
+            //return l.first < r.first;
+        });
+#endif
+
+        memset((char*)_index, INACTIVE, sizeof(_index[0]) * _num_buckets);
+        for (size_type slot = 0; slot < _num_filled; slot++) {
+            const auto& key = _pairs[slot].first;
+            const auto key_hash = hash_key(key);
+            const auto bucket = size_type(key_hash & _mask);
+            auto& next_bucket = _index[bucket].next;
+            if ((int)next_bucket < 0)
+                _index[bucket] = {1, slot | ((size_type)(key_hash) & ~_mask)};
+            else {
+                _index[bucket].slot |= (size_type)(key_hash) & ~_mask;
+                next_bucket ++;
+            }
+        }
+        return true;
+    }
+
+    void rebuild(size_type num_buckets) noexcept
+    {
+        free(_index);
+        auto new_pairs = (value_type*)alloc_bucket((size_type)(num_buckets * max_load_factor()) + 4);
+        if (is_copy_trivially()) {
+            if (_pairs)
+            memcpy((char*)new_pairs, (char*)_pairs, _num_filled * sizeof(value_type));
+        } else {
+            for (size_type slot = 0; slot < _num_filled; slot++) {
+                new(new_pairs + slot) value_type(std::move(_pairs[slot]));
+                if (is_triviall_destructable())
+                    _pairs[slot].~value_type();
+            }
+        }
+        free(_pairs);
+        _pairs = new_pairs;
+        _index = (Index*)alloc_index (num_buckets);
+
+        memset((char*)_index, INACTIVE, sizeof(_index[0]) * num_buckets);
+        memset((char*)(_index + num_buckets), 0, sizeof(_index[0]) * EAD);
+    }
+
+    void rehash(uint64_t required_buckets)
+    {
+        if (required_buckets < _num_filled)
+            return;
+
+        assert(required_buckets < max_size());
+        auto num_buckets = _num_filled > (1u << 16) ? (1u << 16) : 4u;
+        while (num_buckets < required_buckets) { num_buckets *= 2; }
+#if EMH_SAVE_MEM
+        if (sizeof(KeyT) < sizeof(size_type) && num_buckets >= (1ul << (2 * 8)))
+            num_buckets = 2ul << (sizeof(KeyT) * 8);
+#endif
+
+#if EMH_REHASH_LOG
+        auto last = _last;
+        size_type collision = 0;
+#endif
+
+#if EMH_HIGH_LOAD
+        _ehead = 0;
+#endif
+        _last = 0;
+
+        _mask        = num_buckets - 1;
+#if EMH_PACK_TAIL > 1
+        _last = _mask;
+        num_buckets += num_buckets * EMH_PACK_TAIL / 100; //add more 5-10%
+#endif
+        _num_buckets = num_buckets;
+
+        rebuild(num_buckets);
+
+#ifdef EMH_SORT
+        std::sort(_pairs, _pairs + _num_filled, [this](const value_type & l, const value_type & r) {
+            const auto hashl = hash_key(l.first), hashr = hash_key(r.first);
+            auto diff = int64_t((hashl & _mask) - (hashr & _mask));
+            if (diff != 0)
+                return diff < 0;
+            return hashl < hashr;
+//          return l.first < r.first;
+        });
+#endif
+
+        _etail = INACTIVE;
+        for (size_type slot = 0; slot < _num_filled; ++slot) {
+            const auto& key = _pairs[slot].first;
+            const auto key_hash = hash_key(key);
+            const auto bucket = find_unique_bucket(key_hash);
+            _index[bucket] = { bucket, slot | ((size_type)(key_hash) & ~_mask) };
+
+#if EMH_REHASH_LOG
+            if (bucket != hash_main(bucket))
+                collision ++;
+#endif
+        }
+
+#if EMH_REHASH_LOG
+        if (_num_filled > EMH_REHASH_LOG) {
+            auto mbucket = _num_filled - collision;
+            char buff[255] = {0};
+            sprintf(buff, "    _num_filled/aver_size/K.V/pack/collision|last = %u/%.2lf/%s.%s/%zd|%.2lf%%,%.2lf%%",
+                    _num_filled, double (_num_filled) / mbucket, typeid(KeyT).name(), typeid(ValueT).name(), sizeof(_pairs[0]), collision * 100.0 / _num_filled, last * 100.0 / _num_buckets);
+#ifdef EMH_LOG
+            static uint32_t ihashs = 0; EMH_LOG() << "hash_nums = " << ihashs ++ << "|" <<__FUNCTION__ << "|" << buff << endl;
+#else
+            puts(buff);
+#endif
+        }
+#endif
+    }
+
+private:
+    // Can we fit another element?
+    bool check_expand_need()
+    {
+        return reserve(_num_filled, false);
+    }
+
+    static void prefetch_heap_block(char* ctrl)
+    {
+        // Prefetch the heap-allocated memory region to resolve potential TLB
+        // misses.  This is intended to overlap with execution of calculating the hash for a key.
+#if __linux__
+        __builtin_prefetch(static_cast<const void*>(ctrl));
+#elif _WIN32
+        _mm_prefetch((const char*)ctrl, _MM_HINT_T0);
+#endif
+    }
+
+    size_type slot_to_bucket(const size_type slot) const noexcept
+    {
+        size_type main_bucket;
+        return find_slot_bucket(slot, main_bucket); //TODO
+    }
+
+    //very slow
+    void erase_slot(const size_type sbucket, const size_type main_bucket) noexcept
+    {
+        const auto slot = _index[sbucket].slot & _mask;
+        const auto ebucket = erase_bucket(sbucket, main_bucket);
+        const auto last_slot = --_num_filled;
+        if (EMH_LIKELY(slot != last_slot)) {
+            const auto last_bucket = (_etail == INACTIVE || ebucket == _etail)
+                ? slot_to_bucket(last_slot) : _etail;
+
+            _pairs[slot] = std::move(_pairs[last_slot]);
+            _index[last_bucket].slot = slot | (_index[last_bucket].slot & ~_mask);
+        }
+
+        if (is_triviall_destructable())
+            _pairs[last_slot].~value_type();
+
+        _etail = INACTIVE;
+        _index[ebucket] = {INACTIVE, 0};
+#if EMH_HIGH_LOAD
+        if (_ehead) {
+            if (10 * _num_filled < 8 * _num_buckets)
+                clear_empty();
+            else if (ebucket)
+                push_empty(ebucket);
+        }
+#endif
+    }
+
+    size_type erase_bucket(const size_type bucket, const size_type main_bucket) noexcept
+    {
+        const auto next_bucket = _index[bucket].next;
+        if (bucket == main_bucket) {
+            if (main_bucket != next_bucket) {
+                const auto nbucket = _index[next_bucket].next;
+                _index[main_bucket] = {
+                    (nbucket == next_bucket) ? main_bucket : nbucket,
+                    _index[next_bucket].slot
+                };
+            }
+            return next_bucket;
+        }
+
+        const auto prev_bucket = find_prev_bucket(main_bucket, bucket);
+        _index[prev_bucket].next = (bucket == next_bucket) ? prev_bucket : next_bucket;
+        return bucket;
+    }
+
+    // Find the slot with this key, or return bucket size
+    size_type find_slot_bucket(const size_type slot, size_type& main_bucket) const
+    {
+        const auto key_hash = hash_key(_pairs[slot].first);
+        const auto bucket = main_bucket = size_type(key_hash & _mask);
+        if (slot == (_index[bucket].slot & _mask))
+            return bucket;
+
+        auto next_bucket = _index[bucket].next;
+        while (true) {
+            if (EMH_LIKELY(slot == (_index[next_bucket].slot & _mask)))
+                return next_bucket;
+            next_bucket = _index[next_bucket].next;
+        }
+
+        return INACTIVE;
+    }
+
+    // Find the slot with this key, or return bucket size
+    size_type find_filled_bucket(const KeyT& key, uint64_t key_hash) const noexcept
+    {
+        const auto bucket = size_type(key_hash & _mask);
+        auto next_bucket  = _index[bucket].next;
+        if (EMH_UNLIKELY((int)next_bucket < 0))
+            return INACTIVE;
+
+        const auto slot = _index[bucket].slot & _mask;
+        //prefetch_heap_block((char*)&_pairs[slot]);
+        if (EMH_EQHASH(bucket, key_hash)) {
+            if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                return bucket;
+        }
+        if (next_bucket == bucket)
+            return INACTIVE;
+
+        while (true) {
+            if (EMH_EQHASH(next_bucket, key_hash)) {
+                const auto slot = _index[next_bucket].slot & _mask;
+                if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                    return next_bucket;
+            }
+
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                return INACTIVE;
+            next_bucket = nbucket;
+        }
+
+        return INACTIVE;
+    }
+
+    // Find the slot with this key, or return bucket size
+    template<typename K=KeyT>
+    size_type find_filled_slot(const K& key) const noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto bucket = size_type(key_hash & _mask);
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return _num_filled;
+
+        const auto slot = _index[bucket].slot & _mask;
+        //prefetch_heap_block((char*)&_pairs[slot]);
+        if (EMH_EQHASH(bucket, key_hash)) {
+            if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                return slot;
+        }
+        if (next_bucket == bucket)
+            return _num_filled;
+
+        while (true) {
+            if (EMH_EQHASH(next_bucket, key_hash)) {
+                const auto slot = _index[next_bucket].slot & _mask;
+                if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                    return slot;
+            }
+
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                return _num_filled;
+            next_bucket = nbucket;
+        }
+
+        return _num_filled;
+    }
+
+#if EMH_SORT
+    size_type find_hash_bucket(const KeyT& key) const noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto bucket = size_type(key_hash & _mask);
+        const auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0)
+            return END;
+
+        auto slot = _index[bucket].slot & _mask;
+        if (_eq(key, _pairs[slot++].first))
+            return slot;
+        else if (next_bucket == bucket)
+            return END;
+
+        while (true) {
+            const auto& okey = _pairs[slot++].first;
+            if (_eq(key, okey))
+                return slot;
+
+            const auto hasho = hash_key(okey);
+            if ((hasho & _mask) != bucket)
+                break;
+            else if (hasho > key_hash)
+                break;
+            else if (EMH_UNLIKELY(slot >= _num_filled))
+                break;
+        }
+
+        return END;
+    }
+
+    //only for find/can not insert
+    size_type find_sorted_bucket(const KeyT& key) const noexcept
+    {
+        const auto key_hash = hash_key(key);
+        const auto bucket = size_type(key_hash & _mask);
+        const auto slots = (int)(_index[bucket].next); //TODO
+        if (slots < 0 /**|| key < _pairs[slot].first*/)
+            return END;
+
+        const auto slot = _index[bucket].slot & _mask;
+        auto ormask = _index[bucket].slot & ~_mask;
+        auto hmask  = (size_type)(key_hash) & ~_mask;
+        if ((hmask | ormask) != ormask)
+            return END;
+
+        if (_eq(key, _pairs[slot].first))
+            return slot;
+        else if (slots == 1 || key < _pairs[slot].first)
+            return END;
+
+#if EMH_SORT
+        if (key < _pairs[slot].first || key > _pairs[slots + slot - 1].first)
+            return END;
+#endif
+
+        for (size_type i = 1; i < slots; ++i) {
+            const auto& okey = _pairs[slot + i].first;
+            if (_eq(key, okey))
+                return slot + i;
+            //            else if (okey > key)
+            //                return END;
+        }
+
+        return END;
+    }
+#endif
+
+    //kick out bucket and find empty to occpuy
+    //it will break the orgin link and relnik again.
+    //before: main_bucket-->prev_bucket --> bucket   --> next_bucket
+    //atfer : main_bucket-->prev_bucket --> (removed)--> new_bucket--> next_bucket
+    size_type kickout_bucket(const size_type kmain, const size_type bucket) noexcept
+    {
+        const auto next_bucket = _index[bucket].next;
+        const auto new_bucket  = find_empty_bucket(next_bucket, 2);
+        const auto prev_bucket = find_prev_bucket(kmain, bucket);
+
+        const auto last = next_bucket == bucket ? new_bucket : next_bucket;
+        _index[new_bucket] = {last, _index[bucket].slot};
+
+        _index[prev_bucket].next = new_bucket;
+        _index[bucket].next = INACTIVE;
+
+        return bucket;
+    }
+
+    /*
+     ** inserts a new key into a hash table; first, check whether key's main
+     ** bucket/position is free. If not, check whether colliding node/bucket is in its main
+     ** position or not: if it is not, move colliding bucket to an empty place and
+     ** put new key in its main position; otherwise (colliding bucket is in its main
+     ** position), new key goes to an empty position.
+     */
+    template<typename K=KeyT>
+    size_type find_or_allocate(const K& key, uint64_t key_hash) noexcept
+    {
+        const auto bucket = size_type(key_hash & _mask);
+        auto next_bucket = _index[bucket].next;
+        prefetch_heap_block((char*)&_pairs[bucket]);
+        if ((int)next_bucket < 0) {
+#if EMH_HIGH_LOAD
+            if (next_bucket != INACTIVE)
+                pop_empty(bucket);
+#endif
+            return bucket;
+        }
+
+        const auto slot = _index[bucket].slot & _mask;
+        if (EMH_EQHASH(bucket, key_hash))
+            if (EMH_LIKELY(_eq(key, _pairs[slot].first)))
+                return bucket;
+
+        //check current bucket_key is in main bucket or not
+        const auto kmain = hash_bucket(_pairs[slot].first);
+        if (kmain != bucket)
+            return kickout_bucket(kmain, bucket);
+        else if (next_bucket == bucket)
+            return _index[next_bucket].next = find_empty_bucket(next_bucket, 1);
+
+        uint32_t csize = 1;
+        //find next linked bucket and check key
+        while (true) {
+            const auto eslot = _index[next_bucket].slot & _mask;
+            if (EMH_EQHASH(next_bucket, key_hash)) {
+                if (EMH_LIKELY(_eq(key, _pairs[eslot].first)))
+                    return next_bucket;
+            }
+
+            csize += 1;
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                break;
+            next_bucket = nbucket;
+        }
+
+        //find a empty and link it to tail
+        const auto new_bucket = find_empty_bucket(next_bucket, csize);
+        prefetch_heap_block((char*)&_pairs[new_bucket]);
+        return _index[next_bucket].next = new_bucket;
+    }
+
+    size_type find_unique_bucket(uint64_t key_hash) noexcept
+    {
+        const auto bucket = size_type(key_hash & _mask);
+        auto next_bucket = _index[bucket].next;
+        if ((int)next_bucket < 0) {
+#if EMH_HIGH_LOAD
+            if (next_bucket != INACTIVE)
+                pop_empty(bucket);
+#endif
+            return bucket;
+        }
+
+        //check current bucket_key is in main bucket or not
+        const auto kmain = hash_main(bucket);
+        if (EMH_UNLIKELY(kmain != bucket))
+            return kickout_bucket(kmain, bucket);
+        else if (EMH_UNLIKELY(next_bucket != bucket))
+            next_bucket = find_last_bucket(next_bucket);
+
+        return _index[next_bucket].next = find_empty_bucket(next_bucket, 2);
+    }
+
+    /***
+      Different probing techniques usually provide a trade-off between memory locality and avoidance of clustering.
+      Since Robin Hood hashing is relatively resilient to clustering (both primary and secondary), linear probing is the most cache friendly alternativeis typically used.
+
+      It's the core algorithm of this hash map with highly optimization/benchmark.
+      normaly linear probing is inefficient with high load factor, it use a new 3-way linear
+      probing strategy to search empty slot. from benchmark even the load factor > 0.9, it's more 2-3 timer fast than
+      one-way search strategy.
+
+      1. linear or quadratic probing a few cache line for less cache miss from input slot "bucket_from".
+      2. the first  search  slot from member variant "_last", init with 0
+      3. the second search slot from calculated pos "(_num_filled + _last) & _mask", it's like a rand value
+      */
+    // key is not in this mavalue. Find a place to put it.
+    size_type find_empty_bucket(const size_type bucket_from, uint32_t csize) noexcept
+    {
+        (void)csize;
+#if EMH_HIGH_LOAD
+        if (_ehead)
+            return pop_empty(_ehead);
+#endif
+
+        auto bucket = bucket_from;
+        if (EMH_EMPTY(++bucket) || EMH_EMPTY(++bucket))
+            return bucket;
+
+#ifdef EMH_QUADRATIC
+        constexpr size_type linear_probe_length = 2 * EMH_CACHE_LINE_SIZE / sizeof(Index);//16
+        for (size_type offset = csize + 2, step = 4; offset <= linear_probe_length; ) {
+            bucket = (bucket_from + offset) & _mask;
+            if (EMH_EMPTY(bucket) || EMH_EMPTY(++bucket))
+                return bucket;
+            offset += step; //7/8. 12. 16
+        }
+#else
+        constexpr size_type quadratic_probe_length = 6u;
+        for (size_type offset = 4u, step = 3u; step < quadratic_probe_length; ) {
+            bucket = (bucket_from + offset) & _mask;
+            if (EMH_EMPTY(bucket) || EMH_EMPTY(++bucket))
+                return bucket;
+            offset += step++;
+        }
+#endif
+
+#if EMH_PREFETCH
+        __builtin_prefetch(static_cast<const void*>(_index + _last + 1), 0, EMH_PREFETCH);
+#endif
+
+        for (;;) {
+#if EMH_PACK_TAIL
+            //find empty bucket and skip next
+            if (EMH_EMPTY(_last++))// || EMH_EMPTY(_last++))
+                return _last++ - 1;
+
+            if (EMH_UNLIKELY(_last >= _num_buckets))
+                _last = 0;
+
+            auto medium = (_mask / 4 + _last++) & _mask;
+            if (EMH_EMPTY(medium))
+                return medium;
+#else
+            _last &= _mask;
+            if (EMH_EMPTY(++_last))// || EMH_EMPTY(++_last))
+                return _last;
+
+            auto medium = (_num_buckets / 2 + _last) & _mask;
+            if (EMH_EMPTY(medium))// || EMH_EMPTY(++medium))
+                return medium;
+#endif
+        }
+
+        return 0;
+    }
+
+    size_type find_last_bucket(size_type main_bucket) const
+    {
+        auto next_bucket = _index[main_bucket].next;
+        if (next_bucket == main_bucket)
+            return main_bucket;
+
+        while (true) {
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == next_bucket)
+                return next_bucket;
+            next_bucket = nbucket;
+        }
+    }
+
+    size_type find_prev_bucket(const size_type main_bucket, const size_type bucket) const
+    {
+        auto next_bucket = _index[main_bucket].next;
+        if (next_bucket == bucket)
+            return main_bucket;
+
+        while (true) {
+            const auto nbucket = _index[next_bucket].next;
+            if (nbucket == bucket)
+                return next_bucket;
+            next_bucket = nbucket;
+        }
+    }
+
+    size_type hash_bucket(const KeyT& key) const noexcept
+    {
+        return (size_type)hash_key(key) & _mask;
+    }
+
+    size_type hash_main(const size_type bucket) const noexcept
+    {
+        const auto slot = _index[bucket].slot & _mask;
+        return (size_type)hash_key(_pairs[slot].first) & _mask;
+    }
+
+#if EMH_INT_HASH
+    static constexpr uint64_t KC = UINT64_C(11400714819323198485);
+    static uint64_t hash64(uint64_t key)
+    {
+#if __SIZEOF_INT128__ && EMH_INT_HASH == 1
+        __uint128_t r = key; r *= KC;
+        return (uint64_t)(r >> 64) + (uint64_t)r;
+#elif EMH_INT_HASH == 2
+        //MurmurHash3Mixer
+        uint64_t h = key;
+        h ^= h >> 33;
+        h *= 0xff51afd7ed558ccd;
+        h ^= h >> 33;
+        h *= 0xc4ceb9fe1a85ec53;
+        h ^= h >> 33;
+        return h;
+#elif _WIN64 && EMH_INT_HASH == 1
+        uint64_t high;
+        return _umul128(key, KC, &high) + high;
+#elif EMH_INT_HASH == 3
+        auto ror  = (key >> 32) | (key << 32);
+        auto low  = key * 0xA24BAED4963EE407ull;
+        auto high = ror * 0x9FB21C651E98DF25ull;
+        auto mix  = low + high;
+        return mix;
+#elif EMH_INT_HASH == 1
+        uint64_t r = key * UINT64_C(0xca4bcaa75ec3f625);
+        return (r >> 32) + r;
+#elif EMH_WYHASH64
+        return wyhash64(key, KC);
+#else
+        uint64_t x = key;
+        x = (x ^ (x >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
+        x = (x ^ (x >> 27)) * UINT64_C(0x94d049bb133111eb);
+        x = x ^ (x >> 31);
+        return x;
+#endif
+    }
+#endif
+
+#if EMH_WYHASH_HASH
+    //#define WYHASH_CONDOM 1
+    static uint64_t wymix(uint64_t A, uint64_t B)
+    {
+#if defined(__SIZEOF_INT128__)
+        __uint128_t r = A; r *= B;
+#if WYHASH_CONDOM2
+        A ^= (uint64_t)r; B ^= (uint64_t)(r >> 64);
+#else
+        A = (uint64_t)r; B = (uint64_t)(r >> 64);
+#endif
+
+#elif defined(_MSC_VER) && defined(_M_X64)
+#if WYHASH_CONDOM2
+        uint64_t a, b;
+        a = _umul128(A, B, &b);
+        A ^= a; B ^= b;
+#else
+        A = _umul128(A, B, &B);
+#endif
+#else
+        uint64_t ha = A >> 32, hb = B >> 32, la = (uint32_t)A, lb = (uint32_t)B, hi, lo;
+        uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb, t = rl + (rm0 << 32), c = t < rl;
+        lo = t + (rm1 << 32); c += lo < t; hi = rh + (rm0 >> 32) + (rm1 >> 32) + c;
+#if WYHASH_CONDOM2
+        A ^= lo; B ^= hi;
+#else
+        A = lo; B = hi;
+#endif
+#endif
+        return A ^ B;
+    }
+
+    //multiply and xor mix function, aka MUM
+    static inline uint64_t wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return v; }
+    static inline uint64_t wyr4(const uint8_t *p) { uint32_t v; memcpy(&v, p, 4); return v; }
+    static inline uint64_t wyr3(const uint8_t *p, size_t k) {
+        return (((uint64_t)p[0]) << 16) | (((uint64_t)p[k >> 1]) << 8) | p[k - 1];
+    }
+
+    inline static const uint64_t secret[4] = {
+        0x2d358dccaa6c78a5ull, 0x8bb84b93962eacc9ull,
+        0x4b33a62ed433d4a3ull, 0x4d5a2da51de1aa47ull};
+public:
+    //wyhash main function https://github.com/wangyi-fudan/wyhash
+    static uint64_t wyhashstr(const char *key, const size_t len)
+    {
+        uint64_t a = 0, b = 0, seed = secret[0];
+        const uint8_t *p = (const uint8_t*)key;
+        if (EMH_LIKELY(len <= 16)) {
+            if (EMH_LIKELY(len >= 4)) {
+                const auto half = (len >> 3) << 2;
+                a = (wyr4(p) << 32U) | wyr4(p + half); p += len - 4;
+                b = (wyr4(p) << 32U) | wyr4(p - half);
+            } else if (len) {
+                a = wyr3(p, len);
+            }
+        } else {
+            size_t i = len;
+            if (EMH_UNLIKELY(i > 48)) {
+                uint64_t see1 = seed, see2 = seed;
+                do {
+                    seed = wymix(wyr8(p +  0) ^ secret[1], wyr8(p +  8) ^ seed);
+                    see1 = wymix(wyr8(p + 16) ^ secret[2], wyr8(p + 24) ^ see1);
+                    see2 = wymix(wyr8(p + 32) ^ secret[3], wyr8(p + 40) ^ see2);
+                    p += 48; i -= 48;
+                } while (EMH_LIKELY(i > 48));
+                seed ^= see1 ^ see2;
+            }
+            while (i > 16) {
+                seed = wymix(wyr8(p) ^ secret[1], wyr8(p + 8) ^ seed);
+                i -= 16; p += 16;
+            }
+            a = wyr8(p + i - 16);
+            b = wyr8(p + i - 8);
+        }
+
+        return wymix(secret[1] ^ len, wymix(a ^ secret[1], b ^ seed));
+    }
+#endif
+
+private:
+    template<typename UType, typename std::enable_if<std::is_integral<UType>::value, uint32_t>::type = 0>
+        inline uint64_t hash_key(const UType key) const
+        {
+#if EMH_INT_HASH
+            return hash64(key);
+#elif EMH_IDENTITY_HASH
+            return key + (key >> 24);
+#else
+            return _hasher(key);
+#endif
+        }
+
+    template<typename UType, typename std::enable_if<std::is_same<UType, std::string>::value, uint32_t>::type = 0>
+        inline uint64_t hash_key(const UType& key) const
+        {
+#if EMH_WYHASH_HASH
+            return wyhashstr(key.data(), key.size());
+#else
+            return _hasher(key);
+#endif
+        }
+
+    template<typename UType, typename std::enable_if<!std::is_integral<UType>::value && !std::is_same<UType, std::string>::value, uint32_t>::type = 0>
+        inline uint64_t hash_key(const UType& key) const
+        {
+            return _hasher(key);
+        }
+
+private:
+    Index*    _index;
+    value_type*_pairs;
+
+    HashT     _hasher;
+    EqT       _eq;
+    uint32_t  _mlf;
+    size_type _mask;
+    size_type _num_buckets;
+    size_type _num_filled;
+    size_type _last;
+#if EMH_HIGH_LOAD
+    size_type _ehead;
+#endif
+    size_type _etail;
+};
+} // namespace emhash
+

From 22a0eba8dbffd6d46f59c4e19cb6fa816cbbbe64 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@chromium.org>
Date: Fri, 1 Nov 2024 13:32:43 +0100
Subject: [PATCH 4/8] Fix emhash8 compilation for MinGW.

---
 src/third_party/emhash/README.ninja    | 2 +-
 src/third_party/emhash/hash_table8.hpp | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/third_party/emhash/README.ninja b/src/third_party/emhash/README.ninja
index 8a236c2d94..ac2c2114a6 100644
--- a/src/third_party/emhash/README.ninja
+++ b/src/third_party/emhash/README.ninja
@@ -4,4 +4,4 @@ URL: https://github.com/ktprime/emhash
 Copyright: Copyright (c) 2021-2024 Huang Yuanbing & bailuzhou AT 163.com
 SPDX-License-Identifier: MIT
 Local changes:
- - None.
+ - Added includes for _mm_prefetch on MinGW.
diff --git a/src/third_party/emhash/hash_table8.hpp b/src/third_party/emhash/hash_table8.hpp
index ba121cbdfc..689185edac 100644
--- a/src/third_party/emhash/hash_table8.hpp
+++ b/src/third_party/emhash/hash_table8.hpp
@@ -58,6 +58,10 @@
     _etail = bucket; \
     _index[bucket] = {bucket, _num_filled++ | ((size_type)(key_hash) & ~_mask)}
 
+#if _WIN32 && defined(_M_IX86)
+#include <xmmintrin.h>
+#endif
+
 namespace emhash8 {
 
 struct DefaultPolicy {
@@ -1237,7 +1241,7 @@ class HashMap
         // misses.  This is intended to overlap with execution of calculating the hash for a key.
 #if __linux__
         __builtin_prefetch(static_cast<const void*>(ctrl));
-#elif _WIN32
+#elif _WIN32 && defined(_M_IX86)
         _mm_prefetch((const char*)ctrl, _MM_HINT_T0);
 #endif
     }

From c3e3fb98e2987875ef0b640f168837bd301170c1 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@chromium.org>
Date: Tue, 5 Nov 2024 14:57:15 +0100
Subject: [PATCH 5/8] Fix some spelling errors in emhash8.

---
 src/third_party/emhash/README.ninja    | 1 +
 src/third_party/emhash/hash_table8.hpp | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/third_party/emhash/README.ninja b/src/third_party/emhash/README.ninja
index ac2c2114a6..12ead4e545 100644
--- a/src/third_party/emhash/README.ninja
+++ b/src/third_party/emhash/README.ninja
@@ -5,3 +5,4 @@ Copyright: Copyright (c) 2021-2024 Huang Yuanbing & bailuzhou AT 163.com
 SPDX-License-Identifier: MIT
 Local changes:
  - Added includes for _mm_prefetch on MinGW.
+ - Fixed some spelling errors to appease the linter.
diff --git a/src/third_party/emhash/hash_table8.hpp b/src/third_party/emhash/hash_table8.hpp
index 689185edac..36b7218381 100644
--- a/src/third_party/emhash/hash_table8.hpp
+++ b/src/third_party/emhash/hash_table8.hpp
@@ -1456,7 +1456,7 @@ class HashMap
 #endif
 
     //kick out bucket and find empty to occpuy
-    //it will break the orgin link and relnik again.
+    //it will break the origin link and relink again.
     //before: main_bucket-->prev_bucket --> bucket   --> next_bucket
     //atfer : main_bucket-->prev_bucket --> (removed)--> new_bucket--> next_bucket
     size_type kickout_bucket(const size_type kmain, const size_type bucket) noexcept
@@ -1556,7 +1556,7 @@ class HashMap
       Since Robin Hood hashing is relatively resilient to clustering (both primary and secondary), linear probing is the most cache friendly alternativeis typically used.
 
       It's the core algorithm of this hash map with highly optimization/benchmark.
-      normaly linear probing is inefficient with high load factor, it use a new 3-way linear
+      normally linear probing is inefficient with high load factor, it use a new 3-way linear
       probing strategy to search empty slot. from benchmark even the load factor > 0.9, it's more 2-3 timer fast than
       one-way search strategy.
 

From 32e7e2385f098009b45956f74fd4ce603cd0a098 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@chromium.org>
Date: Fri, 1 Nov 2024 12:38:13 +0100
Subject: [PATCH 6/8] Switch hash to rapidhash.

This is the currently fastest hash that passes SMHasher and does not
require special instructions (e.g. SIMD). Like emhash8, it is
liberally licensed (2-clause BSD), and we include the .h file directly.

For a no-op build of Chromium (Linux, Zen 2),
this reduces time spent from 4.62 to 4.22 seconds.
(NOTE: This is a more difficult measurement than the previous ones,
as it necessarily involves removing the entire build log and doing
a clean build. However, just switching the HashMap hash takes
to 4.47 seconds or so.)
---
 src/build_log.cc                       |  55 +----
 src/build_log_test.cc                  |  12 +-
 src/hash_map.h                         |  37 +--
 src/third_party/rapidhash/README.ninja |   7 +
 src/third_party/rapidhash/rapidhash.h  | 323 +++++++++++++++++++++++++
 5 files changed, 342 insertions(+), 92 deletions(-)
 create mode 100644 src/third_party/rapidhash/README.ninja
 create mode 100755 src/third_party/rapidhash/rapidhash.h

diff --git a/src/build_log.cc b/src/build_log.cc
index 52c7c84f85..073d2fe81e 100644
--- a/src/build_log.cc
+++ b/src/build_log.cc
@@ -53,63 +53,14 @@ using namespace std;
 namespace {
 
 const char kFileSignature[] = "# ninja log v%d\n";
-const int kOldestSupportedVersion = 6;
-const int kCurrentVersion = 6;
-
-// 64bit MurmurHash2, by Austin Appleby
-#if defined(_MSC_VER)
-#define BIG_CONSTANT(x) (x)
-#else   // defined(_MSC_VER)
-#define BIG_CONSTANT(x) (x##LLU)
-#endif // !defined(_MSC_VER)
-inline
-uint64_t MurmurHash64A(const void* key, size_t len) {
-  static const uint64_t seed = 0xDECAFBADDECAFBADull;
-  const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
-  const int r = 47;
-  uint64_t h = seed ^ (len * m);
-  const unsigned char* data = static_cast<const unsigned char*>(key);
-  while (len >= 8) {
-    uint64_t k;
-    memcpy(&k, data, sizeof k);
-    k *= m;
-    k ^= k >> r;
-    k *= m;
-    h ^= k;
-    h *= m;
-    data += 8;
-    len -= 8;
-  }
-  switch (len & 7)
-  {
-  case 7: h ^= uint64_t(data[6]) << 48;
-          NINJA_FALLTHROUGH;
-  case 6: h ^= uint64_t(data[5]) << 40;
-          NINJA_FALLTHROUGH;
-  case 5: h ^= uint64_t(data[4]) << 32;
-          NINJA_FALLTHROUGH;
-  case 4: h ^= uint64_t(data[3]) << 24;
-          NINJA_FALLTHROUGH;
-  case 3: h ^= uint64_t(data[2]) << 16;
-          NINJA_FALLTHROUGH;
-  case 2: h ^= uint64_t(data[1]) << 8;
-          NINJA_FALLTHROUGH;
-  case 1: h ^= uint64_t(data[0]);
-          h *= m;
-  };
-  h ^= h >> r;
-  h *= m;
-  h ^= h >> r;
-  return h;
-}
-#undef BIG_CONSTANT
-
+const int kOldestSupportedVersion = 7;
+const int kCurrentVersion = 7;
 
 }  // namespace
 
 // static
 uint64_t BuildLog::LogEntry::HashCommand(StringPiece command) {
-  return MurmurHash64A(command.str_, command.len_);
+  return rapidhash(command.str_, command.len_);
 }
 
 BuildLog::LogEntry::LogEntry(const string& output)
diff --git a/src/build_log_test.cc b/src/build_log_test.cc
index 12c2dc742c..630b1f1a92 100644
--- a/src/build_log_test.cc
+++ b/src/build_log_test.cc
@@ -104,7 +104,7 @@ TEST_F(BuildLogTest, FirstWriteAddsSignature) {
 
 TEST_F(BuildLogTest, DoubleEntry) {
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "0\t1\t2\tout\t%" PRIx64 "\n",
       BuildLog::LogEntry::HashCommand("command abc"));
   fprintf(f, "0\t1\t2\tout\t%" PRIx64 "\n",
@@ -177,7 +177,7 @@ TEST_F(BuildLogTest, ObsoleteOldVersion) {
 
 TEST_F(BuildLogTest, SpacesInOutput) {
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "123\t456\t456\tout with space\t%" PRIx64 "\n",
       BuildLog::LogEntry::HashCommand("command"));
   fclose(f);
@@ -200,10 +200,10 @@ TEST_F(BuildLogTest, DuplicateVersionHeader) {
   // build log on Windows. This shouldn't crash, and the second version header
   // should be ignored.
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "123\t456\t456\tout\t%" PRIx64 "\n",
       BuildLog::LogEntry::HashCommand("command"));
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "456\t789\t789\tout2\t%" PRIx64 "\n",
       BuildLog::LogEntry::HashCommand("command2"));
   fclose(f);
@@ -252,7 +252,7 @@ struct TestDiskInterface : public DiskInterface {
 
 TEST_F(BuildLogTest, Restat) {
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n"
+  fprintf(f, "# ninja log v7\n"
              "1\t2\t3\tout\tcommand\n");
   fclose(f);
   std::string err;
@@ -280,7 +280,7 @@ TEST_F(BuildLogTest, VeryLongInputLine) {
   // Ninja's build log buffer is currently 256kB. Lines longer than that are
   // silently ignored, but don't affect parsing of other lines.
   FILE* f = fopen(kTestFilename, "wb");
-  fprintf(f, "# ninja log v6\n");
+  fprintf(f, "# ninja log v7\n");
   fprintf(f, "123\t456\t456\tout\tcommand start");
   for (size_t i = 0; i < (512 << 10) / strlen(" more_command"); ++i)
     fputs(" more_command", f);
diff --git a/src/hash_map.h b/src/hash_map.h
index 5585b1dc51..4361c80a35 100644
--- a/src/hash_map.h
+++ b/src/hash_map.h
@@ -19,40 +19,9 @@
 #include <string.h>
 #include "string_piece.h"
 #include "util.h"
-#include "third_party/emhash/hash_table8.hpp"
 
-// MurmurHash2, by Austin Appleby
-static inline
-unsigned int MurmurHash2(const void* key, size_t len) {
-  static const unsigned int seed = 0xDECAFBAD;
-  const unsigned int m = 0x5bd1e995;
-  const int r = 24;
-  unsigned int h = seed ^ len;
-  const unsigned char* data = static_cast<const unsigned char*>(key);
-  while (len >= 4) {
-    unsigned int k;
-    memcpy(&k, data, sizeof k);
-    k *= m;
-    k ^= k >> r;
-    k *= m;
-    h *= m;
-    h ^= k;
-    data += 4;
-    len -= 4;
-  }
-  switch (len) {
-  case 3: h ^= data[2] << 16;
-          NINJA_FALLTHROUGH;
-  case 2: h ^= data[1] << 8;
-          NINJA_FALLTHROUGH;
-  case 1: h ^= data[0];
-    h *= m;
-  };
-  h ^= h >> 13;
-  h *= m;
-  h ^= h >> 15;
-  return h;
-}
+#include "third_party/emhash/hash_table8.hpp"
+#include "third_party/rapidhash/rapidhash.h"
 
 namespace std {
 template<>
@@ -61,7 +30,7 @@ struct hash<StringPiece> {
   typedef size_t result_type;
 
   size_t operator()(StringPiece key) const {
-    return MurmurHash2(key.str_, key.len_);
+    return rapidhash(key.str_, key.len_);
   }
 };
 }
diff --git a/src/third_party/rapidhash/README.ninja b/src/third_party/rapidhash/README.ninja
new file mode 100644
index 0000000000..1d74b67c1f
--- /dev/null
+++ b/src/third_party/rapidhash/README.ninja
@@ -0,0 +1,7 @@
+Description: Very fast, high quality, platform-independent hashing algorithm.
+Version: commit 4a6b2570e868536be84800353efd92c699f37d2c
+URL: https://github.com/Nicoshev/rapidhash
+Copyright: Copyright (C) 2024 Nicolas De Carli, Based on 'wyhash', by Wang Yi <godspeed_china@yeah.net>
+SPDX-License-Identifier: BSD-2-Clause
+Local changes:
+ - Changed to UNIX line endings
diff --git a/src/third_party/rapidhash/rapidhash.h b/src/third_party/rapidhash/rapidhash.h
new file mode 100755
index 0000000000..463f733d85
--- /dev/null
+++ b/src/third_party/rapidhash/rapidhash.h
@@ -0,0 +1,323 @@
+/*
+ * rapidhash - Very fast, high quality, platform-independent hashing algorithm.
+ * Copyright (C) 2024 Nicolas De Carli
+ *
+ * Based on 'wyhash', by Wang Yi <godspeed_china@yeah.net>
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - rapidhash source repository: https://github.com/Nicoshev/rapidhash
+ */
+
+/*
+ *  Includes.
+ */
+#include <stdint.h>
+#include <string.h>
+#if defined(_MSC_VER)
+  #include <intrin.h>
+  #if defined(_M_X64) && !defined(_M_ARM64EC)
+    #pragma intrinsic(_umul128)
+  #endif
+#endif
+
+/*
+ *  C++ macros.
+ *
+ *  RAPIDHASH_INLINE can be overridden to be stronger than a hint, i.e. by adding __attribute__((always_inline)).
+ */
+#ifdef __cplusplus
+  #define RAPIDHASH_NOEXCEPT noexcept
+  #define RAPIDHASH_CONSTEXPR constexpr
+  #ifndef RAPIDHASH_INLINE
+    #define RAPIDHASH_INLINE inline
+  #endif
+#else
+  #define RAPIDHASH_NOEXCEPT
+  #define RAPIDHASH_CONSTEXPR static const
+  #ifndef RAPIDHASH_INLINE
+    #define RAPIDHASH_INLINE static inline
+  #endif
+#endif
+
+/*
+ *  Protection macro, alters behaviour of rapid_mum multiplication function.
+ *
+ *  RAPIDHASH_FAST: Normal behavior, max speed.
+ *  RAPIDHASH_PROTECTED: Extra protection against entropy loss.
+ */
+#ifndef RAPIDHASH_PROTECTED
+  #define RAPIDHASH_FAST
+#elif defined(RAPIDHASH_FAST)
+  #error "cannot define RAPIDHASH_PROTECTED and RAPIDHASH_FAST simultaneously."
+#endif
+
+/*
+ *  Unrolling macros, changes code definition for main hash function.
+ *
+ *  RAPIDHASH_COMPACT: Legacy variant, each loop process 48 bytes.
+ *  RAPIDHASH_UNROLLED: Unrolled variant, each loop process 96 bytes.
+ *
+ *  Most modern CPUs should benefit from having RAPIDHASH_UNROLLED.
+ *
+ *  These macros do not alter the output hash.
+ */
+#ifndef RAPIDHASH_COMPACT
+  #define RAPIDHASH_UNROLLED
+#elif defined(RAPIDHASH_UNROLLED)
+  #error "cannot define RAPIDHASH_COMPACT and RAPIDHASH_UNROLLED simultaneously."
+#endif
+
+/*
+ *  Likely and unlikely macros.
+ */
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+  #define _likely_(x)  __builtin_expect(x,1)
+  #define _unlikely_(x)  __builtin_expect(x,0)
+#else
+  #define _likely_(x) (x)
+  #define _unlikely_(x) (x)
+#endif
+
+/*
+ *  Endianness macros.
+ */
+#ifndef RAPIDHASH_LITTLE_ENDIAN
+  #if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+    #define RAPIDHASH_LITTLE_ENDIAN
+  #elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    #define RAPIDHASH_BIG_ENDIAN
+  #else
+    #warning "could not determine endianness! Falling back to little endian."
+    #define RAPIDHASH_LITTLE_ENDIAN
+  #endif
+#endif
+
+/*
+ *  Default seed.
+ */
+#define RAPID_SEED (0xbdd89aa982704029ull)
+
+/*
+ *  Default secret parameters.
+ */
+RAPIDHASH_CONSTEXPR uint64_t rapid_secret[3] = {0x2d358dccaa6c78a5ull, 0x8bb84b93962eacc9ull, 0x4b33a62ed433d4a3ull};
+
+/*
+ *  64*64 -> 128bit multiply function.
+ *
+ *  @param A  Address of 64-bit number.
+ *  @param B  Address of 64-bit number.
+ *
+ *  Calculates 128-bit C = *A * *B.
+ *
+ *  When RAPIDHASH_FAST is defined:
+ *  Overwrites A contents with C's low 64 bits.
+ *  Overwrites B contents with C's high 64 bits.
+ *
+ *  When RAPIDHASH_PROTECTED is defined:
+ *  Xors and overwrites A contents with C's low 64 bits.
+ *  Xors and overwrites B contents with C's high 64 bits.
+ */
+RAPIDHASH_INLINE void rapid_mum(uint64_t *A, uint64_t *B) RAPIDHASH_NOEXCEPT {
+#if defined(__SIZEOF_INT128__)
+  __uint128_t r=*A; r*=*B;
+  #ifdef RAPIDHASH_PROTECTED
+  *A^=(uint64_t)r; *B^=(uint64_t)(r>>64);
+  #else
+  *A=(uint64_t)r; *B=(uint64_t)(r>>64);
+  #endif
+#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))
+  #if defined(_M_X64)
+    #ifdef RAPIDHASH_PROTECTED
+    uint64_t a, b;
+    a=_umul128(*A,*B,&b);
+    *A^=a;  *B^=b;
+    #else
+    *A=_umul128(*A,*B,B);
+    #endif
+  #else
+    #ifdef RAPIDHASH_PROTECTED
+    uint64_t a, b;
+    b = __umulh(*A, *B);
+    a = *A * *B;
+    *A^=a;  *B^=b;
+    #else
+    uint64_t c = __umulh(*A, *B);
+    *A = *A * *B;
+    *B = c;
+    #endif
+  #endif
+#else
+  uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B, hi, lo;
+  uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t<rl;
+  lo=t+(rm1<<32); c+=lo<t; hi=rh+(rm0>>32)+(rm1>>32)+c;
+  #ifdef RAPIDHASH_PROTECTED
+  *A^=lo;  *B^=hi;
+  #else
+  *A=lo;  *B=hi;
+  #endif
+#endif
+}
+
+/*
+ *  Multiply and xor mix function.
+ *
+ *  @param A  64-bit number.
+ *  @param B  64-bit number.
+ *
+ *  Calculates 128-bit C = A * B.
+ *  Returns 64-bit xor between high and low 64 bits of C.
+ */
+RAPIDHASH_INLINE uint64_t rapid_mix(uint64_t A, uint64_t B) RAPIDHASH_NOEXCEPT { rapid_mum(&A,&B); return A^B; }
+
+/*
+ *  Read functions.
+ */
+#ifdef RAPIDHASH_LITTLE_ENDIAN
+RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return v;}
+RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return v;}
+#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return __builtin_bswap64(v);}
+RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return __builtin_bswap32(v);}
+#elif defined(_MSC_VER)
+RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return _byteswap_uint64(v);}
+RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return _byteswap_ulong(v);}
+#else
+RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT {
+  uint64_t v; memcpy(&v, p, 8);
+  return (((v >> 56) & 0xff)| ((v >> 40) & 0xff00)| ((v >> 24) & 0xff0000)| ((v >>  8) & 0xff000000)| ((v <<  8) & 0xff00000000)| ((v << 24) & 0xff0000000000)| ((v << 40) & 0xff000000000000)| ((v << 56) & 0xff00000000000000));
+}
+RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT {
+  uint32_t v; memcpy(&v, p, 4);
+  return (((v >> 24) & 0xff)| ((v >>  8) & 0xff00)| ((v <<  8) & 0xff0000)| ((v << 24) & 0xff000000));
+}
+#endif
+
+/*
+ *  Reads and combines 3 bytes of input.
+ *
+ *  @param p  Buffer to read from.
+ *  @param k  Length of @p, in bytes.
+ *
+ *  Always reads and combines 3 bytes from memory.
+ *  Guarantees to read each buffer position at least once.
+ *
+ *  Returns a 64-bit value containing all three bytes read.
+ */
+RAPIDHASH_INLINE uint64_t rapid_readSmall(const uint8_t *p, size_t k) RAPIDHASH_NOEXCEPT { return (((uint64_t)p[0])<<56)|(((uint64_t)p[k>>1])<<32)|p[k-1];}
+
+/*
+ *  rapidhash main function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *  @param seed    64-bit seed used to alter the hash result predictably.
+ *  @param secret  Triplet of 64-bit secrets used to alter hash result predictably.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE uint64_t rapidhash_internal(const void *key, size_t len, uint64_t seed, const uint64_t* secret) RAPIDHASH_NOEXCEPT {
+  const uint8_t *p=(const uint8_t *)key; seed^=rapid_mix(seed^secret[0],secret[1])^len;  uint64_t  a,  b;
+  if(_likely_(len<=16)){
+    if(_likely_(len>=4)){
+      const uint8_t * plast = p + len - 4;
+      a = (rapid_read32(p) << 32) | rapid_read32(plast);
+      const uint64_t delta = ((len&24)>>(len>>3));
+      b = ((rapid_read32(p + delta) << 32) | rapid_read32(plast - delta)); }
+    else if(_likely_(len>0)){ a=rapid_readSmall(p,len); b=0;}
+    else a=b=0;
+  }
+  else{
+    size_t i=len;
+    if(_unlikely_(i>48)){
+      uint64_t see1=seed, see2=seed;
+#ifdef RAPIDHASH_UNROLLED
+      while(_likely_(i>=96)){
+        seed=rapid_mix(rapid_read64(p)^secret[0],rapid_read64(p+8)^seed);
+        see1=rapid_mix(rapid_read64(p+16)^secret[1],rapid_read64(p+24)^see1);
+        see2=rapid_mix(rapid_read64(p+32)^secret[2],rapid_read64(p+40)^see2);
+        seed=rapid_mix(rapid_read64(p+48)^secret[0],rapid_read64(p+56)^seed);
+        see1=rapid_mix(rapid_read64(p+64)^secret[1],rapid_read64(p+72)^see1);
+        see2=rapid_mix(rapid_read64(p+80)^secret[2],rapid_read64(p+88)^see2);
+        p+=96; i-=96;
+      }
+      if(_unlikely_(i>=48)){
+        seed=rapid_mix(rapid_read64(p)^secret[0],rapid_read64(p+8)^seed);
+        see1=rapid_mix(rapid_read64(p+16)^secret[1],rapid_read64(p+24)^see1);
+        see2=rapid_mix(rapid_read64(p+32)^secret[2],rapid_read64(p+40)^see2);
+        p+=48; i-=48;
+      }
+#else
+      do {
+        seed=rapid_mix(rapid_read64(p)^secret[0],rapid_read64(p+8)^seed);
+        see1=rapid_mix(rapid_read64(p+16)^secret[1],rapid_read64(p+24)^see1);
+        see2=rapid_mix(rapid_read64(p+32)^secret[2],rapid_read64(p+40)^see2);
+        p+=48; i-=48;
+      } while (_likely_(i>=48));
+#endif
+      seed^=see1^see2;
+    }
+    if(i>16){
+      seed=rapid_mix(rapid_read64(p)^secret[2],rapid_read64(p+8)^seed^secret[1]);
+      if(i>32)
+        seed=rapid_mix(rapid_read64(p+16)^secret[2],rapid_read64(p+24)^seed);
+    }
+    a=rapid_read64(p+i-16);  b=rapid_read64(p+i-8);
+  }
+  a^=secret[1]; b^=seed;  rapid_mum(&a,&b);
+  return  rapid_mix(a^secret[0]^len,b^secret[1]);
+}
+
+/*
+ *  rapidhash default seeded hash function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *  @param seed    64-bit seed used to alter the hash result predictably.
+ *
+ *  Calls rapidhash_internal using provided parameters and default secrets.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE uint64_t rapidhash_withSeed(const void *key, size_t len, uint64_t seed) RAPIDHASH_NOEXCEPT {
+  return rapidhash_internal(key, len, seed, rapid_secret);
+}
+
+/*
+ *  rapidhash default hash function.
+ *
+ *  @param key     Buffer to be hashed.
+ *  @param len     @key length, in bytes.
+ *
+ *  Calls rapidhash_withSeed using provided parameters and the default seed.
+ *
+ *  Returns a 64-bit hash.
+ */
+RAPIDHASH_INLINE uint64_t rapidhash(const void *key, size_t len) RAPIDHASH_NOEXCEPT {
+  return rapidhash_withSeed(key, len, RAPID_SEED);
+}

From cffec3852f407094407aa39842ff54d34e088f64 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@chromium.org>
Date: Fri, 1 Nov 2024 14:00:51 +0100
Subject: [PATCH 7/8] Stop calling ftell() in a loop.

ftell() must go ask the kernel for the file offset, in case
someone knew the underlying file descriptor number and seeked it.
Thus, we can save a couple hundred thousand syscalls by just
caching the offset and maintaining it ourselves.

This cuts another ~170ms off a no-op Chromium build.
---
 src/deps_log.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/deps_log.cc b/src/deps_log.cc
index 504799f4bf..d117d06907 100644
--- a/src/deps_log.cc
+++ b/src/deps_log.cc
@@ -186,15 +186,13 @@ LoadStatus DepsLog::Load(const string& path, State* state, string* err) {
     return LOAD_SUCCESS;
   }
 
-  long offset;
+  long offset = ftell(f);
   bool read_failed = false;
   int unique_dep_record_count = 0;
   int total_dep_record_count = 0;
   for (;;) {
-    offset = ftell(f);
-
     unsigned size;
-    if (fread(&size, 4, 1, f) < 1) {
+    if (fread(&size, sizeof(size), 1, f) < 1) {
       if (!feof(f))
         read_failed = true;
       break;
@@ -206,6 +204,7 @@ LoadStatus DepsLog::Load(const string& path, State* state, string* err) {
       read_failed = true;
       break;
     }
+    offset += size + sizeof(size);
 
     if (is_deps) {
       if ((size % 4) != 0) {

From c97558ac2407ad915308ee4d07bcbbad7b86145c Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@chromium.org>
Date: Fri, 1 Nov 2024 15:58:35 +0100
Subject: [PATCH 8/8] Microoptimization in LoadDepsFromLog().

This cuts off another ~100 ms, most likely because the compiler
doesn't have smart enough alias analysis to do the same (trivial)
transformation.
---
 src/graph.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/graph.cc b/src/graph.cc
index f04ffb47c8..c1276daefb 100644
--- a/src/graph.cc
+++ b/src/graph.cc
@@ -740,12 +740,13 @@ bool ImplicitDepLoader::LoadDepsFromLog(Edge* edge, string* err) {
     return false;
   }
 
-  vector<Node*>::iterator implicit_dep =
-      PreallocateSpace(edge, deps->node_count);
-  for (int i = 0; i < deps->node_count; ++i, ++implicit_dep) {
-    Node* node = deps->nodes[i];
-    *implicit_dep = node;
-    node->AddOutEdge(edge);
+  Node** nodes = deps->nodes;
+  size_t node_count = deps->node_count;
+  edge->inputs_.insert(edge->inputs_.end() - edge->order_only_deps_,
+                       nodes, nodes + node_count);
+  edge->implicit_deps_ += node_count;
+  for (size_t i = 0; i < node_count; ++i) {
+    nodes[i]->AddOutEdge(edge);
   }
   return true;
 }