Skip to content

Commit

Permalink
Merge branch 'v28' into fieldwise_token_separators
Browse files Browse the repository at this point in the history
  • Loading branch information
kishorenc authored Dec 20, 2024
2 parents 6e09c52 + 49d746d commit afcfb24
Show file tree
Hide file tree
Showing 18 changed files with 3,389 additions and 1,254 deletions.
425 changes: 379 additions & 46 deletions include/collection.h

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion include/collection_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ struct locked_resource_view_t {
ResourceType* _resource;
};


// Singleton, for managing meta information of all collections and house keeping
class CollectionManager {
private:
Expand Down Expand Up @@ -198,6 +197,10 @@ class CollectionManager {
std::string& results_json_str,
uint64_t start_ts);

static Option<bool> do_union(std::map<std::string, std::string>& req_params,
std::vector<nlohmann::json>& embedded_params_vec, nlohmann::json searches,
nlohmann::json& response, uint64_t start_ts);

static bool parse_sort_by_str(std::string sort_by_str, std::vector<sort_by>& sort_fields);

// symlinks
Expand Down
4 changes: 3 additions & 1 deletion include/core_api_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,6 @@ struct export_state_t: public req_state_t {
};

Option<bool> stateful_remove_docs(deletion_state_t* deletion_state, size_t batch_size, bool& done);
Option<bool> stateful_export_docs(export_state_t* export_state, size_t batch_size, bool& done);
Option<bool> stateful_export_docs(export_state_t* export_state, size_t batch_size, bool& done);
Option<bool> multi_search_validate_and_add_params(std::map<std::string, std::string>& req_params,
nlohmann::json& search_params, const bool& is_conversation);
22 changes: 22 additions & 0 deletions include/field.h
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,22 @@ struct sort_random_t {
};

struct sort_by {
/// Used to make sure different searches sort_by on the same type of field/expression in Union.
enum sort_by_type_t {
string_field,
int32_field,
int64_field,
float_field,
bool_field,
geopoint_field,
eval_expression,
join_expression,
text_match,
random_order,
vector_search,
insertion_order
};

enum missing_values_t {
first,
last,
Expand Down Expand Up @@ -570,6 +586,8 @@ struct sort_by {
float decay_val = 0.5f;
sort_by_params_t sort_by_param = none;

sort_by_type_t type{};

sort_by(const std::string & name, const std::string & order):
name(name), order(order), text_match_buckets(0), geopoint(0), exclude_radius(0), geo_precision(0),
missing_values(normal) {
Expand All @@ -580,13 +598,15 @@ struct sort_by {
geo_precision(0), missing_values(normal) {
name = sort_field_const::eval;
eval.scores = std::move(scores);
type = eval_expression;
}

sort_by(const std::string &name, const std::string &order, uint32_t text_match_buckets, int64_t geopoint,
uint32_t exclude_radius, uint32_t geo_precision) :
name(name), order(order), text_match_buckets(text_match_buckets),
geopoint(geopoint), exclude_radius(exclude_radius), geo_precision(geo_precision),
missing_values(normal) {
type = geopoint_field;
}

sort_by(const sort_by& other) {
Expand All @@ -610,6 +630,7 @@ struct sort_by {
scale = other.scale;
offset = other.offset;
decay_val = other.decay_val;
type = other.type;
}

sort_by& operator=(const sort_by& other) {
Expand All @@ -628,6 +649,7 @@ struct sort_by {
eval = other.eval;
reference_collection_name = other.reference_collection_name;
nested_join_collection_names = other.nested_join_collection_names;
type = other.type;
return *this;
}

Expand Down
129 changes: 72 additions & 57 deletions include/index.h

Large diffs are not rendered by default.

126 changes: 91 additions & 35 deletions include/topster.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,33 +117,103 @@ struct KV {
delete [] query_indices;
query_indices = nullptr;
}

static bool is_greater(const KV* i, const KV* j) {
return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) >
std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
}

static bool is_smaller(const struct KV* i, const struct KV* j) {
return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) <
std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
}

static bool is_greater_kv_group(const std::vector<KV*>& i, const std::vector<KV*>& j) {
return std::tie(i[0]->scores[0], i[0]->scores[1], i[0]->scores[2], i[0]->key) >
std::tie(j[0]->scores[0], j[0]->scores[1], j[0]->scores[2], j[0]->key);
}

static constexpr uint64_t get_key(const KV* kv) {
return kv->key;
}
};

struct Union_KV : public KV {
uint32_t search_index{};

Union_KV(KV& kv, uint32_t search_index) : KV(kv.query_index, kv.key, kv.distinct_key, kv.match_score_index, kv.scores),
search_index(search_index) {
reference_filter_results = std::move(kv.reference_filter_results);
}

Union_KV() = default;

Union_KV& operator=(Union_KV&& kv) noexcept {
if (this != &kv) {
search_index = kv.search_index;
KV::operator=(std::move(kv));
}

return *this;
}

Union_KV& operator=(Union_KV& kv) noexcept {
if (this != &kv) {
search_index = kv.search_index;
KV::operator=(kv);
}

return *this;
}

static bool is_greater(const Union_KV* i, const Union_KV* j) {
return std::tie(i->scores[0], i->scores[1], i->scores[2], i->search_index, i->key) >
std::tie(j->scores[0], j->scores[1], j->scores[2], j->search_index, j->key);
}

static bool is_smaller(const Union_KV* i, const Union_KV* j) {
return std::tie(i->scores[0], i->scores[1], i->scores[2], i->search_index, i->key) <
std::tie(j->scores[0], j->scores[1], j->scores[2], j->search_index, j->key);
}

static constexpr std::pair<uint32_t, uint64_t> get_key(const Union_KV* union_kv) {
return std::make_pair(union_kv->search_index, union_kv->key);
}
};

struct pair_hash {
template <class T1, class T2>
std::size_t operator() (const std::pair<T1, T2> &pair) const {
return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
}
};

/*
* Remembers the max-K elements seen so far using a min-heap
*/
template <typename T, typename K = uint64_t, typename H = std::hash<uint64_t>, const auto& get_key = KV::get_key,
const auto& is_greater = KV::is_greater, const auto& is_smaller = KV::is_smaller>
struct Topster {
const uint32_t MAX_SIZE;
uint32_t size;

KV *data;
KV** kvs;

std::unordered_map<uint64_t, KV*> kv_map;
T *data;
T** kvs;

spp::sparse_hash_set<uint64_t> group_doc_seq_ids;
std::unordered_map<K, T*, H> map;

spp::sparse_hash_map<uint64_t, Topster*> group_kv_map;
size_t distinct;
spp::sparse_hash_set<uint64_t> group_doc_seq_ids;
spp::sparse_hash_map<uint64_t, Topster<T, K, H, get_key, is_greater, is_smaller>*> group_kv_map;

explicit Topster(size_t capacity): Topster(capacity, 0) {
}

explicit Topster(size_t capacity, size_t distinct): MAX_SIZE(capacity), size(0), distinct(distinct) {
// we allocate data first to get a memory block whose indices are then assigned to `kvs`
// we use separate **kvs for easier pointer swaps
data = new KV[capacity];
kvs = new KV*[capacity];
data = new T[capacity];
kvs = new T*[capacity];

for(size_t i=0; i<capacity; i++) {
data[i].match_score_index = 0;
Expand All @@ -168,8 +238,8 @@ struct Topster {
group_kv_map.clear();
}

static inline void swapMe(KV** a, KV** b) {
KV *temp = *a;
static inline void swapMe(T** a, T** b) {
T *temp = *a;
*a = *b;
*b = temp;

Expand All @@ -178,7 +248,7 @@ struct Topster {
(*b)->array_index = a_index;
}

int add(KV* kv) {
int add(T* kv) {
/*LOG(INFO) << "kv_map size: " << kv_map.size() << " -- kvs[0]: " << kvs[0]->scores[kvs[0]->match_score_index];
for(auto& mkv: kv_map) {
LOG(INFO) << "kv key: " << mkv.first << " => " << mkv.second->scores[mkv.second->match_score_index];
Expand Down Expand Up @@ -210,7 +280,7 @@ struct Topster {
if(kvs_it != group_kv_map.end()) {
kvs_it->second->add(kv);
} else {
Topster* g_topster = new Topster(distinct, 0);
auto g_topster = new Topster<T, K, H, get_key, is_greater, is_smaller>(distinct, 0);
g_topster->add(kv);
group_kv_map.insert({kv->distinct_key, g_topster});
}
Expand All @@ -220,8 +290,8 @@ struct Topster {
} else { // not distinct
//LOG(INFO) << "Searching for key: " << kv->key;

const auto& found_it = kv_map.find(kv->key);
bool is_duplicate_key = (found_it != kv_map.end());
const auto& found_it = map.find(get_key(kv));
bool is_duplicate_key = (found_it != map.end());

/*
is_duplicate_key: SIFT_DOWN regardless of `size`.
Expand All @@ -232,7 +302,7 @@ struct Topster {

if(is_duplicate_key) {
// Need to check if kv is greater than existing duplicate kv.
KV* existing_kv = found_it->second;
auto existing_kv = found_it->second;
//LOG(INFO) << "existing_kv: " << existing_kv->key << " -> " << existing_kv->match_score;

bool smaller_than_existing = is_smaller(kv, existing_kv);
Expand All @@ -244,7 +314,7 @@ struct Topster {

// replace existing kv and sift down
heap_op_index = existing_kv->array_index;
kv_map.erase(kvs[heap_op_index]->key);
map.erase(get_key(kvs[heap_op_index]));
} else { // not duplicate

if(size < MAX_SIZE) {
Expand All @@ -257,12 +327,12 @@ struct Topster {
// we have to replace min heap element since array is full
SIFT_DOWN = true;
heap_op_index = 0;
kv_map.erase(kvs[heap_op_index]->key);
map.erase(get_key(kvs[heap_op_index]));
}
}

// kv will be copied into the pointer at heap_op_index
kv_map.emplace(kv->key, kvs[heap_op_index]);
map.emplace(get_key(kv), kvs[heap_op_index]);
}

// we have to replace the existing element in the heap and sift down
Expand Down Expand Up @@ -303,21 +373,6 @@ struct Topster {
return ret;
}

static bool is_greater(const struct KV* i, const struct KV* j) {
return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) >
std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
}

static bool is_smaller(const struct KV* i, const struct KV* j) {
return std::tie(i->scores[0], i->scores[1], i->scores[2], i->key) <
std::tie(j->scores[0], j->scores[1], j->scores[2], j->key);
}

static bool is_greater_kv_group(const std::vector<KV*>& i, const std::vector<KV*>& j) {
return std::tie(i[0]->scores[0], i[0]->scores[1], i[0]->scores[2], i[0]->key) >
std::tie(j[0]->scores[0], j[0]->scores[1], j[0]->scores[2], j[0]->key);
}

// topster must be sorted before iterated upon to remove dead array entries
void sort() {
if(!distinct) {
Expand All @@ -337,7 +392,8 @@ struct Topster {
return kvs[index]->distinct_key;
}

KV* getKV(uint32_t index) {
T* getKV(uint32_t index) {
return kvs[index];
}
};
};

Loading

0 comments on commit afcfb24

Please sign in to comment.