diff --git a/src/Grammar.cpp b/src/Grammar.cpp index a8fcb953..ee5e11e3 100644 --- a/src/Grammar.cpp +++ b/src/Grammar.cpp @@ -1126,7 +1126,6 @@ inline void trie_indexToRule(const trie_t& trie, Grammar& grammar, uint32_t r) { void Grammar::indexSetToRule(uint32_t r, Set* s) { if (s->type & (ST_SPECIAL | ST_TAG_UNIFY)) { indexTagToRule(tag_any, r); - return; } trie_indexToRule(s->trie, *this, r); diff --git a/src/GrammarApplicator.hpp b/src/GrammarApplicator.hpp index 9039eed7..87741afd 100644 --- a/src/GrammarApplicator.hpp +++ b/src/GrammarApplicator.hpp @@ -137,6 +137,7 @@ class GrammarApplicator { uint32Vector sections; uint32IntervalVector valid_rules; uint32IntervalVector trace_rules; + uint32IntervalVector debug_rules; uint32FlatHashMap variables; uint32_t verbosity_level = 0; uint32_t debug_level = 0; @@ -279,10 +280,10 @@ class GrammarApplicator { scoped_stack ss_usets; scoped_stack ss_u32sv; - uint32FlatHashSet index_regexp_yes; - uint32FlatHashSet index_regexp_no; - uint32FlatHashSet index_icase_yes; - uint32FlatHashSet index_icase_no; + uint64FlatHashSet index_regexp_yes; + uint64FlatHashSet index_regexp_no; + uint64FlatHashSet index_icase_yes; + uint64FlatHashSet index_icase_no; std::vector index_readingSet_yes; std::vector index_readingSet_no; uint32FlatHashSet index_ruleCohort_no; @@ -358,6 +359,34 @@ class GrammarApplicator { std::deque subs_any; Reading* get_sub_reading(Reading* tr, int sub_reading); + void printDebugRule(const Rule& rule, bool target = true, bool cntx = true) { + static std::stringstream buf; + + bool ttrace = false; + swapper _st(true, trace, ttrace); + + // Whole context, both before and after current window + buf.str(""); + buf.clear(); + + buf << "# ===== BEGIN RULE " << rule.line << (target ? " TARGET-MATCH" : " TARGET-FAIL") << (cntx ? " CONTEXT-MATCH" : " CONTEXT-FAIL") << " =====\n"; + + buf << "# PREVIOUS WINDOWS\n"; + for (auto s : gWindow->previous) { + printSingleWindow(s, buf, true); + } + buf << "# CURRENT WINDOW\n"; + printSingleWindow(gWindow->current, buf, true); + buf << "# NEXT WINDOWS\n"; + for (auto s : gWindow->next) { + printSingleWindow(s, buf, true); + } + + buf << "# ===== END RULE " << rule.line << " =====\n"; + + u_fprintf(ux_stderr, "%s", buf.str().c_str()); + } + template void addProfilingExample(T& item) { auto& buf = profiler->buf; diff --git a/src/GrammarApplicator_matchSet.cpp b/src/GrammarApplicator_matchSet.cpp index 8c09f0ec..ef80f3d3 100644 --- a/src/GrammarApplicator_matchSet.cpp +++ b/src/GrammarApplicator_matchSet.cpp @@ -91,11 +91,11 @@ uint32_t GrammarApplicator::doesTagMatchRegexp(uint32_t test, const Tag& tag, bo UErrorCode status = U_ZERO_ERROR; int32_t gc = uregex_groupCount(tag.regexp, &status); uint32_t match = 0; - uint32_t ih = hash_value(tag.hash, test); - if (!bypass_index && index_matches(index_regexp_no, ih)) { + auto ih = (UI64(tag.hash) << 32) | test; + if (!bypass_index && index_regexp_no.contains(ih)) { match = 0; } - else if (!bypass_index && gc == 0 && index_matches(index_regexp_yes, ih)) { + else if (!bypass_index && gc == 0 && index_regexp_yes.contains(ih)) { match = test; } else { @@ -130,11 +130,11 @@ uint32_t GrammarApplicator::doesTagMatchRegexp(uint32_t test, const Tag& tag, bo uint32_t GrammarApplicator::doesTagMatchIcase(uint32_t test, const Tag& tag, bool bypass_index) { uint32_t match = 0; - uint32_t ih = hash_value(tag.hash, test); - if (!bypass_index && index_matches(index_icase_no, ih)) { + auto ih = (UI64(tag.hash) << 32) | test; + if (!bypass_index && index_icase_no.contains(ih)) { match = 0; } - else if (!bypass_index && index_matches(index_icase_yes, ih)) { + else if (!bypass_index && index_icase_yes.contains(ih)) { match = test; } else { @@ -157,11 +157,11 @@ uint32_t GrammarApplicator::doesRegexpMatchLine(const Reading& reading, const Ta UErrorCode status = U_ZERO_ERROR; int32_t gc = uregex_groupCount(tag.regexp, &status); uint32_t match = 0; - uint32_t ih = hash_value(reading.tags_string_hash, tag.hash); - if (!bypass_index && index_matches(index_regexp_no, ih)) { + auto ih = (UI64(reading.tags_string_hash) << 32) | tag.hash; + if (!bypass_index && index_regexp_no.contains(ih)) { match = 0; } - else if (!bypass_index && gc == 0 && index_matches(index_regexp_yes, ih)) { + else if (!bypass_index && gc == 0 && index_regexp_yes.contains(ih)) { match = reading.tags_string_hash; } else { @@ -671,10 +671,10 @@ bool GrammarApplicator::doesSetMatchReading(const Reading& reading, const uint32 // Only 30% of tests get past this. // ToDo: This is not good enough...while numeric tags are special, their failures can be indexed. if (!bypass_index && !unif_mode) { - if (index_readingSet_no[set].find(reading.hash) != index_readingSet_no[set].end()) { + if (index_readingSet_no[set].contains(reading.hash)) { return false; } - if (index_readingSet_yes[set].find(reading.hash) != index_readingSet_yes[set].end()) { + if (index_readingSet_yes[set].contains(reading.hash)) { return true; } } diff --git a/src/GrammarApplicator_runContextualTest.cpp b/src/GrammarApplicator_runContextualTest.cpp index 676b9ddf..ebf04d82 100644 --- a/src/GrammarApplicator_runContextualTest.cpp +++ b/src/GrammarApplicator_runContextualTest.cpp @@ -639,7 +639,7 @@ Cohort* GrammarApplicator::runDependencyTest(SingleWindow* sWindow, Cohort* curr // ToDo: Now that dep_deep_seen is a composite, investigate all .clear() to see if they're needed if (test->pos & POS_DEP_DEEP) { - if (index_matches(dep_deep_seen, std::make_pair(test->hash, current->global_number))) { + if (dep_deep_seen.contains(std::make_pair(test->hash, current->global_number))) { return 0; } dep_deep_seen.insert(std::make_pair(test->hash, current->global_number)); diff --git a/src/GrammarApplicator_runRules.cpp b/src/GrammarApplicator_runRules.cpp index fa790440..1b7c558d 100644 --- a/src/GrammarApplicator_runRules.cpp +++ b/src/GrammarApplicator_runRules.cpp @@ -475,7 +475,7 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R // Check if on previous runs the rule did not match this cohort, and skip if that is the case. // This cache is cleared if any rule causes any state change in the window. uint32_t ih = hash_value(rule.number, cohort->global_number); - if (index_matches(index_ruleCohort_no, ih)) { + if (index_ruleCohort_no.contains(ih)) { continue; } index_ruleCohort_no.insert(ih); @@ -640,8 +640,15 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R for (auto r = cohort->readings[i]; r; r = r->next) { r->active = true; } + if (rule.line == 2746) { + cohort = cohort; + } + rule_target = cohort; // Actually check if the reading is a valid target. First check if rule target matches... if (rule.target && doesSetMatchReading(*reading, rule.target, (set.type & (ST_CHILD_UNIFY | ST_SPECIAL)) != 0)) { + if (rule.line == 2746) { + cohort = cohort; + } bool regex_prop = true; if (orz != context_stack.back().regexgrp_ct) { did_test = false; @@ -724,6 +731,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R addProfilingExample(r); } } + if (!debug_rules.empty() && debug_rules.contains(rule.line)) { + printDebugRule(rule); + } if (regex_prop && i && !regexgrps_c.empty()) { for (auto z = i; z > 0; --z) { @@ -738,6 +748,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R } else { context_stack.back().regexgrp_ct = orz; + if (!debug_rules.empty() && debug_rules.contains(rule.line)) { + printDebugRule(rule, true, false); + } } ++num_iff; } @@ -747,6 +760,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R Profiler::Key k{ ET_RULE, rule.number + 1 }; ++profiler->entries[k].num_fail; } + if (!debug_rules.empty() && debug_rules.contains(rule.line)) { + printDebugRule(rule, false, false); + } } readings_plain.insert(std::make_pair(reading->hash_plain, reading)); for (auto r = cohort->readings[i]; r; r = r->next) { diff --git a/src/flat_unordered_map.hpp b/src/flat_unordered_map.hpp index 6f7fd22f..429ccc6a 100644 --- a/src/flat_unordered_map.hpp +++ b/src/flat_unordered_map.hpp @@ -235,6 +235,10 @@ class flat_unordered_map { return (find(t) != end()); } + bool contains(T t) const { + return (find(t) != end()); + } + V& operator[](const T& t) { assert(t != res_empty && t != res_del && "Key cannot be res_empty or res_del!"); diff --git a/src/flat_unordered_set.hpp b/src/flat_unordered_set.hpp index 47f28edb..d0e4bdfe 100644 --- a/src/flat_unordered_set.hpp +++ b/src/flat_unordered_set.hpp @@ -229,6 +229,10 @@ class flat_unordered_set { return (find(t) != end()); } + bool contains(T t) const { + return (find(t) != end()); + } + const_iterator begin() const { if (size_ == 0) { return end(); @@ -326,6 +330,7 @@ class flat_unordered_set { }; using uint32FlatHashSet = flat_unordered_set; +using uint64FlatHashSet = flat_unordered_set; } #endif diff --git a/src/inlines.hpp b/src/inlines.hpp index ea59d13c..20c8583b 100644 --- a/src/inlines.hpp +++ b/src/inlines.hpp @@ -475,11 +475,6 @@ inline bool is_cg3b(const S& s) { return (s[0] == 'C' && s[1] == 'G' && s[2] == '3' && s[3] == 'B'); } -template -inline bool index_matches(const Cont& index, const VT& entry) { - return (index.find(entry) != index.end()); -} - inline void insert_if_exists(boost::dynamic_bitset<>& cont, const boost::dynamic_bitset<>* other) { if (other && !other->empty()) { cont.resize(std::max(cont.size(), other->size())); diff --git a/src/main.cpp b/src/main.cpp index 90bd5879..53827503 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -511,6 +511,11 @@ void GAppSetOpts(GrammarApplicator& applicator, UConverter* conv) { } } } + if (options[DEBUG_RULES].doesOccur) { + if (!options[DEBUG_RULES].value.empty()) { + GAppSetOpts_ranged(options[DEBUG_RULES].value.c_str(), applicator.debug_rules, false); + } + } if (options[VERBOSE].doesOccur) { if (!options[VERBOSE].value.empty()) { applicator.verbosity_level = std::stoul(options[VERBOSE].value); diff --git a/src/options.hpp b/src/options.hpp index b89bab90..5da3e562 100644 --- a/src/options.hpp +++ b/src/options.hpp @@ -41,6 +41,7 @@ enum OPTIONS { NRULES, NRULES_INV, DODEBUG, + DEBUG_RULES, VERBOSE, QUIET, VISLCGCOMPAT, @@ -108,6 +109,7 @@ std::array options{ UOption{"nrules", 0, UOPT_REQUIRES_ARG, "a regex for which rule names to parse/run; defaults to all rules"}, UOption{"nrules-v", 0, UOPT_REQUIRES_ARG, "a regex for which rule names not to parse/run"}, UOption{"debug", 'd', UOPT_OPTIONAL_ARG, "enables debug output (very noisy)"}, + UOption{"debug-rules", 0, UOPT_OPTIONAL_ARG, "number or ranges of rules to debug; defaults to all rules"}, UOption{"verbose", 'v', UOPT_OPTIONAL_ARG, "increases verbosity"}, UOption{"quiet", 0, UOPT_NO_ARG, "squelches warnings (same as -v 0)"}, UOption{"vislcg-compat", '2', UOPT_NO_ARG, "enables compatibility mode for older CG-2 and vislcg grammars"}, diff --git a/src/sorted_vector.hpp b/src/sorted_vector.hpp index 38d2f417..334e42fe 100644 --- a/src/sorted_vector.hpp +++ b/src/sorted_vector.hpp @@ -170,6 +170,10 @@ class sorted_vector { return (find(t) != end()); } + bool contains(T t) const { + return (find(t) != end()); + } + iterator begin() { return elements.begin(); } diff --git a/src/version.hpp b/src/version.hpp index 3ffe1351..9b484f9a 100644 --- a/src/version.hpp +++ b/src/version.hpp @@ -27,7 +27,7 @@ constexpr auto CG3_COPYRIGHT_STRING = "Copyright (C) 2007-2024 GrammarSoft ApS. constexpr uint32_t CG3_VERSION_MAJOR = 1; constexpr uint32_t CG3_VERSION_MINOR = 4; -constexpr uint32_t CG3_VERSION_PATCH = 16; +constexpr uint32_t CG3_VERSION_PATCH = 17; constexpr uint32_t CG3_REVISION = 13898; constexpr uint32_t CG3_FEATURE_REV = 13898; constexpr uint32_t CG3_TOO_OLD = 10373;