Implement --debug-rules; Fix indexSetToRule() with mixed special and …

…normal sets; Bump some indexes to 64 bit to combat collisions
GrammarSoft · Aug 12, 2024 · ca98959 · ca98959
1 parent 16e34b3
commit ca98959
Show file tree

Hide file tree

Showing 12 changed files with 83 additions and 24 deletions.
diff --git a/src/Grammar.cpp b/src/Grammar.cpp
@@ -1126,7 +1126,6 @@ inline void trie_indexToRule(const trie_t& trie, Grammar& grammar, uint32_t r) {
 void Grammar::indexSetToRule(uint32_t r, Set* s) {
 	if (s->type & (ST_SPECIAL | ST_TAG_UNIFY)) {
 		indexTagToRule(tag_any, r);
-		return;
 	}
 
 	trie_indexToRule(s->trie, *this, r);

diff --git a/src/GrammarApplicator.hpp b/src/GrammarApplicator.hpp
@@ -137,6 +137,7 @@ class GrammarApplicator {
 	uint32Vector sections;
 	uint32IntervalVector valid_rules;
 	uint32IntervalVector trace_rules;
+	uint32IntervalVector debug_rules;
 	uint32FlatHashMap variables;
 	uint32_t verbosity_level = 0;
 	uint32_t debug_level = 0;
@@ -279,10 +280,10 @@ class GrammarApplicator {
 	scoped_stack<unif_sets_t> ss_usets;
 	scoped_stack<uint32SortedVector> ss_u32sv;
 
-	uint32FlatHashSet index_regexp_yes;
-	uint32FlatHashSet index_regexp_no;
-	uint32FlatHashSet index_icase_yes;
-	uint32FlatHashSet index_icase_no;
+	uint64FlatHashSet index_regexp_yes;
+	uint64FlatHashSet index_regexp_no;
+	uint64FlatHashSet index_icase_yes;
+	uint64FlatHashSet index_icase_no;
 	std::vector<uint32FlatHashSet> index_readingSet_yes;
 	std::vector<uint32FlatHashSet> index_readingSet_no;
 	uint32FlatHashSet index_ruleCohort_no;
@@ -358,6 +359,34 @@ class GrammarApplicator {
 	std::deque<Reading> subs_any;
 	Reading* get_sub_reading(Reading* tr, int sub_reading);
 
+	void printDebugRule(const Rule& rule, bool target = true, bool cntx = true) {
+		static std::stringstream buf;
+
+		bool ttrace = false;
+		swapper<bool> _st(true, trace, ttrace);
+
+		// Whole context, both before and after current window
+		buf.str("");
+		buf.clear();
+
+		buf << "# ===== BEGIN RULE " << rule.line << (target ? " TARGET-MATCH" : " TARGET-FAIL") << (cntx ? " CONTEXT-MATCH" : " CONTEXT-FAIL") << " =====\n";
+
+		buf << "# PREVIOUS WINDOWS\n";
+		for (auto s : gWindow->previous) {
+			printSingleWindow(s, buf, true);
+		}
+		buf << "# CURRENT WINDOW\n";
+		printSingleWindow(gWindow->current, buf, true);
+		buf << "# NEXT WINDOWS\n";
+		for (auto s : gWindow->next) {
+			printSingleWindow(s, buf, true);
+		}
+
+		buf << "# ===== END RULE " << rule.line << " =====\n";
+
+		u_fprintf(ux_stderr, "%s", buf.str().c_str());
+	}
+
 	template<typename T>
 	void addProfilingExample(T& item) {
 		auto& buf = profiler->buf;

diff --git a/src/GrammarApplicator_matchSet.cpp b/src/GrammarApplicator_matchSet.cpp
@@ -91,11 +91,11 @@ uint32_t GrammarApplicator::doesTagMatchRegexp(uint32_t test, const Tag& tag, bo
 	UErrorCode status = U_ZERO_ERROR;
 	int32_t gc = uregex_groupCount(tag.regexp, &status);
 	uint32_t match = 0;
-	uint32_t ih = hash_value(tag.hash, test);
-	if (!bypass_index && index_matches(index_regexp_no, ih)) {
+	auto ih = (UI64(tag.hash) << 32) | test;
+	if (!bypass_index && index_regexp_no.contains(ih)) {
 		match = 0;
 	}
-	else if (!bypass_index && gc == 0 && index_matches(index_regexp_yes, ih)) {
+	else if (!bypass_index && gc == 0 && index_regexp_yes.contains(ih)) {
 		match = test;
 	}
 	else {
@@ -130,11 +130,11 @@ uint32_t GrammarApplicator::doesTagMatchRegexp(uint32_t test, const Tag& tag, bo
 
 uint32_t GrammarApplicator::doesTagMatchIcase(uint32_t test, const Tag& tag, bool bypass_index) {
 	uint32_t match = 0;
-	uint32_t ih = hash_value(tag.hash, test);
-	if (!bypass_index && index_matches(index_icase_no, ih)) {
+	auto ih = (UI64(tag.hash) << 32) | test;
+	if (!bypass_index && index_icase_no.contains(ih)) {
 		match = 0;
 	}
-	else if (!bypass_index && index_matches(index_icase_yes, ih)) {
+	else if (!bypass_index && index_icase_yes.contains(ih)) {
 		match = test;
 	}
 	else {
@@ -157,11 +157,11 @@ uint32_t GrammarApplicator::doesRegexpMatchLine(const Reading& reading, const Ta
 	UErrorCode status = U_ZERO_ERROR;
 	int32_t gc = uregex_groupCount(tag.regexp, &status);
 	uint32_t match = 0;
-	uint32_t ih = hash_value(reading.tags_string_hash, tag.hash);
-	if (!bypass_index && index_matches(index_regexp_no, ih)) {
+	auto ih = (UI64(reading.tags_string_hash) << 32) | tag.hash;
+	if (!bypass_index && index_regexp_no.contains(ih)) {
 		match = 0;
 	}
-	else if (!bypass_index && gc == 0 && index_matches(index_regexp_yes, ih)) {
+	else if (!bypass_index && gc == 0 && index_regexp_yes.contains(ih)) {
 		match = reading.tags_string_hash;
 	}
 	else {
@@ -671,10 +671,10 @@ bool GrammarApplicator::doesSetMatchReading(const Reading& reading, const uint32
 	// Only 30% of tests get past this.
 	// ToDo: This is not good enough...while numeric tags are special, their failures can be indexed.
 	if (!bypass_index && !unif_mode) {
-		if (index_readingSet_no[set].find(reading.hash) != index_readingSet_no[set].end()) {
+		if (index_readingSet_no[set].contains(reading.hash)) {
 			return false;
 		}
-		if (index_readingSet_yes[set].find(reading.hash) != index_readingSet_yes[set].end()) {
+		if (index_readingSet_yes[set].contains(reading.hash)) {
 			return true;
 		}
 	}

diff --git a/src/GrammarApplicator_runContextualTest.cpp b/src/GrammarApplicator_runContextualTest.cpp
@@ -639,7 +639,7 @@ Cohort* GrammarApplicator::runDependencyTest(SingleWindow* sWindow, Cohort* curr
 
 	// ToDo: Now that dep_deep_seen is a composite, investigate all .clear() to see if they're needed
 	if (test->pos & POS_DEP_DEEP) {
-		if (index_matches(dep_deep_seen, std::make_pair(test->hash, current->global_number))) {
+		if (dep_deep_seen.contains(std::make_pair(test->hash, current->global_number))) {
 			return 0;
 		}
 		dep_deep_seen.insert(std::make_pair(test->hash, current->global_number));

diff --git a/src/GrammarApplicator_runRules.cpp b/src/GrammarApplicator_runRules.cpp
@@ -475,7 +475,7 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
 		// Check if on previous runs the rule did not match this cohort, and skip if that is the case.
 		// This cache is cleared if any rule causes any state change in the window.
 		uint32_t ih = hash_value(rule.number, cohort->global_number);
-		if (index_matches(index_ruleCohort_no, ih)) {
+		if (index_ruleCohort_no.contains(ih)) {
 			continue;
 		}
 		index_ruleCohort_no.insert(ih);
@@ -640,8 +640,15 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
 			for (auto r = cohort->readings[i]; r; r = r->next) {
 				r->active = true;
 			}
+			if (rule.line == 2746) {
+				cohort = cohort;
+			}
+			rule_target = cohort;
 			// Actually check if the reading is a valid target. First check if rule target matches...
 			if (rule.target && doesSetMatchReading(*reading, rule.target, (set.type & (ST_CHILD_UNIFY | ST_SPECIAL)) != 0)) {
+				if (rule.line == 2746) {
+					cohort = cohort;
+				}
 				bool regex_prop = true;
 				if (orz != context_stack.back().regexgrp_ct) {
 					did_test = false;
@@ -724,6 +731,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
 							addProfilingExample(r);
 						}
 					}
+					if (!debug_rules.empty() && debug_rules.contains(rule.line)) {
+						printDebugRule(rule);
+					}
 
 					if (regex_prop && i && !regexgrps_c.empty()) {
 						for (auto z = i; z > 0; --z) {
@@ -738,6 +748,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
 				}
 				else {
 					context_stack.back().regexgrp_ct = orz;
+					if (!debug_rules.empty() && debug_rules.contains(rule.line)) {
+						printDebugRule(rule, true, false);
+					}
 				}
 				++num_iff;
 			}
@@ -747,6 +760,9 @@ bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, R
 					Profiler::Key k{ ET_RULE, rule.number + 1 };
 					++profiler->entries[k].num_fail;
 				}
+				if (!debug_rules.empty() && debug_rules.contains(rule.line)) {
+					printDebugRule(rule, false, false);
+				}
 			}
 			readings_plain.insert(std::make_pair(reading->hash_plain, reading));
 			for (auto r = cohort->readings[i]; r; r = r->next) {

diff --git a/src/flat_unordered_map.hpp b/src/flat_unordered_map.hpp
@@ -235,6 +235,10 @@ class flat_unordered_map {
 		return (find(t) != end());
 	}
 
+	bool contains(T t) const {
+		return (find(t) != end());
+	}
+
 	V& operator[](const T& t) {
 		assert(t != res_empty && t != res_del && "Key cannot be res_empty or res_del!");
 

diff --git a/src/flat_unordered_set.hpp b/src/flat_unordered_set.hpp
@@ -229,6 +229,10 @@ class flat_unordered_set {
 		return (find(t) != end());
 	}
 
+	bool contains(T t) const {
+		return (find(t) != end());
+	}
+
 	const_iterator begin() const {
 		if (size_ == 0) {
 			return end();
@@ -326,6 +330,7 @@ class flat_unordered_set {
 };
 
 using uint32FlatHashSet = flat_unordered_set<uint32_t>;
+using uint64FlatHashSet = flat_unordered_set<uint64_t>;
 }
 
 #endif
diff --git a/src/inlines.hpp b/src/inlines.hpp
@@ -475,11 +475,6 @@ inline bool is_cg3b(const S& s) {
 	return (s[0] == 'C' && s[1] == 'G' && s[2] == '3' && s[3] == 'B');
 }
 
-template<typename Cont, typename VT>
-inline bool index_matches(const Cont& index, const VT& entry) {
-	return (index.find(entry) != index.end());
-}
-
 inline void insert_if_exists(boost::dynamic_bitset<>& cont, const boost::dynamic_bitset<>* other) {
 	if (other && !other->empty()) {
 		cont.resize(std::max(cont.size(), other->size()));

diff --git a/src/main.cpp b/src/main.cpp
@@ -511,6 +511,11 @@ void GAppSetOpts(GrammarApplicator& applicator, UConverter* conv) {
 			}
 		}
 	}
+	if (options[DEBUG_RULES].doesOccur) {
+		if (!options[DEBUG_RULES].value.empty()) {
+			GAppSetOpts_ranged(options[DEBUG_RULES].value.c_str(), applicator.debug_rules, false);
+		}
+	}
 	if (options[VERBOSE].doesOccur) {
 		if (!options[VERBOSE].value.empty()) {
 			applicator.verbosity_level = std::stoul(options[VERBOSE].value);

diff --git a/src/options.hpp b/src/options.hpp
@@ -41,6 +41,7 @@ enum OPTIONS {
 	NRULES,
 	NRULES_INV,
 	DODEBUG,
+	DEBUG_RULES,
 	VERBOSE,
 	QUIET,
 	VISLCGCOMPAT,
@@ -108,6 +109,7 @@ std::array<UOption,NUM_OPTIONS> options{
 	UOption{"nrules",                0, UOPT_REQUIRES_ARG, "a regex for which rule names to parse/run; defaults to all rules"},
 	UOption{"nrules-v",              0, UOPT_REQUIRES_ARG, "a regex for which rule names not to parse/run"},
 	UOption{"debug",               'd', UOPT_OPTIONAL_ARG, "enables debug output (very noisy)"},
+	UOption{"debug-rules",           0, UOPT_OPTIONAL_ARG, "number or ranges of rules to debug; defaults to all rules"},
 	UOption{"verbose",             'v', UOPT_OPTIONAL_ARG, "increases verbosity"},
 	UOption{"quiet",                 0, UOPT_NO_ARG,       "squelches warnings (same as -v 0)"},
 	UOption{"vislcg-compat",       '2', UOPT_NO_ARG,       "enables compatibility mode for older CG-2 and vislcg grammars"},

diff --git a/src/sorted_vector.hpp b/src/sorted_vector.hpp
@@ -170,6 +170,10 @@ class sorted_vector {
 		return (find(t) != end());
 	}
 
+	bool contains(T t) const {
+		return (find(t) != end());
+	}
+
 	iterator begin() {
 		return elements.begin();
 	}

diff --git a/src/version.hpp b/src/version.hpp
@@ -27,7 +27,7 @@ constexpr auto CG3_COPYRIGHT_STRING = "Copyright (C) 2007-2024 GrammarSoft ApS.
 
 constexpr uint32_t CG3_VERSION_MAJOR = 1;
 constexpr uint32_t CG3_VERSION_MINOR = 4;
-constexpr uint32_t CG3_VERSION_PATCH = 16;
+constexpr uint32_t CG3_VERSION_PATCH = 17;
 constexpr uint32_t CG3_REVISION = 13898;
 constexpr uint32_t CG3_FEATURE_REV = 13898;
 constexpr uint32_t CG3_TOO_OLD = 10373;