diff --git a/validate.go b/validate.go index 37e5370..0cd782a 100644 --- a/validate.go +++ b/validate.go @@ -2,10 +2,16 @@ package agents import ( _ "embed" + "encoding/hex" "encoding/json" "fmt" + "hash/maphash" "regexp" + "regexp/syntax" + "strconv" + "strings" "time" + "unicode" ) //go:embed crawler-user-agents.json @@ -80,31 +86,454 @@ var Crawlers = func() []Crawler { return crawlers }() -var regexps = func() []*regexp.Regexp { - regexps := make([]*regexp.Regexp, len(Crawlers)) +// analyzePattern expands a regular expression to the list of matching texts +// for plain search. The list is complete, i.e. iff a text matches the input +// pattern, then it contains at least one of the returned texts. If such a list +// can't be built, then the resulting list contains one element (main literal), +// it also returns built regexp object to run in this case. The main literal is +// a text that is contained in any matching text and is used to optimize search +// (pre-filter with this main literal before running a regexp). In the case such +// a main literal can't be found or the regexp is invalid, an error is returned. +func analyzePattern(pattern string) ([]string, *regexp.Regexp, error) { + re, err := syntax.Parse(pattern, syntax.Perl) + if err != nil { + return nil, nil, fmt.Errorf("re %q does not compile: %w", pattern, err) + } + re = re.Simplify() + + // Try to convert it to the list of literals. + const maxLiterals = 100 + literals, ok := literalizeRegexp(re, maxLiterals) + if ok { + return literals, nil, nil + } + + // Fallback to using a regexp, but we need some string serving as + // an indicator of its possible presence. + mainLiteral := findLongestCommonLiteral(re) + const minLiteralLen = 3 + if len(mainLiteral) < minLiteralLen { + return nil, nil, fmt.Errorf("re %q does not contain sufficiently long literal to serve an indicator. The longest literal is %q", pattern, mainLiteral) + } + + return []string{mainLiteral}, regexp.MustCompile(pattern), nil +} + +// literalizeRegexp expands a regexp to the list of matching sub-strings. +// Iff a text matches the regexp, it contains at least one of the returned +// texts. Argument maxLiterals regulates the maximum number of patterns to +// return. In case of an overflow or if it is impossible to build such a list +// from the regexp, false is returned. +func literalizeRegexp(re *syntax.Regexp, maxLiterals int) (literals []string, ok bool) { + switch re.Op { + case syntax.OpNoMatch: + return nil, true + + case syntax.OpEmptyMatch: + return []string{""}, true + + case syntax.OpLiteral: + return unwrapCase(re, []string{string(re.Rune)}, maxLiterals) + + case syntax.OpCharClass: + count := 0 + for i := 0; i < len(re.Rune); i += 2 { + first := re.Rune[i] + last := re.Rune[i+1] + count += int(last - first + 1) + } + + if count > maxLiterals { + return nil, false + } + + patterns := make([]string, 0, count) + for i := 0; i < len(re.Rune); i += 2 { + first := re.Rune[i] + last := re.Rune[i+1] + for r := first; r <= last; r++ { + patterns = append(patterns, string([]rune{r})) + } + } + + return unwrapCase(re, patterns, maxLiterals) + + case syntax.OpAnyCharNotNL, syntax.OpAnyChar: + // Not supported. + return nil, false + + case syntax.OpBeginLine, syntax.OpBeginText: + return []string{"^"}, true + + case syntax.OpEndLine, syntax.OpEndText: + return []string{"$"}, true + + case syntax.OpWordBoundary, syntax.OpNoWordBoundary: + // Not supported. + return nil, false + + case syntax.OpCapture: + subList, ok := literalizeRegexp(re.Sub[0], maxLiterals) + if !ok { + return nil, false + } + + return unwrapCase(re, subList, maxLiterals) + + case syntax.OpStar, syntax.OpPlus: + // Not supported. + return nil, false + + case syntax.OpQuest: + if re.Flags&syntax.FoldCase != 0 { + return nil, false + } + + subList, ok := literalizeRegexp(re.Sub[0], maxLiterals) + if !ok { + return nil, false + } + subList = append(subList, "") + + return subList, true + + case syntax.OpRepeat: + // Not supported. + return nil, false + + case syntax.OpConcat: + if re.Flags&syntax.FoldCase != 0 { + return nil, false + } + + matrix := make([][]string, len(re.Sub)) + for i, sub := range re.Sub { + subList, ok := literalizeRegexp(sub, maxLiterals) + if !ok { + return nil, false + } + matrix[i] = subList + } + + return combinations(matrix, maxLiterals) + + case syntax.OpAlternate: + results := []string{} + for _, sub := range re.Sub { + subList, ok := literalizeRegexp(sub, maxLiterals) + if !ok { + return nil, false + } + results = append(results, subList...) + } + + if len(results) > maxLiterals { + return nil, false + } + + return unwrapCase(re, results, maxLiterals) + + default: + // Not supported. + return nil, false + } +} + +// combinations produces all combination of elements of matrix. +// Each sub-slice of matrix contributes one part of a resulting string. +// If the number of combinations is larger than maxLiterals, the function +// returns false. +func combinations(matrix [][]string, maxLiterals int) ([]string, bool) { + if len(matrix) == 1 { + if len(matrix[0]) > maxLiterals { + return nil, false + } + + return matrix[0], true + } + + prefixes := matrix[0] + suffixes, ok := combinations(matrix[1:], maxLiterals) + if !ok { + return nil, false + } + + size := len(prefixes) * len(suffixes) + if size > maxLiterals { + return nil, false + } + + results := make([]string, 0, size) + for _, prefix := range prefixes { + for _, suffix := range suffixes { + results = append(results, prefix+suffix) + } + } + + return results, true +} + +// unwrapCase takes the regexp and the list of patterns expanded from it and +// further expands it for a case-insensitive regexp, if needed. Argument +// maxLiterals regulates the maximum number of patterns to return. In case of an +// overflow, false is returned. +func unwrapCase(re *syntax.Regexp, patterns []string, maxLiterals int) ([]string, bool) { + if re.Flags&syntax.FoldCase == 0 { + return patterns, true + } + + results := []string{} + for _, pattern := range patterns { + matrix := make([][]string, len(pattern)) + for i, r := range pattern { + upper := unicode.ToUpper(r) + lower := unicode.ToLower(r) + matrix[i] = []string{ + string([]rune{upper}), + string([]rune{lower}), + } + } + + patterns, ok := combinations(matrix, maxLiterals) + if !ok { + return nil, false + } + + results = append(results, patterns...) + if len(results) > maxLiterals { + return nil, false + } + } + + return results, true +} + +// findLongestCommonLiteral finds the longest common literal in the regexp. It's +// such a string which is contained in any text matching the regexp. If such a +// literal can't be found, it returns an empty string. +func findLongestCommonLiteral(re *syntax.Regexp) string { + if re.Flags&syntax.FoldCase != 0 { + return "" + } + + switch re.Op { + case syntax.OpNoMatch, syntax.OpEmptyMatch: + return "" + + case syntax.OpLiteral: + return string(re.Rune) + + case syntax.OpCharClass, syntax.OpAnyCharNotNL, syntax.OpAnyChar: + return "" + + case syntax.OpBeginLine, syntax.OpBeginText: + return "^" + + case syntax.OpEndLine, syntax.OpEndText: + return "$" + + case syntax.OpWordBoundary, syntax.OpNoWordBoundary: + return "" + + case syntax.OpCapture: + return findLongestCommonLiteral(re.Sub[0]) + + case syntax.OpStar: + return "" + + case syntax.OpPlus: + return findLongestCommonLiteral(re.Sub[0]) + + case syntax.OpQuest: + return "" + + case syntax.OpRepeat: + if re.Min >= 1 { + return findLongestCommonLiteral(re.Sub[0]) + } + + return "" + + case syntax.OpConcat: + longest := "" + for _, sub := range re.Sub { + str := findLongestCommonLiteral(sub) + if len(str) > len(longest) { + longest = str + } + } + + return longest + + case syntax.OpAlternate: + return "" + + default: + return "" + } +} + +type regexpPattern struct { + re *regexp.Regexp + index int +} + +type matcher struct { + replacer *strings.Replacer + regexps []regexpPattern +} + +var uniqueToken = hex.EncodeToString((&maphash.Hash{}).Sum(nil)) + +const ( + uniqueTokenLen = 2 * 8 + numLen = 5 + literalLabel = '-' + regexpLabel = '*' +) + +var m = func() matcher { + if len(uniqueToken) != uniqueTokenLen { + panic("len(uniqueToken) != uniqueTokenLen") + } + + regexps := []regexpPattern{} + oldnew := make([]string, 0, len(Crawlers)*2) + + // Put re-based patterns to the end to prevent AdsBot-Google from + // shadowing AdsBot-Google-Mobile. + var oldnew2 []string + for i, crawler := range Crawlers { - regexps[i] = regexp.MustCompile(crawler.Pattern) + literals, re, err := analyzePattern(crawler.Pattern) + if err != nil { + panic(err) + } + + label := literalLabel + num := i + if re != nil { + label = regexpLabel + num = len(regexps) + regexps = append(regexps, regexpPattern{ + re: re, + index: i, + }) + } + + replaceWith := fmt.Sprintf(" %s%c%0*d ", uniqueToken, label, numLen, num) + + for _, literal := range literals { + if re != nil { + oldnew2 = append(oldnew2, literal, replaceWith) + } else { + oldnew = append(oldnew, literal, replaceWith) + } + } + } + oldnew = append(oldnew, oldnew2...) + + // Allocate another array with regexps of exact size to save memory. + regexps2 := make([]regexpPattern, len(regexps)) + copy(regexps2, regexps) + + r := strings.NewReplacer(oldnew...) + r.Replace("") // To cause internal build process. + + return matcher{ + replacer: r, + regexps: regexps2, } - return regexps }() // Returns if User Agent string matches any of crawler patterns. func IsCrawler(userAgent string) bool { - for _, re := range regexps { - if re.MatchString(userAgent) { + // This code is mostly copy-paste of MatchingCrawlers, + // but with early exit logic, so it works a but faster. + + text := "^" + userAgent + "$" + replaced := m.replacer.Replace(text) + if replaced == text { + return false + } + + for { + uniquePos := strings.Index(replaced, uniqueToken) + if uniquePos == -1 { + break + } + + start := uniquePos + uniqueTokenLen + 1 + if start+numLen >= len(replaced) { + panic("corrupt replaced: " + replaced) + } + + label := replaced[start-1] + switch label { + case literalLabel: return true + case regexpLabel: + // Rare case. Run regexp to confirm the match. + indexStr := replaced[start : start+numLen] + index, err := strconv.Atoi(indexStr) + if err != nil { + panic("corrupt replaced: " + replaced) + } + rp := m.regexps[index] + if rp.re.MatchString(userAgent) { + return true + } + default: + panic("corrupt replaced: " + replaced) } + + replaced = replaced[start+numLen:] } + return false } // Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers. func MatchingCrawlers(userAgent string) []int { + text := "^" + userAgent + "$" + replaced := m.replacer.Replace(text) + if replaced == text { + return []int{} + } + indices := []int{} - for i, re := range regexps { - if re.MatchString(userAgent) { - indices = append(indices, i) + for { + uniquePos := strings.Index(replaced, uniqueToken) + if uniquePos == -1 { + break } + + start := uniquePos + uniqueTokenLen + 1 + if start+numLen >= len(replaced) { + panic("corrupt replaced: " + replaced) + } + indexStr := replaced[start : start+numLen] + index, err := strconv.Atoi(indexStr) + if err != nil { + panic("corrupt replaced: " + replaced) + } + + label := replaced[start-1] + switch label { + case literalLabel: + indices = append(indices, index) + case regexpLabel: + // Rare case. Run regexp to confirm the match. + rp := m.regexps[index] + if rp.re.MatchString(userAgent) { + indices = append(indices, rp.index) + } + default: + panic("corrupt replaced: " + replaced) + } + + replaced = replaced[start+numLen:] } + return indices } diff --git a/validate_test.go b/validate_test.go index 6812ad2..7d70acb 100644 --- a/validate_test.go +++ b/validate_test.go @@ -4,9 +4,467 @@ import ( "encoding/json" "fmt" "net/http" + "reflect" + "regexp/syntax" + "sort" + "strings" "testing" ) +// TestAnalyzePattern tests analyzePattern function on many cases, including +// edge cases. +func TestAnalyzePattern(t *testing.T) { + cases := []struct { + input string + wantError string + wantPatterns []string + wantRe bool + shouldMatchRe []string + shouldNotMatchRe []string + }{ + { + input: "simple phrase", + wantPatterns: []string{"simple phrase"}, + }, + { + input: "^begin anchor", + wantPatterns: []string{"^begin anchor"}, + }, + { + input: "end anchor$", + wantPatterns: []string{"end anchor$"}, + }, + { + input: "^both anchors$", + wantPatterns: []string{"^both anchors$"}, + }, + { + input: "(alter|nation)", + wantPatterns: []string{"alter", "nation"}, + }, + { + input: "too many [aA][lL][tT][eE][rR][nN][aA][tT][iI][oO][nN][sS]", + wantPatterns: []string{"too many "}, + wantRe: true, + shouldMatchRe: []string{"too many ALTERNATIONs"}, + shouldNotMatchRe: []string{"too many combinations "}, + }, + { + input: "(alter|nation) concatenation (alter|nation)", + wantPatterns: []string{ + "alter concatenation alter", + "alter concatenation nation", + "nation concatenation alter", + "nation concatenation nation", + }, + }, + { + input: "clas[sS] of [c]haract[eiu]rs", + wantPatterns: []string{ + "clasS of characters", + "clasS of charactirs", + "clasS of characturs", + "class of characters", + "class of charactirs", + "class of characturs", + }, + }, + { + input: "ranges [0-3]x[a-c]", + wantPatterns: []string{ + "ranges 0xa", "ranges 0xb", "ranges 0xc", + "ranges 1xa", "ranges 1xb", "ranges 1xc", + "ranges 2xa", "ranges 2xb", "ranges 2xc", + "ranges 3xa", "ranges 3xb", "ranges 3xc", + }, + }, + { + input: "Quest?", + wantPatterns: []string{"Ques", "Quest"}, + }, + { + input: "Q?ue(st)?", + wantPatterns: []string{"Que", "Quest", "ue", "uest"}, + }, + { + input: "too many combinations [0-9][a-z]", + wantPatterns: []string{"too many combinations "}, + wantRe: true, + shouldMatchRe: []string{"too many combinations 0a"}, + shouldNotMatchRe: []string{"too many combinations "}, + }, + { + input: "negation in char class [^x]", + wantPatterns: []string{"negation in char class "}, + wantRe: true, + shouldMatchRe: []string{"negation in char class y"}, + shouldNotMatchRe: []string{"negation in char class x"}, + }, + { + input: "any char .", + wantPatterns: []string{"any char "}, + wantRe: true, + shouldMatchRe: []string{"any char x"}, + shouldNotMatchRe: []string{"any char_x"}, + }, + { + input: `word \boundary`, + wantPatterns: []string{"oundary"}, + wantRe: true, + shouldMatchRe: []string{"word oundary"}, + shouldNotMatchRe: []string{"word boundary"}, + }, + { + input: "asterisk*", + wantPatterns: []string{"asteris"}, + wantRe: true, + shouldMatchRe: []string{"asteris", "asterisk", "asteriskk"}, + shouldNotMatchRe: []string{"asterik"}, + }, + { + input: "plus+", + wantPatterns: []string{"plu"}, + wantRe: true, + shouldMatchRe: []string{"plus", "pluss"}, + shouldNotMatchRe: []string{"plu"}, + }, + { + input: "repeat{3,5}$", + wantPatterns: []string{"repeattt$", "repeatttt$", "repeattttt$"}, + }, + { + input: "repeat{1,120}$", + wantPatterns: []string{"repea"}, + wantRe: true, + shouldMatchRe: []string{"repeattt", "repeatttt", "repeattttt"}, + shouldNotMatchRe: []string{"repea5"}, + }, + { + input: "broken re[", + wantError: "does not compile", + }, + { + input: "n?o? ?l?o?n?g? ?l?i?t?e?r?a?l?", + wantError: "does not contain sufficiently long literal", + }, + } + + for _, tc := range cases { + tc := tc + + t.Run(tc.input, func(t *testing.T) { + gotPatterns, re, err := analyzePattern(tc.input) + if tc.wantError != "" { + if err == nil { + t.Fatalf("expected to get an error, got success") + } + if !strings.Contains(err.Error(), tc.wantError) { + t.Fatalf("the error returned must contain text %q, got %q", tc.wantError, err.Error()) + } + + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + sort.Strings(tc.wantPatterns) + sort.Strings(gotPatterns) + if !reflect.DeepEqual(tc.wantPatterns, gotPatterns) { + t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantPatterns) + } + + if !tc.wantRe { + if re != nil { + t.Fatalf("unexpectedly got a re") + } + + return + } + + if re == nil { + t.Fatalf("expected to get a re, got nil") + } + for _, text := range tc.shouldMatchRe { + if !re.MatchString(text) { + t.Fatalf("test %q must match against the re, but it doesn't", text) + } + } + for _, text := range tc.shouldNotMatchRe { + if re.MatchString(text) { + t.Fatalf("test %q must not match against the re, but it does", text) + } + } + }) + } +} + +// TestLiteralizeRegexp tests expansion of a regexp to a list of literals. +func TestLiteralizeRegexp(t *testing.T) { + cases := []struct { + input string + maxLiterals int + wantOutput []string + wantOverflow bool + }{ + { + input: "simple phrase", + maxLiterals: 100, + wantOutput: []string{"simple phrase"}, + }, + { + input: "cases [1-2x-z]", + maxLiterals: 100, + wantOutput: []string{"cases 1", "cases 2", "cases x", "cases y", "cases z"}, + }, + { + input: "[Ii]gnore case", + maxLiterals: 100, + wantOutput: []string{"Ignore case", "ignore case"}, + }, + { + input: "overflow [1-2x-z]", + maxLiterals: 2, + wantOverflow: true, + }, + } + + for _, tc := range cases { + tc := tc + + t.Run(tc.input, func(t *testing.T) { + re, err := syntax.Parse(tc.input, syntax.Perl) + if err != nil { + t.Fatalf("failed to parse regexp %q: %v", tc.input, err) + } + + gotPatterns, ok := literalizeRegexp(re, tc.maxLiterals) + if tc.wantOverflow { + if ok { + t.Fatalf("expected to get an overflow, got success") + } + + return + } + + if !ok { + t.Fatalf("unexpected overflow") + } + + sort.Strings(tc.wantOutput) + sort.Strings(gotPatterns) + if !reflect.DeepEqual(tc.wantOutput, gotPatterns) { + t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput) + } + }) + } +} + +// TestCombinations tests combinations() function. +func TestCombinations(t *testing.T) { + cases := []struct { + name string + input [][]string + maxLiterals int + wantOutput []string + wantOverflow bool + }{ + { + name: "1x1", + input: [][]string{{"A"}, {"B"}}, + maxLiterals: 100, + wantOutput: []string{"AB"}, + }, + { + name: "0x1", + input: [][]string{{}, {"B"}}, + maxLiterals: 100, + wantOutput: []string{}, + }, + { + name: "1x2", + input: [][]string{{"A"}, {"1", "2"}}, + maxLiterals: 100, + wantOutput: []string{"A1", "A2"}, + }, + { + name: "2x2", + input: [][]string{{"A", "B"}, {"1", "2"}}, + maxLiterals: 100, + wantOutput: []string{"A1", "A2", "B1", "B2"}, + }, + { + name: "empty string as an option", + input: [][]string{{"A", ""}, {"1", "2"}}, + maxLiterals: 100, + wantOutput: []string{"A1", "A2", "1", "2"}, + }, + { + name: "overflow", + input: [][]string{{"A", "B"}, {"1", "2"}}, + maxLiterals: 3, + wantOverflow: true, + }, + } + + for _, tc := range cases { + tc := tc + + t.Run(tc.name, func(t *testing.T) { + gotPatterns, ok := combinations(tc.input, tc.maxLiterals) + if tc.wantOverflow { + if ok { + t.Fatalf("expected to get an overflow, got success") + } + + return + } + + if !ok { + t.Fatalf("unexpected overflow") + } + + sort.Strings(tc.wantOutput) + sort.Strings(gotPatterns) + if !reflect.DeepEqual(tc.wantOutput, gotPatterns) { + t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput) + } + }) + } +} + +// TestUnwrapCase tests unwrapping literals of case-insensitive regexps. +func TestUnwrapCase(t *testing.T) { + cases := []struct { + name string + ignoreCase bool + inputPatterns []string + maxLiterals int + wantOutput []string + wantOverflow bool + }{ + { + name: "simple phrase", + inputPatterns: []string{"simple phrase"}, + maxLiterals: 100, + wantOutput: []string{"simple phrase"}, + }, + { + name: "ignore case", + ignoreCase: true, + inputPatterns: []string{"i"}, + maxLiterals: 100, + wantOutput: []string{"i", "I"}, + }, + { + name: "ignore case two letters", + ignoreCase: true, + inputPatterns: []string{"ic"}, + maxLiterals: 100, + wantOutput: []string{"IC", "Ic", "iC", "ic"}, + }, + { + name: "ignore case two words", + ignoreCase: true, + inputPatterns: []string{"i", "c"}, + maxLiterals: 100, + wantOutput: []string{"C", "I", "c", "i"}, + }, + { + name: "ignore case overflow", + ignoreCase: true, + inputPatterns: []string{"long text"}, + maxLiterals: 100, + wantOverflow: true, + }, + } + + for _, tc := range cases { + tc := tc + + t.Run(tc.name, func(t *testing.T) { + re := &syntax.Regexp{} + if tc.ignoreCase { + re.Flags = syntax.FoldCase + } + + gotPatterns, ok := unwrapCase(re, tc.inputPatterns, tc.maxLiterals) + if tc.wantOverflow { + if ok { + t.Fatalf("expected to get an overflow, got success") + } + + return + } + + if !ok { + t.Fatalf("unexpected overflow") + } + + sort.Strings(tc.wantOutput) + sort.Strings(gotPatterns) + if !reflect.DeepEqual(tc.wantOutput, gotPatterns) { + t.Fatalf("returned list of patterns (%#v) does not match the expected value (%#v)", gotPatterns, tc.wantOutput) + } + }) + } +} + +// TestFindLongestCommonLiteral tests finding longest literal in a regexp. +func TestFindLongestCommonLiteral(t *testing.T) { + cases := []struct { + input string + wantOutput string + }{ + { + input: "simple phrase", + wantOutput: "simple phrase", + }, + { + input: "simple (phrase)?", + wantOutput: "simple ", + }, + { + input: "[iI]", + wantOutput: "", + }, + { + input: "[i]b", + wantOutput: "ib", + }, + { + input: "simple (phrase)+", + wantOutput: "simple ", + }, + { + input: "a*", + wantOutput: "", + }, + { + input: "(abc)|(ab)", + wantOutput: "", + }, + } + + for _, tc := range cases { + tc := tc + + t.Run(tc.input, func(t *testing.T) { + re, err := syntax.Parse(tc.input, syntax.Perl) + if err != nil { + t.Fatalf("failed to parse regexp %q: %v", tc.input, err) + } + + gotOutput := findLongestCommonLiteral(re) + + if gotOutput != tc.wantOutput { + t.Fatalf("returned value (%q) does not match the expected value (%q)", gotOutput, tc.wantOutput) + } + }) + } +} + func contains(list []int, value int) bool { for _, elem := range list { if elem == value {