-
-
Notifications
You must be signed in to change notification settings - Fork 253
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for Golang, see https://pkg.go.dev/github.com/monperrus/c…
…rawler-user-agents (#348) Co-authored-by: Martin Monperrus <[email protected]>
- Loading branch information
Showing
6 changed files
with
269 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,3 +23,4 @@ jobs: | |
- run: py.test -vv | ||
- run: python3 validate.py | ||
- run: php validate.php | ||
- run: go test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module github.com/monperrus/crawler-user-agents | ||
|
||
go 1.19 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
package agents | ||
|
||
import ( | ||
_ "embed" | ||
"encoding/json" | ||
"fmt" | ||
"regexp" | ||
"time" | ||
) | ||
|
||
//go:embed crawler-user-agents.json | ||
var crawlersJson []byte | ||
|
||
// Crawler contains information about one crawler. | ||
type Crawler struct { | ||
// Regexp of User Agent of the crawler. | ||
Pattern string `json:"pattern"` | ||
|
||
// Discovery date. | ||
AdditionDate time.Time `json:"addition_date"` | ||
|
||
// Official url of the robot. | ||
URL string `json:"url"` | ||
|
||
// Examples of full User Agent strings. | ||
Instances []string `json:"instances"` | ||
} | ||
|
||
// Private time needed to convert addition_date from/to the format used in JSON. | ||
type jsonCrawler struct { | ||
Pattern string `json:"pattern"` | ||
AdditionDate string `json:"addition_date"` | ||
URL string `json:"url"` | ||
Instances []string `json:"instances"` | ||
} | ||
|
||
const timeLayout = "2006/01/02" | ||
|
||
func (c Crawler) MarshalJSON() ([]byte, error) { | ||
jc := jsonCrawler{ | ||
Pattern: c.Pattern, | ||
AdditionDate: c.AdditionDate.Format(timeLayout), | ||
URL: c.URL, | ||
Instances: c.Instances, | ||
} | ||
return json.Marshal(jc) | ||
} | ||
|
||
func (c *Crawler) UnmarshalJSON(b []byte) error { | ||
var jc jsonCrawler | ||
if err := json.Unmarshal(b, &jc); err != nil { | ||
return err | ||
} | ||
|
||
c.Pattern = jc.Pattern | ||
c.URL = jc.URL | ||
c.Instances = jc.Instances | ||
|
||
if c.Pattern == "" { | ||
return fmt.Errorf("empty pattern in record %s", string(b)) | ||
} | ||
|
||
if jc.AdditionDate != "" { | ||
tim, err := time.ParseInLocation(timeLayout, jc.AdditionDate, time.UTC) | ||
if err != nil { | ||
return err | ||
} | ||
c.AdditionDate = tim | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// The list of crawlers, built from contents of crawler-user-agents.json. | ||
var Crawlers = func() []Crawler { | ||
var crawlers []Crawler | ||
if err := json.Unmarshal(crawlersJson, &crawlers); err != nil { | ||
panic(err) | ||
} | ||
return crawlers | ||
}() | ||
|
||
var regexps = func() []*regexp.Regexp { | ||
regexps := make([]*regexp.Regexp, len(Crawlers)) | ||
for i, crawler := range Crawlers { | ||
regexps[i] = regexp.MustCompile(crawler.Pattern) | ||
} | ||
return regexps | ||
}() | ||
|
||
// Returns if User Agent string matches any of crawler patterns. | ||
func IsCrawler(userAgent string) bool { | ||
for _, re := range regexps { | ||
if re.MatchString(userAgent) { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers. | ||
func MatchingCrawlers(userAgent string) []int { | ||
indices := []int{} | ||
for i, re := range regexps { | ||
if re.MatchString(userAgent) { | ||
indices = append(indices, i) | ||
} | ||
} | ||
return indices | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
package agents | ||
|
||
import ( | ||
"encoding/json" | ||
"fmt" | ||
"net/http" | ||
"testing" | ||
) | ||
|
||
func contains(list []int, value int) bool { | ||
for _, elem := range list { | ||
if elem == value { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func TestPatterns(t *testing.T) { | ||
// Loading all crawlers with go:embed | ||
// some validation happens in UnmarshalJSON. | ||
allCrawlers := Crawlers | ||
|
||
// There are at least 10 crawlers. | ||
if len(allCrawlers) < 10 { | ||
t.Errorf("Number of crawlers must be at least 10, got %d.", len(allCrawlers)) | ||
} | ||
|
||
if IsCrawler(browserUA) { | ||
t.Errorf("Browser UA %q was detected as a crawler.", browserUA) | ||
} | ||
if len(MatchingCrawlers(browserUA)) != 0 { | ||
t.Errorf("MatchingCrawlers found crawlers matching Browser UA %q.", browserUA) | ||
} | ||
|
||
for i, crawler := range allCrawlers { | ||
t.Run(crawler.Pattern, func(t *testing.T) { | ||
fmt.Println(crawler.Pattern) | ||
|
||
for _, instance := range crawler.Instances { | ||
if !IsCrawler(instance) { | ||
t.Errorf("Instance %q is not detected as a crawler.", instance) | ||
} | ||
hits := MatchingCrawlers(instance) | ||
if !contains(hits, i) { | ||
t.Errorf("Crawler with index %d (pattern %q) is not in the list returned by MatchingCrawlers(%q): %v.", i, crawler.Pattern, instance, hits) | ||
} | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestFalseNegatives(t *testing.T) { | ||
const browsersURL = "https://raw.githubusercontent.com/microlinkhq/top-user-agents/master/src/index.json" | ||
resp, err := http.Get(browsersURL) | ||
if err != nil { | ||
t.Fatalf("Failed to fetch the list of browser User Agents from %s: %v.", browsersURL, err) | ||
} | ||
|
||
t.Cleanup(func() { | ||
if err := resp.Body.Close(); err != nil { | ||
t.Fatal(err) | ||
} | ||
}) | ||
|
||
var browsers []string | ||
if err := json.NewDecoder(resp.Body).Decode(&browsers); err != nil { | ||
t.Fatalf("Failed to parse the list of browser User Agents: %v.", err) | ||
} | ||
|
||
for _, userAgent := range browsers { | ||
if IsCrawler(userAgent) { | ||
t.Errorf("Browser User Agent %q is recognized as a crawler.", userAgent) | ||
} | ||
indices := MatchingCrawlers(userAgent) | ||
if len(indices) != 0 { | ||
t.Errorf("Browser User Agent %q matches with crawlers %v.", userAgent, indices) | ||
} | ||
} | ||
} | ||
|
||
const ( | ||
crawlerUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google (+https://developers.google.com/+/web/snippet/" | ||
browserUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36" | ||
) | ||
|
||
func BenchmarkIsCrawlerPositive(b *testing.B) { | ||
b.SetBytes(int64(len(crawlerUA))) | ||
for n := 0; n < b.N; n++ { | ||
if !IsCrawler(crawlerUA) { | ||
b.Fail() | ||
} | ||
} | ||
} | ||
|
||
func BenchmarkMatchingCrawlersPositive(b *testing.B) { | ||
b.SetBytes(int64(len(crawlerUA))) | ||
for n := 0; n < b.N; n++ { | ||
if len(MatchingCrawlers(crawlerUA)) == 0 { | ||
b.Fail() | ||
} | ||
} | ||
} | ||
|
||
func BenchmarkIsCrawlerNegative(b *testing.B) { | ||
b.SetBytes(int64(len(browserUA))) | ||
for n := 0; n < b.N; n++ { | ||
if IsCrawler(browserUA) { | ||
b.Fail() | ||
} | ||
} | ||
} | ||
|
||
func BenchmarkMatchingCrawlersNegative(b *testing.B) { | ||
b.SetBytes(int64(len(browserUA))) | ||
for n := 0; n < b.N; n++ { | ||
if len(MatchingCrawlers(browserUA)) != 0 { | ||
b.Fail() | ||
} | ||
} | ||
} |