Skip to content

Commit

Permalink
Add support for Golang, see https://pkg.go.dev/github.com/monperrus/c…
Browse files Browse the repository at this point in the history
…rawler-user-agents (#348)

Co-authored-by: Martin Monperrus <[email protected]>
  • Loading branch information
starius and monperrus authored Apr 5, 2024
1 parent 36ce640 commit 951462f
Show file tree
Hide file tree
Showing 6 changed files with 269 additions and 1 deletion.
1 change: 1 addition & 0 deletions .github/workflows/ci-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ jobs:
- run: py.test -vv
- run: python3 validate.py
- run: php validate.php
- run: go test
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,40 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f
* JavaScript: `if (RegExp(entry.pattern).test(req.headers['user-agent']) { ... }`
* PHP: add a slash before and after the pattern: `if (preg_match('/'.$entry['pattern'].'/', $_SERVER['HTTP_USER_AGENT'])): ...`
* Python: `if re.search(entry['pattern'], ua): ...`
* Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`),
functions `IsCrawler` and `MatchingCrawlers`.

Example of Go program:

```go
package main

import (
"fmt"

"github.com/monperrus/crawler-user-agents"
)

func main() {
userAgent := "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)"

isCrawler := agents.IsCrawler(userAgent)
fmt.Println("isCrawler:", isCrawler)

indices := agents.MatchingCrawlers(userAgent)
fmt.Println("crawlers' indices:", indices)
fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
}
```

Output:

```
isCrawler: true
crawlers' indices: [237]
crawler' URL: https://discordapp.com
```

## Contributing

Expand Down Expand Up @@ -66,7 +100,6 @@ There are a few wrapper libraries that use this data to detect bots:
* [Voight-Kampff](https://github.com/biola/Voight-Kampff) (Ruby)
* [isbot](https://github.com/Hentioe/isbot) (Ruby)
* [crawlers](https://github.com/Olical/crawlers) (Clojure)
* [crawlerflagger](https://godoc.org/go.kelfa.io/kelfa/pkg/crawlerflagger) (Go)
* [isBot](https://github.com/omrilotan/isbot) (Node.JS)

Other systems for spotting robots, crawlers, and spiders that you may want to consider are:
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/monperrus/crawler-user-agents

go 1.19
Empty file added go.sum
Empty file.
110 changes: 110 additions & 0 deletions validate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package agents

import (
_ "embed"
"encoding/json"
"fmt"
"regexp"
"time"
)

//go:embed crawler-user-agents.json
var crawlersJson []byte

// Crawler contains information about one crawler.
type Crawler struct {
// Regexp of User Agent of the crawler.
Pattern string `json:"pattern"`

// Discovery date.
AdditionDate time.Time `json:"addition_date"`

// Official url of the robot.
URL string `json:"url"`

// Examples of full User Agent strings.
Instances []string `json:"instances"`
}

// Private time needed to convert addition_date from/to the format used in JSON.
type jsonCrawler struct {
Pattern string `json:"pattern"`
AdditionDate string `json:"addition_date"`
URL string `json:"url"`
Instances []string `json:"instances"`
}

const timeLayout = "2006/01/02"

func (c Crawler) MarshalJSON() ([]byte, error) {
jc := jsonCrawler{
Pattern: c.Pattern,
AdditionDate: c.AdditionDate.Format(timeLayout),
URL: c.URL,
Instances: c.Instances,
}
return json.Marshal(jc)
}

func (c *Crawler) UnmarshalJSON(b []byte) error {
var jc jsonCrawler
if err := json.Unmarshal(b, &jc); err != nil {
return err
}

c.Pattern = jc.Pattern
c.URL = jc.URL
c.Instances = jc.Instances

if c.Pattern == "" {
return fmt.Errorf("empty pattern in record %s", string(b))
}

if jc.AdditionDate != "" {
tim, err := time.ParseInLocation(timeLayout, jc.AdditionDate, time.UTC)
if err != nil {
return err
}
c.AdditionDate = tim
}

return nil
}

// The list of crawlers, built from contents of crawler-user-agents.json.
var Crawlers = func() []Crawler {
var crawlers []Crawler
if err := json.Unmarshal(crawlersJson, &crawlers); err != nil {
panic(err)
}
return crawlers
}()

var regexps = func() []*regexp.Regexp {
regexps := make([]*regexp.Regexp, len(Crawlers))
for i, crawler := range Crawlers {
regexps[i] = regexp.MustCompile(crawler.Pattern)
}
return regexps
}()

// Returns if User Agent string matches any of crawler patterns.
func IsCrawler(userAgent string) bool {
for _, re := range regexps {
if re.MatchString(userAgent) {
return true
}
}
return false
}

// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers.
func MatchingCrawlers(userAgent string) []int {
indices := []int{}
for i, re := range regexps {
if re.MatchString(userAgent) {
indices = append(indices, i)
}
}
return indices
}
121 changes: 121 additions & 0 deletions validate_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package agents

import (
"encoding/json"
"fmt"
"net/http"
"testing"
)

func contains(list []int, value int) bool {
for _, elem := range list {
if elem == value {
return true
}
}
return false
}

func TestPatterns(t *testing.T) {
// Loading all crawlers with go:embed
// some validation happens in UnmarshalJSON.
allCrawlers := Crawlers

// There are at least 10 crawlers.
if len(allCrawlers) < 10 {
t.Errorf("Number of crawlers must be at least 10, got %d.", len(allCrawlers))
}

if IsCrawler(browserUA) {
t.Errorf("Browser UA %q was detected as a crawler.", browserUA)
}
if len(MatchingCrawlers(browserUA)) != 0 {
t.Errorf("MatchingCrawlers found crawlers matching Browser UA %q.", browserUA)
}

for i, crawler := range allCrawlers {
t.Run(crawler.Pattern, func(t *testing.T) {
fmt.Println(crawler.Pattern)

for _, instance := range crawler.Instances {
if !IsCrawler(instance) {
t.Errorf("Instance %q is not detected as a crawler.", instance)
}
hits := MatchingCrawlers(instance)
if !contains(hits, i) {
t.Errorf("Crawler with index %d (pattern %q) is not in the list returned by MatchingCrawlers(%q): %v.", i, crawler.Pattern, instance, hits)
}
}
})
}
}

func TestFalseNegatives(t *testing.T) {
const browsersURL = "https://raw.githubusercontent.com/microlinkhq/top-user-agents/master/src/index.json"
resp, err := http.Get(browsersURL)
if err != nil {
t.Fatalf("Failed to fetch the list of browser User Agents from %s: %v.", browsersURL, err)
}

t.Cleanup(func() {
if err := resp.Body.Close(); err != nil {
t.Fatal(err)
}
})

var browsers []string
if err := json.NewDecoder(resp.Body).Decode(&browsers); err != nil {
t.Fatalf("Failed to parse the list of browser User Agents: %v.", err)
}

for _, userAgent := range browsers {
if IsCrawler(userAgent) {
t.Errorf("Browser User Agent %q is recognized as a crawler.", userAgent)
}
indices := MatchingCrawlers(userAgent)
if len(indices) != 0 {
t.Errorf("Browser User Agent %q matches with crawlers %v.", userAgent, indices)
}
}
}

const (
crawlerUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google (+https://developers.google.com/+/web/snippet/"
browserUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36"
)

func BenchmarkIsCrawlerPositive(b *testing.B) {
b.SetBytes(int64(len(crawlerUA)))
for n := 0; n < b.N; n++ {
if !IsCrawler(crawlerUA) {
b.Fail()
}
}
}

func BenchmarkMatchingCrawlersPositive(b *testing.B) {
b.SetBytes(int64(len(crawlerUA)))
for n := 0; n < b.N; n++ {
if len(MatchingCrawlers(crawlerUA)) == 0 {
b.Fail()
}
}
}

func BenchmarkIsCrawlerNegative(b *testing.B) {
b.SetBytes(int64(len(browserUA)))
for n := 0; n < b.N; n++ {
if IsCrawler(browserUA) {
b.Fail()
}
}
}

func BenchmarkMatchingCrawlersNegative(b *testing.B) {
b.SetBytes(int64(len(browserUA)))
for n := 0; n < b.N; n++ {
if len(MatchingCrawlers(browserUA)) != 0 {
b.Fail()
}
}
}

0 comments on commit 951462f

Please sign in to comment.