diff --git a/README.md b/README.md index 697ddc9..868c3a6 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Quickly scrape Google Search Results. -## Example +## Example Search ```go import "fmt" @@ -50,6 +50,27 @@ func main() { } ``` +## Example RelatedSearch + +```go +import "fmt" +import "github.com/rocketlaunchr/google-search" + +func main() { + + _, topics, _ := RelatedSearch(context.Background(), "Google", opt) + fmt.Printf("%#v\n", topics) + +} +``` + +## Results: + +```go +[]string{"www.google.com search", "google account", "google mail", "google play", "google map", "google sign in", "google chrome", "google lens"} +``` + + ## :warning: Warning The implementation relies on Google's search page DOM being constant. From time to time, Google changes their DOM and thus breaks the implementation. @@ -91,7 +112,9 @@ Also note, that if you call this function too quickly, Google detects that it is ## Credits -Special thanks to [Edmund Martin](https://edmundmartin.com/scraping-google-with-golang/). +[dirkh24](https://github.com/dirkh24/google-search) for related search. + +[Edmund Martin](https://edmundmartin.com/scraping-google-with-golang/). Other useful packages diff --git a/example_test.go b/example_test.go index 8dad19f..c01ce2b 100644 --- a/example_test.go +++ b/example_test.go @@ -3,6 +3,9 @@ package googlesearch import ( "fmt" "strings" + "time" + + "golang.org/x/time/rate" ) func ExampleSearch() { @@ -59,3 +62,62 @@ func ExampleUserAgent() { // Output: Australia Wide First Aid (https://www.australiawidefirstaid.com.au/) found in the serp } + +/* +Example of how to set a Rate Limit +*/ +func ExampleRateLimit() { + + ctx := context.Background() + + RateLimit.SetLimit(rate.Every(5 * time.Second)) // Interval + RateLimit.SetBurst(1) // Requests per Interval + + err := RateLimit.Wait(ctx) + if err != nil { + fmt.Print(err.Error()) + } + + for i := 0; i < 1; i++ { + serp, err := Search(ctx, "Australia Wide First Aid") + + if err != nil { + fmt.Print(err.Error()) + } + + if len(serp) > 0 { + fmt.Println("Resaults found") + } + + } + + // Output: + // Resaults found +} + +/* +Example of how to get the Related Searches +*/ + +func ExampleRelatedSearch() { + + opt := SearchOptions{ + CountryCode: "au", + } + + _, topics, err := RelatedSearch(context.Background(), "Google", opt) + + if err != nil { + fmt.Print(err.Error()) + } + + for _, topic := range topics { + if strings.Contains(topic, "google") { + fmt.Println("Related search found") + break + } + } + + // Output: Related search found + +} diff --git a/search.go b/search.go index 2e509ef..797e1e5 100644 --- a/search.go +++ b/search.go @@ -30,8 +30,9 @@ type Result struct { Description string `json:"description"` } +var Topics []string + const stdGoogleBase = "https://www.google." -const defaultAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" // GoogleDomains represents localized Google homepages. The 2 letter country code is based on ISO 3166-1 alpha-2. // @@ -271,12 +272,25 @@ type SearchOptions struct { // Search returns a list of search results from Google. func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Result, error) { + + resaults, _, err := search(ctx, searchTerm, opts...) + return resaults, err +} + +// RelatedSearch returns a list of related searches from Google. +func RelatedSearch(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Result, []string, error) { + resaults, related, err := search(ctx, searchTerm, opts...) + return resaults, related, err +} + +// Search returns a list of search results from Google. +func search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Result, []string, error) { if ctx == nil { ctx = context.Background() } if err := RateLimit.Wait(ctx); err != nil { - return nil, err + return nil, nil, err } c := colly.NewCollector(colly.MaxDepth(1)) @@ -335,7 +349,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re linkHref, _ := sel.Find("a").Attr("href") linkText := strings.TrimSpace(linkHref) - titleText := strings.TrimSpace(sel.Find("div > div > div > a > h3").Text()) + titleText := strings.TrimSpace(sel.Find("div > div > div > div > span:first-child > a > h3").Text()) descText := strings.TrimSpace(sel.Find("div > div > div > div:first-child > span:first-child").Text()) rank += 1 @@ -372,12 +386,44 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re } }) + c.OnHTML("div.AJLUJb:nth-child(1) > div:nth-child(1) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) { + Topics = append(Topics, h.Text) + }) + + c.OnHTML("div.AJLUJb:nth-child(1) > div:nth-child(2) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) { + Topics = append(Topics, h.Text) + }) + + c.OnHTML("div.AJLUJb:nth-child(1) > div:nth-child(3) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) { + Topics = append(Topics, h.Text) + }) + + c.OnHTML("div.AJLUJb:nth-child(1) > div:nth-child(4) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) { + Topics = append(Topics, h.Text) + }) + + c.OnHTML("div.AJLUJb:nth-child(2) > div:nth-child(1) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) { + Topics = append(Topics, h.Text) + }) + + c.OnHTML("div.AJLUJb:nth-child(2) > div:nth-child(2) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) { + Topics = append(Topics, h.Text) + }) + + c.OnHTML("div.AJLUJb:nth-child(2) > div:nth-child(3) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) { + Topics = append(Topics, h.Text) + }) + + c.OnHTML("div.AJLUJb:nth-child(2) > div:nth-child(4) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) { + Topics = append(Topics, h.Text) + }) + url := buildUrl(searchTerm, opts[0].CountryCode, lc, limit, opts[0].Start) if opts[0].ProxyAddr != "" { rp, err := proxy.RoundRobinProxySwitcher(opts[0].ProxyAddr) if err != nil { - return nil, err + return nil, nil, err } c.SetProxyFunc(rp) } @@ -387,17 +433,17 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re if rErr != nil { if strings.Contains(rErr.Error(), "Too Many Requests") { - return nil, ErrBlocked + return nil, nil, ErrBlocked } - return nil, rErr + return nil, nil, rErr } // Reduce results to max limit if opts[0].Limit != 0 && len(results) > opts[0].Limit { - return results[:opts[0].Limit], nil + return results[:opts[0].Limit], Topics, nil } - return results, nil + return results, Topics, nil } func getStart(uri string) int { diff --git a/search_test.go b/search_test.go index f56a030..71c99cb 100644 --- a/search_test.go +++ b/search_test.go @@ -49,3 +49,161 @@ func TestSearch(t *testing.T) { t.Errorf("google dom changed") } } + +func TestRelatedSearch(t *testing.T) { + + q := "Hello World" + + opts := SearchOptions{ + Limit: 20, + } + + returnLinks, related, err := RelatedSearch(ctx, q, opts) + if err != nil { + t.Errorf("something went wrong: %v", err) + return + } + + if len(returnLinks) == 0 { + t.Errorf("no results returned: %v", returnLinks) + } + + noURL := 0 + noTitle := 0 + noDesc := 0 + + for _, res := range returnLinks { + if res.URL == "" { + noURL++ + } + + if res.Title == "" { + noTitle++ + } + + if res.Description == "" { + noDesc++ + } + } + + if len(related) < 1 { + t.Errorf("google dom changed") + } + + if noURL == len(returnLinks) || noTitle == len(returnLinks) || noDesc == len(returnLinks) { + t.Errorf("google dom changed") + } +} + +func Test_base(t *testing.T) { + tests := []struct { + name string + url string + want string + }{ + { + name: "Full url", + url: "https://www.google.com.au/search?q=", + want: "https://www.google.com.au/search?q=", + }, + { + name: "No base", + url: "com.au/search?q=", + want: "https://www.google.com.au/search?q=", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := base(tt.url); got != tt.want { + t.Errorf("base() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_buildUrl(t *testing.T) { + + tests := []struct { + name string + searchTerm string + countryCode string + languageCode string + limit int + start int + want string + }{ + + { + name: "With Spaces", + searchTerm: "Test Example", + countryCode: "au", + languageCode: "en", + limit: 0, + start: 0, + want: "https://www.google.com.au/search?q=Test+Example&hl=en", + }, + + { + name: "Start at 3", + searchTerm: "Test Example", + countryCode: "au", + languageCode: "en", + limit: 0, + start: 3, + want: "https://www.google.com.au/search?q=Test+Example&hl=en&start=3", + }, + + { + name: "Non countryCode", + searchTerm: "Test Example", + countryCode: "xx", + languageCode: "en", + limit: 0, + start: 0, + want: "https://www.google.com/search?q=Test+Example&hl=en", + }, + { + name: "Non countryCode Start at 3", + searchTerm: "Test Example", + countryCode: "xx", + languageCode: "en", + limit: 0, + start: 3, + want: "https://www.google.com/search?q=Test+Example&hl=en&start=3", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := buildUrl(tt.searchTerm, tt.countryCode, tt.languageCode, tt.limit, tt.start); got != tt.want { + t.Errorf("buildUrl() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_getStart(t *testing.T) { + + tests := []struct { + name string + uri string + want int + }{ + { + name: "No start set", + uri: "https://www.google.com/search?q=Test+Example&hl=en", + want: 0, + }, + { + name: "Start at t", + uri: "https://www.google.com/search?q=Test+Example&hl=en&start=3", + want: 3, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := getStart(tt.uri); got != tt.want { + t.Errorf("getStart() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/useragent.go b/useragent.go new file mode 100644 index 0000000..d97b9d2 --- /dev/null +++ b/useragent.go @@ -0,0 +1,7 @@ +// This file is programmatically generated +// Do not manualy update +// go run useragent/main.go + +package googlesearch + +const defaultAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" diff --git a/useragent/main.go b/useragent/main.go new file mode 100644 index 0000000..668b4cc --- /dev/null +++ b/useragent/main.go @@ -0,0 +1,61 @@ +package main + +import ( + "bytes" + "go/format" + "html/template" + "log" + "os" + "strings" + + "github.com/gocolly/colly/v2" +) + +func main() { + + var userAgent []string + collector := colly.NewCollector( + colly.AllowedDomains("whatismybrowser.com", "www.whatismybrowser.com"), + ) + + collector.OnHTML("table > tbody > tr", func(h *colly.HTMLElement) { + h.ForEach("td:nth-child(2) > ul", func(_ int, el *colly.HTMLElement) { + h.ForEach("li", func(_ int, li *colly.HTMLElement) { + userAgent = append(userAgent, strings.TrimSpace(li.ChildText("span"))) + }) + }) + }) + + collector.Visit("https://www.whatismybrowser.com/guides/the-latest-user-agent/chrome") + + if len(userAgent[0]) > 0 { + + tmp := template.Must(template.New("const").Parse(` // This file is programmatically generated + // Do not manualy update + // go run useragent/main.go + + package googlesearch + +const defaultAgent = "{{.}}" +`)) + + f, err := os.Create("useragent.go") + if err != nil { + log.Panicln(err.Error()) + } + defer f.Close() + + var tpl bytes.Buffer + + err = tmp.Execute(&tpl, userAgent[0]) + if err != nil { + log.Panicln(err.Error()) + } + + formatted, err := format.Source(tpl.Bytes()) + if err != nil { + log.Fatal(err) + } + f.Write(formatted) + } +}