Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Link dom and updated agent #26

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
27 changes: 25 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

Quickly scrape Google Search Results.

## Example
## Example Search

```go
import "fmt"
Expand Down Expand Up @@ -50,6 +50,27 @@ func main() {
}
```

## Example RelatedSearch

```go
import "fmt"
import "github.com/rocketlaunchr/google-search"

func main() {

_, topics, _ := RelatedSearch(context.Background(), "Google", opt)
fmt.Printf("%#v\n", topics)

}
```

## Results:

```go
[]string{"www.google.com search", "google account", "google mail", "google play", "google map", "google sign in", "google chrome", "google lens"}
```


## :warning: Warning

The implementation relies on Google's search page DOM being constant. From time to time, Google changes their DOM and thus breaks the implementation.
Expand Down Expand Up @@ -91,7 +112,9 @@ Also note, that if you call this function too quickly, Google detects that it is

## Credits

Special thanks to [Edmund Martin](https://edmundmartin.com/scraping-google-with-golang/).
[dirkh24](https://github.com/dirkh24/google-search) for related search.

[Edmund Martin](https://edmundmartin.com/scraping-google-with-golang/).


Other useful packages
Expand Down
62 changes: 62 additions & 0 deletions example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import (
"fmt"
"strings"
"time"

"golang.org/x/time/rate"
)

func ExampleSearch() {
Expand Down Expand Up @@ -59,3 +62,62 @@
// Output: Australia Wide First Aid (https://www.australiawidefirstaid.com.au/) found in the serp

}

/*
Example of how to set a Rate Limit
*/
func ExampleRateLimit() {

ctx := context.Background()

Check failure on line 71 in example_test.go

View workflow job for this annotation

GitHub Actions / build

undefined: context

RateLimit.SetLimit(rate.Every(5 * time.Second)) // Interval
RateLimit.SetBurst(1) // Requests per Interval

err := RateLimit.Wait(ctx)
if err != nil {
fmt.Print(err.Error())
}

for i := 0; i < 1; i++ {
serp, err := Search(ctx, "Australia Wide First Aid")

if err != nil {
fmt.Print(err.Error())
}

if len(serp) > 0 {
fmt.Println("Resaults found")
}

}

// Output:
// Resaults found
}

/*
Example of how to get the Related Searches
*/

func ExampleRelatedSearch() {

opt := SearchOptions{
CountryCode: "au",
}

_, topics, err := RelatedSearch(context.Background(), "Google", opt)

Check failure on line 108 in example_test.go

View workflow job for this annotation

GitHub Actions / build

undefined: context

if err != nil {
fmt.Print(err.Error())
}

for _, topic := range topics {
if strings.Contains(topic, "google") {
fmt.Println("Related search found")
break
}
}

// Output: Related search found

}
62 changes: 54 additions & 8 deletions search.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ type Result struct {
Description string `json:"description"`
}

var Topics []string

const stdGoogleBase = "https://www.google."
const defaultAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"

// GoogleDomains represents localized Google homepages. The 2 letter country code is based on ISO 3166-1 alpha-2.
//
Expand Down Expand Up @@ -271,12 +272,25 @@ type SearchOptions struct {

// Search returns a list of search results from Google.
func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Result, error) {

resaults, _, err := search(ctx, searchTerm, opts...)
return resaults, err
}

// RelatedSearch returns a list of related searches from Google.
func RelatedSearch(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Result, []string, error) {
resaults, related, err := search(ctx, searchTerm, opts...)
return resaults, related, err
}

// Search returns a list of search results from Google.
func search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Result, []string, error) {
if ctx == nil {
ctx = context.Background()
}

if err := RateLimit.Wait(ctx); err != nil {
return nil, err
return nil, nil, err
}

c := colly.NewCollector(colly.MaxDepth(1))
Expand Down Expand Up @@ -335,7 +349,7 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re

linkHref, _ := sel.Find("a").Attr("href")
linkText := strings.TrimSpace(linkHref)
titleText := strings.TrimSpace(sel.Find("div > div > div > a > h3").Text())
titleText := strings.TrimSpace(sel.Find("div > div > div > div > span:first-child > a > h3").Text())
descText := strings.TrimSpace(sel.Find("div > div > div > div:first-child > span:first-child").Text())

rank += 1
Expand Down Expand Up @@ -372,12 +386,44 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re
}
})

c.OnHTML("div.AJLUJb:nth-child(1) > div:nth-child(1) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) {
Topics = append(Topics, h.Text)
})

c.OnHTML("div.AJLUJb:nth-child(1) > div:nth-child(2) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) {
Topics = append(Topics, h.Text)
})

c.OnHTML("div.AJLUJb:nth-child(1) > div:nth-child(3) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) {
Topics = append(Topics, h.Text)
})

c.OnHTML("div.AJLUJb:nth-child(1) > div:nth-child(4) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) {
Topics = append(Topics, h.Text)
})

c.OnHTML("div.AJLUJb:nth-child(2) > div:nth-child(1) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) {
Topics = append(Topics, h.Text)
})

c.OnHTML("div.AJLUJb:nth-child(2) > div:nth-child(2) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) {
Topics = append(Topics, h.Text)
})

c.OnHTML("div.AJLUJb:nth-child(2) > div:nth-child(3) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) {
Topics = append(Topics, h.Text)
})

c.OnHTML("div.AJLUJb:nth-child(2) > div:nth-child(4) > a:nth-child(1) > div:nth-child(2)", func(h *colly.HTMLElement) {
Topics = append(Topics, h.Text)
})

url := buildUrl(searchTerm, opts[0].CountryCode, lc, limit, opts[0].Start)

if opts[0].ProxyAddr != "" {
rp, err := proxy.RoundRobinProxySwitcher(opts[0].ProxyAddr)
if err != nil {
return nil, err
return nil, nil, err
}
c.SetProxyFunc(rp)
}
Expand All @@ -387,17 +433,17 @@ func Search(ctx context.Context, searchTerm string, opts ...SearchOptions) ([]Re

if rErr != nil {
if strings.Contains(rErr.Error(), "Too Many Requests") {
return nil, ErrBlocked
return nil, nil, ErrBlocked
}
return nil, rErr
return nil, nil, rErr
}

// Reduce results to max limit
if opts[0].Limit != 0 && len(results) > opts[0].Limit {
return results[:opts[0].Limit], nil
return results[:opts[0].Limit], Topics, nil
}

return results, nil
return results, Topics, nil
}

func getStart(uri string) int {
Expand Down
158 changes: 158 additions & 0 deletions search_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,161 @@ func TestSearch(t *testing.T) {
t.Errorf("google dom changed")
}
}

func TestRelatedSearch(t *testing.T) {

q := "Hello World"

opts := SearchOptions{
Limit: 20,
}

returnLinks, related, err := RelatedSearch(ctx, q, opts)
if err != nil {
t.Errorf("something went wrong: %v", err)
return
}

if len(returnLinks) == 0 {
t.Errorf("no results returned: %v", returnLinks)
}

noURL := 0
noTitle := 0
noDesc := 0

for _, res := range returnLinks {
if res.URL == "" {
noURL++
}

if res.Title == "" {
noTitle++
}

if res.Description == "" {
noDesc++
}
}

if len(related) < 1 {
t.Errorf("google dom changed")
}

if noURL == len(returnLinks) || noTitle == len(returnLinks) || noDesc == len(returnLinks) {
t.Errorf("google dom changed")
}
}

func Test_base(t *testing.T) {
tests := []struct {
name string
url string
want string
}{
{
name: "Full url",
url: "https://www.google.com.au/search?q=",
want: "https://www.google.com.au/search?q=",
},
{
name: "No base",
url: "com.au/search?q=",
want: "https://www.google.com.au/search?q=",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := base(tt.url); got != tt.want {
t.Errorf("base() = %v, want %v", got, tt.want)
}
})
}
}

func Test_buildUrl(t *testing.T) {

tests := []struct {
name string
searchTerm string
countryCode string
languageCode string
limit int
start int
want string
}{

{
name: "With Spaces",
searchTerm: "Test Example",
countryCode: "au",
languageCode: "en",
limit: 0,
start: 0,
want: "https://www.google.com.au/search?q=Test+Example&hl=en",
},

{
name: "Start at 3",
searchTerm: "Test Example",
countryCode: "au",
languageCode: "en",
limit: 0,
start: 3,
want: "https://www.google.com.au/search?q=Test+Example&hl=en&start=3",
},

{
name: "Non countryCode",
searchTerm: "Test Example",
countryCode: "xx",
languageCode: "en",
limit: 0,
start: 0,
want: "https://www.google.com/search?q=Test+Example&hl=en",
},
{
name: "Non countryCode Start at 3",
searchTerm: "Test Example",
countryCode: "xx",
languageCode: "en",
limit: 0,
start: 3,
want: "https://www.google.com/search?q=Test+Example&hl=en&start=3",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := buildUrl(tt.searchTerm, tt.countryCode, tt.languageCode, tt.limit, tt.start); got != tt.want {
t.Errorf("buildUrl() = %v, want %v", got, tt.want)
}
})
}
}

func Test_getStart(t *testing.T) {

tests := []struct {
name string
uri string
want int
}{
{
name: "No start set",
uri: "https://www.google.com/search?q=Test+Example&hl=en",
want: 0,
},
{
name: "Start at t",
uri: "https://www.google.com/search?q=Test+Example&hl=en&start=3",
want: 3,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := getStart(tt.uri); got != tt.want {
t.Errorf("getStart() = %v, want %v", got, tt.want)
}
})
}
}
Loading
Loading