Skip to content

Commit

Permalink
Merge pull request #186 from adamdecaf/topn-domain-whitelists
Browse files Browse the repository at this point in the history
whitelist/gen: Support reading "top n domains" csv files
  • Loading branch information
adamdecaf authored Mar 11, 2018
2 parents f18edf9 + bc4788f commit 3fda79c
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 24 deletions.
61 changes: 54 additions & 7 deletions pkg/whitelist/gen/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ package gen

import (
"bufio"
"bytes"
"compress/gzip"
"fmt"
"io"
"net/url"
"os"
"path/filepath"
Expand All @@ -25,6 +28,10 @@ import (
"github.com/adamdecaf/cert-manage/pkg/file"
)

var (
gzipHeader = []byte{0x1f, 0x8b}
)

// FromFile reads a text file and grabs urls separated by newlines
func FromFile(path string) ([]*url.URL, error) {
path, err := filepath.Abs(path)
Expand All @@ -41,15 +48,55 @@ func FromFile(path string) ([]*url.URL, error) {
return nil, err
}
var res []*url.URL
rdr := bufio.NewScanner(fd)
rdr := bufio.NewScanner(decodeIfGzipped(fd))

for rdr.Scan() {
line := strings.TrimSpace(rdr.Text())
if line != "" {
u, err := url.Parse(line)
if err == nil && u != nil {
res = append(res, u)
}
if u := findUrlInLine(rdr.Text()); u != nil {
res = append(res, u)
}
}
return res, nil
}

// decodeIfGzipped attempts to wrap the bufio.Reader if the header contains the gzip
// magic header, otherwise the original reader is returned.
func decodeIfGzipped(r io.Reader) io.Reader {
rdr := bufio.NewReader(r)
bs, err := rdr.Peek(2)
if err != nil {
// Probably not a good thing we error'd, probably io.EOF or something
return r
}
if bytes.Equal(bs, gzipHeader) {
rdr, err := gzip.NewReader(rdr)
if err != nil {
return r // don't wrap reader in gzip
}
return rdr
}
return rdr
}

// findUrlInLine attempts to find the first url embedded in a line
// of plain text
//
// This would be from a file just containing URLS or a "top n domains"
// from a service like alexa or cisco.
//
// e.g. 1,google.com
func findUrlInLine(line string) *url.URL {
// Split on , -- usually from the "top n domains" files
parts := strings.Split(line, ",")
for i := range parts {
parts[i] = strings.TrimSpace(parts[i])
if parts[i] == "" {
continue
}
// return whatever parses as a URL
u, err := url.Parse(parts[i])
if err == nil {
return u
}
}
return nil
}
57 changes: 40 additions & 17 deletions pkg/whitelist/gen/file_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,48 @@ import (
)

func TestGen_fromFile(t *testing.T) {
urls, err := FromFile("../../../testdata/file-with-urls")
if err != nil {
t.Fatal(err)
}
if len(urls) != 3 {
t.Errorf("got %d urls", len(urls))
}
t.Parallel()

var ss []string
for i := range urls {
ss = append(ss, urls[i].String())
cases := []struct {
path string
count int
answer []string
}{
{
path: "../../../testdata/file-with-urls",
count: 3,
answer: []string{
"https://google.com",
"https://yahoo.com",
"https://bing.com",
},
},
{
path: "../../../testdata/alexa-top-1m.csv.gz",
count: 1e6,
},
{
path: "../../../testdata/cisco-top-1m.csv.gz",
count: 1e6,
},
}

ans := []string{
"https://google.com",
"https://yahoo.com",
"https://bing.com",
}
if !reflect.DeepEqual(ss, ans) {
t.Errorf("got %q", ss)
for i := range cases {
urls, err := FromFile(cases[i].path)
if err != nil {
t.Fatal(err)
}
if len(urls) != cases[i].count {
t.Errorf("%s got %d urls", cases[i].path, len(urls))
}
if cases[i].answer != nil {
var ss []string
for i := range urls {
ss = append(ss, urls[i].String())
}
if !reflect.DeepEqual(ss, cases[i].answer) {
t.Errorf("%s got %q", cases[i].path, ss)
}
}
}
}
Binary file added testdata/alexa-top-1m.csv.gz
Binary file not shown.
Binary file added testdata/cisco-top-1m.csv.gz
Binary file not shown.

0 comments on commit 3fda79c

Please sign in to comment.