diff --git a/README.md b/README.md index 7f96916..f310034 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,16 @@ func main() { // https://www.bbc.com => https://archive.li/HjqQV ``` +### Access Tor Hidden Service + +[archive.today](https://archive.today) providing [Tor Hidden Service](http://archivecaslytosk.onion/) to saving webpage, and it's preferred to access +Tor Hidden Service, access if Tor Hidden Service is unavailable. + +By default, the program will dial a proxy using tor socks port `127.0.0.1:9050`, +use `TOR_HOST` and `TOR_SOCKS_PORT` specified a different host and port + +It'll look up tor executable file if dial socks proxy failed, and start it to dial proxy. + ## FAQ ### archive.today is unavailable? diff --git a/go.mod b/go.mod index b81c08b..3197e96 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,12 @@ go 1.15 require ( github.com/PuerkitoBio/goquery v1.6.1 + github.com/andybalholm/cascadia v1.2.0 // indirect github.com/cretz/bine v0.1.0 github.com/stretchr/testify v1.7.0 // indirect - github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616 + github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee + golang.org/x/crypto v0.0.0-20210415154028-4f45737414dc // indirect + golang.org/x/net v0.0.0-20210415231046-e915ea6b2b7d + golang.org/x/sys v0.0.0-20210415045647-66c3f260301c // indirect ) diff --git a/go.sum b/go.sum index 17354fe..7846d94 100644 --- a/go.sum +++ b/go.sum @@ -1,29 +1,51 @@ github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk= github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= -github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= +github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= github.com/cretz/bine v0.1.0 h1:1/fvhLE+fk0bPzjdO5Ci+0ComYxEMuB1JhM4X5skT3g= github.com/cretz/bine v0.1.0/go.mod h1:6PF6fWAvYtwjRGkAuDEJeWNOv3a2hUouSP/yRYXmvHw= github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616 h1:wZ5HtpmZAVUq0Im5Sm92ycJrTeLJk5lB/Kvh55Rd+Ps= -github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616/go.mod h1:N9P4r7Rn46p4nkWtXV6ztN3p5ACVnp++bgfwjTqSxQ8= +github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe h1:V9yz2vQlSVLs51nlo0DAeETFOE57OvlYm98X1LKJA6U= +github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe/go.mod h1:TuTZtoiOu984UWOf7FfX58JllKMjq7FCz701kB5W88E= github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee h1:MMIp++7eem2CI1jIYDoPByMwXeZAjsFo2ciBNtvhB80= github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee/go.mod h1:4uYr9fnQaQoDk1ttTzLnSB3lZm3i/vrJwN8EZIB2YuI= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210415154028-4f45737414dc h1:+q90ECDSAQirdykUN6sPEiBXBsp8Csjcca8Oy7bgLTA= +golang.org/x/crypto v0.0.0-20210415154028-4f45737414dc/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210415231046-e915ea6b2b7d h1:BgJvlyh+UqCUaPlscHJ+PN8GcpfrFdr7NHjd1JL0+Gs= +golang.org/x/net v0.0.0-20210415231046-e915ea6b2b7d/go.mod h1:9tjilg8BloeKEkVJvy7fQ90B1CfIiPueXVOjqfkSzI8= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210415045647-66c3f260301c h1:6L+uOeS3OQt/f4eFHXZcTxeZrGCuz+CLElgEBjbcTA4= +golang.org/x/sys v0.0.0-20210415045647-66c3f260301c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +mvdan.cc/xurls/v2 v2.2.0 h1:NSZPykBXJFCetGZykLAxaL6SIpvbVy/UFEniIfHAa8A= +mvdan.cc/xurls/v2 v2.2.0/go.mod h1:EV1RMtya9D6G5DMYPGD8zTQzaHet6Jh8gFlRgGRJeO8= diff --git a/is.go b/is.go index 87436cd..9ec5219 100644 --- a/is.go +++ b/is.go @@ -55,6 +55,12 @@ var ( } ) +func init() { + if os.Getenv("DEBUG") != "" { + logger.EnableDebug() + } +} + // Wayback is the handle of saving webpages to archive.is func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) { collects, results := make(map[string]string), make(map[string]string) @@ -66,11 +72,16 @@ func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) { collects[link] = link } - torClient, tor, err := newTorClient() + done := make(chan bool, 1) + torClient, err := newTorClient(done) if err != nil { logger.Error("%v", err) - } else { - defer tor.Close() + } + + is := &IS{ + wbrc: wbrc, + httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect}, + torClient: torClient, } ch := make(chan string, len(collects)) @@ -81,13 +92,8 @@ func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) { for _, link := range collects { wg.Add(1) go func(link string) { - is := &IS{ - wbrc: wbrc, - httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect}, - torClient: torClient, - submitid: "", - } mu.Lock() + is.submitid = "" is.archive(link, ch) results[link] = strings.Replace(<-ch, onion, "archive.today", 1) mu.Unlock() @@ -96,6 +102,9 @@ func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) { } wg.Wait() + // Close tor connection + done <- true + if len(results) == 0 { return results, fmt.Errorf("No results") } @@ -115,11 +124,16 @@ func (wbrc *Archiver) Playback(links []string) (map[string]string, error) { return results, fmt.Errorf("No found URL") } - torClient, tor, err := newTorClient() + done := make(chan bool, 1) + torClient, err := newTorClient(done) if err != nil { logger.Error("%v", err) - } else { - defer tor.Close() + } + + is := &IS{ + wbrc: wbrc, + httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect}, + torClient: torClient, } ch := make(chan string, len(collects)) @@ -130,13 +144,8 @@ func (wbrc *Archiver) Playback(links []string) (map[string]string, error) { for _, link := range collects { wg.Add(1) go func(link string) { - is := &IS{ - wbrc: wbrc, - httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect}, - torClient: torClient, - submitid: "", - } mu.Lock() + is.submitid = "" is.search(link, ch) results[link] = strings.Replace(<-ch, onion, "archive.today", 1) mu.Unlock() @@ -195,6 +204,15 @@ func (is *IS) archive(uri string, ch chan<- string) { return } + // When use anyway parameter. + refresh := resp.Header.Get("Refresh") + if len(refresh) > 0 { + r := strings.Split(refresh, ";url=") + if len(r) == 2 { + ch <- r[1] + return + } + } loc := resp.Header.Get("location") if len(loc) > 2 { ch <- loc @@ -206,15 +224,6 @@ func (is *IS) archive(uri string, ch chan<- string) { ch <- final return } - // When use anyway parameter. - refresh := resp.Header.Get("refresh") - if len(refresh) > 0 { - r := strings.Split(refresh, ";url=") - if len(r) == 2 { - ch <- r[1] - return - } - } ch <- fmt.Sprintf("%s/timegate/%s", domain, uri) } diff --git a/tor.go b/tor.go index 7d2df56..813047a 100644 --- a/tor.go +++ b/tor.go @@ -4,7 +4,7 @@ import ( "context" "crypto/tls" "fmt" - // "net" + "net" "net/http" "os" "os/exec" @@ -12,43 +12,68 @@ import ( "time" "github.com/cretz/bine/tor" - // "golang.org/x/net/proxy" + "github.com/wabarc/logger" + "golang.org/x/net/proxy" ) -func newTorClient() (*http.Client, *tor.Tor, error) { - // Lookup tor executable file - if _, err := exec.LookPath("tor"); err != nil { - return nil, nil, fmt.Errorf("%w", err) - } +func newTorClient(done <-chan bool) (*http.Client, error) { + var dialer proxy.ContextDialer + if useProxy() { + // Create a socks5 dialer + pxy, err := proxy.SOCKS5("tcp", "127.0.0.1:9050", nil, proxy.Direct) + if err != nil { + return nil, fmt.Errorf("Can't connect to the proxy: %w", err) + } - // Start tor with default config - startConf := &tor.StartConf{TempDataDirBase: os.TempDir()} - t, err := tor.Start(nil, startConf) - if err != nil { - return nil, nil, fmt.Errorf("Make connection failed: %w", err) - } - // defer t.Close() + dialer = pxy.(interface { + DialContext(ctx context.Context, network, addr string) (net.Conn, error) + }) + } else { + // Lookup tor executable file + if _, err := exec.LookPath("tor"); err != nil { + return nil, fmt.Errorf("%w", err) + } - // Wait at most a minute to start network and get - dialCtx, dialCancel := context.WithTimeout(context.Background(), time.Minute) - defer dialCancel() + // Start tor with default config + startConf := &tor.StartConf{TempDataDirBase: os.TempDir()} + t, err := tor.Start(nil, startConf) + if err != nil { + return nil, fmt.Errorf("Make connection failed: %w", err) + } + // defer t.Close() - // Make connection - dialer, err := t.Dialer(dialCtx, nil) - if err != nil { - t.Close() - return nil, nil, fmt.Errorf("Make connection failed: %w", err) - } + // Wait at most a minute to start network and get + dialCtx, dialCancel := context.WithTimeout(context.Background(), time.Minute) + defer dialCancel() - // Create a socks5 dialer - // pxy, err := proxy.SOCKS5("tcp", "127.0.0.1:9050", nil, proxy.Direct) - // if err != nil { - // return nil, fmt.Errorf("Can't connect to the proxy: %w", err) - // } + // Make connection + dialer, err = t.Dialer(dialCtx, nil) + if err != nil { + t.Close() + return nil, fmt.Errorf("Make connection failed: %w", err) + } - // dialer := pxy.(interface { - // DialContext(ctx context.Context, network, addr string) (net.Conn, error) - // }) + go func() { + // Auto close tor client after 10 min + tick := time.NewTicker(10 * time.Minute) + for { + select { + case <-done: + logger.Debug("Closed tor client") + tick.Stop() + t.Close() + return + case <-tick.C: + logger.Debug("Closed tor client, timeout") + tick.Stop() + t.Close() + return + default: + logger.Debug("Waiting for close tor client") + } + } + }() + } return &http.Client{ Timeout: timeout, @@ -65,5 +90,29 @@ func newTorClient() (*http.Client, *tor.Tor, error) { InsecureSkipVerify: true, }, }, - }, t, nil + }, nil +} + +func useProxy() bool { + host := os.Getenv("TOR_HOST") + port := os.Getenv("TOR_SOCKS_PORT") + if host == "" { + host = "127.0.0.1" + } + if port == "" { + port = "9050" + } + + conn, err := net.DialTimeout("tcp", net.JoinHostPort(host, port), time.Second) + if err != nil { + logger.Debug("Try to connect tor proxy failed: %v", err) + return false + } + if conn != nil { + conn.Close() + logger.Debug("Connected: %v", net.JoinHostPort(host, port)) + return true + } + + return false }