Skip to content

Commit

Permalink
Refactor: accept single url
Browse files Browse the repository at this point in the history
  • Loading branch information
web-flow committed Jul 3, 2021
1 parent 2591836 commit 675b941
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 195 deletions.
38 changes: 30 additions & 8 deletions cmd/archive.is/is.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
package main

import (
"context"
"flag"
"fmt"
"net/url"
"os"
"sync"
"time"

"github.com/wabarc/archive.is"
)
Expand Down Expand Up @@ -38,17 +42,35 @@ func main() {
}

wbrc := &is.Archiver{}

if playback {
collects, _ := wbrc.Playback(args)
for orig, dest := range collects {
fmt.Println(orig, "=>", dest)
}
process(wbrc.Playback, args)
os.Exit(0)
}

saved, _ := wbrc.Wayback(args)
for orig, dest := range saved {
fmt.Println(orig, "=>", dest)
process(wbrc.Wayback, args)
}

func process(f func(context.Context, *url.URL) (string, error), args []string) {
var wg sync.WaitGroup
for _, arg := range args {
wg.Add(1)
go func(link string) {
defer wg.Done()
u, err := url.Parse(link)
if err != nil {
fmt.Println(link, "=>", fmt.Sprintf("%v", err))
return
}
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
defer cancel()

r, err := f(ctx, u)
if err != nil {
fmt.Println(link, "=>", fmt.Sprintf("%v", err))
return
}
fmt.Println(link, "=>", r)
}(arg)
}
wg.Wait()
}
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ require (
github.com/PuerkitoBio/goquery v1.6.1
github.com/andybalholm/cascadia v1.2.0 // indirect
github.com/cretz/bine v0.1.0
github.com/kr/pretty v0.1.0 // indirect
github.com/stretchr/testify v1.7.0 // indirect
github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe
github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee
golang.org/x/crypto v0.0.0-20210415154028-4f45737414dc // indirect
golang.org/x/net v0.0.0-20210415231046-e915ea6b2b7d
golang.org/x/sys v0.0.0-20210415045647-66c3f260301c // indirect
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
)
7 changes: 0 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,9 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe h1:V9yz2vQlSVLs51nlo0DAeETFOE57OvlYm98X1LKJA6U=
github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe/go.mod h1:TuTZtoiOu984UWOf7FfX58JllKMjq7FCz701kB5W88E=
github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee h1:MMIp++7eem2CI1jIYDoPByMwXeZAjsFo2ciBNtvhB80=
github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee/go.mod h1:4uYr9fnQaQoDk1ttTzLnSB3lZm3i/vrJwN8EZIB2YuI=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
Expand All @@ -41,11 +38,7 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
mvdan.cc/xurls/v2 v2.2.0 h1:NSZPykBXJFCetGZykLAxaL6SIpvbVy/UFEniIfHAa8A=
mvdan.cc/xurls/v2 v2.2.0/go.mod h1:EV1RMtya9D6G5DMYPGD8zTQzaHet6Jh8gFlRgGRJeO8=
148 changes: 41 additions & 107 deletions is.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@ import (
"os"
"strconv"
"strings"
"sync"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/wabarc/helper"
"github.com/wabarc/logger"
)

Expand All @@ -42,7 +39,6 @@ var (
scheme = "http"
onion = "archiveiya74codqgiixo33q62qlrqtkgmcitqx5u2oeqnmn5bpcbiyd.onion" // archivecaslytosk.onion
cookie = ""
timeout = 120 * time.Second
domains = []string{
"archive.today",
"archive.is",
Expand All @@ -62,123 +58,67 @@ func init() {
}

// Wayback is the handle of saving webpages to archive.is
func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) {
collects, results := make(map[string]string), make(map[string]string)
for _, link := range links {
if helper.IsURL(link) {
collects[link] = link
}
}
if len(collects) == 0 {
return results, fmt.Errorf("Not found")
}

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
func (wbrc *Archiver) Wayback(ctx context.Context, in *url.URL) (dst string, err error) {
torClient, t, err := newTorClient(ctx)
defer closeTor(t)
defer closeTor(t) // nolint:errcheck
if err != nil {
logger.Error("%v", err)
}

is := &IS{
wbrc: wbrc,
httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect},
httpClient: &http.Client{CheckRedirect: noRedirect},
torClient: torClient,
}

ch := make(chan string, len(collects))
defer close(ch)

var mu sync.Mutex
var wg sync.WaitGroup
for _, link := range collects {
wg.Add(1)
go func(link string) {
mu.Lock()
is.submitid = ""
is.archive(link, ch)
results[link] = strings.Replace(<-ch, onion, "archive.today", 1)
mu.Unlock()
wg.Done()
}(link)
}
wg.Wait()

if len(results) == 0 {
return results, fmt.Errorf("No results")
dst, err = is.archive(ctx, in)
if err != nil {
return
}
dst = strings.Replace(dst, onion, "archive.today", 1)

return results, nil
return
}

// Playback handle searching archived webpages from archive.is
func (wbrc *Archiver) Playback(links []string) (map[string]string, error) {
collects, results := make(map[string]string), make(map[string]string)
for _, link := range links {
if helper.IsURL(link) {
collects[link] = link
}
}
if len(collects) == 0 {
return results, fmt.Errorf("Not found")
}

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
func (wbrc *Archiver) Playback(ctx context.Context, in *url.URL) (dst string, err error) {
torClient, t, err := newTorClient(ctx)
defer closeTor(t)
defer closeTor(t) // nolint:errcheck
if err != nil {
logger.Error("%v", err)
}

is := &IS{
wbrc: wbrc,
httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect},
httpClient: &http.Client{CheckRedirect: noRedirect},
torClient: torClient,
}

ch := make(chan string, len(collects))
defer close(ch)

var mu sync.Mutex
var wg sync.WaitGroup
for _, link := range collects {
wg.Add(1)
go func(link string) {
mu.Lock()
is.submitid = ""
is.search(link, ch)
results[link] = strings.Replace(<-ch, onion, "archive.today", 1)
mu.Unlock()
wg.Done()
}(link)
}
wg.Wait()

if len(results) == 0 {
return results, fmt.Errorf("No results")
dst, err = is.search(ctx, in)
if err != nil {
return
}
dst = strings.Replace(dst, onion, "archive.today", 1)

return results, nil
return
}
func (is *IS) archive(uri string, ch chan<- string) {
func (is *IS) archive(ctx context.Context, u *url.URL) (string, error) {
endpoint, err := is.getValidDomain()
if err != nil {
ch <- fmt.Sprint("archive.today is unavailable.")
return
return "", fmt.Errorf("archive.today is unavailable.")
}

if is.wbrc.Anyway != "" {
anyway = is.wbrc.Anyway
}
uri := u.String()
data := url.Values{
"submitid": {is.submitid},
"anyway": {anyway},
"url": {uri},
}
domain := endpoint.String()
req, err := http.NewRequest("POST", domain+"/submit/", strings.NewReader(data.Encode()))
req, _ := http.NewRequestWithContext(ctx, http.MethodPost, domain+"/submit/", strings.NewReader(data.Encode()))
req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
req.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
req.Header.Add("User-Agent", userAgent)
Expand All @@ -188,46 +128,40 @@ func (is *IS) archive(uri string, ch chan<- string) {
req.Header.Add("Cookie", is.getCookie())
resp, err := is.httpClient.Do(req)
if err != nil {
ch <- fmt.Sprint(err)
return
return "", err
}
defer resp.Body.Close()

code := resp.StatusCode / 100
if code == 1 || code == 4 || code == 5 {
final := fmt.Sprintf("%s?url=%s", domain, uri)
ch <- final
return
return final, nil
}

_, err = io.Copy(ioutil.Discard, resp.Body)
if err != nil {
ch <- fmt.Sprint(err)
return
return "", err
}

// When use anyway parameter.
refresh := resp.Header.Get("Refresh")
if len(refresh) > 0 {
r := strings.Split(refresh, ";url=")
if len(r) == 2 {
ch <- r[1]
return
return r[1], nil
}
}
loc := resp.Header.Get("location")
if len(loc) > 2 {
ch <- loc
return
return loc, nil
}
// Redirect to final url if page saved.
final := resp.Request.URL.String()
if len(final) > 0 && strings.Contains(final, "/submit/") == false {
ch <- final
return
if len(final) > 0 && !strings.Contains(final, "/submit/") {
return final, nil
}

ch <- fmt.Sprintf("%s/timegate/%s", domain, uri)
return fmt.Sprintf("%s/timegate/%s", domain, uri), nil
}

func noRedirect(req *http.Request, via []*http.Request) error {
Expand All @@ -248,12 +182,12 @@ func (is *IS) getCookie() string {
}

func (is *IS) getSubmitID(url string) (string, error) {
if strings.Contains(url, "http") == false {
if !strings.Contains(url, "http") {
return "", fmt.Errorf("missing protocol scheme")
}

r := strings.NewReader("")
req, err := http.NewRequest("GET", url, r)
req, _ := http.NewRequest("GET", url, r)
req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
req.Header.Add("User-Agent", userAgent)
req.Header.Add("Cookie", is.getCookie())
Expand Down Expand Up @@ -313,36 +247,36 @@ func (is *IS) getValidDomain() (*url.URL, error) {
return endpoint, nil
}

func (is *IS) search(uri string, ch chan<- string) {
func (is *IS) search(ctx context.Context, in *url.URL) (string, error) {
endpoint, err := is.getValidDomain()
if err != nil {
ch <- fmt.Sprint("archive.today is unavailable.")
return
return "", fmt.Errorf("archive.today is unavailable.")
}

uri := in.String()
domain := endpoint.String()
req, err := http.NewRequest("GET", fmt.Sprintf("%s/%s", domain, uri), nil)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("%s/%s", domain, uri), nil)
if err != nil {
return "", err
}
req.Header.Add("User-Agent", userAgent)
req.Header.Add("Referer", domain)
req.Header.Add("Host", endpoint.Hostname())
resp, err := is.httpClient.Do(req)
if err != nil {
ch <- fmt.Sprint(err)
return
return "", err
}
defer resp.Body.Close()

doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
ch <- fmt.Sprint(err)
return
return "", err
}

target, exists := doc.Find("#row0 > .TEXT-BLOCK > a").Attr("href")
if !exists {
ch <- "Not found"
return
return "", fmt.Errorf("Not found")
}

ch <- target
return target, nil
}
Loading

0 comments on commit 675b941

Please sign in to comment.