From 675b941854e5cafc8500e033ad10ccb58190838a Mon Sep 17 00:00:00 2001 From: nobody Date: Sat, 3 Jul 2021 20:50:32 +0000 Subject: [PATCH] Refactor: accept single url --- cmd/archive.is/is.go | 38 ++++++++--- go.mod | 3 +- go.sum | 7 -- is.go | 148 ++++++++++++------------------------------- is_test.go | 86 +++++-------------------- tor.go | 3 +- 6 files changed, 90 insertions(+), 195 deletions(-) diff --git a/cmd/archive.is/is.go b/cmd/archive.is/is.go index c528992..4ff9f01 100644 --- a/cmd/archive.is/is.go +++ b/cmd/archive.is/is.go @@ -1,9 +1,13 @@ package main import ( + "context" "flag" "fmt" + "net/url" "os" + "sync" + "time" "github.com/wabarc/archive.is" ) @@ -38,17 +42,35 @@ func main() { } wbrc := &is.Archiver{} - if playback { - collects, _ := wbrc.Playback(args) - for orig, dest := range collects { - fmt.Println(orig, "=>", dest) - } + process(wbrc.Playback, args) os.Exit(0) } - saved, _ := wbrc.Wayback(args) - for orig, dest := range saved { - fmt.Println(orig, "=>", dest) + process(wbrc.Wayback, args) +} + +func process(f func(context.Context, *url.URL) (string, error), args []string) { + var wg sync.WaitGroup + for _, arg := range args { + wg.Add(1) + go func(link string) { + defer wg.Done() + u, err := url.Parse(link) + if err != nil { + fmt.Println(link, "=>", fmt.Sprintf("%v", err)) + return + } + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + + r, err := f(ctx, u) + if err != nil { + fmt.Println(link, "=>", fmt.Sprintf("%v", err)) + return + } + fmt.Println(link, "=>", r) + }(arg) } + wg.Wait() } diff --git a/go.mod b/go.mod index 3197e96..9189ae9 100644 --- a/go.mod +++ b/go.mod @@ -6,10 +6,11 @@ require ( github.com/PuerkitoBio/goquery v1.6.1 github.com/andybalholm/cascadia v1.2.0 // indirect github.com/cretz/bine v0.1.0 + github.com/kr/pretty v0.1.0 // indirect github.com/stretchr/testify v1.7.0 // indirect - github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee golang.org/x/crypto v0.0.0-20210415154028-4f45737414dc // indirect golang.org/x/net v0.0.0-20210415231046-e915ea6b2b7d golang.org/x/sys v0.0.0-20210415045647-66c3f260301c // indirect + gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect ) diff --git a/go.sum b/go.sum index 7846d94..a9f183e 100644 --- a/go.sum +++ b/go.sum @@ -14,12 +14,9 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe h1:V9yz2vQlSVLs51nlo0DAeETFOE57OvlYm98X1LKJA6U= -github.com/wabarc/helper v0.0.0-20210407153720-1bfe98b427fe/go.mod h1:TuTZtoiOu984UWOf7FfX58JllKMjq7FCz701kB5W88E= github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee h1:MMIp++7eem2CI1jIYDoPByMwXeZAjsFo2ciBNtvhB80= github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee/go.mod h1:4uYr9fnQaQoDk1ttTzLnSB3lZm3i/vrJwN8EZIB2YuI= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -41,11 +38,7 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -mvdan.cc/xurls/v2 v2.2.0 h1:NSZPykBXJFCetGZykLAxaL6SIpvbVy/UFEniIfHAa8A= -mvdan.cc/xurls/v2 v2.2.0/go.mod h1:EV1RMtya9D6G5DMYPGD8zTQzaHet6Jh8gFlRgGRJeO8= diff --git a/is.go b/is.go index b318d94..c6f55fc 100644 --- a/is.go +++ b/is.go @@ -11,11 +11,8 @@ import ( "os" "strconv" "strings" - "sync" - "time" "github.com/PuerkitoBio/goquery" - "github.com/wabarc/helper" "github.com/wabarc/logger" ) @@ -42,7 +39,6 @@ var ( scheme = "http" onion = "archiveiya74codqgiixo33q62qlrqtkgmcitqx5u2oeqnmn5bpcbiyd.onion" // archivecaslytosk.onion cookie = "" - timeout = 120 * time.Second domains = []string{ "archive.today", "archive.is", @@ -62,123 +58,67 @@ func init() { } // Wayback is the handle of saving webpages to archive.is -func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) { - collects, results := make(map[string]string), make(map[string]string) - for _, link := range links { - if helper.IsURL(link) { - collects[link] = link - } - } - if len(collects) == 0 { - return results, fmt.Errorf("Not found") - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() +func (wbrc *Archiver) Wayback(ctx context.Context, in *url.URL) (dst string, err error) { torClient, t, err := newTorClient(ctx) - defer closeTor(t) + defer closeTor(t) // nolint:errcheck if err != nil { logger.Error("%v", err) } is := &IS{ wbrc: wbrc, - httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect}, + httpClient: &http.Client{CheckRedirect: noRedirect}, torClient: torClient, } - ch := make(chan string, len(collects)) - defer close(ch) - - var mu sync.Mutex - var wg sync.WaitGroup - for _, link := range collects { - wg.Add(1) - go func(link string) { - mu.Lock() - is.submitid = "" - is.archive(link, ch) - results[link] = strings.Replace(<-ch, onion, "archive.today", 1) - mu.Unlock() - wg.Done() - }(link) - } - wg.Wait() - - if len(results) == 0 { - return results, fmt.Errorf("No results") + dst, err = is.archive(ctx, in) + if err != nil { + return } + dst = strings.Replace(dst, onion, "archive.today", 1) - return results, nil + return } // Playback handle searching archived webpages from archive.is -func (wbrc *Archiver) Playback(links []string) (map[string]string, error) { - collects, results := make(map[string]string), make(map[string]string) - for _, link := range links { - if helper.IsURL(link) { - collects[link] = link - } - } - if len(collects) == 0 { - return results, fmt.Errorf("Not found") - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() +func (wbrc *Archiver) Playback(ctx context.Context, in *url.URL) (dst string, err error) { torClient, t, err := newTorClient(ctx) - defer closeTor(t) + defer closeTor(t) // nolint:errcheck if err != nil { logger.Error("%v", err) } is := &IS{ wbrc: wbrc, - httpClient: &http.Client{Timeout: timeout, CheckRedirect: noRedirect}, + httpClient: &http.Client{CheckRedirect: noRedirect}, torClient: torClient, } - ch := make(chan string, len(collects)) - defer close(ch) - - var mu sync.Mutex - var wg sync.WaitGroup - for _, link := range collects { - wg.Add(1) - go func(link string) { - mu.Lock() - is.submitid = "" - is.search(link, ch) - results[link] = strings.Replace(<-ch, onion, "archive.today", 1) - mu.Unlock() - wg.Done() - }(link) - } - wg.Wait() - - if len(results) == 0 { - return results, fmt.Errorf("No results") + dst, err = is.search(ctx, in) + if err != nil { + return } + dst = strings.Replace(dst, onion, "archive.today", 1) - return results, nil + return } -func (is *IS) archive(uri string, ch chan<- string) { +func (is *IS) archive(ctx context.Context, u *url.URL) (string, error) { endpoint, err := is.getValidDomain() if err != nil { - ch <- fmt.Sprint("archive.today is unavailable.") - return + return "", fmt.Errorf("archive.today is unavailable.") } if is.wbrc.Anyway != "" { anyway = is.wbrc.Anyway } + uri := u.String() data := url.Values{ "submitid": {is.submitid}, "anyway": {anyway}, "url": {uri}, } domain := endpoint.String() - req, err := http.NewRequest("POST", domain+"/submit/", strings.NewReader(data.Encode())) + req, _ := http.NewRequestWithContext(ctx, http.MethodPost, domain+"/submit/", strings.NewReader(data.Encode())) req.Header.Add("Content-Type", "application/x-www-form-urlencoded") req.Header.Add("Content-Length", strconv.Itoa(len(data.Encode()))) req.Header.Add("User-Agent", userAgent) @@ -188,22 +128,19 @@ func (is *IS) archive(uri string, ch chan<- string) { req.Header.Add("Cookie", is.getCookie()) resp, err := is.httpClient.Do(req) if err != nil { - ch <- fmt.Sprint(err) - return + return "", err } defer resp.Body.Close() code := resp.StatusCode / 100 if code == 1 || code == 4 || code == 5 { final := fmt.Sprintf("%s?url=%s", domain, uri) - ch <- final - return + return final, nil } _, err = io.Copy(ioutil.Discard, resp.Body) if err != nil { - ch <- fmt.Sprint(err) - return + return "", err } // When use anyway parameter. @@ -211,23 +148,20 @@ func (is *IS) archive(uri string, ch chan<- string) { if len(refresh) > 0 { r := strings.Split(refresh, ";url=") if len(r) == 2 { - ch <- r[1] - return + return r[1], nil } } loc := resp.Header.Get("location") if len(loc) > 2 { - ch <- loc - return + return loc, nil } // Redirect to final url if page saved. final := resp.Request.URL.String() - if len(final) > 0 && strings.Contains(final, "/submit/") == false { - ch <- final - return + if len(final) > 0 && !strings.Contains(final, "/submit/") { + return final, nil } - ch <- fmt.Sprintf("%s/timegate/%s", domain, uri) + return fmt.Sprintf("%s/timegate/%s", domain, uri), nil } func noRedirect(req *http.Request, via []*http.Request) error { @@ -248,12 +182,12 @@ func (is *IS) getCookie() string { } func (is *IS) getSubmitID(url string) (string, error) { - if strings.Contains(url, "http") == false { + if !strings.Contains(url, "http") { return "", fmt.Errorf("missing protocol scheme") } r := strings.NewReader("") - req, err := http.NewRequest("GET", url, r) + req, _ := http.NewRequest("GET", url, r) req.Header.Add("Content-Type", "application/x-www-form-urlencoded") req.Header.Add("User-Agent", userAgent) req.Header.Add("Cookie", is.getCookie()) @@ -313,36 +247,36 @@ func (is *IS) getValidDomain() (*url.URL, error) { return endpoint, nil } -func (is *IS) search(uri string, ch chan<- string) { +func (is *IS) search(ctx context.Context, in *url.URL) (string, error) { endpoint, err := is.getValidDomain() if err != nil { - ch <- fmt.Sprint("archive.today is unavailable.") - return + return "", fmt.Errorf("archive.today is unavailable.") } + uri := in.String() domain := endpoint.String() - req, err := http.NewRequest("GET", fmt.Sprintf("%s/%s", domain, uri), nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("%s/%s", domain, uri), nil) + if err != nil { + return "", err + } req.Header.Add("User-Agent", userAgent) req.Header.Add("Referer", domain) req.Header.Add("Host", endpoint.Hostname()) resp, err := is.httpClient.Do(req) if err != nil { - ch <- fmt.Sprint(err) - return + return "", err } defer resp.Body.Close() doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { - ch <- fmt.Sprint(err) - return + return "", err } target, exists := doc.Find("#row0 > .TEXT-BLOCK > a").Attr("href") if !exists { - ch <- "Not found" - return + return "", fmt.Errorf("Not found") } - ch <- target + return target, nil } diff --git a/is_test.go b/is_test.go index 5da1ed0..45d9e9c 100644 --- a/is_test.go +++ b/is_test.go @@ -1,87 +1,33 @@ package is import ( + "context" + "net/url" "testing" ) func TestWayback(t *testing.T) { - var got map[string]string - - tests := []struct { - name string - urls []string - got int - }{ - { - name: "Without URLs", - urls: []string{}, - got: 0, - }, - { - name: "Has one invalid URL", - urls: []string{"foo bar", "https://example.com/"}, - got: 1, - }, - { - name: "URLs full matches", - urls: []string{"https://example.com/", "https://example.org/"}, - got: 2, - }, + uri := "https://example.com" + u, err := url.Parse(uri) + if err != nil { + t.Fatal(err) } - wbrc := &Archiver{} - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got, _ = wbrc.Wayback(test.urls) - if len(got) != test.got { - t.Errorf("got = %d; want %d", len(got), test.got) - } - for orig, dest := range got { - if testing.Verbose() { - t.Log(orig, "=>", dest) - } - } - }) + _, err = wbrc.Wayback(context.Background(), u) + if err != nil { + t.Fatal(err) } } func TestPlayback(t *testing.T) { - var got map[string]string - - tests := []struct { - name string - urls []string - got int - }{ - { - name: "Without URLs", - urls: []string{}, - got: 0, - }, - { - name: "Has one invalid URL", - urls: []string{"foo bar", "https://example.com/"}, - got: 1, - }, - { - name: "URLs full matches", - urls: []string{"https://example.com/", "https://example.org/"}, - got: 2, - }, + uri := "https://example.com" + u, err := url.Parse(uri) + if err != nil { + t.Fatal(err) } - wbrc := &Archiver{} - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got, _ = wbrc.Playback(test.urls) - if len(got) != test.got { - t.Errorf("got = %d; want %d", len(got), test.got) - } - for orig, dest := range got { - if testing.Verbose() { - t.Log(orig, "=>", dest) - } - } - }) + _, err = wbrc.Playback(context.Background(), u) + if err != nil { + t.Fatal(err) } } diff --git a/tor.go b/tor.go index 059dbb3..2036577 100644 --- a/tor.go +++ b/tor.go @@ -37,7 +37,7 @@ func newTorClient(ctx context.Context) (client *http.Client, t *tor.Tor, err err // Start tor with default config startConf := &tor.StartConf{TempDataDirBase: os.TempDir(), RetainTempDataDir: false, NoHush: false} - t, err = tor.Start(nil, startConf) + t, err = tor.Start(context.TODO(), startConf) if err != nil { return nil, t, fmt.Errorf("Make connection failed: %w", err) } @@ -59,7 +59,6 @@ func newTorClient(ctx context.Context) (client *http.Client, t *tor.Tor, err err } return &http.Client{ - Timeout: timeout, CheckRedirect: noRedirect, Transport: &http.Transport{ Proxy: http.ProxyFromEnvironment,