Skip to content

Commit

Permalink
Change the way we do CDX dedupe
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed May 8, 2024
1 parent e3fee4d commit 883664a
Showing 1 changed file with 15 additions and 3 deletions.
18 changes: 15 additions & 3 deletions dedupe.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,23 @@ package warc

import (
"io"
"net"
"net/http"
"net/url"
"strings"
"time"
)

var CDXHTTPClient = http.Client{
Timeout: 10,
Transport: &http.Transport{
Dial: (&net.Dialer{
Timeout: 5 * time.Second,
}).Dial,
TLSHandshakeTimeout: 5 * time.Second,
},
}

type DedupeOptions struct {
LocalDedupe bool
CDXDedupe bool
Expand All @@ -31,15 +43,15 @@ func (d *customDialer) checkLocalRevisit(digest string) revisitRecord {
}

func checkCDXRevisit(CDXURL string, digest string, targetURI string, cookie string) (revisitRecord, error) {
req, err := http.NewRequest("GET", CDXURL+"/web/timemap/cdx?url="+url.QueryEscape(targetURI)+"&filter=digest:"+digest+"&limit=-1", nil)
req, err := http.NewRequest("GET", CDXURL+"/web/timemap/cdx?url="+url.QueryEscape(targetURI)+"&limit=-1", nil)
if err != nil {
return revisitRecord{}, err
}

if cookie != "" {
req.Header.Add("Cookie", cookie)
}
resp, err := http.DefaultClient.Do(req)
resp, err := CDXHTTPClient.Do(req)
if err != nil {
return revisitRecord{}, err
}
Expand All @@ -52,7 +64,7 @@ func checkCDXRevisit(CDXURL string, digest string, targetURI string, cookie stri

cdxReply := strings.Fields(string(body))

if len(cdxReply) >= 7 {
if len(cdxReply) >= 7 && cdxReply[3] != "warc/revisit" && cdxReply[6] == digest {
return revisitRecord{
responseUUID: "",
targetURI: cdxReply[2],
Expand Down

0 comments on commit 883664a

Please sign in to comment.