Skip to content

Commit

Permalink
Add cookie support to CDX request (#34)
Browse files Browse the repository at this point in the history
* feat: add support for cookie to be sent with CDX request
* fix: allow both local dedupe and CDX to be used at the same time
* fix: revisit tests
  • Loading branch information
NGTmeaty authored Feb 7, 2024
1 parent fd3ee1c commit fd84c9e
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 11 deletions.
4 changes: 2 additions & 2 deletions client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ func TestHTTPClientLocalDedupe(t *testing.T) {

for _, path := range files {
testFileSingleHashCheck(t, path, "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", []string{"26882", "142"}, 2)
testFileRevisitVailidity(t, path)
testFileRevisitVailidity(t, path, "", "")
}
}

Expand Down Expand Up @@ -499,7 +499,7 @@ func TestHTTPClientRemoteDedupe(t *testing.T) {

for _, path := range files {
testFileSingleHashCheck(t, path, "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", []string{"26882", "142"}, 4)
testFileRevisitVailidity(t, path)
testFileRevisitVailidity(t, path, "20220320002518", "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3")
}
}

Expand Down
13 changes: 11 additions & 2 deletions dedupe.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ type DedupeOptions struct {
LocalDedupe bool
CDXDedupe bool
CDXURL string
CDXCookie string
SizeThreshold int
}

Expand All @@ -29,8 +30,16 @@ func (d *customDialer) checkLocalRevisit(digest string) revisitRecord {
return revisitRecord{}
}

func checkCDXRevisit(CDXURL string, digest string, targetURI string) (revisitRecord, error) {
resp, err := http.Get(CDXURL + "/web/timemap/cdx?url=" + url.QueryEscape(targetURI) + "&filter=digest:" + digest + "&limit=-1")
func checkCDXRevisit(CDXURL string, digest string, targetURI string, cookie string) (revisitRecord, error) {
req, err := http.NewRequest("GET", CDXURL+"/web/timemap/cdx?url="+url.QueryEscape(targetURI)+"&filter=digest:"+digest+"&limit=-1", nil)
if err != nil {
return revisitRecord{}, err
}

if cookie != "" {
req.Header.Add("Cookie", cookie)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return revisitRecord{}, err
}
Expand Down
8 changes: 5 additions & 3 deletions dialer.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,14 +299,16 @@ func (d *customDialer) readResponse(respPipe *io.PipeReader, warcTargetURIChanne
}
resp.Body.Close()
responseRecord.Header.Set("WARC-Payload-Digest", "sha1:"+payloadDigest)

// Write revisit record if local or CDX dedupe is activated
var revisit = revisitRecord{}
if bytesCopied >= int64(d.client.dedupeOptions.SizeThreshold) {
if d.client.dedupeOptions.LocalDedupe {
revisit = d.checkLocalRevisit(payloadDigest)
} else if d.client.dedupeOptions.CDXDedupe {
revisit, _ = checkCDXRevisit(d.client.dedupeOptions.CDXURL, payloadDigest, warcTargetURI)
}

// Allow both to be checked. If local dedupe does not find anything, check CDX (if set).
if d.client.dedupeOptions.CDXDedupe && revisit.targetURI == "" {
revisit, _ = checkCDXRevisit(d.client.dedupeOptions.CDXURL, payloadDigest, warcTargetURI, d.client.dedupeOptions.CDXCookie)
}
}

Expand Down
5 changes: 1 addition & 4 deletions read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ func testFileSingleHashCheck(t *testing.T, path string, hash string, expectedCon
return -1
}

func testFileRevisitVailidity(t *testing.T, path string) {
func testFileRevisitVailidity(t *testing.T, path string, originalTime string, originalDigest string) {
file, err := os.Open(path)
if err != nil {
t.Fatalf("failed to open %q: %v", path, err)
Expand All @@ -152,9 +152,6 @@ func testFileRevisitVailidity(t *testing.T, path string) {
t.Fatalf("warc.NewReader failed for %q: %v", path, err)
}

var originalTime string
var originalDigest string

for {
record, err := reader.ReadRecord()

Expand Down

0 comments on commit fd84c9e

Please sign in to comment.