Skip to content

Commit

Permalink
fix: ensure we are not writing revisit records to blank records. thes…
Browse files Browse the repository at this point in the history
…e can cause playback problems and are generally not needed.
  • Loading branch information
NGTmeaty authored and CorentinB committed Sep 27, 2024
1 parent 3fd3d8d commit 56091d4
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 6 deletions.
90 changes: 87 additions & 3 deletions client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,12 +438,12 @@ func TestHTTPClientLocalDedupe(t *testing.T) {

for _, path := range files {
testFileSingleHashCheck(t, path, "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", []string{"26872", "132"}, 2)
testFileRevisitVailidity(t, path, "", "")
testFileRevisitVailidity(t, path, "", "", false)
}

// verify that the local dedupe count is correct
if LocalDedupeTotal.Value() != 26872 {
t.Fatalf("remote dedupe total mismatch, expected: 26872 got: %d", LocalDedupeTotal.Value())
t.Fatalf("local dedupe total mismatch, expected: 26872 got: %d", LocalDedupeTotal.Value())
}
}

Expand Down Expand Up @@ -534,7 +534,7 @@ func TestHTTPClientRemoteDedupe(t *testing.T) {

for _, path := range files {
testFileSingleHashCheck(t, path, "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", []string{"26872", "132"}, 4)
testFileRevisitVailidity(t, path, "20220320002518", "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3")
testFileRevisitVailidity(t, path, "20220320002518", "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", false)
}

// verify that the remote dedupe count is correct
Expand All @@ -543,6 +543,90 @@ func TestHTTPClientRemoteDedupe(t *testing.T) {
}
}

func TestHTTPClientDedupeEmptyPayload(t *testing.T) {
defer goleak.VerifyNone(t)

var (
rotatorSettings = NewRotatorSettings()
errWg sync.WaitGroup
err error
)

// Reset counter to 0?
LocalDedupeTotal.Reset()

// init test HTTP endpoint
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Empty. This is intentional to mirror 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ.
fileBytes := []byte("")
w.WriteHeader(http.StatusOK)
w.Write(fileBytes)
}))
defer server.Close()

rotatorSettings.OutputDirectory, err = os.MkdirTemp("", "warc-tests-")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(rotatorSettings.OutputDirectory)

rotatorSettings.Prefix = "DEDUP3"

// init the HTTP client responsible for recording HTTP(s) requests / responses
httpClient, err := NewWARCWritingHTTPClient(HTTPClientSettings{
RotatorSettings: rotatorSettings,
DedupeOptions: DedupeOptions{
LocalDedupe: true,
SizeThreshold: 1,
},
})
if err != nil {
t.Fatalf("Unable to init WARC writing HTTP client: %s", err)
}

errWg.Add(1)
go func() {
defer errWg.Done()
for err := range httpClient.ErrChan {
t.Errorf("Error writing to WARC: %s", err.Err.Error())
}
}()

for i := 0; i < 2; i++ {
req, err := http.NewRequest("GET", server.URL, nil)
if err != nil {
t.Fatal(err)
}

resp, err := httpClient.Do(req)
if err != nil {
t.Fatal(err)
}
defer resp.Body.Close()

io.Copy(io.Discard, resp.Body)

time.Sleep(time.Second)
}

httpClient.Close()

files, err := filepath.Glob(rotatorSettings.OutputDirectory + "/*")
if err != nil {
t.Fatal(err)
}

for _, path := range files {
testFileSingleHashCheck(t, path, "sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", []string{"94", "94"}, 2)
testFileRevisitVailidity(t, path, "", "", true)
}

// verify that the local dedupe count is correct
if LocalDedupeTotal.Value() != 0 {
t.Fatalf("local dedupe total mismatch, expected: 0 got: %d", LocalDedupeTotal.Value())
}
}

func TestHTTPClientDisallow429(t *testing.T) {
defer goleak.VerifyNone(t)

Expand Down
4 changes: 2 additions & 2 deletions dialer.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ func (d *customDialer) writeWARCFromConnection(reqPipe, respPipe *io.PipeReader,
r.Header.Set("Content-Length", strconv.Itoa(getContentLength(r.Content)))

if d.client.dedupeOptions.LocalDedupe {
if r.Header.Get("WARC-Type") == "response" {
if r.Header.Get("WARC-Type") == "response" && r.Header.Get("WARC-Payload-Digest")[5:] != "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ" {
d.client.dedupeHashTable.Store(r.Header.Get("WARC-Payload-Digest")[5:], revisitRecord{
responseUUID: recordIDs[i],
size: getContentLength(r.Content),
Expand Down Expand Up @@ -405,7 +405,7 @@ func (d *customDialer) readResponse(respPipe *io.PipeReader, warcTargetURIChanne
}
}

if revisit.targetURI != "" {
if revisit.targetURI != "" && payloadDigest != "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ" {
responseRecord.Header.Set("WARC-Type", "revisit")
responseRecord.Header.Set("WARC-Refers-To-Target-URI", revisit.targetURI)
responseRecord.Header.Set("WARC-Refers-To-Date", revisit.date)
Expand Down
7 changes: 6 additions & 1 deletion read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func testFileSingleHashCheck(t *testing.T, path string, hash string, expectedCon
return -1
}

func testFileRevisitVailidity(t *testing.T, path string, originalTime string, originalDigest string) {
func testFileRevisitVailidity(t *testing.T, path string, originalTime string, originalDigest string, shouldBeEmpty bool) {
var revisitRecordsFound = false
file, err := os.Open(path)
if err != nil {
Expand All @@ -169,6 +169,11 @@ func testFileRevisitVailidity(t *testing.T, path string, originalTime string, or
if revisitRecordsFound {
return
}
if shouldBeEmpty {
t.Logf("No revisit records found. That's expected for this test.")
break
}

t.Fatalf("No revisit records found.")
break
}
Expand Down

0 comments on commit 56091d4

Please sign in to comment.