-
Notifications
You must be signed in to change notification settings - Fork 4
/
dedupe.go
64 lines (53 loc) · 1.25 KB
/
dedupe.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package warc
import (
"io"
"net/http"
"net/url"
"strings"
)
type DedupeOptions struct {
LocalDedupe bool
CDXDedupe bool
CDXURL string
CDXCookie string
SizeThreshold int
}
type revisitRecord struct {
responseUUID string
targetURI string
date string
}
func (d *customDialer) checkLocalRevisit(digest string) revisitRecord {
revisit, exists := d.client.dedupeHashTable.Load(digest)
if exists {
return revisit.(revisitRecord)
}
return revisitRecord{}
}
func checkCDXRevisit(CDXURL string, digest string, targetURI string, cookie string) (revisitRecord, error) {
req, err := http.NewRequest("GET", CDXURL+"/web/timemap/cdx?url="+url.QueryEscape(targetURI)+"&filter=digest:"+digest+"&limit=-1", nil)
if err != nil {
return revisitRecord{}, err
}
if cookie != "" {
req.Header.Add("Cookie", cookie)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return revisitRecord{}, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return revisitRecord{}, err
}
cdxReply := strings.Fields(string(body))
if len(cdxReply) >= 7 {
return revisitRecord{
responseUUID: "",
targetURI: cdxReply[2],
date: cdxReply[1],
}, nil
}
return revisitRecord{}, nil
}