-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add: warc extract * Add: results report printing * Oops, forgot to push utils.go * Add .gitignore for output folder * fix: improve support for spaces in the msgtype. * fix: add warc executable and warc files to ignore. * Add: warc verify * Update cmd/verify.go * small cosmetic fix * fix: we currently cannot process revisit records. this is currently outside of the scope of this tool, but could be added in the future. * feat: add gzip content decoding * fix: revisit records in verify * small cosmetic fix * fix: revisit if statement * feat: add folder structure to extract output. * fix: add support for SHA-256 Base16 verify support Base16 appears to be the most common SHA-256 encoding. As such, we will check based on that. iipc/warc-specifications#80 (comment) * Add: --host-sort * Truncate filenames too long * Cmd/extract: use filename from Content-Disposition only when it's not empty * cmd/extract: replace / in filenames * cmd/extract: handle mime parsing failure * feat: add (default) support to suffix duplicate file names with a SHA1 hash if they are different. * fix: resolve EOF read error --------- Co-authored-by: Jake L <[email protected]>
- Loading branch information
Showing
8 changed files
with
657 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
warcs/* | ||
temp/* | ||
warcs/** | ||
temp/** | ||
output/** | ||
warc | ||
*.warc.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
package main | ||
|
||
import ( | ||
"bufio" | ||
"bytes" | ||
"compress/gzip" | ||
"io" | ||
"mime" | ||
"net/http" | ||
"net/url" | ||
"os" | ||
"path" | ||
"slices" | ||
"strconv" | ||
"strings" | ||
"time" | ||
|
||
"github.com/CorentinB/warc" | ||
"github.com/remeh/sizedwaitgroup" | ||
"github.com/sirupsen/logrus" | ||
"github.com/spf13/cobra" | ||
) | ||
|
||
func extract(cmd *cobra.Command, files []string) { | ||
threads, err := strconv.Atoi(cmd.Flags().Lookup("threads").Value.String()) | ||
if err != nil { | ||
logrus.Fatalf("failed to parse threads: %s", err.Error()) | ||
} | ||
|
||
swg := sizedwaitgroup.New(threads) | ||
|
||
for _, filepath := range files { | ||
startTime := time.Now() | ||
resultsChan := make(chan string) | ||
results := make(map[string]int) | ||
|
||
f, err := os.Open(filepath) | ||
if err != nil { | ||
logrus.Errorf("failed to open file: %s", err.Error()) | ||
return | ||
} | ||
|
||
reader, err := warc.NewReader(f) | ||
if err != nil { | ||
logrus.Errorf("warc.NewReader failed for %q: %v", filepath, err) | ||
return | ||
} | ||
|
||
go func(c chan string) { | ||
for result := range c { | ||
results[result]++ | ||
} | ||
}(resultsChan) | ||
|
||
for { | ||
record, err := reader.ReadRecord() | ||
if err != nil { | ||
if err != io.EOF { | ||
logrus.Errorf("failed to read all record content: %v", err) | ||
return | ||
} | ||
break | ||
} | ||
|
||
swg.Add() | ||
go processRecord(cmd, record, &resultsChan, &swg) | ||
} | ||
|
||
swg.Wait() | ||
close(resultsChan) | ||
|
||
printExtractReport(filepath, results, time.Since(startTime)) | ||
} | ||
} | ||
|
||
func processRecord(cmd *cobra.Command, record *warc.Record, resultsChan *chan string, swg *sizedwaitgroup.SizedWaitGroup) { | ||
defer record.Content.Close() | ||
defer swg.Done() | ||
|
||
// Only process Content-Type: application/http; msgtype=response (no reason to process requests or other records) | ||
if !strings.Contains(record.Header.Get("Content-Type"), "msgtype=response") { | ||
logrus.Debugf("skipping record with Content-Type: %s", record.Header.Get("Content-Type")) | ||
return | ||
} | ||
|
||
if record.Header.Get("WARC-Type") == "revisit" { | ||
logrus.Debugf("skipping revisit record.") | ||
return | ||
} | ||
|
||
// Read the entire record.Content into a bufio.Reader | ||
response, err := http.ReadResponse(bufio.NewReader(record.Content), nil) | ||
if err != nil { | ||
logrus.Errorf("failed to read response: %v", err) | ||
return | ||
} | ||
|
||
// If the response's Content-Type match one of the content types to extract, write the file | ||
contentTypesToExtract := strings.Split(strings.Trim(cmd.Flags().Lookup("content-type").Value.String(), "[]"), ",") | ||
|
||
if slices.ContainsFunc(contentTypesToExtract, func(s string) bool { | ||
return strings.Contains(response.Header.Get("Content-Type"), s) | ||
}) { | ||
err = writeFile(cmd, response, record) | ||
if err != nil { | ||
logrus.Errorf("failed to write file: %v", err) | ||
return | ||
} | ||
|
||
// Send the result to the results channel | ||
*resultsChan <- response.Header.Get("Content-Type") | ||
} | ||
} | ||
|
||
func writeFile(vmd *cobra.Command, resp *http.Response, record *warc.Record) error { | ||
// Find the filename either from the Content-Disposition header or the last part of the URL | ||
filename := path.Base(record.Header.Get("WARC-Target-URI")) | ||
|
||
if resp.Header.Get("Content-Disposition") != "" { | ||
_, params, err := mime.ParseMediaType(resp.Header.Get("Content-Disposition")) | ||
if err == nil { | ||
if params["filename"] != "" { | ||
filename = params["filename"] | ||
} | ||
} else { | ||
logrus.Debugf("failed to parse Content-Disposition header: %v", err) | ||
|
||
if !strings.HasSuffix(filename, ".pdf") { | ||
filename += ".pdf" | ||
} | ||
} | ||
} | ||
|
||
// Truncate the filename if it's too long (keep the extension) | ||
if len(filename) > 255 { | ||
extension := path.Ext(filename) | ||
|
||
filename = filename[:255-len(extension)] + extension | ||
} | ||
|
||
// Remove any invalid characters from the filename | ||
filename = strings.ReplaceAll(filename, "/", "_") | ||
|
||
// Check if the file already exists | ||
outputDir := vmd.Flags().Lookup("output").Value.String() | ||
|
||
// Create the output directory if it doesn't exist. | ||
if _, err := os.Stat(outputDir); os.IsNotExist(err) { | ||
err := os.MkdirAll(outputDir, 0755) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
|
||
// Check if --host-sort is enabled, if yes extract the host from the WARC-Target-URI and put the file in a subdirectory | ||
if vmd.Flags().Lookup("host-sort").Changed { | ||
URI := record.Header.Get("WARC-Target-URI") | ||
URL, err := url.Parse(URI) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
err = os.MkdirAll(path.Join(outputDir, URL.Host), 0755) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
outputDir = path.Join(outputDir, URL.Host) | ||
} | ||
|
||
outputPath := path.Join(outputDir, filename) | ||
if _, err := os.Stat(outputPath); err == nil { | ||
if vmd.Flags().Lookup("hash-suffix").Changed { | ||
// Read the file to check the hash. | ||
originalFile, err := os.Open(outputPath) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
defer originalFile.Close() | ||
|
||
body, err := io.ReadAll(resp.Body) | ||
|
||
if err != nil { | ||
return err | ||
} | ||
|
||
var reader io.Reader | ||
|
||
if resp.Header.Get("Content-Encoding") == "gzip" { | ||
reader, err = gzip.NewReader(bytes.NewReader(body)) | ||
if err != nil { | ||
return err | ||
} | ||
} else { | ||
reader = bytes.NewReader(body) | ||
} | ||
|
||
payloadDigest := warc.GetSHA1(reader) | ||
|
||
// Reset response reader | ||
resp.Body = io.NopCloser(bytes.NewBuffer(body)) | ||
|
||
originalPayloadDigest := warc.GetSHA1(originalFile) | ||
|
||
if originalPayloadDigest != payloadDigest { | ||
if len(filename) > 247 { | ||
extension := path.Ext(filename) | ||
|
||
filename = filename[:247-len(extension)] + "[" + payloadDigest[26:] + "]" + extension | ||
} else { | ||
extension := path.Ext(filename) | ||
|
||
filename = filename[:len(filename)-len(extension)] + "[" + payloadDigest[26:] + "]" + extension | ||
} | ||
|
||
outputPath = path.Join(outputDir, filename) | ||
// Double check that the new file doesn't exist | ||
if _, err := os.Stat(outputPath); err == nil { | ||
if !vmd.Flags().Lookup("allow-overwrite").Changed { | ||
logrus.Infof("file %s already exists, skipping", filename) | ||
return nil | ||
} | ||
} | ||
} else { | ||
// Matches! | ||
logrus.Infof("file %s already exists and hash matches, skipping", filename) | ||
return nil | ||
} | ||
|
||
} else if !vmd.Flags().Lookup("allow-overwrite").Changed { | ||
logrus.Infof("file %s already exists, skipping", filename) | ||
return nil | ||
} | ||
} | ||
|
||
// Create the file | ||
file, err := os.OpenFile(outputPath, os.O_CREATE|os.O_WRONLY, 0644) | ||
if err != nil { | ||
return err | ||
} | ||
defer file.Close() | ||
|
||
// Close body when finished. | ||
defer resp.Body.Close() | ||
|
||
var reader io.ReadCloser | ||
|
||
switch resp.Header.Get("Content-Encoding") { | ||
case "gzip": | ||
reader, err = gzip.NewReader(resp.Body) | ||
if err != nil { | ||
return err | ||
} | ||
defer reader.Close() | ||
default: | ||
reader = resp.Body | ||
} | ||
|
||
// Write the response body to the file | ||
_, err = io.Copy(file, reader) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package main | ||
|
||
import ( | ||
"os" | ||
|
||
"github.com/spf13/cobra" | ||
) | ||
|
||
func init() { | ||
rootCmd.AddCommand(extractCmd) | ||
rootCmd.AddCommand(verifyCmd) | ||
|
||
extractCmd.Flags().IntP("threads", "t", 1, "Number of threads to use for extraction") | ||
extractCmd.Flags().StringP("output", "o", "output", "Output directory for extracted files") | ||
extractCmd.Flags().StringSliceP("content-type", "c", []string{}, "Content type that should be extracted") | ||
extractCmd.Flags().Bool("allow-overwrite", false, "Allow overwriting of existing files") | ||
extractCmd.Flags().Bool("host-sort", false, "Sort the extracted URLs by host") | ||
extractCmd.Flags().Bool("hash-suffix", false, "When duplicate file names exist, the hash will be added if a duplicate file name exists. ") | ||
|
||
verifyCmd.Flags().IntP("threads", "t", 1, "Number of threads to use for verification") | ||
verifyCmd.Flags().Bool("json", false, "Output results in JSON format") | ||
} | ||
|
||
// rootCmd represents the base command when called without any subcommands | ||
var rootCmd = &cobra.Command{ | ||
Use: "cmd", | ||
Short: "Utility to process WARC files", | ||
Long: `Utility to process WARC files`, | ||
} | ||
|
||
var extractCmd = &cobra.Command{ | ||
Use: "extract", | ||
Short: "Extracts the URLs from one or many WARC file(s)", | ||
Long: `Extracts the URLs from one or many WARC file(s)`, | ||
Args: cobra.MinimumNArgs(1), | ||
Run: extract, | ||
} | ||
|
||
var verifyCmd = &cobra.Command{ | ||
Use: "verify", | ||
Short: "Verify the validity of one or many WARC file(s)", | ||
Long: `Verify the validity of xtracts the URLs from one or many WARC file(s)`, | ||
Args: cobra.MinimumNArgs(1), | ||
Run: verify, | ||
} | ||
|
||
func main() { | ||
err := rootCmd.Execute() | ||
if err != nil { | ||
os.Exit(1) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package main | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/sirupsen/logrus" | ||
) | ||
|
||
func printExtractReport(filePath string, results map[string]int, elapsed time.Duration) { | ||
total := 0 | ||
|
||
for _, v := range results { | ||
total += v | ||
} | ||
|
||
logrus.Infof("Processed file %s in %s", filePath, elapsed.String()) | ||
logrus.Infof("Number of files extracted: %d", total) | ||
for k, v := range results { | ||
logrus.Infof("- %s: %d\n", k, v) | ||
} | ||
} |
Oops, something went wrong.