forked from CorentinB/warc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
decompression-reader.go
153 lines (122 loc) · 4.42 KB
/
decompression-reader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// This file is copied from https://github.com/crissyfield/troll-a/blob/main/pkg/fetch/decompression-reader.go , under Apache-2.0 License
// Author: [Crissy Field](https://github.com/crissyfield)
package warc
import (
"bufio"
"compress/bzip2"
"encoding/binary"
"fmt"
"io"
"github.com/klauspost/compress/gzip"
"github.com/klauspost/compress/zstd"
"github.com/ulikunitz/xz"
)
const (
magicGZip = "\x1f\x8b" // Magic bytes for the Gzip format (RFC 1952, section 2.3.1)
magicBZip2 = "\x42\x5a" // Magic bytes for the BZip2 format (no formal spec exists)
magicXZ = "\xfd\x37\x7a\x58\x5a\x00" // Magic bytes for the XZ format (https://tukaani.org/xz/xz-file-format.txt)
magicZStdFrame = "\x28\xb5\x2f\xfd" // Magic bytes for the ZStd frame format (RFC 8478, section 3.1.1)
magicZStdSkippableFrame = "\x2a\x4d\x18" // Magic bytes for the ZStd skippable frame format (RFC 8478, section 3.1.2)
)
// NewDecompressionReader will return a new reader transparently doing decompression of GZip, BZip2, XZ, and
// ZStd.
func NewDecompressionReader(r io.Reader) (io.Reader, error) {
// Read magic bytes
br := bufio.NewReader(r)
magic, err := br.Peek(6)
if err != nil {
if err == io.EOF {
return io.NopCloser(br), nil
}
return nil, fmt.Errorf("read magic bytes: %w", err)
}
switch {
case string(magic[0:2]) == magicGZip:
// GZIP decompression
return decompressGZip(br)
case string(magic[0:2]) == magicBZip2:
// BZIP2 decompression
return decompressBzip2(br)
case string(magic[0:6]) == magicXZ:
// XZ decompression
return decompressXZ(br)
case string(magic[0:4]) == magicZStdFrame:
// ZStd decompression
return decompressZStd(br)
case (string(magic[1:4]) == magicZStdSkippableFrame) && (magic[0]&0xf0 == 0x50):
// ZStd decompression with custom dictionary
return decompressZStdCustomDict(br)
default:
// Use no decompression
return io.NopCloser(br), nil
}
}
// decompressGZip decompresses a GZip stream from the given input reader r.
func decompressGZip(br *bufio.Reader) (io.ReadCloser, error) {
// Open GZip reader
dr, err := gzip.NewReader(br)
if err != nil {
return nil, fmt.Errorf("read GZip stream: %w", err)
}
return dr, nil
}
// decompressBZip2 decompresses a BZip2 stream from the given input reader r.
func decompressBzip2(br *bufio.Reader) (io.ReadCloser, error) {
// Open BZip2 reader
dr := bzip2.NewReader(br)
return io.NopCloser(dr), nil
}
// decompressXZ decompresses an XZ stream from the given input reader r.
func decompressXZ(br *bufio.Reader) (io.ReadCloser, error) {
// Open XZ reader
dr, err := xz.NewReader(br)
if err != nil {
return nil, fmt.Errorf("read XZ stream: %w", err)
}
return io.NopCloser(dr), nil
}
// decompressZStd decompresses a ZStd stream from the given input reader r.
func decompressZStd(br *bufio.Reader) (io.ReadCloser, error) {
// Open ZStd reader
dr, err := zstd.NewReader(br, zstd.WithDecoderConcurrency(1))
if err != nil {
return nil, fmt.Errorf("read ZStd stream: %w", err)
}
return dr.IOReadCloser(), nil
}
// decompressZStdCustomDict decompresses a ZStd stream with a prefixed custom dictionary from the given input
// reader r.
func decompressZStdCustomDict(br *bufio.Reader) (io.ReadCloser, error) {
// Read header
var header [8]byte
_, err := br.Read(header[:])
if err != nil {
return nil, fmt.Errorf("read ZStd skippable frame header: %w", err)
}
magic, length := header[0:4], binary.LittleEndian.Uint32(header[4:8])
if (string(magic[1:4]) != magicZStdSkippableFrame) || (magic[0]&0xf0 != 0x50) {
return nil, fmt.Errorf("expected ZStd skippable frame header")
}
// Read ZStd compressed custom dictionary
lr := io.LimitReader(br, int64(length))
dictr, err := zstd.NewReader(lr)
if err != nil {
return nil, fmt.Errorf("read ZStd compressed custom dictionary: %w", err)
}
defer dictr.Close()
dict, err := io.ReadAll(dictr)
if err != nil {
return nil, fmt.Errorf("read ZStd compressed custom dictionary: %w", err)
}
// Discard remaining bytes, if any
_, err = io.Copy(io.Discard, lr)
if err != nil {
return nil, fmt.Errorf("discard remaining bytes of ZStd compressed custom dictionary: %w", err)
}
// Open ZStd reader, with the given dictionary
dr, err := zstd.NewReader(br, zstd.WithDecoderDicts(dict), zstd.WithDecoderConcurrency(1))
if err != nil {
return nil, fmt.Errorf("create ZStd reader: %w", err)
}
return dr.IOReadCloser(), nil
}