This repository has been archived by the owner on Nov 19, 2024. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 390
/
tar.go
253 lines (219 loc) · 7.22 KB
/
tar.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
package archiver
import (
"archive/tar"
"context"
"errors"
"fmt"
"io"
"io/fs"
"log"
"path"
"strings"
)
func init() {
RegisterFormat(Tar{})
}
type Tar struct {
// If true, preserve only numeric user and group id
NumericUIDGID bool
// If true, errors encountered during reading or writing
// a file within an archive will be logged and the
// operation will continue on remaining files.
ContinueOnError bool
}
func (Tar) Extension() string { return ".tar" }
func (t Tar) Match(_ context.Context, filename string, stream io.Reader) (MatchResult, error) {
var mr MatchResult
// match filename
if strings.Contains(strings.ToLower(filename), t.Extension()) {
mr.ByName = true
}
// match file header
if stream != nil {
r := tar.NewReader(stream)
_, err := r.Next()
mr.ByStream = err == nil
}
return mr, nil
}
func (t Tar) Archive(ctx context.Context, output io.Writer, files []FileInfo) error {
tw := tar.NewWriter(output)
defer tw.Close()
for _, file := range files {
if err := t.writeFileToArchive(ctx, tw, file); err != nil {
if t.ContinueOnError && ctx.Err() == nil { // context errors should always abort
log.Printf("[ERROR] %v", err)
continue
}
return err
}
}
return nil
}
func (t Tar) ArchiveAsync(ctx context.Context, output io.Writer, jobs <-chan ArchiveAsyncJob) error {
tw := tar.NewWriter(output)
defer tw.Close()
for job := range jobs {
job.Result <- t.writeFileToArchive(ctx, tw, job.File)
}
return nil
}
func (t Tar) writeFileToArchive(ctx context.Context, tw *tar.Writer, file FileInfo) error {
if err := ctx.Err(); err != nil {
return err // honor context cancellation
}
hdr, err := tar.FileInfoHeader(file, file.LinkTarget)
if err != nil {
return fmt.Errorf("file %s: creating header: %w", file.NameInArchive, err)
}
hdr.Name = file.NameInArchive // complete path, since FileInfoHeader() only has base name
if hdr.Name == "" {
hdr.Name = file.Name() // assume base name of file I guess
}
if t.NumericUIDGID {
hdr.Uname = ""
hdr.Gname = ""
}
if err := tw.WriteHeader(hdr); err != nil {
return fmt.Errorf("file %s: writing header: %w", file.NameInArchive, err)
}
// only proceed to write a file body if there is actually a body
// (for example, directories and links don't have a body)
if hdr.Typeflag != tar.TypeReg {
return nil
}
if err := openAndCopyFile(file, tw); err != nil {
return fmt.Errorf("file %s: writing data: %w", file.NameInArchive, err)
}
return nil
}
func (t Tar) Insert(ctx context.Context, into io.ReadWriteSeeker, files []FileInfo) error {
// Tar files may end with some, none, or a lot of zero-byte padding. The spec says
// it should end with two 512-byte trailer records consisting solely of null/0
// bytes: https://www.gnu.org/software/tar/manual/html_node/Standard.html. However,
// in my experiments using the `tar` command, I've found that is not the case,
// and Colin Percival (author of tarsnap) confirmed this:
// - https://twitter.com/cperciva/status/1476774314623913987
// - https://twitter.com/cperciva/status/1476776999758663680
// So while this solution on Stack Overflow makes sense if you control the
// writer: https://stackoverflow.com/a/18330903/1048862 - and I did get it
// to work in that case -- it is not a general solution. Seems that the only
// reliable thing to do is scan the entire archive to find the last file,
// read its size, then use that to compute the end of content and thus the
// true length of end-of-archive padding. This is slightly more complex than
// just adding the size of the last file to the current stream/seek position,
// because we have to align to 512-byte blocks precisely. I don't actually
// fully know why this works, but in my testing on a few different files it
// did work, whereas other solutions only worked on 1 specific file. *shrug*
//
// Another option is to scan the file for the last contiguous series of 0s,
// without interpreting the tar format at all, and to find the nearest
// blocksize-offset and start writing there. Problem is that you wouldn't
// know if you just overwrote some of the last file if it ends with all 0s.
// Sigh.
var lastFileSize, lastStreamPos int64
tr := tar.NewReader(into)
for {
hdr, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
return err
}
lastStreamPos, err = into.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
lastFileSize = hdr.Size
}
// we can now compute the precise location to write the new file to (I think)
const blockSize = 512 // (as of Go 1.17, this is also a hard-coded const in the archive/tar package)
newOffset := lastStreamPos + lastFileSize
newOffset += blockSize - (newOffset % blockSize) // shift to next-nearest block boundary
_, err := into.Seek(newOffset, io.SeekStart)
if err != nil {
return err
}
tw := tar.NewWriter(into)
defer tw.Close()
for i, file := range files {
if err := ctx.Err(); err != nil {
return err // honor context cancellation
}
err = t.writeFileToArchive(ctx, tw, file)
if err != nil {
if t.ContinueOnError && ctx.Err() == nil {
log.Printf("[ERROR] appending file %d into archive: %s: %v", i, file.Name(), err)
continue
}
return fmt.Errorf("appending file %d into archive: %s: %w", i, file.Name(), err)
}
}
return nil
}
func (t Tar) Extract(ctx context.Context, sourceArchive io.Reader, handleFile FileHandler) error {
tr := tar.NewReader(sourceArchive)
// important to initialize to non-nil, empty value due to how fileIsIncluded works
skipDirs := skipList{}
for {
if err := ctx.Err(); err != nil {
return err // honor context cancellation
}
hdr, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
if t.ContinueOnError && ctx.Err() == nil {
log.Printf("[ERROR] Advancing to next file in tar archive: %v", err)
continue
}
return err
}
if fileIsIncluded(skipDirs, hdr.Name) {
continue
}
if hdr.Typeflag == tar.TypeXGlobalHeader {
// ignore the pax global header from git-generated tarballs
continue
}
info := hdr.FileInfo()
file := FileInfo{
FileInfo: info,
Header: hdr,
NameInArchive: hdr.Name,
LinkTarget: hdr.Linkname,
Open: func() (fs.File, error) {
return fileInArchive{io.NopCloser(tr), info}, nil
},
}
err = handleFile(ctx, file)
if errors.Is(err, fs.SkipAll) {
// At first, I wasn't sure if fs.SkipAll implied that the rest of the entries
// should still be iterated and just "skipped" (i.e. no-ops) or if the walk
// should stop; both have the same net effect, one is just less efficient...
// apparently the name of fs.StopWalk was the preferred name, but it still
// became fs.SkipAll because of semantics with documentation; see
// https://github.com/golang/go/issues/47209 -- anyway, the walk should stop.
break
} else if errors.Is(err, fs.SkipDir) {
// if a directory, skip this path; if a file, skip the folder path
dirPath := hdr.Name
if hdr.Typeflag != tar.TypeDir {
dirPath = path.Dir(hdr.Name) + "/"
}
skipDirs.add(dirPath)
} else if err != nil {
return fmt.Errorf("handling file: %s: %w", hdr.Name, err)
}
}
return nil
}
// Interface guards
var (
_ Archiver = (*Tar)(nil)
_ ArchiverAsync = (*Tar)(nil)
_ Extractor = (*Tar)(nil)
_ Inserter = (*Tar)(nil)
)