From 383ead8fe7076e62ca141f58aa7eee89c0586b6e Mon Sep 17 00:00:00 2001 From: Mackenzie Grimes - NOAA Affiliate Date: Wed, 13 Nov 2024 11:15:48 -0700 Subject: [PATCH 1/5] add ContentRange to struct, --range cli arg, optional contentRange to Get() --- command/cat.go | 2 +- command/cp.go | 8 +++++++- storage/s3.go | 11 ++++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/command/cat.go b/command/cat.go index ccb1ef25b..cf3501513 100644 --- a/command/cat.go +++ b/command/cat.go @@ -150,7 +150,7 @@ func (c Cat) processObjects(ctx context.Context, client *storage.S3, objectChan func (c Cat) processSingleObject(ctx context.Context, client *storage.S3, url *url.URL) error { buf := orderedwriter.New(os.Stdout) - _, err := client.Get(ctx, url, buf, c.concurrency, c.partSize) + _, err := client.Get(ctx, url, buf, c.concurrency, c.partSize, nil) return err } diff --git a/command/cp.go b/command/cp.go index 7ed06bfee..5cf43f0ab 100644 --- a/command/cp.go +++ b/command/cp.go @@ -252,6 +252,10 @@ func NewCopyCommandFlags() []cli.Flag { Name: "version-id", Usage: "use the specified version of an object", }, + &cli.StringFlag{ + Name: "range", + Usage: "defines range header for target object, e.g. --range bytes=0-100", + }, &cli.BoolFlag{ Name: "show-progress", Aliases: []string{"sp"}, @@ -320,6 +324,7 @@ type Copy struct { contentType string contentEncoding string contentDisposition string + contentRange string metadata map[string]string metadataDirective string showProgress bool @@ -398,6 +403,7 @@ func NewCopy(c *cli.Context, deleteSource bool) (*Copy, error) { contentType: c.String("content-type"), contentEncoding: c.String("content-encoding"), contentDisposition: c.String("content-disposition"), + contentRange: c.String("range"), metadata: metadata, metadataDirective: c.String("metadata-directive"), showProgress: c.Bool("show-progress"), @@ -665,7 +671,7 @@ func (c Copy) doDownload(ctx context.Context, srcurl *url.URL, dsturl *url.URL) } writer := newCountingReaderWriter(file, c.progressbar) - size, err := srcClient.Get(ctx, srcurl, writer, c.concurrency, c.partSize) + size, err := srcClient.Get(ctx, srcurl, writer, c.concurrency, c.partSize, &c.contentRange) file.Close() if err != nil { diff --git a/storage/s3.go b/storage/s3.go index 3313be66e..557e2ca12 100644 --- a/storage/s3.go +++ b/storage/s3.go @@ -543,6 +543,11 @@ func (s *S3) Copy(ctx context.Context, from, to *url.URL, metadata Metadata) err input.ContentDisposition = aws.String(contentDisposition) } + // TODO: does this even exist for CopyObject? + // if metadata.Range != "" { + // input.Range = aws.String(metadata.Range) + // } + // add retry ID to the object metadata if s.noSuchUploadRetryCount > 0 { input.Metadata[metadataKeyRetryID] = generateRetryID() @@ -600,13 +605,14 @@ func (s *S3) Presign(ctx context.Context, from *url.URL, expire time.Duration) ( // Get is a multipart download operation which downloads S3 objects into any // destination that implements io.WriterAt interface. -// Makes a single 'GetObject' call if 'concurrency' is 1 and ignores 'partSize'. +// Makes a single 'GetObject' call if 'concurrency' is 1 or contentRange is not nil, ignoring 'partSize'. func (s *S3) Get( ctx context.Context, from *url.URL, to io.WriterAt, concurrency int, partSize int64, + contentRange *string, // optional ) (int64, error) { if s.dryRun { return 0, nil @@ -620,6 +626,9 @@ func (s *S3) Get( if from.VersionID != "" { input.VersionId = aws.String(from.VersionID) } + if contentRange != nil { + input.Range = aws.String(*contentRange) + } return s.downloader.DownloadWithContext(ctx, to, input, func(u *s3manager.Downloader) { u.PartSize = partSize From 6a017709724569b9dd535a827cbe230f6a685eeb Mon Sep 17 00:00:00 2001 From: Mackenzie Grimes - NOAA Affiliate Date: Wed, 13 Nov 2024 11:41:09 -0700 Subject: [PATCH 2/5] cleanup code comments, add --range arg to README --- README.md | 16 ++++++++++++---- command/cp.go | 3 +++ storage/s3.go | 5 ----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index f57a98a2f..1c9fc39c5 100644 --- a/README.md +++ b/README.md @@ -80,9 +80,9 @@ You can also install `s5cmd` from [MacPorts](https://ports.macports.org/port/s5c > conda config --add channels conda-forge > conda config --set channel_priority strict > ``` -> +> > Once the `conda-forge` channel has been enabled, `s5cmd` can be installed with `conda`: -> +> > ``` > conda install s5cmd > ``` @@ -319,7 +319,7 @@ folder hierarchy. an [open ticket](https://github.com/peak/s5cmd/issues/29) to track the issue. #### Using Exclude and Include Filters -`s5cmd` supports the `--exclude` and `--include` flags, which can be used to specify patterns for objects to be excluded or included in commands. +`s5cmd` supports the `--exclude` and `--include` flags, which can be used to specify patterns for objects to be excluded or included in commands. - The `--exclude` flag specifies objects that should be excluded from the operation. Any object that matches the pattern will be skipped. - The `--include` flag specifies objects that should be included in the operation. Only objects that match the pattern will be handled. @@ -540,7 +540,7 @@ The environment variable `SHELL` must be accurate for the autocompletion to func The autocompletion is tested with following versions of the shells: \ ***zsh*** 5.8.1 (x86_64-apple-darwin21.0) \ GNU ***bash***, version 5.1.16(1)-release (x86_64-apple-darwin21.1.0) \ -***PowerShell*** 7.2.6 +***PowerShell*** 7.2.6 ### Google Cloud Storage support @@ -687,6 +687,14 @@ s5cmd --numworkers 10 cp --concurrency 10 '/Users/foo/bar/*' s3://mybucket/foo/b If you have a few, large files to download, setting `--numworkers` to a very high value will not affect download speed. In this scenario setting `--concurrency` to a higher value may have a better impact on the download speed. +### range + +`range` is a `cp` command option that targets only a specific byterange in the source object to download. This parameter is used by the AWS Go SDK (setting the [Range header](https://www.rfc-editor.org/rfc/rfc9110.html#name-range) in the GET request). Passing `range` option to `cp` will override any `--concurrency` or `--part_size` arguments (1 thread will be used to download this 1 part in the byterange). + +``` +s5cmd cp --range bytes=500-999 's3://mybucket/foo/bar/file.txt' partialFile.txt +``` + ## Benchmarks Some benchmarks regarding the performance of `s5cmd` are introduced below. For more details refer to this [post](https://medium.com/@joshua_robinson/s5cmd-for-high-performance-object-storage-7071352cc09d) diff --git a/command/cp.go b/command/cp.go index 5cf43f0ab..341aa3dfb 100644 --- a/command/cp.go +++ b/command/cp.go @@ -118,6 +118,9 @@ Examples: 24. Pass arbitrary metadata to the object during upload or copy > s5cmd {{.HelpName}} --metadata "camera=Nixon D750" --metadata "imageSize=6032x4032" flowers.png s3://bucket/prefix/flowers.png + + 25. Copy only a specific byte range out of an S3 object. + > s5cmd {{.HelpName}} --range bytes=500-999 s3://bucket/prefix/object . ` func NewSharedFlags() []cli.Flag { diff --git a/storage/s3.go b/storage/s3.go index 557e2ca12..832666a6d 100644 --- a/storage/s3.go +++ b/storage/s3.go @@ -543,11 +543,6 @@ func (s *S3) Copy(ctx context.Context, from, to *url.URL, metadata Metadata) err input.ContentDisposition = aws.String(contentDisposition) } - // TODO: does this even exist for CopyObject? - // if metadata.Range != "" { - // input.Range = aws.String(metadata.Range) - // } - // add retry ID to the object metadata if s.noSuchUploadRetryCount > 0 { input.Metadata[metadataKeyRetryID] = generateRetryID() From 710b9a9bf61983ed2f3280cf8f268906976b175c Mon Sep 17 00:00:00 2001 From: Mackenzie Grimes - NOAA Affiliate Date: Wed, 13 Nov 2024 11:42:45 -0700 Subject: [PATCH 3/5] readme words --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1c9fc39c5..65934304b 100644 --- a/README.md +++ b/README.md @@ -689,7 +689,7 @@ If you have a few, large files to download, setting `--numworkers` to a very hig ### range -`range` is a `cp` command option that targets only a specific byterange in the source object to download. This parameter is used by the AWS Go SDK (setting the [Range header](https://www.rfc-editor.org/rfc/rfc9110.html#name-range) in the GET request). Passing `range` option to `cp` will override any `--concurrency` or `--part_size` arguments (1 thread will be used to download this 1 part in the byterange). +`range` is a `cp` command option that targets only a specific byterange in the source object to download. This parameter is used by the AWS Go SDK (setting the [Range header](https://www.rfc-editor.org/rfc/rfc9110.html#name-range) in the GET request). Passing `range` option to `cp` will override any `--concurrency` or `--part_size` arguments (1 thread will be used to download this 1 part specified by the byterange). ``` s5cmd cp --range bytes=500-999 's3://mybucket/foo/bar/file.txt' partialFile.txt From 47c47e65eca547c2976758d9abd521d6f01a22b0 Mon Sep 17 00:00:00 2001 From: Mackenzie Grimes - NOAA Affiliate Date: Wed, 13 Nov 2024 11:49:01 -0700 Subject: [PATCH 4/5] fix tab spacing --- command/cp.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/cp.go b/command/cp.go index 341aa3dfb..6d85214b0 100644 --- a/command/cp.go +++ b/command/cp.go @@ -327,7 +327,7 @@ type Copy struct { contentType string contentEncoding string contentDisposition string - contentRange string + contentRange string metadata map[string]string metadataDirective string showProgress bool From 350c4b166bd4d5ec2bbabdcd38752228b09f579c Mon Sep 17 00:00:00 2001 From: Mackenzie Grimes - NOAA Affiliate Date: Wed, 13 Nov 2024 12:01:01 -0700 Subject: [PATCH 5/5] fix cp.go formatting --- command/cp.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/cp.go b/command/cp.go index 6d85214b0..aeb93fa31 100644 --- a/command/cp.go +++ b/command/cp.go @@ -406,7 +406,7 @@ func NewCopy(c *cli.Context, deleteSource bool) (*Copy, error) { contentType: c.String("content-type"), contentEncoding: c.String("content-encoding"), contentDisposition: c.String("content-disposition"), - contentRange: c.String("range"), + contentRange: c.String("range"), metadata: metadata, metadataDirective: c.String("metadata-directive"), showProgress: c.Bool("show-progress"),