Skip to content

Commit

Permalink
Rewrite walk to use less RAM
Browse files Browse the repository at this point in the history
  • Loading branch information
mjkw31 committed Dec 17, 2024
1 parent 90fab5b commit 5393d83
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 214 deletions.
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ require (
github.com/smartystreets/goconvey v1.7.2
github.com/spf13/cobra v1.8.1
github.com/termie/go-shutil v0.0.0-20140729215957-bcacb06fecae
github.com/wtsi-hgi/godirwalk v1.18.1
github.com/wtsi-ssg/wr v0.5.9
)

Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,6 @@ github.com/tklauser/numcpus v0.9.0 h1:lmyCHtANi8aRUgkckBgoDk1nHCux3n2cgkJLXdQGPD
github.com/tklauser/numcpus v0.9.0/go.mod h1:SN6Nq1O3VychhC1npsWostA+oW+VOQTxZrS604NSRyI=
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/wtsi-hgi/godirwalk v1.18.1 h1:t7eaGXYBfTtfIEGLizPCC9fzASTvZtdhKEEri8TyyJs=
github.com/wtsi-hgi/godirwalk v1.18.1/go.mod h1:rLa4FlI9kdT7o67jwFos8qgaX3K2sMC6XI4FXJ1iVyk=
github.com/wtsi-ssg/wr v0.5.9 h1:lJWNuJfVvhTpXQqxRN5RbffhvK3HMog0fFpUFznvoz8=
github.com/wtsi-ssg/wr v0.5.9/go.mod h1:njSdCX+xv1xzzw3Oy3Smid6s/IyIQEvLsKbRwaq4fC8=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
Expand Down
134 changes: 39 additions & 95 deletions walk/dirent.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,136 +29,80 @@ package walk
import (
"io/fs"
"os"
"sync"
"unsafe"

"github.com/wtsi-hgi/godirwalk"
)

var (
filePathPool64 = sync.Pool{New: func() any { x := make(FilePath, 0, 64); return &x }} //nolint:gochecknoglobals,mnd,nlreturn,lll
filePathPool128 = sync.Pool{New: func() any { x := make(FilePath, 0, 128); return &x }} //nolint:gochecknoglobals,mnd,nlreturn,lll
filePathPool256 = sync.Pool{New: func() any { x := make(FilePath, 0, 256); return &x }} //nolint:gochecknoglobals,mnd,nlreturn,lll
filePathPool512 = sync.Pool{New: func() any { x := make(FilePath, 0, 512); return &x }} //nolint:gochecknoglobals,mnd,nlreturn,lll
filePathPool1024 = sync.Pool{New: func() any { x := make(FilePath, 0, 1024); return &x }} //nolint:gochecknoglobals,mnd,nlreturn,lll
filePathPool2048 = sync.Pool{New: func() any { x := make(FilePath, 0, 2048); return &x }} //nolint:gochecknoglobals,mnd,nlreturn,lll
filePathPool4096 = sync.Pool{New: func() any { x := make(FilePath, 0, 4096); return &x }} //nolint:gochecknoglobals,mnd,nlreturn,lll
"strings"
)

// FilePath is a byte-slice of a path, utilising object pools to reduce memory
// allocations.
//
// It is the clients responsibility to call the Done method once it is no longer
// needed.
type FilePath []byte

func newFilePathSize(size int) *FilePath {
switch {
case size <= 64: //nolint:mnd
return filePathPool64.Get().(*FilePath) //nolint:forcetypeassert
case size <= 128: //nolint:mnd
return filePathPool128.Get().(*FilePath) //nolint:forcetypeassert
case size <= 256: //nolint:mnd
return filePathPool256.Get().(*FilePath) //nolint:forcetypeassert
case size <= 512: //nolint:mnd
return filePathPool512.Get().(*FilePath) //nolint:forcetypeassert
case size <= 1024: //nolint:mnd
return filePathPool1024.Get().(*FilePath) //nolint:forcetypeassert
case size <= 2048: //nolint:mnd
return filePathPool2048.Get().(*FilePath) //nolint:forcetypeassert
}

return filePathPool4096.Get().(*FilePath) //nolint:forcetypeassert
// FilePath is a byte-slice of a path.
type FilePath struct {
parent *FilePath
name string
depth uint16
}

// NewFilePath creates a new FilePath, setting the value to the given string.
func NewFilePath(path string) *FilePath {
c := newFilePathSize(len(path))
c.writeString(path)

return c
func NewFilePath(path string) FilePath {
return FilePath{name: path}
}

func (f *FilePath) writeString(str string) {
*f = append(*f, str...)
func (f *FilePath) appendTo(p []byte) []byte {
if f.parent != nil {
p = f.parent.appendTo(p)
}

return append(p, f.name...)
}

func (f *FilePath) writeBytes(p []byte) {
*f = append(*f, p...)
// Bytes returns the FilePath as a literal byte-slice.
func (f *FilePath) Bytes() []byte {
return f.appendTo(nil)
}

// Done deallocates the underlying byte-slice; any uses of the Bytes method are
// now invalid and may change.
func (f *FilePath) Done() { //nolint:gocyclo
*f = (*f)[:0]

switch cap(*f) {
case 64: //nolint:mnd
filePathPool64.Put(f)
case 128: //nolint:mnd
filePathPool128.Put(f)
case 256: //nolint:mnd
filePathPool256.Put(f)
case 512: //nolint:mnd
filePathPool512.Put(f)
case 1024: //nolint:mnd
filePathPool1024.Put(f)
case 2048: //nolint:mnd
filePathPool2048.Put(f)
case 4096: //nolint:mnd
filePathPool4096.Put(f)
func (f *FilePath) compare(g *FilePath) int {
if f.depth < g.depth {
return f.compareTo(g.getDepth(f.depth))
} else if f.depth > g.depth {
return f.getDepth(g.depth).compareTo(g)
}
}

func (f *FilePath) sub(d *godirwalk.Dirent) *FilePath {
name := d.Name()
size := len(*f) + len(name)
return f.compareTo(g)
}

if d.IsDir() {
size++
func (f *FilePath) getDepth(n uint16) *FilePath {
for f.depth != n {
f = f.parent
}

c := newFilePathSize(size)

c.writeBytes(*f)
c.writeString(name)
return f
}

if d.IsDir() {
c.writeString("/")
func (f *FilePath) compareTo(g *FilePath) int {
if f == g {
return 0
}

return c
}
cmp := f.parent.compareTo(g.parent)

// Bytes returns the FilePath as a literal byte-slice.
func (f *FilePath) Bytes() []byte {
return *f
}
if cmp == 0 {
return strings.Compare(f.name, g.name)
}

func (f *FilePath) string() string {
return unsafe.String(&(*f)[0], len(*f))
return cmp
}

// Dirent represents a file system directory entry (a file or a directory),
// providing information about the entry's path, type and inode.
type Dirent struct {
// Path is the complete path to the directory entry (including both
// directory and basename)
Path *FilePath
Path FilePath

// Type is the type bits of the file mode of this entry.
Type os.FileMode
Type fs.FileMode

// Inode is the file system inode number for this entry.
Inode uint64
}

// newDirentForDirectoryPath returns a Dirent for the given directory, with
// a Type for directories and no Inode.
func newDirentForDirectoryPath(dir string) Dirent {
return Dirent{Path: NewFilePath(dir), Type: fs.ModeDir}
}

// IsDir returns true if we are a directory.
func (d *Dirent) IsDir() bool {
return d.Type.IsDir()
Expand Down
7 changes: 0 additions & 7 deletions walk/dirent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,4 @@ func TestDirent(t *testing.T) {
So(d.IsRegular(), ShouldBeFalse)
So(d.IsSymlink(), ShouldBeTrue)
})

Convey("You can make a fake Direct for directories", t, func() {
d := newDirentForDirectoryPath("/a/dir")
So(d.IsDir(), ShouldBeTrue)
So(d.IsRegular(), ShouldBeFalse)
So(d.IsSymlink(), ShouldBeFalse)
})
}
15 changes: 11 additions & 4 deletions walk/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,13 @@ import (
"path/filepath"
"strconv"
"sync"
"unsafe"
)

const userOnlyPerm = 0700

const maxPathLength = 4096

// non-ascii bytes could become \xXX (4x the length at worst), the two
// speech-marks are +2 and a newline is +1.
const maxQuotedPathLength = 4096*4 + 2 + 1
Expand Down Expand Up @@ -162,12 +165,16 @@ func NewFiles(outDir string, n int) (*Files, error) {
//
// It will terminate the walk if writes to our output files fail.
func (f *Files) WritePaths() PathCallback {
var quoted [maxQuotedPathLength]byte
var (
quoted [maxQuotedPathLength]byte
tmpPath [maxPathLength]byte
)

return func(entry *Dirent) error {
defer entry.Path.Done()

return f.writePath(append(strconv.AppendQuote(quoted[:0], entry.Path.string()), '\n'))
return f.writePath(append(
strconv.AppendQuote(
quoted[:0], unsafe.String(&tmpPath[0], len(entry.Path.appendTo(tmpPath[:0]))),
), '\n'))
}
}

Expand Down
Loading

0 comments on commit 5393d83

Please sign in to comment.