diff --git a/data/builder/file.go b/data/builder/file.go index a522784..323b147 100644 --- a/data/builder/file.go +++ b/data/builder/file.go @@ -13,6 +13,7 @@ import ( basicnode "github.com/ipld/go-ipld-prime/node/basic" "github.com/multiformats/go-multicodec" multihash "github.com/multiformats/go-multihash/core" + "github.com/multiformats/go-varint" // raw needed for opening as bytes _ "github.com/ipld/go-ipld-prime/codec/raw" @@ -57,6 +58,91 @@ func BuildUnixFSFile(r io.Reader, chunker string, ls *ipld.LinkSystem) (ipld.Lin } } +// EstimateUnixFSFile estimates the byte size of the car file that would be +// needed to hold a UnixFS file containing data of the given length. +func EstimateUnixFSFileDefaultChunking(dataLength uint64) uint64 { + blkSize := chunk.DefaultBlockSize + blocks := dataLength / uint64(blkSize) + remainder := dataLength % uint64(blkSize) + + size := dataLength + cidExample, _ := leafLinkProto.Prefix.Sum([]byte{0}) + cidLength := uint64(len(cidExample.Bytes())) + + links := []uint64{} + for i := uint64(0); i < blocks; i++ { + links = append(links, uint64(chunk.DefaultBlockSize)) + } + // account for the uvarint + cid length of each block of raw data. + size += uint64(len(links)) * (cidLength + uint64(varint.UvarintSize(cidLength+uint64(blkSize)))) + if remainder > 0 { + links = append(links, remainder) + size += cidLength + uint64(varint.UvarintSize(cidLength+uint64(remainder))) + } + + // account for the metadata overhead nodes. + ls := cidlink.DefaultLinkSystem() + storage := cidlink.Memory{} + ls.StorageReadOpener = storage.OpenRead + ls.StorageWriteOpener = storage.OpenWrite + + icnt := 0 + for len(links) > 1 { + nxtLnks := []uint64{} + for len(links) > 1 { + icnt++ + children := uint64(DefaultLinksPerBlock) + if len(links) < DefaultLinksPerBlock { + children = uint64(len(links)) + } + childrenLinks := links[:children] + links = links[children:] + totalSize := uint64(0) + for _, l := range childrenLinks { + totalSize += l + } + + node, _ := BuildUnixFS(func(b *Builder) { + FileSize(b, totalSize) + BlockSizes(b, childrenLinks) + }) + + // Pack into the dagpb node. + dpbb := dagpb.Type.PBNode.NewBuilder() + pbm, _ := dpbb.BeginMap(2) + pblb, _ := pbm.AssembleEntry("Links") + pbl, _ := pblb.BeginList(int64(len(childrenLinks))) + for _, c := range childrenLinks { + pbln, _ := BuildUnixFSDirectoryEntry("", int64(c), cidlink.Link{Cid: cidExample}) + pbl.AssembleValue().AssignNode(pbln) + } + pbl.Finish() + pbm.AssembleKey().AssignString("Data") + pbm.AssembleValue().AssignBytes(data.EncodeUnixFSData(node)) + pbm.Finish() + pbn := dpbb.Build() + pbLnk := ls.MustStore(ipld.LinkContext{}, fileLinkProto, pbn) + pbRcrd, _ := ls.LoadRaw(ipld.LinkContext{}, pbLnk) + + // dagpb overhead + intermediateNodeSize := uint64(len(pbRcrd)) + + size += intermediateNodeSize + cidLength + uint64(varint.UvarintSize(cidLength+intermediateNodeSize)) + nxtLnks = append(nxtLnks, totalSize) + } + if len(links) == 1 { + nxtLnks = append(nxtLnks, links[0]) + } + links = nxtLnks + } + fmt.Printf("estimated %d intermeidate nodes\n", icnt) + + // add the car header + size += 59 + + return size +} + var fileLinkProto = cidlink.LinkPrototype{ Prefix: cid.Prefix{ Version: 1, diff --git a/data/builder/file_test.go b/data/builder/file_test.go index de3803e..db4208c 100644 --- a/data/builder/file_test.go +++ b/data/builder/file_test.go @@ -1,16 +1,27 @@ -package builder +package builder_test import ( "bytes" "context" + "fmt" + "io" + "math/rand" "testing" + "github.com/ipfs/go-unixfsnode/data/builder" + "github.com/multiformats/go-multicodec" + multihash "github.com/multiformats/go-multihash/core" + "github.com/ipfs/go-cid" u "github.com/ipfs/go-ipfs-util" "github.com/ipfs/go-unixfsnode/file" + carv1 "github.com/ipld/go-car" + "github.com/ipld/go-car/v2" dagpb "github.com/ipld/go-codec-dagpb" "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/linking" cidlink "github.com/ipld/go-ipld-prime/linking/cid" + selectorparse "github.com/ipld/go-ipld-prime/traversal/selector/parse" ) func TestBuildUnixFSFile(t *testing.T) { @@ -23,7 +34,7 @@ func TestBuildUnixFSFile(t *testing.T) { ls.StorageReadOpener = storage.OpenRead ls.StorageWriteOpener = storage.OpenWrite - f, _, err := BuildUnixFSFile(r, "", &ls) + f, _, err := builder.BuildUnixFSFile(r, "", &ls) if err != nil { t.Fatal(err) } @@ -43,6 +54,61 @@ func TestBuildUnixFSFile(t *testing.T) { } } +func TestEstimateUnixFSFileDefaultChunking(t *testing.T) { + for i := 100; i < 1000000000; i *= 10 { + b := make([]byte, i) + rand.Read(b) + + ls := cidlink.DefaultLinkSystem() + storage := cidlink.Memory{} + ls.StorageReadOpener = storage.OpenRead + nPB := 0 + + ls.StorageWriteOpener = func(lc linking.LinkContext) (io.Writer, linking.BlockWriteCommitter, error) { + w, bwc, err := storage.OpenWrite(lc) + return w, func(lnk ipld.Link) error { + if lnk.(cidlink.Link).Cid.Prefix().Codec == uint64(multicodec.DagPb) { + nPB++ + } + return bwc(lnk) + }, err + } + rt, _, err := builder.BuildUnixFSFile(bytes.NewReader(b), "", &ls) + if err != nil { + t.Fatal(err) + } + + ob := bytes.NewBuffer(nil) + _, err = car.TraverseV1(context.Background(), &ls, rt.(cidlink.Link).Cid, selectorparse.CommonSelector_ExploreAllRecursively, ob) + if err != nil { + t.Fatal(err) + } + fileLen := len(ob.Bytes()) + + estimate := builder.EstimateUnixFSFileDefaultChunking(uint64(i)) + if estimate != uint64(fileLen) { + fmt.Printf("%d intermediate nodes.\n", nPB) + t.Fatalf("estimate for file length %d was %d. should be %d", i, estimate, fileLen) + } + } +} + +func TestS(t *testing.T) { + p := cid.Prefix{ + Version: 1, + Codec: uint64(multicodec.DagPb), + MhType: multihash.SHA2_256, + MhLength: 32, + } + rt, _ := p.Sum([]byte{0}) + ch := carv1.CarHeader{ + Roots: []cid.Cid{rt}, + Version: 1, + } + s, _ := carv1.HeaderSize(&ch) + t.Fatalf("hs: %d\n", s) +} + func TestUnixFSFileRoundtrip(t *testing.T) { buf := make([]byte, 10*1024*1024) u.NewSeededRand(0xdeadbeef).Read(buf) @@ -53,7 +119,7 @@ func TestUnixFSFileRoundtrip(t *testing.T) { ls.StorageReadOpener = storage.OpenRead ls.StorageWriteOpener = storage.OpenWrite - f, _, err := BuildUnixFSFile(r, "", &ls) + f, _, err := builder.BuildUnixFSFile(r, "", &ls) if err != nil { t.Fatal(err) } diff --git a/go.mod b/go.mod index c31842f..44f92a1 100644 --- a/go.mod +++ b/go.mod @@ -10,11 +10,13 @@ require ( github.com/ipfs/go-ipld-format v0.4.0 github.com/ipfs/go-merkledag v0.10.0 github.com/ipfs/go-unixfs v0.4.4 + github.com/ipld/go-car v0.5.0 github.com/ipld/go-car/v2 v2.8.0 github.com/ipld/go-codec-dagpb v1.6.0 github.com/ipld/go-ipld-prime v0.20.0 github.com/multiformats/go-multicodec v0.8.1 github.com/multiformats/go-multihash v0.2.1 + github.com/multiformats/go-varint v0.0.7 github.com/spaolacci/murmur3 v1.1.0 github.com/stretchr/testify v1.8.2 google.golang.org/protobuf v1.28.1 @@ -51,7 +53,6 @@ require ( github.com/multiformats/go-base32 v0.1.0 // indirect github.com/multiformats/go-base36 v0.2.0 // indirect github.com/multiformats/go-multibase v0.1.1 // indirect - github.com/multiformats/go-varint v0.0.7 // indirect github.com/opentracing/opentracing-go v1.2.0 // indirect github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/go.sum b/go.sum index e193de6..8390b46 100644 --- a/go.sum +++ b/go.sum @@ -108,6 +108,8 @@ github.com/ipfs/go-unixfs v0.4.4 h1:D/dLBOJgny5ZLIur2vIXVQVW0EyDHdOMBDEhgHrt6rY= github.com/ipfs/go-unixfs v0.4.4/go.mod h1:TSG7G1UuT+l4pNj91raXAPkX0BhJi3jST1FDTfQ5QyM= github.com/ipfs/go-verifcid v0.0.2 h1:XPnUv0XmdH+ZIhLGKg6U2vaPaRDXb9urMyNVCE7uvTs= github.com/ipfs/go-verifcid v0.0.2/go.mod h1:40cD9x1y4OWnFXbLNJYRe7MpNvWlMn3LZAG5Wb4xnPU= +github.com/ipld/go-car v0.5.0 h1:kcCEa3CvYMs0iE5BzD5sV7O2EwMiCIp3uF8tA6APQT8= +github.com/ipld/go-car v0.5.0/go.mod h1:ppiN5GWpjOZU9PgpAZ9HbZd9ZgSpwPMr48fGRJOWmvE= github.com/ipld/go-car/v2 v2.8.0 h1:8tUI+VM1mAQ2Qa7ScK++lfyuZYcGQ70bZ6NpGOcJj5o= github.com/ipld/go-car/v2 v2.8.0/go.mod h1:a+BnAxUqgr7wcWxW/lI6ctyEQ2v9gjBChPytwFMp2f4= github.com/ipld/go-codec-dagpb v1.6.0 h1:9nYazfyu9B1p3NAgfVdpRco3Fs2nFC72DqVsMj6rOcc=