diff --git a/internal/storage/serde.go b/internal/storage/serde.go index 5fe8ead242907..ae46ca8bfbb6b 100644 --- a/internal/storage/serde.go +++ b/internal/storage/serde.go @@ -620,6 +620,10 @@ func calculateArraySize(a arrow.Array) int { offset := a.Data().Offset() length := a.Len() + if len(a.NullBitmapBytes()) > 0 { + totalSize += (length + 7) / 8 + } + for i, buf := range a.Data().Buffers() { if buf == nil { continue @@ -627,8 +631,7 @@ func calculateArraySize(a arrow.Array) int { switch i { case 0: - // Handle bitmap buffer - totalSize += (length + 7) / 8 + // Handle bitmap buffer, already handled case 1: switch a.DataType().ID() { case arrow.STRING, arrow.BINARY: @@ -639,13 +642,14 @@ func calculateArraySize(a arrow.Array) int { case arrow.LIST: // Handle nest types like list for i := 0; i < length; i++ { - startOffset := int(binary.LittleEndian.Uint32(buf.Bytes()[i*4:])) - endOffset := int(binary.LittleEndian.Uint32(buf.Bytes()[(i+1)*4:])) - totalSize += endOffset - startOffset + startOffset := int(binary.LittleEndian.Uint32(buf.Bytes()[(offset+i)*4:])) + endOffset := int(binary.LittleEndian.Uint32(buf.Bytes()[(offset+i+1)*4:])) + elementSize := a.DataType().(*arrow.ListType).Elem().(arrow.FixedWidthDataType).Bytes() + totalSize += (endOffset - startOffset) * elementSize } default: // Handle fixed-length types - elementSize := buf.Len() / a.Data().Len() + elementSize := a.DataType().(arrow.FixedWidthDataType).Bytes() totalSize += elementSize * length } } diff --git a/internal/storage/serde_test.go b/internal/storage/serde_test.go index a6834bc0da741..f31ffe8b49aff 100644 --- a/internal/storage/serde_test.go +++ b/internal/storage/serde_test.go @@ -170,3 +170,98 @@ func BenchmarkBinlogIterator(b *testing.B) { assert.False(b, itr.HasNext()) } } + +func TestCalculateArraySize(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + tests := []struct { + name string + arrayBuilder func() arrow.Array + expectedSize int + }{ + { + name: "Empty array", + arrayBuilder: func() arrow.Array { + b := array.NewInt32Builder(mem) + defer b.Release() + return b.NewArray() + }, + expectedSize: 0, + }, + { + name: "Fixed-length array", + arrayBuilder: func() arrow.Array { + b := array.NewInt32Builder(mem) + defer b.Release() + b.AppendValues([]int32{1, 2, 3, 4}, nil) + return b.NewArray() + }, + expectedSize: 17, // 4 elements * 4 bytes + bitmap(1bytes) + }, + { + name: "Variable-length string array", + arrayBuilder: func() arrow.Array { + b := array.NewStringBuilder(mem) + defer b.Release() + b.AppendValues([]string{"hello", "world"}, nil) + return b.NewArray() + }, + expectedSize: 11, // "hello" (5 bytes) + "world" (5 bytes) + bitmap(1bytes) + }, + { + name: "Nested list array", + arrayBuilder: func() arrow.Array { + b := array.NewListBuilder(mem, arrow.PrimitiveTypes.Int32) + defer b.Release() + valueBuilder := b.ValueBuilder().(*array.Int32Builder) + + b.Append(true) + valueBuilder.AppendValues([]int32{1, 2, 3}, nil) + + b.Append(true) + valueBuilder.AppendValues([]int32{4, 5}, nil) + + b.Append(true) + valueBuilder.AppendValues([]int32{}, nil) + + return b.NewArray() + }, + expectedSize: 21, // 3 + 2 elements in data buffer, plus bitmap(1bytes) + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + arr := tt.arrayBuilder() + defer arr.Release() + + size := calculateArraySize(arr) + if size != tt.expectedSize { + t.Errorf("Expected size %d, got %d", tt.expectedSize, size) + } + }) + } +} + +func TestCalculateArraySizeWithOffset(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + b := array.NewStringBuilder(mem) + defer b.Release() + + b.AppendValues([]string{"zero", "one", "two", "three", "four"}, nil) + fullArray := b.NewArray() + defer fullArray.Release() + + slicedArray := array.NewSlice(fullArray, 1, 4) // Offset = 1, End = 4 + defer slicedArray.Release() + + size := calculateArraySize(slicedArray) + expectedSize := len("one") + len("two") + len("three") + 1 // "one", "two", "three", bitmap(1 bytes) + + if size != expectedSize { + t.Errorf("Expected size %d, got %d", expectedSize, size) + } +}