From 51147293a5443335a068d6703bbe93b71d15a84c Mon Sep 17 00:00:00 2001 From: janiodev Date: Sun, 7 Jan 2024 12:53:13 -0300 Subject: [PATCH] GH-39552: [Go] inclusion of option to use replacer when crating csv strings with go libary --- go/arrow/csv/common.go | 14 ++++++++++++++ go/arrow/csv/transformer.go | 12 ++++++------ go/arrow/csv/writer.go | 24 +++++++++++++----------- go/arrow/csv/writer_test.go | 6 ++++-- 4 files changed, 37 insertions(+), 19 deletions(-) diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 99dac29f4d728..31ca61f323d36 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -21,6 +21,7 @@ package csv import ( "errors" "fmt" + "strings" "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/memory" @@ -223,6 +224,19 @@ func WithIncludeColumns(cols []string) Option { } } +// WithStringsReplacer receives a replacer to be applied in the string fields +// of the CSV. This is useful to remove unwanted characters from the string. +func WithStringsReplacer(replacer *strings.Replacer) Option { + return func(cfg config) { + switch cfg := cfg.(type) { + case *Writer: + cfg.stringReplacer = replacer.Replace + default: + panic(fmt.Errorf("arrow/csv: unknown config type %T", cfg)) + } + } +} + func validate(schema *arrow.Schema) { for i, f := range schema.Fields() { switch ft := f.Type.(type) { diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go index 0f0181520b847..78b16446d4def 100644 --- a/go/arrow/csv/transformer.go +++ b/go/arrow/csv/transformer.go @@ -29,7 +29,7 @@ import ( "github.com/apache/arrow/go/v15/arrow/array" ) -func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) []string { +func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, stringsReplacer func(string)string) []string { res := make([]string, col.Len()) switch typ.(type) { case *arrow.BooleanType: @@ -144,7 +144,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] arr := col.(*array.String) for i := 0; i < arr.Len(); i++ { if arr.IsValid(i) { - res[i] = arr.Value(i) + res[i] = stringsReplacer(arr.Value(i)) } else { res[i] = w.nullValue } @@ -153,7 +153,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] arr := col.(*array.LargeString) for i := 0; i < arr.Len(); i++ { if arr.IsValid(i) { - res[i] = arr.Value(i) + res[i] = stringsReplacer(arr.Value(i)) } else { res[i] = w.nullValue } @@ -224,7 +224,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] var b bytes.Buffer b.Write([]byte{'{'}) writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list)) + writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) writer.Flush() b.Truncate(b.Len() - 1) b.Write([]byte{'}'}) @@ -243,7 +243,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] var b bytes.Buffer b.Write([]byte{'{'}) writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list)) + writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) writer.Flush() b.Truncate(b.Len() - 1) b.Write([]byte{'}'}) @@ -262,7 +262,7 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array) [] var b bytes.Buffer b.Write([]byte{'{'}) writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list)) + writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) writer.Flush() b.Truncate(b.Len() - 1) b.Write([]byte{'}'}) diff --git a/go/arrow/csv/writer.go b/go/arrow/csv/writer.go index a672008b58a07..b939b72984b0f 100644 --- a/go/arrow/csv/writer.go +++ b/go/arrow/csv/writer.go @@ -27,12 +27,13 @@ import ( // Writer wraps encoding/csv.Writer and writes arrow.Record based on a schema. type Writer struct { - boolFormatter func(bool) string - header bool - nullValue string - once sync.Once - schema *arrow.Schema - w *csv.Writer + boolFormatter func(bool) string + header bool + nullValue string + stringReplacer func(string) string + once sync.Once + schema *arrow.Schema + w *csv.Writer } // NewWriter returns a writer that writes arrow.Records to the CSV file @@ -45,10 +46,11 @@ func NewWriter(w io.Writer, schema *arrow.Schema, opts ...Option) *Writer { validate(schema) ww := &Writer{ - boolFormatter: strconv.FormatBool, // override by passing WithBoolWriter() as an option - nullValue: "NULL", // override by passing WithNullWriter() as an option - schema: schema, - w: csv.NewWriter(w), + boolFormatter: strconv.FormatBool, // override by passing WithBoolWriter() as an option + nullValue: "NULL", // override by passing WithNullWriter() as an option + stringReplacer: func(x string) string { return x }, // override by passing WithStringsReplacer() as an option + schema: schema, + w: csv.NewWriter(w), } for _, opt := range opts { opt(ww) @@ -81,7 +83,7 @@ func (w *Writer) Write(record arrow.Record) error { } for j, col := range record.Columns() { - rows := w.transformColToStringArr(w.schema.Field(j).Type, col) + rows := w.transformColToStringArr(w.schema.Field(j).Type, col, w.stringReplacer) for i, row := range rows { recs[i][j] = row } diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go index 644cae0933f4c..b1bd3251c5622 100644 --- a/go/arrow/csv/writer_test.go +++ b/go/arrow/csv/writer_test.go @@ -23,6 +23,7 @@ import ( "fmt" "io" "log" + "strings" "testing" "github.com/apache/arrow/go/v15/arrow" @@ -250,8 +251,8 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo b.Field(9).(*array.Float16Builder).AppendValues([]float16.Num{float16.New(0.0), float16.New(0.1), float16.New(0.2)}, nil) b.Field(10).(*array.Float32Builder).AppendValues([]float32{0.0, 0.1, 0.2}, nil) b.Field(11).(*array.Float64Builder).AppendValues([]float64{0.0, 0.1, 0.2}, nil) - b.Field(12).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil) - b.Field(13).(*array.LargeStringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil) + b.Field(12).(*array.StringBuilder).AppendValues([]string{"str_0", "str-1", "str-2"}, nil) + b.Field(13).(*array.LargeStringBuilder).AppendValues([]string{"str_0", "str-1", "str-2"}, nil) b.Field(14).(*array.TimestampBuilder).AppendValues(genTimestamps(arrow.Second), nil) b.Field(15).(*array.Date32Builder).AppendValues([]arrow.Date32{17304, 19304, 20304}, nil) b.Field(16).(*array.Date64Builder).AppendValues([]arrow.Date64{1840400000000, 1940400000000, 2040400000000}, nil) @@ -300,6 +301,7 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo csv.WithHeader(writeHeader), csv.WithNullWriter(nullVal), csv.WithBoolWriter(fmtr), + csv.WithStringsReplacer(strings.NewReplacer("_", "-")), ) err := w.Write(rec) if err != nil {