Skip to content

Commit

Permalink
Streaming and Excel BOM handling (#19)
Browse files Browse the repository at this point in the history
* Support for streaming via Reader and Appendable
* Handle Microsoft Excel's insistence on using a byte order marker
* Cleaning up new unit tests for FetchSourceTest
* Removed commented debugging println's for FetchSource
* Cleanup formatting

---------

Co-authored-by: Sven Obser <[email protected]>
  • Loading branch information
UnknownJoe796 and Sven Obser authored Dec 13, 2024
1 parent e92334e commit bc2e62e
Show file tree
Hide file tree
Showing 6 changed files with 317 additions and 15 deletions.
71 changes: 60 additions & 11 deletions library/src/main/kotlin/kotlinx/serialization/csv/Csv.kt
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@ import kotlinx.serialization.StringFormat
import kotlinx.serialization.csv.config.CsvBuilder
import kotlinx.serialization.csv.config.CsvConfig
import kotlinx.serialization.csv.decode.CsvReader
import kotlinx.serialization.csv.decode.FetchSource
import kotlinx.serialization.csv.decode.RootCsvDecoder
import kotlinx.serialization.csv.decode.Source
import kotlinx.serialization.csv.decode.StringSource
import kotlinx.serialization.csv.encode.RootCsvEncoder
import kotlinx.serialization.modules.SerializersModule
import java.io.Reader
import java.io.StringWriter

/**
* The main entry point to work with CSV serialization.
Expand All @@ -25,7 +29,7 @@ import kotlinx.serialization.modules.SerializersModule
* Then constructed instance can be used either as regular [SerialFormat] or [StringFormat].
*/
@ExperimentalSerializationApi
sealed class Csv(val config: CsvConfig) : SerialFormat, StringFormat {
sealed class Csv(val config: CsvConfig) : StringFormat {

override val serializersModule: SerializersModule
get() = config.serializersModule
Expand All @@ -36,10 +40,20 @@ sealed class Csv(val config: CsvConfig) : SerialFormat, StringFormat {
* @param serializer The serializer used to serialize the given object.
* @param value The [Serializable] object.
*/
override fun <T> encodeToString(serializer: SerializationStrategy<T>, value: T): String {
val result = StringBuilder()
RootCsvEncoder(this, result).encodeSerializableValue(serializer, value)
return result.toString()
override fun <T> encodeToString(serializer: SerializationStrategy<T>, value: T): String =
StringWriter().apply {
encodeTo(serializer, value, this)
}.toString()

/**
* Serialize [value] into CSV record(s).
*
* @param serializer The serializer used to serialize the given object.
* @param value The [Serializable] object.
* @param output The output where the CSV will be written.
*/
fun <T> encodeTo(serializer: SerializationStrategy<T>, value: T, output: Appendable) {
output.encode(serializer, value)
}

/**
Expand All @@ -48,13 +62,48 @@ sealed class Csv(val config: CsvConfig) : SerialFormat, StringFormat {
* @param deserializer The deserializer used to parse the given CSV string.
* @param string The CSV string to parse.
*/
override fun <T> decodeFromString(deserializer: DeserializationStrategy<T>, string: String): T {
val reader = CsvReader(StringSource(string), config)
val input = RootCsvDecoder(this, reader)
val result = input.decodeSerializableValue(deserializer)
override fun <T> decodeFromString(deserializer: DeserializationStrategy<T>, string: String): T =
StringSource(string).decode(deserializer)

/**
* Parse CSV from the given [input] into [Serializable] object.
*
* @param deserializer The deserializer used to parse the given CSV string.
* @param input The CSV input to parse.
*/
fun <T> decodeFrom(deserializer: DeserializationStrategy<T>, input: Reader): T =
FetchSource(input).decode(deserializer)

/**
* Serialize [value] into CSV record(s).
*
* @param serializer The serializer used to serialize the given object.
* @param value The [Serializable] object.
*/
private fun <T> Appendable.encode(serializer: SerializationStrategy<T>, value: T) {
RootCsvEncoder(
csv = this@Csv,
output = this
).encodeSerializableValue(serializer, value)
}

/**
* Parse CSV from [this] input into [Serializable] object.
*
* @param deserializer The deserializer used to parse the given CSV string.
*/
private fun <T> Source.decode(deserializer: DeserializationStrategy<T>): T {
val reader = CsvReader(
source = this,
config = config
)

require(reader.isDone) { "Reader has not consumed the whole input: $reader" }
return result
return RootCsvDecoder(
csv = this@Csv,
reader = reader,
).decodeSerializableValue(deserializer).also {
require(reader.isDone) { "Reader has not consumed the whole input: $reader" }
}
}

internal class Impl(config: CsvConfig) : Csv(config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ internal class CsvReader(private val source: Source, private val config: CsvConf

private var marks = arrayListOf<Int>()

init {
// Skip Microsoft Excel's byte order marker, should it appear:
read("\uFEFF")
}

/**
* Read value in the next column.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package kotlinx.serialization.csv.decode

import java.io.Reader

internal class FetchSource(
private val getChar: () -> Char?,
) : Source {

constructor(
reader: Reader,
) : this(
getChar = {
reader.read().let {
if (it == -1) null else it.toChar()
}
},
)

private var nextPosition = 0
override var offset: Int = 0
private set

private var next: Char? = getChar()
private fun nextChar(): Char {
val n = next ?: throw IllegalStateException("Out of characters")
next = getChar()
nextPosition++
return n
}

private var queue = ArrayList<Char>(2048)
private var marks = ArrayList<Int>(2048)
private var queueOffset = 0

override fun canRead(): Boolean = offset <= nextPosition

override fun read(): Char? {
if (offset > nextPosition) {
return null
} else if (offset == nextPosition) {
if (next == null) {
offset++
if (marks.isEmpty()) queue.clear()
return null
}
val c = nextChar()
if (marks.isNotEmpty()) {
if (queue.isEmpty()) {
queueOffset = offset
}
queue.add(c)
} else {
queue.clear()
}
offset++
return c
} else {
val indexToCheck = offset - queueOffset
val result = queue[indexToCheck]
offset++
return result
}
}

override fun peek(): Char? =
if (offset > nextPosition) {
null
} else if (offset == nextPosition) {
next
} else {
queue[offset - queueOffset]
}

override fun mark() {
marks.add(offset)
}

override fun unmark() {
marks.removeAt(marks.lastIndex)
}

override fun reset() {
offset = marks[marks.lastIndex]
marks.removeAt(marks.lastIndex)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,21 @@ class CsvReaderTest {
assertEquals("b", reader.readColumn())
}

@Test
fun testByteOrderMarkSkip() {
val zwnbsp = "\uFEFF"
val csv = """
|${zwnbsp}1,a
|2,b
""".trimMargin()
val reader = CsvReader(StringSource(csv), CsvConfig.Default)

assertEquals("1", reader.readColumn())
assertEquals("a", reader.readColumn())
assertEquals("2", reader.readColumn())
assertEquals("b", reader.readColumn())
}

@Test
fun testRecordNo() {
val csv = """
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package kotlinx.serialization.csv.decode

import java.io.StringReader
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertFalse
import kotlin.test.assertNull
import kotlin.test.assertTrue

class FetchSourceTest {

@Test
fun testCanRead() {
val source = FetchSource("")
assertTrue(source.canRead())
}

@Test
fun testNotCanRead() {
val source = FetchSource("")
source.read()
assertFalse(source.canRead())
}

@Test
fun testRead() {
val source = FetchSource("abc")
assertEquals('a', source.read())
assertEquals('b', source.read())
assertEquals('c', source.read())
assertNull(source.read())
}

@Test
fun testReadEof() {
val source = FetchSource("")
assertNull(source.read())
}

@Test
fun testPeek() {
val source = FetchSource("abc")
assertEquals('a', source.peek())
}

@Test
fun testPeekMultipleTimes() {
val source = FetchSource("abc")
assertEquals('a', source.peek())
assertEquals('a', source.peek())
assertEquals('a', source.peek())
}

@Test
fun testPeekEof() {
val source = FetchSource("")
assertNull(source.peek())
}

@Test
fun testMarkUnmark() {
val source = FetchSource("abc")
assertEquals('a', source.read())
source.mark()
assertEquals('b', source.read())
source.unmark()
assertEquals('c', source.read())
}

@Test
fun testMarkMarkUnmarkReset() {
val source = FetchSource("0123456789")
assertEquals('0', source.read())
source.mark()
assertEquals('1', source.read())
source.mark()
assertEquals('2', source.read())
source.unmark()
assertEquals('3', source.read())
source.reset()
assertEquals('1', source.read())
}

@Test
fun testMarkReset() {
val source = FetchSource("abc")
assertEquals('a', source.read())
source.mark()
assertEquals('b', source.read())
source.reset()
assertEquals('b', source.read())
}

@Test
fun testMarkResetMultiple() {
val source = FetchSource("abcdef")
assertEquals('a', source.read())
source.mark()
assertEquals('b', source.read())
source.mark()
assertEquals('c', source.read())
source.reset()
assertEquals('c', source.read())
source.reset()
assertEquals('b', source.read())
}

@Test
fun testMarkPeekRead() {
val source = FetchSource("abc")
assertEquals('a', source.read())
source.mark()
assertEquals('b', source.peek())
assertEquals('b', source.read())
source.reset()
assertEquals('b', source.peek())
assertEquals('b', source.read())
source.mark()
assertEquals('c', source.peek())
assertEquals('c', source.read())
source.reset()
assertEquals('c', source.peek())
assertEquals('c', source.read())
}
}

@Suppress("TestFunctionName")
private fun FetchSource(string: String): FetchSource =
FetchSource(StringReader(string))
Loading

0 comments on commit bc2e62e

Please sign in to comment.