feat(spdx-utils): Add a new SPDX expression parser implementation

Add a handwritten parser for SPDX expressions which will replace the parser generated by ANTLR in a later commit. Having a parser that is implemented in pure Kotlin will make it possible to make `spdx-utils` a multiplatform module. Performance tests with various SPDX expressions have shown that the handwritten parser is on average seven times faster than the one generated by ANTLR. Signed-off-by: Martin Nonnenmacher <[email protected]>
oss-review-toolkit · Feb 6, 2024 · 654b142 · 654b142
1 parent 36e703b
commit 654b142
Show file tree

Hide file tree

Showing 6 changed files with 939 additions and 0 deletions.
diff --git a/utils/spdx/src/main/kotlin/parser/Exceptions.kt b/utils/spdx/src/main/kotlin/parser/Exceptions.kt
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * License-Filename: LICENSE
+ */
+
+package org.ossreviewtoolkit.utils.spdx.parser
+
+import kotlin.reflect.KClass
+
+import org.ossreviewtoolkit.utils.spdx.SpdxException
+
+/**
+ * An exception to indicate that an [SpdxExpressionLexer] error occurred.
+ */
+class SpdxExpressionLexerException(val char: Char, val position: Int) :
+    SpdxException("Unexpected character '$char' at position $position.")
+
+/**
+ * An exception to indicate that an [SpdxExpressionParser] error occurred. [token] is the unexpected token that caused
+ * the exception, if it is `null` that means the end of the input was reached unexpectedly. [expectedTokenTypes] are the
+ * expected token types, if available.
+ */
+class SpdxExpressionParserException(
+    val token: Token?,
+    vararg val expectedTokenTypes: KClass<out Token> = emptyArray()
+) : SpdxException(
+    buildString {
+        append("Unexpected token '$token'")
+
+        if (expectedTokenTypes.size == 1) {
+            append(", expected ${expectedTokenTypes.first().simpleName}")
+        } else if (expectedTokenTypes.size > 1) {
+            append(", expected one of ${expectedTokenTypes.joinToString { it.simpleName.orEmpty() }}")
+        }
+
+        append(".")
+    }
+)
diff --git a/utils/spdx/src/main/kotlin/parser/SpdxExpressionLexer.kt b/utils/spdx/src/main/kotlin/parser/SpdxExpressionLexer.kt
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * License-Filename: LICENSE
+ */
+
+package org.ossreviewtoolkit.utils.spdx.parser
+
+import org.ossreviewtoolkit.utils.common.nextOrNull
+
+/**
+ * A lexer for SPDX expressions. It consumes a sequence of characters and produces a sequence of [Token]s. For details
+ * on the grammar see [SpdxExpressionParser].
+ */
+class SpdxExpressionLexer(input: Sequence<Char>) {
+    constructor(input: String) : this(input.asSequence())
+
+    companion object {
+        /** The uppercase characters allowed by the SPDX specification. */
+        private val UPPERCASE = 'A'..'Z'
+        /** The lowercase characters allowed by the SPDX specification. */
+        private val LOWERCASE = 'a'..'z'
+        /** The digits allowed by the SPDX specification. */
+        private val DIGITS = '0'..'9'
+
+        /** Return true if the character is an uppercase character allowed by the SPDX specification. */
+        private fun Char.isUpper() = this in UPPERCASE
+        /** Return true if the character is a lowercase character allowed by the SPDX specification. */
+        private fun Char.isLower() = this in LOWERCASE
+        /** Return true if the character is a digit allowed by the SPDX specification. */
+        private fun Char.isDigit() = this in DIGITS
+        /** Return true if the character is a valid identifier character allowed by the SPDX specification. */
+        private fun Char.isIdentifier() = isUpper() || isLower() || isDigit() || this == '.' || this == '-'
+    }
+
+    private val iterator = input.iterator()
+    private var next = iterator.nextOrNull()
+    private var position = 0
+
+    fun tokens(): Sequence<Token> = generateSequence { nextToken() }
+
+    private fun nextToken(): Token? {
+        var cur = consumeChar()
+
+        while (cur != null) {
+            if (cur == ' ') {
+                cur = consumeChar()
+                continue
+            }
+
+            if (cur == '(') return Token.OPEN(position)
+            if (cur == ')') return Token.CLOSE(position)
+            if (cur == '+') return Token.PLUS(position)
+            if (cur == ':') return Token.COLON(position)
+
+            if (cur.isIdentifier()) {
+                val start = position
+                val value = buildString {
+                    append(cur)
+
+                    while (next?.isIdentifier() == true) {
+                        cur = consumeChar()
+                        append(cur)
+                    }
+                }
+
+                return when (value.uppercase()) {
+                    "AND" -> Token.AND(start)
+                    "OR" -> Token.OR(start)
+                    "WITH" -> Token.WITH(start)
+                    else -> when {
+                        value.startsWith("DocumentRef-") -> Token.DOCUMENTREF(start, position, value)
+                        value.startsWith("LicenseRef-") -> Token.LICENSEREF(start, position, value)
+                        else -> Token.IDENTIFIER(start, position, value)
+                    }
+                }
+            }
+
+            cur?.let { throw SpdxExpressionLexerException(it, position) }
+        }
+
+        return null
+    }
+
+    private fun consumeChar(): Char? =
+        next?.also {
+            position++
+            next = iterator.nextOrNull()
+        }
+}
diff --git a/utils/spdx/src/main/kotlin/parser/SpdxExpressionParser.kt b/utils/spdx/src/main/kotlin/parser/SpdxExpressionParser.kt
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * License-Filename: LICENSE
+ */
+
+package org.ossreviewtoolkit.utils.spdx.parser
+
+import org.ossreviewtoolkit.utils.common.nextOrNull
+import org.ossreviewtoolkit.utils.spdx.SpdxCompoundExpression
+import org.ossreviewtoolkit.utils.spdx.SpdxExpression
+import org.ossreviewtoolkit.utils.spdx.SpdxLicenseIdExpression
+import org.ossreviewtoolkit.utils.spdx.SpdxLicenseReferenceExpression
+import org.ossreviewtoolkit.utils.spdx.SpdxLicenseWithExceptionExpression
+import org.ossreviewtoolkit.utils.spdx.SpdxOperator
+
+/**
+ * A parser for SPDX expressions. It consumes a sequence of [Token]s and produces an [SpdxExpression].
+ *
+ * This parser implements the grammar defined in the
+ * [SPDX specification](https://spdx.github.io/spdx-spec/v2.2.2/SPDX-license-expressions/):
+ *
+ * ```
+ * license-expression   -> simple-expression | compound-expression
+ * compound-expression  -> simple-expression |
+ *                         simple-expression "WITH" license-exception-id |
+ *                         compound-expression "AND" compound-expression |
+ *                         compound-expression "OR" compound-expression |
+ *                         "(" compound-expression ")" )
+ * simple-expression    -> license-id | license-id"+" | license-ref
+ * license-ref          -> ["DocumentRef-" idstring ":"] "LicenseRef-" idstring
+ * license-exception-id -> <short form license exception identifier in Annex A.2>
+ * license-id           -> <short form license identifier in Annex A.1>
+ * idstring             -> 1*(ALPHA / DIGIT / "-" / "." )
+ * ```
+ *
+ * For more efficient parsing, this is transformed into the following form which implements the operator precedence as
+ * part of the grammar:
+ *
+ * ```
+ * license-expression -> or-expression
+ * or-expression      -> and-expression ( "OR" and-expression ) *
+ * and-expression     -> primary ( "AND" primary ) *
+ * primary            -> "(" license-expression ")" | simple-expression
+ * simple-expression  -> ( IDENTIFIER [ "+" ] | [ "DOCUMENTREF" ":" ] LICENSEREF ) [ "WITH" IDENTIFIER ]
+ * ```
+ *
+ * This allows implementing a
+ * [recursive descent parser](https://en.wikipedia.org/wiki/Recursive_descent_parser) with
+ * [Pratt parsing](https://en.wikipedia.org/wiki/Operator-precedence_parser#Pratt_parsing). The implementation is
+ * loosely based on this
+ * [example](https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/) but with many
+ * simplifications as the SPDX grammar has only one operator per level of precedence and the parser does not need to be
+ * extensible.
+ *
+ * Also, the rules for `license-id` and `license-exception-id` are changed to allow any valid `idstring` as the
+ * [strictness] decides if only the SPDX identifiers are allowed for license and exception ids and therefore these rules
+ * cannot be part of the grammar.
+ *
+ * For backward compatibility with the previously used SPDX expression parser, operators are case-insensitive. This is
+ * also planned for future SPDX versions, see https://github.com/spdx/spdx-spec/pull/876.
+ */
+class SpdxExpressionParser(
+    tokens: Sequence<Token>,
+    private val strictness: SpdxExpression.Strictness = SpdxExpression.Strictness.ALLOW_ANY
+) {
+    constructor(
+        input: String,
+        strictness: SpdxExpression.Strictness = SpdxExpression.Strictness.ALLOW_ANY
+    ) : this(SpdxExpressionLexer(input).tokens(), strictness)
+
+    private val iterator = tokens.iterator()
+    private var next = iterator.nextOrNull()
+
+    fun parse(): SpdxExpression {
+        val result = parseOrExpression()
+        if (next != null) throw SpdxExpressionParserException(next)
+        return result
+    }
+
+    /**
+     * Parse an OR expression of the form `or-expression -> and-expression ( "OR" and-expression ) *`.
+     */
+    private fun parseOrExpression(): SpdxExpression {
+        var left = parseAndExpression()
+        while (next is Token.OR) {
+            consume<Token.OR>()
+            val right = parseAndExpression()
+            left = SpdxCompoundExpression(left, SpdxOperator.OR, right)
+        }
+        return left
+    }
+
+    /**
+     * Parse an AND expression of the form `and-expression -> primary ( "AND" primary ) *`.
+     */
+    private fun parseAndExpression(): SpdxExpression {
+        var left = parsePrimary()
+        while (next is Token.AND) {
+            consume<Token.AND>()
+            val right = parsePrimary()
+            left = SpdxCompoundExpression(left, SpdxOperator.AND, right)
+        }
+        return left
+    }
+
+    /**
+     * Parse a primary of the form `primary -> "(" license-expression ")" | simple-expression`.
+     */
+    private fun parsePrimary(): SpdxExpression {
+        if (next is Token.OPEN) {
+            consume<Token.OPEN>()
+            val expression = parseOrExpression()
+            consume<Token.CLOSE>()
+            return expression
+        }
+
+        return parseSimpleExpression()
+    }
+
+    /**
+     * Parse a simple expression of the form
+     * `simple-expression -> ( IDENTIFIER [ "+" ] | [ "DOCUMENTREF" ":" ] LICENSEREF ) [ "WITH" IDENTIFIER ]`.
+     */
+    private fun parseSimpleExpression(): SpdxExpression {
+        val left = when (next) {
+            is Token.IDENTIFIER -> {
+                val identifier = consume<Token.IDENTIFIER>()
+
+                val orLaterVersion = next is Token.PLUS || identifier.value.endsWith("-or-later")
+                if (next is Token.PLUS) consume<Token.PLUS>()
+
+                SpdxLicenseIdExpression(identifier.value, orLaterVersion).apply { validate(strictness) }
+            }
+
+            is Token.DOCUMENTREF -> {
+                val documentRef = consume<Token.DOCUMENTREF>()
+                consume<Token.COLON>()
+                val licenseRef = consume<Token.LICENSEREF>()
+
+                SpdxLicenseReferenceExpression("${documentRef.value}:${licenseRef.value}")
+                    .apply { validate(strictness) }
+            }
+
+            is Token.LICENSEREF -> {
+                val licenseRef = consume<Token.LICENSEREF>()
+
+                SpdxLicenseReferenceExpression(licenseRef.value).apply { validate(strictness) }
+            }
+
+            else -> throw SpdxExpressionParserException(next)
+        }
+
+        if (next is Token.WITH) {
+            consume<Token.WITH>()
+            val exception = when (next) {
+                is Token.IDENTIFIER -> consume<Token.IDENTIFIER>().value
+                is Token.LICENSEREF -> consume<Token.LICENSEREF>().value
+                is Token.DOCUMENTREF -> "${consume<Token.DOCUMENTREF>().value}:${consume<Token.LICENSEREF>().value}"
+                else -> throw SpdxExpressionParserException(
+                    next,
+                    Token.IDENTIFIER::class,
+                    Token.LICENSEREF::class,
+                    Token.DOCUMENTREF::class
+                )
+            }
+            return SpdxLicenseWithExceptionExpression(left, exception).apply { validate(strictness) }
+        }
+
+        return left
+    }
+
+    /**
+     * Consume the [next] token and return it if it is of the expected type [T], otherwise throw an
+     * [SpdxExpressionParserException].
+     */
+    private inline fun <reified T : Token> consume(): T {
+        val token = next
+        if (token !is T) throw SpdxExpressionParserException(token, T::class)
+        next = iterator.nextOrNull()
+        return token
+    }
+}
diff --git a/utils/spdx/src/main/kotlin/parser/Token.kt b/utils/spdx/src/main/kotlin/parser/Token.kt
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ * License-Filename: LICENSE
+ */
+
+package org.ossreviewtoolkit.utils.spdx.parser
+
+/**
+ * The tokens created by the [SpdxExpressionLexer] and consumed by the [SpdxExpressionParser].
+ */
+sealed class Token {
+    data class OPEN(val position: Int) : Token()
+    data class CLOSE(val position: Int) : Token()
+    data class PLUS(val position: Int) : Token()
+    data class COLON(val position: Int) : Token()
+    data class AND(val position: Int) : Token()
+    data class OR(val position: Int) : Token()
+    data class WITH(val position: Int) : Token()
+    data class IDENTIFIER(val start: Int, val end: Int, val value: String) : Token()
+    data class LICENSEREF(val start: Int, val end: Int, val value: String) : Token()
+    data class DOCUMENTREF(val start: Int, val end: Int, val value: String) : Token()
+}