Skip to content

Commit

Permalink
feat(spdx-utils): Add a new SPDX expression parser implementation
Browse files Browse the repository at this point in the history
Add a handwritten parser for SPDX expressions which will replace the
parser generated by ANTLR in a later commit. Having a parser that is
implemented in pure Kotlin will make it possible to make `spdx-utils` a
multiplatform module.

Performance tests with various SPDX expressions have shown that the
handwritten parser is on average seven times faster than the one
generated by ANTLR.

Signed-off-by: Martin Nonnenmacher <[email protected]>
  • Loading branch information
mnonnenmacher committed Feb 6, 2024
1 parent 36e703b commit 654b142
Show file tree
Hide file tree
Showing 6 changed files with 939 additions and 0 deletions.
52 changes: 52 additions & 0 deletions utils/spdx/src/main/kotlin/parser/Exceptions.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0
* License-Filename: LICENSE
*/

package org.ossreviewtoolkit.utils.spdx.parser

import kotlin.reflect.KClass

import org.ossreviewtoolkit.utils.spdx.SpdxException

/**
* An exception to indicate that an [SpdxExpressionLexer] error occurred.
*/
class SpdxExpressionLexerException(val char: Char, val position: Int) :
SpdxException("Unexpected character '$char' at position $position.")

/**
* An exception to indicate that an [SpdxExpressionParser] error occurred. [token] is the unexpected token that caused
* the exception, if it is `null` that means the end of the input was reached unexpectedly. [expectedTokenTypes] are the
* expected token types, if available.
*/
class SpdxExpressionParserException(
val token: Token?,
vararg val expectedTokenTypes: KClass<out Token> = emptyArray()
) : SpdxException(
buildString {
append("Unexpected token '$token'")

if (expectedTokenTypes.size == 1) {
append(", expected ${expectedTokenTypes.first().simpleName}")
} else if (expectedTokenTypes.size > 1) {
append(", expected one of ${expectedTokenTypes.joinToString { it.simpleName.orEmpty() }}")
}

append(".")
}
)
103 changes: 103 additions & 0 deletions utils/spdx/src/main/kotlin/parser/SpdxExpressionLexer.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0
* License-Filename: LICENSE
*/

package org.ossreviewtoolkit.utils.spdx.parser

import org.ossreviewtoolkit.utils.common.nextOrNull

/**
* A lexer for SPDX expressions. It consumes a sequence of characters and produces a sequence of [Token]s. For details
* on the grammar see [SpdxExpressionParser].
*/
class SpdxExpressionLexer(input: Sequence<Char>) {
constructor(input: String) : this(input.asSequence())

companion object {
/** The uppercase characters allowed by the SPDX specification. */
private val UPPERCASE = 'A'..'Z'
/** The lowercase characters allowed by the SPDX specification. */
private val LOWERCASE = 'a'..'z'
/** The digits allowed by the SPDX specification. */
private val DIGITS = '0'..'9'

/** Return true if the character is an uppercase character allowed by the SPDX specification. */
private fun Char.isUpper() = this in UPPERCASE
/** Return true if the character is a lowercase character allowed by the SPDX specification. */
private fun Char.isLower() = this in LOWERCASE
/** Return true if the character is a digit allowed by the SPDX specification. */
private fun Char.isDigit() = this in DIGITS
/** Return true if the character is a valid identifier character allowed by the SPDX specification. */
private fun Char.isIdentifier() = isUpper() || isLower() || isDigit() || this == '.' || this == '-'
}

private val iterator = input.iterator()
private var next = iterator.nextOrNull()
private var position = 0

fun tokens(): Sequence<Token> = generateSequence { nextToken() }

private fun nextToken(): Token? {
var cur = consumeChar()

while (cur != null) {
if (cur == ' ') {
cur = consumeChar()
continue
}

if (cur == '(') return Token.OPEN(position)
if (cur == ')') return Token.CLOSE(position)
if (cur == '+') return Token.PLUS(position)
if (cur == ':') return Token.COLON(position)

if (cur.isIdentifier()) {
val start = position
val value = buildString {
append(cur)

while (next?.isIdentifier() == true) {
cur = consumeChar()
append(cur)
}
}

return when (value.uppercase()) {
"AND" -> Token.AND(start)
"OR" -> Token.OR(start)
"WITH" -> Token.WITH(start)
else -> when {
value.startsWith("DocumentRef-") -> Token.DOCUMENTREF(start, position, value)
value.startsWith("LicenseRef-") -> Token.LICENSEREF(start, position, value)
else -> Token.IDENTIFIER(start, position, value)
}
}
}

cur?.let { throw SpdxExpressionLexerException(it, position) }
}

return null
}

private fun consumeChar(): Char? =
next?.also {
position++
next = iterator.nextOrNull()
}
}
196 changes: 196 additions & 0 deletions utils/spdx/src/main/kotlin/parser/SpdxExpressionParser.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0
* License-Filename: LICENSE
*/

package org.ossreviewtoolkit.utils.spdx.parser

import org.ossreviewtoolkit.utils.common.nextOrNull
import org.ossreviewtoolkit.utils.spdx.SpdxCompoundExpression
import org.ossreviewtoolkit.utils.spdx.SpdxExpression
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseIdExpression
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseReferenceExpression
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseWithExceptionExpression
import org.ossreviewtoolkit.utils.spdx.SpdxOperator

/**
* A parser for SPDX expressions. It consumes a sequence of [Token]s and produces an [SpdxExpression].
*
* This parser implements the grammar defined in the
* [SPDX specification](https://spdx.github.io/spdx-spec/v2.2.2/SPDX-license-expressions/):
*
* ```
* license-expression -> simple-expression | compound-expression
* compound-expression -> simple-expression |
* simple-expression "WITH" license-exception-id |
* compound-expression "AND" compound-expression |
* compound-expression "OR" compound-expression |
* "(" compound-expression ")" )
* simple-expression -> license-id | license-id"+" | license-ref
* license-ref -> ["DocumentRef-" idstring ":"] "LicenseRef-" idstring
* license-exception-id -> <short form license exception identifier in Annex A.2>
* license-id -> <short form license identifier in Annex A.1>
* idstring -> 1*(ALPHA / DIGIT / "-" / "." )
* ```
*
* For more efficient parsing, this is transformed into the following form which implements the operator precedence as
* part of the grammar:
*
* ```
* license-expression -> or-expression
* or-expression -> and-expression ( "OR" and-expression ) *
* and-expression -> primary ( "AND" primary ) *
* primary -> "(" license-expression ")" | simple-expression
* simple-expression -> ( IDENTIFIER [ "+" ] | [ "DOCUMENTREF" ":" ] LICENSEREF ) [ "WITH" IDENTIFIER ]
* ```
*
* This allows implementing a
* [recursive descent parser](https://en.wikipedia.org/wiki/Recursive_descent_parser) with
* [Pratt parsing](https://en.wikipedia.org/wiki/Operator-precedence_parser#Pratt_parsing). The implementation is
* loosely based on this
* [example](https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/) but with many
* simplifications as the SPDX grammar has only one operator per level of precedence and the parser does not need to be
* extensible.
*
* Also, the rules for `license-id` and `license-exception-id` are changed to allow any valid `idstring` as the
* [strictness] decides if only the SPDX identifiers are allowed for license and exception ids and therefore these rules
* cannot be part of the grammar.
*
* For backward compatibility with the previously used SPDX expression parser, operators are case-insensitive. This is
* also planned for future SPDX versions, see https://github.com/spdx/spdx-spec/pull/876.
*/
class SpdxExpressionParser(
tokens: Sequence<Token>,
private val strictness: SpdxExpression.Strictness = SpdxExpression.Strictness.ALLOW_ANY
) {
constructor(
input: String,
strictness: SpdxExpression.Strictness = SpdxExpression.Strictness.ALLOW_ANY
) : this(SpdxExpressionLexer(input).tokens(), strictness)

private val iterator = tokens.iterator()
private var next = iterator.nextOrNull()

fun parse(): SpdxExpression {
val result = parseOrExpression()
if (next != null) throw SpdxExpressionParserException(next)
return result
}

/**
* Parse an OR expression of the form `or-expression -> and-expression ( "OR" and-expression ) *`.
*/
private fun parseOrExpression(): SpdxExpression {
var left = parseAndExpression()
while (next is Token.OR) {
consume<Token.OR>()
val right = parseAndExpression()
left = SpdxCompoundExpression(left, SpdxOperator.OR, right)
}
return left
}

/**
* Parse an AND expression of the form `and-expression -> primary ( "AND" primary ) *`.
*/
private fun parseAndExpression(): SpdxExpression {
var left = parsePrimary()
while (next is Token.AND) {
consume<Token.AND>()
val right = parsePrimary()
left = SpdxCompoundExpression(left, SpdxOperator.AND, right)
}
return left
}

/**
* Parse a primary of the form `primary -> "(" license-expression ")" | simple-expression`.
*/
private fun parsePrimary(): SpdxExpression {
if (next is Token.OPEN) {
consume<Token.OPEN>()
val expression = parseOrExpression()
consume<Token.CLOSE>()
return expression
}

return parseSimpleExpression()
}

/**
* Parse a simple expression of the form
* `simple-expression -> ( IDENTIFIER [ "+" ] | [ "DOCUMENTREF" ":" ] LICENSEREF ) [ "WITH" IDENTIFIER ]`.
*/
private fun parseSimpleExpression(): SpdxExpression {
val left = when (next) {
is Token.IDENTIFIER -> {
val identifier = consume<Token.IDENTIFIER>()

val orLaterVersion = next is Token.PLUS || identifier.value.endsWith("-or-later")
if (next is Token.PLUS) consume<Token.PLUS>()

SpdxLicenseIdExpression(identifier.value, orLaterVersion).apply { validate(strictness) }
}

is Token.DOCUMENTREF -> {
val documentRef = consume<Token.DOCUMENTREF>()
consume<Token.COLON>()
val licenseRef = consume<Token.LICENSEREF>()

SpdxLicenseReferenceExpression("${documentRef.value}:${licenseRef.value}")
.apply { validate(strictness) }
}

is Token.LICENSEREF -> {
val licenseRef = consume<Token.LICENSEREF>()

SpdxLicenseReferenceExpression(licenseRef.value).apply { validate(strictness) }
}

else -> throw SpdxExpressionParserException(next)
}

if (next is Token.WITH) {
consume<Token.WITH>()
val exception = when (next) {
is Token.IDENTIFIER -> consume<Token.IDENTIFIER>().value
is Token.LICENSEREF -> consume<Token.LICENSEREF>().value
is Token.DOCUMENTREF -> "${consume<Token.DOCUMENTREF>().value}:${consume<Token.LICENSEREF>().value}"
else -> throw SpdxExpressionParserException(
next,
Token.IDENTIFIER::class,
Token.LICENSEREF::class,
Token.DOCUMENTREF::class
)
}
return SpdxLicenseWithExceptionExpression(left, exception).apply { validate(strictness) }
}

return left
}

/**
* Consume the [next] token and return it if it is of the expected type [T], otherwise throw an
* [SpdxExpressionParserException].
*/
private inline fun <reified T : Token> consume(): T {
val token = next
if (token !is T) throw SpdxExpressionParserException(token, T::class)
next = iterator.nextOrNull()
return token
}
}
36 changes: 36 additions & 0 deletions utils/spdx/src/main/kotlin/parser/Token.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0
* License-Filename: LICENSE
*/

package org.ossreviewtoolkit.utils.spdx.parser

/**
* The tokens created by the [SpdxExpressionLexer] and consumed by the [SpdxExpressionParser].
*/
sealed class Token {
data class OPEN(val position: Int) : Token()
data class CLOSE(val position: Int) : Token()
data class PLUS(val position: Int) : Token()
data class COLON(val position: Int) : Token()
data class AND(val position: Int) : Token()
data class OR(val position: Int) : Token()
data class WITH(val position: Int) : Token()
data class IDENTIFIER(val start: Int, val end: Int, val value: String) : Token()
data class LICENSEREF(val start: Int, val end: Int, val value: String) : Token()
data class DOCUMENTREF(val start: Int, val end: Int, val value: String) : Token()
}
Loading

0 comments on commit 654b142

Please sign in to comment.