-
Notifications
You must be signed in to change notification settings - Fork 314
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(spdx-utils): Add a new SPDX expression parser implementation
Add a handwritten parser for SPDX expressions which will replace the parser generated by ANTLR in a later commit. Having a parser that is implemented in pure Kotlin will make it possible to make `spdx-utils` a multiplatform module. Performance tests with various SPDX expressions have shown that the handwritten parser is on average seven times faster than the one generated by ANTLR. Signed-off-by: Martin Nonnenmacher <[email protected]>
- Loading branch information
1 parent
36e703b
commit 654b142
Showing
6 changed files
with
939 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
/* | ||
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* License-Filename: LICENSE | ||
*/ | ||
|
||
package org.ossreviewtoolkit.utils.spdx.parser | ||
|
||
import kotlin.reflect.KClass | ||
|
||
import org.ossreviewtoolkit.utils.spdx.SpdxException | ||
|
||
/** | ||
* An exception to indicate that an [SpdxExpressionLexer] error occurred. | ||
*/ | ||
class SpdxExpressionLexerException(val char: Char, val position: Int) : | ||
SpdxException("Unexpected character '$char' at position $position.") | ||
|
||
/** | ||
* An exception to indicate that an [SpdxExpressionParser] error occurred. [token] is the unexpected token that caused | ||
* the exception, if it is `null` that means the end of the input was reached unexpectedly. [expectedTokenTypes] are the | ||
* expected token types, if available. | ||
*/ | ||
class SpdxExpressionParserException( | ||
val token: Token?, | ||
vararg val expectedTokenTypes: KClass<out Token> = emptyArray() | ||
) : SpdxException( | ||
buildString { | ||
append("Unexpected token '$token'") | ||
|
||
if (expectedTokenTypes.size == 1) { | ||
append(", expected ${expectedTokenTypes.first().simpleName}") | ||
} else if (expectedTokenTypes.size > 1) { | ||
append(", expected one of ${expectedTokenTypes.joinToString { it.simpleName.orEmpty() }}") | ||
} | ||
|
||
append(".") | ||
} | ||
) |
103 changes: 103 additions & 0 deletions
103
utils/spdx/src/main/kotlin/parser/SpdxExpressionLexer.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
/* | ||
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* License-Filename: LICENSE | ||
*/ | ||
|
||
package org.ossreviewtoolkit.utils.spdx.parser | ||
|
||
import org.ossreviewtoolkit.utils.common.nextOrNull | ||
|
||
/** | ||
* A lexer for SPDX expressions. It consumes a sequence of characters and produces a sequence of [Token]s. For details | ||
* on the grammar see [SpdxExpressionParser]. | ||
*/ | ||
class SpdxExpressionLexer(input: Sequence<Char>) { | ||
constructor(input: String) : this(input.asSequence()) | ||
|
||
companion object { | ||
/** The uppercase characters allowed by the SPDX specification. */ | ||
private val UPPERCASE = 'A'..'Z' | ||
/** The lowercase characters allowed by the SPDX specification. */ | ||
private val LOWERCASE = 'a'..'z' | ||
/** The digits allowed by the SPDX specification. */ | ||
private val DIGITS = '0'..'9' | ||
|
||
/** Return true if the character is an uppercase character allowed by the SPDX specification. */ | ||
private fun Char.isUpper() = this in UPPERCASE | ||
/** Return true if the character is a lowercase character allowed by the SPDX specification. */ | ||
private fun Char.isLower() = this in LOWERCASE | ||
/** Return true if the character is a digit allowed by the SPDX specification. */ | ||
private fun Char.isDigit() = this in DIGITS | ||
/** Return true if the character is a valid identifier character allowed by the SPDX specification. */ | ||
private fun Char.isIdentifier() = isUpper() || isLower() || isDigit() || this == '.' || this == '-' | ||
} | ||
|
||
private val iterator = input.iterator() | ||
private var next = iterator.nextOrNull() | ||
private var position = 0 | ||
|
||
fun tokens(): Sequence<Token> = generateSequence { nextToken() } | ||
|
||
private fun nextToken(): Token? { | ||
var cur = consumeChar() | ||
|
||
while (cur != null) { | ||
if (cur == ' ') { | ||
cur = consumeChar() | ||
continue | ||
} | ||
|
||
if (cur == '(') return Token.OPEN(position) | ||
if (cur == ')') return Token.CLOSE(position) | ||
if (cur == '+') return Token.PLUS(position) | ||
if (cur == ':') return Token.COLON(position) | ||
|
||
if (cur.isIdentifier()) { | ||
val start = position | ||
val value = buildString { | ||
append(cur) | ||
|
||
while (next?.isIdentifier() == true) { | ||
cur = consumeChar() | ||
append(cur) | ||
} | ||
} | ||
|
||
return when (value.uppercase()) { | ||
"AND" -> Token.AND(start) | ||
"OR" -> Token.OR(start) | ||
"WITH" -> Token.WITH(start) | ||
else -> when { | ||
value.startsWith("DocumentRef-") -> Token.DOCUMENTREF(start, position, value) | ||
value.startsWith("LicenseRef-") -> Token.LICENSEREF(start, position, value) | ||
else -> Token.IDENTIFIER(start, position, value) | ||
} | ||
} | ||
} | ||
|
||
cur?.let { throw SpdxExpressionLexerException(it, position) } | ||
} | ||
|
||
return null | ||
} | ||
|
||
private fun consumeChar(): Char? = | ||
next?.also { | ||
position++ | ||
next = iterator.nextOrNull() | ||
} | ||
} |
196 changes: 196 additions & 0 deletions
196
utils/spdx/src/main/kotlin/parser/SpdxExpressionParser.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
/* | ||
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* License-Filename: LICENSE | ||
*/ | ||
|
||
package org.ossreviewtoolkit.utils.spdx.parser | ||
|
||
import org.ossreviewtoolkit.utils.common.nextOrNull | ||
import org.ossreviewtoolkit.utils.spdx.SpdxCompoundExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseIdExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseReferenceExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseWithExceptionExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxOperator | ||
|
||
/** | ||
* A parser for SPDX expressions. It consumes a sequence of [Token]s and produces an [SpdxExpression]. | ||
* | ||
* This parser implements the grammar defined in the | ||
* [SPDX specification](https://spdx.github.io/spdx-spec/v2.2.2/SPDX-license-expressions/): | ||
* | ||
* ``` | ||
* license-expression -> simple-expression | compound-expression | ||
* compound-expression -> simple-expression | | ||
* simple-expression "WITH" license-exception-id | | ||
* compound-expression "AND" compound-expression | | ||
* compound-expression "OR" compound-expression | | ||
* "(" compound-expression ")" ) | ||
* simple-expression -> license-id | license-id"+" | license-ref | ||
* license-ref -> ["DocumentRef-" idstring ":"] "LicenseRef-" idstring | ||
* license-exception-id -> <short form license exception identifier in Annex A.2> | ||
* license-id -> <short form license identifier in Annex A.1> | ||
* idstring -> 1*(ALPHA / DIGIT / "-" / "." ) | ||
* ``` | ||
* | ||
* For more efficient parsing, this is transformed into the following form which implements the operator precedence as | ||
* part of the grammar: | ||
* | ||
* ``` | ||
* license-expression -> or-expression | ||
* or-expression -> and-expression ( "OR" and-expression ) * | ||
* and-expression -> primary ( "AND" primary ) * | ||
* primary -> "(" license-expression ")" | simple-expression | ||
* simple-expression -> ( IDENTIFIER [ "+" ] | [ "DOCUMENTREF" ":" ] LICENSEREF ) [ "WITH" IDENTIFIER ] | ||
* ``` | ||
* | ||
* This allows implementing a | ||
* [recursive descent parser](https://en.wikipedia.org/wiki/Recursive_descent_parser) with | ||
* [Pratt parsing](https://en.wikipedia.org/wiki/Operator-precedence_parser#Pratt_parsing). The implementation is | ||
* loosely based on this | ||
* [example](https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/) but with many | ||
* simplifications as the SPDX grammar has only one operator per level of precedence and the parser does not need to be | ||
* extensible. | ||
* | ||
* Also, the rules for `license-id` and `license-exception-id` are changed to allow any valid `idstring` as the | ||
* [strictness] decides if only the SPDX identifiers are allowed for license and exception ids and therefore these rules | ||
* cannot be part of the grammar. | ||
* | ||
* For backward compatibility with the previously used SPDX expression parser, operators are case-insensitive. This is | ||
* also planned for future SPDX versions, see https://github.com/spdx/spdx-spec/pull/876. | ||
*/ | ||
class SpdxExpressionParser( | ||
tokens: Sequence<Token>, | ||
private val strictness: SpdxExpression.Strictness = SpdxExpression.Strictness.ALLOW_ANY | ||
) { | ||
constructor( | ||
input: String, | ||
strictness: SpdxExpression.Strictness = SpdxExpression.Strictness.ALLOW_ANY | ||
) : this(SpdxExpressionLexer(input).tokens(), strictness) | ||
|
||
private val iterator = tokens.iterator() | ||
private var next = iterator.nextOrNull() | ||
|
||
fun parse(): SpdxExpression { | ||
val result = parseOrExpression() | ||
if (next != null) throw SpdxExpressionParserException(next) | ||
return result | ||
} | ||
|
||
/** | ||
* Parse an OR expression of the form `or-expression -> and-expression ( "OR" and-expression ) *`. | ||
*/ | ||
private fun parseOrExpression(): SpdxExpression { | ||
var left = parseAndExpression() | ||
while (next is Token.OR) { | ||
consume<Token.OR>() | ||
val right = parseAndExpression() | ||
left = SpdxCompoundExpression(left, SpdxOperator.OR, right) | ||
} | ||
return left | ||
} | ||
|
||
/** | ||
* Parse an AND expression of the form `and-expression -> primary ( "AND" primary ) *`. | ||
*/ | ||
private fun parseAndExpression(): SpdxExpression { | ||
var left = parsePrimary() | ||
while (next is Token.AND) { | ||
consume<Token.AND>() | ||
val right = parsePrimary() | ||
left = SpdxCompoundExpression(left, SpdxOperator.AND, right) | ||
} | ||
return left | ||
} | ||
|
||
/** | ||
* Parse a primary of the form `primary -> "(" license-expression ")" | simple-expression`. | ||
*/ | ||
private fun parsePrimary(): SpdxExpression { | ||
if (next is Token.OPEN) { | ||
consume<Token.OPEN>() | ||
val expression = parseOrExpression() | ||
consume<Token.CLOSE>() | ||
return expression | ||
} | ||
|
||
return parseSimpleExpression() | ||
} | ||
|
||
/** | ||
* Parse a simple expression of the form | ||
* `simple-expression -> ( IDENTIFIER [ "+" ] | [ "DOCUMENTREF" ":" ] LICENSEREF ) [ "WITH" IDENTIFIER ]`. | ||
*/ | ||
private fun parseSimpleExpression(): SpdxExpression { | ||
val left = when (next) { | ||
is Token.IDENTIFIER -> { | ||
val identifier = consume<Token.IDENTIFIER>() | ||
|
||
val orLaterVersion = next is Token.PLUS || identifier.value.endsWith("-or-later") | ||
if (next is Token.PLUS) consume<Token.PLUS>() | ||
|
||
SpdxLicenseIdExpression(identifier.value, orLaterVersion).apply { validate(strictness) } | ||
} | ||
|
||
is Token.DOCUMENTREF -> { | ||
val documentRef = consume<Token.DOCUMENTREF>() | ||
consume<Token.COLON>() | ||
val licenseRef = consume<Token.LICENSEREF>() | ||
|
||
SpdxLicenseReferenceExpression("${documentRef.value}:${licenseRef.value}") | ||
.apply { validate(strictness) } | ||
} | ||
|
||
is Token.LICENSEREF -> { | ||
val licenseRef = consume<Token.LICENSEREF>() | ||
|
||
SpdxLicenseReferenceExpression(licenseRef.value).apply { validate(strictness) } | ||
} | ||
|
||
else -> throw SpdxExpressionParserException(next) | ||
} | ||
|
||
if (next is Token.WITH) { | ||
consume<Token.WITH>() | ||
val exception = when (next) { | ||
is Token.IDENTIFIER -> consume<Token.IDENTIFIER>().value | ||
is Token.LICENSEREF -> consume<Token.LICENSEREF>().value | ||
is Token.DOCUMENTREF -> "${consume<Token.DOCUMENTREF>().value}:${consume<Token.LICENSEREF>().value}" | ||
else -> throw SpdxExpressionParserException( | ||
next, | ||
Token.IDENTIFIER::class, | ||
Token.LICENSEREF::class, | ||
Token.DOCUMENTREF::class | ||
) | ||
} | ||
return SpdxLicenseWithExceptionExpression(left, exception).apply { validate(strictness) } | ||
} | ||
|
||
return left | ||
} | ||
|
||
/** | ||
* Consume the [next] token and return it if it is of the expected type [T], otherwise throw an | ||
* [SpdxExpressionParserException]. | ||
*/ | ||
private inline fun <reified T : Token> consume(): T { | ||
val token = next | ||
if (token !is T) throw SpdxExpressionParserException(token, T::class) | ||
next = iterator.nextOrNull() | ||
return token | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
/* | ||
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* License-Filename: LICENSE | ||
*/ | ||
|
||
package org.ossreviewtoolkit.utils.spdx.parser | ||
|
||
/** | ||
* The tokens created by the [SpdxExpressionLexer] and consumed by the [SpdxExpressionParser]. | ||
*/ | ||
sealed class Token { | ||
data class OPEN(val position: Int) : Token() | ||
data class CLOSE(val position: Int) : Token() | ||
data class PLUS(val position: Int) : Token() | ||
data class COLON(val position: Int) : Token() | ||
data class AND(val position: Int) : Token() | ||
data class OR(val position: Int) : Token() | ||
data class WITH(val position: Int) : Token() | ||
data class IDENTIFIER(val start: Int, val end: Int, val value: String) : Token() | ||
data class LICENSEREF(val start: Int, val end: Int, val value: String) : Token() | ||
data class DOCUMENTREF(val start: Int, val end: Int, val value: String) : Token() | ||
} |
Oops, something went wrong.