-
Notifications
You must be signed in to change notification settings - Fork 314
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(spdx-utils): Add a new SPDX expression parser implementation
Add a handwritten parser for SPDX expressions which will replace the parser generated by ANTLR in a later commit. Having a parser that is implemented in pure Kotlin will make it possible to make `spdx-utils` a multiplatform module. Performance tests with various SPDX expressions have shown that the handwritten parser is on average seven times faster than the one generated by ANTLR. Signed-off-by: Martin Nonnenmacher <[email protected]>
- Loading branch information
1 parent
d1e9c6b
commit b440b78
Showing
6 changed files
with
914 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/* | ||
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* License-Filename: LICENSE | ||
*/ | ||
|
||
package org.ossreviewtoolkit.utils.spdx.parser | ||
|
||
import kotlin.reflect.KClass | ||
|
||
import org.ossreviewtoolkit.utils.spdx.SpdxException | ||
|
||
/** | ||
* An exception to indicate that an [SpdxExpressionLexer] error occurred. | ||
*/ | ||
class SpdxExpressionLexerException(val char: Char?, val position: Int) : | ||
SpdxException("Unexpected character '$char' at position $position.") | ||
|
||
/** | ||
* An exception to indicate that an [SpdxExpressionParser] error occurred. | ||
*/ | ||
class SpdxExpressionParserException( | ||
val token: Token?, | ||
vararg val expectedTokenTypes: KClass<out Token> = emptyArray() | ||
) : SpdxException( | ||
buildString { | ||
append("Unexpected token '$token'") | ||
|
||
if (expectedTokenTypes.size == 1) { | ||
append(", expected ${expectedTokenTypes.first().simpleName}") | ||
} else if (expectedTokenTypes.size > 1) { | ||
append(", expected one of ${expectedTokenTypes.joinToString { it.simpleName.orEmpty() }}") | ||
} | ||
|
||
append(".") | ||
} | ||
) |
102 changes: 102 additions & 0 deletions
102
utils/spdx/src/main/kotlin/parser/SpdxExpressionLexer.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
/* | ||
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* License-Filename: LICENSE | ||
*/ | ||
|
||
package org.ossreviewtoolkit.utils.spdx.parser | ||
|
||
import org.ossreviewtoolkit.utils.common.nextOrNull | ||
|
||
/** | ||
* A lexer for SPDX expressions. It consumes a sequence of characters and produces a sequence of [Token]s. For details | ||
* on the grammar see [SpdxExpressionParser]. | ||
*/ | ||
class SpdxExpressionLexer(input: Sequence<Char>) { | ||
constructor(input: String) : this(input.asSequence()) | ||
|
||
companion object { | ||
private val UPPERCASE = 'A'..'Z' | ||
private val LOWERCASE = 'a'..'z' | ||
private val DIGITS = '0'..'9' | ||
|
||
private fun Char.isUpper() = this in UPPERCASE | ||
private fun Char.isLower() = this in LOWERCASE | ||
private fun Char.isDigit() = this in DIGITS | ||
private fun Char.isIdentifier() = isUpper() || isLower() || isDigit() || this == '.' || this == '-' | ||
} | ||
|
||
private val iterator = input.iterator() | ||
private var next = iterator.nextOrNull() | ||
private var position = 0 | ||
|
||
fun tokens(): Sequence<Token> { | ||
var complete = false | ||
return generateSequence { | ||
if (complete) null else nextToken().also { if (it == null) complete = true } | ||
} | ||
} | ||
|
||
private fun nextToken(): Token? { | ||
var cur = consumeChar() | ||
|
||
while (cur != null) { | ||
if (cur == ' ') { | ||
cur = consumeChar() | ||
continue | ||
} | ||
|
||
if (cur == '(') return Token.OPEN(position) | ||
if (cur == ')') return Token.CLOSE(position) | ||
if (cur == '+') return Token.PLUS(position) | ||
if (cur == ':') return Token.COLON(position) | ||
|
||
if (cur.isIdentifier()) { | ||
val start = position | ||
val value = buildString { | ||
append(cur) | ||
|
||
while (next?.isIdentifier() == true) { | ||
cur = consumeChar() | ||
append(cur) | ||
} | ||
} | ||
|
||
return when (value) { | ||
"AND" -> Token.AND(start) | ||
"OR" -> Token.OR(start) | ||
"WITH" -> Token.WITH(start) | ||
else -> when { | ||
value.startsWith("DocumentRef-") -> Token.DOCUMENTREF(start, position, value) | ||
value.startsWith("LicenseRef-") -> Token.LICENSEREF(start, position, value) | ||
else -> Token.IDENTIFIER(start, position, value) | ||
} | ||
} | ||
} | ||
|
||
throw SpdxExpressionLexerException(cur, position) | ||
} | ||
|
||
return null | ||
} | ||
|
||
private fun consumeChar(): Char? { | ||
return next?.also { | ||
position++ | ||
next = iterator.nextOrNull() | ||
} | ||
} | ||
} |
193 changes: 193 additions & 0 deletions
193
utils/spdx/src/main/kotlin/parser/SpdxExpressionParser.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
/* | ||
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* License-Filename: LICENSE | ||
*/ | ||
|
||
package org.ossreviewtoolkit.utils.spdx.parser | ||
|
||
import org.ossreviewtoolkit.utils.common.nextOrNull | ||
import org.ossreviewtoolkit.utils.spdx.SpdxCompoundExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseIdExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseReferenceExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxLicenseWithExceptionExpression | ||
import org.ossreviewtoolkit.utils.spdx.SpdxOperator | ||
|
||
/** | ||
* A parser for SPDX expressions. It consumes a sequence of [Token]s and produces an [SpdxExpression]. | ||
* | ||
* This parser implements the grammar defined in the | ||
* [SPDX specification](https://spdx.github.io/spdx-spec/v2.2.2/SPDX-license-expressions/): | ||
* | ||
* ``` | ||
* license-expression -> simple-expression | compound-expression | ||
* compound-expression -> simple-expression | | ||
* simple-expression "WITH" license-exception-id | | ||
* compound-expression "AND" compound-expression | | ||
* compound-expression "OR" compound-expression | | ||
* "(" compound-expression ")" ) | ||
* simple-expression -> license-id | license-id"+" | license-ref | ||
* license-ref -> ["DocumentRef-" idstring ":"] "LicenseRef-" idstring | ||
* license-exception-id -> <short form license exception identifier in Annex A.2> | ||
* license-id -> <short form license identifier in Annex A.1> | ||
* idstring -> 1*(ALPHA / DIGIT / "-" / "." ) | ||
* ``` | ||
* | ||
* For more efficient parsing, this is transformed into the following form which implements the operator precedence as | ||
* part of the grammar: | ||
* | ||
* ``` | ||
* license-expression -> or-expression | ||
* or-expression -> and-expression ( "OR" and-expression ) * | ||
* and-expression -> primary ( "AND" primary ) * | ||
* primary -> "(" license-expression ")" | simple-expression | ||
* simple-expression -> ( IDENTIFIER [ "+" ] | [ "DOCUMENTREF" ":" ] LICENSEREF ) [ "WITH" IDENTIFIER ] | ||
* ``` | ||
* | ||
* This allows implementing a | ||
* [recursive descent parser](https://en.wikipedia.org/wiki/Recursive_descent_parser) with | ||
* [Pratt parsing](https://en.wikipedia.org/wiki/Operator-precedence_parser#Pratt_parsing). The implementation is | ||
* loosely based on this | ||
* [example](https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/) but with many | ||
* simplifications as the SPDX grammar has only one operator per level of precedence and the parser does not need to be | ||
* extensible. | ||
* | ||
* Also, the rules for `license-id` and `license-exception-id` are changed to allow any valid `idstring` as the | ||
* [strictness] decides if only the SPDX identifiers are allowed for license and exception ids and therefore these rules | ||
* cannot be part of the grammar. | ||
*/ | ||
class SpdxExpressionParser( | ||
tokens: Sequence<Token>, | ||
private val strictness: SpdxExpression.Strictness = SpdxExpression.Strictness.ALLOW_ANY | ||
) { | ||
constructor( | ||
input: String, | ||
strictness: SpdxExpression.Strictness = SpdxExpression.Strictness.ALLOW_ANY | ||
) : this(SpdxExpressionLexer(input).tokens(), strictness) | ||
|
||
private val iterator = tokens.iterator() | ||
private var next = iterator.nextOrNull() | ||
|
||
fun parse(): SpdxExpression { | ||
val result = parseOrExpression() | ||
if (next != null) throw SpdxExpressionParserException(next) | ||
return result | ||
} | ||
|
||
/** | ||
* Parse an OR expression of the form `or-expression -> and-expression ( "OR" and-expression ) *`. | ||
*/ | ||
private fun parseOrExpression(): SpdxExpression { | ||
var left = parseAndExpression() | ||
while (next is Token.OR) { | ||
consume<Token.OR>() | ||
val right = parseAndExpression() | ||
left = SpdxCompoundExpression(left, SpdxOperator.OR, right) | ||
} | ||
return left | ||
} | ||
|
||
/** | ||
* Parse an AND expression of the form `and-expression -> primary ( "AND" primary ) *`. | ||
*/ | ||
private fun parseAndExpression(): SpdxExpression { | ||
var left = parsePrimary() | ||
while (next is Token.AND) { | ||
consume<Token.AND>() | ||
val right = parsePrimary() | ||
left = SpdxCompoundExpression(left, SpdxOperator.AND, right) | ||
} | ||
return left | ||
} | ||
|
||
/** | ||
* Parse a primary of the form `primary -> "(" license-expression ")" | simple-expression`. | ||
*/ | ||
private fun parsePrimary(): SpdxExpression { | ||
if (next is Token.OPEN) { | ||
consume<Token.OPEN>() | ||
val expression = parseOrExpression() | ||
consume<Token.CLOSE>() | ||
return expression | ||
} | ||
|
||
return parseSimpleExpression() | ||
} | ||
|
||
/** | ||
* Parse a simple expression of the form | ||
* `simple-expression -> ( IDENTIFIER [ "+" ] | [ "DOCUMENTREF" ":" ] LICENSEREF ) [ "WITH" IDENTIFIER ]`. | ||
*/ | ||
private fun parseSimpleExpression(): SpdxExpression { | ||
val left = when (next) { | ||
is Token.IDENTIFIER -> { | ||
val identifier = consume<Token.IDENTIFIER>() | ||
|
||
val orLaterVersion = next is Token.PLUS || identifier.value.endsWith("-or-later") | ||
if (next is Token.PLUS) consume<Token.PLUS>() | ||
|
||
SpdxLicenseIdExpression(identifier.value, orLaterVersion).apply { validate(strictness) } | ||
} | ||
|
||
is Token.DOCUMENTREF -> { | ||
val documentRef = consume<Token.DOCUMENTREF>() | ||
consume<Token.COLON>() | ||
val licenseRef = consume<Token.LICENSEREF>() | ||
|
||
SpdxLicenseReferenceExpression("${documentRef.value}:${licenseRef.value}") | ||
.apply { validate(strictness) } | ||
} | ||
|
||
is Token.LICENSEREF -> { | ||
val licenseRef = consume<Token.LICENSEREF>() | ||
|
||
SpdxLicenseReferenceExpression(licenseRef.value).apply { validate(strictness) } | ||
} | ||
|
||
else -> throw SpdxExpressionParserException(next) | ||
} | ||
|
||
if (next is Token.WITH) { | ||
consume<Token.WITH>() | ||
val exception = when (next) { | ||
is Token.IDENTIFIER -> consume<Token.IDENTIFIER>().value | ||
is Token.LICENSEREF -> consume<Token.LICENSEREF>().value | ||
is Token.DOCUMENTREF -> "${consume<Token.DOCUMENTREF>().value}:${consume<Token.LICENSEREF>().value}" | ||
else -> throw SpdxExpressionParserException( | ||
next, | ||
Token.IDENTIFIER::class, | ||
Token.LICENSEREF::class, | ||
Token.DOCUMENTREF::class | ||
) | ||
} | ||
return SpdxLicenseWithExceptionExpression(left, exception).apply { validate(strictness) } | ||
} | ||
|
||
return left | ||
} | ||
|
||
/** | ||
* Consume the [next] token and return it if it is of the expected type [T], otherwise throw an | ||
* [SpdxExpressionParserException]. | ||
*/ | ||
private inline fun <reified T : Token> consume(): T { | ||
val token = next | ||
if (token !is T) throw SpdxExpressionParserException(token, T::class) | ||
next = iterator.nextOrNull() | ||
return token | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
/* | ||
* Copyright (C) 2024 The ORT Project Authors (see <https://github.com/oss-review-toolkit/ort/blob/main/NOTICE>) | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* License-Filename: LICENSE | ||
*/ | ||
|
||
package org.ossreviewtoolkit.utils.spdx.parser | ||
|
||
/** | ||
* The tokens created by the [SpdxExpressionLexer] and consumed by the [SpdxExpressionParser]. | ||
*/ | ||
sealed class Token { | ||
data class OPEN(val position: Int) : Token() | ||
data class CLOSE(val position: Int) : Token() | ||
data class PLUS(val position: Int) : Token() | ||
data class COLON(val position: Int) : Token() | ||
data class AND(val position: Int) : Token() | ||
data class OR(val position: Int) : Token() | ||
data class WITH(val position: Int) : Token() | ||
data class IDENTIFIER(val start: Int, val end: Int, val value: String) : Token() | ||
data class LICENSEREF(val start: Int, val end: Int, val value: String) : Token() | ||
data class DOCUMENTREF(val start: Int, val end: Int, val value: String) : Token() | ||
} |
Oops, something went wrong.