From e78cae418657d2a13e14e73ba2fcae02a958e3e7 Mon Sep 17 00:00:00 2001 From: Nicolas Nobelis Date: Mon, 12 Feb 2024 11:03:06 +0100 Subject: [PATCH 1/2] feat(RepositoryConfiguration): Add support for snippet choice Signed-off-by: Nicolas Nobelis --- .../repository-configuration-schema.json | 85 ++++++++ .../kotlin/config/RepositoryConfiguration.kt | 8 +- .../src/main/kotlin/config/SnippetChoices.kt | 38 ++++ .../main/kotlin/config/snippet/Provenance.kt | 27 +++ .../kotlin/config/snippet/SnippetChoice.kt | 68 +++++++ .../config/snippet/SnippetChoiceReason.kt | 40 ++++ .../config/RepositoryConfigurationTest.kt | 29 +++ website/docs/configuration/snippet-choice.md | 192 ++++++++++++++++++ 8 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 model/src/main/kotlin/config/SnippetChoices.kt create mode 100644 model/src/main/kotlin/config/snippet/Provenance.kt create mode 100644 model/src/main/kotlin/config/snippet/SnippetChoice.kt create mode 100644 model/src/main/kotlin/config/snippet/SnippetChoiceReason.kt create mode 100644 website/docs/configuration/snippet-choice.md diff --git a/integrations/schemas/repository-configuration-schema.json b/integrations/schemas/repository-configuration-schema.json index 560312d56d665..2dbebde457989 100644 --- a/integrations/schemas/repository-configuration-schema.json +++ b/integrations/schemas/repository-configuration-schema.json @@ -142,6 +142,84 @@ } } } + }, + "snippet_choices": { + "type": "array", + "description": "A configuration to select a snippet from a package with multiple snippet findings.", + "items": { + "type": "object", + "properties": { + "provenance": { + "type": "object", + "properties": { + "url": { + "type": "string" + } + }, + "required": [ + "url" + ] + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "given": { + "type": "object", + "properties": { + "sourceLocation": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "lineStart": { + "type": "integer" + }, + "lineEnd": { + "type": "integer" + } + }, + "required": [ + "path", + "lineStart", + "lineEnd" + ] + } + } + }, + "choice": { + "type": "object", + "properties": { + "purl": { + "type": "string" + }, + "reason": { + "$ref": "#/definitions/snippetChoiceReason" + }, + "comment": { + "type": "string" + } + }, + "required": [ + "reason", + "reasoning" + ] + } + }, + "required": [ + "given", + "choice" + ] + } + } + }, + "required": [ + "provenance", + "choices" + ] + } } }, "definitions": { @@ -210,6 +288,13 @@ "NOT_DETECTED", "REFERENCE" ] + }, + "snippetChoiceReason": { + "enum": [ + "NO_RELEVANT_FINDING", + "ORIGINAL_FINDING", + "OTHER" + ] } } } diff --git a/model/src/main/kotlin/config/RepositoryConfiguration.kt b/model/src/main/kotlin/config/RepositoryConfiguration.kt index 85b24d4ab037e..4f645afcb3ef6 100644 --- a/model/src/main/kotlin/config/RepositoryConfiguration.kt +++ b/model/src/main/kotlin/config/RepositoryConfiguration.kt @@ -68,5 +68,11 @@ data class RepositoryConfiguration( * Defines license choices within this repository. */ @JsonInclude(value = JsonInclude.Include.CUSTOM, valueFilter = LicenseChoicesFilter::class) - val licenseChoices: LicenseChoices = LicenseChoices() + val licenseChoices: LicenseChoices = LicenseChoices(), + + /** + * Defines snippet choices for projects in this repository. + */ + @JsonInclude(value = JsonInclude.Include.NON_EMPTY) + val snippetChoices: List = emptyList() ) diff --git a/model/src/main/kotlin/config/SnippetChoices.kt b/model/src/main/kotlin/config/SnippetChoices.kt new file mode 100644 index 0000000000000..40b35230b30c2 --- /dev/null +++ b/model/src/main/kotlin/config/SnippetChoices.kt @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2024 The ORT Project Authors (see ) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + * License-Filename: LICENSE + */ + +package org.ossreviewtoolkit.model.config + +import org.ossreviewtoolkit.model.config.snippet.Provenance +import org.ossreviewtoolkit.model.config.snippet.SnippetChoice + +/** + * A collection of snippet choices for a given provenance. + */ +data class SnippetChoices( + /** + * The provenance this snippet choice applies to. + */ + val provenance: Provenance, + + /** + * The snippet choices for this package. + */ + val choices: List +) diff --git a/model/src/main/kotlin/config/snippet/Provenance.kt b/model/src/main/kotlin/config/snippet/Provenance.kt new file mode 100644 index 0000000000000..5ea8dc89388ff --- /dev/null +++ b/model/src/main/kotlin/config/snippet/Provenance.kt @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2024 The ORT Project Authors (see ) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + * License-Filename: LICENSE + */ + +package org.ossreviewtoolkit.model.config.snippet + +import org.ossreviewtoolkit.model.RepositoryProvenance + +/** + * The URL of the [RepositoryProvenance] the snippet choice applies to. + */ +data class Provenance(val url: String) diff --git a/model/src/main/kotlin/config/snippet/SnippetChoice.kt b/model/src/main/kotlin/config/snippet/SnippetChoice.kt new file mode 100644 index 0000000000000..3fe260339dce0 --- /dev/null +++ b/model/src/main/kotlin/config/snippet/SnippetChoice.kt @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2024 The ORT Project Authors (see ) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + * License-Filename: LICENSE + */ + +package org.ossreviewtoolkit.model.config.snippet + +import org.ossreviewtoolkit.model.TextLocation + +/** + * A snippet choice for a given source file. + */ +data class SnippetChoice( + /** + * The source file criteria for which the snippet choice is made. + */ + val given: Given, + + /** + * The snippet criteria to make the snippet choice. + */ + val choice: Choice +) + +/** + * A source file criteria for which the snippet choice is made. + */ +data class Given( + /** + * The source file for which the snippet choice is made. + */ + val sourceLocation: TextLocation +) + +/** + * A snippet criteria to make the snippet choice. + */ +data class Choice( + /** + * The purl of the snippet chosen by this snippet choice. If [reason] is [SnippetChoiceReason.NO_RELEVANT_FINDING], + * it is null. + */ + val purl: String? = null, + + /** + * The reason why this snippet choice was made. + */ + val reason: SnippetChoiceReason, + + /** + * An optional comment describing the snippet choice. + */ + val comment: String? = null +) diff --git a/model/src/main/kotlin/config/snippet/SnippetChoiceReason.kt b/model/src/main/kotlin/config/snippet/SnippetChoiceReason.kt new file mode 100644 index 0000000000000..38a9712ae08f0 --- /dev/null +++ b/model/src/main/kotlin/config/snippet/SnippetChoiceReason.kt @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2024 The ORT Project Authors (see ) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + * License-Filename: LICENSE + */ + +package org.ossreviewtoolkit.model.config.snippet + +/** + * The reason for which the snippet choice has been made. + */ +enum class SnippetChoiceReason { + /** + * No relevant finding has been found for the corresponding source file. All snippets will be ignored. + */ + NO_RELEVANT_FINDING, + + /** + * One snippet finding is relevant for the corresponding source file. All other snippets will be ignored. + */ + ORIGINAL_FINDING, + + /** + * Other reason. + */ + OTHER +} diff --git a/model/src/test/kotlin/config/RepositoryConfigurationTest.kt b/model/src/test/kotlin/config/RepositoryConfigurationTest.kt index 1465c21f5b51f..d786e434031dd 100644 --- a/model/src/test/kotlin/config/RepositoryConfigurationTest.kt +++ b/model/src/test/kotlin/config/RepositoryConfigurationTest.kt @@ -24,12 +24,15 @@ import com.fasterxml.jackson.databind.exc.ValueInstantiationException import io.kotest.assertions.throwables.shouldThrow import io.kotest.core.spec.style.WordSpec import io.kotest.matchers.collections.haveSize +import io.kotest.matchers.nulls.beNull import io.kotest.matchers.should import io.kotest.matchers.shouldBe import io.kotest.matchers.string.shouldContain import io.kotest.matchers.string.shouldNotContain import org.ossreviewtoolkit.model.Identifier +import org.ossreviewtoolkit.model.TextLocation +import org.ossreviewtoolkit.model.config.snippet.SnippetChoiceReason import org.ossreviewtoolkit.model.fromYaml import org.ossreviewtoolkit.utils.spdx.toSpdx import org.ossreviewtoolkit.utils.test.shouldNotBeNull @@ -121,6 +124,18 @@ class RepositoryConfigurationTest : WordSpec({ - given: MPL-2.0 or EPL-1.0 choice: MPL-2.0 - choice: MPL-2.0 AND MIT + snippet_choices: + - provenance: + url: "https://github.com/vdurmont/semver4j.git" + choices: + - given: + source_location: + path: "CHANGELOG.md" + start_line: 2 + end_line: 5 + choice: + reason: "NO_RELEVANT_FINDING" + comment: "Explain why this location has only false positive snippets" """.trimIndent() val repositoryConfiguration = configuration.fromYaml() @@ -197,6 +212,20 @@ class RepositoryConfigurationTest : WordSpec({ choice shouldBe "MPL-2.0 AND MIT".toSpdx() } } + + val snippetChoices = repositoryConfiguration.snippetChoices + snippetChoices should haveSize(1) + + with(snippetChoices.first()) { + provenance.url shouldBe "https://github.com/vdurmont/semver4j.git" + with(choices.first()) { + given.sourceLocation shouldBe TextLocation("CHANGELOG.md", 2, 5) + + choice.purl should beNull() + choice.reason shouldBe SnippetChoiceReason.NO_RELEVANT_FINDING + choice.comment shouldBe "Explain why this location has only false positive snippets" + } + } } } }) diff --git a/website/docs/configuration/snippet-choice.md b/website/docs/configuration/snippet-choice.md new file mode 100644 index 0000000000000..e8f0204592123 --- /dev/null +++ b/website/docs/configuration/snippet-choice.md @@ -0,0 +1,192 @@ +# The snippet choice feature + +## Introduction + +Snippets are short pieces of code. They might come from difference public sources such as GitHub, GitLab, or +Stackoverflow. + +So-called snippet scanners like ScanOSS and FossID scrape those public sources and build a knowledge base of snippets, +classifying them by their author, version and license. Then, the snippet scanner component of such products can use this +knowledge base to find if some source file contains some of those snippets. The matching of a snippet can be: + +* *full*: the whole snippet matches the source file +* *partial*: only some parts of a snippet matches the source file + +Currently, ORT supports two products offering snippet scanner capabilities: ScanOSS and FossID. While each +implementation is specific, the base functionality is the same: ORT submits a source file to scan to the snippet scanner +and receives the list of the snippets matching some parts of this source file. ORT puts those snippets in the +`SnippetFindings` property of the [ScanSummary](https://github.com/oss-review-toolkit/ort/blob/main/model/src/main/kotlin/ScanSummary.kt). + +## The problem + +As mentioned in the previous section, ORT returns the list of snippet findings (i.e. matches) for a given source file. +These snippet findings are aggregated by source code location, including the line ranges. + +ORT lacks a mechanism to control the snippets results: while some of these results are legitimate, others are false +positives. To decide which of these snippet findings are legitimate or false positives, the snippet choice feature +allows the user to: + +* Choose a snippet as the origin of the code snippet found in the project source code, discarding all other snippets for +this source location. +* Mark all the snippets for a given source location as false positives. + +Hence, the user can control which snippets are present in the ORT snippet results. + +## Choosing a snippet + +Let's say a source file `ci.yml` has been scanned with the following findings: + +```yaml +snippets: +- source_location: + path: ".github/workflows/ci.yml" + start_line: 3 + end_line: 17 + snippets: + - score: 0.93 + location: + path: "dot_config/nvim/autoload/plugged/vim-devicons/dot_github/workflows/vint.yml" + start_line: 3 + end_line: 18 + provenance: + source_artifact: + url: "https://github.com/RS2007/dotfiles/archive/0384a21038fd2e5befb429d0ca52384172607a6d.tar.gz" + hash: + value: "" + algorithm: "" + purl: "pkg:github/RS2007/dotfiles@0384a21038fd2e5befb429d0ca52384172607a6d" + licenses: "MIT" + - score: 0.93 + location: + path: "private_dot_config/nvim/plugged/vim-devicons/dot_github/workflows/vint.yml" + start_line: 3 + end_line: 18 + provenance: + source_artifact: + url: "https://github.com/stianfro/dotfiles/archive/b371008f262377599edac1c8ea23ef53da82f832.tar.gz" + hash: + value: "" + algorithm: "" + purl: "pkg:github/stianfro/dotfiles@b371008f262377599edac1c8ea23ef53da82f832" + licenses: "Apache-2.0" +``` + +Now an operator decided that the snippet `pkg:github/RS2007/dotfiles@0384a21038fd2e5befb429d0ca52384172607a6d` is +indeed a match and should be reflected in ORT results. To do so, the user defines in the repository's `.ort.yml` the +following **snippet choice**: + +```yaml +package_snippet_choices: +- provenance: + url: "https://github.com/vdurmont/semver4j.git" + choices: + - given: + source_location: + path: ".github/workflows/ci.yml" + start_line: 3 + end_line: 17 + choice: + purl: "pkg:github/RS2007/dotfiles@0384a21038fd2e5befb429d0ca52384172607a6d" + reason: "ORIGINAL_FINDING" + comment: "Explain why this snippet choice was made" +``` + +Three properties are required to identify the recipients of the snippet choice: + +* `provenance.url` is the provenance of the repository of the source file +* `choices.given.source_location` identifies the source file receiving the snippet choice. +* `choices.choice.purl` is the Purl identifying the snippet + +There is also another mandatory property: + +* `choices.choice.reason` enum member of [SnippetChoiceReason](https://github.com/oss-review-toolkit/ort/blob/main/model/src/main/kotlin/config/SnippetChoiceReason.kt). + +Finally, one property is *informative* and aims at making the snippet choice configuration more maintainable: + +* `choices.choice.comment` describes why the snippet choice was made. + +The snippet choice is an iterative process: one must first run ORT to get the snippets in the scan results. Then, one or +several snippets can be chosen in the `.ort.yml` file. Then, ORT is run again to generate nw scan results, taking into +account those chosen snippets. This loop can be repeated as needed. + +### What are the consequences of a snippet choice ? + +1. The license of the chosen snippet will be added to the license findings + + The consequences are *scanner specific*: + + * For FossID, these findings are usually coming from files that have been marked as identified. With the snippet +choice, pending files with a chosen snippet should therefore be marked as identified with the license of the snippet as +identification. + * For ScanOSS, license findings are currently coming from *full matched* snippets. With the snippet choice, also +files with a partial snippet match (and a chosen snippet) will have the license of the snippet as license finding. + +2. For a chosen snippet of a given source code location, all the other snippets will be considered as +**false positives** and be removed from the scan results. As it makes no sense to choose two snippets for the same +source location, this powerful feature allows to keep in the scan results only snippet findings that require attention. +3. The snippets that have been chosen won't be visible in the FossID snippet report anymore. +4. The FossID files with a snippet choice are not *pending* anymore since they will be marked as identified. +Consequently, they won't be counted in the special "pending files count" ORT issue created by the FossID scanner. + +## Handling false positives + +Continuing with the example from [above](snippet-choice.md#choosing-a-snippet), a problem remains: How to deal with a +source location that has *only* false positives snippets? The solution is to use the `NO_RELEVANT_FINDING` reason in the +`.ort.yml` file: + +```yaml +package_snippet_choices: +- provenance: + url: "https://github.com/vdurmont/semver4j.git" + choices: + - given: + source_location: + path: "CHANGELOG.md" + start_line: 2 + end_line: 5 + choice: + reason: "NO_RELEVANT_FINDING" + comment: "Explain why this location has only false positives snippets" +``` + +Three properties are required to mark all the snippets for a given location as false positives: + +* `provenance.url` is the provenance of the repository of the source file +* `choices.given.source_location` identifies the source file for which the snippet have been matched against + +There is also another mandatory property: + +* `choices.choice.reason` always [SnippetChoiceReason.NO_RELEVANT_FINDING](https://github.com/oss-review-toolkit/ort/blob/main/model/src/main/kotlin/config/SnippetChoiceReason.kt#L26). + +And an optional one: + +* `choices.choice.comment` describes why the snippet is a false positive. + +### What are the consequences of snippets marked as *false positives* ? + +1. The snippets that are *false positives* are removed be from the scan results. +2. The snippets that are *false positives* won't be visible in the FossID snippet report. + +## Snippet choice FAQ + +Q: *If the snippets with a snippet choice or a `NO_RELEVANT_FINDING` are not in the scan results anymore, +what happens when there is a new snippet finding for this source location (e.g. after an update of the Knowledge base)?* + +A: Conceptually, the scan results that have been removed in a past run may have to be added again, with the addition of +new detected snippets. How to achieve that will be remote scanner-specific as FossID is stateful while ScanOSS is not. + +Q: *Identically, what happens if a snippet with a snippet choice or a `NO_RELEVANT_FINDING` is not present in +the scanner Knowledge Base anymore, e.g. after an update?* + +A: This is problematic as it means the **.ort.yml** will be filled with snippet choices or *false positives* that are +not relevant anymore, i.e. garbage data. +This is the responsibility of ORT to always query the full snippet findings and compare them with the entries in the +`.ort.yml` file. Then, an issue can be raised to notify the user that a snippet choice or a +`NO_RELEVANT_FINDING` is not required anymore. +However, for FossID, querying the full snippet findings may be skipped if the state of FossID is used. + +Q: *What happens when the user deletes a previously chosen snippet in the .ort.yml file and retriggers a scan?* + +A: All the snippets for this source location need to be added again to the scan results. Therefore, as with the previous +question, the full snippet findings need to be queried. +And here too, in the case of FossID, its state can be used for that. From a84d4845233df05697c3e6ebca6580778b4e69ed Mon Sep 17 00:00:00 2001 From: Nicolas Nobelis Date: Mon, 12 Feb 2024 11:02:39 +0100 Subject: [PATCH 2/2] feat(scanner): Expose the snippet choices to the scanner By adding the snippet choices configuration to the scanner context, the snippet scanner implementations such as ScanOSS and FossID can be extended to take into account this configuration. This will be done in a future commit. Signed-off-by: Nicolas Nobelis --- scanner/src/main/kotlin/ScanContext.kt | 8 +++++++- scanner/src/main/kotlin/Scanner.kt | 6 ++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/scanner/src/main/kotlin/ScanContext.kt b/scanner/src/main/kotlin/ScanContext.kt index 4f37dd963d857..4959282410718 100644 --- a/scanner/src/main/kotlin/ScanContext.kt +++ b/scanner/src/main/kotlin/ScanContext.kt @@ -25,6 +25,7 @@ import org.ossreviewtoolkit.model.Package import org.ossreviewtoolkit.model.PackageType import org.ossreviewtoolkit.model.config.Excludes import org.ossreviewtoolkit.model.config.ScannerConfiguration +import org.ossreviewtoolkit.model.config.SnippetChoices import org.ossreviewtoolkit.utils.spdx.SpdxExpression /** @@ -60,5 +61,10 @@ data class ScanContext( * The packages known to be covered in the context of this scan. For package scanners, this is the list of packages * that have the same provenance as the reference package. */ - val coveredPackages: List = emptyList() + val coveredPackages: List = emptyList(), + + /** + * The [SnippetChoices] of the project to scan. + */ + val snippetChoices: List = emptyList() ) diff --git a/scanner/src/main/kotlin/Scanner.kt b/scanner/src/main/kotlin/Scanner.kt index 594d1543a319f..0abb797769d60 100644 --- a/scanner/src/main/kotlin/Scanner.kt +++ b/scanner/src/main/kotlin/Scanner.kt @@ -115,7 +115,8 @@ class Scanner( ortResult.labels + labels, PackageType.PROJECT, ortResult.repository.config.excludes, - scannerConfig.detectedLicenseMapping + scannerConfig.detectedLicenseMapping, + snippetChoices = ortResult.repository.config.snippetChoices ) ) @@ -127,7 +128,8 @@ class Scanner( ortResult.labels, PackageType.PACKAGE, ortResult.repository.config.excludes, - scannerConfig.detectedLicenseMapping + scannerConfig.detectedLicenseMapping, + snippetChoices = ortResult.repository.config.snippetChoices ) )