From 376bf37917854ba66317f31b5c7b4803fe9eb364 Mon Sep 17 00:00:00 2001 From: Nicolas Nobelis Date: Mon, 12 Feb 2024 11:03:06 +0100 Subject: [PATCH] feat(RepositoryConfiguration): Add support for snippet choice Signed-off-by: Nicolas Nobelis --- .../repository-configuration-schema.json | 85 ++++++++ .../kotlin/config/RepositoryConfiguration.kt | 8 +- .../src/main/kotlin/config/SnippetChoices.kt | 43 ++++ .../kotlin/config/snippet/SnippetChoice.kt | 68 +++++++ .../config/snippet/SnippetChoiceReason.kt | 40 ++++ .../config/RepositoryConfigurationTest.kt | 29 +++ website/docs/configuration/snippet-choice.md | 187 ++++++++++++++++++ 7 files changed, 459 insertions(+), 1 deletion(-) create mode 100644 model/src/main/kotlin/config/SnippetChoices.kt create mode 100644 model/src/main/kotlin/config/snippet/SnippetChoice.kt create mode 100644 model/src/main/kotlin/config/snippet/SnippetChoiceReason.kt create mode 100644 website/docs/configuration/snippet-choice.md diff --git a/integrations/schemas/repository-configuration-schema.json b/integrations/schemas/repository-configuration-schema.json index 560312d56d665..2dbebde457989 100644 --- a/integrations/schemas/repository-configuration-schema.json +++ b/integrations/schemas/repository-configuration-schema.json @@ -142,6 +142,84 @@ } } } + }, + "snippet_choices": { + "type": "array", + "description": "A configuration to select a snippet from a package with multiple snippet findings.", + "items": { + "type": "object", + "properties": { + "provenance": { + "type": "object", + "properties": { + "url": { + "type": "string" + } + }, + "required": [ + "url" + ] + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "given": { + "type": "object", + "properties": { + "sourceLocation": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "lineStart": { + "type": "integer" + }, + "lineEnd": { + "type": "integer" + } + }, + "required": [ + "path", + "lineStart", + "lineEnd" + ] + } + } + }, + "choice": { + "type": "object", + "properties": { + "purl": { + "type": "string" + }, + "reason": { + "$ref": "#/definitions/snippetChoiceReason" + }, + "comment": { + "type": "string" + } + }, + "required": [ + "reason", + "reasoning" + ] + } + }, + "required": [ + "given", + "choice" + ] + } + } + }, + "required": [ + "provenance", + "choices" + ] + } } }, "definitions": { @@ -210,6 +288,13 @@ "NOT_DETECTED", "REFERENCE" ] + }, + "snippetChoiceReason": { + "enum": [ + "NO_RELEVANT_FINDING", + "ORIGINAL_FINDING", + "OTHER" + ] } } } diff --git a/model/src/main/kotlin/config/RepositoryConfiguration.kt b/model/src/main/kotlin/config/RepositoryConfiguration.kt index 85b24d4ab037e..4f645afcb3ef6 100644 --- a/model/src/main/kotlin/config/RepositoryConfiguration.kt +++ b/model/src/main/kotlin/config/RepositoryConfiguration.kt @@ -68,5 +68,11 @@ data class RepositoryConfiguration( * Defines license choices within this repository. */ @JsonInclude(value = JsonInclude.Include.CUSTOM, valueFilter = LicenseChoicesFilter::class) - val licenseChoices: LicenseChoices = LicenseChoices() + val licenseChoices: LicenseChoices = LicenseChoices(), + + /** + * Defines snippet choices for projects in this repository. + */ + @JsonInclude(value = JsonInclude.Include.NON_EMPTY) + val snippetChoices: List = emptyList() ) diff --git a/model/src/main/kotlin/config/SnippetChoices.kt b/model/src/main/kotlin/config/SnippetChoices.kt new file mode 100644 index 0000000000000..f7b05e3fc63a2 --- /dev/null +++ b/model/src/main/kotlin/config/SnippetChoices.kt @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2024 The ORT Project Authors (see ) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + * License-Filename: LICENSE + */ + +package org.ossreviewtoolkit.model.config + +import org.ossreviewtoolkit.model.RepositoryProvenance +import org.ossreviewtoolkit.model.config.snippet.SnippetChoice + +/** + * A collection of snippet choices for a given provenance. + */ +data class SnippetChoices( + /** + * The provenance this snippet choice applies to. + */ + val provenance: Provenance, + + /** + * The snippet choices for this package. + */ + val choices: List +) + +/** + * The URL of the [RepositoryProvenance] the snippet choice applies to. + */ +data class Provenance(val url: String) diff --git a/model/src/main/kotlin/config/snippet/SnippetChoice.kt b/model/src/main/kotlin/config/snippet/SnippetChoice.kt new file mode 100644 index 0000000000000..181540603fa30 --- /dev/null +++ b/model/src/main/kotlin/config/snippet/SnippetChoice.kt @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2024 The ORT Project Authors (see ) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + * License-Filename: LICENSE + */ + +package org.ossreviewtoolkit.model.config.snippet + +import org.ossreviewtoolkit.model.TextLocation + +/** + * A snippet choice for a given source file. + */ +data class SnippetChoice( + /** + * The source file criteria for which the snippet choice is made. + */ + val given: Given, + + /** + * The snippet criteria to make the snippet choice. + */ + val choice: Choice +) + +/** + * A source file criteria for which the snippet choice is made. + */ +data class Given( + /** + * The source file for which the snippet choice is made. + */ + val sourceLocation: TextLocation +) + +/** + * A snippet criteria to make the snippet choice. + */ +data class Choice( + /** + * The purl of the snippet chosen by this snippet choice. It [reason] is [SnippetChoiceReason.NO_RELEVANT_FINDING], + * it is null. + */ + val purl: String? = null, + + /** + * The reason why this snippet choice was made. + */ + val reason: SnippetChoiceReason, + + /** + * An optional comment describing the snippet choice. + */ + val comment: String? = null +) diff --git a/model/src/main/kotlin/config/snippet/SnippetChoiceReason.kt b/model/src/main/kotlin/config/snippet/SnippetChoiceReason.kt new file mode 100644 index 0000000000000..38a9712ae08f0 --- /dev/null +++ b/model/src/main/kotlin/config/snippet/SnippetChoiceReason.kt @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2024 The ORT Project Authors (see ) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + * License-Filename: LICENSE + */ + +package org.ossreviewtoolkit.model.config.snippet + +/** + * The reason for which the snippet choice has been made. + */ +enum class SnippetChoiceReason { + /** + * No relevant finding has been found for the corresponding source file. All snippets will be ignored. + */ + NO_RELEVANT_FINDING, + + /** + * One snippet finding is relevant for the corresponding source file. All other snippets will be ignored. + */ + ORIGINAL_FINDING, + + /** + * Other reason. + */ + OTHER +} diff --git a/model/src/test/kotlin/config/RepositoryConfigurationTest.kt b/model/src/test/kotlin/config/RepositoryConfigurationTest.kt index 1465c21f5b51f..74ea6792e1d36 100644 --- a/model/src/test/kotlin/config/RepositoryConfigurationTest.kt +++ b/model/src/test/kotlin/config/RepositoryConfigurationTest.kt @@ -24,12 +24,15 @@ import com.fasterxml.jackson.databind.exc.ValueInstantiationException import io.kotest.assertions.throwables.shouldThrow import io.kotest.core.spec.style.WordSpec import io.kotest.matchers.collections.haveSize +import io.kotest.matchers.nulls.beNull import io.kotest.matchers.should import io.kotest.matchers.shouldBe import io.kotest.matchers.string.shouldContain import io.kotest.matchers.string.shouldNotContain import org.ossreviewtoolkit.model.Identifier +import org.ossreviewtoolkit.model.TextLocation +import org.ossreviewtoolkit.model.config.snippet.SnippetChoiceReason import org.ossreviewtoolkit.model.fromYaml import org.ossreviewtoolkit.utils.spdx.toSpdx import org.ossreviewtoolkit.utils.test.shouldNotBeNull @@ -121,6 +124,18 @@ class RepositoryConfigurationTest : WordSpec({ - given: MPL-2.0 or EPL-1.0 choice: MPL-2.0 - choice: MPL-2.0 AND MIT + snippet_choices: + - provenance: + url: "https://github.com/vdurmont/semver4j.git" + choices: + - given: + source_location: + path: "CHANGELOG.md" + start_line: 2 + end_line: 5 + choice: + reason: "NO_RELEVANT_FINDING" + comment: "Explain why this location has only false positives snippets" """.trimIndent() val repositoryConfiguration = configuration.fromYaml() @@ -197,6 +212,20 @@ class RepositoryConfigurationTest : WordSpec({ choice shouldBe "MPL-2.0 AND MIT".toSpdx() } } + + val snippetChoices = repositoryConfiguration.snippetChoices + snippetChoices should haveSize(1) + + with(snippetChoices.first()) { + provenance.url shouldBe "https://github.com/vdurmont/semver4j.git" + with(choices.first()) { + given.sourceLocation shouldBe TextLocation("CHANGELOG.md", 2, 5) + + choice.purl should beNull() + choice.reason shouldBe SnippetChoiceReason.NO_RELEVANT_FINDING + choice.comment shouldBe "Explain why this location has only false positives snippets" + } + } } } }) diff --git a/website/docs/configuration/snippet-choice.md b/website/docs/configuration/snippet-choice.md new file mode 100644 index 0000000000000..9ea946ec67b48 --- /dev/null +++ b/website/docs/configuration/snippet-choice.md @@ -0,0 +1,187 @@ +The snippets are short pieces of code coming from difference public sources such as GitHub, GitLab, Stackoverflow, ... + +Some product such as ScanOSS and FossID scrape those public sources and build a Knowledge Base of snippets, classifying +them by their Author, Version and License. Then, the snippet scanner component of such products can use this Knowledge +Base to find if some source file contains some of those snippets. The matching of snippet can be: + +* *full*: the whole snippet matches the source file +* *partial*: only some parts of a snippet matches the source file + +Currently, ORT supports two products offering snippet scanners capabilities: ScanOSS and FossID. While each +implementation is specific, the base workings are the same: ORT submits a source file to scan to the snippet scanner and +receives the list of the snippets matching some parts of this source file. ORT puts those snippets in the +`SnippetFindings` property of the [ScanSummary](https://github.com/oss-review-toolkit/ort/blob/main/model/src/main/kotlin/ScanSummary.kt). + +# Snippet choice + +As mentioned in the previous section, ORT returns the list of snippets (i.e. matches) for a given source file. These +snippets are aggregated by source code location, including the line ranges. + +ORT lacks a mechanism to control the snippets results: while some of these results are legitimate, others are false +positives. The snippet choice feature aims at filling this gap by allowing the user to: + +* Choose a snippet as the origin of the code snippet found in the project source code, discarding all other snippets for +this source location. +* Mark all the snippets for a given source location as false positives. + +Hence, the user can control which snippets are present in the ORT snippet results. + +## Choosing a snippet + +Let's say a source file `ci.yml` has been scanned with the following findings: + +```yaml +snippets: + - source_location: + path: ".github/workflows/ci.yml" + start_line: 3 + end_line: 17 + snippets: + - score: 0.93 + location: + path: "dot_config/nvim/autoload/plugged/vim-devicons/dot_github/workflows/vint.yml" + start_line: 3 + end_line: 18 + provenance: + source_artifact: + url: "https://github.com/RS2007/dotfiles/archive/0384a21038fd2e5befb429d0ca52384172607a6d.tar.gz" + hash: + value: "" + algorithm: "" + purl: "pkg:github/RS2007/dotfiles@0384a21038fd2e5befb429d0ca52384172607a6d" + licenses: "MIT" + - score: 0.93 + location: + path: "private_dot_config/nvim/plugged/vim-devicons/dot_github/workflows/vint.yml" + start_line: 3 + end_line: 18 + provenance: + source_artifact: + url: "https://github.com/stianfro/dotfiles/archive/b371008f262377599edac1c8ea23ef53da82f832.tar.gz" + hash: + value: "" + algorithm: "" + purl: "pkg:github/stianfro/dotfiles@b371008f262377599edac1c8ea23ef53da82f832" + licenses: "Apache-2.0" +``` + +Now an operator has decided the snippet `pkg:github/RS2007/dotfiles@0384a21038fd2e5befb429d0ca52384172607a6d` is +indeed a match and should be reflected in ORT results. To do so, the user defines in the repository's `.ort.yml` the +following **snippet choice**: + +```yaml +package_snippet_choices: + - provenance: + url: "https://github.com/vdurmont/semver4j.git" + choices: + - given: + source_location: + path: ".github/workflows/ci.yml" + start_line: 3 + end_line: 17 + choice: + purl: "pkg:github/RS2007/dotfiles@0384a21038fd2e5befb429d0ca52384172607a6d" + reason: "ORIGINAL_FINDING" + comment: "Explain why this snippet choice was made" +``` + +Three properties are required to identify the recipients of the snippet choice: + +* `provenance.url` is the provenance of the repository of the source file +* `choices.given.source_location` identifies the source file receiving the snippet choice. +* `choices.choice.purl` is the Purl identifying the snippet + +There is also another mandatory property: + +* `choices.choice.reason` enum member of [SnippetChoiceReason](https://github.com/oss-review-toolkit/ort/blob/main/model/src/main/kotlin/config/SnippetChoiceReason.kt). + +Finally, one property is *informative* and aim at making the snippet choice configuration more maintainable: + +* `choices.choice.comment` describes why the snippet choice was made. + +The snippet choice is an iterative process: one must first run ORT to get the snippets in the scan results. Then, one or +several snippets can be chosen in the `.ort.yml` file. Then, ORT is run again to generate nw scan results, taking in +account those chosen snippets. This loop can be repeated as needed. + +### What are the consequences of a snippet choice ? + +1. The license of the chosen snippet will be added to the license findings + + The consequences are *scanner specific*: + + * For FossID, these findings are usually coming for files that have been marked as identified. With the snippet +choice, pending files with a chosen snippet should therefore be marked as identified with the license of the snippet as +identification. + * For ScanOSS license findings are currently coming from *full matched* snippet. With the snippet choice, also files +with a partial snippet match (and a chosen snippet) will have the license of the snippet as license finding. + +2. For a chosen snippet of a given source code location, all the other snippets the other snippets will be considered as +**false positives** and be removed from the scan results. As it makes no sense to choose two snippets for a same source +location, this powerful feature allows to keep in the scan results only snippet findings that require attention. +3. The snippets that have been chosen won't be visible in the FossID snippet report anymore. +4. The FossID files with a snippet choice are not *pending* anymore since they will be marked as identified. +Consequently, they won't be counted in the special "pending files count" ORT issue created by the FossID scanner. + +## Handling false positives + +Continuing with the example from [above](snippet-choice.md#choosing-a-snippet), a problem remain: How to deal with a +source location that has *only* false positives snippets ? The solution is to use the `NO_RELEVANT_FINDING` reason in +the `.ort.yml` file: + +```yaml + +package_snippet_choices: + - provenance: + url: "https://github.com/vdurmont/semver4j.git" + choices: + - given: + source_location: + path: "CHANGELOG.md" + start_line: 2 + end_line: 5 + choice: + reason: "NO_RELEVANT_FINDING" + comment: "Explain why this location has only false positives snippets" +``` + +Three properties are required to mark all the snippets for a given location as false positives: + +* `provenance.url` is the provenance of the repository of the source file +* `choices.given.source_location` identifies the source file for which the snippet have been matched against + +There is also another mandatory property: + +* `choices.choice.reason` always [SnippetChoiceReason.NO_RELEVANT_FINDING](https://github.com/oss-review-toolkit/ort/blob/main/model/src/main/kotlin/config/SnippetChoiceReason.kt#L26). + +And an optional one: + +* `choices.choice.comment` describes why the snippet is a false positive. + +### What are the consequences of snippets marked as *false positives* ? + +1. The snippets that are *false positives* are removed be from the scan results. +2. The snippets that are *false positives* won't be visible in the FossID snippet report. + +## Snippet choice FAQ + +Q: *If the snippets with a snippet choice or a `NO_RELEVANT_FINDING` are not in the scan results anymore, +what happens when there is a new snippet finding for this source location (e.g. after an update of the Knowledge base)?* + +A: Conceptually, the scan results that have been removed in a past run may have to be added again, with the addition of +new detected snippets. How to achieve that will be remote scanner-specific as FossID is stateful while ScanOSS is not. + +Q: Identically, *what happen if a snippet with a snippet choice or a `NO_RELEVANT_FINDING` is not present in +the scanner Knowledge Base anymore, e.g. after an update?* + +A: This is problematic as it means the **.ort.yml** will be filled with snippet choices or *false positives* that are +not relevant anymore, i.e. garbage data. +This is the responsibility of ORT to always query the full snippet findings and compare them with the entries in the +`.ort.yml` file. Then, an issue can be raised to notify the user that a snippet choice or a +`NO_RELEVANT_FINDING` is not required anymore. +However, for FossID, querying the full snippet findings may be skipped if the state of FossID is used. + +Q: *What happens when the user delete a previously chosen snippet in the .ort.yml file and retrigger a scan?* + +A: All the snippets for this source location need to be added again to the scan results. Therefore, as with the previous +question, the full snippet findings need to be queried. +And here too, in the case of FossID, its state can be used for that.