From 6020fc0123c9c36e42cd3ee2b02a19ba85ff888c Mon Sep 17 00:00:00 2001
From: Giovanni M Guidini <99758426+giovanni-guidini@users.noreply.github.com>
Date: Fri, 29 Sep 2023 12:38:17 -0300
Subject: [PATCH] fix: better handle errors in static analysis (#271)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This was needed already.
But ultimately inspired by a user tha had the following error:

```
File "codecov_cli/services/staticanalysis/analyzers/python/node_wrappers.py", line 41, in do_visit
    first_if_statement = first_if_statement.children[0]
IndexError: list index out of range
```

Despite my best efforts, I was only able to reproduce this error using invalid Python code
(i.e. an empty `if` statement)

Regardless of what caused this issue in the 1st place, there may be valid reasons
for someone to have malformed code in their repo (depending what the repo is 👀)
So we should be able to still upload _something_ and ignore the failures.

The tricky thing here is that without the static analysis data ATS can't work well.
And we can't be sure that the analyzer doesn't have bugs. I don't think this case in
particular is one of them, but I could be wrong.

So these changes actually are a way to surface processing errors AND ignore the files
that generated them, WHILE also surfacing errors to users. They are surfaced twice.
Ideally users would look at them and let us know if something is not right.

But maybe we should still fail the static analysis step in their CI? 🧐
---
 .../services/staticanalysis/__init__.py       | 78 ++++++++++++++-----
 .../staticanalysis/analyzers/__init__.py      |  5 +-
 .../analyzers/python/__init__.py              |  7 +-
 .../analyzers/python/node_wrappers.py         | 37 ++++++---
 .../test_node_wrappers_malformed_code.py      | 65 ++++++++++++++++
 .../test_static_analysis_service.py           | 58 +++++++++++++-
 6 files changed, 215 insertions(+), 35 deletions(-)
 create mode 100644 tests/services/static_analysis/languages/python/test_node_wrappers_malformed_code.py

diff --git a/codecov_cli/services/staticanalysis/__init__.py b/codecov_cli/services/staticanalysis/__init__.py
index 2b01b4c8..aa2ab058 100644
--- a/codecov_cli/services/staticanalysis/__init__.py
+++ b/codecov_cli/services/staticanalysis/__init__.py
@@ -36,24 +36,15 @@ async def run_analysis_entrypoint(
 ):
     ff = select_file_finder(config)
     files = list(ff.find_files(folder, pattern, folders_to_exclude))
-    logger.info(f"Running the analyzer on {len(files)} files")
-    mapped_func = partial(analyze_file, config)
-    all_data = {}
-    file_metadata = []
-    with click.progressbar(
-        length=len(files),
-        label="Analyzing files",
-    ) as bar:
-        with get_context("fork").Pool(processes=numberprocesses) as pool:
-            file_results = pool.imap_unordered(mapped_func, files)
-            for x in file_results:
-                bar.update(1, x)
-                if x is not None:
-                    res = x.asdict()["result"]
-                    all_data[x.filename] = res
-                    file_metadata.append(
-                        {"filepath": x.filename, "file_hash": res["hash"]}
-                    )
+    processing_results = await process_files(files, numberprocesses, config)
+    # Let users know if there were processing errors
+    # This is here and not in the funcition so we can add an option to ignore those (possibly)
+    # Also makes the function easier to test
+    processing_errors = processing_results["processing_errors"]
+    log_processing_errors(processing_errors)
+    # Upload results metadata to codecov to get list of files that we need to upload
+    file_metadata = processing_results["file_metadata"]
+    all_data = processing_results["all_data"]
     try:
         json_output = {"commit": commit, "filepaths": file_metadata}
         logger.debug(
@@ -150,6 +141,51 @@ async def run_analysis_entrypoint(
             extra_log_attributes=dict(time_taken=response.elapsed.total_seconds())
         ),
     )
+    log_processing_errors(processing_errors)
+
+
+def log_processing_errors(processing_errors: typing.Dict[str, str]) -> None:
+    if len(processing_errors) > 0:
+        logger.error(
+            f"{len(processing_errors)} files have processing errors and have been IGNORED."
+        )
+        for file, error in processing_errors.items():
+            logger.error(f"-> {file}: ERROR {error}")
+
+
+async def process_files(
+    files_to_analyze: typing.List[FileAnalysisRequest],
+    numberprocesses: int,
+    config: typing.Optional[typing.Dict],
+):
+    logger.info(f"Running the analyzer on {len(files_to_analyze)} files")
+    mapped_func = partial(analyze_file, config)
+    all_data = {}
+    file_metadata = []
+    errors = {}
+    with click.progressbar(
+        length=len(files_to_analyze),
+        label="Analyzing files",
+    ) as bar:
+        with get_context("fork").Pool(processes=numberprocesses) as pool:
+            file_results = pool.imap_unordered(mapped_func, files_to_analyze)
+            for result in file_results:
+                bar.update(1, result)
+                if result is not None:
+                    if result.result:
+                        all_data[result.filename] = result.result
+                        file_metadata.append(
+                            {
+                                "filepath": result.filename,
+                                "file_hash": result.result["hash"],
+                            }
+                        )
+                    elif result.error:
+                        errors[result.filename] = result.error
+    logger.info("All files have been processed")
+    return dict(
+        all_data=all_data, file_metadata=file_metadata, processing_errors=errors
+    )
 
 
 async def send_single_upload_put(client, all_data, el):
@@ -205,7 +241,9 @@ def send_finish_signal(response_json, upload_url: str, token: str):
     return response
 
 
-def analyze_file(config, filename: FileAnalysisRequest):
+def analyze_file(
+    config, filename: FileAnalysisRequest
+) -> typing.Optional[FileAnalysisResult]:
     try:
         with open(filename.actual_filepath, "rb") as file:
             actual_code = file.read()
@@ -219,7 +257,7 @@ def analyze_file(config, filename: FileAnalysisRequest):
     except AnalysisError as e:
         error_dict = {
             "filename": str(filename.result_filename),
-            "error_class": str(type(e)),
+            "error": str(e),
         }
         return FileAnalysisResult(
             filename=str(filename.result_filename), error=error_dict
diff --git a/codecov_cli/services/staticanalysis/analyzers/__init__.py b/codecov_cli/services/staticanalysis/analyzers/__init__.py
index b91d588d..077cbc5f 100644
--- a/codecov_cli/services/staticanalysis/analyzers/__init__.py
+++ b/codecov_cli/services/staticanalysis/analyzers/__init__.py
@@ -1,9 +1,12 @@
 from codecov_cli.services.staticanalysis.analyzers.general import BaseAnalyzer
 from codecov_cli.services.staticanalysis.analyzers.javascript_es6 import ES6Analyzer
 from codecov_cli.services.staticanalysis.analyzers.python import PythonAnalyzer
+from codecov_cli.services.staticanalysis.types import FileAnalysisRequest
 
 
-def get_best_analyzer(filename, actual_code) -> BaseAnalyzer:
+def get_best_analyzer(
+    filename: FileAnalysisRequest, actual_code: bytes
+) -> BaseAnalyzer:
     if filename.actual_filepath.suffix == ".py":
         return PythonAnalyzer(filename, actual_code)
     if filename.actual_filepath.suffix == ".js":
diff --git a/codecov_cli/services/staticanalysis/analyzers/python/__init__.py b/codecov_cli/services/staticanalysis/analyzers/python/__init__.py
index af3e7697..e535698b 100644
--- a/codecov_cli/services/staticanalysis/analyzers/python/__init__.py
+++ b/codecov_cli/services/staticanalysis/analyzers/python/__init__.py
@@ -7,6 +7,7 @@
 from codecov_cli.services.staticanalysis.analyzers.python.node_wrappers import (
     NodeVisitor,
 )
+from codecov_cli.services.staticanalysis.types import FileAnalysisRequest
 
 _function_query_str = """
 (function_definition
@@ -52,14 +53,16 @@ class PythonAnalyzer(BaseAnalyzer):
     ]
     wrappers = ["class_definition", "function_definition"]
 
-    def __init__(self, path, actual_code, **options):
+    def __init__(
+        self, file_analysis_request: FileAnalysisRequest, actual_code: bytes, **options
+    ):
         self.actual_code = actual_code
         self.lines = self.actual_code.split(b"\n")
         self.statements = []
         self.import_lines = set()
         self.definitions_lines = set()
         self.functions = []
-        self.path = path.result_filename
+        self.path = file_analysis_request.result_filename
         self.PY_LANGUAGE = Language(staticcodecov_languages.__file__, "python")
         self.parser = Parser()
         self.parser.set_language(self.PY_LANGUAGE)
diff --git a/codecov_cli/services/staticanalysis/analyzers/python/node_wrappers.py b/codecov_cli/services/staticanalysis/analyzers/python/node_wrappers.py
index 28f9e03f..f313bfb7 100644
--- a/codecov_cli/services/staticanalysis/analyzers/python/node_wrappers.py
+++ b/codecov_cli/services/staticanalysis/analyzers/python/node_wrappers.py
@@ -1,3 +1,8 @@
+from tree_sitter import Node
+
+from codecov_cli.services.staticanalysis.exceptions import AnalysisError
+
+
 class NodeVisitor(object):
     def __init__(self, analyzer):
         self.analyzer = analyzer
@@ -5,12 +10,12 @@ def __init__(self, analyzer):
     def start_visit(self, node):
         self.visit(node)
 
-    def visit(self, node):
+    def visit(self, node: Node):
         self.do_visit(node)
         for c in node.children:
             self.visit(c)
 
-    def _is_function_docstring(self, node):
+    def _is_function_docstring(self, node: Node):
         """Skips docstrings for funtions, such as this one.
         Pytest doesn't include them in the report, so I don't think we should either,
         at least for now.
@@ -39,7 +44,7 @@ def _is_function_docstring(self, node):
             and is_in_function_context
         )
 
-    def _get_previous_sibling_that_is_not_comment_not_func_docstring(self, node):
+    def _get_previous_sibling_that_is_not_comment_not_func_docstring(self, node: Node):
         curr = node.prev_named_sibling
         while curr is not None and (
             curr.type == "comment" or self._is_function_docstring(curr)
@@ -47,7 +52,7 @@ def _get_previous_sibling_that_is_not_comment_not_func_docstring(self, node):
             curr = curr.prev_named_sibling
         return curr
 
-    def do_visit(self, node):
+    def do_visit(self, node: Node):
         if node.is_named:
             current_line_number = node.start_point[0] + 1
             if node.type in (
@@ -84,17 +89,29 @@ def do_visit(self, node):
                     }
                 )
             if node.type in ("if_statement", "elif_clause"):
+                # Some of the children of a node have a field_name associated to them
+                # In the case of an if and elif, "consequence" is the code that is executed in that branch of code
                 first_if_statement = node.child_by_field_name("consequence")
-                if first_if_statement.type == "block":
-                    first_if_statement = first_if_statement.children[0]
+                try:
+                    if first_if_statement.type == "block":
+                        first_if_statement = first_if_statement.children[0]  # BUG
+                except IndexError:
+                    raise AnalysisError(
+                        f"if_statement consequence is empty block @ {self.analyzer.path}:{first_if_statement.start_point[0] + 1}, column {first_if_statement.start_point[1]}"
+                    )
                 self.analyzer.line_surety_ancestorship[
                     first_if_statement.start_point[0] + 1
                 ] = current_line_number
             if node.type in ("for_statement", "while_statement"):
-                first_if_statement = node.child_by_field_name("body")
-                if first_if_statement.type == "block":
-                    first_if_statement = first_if_statement.children[0]
+                first_loop_statement = node.child_by_field_name("body")
+                try:
+                    if first_loop_statement.type == "block":
+                        first_loop_statement = first_loop_statement.children[0]
+                except IndexError:
+                    raise AnalysisError(
+                        f"loop_statement body is empty block @ {self.analyzer.path}:{first_loop_statement.start_point[0] + 1}, column {first_loop_statement.start_point[1]}"
+                    )
                 self.analyzer.line_surety_ancestorship[
-                    first_if_statement.start_point[0] + 1
+                    first_loop_statement.start_point[0] + 1
                 ] = current_line_number
                 pass
diff --git a/tests/services/static_analysis/languages/python/test_node_wrappers_malformed_code.py b/tests/services/static_analysis/languages/python/test_node_wrappers_malformed_code.py
new file mode 100644
index 00000000..aa7f7fd1
--- /dev/null
+++ b/tests/services/static_analysis/languages/python/test_node_wrappers_malformed_code.py
@@ -0,0 +1,65 @@
+import pathlib
+
+import pytest
+from tree_sitter import Node
+
+from codecov_cli.services.staticanalysis.analyzers.python import PythonAnalyzer
+from codecov_cli.services.staticanalysis.analyzers.python.node_wrappers import (
+    NodeVisitor,
+)
+from codecov_cli.services.staticanalysis.exceptions import AnalysisError
+from codecov_cli.services.staticanalysis.types import FileAnalysisRequest
+
+
+class TestMalformedIfStatements(object):
+    def test_if_empty_block_raises_analysis_error(self):
+        analysis_request = FileAnalysisRequest(
+            actual_filepath=pathlib.Path("test_file"), result_filename="test_file"
+        )
+        # Code for an empty IF. NOT valid python code
+        actual_code = b'x = 10\nif x == "batata":\n\n'
+        python_analyser = PythonAnalyzer(analysis_request, actual_code=actual_code)
+        # Parse the code snippet and get the if_statement node
+        tree = python_analyser.parser.parse(actual_code)
+        root = tree.root_node
+        assert root.type == "module"
+        assert root.child_count == 2
+        if_statement_node = root.children[1]
+        assert if_statement_node.type == "if_statement"
+        # Make sure it is indeed an empty if_statement
+        if_body = if_statement_node.child_by_field_name("consequence")
+        assert if_body.type == "block"
+        assert if_body.child_count == 0
+        visitor = NodeVisitor(python_analyser)
+        with pytest.raises(AnalysisError) as exp:
+            visitor.do_visit(if_statement_node)
+        assert (
+            str(exp.value)
+            == "if_statement consequence is empty block @ test_file:2, column 17"
+        )
+
+    def test_for_empty_block_raises_analysis_error(self):
+        analysis_request = FileAnalysisRequest(
+            actual_filepath=pathlib.Path("test_file"), result_filename="test_file"
+        )
+        # Code for an empty IF. NOT valid python code
+        actual_code = b"for x in range(10):\n\n"
+        python_analyser = PythonAnalyzer(analysis_request, actual_code=actual_code)
+        # Parse the code snippet and get the if_statement node
+        tree = python_analyser.parser.parse(actual_code)
+        root = tree.root_node
+        assert root.type == "module"
+        assert root.child_count == 1
+        for_statement_node = root.children[0]
+        assert for_statement_node.type == "for_statement"
+        # Make sure it is indeed an empty if_statement
+        if_body = for_statement_node.child_by_field_name("body")
+        assert if_body.type == "block"
+        assert if_body.child_count == 0
+        visitor = NodeVisitor(python_analyser)
+        with pytest.raises(AnalysisError) as exp:
+            visitor.do_visit(for_statement_node)
+        assert (
+            str(exp.value)
+            == "loop_statement body is empty block @ test_file:1, column 19"
+        )
diff --git a/tests/services/static_analysis/test_static_analysis_service.py b/tests/services/static_analysis/test_static_analysis_service.py
index aaac3f0e..ea443700 100644
--- a/tests/services/static_analysis/test_static_analysis_service.py
+++ b/tests/services/static_analysis/test_static_analysis_service.py
@@ -7,11 +7,65 @@
 import responses
 from responses import matchers
 
-from codecov_cli.services.staticanalysis import run_analysis_entrypoint
-from codecov_cli.services.staticanalysis.types import FileAnalysisRequest
+from codecov_cli.services.staticanalysis import process_files, run_analysis_entrypoint
+from codecov_cli.services.staticanalysis.types import (
+    FileAnalysisRequest,
+    FileAnalysisResult,
+)
 
 
 class TestStaticAnalysisService:
+    @pytest.mark.asyncio
+    async def test_process_files_with_error(self, mocker):
+        files_found = list(
+            map(
+                lambda filename: FileAnalysisRequest(str(filename), Path(filename)),
+                [
+                    "correct_file.py",
+                    "error_file.py",
+                ],
+            )
+        )
+        mock_get_context = mocker.patch(
+            "codecov_cli.services.staticanalysis.get_context"
+        )
+
+        def side_effect(config, filename: FileAnalysisRequest):
+            if filename.result_filename == "correct_file.py":
+                return FileAnalysisResult(
+                    filename=filename.result_filename, result={"hash": "abc123"}
+                )
+            if filename.result_filename == "error_file.py":
+                return FileAnalysisResult(
+                    filename=filename.result_filename, error="some error @ line 12"
+                )
+            # Should not get here, so fail test
+            assert False
+
+        mock_analyze_function = mocker.patch(
+            "codecov_cli.services.staticanalysis.analyze_file"
+        )
+        mock_analyze_function.side_effect = side_effect
+
+        def imap_side_effect(mapped_func, files):
+            results = []
+            for file in files:
+                results.append(mapped_func(file))
+            return results
+
+        mock_get_context.return_value.Pool.return_value.__enter__.return_value.imap_unordered.side_effect = (
+            imap_side_effect
+        )
+
+        results = await process_files(files_found, 1, {})
+        mock_get_context.return_value.Pool.return_value.__enter__.return_value.imap_unordered.assert_called()
+        assert mock_analyze_function.call_count == 2
+        assert results == dict(
+            all_data={"correct_file.py": {"hash": "abc123"}},
+            file_metadata=[{"file_hash": "abc123", "filepath": "correct_file.py"}],
+            processing_errors={"error_file.py": "some error @ line 12"},
+        )
+
     @pytest.mark.asyncio
     async def test_static_analysis_service(self, mocker):
         mock_file_finder = mocker.patch(