fix: better handle errors in static analysis

This was needed already. But ultimately inspired by a user tha had the following error: ``` File "codecov_cli/services/staticanalysis/analyzers/python/node_wrappers.py", line 41, in do_visit first_if_statement = first_if_statement.children[0] IndexError: list index out of range ``` Despite my best efforts, I was only able to reproduce this error using invalid Python code (i.e. an empty `if` statement) Regardless of what caused this issue in the 1st place, there may be valid reasons for someone to have malformed code in their repo (depending what the repo is 👀) So we should be able to still upload _something_ and ignore the failures. The tricky thing here is that without the static analysis data ATS can't work well. And we can't be sure that the analyzer doesn't have bugs. I don't think this case in particular is one of them, but I could be wrong. So these changes actually are a way to surface processing errors AND ignore the files that generated them, WHILE also surfacing errors to users. They are surfaced twice. Ideally users would look at them and let us know if something is not right. But maybe we should still fail the static analysis step in their CI? 🧐
codecov · Sep 29, 2023 · 22b4630 · 22b4630
1 parent 01986b6
commit 22b4630
Show file tree

Hide file tree

Showing 5 changed files with 150 additions and 35 deletions.
diff --git a/codecov_cli/services/staticanalysis/__init__.py b/codecov_cli/services/staticanalysis/__init__.py
@@ -36,24 +36,15 @@ async def run_analysis_entrypoint(
 ):
     ff = select_file_finder(config)
     files = list(ff.find_files(folder, pattern, folders_to_exclude))
-    logger.info(f"Running the analyzer on {len(files)} files")
-    mapped_func = partial(analyze_file, config)
-    all_data = {}
-    file_metadata = []
-    with click.progressbar(
-        length=len(files),
-        label="Analyzing files",
-    ) as bar:
-        with get_context("fork").Pool(processes=numberprocesses) as pool:
-            file_results = pool.imap_unordered(mapped_func, files)
-            for x in file_results:
-                bar.update(1, x)
-                if x is not None:
-                    res = x.asdict()["result"]
-                    all_data[x.filename] = res
-                    file_metadata.append(
-                        {"filepath": x.filename, "file_hash": res["hash"]}
-                    )
+    processing_results = await process_files(files, numberprocesses, config)
+    # Let users know if there were processing errors
+    # This is here and not in the funcition so we can add an option to ignore those (possibly)
+    # Also makes the function easier to test
+    processing_errors = processing_results["processing_errors"]
+    log_processing_errors(processing_errors)
+    # Upload results metadata to codecov to get list of files that we need to upload
+    file_metadata = processing_results["file_metadata"]
+    all_data = processing_results["all_data"]
     try:
         json_output = {"commit": commit, "filepaths": file_metadata}
         logger.debug(
@@ -150,6 +141,51 @@ async def run_analysis_entrypoint(
             extra_log_attributes=dict(time_taken=response.elapsed.total_seconds())
         ),
     )
+    log_processing_errors(processing_errors)
+
+
+def log_processing_errors(processing_errors: typing.Dict[str, str]) -> None:
+    if len(processing_errors) > 0:
+        logger.error(
+            f"{len(processing_errors)} files have processing errors and have been IGNORED."
+        )
+        for file, error in processing_errors.items():
+            logger.error(f"-> {file}: ERROR {error}")
+
+
+async def process_files(
+    files_to_analyze: typing.List[FileAnalysisRequest],
+    numberprocesses: int,
+    config: typing.Optional[typing.Dict],
+):
+    logger.info(f"Running the analyzer on {len(files_to_analyze)} files")
+    mapped_func = partial(analyze_file, config)
+    all_data = {}
+    file_metadata = []
+    errors = {}
+    with click.progressbar(
+        length=len(files_to_analyze),
+        label="Analyzing files",
+    ) as bar:
+        with get_context("fork").Pool(processes=numberprocesses) as pool:
+            file_results = pool.imap_unordered(mapped_func, files_to_analyze)
+            for result in file_results:
+                bar.update(1, result)
+                if result is not None:
+                    if result.result:
+                        all_data[result.filename] = result.result
+                        file_metadata.append(
+                            {
+                                "filepath": result.filename,
+                                "file_hash": result.result["hash"],
+                            }
+                        )
+                    elif result.error:
+                        errors[result.filename] = result.error
+    logger.info("All files have been processed")
+    return dict(
+        all_data=all_data, file_metadata=file_metadata, processing_errors=errors
+    )
 
 
 async def send_single_upload_put(client, all_data, el):
@@ -205,7 +241,9 @@ def send_finish_signal(response_json, upload_url: str, token: str):
     return response
 
 
-def analyze_file(config, filename: FileAnalysisRequest):
+def analyze_file(
+    config, filename: FileAnalysisRequest
+) -> typing.Optional[FileAnalysisResult]:
     try:
         with open(filename.actual_filepath, "rb") as file:
             actual_code = file.read()
@@ -219,7 +257,7 @@ def analyze_file(config, filename: FileAnalysisRequest):
     except AnalysisError as e:
         error_dict = {
             "filename": str(filename.result_filename),
-            "error_class": str(type(e)),
+            "error": str(e),
         }
         return FileAnalysisResult(
             filename=str(filename.result_filename), error=error_dict

diff --git a/codecov_cli/services/staticanalysis/analyzers/__init__.py b/codecov_cli/services/staticanalysis/analyzers/__init__.py
@@ -1,9 +1,12 @@
 from codecov_cli.services.staticanalysis.analyzers.general import BaseAnalyzer
 from codecov_cli.services.staticanalysis.analyzers.javascript_es6 import ES6Analyzer
 from codecov_cli.services.staticanalysis.analyzers.python import PythonAnalyzer
+from codecov_cli.services.staticanalysis.types import FileAnalysisRequest
 
 
-def get_best_analyzer(filename, actual_code) -> BaseAnalyzer:
+def get_best_analyzer(
+    filename: FileAnalysisRequest, actual_code: bytes
+) -> BaseAnalyzer:
     if filename.actual_filepath.suffix == ".py":
         return PythonAnalyzer(filename, actual_code)
     if filename.actual_filepath.suffix == ".js":

diff --git a/codecov_cli/services/staticanalysis/analyzers/python/__init__.py b/codecov_cli/services/staticanalysis/analyzers/python/__init__.py
@@ -7,6 +7,7 @@
 from codecov_cli.services.staticanalysis.analyzers.python.node_wrappers import (
     NodeVisitor,
 )
+from codecov_cli.services.staticanalysis.types import FileAnalysisRequest
 
 _function_query_str = """
 (function_definition
@@ -52,14 +53,16 @@ class PythonAnalyzer(BaseAnalyzer):
     ]
     wrappers = ["class_definition", "function_definition"]
 
-    def __init__(self, path, actual_code, **options):
+    def __init__(
+        self, file_analysis_request: FileAnalysisRequest, actual_code: bytes, **options
+    ):
         self.actual_code = actual_code
         self.lines = self.actual_code.split(b"\n")
         self.statements = []
         self.import_lines = set()
         self.definitions_lines = set()
         self.functions = []
-        self.path = path.result_filename
+        self.path = file_analysis_request.result_filename
         self.PY_LANGUAGE = Language(staticcodecov_languages.__file__, "python")
         self.parser = Parser()
         self.parser.set_language(self.PY_LANGUAGE)

diff --git a/codecov_cli/services/staticanalysis/analyzers/python/node_wrappers.py b/codecov_cli/services/staticanalysis/analyzers/python/node_wrappers.py
@@ -1,16 +1,21 @@
+from tree_sitter import Node
+
+from codecov_cli.services.staticanalysis.exceptions import AnalysisError
+
+
 class NodeVisitor(object):
     def __init__(self, analyzer):
         self.analyzer = analyzer
 
     def start_visit(self, node):
         self.visit(node)
 
-    def visit(self, node):
+    def visit(self, node: Node):
         self.do_visit(node)
         for c in node.children:
             self.visit(c)
 
-    def _is_function_docstring(self, node):
+    def _is_function_docstring(self, node: Node):
         """Skips docstrings for funtions, such as this one.
         Pytest doesn't include them in the report, so I don't think we should either,
         at least for now.
@@ -39,15 +44,15 @@ def _is_function_docstring(self, node):
             and is_in_function_context
         )
 
-    def _get_previous_sibling_that_is_not_comment_not_func_docstring(self, node):
+    def _get_previous_sibling_that_is_not_comment_not_func_docstring(self, node: Node):
         curr = node.prev_named_sibling
         while curr is not None and (
             curr.type == "comment" or self._is_function_docstring(curr)
         ):
             curr = curr.prev_named_sibling
         return curr
 
-    def do_visit(self, node):
+    def do_visit(self, node: Node):
         if node.is_named:
             current_line_number = node.start_point[0] + 1
             if node.type in (
@@ -84,17 +89,29 @@ def do_visit(self, node):
                     }
                 )
             if node.type in ("if_statement", "elif_clause"):
+                # Some of the children of a node have a field_name associated to them
+                # In the case of an if and elif, "consequence" is the code that is executed in that branch of code
                 first_if_statement = node.child_by_field_name("consequence")
-                if first_if_statement.type == "block":
-                    first_if_statement = first_if_statement.children[0]
+                try:
+                    if first_if_statement.type == "block":
+                        first_if_statement = first_if_statement.children[0]  # BUG
+                except IndexError:
+                    raise AnalysisError(
+                        f"if_statement consequence is empty block @ {self.analyzer.path}:{first_if_statement.start_point[0] + 1}, column {first_if_statement.start_point[1]}"
+                    )
                 self.analyzer.line_surety_ancestorship[
                     first_if_statement.start_point[0] + 1
                 ] = current_line_number
             if node.type in ("for_statement", "while_statement"):
-                first_if_statement = node.child_by_field_name("body")
-                if first_if_statement.type == "block":
-                    first_if_statement = first_if_statement.children[0]
+                first_loop_statement = node.child_by_field_name("body")
+                try:
+                    if first_loop_statement.type == "block":
+                        first_loop_statement = first_loop_statement.children[0]
+                except IndexError:
+                    raise AnalysisError(
+                        f"loop_statement body is empty block @ {self.analyzer.path}:{first_loop_statement.start_point[0] + 1}, column {first_loop_statement.start_point[1]}"
+                    )
                 self.analyzer.line_surety_ancestorship[
-                    first_if_statement.start_point[0] + 1
+                    first_loop_statement.start_point[0] + 1
                 ] = current_line_number
                 pass
diff --git a/tests/services/static_analysis/test_static_analysis_service.py b/tests/services/static_analysis/test_static_analysis_service.py
@@ -7,11 +7,65 @@
 import responses
 from responses import matchers
 
-from codecov_cli.services.staticanalysis import run_analysis_entrypoint
-from codecov_cli.services.staticanalysis.types import FileAnalysisRequest
+from codecov_cli.services.staticanalysis import process_files, run_analysis_entrypoint
+from codecov_cli.services.staticanalysis.types import (
+    FileAnalysisRequest,
+    FileAnalysisResult,
+)
 
 
 class TestStaticAnalysisService:
+    @pytest.mark.asyncio
+    async def test_process_files_with_error(self, mocker):
+        files_found = list(
+            map(
+                lambda filename: FileAnalysisRequest(str(filename), Path(filename)),
+                [
+                    "correct_file.py",
+                    "error_file.py",
+                ],
+            )
+        )
+        mock_get_context = mocker.patch(
+            "codecov_cli.services.staticanalysis.get_context"
+        )
+
+        def side_effect(config, filename: FileAnalysisRequest):
+            if filename.result_filename == "correct_file.py":
+                return FileAnalysisResult(
+                    filename=filename.result_filename, result={"hash": "abc123"}
+                )
+            if filename.result_filename == "error_file.py":
+                return FileAnalysisResult(
+                    filename=filename.result_filename, error="some error @ line 12"
+                )
+            # Should not get here, so fail test
+            assert False
+
+        mock_analyze_function = mocker.patch(
+            "codecov_cli.services.staticanalysis.analyze_file"
+        )
+        mock_analyze_function.side_effect = side_effect
+
+        def imap_side_effect(mapped_func, files):
+            results = []
+            for file in files:
+                results.append(mapped_func(file))
+            return results
+
+        mock_get_context.return_value.Pool.return_value.__enter__.return_value.imap_unordered.side_effect = (
+            imap_side_effect
+        )
+
+        results = await process_files(files_found, 1, {})
+        mock_get_context.return_value.Pool.return_value.__enter__.return_value.imap_unordered.assert_called()
+        assert mock_analyze_function.call_count == 2
+        assert results == dict(
+            all_data={"correct_file.py": {"hash": "abc123"}},
+            file_metadata=[{"file_hash": "abc123", "filepath": "correct_file.py"}],
+            processing_errors={"error_file.py": "some error @ line 12"},
+        )
+
     @pytest.mark.asyncio
     async def test_static_analysis_service(self, mocker):
         mock_file_finder = mocker.patch(