From 2559335cdfc1657c0139dd3de9ad3b03152be2ff Mon Sep 17 00:00:00 2001 From: williamfzc <178894043@qq.com> Date: Sat, 23 Dec 2023 16:06:03 +0800 Subject: [PATCH] feat(#3): diff-driven graph in cli --- srctag/cli.py | 99 ++++++++++++++++++++++++++++++++++++++++++--- srctag/collector.py | 39 +++++++++++++----- tests/test_cli.py | 16 ++++++++ 3 files changed, 140 insertions(+), 14 deletions(-) create mode 100644 tests/test_cli.py diff --git a/srctag/cli.py b/srctag/cli.py index 6cfac71..29c8b2d 100644 --- a/srctag/cli.py +++ b/srctag/cli.py @@ -1,12 +1,15 @@ +import subprocess +import typing + import chromadb import click +import networkx import networkx as nx - from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction from loguru import logger from srctag.collector import Collector, FileLevelEnum -from srctag.storage import Storage +from srctag.storage import Storage, MetadataConstant from srctag.tagger import Tagger @@ -77,23 +80,109 @@ def tag(repo_root, max_depth_limit, include_regex, tags_file, output_path, file_ @click.option("--include-regex", default="", help="File include regex pattern") @click.option("--file-level", default=FileLevelEnum.FILE.value, help="Scan file level, FILE or DIR, default to FILE") @click.option("--output-path", default="srctag.dot", help="Output file path for DOT") -def graph(repo_root, max_depth_limit, include_regex, file_level, output_path): +@click.option("--issue-regex", default="", help="Issue regex") +def graph(repo_root, max_depth_limit, include_regex, file_level, output_path, issue_regex): + """ create relations graph from your repo """ collector = Collector() collector.config.repo_root = repo_root collector.config.max_depth_limit = max_depth_limit collector.config.include_regex = include_regex collector.config.file_level = file_level + if issue_regex: + collector.config.issue_regex = issue_regex ctx = collector.collect_metadata() relation_graph = ctx.relations + render_dot(relation_graph, output_path) + + +@cli.command() +@click.option("--repo-root", default=".", help="Repository root directory") +@click.option("--max-depth-limit", default=-1, help="Maximum depth limit") +@click.option("--diff-target", default="HEAD~1", help="diff target rev") +@click.option("--file-level", default=FileLevelEnum.FILE.value, help="Scan file level, FILE or DIR, default to FILE") +@click.option("--output-path", default="srctag.dot", help="Output file path for DOT") +@click.option("--batch", default=1, help="") +@click.option("--issue-regex", default="", help="Issue regex") +def diff(repo_root, max_depth_limit, diff_target, file_level, output_path, batch, issue_regex): + """ create relations graph from your repo, with diff """ + base_file_set = get_git_diff_files(diff_target) + total_file_set = base_file_set + logger.info(f"base file set: {base_file_set}") + + for i in range(batch): + collector = Collector() + collector.config.repo_root = repo_root + collector.config.max_depth_limit = max_depth_limit + collector.config.file_level = file_level + collector.config.include_file_list = base_file_set + if issue_regex: + collector.config.issue_regex = issue_regex + + ctx = collector.collect_metadata() + + # enlarge this file set + file_set = set(ctx.files.keys()) + # network graph + for each in file_set: + if not ctx.relations.has_node(each): + logger.warning(f"node {each} not in graph") + continue + + issues = set() + for each_node in ctx.relations.neighbors(each): + if ctx.relations.nodes[each_node]["node_type"] != MetadataConstant.KEY_ISSUE_ID: + continue + issues.add(each_node) + # END issue query + + for each_issue in issues: + related_files = ctx.relations.neighbors(each_issue) + file_set = file_set.union(related_files) + + # END file query + + logger.info(f"batch {i} end, files: {len(base_file_set)} -> {len(file_set)}") + new_file_set = file_set - total_file_set + if not new_file_set: + logger.info(f"file range search ready: {len(total_file_set)}") + break + # update base scope, and run again + total_file_set = file_set + base_file_set = new_file_set + # END loop batch + + collector = Collector() + collector.config.repo_root = repo_root + collector.config.max_depth_limit = max_depth_limit + collector.config.file_level = file_level + collector.config.include_file_list = total_file_set + if issue_regex: + collector.config.issue_regex = issue_regex + ctx = collector.collect_metadata() + relation_graph = ctx.relations + render_dot(relation_graph, output_path) + + +def get_git_diff_files(target: str) -> typing.Set[str]: + result = subprocess.check_output(['git', 'diff', target, '--name-only'], text=True) + diff_files = result.splitlines() + return set(diff_files) + - node_colors = {'Type1': 'tomato', 'Type2': 'lightgreen', 'Type3': 'lightblue'} # 使用命名颜色 +def render_dot(relation_graph: networkx.Graph, output: str): + node_colors = { + MetadataConstant.KEY_SOURCE: 'tomato', + MetadataConstant.KEY_ISSUE_ID: 'lightgreen', + MetadataConstant.KEY_COMMIT_SHA: 'lightblue', + } node_color_mapping = {node: node_colors.get(data.get('node_type', 'default'), 'lightgray') for node, data in relation_graph.nodes(data=True)} for node, color in node_color_mapping.items(): relation_graph.nodes[node]['color'] = color - nx.drawing.nx_pydot.write_dot(relation_graph, output_path) + nx.drawing.nx_pydot.write_dot(relation_graph, output) + logger.info("rendered graph to %s", output) if __name__ == '__main__': diff --git a/srctag/collector.py b/srctag/collector.py index 587527b..d7fcaa3 100644 --- a/srctag/collector.py +++ b/srctag/collector.py @@ -1,3 +1,4 @@ +import functools import os import re import typing @@ -72,6 +73,14 @@ def collect_metadata(self) -> RuntimeContext: logger.info("metadata ready") return ctx + @functools.lru_cache(maxsize=None) + def _process_diff_from_commit(self, commit: Commit) -> typing.Set[str]: + ret = set() + for each_diff in commit.diff(): + each_b_path = each_diff.b_path + ret.add(each_b_path) + return ret + def _process_relations(self, ctx: RuntimeContext): """ collect different relations from metadata @@ -81,22 +90,34 @@ def _process_relations(self, ctx: RuntimeContext): """ regex = re.compile(self.config.issue_regex) - for each_file in ctx.files.values(): - for each_commit in each_file.commits: - issue_id_list = regex.findall(each_commit.message) + for each_file in tqdm(ctx.files.values()): + ctx.relations.add_node(each_file.name, node_type=MetadataConstant.KEY_SOURCE) - ctx.relations.add_node(each_commit.hexsha, node_type=MetadataConstant.KEY_ISSUE_ID) - ctx.relations.add_node(each_file.name, node_type=MetadataConstant.KEY_SOURCE) + # and the related files + for each_commit in each_file.commits: + related_files = self._process_diff_from_commit(each_commit) + ctx.relations.add_node(each_commit.hexsha, node_type=MetadataConstant.KEY_COMMIT_SHA) ctx.relations.add_edge(each_commit.hexsha, each_file.name) + for each_related in related_files: + # commit -> related files + ctx.relations.add_node(each_related, node_type=MetadataConstant.KEY_SOURCE) + ctx.relations.add_edge(each_commit.hexsha, each_related) + # END commit -> file + + issue_id_list = regex.findall(each_commit.message) for each_issue in issue_id_list: + # issue -> file ctx.relations.add_node(each_issue, node_type=MetadataConstant.KEY_ISSUE_ID) ctx.relations.add_edge(each_issue, each_file.name) - ctx.relations.add_edge(each_issue, each_commit.hexsha) - # END loop issue - # END loop commit - # END loop file + for each_related in related_files: + ctx.relations.add_edge(each_issue, each_related) + # END issue -> related files + # END issue -> file + + # END each file added + # END file added def _check_env(self) -> typing.Optional[BaseException]: try: diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..6defab9 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,16 @@ +from click.testing import CliRunner +import pathlib + +from srctag.cli import diff, graph + + +def test_diff(): + runner = CliRunner() + path = pathlib.Path(__file__).parent.parent.as_posix() + runner.invoke(diff, ["--repo-root", path, "--batch", 2], catch_exceptions=False) + + +def test_graph(): + runner = CliRunner() + path = pathlib.Path(__file__).parent.parent.as_posix() + runner.invoke(graph, ["--repo-root", path], catch_exceptions=False)