Skip to content

Commit

Permalink
feat(#3): diff-driven graph in cli
Browse files Browse the repository at this point in the history
  • Loading branch information
williamfzc committed Dec 23, 2023
1 parent a21b595 commit 2559335
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 14 deletions.
99 changes: 94 additions & 5 deletions srctag/cli.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import subprocess
import typing

import chromadb
import click
import networkx
import networkx as nx

from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from loguru import logger

from srctag.collector import Collector, FileLevelEnum
from srctag.storage import Storage
from srctag.storage import Storage, MetadataConstant
from srctag.tagger import Tagger


Expand Down Expand Up @@ -77,23 +80,109 @@ def tag(repo_root, max_depth_limit, include_regex, tags_file, output_path, file_
@click.option("--include-regex", default="", help="File include regex pattern")
@click.option("--file-level", default=FileLevelEnum.FILE.value, help="Scan file level, FILE or DIR, default to FILE")
@click.option("--output-path", default="srctag.dot", help="Output file path for DOT")
def graph(repo_root, max_depth_limit, include_regex, file_level, output_path):
@click.option("--issue-regex", default="", help="Issue regex")
def graph(repo_root, max_depth_limit, include_regex, file_level, output_path, issue_regex):
""" create relations graph from your repo """
collector = Collector()
collector.config.repo_root = repo_root
collector.config.max_depth_limit = max_depth_limit
collector.config.include_regex = include_regex
collector.config.file_level = file_level
if issue_regex:
collector.config.issue_regex = issue_regex

ctx = collector.collect_metadata()
relation_graph = ctx.relations
render_dot(relation_graph, output_path)


@cli.command()
@click.option("--repo-root", default=".", help="Repository root directory")
@click.option("--max-depth-limit", default=-1, help="Maximum depth limit")
@click.option("--diff-target", default="HEAD~1", help="diff target rev")
@click.option("--file-level", default=FileLevelEnum.FILE.value, help="Scan file level, FILE or DIR, default to FILE")
@click.option("--output-path", default="srctag.dot", help="Output file path for DOT")
@click.option("--batch", default=1, help="")
@click.option("--issue-regex", default="", help="Issue regex")
def diff(repo_root, max_depth_limit, diff_target, file_level, output_path, batch, issue_regex):
""" create relations graph from your repo, with diff """
base_file_set = get_git_diff_files(diff_target)
total_file_set = base_file_set
logger.info(f"base file set: {base_file_set}")

for i in range(batch):
collector = Collector()
collector.config.repo_root = repo_root
collector.config.max_depth_limit = max_depth_limit
collector.config.file_level = file_level
collector.config.include_file_list = base_file_set
if issue_regex:
collector.config.issue_regex = issue_regex

ctx = collector.collect_metadata()

# enlarge this file set
file_set = set(ctx.files.keys())
# network graph
for each in file_set:
if not ctx.relations.has_node(each):
logger.warning(f"node {each} not in graph")
continue

issues = set()
for each_node in ctx.relations.neighbors(each):
if ctx.relations.nodes[each_node]["node_type"] != MetadataConstant.KEY_ISSUE_ID:
continue
issues.add(each_node)
# END issue query

for each_issue in issues:
related_files = ctx.relations.neighbors(each_issue)
file_set = file_set.union(related_files)

# END file query

logger.info(f"batch {i} end, files: {len(base_file_set)} -> {len(file_set)}")
new_file_set = file_set - total_file_set
if not new_file_set:
logger.info(f"file range search ready: {len(total_file_set)}")
break
# update base scope, and run again
total_file_set = file_set
base_file_set = new_file_set
# END loop batch

collector = Collector()
collector.config.repo_root = repo_root
collector.config.max_depth_limit = max_depth_limit
collector.config.file_level = file_level
collector.config.include_file_list = total_file_set
if issue_regex:
collector.config.issue_regex = issue_regex
ctx = collector.collect_metadata()
relation_graph = ctx.relations
render_dot(relation_graph, output_path)


def get_git_diff_files(target: str) -> typing.Set[str]:
result = subprocess.check_output(['git', 'diff', target, '--name-only'], text=True)
diff_files = result.splitlines()
return set(diff_files)


node_colors = {'Type1': 'tomato', 'Type2': 'lightgreen', 'Type3': 'lightblue'} # 使用命名颜色
def render_dot(relation_graph: networkx.Graph, output: str):
node_colors = {
MetadataConstant.KEY_SOURCE: 'tomato',
MetadataConstant.KEY_ISSUE_ID: 'lightgreen',
MetadataConstant.KEY_COMMIT_SHA: 'lightblue',
}
node_color_mapping = {node: node_colors.get(data.get('node_type', 'default'), 'lightgray') for node, data in
relation_graph.nodes(data=True)}
for node, color in node_color_mapping.items():
relation_graph.nodes[node]['color'] = color

nx.drawing.nx_pydot.write_dot(relation_graph, output_path)
nx.drawing.nx_pydot.write_dot(relation_graph, output)
logger.info("rendered graph to %s", output)


if __name__ == '__main__':
Expand Down
39 changes: 30 additions & 9 deletions srctag/collector.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import functools
import os
import re
import typing
Expand Down Expand Up @@ -72,6 +73,14 @@ def collect_metadata(self) -> RuntimeContext:
logger.info("metadata ready")
return ctx

@functools.lru_cache(maxsize=None)
def _process_diff_from_commit(self, commit: Commit) -> typing.Set[str]:
ret = set()
for each_diff in commit.diff():
each_b_path = each_diff.b_path
ret.add(each_b_path)
return ret

def _process_relations(self, ctx: RuntimeContext):
"""
collect different relations from metadata
Expand All @@ -81,22 +90,34 @@ def _process_relations(self, ctx: RuntimeContext):
"""
regex = re.compile(self.config.issue_regex)

for each_file in ctx.files.values():
for each_commit in each_file.commits:
issue_id_list = regex.findall(each_commit.message)
for each_file in tqdm(ctx.files.values()):
ctx.relations.add_node(each_file.name, node_type=MetadataConstant.KEY_SOURCE)

ctx.relations.add_node(each_commit.hexsha, node_type=MetadataConstant.KEY_ISSUE_ID)
ctx.relations.add_node(each_file.name, node_type=MetadataConstant.KEY_SOURCE)
# and the related files
for each_commit in each_file.commits:
related_files = self._process_diff_from_commit(each_commit)
ctx.relations.add_node(each_commit.hexsha, node_type=MetadataConstant.KEY_COMMIT_SHA)
ctx.relations.add_edge(each_commit.hexsha, each_file.name)

for each_related in related_files:
# commit -> related files
ctx.relations.add_node(each_related, node_type=MetadataConstant.KEY_SOURCE)
ctx.relations.add_edge(each_commit.hexsha, each_related)
# END commit -> file

issue_id_list = regex.findall(each_commit.message)
for each_issue in issue_id_list:
# issue -> file
ctx.relations.add_node(each_issue, node_type=MetadataConstant.KEY_ISSUE_ID)
ctx.relations.add_edge(each_issue, each_file.name)
ctx.relations.add_edge(each_issue, each_commit.hexsha)

# END loop issue
# END loop commit
# END loop file
for each_related in related_files:
ctx.relations.add_edge(each_issue, each_related)
# END issue -> related files
# END issue -> file

# END each file added
# END file added

def _check_env(self) -> typing.Optional[BaseException]:
try:
Expand Down
16 changes: 16 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from click.testing import CliRunner
import pathlib

from srctag.cli import diff, graph


def test_diff():
runner = CliRunner()
path = pathlib.Path(__file__).parent.parent.as_posix()
runner.invoke(diff, ["--repo-root", path, "--batch", 2], catch_exceptions=False)


def test_graph():
runner = CliRunner()
path = pathlib.Path(__file__).parent.parent.as_posix()
runner.invoke(graph, ["--repo-root", path], catch_exceptions=False)

0 comments on commit 2559335

Please sign in to comment.