Skip to content

Commit

Permalink
feat(#1): graph in storage
Browse files Browse the repository at this point in the history
  • Loading branch information
williamfzc committed Dec 16, 2023
1 parent 7821d31 commit 166d68c
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 24 deletions.
58 changes: 38 additions & 20 deletions srctag/storage.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
import json
import re
import typing

Expand All @@ -7,9 +7,11 @@
from chromadb.api.models.Collection import Collection
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from loguru import logger
from networkx import Graph
from pydantic import BaseModel
from pydantic_settings import BaseSettings
from tqdm import tqdm
import networkx as nx

from srctag.model import FileContext, RuntimeContext, SrcTagException

Expand All @@ -24,6 +26,8 @@ class MetadataConstant(object):
KEY_SOURCE = "source"
KEY_COMMIT_SHA = "commit_sha"
KEY_DATA_TYPE = "data_type"
KEY_ISSUE_ID = "issue_id"
KEY_TAG = "tag"

DATA_TYPE_COMMIT_MSG = "commit_msg"
DATA_TYPE_ISSUE = "issue"
Expand All @@ -40,10 +44,22 @@ class StorageConfig(BaseSettings):
# issue regex for matching issue grammar
# by default, we use GitHub standard
issue_regex: str = r"(#\d+)"
# content mapping for avoiding too much I/O
# "#11" -> "content for #11"
issue_mapping: typing.Dict[str, str] = dict()

data_types: typing.Set[str] = {MetadataConstant.DATA_TYPE_COMMIT_MSG, MetadataConstant.DATA_TYPE_ISSUE}

def load_issue_mapping_from_gh_json_file(self, gh_json_file: str):
with open(gh_json_file) as f:
content = json.load(f)
assert isinstance(content, list), "not a valid issue dump"

for each in content:
sharp_id = f'#{each["number"]}'
self.issue_mapping[sharp_id] = each["title"]
logger.info(f"load {len(content)} issues from {gh_json_file}")


class Storage(object):
def __init__(self, config: StorageConfig = None):
Expand All @@ -53,6 +69,7 @@ def __init__(self, config: StorageConfig = None):

self.chromadb: typing.Optional[API] = None
self.chromadb_collection: typing.Optional[Collection] = None
self.relation_graph: Graph = nx.Graph()

def init_chroma(self):
if self.chromadb and self.chromadb_collection:
Expand Down Expand Up @@ -85,7 +102,7 @@ def process_commit_msg(self, file: FileContext, collection: Collection):
MetadataConstant.KEY_COMMIT_SHA: str(each.hexsha),
MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_COMMIT_MSG,
},
id=f"{file.name}|{each.hexsha}|{MetadataConstant.DATA_TYPE_COMMIT_MSG}"
id=f"{MetadataConstant.DATA_TYPE_COMMIT_MSG}|{file.name}|{each.hexsha}"
)
targets.append(item)

Expand All @@ -107,30 +124,31 @@ def process_issue(self, file: FileContext, collection: Collection):
targets = []
for each in file.commits:
issue_id_list = regex.findall(each.message)
issue_contents = []
for each_issue in issue_id_list:
each_issue_content = self.process_issue_id_to_title(each_issue)
for each_issue_id in issue_id_list:
each_issue_content = self.process_issue_id_to_title(each_issue_id)
if not each_issue_content:
continue
issue_contents.append(each_issue_content)
# END issue loop

if not issue_contents:
continue
item = StorageDoc(
document=os.sep.join(issue_contents),
metadata={
MetadataConstant.KEY_SOURCE: file.name,
MetadataConstant.KEY_COMMIT_SHA: str(each.hexsha),
MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_ISSUE,
},
id=f"{file.name}|{each.hexsha}|{MetadataConstant.DATA_TYPE_ISSUE}"
)
targets.append(item)
item = StorageDoc(
document=each_issue_content,
metadata={
MetadataConstant.KEY_ISSUE_ID: each_issue_id,
MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_ISSUE,
},
id=f"{MetadataConstant.DATA_TYPE_ISSUE}|{each_issue_id}"
)
targets.append(item)

# save to graph
self.relation_graph.add_node(each_issue_id, node_type=MetadataConstant.KEY_ISSUE_ID)
self.relation_graph.add_node(file.name, node_type=MetadataConstant.KEY_SOURCE)
self.relation_graph.add_edge(each_issue_id, file.name)

# END issue loop
# END commit loop

for each in targets:
collection.add(
collection.upsert(
documents=[each.document],
metadatas=[each.metadata],
ids=[each.id],
Expand Down
91 changes: 87 additions & 4 deletions srctag/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,19 +95,18 @@ def __init__(self, config: TaggerConfig = None):
config = TaggerConfig()
self.config = config

def tag(self, storage: Storage) -> TagResult:
storage.init_chroma()
def tag_with_commit(self, storage: Storage) -> TagResult:
doc_count = storage.chromadb_collection.count()
n_results = int(doc_count * self.config.n_percent)

logger.info(f"start tagging source files ...")

tag_results = []
relation_graph = storage.relation_graph.copy()
for each_tag in tqdm(self.config.tags):
query_result: QueryResult = storage.chromadb_collection.query(
query_texts=each_tag,
n_results=n_results,
include=["metadatas", "distances"],
where={MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_COMMIT_MSG}
)

metadatas: typing.List[Metadata] = query_result["metadatas"][0]
Expand Down Expand Up @@ -137,6 +136,77 @@ def tag(self, storage: Storage) -> TagResult:
else:
# has been touched by other commits, merge
each_file_tag_result[each_tag] += each_score

# update graph
relation_graph.add_node(each_tag, node_type=MetadataConstant.KEY_TAG)
relation_graph.add_edge(each_tag, each_file_name)
# END tag_results

scores_df = pd.DataFrame.from_dict(ret, orient="index")
if self.config.optimize:
scores_df = self.optimize(scores_df)

# convert score matrix into rank (use reversed rank as score). because:
# 1. score/distance is meaningless to users
# 2. can not be evaluated both rows and cols
scores_df = scores_df.rank(axis=0, method='min')

if self.config.normalize:
scores_df = (scores_df - scores_df.min()) / (scores_df.max() - scores_df.min())

logger.info(f"tag finished")
# update relation graph in storage
storage.relation_graph = relation_graph

return TagResult(scores_df=scores_df)

def tag_with_issue(self, storage: Storage) -> TagResult:
doc_count = storage.chromadb_collection.count()
n_results = int(doc_count * self.config.n_percent)

tag_results = []
relation_graph = storage.relation_graph.copy()
for each_tag in tqdm(self.config.tags):
query_result: QueryResult = storage.chromadb_collection.query(
query_texts=each_tag,
n_results=n_results,
include=["metadatas", "distances"],
where={MetadataConstant.KEY_DATA_TYPE: MetadataConstant.DATA_TYPE_ISSUE}
)

metadatas: typing.List[Metadata] = query_result["metadatas"][0]
# https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/vectorstores/chroma.py
# https://stats.stackexchange.com/questions/158279/how-i-can-convert-distance-euclidean-to-similarity-score
distances: typing.List[float] = query_result["distances"][0]
normalized_scores = [
1.0 / (1.0 + x) for x in distances
]

for each_metadata, each_score in zip(metadatas, normalized_scores):
each_issue_id = each_metadata[MetadataConstant.KEY_ISSUE_ID]
tag_results.append((each_tag, each_issue_id, each_score))
# END file loop
# END tag loop

ret = dict()
for each_tag, each_issue_id, each_score in tag_results:
files = storage.relation_graph.neighbors(each_issue_id)
for each_file in files:
if each_file not in ret:
# has not been touched by other tags
# the score order is decreasing
ret[each_file] = OrderedDict()
each_file_tag_result = ret[each_file]

if each_tag not in each_file_tag_result:
each_file_tag_result[each_tag] = each_score
else:
# has been touched by other commits, merge
each_file_tag_result[each_tag] += each_score

# update graph
relation_graph.add_node(each_tag, node_type=MetadataConstant.KEY_TAG)
relation_graph.add_edge(each_tag, each_issue_id)
# END tag_results

scores_df = pd.DataFrame.from_dict(ret, orient="index")
Expand All @@ -152,8 +222,21 @@ def tag(self, storage: Storage) -> TagResult:
scores_df = (scores_df - scores_df.min()) / (scores_df.max() - scores_df.min())

logger.info(f"tag finished")
# update relation graph in storage
storage.relation_graph = relation_graph
return TagResult(scores_df=scores_df)

def tag(self, storage: Storage) -> TagResult:
logger.info(f"start tagging source files ...")
storage.init_chroma()

if storage.relation_graph.number_of_nodes():
logger.info("tag with issue")
return self.tag_with_issue(storage)
else:
logger.info("tag with commit")
return self.tag_with_commit(storage)

def optimize(self, df: pd.DataFrame) -> pd.DataFrame:
scale_factor = 2.0
df = np.exp(df * scale_factor)
Expand Down

0 comments on commit 166d68c

Please sign in to comment.