From e84e3df58dcfa7ca8118affeccb25812e33b0097 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 26 Sep 2024 10:17:54 +0100 Subject: [PATCH] Remove functionality to merge duplicates We previously disabled this functionally unconditionally by setting merge_duplicates=False, but stopped short of removing the code. This commit removes the merge_duplicates variable completely, along with any code unreachable when merge_duplicates is set to False. Signed-off-by: John Pennycook --- codebasin/finder.py | 59 +-------------------------------------------- codebasin/util.py | 11 --------- 2 files changed, 1 insertion(+), 69 deletions(-) diff --git a/codebasin/finder.py b/codebasin/finder.py index 34fb802..ad743c0 100644 --- a/codebasin/finder.py +++ b/codebasin/finder.py @@ -10,24 +10,13 @@ import os from pathlib import Path -from codebasin import file_parser, platform, preprocessor, util +from codebasin import file_parser, platform, preprocessor from codebasin.language import FileLanguage from codebasin.walkers.tree_associator import TreeAssociator log = logging.getLogger(__name__) -class FileInfo: - """ - Data class storing (path, size, sha) for a file. - """ - - def __init__(self, path, size=None, sha=None): - self.path = path - self.size = size - self.sha = sha - - class ParserState: """ Keeps track of the overall state of the parser. @@ -41,56 +30,12 @@ def __init__(self, summarize_only): self.maps = {} self.langs = {} self.summarize_only = summarize_only - self.fileinfo = collections.defaultdict(list) - self.merge_duplicates = False - - def _map_filename(self, fn): - """ - Map the real filename to an internal filename used by the parser. - Enables duplicate files to be merged. - """ - if not self.merge_duplicates: - return fn - - # The first time we encounter a filename, store limited info - bn = os.path.basename(fn) - if bn not in self.fileinfo: - self.fileinfo[bn] = [FileInfo(fn)] - return fn - - # If filename has been encountered, check for matching size/hash - size = os.path.getsize(fn) - sha = None - for fi in self.fileinfo[bn]: - # Fill in missing size information - if fi.size is None: - fi.size = os.path.getsize(fi.path) - - # If sizes don't match, the file is different - if fi.size != size: - continue - - # Fill in missing hash information - if sha is None: - sha = util.compute_file_hash(fn) - if fi.sha is None: - fi.sha = util.compute_file_hash(fi.path) - - # Use hash to determine if file is duplicate or not - if fi.sha != sha: - continue - return fi.path - - # If no match, this is the first time encountering this file - self.fileinfo[bn].append(FileInfo(fn, size, sha)) - return fn def insert_file(self, fn, language=None): """ Build a new tree for a source file, and create an association map for it. """ - fn = self._map_filename(fn) if fn not in self.trees: parser = file_parser.FileParser(fn) self.trees[fn] = parser.parse_file( @@ -113,7 +58,6 @@ def get_tree(self, fn): """ Return the SourceTree associated with a filename """ - fn = self._map_filename(fn) if fn not in self.trees: return None return self.trees[fn] @@ -122,7 +66,6 @@ def get_map(self, fn): """ Return the NodeAssociationMap associated with a filename """ - fn = self._map_filename(fn) if fn not in self.maps: return None return self.maps[fn] diff --git a/codebasin/util.py b/codebasin/util.py index 5982d84..0aebb6a 100644 --- a/codebasin/util.py +++ b/codebasin/util.py @@ -7,7 +7,6 @@ - Checking paths """ -import hashlib import json import logging import os @@ -23,16 +22,6 @@ log = logging.getLogger(__name__) -def compute_file_hash(fname): - """Return sha512 for fname""" - chunk_size = 4096 - hasher = hashlib.sha512() - with safe_open_read_nofollow(fname, "rb") as in_file: - for chunk in iter(lambda: in_file.read(chunk_size), b""): - hasher.update(chunk) - return hasher.hexdigest() - - def ensure_ext(fname, extensions): """Return true if the path passed in has specified extension""" if not isinstance(extensions, Iterable):