Merge pull request #116 from Pennycook/remove-merge-duplicates

Remove functionality to merge duplicates
intel · Oct 9, 2024 · 48a280d · 48a280d
2 parents 67391c4 + e84e3df
commit 48a280d
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 69 deletions.
diff --git a/codebasin/finder.py b/codebasin/finder.py
@@ -10,24 +10,13 @@
 import os
 from pathlib import Path
 
-from codebasin import file_parser, platform, preprocessor, util
+from codebasin import file_parser, platform, preprocessor
 from codebasin.language import FileLanguage
 from codebasin.walkers.tree_associator import TreeAssociator
 
 log = logging.getLogger(__name__)
 
 
-class FileInfo:
-    """
-    Data class storing (path, size, sha) for a file.
-    """
-
-    def __init__(self, path, size=None, sha=None):
-        self.path = path
-        self.size = size
-        self.sha = sha
-
-
 class ParserState:
     """
     Keeps track of the overall state of the parser.
@@ -41,56 +30,12 @@ def __init__(self, summarize_only):
         self.maps = {}
         self.langs = {}
         self.summarize_only = summarize_only
-        self.fileinfo = collections.defaultdict(list)
-        self.merge_duplicates = False
-
-    def _map_filename(self, fn):
-        """
-        Map the real filename to an internal filename used by the parser.
-        Enables duplicate files to be merged.
-        """
-        if not self.merge_duplicates:
-            return fn
-
-        # The first time we encounter a filename, store limited info
-        bn = os.path.basename(fn)
-        if bn not in self.fileinfo:
-            self.fileinfo[bn] = [FileInfo(fn)]
-            return fn
-
-        # If filename has been encountered, check for matching size/hash
-        size = os.path.getsize(fn)
-        sha = None
-        for fi in self.fileinfo[bn]:
-            # Fill in missing size information
-            if fi.size is None:
-                fi.size = os.path.getsize(fi.path)
-
-            # If sizes don't match, the file is different
-            if fi.size != size:
-                continue
-
-            # Fill in missing hash information
-            if sha is None:
-                sha = util.compute_file_hash(fn)
-            if fi.sha is None:
-                fi.sha = util.compute_file_hash(fi.path)
-
-            # Use hash to determine if file is duplicate or not
-            if fi.sha != sha:
-                continue
-            return fi.path
-
-        # If no match, this is the first time encountering this file
-        self.fileinfo[bn].append(FileInfo(fn, size, sha))
-        return fn
 
     def insert_file(self, fn, language=None):
         """
         Build a new tree for a source file, and create an association
         map for it.
         """
-        fn = self._map_filename(fn)
         if fn not in self.trees:
             parser = file_parser.FileParser(fn)
             self.trees[fn] = parser.parse_file(
@@ -113,7 +58,6 @@ def get_tree(self, fn):
         """
         Return the SourceTree associated with a filename
         """
-        fn = self._map_filename(fn)
         if fn not in self.trees:
             return None
         return self.trees[fn]
@@ -122,7 +66,6 @@ def get_map(self, fn):
         """
         Return the NodeAssociationMap associated with a filename
         """
-        fn = self._map_filename(fn)
         if fn not in self.maps:
             return None
         return self.maps[fn]

diff --git a/codebasin/util.py b/codebasin/util.py
@@ -7,7 +7,6 @@
 - Checking paths
 """
 
-import hashlib
 import json
 import logging
 import os
@@ -23,16 +22,6 @@
 log = logging.getLogger(__name__)
 
 
-def compute_file_hash(fname):
-    """Return sha512 for fname"""
-    chunk_size = 4096
-    hasher = hashlib.sha512()
-    with safe_open_read_nofollow(fname, "rb") as in_file:
-        for chunk in iter(lambda: in_file.read(chunk_size), b""):
-            hasher.update(chunk)
-    return hasher.hexdigest()
-
-
 def ensure_ext(fname, extensions):
     """Return true if the path passed in has specified extension"""
     if not isinstance(extensions, Iterable):