Skip to content

Commit

Permalink
Merge pull request #116 from Pennycook/remove-merge-duplicates
Browse files Browse the repository at this point in the history
Remove functionality to merge duplicates
  • Loading branch information
Pennycook authored Oct 9, 2024
2 parents 67391c4 + e84e3df commit 48a280d
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 69 deletions.
59 changes: 1 addition & 58 deletions codebasin/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,13 @@
import os
from pathlib import Path

from codebasin import file_parser, platform, preprocessor, util
from codebasin import file_parser, platform, preprocessor
from codebasin.language import FileLanguage
from codebasin.walkers.tree_associator import TreeAssociator

log = logging.getLogger(__name__)


class FileInfo:
"""
Data class storing (path, size, sha) for a file.
"""

def __init__(self, path, size=None, sha=None):
self.path = path
self.size = size
self.sha = sha


class ParserState:
"""
Keeps track of the overall state of the parser.
Expand All @@ -41,56 +30,12 @@ def __init__(self, summarize_only):
self.maps = {}
self.langs = {}
self.summarize_only = summarize_only
self.fileinfo = collections.defaultdict(list)
self.merge_duplicates = False

def _map_filename(self, fn):
"""
Map the real filename to an internal filename used by the parser.
Enables duplicate files to be merged.
"""
if not self.merge_duplicates:
return fn

# The first time we encounter a filename, store limited info
bn = os.path.basename(fn)
if bn not in self.fileinfo:
self.fileinfo[bn] = [FileInfo(fn)]
return fn

# If filename has been encountered, check for matching size/hash
size = os.path.getsize(fn)
sha = None
for fi in self.fileinfo[bn]:
# Fill in missing size information
if fi.size is None:
fi.size = os.path.getsize(fi.path)

# If sizes don't match, the file is different
if fi.size != size:
continue

# Fill in missing hash information
if sha is None:
sha = util.compute_file_hash(fn)
if fi.sha is None:
fi.sha = util.compute_file_hash(fi.path)

# Use hash to determine if file is duplicate or not
if fi.sha != sha:
continue
return fi.path

# If no match, this is the first time encountering this file
self.fileinfo[bn].append(FileInfo(fn, size, sha))
return fn

def insert_file(self, fn, language=None):
"""
Build a new tree for a source file, and create an association
map for it.
"""
fn = self._map_filename(fn)
if fn not in self.trees:
parser = file_parser.FileParser(fn)
self.trees[fn] = parser.parse_file(
Expand All @@ -113,7 +58,6 @@ def get_tree(self, fn):
"""
Return the SourceTree associated with a filename
"""
fn = self._map_filename(fn)
if fn not in self.trees:
return None
return self.trees[fn]
Expand All @@ -122,7 +66,6 @@ def get_map(self, fn):
"""
Return the NodeAssociationMap associated with a filename
"""
fn = self._map_filename(fn)
if fn not in self.maps:
return None
return self.maps[fn]
Expand Down
11 changes: 0 additions & 11 deletions codebasin/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
- Checking paths
"""

import hashlib
import json
import logging
import os
Expand All @@ -23,16 +22,6 @@
log = logging.getLogger(__name__)


def compute_file_hash(fname):
"""Return sha512 for fname"""
chunk_size = 4096
hasher = hashlib.sha512()
with safe_open_read_nofollow(fname, "rb") as in_file:
for chunk in iter(lambda: in_file.read(chunk_size), b""):
hasher.update(chunk)
return hasher.hexdigest()


def ensure_ext(fname, extensions):
"""Return true if the path passed in has specified extension"""
if not isinstance(extensions, Iterable):
Expand Down

0 comments on commit 48a280d

Please sign in to comment.