Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor codebase dictionary into CodeBase class #98

Merged
merged 8 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 10 additions & 15 deletions bin/codebasin
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import logging
import os
import sys

from codebasin import config, finder, report, util
from codebasin import CodeBase, config, finder, report, util
from codebasin.walkers.platform_mapper import PlatformMapper

version = "1.2.0"
Expand Down Expand Up @@ -107,14 +107,7 @@ def main():
# Determine the root directory based on where codebasin is run.
rootdir = os.path.realpath(os.getcwd())

# Set up a default codebase and configuration object.
codebase = {
"files": [],
"platforms": [],
"exclude_files": set(),
"exclude_patterns": args.excludes,
"rootdir": rootdir,
}
# Set up a default configuration object.
configuration = {}

# Load the analysis file if it exists.
Expand All @@ -132,8 +125,7 @@ def main():

if "codebase" in analysis_toml:
if "exclude" in analysis_toml["codebase"]:
excludes = analysis_toml["codebase"]["exclude"]
codebase["exclude_patterns"] += excludes
args.excludes += analysis_toml["codebase"]["exclude"]

for name in args.platforms:
if name not in analysis_toml["platform"].keys():
Expand All @@ -142,16 +134,20 @@ def main():
+ "does not exist in the configuration file.",
)

cmd_platforms = args.platforms.copy()
for name in analysis_toml["platform"].keys():
if args.platforms and name not in args.platforms:
if cmd_platforms and name not in cmd_platforms:
continue
if "commands" not in analysis_toml["platform"][name]:
raise ValueError(f"Missing 'commands' for platform {name}")
p = analysis_toml["platform"][name]["commands"]
db = config.load_database(p, rootdir)
codebase["platforms"].append(name)
args.platforms.append(name)
configuration.update({name: db})

# Construct a codebase object associated with the root directory.
codebase = CodeBase(rootdir, exclude_patterns=args.excludes)

# Parse the source tree, and determine source line associations.
# The trees and associations are housed in state.
state = finder.find(
Expand Down Expand Up @@ -180,8 +176,7 @@ def main():
if report_enabled("clustering"):
basename = os.path.basename(args.analysis_file)
filename = os.path.splitext(basename)[0]
platform_names = [p for p in codebase["platforms"]]
output_prefix = "-".join([filename] + platform_names)
output_prefix = "-".join([filename] + args.platforms)

clustering_output_name = output_prefix + "-dendrogram.png"
clustering = report.clustering(clustering_output_name, setmap)
Expand Down
114 changes: 114 additions & 0 deletions codebasin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# Copyright (C) 2019-2024 Intel Corporation
# SPDX-License-Identifier: BSD-3-Clause
import os
import shlex
import warnings
from collections.abc import Iterable
from pathlib import Path

import pathspec

import codebasin.source
import codebasin.walkers
Expand Down Expand Up @@ -123,3 +128,112 @@ def from_json(cls, instance: dict):
command=command,
output=output,
)


class CodeBase:
"""
A representation of all source files in the code base.

Attributes
----------
directories: list[str | os.PathLike[str]]
The set of source directories that make up the code base.

exclude_patterns: list[str]
A set of patterns describing source files excluded from the code base.
"""

def __init__(
self,
*directories: str | os.PathLike[str],
laserkelvin marked this conversation as resolved.
Show resolved Hide resolved
exclude_patterns: Iterable[str] = [],
):
"""
Raises
------
TypeError
If any directory in `directories` is not a path.
If `exclude_patterns` is not a list of strings.
"""
if not isinstance(exclude_patterns, list):
raise TypeError("'exclude_patterns' must be a list.")
if not all([isinstance(d, (str, os.PathLike)) for d in directories]):
raise TypeError(
"Each directory in 'directories' must be PathLike.",
)
if not all([isinstance(p, str) for p in exclude_patterns]):
raise TypeError(
"Each pattern in 'exclude_patterns' must be a string.",
)
self._directories = [Path(d).resolve() for d in directories]
self._excludes = exclude_patterns

def __repr__(self):
return (
f"CodeBase(directories={self.directories}, "
+ f"exclude_patterns={self.exclude_patterns})"
)

@property
def directories(self):
return [str(d) for d in self._directories]

@property
def exclude_patterns(self):
return self._excludes

def __contains__(self, path: os.PathLike) -> bool:
"""
Returns
-------
bool
True if `path` is a recognized source file in one of the code
base's listed directories and does not match any exclude
pattern(s).
"""
path = Path(path).resolve()

# Files that don't exist aren't part of the code base.
if not path.exists():
return False

# Directories cannot be source files.
if path.is_dir():
return False

# Files with unrecognized extensions are not source files.
if not codebasin.source.is_source_file(path):
return False

# Files outside of any directory are not in the code base.
# Store the root for evaluation of relative exclude paths later.
root = None
for directory in self.directories:
if path.is_relative_to(directory):
root = directory
break
if root is None:
return False

# Files matching an exclude pattern are not in the code base.
#
# Use GitIgnoreSpec to match git behavior in weird corner cases.
# Convert relative paths to match .gitignore subdirectory behavior.
spec = pathspec.GitIgnoreSpec.from_lines(self.exclude_patterns)
try:
relative_path = path.relative_to(root)
if spec.match_file(relative_path):
return False
except ValueError:
pass

return True

def __iter__(self):
"""
Iterate over all files in the code base by walking each directory.
"""
for directory in self.directories:
for path in Path(directory).rglob("*"):
Pennycook marked this conversation as resolved.
Show resolved Hide resolved
if self.__contains__(path):
laserkelvin marked this conversation as resolved.
Show resolved Hide resolved
yield str(path)
10 changes: 8 additions & 2 deletions codebasin/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import collections
import logging
import os
from pathlib import Path

from codebasin import file_parser, platform, preprocessor, util
from codebasin.language import FileLanguage
Expand Down Expand Up @@ -140,13 +141,18 @@ def find(
lines to platforms.
"""

# Ensure rootdir is a string for compatibility with legacy code.
# TODO: Remove this once all other functionality is ported to Path.
if isinstance(rootdir, Path):
rootdir = str(rootdir)

# Build a tree for each unique file for all platforms.
state = ParserState(summarize_only)
for f in codebase["files"]:
for f in codebase:
state.insert_file(f)
for p in configuration:
for e in configuration[p]:
if e["file"] not in codebase["files"]:
if e["file"] not in codebase:
filename = e["file"]
if legacy_warnings:
log.warning(
Expand Down
6 changes: 1 addition & 5 deletions codebasin/walkers/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from codebasin import util
from codebasin.preprocessor import CodeNode, FileNode
from codebasin.walkers.platform_mapper import exclude
from codebasin.walkers.tree_walker import TreeWalker

log = logging.getLogger("codebasin")
Expand Down Expand Up @@ -38,10 +37,7 @@ def walk(self, state):
def _export_node(self, _filename, _node, _map):
# Do not export files that the user does not consider to be part of
# the codebase
if isinstance(_node, FileNode) and exclude(
_node.filename,
self.codebase,
):
if isinstance(_node, FileNode) and _node.filename not in self.codebase:
return

if isinstance(_node, CodeNode):
Expand Down
36 changes: 1 addition & 35 deletions codebasin/walkers/platform_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,13 @@
# SPDX-License-Identifier: BSD-3-Clause

import logging
import os

import pathspec

from codebasin.preprocessor import CodeNode, FileNode
from codebasin.walkers.tree_mapper import TreeMapper

log = logging.getLogger("codebasin")


def exclude(filename, cb):
# Always exclude files that were explicitly listed as excluded.
if filename in cb["exclude_files"]:
log.info(f"Excluding {filename}; matches 'exclude_files'.")
return True

# Only exclude files outside of the root directory if they weren't
# explicitly listed as part of the codebase.
path = os.path.realpath(filename)
if not path.startswith(cb["rootdir"]):
if filename in cb["files"]:
return False
log.info(f"Excluding {filename}; outside of root directory.")
return True

# Exclude files matching an exclude pattern.
#
# Use GitIgnoreSpec to match git behavior in weird corner cases.
# Convert relative paths to match .gitignore subdirectory behavior.
spec = pathspec.GitIgnoreSpec.from_lines(cb["exclude_patterns"])
rel = os.path.relpath(path, cb["rootdir"])
if spec.match_file(rel):
log.info(f"Excluding {filename}; matches exclude pattern.")
return True

return False


class PlatformMapper(TreeMapper):
"""
Specific TreeMapper that builds a mapping of nodes to platforms.
Expand All @@ -57,10 +26,7 @@ def _map_node(self, _node, _map):
"""
# Do not map files that the user does not consider to be part of
# the codebase
if isinstance(_node, FileNode) and exclude(
_node.filename,
self.codebase,
):
if isinstance(_node, FileNode) and _node.filename not in self.codebase:
return

if isinstance(_node, CodeNode):
Expand Down
23 changes: 20 additions & 3 deletions docs/source/analysis.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,24 @@ The table's name is the name of the platform, and we can use any meaningful
string. The ``commands`` key tells CBI where to find the compilation database
for this platform.

.. important::

By default, ``codebasin`` searches the current working directory for source
files to include in its analysis. Since we'll be running in the ``src``
directory, we need to specify the ``commands`` paths relative to the
``src`` directory or as absolute paths.

In our example, we have two platforms that we're calling "cpu" and "gpu",
and our build directories are called ``build-cpu`` and ``build-gpu``, so
our platform definitions should look like this:

.. code-block:: toml

[platform.cpu]
commands = "build-cpu/compile_commands.json"
commands = "../build-cpu/compile_commands.json"

[platform.gpu]
commands = "build-gpu/compile_commands.json"
commands = "../build-gpu/compile_commands.json"

.. warning::
Platform names are case sensitive! The names "cpu" and "CPU" would refer to
Expand All @@ -56,7 +63,8 @@ our platform definitions should look like this:
Running ``codebasin``
#####################

Running ``codebasin`` with this analysis file gives the following output:
Running ``codebasin`` in the ``src`` directory with this analysis file gives
the following output:

.. code-block:: text
:emphasize-lines: 4,5,6,7,9
Expand Down Expand Up @@ -86,6 +94,15 @@ used only by the GPU compilation, and 17 lines of code shared by both
platforms. Plugging these numbers into the equation for code divergence gives
0.45.

.. caution::
If we had run ``codebasin`` in the parent directory, everything in the
``src``, ``build-cpu`` and ``build-gpu`` directories would have been
included in the analysis. For our sample code base, this would have
resulted in over 2000 lines of code being identified as unused! Why so
many? CMake generates multiple ``*.cpp`` files, which it uses as part of
the build process. ``codebasin`` will analyze such files unless we tell it
not to (more on that later).


Filtering Platforms
###################
Expand Down
Loading
Loading