From a7dff7e93401f8da5f2c4a868fb96175b5d2b377 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 5 Nov 2019 10:22:39 -0800 Subject: [PATCH 01/49] Add test for complex C code with continuations --- tests/comments/__init__.py | 2 ++ tests/comments/continuation.cpp | 55 +++++++++++++++++++++++++++++++++ tests/comments/test_comments.py | 22 +++++++++++++ 3 files changed, 79 insertions(+) create mode 100644 tests/comments/__init__.py create mode 100644 tests/comments/continuation.cpp create mode 100644 tests/comments/test_comments.py diff --git a/tests/comments/__init__.py b/tests/comments/__init__.py new file mode 100644 index 0000000..93af6d4 --- /dev/null +++ b/tests/comments/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2019 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause diff --git a/tests/comments/continuation.cpp b/tests/comments/continuation.cpp new file mode 100644 index 0000000..9101835 --- /dev/null +++ b/tests/comments/continuation.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019-2020 Intel Corporation +// SPDX-License-Identifier: BSD-3-Clause + +int i = \ + 5; // comment \ + lines \ + more + + +int x = \ + 1 /* now what \ + comment \ + // \ + */ +2/\ +* hahaha */+3; + +"long // - - - string \ +and \ +\ +\ +stuff " + + \ char w[] = +"confusing \"\ + string \" \n\ +\" \" \\ \/ /* \* */ "; \ +"long - - - string \ +and /* \" */ \ + \ + \ +stuff " + +/* big block comment +** and so on +** and so on too +*/ + +'"' + +''//what about this?\ +d' + +'/' + +"'\"'" + +int foo(); /\ +* hahahaha *\ +/ + +#warning Dangerous don't do this +#warning "This is more safe" + +/* "Strings 'r' // Fun! *\ +/ diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py new file mode 100644 index 0000000..5caf55a --- /dev/null +++ b/tests/comments/test_comments.py @@ -0,0 +1,22 @@ +# Copyright (C) 2019 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +import unittest +import logging +import os +from codebasin import preprocessor, file_parser + +class TestExampleFile(unittest.TestCase): + """ + Test handling of comments + """ + + def test_c_comments(self): + rootdir = "./tests/comments/" + parser = file_parser.FileParser(os.path.join(rootdir, "continuation.cpp")) + + tree = parser.parse_file() + self.assertEqual(tree.root.children[0].num_lines, 25) + +if __name__ == '__main__': + unittest.main() From ed76d521942a2beedf08daa6cc0c988d5af710ff Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Thu, 7 Nov 2019 13:53:15 -0800 Subject: [PATCH 02/49] Add new c preprocessor stages This breaks fortran slightly and needs to be better commented, etc, but I wanted to push this so I could get feedback before going too far --- codebasin/c_source.py | 194 +++++++++++++++++++++++++++++++++++++++ codebasin/file_parser.py | 145 ++++++++--------------------- 2 files changed, 232 insertions(+), 107 deletions(-) create mode 100644 codebasin/c_source.py diff --git a/codebasin/c_source.py b/codebasin/c_source.py new file mode 100644 index 0000000..8aeefe9 --- /dev/null +++ b/codebasin/c_source.py @@ -0,0 +1,194 @@ +# Copyright (C) 2019 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause +""" +Contains classes and functions for stripping comments and whitespace from C/C++ files +""" + +global whitespace_dict +whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000') + +def is_whitespace(c): + return c in whitespace_dict + +class one_space_line(object): + def __init__(self): + self.parts = [] + self.trailing_space = False + def append_char(self, c): + if not is_whitespace(c): + self.parts.append(c) + self.trailing_space = False + else: + if not self.trailing_space: + self.parts.append(' ') + self.trailing_space = True + def append_space(self): + if not self.trailing_space: + self.parts.append(' ') + self.trailing_space = True + def append_nonspace(self, c): + self.parts.append(c) + self.trailing_space = False + def join(self, other): + if len(other.parts) > 0: + if other.parts[0] == ' ' and self.trailing_space: + self.parts += other.parts[1:] + else: + self.parts += other.parts[:] + self.trailing_space = other.trailing_space + def is_blank(self): + return len(self.parts) == 0 or ( len(self.parts) == 1 and self.parts[0] == ' ' ) + def flush(self): + res= ''.join(self.parts) + self.__init__() + return res + +class c_cleaner(object): + def __init__(self, outbuf): + self.state = ["NO_COMMENT"] + self.outbuf = outbuf + def logical_newline(self): + if self.state[-1] == "IN_INLINE_COMMENT": + self.state.pop() + assert self.state == ["NO_COMMENT"] + self.outbuf.append_space() + elif self.state[-1] == "FOUND_SLASH": + self.state.pop() + assert self.state == ["NO_COMMENT"] + self.outbuf.append_nonspace('/') + elif self.state[-1] == "SINGLE_QUOTATION": + # This probably should give a warning + self.state.pop() + assert self.state == ["NO_COMMENT"] + elif self.state[-1] == "DOUBLE_QUOTATION": + # This probably should give a warning + self.state.pop() + assert self.state == ["NO_COMMENT"] + elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": + self.state.pop() + assert self.state[-1] == "IN_BLOCK_COMMENT" + def process(self, line, start, end): + pos = start + while pos < end: + if self.state[-1] == "NO_COMMENT": + if line[pos] == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(line[pos]) + elif line[pos] == '/': + self.state.append("FOUND_SLASH") + elif line[pos] == '"': + self.state.append("DOUBLE_QUOTATION") + self.outbuf.append_nonspace(line[pos]) + elif line[pos] == '\'': + self.state.append("SINGLE_QUOTATION") + self.outbuf.append_nonspace(line[pos]) + else: + self.outbuf.append_char(line[pos]) + elif self.state[-1] == "DOUBLE_QUOTATION": + if line[pos] == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(line[pos]) + elif line[pos] == '"': + self.state.pop() + assert self.state == ["NO_COMMENT"] + self.outbuf.append_nonspace(line[pos]) + else: + self.outbuf.append_nonspace(line[pos]) + elif self.state[-1] == "SINGLE_QUOTATION": + if line[pos] == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(line[pos]) + elif line[pos] == '/': + self.state.append("FOUND_SLASH") + elif line[pos] == '\'': + self.state.pop() + assert self.state == ["NO_COMMENT"] + self.outbuf.append_nonspace(line[pos]) + else: + self.outbuf.append_nonspace(line[pos]) + elif self.state[-1] == "FOUND_SLASH": + if line[pos] == '/': + self.state.pop() + self.state.append("IN_INLINE_COMMENT") + elif line[pos] == '*': + self.state.pop() + self.state.append("IN_BLOCK_COMMENT") + else: + self.state.pop() + self.outbuf.append_char('/') + pos -= 1 + elif self.state[-1] == "IN_BLOCK_COMMENT": + if line[pos] == '*': + self.state.append("IN_BLOCK_COMMENT_FOUND_STAR") + elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": + if line[pos] == '/': + self.state.pop() + assert self.state[-1] == "IN_BLOCK_COMMENT" + self.state.pop() + assert self.state == ["NO_COMMENT"] + self.outbuf.append_space() + elif line[pos] != '*': + self.state.pop() + assert self.state[-1] == "IN_BLOCK_COMMENT" + elif self.state[-1] == "ESCAPING": + self.outbuf.append_nonspace(line[pos]) + self.state.pop() + elif self.state[-1] == "IN_INLINE_COMMENT": + return + pos += 1 + +def c_file_source(fp): + + current_physical_line = one_space_line() + cleaner = c_cleaner(current_physical_line) + + current_logical_line = one_space_line() + + current_physical_start = 1 + total_sloc = 0 + local_sloc = 0 + + physical_line_num = 0 + for (physical_line_num, line) in enumerate(fp, start=1): + current_physical_line.__init__() + end = len(line) + if line[-1] == '\n': + end -= 1 + else: + if end > 0 and line[end-1] == '\\': + raise RuntimeError("file seems to end in \\ with no newline!") + + if end > 0 and line[end-1] == '\\': + continued = True + end -= 1 + else: + continued = False + cleaner.process(line, 0, end) + if not continued: + cleaner.logical_newline() + + if not current_physical_line.is_blank(): + local_sloc += 1 + + current_logical_line.join(current_physical_line) + + if not continued: + if not current_logical_line.is_blank(): + yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush()) + else: + current_logical_line.__init__() + assert local_sloc == 0 + + current_physical_start = physical_line_num + 1 + total_sloc += local_sloc + local_sloc = 0 + + total_physical_lines = physical_line_num + + if not current_logical_line.is_blank(): + yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush()) + + total_sloc += local_sloc + assert cleaner.state == ["NO_COMMENT"] + + return (total_sloc, total_physical_lines) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index a22ddc2..180aace 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -9,6 +9,7 @@ from . import preprocessor # pylint : disable=no-name-in-module +from codebasin.c_source import c_file_source class LineGroup: """ @@ -30,26 +31,19 @@ def empty(self): return False return True - def add_line(self, line_num, is_countable=False): + def add_line(self, phys_int, sloc_count): """ Add a line to this line group. Update the extent appropriately, and if it's a countable line, add it to the line count. """ - if self.start_line == -1: - self.start_line = line_num - - self.end_line = line_num - - if self.start_line == -1 or line_num < self.start_line: - self.start_line = line_num - - if line_num > self.end_line: - self.end_line = line_num + if self.start_line == -1 or phys_int[0] < self.start_line: + self.start_line = phys_int[0] - if is_countable: - self.line_count += 1 + if phys_int[1]-1 > self.end_line: + self.end_line = phys_int[1]-1 + self.line_count += sloc_count def reset(self): """ Reset the countable group @@ -58,13 +52,12 @@ def reset(self): self.start_line = -1 self.end_line = -1 - def merge(self, line_group, count=False): + def merge(self, line_group): """ Merge another line group into this line group, and reset the other group. """ - if count: - self.line_count += line_group.line_count + self.line_count += line_group.line_count if self.start_line == -1: self.start_line = line_group.start_line @@ -85,7 +78,6 @@ class FileParser: def __init__(self, _filename): self._filename = _filename - self.full_line = '' split = splitext(_filename) if len(split) == 2: @@ -94,53 +86,22 @@ def __init__(self, _filename): self._file_extension = None @staticmethod - def line_info(line): - """ - Determine if the input line is a directive by checking if the - first by looking for a '#' as the first non-whitespace - character. Also determine if the last character before a new - line is a line continuation character '\'. - - Return a (directive, line_continue) tuple. - """ - - directive = False - line_continue = False - - for c in line: - if c == '#': - directive = True - break - elif c not in [' ', '\t']: - break - - if line.rstrip("\n\r")[-1:] == '\\': - line_continue = True - - return (directive, line_continue) - - def handle_directive(self, out_tree, line_num, comment_cleaner, groups): + def handle_directive(out_tree, groups, phys_int, sloc, logical_line): """ Handle inserting code and directive nodes, where appropriate. Update the file group, and reset the code and directive groups. """ # We will actually use this directive, if it is not empty - self.full_line = comment_cleaner.strip_comments(self.full_line) - if self.full_line.strip(): - # We need to finalize the previously started - # CodeNode (if there was one) before processing - # this DirectiveNode - if not groups['code'].empty(): - groups['code'].add_line(line_num - 1) - self.insert_code_node(out_tree, groups['code']) + # We need to finalize the previously started + # CodeNode (if there was one) before processing + # this DirectiveNode + if not groups['code'].empty(): + FileParser.insert_code_node(out_tree, groups['code']) + groups['file'].merge(groups['code']) - groups['file'].merge(groups['code']) + FileParser.insert_directive_node(out_tree, groups['directive'], logical_line) - self.insert_directive_node(out_tree, groups['directive']) - - groups['file'].merge(groups['directive']) - else: - groups['code'].merge(groups['directive']) + groups['file'].merge(groups['directive']) @staticmethod def insert_code_node(tree, line_group): @@ -151,13 +112,14 @@ def insert_code_node(tree, line_group): line_group.start_line, line_group.end_line, line_group.line_count) tree.insert(new_node) - def insert_directive_node(self, tree, line_group): + @staticmethod + def insert_directive_node(tree, line_group, logical_line): """ Build a directive node by parsing a directive line, and insert a new directive node into the tree. """ new_node = preprocessor.DirectiveParser(preprocessor.Lexer( - self.full_line, line_group.start_line).tokenize()).parse() + logical_line, line_group.start_line).tokenize()).parse() new_node.start_line = line_group.start_line new_node.end_line = line_group.end_line new_node.num_lines = line_group.line_count @@ -169,12 +131,6 @@ def parse_file(self): representing this file, and return it. """ - file_comment_cleaner = preprocessor.CommentCleaner(self._file_extension) - if file_comment_cleaner.filetype == 'c': - cpp_comment_cleaner = file_comment_cleaner - else: - cpp_comment_cleaner = preprocessor.CommentCleaner('.c') - out_tree = preprocessor.SourceTree(self._filename) with open(self._filename, mode='r', errors='replace') as source_file: previous_continue = False @@ -186,56 +142,31 @@ def parse_file(self): groups['file'].start_line = 1 - lines = source_file.readlines() - for (line_num, line) in enumerate(lines, 1): - # Determine if this line starts with a # (directive) - # and/or ends with a \ (line continuation) - (in_directive, continue_line) = self.line_info(line) - - # Only follow continuation for directives - if previous_continue or in_directive: - - # Add this into the directive lines, even if it - # might not be a directive we count - groups['directive'].add_line(line_num, True) + c_source = c_file_source(source_file) + try: + while True: + (phys_int, local_sloc, logical_line) = next(c_source) + in_directive = logical_line[0] == "#" or logical_line[0] == ' ' and logical_line[1] == '#' + # Only follow continuation for directives + if in_directive: + # Add this into the directive lines, even if it + # might not be a directive we count + groups['directive'].add_line(phys_int, local_sloc) - # If this line starts a new directive, flush the - # line buffer - if in_directive and not previous_continue: - self.full_line = '' + FileParser.handle_directive(out_tree, groups, phys_int, local_sloc, logical_line) - previous_continue = continue_line - - # If this line also contains a continuation - # character - if continue_line: - self.full_line += line.rstrip("\\\n\r") - # If this line ends a previously continued line + # FallBack is that this line is a simple code line. else: - self.full_line += line.rstrip("\n\r") - - self.handle_directive(out_tree, line_num, cpp_comment_cleaner, - groups) + groups['code'].add_line(phys_int, local_sloc) + except StopIteration as it: + total_sloc, physical_loc = it.value - # FallBack is that this line is a simple code line. - else: - previous_continue = False - - # If the line isn't empty after stripping comments, - # count it as code - if file_comment_cleaner.strip_comments(line[0:-1]).strip(): - groups['code'].add_line(line_num, True) - else: - groups['code'].add_line(line_num) - - # Insert any code lines left at the end of the file if not groups['code'].empty(): - groups['code'].add_line(len(lines)) self.insert_code_node(out_tree, groups['code']) - groups['file'].merge(groups['code']) - groups['file'].add_line(len(lines)) + + groups['file'].add_line((1, physical_loc-1), total_sloc) out_tree.root.num_lines = groups['file'].end_line out_tree.root.total_sloc = groups['file'].line_count return out_tree From 45e5f15e958072cb00d541fde4128eb8822fa397 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 8 Nov 2019 08:42:09 -0800 Subject: [PATCH 03/49] Correct line counting test --- codebasin/file_parser.py | 7 +++---- tests/comments/test_comments.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 180aace..2da9d82 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -86,7 +86,7 @@ def __init__(self, _filename): self._file_extension = None @staticmethod - def handle_directive(out_tree, groups, phys_int, sloc, logical_line): + def handle_directive(out_tree, groups, logical_line): """ Handle inserting code and directive nodes, where appropriate. Update the file group, and reset the code and directive groups. @@ -153,7 +153,7 @@ def parse_file(self): # might not be a directive we count groups['directive'].add_line(phys_int, local_sloc) - FileParser.handle_directive(out_tree, groups, phys_int, local_sloc, logical_line) + FileParser.handle_directive(out_tree, groups, logical_line) # FallBack is that this line is a simple code line. else: @@ -162,11 +162,10 @@ def parse_file(self): total_sloc, physical_loc = it.value if not groups['code'].empty(): + groups['code'].add_line((groups['code'].start_line, physical_loc-1), 0) self.insert_code_node(out_tree, groups['code']) groups['file'].merge(groups['code']) - - groups['file'].add_line((1, physical_loc-1), total_sloc) out_tree.root.num_lines = groups['file'].end_line out_tree.root.total_sloc = groups['file'].line_count return out_tree diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py index 5caf55a..c9608cd 100644 --- a/tests/comments/test_comments.py +++ b/tests/comments/test_comments.py @@ -16,7 +16,7 @@ def test_c_comments(self): parser = file_parser.FileParser(os.path.join(rootdir, "continuation.cpp")) tree = parser.parse_file() - self.assertEqual(tree.root.children[0].num_lines, 25) + self.assertEqual(tree.root.total_sloc, 25) if __name__ == '__main__': unittest.main() From 06eb6829e725f2c99c330b5c7cd1723172d31512 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 8 Nov 2019 09:07:10 -0800 Subject: [PATCH 04/49] Add standalone sloc_translate utility --- sloc_translate.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100755 sloc_translate.py diff --git a/sloc_translate.py b/sloc_translate.py new file mode 100755 index 0000000..73d52b4 --- /dev/null +++ b/sloc_translate.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3.6 +# Copyright (C) 2019-2020 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from codebasin.c_source import c_file_source +import sys +import re + +def file_sloc(path, verbose=False): + with open(path, mode='r', errors='replace') as source_file: + walker = c_file_source(source_file) + try: + while True: + (interval, sloc, line) = next(walker) + if verbose: + print(f"{path} [{interval[0]}, {interval[1]}) ({sloc}): {line}") + except StopIteration as it: + total_sloc, physical_loc = it.value + + return (path, total_sloc, physical_loc) + +def walk_sloc(root, regexp, verbose=False): + for root, dirs, files in os.walk(root): + for f in files: + full_path = os.path.join(root, f) + if regexp.match(full_path): + try: + (filename, total_sloc, physical_loc) = file_sloc(full_path) + print(f"{filename}, {total_sloc}, {physical_loc}") + except FileNotFoundError: + pass + + +if __name__ == '__main__': + filename = sys.argv[1] + (filename, total_sloc, physical_loc) = file_sloc(filename, verbose=True) + print(f"{filename}, {total_sloc}, {physical_loc}") + +# walk_sloc(sys.argv[1], re.compile(sys.argv[2])) From b37901ae44e14f2ef153e844a301f8ee64aee4e5 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 8 Nov 2019 09:07:28 -0800 Subject: [PATCH 05/49] Rename NO_COMMENT to TOPLEVEL --- codebasin/c_source.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/codebasin/c_source.py b/codebasin/c_source.py index 8aeefe9..a292b39 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -45,32 +45,32 @@ def flush(self): class c_cleaner(object): def __init__(self, outbuf): - self.state = ["NO_COMMENT"] + self.state = ["TOPLEVEL"] self.outbuf = outbuf def logical_newline(self): if self.state[-1] == "IN_INLINE_COMMENT": self.state.pop() - assert self.state == ["NO_COMMENT"] + assert self.state == ["TOPLEVEL"] self.outbuf.append_space() elif self.state[-1] == "FOUND_SLASH": self.state.pop() - assert self.state == ["NO_COMMENT"] + assert self.state == ["TOPLEVEL"] self.outbuf.append_nonspace('/') elif self.state[-1] == "SINGLE_QUOTATION": # This probably should give a warning self.state.pop() - assert self.state == ["NO_COMMENT"] + assert self.state == ["TOPLEVEL"] elif self.state[-1] == "DOUBLE_QUOTATION": # This probably should give a warning self.state.pop() - assert self.state == ["NO_COMMENT"] + assert self.state == ["TOPLEVEL"] elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": self.state.pop() assert self.state[-1] == "IN_BLOCK_COMMENT" def process(self, line, start, end): pos = start while pos < end: - if self.state[-1] == "NO_COMMENT": + if self.state[-1] == "TOPLEVEL": if line[pos] == '\\': self.state.append("ESCAPING") self.outbuf.append_nonspace(line[pos]) @@ -90,7 +90,7 @@ def process(self, line, start, end): self.outbuf.append_nonspace(line[pos]) elif line[pos] == '"': self.state.pop() - assert self.state == ["NO_COMMENT"] + assert self.state == ["TOPLEVEL"] self.outbuf.append_nonspace(line[pos]) else: self.outbuf.append_nonspace(line[pos]) @@ -102,7 +102,7 @@ def process(self, line, start, end): self.state.append("FOUND_SLASH") elif line[pos] == '\'': self.state.pop() - assert self.state == ["NO_COMMENT"] + assert self.state == ["TOPLEVEL"] self.outbuf.append_nonspace(line[pos]) else: self.outbuf.append_nonspace(line[pos]) @@ -125,7 +125,7 @@ def process(self, line, start, end): self.state.pop() assert self.state[-1] == "IN_BLOCK_COMMENT" self.state.pop() - assert self.state == ["NO_COMMENT"] + assert self.state == ["TOPLEVEL"] self.outbuf.append_space() elif line[pos] != '*': self.state.pop() @@ -189,6 +189,6 @@ def c_file_source(fp): yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush()) total_sloc += local_sloc - assert cleaner.state == ["NO_COMMENT"] + assert cleaner.state == ["TOPLEVEL"] return (total_sloc, total_physical_lines) From 1a53e093f100de96cc262dbe8f678dca2fe5dbc9 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 8 Nov 2019 11:37:38 -0800 Subject: [PATCH 06/49] Upgrade c_cleaner directive detection --- codebasin/c_source.py | 62 ++++++++++++++++++++++++++++------------ codebasin/file_parser.py | 5 ++-- sloc_translate.py | 4 +-- 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/codebasin/c_source.py b/codebasin/c_source.py index a292b39..041b3e9 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -36,8 +36,18 @@ def join(self, other): else: self.parts += other.parts[:] self.trailing_space = other.trailing_space - def is_blank(self): - return len(self.parts) == 0 or ( len(self.parts) == 1 and self.parts[0] == ' ' ) + def category(self): + res = "SRC_NONBLANK" + if len(self.parts) == 0: + res = "BLANK" + elif len(self.parts) == 1: + if self.parts[0] == ' ': + res = "BLANK" + elif self.parts[0] == '#': + res = "CPP_DIRECTIVE" + elif ( self.parts[0] == ' ' and self.parts[1] == '#' ) or self.parts[0] == '#': + res = "CPP_DIRECTIVE" + return res def flush(self): res= ''.join(self.parts) self.__init__() @@ -49,28 +59,43 @@ def __init__(self, outbuf): self.outbuf = outbuf def logical_newline(self): if self.state[-1] == "IN_INLINE_COMMENT": - self.state.pop() - assert self.state == ["TOPLEVEL"] + self.state = ["TOPLEVEL"] self.outbuf.append_space() elif self.state[-1] == "FOUND_SLASH": - self.state.pop() - assert self.state == ["TOPLEVEL"] + self.state = ["TOPLEVEL"] self.outbuf.append_nonspace('/') elif self.state[-1] == "SINGLE_QUOTATION": # This probably should give a warning - self.state.pop() - assert self.state == ["TOPLEVEL"] + self.state = ["TOPLEVEL"] elif self.state[-1] == "DOUBLE_QUOTATION": # This probably should give a warning - self.state.pop() - assert self.state == ["TOPLEVEL"] + self.state == ["TOPLEVEL"] elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": self.state.pop() assert self.state[-1] == "IN_BLOCK_COMMENT" + elif self.state[-1] == "CPP_DIRECTIVE": + self.state = ["TOPLEVEL"] def process(self, line, start, end): pos = start while pos < end: if self.state[-1] == "TOPLEVEL": + if line[pos] == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(line[pos]) + elif line[pos] == '/': + self.state.append("FOUND_SLASH") + elif line[pos] == '"': + self.state.append("DOUBLE_QUOTATION") + self.outbuf.append_nonspace(line[pos]) + elif line[pos] == '\'': + self.state.append("SINGLE_QUOTATION") + self.outbuf.append_nonspace(line[pos]) + elif line[pos] == '#' and self.outbuf.category() == "BLANK": + self.state.append("CPP_DIRECTIVE") + self.outbuf.append_nonspace(line[pos]) + else: + self.outbuf.append_char(line[pos]) + elif self.state[-1] == "CPP_DIRECTIVE": if line[pos] == '\\': self.state.append("ESCAPING") self.outbuf.append_nonspace(line[pos]) @@ -90,7 +115,6 @@ def process(self, line, start, end): self.outbuf.append_nonspace(line[pos]) elif line[pos] == '"': self.state.pop() - assert self.state == ["TOPLEVEL"] self.outbuf.append_nonspace(line[pos]) else: self.outbuf.append_nonspace(line[pos]) @@ -102,7 +126,6 @@ def process(self, line, start, end): self.state.append("FOUND_SLASH") elif line[pos] == '\'': self.state.pop() - assert self.state == ["TOPLEVEL"] self.outbuf.append_nonspace(line[pos]) else: self.outbuf.append_nonspace(line[pos]) @@ -125,7 +148,6 @@ def process(self, line, start, end): self.state.pop() assert self.state[-1] == "IN_BLOCK_COMMENT" self.state.pop() - assert self.state == ["TOPLEVEL"] self.outbuf.append_space() elif line[pos] != '*': self.state.pop() @@ -135,6 +157,8 @@ def process(self, line, start, end): self.state.pop() elif self.state[-1] == "IN_INLINE_COMMENT": return + else: + assert None pos += 1 def c_file_source(fp): @@ -167,14 +191,15 @@ def c_file_source(fp): if not continued: cleaner.logical_newline() - if not current_physical_line.is_blank(): + if not current_physical_line.category() == "BLANK": local_sloc += 1 current_logical_line.join(current_physical_line) if not continued: - if not current_logical_line.is_blank(): - yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush()) + line_cat = current_logical_line.category() + if line_cat != "BLANK": + yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat) else: current_logical_line.__init__() assert local_sloc == 0 @@ -185,8 +210,9 @@ def c_file_source(fp): total_physical_lines = physical_line_num - if not current_logical_line.is_blank(): - yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush()) + line_cat = current_logical_line.category() + if line_cat != "BLANK": + yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat) total_sloc += local_sloc assert cleaner.state == ["TOPLEVEL"] diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 2da9d82..61cd085 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -145,10 +145,9 @@ def parse_file(self): c_source = c_file_source(source_file) try: while True: - (phys_int, local_sloc, logical_line) = next(c_source) - in_directive = logical_line[0] == "#" or logical_line[0] == ' ' and logical_line[1] == '#' + (phys_int, local_sloc, logical_line, line_cat) = next(c_source) # Only follow continuation for directives - if in_directive: + if line_cat == 'CPP_DIRECTIVE': # Add this into the directive lines, even if it # might not be a directive we count groups['directive'].add_line(phys_int, local_sloc) diff --git a/sloc_translate.py b/sloc_translate.py index 73d52b4..b7b63a7 100755 --- a/sloc_translate.py +++ b/sloc_translate.py @@ -11,9 +11,9 @@ def file_sloc(path, verbose=False): walker = c_file_source(source_file) try: while True: - (interval, sloc, line) = next(walker) + (interval, sloc, line, line_cat) = next(walker) if verbose: - print(f"{path} [{interval[0]}, {interval[1]}) ({sloc}): {line}") + print(f"{path} [{interval[0]}, {interval[1]}) ({sloc}): {line} {line_cat}") except StopIteration as it: total_sloc, physical_loc = it.value From b13489c8a36830834e6d0571e1b1988bc3ca3ee2 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 8 Nov 2019 12:02:44 -0800 Subject: [PATCH 07/49] Use an iterator in c_cleaner.process Requires a 'put back' functionality --- codebasin/c_source.py | 99 +++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 41 deletions(-) diff --git a/codebasin/c_source.py b/codebasin/c_source.py index 041b3e9..e461c72 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -4,6 +4,8 @@ Contains classes and functions for stripping comments and whitespace from C/C++ files """ +import itertools as it + global whitespace_dict whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000') @@ -53,6 +55,22 @@ def flush(self): self.__init__() return res +class iter_keep1(object): + def __init__(self, iterator): + self.iterator = iter(iterator) + self.single = None + def __iter__(self): + return self + def __next__(self): + if self.single is not None: + res, self.single = self.single, None + return res + else: + return next(self.iterator) + def putback(self, item): + assert self.single is None + self.single = item + class c_cleaner(object): def __init__(self, outbuf): self.state = ["TOPLEVEL"] @@ -75,91 +93,90 @@ def logical_newline(self): assert self.state[-1] == "IN_BLOCK_COMMENT" elif self.state[-1] == "CPP_DIRECTIVE": self.state = ["TOPLEVEL"] - def process(self, line, start, end): - pos = start - while pos < end: + def process(self, lineiter): + inbuffer = iter_keep1(lineiter) + for char in inbuffer: if self.state[-1] == "TOPLEVEL": - if line[pos] == '\\': + if char == '\\': self.state.append("ESCAPING") - self.outbuf.append_nonspace(line[pos]) - elif line[pos] == '/': + self.outbuf.append_nonspace(char) + elif char == '/': self.state.append("FOUND_SLASH") - elif line[pos] == '"': + elif char == '"': self.state.append("DOUBLE_QUOTATION") - self.outbuf.append_nonspace(line[pos]) - elif line[pos] == '\'': + self.outbuf.append_nonspace(char) + elif char == '\'': self.state.append("SINGLE_QUOTATION") - self.outbuf.append_nonspace(line[pos]) - elif line[pos] == '#' and self.outbuf.category() == "BLANK": + self.outbuf.append_nonspace(char) + elif char == '#' and self.outbuf.category() == "BLANK": self.state.append("CPP_DIRECTIVE") - self.outbuf.append_nonspace(line[pos]) + self.outbuf.append_nonspace(char) else: - self.outbuf.append_char(line[pos]) + self.outbuf.append_char(char) elif self.state[-1] == "CPP_DIRECTIVE": - if line[pos] == '\\': + if char == '\\': self.state.append("ESCAPING") - self.outbuf.append_nonspace(line[pos]) - elif line[pos] == '/': + self.outbuf.append_nonspace(char) + elif char == '/': self.state.append("FOUND_SLASH") - elif line[pos] == '"': + elif char == '"': self.state.append("DOUBLE_QUOTATION") - self.outbuf.append_nonspace(line[pos]) - elif line[pos] == '\'': + self.outbuf.append_nonspace(char) + elif char == '\'': self.state.append("SINGLE_QUOTATION") - self.outbuf.append_nonspace(line[pos]) + self.outbuf.append_nonspace(char) else: - self.outbuf.append_char(line[pos]) + self.outbuf.append_char(char) elif self.state[-1] == "DOUBLE_QUOTATION": - if line[pos] == '\\': + if char == '\\': self.state.append("ESCAPING") - self.outbuf.append_nonspace(line[pos]) - elif line[pos] == '"': + self.outbuf.append_nonspace(char) + elif char == '"': self.state.pop() - self.outbuf.append_nonspace(line[pos]) + self.outbuf.append_nonspace(char) else: - self.outbuf.append_nonspace(line[pos]) + self.outbuf.append_nonspace(char) elif self.state[-1] == "SINGLE_QUOTATION": - if line[pos] == '\\': + if char == '\\': self.state.append("ESCAPING") - self.outbuf.append_nonspace(line[pos]) - elif line[pos] == '/': + self.outbuf.append_nonspace(char) + elif char == '/': self.state.append("FOUND_SLASH") - elif line[pos] == '\'': + elif char == '\'': self.state.pop() - self.outbuf.append_nonspace(line[pos]) + self.outbuf.append_nonspace(char) else: - self.outbuf.append_nonspace(line[pos]) + self.outbuf.append_nonspace(char) elif self.state[-1] == "FOUND_SLASH": - if line[pos] == '/': + if char == '/': self.state.pop() self.state.append("IN_INLINE_COMMENT") - elif line[pos] == '*': + elif char == '*': self.state.pop() self.state.append("IN_BLOCK_COMMENT") else: self.state.pop() self.outbuf.append_char('/') - pos -= 1 + inbuffer.putback(char) elif self.state[-1] == "IN_BLOCK_COMMENT": - if line[pos] == '*': + if char == '*': self.state.append("IN_BLOCK_COMMENT_FOUND_STAR") elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": - if line[pos] == '/': + if char == '/': self.state.pop() assert self.state[-1] == "IN_BLOCK_COMMENT" self.state.pop() self.outbuf.append_space() - elif line[pos] != '*': + elif char != '*': self.state.pop() assert self.state[-1] == "IN_BLOCK_COMMENT" elif self.state[-1] == "ESCAPING": - self.outbuf.append_nonspace(line[pos]) + self.outbuf.append_nonspace(char) self.state.pop() elif self.state[-1] == "IN_INLINE_COMMENT": return else: assert None - pos += 1 def c_file_source(fp): @@ -187,7 +204,7 @@ def c_file_source(fp): end -= 1 else: continued = False - cleaner.process(line, 0, end) + cleaner.process(it.islice(line, 0, end)) if not continued: cleaner.logical_newline() From 5db669f499560e5692b2efd028baa2b401c6f652 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Mon, 11 Nov 2019 14:09:26 -0800 Subject: [PATCH 08/49] Add Fortran support --- codebasin/c_source.py | 158 +++++++++++++++++++++++++++++++++++++++++- sloc_translate.py | 14 ++-- 2 files changed, 163 insertions(+), 9 deletions(-) diff --git a/codebasin/c_source.py b/codebasin/c_source.py index e461c72..9b864cb 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -178,7 +178,108 @@ def process(self, lineiter): else: assert None -def c_file_source(fp): +class fortran_cleaner(object): + def __init__(self, outbuf): + self.state = ["TOPLEVEL"] + self.outbuf = outbuf + self.verify_continue = [] + def dir_check(self, inbuffer): + self.found=['!'] + for char in inbuffer: + if char == '$': + self.found.append('$') + for char in self.found: + self.outbuf.append_nonspace(char) + break + elif char.isalpha(): + self.found.append(char) + else: + return + def process(self, lineiter): + inbuffer = iter_keep1(lineiter) + try: + while True: + char = next(inbuffer) + if self.state[-1] == "TOPLEVEL": + if char == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(char) + elif char == '!': + self.dir_check(inbuffer) + self.state = ["TOPLEVEL"] + break + elif char == '&': + self.verify_continue.append(char) + self.state.append("VERIFY_CONTINUE") + elif char == '"': + self.state.append("DOUBLE_QUOTATION") + self.outbuf.append_nonspace(char) + elif char == '\'': + self.state.append("SINGLE_QUOTATION") + self.outbuf.append_nonspace(char) + else: + self.outbuf.append_char(char) + elif self.state[-1] == 'CONTINUING_FROM_SOL': + if is_whitespace(char): + self.outbuf.append_space() + elif char == '&': + self.state.pop() + elif char == '!': + self.dir_check(inbuffer) + break + else: + self.state.pop() + inbuffer.putback(char) + # should complain if we are quoting here, but will ignore for now + elif self.state[-1] == "DOUBLE_QUOTATION": + if char == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(char) + elif char == '"': + self.state.pop() + self.outbuf.append_nonspace(char) + elif char == '&': + self.verify_continue.append(char) + self.state.append("VERIFY_CONTINUE") + else: + self.outbuf.append_nonspace(char) + elif self.state[-1] == "SINGLE_QUOTATION": + if char == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(char) + elif char == '\'': + self.state.pop() + self.outbuf.append_nonspace(char) + elif char == '&': + self.verify_continue.append(char) + self.state.append("VERIFY_CONTINUE") + else: + self.outbuf.append_nonspace(char) + elif self.state[-1] == "ESCAPING": + self.outbuf.append_nonspace(char) + self.state.pop() + elif self.state[-1] == "VERIFY_CONTINUE": + if char == '!' and self.state[-2] == "TOPLEVEL": + self.dir_check(inbuffer) + break + elif not is_whitespace(char): + for tmp in self.verify_continue: + self.outbuf.append_nonspace(tmp) + self.verify_continue = [] + self.state.pop() + inbuffer.putback(char) + else: + assert None + except StopIteration: + pass + if self.state[-1] == "CONTINUING_TO_EOL": + self.state[-1] = "CONTINUING_FROM_SOL" + elif self.state[-1] == "VERIFY_CONTINUE": + self.state[-1] = "CONTINUING_FROM_SOL" + #print(self.state) + + +def c_file_source(fp, relaxed=True): current_physical_line = one_space_line() cleaner = c_cleaner(current_physical_line) @@ -232,6 +333,59 @@ def c_file_source(fp): yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat) total_sloc += local_sloc - assert cleaner.state == ["TOPLEVEL"] + if not relaxed: + assert cleaner.state == ["TOPLEVEL"] + + return (total_sloc, total_physical_lines) + +def fortran_file_source(fp, relaxed=True): + + current_physical_line = one_space_line() + cleaner = fortran_cleaner(current_physical_line) + + current_logical_line = one_space_line() + + current_physical_start = None + total_sloc = 0 + local_sloc = 0 + + physical_line_num = 0 + c_walker = c_file_source(fp) + try: + while True: + ((src_physical_start, src_physical_end), src_line_sloc, src_line, _) = next(c_walker) + if current_physical_start == None: + current_physical_start = src_physical_start + current_physical_line.__init__() + import pdb +# pdb.set_trace() + cleaner.process(it.islice(src_line, 0, len(src_line))) + + if not current_physical_line.category() == "BLANK": + local_sloc += src_line_sloc + + current_logical_line.join(current_physical_line) + + if cleaner.state[-1] != "CONTINUING_FROM_SOL": + line_cat = current_logical_line.category() + if line_cat != "BLANK": + yield ((current_physical_start, src_physical_end), local_sloc, current_logical_line.flush(), line_cat) + else: + current_logical_line.__init__() + assert local_sloc == 0 + + current_physical_start = None + total_sloc += local_sloc + local_sloc = 0 + except StopIteration as stopit: + _, total_physical_lines = stopit.value + + line_cat = current_logical_line.category() + if line_cat != "BLANK": + yield ((current_physical_start, total_physical_lines), local_sloc, current_logical_line.flush(), line_cat) + + total_sloc += local_sloc + if not relaxed: + assert cleaner.state == ["TOPLEVEL"] return (total_sloc, total_physical_lines) diff --git a/sloc_translate.py b/sloc_translate.py index b7b63a7..43a286f 100755 --- a/sloc_translate.py +++ b/sloc_translate.py @@ -2,13 +2,14 @@ # Copyright (C) 2019-2020 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause -from codebasin.c_source import c_file_source +from codebasin.c_source import c_file_source, fortran_file_source +import os import sys import re def file_sloc(path, verbose=False): with open(path, mode='r', errors='replace') as source_file: - walker = c_file_source(source_file) + walker = fortran_file_source(source_file, relaxed=False) try: while True: (interval, sloc, line, line_cat) = next(walker) @@ -30,10 +31,9 @@ def walk_sloc(root, regexp, verbose=False): except FileNotFoundError: pass - if __name__ == '__main__': - filename = sys.argv[1] - (filename, total_sloc, physical_loc) = file_sloc(filename, verbose=True) - print(f"{filename}, {total_sloc}, {physical_loc}") + # filename = sys.argv[1] + # (filename, total_sloc, physical_loc) = file_sloc(filename, verbose=True) + # print(f"{filename}, {total_sloc}, {physical_loc}") -# walk_sloc(sys.argv[1], re.compile(sys.argv[2])) + walk_sloc(sys.argv[1], re.compile(sys.argv[2])) From 06f7b766f4bfb8ff6da0750db7dff39830edac39 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 07:14:35 -0800 Subject: [PATCH 09/49] Add front-ends for language parsers Use guess_language/get_file_source as front-end for fortran/c parsers --- codebasin/c_source.py | 28 ++++++++++++++++++++++++++++ codebasin/file_parser.py | 10 ++++++---- sloc_translate.py | 20 +++++++++++++------- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/codebasin/c_source.py b/codebasin/c_source.py index 9b864cb..0b9b4e0 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -5,6 +5,7 @@ """ import itertools as it +from os.path import splitext global whitespace_dict whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000') @@ -389,3 +390,30 @@ def fortran_file_source(fp, relaxed=True): assert cleaner.state == ["TOPLEVEL"] return (total_sloc, total_physical_lines) + + +global extension_map +extension_map = {'.f90' : "FREEFORM FORTRAN", + '.cxx' : "C FAMILY", + '.cl' : "C FAMILY", + '.cu' : "C FAMILY", + '.cpp' : "C FAMILY", + '.c' : "C FAMILY", + '.h' : "C FAMILY", + '.hpp' : "C FAMILY"} + +def guess_language(fname): + _, ext = splitext(fname) + try: + return extension_map[ext.lower()] + except KeyError: + return "Unknown" + +def get_file_source(path): + lang = guess_language(path) + if lang == "FREEFORM FORTRAN": + return fortran_file_source + elif lang == "C FAMILY": + return c_file_source + else: + return None diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 61cd085..ea05bd1 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -9,7 +9,7 @@ from . import preprocessor # pylint : disable=no-name-in-module -from codebasin.c_source import c_file_source +from codebasin.c_source import get_file_source class LineGroup: """ @@ -68,7 +68,6 @@ def merge(self, line_group): self.end_line = max(self.end_line, line_group.end_line) line_group.reset() - class FileParser: """ Contains methods for parsing an entire source file and returning a @@ -132,6 +131,9 @@ def parse_file(self): """ out_tree = preprocessor.SourceTree(self._filename) + file_source = get_file_source(path) + if not file_source: + raise RuntimeError(f"{path} doesn't appear to be a language this tool can process") with open(self._filename, mode='r', errors='replace') as source_file: previous_continue = False @@ -142,10 +144,10 @@ def parse_file(self): groups['file'].start_line = 1 - c_source = c_file_source(source_file) + source = file_source(source_file) try: while True: - (phys_int, local_sloc, logical_line, line_cat) = next(c_source) + (phys_int, local_sloc, logical_line, line_cat) = next(source) # Only follow continuation for directives if line_cat == 'CPP_DIRECTIVE': # Add this into the directive lines, even if it diff --git a/sloc_translate.py b/sloc_translate.py index 43a286f..6f66c95 100755 --- a/sloc_translate.py +++ b/sloc_translate.py @@ -2,14 +2,17 @@ # Copyright (C) 2019-2020 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause -from codebasin.c_source import c_file_source, fortran_file_source +from codebasin.c_source import get_file_source import os import sys import re def file_sloc(path, verbose=False): + file_source = get_file_source(path) + if not file_source: + raise RuntimeError(f"{path} doesn't appear to be a language this tool can process") with open(path, mode='r', errors='replace') as source_file: - walker = fortran_file_source(source_file, relaxed=False) + walker = file_source(source_file, relaxed=False) try: while True: (interval, sloc, line, line_cat) = next(walker) @@ -32,8 +35,11 @@ def walk_sloc(root, regexp, verbose=False): pass if __name__ == '__main__': - # filename = sys.argv[1] - # (filename, total_sloc, physical_loc) = file_sloc(filename, verbose=True) - # print(f"{filename}, {total_sloc}, {physical_loc}") - - walk_sloc(sys.argv[1], re.compile(sys.argv[2])) + if len(sys.argv) == 2: + filename = sys.argv[1] + (filename, total_sloc, physical_loc) = file_sloc(filename, verbose=True) + print(f"{filename}, {total_sloc}, {physical_loc}") + elif len(sys.argv) == 3: + walk_sloc(sys.argv[1], re.compile(sys.argv[2])) + else: + print("Expected either 1 argument (a single file to parse and print) or 2 (a directory root & file pattern)") From d153a6aeba05e30b001174011830d8437c0d9ed6 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 07:15:47 -0800 Subject: [PATCH 10/49] Have C parser only look for directives in fortran --- codebasin/c_source.py | 56 ++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/codebasin/c_source.py b/codebasin/c_source.py index 0b9b4e0..acc597d 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -73,9 +73,10 @@ def putback(self, item): self.single = item class c_cleaner(object): - def __init__(self, outbuf): + def __init__(self, outbuf, directives_only=False): self.state = ["TOPLEVEL"] self.outbuf = outbuf + self.directives_only = directives_only def logical_newline(self): if self.state[-1] == "IN_INLINE_COMMENT": self.state = ["TOPLEVEL"] @@ -98,22 +99,32 @@ def process(self, lineiter): inbuffer = iter_keep1(lineiter) for char in inbuffer: if self.state[-1] == "TOPLEVEL": - if char == '\\': - self.state.append("ESCAPING") - self.outbuf.append_nonspace(char) - elif char == '/': - self.state.append("FOUND_SLASH") - elif char == '"': - self.state.append("DOUBLE_QUOTATION") - self.outbuf.append_nonspace(char) - elif char == '\'': - self.state.append("SINGLE_QUOTATION") - self.outbuf.append_nonspace(char) - elif char == '#' and self.outbuf.category() == "BLANK": - self.state.append("CPP_DIRECTIVE") - self.outbuf.append_nonspace(char) + if self.directives_only: + if char == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(char) + elif char == '#' and self.outbuf.category() == "BLANK": + self.state.append("CPP_DIRECTIVE") + self.outbuf.append_nonspace(char) + else: + self.outbuf.append_char(char) else: - self.outbuf.append_char(char) + if char == '\\': + self.state.append("ESCAPING") + self.outbuf.append_nonspace(char) + elif char == '/': + self.state.append("FOUND_SLASH") + elif char == '"': + self.state.append("DOUBLE_QUOTATION") + self.outbuf.append_nonspace(char) + elif char == '\'': + self.state.append("SINGLE_QUOTATION") + self.outbuf.append_nonspace(char) + elif char == '#' and self.outbuf.category() == "BLANK": + self.state.append("CPP_DIRECTIVE") + self.outbuf.append_nonspace(char) + else: + self.outbuf.append_char(char) elif self.state[-1] == "CPP_DIRECTIVE": if char == '\\': self.state.append("ESCAPING") @@ -277,13 +288,11 @@ def process(self, lineiter): self.state[-1] = "CONTINUING_FROM_SOL" elif self.state[-1] == "VERIFY_CONTINUE": self.state[-1] = "CONTINUING_FROM_SOL" - #print(self.state) - -def c_file_source(fp, relaxed=True): +def c_file_source(fp, relaxed=False, directives_only=False): current_physical_line = one_space_line() - cleaner = c_cleaner(current_physical_line) + cleaner = c_cleaner(current_physical_line, directives_only) current_logical_line = one_space_line() @@ -339,7 +348,7 @@ def c_file_source(fp, relaxed=True): return (total_sloc, total_physical_lines) -def fortran_file_source(fp, relaxed=True): +def fortran_file_source(fp, relaxed=False): current_physical_line = one_space_line() cleaner = fortran_cleaner(current_physical_line) @@ -350,16 +359,13 @@ def fortran_file_source(fp, relaxed=True): total_sloc = 0 local_sloc = 0 - physical_line_num = 0 - c_walker = c_file_source(fp) + c_walker = c_file_source(fp, directives_only=True) try: while True: ((src_physical_start, src_physical_end), src_line_sloc, src_line, _) = next(c_walker) if current_physical_start == None: current_physical_start = src_physical_start current_physical_line.__init__() - import pdb -# pdb.set_trace() cleaner.process(it.islice(src_line, 0, len(src_line))) if not current_physical_line.category() == "BLANK": From 768347b1f4e1667575e12576701eda8819709e37 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 07:40:09 -0800 Subject: [PATCH 11/49] Fix spelling bug --- codebasin/file_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index ea05bd1..c078b77 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -131,7 +131,7 @@ def parse_file(self): """ out_tree = preprocessor.SourceTree(self._filename) - file_source = get_file_source(path) + file_source = get_file_source(self._filename) if not file_source: raise RuntimeError(f"{path} doesn't appear to be a language this tool can process") with open(self._filename, mode='r', errors='replace') as source_file: From 9bc99c4be8800e452460d59240324de995c367ab Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 07:41:38 -0800 Subject: [PATCH 12/49] Remove unused code --- codebasin/file_parser.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index c078b77..9ef3986 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -78,12 +78,6 @@ class FileParser: def __init__(self, _filename): self._filename = _filename - split = splitext(_filename) - if len(split) == 2: - self._file_extension = split[1].lower() - else: - self._file_extension = None - @staticmethod def handle_directive(out_tree, groups, logical_line): """ From a87c862374676c315daccda30d165614abd0e2da Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 11:53:33 -0800 Subject: [PATCH 13/49] Pass C directives directly to output This commit forces C directives to skip through the Fortran parser. --- codebasin/c_source.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/codebasin/c_source.py b/codebasin/c_source.py index acc597d..f832981 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -362,9 +362,26 @@ def fortran_file_source(fp, relaxed=False): c_walker = c_file_source(fp, directives_only=True) try: while True: - ((src_physical_start, src_physical_end), src_line_sloc, src_line, _) = next(c_walker) + ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) = next(c_walker) + #if it's a cpp directive, flush what we have, then emit the directive and start over if current_physical_start == None: current_physical_start = src_physical_start + + if c_category == "CPP_DIRECTIVE": + line_cat = current_logical_line.category() + if line_cat != "BLANK": + yield ((current_physical_start, src_physical_end), local_sloc, current_logical_line.flush(), line_cat) + else: + current_logical_line.__init__() + assert local_sloc == 0 + + current_physical_start = None + total_sloc += local_sloc + local_sloc = 0 + yield ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) + total_sloc += src_line_sloc + continue + current_physical_line.__init__() cleaner.process(it.islice(src_line, 0, len(src_line))) From d410c7fabc1681b026b1170461c31bffad37bb6b Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 12:34:05 -0800 Subject: [PATCH 14/49] Add test for Fortran Also fix verify_continue behavior --- codebasin/c_source.py | 5 +++++ tests/comments/fortran.f90 | 34 +++++++++++++++++++++++++++++++++ tests/comments/test_comments.py | 16 ++++++++++++++-- 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 tests/comments/fortran.f90 diff --git a/codebasin/c_source.py b/codebasin/c_source.py index f832981..b673a8d 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -202,6 +202,8 @@ def dir_check(self, inbuffer): self.found.append('$') for char in self.found: self.outbuf.append_nonspace(char) + for char in inbuffer: + self.outbuf.append_nonspace(char) break elif char.isalpha(): self.found.append(char) @@ -280,6 +282,8 @@ def process(self, lineiter): self.verify_continue = [] self.state.pop() inbuffer.putback(char) + elif is_whitespace(char): + self.verify_continue.append(char) else: assert None except StopIteration: @@ -287,6 +291,7 @@ def process(self, lineiter): if self.state[-1] == "CONTINUING_TO_EOL": self.state[-1] = "CONTINUING_FROM_SOL" elif self.state[-1] == "VERIFY_CONTINUE": + self.verify_continue = [] self.state[-1] = "CONTINUING_FROM_SOL" def c_file_source(fp, relaxed=False, directives_only=False): diff --git a/tests/comments/fortran.f90 b/tests/comments/fortran.f90 new file mode 100644 index 0000000..0b1006a --- /dev/null +++ b/tests/comments/fortran.f90 @@ -0,0 +1,34 @@ +! Copyright (C) 2019-2020 Intel Corporation +! SPDX-License-Identifier: BSD-3-Clause + +program foo + +#define my_fortran_macro() \ + /*wow a comment*/ \ + a = b - c /* another */ \ + + b !FOO // "neat" /* hey look a c comment*/ + + integer a,b,c + b = b & ! Comments after continuations + ! no comment! + + b + !$ A directive + + write(*,*) "Fortran! /*Has*/ !Unique parsing semantics" + !omp$ a different directive + write(*,*) "& Fortran! has complex ways of dealing with (&) //ampersands&" + !omp5% not a directives + write(*,*) "Fortran! \& d \n & + !Can be " + &'quite' complex& + !Mixin +&"//"& + !Mixin + &with quoted continuations" + +my_fortran_macro() + +#if !defined(GPU) /*something*/ + write(*,*) "directives" // "appending" +#endif +end program foo diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py index c9608cd..aa97472 100644 --- a/tests/comments/test_comments.py +++ b/tests/comments/test_comments.py @@ -6,9 +6,21 @@ import os from codebasin import preprocessor, file_parser -class TestExampleFile(unittest.TestCase): +class TestExampleFortranFile(unittest.TestCase): """ - Test handling of comments + Test handling of fixed form Fortran + """ + + def test_fortran_comments(self): + rootdir = "./tests/comments/" + parser = file_parser.FileParser(os.path.join(rootdir, "fortran.f90")) + + tree = parser.parse_file() + self.assertEqual(tree.root.total_sloc, 20) + +class TestExampleCFile(unittest.TestCase): + """ + Test handling of C comments """ def test_c_comments(self): From b64cabb560d3dc4c1eb9b2741cf958c579044915 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 13:06:49 -0800 Subject: [PATCH 15/49] Add cuh, cc to C-like extensions --- codebasin/c_source.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/codebasin/c_source.py b/codebasin/c_source.py index b673a8d..14457af 100644 --- a/codebasin/c_source.py +++ b/codebasin/c_source.py @@ -425,6 +425,8 @@ def fortran_file_source(fp, relaxed=False): '.cxx' : "C FAMILY", '.cl' : "C FAMILY", '.cu' : "C FAMILY", + '.cuh' : "C FAMILY", + '.cc' : "C FAMILY", '.cpp' : "C FAMILY", '.c' : "C FAMILY", '.h' : "C FAMILY", From a65aad4e61b9d78383a2db029bce52cb6640b7c3 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 13:07:04 -0800 Subject: [PATCH 16/49] Fix spelling issue in file_parser --- codebasin/file_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 9ef3986..2ec5037 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -127,7 +127,7 @@ def parse_file(self): out_tree = preprocessor.SourceTree(self._filename) file_source = get_file_source(self._filename) if not file_source: - raise RuntimeError(f"{path} doesn't appear to be a language this tool can process") + raise RuntimeError(f"{self._filename} doesn't appear to be a language this tool can process") with open(self._filename, mode='r', errors='replace') as source_file: previous_continue = False From c6675cd68d340fc133e78b4cf56aa737da21098b Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 13:09:43 -0800 Subject: [PATCH 17/49] Rename c_source.py to file_source.py --- codebasin/file_parser.py | 2 +- codebasin/{c_source.py => file_source.py} | 2 +- sloc_translate.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename codebasin/{c_source.py => file_source.py} (99%) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 2ec5037..bc6d7f2 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -9,7 +9,7 @@ from . import preprocessor # pylint : disable=no-name-in-module -from codebasin.c_source import get_file_source +from codebasin.file_source import get_file_source class LineGroup: """ diff --git a/codebasin/c_source.py b/codebasin/file_source.py similarity index 99% rename from codebasin/c_source.py rename to codebasin/file_source.py index 14457af..4727061 100644 --- a/codebasin/c_source.py +++ b/codebasin/file_source.py @@ -1,7 +1,7 @@ # Copyright (C) 2019 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause """ -Contains classes and functions for stripping comments and whitespace from C/C++ files +Contains classes and functions for stripping comments and whitespace from C/C++ files as well as fixed-form Fortran """ import itertools as it diff --git a/sloc_translate.py b/sloc_translate.py index 6f66c95..b4a5ee0 100755 --- a/sloc_translate.py +++ b/sloc_translate.py @@ -2,7 +2,7 @@ # Copyright (C) 2019-2020 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause -from codebasin.c_source import get_file_source +from codebasin.file_source import get_file_source import os import sys import re From 70dec183c62d59c32b02d54eb116e9ad31591f6b Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 13:17:40 -0800 Subject: [PATCH 18/49] Move sloc_translate to etc/ --- sloc_translate.py => etc/sloc_translate.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sloc_translate.py => etc/sloc_translate.py (100%) diff --git a/sloc_translate.py b/etc/sloc_translate.py similarity index 100% rename from sloc_translate.py rename to etc/sloc_translate.py From ea03e62e65b8d9a6cebcfa3d282bbdf248300ec3 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 13:29:22 -0800 Subject: [PATCH 19/49] Fix imports in etc/sloc_translate --- etc/sloc_translate.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index b4a5ee0..c26b2c4 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -2,9 +2,13 @@ # Copyright (C) 2019-2020 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause -from codebasin.file_source import get_file_source import os import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + +from codebasin.file_source import get_file_source + import re def file_sloc(path, verbose=False): From b50bdc6f96050211d96676dc164e258d78961a7c Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 12 Nov 2019 14:32:49 -0800 Subject: [PATCH 20/49] Guess CDS-DPCPP-HPCBench platform breakdown --- etc/guess_info.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100755 etc/guess_info.py diff --git a/etc/guess_info.py b/etc/guess_info.py new file mode 100755 index 0000000..4082f4a --- /dev/null +++ b/etc/guess_info.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3.6 + +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + +from sloc_translate import file_sloc +from codebasin.file_source import get_file_source +from codebasin.report import divergence + +import csv +from pathlib import Path +from collections import defaultdict +import re +import itertools as it +import yaml + +def guess_app(inpath): + path = Path(inpath) + if path.parts[0] == 'dlpbenchcuda': + app = 'dlpbench-' + path.parts[1] + elif path.parts[0] == 'dlpbenchopencl': + app = 'dlpbench-' + path.parts[1] + elif path.parts[0] == 'dlpbench': + app = 'dlpbench-' + path.parts[2] + elif path.parts[0] == 'cmedia-bench': + app = None + elif path.parts[0] == 'DNNBench': + app = f"DNNBench-{path.parts[1]}" + else: + app = path.parts[0] + return app + +def matches(path, regexp): + return regexp.search(path) != None + +class plat_guesser(object): + def __init__(self, name, pathwl, extwl): + self.name = name + self.pathwl = pathwl + self.pathbl = [] + self.extwl = extwl + self.extbl = [] + def finalize(self): + if len(self.pathwl) > 0: + all_exts = "|".join((f"[^a-z]+{x}|{x}[^a-z]+" for x in (z.replace("+", r"\+") for z in self.pathwl))) + self.pathwl_re = re.compile(f"{all_exts}") + else: + self.pathwl_re = re.compile(r"^\b$") + if len(self.pathbl) > 0: + all_exts = "|".join((f"[^a-z]+{x}|{x}[^a-z]+" for x in (z.replace("+", r"\+") for z in self.pathbl))) + self.pathbl_re = re.compile(f"{all_exts}") + else: + self.pathwl_re = re.compile(r"^\b$") + if len(self.extwl) > 0: + all_exts = "|".join(self.extwl) + self.extwl_re = re.compile(f"(.{all_exts})$") + else: + self.extwl_re = re.compile(r"^\b$") + if len(self.extbl) > 0: + all_exts = "|".join(self.extbl) + self.extbl_re = re.compile(f"(.{all_exts})$") + else: + self.extbl_re = re.compile(r"^\b$") + def score(self, path): + neg, pos = False, False + pos |= matches(path, self.pathwl_re) + neg |= matches(path, self.pathbl_re) + pos |= matches(path, self.extwl_re) + neg |= matches(path, self.extbl_re) + return self.name, (neg, pos) + + +guessers = [plat_guesser("cuda", + ["cuda"], + ["cu"]), + plat_guesser("opencl", + ["opencl", "ocl"], + ["cl"]), + plat_guesser("dpc++", + ["dpc++", "dpcpp", "sycl"], + []), + plat_guesser("openmp", + ["omp", "openmp"], + [])] + +all_pathwl = set() +all_extwl = set() +for g in guessers: + all_pathwl.update(set(g.pathwl)) + all_extwl.update(set(g.extwl)) + +for g in guessers: + g.pathbl = list(all_pathwl.difference(set(g.pathwl))) + g.extbl = list(all_extwl.difference(set(g.extwl))) + g.finalize() + +def guess_platform(inpath): + path = Path(inpath) + return path.parts[1] + +def categorize_file(inpath): + res = {} + path = inpath.lower() + for g in guessers: + name, cat = g.score(path) + res[name] = cat + return res + +def walk_apptree(inroot, regexp): + apps = defaultdict(list) + for root, dirs, files in os.walk(inroot): + for f in files: + full_path = os.path.join(root, f) + if regexp.match(full_path): + app = guess_app(full_path) + if app: + apps[app].append(os.path.relpath(full_path, inroot)) + return apps + +def app_groups(files, all_lang=frozenset(['cuda', 'opencl', 'dpc++', 'openmp'])): + platmap = defaultdict(list) + for f in files: + cats = categorize_file(f) + is_in = set() + isnt_in = set() + for k, which in cats.items(): + if which[1]: + is_in.update([k]) + if which[0]: + isnt_in.update([k]) + if len(is_in) == 0: + partial_common = all_lang.difference(isnt_in) + if len(partial_common) > 0: + for p in partial_common: + platmap[p].append(f) + else: + update=is_in.intersection(all_lang) + if len(update) > 0: + for p in update: + platmap[p].append(f) + return platmap + +def write_yaml(output, files): + platmap = app_groups(files) + base = {'codebase' : { 'files' : files, 'platforms' : list(platmap.keys()) }} + for plat_name, plat_files in platmap.items(): + base[plat_name] = plat_files + with open(output, "w") as ofp: + yaml.dump(base, ofp) + +os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/") +apps = walk_apptree(".", re.compile('(.*\.)(cpp|c|hpp|h|cl|cu|cxx|cc|cuh)$')) + +for app_name, app_files in apps.items(): + + write_yaml(f"{app_name}.yaml", app_files) + print(f"{app_name}.yaml") + +print("done") From fe728a3672b532b50fb41ad0018126fa77aed9a4 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Wed, 13 Nov 2019 13:05:56 -0800 Subject: [PATCH 21/49] Add update guess-info to skip certain apps --- etc/guess_info.py | 57 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/etc/guess_info.py b/etc/guess_info.py index 4082f4a..03ced5b 100755 --- a/etc/guess_info.py +++ b/etc/guess_info.py @@ -18,15 +18,15 @@ def guess_app(inpath): path = Path(inpath) - if path.parts[0] == 'dlpbenchcuda': + if path.parts[0] == 'dlpbenchcuda' and path.parts[1] != 'utils': app = 'dlpbench-' + path.parts[1] elif path.parts[0] == 'dlpbenchopencl': app = 'dlpbench-' + path.parts[1] - elif path.parts[0] == 'dlpbench': + elif path.parts[0] == 'dlpbench' and path.parts[1] != 'common' and path.parts[1] !='deprecated_workloads' and path.parts[1] != 'csa': app = 'dlpbench-' + path.parts[2] - elif path.parts[0] == 'cmedia-bench': + elif path.parts[0] in ['cmedia-bench', "config", "infrastructure", "Test-Infrastructure"]: app = None - elif path.parts[0] == 'DNNBench': + elif path.parts[0] == 'DNNBench' and not path.parts[1] == 'common': app = f"DNNBench-{path.parts[1]}" else: app = path.parts[0] @@ -110,6 +110,7 @@ def categorize_file(inpath): def walk_apptree(inroot, regexp): apps = defaultdict(list) + paths = {} for root, dirs, files in os.walk(inroot): for f in files: full_path = os.path.join(root, f) @@ -134,28 +135,58 @@ def app_groups(files, all_lang=frozenset(['cuda', 'opencl', 'dpc++', 'openmp'])) partial_common = all_lang.difference(isnt_in) if len(partial_common) > 0: for p in partial_common: - platmap[p].append(f) + platmap[p].append(Path(f)) else: update=is_in.intersection(all_lang) if len(update) > 0: for p in update: - platmap[p].append(f) + platmap[p].append(Path(f)) return platmap -def write_yaml(output, files): - platmap = app_groups(files) - base = {'codebase' : { 'files' : files, 'platforms' : list(platmap.keys()) }} - for plat_name, plat_files in platmap.items(): - base[plat_name] = plat_files +def write_yaml(output, files, langs_names_map, strip_prefix=Path(".")): + + platmap = app_groups(files, frozenset(langs_names_map.values())) + all_files = set() + for plat, pfiles in platmap.items(): + all_files.update([str(f.relative_to(strip_prefix)) for f in pfiles]) + if len(all_files) == 0: + return False + base = {'codebase' : { 'files' : list(all_files) }} + plats = set() + for export_name, plat_name in langs_names_map.items(): + plat_files = [str(f.relative_to(strip_prefix)) for f in platmap[plat_name]] + if len(plat_files) > 0: + base[export_name] = {'files': plat_files} + plats.update([export_name]) + elif len(langs_names_map) < 4: #Hack + return False + base['codebase']['platforms'] = list(plats) with open(output, "w") as ofp: yaml.dump(base, ofp) + return True os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/") apps = walk_apptree(".", re.compile('(.*\.)(cpp|c|hpp|h|cl|cu|cxx|cc|cuh)$')) +#os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/configs") for app_name, app_files in apps.items(): - write_yaml(f"{app_name}.yaml", app_files) - print(f"{app_name}.yaml") + prefixed= [f"./{p}" for p in app_files] + app_path = Path(os.path.commonpath(prefixed)) + if app_path.is_file(): + app_path = app_path.parent + + outpath = app_path / "cbi-configs" + try: + os.makedirs(outpath) + except FileExistsError: + pass + for suffix, config in [("all", dict(zip(*it.repeat(['cuda', 'opencl', 'dpc++', 'openmp'],2)))), + ("dpcpp", {'dpc++-gpu' : 'dpc++', 'dpc++-cpu' : 'dpc++'}), + ("ducttape", {'gpu' : 'cuda', 'cpu' : 'openmp'})]: + outfile = outpath / f"{app_name}-{suffix}.yaml" + write = write_yaml(outfile, app_files, config, strip_prefix=app_path) + if write: + print(outfile) print("done") From 7cf8bb5ec1c7495759beaa9e725947774dfa27ed Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Wed, 13 Nov 2019 13:12:26 -0800 Subject: [PATCH 22/49] Print out root and config file from codebasin.py --- codebasin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/codebasin.py b/codebasin.py index 17e6bd9..97f64c9 100755 --- a/codebasin.py +++ b/codebasin.py @@ -98,6 +98,8 @@ def guess_project_name(config_path): output_prefix = os.path.realpath(guess_project_name(args.config_file)) + print(f"Config file: {args.config_file}") + print(f"Root: {rootdir}") # Print summary report if report_enabled("summary"): summary = report.summary(setmap) From 7a380e9b914b4c9384565d89360a560f56f5249a Mon Sep 17 00:00:00 2001 From: Douglas Jacobsen Date: Fri, 8 Nov 2019 12:29:12 -0800 Subject: [PATCH 23/49] Add a language class This commit adds a language class that can help determine what language a file uses. --- codebasin/file_parser.py | 1 + codebasin/language.py | 42 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 codebasin/language.py diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index bc6d7f2..e3ab479 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -8,6 +8,7 @@ from os.path import splitext from . import preprocessor # pylint : disable=no-name-in-module +from . import language from codebasin.file_source import get_file_source diff --git a/codebasin/language.py b/codebasin/language.py new file mode 100644 index 0000000..bd8abb1 --- /dev/null +++ b/codebasin/language.py @@ -0,0 +1,42 @@ +# Copyright (C) 2019 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause +""" +Contains classes and functions related to language detection +and providing information about the language to other parts of +code base investigator +""" + +import os +import logging + +log = logging.getLogger(__name__) + + +class FileLanguage: + """ + Represents the language and modifiers for a given filename + """ + + _supported_languages = ['fortran-free', 'fortran-fixed', 'c', 'c++'] + + _language_extensions = {} + _language_extensions['fortran-free'] = ['.f90', '.F90'] + _language_extensions['fortran-fixed'] = ['.f', '.ftn', '.fpp', '.F', '.FOR', '.FTN', '.FPP'] + _language_extensions['c'] = ['.c', '.h'] + _language_extensions['c++'] = ['.c++', '.cxx', '.cpp', '.cc', + '.hpp', '.hxx', '.h++', '.hh', + '.inc', '.inl', '.tcc', '.icc', + '.ipp'] + + def __init__(self, filename): + self._filename = filename + self._extension = os.path.splitext(self._filename)[1] + self._language = 'None' + + for lang in self._supported_languages: + if self._extension in self._language_extensions[lang]: + self._language = lang + break + + def get_language(self): + return self._language From 026aaf88b80934059c75395eb4a4a89c83ea4208 Mon Sep 17 00:00:00 2001 From: Douglas Jacobsen Date: Wed, 13 Nov 2019 12:52:37 -0800 Subject: [PATCH 24/49] Update file types --- codebasin/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codebasin/language.py b/codebasin/language.py index bd8abb1..7e6dddf 100644 --- a/codebasin/language.py +++ b/codebasin/language.py @@ -26,7 +26,7 @@ class FileLanguage: _language_extensions['c++'] = ['.c++', '.cxx', '.cpp', '.cc', '.hpp', '.hxx', '.h++', '.hh', '.inc', '.inl', '.tcc', '.icc', - '.ipp'] + '.ipp', '.cu', '.cuh', '.cl'] def __init__(self, filename): self._filename = filename From ce6338d8b8cd7fa63b0bc1afd5eb9116809f8d00 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Wed, 13 Nov 2019 13:32:50 -0800 Subject: [PATCH 25/49] Use new language identifier --- codebasin/file_source.py | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/codebasin/file_source.py b/codebasin/file_source.py index 4727061..8c1b7af 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -6,6 +6,7 @@ import itertools as it from os.path import splitext +from .language import FileLanguage global whitespace_dict whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000') @@ -419,31 +420,11 @@ def fortran_file_source(fp, relaxed=False): return (total_sloc, total_physical_lines) - -global extension_map -extension_map = {'.f90' : "FREEFORM FORTRAN", - '.cxx' : "C FAMILY", - '.cl' : "C FAMILY", - '.cu' : "C FAMILY", - '.cuh' : "C FAMILY", - '.cc' : "C FAMILY", - '.cpp' : "C FAMILY", - '.c' : "C FAMILY", - '.h' : "C FAMILY", - '.hpp' : "C FAMILY"} - -def guess_language(fname): - _, ext = splitext(fname) - try: - return extension_map[ext.lower()] - except KeyError: - return "Unknown" - def get_file_source(path): - lang = guess_language(path) - if lang == "FREEFORM FORTRAN": + lang = FileLanguage(path) + if lang.get_language() == "fortran-free": return fortran_file_source - elif lang == "C FAMILY": + elif lang.get_language() in ["c", "c++"]: return c_file_source else: - return None + raise RuntimeError(f"Language {lang.get_language()} in file {path} is unsupported by code base investigator") From 029dd0e391e3cd657d760560e88730645e2a6db2 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Wed, 20 Nov 2019 12:39:54 -0800 Subject: [PATCH 26/49] Remove object parent types --- codebasin/file_source.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/codebasin/file_source.py b/codebasin/file_source.py index 8c1b7af..7d4f9a0 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -14,7 +14,7 @@ def is_whitespace(c): return c in whitespace_dict -class one_space_line(object): +class one_space_line: def __init__(self): self.parts = [] self.trailing_space = False @@ -57,7 +57,7 @@ def flush(self): self.__init__() return res -class iter_keep1(object): +class iter_keep1: def __init__(self, iterator): self.iterator = iter(iterator) self.single = None @@ -73,7 +73,7 @@ def putback(self, item): assert self.single is None self.single = item -class c_cleaner(object): +class c_cleaner: def __init__(self, outbuf, directives_only=False): self.state = ["TOPLEVEL"] self.outbuf = outbuf @@ -191,7 +191,7 @@ def process(self, lineiter): else: assert None -class fortran_cleaner(object): +class fortran_cleaner: def __init__(self, outbuf): self.state = ["TOPLEVEL"] self.outbuf = outbuf From 4c6d045a324b8cca3f34bc51defe6b9a8ac73c1b Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Thu, 21 Nov 2019 13:36:53 -0800 Subject: [PATCH 27/49] Add comments and cleanup according to pylint --- codebasin/file_source.py | 123 +++++++++++++++++++++++++++++++++------ 1 file changed, 106 insertions(+), 17 deletions(-) diff --git a/codebasin/file_source.py b/codebasin/file_source.py index 7d4f9a0..59d0555 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -1,24 +1,40 @@ # Copyright (C) 2019 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause """ -Contains classes and functions for stripping comments and whitespace from C/C++ files as well as fixed-form Fortran +Contains classes and functions for stripping comments and whitespace from +C/C++ files as well as fixed-form Fortran """ import itertools as it -from os.path import splitext from .language import FileLanguage -global whitespace_dict -whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000') +### This string was created by looking at all unicode code points +### and checking to see if they are considered whitespace +### ('\s') by the re module +whitespace_dict = dict.fromkeys(''.join([' \t\n\r\x0b\x0c\x1c\x1d\x1e', + '\x1f\x85\xa0\u1680\u2000\u2001', + '\u2002\u2003\u2004\u2005\u2006', + '\u2007\u2008\u2009\u200a\u2028', + '\u2029\u202f\u205f\u3000'])) def is_whitespace(c): + """Returns true if the character c is whitespace""" + global whitespace_dict return c in whitespace_dict class one_space_line: + """ + A container that represents a single line of code while (generally) + merging all whitespace into a single space. + """ def __init__(self): self.parts = [] self.trailing_space = False def append_char(self, c): + """ + Append a character of no particular class to the line. + Whitespace will be dropped if the line already ends in space. + """ if not is_whitespace(c): self.parts.append(c) self.trailing_space = False @@ -27,6 +43,9 @@ def append_char(self, c): self.parts.append(' ') self.trailing_space = True def append_space(self): + """ + Append whitespace to line, unless line already ends in a space. + """ if not self.trailing_space: self.parts.append(' ') self.trailing_space = True @@ -34,30 +53,46 @@ def append_nonspace(self, c): self.parts.append(c) self.trailing_space = False def join(self, other): - if len(other.parts) > 0: + """ + Append another one_space_line to this one, respecting whitespace rules. + """ + if other.parts: if other.parts[0] == ' ' and self.trailing_space: self.parts += other.parts[1:] else: self.parts += other.parts[:] self.trailing_space = other.trailing_space def category(self): + """ + Report the a category for this line: + * SRC_NONBLANK if it is non-empty/non-whitespace line of code. + * BLANK if it is empty or only whitespace. + * CPP_DIRECTIVE it is is a C preprocessor directive. + """ res = "SRC_NONBLANK" - if len(self.parts) == 0: + if not self.parts: res = "BLANK" elif len(self.parts) == 1: if self.parts[0] == ' ': res = "BLANK" elif self.parts[0] == '#': res = "CPP_DIRECTIVE" - elif ( self.parts[0] == ' ' and self.parts[1] == '#' ) or self.parts[0] == '#': + elif self.parts[:2] == ' #' or self.parts[0] == '#': res = "CPP_DIRECTIVE" return res def flush(self): - res= ''.join(self.parts) + """ + Convert the characters to a string and reset the buffer. + """ + res = ''.join(self.parts) self.__init__() return res class iter_keep1: + """ + An iterator wrapper that allows a single item to be 'put back' + and picked up for the next iteration. + """ def __init__(self, iterator): self.iterator = iter(iterator) self.single = None @@ -70,15 +105,32 @@ def __next__(self): else: return next(self.iterator) def putback(self, item): + """ + Put item into the iterator such that it will be the next + yielded item. + """ assert self.single is None self.single = item class c_cleaner: + """ + Approximation of the early stages of a C preprocessor. + Joins line continuations, merges whitespace, and replaces comments + with whitespace. State is kept across physical lines and cleared with + logical_newline. + """ def __init__(self, outbuf, directives_only=False): + """ + directives_only has the cleaner only operate on directive lines. + """ self.state = ["TOPLEVEL"] self.outbuf = outbuf self.directives_only = directives_only def logical_newline(self): + """ + Reset state when a logical newline is found. + That is, when a newline without continuation. + """ if self.state[-1] == "IN_INLINE_COMMENT": self.state = ["TOPLEVEL"] self.outbuf.append_space() @@ -90,13 +142,16 @@ def logical_newline(self): self.state = ["TOPLEVEL"] elif self.state[-1] == "DOUBLE_QUOTATION": # This probably should give a warning - self.state == ["TOPLEVEL"] + self.state = ["TOPLEVEL"] elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": self.state.pop() assert self.state[-1] == "IN_BLOCK_COMMENT" elif self.state[-1] == "CPP_DIRECTIVE": self.state = ["TOPLEVEL"] def process(self, lineiter): + """ + Add contents of lineiter to outbuf, stripping as directed. + """ inbuffer = iter_keep1(lineiter) for char in inbuffer: if self.state[-1] == "TOPLEVEL": @@ -192,25 +247,38 @@ def process(self, lineiter): assert None class fortran_cleaner: + """ + 'Cleans' source to remove comments and blanks while preserving + directives and handling strings and continuations properly. + Expects to have c defines already processed. + """ def __init__(self, outbuf): self.state = ["TOPLEVEL"] self.outbuf = outbuf self.verify_continue = [] def dir_check(self, inbuffer): - self.found=['!'] + """ + Inspect comment to see if it is in fact, a valid directive, + which should be preserved. + """ + found = ['!'] for char in inbuffer: if char == '$': - self.found.append('$') - for char in self.found: - self.outbuf.append_nonspace(char) - for char in inbuffer: - self.outbuf.append_nonspace(char) + found.append('$') + for c in found: + self.outbuf.append_nonspace(c) + for c in inbuffer: + self.outbuf.append_nonspace(c) break elif char.isalpha(): - self.found.append(char) + found.append(char) else: return def process(self, lineiter): + """ + Add contents of lineiter to current line, removing contents and + handling continuations. + """ inbuffer = iter_keep1(lineiter) try: while True: @@ -296,6 +364,15 @@ def process(self, lineiter): self.state[-1] = "CONTINUING_FROM_SOL" def c_file_source(fp, relaxed=False, directives_only=False): + """ + Process file fp in terms of logical (sloc) and physical lines of C code. + Yield blocks of logical lines of code with physical extents. + Return total lines at exit. + Relaxed allows for inconsistent state at the end of parsing, usefule for + special composition cases. + directives_only sets up parser to only process directive lines such that + the output can be fed to another file source (i.e. Fortran). + """ current_physical_line = one_space_line() cleaner = c_cleaner(current_physical_line, directives_only) @@ -355,6 +432,14 @@ def c_file_source(fp, relaxed=False, directives_only=False): return (total_sloc, total_physical_lines) def fortran_file_source(fp, relaxed=False): + """ + Process file fp in terms of logical (sloc) and physical lines of + fixed-form Fortran code. + Yield blocks of logical lines of code with physical extents. + Return total lines at exit. + Relaxed allows for inconsistent state at the end of parsing, usefule for + special composition cases. + """ current_physical_line = one_space_line() cleaner = fortran_cleaner(current_physical_line) @@ -370,7 +455,7 @@ def fortran_file_source(fp, relaxed=False): while True: ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) = next(c_walker) #if it's a cpp directive, flush what we have, then emit the directive and start over - if current_physical_start == None: + if current_physical_start is None: current_physical_start = src_physical_start if c_category == "CPP_DIRECTIVE": @@ -421,6 +506,10 @@ def fortran_file_source(fp, relaxed=False): return (total_sloc, total_physical_lines) def get_file_source(path): + """ + Return a C or Fortran line source for path depending on + the language we can detect, or fail. + """ lang = FileLanguage(path) if lang.get_language() == "fortran-free": return fortran_file_source From 3fa388680fce3e3c3130cd53c1dcb7980fff3838 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 22 Nov 2019 09:12:26 -0800 Subject: [PATCH 28/49] Add line info and use it in c_file_source --- codebasin/file_source.py | 84 ++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/codebasin/file_source.py b/codebasin/file_source.py index 59d0555..ed78bdb 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -363,6 +363,50 @@ def process(self, lineiter): self.verify_continue = [] self.state[-1] = "CONTINUING_FROM_SOL" +class line_info: + """ + Reprsents a logical line of code. + """ + def __init__(self): + self.current_logical_line = one_space_line() + self.current_physical_start = 1 + self.current_physical_end = None + self.local_sloc = 0 + self.category = None + self.flushed_line = None + def join(self, other_line): + """ + Combine this logical line with another one. + """ + self.current_logical_line.join(other_line) + def physical_nonblank(self): + """ + Mark nonblank link in this logical like. + """ + self.local_sloc += 1 + def physical_update(self, physical_line_num): + """ + Mark end of new physical line. + """ + self.current_physical_end = physical_line_num + 1 + self.category = self.current_logical_line.category() + self.flushed_line = self.current_logical_line.flush() + def physical_reset(self): + """ + Prepare for next logical block. Return counted sloc. + """ + self.current_physical_start = self.current_physical_end + local_sloc_copy = self.local_sloc + self.local_sloc = 0 + self.flushed_line = None + return local_sloc_copy + def logical_result(self): + """ + Return tuple of contents. Eventually should just return this class. + """ + return ((self.current_physical_start, self.current_physical_end), + self.local_sloc, self.flushed_line, self.category) + def c_file_source(fp, relaxed=False, directives_only=False): """ Process file fp in terms of logical (sloc) and physical lines of C code. @@ -377,11 +421,9 @@ def c_file_source(fp, relaxed=False, directives_only=False): current_physical_line = one_space_line() cleaner = c_cleaner(current_physical_line, directives_only) - current_logical_line = one_space_line() + curr_line = line_info() - current_physical_start = 1 total_sloc = 0 - local_sloc = 0 physical_line_num = 0 for (physical_line_num, line) in enumerate(fp, start=1): @@ -389,43 +431,35 @@ def c_file_source(fp, relaxed=False, directives_only=False): end = len(line) if line[-1] == '\n': end -= 1 - else: - if end > 0 and line[end-1] == '\\': - raise RuntimeError("file seems to end in \\ with no newline!") + elif end > 0 and line[end-1] == '\\': + raise RuntimeError("file seems to end in \\ with no newline!") - if end > 0 and line[end-1] == '\\': - continued = True + continued = end > 0 and line[end-1] == '\\' + if continued: end -= 1 - else: - continued = False cleaner.process(it.islice(line, 0, end)) if not continued: cleaner.logical_newline() if not current_physical_line.category() == "BLANK": - local_sloc += 1 + curr_line.physical_nonblank() - current_logical_line.join(current_physical_line) + curr_line.join(current_physical_line) if not continued: - line_cat = current_logical_line.category() - if line_cat != "BLANK": - yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat) - else: - current_logical_line.__init__() - assert local_sloc == 0 + curr_line.physical_update(physical_line_num+1) + if curr_line.category != "BLANK": + yield curr_line.logical_result() - current_physical_start = physical_line_num + 1 - total_sloc += local_sloc - local_sloc = 0 + total_sloc += curr_line.physical_reset() total_physical_lines = physical_line_num - line_cat = current_logical_line.category() - if line_cat != "BLANK": - yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat) + curr_line.physical_update(physical_line_num+1) + if curr_line.category != "BLANK": + yield curr_line.logical_result() - total_sloc += local_sloc + total_sloc += curr_line.physical_reset() if not relaxed: assert cleaner.state == ["TOPLEVEL"] From f942c287e69878dbdd61398959b5489373bdb1f3 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 22 Nov 2019 13:12:21 -0800 Subject: [PATCH 29/49] Clean up file_source --- codebasin/file_parser.py | 12 ++++--- codebasin/file_source.py | 69 +++++++++++++++++++--------------------- etc/sloc_translate.py | 4 +-- 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index e3ab479..54d2a47 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -142,18 +142,20 @@ def parse_file(self): source = file_source(source_file) try: while True: - (phys_int, local_sloc, logical_line, line_cat) = next(source) + logical_line = next(source) + phys_int = (logical_line.current_physical_start, logical_line.current_physical_end) # Only follow continuation for directives - if line_cat == 'CPP_DIRECTIVE': + if logical_line.category == 'CPP_DIRECTIVE': # Add this into the directive lines, even if it # might not be a directive we count - groups['directive'].add_line(phys_int, local_sloc) - FileParser.handle_directive(out_tree, groups, logical_line) + groups['directive'].add_line(phys_int, logical_line.local_sloc) + + FileParser.handle_directive(out_tree, groups, logical_line.flushed_line) # FallBack is that this line is a simple code line. else: - groups['code'].add_line(phys_int, local_sloc) + groups['code'].add_line(phys_int, logical_line.local_sloc) except StopIteration as it: total_sloc, physical_loc = it.value diff --git a/codebasin/file_source.py b/codebasin/file_source.py index ed78bdb..1ff56eb 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -379,16 +379,16 @@ def join(self, other_line): Combine this logical line with another one. """ self.current_logical_line.join(other_line) - def physical_nonblank(self): + def physical_nonblank(self, n): """ Mark nonblank link in this logical like. """ - self.local_sloc += 1 + self.local_sloc += n def physical_update(self, physical_line_num): """ Mark end of new physical line. """ - self.current_physical_end = physical_line_num + 1 + self.current_physical_end = physical_line_num self.category = self.current_logical_line.category() self.flushed_line = self.current_logical_line.flush() def physical_reset(self): @@ -400,6 +400,8 @@ def physical_reset(self): self.local_sloc = 0 self.flushed_line = None return local_sloc_copy + def phys_interval(self): + return (self.current_physical_start, self.current_physical_end) def logical_result(self): """ Return tuple of contents. Eventually should just return this class. @@ -442,14 +444,14 @@ def c_file_source(fp, relaxed=False, directives_only=False): cleaner.logical_newline() if not current_physical_line.category() == "BLANK": - curr_line.physical_nonblank() + curr_line.physical_nonblank(1) curr_line.join(current_physical_line) if not continued: curr_line.physical_update(physical_line_num+1) if curr_line.category != "BLANK": - yield curr_line.logical_result() + yield curr_line total_sloc += curr_line.physical_reset() @@ -457,7 +459,7 @@ def c_file_source(fp, relaxed=False, directives_only=False): curr_line.physical_update(physical_line_num+1) if curr_line.category != "BLANK": - yield curr_line.logical_result() + yield curr_line total_sloc += curr_line.physical_reset() if not relaxed: @@ -478,62 +480,57 @@ def fortran_file_source(fp, relaxed=False): current_physical_line = one_space_line() cleaner = fortran_cleaner(current_physical_line) - current_logical_line = one_space_line() + curr_line = line_info() current_physical_start = None total_sloc = 0 - local_sloc = 0 c_walker = c_file_source(fp, directives_only=True) try: while True: - ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) = next(c_walker) + src_c_line = next(c_walker) + #((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) #if it's a cpp directive, flush what we have, then emit the directive and start over if current_physical_start is None: - current_physical_start = src_physical_start + current_physical_start = curr_line.current_physical_start - if c_category == "CPP_DIRECTIVE": - line_cat = current_logical_line.category() - if line_cat != "BLANK": - yield ((current_physical_start, src_physical_end), local_sloc, current_logical_line.flush(), line_cat) - else: - current_logical_line.__init__() - assert local_sloc == 0 + if src_c_line.category == "CPP_DIRECTIVE": + curr_line.physical_update(src_c_line.current_physical_end) + if curr_line.category != "BLANK": + yield curr_line current_physical_start = None - total_sloc += local_sloc - local_sloc = 0 - yield ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) - total_sloc += src_line_sloc + total_sloc += curr_line.physical_reset() + yield src_c_line + total_sloc += src_c_line.local_sloc continue current_physical_line.__init__() - cleaner.process(it.islice(src_line, 0, len(src_line))) + cleaner.process(it.islice(src_c_line.flushed_line, 0, len(src_c_line.flushed_line))) if not current_physical_line.category() == "BLANK": - local_sloc += src_line_sloc + curr_line.physical_nonblank(src_c_line.local_sloc) - current_logical_line.join(current_physical_line) + curr_line.join(current_physical_line) if cleaner.state[-1] != "CONTINUING_FROM_SOL": - line_cat = current_logical_line.category() - if line_cat != "BLANK": - yield ((current_physical_start, src_physical_end), local_sloc, current_logical_line.flush(), line_cat) - else: - current_logical_line.__init__() - assert local_sloc == 0 + curr_line.current_physical_start = current_physical_start + curr_line.physical_update(src_c_line.current_physical_end) + if curr_line.category != "BLANK": + yield curr_line current_physical_start = None - total_sloc += local_sloc - local_sloc = 0 + total_sloc += curr_line.physical_reset() + except StopIteration as stopit: _, total_physical_lines = stopit.value - line_cat = current_logical_line.category() - if line_cat != "BLANK": - yield ((current_physical_start, total_physical_lines), local_sloc, current_logical_line.flush(), line_cat) + curr_line.physical_update(total_physical_lines) + if not curr_line.category == "BLANK": + curr_line.current_physical_start = current_physical_start + yield curr_line - total_sloc += local_sloc + total_sloc += curr_line.physical_reset() if not relaxed: assert cleaner.state == ["TOPLEVEL"] diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index c26b2c4..b925f72 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -19,9 +19,9 @@ def file_sloc(path, verbose=False): walker = file_source(source_file, relaxed=False) try: while True: - (interval, sloc, line, line_cat) = next(walker) + logical_line = next(walker) if verbose: - print(f"{path} [{interval[0]}, {interval[1]}) ({sloc}): {line} {line_cat}") + print(f"{path} [{logical_line.current_physical_start}, {logical_line.current_physical_end}) ({logical_line.local_sloc}): {logical_line.flushed_line} {logical_line.category}") except StopIteration as it: total_sloc, physical_loc = it.value From 4a5e61cb418c74a1787d358cd32a2baa0c7d64d8 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 22 Nov 2019 13:59:03 -0800 Subject: [PATCH 30/49] Clean up guess-info --- codebasin/file_parser.py | 14 +-- codebasin/file_source.py | 3 +- etc/guess_info.py | 192 --------------------------------------- etc/sloc_translate.py | 50 ++++++---- 4 files changed, 41 insertions(+), 218 deletions(-) delete mode 100755 etc/guess_info.py diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 54d2a47..52f9f5d 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -5,12 +5,8 @@ and building a tree of nodes from it. """ -from os.path import splitext - -from . import preprocessor # pylint : disable=no-name-in-module -from . import language - from codebasin.file_source import get_file_source +from . import preprocessor # pylint : disable=no-name-in-module class LineGroup: """ @@ -128,9 +124,9 @@ def parse_file(self): out_tree = preprocessor.SourceTree(self._filename) file_source = get_file_source(self._filename) if not file_source: - raise RuntimeError(f"{self._filename} doesn't appear to be a language this tool can process") + raise RuntimeError(f"{self._filename} doesn't appear " + + "to be a language this tool can process") with open(self._filename, mode='r', errors='replace') as source_file: - previous_continue = False groups = {'code': LineGroup(), 'directive': LineGroup(), @@ -143,7 +139,7 @@ def parse_file(self): try: while True: logical_line = next(source) - phys_int = (logical_line.current_physical_start, logical_line.current_physical_end) + phys_int = logical_line.phys_interval() # Only follow continuation for directives if logical_line.category == 'CPP_DIRECTIVE': # Add this into the directive lines, even if it @@ -157,7 +153,7 @@ def parse_file(self): else: groups['code'].add_line(phys_int, logical_line.local_sloc) except StopIteration as it: - total_sloc, physical_loc = it.value + _, physical_loc = it.value if not groups['code'].empty(): groups['code'].add_line((groups['code'].start_line, physical_loc-1), 0) diff --git a/codebasin/file_source.py b/codebasin/file_source.py index 1ff56eb..310b7d9 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -547,4 +547,5 @@ def get_file_source(path): elif lang.get_language() in ["c", "c++"]: return c_file_source else: - raise RuntimeError(f"Language {lang.get_language()} in file {path} is unsupported by code base investigator") + raise RuntimeError(f"Language {lang.get_language()} in file " + + f"{path} is unsupported by code base investigator") diff --git a/etc/guess_info.py b/etc/guess_info.py deleted file mode 100755 index 03ced5b..0000000 --- a/etc/guess_info.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python3.6 - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - -from sloc_translate import file_sloc -from codebasin.file_source import get_file_source -from codebasin.report import divergence - -import csv -from pathlib import Path -from collections import defaultdict -import re -import itertools as it -import yaml - -def guess_app(inpath): - path = Path(inpath) - if path.parts[0] == 'dlpbenchcuda' and path.parts[1] != 'utils': - app = 'dlpbench-' + path.parts[1] - elif path.parts[0] == 'dlpbenchopencl': - app = 'dlpbench-' + path.parts[1] - elif path.parts[0] == 'dlpbench' and path.parts[1] != 'common' and path.parts[1] !='deprecated_workloads' and path.parts[1] != 'csa': - app = 'dlpbench-' + path.parts[2] - elif path.parts[0] in ['cmedia-bench', "config", "infrastructure", "Test-Infrastructure"]: - app = None - elif path.parts[0] == 'DNNBench' and not path.parts[1] == 'common': - app = f"DNNBench-{path.parts[1]}" - else: - app = path.parts[0] - return app - -def matches(path, regexp): - return regexp.search(path) != None - -class plat_guesser(object): - def __init__(self, name, pathwl, extwl): - self.name = name - self.pathwl = pathwl - self.pathbl = [] - self.extwl = extwl - self.extbl = [] - def finalize(self): - if len(self.pathwl) > 0: - all_exts = "|".join((f"[^a-z]+{x}|{x}[^a-z]+" for x in (z.replace("+", r"\+") for z in self.pathwl))) - self.pathwl_re = re.compile(f"{all_exts}") - else: - self.pathwl_re = re.compile(r"^\b$") - if len(self.pathbl) > 0: - all_exts = "|".join((f"[^a-z]+{x}|{x}[^a-z]+" for x in (z.replace("+", r"\+") for z in self.pathbl))) - self.pathbl_re = re.compile(f"{all_exts}") - else: - self.pathwl_re = re.compile(r"^\b$") - if len(self.extwl) > 0: - all_exts = "|".join(self.extwl) - self.extwl_re = re.compile(f"(.{all_exts})$") - else: - self.extwl_re = re.compile(r"^\b$") - if len(self.extbl) > 0: - all_exts = "|".join(self.extbl) - self.extbl_re = re.compile(f"(.{all_exts})$") - else: - self.extbl_re = re.compile(r"^\b$") - def score(self, path): - neg, pos = False, False - pos |= matches(path, self.pathwl_re) - neg |= matches(path, self.pathbl_re) - pos |= matches(path, self.extwl_re) - neg |= matches(path, self.extbl_re) - return self.name, (neg, pos) - - -guessers = [plat_guesser("cuda", - ["cuda"], - ["cu"]), - plat_guesser("opencl", - ["opencl", "ocl"], - ["cl"]), - plat_guesser("dpc++", - ["dpc++", "dpcpp", "sycl"], - []), - plat_guesser("openmp", - ["omp", "openmp"], - [])] - -all_pathwl = set() -all_extwl = set() -for g in guessers: - all_pathwl.update(set(g.pathwl)) - all_extwl.update(set(g.extwl)) - -for g in guessers: - g.pathbl = list(all_pathwl.difference(set(g.pathwl))) - g.extbl = list(all_extwl.difference(set(g.extwl))) - g.finalize() - -def guess_platform(inpath): - path = Path(inpath) - return path.parts[1] - -def categorize_file(inpath): - res = {} - path = inpath.lower() - for g in guessers: - name, cat = g.score(path) - res[name] = cat - return res - -def walk_apptree(inroot, regexp): - apps = defaultdict(list) - paths = {} - for root, dirs, files in os.walk(inroot): - for f in files: - full_path = os.path.join(root, f) - if regexp.match(full_path): - app = guess_app(full_path) - if app: - apps[app].append(os.path.relpath(full_path, inroot)) - return apps - -def app_groups(files, all_lang=frozenset(['cuda', 'opencl', 'dpc++', 'openmp'])): - platmap = defaultdict(list) - for f in files: - cats = categorize_file(f) - is_in = set() - isnt_in = set() - for k, which in cats.items(): - if which[1]: - is_in.update([k]) - if which[0]: - isnt_in.update([k]) - if len(is_in) == 0: - partial_common = all_lang.difference(isnt_in) - if len(partial_common) > 0: - for p in partial_common: - platmap[p].append(Path(f)) - else: - update=is_in.intersection(all_lang) - if len(update) > 0: - for p in update: - platmap[p].append(Path(f)) - return platmap - -def write_yaml(output, files, langs_names_map, strip_prefix=Path(".")): - - platmap = app_groups(files, frozenset(langs_names_map.values())) - all_files = set() - for plat, pfiles in platmap.items(): - all_files.update([str(f.relative_to(strip_prefix)) for f in pfiles]) - if len(all_files) == 0: - return False - base = {'codebase' : { 'files' : list(all_files) }} - plats = set() - for export_name, plat_name in langs_names_map.items(): - plat_files = [str(f.relative_to(strip_prefix)) for f in platmap[plat_name]] - if len(plat_files) > 0: - base[export_name] = {'files': plat_files} - plats.update([export_name]) - elif len(langs_names_map) < 4: #Hack - return False - base['codebase']['platforms'] = list(plats) - with open(output, "w") as ofp: - yaml.dump(base, ofp) - return True - -os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/") -apps = walk_apptree(".", re.compile('(.*\.)(cpp|c|hpp|h|cl|cu|cxx|cc|cuh)$')) - -#os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/configs") -for app_name, app_files in apps.items(): - - prefixed= [f"./{p}" for p in app_files] - app_path = Path(os.path.commonpath(prefixed)) - if app_path.is_file(): - app_path = app_path.parent - - outpath = app_path / "cbi-configs" - try: - os.makedirs(outpath) - except FileExistsError: - pass - for suffix, config in [("all", dict(zip(*it.repeat(['cuda', 'opencl', 'dpc++', 'openmp'],2)))), - ("dpcpp", {'dpc++-gpu' : 'dpc++', 'dpc++-cpu' : 'dpc++'}), - ("ducttape", {'gpu' : 'cuda', 'cpu' : 'openmp'})]: - outfile = outpath / f"{app_name}-{suffix}.yaml" - write = write_yaml(outfile, app_files, config, strip_prefix=app_path) - if write: - print(outfile) - -print("done") diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index b925f72..946b9f9 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -1,17 +1,23 @@ #!/usr/bin/env python3.6 # Copyright (C) 2019-2020 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause +""" +Parse source file, reporting sloc and physical lines. +Can optionally print logical line regions and cleaned lines. +""" import os import sys +import re sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) from codebasin.file_source import get_file_source -import re - def file_sloc(path, verbose=False): + """ + Process file in path, reporting total_sloc/loc. Optionally print logical regions. + """ file_source = get_file_source(path) if not file_source: raise RuntimeError(f"{path} doesn't appear to be a language this tool can process") @@ -21,29 +27,41 @@ def file_sloc(path, verbose=False): while True: logical_line = next(walker) if verbose: - print(f"{path} [{logical_line.current_physical_start}, {logical_line.current_physical_end}) ({logical_line.local_sloc}): {logical_line.flushed_line} {logical_line.category}") + print(f"{path} [{logical_line.current_physical_start}," + + f" {logical_line.current_physical_end}) ({logical_line.local_sloc}):" + f" {logical_line.flushed_line} {logical_line.category}") except StopIteration as it: total_sloc, physical_loc = it.value return (path, total_sloc, physical_loc) -def walk_sloc(root, regexp, verbose=False): - for root, dirs, files in os.walk(root): - for f in files: - full_path = os.path.join(root, f) +def walk_sloc(in_root, regexp, verbose=False): + """ + Run file_sloc on each file that matches regexp under root path. + """ + for root, _, files in os.walk(in_root): + for current_file in files: + full_path = os.path.join(root, current_file) if regexp.match(full_path): try: - (filename, total_sloc, physical_loc) = file_sloc(full_path) - print(f"{filename}, {total_sloc}, {physical_loc}") + (filename, total_sloc, physical_loc) = file_sloc(full_path) + if verbose: + print(f"{filename}, {total_sloc}, {physical_loc}") except FileNotFoundError: pass -if __name__ == '__main__': - if len(sys.argv) == 2: - filename = sys.argv[1] - (filename, total_sloc, physical_loc) = file_sloc(filename, verbose=True) +def sloc_translate(args): + """ + Toplevel routine for script. + """ + if len(args) == 2: + (filename, total_sloc, physical_loc) = file_sloc(args[1], verbose=True) print(f"{filename}, {total_sloc}, {physical_loc}") - elif len(sys.argv) == 3: - walk_sloc(sys.argv[1], re.compile(sys.argv[2])) + elif len(args) == 3: + walk_sloc(args[1], re.compile(args[2])) else: - print("Expected either 1 argument (a single file to parse and print) or 2 (a directory root & file pattern)") + print("Expected either 1 argument (a single file to parse" + + " and print) or 2 (a directory root & file pattern)") + +if __name__ == '__main__': + sloc_translate(sys.argv) From 9d30371a4301e3077e2d63147ec8421536793849 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 22 Nov 2019 14:09:16 -0800 Subject: [PATCH 31/49] Work around pylint errors --- codebasin/file_parser.py | 1 + codebasin/file_source.py | 4 ++++ etc/sloc_translate.py | 2 ++ 3 files changed, 7 insertions(+) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 52f9f5d..f088d63 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -153,6 +153,7 @@ def parse_file(self): else: groups['code'].add_line(phys_int, logical_line.local_sloc) except StopIteration as it: + # pylint: disable=unpacking-non-sequence _, physical_loc = it.value if not groups['code'].empty(): diff --git a/codebasin/file_source.py b/codebasin/file_source.py index 310b7d9..2e677a8 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -19,6 +19,7 @@ def is_whitespace(c): """Returns true if the character c is whitespace""" + # pylint: disable=global-statement global whitespace_dict return c in whitespace_dict @@ -152,6 +153,7 @@ def process(self, lineiter): """ Add contents of lineiter to outbuf, stripping as directed. """ + # pylint: disable=too-many-branches,too-many-statements inbuffer = iter_keep1(lineiter) for char in inbuffer: if self.state[-1] == "TOPLEVEL": @@ -279,6 +281,7 @@ def process(self, lineiter): Add contents of lineiter to current line, removing contents and handling continuations. """ + # pylint: disable=too-many-branches,too-many-statements inbuffer = iter_keep1(lineiter) try: while True: @@ -523,6 +526,7 @@ def fortran_file_source(fp, relaxed=False): total_sloc += curr_line.physical_reset() except StopIteration as stopit: + # pylint: disable=unpacking-non-sequence _, total_physical_lines = stopit.value curr_line.physical_update(total_physical_lines) diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index 946b9f9..c25fbd2 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -12,6 +12,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +# pylint: disable=wrong-import-position from codebasin.file_source import get_file_source def file_sloc(path, verbose=False): @@ -31,6 +32,7 @@ def file_sloc(path, verbose=False): f" {logical_line.current_physical_end}) ({logical_line.local_sloc}):" f" {logical_line.flushed_line} {logical_line.category}") except StopIteration as it: + # pylint: disable=unpacking-non-sequence total_sloc, physical_loc = it.value return (path, total_sloc, physical_loc) From 6f207cb7c2ff3496448019ccf789bc6a9c67ea74 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 22 Nov 2019 14:11:21 -0800 Subject: [PATCH 32/49] Autopep8 --- codebasin/file_parser.py | 9 ++++-- codebasin/file_source.py | 49 +++++++++++++++++++++++++++------ etc/sloc_translate.py | 4 +++ setup.py | 2 +- tests/comments/test_comments.py | 3 ++ 5 files changed, 55 insertions(+), 12 deletions(-) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index f088d63..152ce3d 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -8,6 +8,7 @@ from codebasin.file_source import get_file_source from . import preprocessor # pylint : disable=no-name-in-module + class LineGroup: """ Represents a grouping of lines. It contains the extent, and the @@ -37,10 +38,11 @@ def add_line(self, phys_int, sloc_count): if self.start_line == -1 or phys_int[0] < self.start_line: self.start_line = phys_int[0] - if phys_int[1]-1 > self.end_line: - self.end_line = phys_int[1]-1 + if phys_int[1] - 1 > self.end_line: + self.end_line = phys_int[1] - 1 self.line_count += sloc_count + def reset(self): """ Reset the countable group @@ -65,6 +67,7 @@ def merge(self, line_group): self.end_line = max(self.end_line, line_group.end_line) line_group.reset() + class FileParser: """ Contains methods for parsing an entire source file and returning a @@ -157,7 +160,7 @@ def parse_file(self): _, physical_loc = it.value if not groups['code'].empty(): - groups['code'].add_line((groups['code'].start_line, physical_loc-1), 0) + groups['code'].add_line((groups['code'].start_line, physical_loc - 1), 0) self.insert_code_node(out_tree, groups['code']) groups['file'].merge(groups['code']) diff --git a/codebasin/file_source.py b/codebasin/file_source.py index 2e677a8..4138125 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -8,29 +8,33 @@ import itertools as it from .language import FileLanguage -### This string was created by looking at all unicode code points -### and checking to see if they are considered whitespace -### ('\s') by the re module +# This string was created by looking at all unicode code points +# and checking to see if they are considered whitespace +# ('\s') by the re module whitespace_dict = dict.fromkeys(''.join([' \t\n\r\x0b\x0c\x1c\x1d\x1e', '\x1f\x85\xa0\u1680\u2000\u2001', '\u2002\u2003\u2004\u2005\u2006', '\u2007\u2008\u2009\u200a\u2028', '\u2029\u202f\u205f\u3000'])) + def is_whitespace(c): """Returns true if the character c is whitespace""" # pylint: disable=global-statement global whitespace_dict return c in whitespace_dict + class one_space_line: """ A container that represents a single line of code while (generally) merging all whitespace into a single space. """ + def __init__(self): self.parts = [] self.trailing_space = False + def append_char(self, c): """ Append a character of no particular class to the line. @@ -43,6 +47,7 @@ def append_char(self, c): if not self.trailing_space: self.parts.append(' ') self.trailing_space = True + def append_space(self): """ Append whitespace to line, unless line already ends in a space. @@ -50,9 +55,11 @@ def append_space(self): if not self.trailing_space: self.parts.append(' ') self.trailing_space = True + def append_nonspace(self, c): self.parts.append(c) self.trailing_space = False + def join(self, other): """ Append another one_space_line to this one, respecting whitespace rules. @@ -63,6 +70,7 @@ def join(self, other): else: self.parts += other.parts[:] self.trailing_space = other.trailing_space + def category(self): """ Report the a category for this line: @@ -81,6 +89,7 @@ def category(self): elif self.parts[:2] == ' #' or self.parts[0] == '#': res = "CPP_DIRECTIVE" return res + def flush(self): """ Convert the characters to a string and reset the buffer. @@ -89,22 +98,27 @@ def flush(self): self.__init__() return res + class iter_keep1: """ An iterator wrapper that allows a single item to be 'put back' and picked up for the next iteration. """ + def __init__(self, iterator): self.iterator = iter(iterator) self.single = None + def __iter__(self): return self + def __next__(self): if self.single is not None: res, self.single = self.single, None return res else: return next(self.iterator) + def putback(self, item): """ Put item into the iterator such that it will be the next @@ -113,6 +127,7 @@ def putback(self, item): assert self.single is None self.single = item + class c_cleaner: """ Approximation of the early stages of a C preprocessor. @@ -120,6 +135,7 @@ class c_cleaner: with whitespace. State is kept across physical lines and cleared with logical_newline. """ + def __init__(self, outbuf, directives_only=False): """ directives_only has the cleaner only operate on directive lines. @@ -127,6 +143,7 @@ def __init__(self, outbuf, directives_only=False): self.state = ["TOPLEVEL"] self.outbuf = outbuf self.directives_only = directives_only + def logical_newline(self): """ Reset state when a logical newline is found. @@ -149,6 +166,7 @@ def logical_newline(self): assert self.state[-1] == "IN_BLOCK_COMMENT" elif self.state[-1] == "CPP_DIRECTIVE": self.state = ["TOPLEVEL"] + def process(self, lineiter): """ Add contents of lineiter to outbuf, stripping as directed. @@ -248,16 +266,19 @@ def process(self, lineiter): else: assert None + class fortran_cleaner: """ 'Cleans' source to remove comments and blanks while preserving directives and handling strings and continuations properly. Expects to have c defines already processed. """ + def __init__(self, outbuf): self.state = ["TOPLEVEL"] self.outbuf = outbuf self.verify_continue = [] + def dir_check(self, inbuffer): """ Inspect comment to see if it is in fact, a valid directive, @@ -276,6 +297,7 @@ def dir_check(self, inbuffer): found.append(char) else: return + def process(self, lineiter): """ Add contents of lineiter to current line, removing contents and @@ -366,10 +388,12 @@ def process(self, lineiter): self.verify_continue = [] self.state[-1] = "CONTINUING_FROM_SOL" + class line_info: """ Reprsents a logical line of code. """ + def __init__(self): self.current_logical_line = one_space_line() self.current_physical_start = 1 @@ -377,16 +401,19 @@ def __init__(self): self.local_sloc = 0 self.category = None self.flushed_line = None + def join(self, other_line): """ Combine this logical line with another one. """ self.current_logical_line.join(other_line) + def physical_nonblank(self, n): """ Mark nonblank link in this logical like. """ self.local_sloc += n + def physical_update(self, physical_line_num): """ Mark end of new physical line. @@ -394,6 +421,7 @@ def physical_update(self, physical_line_num): self.current_physical_end = physical_line_num self.category = self.current_logical_line.category() self.flushed_line = self.current_logical_line.flush() + def physical_reset(self): """ Prepare for next logical block. Return counted sloc. @@ -403,8 +431,10 @@ def physical_reset(self): self.local_sloc = 0 self.flushed_line = None return local_sloc_copy + def phys_interval(self): return (self.current_physical_start, self.current_physical_end) + def logical_result(self): """ Return tuple of contents. Eventually should just return this class. @@ -412,6 +442,7 @@ def logical_result(self): return ((self.current_physical_start, self.current_physical_end), self.local_sloc, self.flushed_line, self.category) + def c_file_source(fp, relaxed=False, directives_only=False): """ Process file fp in terms of logical (sloc) and physical lines of C code. @@ -436,10 +467,10 @@ def c_file_source(fp, relaxed=False, directives_only=False): end = len(line) if line[-1] == '\n': end -= 1 - elif end > 0 and line[end-1] == '\\': + elif end > 0 and line[end - 1] == '\\': raise RuntimeError("file seems to end in \\ with no newline!") - continued = end > 0 and line[end-1] == '\\' + continued = end > 0 and line[end - 1] == '\\' if continued: end -= 1 cleaner.process(it.islice(line, 0, end)) @@ -452,7 +483,7 @@ def c_file_source(fp, relaxed=False, directives_only=False): curr_line.join(current_physical_line) if not continued: - curr_line.physical_update(physical_line_num+1) + curr_line.physical_update(physical_line_num + 1) if curr_line.category != "BLANK": yield curr_line @@ -460,7 +491,7 @@ def c_file_source(fp, relaxed=False, directives_only=False): total_physical_lines = physical_line_num - curr_line.physical_update(physical_line_num+1) + curr_line.physical_update(physical_line_num + 1) if curr_line.category != "BLANK": yield curr_line @@ -470,6 +501,7 @@ def c_file_source(fp, relaxed=False, directives_only=False): return (total_sloc, total_physical_lines) + def fortran_file_source(fp, relaxed=False): """ Process file fp in terms of logical (sloc) and physical lines of @@ -493,7 +525,7 @@ def fortran_file_source(fp, relaxed=False): while True: src_c_line = next(c_walker) #((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) - #if it's a cpp directive, flush what we have, then emit the directive and start over + # if it's a cpp directive, flush what we have, then emit the directive and start over if current_physical_start is None: current_physical_start = curr_line.current_physical_start @@ -540,6 +572,7 @@ def fortran_file_source(fp, relaxed=False): return (total_sloc, total_physical_lines) + def get_file_source(path): """ Return a C or Fortran line source for path depending on diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index c25fbd2..59c9076 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -15,6 +15,7 @@ # pylint: disable=wrong-import-position from codebasin.file_source import get_file_source + def file_sloc(path, verbose=False): """ Process file in path, reporting total_sloc/loc. Optionally print logical regions. @@ -37,6 +38,7 @@ def file_sloc(path, verbose=False): return (path, total_sloc, physical_loc) + def walk_sloc(in_root, regexp, verbose=False): """ Run file_sloc on each file that matches regexp under root path. @@ -52,6 +54,7 @@ def walk_sloc(in_root, regexp, verbose=False): except FileNotFoundError: pass + def sloc_translate(args): """ Toplevel routine for script. @@ -65,5 +68,6 @@ def sloc_translate(args): print("Expected either 1 argument (a single file to parse" + " and print) or 2 (a directory root & file pattern)") + if __name__ == '__main__': sloc_translate(sys.argv) diff --git a/setup.py b/setup.py index 7c0c758..2fdbaf1 100644 --- a/setup.py +++ b/setup.py @@ -23,4 +23,4 @@ 'matplotlib', 'pyyaml', 'scipy'] -) + ) diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py index aa97472..ab672e6 100644 --- a/tests/comments/test_comments.py +++ b/tests/comments/test_comments.py @@ -6,6 +6,7 @@ import os from codebasin import preprocessor, file_parser + class TestExampleFortranFile(unittest.TestCase): """ Test handling of fixed form Fortran @@ -18,6 +19,7 @@ def test_fortran_comments(self): tree = parser.parse_file() self.assertEqual(tree.root.total_sloc, 20) + class TestExampleCFile(unittest.TestCase): """ Test handling of C comments @@ -30,5 +32,6 @@ def test_c_comments(self): tree = parser.parse_file() self.assertEqual(tree.root.total_sloc, 25) + if __name__ == '__main__': unittest.main() From 357719a76116bb99f6ead6f82f6a2d0bf9034f26 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 3 Dec 2019 09:50:13 -0800 Subject: [PATCH 33/49] Toggle report title printouts --- codebasin.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/codebasin.py b/codebasin.py index 97f64c9..6579339 100755 --- a/codebasin.py +++ b/codebasin.py @@ -98,8 +98,10 @@ def guess_project_name(config_path): output_prefix = os.path.realpath(guess_project_name(args.config_file)) - print(f"Config file: {args.config_file}") - print(f"Root: {rootdir}") + if report_enabled("summary") or report_enabled("clustering"): + print(f"Config file: {args.config_file}") + print(f"Root: {rootdir}") + # Print summary report if report_enabled("summary"): summary = report.summary(setmap) From bf611fddbe49e2a6263cfbc10e64a8da76ffdff5 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 3 Dec 2019 09:50:53 -0800 Subject: [PATCH 34/49] Add verbose flag to sloc_translate --- etc/sloc_translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index 59c9076..799052b 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -63,7 +63,7 @@ def sloc_translate(args): (filename, total_sloc, physical_loc) = file_sloc(args[1], verbose=True) print(f"{filename}, {total_sloc}, {physical_loc}") elif len(args) == 3: - walk_sloc(args[1], re.compile(args[2])) + walk_sloc(args[1], re.compile(args[2]), verbose=True) else: print("Expected either 1 argument (a single file to parse" + " and print) or 2 (a directory root & file pattern)") From 197d8e39f29977940bf5b781a3347d9672227544 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 3 Dec 2019 11:14:48 -0800 Subject: [PATCH 35/49] Fixed form -> Freeform Fortran --- tests/comments/test_comments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py index ab672e6..f63d784 100644 --- a/tests/comments/test_comments.py +++ b/tests/comments/test_comments.py @@ -9,7 +9,7 @@ class TestExampleFortranFile(unittest.TestCase): """ - Test handling of fixed form Fortran + Test handling of freeform Fortran """ def test_fortran_comments(self): From 631a3fbed63c18b96b1962064a1418f44225a777 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Tue, 3 Dec 2019 11:16:17 -0800 Subject: [PATCH 36/49] Get rid of regexps in sloc_translate --- etc/sloc_translate.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index 799052b..72d7ca2 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -8,7 +8,7 @@ import os import sys -import re +from pathlib import Path sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) @@ -39,14 +39,14 @@ def file_sloc(path, verbose=False): return (path, total_sloc, physical_loc) -def walk_sloc(in_root, regexp, verbose=False): +def walk_sloc(in_root, extensions, verbose=False): """ Run file_sloc on each file that matches regexp under root path. """ for root, _, files in os.walk(in_root): for current_file in files: full_path = os.path.join(root, current_file) - if regexp.match(full_path): + if Path(full_path).suffix in extensions: try: (filename, total_sloc, physical_loc) = file_sloc(full_path) if verbose: @@ -63,7 +63,8 @@ def sloc_translate(args): (filename, total_sloc, physical_loc) = file_sloc(args[1], verbose=True) print(f"{filename}, {total_sloc}, {physical_loc}") elif len(args) == 3: - walk_sloc(args[1], re.compile(args[2]), verbose=True) + cleaned = [f".{x}" for x in args[2].split(',')] + walk_sloc(args[1], cleaned, verbose=True) else: print("Expected either 1 argument (a single file to parse" + " and print) or 2 (a directory root & file pattern)") From 77cf05f9cd05fb2067176068e0c1581f891bd0d8 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Wed, 4 Dec 2019 11:42:06 -0800 Subject: [PATCH 37/49] Add batchmode flag to guard extra report info --- codebasin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/codebasin.py b/codebasin.py index 6579339..b2fa223 100755 --- a/codebasin.py +++ b/codebasin.py @@ -72,6 +72,8 @@ def guess_project_name(config_path): parser.add_argument('-R', '--report', dest='reports', metavar='REPORT', default=['all'], choices=['all', 'summary', 'clustering'], nargs='+', help='desired output reports (default: all)') + parser.add_argument('--batchmode', dest='batchmode', action='store_true', default=False, + help="Set batch mode (additional output for bulk operation.)") args = parser.parse_args() stdout_log = logging.StreamHandler(sys.stdout) @@ -98,7 +100,7 @@ def guess_project_name(config_path): output_prefix = os.path.realpath(guess_project_name(args.config_file)) - if report_enabled("summary") or report_enabled("clustering"): + if args.batchmode and (report_enabled("summary") or report_enabled("clustering")): print(f"Config file: {args.config_file}") print(f"Root: {rootdir}") From faf9a204ae80a6e5424ba53ca13d059452a3504a Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Thu, 19 Mar 2020 06:57:26 -0700 Subject: [PATCH 38/49] Add default configfile in rootdir detetction --- codebasin.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/codebasin.py b/codebasin.py index b2fa223..d914760 100755 --- a/codebasin.py +++ b/codebasin.py @@ -9,7 +9,7 @@ optional arguments: -h, --help show this help message and exit -c FILE, --config FILE - configuration file (default: config.yaml) + configuration file (default: /config.yaml) -v, --verbose verbosity level -q, --quiet quiet level -r DIR, --rootdir DIR @@ -61,7 +61,7 @@ def guess_project_name(config_path): # Read command-line arguments parser = argparse.ArgumentParser(description="Code Base Investigator v" + str(version)) parser.add_argument('-c', '--config', dest='config_file', metavar='FILE', action='store', - default='config.yaml', help='configuration file (default: config.yaml)') + help='configuration file (default: /config.yaml)') parser.add_argument('-v', '--verbose', dest='verbose', action='count', default=0, help='increase verbosity level') parser.add_argument('-q', '--quiet', dest='quiet', @@ -83,12 +83,16 @@ def guess_project_name(config_path): max(1, logging.WARNING - 10 * (args.verbose - args.quiet))) rootdir = os.path.realpath(args.rootdir) + if args.config_file == None: + config_file = os.path.join(rootdir, "config.yaml") + else: + config_file = args.config_file # Load the configuration file into a dict - if not util.ensure_yaml(args.config_file): + if not util.ensure_yaml(config_file): logging.getLogger("codebasin").error( "Configuration file does not have YAML file extension.") sys.exit(1) - codebase, configuration = config.load(args.config_file, rootdir) + codebase, configuration = config.load(config_file, rootdir) # Parse the source tree, and determine source line associations. # The trees and associations are housed in state. @@ -98,10 +102,10 @@ def guess_project_name(config_path): platform_mapper = walkers.PlatformMapper(codebase) setmap = platform_mapper.walk(state) - output_prefix = os.path.realpath(guess_project_name(args.config_file)) + output_prefix = os.path.realpath(guess_project_name(config_file)) if args.batchmode and (report_enabled("summary") or report_enabled("clustering")): - print(f"Config file: {args.config_file}") + print(f"Config file: {config_file}") print(f"Root: {rootdir}") # Print summary report From b724e01bddbc1374e869326055dd3b550a62e6fc Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Mon, 28 Oct 2019 08:00:14 -0700 Subject: [PATCH 39/49] Fix typo in comment --- codebasin/preprocessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codebasin/preprocessor.py b/codebasin/preprocessor.py index 517ffac..15f19e9 100644 --- a/codebasin/preprocessor.py +++ b/codebasin/preprocessor.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: BSD-3-Clause # pylint: disable=too-many-lines """ -Dontains classes that define: +Contains classes that define: - Nodes from the tree - Tokens from lexing a line of code - Operators to handle tokens From 750a1edcdf29d24ed3f17de87aba39dcdc880384 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Thu, 19 Mar 2020 08:54:20 -0700 Subject: [PATCH 40/49] Move rootdir options up above config options This makes the refeence to more clear. --- codebasin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/codebasin.py b/codebasin.py index d914760..074eade 100755 --- a/codebasin.py +++ b/codebasin.py @@ -60,15 +60,15 @@ def guess_project_name(config_path): # Read command-line arguments parser = argparse.ArgumentParser(description="Code Base Investigator v" + str(version)) + parser.add_argument('-r', '--rootdir', dest="rootdir", metavar='DIR', + default=os.getcwd(), type=str, + help="Set working root directory (default .)") parser.add_argument('-c', '--config', dest='config_file', metavar='FILE', action='store', help='configuration file (default: /config.yaml)') parser.add_argument('-v', '--verbose', dest='verbose', action='count', default=0, help='increase verbosity level') parser.add_argument('-q', '--quiet', dest='quiet', action='count', default=0, help='decrease verbosity level') - parser.add_argument('-r', '--rootdir', dest="rootdir", metavar='DIR', - default=os.getcwd(), type=str, - help="Set working root directory (default .)") parser.add_argument('-R', '--report', dest='reports', metavar='REPORT', default=['all'], choices=['all', 'summary', 'clustering'], nargs='+', help='desired output reports (default: all)') From 807c8f71fcf50ddc009d4d1ff05e11db2f855c38 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 29 May 2020 13:51:59 -0700 Subject: [PATCH 41/49] Apply autopep8 and pylint fixes --- codebasin.py | 2 +- codebasin/file_parser.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/codebasin.py b/codebasin.py index 074eade..9e6a6eb 100755 --- a/codebasin.py +++ b/codebasin.py @@ -83,7 +83,7 @@ def guess_project_name(config_path): max(1, logging.WARNING - 10 * (args.verbose - args.quiet))) rootdir = os.path.realpath(args.rootdir) - if args.config_file == None: + if args.config_file is None: config_file = os.path.join(rootdir, "config.yaml") else: config_file = args.config_file diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 152ce3d..86dfb8f 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -133,8 +133,7 @@ def parse_file(self): groups = {'code': LineGroup(), 'directive': LineGroup(), - 'file': LineGroup() - } + 'file': LineGroup()} groups['file'].start_line = 1 From d0848c9d954928134a5f7f67efde0158a646b349 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 1 Jun 2020 07:04:58 -0700 Subject: [PATCH 42/49] Bump version number to 1.05 --- codebasin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codebasin.py b/codebasin.py index 9e6a6eb..bdb6ecf 100755 --- a/codebasin.py +++ b/codebasin.py @@ -25,7 +25,7 @@ from codebasin import config, finder, report, util, walkers -version = 1.0 +version = 1.05 def report_enabled(name): From ab4fcc0de966b790e61a422e4575359053597457 Mon Sep 17 00:00:00 2001 From: Douglas Jacobsen Date: Mon, 1 Jun 2020 07:40:43 -0700 Subject: [PATCH 43/49] Prevent import reorder in sloc_translate This commit adds `# nopep8` to the end of the sloc_translate.py get_file_source import, to prevent autopep8 from reordering it automatically. --- etc/sloc_translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index 72d7ca2..f252d96 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -13,7 +13,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) # pylint: disable=wrong-import-position -from codebasin.file_source import get_file_source +from codebasin.file_source import get_file_source # nopep8 def file_sloc(path, verbose=False): From 0423ccf0add74b676b18539f33186d46c34c7d3c Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 19 Jun 2020 12:14:42 -0700 Subject: [PATCH 44/49] Replace asserts in file_source.py with exceptions --- codebasin/file_source.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/codebasin/file_source.py b/codebasin/file_source.py index 4138125..f625bf4 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -124,7 +124,8 @@ def putback(self, item): Put item into the iterator such that it will be the next yielded item. """ - assert self.single is None + if self.single is not None: + raise RuntimeError("iter_keep1 can only have one item put back at a time!") self.single = item @@ -163,7 +164,8 @@ def logical_newline(self): self.state = ["TOPLEVEL"] elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": self.state.pop() - assert self.state[-1] == "IN_BLOCK_COMMENT" + if not self.state[-1] == "IN_BLOCK_COMMENT": + raise RuntimeError("Inconsistent parser state! Looking for / to terminates a block comment but not in a block comment!") elif self.state[-1] == "CPP_DIRECTIVE": self.state = ["TOPLEVEL"] @@ -252,19 +254,21 @@ def process(self, lineiter): elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": if char == '/': self.state.pop() - assert self.state[-1] == "IN_BLOCK_COMMENT" + if not self.state[-1] == "IN_BLOCK_COMMENT": + raise RuntimeError("Inconsistent parser state! Looking for / to terminates a block comment but not in a block comment!") self.state.pop() self.outbuf.append_space() elif char != '*': self.state.pop() - assert self.state[-1] == "IN_BLOCK_COMMENT" + if not self.state[-1] == "IN_BLOCK_COMMENT": + raise RuntimeError("Inconsistent parser Looking for * that terminates a block comment but not in a block comment!") elif self.state[-1] == "ESCAPING": self.outbuf.append_nonspace(char) self.state.pop() elif self.state[-1] == "IN_INLINE_COMMENT": return else: - assert None + raise RuntimeError("Unknown parser state!") class fortran_cleaner: @@ -379,7 +383,7 @@ def process(self, lineiter): elif is_whitespace(char): self.verify_continue.append(char) else: - assert None + raise RuntimeError("Unknown parser state") except StopIteration: pass if self.state[-1] == "CONTINUING_TO_EOL": @@ -496,8 +500,8 @@ def c_file_source(fp, relaxed=False, directives_only=False): yield curr_line total_sloc += curr_line.physical_reset() - if not relaxed: - assert cleaner.state == ["TOPLEVEL"] + if not relaxed and not cleaner.state == ["TOPLEVEL"]: + raise RuntimeError("C file parser did not end at top level, and not in 'relaxed' mode") return (total_sloc, total_physical_lines) @@ -567,8 +571,8 @@ def fortran_file_source(fp, relaxed=False): yield curr_line total_sloc += curr_line.physical_reset() - if not relaxed: - assert cleaner.state == ["TOPLEVEL"] + if not relaxed and not cleaner.state == ["TOPLEVEL"]: + raise RuntimeError("Fortran file parser did not end at top level, and not in 'relaxed' mode") return (total_sloc, total_physical_lines) From be0d0c12d611a3699eeea0646bcd492ac0b1e44c Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 19 Jun 2020 12:17:27 -0700 Subject: [PATCH 45/49] Bump version number in setup.py to 1.05 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2fdbaf1..6fc5d44 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup setup(name='codebasin', - version='1.0', + version='1.05', description='Code Base Investigator', author='John Pennycook', author_email='john.pennycook@intel.com', From 17218f2a8a7cda706e06fad31e36f379eacefd1d Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 19 Jun 2020 12:18:17 -0700 Subject: [PATCH 46/49] Bump python_requires in setup.py to 3.6 We use f-strings, which are found in 3.6+ --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6fc5d44..1446a5c 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ 'License :: OSI Approved :: BSD License', 'Programming Language :: Python', 'Topic :: Software Development'], - python_requires='>=3.4', + python_requires='>=3.6', install_requires=['numpy', 'matplotlib', 'pyyaml', From b5a808a4c148e4692c02f5e0eff4d5f5ea15753a Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 19 Jun 2020 12:36:37 -0700 Subject: [PATCH 47/49] Fix typos in exception strings --- codebasin/file_source.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/codebasin/file_source.py b/codebasin/file_source.py index f625bf4..e887b98 100644 --- a/codebasin/file_source.py +++ b/codebasin/file_source.py @@ -165,7 +165,8 @@ def logical_newline(self): elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR": self.state.pop() if not self.state[-1] == "IN_BLOCK_COMMENT": - raise RuntimeError("Inconsistent parser state! Looking for / to terminates a block comment but not in a block comment!") + raise RuntimeError( + "Inconsistent parser state! Looking for / to terminate a block comment but not in a block comment!") elif self.state[-1] == "CPP_DIRECTIVE": self.state = ["TOPLEVEL"] @@ -255,13 +256,15 @@ def process(self, lineiter): if char == '/': self.state.pop() if not self.state[-1] == "IN_BLOCK_COMMENT": - raise RuntimeError("Inconsistent parser state! Looking for / to terminates a block comment but not in a block comment!") + raise RuntimeError( + "Inconsistent parser state! Looking for / to terminate a block comment but not in a block comment!") self.state.pop() self.outbuf.append_space() elif char != '*': self.state.pop() if not self.state[-1] == "IN_BLOCK_COMMENT": - raise RuntimeError("Inconsistent parser Looking for * that terminates a block comment but not in a block comment!") + raise RuntimeError( + "Inconsistent parser state! Looking for * that terminates a block comment but not in a block comment!") elif self.state[-1] == "ESCAPING": self.outbuf.append_nonspace(char) self.state.pop() @@ -572,7 +575,8 @@ def fortran_file_source(fp, relaxed=False): total_sloc += curr_line.physical_reset() if not relaxed and not cleaner.state == ["TOPLEVEL"]: - raise RuntimeError("Fortran file parser did not end at top level, and not in 'relaxed' mode") + raise RuntimeError( + "Fortran file parser did not end at top level, and not in 'relaxed' mode") return (total_sloc, total_physical_lines) From b1d1fafe381afde509e674f6e232616e005fa445 Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 10 Jul 2020 06:52:51 -0700 Subject: [PATCH 48/49] Use realpath in sloc_translate to resolve symlinks This allows us to check real file extensions --- etc/sloc_translate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index f252d96..86d4a99 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -43,9 +43,10 @@ def walk_sloc(in_root, extensions, verbose=False): """ Run file_sloc on each file that matches regexp under root path. """ + in_root = os.path.realpath(in_root) for root, _, files in os.walk(in_root): for current_file in files: - full_path = os.path.join(root, current_file) + full_path = os.path.realpath(os.path.join(root, current_file)) if Path(full_path).suffix in extensions: try: (filename, total_sloc, physical_loc) = file_sloc(full_path) @@ -60,7 +61,8 @@ def sloc_translate(args): Toplevel routine for script. """ if len(args) == 2: - (filename, total_sloc, physical_loc) = file_sloc(args[1], verbose=True) + path = os.path.realpath(args[1]) + (filename, total_sloc, physical_loc) = file_sloc(path, verbose=True) print(f"{filename}, {total_sloc}, {physical_loc}") elif len(args) == 3: cleaned = [f".{x}" for x in args[2].split(',')] From a046504bd596a13d4991e30dd08ed4e75e9a3f4b Mon Sep 17 00:00:00 2001 From: Jason Sewall Date: Fri, 10 Jul 2020 07:22:14 -0700 Subject: [PATCH 49/49] Add safe_open_read_nofollow and use it This wraps open() for reading but prevents us from following links. By using os.path.realpath to resolve links first, then checking extensions, and then using this function, we can be certain that we don't read a file with a bogus extension. --- codebasin/config.py | 4 ++-- codebasin/file_parser.py | 6 ++++-- codebasin/preprocessor.py | 2 +- codebasin/util.py | 6 ++++++ etc/sloc_translate.py | 3 ++- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/codebasin/config.py b/codebasin/config.py index 4cb49aa..3a6516d 100644 --- a/codebasin/config.py +++ b/codebasin/config.py @@ -134,7 +134,7 @@ def load_database(dbpath, rootdir): Return a list of compilation commands, where each command is represented as a compilation database entry. """ - with open(dbpath, 'r') as fi: + with util.safe_open_read_nofollow(dbpath, 'r') as fi: db = yaml.safe_load(fi) configuration = [] @@ -244,7 +244,7 @@ def load(config_file, rootdir): Return a (codebase, platform configuration) tuple of dicts. """ if os.path.isfile(config_file): - with open(config_file, 'r') as f: + with util.safe_open_read_nofollow(config_file, 'r') as f: config = yaml.safe_load(f) else: raise RuntimeError("Could not open {!s}.".format(config_file)) diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py index 86dfb8f..97ef72c 100644 --- a/codebasin/file_parser.py +++ b/codebasin/file_parser.py @@ -5,8 +5,10 @@ and building a tree of nodes from it. """ +import os from codebasin.file_source import get_file_source from . import preprocessor # pylint : disable=no-name-in-module +from . import util # pylint : disable=no-name-in-module class LineGroup: @@ -76,7 +78,7 @@ class FileParser: """ def __init__(self, _filename): - self._filename = _filename + self._filename = os.path.realpath(_filename) @staticmethod def handle_directive(out_tree, groups, logical_line): @@ -129,7 +131,7 @@ def parse_file(self): if not file_source: raise RuntimeError(f"{self._filename} doesn't appear " + "to be a language this tool can process") - with open(self._filename, mode='r', errors='replace') as source_file: + with util.safe_open_read_nofollow(self._filename, mode='r', errors='replace') as source_file: groups = {'code': LineGroup(), 'directive': LineGroup(), diff --git a/codebasin/preprocessor.py b/codebasin/preprocessor.py index 15f19e9..2b0088d 100644 --- a/codebasin/preprocessor.py +++ b/codebasin/preprocessor.py @@ -443,7 +443,7 @@ def __init__(self, _filename): def __compute_file_hash(self): chunk_size = 4096 hasher = hashlib.sha512() - with open(self.filename, 'rb') as in_file: + with util.safe_open_read_nofollow(self.filename, 'rb') as in_file: for chunk in iter(lambda: in_file.read(chunk_size), b""): hasher.update(chunk) diff --git a/codebasin/util.py b/codebasin/util.py index b65f6cc..67bad44 100644 --- a/codebasin/util.py +++ b/codebasin/util.py @@ -50,6 +50,12 @@ def safe_open_write_binary(fname): return os.fdopen(fpid, "wb") +def safe_open_read_nofollow(fname, *args, **kwargs): + """Open fname for reading, but don't follow links.""" + fpid = os.open(fname, os.O_RDONLY | os.O_NOFOLLOW) + return os.fdopen(fpid, *args, **kwargs) + + def valid_path(path): """Return true if the path passed in is valid""" valid = True diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py index 86d4a99..6a28382 100755 --- a/etc/sloc_translate.py +++ b/etc/sloc_translate.py @@ -14,6 +14,7 @@ # pylint: disable=wrong-import-position from codebasin.file_source import get_file_source # nopep8 +from codebasin.util import safe_open_read_nofollow # nopep8 def file_sloc(path, verbose=False): @@ -23,7 +24,7 @@ def file_sloc(path, verbose=False): file_source = get_file_source(path) if not file_source: raise RuntimeError(f"{path} doesn't appear to be a language this tool can process") - with open(path, mode='r', errors='replace') as source_file: + with safe_open_read_nofollow(path, mode='r', errors='replace') as source_file: walker = file_source(source_file, relaxed=False) try: while True: