From a7dff7e93401f8da5f2c4a868fb96175b5d2b377 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 5 Nov 2019 10:22:39 -0800
Subject: [PATCH 01/49] Add test for complex C code with continuations

---
 tests/comments/__init__.py      |  2 ++
 tests/comments/continuation.cpp | 55 +++++++++++++++++++++++++++++++++
 tests/comments/test_comments.py | 22 +++++++++++++
 3 files changed, 79 insertions(+)
 create mode 100644 tests/comments/__init__.py
 create mode 100644 tests/comments/continuation.cpp
 create mode 100644 tests/comments/test_comments.py

diff --git a/tests/comments/__init__.py b/tests/comments/__init__.py
new file mode 100644
index 0000000..93af6d4
--- /dev/null
+++ b/tests/comments/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2019 Intel Corporation
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/tests/comments/continuation.cpp b/tests/comments/continuation.cpp
new file mode 100644
index 0000000..9101835
--- /dev/null
+++ b/tests/comments/continuation.cpp
@@ -0,0 +1,55 @@
+// Copyright (C) 2019-2020 Intel Corporation
+// SPDX-License-Identifier: BSD-3-Clause
+
+int i = \
+    5; // comment \
+         lines \
+         more
+
+
+int x = \
+    1 /* now what \
+        comment \
+       // \
+       */ +2/\
+* hahaha */+3;
+
+"long //   - - -   string \
+and    \
+\
+\
+stuff    "
+
+  \     char w[] =
+"confusing \"\
+   string \" \n\
+\"  \" \\ \/ /* \* */ "; \
+"long   - - -   string \
+and /* \" */           \
+   \
+   \
+stuff    "
+
+/* big block comment
+** and so on
+** and so on too
+*/
+
+'"'
+
+''//what about this?\
+d'
+
+'/'
+
+"'\"'"
+
+int foo(); /\
+* hahahaha *\
+/
+
+#warning Dangerous don't do this
+#warning "This is more safe"
+
+/* "Strings 'r' // Fun! *\
+/
diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py
new file mode 100644
index 0000000..5caf55a
--- /dev/null
+++ b/tests/comments/test_comments.py
@@ -0,0 +1,22 @@
+# Copyright (C) 2019 Intel Corporation
+# SPDX-License-Identifier: BSD-3-Clause
+
+import unittest
+import logging
+import os
+from codebasin import preprocessor, file_parser
+
+class TestExampleFile(unittest.TestCase):
+    """
+    Test handling of comments
+    """
+
+    def test_c_comments(self):
+        rootdir = "./tests/comments/"
+        parser = file_parser.FileParser(os.path.join(rootdir, "continuation.cpp"))
+
+        tree = parser.parse_file()
+        self.assertEqual(tree.root.children[0].num_lines, 25)
+
+if __name__ == '__main__':
+    unittest.main()

From ed76d521942a2beedf08daa6cc0c988d5af710ff Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Thu, 7 Nov 2019 13:53:15 -0800
Subject: [PATCH 02/49] Add new c preprocessor stages

This breaks fortran slightly and needs to be better commented, etc, but
I wanted to push this so I could get feedback before going too far
---
 codebasin/c_source.py    | 194 +++++++++++++++++++++++++++++++++++++++
 codebasin/file_parser.py | 145 ++++++++---------------------
 2 files changed, 232 insertions(+), 107 deletions(-)
 create mode 100644 codebasin/c_source.py

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
new file mode 100644
index 0000000..8aeefe9
--- /dev/null
+++ b/codebasin/c_source.py
@@ -0,0 +1,194 @@
+# Copyright (C) 2019 Intel Corporation
+# SPDX-License-Identifier: BSD-3-Clause
+"""
+Contains classes and functions for stripping comments and whitespace from C/C++ files
+"""
+
+global whitespace_dict
+whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000')
+
+def is_whitespace(c):
+    return c in whitespace_dict
+
+class one_space_line(object):
+    def __init__(self):
+        self.parts = []
+        self.trailing_space = False
+    def append_char(self, c):
+        if not is_whitespace(c):
+            self.parts.append(c)
+            self.trailing_space = False
+        else:
+            if not self.trailing_space:
+                self.parts.append(' ')
+                self.trailing_space = True
+    def append_space(self):
+        if not self.trailing_space:
+            self.parts.append(' ')
+            self.trailing_space = True
+    def append_nonspace(self, c):
+        self.parts.append(c)
+        self.trailing_space = False
+    def join(self, other):
+        if len(other.parts) > 0:
+            if other.parts[0] == ' ' and self.trailing_space:
+                self.parts += other.parts[1:]
+            else:
+                self.parts += other.parts[:]
+            self.trailing_space = other.trailing_space
+    def is_blank(self):
+        return len(self.parts) == 0 or ( len(self.parts) == 1 and self.parts[0] == ' ' )
+    def flush(self):
+        res= ''.join(self.parts)
+        self.__init__()
+        return res
+
+class c_cleaner(object):
+    def __init__(self, outbuf):
+        self.state = ["NO_COMMENT"]
+        self.outbuf = outbuf
+    def logical_newline(self):
+        if self.state[-1] == "IN_INLINE_COMMENT":
+            self.state.pop()
+            assert self.state == ["NO_COMMENT"]
+            self.outbuf.append_space()
+        elif self.state[-1] == "FOUND_SLASH":
+            self.state.pop()
+            assert self.state == ["NO_COMMENT"]
+            self.outbuf.append_nonspace('/')
+        elif self.state[-1] == "SINGLE_QUOTATION":
+            # This probably should give a warning
+            self.state.pop()
+            assert self.state == ["NO_COMMENT"]
+        elif self.state[-1] == "DOUBLE_QUOTATION":
+            # This probably should give a warning
+            self.state.pop()
+            assert self.state == ["NO_COMMENT"]
+        elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
+            self.state.pop()
+            assert self.state[-1] == "IN_BLOCK_COMMENT"
+    def process(self, line, start, end):
+        pos = start
+        while pos < end:
+            if self.state[-1] == "NO_COMMENT":
+                if line[pos] == '\\':
+                    self.state.append("ESCAPING")
+                    self.outbuf.append_nonspace(line[pos])
+                elif line[pos] == '/':
+                    self.state.append("FOUND_SLASH")
+                elif line[pos] == '"':
+                    self.state.append("DOUBLE_QUOTATION")
+                    self.outbuf.append_nonspace(line[pos])
+                elif line[pos] == '\'':
+                    self.state.append("SINGLE_QUOTATION")
+                    self.outbuf.append_nonspace(line[pos])
+                else:
+                    self.outbuf.append_char(line[pos])
+            elif self.state[-1] == "DOUBLE_QUOTATION":
+                if line[pos] == '\\':
+                    self.state.append("ESCAPING")
+                    self.outbuf.append_nonspace(line[pos])
+                elif line[pos] == '"':
+                    self.state.pop()
+                    assert self.state == ["NO_COMMENT"]
+                    self.outbuf.append_nonspace(line[pos])
+                else:
+                    self.outbuf.append_nonspace(line[pos])
+            elif self.state[-1] == "SINGLE_QUOTATION":
+                if line[pos] == '\\':
+                    self.state.append("ESCAPING")
+                    self.outbuf.append_nonspace(line[pos])
+                elif line[pos] == '/':
+                    self.state.append("FOUND_SLASH")
+                elif line[pos] == '\'':
+                    self.state.pop()
+                    assert self.state == ["NO_COMMENT"]
+                    self.outbuf.append_nonspace(line[pos])
+                else:
+                    self.outbuf.append_nonspace(line[pos])
+            elif self.state[-1] == "FOUND_SLASH":
+                if line[pos] == '/':
+                    self.state.pop()
+                    self.state.append("IN_INLINE_COMMENT")
+                elif line[pos] == '*':
+                    self.state.pop()
+                    self.state.append("IN_BLOCK_COMMENT")
+                else:
+                    self.state.pop()
+                    self.outbuf.append_char('/')
+                    pos -= 1
+            elif self.state[-1] == "IN_BLOCK_COMMENT":
+                if line[pos] == '*':
+                    self.state.append("IN_BLOCK_COMMENT_FOUND_STAR")
+            elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
+                if line[pos] == '/':
+                    self.state.pop()
+                    assert self.state[-1] == "IN_BLOCK_COMMENT"
+                    self.state.pop()
+                    assert self.state == ["NO_COMMENT"]
+                    self.outbuf.append_space()
+                elif line[pos] != '*':
+                    self.state.pop()
+                    assert self.state[-1] == "IN_BLOCK_COMMENT"
+            elif self.state[-1] == "ESCAPING":
+                self.outbuf.append_nonspace(line[pos])
+                self.state.pop()
+            elif self.state[-1] == "IN_INLINE_COMMENT":
+                return
+            pos += 1
+
+def c_file_source(fp):
+
+    current_physical_line = one_space_line()
+    cleaner = c_cleaner(current_physical_line)
+
+    current_logical_line = one_space_line()
+
+    current_physical_start = 1
+    total_sloc = 0
+    local_sloc = 0
+
+    physical_line_num = 0
+    for (physical_line_num, line) in enumerate(fp, start=1):
+        current_physical_line.__init__()
+        end = len(line)
+        if line[-1] == '\n':
+            end -= 1
+        else:
+            if end > 0 and line[end-1] == '\\':
+                raise RuntimeError("file seems to end in \\ with no newline!")
+
+        if end > 0 and line[end-1] == '\\':
+            continued = True
+            end -= 1
+        else:
+            continued = False
+        cleaner.process(line, 0, end)
+        if not continued:
+            cleaner.logical_newline()
+
+        if not current_physical_line.is_blank():
+            local_sloc += 1
+
+        current_logical_line.join(current_physical_line)
+
+        if not continued:
+            if not current_logical_line.is_blank():
+                yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush())
+            else:
+                current_logical_line.__init__()
+                assert local_sloc == 0
+
+            current_physical_start = physical_line_num + 1
+            total_sloc += local_sloc
+            local_sloc = 0
+
+    total_physical_lines = physical_line_num
+
+    if not current_logical_line.is_blank():
+        yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush())
+
+    total_sloc += local_sloc
+    assert cleaner.state == ["NO_COMMENT"]
+
+    return (total_sloc, total_physical_lines)
diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index a22ddc2..180aace 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -9,6 +9,7 @@
 
 from . import preprocessor  # pylint : disable=no-name-in-module
 
+from codebasin.c_source import c_file_source
 
 class LineGroup:
     """
@@ -30,26 +31,19 @@ def empty(self):
             return False
         return True
 
-    def add_line(self, line_num, is_countable=False):
+    def add_line(self, phys_int, sloc_count):
         """
         Add a line to this line group. Update the extent appropriately,
         and if it's a countable line, add it to the line count.
         """
 
-        if self.start_line == -1:
-            self.start_line = line_num
-
-        self.end_line = line_num
-
-        if self.start_line == -1 or line_num < self.start_line:
-            self.start_line = line_num
-
-        if line_num > self.end_line:
-            self.end_line = line_num
+        if self.start_line == -1 or phys_int[0] < self.start_line:
+            self.start_line = phys_int[0]
 
-        if is_countable:
-            self.line_count += 1
+        if phys_int[1]-1 > self.end_line:
+            self.end_line = phys_int[1]-1
 
+        self.line_count += sloc_count
     def reset(self):
         """
         Reset the countable group
@@ -58,13 +52,12 @@ def reset(self):
         self.start_line = -1
         self.end_line = -1
 
-    def merge(self, line_group, count=False):
+    def merge(self, line_group):
         """
         Merge another line group into this line group, and reset the
         other group.
         """
-        if count:
-            self.line_count += line_group.line_count
+        self.line_count += line_group.line_count
 
         if self.start_line == -1:
             self.start_line = line_group.start_line
@@ -85,7 +78,6 @@ class FileParser:
 
     def __init__(self, _filename):
         self._filename = _filename
-        self.full_line = ''
 
         split = splitext(_filename)
         if len(split) == 2:
@@ -94,53 +86,22 @@ def __init__(self, _filename):
             self._file_extension = None
 
     @staticmethod
-    def line_info(line):
-        """
-        Determine if the input line is a directive by checking if the
-        first by looking for a '#' as the first non-whitespace
-        character. Also determine if the last character before a new
-        line is a line continuation character '\'.
-
-        Return a (directive, line_continue) tuple.
-        """
-
-        directive = False
-        line_continue = False
-
-        for c in line:
-            if c == '#':
-                directive = True
-                break
-            elif c not in [' ', '\t']:
-                break
-
-        if line.rstrip("\n\r")[-1:] == '\\':
-            line_continue = True
-
-        return (directive, line_continue)
-
-    def handle_directive(self, out_tree, line_num, comment_cleaner, groups):
+    def handle_directive(out_tree, groups, phys_int, sloc, logical_line):
         """
         Handle inserting code and directive nodes, where appropriate.
         Update the file group, and reset the code and directive groups.
         """
         # We will actually use this directive, if it is not empty
-        self.full_line = comment_cleaner.strip_comments(self.full_line)
-        if self.full_line.strip():
-            # We need to finalize the previously started
-            # CodeNode (if there was one) before processing
-            # this DirectiveNode
-            if not groups['code'].empty():
-                groups['code'].add_line(line_num - 1)
-                self.insert_code_node(out_tree, groups['code'])
+        # We need to finalize the previously started
+        # CodeNode (if there was one) before processing
+        # this DirectiveNode
+        if not groups['code'].empty():
+            FileParser.insert_code_node(out_tree, groups['code'])
+            groups['file'].merge(groups['code'])
 
-                groups['file'].merge(groups['code'])
+        FileParser.insert_directive_node(out_tree, groups['directive'], logical_line)
 
-            self.insert_directive_node(out_tree, groups['directive'])
-
-            groups['file'].merge(groups['directive'])
-        else:
-            groups['code'].merge(groups['directive'])
+        groups['file'].merge(groups['directive'])
 
     @staticmethod
     def insert_code_node(tree, line_group):
@@ -151,13 +112,14 @@ def insert_code_node(tree, line_group):
             line_group.start_line, line_group.end_line, line_group.line_count)
         tree.insert(new_node)
 
-    def insert_directive_node(self, tree, line_group):
+    @staticmethod
+    def insert_directive_node(tree, line_group, logical_line):
         """
         Build a directive node by parsing a directive line, and insert a
         new directive node into the tree.
         """
         new_node = preprocessor.DirectiveParser(preprocessor.Lexer(
-            self.full_line, line_group.start_line).tokenize()).parse()
+            logical_line, line_group.start_line).tokenize()).parse()
         new_node.start_line = line_group.start_line
         new_node.end_line = line_group.end_line
         new_node.num_lines = line_group.line_count
@@ -169,12 +131,6 @@ def parse_file(self):
         representing this file, and return it.
         """
 
-        file_comment_cleaner = preprocessor.CommentCleaner(self._file_extension)
-        if file_comment_cleaner.filetype == 'c':
-            cpp_comment_cleaner = file_comment_cleaner
-        else:
-            cpp_comment_cleaner = preprocessor.CommentCleaner('.c')
-
         out_tree = preprocessor.SourceTree(self._filename)
         with open(self._filename, mode='r', errors='replace') as source_file:
             previous_continue = False
@@ -186,56 +142,31 @@ def parse_file(self):
 
             groups['file'].start_line = 1
 
-            lines = source_file.readlines()
-            for (line_num, line) in enumerate(lines, 1):
-                # Determine if this line starts with a # (directive)
-                # and/or ends with a \ (line continuation)
-                (in_directive, continue_line) = self.line_info(line)
-
-                # Only follow continuation for directives
-                if previous_continue or in_directive:
-
-                    # Add this into the directive lines, even if it
-                    # might not be a directive we count
-                    groups['directive'].add_line(line_num, True)
+            c_source = c_file_source(source_file)
+            try:
+                while True:
+                    (phys_int, local_sloc, logical_line) = next(c_source)
+                    in_directive = logical_line[0] == "#" or logical_line[0] == ' ' and logical_line[1] == '#'
+                    # Only follow continuation for directives
+                    if in_directive:
+                        # Add this into the directive lines, even if it
+                        # might not be a directive we count
+                        groups['directive'].add_line(phys_int, local_sloc)
 
-                    # If this line starts a new directive, flush the
-                    # line buffer
-                    if in_directive and not previous_continue:
-                        self.full_line = ''
+                        FileParser.handle_directive(out_tree, groups, phys_int, local_sloc, logical_line)
 
-                    previous_continue = continue_line
-
-                    # If this line also contains a continuation
-                    # character
-                    if continue_line:
-                        self.full_line += line.rstrip("\\\n\r")
-                    # If this line ends a previously continued line
+                        # FallBack is that this line is a simple code line.
                     else:
-                        self.full_line += line.rstrip("\n\r")
-
-                        self.handle_directive(out_tree, line_num, cpp_comment_cleaner,
-                                              groups)
+                        groups['code'].add_line(phys_int, local_sloc)
+            except StopIteration as it:
+                total_sloc, physical_loc = it.value
 
-                # FallBack is that this line is a simple code line.
-                else:
-                    previous_continue = False
-
-                    # If the line isn't empty after stripping comments,
-                    # count it as code
-                    if file_comment_cleaner.strip_comments(line[0:-1]).strip():
-                        groups['code'].add_line(line_num, True)
-                    else:
-                        groups['code'].add_line(line_num)
-
-            # Insert any code lines left at the end of the file
             if not groups['code'].empty():
-                groups['code'].add_line(len(lines))
                 self.insert_code_node(out_tree, groups['code'])
-
                 groups['file'].merge(groups['code'])
 
-            groups['file'].add_line(len(lines))
+
+            groups['file'].add_line((1, physical_loc-1), total_sloc)
             out_tree.root.num_lines = groups['file'].end_line
             out_tree.root.total_sloc = groups['file'].line_count
             return out_tree

From 45e5f15e958072cb00d541fde4128eb8822fa397 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 8 Nov 2019 08:42:09 -0800
Subject: [PATCH 03/49] Correct line counting test

---
 codebasin/file_parser.py        | 7 +++----
 tests/comments/test_comments.py | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 180aace..2da9d82 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -86,7 +86,7 @@ def __init__(self, _filename):
             self._file_extension = None
 
     @staticmethod
-    def handle_directive(out_tree, groups, phys_int, sloc, logical_line):
+    def handle_directive(out_tree, groups, logical_line):
         """
         Handle inserting code and directive nodes, where appropriate.
         Update the file group, and reset the code and directive groups.
@@ -153,7 +153,7 @@ def parse_file(self):
                         # might not be a directive we count
                         groups['directive'].add_line(phys_int, local_sloc)
 
-                        FileParser.handle_directive(out_tree, groups, phys_int, local_sloc, logical_line)
+                        FileParser.handle_directive(out_tree, groups, logical_line)
 
                         # FallBack is that this line is a simple code line.
                     else:
@@ -162,11 +162,10 @@ def parse_file(self):
                 total_sloc, physical_loc = it.value
 
             if not groups['code'].empty():
+                groups['code'].add_line((groups['code'].start_line, physical_loc-1), 0)
                 self.insert_code_node(out_tree, groups['code'])
                 groups['file'].merge(groups['code'])
 
-
-            groups['file'].add_line((1, physical_loc-1), total_sloc)
             out_tree.root.num_lines = groups['file'].end_line
             out_tree.root.total_sloc = groups['file'].line_count
             return out_tree
diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py
index 5caf55a..c9608cd 100644
--- a/tests/comments/test_comments.py
+++ b/tests/comments/test_comments.py
@@ -16,7 +16,7 @@ def test_c_comments(self):
         parser = file_parser.FileParser(os.path.join(rootdir, "continuation.cpp"))
 
         tree = parser.parse_file()
-        self.assertEqual(tree.root.children[0].num_lines, 25)
+        self.assertEqual(tree.root.total_sloc, 25)
 
 if __name__ == '__main__':
     unittest.main()

From 06eb6829e725f2c99c330b5c7cd1723172d31512 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 8 Nov 2019 09:07:10 -0800
Subject: [PATCH 04/49] Add standalone sloc_translate utility

---
 sloc_translate.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100755 sloc_translate.py

diff --git a/sloc_translate.py b/sloc_translate.py
new file mode 100755
index 0000000..73d52b4
--- /dev/null
+++ b/sloc_translate.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3.6
+# Copyright (C) 2019-2020 Intel Corporation
+# SPDX-License-Identifier: BSD-3-Clause
+
+from codebasin.c_source import c_file_source
+import sys
+import re
+
+def file_sloc(path, verbose=False):
+    with open(path, mode='r', errors='replace') as source_file:
+        walker = c_file_source(source_file)
+        try:
+            while True:
+                (interval, sloc, line) = next(walker)
+                if verbose:
+                    print(f"{path} [{interval[0]}, {interval[1]}) ({sloc}): {line}")
+        except StopIteration as it:
+            total_sloc, physical_loc = it.value
+
+    return (path, total_sloc, physical_loc)
+
+def walk_sloc(root, regexp, verbose=False):
+    for root, dirs, files in os.walk(root):
+        for f in files:
+            full_path = os.path.join(root, f)
+            if regexp.match(full_path):
+                try:
+                    (filename, total_sloc, physical_loc)  = file_sloc(full_path)
+                    print(f"{filename}, {total_sloc}, {physical_loc}")
+                except FileNotFoundError:
+                    pass
+
+
+if __name__ == '__main__':
+    filename = sys.argv[1]
+    (filename, total_sloc, physical_loc)  = file_sloc(filename, verbose=True)
+    print(f"{filename}, {total_sloc}, {physical_loc}")
+
+#    walk_sloc(sys.argv[1], re.compile(sys.argv[2]))

From b37901ae44e14f2ef153e844a301f8ee64aee4e5 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 8 Nov 2019 09:07:28 -0800
Subject: [PATCH 05/49] Rename NO_COMMENT to TOPLEVEL

---
 codebasin/c_source.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index 8aeefe9..a292b39 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -45,32 +45,32 @@ def flush(self):
 
 class c_cleaner(object):
     def __init__(self, outbuf):
-        self.state = ["NO_COMMENT"]
+        self.state = ["TOPLEVEL"]
         self.outbuf = outbuf
     def logical_newline(self):
         if self.state[-1] == "IN_INLINE_COMMENT":
             self.state.pop()
-            assert self.state == ["NO_COMMENT"]
+            assert self.state == ["TOPLEVEL"]
             self.outbuf.append_space()
         elif self.state[-1] == "FOUND_SLASH":
             self.state.pop()
-            assert self.state == ["NO_COMMENT"]
+            assert self.state == ["TOPLEVEL"]
             self.outbuf.append_nonspace('/')
         elif self.state[-1] == "SINGLE_QUOTATION":
             # This probably should give a warning
             self.state.pop()
-            assert self.state == ["NO_COMMENT"]
+            assert self.state == ["TOPLEVEL"]
         elif self.state[-1] == "DOUBLE_QUOTATION":
             # This probably should give a warning
             self.state.pop()
-            assert self.state == ["NO_COMMENT"]
+            assert self.state == ["TOPLEVEL"]
         elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
             self.state.pop()
             assert self.state[-1] == "IN_BLOCK_COMMENT"
     def process(self, line, start, end):
         pos = start
         while pos < end:
-            if self.state[-1] == "NO_COMMENT":
+            if self.state[-1] == "TOPLEVEL":
                 if line[pos] == '\\':
                     self.state.append("ESCAPING")
                     self.outbuf.append_nonspace(line[pos])
@@ -90,7 +90,7 @@ def process(self, line, start, end):
                     self.outbuf.append_nonspace(line[pos])
                 elif line[pos] == '"':
                     self.state.pop()
-                    assert self.state == ["NO_COMMENT"]
+                    assert self.state == ["TOPLEVEL"]
                     self.outbuf.append_nonspace(line[pos])
                 else:
                     self.outbuf.append_nonspace(line[pos])
@@ -102,7 +102,7 @@ def process(self, line, start, end):
                     self.state.append("FOUND_SLASH")
                 elif line[pos] == '\'':
                     self.state.pop()
-                    assert self.state == ["NO_COMMENT"]
+                    assert self.state == ["TOPLEVEL"]
                     self.outbuf.append_nonspace(line[pos])
                 else:
                     self.outbuf.append_nonspace(line[pos])
@@ -125,7 +125,7 @@ def process(self, line, start, end):
                     self.state.pop()
                     assert self.state[-1] == "IN_BLOCK_COMMENT"
                     self.state.pop()
-                    assert self.state == ["NO_COMMENT"]
+                    assert self.state == ["TOPLEVEL"]
                     self.outbuf.append_space()
                 elif line[pos] != '*':
                     self.state.pop()
@@ -189,6 +189,6 @@ def c_file_source(fp):
         yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush())
 
     total_sloc += local_sloc
-    assert cleaner.state == ["NO_COMMENT"]
+    assert cleaner.state == ["TOPLEVEL"]
 
     return (total_sloc, total_physical_lines)

From 1a53e093f100de96cc262dbe8f678dca2fe5dbc9 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 8 Nov 2019 11:37:38 -0800
Subject: [PATCH 06/49] Upgrade c_cleaner directive detection

---
 codebasin/c_source.py    | 62 ++++++++++++++++++++++++++++------------
 codebasin/file_parser.py |  5 ++--
 sloc_translate.py        |  4 +--
 3 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index a292b39..041b3e9 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -36,8 +36,18 @@ def join(self, other):
             else:
                 self.parts += other.parts[:]
             self.trailing_space = other.trailing_space
-    def is_blank(self):
-        return len(self.parts) == 0 or ( len(self.parts) == 1 and self.parts[0] == ' ' )
+    def category(self):
+        res = "SRC_NONBLANK"
+        if len(self.parts) == 0:
+            res = "BLANK"
+        elif len(self.parts) == 1:
+            if self.parts[0] == ' ':
+                res = "BLANK"
+            elif self.parts[0] == '#':
+                res = "CPP_DIRECTIVE"
+        elif ( self.parts[0] == ' ' and self.parts[1] == '#' ) or self.parts[0] == '#':
+            res = "CPP_DIRECTIVE"
+        return res
     def flush(self):
         res= ''.join(self.parts)
         self.__init__()
@@ -49,28 +59,43 @@ def __init__(self, outbuf):
         self.outbuf = outbuf
     def logical_newline(self):
         if self.state[-1] == "IN_INLINE_COMMENT":
-            self.state.pop()
-            assert self.state == ["TOPLEVEL"]
+            self.state = ["TOPLEVEL"]
             self.outbuf.append_space()
         elif self.state[-1] == "FOUND_SLASH":
-            self.state.pop()
-            assert self.state == ["TOPLEVEL"]
+            self.state = ["TOPLEVEL"]
             self.outbuf.append_nonspace('/')
         elif self.state[-1] == "SINGLE_QUOTATION":
             # This probably should give a warning
-            self.state.pop()
-            assert self.state == ["TOPLEVEL"]
+            self.state = ["TOPLEVEL"]
         elif self.state[-1] == "DOUBLE_QUOTATION":
             # This probably should give a warning
-            self.state.pop()
-            assert self.state == ["TOPLEVEL"]
+            self.state == ["TOPLEVEL"]
         elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
             self.state.pop()
             assert self.state[-1] == "IN_BLOCK_COMMENT"
+        elif self.state[-1] == "CPP_DIRECTIVE":
+            self.state = ["TOPLEVEL"]
     def process(self, line, start, end):
         pos = start
         while pos < end:
             if self.state[-1] == "TOPLEVEL":
+                if line[pos] == '\\':
+                    self.state.append("ESCAPING")
+                    self.outbuf.append_nonspace(line[pos])
+                elif line[pos] == '/':
+                    self.state.append("FOUND_SLASH")
+                elif line[pos] == '"':
+                    self.state.append("DOUBLE_QUOTATION")
+                    self.outbuf.append_nonspace(line[pos])
+                elif line[pos] == '\'':
+                    self.state.append("SINGLE_QUOTATION")
+                    self.outbuf.append_nonspace(line[pos])
+                elif line[pos] == '#' and self.outbuf.category() == "BLANK":
+                    self.state.append("CPP_DIRECTIVE")
+                    self.outbuf.append_nonspace(line[pos])
+                else:
+                    self.outbuf.append_char(line[pos])
+            elif self.state[-1] == "CPP_DIRECTIVE":
                 if line[pos] == '\\':
                     self.state.append("ESCAPING")
                     self.outbuf.append_nonspace(line[pos])
@@ -90,7 +115,6 @@ def process(self, line, start, end):
                     self.outbuf.append_nonspace(line[pos])
                 elif line[pos] == '"':
                     self.state.pop()
-                    assert self.state == ["TOPLEVEL"]
                     self.outbuf.append_nonspace(line[pos])
                 else:
                     self.outbuf.append_nonspace(line[pos])
@@ -102,7 +126,6 @@ def process(self, line, start, end):
                     self.state.append("FOUND_SLASH")
                 elif line[pos] == '\'':
                     self.state.pop()
-                    assert self.state == ["TOPLEVEL"]
                     self.outbuf.append_nonspace(line[pos])
                 else:
                     self.outbuf.append_nonspace(line[pos])
@@ -125,7 +148,6 @@ def process(self, line, start, end):
                     self.state.pop()
                     assert self.state[-1] == "IN_BLOCK_COMMENT"
                     self.state.pop()
-                    assert self.state == ["TOPLEVEL"]
                     self.outbuf.append_space()
                 elif line[pos] != '*':
                     self.state.pop()
@@ -135,6 +157,8 @@ def process(self, line, start, end):
                 self.state.pop()
             elif self.state[-1] == "IN_INLINE_COMMENT":
                 return
+            else:
+                assert None
             pos += 1
 
 def c_file_source(fp):
@@ -167,14 +191,15 @@ def c_file_source(fp):
         if not continued:
             cleaner.logical_newline()
 
-        if not current_physical_line.is_blank():
+        if not current_physical_line.category() == "BLANK":
             local_sloc += 1
 
         current_logical_line.join(current_physical_line)
 
         if not continued:
-            if not current_logical_line.is_blank():
-                yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush())
+            line_cat = current_logical_line.category()
+            if line_cat != "BLANK":
+                yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat)
             else:
                 current_logical_line.__init__()
                 assert local_sloc == 0
@@ -185,8 +210,9 @@ def c_file_source(fp):
 
     total_physical_lines = physical_line_num
 
-    if not current_logical_line.is_blank():
-        yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush())
+    line_cat = current_logical_line.category()
+    if line_cat != "BLANK":
+        yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat)
 
     total_sloc += local_sloc
     assert cleaner.state == ["TOPLEVEL"]
diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 2da9d82..61cd085 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -145,10 +145,9 @@ def parse_file(self):
             c_source = c_file_source(source_file)
             try:
                 while True:
-                    (phys_int, local_sloc, logical_line) = next(c_source)
-                    in_directive = logical_line[0] == "#" or logical_line[0] == ' ' and logical_line[1] == '#'
+                    (phys_int, local_sloc, logical_line, line_cat) = next(c_source)
                     # Only follow continuation for directives
-                    if in_directive:
+                    if line_cat == 'CPP_DIRECTIVE':
                         # Add this into the directive lines, even if it
                         # might not be a directive we count
                         groups['directive'].add_line(phys_int, local_sloc)
diff --git a/sloc_translate.py b/sloc_translate.py
index 73d52b4..b7b63a7 100755
--- a/sloc_translate.py
+++ b/sloc_translate.py
@@ -11,9 +11,9 @@ def file_sloc(path, verbose=False):
         walker = c_file_source(source_file)
         try:
             while True:
-                (interval, sloc, line) = next(walker)
+                (interval, sloc, line, line_cat) = next(walker)
                 if verbose:
-                    print(f"{path} [{interval[0]}, {interval[1]}) ({sloc}): {line}")
+                    print(f"{path} [{interval[0]}, {interval[1]}) ({sloc}): {line} {line_cat}")
         except StopIteration as it:
             total_sloc, physical_loc = it.value
 

From b13489c8a36830834e6d0571e1b1988bc3ca3ee2 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 8 Nov 2019 12:02:44 -0800
Subject: [PATCH 07/49] Use an iterator in c_cleaner.process

Requires a 'put back' functionality
---
 codebasin/c_source.py | 99 +++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 41 deletions(-)

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index 041b3e9..e461c72 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -4,6 +4,8 @@
 Contains classes and functions for stripping comments and whitespace from C/C++ files
 """
 
+import itertools as it
+
 global whitespace_dict
 whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000')
 
@@ -53,6 +55,22 @@ def flush(self):
         self.__init__()
         return res
 
+class iter_keep1(object):
+    def __init__(self, iterator):
+        self.iterator = iter(iterator)
+        self.single = None
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self.single is not None:
+            res, self.single = self.single, None
+            return res
+        else:
+            return next(self.iterator)
+    def putback(self, item):
+        assert self.single is None
+        self.single = item
+
 class c_cleaner(object):
     def __init__(self, outbuf):
         self.state = ["TOPLEVEL"]
@@ -75,91 +93,90 @@ def logical_newline(self):
             assert self.state[-1] == "IN_BLOCK_COMMENT"
         elif self.state[-1] == "CPP_DIRECTIVE":
             self.state = ["TOPLEVEL"]
-    def process(self, line, start, end):
-        pos = start
-        while pos < end:
+    def process(self, lineiter):
+        inbuffer = iter_keep1(lineiter)
+        for char in inbuffer:
             if self.state[-1] == "TOPLEVEL":
-                if line[pos] == '\\':
+                if char == '\\':
                     self.state.append("ESCAPING")
-                    self.outbuf.append_nonspace(line[pos])
-                elif line[pos] == '/':
+                    self.outbuf.append_nonspace(char)
+                elif char == '/':
                     self.state.append("FOUND_SLASH")
-                elif line[pos] == '"':
+                elif char == '"':
                     self.state.append("DOUBLE_QUOTATION")
-                    self.outbuf.append_nonspace(line[pos])
-                elif line[pos] == '\'':
+                    self.outbuf.append_nonspace(char)
+                elif char == '\'':
                     self.state.append("SINGLE_QUOTATION")
-                    self.outbuf.append_nonspace(line[pos])
-                elif line[pos] == '#' and self.outbuf.category() == "BLANK":
+                    self.outbuf.append_nonspace(char)
+                elif char == '#' and self.outbuf.category() == "BLANK":
                     self.state.append("CPP_DIRECTIVE")
-                    self.outbuf.append_nonspace(line[pos])
+                    self.outbuf.append_nonspace(char)
                 else:
-                    self.outbuf.append_char(line[pos])
+                    self.outbuf.append_char(char)
             elif self.state[-1] == "CPP_DIRECTIVE":
-                if line[pos] == '\\':
+                if char == '\\':
                     self.state.append("ESCAPING")
-                    self.outbuf.append_nonspace(line[pos])
-                elif line[pos] == '/':
+                    self.outbuf.append_nonspace(char)
+                elif char == '/':
                     self.state.append("FOUND_SLASH")
-                elif line[pos] == '"':
+                elif char == '"':
                     self.state.append("DOUBLE_QUOTATION")
-                    self.outbuf.append_nonspace(line[pos])
-                elif line[pos] == '\'':
+                    self.outbuf.append_nonspace(char)
+                elif char == '\'':
                     self.state.append("SINGLE_QUOTATION")
-                    self.outbuf.append_nonspace(line[pos])
+                    self.outbuf.append_nonspace(char)
                 else:
-                    self.outbuf.append_char(line[pos])
+                    self.outbuf.append_char(char)
             elif self.state[-1] == "DOUBLE_QUOTATION":
-                if line[pos] == '\\':
+                if char == '\\':
                     self.state.append("ESCAPING")
-                    self.outbuf.append_nonspace(line[pos])
-                elif line[pos] == '"':
+                    self.outbuf.append_nonspace(char)
+                elif char == '"':
                     self.state.pop()
-                    self.outbuf.append_nonspace(line[pos])
+                    self.outbuf.append_nonspace(char)
                 else:
-                    self.outbuf.append_nonspace(line[pos])
+                    self.outbuf.append_nonspace(char)
             elif self.state[-1] == "SINGLE_QUOTATION":
-                if line[pos] == '\\':
+                if char == '\\':
                     self.state.append("ESCAPING")
-                    self.outbuf.append_nonspace(line[pos])
-                elif line[pos] == '/':
+                    self.outbuf.append_nonspace(char)
+                elif char == '/':
                     self.state.append("FOUND_SLASH")
-                elif line[pos] == '\'':
+                elif char == '\'':
                     self.state.pop()
-                    self.outbuf.append_nonspace(line[pos])
+                    self.outbuf.append_nonspace(char)
                 else:
-                    self.outbuf.append_nonspace(line[pos])
+                    self.outbuf.append_nonspace(char)
             elif self.state[-1] == "FOUND_SLASH":
-                if line[pos] == '/':
+                if char == '/':
                     self.state.pop()
                     self.state.append("IN_INLINE_COMMENT")
-                elif line[pos] == '*':
+                elif char == '*':
                     self.state.pop()
                     self.state.append("IN_BLOCK_COMMENT")
                 else:
                     self.state.pop()
                     self.outbuf.append_char('/')
-                    pos -= 1
+                    inbuffer.putback(char)
             elif self.state[-1] == "IN_BLOCK_COMMENT":
-                if line[pos] == '*':
+                if char == '*':
                     self.state.append("IN_BLOCK_COMMENT_FOUND_STAR")
             elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
-                if line[pos] == '/':
+                if char == '/':
                     self.state.pop()
                     assert self.state[-1] == "IN_BLOCK_COMMENT"
                     self.state.pop()
                     self.outbuf.append_space()
-                elif line[pos] != '*':
+                elif char != '*':
                     self.state.pop()
                     assert self.state[-1] == "IN_BLOCK_COMMENT"
             elif self.state[-1] == "ESCAPING":
-                self.outbuf.append_nonspace(line[pos])
+                self.outbuf.append_nonspace(char)
                 self.state.pop()
             elif self.state[-1] == "IN_INLINE_COMMENT":
                 return
             else:
                 assert None
-            pos += 1
 
 def c_file_source(fp):
 
@@ -187,7 +204,7 @@ def c_file_source(fp):
             end -= 1
         else:
             continued = False
-        cleaner.process(line, 0, end)
+        cleaner.process(it.islice(line, 0, end))
         if not continued:
             cleaner.logical_newline()
 

From 5db669f499560e5692b2efd028baa2b401c6f652 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Mon, 11 Nov 2019 14:09:26 -0800
Subject: [PATCH 08/49] Add Fortran support

---
 codebasin/c_source.py | 158 +++++++++++++++++++++++++++++++++++++++++-
 sloc_translate.py     |  14 ++--
 2 files changed, 163 insertions(+), 9 deletions(-)

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index e461c72..9b864cb 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -178,7 +178,108 @@ def process(self, lineiter):
             else:
                 assert None
 
-def c_file_source(fp):
+class fortran_cleaner(object):
+    def __init__(self, outbuf):
+        self.state = ["TOPLEVEL"]
+        self.outbuf = outbuf
+        self.verify_continue = []
+    def dir_check(self, inbuffer):
+        self.found=['!']
+        for char in inbuffer:
+            if char == '$':
+                self.found.append('$')
+                for char in self.found:
+                    self.outbuf.append_nonspace(char)
+                break
+            elif char.isalpha():
+                self.found.append(char)
+            else:
+                return
+    def process(self, lineiter):
+        inbuffer = iter_keep1(lineiter)
+        try:
+            while True:
+                char = next(inbuffer)
+                if self.state[-1] == "TOPLEVEL":
+                    if char == '\\':
+                        self.state.append("ESCAPING")
+                        self.outbuf.append_nonspace(char)
+                    elif char == '!':
+                        self.dir_check(inbuffer)
+                        self.state = ["TOPLEVEL"]
+                        break
+                    elif char == '&':
+                        self.verify_continue.append(char)
+                        self.state.append("VERIFY_CONTINUE")
+                    elif char == '"':
+                        self.state.append("DOUBLE_QUOTATION")
+                        self.outbuf.append_nonspace(char)
+                    elif char == '\'':
+                        self.state.append("SINGLE_QUOTATION")
+                        self.outbuf.append_nonspace(char)
+                    else:
+                        self.outbuf.append_char(char)
+                elif self.state[-1] == 'CONTINUING_FROM_SOL':
+                    if is_whitespace(char):
+                        self.outbuf.append_space()
+                    elif char == '&':
+                        self.state.pop()
+                    elif char == '!':
+                        self.dir_check(inbuffer)
+                        break
+                    else:
+                        self.state.pop()
+                        inbuffer.putback(char)
+                        # should complain if we are quoting here, but will ignore for now
+                elif self.state[-1] == "DOUBLE_QUOTATION":
+                    if char == '\\':
+                        self.state.append("ESCAPING")
+                        self.outbuf.append_nonspace(char)
+                    elif char == '"':
+                        self.state.pop()
+                        self.outbuf.append_nonspace(char)
+                    elif char == '&':
+                        self.verify_continue.append(char)
+                        self.state.append("VERIFY_CONTINUE")
+                    else:
+                        self.outbuf.append_nonspace(char)
+                elif self.state[-1] == "SINGLE_QUOTATION":
+                    if char == '\\':
+                        self.state.append("ESCAPING")
+                        self.outbuf.append_nonspace(char)
+                    elif char == '\'':
+                        self.state.pop()
+                        self.outbuf.append_nonspace(char)
+                    elif char == '&':
+                        self.verify_continue.append(char)
+                        self.state.append("VERIFY_CONTINUE")
+                    else:
+                        self.outbuf.append_nonspace(char)
+                elif self.state[-1] == "ESCAPING":
+                    self.outbuf.append_nonspace(char)
+                    self.state.pop()
+                elif self.state[-1] == "VERIFY_CONTINUE":
+                    if char == '!' and self.state[-2] == "TOPLEVEL":
+                        self.dir_check(inbuffer)
+                        break
+                    elif not is_whitespace(char):
+                        for tmp in self.verify_continue:
+                            self.outbuf.append_nonspace(tmp)
+                        self.verify_continue = []
+                        self.state.pop()
+                        inbuffer.putback(char)
+                else:
+                    assert None
+        except StopIteration:
+            pass
+        if self.state[-1] == "CONTINUING_TO_EOL":
+            self.state[-1] = "CONTINUING_FROM_SOL"
+        elif self.state[-1] == "VERIFY_CONTINUE":
+            self.state[-1] = "CONTINUING_FROM_SOL"
+        #print(self.state)
+
+
+def c_file_source(fp, relaxed=True):
 
     current_physical_line = one_space_line()
     cleaner = c_cleaner(current_physical_line)
@@ -232,6 +333,59 @@ def c_file_source(fp):
         yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat)
 
     total_sloc += local_sloc
-    assert cleaner.state == ["TOPLEVEL"]
+    if not relaxed:
+        assert cleaner.state == ["TOPLEVEL"]
+
+    return (total_sloc, total_physical_lines)
+
+def fortran_file_source(fp, relaxed=True):
+
+    current_physical_line = one_space_line()
+    cleaner = fortran_cleaner(current_physical_line)
+
+    current_logical_line = one_space_line()
+
+    current_physical_start = None
+    total_sloc = 0
+    local_sloc = 0
+
+    physical_line_num = 0
+    c_walker = c_file_source(fp)
+    try:
+        while True:
+            ((src_physical_start, src_physical_end), src_line_sloc, src_line, _) = next(c_walker)
+            if current_physical_start == None:
+                current_physical_start = src_physical_start
+            current_physical_line.__init__()
+            import pdb
+#            pdb.set_trace()
+            cleaner.process(it.islice(src_line, 0, len(src_line)))
+
+            if not current_physical_line.category() == "BLANK":
+                local_sloc += src_line_sloc
+
+            current_logical_line.join(current_physical_line)
+
+            if cleaner.state[-1] != "CONTINUING_FROM_SOL":
+                line_cat = current_logical_line.category()
+                if line_cat != "BLANK":
+                    yield ((current_physical_start, src_physical_end), local_sloc, current_logical_line.flush(), line_cat)
+                else:
+                    current_logical_line.__init__()
+                    assert local_sloc == 0
+
+                current_physical_start = None
+                total_sloc += local_sloc
+                local_sloc = 0
+    except StopIteration as stopit:
+        _, total_physical_lines = stopit.value
+
+    line_cat = current_logical_line.category()
+    if line_cat != "BLANK":
+        yield ((current_physical_start, total_physical_lines), local_sloc, current_logical_line.flush(), line_cat)
+
+    total_sloc += local_sloc
+    if not relaxed:
+        assert cleaner.state == ["TOPLEVEL"]
 
     return (total_sloc, total_physical_lines)
diff --git a/sloc_translate.py b/sloc_translate.py
index b7b63a7..43a286f 100755
--- a/sloc_translate.py
+++ b/sloc_translate.py
@@ -2,13 +2,14 @@
 # Copyright (C) 2019-2020 Intel Corporation
 # SPDX-License-Identifier: BSD-3-Clause
 
-from codebasin.c_source import c_file_source
+from codebasin.c_source import c_file_source, fortran_file_source
+import os
 import sys
 import re
 
 def file_sloc(path, verbose=False):
     with open(path, mode='r', errors='replace') as source_file:
-        walker = c_file_source(source_file)
+        walker = fortran_file_source(source_file, relaxed=False)
         try:
             while True:
                 (interval, sloc, line, line_cat) = next(walker)
@@ -30,10 +31,9 @@ def walk_sloc(root, regexp, verbose=False):
                 except FileNotFoundError:
                     pass
 
-
 if __name__ == '__main__':
-    filename = sys.argv[1]
-    (filename, total_sloc, physical_loc)  = file_sloc(filename, verbose=True)
-    print(f"{filename}, {total_sloc}, {physical_loc}")
+    # filename = sys.argv[1]
+    # (filename, total_sloc, physical_loc)  = file_sloc(filename, verbose=True)
+    # print(f"{filename}, {total_sloc}, {physical_loc}")
 
-#    walk_sloc(sys.argv[1], re.compile(sys.argv[2]))
+    walk_sloc(sys.argv[1], re.compile(sys.argv[2]))

From 06f7b766f4bfb8ff6da0750db7dff39830edac39 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 07:14:35 -0800
Subject: [PATCH 09/49] Add front-ends for language parsers

Use guess_language/get_file_source as front-end for fortran/c parsers
---
 codebasin/c_source.py    | 28 ++++++++++++++++++++++++++++
 codebasin/file_parser.py | 10 ++++++----
 sloc_translate.py        | 20 +++++++++++++-------
 3 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index 9b864cb..0b9b4e0 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -5,6 +5,7 @@
 """
 
 import itertools as it
+from os.path import splitext
 
 global whitespace_dict
 whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000')
@@ -389,3 +390,30 @@ def fortran_file_source(fp, relaxed=True):
         assert cleaner.state == ["TOPLEVEL"]
 
     return (total_sloc, total_physical_lines)
+
+
+global extension_map
+extension_map = {'.f90' : "FREEFORM FORTRAN",
+                 '.cxx' : "C FAMILY",
+                 '.cl' : "C FAMILY",
+                 '.cu' : "C FAMILY",
+                 '.cpp' : "C FAMILY",
+                 '.c' : "C FAMILY",
+                 '.h' : "C FAMILY",
+                 '.hpp' : "C FAMILY"}
+
+def guess_language(fname):
+    _, ext = splitext(fname)
+    try:
+        return extension_map[ext.lower()]
+    except KeyError:
+        return "Unknown"
+
+def get_file_source(path):
+    lang = guess_language(path)
+    if lang == "FREEFORM FORTRAN":
+        return fortran_file_source
+    elif lang == "C FAMILY":
+        return c_file_source
+    else:
+        return None
diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 61cd085..ea05bd1 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -9,7 +9,7 @@
 
 from . import preprocessor  # pylint : disable=no-name-in-module
 
-from codebasin.c_source import c_file_source
+from codebasin.c_source import get_file_source
 
 class LineGroup:
     """
@@ -68,7 +68,6 @@ def merge(self, line_group):
         self.end_line = max(self.end_line, line_group.end_line)
         line_group.reset()
 
-
 class FileParser:
     """
     Contains methods for parsing an entire source file and returning a
@@ -132,6 +131,9 @@ def parse_file(self):
         """
 
         out_tree = preprocessor.SourceTree(self._filename)
+        file_source = get_file_source(path)
+        if not file_source:
+            raise RuntimeError(f"{path} doesn't appear to be a language this tool can process")
         with open(self._filename, mode='r', errors='replace') as source_file:
             previous_continue = False
 
@@ -142,10 +144,10 @@ def parse_file(self):
 
             groups['file'].start_line = 1
 
-            c_source = c_file_source(source_file)
+            source = file_source(source_file)
             try:
                 while True:
-                    (phys_int, local_sloc, logical_line, line_cat) = next(c_source)
+                    (phys_int, local_sloc, logical_line, line_cat) = next(source)
                     # Only follow continuation for directives
                     if line_cat == 'CPP_DIRECTIVE':
                         # Add this into the directive lines, even if it
diff --git a/sloc_translate.py b/sloc_translate.py
index 43a286f..6f66c95 100755
--- a/sloc_translate.py
+++ b/sloc_translate.py
@@ -2,14 +2,17 @@
 # Copyright (C) 2019-2020 Intel Corporation
 # SPDX-License-Identifier: BSD-3-Clause
 
-from codebasin.c_source import c_file_source, fortran_file_source
+from codebasin.c_source import get_file_source
 import os
 import sys
 import re
 
 def file_sloc(path, verbose=False):
+    file_source = get_file_source(path)
+    if not file_source:
+        raise RuntimeError(f"{path} doesn't appear to be a language this tool can process")
     with open(path, mode='r', errors='replace') as source_file:
-        walker = fortran_file_source(source_file, relaxed=False)
+        walker = file_source(source_file, relaxed=False)
         try:
             while True:
                 (interval, sloc, line, line_cat) = next(walker)
@@ -32,8 +35,11 @@ def walk_sloc(root, regexp, verbose=False):
                     pass
 
 if __name__ == '__main__':
-    # filename = sys.argv[1]
-    # (filename, total_sloc, physical_loc)  = file_sloc(filename, verbose=True)
-    # print(f"{filename}, {total_sloc}, {physical_loc}")
-
-    walk_sloc(sys.argv[1], re.compile(sys.argv[2]))
+    if len(sys.argv) == 2:
+        filename = sys.argv[1]
+        (filename, total_sloc, physical_loc)  = file_sloc(filename, verbose=True)
+        print(f"{filename}, {total_sloc}, {physical_loc}")
+    elif len(sys.argv) == 3:
+        walk_sloc(sys.argv[1], re.compile(sys.argv[2]))
+    else:
+        print("Expected either 1 argument (a single file to parse and print) or 2 (a directory root & file pattern)")

From d153a6aeba05e30b001174011830d8437c0d9ed6 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 07:15:47 -0800
Subject: [PATCH 10/49] Have C parser only look for directives in fortran

---
 codebasin/c_source.py | 56 ++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index 0b9b4e0..acc597d 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -73,9 +73,10 @@ def putback(self, item):
         self.single = item
 
 class c_cleaner(object):
-    def __init__(self, outbuf):
+    def __init__(self, outbuf, directives_only=False):
         self.state = ["TOPLEVEL"]
         self.outbuf = outbuf
+        self.directives_only = directives_only
     def logical_newline(self):
         if self.state[-1] == "IN_INLINE_COMMENT":
             self.state = ["TOPLEVEL"]
@@ -98,22 +99,32 @@ def process(self, lineiter):
         inbuffer = iter_keep1(lineiter)
         for char in inbuffer:
             if self.state[-1] == "TOPLEVEL":
-                if char == '\\':
-                    self.state.append("ESCAPING")
-                    self.outbuf.append_nonspace(char)
-                elif char == '/':
-                    self.state.append("FOUND_SLASH")
-                elif char == '"':
-                    self.state.append("DOUBLE_QUOTATION")
-                    self.outbuf.append_nonspace(char)
-                elif char == '\'':
-                    self.state.append("SINGLE_QUOTATION")
-                    self.outbuf.append_nonspace(char)
-                elif char == '#' and self.outbuf.category() == "BLANK":
-                    self.state.append("CPP_DIRECTIVE")
-                    self.outbuf.append_nonspace(char)
+                if self.directives_only:
+                    if char == '\\':
+                        self.state.append("ESCAPING")
+                        self.outbuf.append_nonspace(char)
+                    elif char == '#' and self.outbuf.category() == "BLANK":
+                        self.state.append("CPP_DIRECTIVE")
+                        self.outbuf.append_nonspace(char)
+                    else:
+                        self.outbuf.append_char(char)
                 else:
-                    self.outbuf.append_char(char)
+                    if char == '\\':
+                        self.state.append("ESCAPING")
+                        self.outbuf.append_nonspace(char)
+                    elif char == '/':
+                        self.state.append("FOUND_SLASH")
+                    elif char == '"':
+                        self.state.append("DOUBLE_QUOTATION")
+                        self.outbuf.append_nonspace(char)
+                    elif char == '\'':
+                        self.state.append("SINGLE_QUOTATION")
+                        self.outbuf.append_nonspace(char)
+                    elif char == '#' and self.outbuf.category() == "BLANK":
+                        self.state.append("CPP_DIRECTIVE")
+                        self.outbuf.append_nonspace(char)
+                    else:
+                        self.outbuf.append_char(char)
             elif self.state[-1] == "CPP_DIRECTIVE":
                 if char == '\\':
                     self.state.append("ESCAPING")
@@ -277,13 +288,11 @@ def process(self, lineiter):
             self.state[-1] = "CONTINUING_FROM_SOL"
         elif self.state[-1] == "VERIFY_CONTINUE":
             self.state[-1] = "CONTINUING_FROM_SOL"
-        #print(self.state)
 
-
-def c_file_source(fp, relaxed=True):
+def c_file_source(fp, relaxed=False, directives_only=False):
 
     current_physical_line = one_space_line()
-    cleaner = c_cleaner(current_physical_line)
+    cleaner = c_cleaner(current_physical_line, directives_only)
 
     current_logical_line = one_space_line()
 
@@ -339,7 +348,7 @@ def c_file_source(fp, relaxed=True):
 
     return (total_sloc, total_physical_lines)
 
-def fortran_file_source(fp, relaxed=True):
+def fortran_file_source(fp, relaxed=False):
 
     current_physical_line = one_space_line()
     cleaner = fortran_cleaner(current_physical_line)
@@ -350,16 +359,13 @@ def fortran_file_source(fp, relaxed=True):
     total_sloc = 0
     local_sloc = 0
 
-    physical_line_num = 0
-    c_walker = c_file_source(fp)
+    c_walker = c_file_source(fp, directives_only=True)
     try:
         while True:
             ((src_physical_start, src_physical_end), src_line_sloc, src_line, _) = next(c_walker)
             if current_physical_start == None:
                 current_physical_start = src_physical_start
             current_physical_line.__init__()
-            import pdb
-#            pdb.set_trace()
             cleaner.process(it.islice(src_line, 0, len(src_line)))
 
             if not current_physical_line.category() == "BLANK":

From 768347b1f4e1667575e12576701eda8819709e37 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 07:40:09 -0800
Subject: [PATCH 11/49] Fix spelling bug

---
 codebasin/file_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index ea05bd1..c078b77 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -131,7 +131,7 @@ def parse_file(self):
         """
 
         out_tree = preprocessor.SourceTree(self._filename)
-        file_source = get_file_source(path)
+        file_source = get_file_source(self._filename)
         if not file_source:
             raise RuntimeError(f"{path} doesn't appear to be a language this tool can process")
         with open(self._filename, mode='r', errors='replace') as source_file:

From 9bc99c4be8800e452460d59240324de995c367ab Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 07:41:38 -0800
Subject: [PATCH 12/49] Remove unused code

---
 codebasin/file_parser.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index c078b77..9ef3986 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -78,12 +78,6 @@ class FileParser:
     def __init__(self, _filename):
         self._filename = _filename
 
-        split = splitext(_filename)
-        if len(split) == 2:
-            self._file_extension = split[1].lower()
-        else:
-            self._file_extension = None
-
     @staticmethod
     def handle_directive(out_tree, groups, logical_line):
         """

From a87c862374676c315daccda30d165614abd0e2da Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 11:53:33 -0800
Subject: [PATCH 13/49] Pass C directives directly to output

This commit forces C directives to skip through the Fortran parser.
---
 codebasin/c_source.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index acc597d..f832981 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -362,9 +362,26 @@ def fortran_file_source(fp, relaxed=False):
     c_walker = c_file_source(fp, directives_only=True)
     try:
         while True:
-            ((src_physical_start, src_physical_end), src_line_sloc, src_line, _) = next(c_walker)
+            ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) = next(c_walker)
+            #if it's a cpp directive, flush what we have, then emit the directive and start over
             if current_physical_start == None:
                 current_physical_start = src_physical_start
+
+            if c_category == "CPP_DIRECTIVE":
+                line_cat = current_logical_line.category()
+                if line_cat != "BLANK":
+                    yield ((current_physical_start, src_physical_end), local_sloc, current_logical_line.flush(), line_cat)
+                else:
+                    current_logical_line.__init__()
+                    assert local_sloc == 0
+
+                current_physical_start = None
+                total_sloc += local_sloc
+                local_sloc = 0
+                yield ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category)
+                total_sloc += src_line_sloc
+                continue
+
             current_physical_line.__init__()
             cleaner.process(it.islice(src_line, 0, len(src_line)))
 

From d410c7fabc1681b026b1170461c31bffad37bb6b Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 12:34:05 -0800
Subject: [PATCH 14/49] Add test for Fortran

Also fix verify_continue behavior
---
 codebasin/c_source.py           |  5 +++++
 tests/comments/fortran.f90      | 34 +++++++++++++++++++++++++++++++++
 tests/comments/test_comments.py | 16 ++++++++++++++--
 3 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 tests/comments/fortran.f90

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index f832981..b673a8d 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -202,6 +202,8 @@ def dir_check(self, inbuffer):
                 self.found.append('$')
                 for char in self.found:
                     self.outbuf.append_nonspace(char)
+                for char in inbuffer:
+                    self.outbuf.append_nonspace(char)
                 break
             elif char.isalpha():
                 self.found.append(char)
@@ -280,6 +282,8 @@ def process(self, lineiter):
                         self.verify_continue = []
                         self.state.pop()
                         inbuffer.putback(char)
+                    elif is_whitespace(char):
+                        self.verify_continue.append(char)
                 else:
                     assert None
         except StopIteration:
@@ -287,6 +291,7 @@ def process(self, lineiter):
         if self.state[-1] == "CONTINUING_TO_EOL":
             self.state[-1] = "CONTINUING_FROM_SOL"
         elif self.state[-1] == "VERIFY_CONTINUE":
+            self.verify_continue = []
             self.state[-1] = "CONTINUING_FROM_SOL"
 
 def c_file_source(fp, relaxed=False, directives_only=False):
diff --git a/tests/comments/fortran.f90 b/tests/comments/fortran.f90
new file mode 100644
index 0000000..0b1006a
--- /dev/null
+++ b/tests/comments/fortran.f90
@@ -0,0 +1,34 @@
+! Copyright (C) 2019-2020 Intel Corporation
+! SPDX-License-Identifier: BSD-3-Clause
+
+program foo
+
+#define my_fortran_macro() \
+  /*wow a comment*/ \
+  a = b - c /* another */ \
+  + b !FOO  // "neat" /* hey look a c comment*/
+
+  integer a,b,c
+  b = b  & ! Comments after continuations
+       ! no comment!
+           + b
+  !$ A directive
+
+  write(*,*) "Fortran! /*Has*/ !Unique parsing semantics"
+  !omp$ a different directive
+  write(*,*) "& Fortran! has complex ways of dealing with (&) //ampersands&"
+  !omp5% not a directives
+  write(*,*) "Fortran! \& d \n &
+                                !Can be "
+       &'quite' complex&
+       !Mixin
+&"//"&
+       !Mixin
+       &with quoted continuations"
+
+my_fortran_macro()
+
+#if !defined(GPU) /*something*/
+  write(*,*) "directives" // "appending"
+#endif
+end program foo
diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py
index c9608cd..aa97472 100644
--- a/tests/comments/test_comments.py
+++ b/tests/comments/test_comments.py
@@ -6,9 +6,21 @@
 import os
 from codebasin import preprocessor, file_parser
 
-class TestExampleFile(unittest.TestCase):
+class TestExampleFortranFile(unittest.TestCase):
     """
-    Test handling of comments
+    Test handling of fixed form Fortran
+    """
+
+    def test_fortran_comments(self):
+        rootdir = "./tests/comments/"
+        parser = file_parser.FileParser(os.path.join(rootdir, "fortran.f90"))
+
+        tree = parser.parse_file()
+        self.assertEqual(tree.root.total_sloc, 20)
+
+class TestExampleCFile(unittest.TestCase):
+    """
+    Test handling of C comments
     """
 
     def test_c_comments(self):

From b64cabb560d3dc4c1eb9b2741cf958c579044915 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 13:06:49 -0800
Subject: [PATCH 15/49] Add cuh, cc to C-like extensions

---
 codebasin/c_source.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/codebasin/c_source.py b/codebasin/c_source.py
index b673a8d..14457af 100644
--- a/codebasin/c_source.py
+++ b/codebasin/c_source.py
@@ -425,6 +425,8 @@ def fortran_file_source(fp, relaxed=False):
                  '.cxx' : "C FAMILY",
                  '.cl' : "C FAMILY",
                  '.cu' : "C FAMILY",
+                 '.cuh' : "C FAMILY",
+                 '.cc' : "C FAMILY",
                  '.cpp' : "C FAMILY",
                  '.c' : "C FAMILY",
                  '.h' : "C FAMILY",

From a65aad4e61b9d78383a2db029bce52cb6640b7c3 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 13:07:04 -0800
Subject: [PATCH 16/49] Fix spelling issue in file_parser

---
 codebasin/file_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 9ef3986..2ec5037 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -127,7 +127,7 @@ def parse_file(self):
         out_tree = preprocessor.SourceTree(self._filename)
         file_source = get_file_source(self._filename)
         if not file_source:
-            raise RuntimeError(f"{path} doesn't appear to be a language this tool can process")
+            raise RuntimeError(f"{self._filename} doesn't appear to be a language this tool can process")
         with open(self._filename, mode='r', errors='replace') as source_file:
             previous_continue = False
 

From c6675cd68d340fc133e78b4cf56aa737da21098b Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 13:09:43 -0800
Subject: [PATCH 17/49] Rename c_source.py to file_source.py

---
 codebasin/file_parser.py                  | 2 +-
 codebasin/{c_source.py => file_source.py} | 2 +-
 sloc_translate.py                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename codebasin/{c_source.py => file_source.py} (99%)

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 2ec5037..bc6d7f2 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -9,7 +9,7 @@
 
 from . import preprocessor  # pylint : disable=no-name-in-module
 
-from codebasin.c_source import get_file_source
+from codebasin.file_source import get_file_source
 
 class LineGroup:
     """
diff --git a/codebasin/c_source.py b/codebasin/file_source.py
similarity index 99%
rename from codebasin/c_source.py
rename to codebasin/file_source.py
index 14457af..4727061 100644
--- a/codebasin/c_source.py
+++ b/codebasin/file_source.py
@@ -1,7 +1,7 @@
 # Copyright (C) 2019 Intel Corporation
 # SPDX-License-Identifier: BSD-3-Clause
 """
-Contains classes and functions for stripping comments and whitespace from C/C++ files
+Contains classes and functions for stripping comments and whitespace from C/C++ files as well as fixed-form Fortran
 """
 
 import itertools as it
diff --git a/sloc_translate.py b/sloc_translate.py
index 6f66c95..b4a5ee0 100755
--- a/sloc_translate.py
+++ b/sloc_translate.py
@@ -2,7 +2,7 @@
 # Copyright (C) 2019-2020 Intel Corporation
 # SPDX-License-Identifier: BSD-3-Clause
 
-from codebasin.c_source import get_file_source
+from codebasin.file_source import get_file_source
 import os
 import sys
 import re

From 70dec183c62d59c32b02d54eb116e9ad31591f6b Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 13:17:40 -0800
Subject: [PATCH 18/49] Move sloc_translate to etc/

---
 sloc_translate.py => etc/sloc_translate.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sloc_translate.py => etc/sloc_translate.py (100%)

diff --git a/sloc_translate.py b/etc/sloc_translate.py
similarity index 100%
rename from sloc_translate.py
rename to etc/sloc_translate.py

From ea03e62e65b8d9a6cebcfa3d282bbdf248300ec3 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 13:29:22 -0800
Subject: [PATCH 19/49] Fix imports in etc/sloc_translate

---
 etc/sloc_translate.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index b4a5ee0..c26b2c4 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -2,9 +2,13 @@
 # Copyright (C) 2019-2020 Intel Corporation
 # SPDX-License-Identifier: BSD-3-Clause
 
-from codebasin.file_source import get_file_source
 import os
 import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+from codebasin.file_source import get_file_source
+
 import re
 
 def file_sloc(path, verbose=False):

From b50bdc6f96050211d96676dc164e258d78961a7c Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 12 Nov 2019 14:32:49 -0800
Subject: [PATCH 20/49] Guess CDS-DPCPP-HPCBench platform breakdown

---
 etc/guess_info.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100755 etc/guess_info.py

diff --git a/etc/guess_info.py b/etc/guess_info.py
new file mode 100755
index 0000000..4082f4a
--- /dev/null
+++ b/etc/guess_info.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3.6
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+from sloc_translate import file_sloc
+from codebasin.file_source import get_file_source
+from codebasin.report import divergence
+
+import csv
+from pathlib import Path
+from collections import defaultdict
+import re
+import itertools as it
+import yaml
+
+def guess_app(inpath):
+    path = Path(inpath)
+    if path.parts[0] == 'dlpbenchcuda':
+        app = 'dlpbench-'  + path.parts[1]
+    elif path.parts[0] == 'dlpbenchopencl':
+        app = 'dlpbench-'  + path.parts[1]
+    elif path.parts[0] == 'dlpbench':
+        app = 'dlpbench-'  + path.parts[2]
+    elif path.parts[0] == 'cmedia-bench':
+        app = None
+    elif path.parts[0] == 'DNNBench':
+        app = f"DNNBench-{path.parts[1]}"
+    else:
+        app = path.parts[0]
+    return app
+
+def matches(path, regexp):
+    return regexp.search(path) != None
+
+class plat_guesser(object):
+    def __init__(self, name, pathwl, extwl):
+        self.name = name
+        self.pathwl = pathwl
+        self.pathbl = []
+        self.extwl = extwl
+        self.extbl = []
+    def finalize(self):
+        if len(self.pathwl) > 0:
+            all_exts = "|".join((f"[^a-z]+{x}|{x}[^a-z]+" for x in (z.replace("+", r"\+") for z in self.pathwl)))
+            self.pathwl_re = re.compile(f"{all_exts}")
+        else:
+            self.pathwl_re = re.compile(r"^\b$")
+        if len(self.pathbl) > 0:
+            all_exts = "|".join((f"[^a-z]+{x}|{x}[^a-z]+" for x in (z.replace("+", r"\+") for z in self.pathbl)))
+            self.pathbl_re = re.compile(f"{all_exts}")
+        else:
+            self.pathwl_re = re.compile(r"^\b$")
+        if len(self.extwl) > 0:
+            all_exts = "|".join(self.extwl)
+            self.extwl_re = re.compile(f"(.{all_exts})$")
+        else:
+            self.extwl_re = re.compile(r"^\b$")
+        if len(self.extbl) > 0:
+            all_exts = "|".join(self.extbl)
+            self.extbl_re = re.compile(f"(.{all_exts})$")
+        else:
+            self.extbl_re = re.compile(r"^\b$")
+    def score(self, path):
+        neg, pos = False, False
+        pos |= matches(path, self.pathwl_re)
+        neg |= matches(path, self.pathbl_re)
+        pos |= matches(path, self.extwl_re)
+        neg |= matches(path, self.extbl_re)
+        return self.name, (neg, pos)
+
+
+guessers = [plat_guesser("cuda",
+                         ["cuda"],
+                         ["cu"]),
+            plat_guesser("opencl",
+                         ["opencl", "ocl"],
+                         ["cl"]),
+            plat_guesser("dpc++",
+                         ["dpc++", "dpcpp", "sycl"],
+                         []),
+            plat_guesser("openmp",
+                         ["omp", "openmp"],
+                         [])]
+
+all_pathwl = set()
+all_extwl = set()
+for g in guessers:
+    all_pathwl.update(set(g.pathwl))
+    all_extwl.update(set(g.extwl))
+
+for g in guessers:
+    g.pathbl = list(all_pathwl.difference(set(g.pathwl)))
+    g.extbl = list(all_extwl.difference(set(g.extwl)))
+    g.finalize()
+
+def guess_platform(inpath):
+    path = Path(inpath)
+    return path.parts[1]
+
+def categorize_file(inpath):
+    res = {}
+    path = inpath.lower()
+    for g in guessers:
+        name, cat = g.score(path)
+        res[name] = cat
+    return res
+
+def walk_apptree(inroot, regexp):
+    apps = defaultdict(list)
+    for root, dirs, files in os.walk(inroot):
+        for f in files:
+            full_path = os.path.join(root, f)
+            if regexp.match(full_path):
+                app = guess_app(full_path)
+                if app:
+                    apps[app].append(os.path.relpath(full_path, inroot))
+    return apps
+
+def app_groups(files, all_lang=frozenset(['cuda', 'opencl', 'dpc++', 'openmp'])):
+    platmap = defaultdict(list)
+    for f in files:
+        cats = categorize_file(f)
+        is_in = set()
+        isnt_in = set()
+        for k, which in cats.items():
+            if which[1]:
+                is_in.update([k])
+            if which[0]:
+                isnt_in.update([k])
+        if len(is_in) == 0:
+            partial_common = all_lang.difference(isnt_in)
+            if len(partial_common) > 0:
+                for p in partial_common:
+                    platmap[p].append(f)
+        else:
+            update=is_in.intersection(all_lang)
+            if len(update) > 0:
+                for p in update:
+                    platmap[p].append(f)
+    return platmap
+
+def write_yaml(output, files):
+    platmap = app_groups(files)
+    base = {'codebase' : { 'files' : files, 'platforms' : list(platmap.keys()) }}
+    for plat_name, plat_files in platmap.items():
+        base[plat_name] = plat_files
+    with open(output, "w") as ofp:
+        yaml.dump(base, ofp)
+
+os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/")
+apps = walk_apptree(".", re.compile('(.*\.)(cpp|c|hpp|h|cl|cu|cxx|cc|cuh)$'))
+
+for app_name, app_files in apps.items():
+
+    write_yaml(f"{app_name}.yaml", app_files)
+    print(f"{app_name}.yaml")
+
+print("done")

From fe728a3672b532b50fb41ad0018126fa77aed9a4 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Wed, 13 Nov 2019 13:05:56 -0800
Subject: [PATCH 21/49] Add update guess-info to skip certain apps

---
 etc/guess_info.py | 57 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 13 deletions(-)

diff --git a/etc/guess_info.py b/etc/guess_info.py
index 4082f4a..03ced5b 100755
--- a/etc/guess_info.py
+++ b/etc/guess_info.py
@@ -18,15 +18,15 @@
 
 def guess_app(inpath):
     path = Path(inpath)
-    if path.parts[0] == 'dlpbenchcuda':
+    if path.parts[0] == 'dlpbenchcuda' and path.parts[1] != 'utils':
         app = 'dlpbench-'  + path.parts[1]
     elif path.parts[0] == 'dlpbenchopencl':
         app = 'dlpbench-'  + path.parts[1]
-    elif path.parts[0] == 'dlpbench':
+    elif path.parts[0] == 'dlpbench' and path.parts[1] != 'common' and path.parts[1] !='deprecated_workloads' and path.parts[1] != 'csa':
         app = 'dlpbench-'  + path.parts[2]
-    elif path.parts[0] == 'cmedia-bench':
+    elif path.parts[0] in ['cmedia-bench', "config", "infrastructure", "Test-Infrastructure"]:
         app = None
-    elif path.parts[0] == 'DNNBench':
+    elif path.parts[0] == 'DNNBench' and not path.parts[1] == 'common':
         app = f"DNNBench-{path.parts[1]}"
     else:
         app = path.parts[0]
@@ -110,6 +110,7 @@ def categorize_file(inpath):
 
 def walk_apptree(inroot, regexp):
     apps = defaultdict(list)
+    paths = {}
     for root, dirs, files in os.walk(inroot):
         for f in files:
             full_path = os.path.join(root, f)
@@ -134,28 +135,58 @@ def app_groups(files, all_lang=frozenset(['cuda', 'opencl', 'dpc++', 'openmp']))
             partial_common = all_lang.difference(isnt_in)
             if len(partial_common) > 0:
                 for p in partial_common:
-                    platmap[p].append(f)
+                    platmap[p].append(Path(f))
         else:
             update=is_in.intersection(all_lang)
             if len(update) > 0:
                 for p in update:
-                    platmap[p].append(f)
+                    platmap[p].append(Path(f))
     return platmap
 
-def write_yaml(output, files):
-    platmap = app_groups(files)
-    base = {'codebase' : { 'files' : files, 'platforms' : list(platmap.keys()) }}
-    for plat_name, plat_files in platmap.items():
-        base[plat_name] = plat_files
+def write_yaml(output, files, langs_names_map, strip_prefix=Path(".")):
+
+    platmap = app_groups(files, frozenset(langs_names_map.values()))
+    all_files = set()
+    for plat, pfiles in platmap.items():
+        all_files.update([str(f.relative_to(strip_prefix)) for f in pfiles])
+    if len(all_files) == 0:
+        return False
+    base = {'codebase' : { 'files' : list(all_files) }}
+    plats = set()
+    for export_name, plat_name in langs_names_map.items():
+        plat_files = [str(f.relative_to(strip_prefix)) for f in platmap[plat_name]]
+        if len(plat_files) > 0:
+            base[export_name] = {'files': plat_files}
+            plats.update([export_name])
+        elif len(langs_names_map) < 4: #Hack
+            return False
+    base['codebase']['platforms'] = list(plats)
     with open(output, "w") as ofp:
         yaml.dump(base, ofp)
+    return True
 
 os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/")
 apps = walk_apptree(".", re.compile('(.*\.)(cpp|c|hpp|h|cl|cu|cxx|cc|cuh)$'))
 
+#os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/configs")
 for app_name, app_files in apps.items():
 
-    write_yaml(f"{app_name}.yaml", app_files)
-    print(f"{app_name}.yaml")
+    prefixed= [f"./{p}" for p in app_files]
+    app_path = Path(os.path.commonpath(prefixed))
+    if app_path.is_file():
+        app_path = app_path.parent
+
+    outpath = app_path / "cbi-configs"
+    try:
+        os.makedirs(outpath)
+    except FileExistsError:
+        pass
+    for suffix, config in [("all", dict(zip(*it.repeat(['cuda', 'opencl', 'dpc++', 'openmp'],2)))),
+                           ("dpcpp", {'dpc++-gpu' : 'dpc++', 'dpc++-cpu' : 'dpc++'}),
+                           ("ducttape", {'gpu' : 'cuda', 'cpu' : 'openmp'})]:
+        outfile = outpath / f"{app_name}-{suffix}.yaml"
+        write = write_yaml(outfile, app_files, config, strip_prefix=app_path)
+        if write:
+            print(outfile)
 
 print("done")

From 7cf8bb5ec1c7495759beaa9e725947774dfa27ed Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Wed, 13 Nov 2019 13:12:26 -0800
Subject: [PATCH 22/49] Print out root and config file from codebasin.py

---
 codebasin.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/codebasin.py b/codebasin.py
index 17e6bd9..97f64c9 100755
--- a/codebasin.py
+++ b/codebasin.py
@@ -98,6 +98,8 @@ def guess_project_name(config_path):
 
     output_prefix = os.path.realpath(guess_project_name(args.config_file))
 
+    print(f"Config file: {args.config_file}")
+    print(f"Root: {rootdir}")
     # Print summary report
     if report_enabled("summary"):
         summary = report.summary(setmap)

From 7a380e9b914b4c9384565d89360a560f56f5249a Mon Sep 17 00:00:00 2001
From: Douglas Jacobsen <douglas.w.jacobsen@intel.com>
Date: Fri, 8 Nov 2019 12:29:12 -0800
Subject: [PATCH 23/49] Add a language class

This commit adds a language class that can help determine what language
a file uses.
---
 codebasin/file_parser.py |  1 +
 codebasin/language.py    | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 codebasin/language.py

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index bc6d7f2..e3ab479 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -8,6 +8,7 @@
 from os.path import splitext
 
 from . import preprocessor  # pylint : disable=no-name-in-module
+from . import language
 
 from codebasin.file_source import get_file_source
 
diff --git a/codebasin/language.py b/codebasin/language.py
new file mode 100644
index 0000000..bd8abb1
--- /dev/null
+++ b/codebasin/language.py
@@ -0,0 +1,42 @@
+# Copyright (C) 2019 Intel Corporation
+# SPDX-License-Identifier: BSD-3-Clause
+"""
+Contains classes and functions related to language detection
+and providing information about the language to other parts of
+code base investigator
+"""
+
+import os
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class FileLanguage:
+    """
+    Represents the language and modifiers for a given filename
+    """
+
+    _supported_languages = ['fortran-free', 'fortran-fixed', 'c', 'c++']
+
+    _language_extensions = {}
+    _language_extensions['fortran-free'] = ['.f90', '.F90']
+    _language_extensions['fortran-fixed'] = ['.f', '.ftn', '.fpp', '.F', '.FOR', '.FTN', '.FPP']
+    _language_extensions['c'] = ['.c', '.h']
+    _language_extensions['c++'] = ['.c++', '.cxx', '.cpp', '.cc',
+                                   '.hpp', '.hxx', '.h++', '.hh',
+                                   '.inc', '.inl', '.tcc', '.icc',
+                                   '.ipp']
+
+    def __init__(self, filename):
+        self._filename = filename
+        self._extension = os.path.splitext(self._filename)[1]
+        self._language = 'None'
+
+        for lang in self._supported_languages:
+            if self._extension in self._language_extensions[lang]:
+                self._language = lang
+                break
+
+    def get_language(self):
+        return self._language

From 026aaf88b80934059c75395eb4a4a89c83ea4208 Mon Sep 17 00:00:00 2001
From: Douglas Jacobsen <douglas.w.jacobsen@intel.com>
Date: Wed, 13 Nov 2019 12:52:37 -0800
Subject: [PATCH 24/49] Update file types

---
 codebasin/language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codebasin/language.py b/codebasin/language.py
index bd8abb1..7e6dddf 100644
--- a/codebasin/language.py
+++ b/codebasin/language.py
@@ -26,7 +26,7 @@ class FileLanguage:
     _language_extensions['c++'] = ['.c++', '.cxx', '.cpp', '.cc',
                                    '.hpp', '.hxx', '.h++', '.hh',
                                    '.inc', '.inl', '.tcc', '.icc',
-                                   '.ipp']
+                                   '.ipp', '.cu', '.cuh', '.cl']
 
     def __init__(self, filename):
         self._filename = filename

From ce6338d8b8cd7fa63b0bc1afd5eb9116809f8d00 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Wed, 13 Nov 2019 13:32:50 -0800
Subject: [PATCH 25/49] Use new language identifier

---
 codebasin/file_source.py | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index 4727061..8c1b7af 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -6,6 +6,7 @@
 
 import itertools as it
 from os.path import splitext
+from .language import FileLanguage
 
 global whitespace_dict
 whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000')
@@ -419,31 +420,11 @@ def fortran_file_source(fp, relaxed=False):
 
     return (total_sloc, total_physical_lines)
 
-
-global extension_map
-extension_map = {'.f90' : "FREEFORM FORTRAN",
-                 '.cxx' : "C FAMILY",
-                 '.cl' : "C FAMILY",
-                 '.cu' : "C FAMILY",
-                 '.cuh' : "C FAMILY",
-                 '.cc' : "C FAMILY",
-                 '.cpp' : "C FAMILY",
-                 '.c' : "C FAMILY",
-                 '.h' : "C FAMILY",
-                 '.hpp' : "C FAMILY"}
-
-def guess_language(fname):
-    _, ext = splitext(fname)
-    try:
-        return extension_map[ext.lower()]
-    except KeyError:
-        return "Unknown"
-
 def get_file_source(path):
-    lang = guess_language(path)
-    if lang == "FREEFORM FORTRAN":
+    lang = FileLanguage(path)
+    if lang.get_language() == "fortran-free":
         return fortran_file_source
-    elif lang == "C FAMILY":
+    elif lang.get_language() in ["c", "c++"]:
         return c_file_source
     else:
-        return None
+        raise RuntimeError(f"Language {lang.get_language()} in file {path} is unsupported by code base investigator")

From 029dd0e391e3cd657d760560e88730645e2a6db2 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Wed, 20 Nov 2019 12:39:54 -0800
Subject: [PATCH 26/49] Remove object parent types

---
 codebasin/file_source.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index 8c1b7af..7d4f9a0 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -14,7 +14,7 @@
 def is_whitespace(c):
     return c in whitespace_dict
 
-class one_space_line(object):
+class one_space_line:
     def __init__(self):
         self.parts = []
         self.trailing_space = False
@@ -57,7 +57,7 @@ def flush(self):
         self.__init__()
         return res
 
-class iter_keep1(object):
+class iter_keep1:
     def __init__(self, iterator):
         self.iterator = iter(iterator)
         self.single = None
@@ -73,7 +73,7 @@ def putback(self, item):
         assert self.single is None
         self.single = item
 
-class c_cleaner(object):
+class c_cleaner:
     def __init__(self, outbuf, directives_only=False):
         self.state = ["TOPLEVEL"]
         self.outbuf = outbuf
@@ -191,7 +191,7 @@ def process(self, lineiter):
             else:
                 assert None
 
-class fortran_cleaner(object):
+class fortran_cleaner:
     def __init__(self, outbuf):
         self.state = ["TOPLEVEL"]
         self.outbuf = outbuf

From 4c6d045a324b8cca3f34bc51defe6b9a8ac73c1b Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Thu, 21 Nov 2019 13:36:53 -0800
Subject: [PATCH 27/49] Add comments and cleanup according to pylint

---
 codebasin/file_source.py | 123 +++++++++++++++++++++++++++++++++------
 1 file changed, 106 insertions(+), 17 deletions(-)

diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index 7d4f9a0..59d0555 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -1,24 +1,40 @@
 # Copyright (C) 2019 Intel Corporation
 # SPDX-License-Identifier: BSD-3-Clause
 """
-Contains classes and functions for stripping comments and whitespace from C/C++ files as well as fixed-form Fortran
+Contains classes and functions for stripping comments and whitespace from
+C/C++ files as well as fixed-form Fortran
 """
 
 import itertools as it
-from os.path import splitext
 from .language import FileLanguage
 
-global whitespace_dict
-whitespace_dict = dict.fromkeys(' \t\n\r\x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000')
+### This string was created by looking at all unicode code points
+### and checking to see if they are considered whitespace
+### ('\s') by the re module
+whitespace_dict = dict.fromkeys(''.join([' \t\n\r\x0b\x0c\x1c\x1d\x1e',
+                                         '\x1f\x85\xa0\u1680\u2000\u2001',
+                                         '\u2002\u2003\u2004\u2005\u2006',
+                                         '\u2007\u2008\u2009\u200a\u2028',
+                                         '\u2029\u202f\u205f\u3000']))
 
 def is_whitespace(c):
+    """Returns true if the character c is whitespace"""
+    global whitespace_dict
     return c in whitespace_dict
 
 class one_space_line:
+    """
+    A container that represents a single line of code while (generally)
+    merging all whitespace into a single space.
+    """
     def __init__(self):
         self.parts = []
         self.trailing_space = False
     def append_char(self, c):
+        """
+        Append a character of no particular class to the line.
+        Whitespace will be dropped if the line already ends in space.
+        """
         if not is_whitespace(c):
             self.parts.append(c)
             self.trailing_space = False
@@ -27,6 +43,9 @@ def append_char(self, c):
                 self.parts.append(' ')
                 self.trailing_space = True
     def append_space(self):
+        """
+        Append whitespace to line, unless line already ends in a space.
+        """
         if not self.trailing_space:
             self.parts.append(' ')
             self.trailing_space = True
@@ -34,30 +53,46 @@ def append_nonspace(self, c):
         self.parts.append(c)
         self.trailing_space = False
     def join(self, other):
-        if len(other.parts) > 0:
+        """
+        Append another one_space_line to this one, respecting whitespace rules.
+        """
+        if other.parts:
             if other.parts[0] == ' ' and self.trailing_space:
                 self.parts += other.parts[1:]
             else:
                 self.parts += other.parts[:]
             self.trailing_space = other.trailing_space
     def category(self):
+        """
+        Report the a category for this line:
+        * SRC_NONBLANK if it is non-empty/non-whitespace line of code.
+        * BLANK if it is empty or only whitespace.
+        * CPP_DIRECTIVE it is is a C preprocessor directive.
+        """
         res = "SRC_NONBLANK"
-        if len(self.parts) == 0:
+        if not self.parts:
             res = "BLANK"
         elif len(self.parts) == 1:
             if self.parts[0] == ' ':
                 res = "BLANK"
             elif self.parts[0] == '#':
                 res = "CPP_DIRECTIVE"
-        elif ( self.parts[0] == ' ' and self.parts[1] == '#' ) or self.parts[0] == '#':
+        elif self.parts[:2] == ' #' or self.parts[0] == '#':
             res = "CPP_DIRECTIVE"
         return res
     def flush(self):
-        res= ''.join(self.parts)
+        """
+        Convert the characters to a string and reset the buffer.
+        """
+        res = ''.join(self.parts)
         self.__init__()
         return res
 
 class iter_keep1:
+    """
+    An iterator wrapper that allows a single item to be 'put back'
+    and picked up for the next iteration.
+    """
     def __init__(self, iterator):
         self.iterator = iter(iterator)
         self.single = None
@@ -70,15 +105,32 @@ def __next__(self):
         else:
             return next(self.iterator)
     def putback(self, item):
+        """
+        Put item into the iterator such that it will be the next
+        yielded item.
+        """
         assert self.single is None
         self.single = item
 
 class c_cleaner:
+    """
+    Approximation of the early stages of a C preprocessor.
+    Joins line continuations, merges whitespace, and replaces comments
+    with whitespace. State is kept across physical lines and cleared with
+    logical_newline.
+    """
     def __init__(self, outbuf, directives_only=False):
+        """
+        directives_only has the cleaner only operate on directive lines.
+        """
         self.state = ["TOPLEVEL"]
         self.outbuf = outbuf
         self.directives_only = directives_only
     def logical_newline(self):
+        """
+        Reset state when a logical newline is found.
+        That is, when a newline without continuation.
+        """
         if self.state[-1] == "IN_INLINE_COMMENT":
             self.state = ["TOPLEVEL"]
             self.outbuf.append_space()
@@ -90,13 +142,16 @@ def logical_newline(self):
             self.state = ["TOPLEVEL"]
         elif self.state[-1] == "DOUBLE_QUOTATION":
             # This probably should give a warning
-            self.state == ["TOPLEVEL"]
+            self.state = ["TOPLEVEL"]
         elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
             self.state.pop()
             assert self.state[-1] == "IN_BLOCK_COMMENT"
         elif self.state[-1] == "CPP_DIRECTIVE":
             self.state = ["TOPLEVEL"]
     def process(self, lineiter):
+        """
+        Add contents of lineiter to outbuf, stripping as directed.
+        """
         inbuffer = iter_keep1(lineiter)
         for char in inbuffer:
             if self.state[-1] == "TOPLEVEL":
@@ -192,25 +247,38 @@ def process(self, lineiter):
                 assert None
 
 class fortran_cleaner:
+    """
+    'Cleans' source to remove comments and blanks while preserving
+    directives and handling strings and continuations properly.
+    Expects to have c defines already processed.
+    """
     def __init__(self, outbuf):
         self.state = ["TOPLEVEL"]
         self.outbuf = outbuf
         self.verify_continue = []
     def dir_check(self, inbuffer):
-        self.found=['!']
+        """
+        Inspect comment to see if it is in fact, a valid directive,
+        which should be preserved.
+        """
+        found = ['!']
         for char in inbuffer:
             if char == '$':
-                self.found.append('$')
-                for char in self.found:
-                    self.outbuf.append_nonspace(char)
-                for char in inbuffer:
-                    self.outbuf.append_nonspace(char)
+                found.append('$')
+                for c in found:
+                    self.outbuf.append_nonspace(c)
+                for c in inbuffer:
+                    self.outbuf.append_nonspace(c)
                 break
             elif char.isalpha():
-                self.found.append(char)
+                found.append(char)
             else:
                 return
     def process(self, lineiter):
+        """
+        Add contents of lineiter to current line, removing contents and
+        handling continuations.
+        """
         inbuffer = iter_keep1(lineiter)
         try:
             while True:
@@ -296,6 +364,15 @@ def process(self, lineiter):
             self.state[-1] = "CONTINUING_FROM_SOL"
 
 def c_file_source(fp, relaxed=False, directives_only=False):
+    """
+    Process file fp in terms of logical (sloc) and physical lines of C code.
+    Yield blocks of logical lines of code with physical extents.
+    Return total lines at exit.
+    Relaxed allows for inconsistent state at the end of parsing, usefule for
+    special composition cases.
+    directives_only sets up parser to only process directive lines such that
+    the output can be fed to another file source (i.e. Fortran).
+    """
 
     current_physical_line = one_space_line()
     cleaner = c_cleaner(current_physical_line, directives_only)
@@ -355,6 +432,14 @@ def c_file_source(fp, relaxed=False, directives_only=False):
     return (total_sloc, total_physical_lines)
 
 def fortran_file_source(fp, relaxed=False):
+    """
+    Process file fp in terms of logical (sloc) and physical lines of
+    fixed-form  Fortran code.
+    Yield blocks of logical lines of code with physical extents.
+    Return total lines at exit.
+    Relaxed allows for inconsistent state at the end of parsing, usefule for
+    special composition cases.
+    """
 
     current_physical_line = one_space_line()
     cleaner = fortran_cleaner(current_physical_line)
@@ -370,7 +455,7 @@ def fortran_file_source(fp, relaxed=False):
         while True:
             ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) = next(c_walker)
             #if it's a cpp directive, flush what we have, then emit the directive and start over
-            if current_physical_start == None:
+            if current_physical_start is None:
                 current_physical_start = src_physical_start
 
             if c_category == "CPP_DIRECTIVE":
@@ -421,6 +506,10 @@ def fortran_file_source(fp, relaxed=False):
     return (total_sloc, total_physical_lines)
 
 def get_file_source(path):
+    """
+    Return a C or Fortran line source for path depending on
+    the language we can detect, or fail.
+    """
     lang = FileLanguage(path)
     if lang.get_language() == "fortran-free":
         return fortran_file_source

From 3fa388680fce3e3c3130cd53c1dcb7980fff3838 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 22 Nov 2019 09:12:26 -0800
Subject: [PATCH 28/49] Add line info and use it in c_file_source

---
 codebasin/file_source.py | 84 ++++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index 59d0555..ed78bdb 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -363,6 +363,50 @@ def process(self, lineiter):
             self.verify_continue = []
             self.state[-1] = "CONTINUING_FROM_SOL"
 
+class line_info:
+    """
+    Reprsents a logical line of code.
+    """
+    def __init__(self):
+        self.current_logical_line = one_space_line()
+        self.current_physical_start = 1
+        self.current_physical_end = None
+        self.local_sloc = 0
+        self.category = None
+        self.flushed_line = None
+    def join(self, other_line):
+        """
+        Combine this logical line with another one.
+        """
+        self.current_logical_line.join(other_line)
+    def physical_nonblank(self):
+        """
+        Mark nonblank link in this logical like.
+        """
+        self.local_sloc += 1
+    def physical_update(self, physical_line_num):
+        """
+        Mark end of new physical line.
+        """
+        self.current_physical_end = physical_line_num + 1
+        self.category = self.current_logical_line.category()
+        self.flushed_line = self.current_logical_line.flush()
+    def physical_reset(self):
+        """
+        Prepare for next logical block. Return counted sloc.
+        """
+        self.current_physical_start = self.current_physical_end
+        local_sloc_copy = self.local_sloc
+        self.local_sloc = 0
+        self.flushed_line = None
+        return local_sloc_copy
+    def logical_result(self):
+        """
+        Return tuple of contents. Eventually should just return this class.
+        """
+        return ((self.current_physical_start, self.current_physical_end),
+                self.local_sloc, self.flushed_line, self.category)
+
 def c_file_source(fp, relaxed=False, directives_only=False):
     """
     Process file fp in terms of logical (sloc) and physical lines of C code.
@@ -377,11 +421,9 @@ def c_file_source(fp, relaxed=False, directives_only=False):
     current_physical_line = one_space_line()
     cleaner = c_cleaner(current_physical_line, directives_only)
 
-    current_logical_line = one_space_line()
+    curr_line = line_info()
 
-    current_physical_start = 1
     total_sloc = 0
-    local_sloc = 0
 
     physical_line_num = 0
     for (physical_line_num, line) in enumerate(fp, start=1):
@@ -389,43 +431,35 @@ def c_file_source(fp, relaxed=False, directives_only=False):
         end = len(line)
         if line[-1] == '\n':
             end -= 1
-        else:
-            if end > 0 and line[end-1] == '\\':
-                raise RuntimeError("file seems to end in \\ with no newline!")
+        elif end > 0 and line[end-1] == '\\':
+            raise RuntimeError("file seems to end in \\ with no newline!")
 
-        if end > 0 and line[end-1] == '\\':
-            continued = True
+        continued = end > 0 and line[end-1] == '\\'
+        if continued:
             end -= 1
-        else:
-            continued = False
         cleaner.process(it.islice(line, 0, end))
         if not continued:
             cleaner.logical_newline()
 
         if not current_physical_line.category() == "BLANK":
-            local_sloc += 1
+            curr_line.physical_nonblank()
 
-        current_logical_line.join(current_physical_line)
+        curr_line.join(current_physical_line)
 
         if not continued:
-            line_cat = current_logical_line.category()
-            if line_cat != "BLANK":
-                yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat)
-            else:
-                current_logical_line.__init__()
-                assert local_sloc == 0
+            curr_line.physical_update(physical_line_num+1)
+            if curr_line.category != "BLANK":
+                yield curr_line.logical_result()
 
-            current_physical_start = physical_line_num + 1
-            total_sloc += local_sloc
-            local_sloc = 0
+            total_sloc += curr_line.physical_reset()
 
     total_physical_lines = physical_line_num
 
-    line_cat = current_logical_line.category()
-    if line_cat != "BLANK":
-        yield ((current_physical_start, physical_line_num+1), local_sloc, current_logical_line.flush(), line_cat)
+    curr_line.physical_update(physical_line_num+1)
+    if curr_line.category != "BLANK":
+        yield curr_line.logical_result()
 
-    total_sloc += local_sloc
+    total_sloc += curr_line.physical_reset()
     if not relaxed:
         assert cleaner.state == ["TOPLEVEL"]
 

From f942c287e69878dbdd61398959b5489373bdb1f3 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 22 Nov 2019 13:12:21 -0800
Subject: [PATCH 29/49] Clean up file_source

---
 codebasin/file_parser.py | 12 ++++---
 codebasin/file_source.py | 69 +++++++++++++++++++---------------------
 etc/sloc_translate.py    |  4 +--
 3 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index e3ab479..54d2a47 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -142,18 +142,20 @@ def parse_file(self):
             source = file_source(source_file)
             try:
                 while True:
-                    (phys_int, local_sloc, logical_line, line_cat) = next(source)
+                    logical_line = next(source)
+                    phys_int = (logical_line.current_physical_start, logical_line.current_physical_end)
                     # Only follow continuation for directives
-                    if line_cat == 'CPP_DIRECTIVE':
+                    if logical_line.category == 'CPP_DIRECTIVE':
                         # Add this into the directive lines, even if it
                         # might not be a directive we count
-                        groups['directive'].add_line(phys_int, local_sloc)
 
-                        FileParser.handle_directive(out_tree, groups, logical_line)
+                        groups['directive'].add_line(phys_int, logical_line.local_sloc)
+
+                        FileParser.handle_directive(out_tree, groups, logical_line.flushed_line)
 
                         # FallBack is that this line is a simple code line.
                     else:
-                        groups['code'].add_line(phys_int, local_sloc)
+                        groups['code'].add_line(phys_int, logical_line.local_sloc)
             except StopIteration as it:
                 total_sloc, physical_loc = it.value
 
diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index ed78bdb..1ff56eb 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -379,16 +379,16 @@ def join(self, other_line):
         Combine this logical line with another one.
         """
         self.current_logical_line.join(other_line)
-    def physical_nonblank(self):
+    def physical_nonblank(self, n):
         """
         Mark nonblank link in this logical like.
         """
-        self.local_sloc += 1
+        self.local_sloc += n
     def physical_update(self, physical_line_num):
         """
         Mark end of new physical line.
         """
-        self.current_physical_end = physical_line_num + 1
+        self.current_physical_end = physical_line_num
         self.category = self.current_logical_line.category()
         self.flushed_line = self.current_logical_line.flush()
     def physical_reset(self):
@@ -400,6 +400,8 @@ def physical_reset(self):
         self.local_sloc = 0
         self.flushed_line = None
         return local_sloc_copy
+    def phys_interval(self):
+        return (self.current_physical_start, self.current_physical_end)
     def logical_result(self):
         """
         Return tuple of contents. Eventually should just return this class.
@@ -442,14 +444,14 @@ def c_file_source(fp, relaxed=False, directives_only=False):
             cleaner.logical_newline()
 
         if not current_physical_line.category() == "BLANK":
-            curr_line.physical_nonblank()
+            curr_line.physical_nonblank(1)
 
         curr_line.join(current_physical_line)
 
         if not continued:
             curr_line.physical_update(physical_line_num+1)
             if curr_line.category != "BLANK":
-                yield curr_line.logical_result()
+                yield curr_line
 
             total_sloc += curr_line.physical_reset()
 
@@ -457,7 +459,7 @@ def c_file_source(fp, relaxed=False, directives_only=False):
 
     curr_line.physical_update(physical_line_num+1)
     if curr_line.category != "BLANK":
-        yield curr_line.logical_result()
+        yield curr_line
 
     total_sloc += curr_line.physical_reset()
     if not relaxed:
@@ -478,62 +480,57 @@ def fortran_file_source(fp, relaxed=False):
     current_physical_line = one_space_line()
     cleaner = fortran_cleaner(current_physical_line)
 
-    current_logical_line = one_space_line()
+    curr_line = line_info()
 
     current_physical_start = None
     total_sloc = 0
-    local_sloc = 0
 
     c_walker = c_file_source(fp, directives_only=True)
     try:
         while True:
-            ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category) = next(c_walker)
+            src_c_line = next(c_walker)
+            #((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category)
             #if it's a cpp directive, flush what we have, then emit the directive and start over
             if current_physical_start is None:
-                current_physical_start = src_physical_start
+                current_physical_start = curr_line.current_physical_start
 
-            if c_category == "CPP_DIRECTIVE":
-                line_cat = current_logical_line.category()
-                if line_cat != "BLANK":
-                    yield ((current_physical_start, src_physical_end), local_sloc, current_logical_line.flush(), line_cat)
-                else:
-                    current_logical_line.__init__()
-                    assert local_sloc == 0
+            if src_c_line.category == "CPP_DIRECTIVE":
+                curr_line.physical_update(src_c_line.current_physical_end)
+                if curr_line.category != "BLANK":
+                    yield curr_line
 
                 current_physical_start = None
-                total_sloc += local_sloc
-                local_sloc = 0
-                yield ((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category)
-                total_sloc += src_line_sloc
+                total_sloc += curr_line.physical_reset()
+                yield src_c_line
+                total_sloc += src_c_line.local_sloc
                 continue
 
             current_physical_line.__init__()
-            cleaner.process(it.islice(src_line, 0, len(src_line)))
+            cleaner.process(it.islice(src_c_line.flushed_line, 0, len(src_c_line.flushed_line)))
 
             if not current_physical_line.category() == "BLANK":
-                local_sloc += src_line_sloc
+                curr_line.physical_nonblank(src_c_line.local_sloc)
 
-            current_logical_line.join(current_physical_line)
+            curr_line.join(current_physical_line)
 
             if cleaner.state[-1] != "CONTINUING_FROM_SOL":
-                line_cat = current_logical_line.category()
-                if line_cat != "BLANK":
-                    yield ((current_physical_start, src_physical_end), local_sloc, current_logical_line.flush(), line_cat)
-                else:
-                    current_logical_line.__init__()
-                    assert local_sloc == 0
+                curr_line.current_physical_start = current_physical_start
+                curr_line.physical_update(src_c_line.current_physical_end)
+                if curr_line.category != "BLANK":
+                    yield curr_line
 
                 current_physical_start = None
-                total_sloc += local_sloc
-                local_sloc = 0
+                total_sloc += curr_line.physical_reset()
+
     except StopIteration as stopit:
         _, total_physical_lines = stopit.value
 
-    line_cat = current_logical_line.category()
-    if line_cat != "BLANK":
-        yield ((current_physical_start, total_physical_lines), local_sloc, current_logical_line.flush(), line_cat)
+    curr_line.physical_update(total_physical_lines)
+    if not curr_line.category == "BLANK":
+        curr_line.current_physical_start = current_physical_start
+        yield curr_line
 
-    total_sloc += local_sloc
+    total_sloc += curr_line.physical_reset()
     if not relaxed:
         assert cleaner.state == ["TOPLEVEL"]
 
diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index c26b2c4..b925f72 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -19,9 +19,9 @@ def file_sloc(path, verbose=False):
         walker = file_source(source_file, relaxed=False)
         try:
             while True:
-                (interval, sloc, line, line_cat) = next(walker)
+                logical_line = next(walker)
                 if verbose:
-                    print(f"{path} [{interval[0]}, {interval[1]}) ({sloc}): {line} {line_cat}")
+                    print(f"{path} [{logical_line.current_physical_start}, {logical_line.current_physical_end}) ({logical_line.local_sloc}): {logical_line.flushed_line} {logical_line.category}")
         except StopIteration as it:
             total_sloc, physical_loc = it.value
 

From 4a5e61cb418c74a1787d358cd32a2baa0c7d64d8 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 22 Nov 2019 13:59:03 -0800
Subject: [PATCH 30/49] Clean up guess-info

---
 codebasin/file_parser.py |  14 +--
 codebasin/file_source.py |   3 +-
 etc/guess_info.py        | 192 ---------------------------------------
 etc/sloc_translate.py    |  50 ++++++----
 4 files changed, 41 insertions(+), 218 deletions(-)
 delete mode 100755 etc/guess_info.py

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 54d2a47..52f9f5d 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -5,12 +5,8 @@
 and building a tree of nodes from it.
 """
 
-from os.path import splitext
-
-from . import preprocessor  # pylint : disable=no-name-in-module
-from . import language
-
 from codebasin.file_source import get_file_source
+from . import preprocessor  # pylint : disable=no-name-in-module
 
 class LineGroup:
     """
@@ -128,9 +124,9 @@ def parse_file(self):
         out_tree = preprocessor.SourceTree(self._filename)
         file_source = get_file_source(self._filename)
         if not file_source:
-            raise RuntimeError(f"{self._filename} doesn't appear to be a language this tool can process")
+            raise RuntimeError(f"{self._filename} doesn't appear " +
+                               "to be a language this tool can process")
         with open(self._filename, mode='r', errors='replace') as source_file:
-            previous_continue = False
 
             groups = {'code': LineGroup(),
                       'directive': LineGroup(),
@@ -143,7 +139,7 @@ def parse_file(self):
             try:
                 while True:
                     logical_line = next(source)
-                    phys_int = (logical_line.current_physical_start, logical_line.current_physical_end)
+                    phys_int = logical_line.phys_interval()
                     # Only follow continuation for directives
                     if logical_line.category == 'CPP_DIRECTIVE':
                         # Add this into the directive lines, even if it
@@ -157,7 +153,7 @@ def parse_file(self):
                     else:
                         groups['code'].add_line(phys_int, logical_line.local_sloc)
             except StopIteration as it:
-                total_sloc, physical_loc = it.value
+                _, physical_loc = it.value
 
             if not groups['code'].empty():
                 groups['code'].add_line((groups['code'].start_line, physical_loc-1), 0)
diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index 1ff56eb..310b7d9 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -547,4 +547,5 @@ def get_file_source(path):
     elif lang.get_language() in ["c", "c++"]:
         return c_file_source
     else:
-        raise RuntimeError(f"Language {lang.get_language()} in file {path} is unsupported by code base investigator")
+        raise RuntimeError(f"Language {lang.get_language()} in file " +
+                           f"{path} is unsupported by code base investigator")
diff --git a/etc/guess_info.py b/etc/guess_info.py
deleted file mode 100755
index 03ced5b..0000000
--- a/etc/guess_info.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env python3.6
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-from sloc_translate import file_sloc
-from codebasin.file_source import get_file_source
-from codebasin.report import divergence
-
-import csv
-from pathlib import Path
-from collections import defaultdict
-import re
-import itertools as it
-import yaml
-
-def guess_app(inpath):
-    path = Path(inpath)
-    if path.parts[0] == 'dlpbenchcuda' and path.parts[1] != 'utils':
-        app = 'dlpbench-'  + path.parts[1]
-    elif path.parts[0] == 'dlpbenchopencl':
-        app = 'dlpbench-'  + path.parts[1]
-    elif path.parts[0] == 'dlpbench' and path.parts[1] != 'common' and path.parts[1] !='deprecated_workloads' and path.parts[1] != 'csa':
-        app = 'dlpbench-'  + path.parts[2]
-    elif path.parts[0] in ['cmedia-bench', "config", "infrastructure", "Test-Infrastructure"]:
-        app = None
-    elif path.parts[0] == 'DNNBench' and not path.parts[1] == 'common':
-        app = f"DNNBench-{path.parts[1]}"
-    else:
-        app = path.parts[0]
-    return app
-
-def matches(path, regexp):
-    return regexp.search(path) != None
-
-class plat_guesser(object):
-    def __init__(self, name, pathwl, extwl):
-        self.name = name
-        self.pathwl = pathwl
-        self.pathbl = []
-        self.extwl = extwl
-        self.extbl = []
-    def finalize(self):
-        if len(self.pathwl) > 0:
-            all_exts = "|".join((f"[^a-z]+{x}|{x}[^a-z]+" for x in (z.replace("+", r"\+") for z in self.pathwl)))
-            self.pathwl_re = re.compile(f"{all_exts}")
-        else:
-            self.pathwl_re = re.compile(r"^\b$")
-        if len(self.pathbl) > 0:
-            all_exts = "|".join((f"[^a-z]+{x}|{x}[^a-z]+" for x in (z.replace("+", r"\+") for z in self.pathbl)))
-            self.pathbl_re = re.compile(f"{all_exts}")
-        else:
-            self.pathwl_re = re.compile(r"^\b$")
-        if len(self.extwl) > 0:
-            all_exts = "|".join(self.extwl)
-            self.extwl_re = re.compile(f"(.{all_exts})$")
-        else:
-            self.extwl_re = re.compile(r"^\b$")
-        if len(self.extbl) > 0:
-            all_exts = "|".join(self.extbl)
-            self.extbl_re = re.compile(f"(.{all_exts})$")
-        else:
-            self.extbl_re = re.compile(r"^\b$")
-    def score(self, path):
-        neg, pos = False, False
-        pos |= matches(path, self.pathwl_re)
-        neg |= matches(path, self.pathbl_re)
-        pos |= matches(path, self.extwl_re)
-        neg |= matches(path, self.extbl_re)
-        return self.name, (neg, pos)
-
-
-guessers = [plat_guesser("cuda",
-                         ["cuda"],
-                         ["cu"]),
-            plat_guesser("opencl",
-                         ["opencl", "ocl"],
-                         ["cl"]),
-            plat_guesser("dpc++",
-                         ["dpc++", "dpcpp", "sycl"],
-                         []),
-            plat_guesser("openmp",
-                         ["omp", "openmp"],
-                         [])]
-
-all_pathwl = set()
-all_extwl = set()
-for g in guessers:
-    all_pathwl.update(set(g.pathwl))
-    all_extwl.update(set(g.extwl))
-
-for g in guessers:
-    g.pathbl = list(all_pathwl.difference(set(g.pathwl)))
-    g.extbl = list(all_extwl.difference(set(g.extwl)))
-    g.finalize()
-
-def guess_platform(inpath):
-    path = Path(inpath)
-    return path.parts[1]
-
-def categorize_file(inpath):
-    res = {}
-    path = inpath.lower()
-    for g in guessers:
-        name, cat = g.score(path)
-        res[name] = cat
-    return res
-
-def walk_apptree(inroot, regexp):
-    apps = defaultdict(list)
-    paths = {}
-    for root, dirs, files in os.walk(inroot):
-        for f in files:
-            full_path = os.path.join(root, f)
-            if regexp.match(full_path):
-                app = guess_app(full_path)
-                if app:
-                    apps[app].append(os.path.relpath(full_path, inroot))
-    return apps
-
-def app_groups(files, all_lang=frozenset(['cuda', 'opencl', 'dpc++', 'openmp'])):
-    platmap = defaultdict(list)
-    for f in files:
-        cats = categorize_file(f)
-        is_in = set()
-        isnt_in = set()
-        for k, which in cats.items():
-            if which[1]:
-                is_in.update([k])
-            if which[0]:
-                isnt_in.update([k])
-        if len(is_in) == 0:
-            partial_common = all_lang.difference(isnt_in)
-            if len(partial_common) > 0:
-                for p in partial_common:
-                    platmap[p].append(Path(f))
-        else:
-            update=is_in.intersection(all_lang)
-            if len(update) > 0:
-                for p in update:
-                    platmap[p].append(Path(f))
-    return platmap
-
-def write_yaml(output, files, langs_names_map, strip_prefix=Path(".")):
-
-    platmap = app_groups(files, frozenset(langs_names_map.values()))
-    all_files = set()
-    for plat, pfiles in platmap.items():
-        all_files.update([str(f.relative_to(strip_prefix)) for f in pfiles])
-    if len(all_files) == 0:
-        return False
-    base = {'codebase' : { 'files' : list(all_files) }}
-    plats = set()
-    for export_name, plat_name in langs_names_map.items():
-        plat_files = [str(f.relative_to(strip_prefix)) for f in platmap[plat_name]]
-        if len(plat_files) > 0:
-            base[export_name] = {'files': plat_files}
-            plats.update([export_name])
-        elif len(langs_names_map) < 4: #Hack
-            return False
-    base['codebase']['platforms'] = list(plats)
-    with open(output, "w") as ofp:
-        yaml.dump(base, ofp)
-    return True
-
-os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/")
-apps = walk_apptree(".", re.compile('(.*\.)(cpp|c|hpp|h|cl|cu|cxx|cc|cuh)$'))
-
-#os.chdir("/nfs/home/jsewall/CDS-DPCPP-HPCBench/configs")
-for app_name, app_files in apps.items():
-
-    prefixed= [f"./{p}" for p in app_files]
-    app_path = Path(os.path.commonpath(prefixed))
-    if app_path.is_file():
-        app_path = app_path.parent
-
-    outpath = app_path / "cbi-configs"
-    try:
-        os.makedirs(outpath)
-    except FileExistsError:
-        pass
-    for suffix, config in [("all", dict(zip(*it.repeat(['cuda', 'opencl', 'dpc++', 'openmp'],2)))),
-                           ("dpcpp", {'dpc++-gpu' : 'dpc++', 'dpc++-cpu' : 'dpc++'}),
-                           ("ducttape", {'gpu' : 'cuda', 'cpu' : 'openmp'})]:
-        outfile = outpath / f"{app_name}-{suffix}.yaml"
-        write = write_yaml(outfile, app_files, config, strip_prefix=app_path)
-        if write:
-            print(outfile)
-
-print("done")
diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index b925f72..946b9f9 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -1,17 +1,23 @@
 #!/usr/bin/env python3.6
 # Copyright (C) 2019-2020 Intel Corporation
 # SPDX-License-Identifier: BSD-3-Clause
+"""
+Parse source file, reporting sloc and physical lines.
+Can optionally print logical line regions and cleaned lines.
+"""
 
 import os
 import sys
+import re
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 
 from codebasin.file_source import get_file_source
 
-import re
-
 def file_sloc(path, verbose=False):
+    """
+    Process file in path, reporting total_sloc/loc. Optionally print logical regions.
+    """
     file_source = get_file_source(path)
     if not file_source:
         raise RuntimeError(f"{path} doesn't appear to be a language this tool can process")
@@ -21,29 +27,41 @@ def file_sloc(path, verbose=False):
             while True:
                 logical_line = next(walker)
                 if verbose:
-                    print(f"{path} [{logical_line.current_physical_start}, {logical_line.current_physical_end}) ({logical_line.local_sloc}): {logical_line.flushed_line} {logical_line.category}")
+                    print(f"{path} [{logical_line.current_physical_start}," +
+                          f" {logical_line.current_physical_end}) ({logical_line.local_sloc}):"
+                          f" {logical_line.flushed_line} {logical_line.category}")
         except StopIteration as it:
             total_sloc, physical_loc = it.value
 
     return (path, total_sloc, physical_loc)
 
-def walk_sloc(root, regexp, verbose=False):
-    for root, dirs, files in os.walk(root):
-        for f in files:
-            full_path = os.path.join(root, f)
+def walk_sloc(in_root, regexp, verbose=False):
+    """
+    Run file_sloc on each file that matches regexp under root path.
+    """
+    for root, _, files in os.walk(in_root):
+        for current_file in files:
+            full_path = os.path.join(root, current_file)
             if regexp.match(full_path):
                 try:
-                    (filename, total_sloc, physical_loc)  = file_sloc(full_path)
-                    print(f"{filename}, {total_sloc}, {physical_loc}")
+                    (filename, total_sloc, physical_loc) = file_sloc(full_path)
+                    if verbose:
+                        print(f"{filename}, {total_sloc}, {physical_loc}")
                 except FileNotFoundError:
                     pass
 
-if __name__ == '__main__':
-    if len(sys.argv) == 2:
-        filename = sys.argv[1]
-        (filename, total_sloc, physical_loc)  = file_sloc(filename, verbose=True)
+def sloc_translate(args):
+    """
+    Toplevel routine for script.
+    """
+    if len(args) == 2:
+        (filename, total_sloc, physical_loc) = file_sloc(args[1], verbose=True)
         print(f"{filename}, {total_sloc}, {physical_loc}")
-    elif len(sys.argv) == 3:
-        walk_sloc(sys.argv[1], re.compile(sys.argv[2]))
+    elif len(args) == 3:
+        walk_sloc(args[1], re.compile(args[2]))
     else:
-        print("Expected either 1 argument (a single file to parse and print) or 2 (a directory root & file pattern)")
+        print("Expected either 1 argument (a single file to parse" +
+              " and print) or 2 (a directory root & file pattern)")
+
+if __name__ == '__main__':
+    sloc_translate(sys.argv)

From 9d30371a4301e3077e2d63147ec8421536793849 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 22 Nov 2019 14:09:16 -0800
Subject: [PATCH 31/49] Work around pylint errors

---
 codebasin/file_parser.py | 1 +
 codebasin/file_source.py | 4 ++++
 etc/sloc_translate.py    | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 52f9f5d..f088d63 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -153,6 +153,7 @@ def parse_file(self):
                     else:
                         groups['code'].add_line(phys_int, logical_line.local_sloc)
             except StopIteration as it:
+                # pylint: disable=unpacking-non-sequence
                 _, physical_loc = it.value
 
             if not groups['code'].empty():
diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index 310b7d9..2e677a8 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -19,6 +19,7 @@
 
 def is_whitespace(c):
     """Returns true if the character c is whitespace"""
+    # pylint: disable=global-statement
     global whitespace_dict
     return c in whitespace_dict
 
@@ -152,6 +153,7 @@ def process(self, lineiter):
         """
         Add contents of lineiter to outbuf, stripping as directed.
         """
+        # pylint: disable=too-many-branches,too-many-statements
         inbuffer = iter_keep1(lineiter)
         for char in inbuffer:
             if self.state[-1] == "TOPLEVEL":
@@ -279,6 +281,7 @@ def process(self, lineiter):
         Add contents of lineiter to current line, removing contents and
         handling continuations.
         """
+        # pylint: disable=too-many-branches,too-many-statements
         inbuffer = iter_keep1(lineiter)
         try:
             while True:
@@ -523,6 +526,7 @@ def fortran_file_source(fp, relaxed=False):
                 total_sloc += curr_line.physical_reset()
 
     except StopIteration as stopit:
+        # pylint: disable=unpacking-non-sequence
         _, total_physical_lines = stopit.value
 
     curr_line.physical_update(total_physical_lines)
diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index 946b9f9..c25fbd2 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -12,6 +12,7 @@
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 
+# pylint: disable=wrong-import-position
 from codebasin.file_source import get_file_source
 
 def file_sloc(path, verbose=False):
@@ -31,6 +32,7 @@ def file_sloc(path, verbose=False):
                           f" {logical_line.current_physical_end}) ({logical_line.local_sloc}):"
                           f" {logical_line.flushed_line} {logical_line.category}")
         except StopIteration as it:
+             # pylint: disable=unpacking-non-sequence
             total_sloc, physical_loc = it.value
 
     return (path, total_sloc, physical_loc)

From 6f207cb7c2ff3496448019ccf789bc6a9c67ea74 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 22 Nov 2019 14:11:21 -0800
Subject: [PATCH 32/49] Autopep8

---
 codebasin/file_parser.py        |  9 ++++--
 codebasin/file_source.py        | 49 +++++++++++++++++++++++++++------
 etc/sloc_translate.py           |  4 +++
 setup.py                        |  2 +-
 tests/comments/test_comments.py |  3 ++
 5 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index f088d63..152ce3d 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -8,6 +8,7 @@
 from codebasin.file_source import get_file_source
 from . import preprocessor  # pylint : disable=no-name-in-module
 
+
 class LineGroup:
     """
     Represents a grouping of lines. It contains the extent, and the
@@ -37,10 +38,11 @@ def add_line(self, phys_int, sloc_count):
         if self.start_line == -1 or phys_int[0] < self.start_line:
             self.start_line = phys_int[0]
 
-        if phys_int[1]-1 > self.end_line:
-            self.end_line = phys_int[1]-1
+        if phys_int[1] - 1 > self.end_line:
+            self.end_line = phys_int[1] - 1
 
         self.line_count += sloc_count
+
     def reset(self):
         """
         Reset the countable group
@@ -65,6 +67,7 @@ def merge(self, line_group):
         self.end_line = max(self.end_line, line_group.end_line)
         line_group.reset()
 
+
 class FileParser:
     """
     Contains methods for parsing an entire source file and returning a
@@ -157,7 +160,7 @@ def parse_file(self):
                 _, physical_loc = it.value
 
             if not groups['code'].empty():
-                groups['code'].add_line((groups['code'].start_line, physical_loc-1), 0)
+                groups['code'].add_line((groups['code'].start_line, physical_loc - 1), 0)
                 self.insert_code_node(out_tree, groups['code'])
                 groups['file'].merge(groups['code'])
 
diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index 2e677a8..4138125 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -8,29 +8,33 @@
 import itertools as it
 from .language import FileLanguage
 
-### This string was created by looking at all unicode code points
-### and checking to see if they are considered whitespace
-### ('\s') by the re module
+# This string was created by looking at all unicode code points
+# and checking to see if they are considered whitespace
+# ('\s') by the re module
 whitespace_dict = dict.fromkeys(''.join([' \t\n\r\x0b\x0c\x1c\x1d\x1e',
                                          '\x1f\x85\xa0\u1680\u2000\u2001',
                                          '\u2002\u2003\u2004\u2005\u2006',
                                          '\u2007\u2008\u2009\u200a\u2028',
                                          '\u2029\u202f\u205f\u3000']))
 
+
 def is_whitespace(c):
     """Returns true if the character c is whitespace"""
     # pylint: disable=global-statement
     global whitespace_dict
     return c in whitespace_dict
 
+
 class one_space_line:
     """
     A container that represents a single line of code while (generally)
     merging all whitespace into a single space.
     """
+
     def __init__(self):
         self.parts = []
         self.trailing_space = False
+
     def append_char(self, c):
         """
         Append a character of no particular class to the line.
@@ -43,6 +47,7 @@ def append_char(self, c):
             if not self.trailing_space:
                 self.parts.append(' ')
                 self.trailing_space = True
+
     def append_space(self):
         """
         Append whitespace to line, unless line already ends in a space.
@@ -50,9 +55,11 @@ def append_space(self):
         if not self.trailing_space:
             self.parts.append(' ')
             self.trailing_space = True
+
     def append_nonspace(self, c):
         self.parts.append(c)
         self.trailing_space = False
+
     def join(self, other):
         """
         Append another one_space_line to this one, respecting whitespace rules.
@@ -63,6 +70,7 @@ def join(self, other):
             else:
                 self.parts += other.parts[:]
             self.trailing_space = other.trailing_space
+
     def category(self):
         """
         Report the a category for this line:
@@ -81,6 +89,7 @@ def category(self):
         elif self.parts[:2] == ' #' or self.parts[0] == '#':
             res = "CPP_DIRECTIVE"
         return res
+
     def flush(self):
         """
         Convert the characters to a string and reset the buffer.
@@ -89,22 +98,27 @@ def flush(self):
         self.__init__()
         return res
 
+
 class iter_keep1:
     """
     An iterator wrapper that allows a single item to be 'put back'
     and picked up for the next iteration.
     """
+
     def __init__(self, iterator):
         self.iterator = iter(iterator)
         self.single = None
+
     def __iter__(self):
         return self
+
     def __next__(self):
         if self.single is not None:
             res, self.single = self.single, None
             return res
         else:
             return next(self.iterator)
+
     def putback(self, item):
         """
         Put item into the iterator such that it will be the next
@@ -113,6 +127,7 @@ def putback(self, item):
         assert self.single is None
         self.single = item
 
+
 class c_cleaner:
     """
     Approximation of the early stages of a C preprocessor.
@@ -120,6 +135,7 @@ class c_cleaner:
     with whitespace. State is kept across physical lines and cleared with
     logical_newline.
     """
+
     def __init__(self, outbuf, directives_only=False):
         """
         directives_only has the cleaner only operate on directive lines.
@@ -127,6 +143,7 @@ def __init__(self, outbuf, directives_only=False):
         self.state = ["TOPLEVEL"]
         self.outbuf = outbuf
         self.directives_only = directives_only
+
     def logical_newline(self):
         """
         Reset state when a logical newline is found.
@@ -149,6 +166,7 @@ def logical_newline(self):
             assert self.state[-1] == "IN_BLOCK_COMMENT"
         elif self.state[-1] == "CPP_DIRECTIVE":
             self.state = ["TOPLEVEL"]
+
     def process(self, lineiter):
         """
         Add contents of lineiter to outbuf, stripping as directed.
@@ -248,16 +266,19 @@ def process(self, lineiter):
             else:
                 assert None
 
+
 class fortran_cleaner:
     """
     'Cleans' source to remove comments and blanks while preserving
     directives and handling strings and continuations properly.
     Expects to have c defines already processed.
     """
+
     def __init__(self, outbuf):
         self.state = ["TOPLEVEL"]
         self.outbuf = outbuf
         self.verify_continue = []
+
     def dir_check(self, inbuffer):
         """
         Inspect comment to see if it is in fact, a valid directive,
@@ -276,6 +297,7 @@ def dir_check(self, inbuffer):
                 found.append(char)
             else:
                 return
+
     def process(self, lineiter):
         """
         Add contents of lineiter to current line, removing contents and
@@ -366,10 +388,12 @@ def process(self, lineiter):
             self.verify_continue = []
             self.state[-1] = "CONTINUING_FROM_SOL"
 
+
 class line_info:
     """
     Reprsents a logical line of code.
     """
+
     def __init__(self):
         self.current_logical_line = one_space_line()
         self.current_physical_start = 1
@@ -377,16 +401,19 @@ def __init__(self):
         self.local_sloc = 0
         self.category = None
         self.flushed_line = None
+
     def join(self, other_line):
         """
         Combine this logical line with another one.
         """
         self.current_logical_line.join(other_line)
+
     def physical_nonblank(self, n):
         """
         Mark nonblank link in this logical like.
         """
         self.local_sloc += n
+
     def physical_update(self, physical_line_num):
         """
         Mark end of new physical line.
@@ -394,6 +421,7 @@ def physical_update(self, physical_line_num):
         self.current_physical_end = physical_line_num
         self.category = self.current_logical_line.category()
         self.flushed_line = self.current_logical_line.flush()
+
     def physical_reset(self):
         """
         Prepare for next logical block. Return counted sloc.
@@ -403,8 +431,10 @@ def physical_reset(self):
         self.local_sloc = 0
         self.flushed_line = None
         return local_sloc_copy
+
     def phys_interval(self):
         return (self.current_physical_start, self.current_physical_end)
+
     def logical_result(self):
         """
         Return tuple of contents. Eventually should just return this class.
@@ -412,6 +442,7 @@ def logical_result(self):
         return ((self.current_physical_start, self.current_physical_end),
                 self.local_sloc, self.flushed_line, self.category)
 
+
 def c_file_source(fp, relaxed=False, directives_only=False):
     """
     Process file fp in terms of logical (sloc) and physical lines of C code.
@@ -436,10 +467,10 @@ def c_file_source(fp, relaxed=False, directives_only=False):
         end = len(line)
         if line[-1] == '\n':
             end -= 1
-        elif end > 0 and line[end-1] == '\\':
+        elif end > 0 and line[end - 1] == '\\':
             raise RuntimeError("file seems to end in \\ with no newline!")
 
-        continued = end > 0 and line[end-1] == '\\'
+        continued = end > 0 and line[end - 1] == '\\'
         if continued:
             end -= 1
         cleaner.process(it.islice(line, 0, end))
@@ -452,7 +483,7 @@ def c_file_source(fp, relaxed=False, directives_only=False):
         curr_line.join(current_physical_line)
 
         if not continued:
-            curr_line.physical_update(physical_line_num+1)
+            curr_line.physical_update(physical_line_num + 1)
             if curr_line.category != "BLANK":
                 yield curr_line
 
@@ -460,7 +491,7 @@ def c_file_source(fp, relaxed=False, directives_only=False):
 
     total_physical_lines = physical_line_num
 
-    curr_line.physical_update(physical_line_num+1)
+    curr_line.physical_update(physical_line_num + 1)
     if curr_line.category != "BLANK":
         yield curr_line
 
@@ -470,6 +501,7 @@ def c_file_source(fp, relaxed=False, directives_only=False):
 
     return (total_sloc, total_physical_lines)
 
+
 def fortran_file_source(fp, relaxed=False):
     """
     Process file fp in terms of logical (sloc) and physical lines of
@@ -493,7 +525,7 @@ def fortran_file_source(fp, relaxed=False):
         while True:
             src_c_line = next(c_walker)
             #((src_physical_start, src_physical_end), src_line_sloc, src_line, c_category)
-            #if it's a cpp directive, flush what we have, then emit the directive and start over
+            # if it's a cpp directive, flush what we have, then emit the directive and start over
             if current_physical_start is None:
                 current_physical_start = curr_line.current_physical_start
 
@@ -540,6 +572,7 @@ def fortran_file_source(fp, relaxed=False):
 
     return (total_sloc, total_physical_lines)
 
+
 def get_file_source(path):
     """
     Return a C or Fortran line source for path depending on
diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index c25fbd2..59c9076 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -15,6 +15,7 @@
 # pylint: disable=wrong-import-position
 from codebasin.file_source import get_file_source
 
+
 def file_sloc(path, verbose=False):
     """
     Process file in path, reporting total_sloc/loc. Optionally print logical regions.
@@ -37,6 +38,7 @@ def file_sloc(path, verbose=False):
 
     return (path, total_sloc, physical_loc)
 
+
 def walk_sloc(in_root, regexp, verbose=False):
     """
     Run file_sloc on each file that matches regexp under root path.
@@ -52,6 +54,7 @@ def walk_sloc(in_root, regexp, verbose=False):
                 except FileNotFoundError:
                     pass
 
+
 def sloc_translate(args):
     """
     Toplevel routine for script.
@@ -65,5 +68,6 @@ def sloc_translate(args):
         print("Expected either 1 argument (a single file to parse" +
               " and print) or 2 (a directory root & file pattern)")
 
+
 if __name__ == '__main__':
     sloc_translate(sys.argv)
diff --git a/setup.py b/setup.py
index 7c0c758..2fdbaf1 100644
--- a/setup.py
+++ b/setup.py
@@ -23,4 +23,4 @@
                         'matplotlib',
                         'pyyaml',
                         'scipy']
-)
+      )
diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py
index aa97472..ab672e6 100644
--- a/tests/comments/test_comments.py
+++ b/tests/comments/test_comments.py
@@ -6,6 +6,7 @@
 import os
 from codebasin import preprocessor, file_parser
 
+
 class TestExampleFortranFile(unittest.TestCase):
     """
     Test handling of fixed form Fortran
@@ -18,6 +19,7 @@ def test_fortran_comments(self):
         tree = parser.parse_file()
         self.assertEqual(tree.root.total_sloc, 20)
 
+
 class TestExampleCFile(unittest.TestCase):
     """
     Test handling of C comments
@@ -30,5 +32,6 @@ def test_c_comments(self):
         tree = parser.parse_file()
         self.assertEqual(tree.root.total_sloc, 25)
 
+
 if __name__ == '__main__':
     unittest.main()

From 357719a76116bb99f6ead6f82f6a2d0bf9034f26 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 3 Dec 2019 09:50:13 -0800
Subject: [PATCH 33/49] Toggle report title printouts

---
 codebasin.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/codebasin.py b/codebasin.py
index 97f64c9..6579339 100755
--- a/codebasin.py
+++ b/codebasin.py
@@ -98,8 +98,10 @@ def guess_project_name(config_path):
 
     output_prefix = os.path.realpath(guess_project_name(args.config_file))
 
-    print(f"Config file: {args.config_file}")
-    print(f"Root: {rootdir}")
+    if report_enabled("summary") or report_enabled("clustering"):
+        print(f"Config file: {args.config_file}")
+        print(f"Root: {rootdir}")
+
     # Print summary report
     if report_enabled("summary"):
         summary = report.summary(setmap)

From bf611fddbe49e2a6263cfbc10e64a8da76ffdff5 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 3 Dec 2019 09:50:53 -0800
Subject: [PATCH 34/49] Add verbose flag to sloc_translate

---
 etc/sloc_translate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index 59c9076..799052b 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -63,7 +63,7 @@ def sloc_translate(args):
         (filename, total_sloc, physical_loc) = file_sloc(args[1], verbose=True)
         print(f"{filename}, {total_sloc}, {physical_loc}")
     elif len(args) == 3:
-        walk_sloc(args[1], re.compile(args[2]))
+        walk_sloc(args[1], re.compile(args[2]), verbose=True)
     else:
         print("Expected either 1 argument (a single file to parse" +
               " and print) or 2 (a directory root & file pattern)")

From 197d8e39f29977940bf5b781a3347d9672227544 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 3 Dec 2019 11:14:48 -0800
Subject: [PATCH 35/49] Fixed form -> Freeform Fortran

---
 tests/comments/test_comments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/comments/test_comments.py b/tests/comments/test_comments.py
index ab672e6..f63d784 100644
--- a/tests/comments/test_comments.py
+++ b/tests/comments/test_comments.py
@@ -9,7 +9,7 @@
 
 class TestExampleFortranFile(unittest.TestCase):
     """
-    Test handling of fixed form Fortran
+    Test handling of freeform Fortran
     """
 
     def test_fortran_comments(self):

From 631a3fbed63c18b96b1962064a1418f44225a777 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Tue, 3 Dec 2019 11:16:17 -0800
Subject: [PATCH 36/49] Get rid of regexps in sloc_translate

---
 etc/sloc_translate.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index 799052b..72d7ca2 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -8,7 +8,7 @@
 
 import os
 import sys
-import re
+from pathlib import Path
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 
@@ -39,14 +39,14 @@ def file_sloc(path, verbose=False):
     return (path, total_sloc, physical_loc)
 
 
-def walk_sloc(in_root, regexp, verbose=False):
+def walk_sloc(in_root, extensions, verbose=False):
     """
     Run file_sloc on each file that matches regexp under root path.
     """
     for root, _, files in os.walk(in_root):
         for current_file in files:
             full_path = os.path.join(root, current_file)
-            if regexp.match(full_path):
+            if Path(full_path).suffix in extensions:
                 try:
                     (filename, total_sloc, physical_loc) = file_sloc(full_path)
                     if verbose:
@@ -63,7 +63,8 @@ def sloc_translate(args):
         (filename, total_sloc, physical_loc) = file_sloc(args[1], verbose=True)
         print(f"{filename}, {total_sloc}, {physical_loc}")
     elif len(args) == 3:
-        walk_sloc(args[1], re.compile(args[2]), verbose=True)
+        cleaned = [f".{x}" for x in args[2].split(',')]
+        walk_sloc(args[1], cleaned, verbose=True)
     else:
         print("Expected either 1 argument (a single file to parse" +
               " and print) or 2 (a directory root & file pattern)")

From 77cf05f9cd05fb2067176068e0c1581f891bd0d8 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Wed, 4 Dec 2019 11:42:06 -0800
Subject: [PATCH 37/49] Add batchmode flag to guard extra report info

---
 codebasin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/codebasin.py b/codebasin.py
index 6579339..b2fa223 100755
--- a/codebasin.py
+++ b/codebasin.py
@@ -72,6 +72,8 @@ def guess_project_name(config_path):
     parser.add_argument('-R', '--report', dest='reports', metavar='REPORT', default=['all'],
                         choices=['all', 'summary', 'clustering'], nargs='+',
                         help='desired output reports (default: all)')
+    parser.add_argument('--batchmode', dest='batchmode', action='store_true', default=False,
+                        help="Set batch mode (additional output for bulk operation.)")
     args = parser.parse_args()
 
     stdout_log = logging.StreamHandler(sys.stdout)
@@ -98,7 +100,7 @@ def guess_project_name(config_path):
 
     output_prefix = os.path.realpath(guess_project_name(args.config_file))
 
-    if report_enabled("summary") or report_enabled("clustering"):
+    if args.batchmode and (report_enabled("summary") or report_enabled("clustering")):
         print(f"Config file: {args.config_file}")
         print(f"Root: {rootdir}")
 

From faf9a204ae80a6e5424ba53ca13d059452a3504a Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Thu, 19 Mar 2020 06:57:26 -0700
Subject: [PATCH 38/49] Add default configfile in rootdir detetction

---
 codebasin.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/codebasin.py b/codebasin.py
index b2fa223..d914760 100755
--- a/codebasin.py
+++ b/codebasin.py
@@ -9,7 +9,7 @@
 optional arguments:
   -h, --help            show this help message and exit
   -c FILE, --config FILE
-                        configuration file (default: config.yaml)
+                        configuration file (default: <DIR>/config.yaml)
   -v, --verbose         verbosity level
   -q, --quiet           quiet level
   -r DIR, --rootdir DIR
@@ -61,7 +61,7 @@ def guess_project_name(config_path):
     # Read command-line arguments
     parser = argparse.ArgumentParser(description="Code Base Investigator v" + str(version))
     parser.add_argument('-c', '--config', dest='config_file', metavar='FILE', action='store',
-                        default='config.yaml', help='configuration file (default: config.yaml)')
+                        help='configuration file (default: <DIR>/config.yaml)')
     parser.add_argument('-v', '--verbose', dest='verbose',
                         action='count', default=0, help='increase verbosity level')
     parser.add_argument('-q', '--quiet', dest='quiet',
@@ -83,12 +83,16 @@ def guess_project_name(config_path):
         max(1, logging.WARNING - 10 * (args.verbose - args.quiet)))
     rootdir = os.path.realpath(args.rootdir)
 
+    if args.config_file == None:
+        config_file = os.path.join(rootdir, "config.yaml")
+    else:
+        config_file = args.config_file
     # Load the configuration file into a dict
-    if not util.ensure_yaml(args.config_file):
+    if not util.ensure_yaml(config_file):
         logging.getLogger("codebasin").error(
             "Configuration file does not have YAML file extension.")
         sys.exit(1)
-    codebase, configuration = config.load(args.config_file, rootdir)
+    codebase, configuration = config.load(config_file, rootdir)
 
     # Parse the source tree, and determine source line associations.
     # The trees and associations are housed in state.
@@ -98,10 +102,10 @@ def guess_project_name(config_path):
     platform_mapper = walkers.PlatformMapper(codebase)
     setmap = platform_mapper.walk(state)
 
-    output_prefix = os.path.realpath(guess_project_name(args.config_file))
+    output_prefix = os.path.realpath(guess_project_name(config_file))
 
     if args.batchmode and (report_enabled("summary") or report_enabled("clustering")):
-        print(f"Config file: {args.config_file}")
+        print(f"Config file: {config_file}")
         print(f"Root: {rootdir}")
 
     # Print summary report

From b724e01bddbc1374e869326055dd3b550a62e6fc Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Mon, 28 Oct 2019 08:00:14 -0700
Subject: [PATCH 39/49] Fix typo in comment

---
 codebasin/preprocessor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codebasin/preprocessor.py b/codebasin/preprocessor.py
index 517ffac..15f19e9 100644
--- a/codebasin/preprocessor.py
+++ b/codebasin/preprocessor.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # pylint: disable=too-many-lines
 """
-Dontains classes that define:
+Contains classes that define:
 - Nodes from the tree
 - Tokens from lexing a line of code
 - Operators to handle tokens

From 750a1edcdf29d24ed3f17de87aba39dcdc880384 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Thu, 19 Mar 2020 08:54:20 -0700
Subject: [PATCH 40/49] Move rootdir options up above config options

This makes the refeence to <DIR> more clear.
---
 codebasin.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/codebasin.py b/codebasin.py
index d914760..074eade 100755
--- a/codebasin.py
+++ b/codebasin.py
@@ -60,15 +60,15 @@ def guess_project_name(config_path):
 
     # Read command-line arguments
     parser = argparse.ArgumentParser(description="Code Base Investigator v" + str(version))
+    parser.add_argument('-r', '--rootdir', dest="rootdir", metavar='DIR',
+                        default=os.getcwd(), type=str,
+                        help="Set working root directory (default .)")
     parser.add_argument('-c', '--config', dest='config_file', metavar='FILE', action='store',
                         help='configuration file (default: <DIR>/config.yaml)')
     parser.add_argument('-v', '--verbose', dest='verbose',
                         action='count', default=0, help='increase verbosity level')
     parser.add_argument('-q', '--quiet', dest='quiet',
                         action='count', default=0, help='decrease verbosity level')
-    parser.add_argument('-r', '--rootdir', dest="rootdir", metavar='DIR',
-                        default=os.getcwd(), type=str,
-                        help="Set working root directory (default .)")
     parser.add_argument('-R', '--report', dest='reports', metavar='REPORT', default=['all'],
                         choices=['all', 'summary', 'clustering'], nargs='+',
                         help='desired output reports (default: all)')

From 807c8f71fcf50ddc009d4d1ff05e11db2f855c38 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Fri, 29 May 2020 13:51:59 -0700
Subject: [PATCH 41/49] Apply autopep8 and pylint fixes

---
 codebasin.py             | 2 +-
 codebasin/file_parser.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/codebasin.py b/codebasin.py
index 074eade..9e6a6eb 100755
--- a/codebasin.py
+++ b/codebasin.py
@@ -83,7 +83,7 @@ def guess_project_name(config_path):
         max(1, logging.WARNING - 10 * (args.verbose - args.quiet)))
     rootdir = os.path.realpath(args.rootdir)
 
-    if args.config_file == None:
+    if args.config_file is None:
         config_file = os.path.join(rootdir, "config.yaml")
     else:
         config_file = args.config_file
diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 152ce3d..86dfb8f 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -133,8 +133,7 @@ def parse_file(self):
 
             groups = {'code': LineGroup(),
                       'directive': LineGroup(),
-                      'file': LineGroup()
-                      }
+                      'file': LineGroup()}
 
             groups['file'].start_line = 1
 

From d0848c9d954928134a5f7f67efde0158a646b349 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 1 Jun 2020 07:04:58 -0700
Subject: [PATCH 42/49] Bump version number to 1.05

---
 codebasin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codebasin.py b/codebasin.py
index 9e6a6eb..bdb6ecf 100755
--- a/codebasin.py
+++ b/codebasin.py
@@ -25,7 +25,7 @@
 
 from codebasin import config, finder, report, util, walkers
 
-version = 1.0
+version = 1.05
 
 
 def report_enabled(name):

From ab4fcc0de966b790e61a422e4575359053597457 Mon Sep 17 00:00:00 2001
From: Douglas Jacobsen <douglas.w.jacobsen@intel.com>
Date: Mon, 1 Jun 2020 07:40:43 -0700
Subject: [PATCH 43/49] Prevent import reorder in sloc_translate

This commit adds `# nopep8` to the end of the sloc_translate.py
get_file_source import, to prevent autopep8 from reordering it
automatically.
---
 etc/sloc_translate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index 72d7ca2..f252d96 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -13,7 +13,7 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 
 # pylint: disable=wrong-import-position
-from codebasin.file_source import get_file_source
+from codebasin.file_source import get_file_source  # nopep8
 
 
 def file_sloc(path, verbose=False):

From 0423ccf0add74b676b18539f33186d46c34c7d3c Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 19 Jun 2020 12:14:42 -0700
Subject: [PATCH 44/49] Replace asserts in file_source.py with exceptions

---
 codebasin/file_source.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index 4138125..f625bf4 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -124,7 +124,8 @@ def putback(self, item):
         Put item into the iterator such that it will be the next
         yielded item.
         """
-        assert self.single is None
+        if self.single is not None:
+            raise RuntimeError("iter_keep1 can only have one item put back at a time!")
         self.single = item
 
 
@@ -163,7 +164,8 @@ def logical_newline(self):
             self.state = ["TOPLEVEL"]
         elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
             self.state.pop()
-            assert self.state[-1] == "IN_BLOCK_COMMENT"
+            if not self.state[-1] == "IN_BLOCK_COMMENT":
+                raise RuntimeError("Inconsistent parser state! Looking for / to terminates a block comment but not in a block comment!")
         elif self.state[-1] == "CPP_DIRECTIVE":
             self.state = ["TOPLEVEL"]
 
@@ -252,19 +254,21 @@ def process(self, lineiter):
             elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
                 if char == '/':
                     self.state.pop()
-                    assert self.state[-1] == "IN_BLOCK_COMMENT"
+                    if not self.state[-1] == "IN_BLOCK_COMMENT":
+                        raise RuntimeError("Inconsistent parser state! Looking for / to terminates a block comment but not in a block comment!")
                     self.state.pop()
                     self.outbuf.append_space()
                 elif char != '*':
                     self.state.pop()
-                    assert self.state[-1] == "IN_BLOCK_COMMENT"
+                    if not self.state[-1] == "IN_BLOCK_COMMENT":
+                        raise RuntimeError("Inconsistent parser Looking for * that terminates a block comment but not in a block comment!")
             elif self.state[-1] == "ESCAPING":
                 self.outbuf.append_nonspace(char)
                 self.state.pop()
             elif self.state[-1] == "IN_INLINE_COMMENT":
                 return
             else:
-                assert None
+                raise RuntimeError("Unknown parser state!")
 
 
 class fortran_cleaner:
@@ -379,7 +383,7 @@ def process(self, lineiter):
                     elif is_whitespace(char):
                         self.verify_continue.append(char)
                 else:
-                    assert None
+                    raise RuntimeError("Unknown parser state")
         except StopIteration:
             pass
         if self.state[-1] == "CONTINUING_TO_EOL":
@@ -496,8 +500,8 @@ def c_file_source(fp, relaxed=False, directives_only=False):
         yield curr_line
 
     total_sloc += curr_line.physical_reset()
-    if not relaxed:
-        assert cleaner.state == ["TOPLEVEL"]
+    if not relaxed and not cleaner.state == ["TOPLEVEL"]:
+        raise RuntimeError("C file parser did not end at top level, and not in 'relaxed' mode")
 
     return (total_sloc, total_physical_lines)
 
@@ -567,8 +571,8 @@ def fortran_file_source(fp, relaxed=False):
         yield curr_line
 
     total_sloc += curr_line.physical_reset()
-    if not relaxed:
-        assert cleaner.state == ["TOPLEVEL"]
+    if not relaxed and not cleaner.state == ["TOPLEVEL"]:
+        raise RuntimeError("Fortran file parser did not end at top level, and not in 'relaxed' mode")
 
     return (total_sloc, total_physical_lines)
 

From be0d0c12d611a3699eeea0646bcd492ac0b1e44c Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 19 Jun 2020 12:17:27 -0700
Subject: [PATCH 45/49] Bump version number in setup.py to 1.05

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2fdbaf1..6fc5d44 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 from setuptools import setup
 
 setup(name='codebasin',
-      version='1.0',
+      version='1.05',
       description='Code Base Investigator',
       author='John Pennycook',
       author_email='john.pennycook@intel.com',

From 17218f2a8a7cda706e06fad31e36f379eacefd1d Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 19 Jun 2020 12:18:17 -0700
Subject: [PATCH 46/49] Bump python_requires in setup.py to 3.6

We use f-strings, which are found in 3.6+
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6fc5d44..1446a5c 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
                    'License :: OSI Approved :: BSD License',
                    'Programming Language :: Python',
                    'Topic :: Software Development'],
-      python_requires='>=3.4',
+      python_requires='>=3.6',
       install_requires=['numpy',
                         'matplotlib',
                         'pyyaml',

From b5a808a4c148e4692c02f5e0eff4d5f5ea15753a Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 19 Jun 2020 12:36:37 -0700
Subject: [PATCH 47/49] Fix typos in exception strings

---
 codebasin/file_source.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/codebasin/file_source.py b/codebasin/file_source.py
index f625bf4..e887b98 100644
--- a/codebasin/file_source.py
+++ b/codebasin/file_source.py
@@ -165,7 +165,8 @@ def logical_newline(self):
         elif self.state[-1] == "IN_BLOCK_COMMENT_FOUND_STAR":
             self.state.pop()
             if not self.state[-1] == "IN_BLOCK_COMMENT":
-                raise RuntimeError("Inconsistent parser state! Looking for / to terminates a block comment but not in a block comment!")
+                raise RuntimeError(
+                    "Inconsistent parser state! Looking for / to terminate a block comment but not in a block comment!")
         elif self.state[-1] == "CPP_DIRECTIVE":
             self.state = ["TOPLEVEL"]
 
@@ -255,13 +256,15 @@ def process(self, lineiter):
                 if char == '/':
                     self.state.pop()
                     if not self.state[-1] == "IN_BLOCK_COMMENT":
-                        raise RuntimeError("Inconsistent parser state! Looking for / to terminates a block comment but not in a block comment!")
+                        raise RuntimeError(
+                            "Inconsistent parser state! Looking for / to terminate a block comment but not in a block comment!")
                     self.state.pop()
                     self.outbuf.append_space()
                 elif char != '*':
                     self.state.pop()
                     if not self.state[-1] == "IN_BLOCK_COMMENT":
-                        raise RuntimeError("Inconsistent parser Looking for * that terminates a block comment but not in a block comment!")
+                        raise RuntimeError(
+                            "Inconsistent parser state! Looking for * that terminates a block comment but not in a block comment!")
             elif self.state[-1] == "ESCAPING":
                 self.outbuf.append_nonspace(char)
                 self.state.pop()
@@ -572,7 +575,8 @@ def fortran_file_source(fp, relaxed=False):
 
     total_sloc += curr_line.physical_reset()
     if not relaxed and not cleaner.state == ["TOPLEVEL"]:
-        raise RuntimeError("Fortran file parser did not end at top level, and not in 'relaxed' mode")
+        raise RuntimeError(
+            "Fortran file parser did not end at top level, and not in 'relaxed' mode")
 
     return (total_sloc, total_physical_lines)
 

From b1d1fafe381afde509e674f6e232616e005fa445 Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 10 Jul 2020 06:52:51 -0700
Subject: [PATCH 48/49] Use realpath in sloc_translate to resolve symlinks

This allows us to check real file extensions
---
 etc/sloc_translate.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index f252d96..86d4a99 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -43,9 +43,10 @@ def walk_sloc(in_root, extensions, verbose=False):
     """
     Run file_sloc on each file that matches regexp under root path.
     """
+    in_root = os.path.realpath(in_root)
     for root, _, files in os.walk(in_root):
         for current_file in files:
-            full_path = os.path.join(root, current_file)
+            full_path = os.path.realpath(os.path.join(root, current_file))
             if Path(full_path).suffix in extensions:
                 try:
                     (filename, total_sloc, physical_loc) = file_sloc(full_path)
@@ -60,7 +61,8 @@ def sloc_translate(args):
     Toplevel routine for script.
     """
     if len(args) == 2:
-        (filename, total_sloc, physical_loc) = file_sloc(args[1], verbose=True)
+        path = os.path.realpath(args[1])
+        (filename, total_sloc, physical_loc) = file_sloc(path, verbose=True)
         print(f"{filename}, {total_sloc}, {physical_loc}")
     elif len(args) == 3:
         cleaned = [f".{x}" for x in args[2].split(',')]

From a046504bd596a13d4991e30dd08ed4e75e9a3f4b Mon Sep 17 00:00:00 2001
From: Jason Sewall <jason.sewall@intel.com>
Date: Fri, 10 Jul 2020 07:22:14 -0700
Subject: [PATCH 49/49] Add safe_open_read_nofollow and use it

This wraps open() for reading but prevents us from following links. By
using os.path.realpath to resolve links first, then checking
extensions, and then using this function, we can be certain that we
don't read a file with a bogus extension.
---
 codebasin/config.py       | 4 ++--
 codebasin/file_parser.py  | 6 ++++--
 codebasin/preprocessor.py | 2 +-
 codebasin/util.py         | 6 ++++++
 etc/sloc_translate.py     | 3 ++-
 5 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/codebasin/config.py b/codebasin/config.py
index 4cb49aa..3a6516d 100644
--- a/codebasin/config.py
+++ b/codebasin/config.py
@@ -134,7 +134,7 @@ def load_database(dbpath, rootdir):
     Return a list of compilation commands, where each command is
     represented as a compilation database entry.
     """
-    with open(dbpath, 'r') as fi:
+    with util.safe_open_read_nofollow(dbpath, 'r') as fi:
         db = yaml.safe_load(fi)
 
     configuration = []
@@ -244,7 +244,7 @@ def load(config_file, rootdir):
     Return a (codebase, platform configuration) tuple of dicts.
     """
     if os.path.isfile(config_file):
-        with open(config_file, 'r') as f:
+        with util.safe_open_read_nofollow(config_file, 'r') as f:
             config = yaml.safe_load(f)
     else:
         raise RuntimeError("Could not open {!s}.".format(config_file))
diff --git a/codebasin/file_parser.py b/codebasin/file_parser.py
index 86dfb8f..97ef72c 100644
--- a/codebasin/file_parser.py
+++ b/codebasin/file_parser.py
@@ -5,8 +5,10 @@
 and building a tree of nodes from it.
 """
 
+import os
 from codebasin.file_source import get_file_source
 from . import preprocessor  # pylint : disable=no-name-in-module
+from . import util  # pylint : disable=no-name-in-module
 
 
 class LineGroup:
@@ -76,7 +78,7 @@ class FileParser:
     """
 
     def __init__(self, _filename):
-        self._filename = _filename
+        self._filename = os.path.realpath(_filename)
 
     @staticmethod
     def handle_directive(out_tree, groups, logical_line):
@@ -129,7 +131,7 @@ def parse_file(self):
         if not file_source:
             raise RuntimeError(f"{self._filename} doesn't appear " +
                                "to be a language this tool can process")
-        with open(self._filename, mode='r', errors='replace') as source_file:
+        with util.safe_open_read_nofollow(self._filename, mode='r', errors='replace') as source_file:
 
             groups = {'code': LineGroup(),
                       'directive': LineGroup(),
diff --git a/codebasin/preprocessor.py b/codebasin/preprocessor.py
index 15f19e9..2b0088d 100644
--- a/codebasin/preprocessor.py
+++ b/codebasin/preprocessor.py
@@ -443,7 +443,7 @@ def __init__(self, _filename):
     def __compute_file_hash(self):
         chunk_size = 4096
         hasher = hashlib.sha512()
-        with open(self.filename, 'rb') as in_file:
+        with util.safe_open_read_nofollow(self.filename, 'rb') as in_file:
             for chunk in iter(lambda: in_file.read(chunk_size), b""):
                 hasher.update(chunk)
 
diff --git a/codebasin/util.py b/codebasin/util.py
index b65f6cc..67bad44 100644
--- a/codebasin/util.py
+++ b/codebasin/util.py
@@ -50,6 +50,12 @@ def safe_open_write_binary(fname):
     return os.fdopen(fpid, "wb")
 
 
+def safe_open_read_nofollow(fname, *args, **kwargs):
+    """Open fname for reading, but don't follow links."""
+    fpid = os.open(fname, os.O_RDONLY | os.O_NOFOLLOW)
+    return os.fdopen(fpid, *args, **kwargs)
+
+
 def valid_path(path):
     """Return true if the path passed in is valid"""
     valid = True
diff --git a/etc/sloc_translate.py b/etc/sloc_translate.py
index 86d4a99..6a28382 100755
--- a/etc/sloc_translate.py
+++ b/etc/sloc_translate.py
@@ -14,6 +14,7 @@
 
 # pylint: disable=wrong-import-position
 from codebasin.file_source import get_file_source  # nopep8
+from codebasin.util import safe_open_read_nofollow  # nopep8
 
 
 def file_sloc(path, verbose=False):
@@ -23,7 +24,7 @@ def file_sloc(path, verbose=False):
     file_source = get_file_source(path)
     if not file_source:
         raise RuntimeError(f"{path} doesn't appear to be a language this tool can process")
-    with open(path, mode='r', errors='replace') as source_file:
+    with safe_open_read_nofollow(path, mode='r', errors='replace') as source_file:
         walker = file_source(source_file, relaxed=False)
         try:
             while True: