[codeface] [PATCH 6/9] Extract parsing functionality of _getFeatureLines in separate functions to make them unit-testable.

From: Matthias Dittrich <matthi.d@xxxxxxxxxxxxxx>
To: codeface@xxxxxxxxxxxxx
Date: Fri, 17 Oct 2014 15:14:23 +0200
- parseFeatureLine function to parse a single line of cppstats output.
- Added FileDict class to encapsulate information about the features of a 
source code file.
- parseFeatureLines function to generate a FileDict instance from all lines of 
cppstats output.

Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx>
---
 codeface/VCS.py        | 217 +++++++++++++++++++++++++++++++++++--------------
 codeface/fileCommit.py |  72 +++++++++++++---
 2 files changed, 217 insertions(+), 72 deletions(-)

diff --git a/codeface/VCS.py b/codeface/VCS.py
index a62c72a..8bf7022 100644
--- a/codeface/VCS.py
+++ b/codeface/VCS.py
@@ -34,6 +34,8 @@
 # VCS-specific.
 # TODO: Unify range handling. Either a range is always a list, or always
 # represented by two parameters.
+import itertools
+import readline
 
 import commit
 import fileCommit
@@ -44,6 +46,7 @@ import ctags
 import tempfile
 import source_analysis
 import shutil
+from fileCommit import FileDict
 from progressbar import ProgressBar, Percentage, Bar, ETA
 from ctags import CTags, TagEntry
 from logging import getLogger; log = getLogger(__name__)
@@ -183,6 +186,140 @@ class VCS:
         return subsys=="__main__" or subsys in self.subsys_description.keys()
 
 
+def parseSepLine (line):
+    if not line.startswith("\"sep="):
+        raise ParseError(
+                    "expected that the csv file header starts with '\"sep=' 
but it started with '{}'"
+                    .format(line), 'CSVFile')
+    stripped = line.rstrip()
+    if not stripped.endswith("\""):
+        raise ParseError(
+                    "expected that the csv file header ends with '\"' but the 
line was '{}'"
+                    .format(line), 'CSVFile')
+    return stripped[5:-1]
+
+def parseline(sep, line):
+    """
+    Parses a line from a csv file
+    :param sep:
+    :param line:
+    :return:
+    """
+    # TODO: Handle escaping: sep is escaped with quotes, quotes are escaped 
with quotes
+    # 'test,test' will be '"test,test"' in the csv file
+    # 'test"this,"test' will be '"test""this,""test"' in the csv file
+    return [l.strip() for l in line.split(sep)]
+
+
+def enum(*sequential, **named):
+    enums = dict(zip(sequential, range(len(sequential))), **named)
+    reverse = dict((value, key) for key, value in enums.iteritems())
+    enums['reverse_mapping'] = reverse
+    return type('Enum', (), enums)
+
+
+LineType = enum(IF='#if', ELSE='#else', ELIF='#elif')
+
+
+def parseFeatureLine(sep, line):
+    """
+    parse the current line which is something like: feature_list, start_line, 
end_line
+    :param line: the line to parse
+    :return: start_line, end_line, feature_list
+    """
+    parsedline = parseline(sep, line)
+    # FILENAME,LINE_START,LINE_END,TYPE,EXPRESSION,CONSTANTS
+    try:
+        start_line = int(parsedline[1])
+        end_line = int(parsedline[2])
+        line_type = LineType.reverse_mapping[parsedline[3]]
+        feature_list = parsedline[5].split(';')
+        return start_line, end_line, line_type, feature_list
+    except ValueError:
+        raise ParseError(
+                    "could not parse feature line (most likely because we 
could not parse the start- or end-line which should be on index 2 and 3): 
\"{}\""
+                    .format(line), 'CSVFile')
+
+
+
+
+
+
+def getFeatureLines(parsed_lines, filename):
+    """
+    calculates an dictionary representing changes in the current feature set 
and a sorted helper list for calculating
+    indices of the dictionary for any line
+    :param parsed_lines: a list of tuples with (start_line, end_line, 
feature_list) elements
+    :param filename: the name or the analysed files (only used for descriptive 
error messages if the calculation fails
+    :return: line_nums, feature_lines: the first is a sorted list to be able 
to access the feature list for any line
+    (while only changing lines are in the dictionary)
+    """
+    # mapping line -> feature list, we only add changing elements
+    feature_lines = FileDict()
+    feature_lines.add_line(0, [])
+
+    # we want a format like (is_start, features) for every changing line
+    better_format = {}
+    # We assume that every line is used at most once as start_line or end_line
+
+    def check_line(line):
+        if line in better_format:
+            raise ParseError(
+                "every line index can be used at most once (problematic line 
was {0} in file {1})"
+                .format(line, filename), filename)
+
+    for start_line, end_line, line_type, feature_list in parsed_lines:
+        if start_line >= end_line:
+            raise ParseError(
+                "start_line can't be greater or equal to end_line (problematic 
line was {0} in file {1})"
+                .format(start_line, filename), filename)
+        if line_type == 'IF':
+            # ifs start on their own line, however the end_line could already 
be used by the start of an else/elif
+            check_line(start_line)
+            if end_line in better_format:
+                end_line -= 1
+            check_line(end_line)
+            better_format[start_line] = (True, feature_list)
+            better_format[end_line] = (False, feature_list)
+        else:
+            # we try to mostly ignore else and elif if the feature_list 
doesn't change
+            is_start, old_feature_list = better_format[start_line]
+            if (not is_start) and old_feature_list == feature_list:
+                # moving to the new end
+                del better_format[start_line]
+                better_format[end_line] = is_start, old_feature_list
+            elif is_start:
+                raise ParseError(
+                    "line {0} appeared twice as start line (problematic file 
was {1})"
+                    .format(start_line, filename), filename)
+            else:
+                # So we have a elif with different features,
+                # so we start more features now end add them to the ending 
later
+                del better_format[start_line]
+                better_format[start_line] = (True, feature_list)
+                better_format[end_line] = (False, old_feature_list + 
feature_list)
+
+    for line in sorted(better_format):
+        is_start, features = better_format[line]
+        # Get last infos
+        last_feature_list = feature_lines.get_line_info_raw(line)
+        # Copy last list and create new list for current line
+        new_feature_list = list(last_feature_list)
+        if is_start:
+            for r in features:
+                new_feature_list.insert(0, r)
+        else:
+            for r in reversed(features):
+                item = new_feature_list.pop(0)
+                assert(item == r)
+                ##new_feature_list.remove(r)
+            # Remove in next line (because we want to count the current #endif 
line as well).
+            line += 1
+
+        feature_lines.add_line(line, new_feature_list)
+    return feature_lines
+
+
 class gitVCS (VCS):
     def __init__(self):
         VCS.__init__(self) # Python OOP braindamage
@@ -1070,10 +1207,10 @@ class gitVCS (VCS):
 
 
     def _getFeatureLines(self, file_layout_src, file_commit):
-        '''
+        """
         similar to _getFunctionLines but computes the line numbers of each
         feature in the file.
-        '''
+        """
         '''
         - Input -
         file_name: original name of the file, used only to determine the
@@ -1093,7 +1230,7 @@ class gitVCS (VCS):
 
         # temporary file where we write transient data needed for ctags
         srcFile = tempfile.NamedTemporaryFile(suffix=fileExt)
-        featurefile = tempfile.NamedTemporaryFile()
+        featurefile = tempfile.NamedTemporaryFile(suffix=".csv")
         # generate a source code file from the file_layout_src dictionary
         # and save it to a temporary location
         for line in file_layout_src:
@@ -1101,68 +1238,28 @@ class gitVCS (VCS):
         srcFile.flush()
 
         # run cppstats analysis on the file to get the feature locations
-        cmd = "cppstats -f {0} {1}".format(featurefile.name, 
srcFile.name).split()
-        output = execute_command(cmd).splitlines()
-
-        # mapping line -> feature list, we only add changing elements
-        feature_lines = {0: []}
-        # Helper list to get the last element of feature_lines (which contains 
only lines with changes)
-        line_nums = [0]
-
-        def parse_result_line(line):
-            """
-            parse the current line which is something like: feature_list, 
start_line, end_line
-            :param line: the line to parse
-            :return: start_line, end_line, feature_list
-            """
-            start_line = 0
-            end_line = 0
-            feature_list = {}
-            return start_line, end_line, feature_list
-
-        try:
-            results_file = open(featurefile.name, 'r')
-            parsed_lines = [parse_result_line(featureLine) for featureLine in 
results_file]
-            # we want a format like (is_start, features) for every changing 
line
-            better_format = {}
-            # We assume that every line is used at most once as start_line or 
end_line
-
-            def check_line(line):
-                if line in better_format:
-                    raise ParseError(
-                        "every line index can be used at most once 
(problematic line was {0} in file {1})"
-                        .format(line, file_commit.filename))
-
-            for start_line, end_line, feature_list in parsed_lines:
-                check_line(start_line)
-                check_line(end_line)
-                better_format[start_line] = (True, feature_list)
-                better_format[end_line] = (False, feature_list)
-
-            for line in sorted(better_format):
-                is_start, features = better_format[line]
-                # Get last line
-                line_nums.append(line)
-                last_feature_list_line = bisect.bisect_right(line_nums, line)
-                last_feature_list = feature_lines[last_feature_list_line-1]
-                # Copy last list and create new list for current line
-                new_feature_list = list(last_feature_list)
-                if is_start:
-                    new_feature_list.extend(features)
-                else:
-                    for r in features:
-                        new_feature_list.remove(r)
-                feature_lines[line] = new_feature_list
-        except:
-            log.critical("was unable unable to parse feature information of 
cppstats")
-            raise
-
+        # TODO: fix hardcoded paths
+        # BUG: THIS IS VERY BAD HARDCODED CODE AND SHOULD BE FIXED,
+        # HOWEVER IT IS NOT CLEAR HOW CPPSTATS IS DISTRUBUTED NOR WHERE IT 
LIVES
+        oldPath = os.getenv("PYTHONPATH")
+        os.putenv("PYTHONPATH", "/home/drag0on/projects/cppstats/lib")
+        cmd = "/usr/bin/env python /home/drag0on/projects/cppstats/cppstats.py 
--kind featurelocations --file {0} {1}"\
+            .format(srcFile.name, featurefile.name).split()
+        output = execute_command(cmd, 
cwd="/home/drag0on/projects/cppstats").splitlines()
+        os.putenv("PYTHONPATH", oldPath)
+
+        results_file = open(featurefile.name, 'r')
+        sep = parseSepLine(next(results_file))
+        headlines = parseline(sep, next(results_file))
+        feature_lines = \
+            getFeatureLines(
+                [parseFeatureLine(sep, line) for line in results_file], 
file_commit.filename)
         # clean up temporary files
         srcFile.close()
         featurefile.close()
 
         # save result to the file commit instance
-        file_commit.setFeatureLines(line_nums, feature_lines)
+        file_commit.set_feature_infos(feature_lines)
 
     def cmtHash2CmtObj(self, cmtHash):
         '''
diff --git a/codeface/fileCommit.py b/codeface/fileCommit.py
index 956307a..2b8af8a 100644
--- a/codeface/fileCommit.py
+++ b/codeface/fileCommit.py
@@ -23,6 +23,61 @@ single file.'''
 import commit
 import bisect
 
+
+class FileDict:
+    """
+    A dictionary saving per-line information. We assume that we have 
information on any line,
+    and that we only have to save changing lines.
+    """
+    def __init__(self, line_list, line_dict):
+        """
+        :rtype : FileDict
+        """
+        self.line_list = line_list
+        self.line_dict = line_dict
+        self.lastItem = line_list[-1]
+
+    def __init__(self):
+        """
+        :rtype : FileDict
+        """
+        self.line_list = []
+        self.line_dict = {}
+        self.lastItem = -1
+
+    def __iter__(self):
+        return self.line_dict.__iter__()
+
+    def get_line_info_raw(self, line_nr):
+        """
+        Returns the info for the given line (if the line was never set, the 
info for the last set line is returned)
+        :param line_nr:
+        :return:
+        """
+        i = bisect.bisect_right(self.line_list, line_nr)
+        info_line = self.line_list[i-1]
+        return self.line_dict[info_line]
+
+    def get_line_info(self, line_nr):
+        return set(self.get_line_info_raw(line_nr))
+
+    def add_line(self, line_nr, info):
+        """
+        Add the given information to the current dictionary.
+        Note: while filling the dictionary your line_nr has to be incremented!
+        :param line_nr:
+        :param info:
+        :return:
+        """
+        if line_nr < self.lastItem:
+            raise ValueError("can only incrementally add items")
+        self.line_list.append(line_nr)
+        self.line_dict[line_nr] = info
+
+    def values(self):
+        return self.line_dict.values()
+
+
 class FileCommit:
     def __init__(self):
 
@@ -58,11 +113,7 @@ class FileCommit:
         self._src_elem_list = []
 
         # dictionary with key = line number, value = feature list
-        self.featureLists = {}
-
-        # list of function line numbers in sorted order, this is for
-        # optimizing the process of finding a feature list given a line number
-        self.featureLineNums = [0]
+        self.feature_info = FileDict()
 
     #Getter/Setters
     def getFileSnapShots(self):
@@ -91,9 +142,9 @@ class FileCommit:
     def setSrcElems(self, src_elem_list):
         self._src_elem_list.extend(src_elem_list)
 
-    def setFeatureLines(self, featureLineNums, featureLists):
-        self.featureLists.update(featureLists)
-        self.featureLineNums = featureLineNums  # 
.extend(sorted(self.featureLists.iterkeys()))
+    def set_feature_infos(self, feature_line_infos):
+        self.feature_info = feature_line_infos
+
     #Methods
     def addFileSnapShot(self, key, dict):
         self.fileSnapShots[key] = dict
@@ -128,7 +179,4 @@ class FileCommit:
         self.functionImpl[id].append(srcLine)
 
     def findFeatureList(self, lineNum):
-        # returns the identifier of a feature given a line number
-        i = bisect.bisect_right(self.featureLineNums, lineNum)
-        featureLine = self.featureLineNums[i-1]
-        return self.featureLists[featureLine]
+        return self.feature_info.get_line_info(lineNum)
-- 
1.8.5.5
References:
- [codeface] Re: Feature-aware Collaboration Analysis with Codeface
  - From: Wolfgang Mauerer
- [codeface] [PATCH 0/9] Feature-aware Collaboration Analysis
  - From: Matthias Dittrich
[codeface] [PATCH 6/9] Extract parsing functionality of _getFeatureLines in separate functions to make them unit-testable.

Other related posts: