[codeface] Re: [PATCH 6/9] Extract parsing functionality of _getFeatureLines in separate functions to make them unit-testable.

From: Matthias Dittrich <matthi.d@xxxxxxxxxxxxxx>
To: codeface@xxxxxxxxxxxxx
Date: Tue, 18 Nov 2014 15:08:13 +0100

Hi Wolfgang,

On 17.11.2014 17:07, Wolfgang Mauerer wrote:


Am 17/10/2014 15:14, schrieb Matthias Dittrich:

- parseFeatureLine function to parse a single line of cppstats output.
- Added FileDict class to encapsulate information about the features of a 
source code file.
- parseFeatureLines function to generate a FileDict instance from all lines of 
cppstats output.

Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx>
---
  codeface/VCS.py        | 217 +++++++++++++++++++++++++++++++++++--------------
  codeface/fileCommit.py |  72 +++++++++++++---
  2 files changed, 217 insertions(+), 72 deletions(-)

diff --git a/codeface/VCS.py b/codeface/VCS.py
index a62c72a..8bf7022 100644
--- a/codeface/VCS.py
+++ b/codeface/VCS.py
@@ -34,6 +34,8 @@
  # VCS-specific.
  # TODO: Unify range handling. Either a range is always a list, or always
  # represented by two parameters.
+import itertools
+import readline

import commit

  import fileCommit
@@ -44,6 +46,7 @@ import ctags
  import tempfile
  import source_analysis
  import shutil
+from fileCommit import FileDict
  from progressbar import ProgressBar, Percentage, Bar, ETA
  from ctags import CTags, TagEntry
  from logging import getLogger; log = getLogger(__name__)
@@ -183,6 +186,140 @@ class VCS:
          return subsys=="__main__" or subsys in self.subsys_description.keys()

+def parseSepLine (line):

+    if not line.startswith("\"sep="):
+        raise ParseError(
+                    "expected that the csv file header starts with '\"sep=' but it 
started with '{}'"
+                    .format(line), 'CSVFile')
+    stripped = line.rstrip()
+    if not stripped.endswith("\""):
+        raise ParseError(
+                    "expected that the csv file header ends with '\"' but the line 
was '{}'"
+                    .format(line), 'CSVFile')
+    return stripped[5:-1]
+
+def parseline(sep, line):
+    """
+    Parses a line from a csv file
+    :param sep:
+    :param line:
+    :return:
+    """
+    # TODO: Handle escaping: sep is escaped with quotes, quotes are escaped 
with quotes
+    # 'test,test' will be '"test,test"' in the csv file
+    # 'test"this,"test' will be '"test""this,""test"' in the csv file
+    return [l.strip() for l in line.split(sep)]

any reason for not using the csv module?

Does it support the initial header and possibly utf-8?

+
+
+def enum(*sequential, **named):
+    enums = dict(zip(sequential, range(len(sequential))), **named)
+    reverse = dict((value, key) for key, value in enums.iteritems())
+    enums['reverse_mapping'] = reverse
+    return type('Enum', (), enums)
+
+
+LineType = enum(IF='#if', ELSE='#else', ELIF='#elif')
+
+
+def parseFeatureLine(sep, line):
+    """
+    parse the current line which is something like: feature_list, start_line, 
end_line

really something "like" or is your list an exact specification? From
the code below, it appears that the output follows this format, not the
current line that is parsed -- this is confusing.

Depends on what 'sep' is, however cppstats in fact uses ',' for now. Thecode currently is generic enough to survive if they change the separatorbut not if they start switching columns.


Best regards, Matthias

+    :param line: the line to parse
+    :return: start_line, end_line, feature_list
+    """
+    parsedline = parseline(sep, line)
+    # FILENAME,LINE_START,LINE_END,TYPE,EXPRESSION,CONSTANTS
+    try:
+        start_line = int(parsedline[1])
+        end_line = int(parsedline[2])
+        line_type = LineType.reverse_mapping[parsedline[3]]
+        feature_list = parsedline[5].split(';')
+        return start_line, end_line, line_type, feature_list
+    except ValueError:
+        raise ParseError(
+                    "could not parse feature line (most likely because we could not parse the 
start- or end-line which should be on index 2 and 3): \"{}\""
+                    .format(line), 'CSVFile')
+
+
+
+
+
+
+def getFeatureLines(parsed_lines, filename):
+    """
+    calculates an dictionary representing changes in the current feature set 
and a sorted helper list for calculating
+    indices of the dictionary for any line
+    :param parsed_lines: a list of tuples with (start_line, end_line, 
feature_list) elements
+    :param filename: the name or the analysed files (only used for descriptive 
error messages if the calculation fails

or->of

+    :return: line_nums, feature_lines: the first is a sorted list to be able 
to access the feature list for any line
+    (while only changing lines are in the dictionary)

please break lines at 70-75 characters

+    """
+    # mapping line -> feature list, we only add changing elements
+    feature_lines = FileDict()
+    feature_lines.add_line(0, [])
+
+    # we want a format like (is_start, features) for every changing line
+    better_format = {}
+    # We assume that every line is used at most once as start_line or end_line
+
+    def check_line(line):
+        if line in better_format:
+            raise ParseError(
+                "every line index can be used at most once (problematic line was 
{0} in file {1})"
+                .format(line, filename), filename)
+
+    for start_line, end_line, line_type, feature_list in parsed_lines:
+        if start_line >= end_line:
+            raise ParseError(
+                "start_line can't be greater or equal to end_line (problematic line 
was {0} in file {1})"
+                .format(start_line, filename), filename)
+        if line_type == 'IF':
+            # ifs start on their own line, however the end_line could already 
be used by the start of an else/elif
+            check_line(start_line)
+            if end_line in better_format:
+                end_line -= 1
+            check_line(end_line)
+            better_format[start_line] = (True, feature_list)
+            better_format[end_line] = (False, feature_list)
+        else:
+            # we try to mostly ignore else and elif if the feature_list 
doesn't change
+            is_start, old_feature_list = better_format[start_line]
+            if (not is_start) and old_feature_list == feature_list:
+                # moving to the new end
+                del better_format[start_line]
+                better_format[end_line] = is_start, old_feature_list
+            elif is_start:
+                raise ParseError(
+                    "line {0} appeared twice as start line (problematic file was 
{1})"
+                    .format(start_line, filename), filename)
+            else:
+                # So we have a elif with different features,
+                # so we start more features now end add them to the ending 
later
+                del better_format[start_line]
+                better_format[start_line] = (True, feature_list)
+                better_format[end_line] = (False, old_feature_list + 
feature_list)
+
+    for line in sorted(better_format):
+        is_start, features = better_format[line]
+        # Get last infos
+        last_feature_list = feature_lines.get_line_info_raw(line)
+        # Copy last list and create new list for current line
+        new_feature_list = list(last_feature_list)
+        if is_start:
+            for r in features:
+                new_feature_list.insert(0, r)
+        else:
+            for r in reversed(features):
+                item = new_feature_list.pop(0)
+                assert(item == r)
+                ##new_feature_list.remove(r)
+            # Remove in next line (because we want to count the current #endif 
line as well).
+            line += 1
+
+        feature_lines.add_line(line, new_feature_list)
+    return feature_lines
+
+
  class gitVCS (VCS):
      def __init__(self):
          VCS.__init__(self) # Python OOP braindamage
@@ -1070,10 +1207,10 @@ class gitVCS (VCS):

def _getFeatureLines(self, file_layout_src, file_commit):

-        '''
+        """
          similar to _getFunctionLines but computes the line numbers of each
          feature in the file.
-        '''
+        """
          '''
          - Input -
          file_name: original name of the file, used only to determine the
@@ -1093,7 +1230,7 @@ class gitVCS (VCS):

# temporary file where we write transient data needed for ctags

          srcFile = tempfile.NamedTemporaryFile(suffix=fileExt)
-        featurefile = tempfile.NamedTemporaryFile()
+        featurefile = tempfile.NamedTemporaryFile(suffix=".csv")
          # generate a source code file from the file_layout_src dictionary
          # and save it to a temporary location
          for line in file_layout_src:
@@ -1101,68 +1238,28 @@ class gitVCS (VCS):
          srcFile.flush()

# run cppstats analysis on the file to get the feature locations

-        cmd = "cppstats -f {0} {1}".format(featurefile.name, 
srcFile.name).split()
-        output = execute_command(cmd).splitlines()
-
-        # mapping line -> feature list, we only add changing elements
-        feature_lines = {0: []}
-        # Helper list to get the last element of feature_lines (which contains 
only lines with changes)
-        line_nums = [0]
-
-        def parse_result_line(line):
-            """
-            parse the current line which is something like: feature_list, 
start_line, end_line
-            :param line: the line to parse
-            :return: start_line, end_line, feature_list
-            """
-            start_line = 0
-            end_line = 0
-            feature_list = {}
-            return start_line, end_line, feature_list
-
-        try:
-            results_file = open(featurefile.name, 'r')
-            parsed_lines = [parse_result_line(featureLine) for featureLine in 
results_file]
-            # we want a format like (is_start, features) for every changing 
line
-            better_format = {}
-            # We assume that every line is used at most once as start_line or 
end_line
-
-            def check_line(line):
-                if line in better_format:
-                    raise ParseError(
-                        "every line index can be used at most once (problematic 
line was {0} in file {1})"
-                        .format(line, file_commit.filename))
-
-            for start_line, end_line, feature_list in parsed_lines:
-                check_line(start_line)
-                check_line(end_line)
-                better_format[start_line] = (True, feature_list)
-                better_format[end_line] = (False, feature_list)
-
-            for line in sorted(better_format):
-                is_start, features = better_format[line]
-                # Get last line
-                line_nums.append(line)
-                last_feature_list_line = bisect.bisect_right(line_nums, line)
-                last_feature_list = feature_lines[last_feature_list_line-1]
-                # Copy last list and create new list for current line
-                new_feature_list = list(last_feature_list)
-                if is_start:
-                    new_feature_list.extend(features)
-                else:
-                    for r in features:
-                        new_feature_list.remove(r)
-                feature_lines[line] = new_feature_list
-        except:
-            log.critical("was unable unable to parse feature information of 
cppstats")
-            raise

so you're refactoring a bit by moving code from the git specific
VCS class to the base class. Can you please note that in the
commit description? When reading the patch, I was a bit confused
at first to see code being added, and then the same code being
removed before I realised we're talking about different classes.

-
+        # TODO: fix hardcoded paths
+        # BUG: THIS IS VERY BAD HARDCODED CODE AND SHOULD BE FIXED,

indeed ;)

+        # HOWEVER IT IS NOT CLEAR HOW CPPSTATS IS DISTRUBUTED NOR WHERE IT 
LIVES

please mention in the installation document that cppstats is required
and where the source can be obtained. Then, ensure at the beginning of
the analysis run that the tool is available, and exit as early as
possible with an error message if it is not -- this avoids any partial
results in the database, which usually leads to hard to track errors.

+        oldPath = os.getenv("PYTHONPATH")
+        os.putenv("PYTHONPATH", "/home/drag0on/projects/cppstats/lib")
+        cmd = "/usr/bin/env python /home/drag0on/projects/cppstats/cppstats.py 
--kind featurelocations --file {0} {1}"\
+            .format(srcFile.name, featurefile.name).split()
+        output = execute_command(cmd, 
cwd="/home/drag0on/projects/cppstats").splitlines()
+        os.putenv("PYTHONPATH", oldPath)
+
+        results_file = open(featurefile.name, 'r')
+        sep = parseSepLine(next(results_file))
+        headlines = parseline(sep, next(results_file))
+        feature_lines = \
+            getFeatureLines(
+                [parseFeatureLine(sep, line) for line in results_file], 
file_commit.filename)
          # clean up temporary files
          srcFile.close()
          featurefile.close()

# save result to the file commit instance

-        file_commit.setFeatureLines(line_nums, feature_lines)
+        file_commit.set_feature_infos(feature_lines)

def cmtHash2CmtObj(self, cmtHash):

          '''
diff --git a/codeface/fileCommit.py b/codeface/fileCommit.py
index 956307a..2b8af8a 100644
--- a/codeface/fileCommit.py
+++ b/codeface/fileCommit.py
@@ -23,6 +23,61 @@ single file.'''
  import commit
  import bisect

+

+class FileDict:
+    """
+    A dictionary saving per-line information. We assume that we have 
information on any line,
+    and that we only have to save changing lines.

with "changing line", you mean a line that's affected by a particular
commit? Maybe rephrase the comment a bit for clarity.

+    """
+    def __init__(self, line_list, line_dict):
+        """
+        :rtype : FileDict
+        """
+        self.line_list = line_list
+        self.line_dict = line_dict
+        self.lastItem = line_list[-1]
+
+    def __init__(self):
+        """
+        :rtype : FileDict
+        """
+        self.line_list = []
+        self.line_dict = {}
+        self.lastItem = -1
+
+    def __iter__(self):
+        return self.line_dict.__iter__()
+
+    def get_line_info_raw(self, line_nr):
+        """
+        Returns the info for the given line (if the line was never set, the 
info for the last set line is returned)

line > 75 chars

+        :param line_nr:
+        :return:
+        """
+        i = bisect.bisect_right(self.line_list, line_nr)
+        info_line = self.line_list[i-1]
+        return self.line_dict[info_line]
+
+    def get_line_info(self, line_nr):
+        return set(self.get_line_info_raw(line_nr))
+
+    def add_line(self, line_nr, info):
+        """
+        Add the given information to the current dictionary.
+        Note: while filling the dictionary your line_nr has to be incremented!

with "your line_nr", you mean the variable used to indicate
the currently parsed line in the caller's context? If yes, please
clarify the comment a bit,

+        :param line_nr:
+        :param info:
+        :return:
+        """
+        if line_nr < self.lastItem:
+            raise ValueError("can only incrementally add items")
+        self.line_list.append(line_nr)
+        self.line_dict[line_nr] = info
+
+    def values(self):
+        return self.line_dict.values()
+
+
  class FileCommit:
      def __init__(self):

@@ -58,11 +113,7 @@ class FileCommit:

          self._src_elem_list = []

# dictionary with key = line number, value = feature list

-        self.featureLists = {}
-
-        # list of function line numbers in sorted order, this is for
-        # optimizing the process of finding a feature list given a line number
-        self.featureLineNums = [0]
+        self.feature_info = FileDict()

#Getter/Setters

      def getFileSnapShots(self):
@@ -91,9 +142,9 @@ class FileCommit:
      def setSrcElems(self, src_elem_list):
          self._src_elem_list.extend(src_elem_list)

- def setFeatureLines(self, featureLineNums, featureLists):

-        self.featureLists.update(featureLists)
-        self.featureLineNums = featureLineNums  # 
.extend(sorted(self.featureLists.iterkeys()))
+    def set_feature_infos(self, feature_line_infos):
+        self.feature_info = feature_line_infos
+
      #Methods
      def addFileSnapShot(self, key, dict):
          self.fileSnapShots[key] = dict
@@ -128,7 +179,4 @@ class FileCommit:
          self.functionImpl[id].append(srcLine)

def findFeatureList(self, lineNum):

-        # returns the identifier of a feature given a line number
-        i = bisect.bisect_right(self.featureLineNums, lineNum)
-        featureLine = self.featureLineNums[i-1]
-        return self.featureLists[featureLine]
+        return self.feature_info.get_line_info(lineNum)

Follow-Ups:
- [codeface] Re: [PATCH 6/9] Extract parsing functionality of _getFeatureLines in separate functions to make them unit-testable.
  - From: Wolfgang Mauerer

References:
- [codeface] Re: [PATCH 6/9] Extract parsing functionality of _getFeatureLines in separate functions to make them unit-testable.
  - From: Wolfgang Mauerer

[codeface] Re: [PATCH 6/9] Extract parsing functionality of _getFeatureLines in separate functions to make them unit-testable.

Other related posts: