[codeface] Re: [PATCH 5/9] Add a _getFeatureLines function which calculates feature sets for all source code lines.

  • From: Matthias Dittrich <matthi.d@xxxxxxxxxxxxxx>
  • To: codeface@xxxxxxxxxxxxx
  • Date: Tue, 18 Nov 2014 14:47:27 +0100

Hi,

On 17.11.2014 16:33, Wolfgang Mauerer wrote:
Am 17/10/2014 15:14, schrieb Matthias Dittrich:
- The _getFeatureLines function calculates the feature sets in a similar way 
how _getFunctionsLines
calculates the current functions for all source code lines.
- Added some fields to codeface/fileCommit.py to save those results.

Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx>
---
  codeface/VCS.py        | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
  codeface/fileCommit.py | 16 +++++++++
  2 files changed, 112 insertions(+)

diff --git a/codeface/VCS.py b/codeface/VCS.py
index 5229981..a62c72a 100644
--- a/codeface/VCS.py
+++ b/codeface/VCS.py
@@ -39,6 +39,7 @@ import commit
  import fileCommit
  import re
  import os
+import bisect
  import ctags
  import tempfile
  import source_analysis
@@ -1068,6 +1069,101 @@ class gitVCS (VCS):
           for lineNum, srcLine in enumerate(file_layout_src)]
+ def _getFeatureLines(self, file_layout_src, file_commit):
+        '''
+        similar to _getFunctionLines but computes the line numbers of each
+        feature in the file.
+        '''
+        '''
+        - Input -
+        file_name: original name of the file, used only to determine the
+                    programming language (ie. file.c is a c-language file)
if you use the file name just to determine the programming language,
would it then not make sense to store the inferred language? Or might
the filename provide some additional information in the future?
file_name is taken from the file_commit object (I will remove that part of the documentation).

+        file_layout_scr: dictionary with key=line number value = line of code
scr->src
+        file_commit: fileCommit instance where the results will be stored
+
+        - Description -
+        The file_layout is used to construct a source code file that can be
+        parsed by ctags to generate a ctags file. The ctags file is then
+        accessed to extract the function tags and line numbers to be saved in
+        the fileCommit object
+        '''
+
+        # grab the file extension to determine the language of the file
+        fileExt = os.path.splitext(file_commit.filename)[1]
+
+        # temporary file where we write transient data needed for ctags
+        srcFile = tempfile.NamedTemporaryFile(suffix=fileExt)
+        featurefile = tempfile.NamedTemporaryFile()
+        # generate a source code file from the file_layout_src dictionary
+        # and save it to a temporary location
+        for line in file_layout_src:
+            srcFile.write(line)
+        srcFile.flush()
+
+        # run cppstats analysis on the file to get the feature locations
+        cmd = "cppstats -f {0} {1}".format(featurefile.name, 
srcFile.name).split()
+        output = execute_command(cmd).splitlines()
+
+        # mapping line -> feature list, we only add changing elements
+        feature_lines = {0: []}
+        # Helper list to get the last element of feature_lines (which contains 
only lines with changes)
+        line_nums = [0]
+
+        def parse_result_line(line):
+            """
+            parse the current line which is something like: feature_list, 
start_line, end_line
+            :param line: the line to parse
+            :return: start_line, end_line, feature_list
+            """
+            start_line = 0
+            end_line = 0
+            feature_list = {}
+            return start_line, end_line, feature_list
+
+        try:
+            results_file = open(featurefile.name, 'r')
+            parsed_lines = [parse_result_line(featureLine) for featureLine in 
results_file]
+            # we want a format like (is_start, features) for every changing 
line
+            better_format = {}
so you basically annotate all code lines of interest in better_format,
right? This is not entirely well captured by the name "better_format".
Would "annotated_lines" or "feature_annotation" or something along these
lines make sense?
better_format is basically the same as parsed_lines which is the output of cppstats, but in a better format (hence the name).
I'm converting it so the algorithm (the last for loop) is quite simple.
annotated_lines would also work.
+            # We assume that every line is used at most once as start_line or 
end_line
+
+            def check_line(line):
+                if line in better_format:
+                    raise ParseError(
+                        "every line index can be used at most once (problematic 
line was {0} in file {1})"
+                        .format(line, file_commit.filename))
+
+            for start_line, end_line, feature_list in parsed_lines:
+                check_line(start_line)
+                check_line(end_line)
+                better_format[start_line] = (True, feature_list)
+                better_format[end_line] = (False, feature_list)
+
+            for line in sorted(better_format):
+                is_start, features = better_format[line]
+                # Get last line
+                line_nums.append(line)
+                last_feature_list_line = bisect.bisect_right(line_nums, line)
+                last_feature_list = feature_lines[last_feature_list_line-1]
+                # Copy last list and create new list for current line
+                new_feature_list = list(last_feature_list)
It's not really clear what this code does, and why. Since it seems like
one of the central parts of your contribution, could you add an example
as a comment?
I will try.
Best regards, Matthias
+                if is_start:
+                    new_feature_list.extend(features)
+                else:
+                    for r in features:
+                        new_feature_list.remove(r)
+                feature_lines[line] = new_feature_list
+        except:
+            log.critical("was unable unable to parse feature information of 
cppstats")
+            raise
+
+        # clean up temporary files
+        srcFile.close()
+        featurefile.close()
+
+        # save result to the file commit instance
+        file_commit.setFeatureLines(line_nums, feature_lines)
+
      def cmtHash2CmtObj(self, cmtHash):
          '''
          input: cmtHash
diff --git a/codeface/fileCommit.py b/codeface/fileCommit.py
index aa99e84..956307a 100644
--- a/codeface/fileCommit.py
+++ b/codeface/fileCommit.py
@@ -57,6 +57,13 @@ class FileCommit:
          # meta data
          self._src_elem_list = []
+ # dictionary with key = line number, value = feature list
+        self.featureLists = {}
+
+        # list of function line numbers in sorted order, this is for
+        # optimizing the process of finding a feature list given a line number
+        self.featureLineNums = [0]
+
      #Getter/Setters
      def getFileSnapShots(self):
          return self.fileSnapShots
@@ -84,6 +91,9 @@ class FileCommit:
      def setSrcElems(self, src_elem_list):
          self._src_elem_list.extend(src_elem_list)
+ def setFeatureLines(self, featureLineNums, featureLists):
+        self.featureLists.update(featureLists)
+        self.featureLineNums = featureLineNums  # 
.extend(sorted(self.featureLists.iterkeys()))
      #Methods
do the comments have any meaning, or are they just old leftovers? In
the latter case, please remove.


Best regards, Wolfgang
      def addFileSnapShot(self, key, dict):
          self.fileSnapShots[key] = dict
@@ -116,3 +126,9 @@ class FileCommit:
      def addFuncImplLine(self, lineNum, srcLine):
          id = self.findFuncId(lineNum)
          self.functionImpl[id].append(srcLine)
+
+    def findFeatureList(self, lineNum):
+        # returns the identifier of a feature given a line number
+        i = bisect.bisect_right(self.featureLineNums, lineNum)
+        featureLine = self.featureLineNums[i-1]
+        return self.featureLists[featureLine]



Other related posts: