[codeface] Re: [PATCH 5/9] Add a _getFeatureLines function which calculates feature sets for all source code lines.

From: Wolfgang Mauerer <wm@xxxxxxxxxxxxxxxx>
To: codeface@xxxxxxxxxxxxx
Date: Mon, 17 Nov 2014 16:33:14 +0100
Am 17/10/2014 15:14, schrieb Matthias Dittrich:
> - The _getFeatureLines function calculates the feature sets in a similar way 
> how _getFunctionsLines
> calculates the current functions for all source code lines.
> - Added some fields to codeface/fileCommit.py to save those results.
> 
> Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx>
> ---
>  codeface/VCS.py        | 96 
> ++++++++++++++++++++++++++++++++++++++++++++++++++
>  codeface/fileCommit.py | 16 +++++++++
>  2 files changed, 112 insertions(+)
> 
> diff --git a/codeface/VCS.py b/codeface/VCS.py
> index 5229981..a62c72a 100644
> --- a/codeface/VCS.py
> +++ b/codeface/VCS.py
> @@ -39,6 +39,7 @@ import commit
>  import fileCommit
>  import re
>  import os
> +import bisect
>  import ctags
>  import tempfile
>  import source_analysis
> @@ -1068,6 +1069,101 @@ class gitVCS (VCS):
>           for lineNum, srcLine in enumerate(file_layout_src)]
>  
>  
> +    def _getFeatureLines(self, file_layout_src, file_commit):
> +        '''
> +        similar to _getFunctionLines but computes the line numbers of each
> +        feature in the file.
> +        '''
> +        '''
> +        - Input -
> +        file_name: original name of the file, used only to determine the
> +                    programming language (ie. file.c is a c-language file)
if you use the file name just to determine the programming language,
would it then not make sense to store the inferred language? Or might
the filename provide some additional information in the future?

> +        file_layout_scr: dictionary with key=line number value = line of code
scr->src
> +        file_commit: fileCommit instance where the results will be stored
> +
> +        - Description -
> +        The file_layout is used to construct a source code file that can be
> +        parsed by ctags to generate a ctags file. The ctags file is then
> +        accessed to extract the function tags and line numbers to be saved in
> +        the fileCommit object

> +        '''
> +
> +        # grab the file extension to determine the language of the file
> +        fileExt = os.path.splitext(file_commit.filename)[1]
> +
> +        # temporary file where we write transient data needed for ctags
> +        srcFile = tempfile.NamedTemporaryFile(suffix=fileExt)
> +        featurefile = tempfile.NamedTemporaryFile()
> +        # generate a source code file from the file_layout_src dictionary
> +        # and save it to a temporary location
> +        for line in file_layout_src:
> +            srcFile.write(line)
> +        srcFile.flush()
> +
> +        # run cppstats analysis on the file to get the feature locations
> +        cmd = "cppstats -f {0} {1}".format(featurefile.name, 
> srcFile.name).split()
> +        output = execute_command(cmd).splitlines()
> +
> +        # mapping line -> feature list, we only add changing elements
> +        feature_lines = {0: []}
> +        # Helper list to get the last element of feature_lines (which 
> contains only lines with changes)
> +        line_nums = [0]
> +
> +        def parse_result_line(line):
> +            """
> +            parse the current line which is something like: feature_list, 
> start_line, end_line
> +            :param line: the line to parse
> +            :return: start_line, end_line, feature_list
> +            """
> +            start_line = 0
> +            end_line = 0
> +            feature_list = {}
> +            return start_line, end_line, feature_list
> +
> +        try:
> +            results_file = open(featurefile.name, 'r')
> +            parsed_lines = [parse_result_line(featureLine) for featureLine 
> in results_file]
> +            # we want a format like (is_start, features) for every changing 
> line
> +            better_format = {}
so you basically annotate all code lines of interest in better_format,
right? This is not entirely well captured by the name "better_format".
Would "annotated_lines" or "feature_annotation" or something along these
lines make sense?
> +            # We assume that every line is used at most once as start_line 
> or end_line
> +
> +            def check_line(line):
> +                if line in better_format:
> +                    raise ParseError(
> +                        "every line index can be used at most once 
> (problematic line was {0} in file {1})"
> +                        .format(line, file_commit.filename))
> +
> +            for start_line, end_line, feature_list in parsed_lines:
> +                check_line(start_line)
> +                check_line(end_line)
> +                better_format[start_line] = (True, feature_list)
> +                better_format[end_line] = (False, feature_list)
> +
> +            for line in sorted(better_format):
> +                is_start, features = better_format[line]
> +                # Get last line
> +                line_nums.append(line)
> +                last_feature_list_line = bisect.bisect_right(line_nums, line)
> +                last_feature_list = feature_lines[last_feature_list_line-1]
> +                # Copy last list and create new list for current line
> +                new_feature_list = list(last_feature_list)
It's not really clear what this code does, and why. Since it seems like
one of the central parts of your contribution, could you add an example
as a comment?
> +                if is_start:
> +                    new_feature_list.extend(features)
> +                else:
> +                    for r in features:
> +                        new_feature_list.remove(r)
> +                feature_lines[line] = new_feature_list
> +        except:
> +            log.critical("was unable unable to parse feature information of 
> cppstats")
> +            raise
> +
> +        # clean up temporary files
> +        srcFile.close()
> +        featurefile.close()
> +
> +        # save result to the file commit instance
> +        file_commit.setFeatureLines(line_nums, feature_lines)
> +
>      def cmtHash2CmtObj(self, cmtHash):
>          '''
>          input: cmtHash
> diff --git a/codeface/fileCommit.py b/codeface/fileCommit.py
> index aa99e84..956307a 100644
> --- a/codeface/fileCommit.py
> +++ b/codeface/fileCommit.py
> @@ -57,6 +57,13 @@ class FileCommit:
>          # meta data
>          self._src_elem_list = []
>  
> +        # dictionary with key = line number, value = feature list
> +        self.featureLists = {}
> +
> +        # list of function line numbers in sorted order, this is for
> +        # optimizing the process of finding a feature list given a line 
> number
> +        self.featureLineNums = [0]
> +
>      #Getter/Setters
>      def getFileSnapShots(self):
>          return self.fileSnapShots
> @@ -84,6 +91,9 @@ class FileCommit:
>      def setSrcElems(self, src_elem_list):
>          self._src_elem_list.extend(src_elem_list)
>  
> +    def setFeatureLines(self, featureLineNums, featureLists):
> +        self.featureLists.update(featureLists)
> +        self.featureLineNums = featureLineNums  # 
> .extend(sorted(self.featureLists.iterkeys()))
>      #Methods
do the comments have any meaning, or are they just old leftovers? In
the latter case, please remove.

Best regards, Wolfgang
>      def addFileSnapShot(self, key, dict):
>          self.fileSnapShots[key] = dict
> @@ -116,3 +126,9 @@ class FileCommit:
>      def addFuncImplLine(self, lineNum, srcLine):
>          id = self.findFuncId(lineNum)
>          self.functionImpl[id].append(srcLine)
> +
> +    def findFeatureList(self, lineNum):
> +        # returns the identifier of a feature given a line number
> +        i = bisect.bisect_right(self.featureLineNums, lineNum)
> +        featureLine = self.featureLineNums[i-1]
> +        return self.featureLists[featureLine]
>
Follow-Ups:
- [codeface] Re: [PATCH 5/9] Add a _getFeatureLines function which calculates feature sets for all source code lines.
  - From: Matthias Dittrich
[codeface] Re: [PATCH 5/9] Add a _getFeatureLines function which calculates feature sets for all source code lines.

Other related posts: