Hi, On 17.11.2014 16:33, Wolfgang Mauerer wrote:
file_name is taken from the file_commit object (I will remove that part of the documentation).Am 17/10/2014 15:14, schrieb Matthias Dittrich:- The _getFeatureLines function calculates the feature sets in a similar way how _getFunctionsLines calculates the current functions for all source code lines. - Added some fields to codeface/fileCommit.py to save those results. Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx> --- codeface/VCS.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++ codeface/fileCommit.py | 16 +++++++++ 2 files changed, 112 insertions(+) diff --git a/codeface/VCS.py b/codeface/VCS.py index 5229981..a62c72a 100644 --- a/codeface/VCS.py +++ b/codeface/VCS.py @@ -39,6 +39,7 @@ import commit import fileCommit import re import os +import bisect import ctags import tempfile import source_analysis @@ -1068,6 +1069,101 @@ class gitVCS (VCS): for lineNum, srcLine in enumerate(file_layout_src)]+ def _getFeatureLines(self, file_layout_src, file_commit):+ ''' + similar to _getFunctionLines but computes the line numbers of each + feature in the file. + ''' + ''' + - Input - + file_name: original name of the file, used only to determine the + programming language (ie. file.c is a c-language file)if you use the file name just to determine the programming language, would it then not make sense to store the inferred language? Or might the filename provide some additional information in the future?
better_format is basically the same as parsed_lines which is the output of cppstats, but in a better format (hence the name).+ file_layout_scr: dictionary with key=line number value = line of codescr->src+ file_commit: fileCommit instance where the results will be stored + + - Description - + The file_layout is used to construct a source code file that can be + parsed by ctags to generate a ctags file. The ctags file is then + accessed to extract the function tags and line numbers to be saved in + the fileCommit object + ''' + + # grab the file extension to determine the language of the file + fileExt = os.path.splitext(file_commit.filename)[1] + + # temporary file where we write transient data needed for ctags + srcFile = tempfile.NamedTemporaryFile(suffix=fileExt) + featurefile = tempfile.NamedTemporaryFile() + # generate a source code file from the file_layout_src dictionary + # and save it to a temporary location + for line in file_layout_src: + srcFile.write(line) + srcFile.flush() + + # run cppstats analysis on the file to get the feature locations + cmd = "cppstats -f {0} {1}".format(featurefile.name, srcFile.name).split() + output = execute_command(cmd).splitlines() + + # mapping line -> feature list, we only add changing elements + feature_lines = {0: []} + # Helper list to get the last element of feature_lines (which contains only lines with changes) + line_nums = [0] + + def parse_result_line(line): + """ + parse the current line which is something like: feature_list, start_line, end_line + :param line: the line to parse + :return: start_line, end_line, feature_list + """ + start_line = 0 + end_line = 0 + feature_list = {} + return start_line, end_line, feature_list + + try: + results_file = open(featurefile.name, 'r') + parsed_lines = [parse_result_line(featureLine) for featureLine in results_file] + # we want a format like (is_start, features) for every changing line + better_format = {}so you basically annotate all code lines of interest in better_format, right? This is not entirely well captured by the name "better_format". Would "annotated_lines" or "feature_annotation" or something along these lines make sense?
I'm converting it so the algorithm (the last for loop) is quite simple. annotated_lines would also work.
+ # We assume that every line is used at most once as start_line or end_line + + def check_line(line): + if line in better_format: + raise ParseError( + "every line index can be used at most once (problematic line was {0} in file {1})" + .format(line, file_commit.filename)) + + for start_line, end_line, feature_list in parsed_lines: + check_line(start_line) + check_line(end_line) + better_format[start_line] = (True, feature_list) + better_format[end_line] = (False, feature_list) + + for line in sorted(better_format): + is_start, features = better_format[line] + # Get last line + line_nums.append(line) + last_feature_list_line = bisect.bisect_right(line_nums, line) + last_feature_list = feature_lines[last_feature_list_line-1] + # Copy last list and create new list for current line + new_feature_list = list(last_feature_list)It's not really clear what this code does, and why. Since it seems like one of the central parts of your contribution, could you add an example as a comment?
I will try. Best regards, Matthias
+ if is_start: + new_feature_list.extend(features) + else: + for r in features: + new_feature_list.remove(r) + feature_lines[line] = new_feature_list + except: + log.critical("was unable unable to parse feature information of cppstats") + raise + + # clean up temporary files + srcFile.close() + featurefile.close() + + # save result to the file commit instance + file_commit.setFeatureLines(line_nums, feature_lines) + def cmtHash2CmtObj(self, cmtHash): ''' input: cmtHash diff --git a/codeface/fileCommit.py b/codeface/fileCommit.py index aa99e84..956307a 100644 --- a/codeface/fileCommit.py +++ b/codeface/fileCommit.py @@ -57,6 +57,13 @@ class FileCommit: # meta data self._src_elem_list = []+ # dictionary with key = line number, value = feature list+ self.featureLists = {} + + # list of function line numbers in sorted order, this is for + # optimizing the process of finding a feature list given a line number + self.featureLineNums = [0] + #Getter/Setters def getFileSnapShots(self): return self.fileSnapShots @@ -84,6 +91,9 @@ class FileCommit: def setSrcElems(self, src_elem_list): self._src_elem_list.extend(src_elem_list)+ def setFeatureLines(self, featureLineNums, featureLists):+ self.featureLists.update(featureLists) + self.featureLineNums = featureLineNums # .extend(sorted(self.featureLists.iterkeys())) #Methodsdo the comments have any meaning, or are they just old leftovers? In the latter case, please remove.
Best regards, Wolfgangdef addFileSnapShot(self, key, dict): self.fileSnapShots[key] = dict @@ -116,3 +126,9 @@ class FileCommit: def addFuncImplLine(self, lineNum, srcLine): id = self.findFuncId(lineNum) self.functionImpl[id].append(srcLine) + + def findFeatureList(self, lineNum): + # returns the identifier of a feature given a line number + i = bisect.bisect_right(self.featureLineNums, lineNum) + featureLine = self.featureLineNums[i-1] + return self.featureLists[featureLine]