Am 18/11/2014 15:08, schrieb Matthias Dittrich: > Hi Wolfgang, > > On 17.11.2014 17:07, Wolfgang Mauerer wrote: >> >> Am 17/10/2014 15:14, schrieb Matthias Dittrich: >>> - parseFeatureLine function to parse a single line of cppstats output. >>> - Added FileDict class to encapsulate information about the features >>> of a source code file. >>> - parseFeatureLines function to generate a FileDict instance from all >>> lines of cppstats output. >>> >>> Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx> >>> --- >>> codeface/VCS.py | 217 >>> +++++++++++++++++++++++++++++++++++-------------- >>> codeface/fileCommit.py | 72 +++++++++++++--- >>> 2 files changed, 217 insertions(+), 72 deletions(-) >>> >>> diff --git a/codeface/VCS.py b/codeface/VCS.py >>> index a62c72a..8bf7022 100644 >>> --- a/codeface/VCS.py >>> +++ b/codeface/VCS.py >>> @@ -34,6 +34,8 @@ >>> # VCS-specific. >>> # TODO: Unify range handling. Either a range is always a list, or >>> always >>> # represented by two parameters. >>> +import itertools >>> +import readline >>> import commit >>> import fileCommit >>> @@ -44,6 +46,7 @@ import ctags >>> import tempfile >>> import source_analysis >>> import shutil >>> +from fileCommit import FileDict >>> from progressbar import ProgressBar, Percentage, Bar, ETA >>> from ctags import CTags, TagEntry >>> from logging import getLogger; log = getLogger(__name__) >>> @@ -183,6 +186,140 @@ class VCS: >>> return subsys=="__main__" or subsys in >>> self.subsys_description.keys() >>> +def parseSepLine (line): >>> + if not line.startswith("\"sep="): >>> + raise ParseError( >>> + "expected that the csv file header starts with >>> '\"sep=' but it started with '{}'" >>> + .format(line), 'CSVFile') >>> + stripped = line.rstrip() >>> + if not stripped.endswith("\""): >>> + raise ParseError( >>> + "expected that the csv file header ends with >>> '\"' but the line was '{}'" >>> + .format(line), 'CSVFile') >>> + return stripped[5:-1] >>> + >>> +def parseline(sep, line): >>> + """ >>> + Parses a line from a csv file >>> + :param sep: >>> + :param line: >>> + :return: >>> + """ >>> + # TODO: Handle escaping: sep is escaped with quotes, quotes are >>> escaped with quotes >>> + # 'test,test' will be '"test,test"' in the csv file >>> + # 'test"this,"test' will be '"test""this,""test"' in the csv file >>> + return [l.strip() for l in line.split(sep)] >> any reason for not using the csv module? > Does it support the initial header and possibly utf-8? sure, see for instance http://chimera.labs.oreilly.com/books/1230000000393/ch06.html. >>> + >>> + >>> +def enum(*sequential, **named): >>> + enums = dict(zip(sequential, range(len(sequential))), **named) >>> + reverse = dict((value, key) for key, value in enums.iteritems()) >>> + enums['reverse_mapping'] = reverse >>> + return type('Enum', (), enums) >>> + >>> + >>> +LineType = enum(IF='#if', ELSE='#else', ELIF='#elif') >>> + >>> + >>> +def parseFeatureLine(sep, line): >>> + """ >>> + parse the current line which is something like: feature_list, >>> start_line, end_line >> really something "like" or is your list an exact specification? From >> the code below, it appears that the output follows this format, not the >> current line that is parsed -- this is confusing. > Depends on what 'sep' is, however cppstats in fact uses ',' for now. The > code currently is generic enough to survive if they change the separator > but not if they start switching columns. I was more actually referring to the content of the line: above you're talking about three elements, below you use 5. This is somewhat confusing. Best regards, Wolfgang > > Best regards, Matthias >>> + :param line: the line to parse >>> + :return: start_line, end_line, feature_list >>> + """ >>> + parsedline = parseline(sep, line) >>> + # FILENAME,LINE_START,LINE_END,TYPE,EXPRESSION,CONSTANTS >>> + try: >>> + start_line = int(parsedline[1]) >>> + end_line = int(parsedline[2]) >>> + line_type = LineType.reverse_mapping[parsedline[3]] >>> + feature_list = parsedline[5].split(';') >>> + return start_line, end_line, line_type, feature_list >>> + except ValueError: >>> + raise ParseError( >>> + "could not parse feature line (most likely >>> because we could not parse the start- or end-line which should be on >>> index 2 and 3): \"{}\"" >>> + .format(line), 'CSVFile') >>> + >>> + >>> + >>> + >>> + >>> + >>> +def getFeatureLines(parsed_lines, filename): >>> + """ >>> + calculates an dictionary representing changes in the current >>> feature set and a sorted helper list for calculating >>> + indices of the dictionary for any line >>> + :param parsed_lines: a list of tuples with (start_line, >>> end_line, feature_list) elements >>> + :param filename: the name or the analysed files (only used for >>> descriptive error messages if the calculation fails >> or->of >>> + :return: line_nums, feature_lines: the first is a sorted list to >>> be able to access the feature list for any line >>> + (while only changing lines are in the dictionary) >> please break lines at 70-75 characters >>> + """ >>> + # mapping line -> feature list, we only add changing elements >>> + feature_lines = FileDict() >>> + feature_lines.add_line(0, []) >>> + >>> + # we want a format like (is_start, features) for every changing >>> line >>> + better_format = {} >>> + # We assume that every line is used at most once as start_line >>> or end_line >>> + >>> + def check_line(line): >>> + if line in better_format: >>> + raise ParseError( >>> + "every line index can be used at most once >>> (problematic line was {0} in file {1})" >>> + .format(line, filename), filename) >>> + >>> + for start_line, end_line, line_type, feature_list in parsed_lines: >>> + if start_line >= end_line: >>> + raise ParseError( >>> + "start_line can't be greater or equal to end_line >>> (problematic line was {0} in file {1})" >>> + .format(start_line, filename), filename) >>> + if line_type == 'IF': >>> + # ifs start on their own line, however the end_line >>> could already be used by the start of an else/elif >>> + check_line(start_line) >>> + if end_line in better_format: >>> + end_line -= 1 >>> + check_line(end_line) >>> + better_format[start_line] = (True, feature_list) >>> + better_format[end_line] = (False, feature_list) >>> + else: >>> + # we try to mostly ignore else and elif if the >>> feature_list doesn't change >>> + is_start, old_feature_list = better_format[start_line] >>> + if (not is_start) and old_feature_list == feature_list: >>> + # moving to the new end >>> + del better_format[start_line] >>> + better_format[end_line] = is_start, old_feature_list >>> + elif is_start: >>> + raise ParseError( >>> + "line {0} appeared twice as start line >>> (problematic file was {1})" >>> + .format(start_line, filename), filename) >>> + else: >>> + # So we have a elif with different features, >>> + # so we start more features now end add them to the >>> ending later >>> + del better_format[start_line] >>> + better_format[start_line] = (True, feature_list) >>> + better_format[end_line] = (False, old_feature_list + >>> feature_list) >>> + >>> + for line in sorted(better_format): >>> + is_start, features = better_format[line] >>> + # Get last infos >>> + last_feature_list = feature_lines.get_line_info_raw(line) >>> + # Copy last list and create new list for current line >>> + new_feature_list = list(last_feature_list) >>> + if is_start: >>> + for r in features: >>> + new_feature_list.insert(0, r) >>> + else: >>> + for r in reversed(features): >>> + item = new_feature_list.pop(0) >>> + assert(item == r) >>> + ##new_feature_list.remove(r) >>> + # Remove in next line (because we want to count the >>> current #endif line as well). >>> + line += 1 >>> + >>> + feature_lines.add_line(line, new_feature_list) >>> + return feature_lines >>> + >>> + >>> class gitVCS (VCS): >>> def __init__(self): >>> VCS.__init__(self) # Python OOP braindamage >>> @@ -1070,10 +1207,10 @@ class gitVCS (VCS): >>> def _getFeatureLines(self, file_layout_src, file_commit): >>> - ''' >>> + """ >>> similar to _getFunctionLines but computes the line numbers >>> of each >>> feature in the file. >>> - ''' >>> + """ >>> ''' >>> - Input - >>> file_name: original name of the file, used only to >>> determine the >>> @@ -1093,7 +1230,7 @@ class gitVCS (VCS): >>> # temporary file where we write transient data needed for >>> ctags >>> srcFile = tempfile.NamedTemporaryFile(suffix=fileExt) >>> - featurefile = tempfile.NamedTemporaryFile() >>> + featurefile = tempfile.NamedTemporaryFile(suffix=".csv") >>> # generate a source code file from the file_layout_src >>> dictionary >>> # and save it to a temporary location >>> for line in file_layout_src: >>> @@ -1101,68 +1238,28 @@ class gitVCS (VCS): >>> srcFile.flush() >>> # run cppstats analysis on the file to get the feature >>> locations >>> - cmd = "cppstats -f {0} {1}".format(featurefile.name, >>> srcFile.name).split() >>> - output = execute_command(cmd).splitlines() >>> - >>> - # mapping line -> feature list, we only add changing elements >>> - feature_lines = {0: []} >>> - # Helper list to get the last element of feature_lines >>> (which contains only lines with changes) >>> - line_nums = [0] >>> - >>> - def parse_result_line(line): >>> - """ >>> - parse the current line which is something like: >>> feature_list, start_line, end_line >>> - :param line: the line to parse >>> - :return: start_line, end_line, feature_list >>> - """ >>> - start_line = 0 >>> - end_line = 0 >>> - feature_list = {} >>> - return start_line, end_line, feature_list >>> - >>> - try: >>> - results_file = open(featurefile.name, 'r') >>> - parsed_lines = [parse_result_line(featureLine) for >>> featureLine in results_file] >>> - # we want a format like (is_start, features) for every >>> changing line >>> - better_format = {} >>> - # We assume that every line is used at most once as >>> start_line or end_line >>> - >>> - def check_line(line): >>> - if line in better_format: >>> - raise ParseError( >>> - "every line index can be used at most once >>> (problematic line was {0} in file {1})" >>> - .format(line, file_commit.filename)) >>> - >>> - for start_line, end_line, feature_list in parsed_lines: >>> - check_line(start_line) >>> - check_line(end_line) >>> - better_format[start_line] = (True, feature_list) >>> - better_format[end_line] = (False, feature_list) >>> - >>> - for line in sorted(better_format): >>> - is_start, features = better_format[line] >>> - # Get last line >>> - line_nums.append(line) >>> - last_feature_list_line = >>> bisect.bisect_right(line_nums, line) >>> - last_feature_list = >>> feature_lines[last_feature_list_line-1] >>> - # Copy last list and create new list for current line >>> - new_feature_list = list(last_feature_list) >>> - if is_start: >>> - new_feature_list.extend(features) >>> - else: >>> - for r in features: >>> - new_feature_list.remove(r) >>> - feature_lines[line] = new_feature_list >>> - except: >>> - log.critical("was unable unable to parse feature >>> information of cppstats") >>> - raise >> so you're refactoring a bit by moving code from the git specific >> VCS class to the base class. Can you please note that in the >> commit description? When reading the patch, I was a bit confused >> at first to see code being added, and then the same code being >> removed before I realised we're talking about different classes. >> >>> - >>> + # TODO: fix hardcoded paths >>> + # BUG: THIS IS VERY BAD HARDCODED CODE AND SHOULD BE FIXED, >> indeed ;) >>> + # HOWEVER IT IS NOT CLEAR HOW CPPSTATS IS DISTRUBUTED NOR >>> WHERE IT LIVES >> please mention in the installation document that cppstats is required >> and where the source can be obtained. Then, ensure at the beginning of >> the analysis run that the tool is available, and exit as early as >> possible with an error message if it is not -- this avoids any partial >> results in the database, which usually leads to hard to track errors. >> >>> + oldPath = os.getenv("PYTHONPATH") >>> + os.putenv("PYTHONPATH", "/home/drag0on/projects/cppstats/lib") >>> + cmd = "/usr/bin/env python >>> /home/drag0on/projects/cppstats/cppstats.py --kind featurelocations >>> --file {0} {1}"\ >>> + .format(srcFile.name, featurefile.name).split() >>> + output = execute_command(cmd, >>> cwd="/home/drag0on/projects/cppstats").splitlines() >>> + os.putenv("PYTHONPATH", oldPath) >>> + >>> + results_file = open(featurefile.name, 'r') >>> + sep = parseSepLine(next(results_file)) >>> + headlines = parseline(sep, next(results_file)) >>> + feature_lines = \ >>> + getFeatureLines( >>> + [parseFeatureLine(sep, line) for line in >>> results_file], file_commit.filename) >>> # clean up temporary files >>> srcFile.close() >>> featurefile.close() >>> # save result to the file commit instance >>> - file_commit.setFeatureLines(line_nums, feature_lines) >>> + file_commit.set_feature_infos(feature_lines) >>> def cmtHash2CmtObj(self, cmtHash): >>> ''' >>> diff --git a/codeface/fileCommit.py b/codeface/fileCommit.py >>> index 956307a..2b8af8a 100644 >>> --- a/codeface/fileCommit.py >>> +++ b/codeface/fileCommit.py >>> @@ -23,6 +23,61 @@ single file.''' >>> import commit >>> import bisect >>> + >>> +class FileDict: >>> + """ >>> + A dictionary saving per-line information. We assume that we have >>> information on any line, >>> + and that we only have to save changing lines. >> with "changing line", you mean a line that's affected by a particular >> commit? Maybe rephrase the comment a bit for clarity. >> >>> + """ >>> + def __init__(self, line_list, line_dict): >>> + """ >>> + :rtype : FileDict >>> + """ >>> + self.line_list = line_list >>> + self.line_dict = line_dict >>> + self.lastItem = line_list[-1] >>> + >>> + def __init__(self): >>> + """ >>> + :rtype : FileDict >>> + """ >>> + self.line_list = [] >>> + self.line_dict = {} >>> + self.lastItem = -1 >>> + >>> + def __iter__(self): >>> + return self.line_dict.__iter__() >>> + >>> + def get_line_info_raw(self, line_nr): >>> + """ >>> + Returns the info for the given line (if the line was never >>> set, the info for the last set line is returned) >> line > 75 chars >>> + :param line_nr: >>> + :return: >>> + """ >>> + i = bisect.bisect_right(self.line_list, line_nr) >>> + info_line = self.line_list[i-1] >>> + return self.line_dict[info_line] >>> + >>> + def get_line_info(self, line_nr): >>> + return set(self.get_line_info_raw(line_nr)) >>> + >>> + def add_line(self, line_nr, info): >>> + """ >>> + Add the given information to the current dictionary. >>> + Note: while filling the dictionary your line_nr has to be >>> incremented! >> with "your line_nr", you mean the variable used to indicate >> the currently parsed line in the caller's context? If yes, please >> clarify the comment a bit, >>> + :param line_nr: >>> + :param info: >>> + :return: >>> + """ >>> + if line_nr < self.lastItem: >>> + raise ValueError("can only incrementally add items") >>> + self.line_list.append(line_nr) >>> + self.line_dict[line_nr] = info >>> + >>> + def values(self): >>> + return self.line_dict.values() >>> + >>> + >>> class FileCommit: >>> def __init__(self): >>> @@ -58,11 +113,7 @@ class FileCommit: >>> self._src_elem_list = [] >>> # dictionary with key = line number, value = feature list >>> - self.featureLists = {} >>> - >>> - # list of function line numbers in sorted order, this is for >>> - # optimizing the process of finding a feature list given a >>> line number >>> - self.featureLineNums = [0] >>> + self.feature_info = FileDict() >>> #Getter/Setters >>> def getFileSnapShots(self): >>> @@ -91,9 +142,9 @@ class FileCommit: >>> def setSrcElems(self, src_elem_list): >>> self._src_elem_list.extend(src_elem_list) >>> - def setFeatureLines(self, featureLineNums, featureLists): >>> - self.featureLists.update(featureLists) >>> - self.featureLineNums = featureLineNums # >>> .extend(sorted(self.featureLists.iterkeys())) >>> + def set_feature_infos(self, feature_line_infos): >>> + self.feature_info = feature_line_infos >>> + >>> #Methods >>> def addFileSnapShot(self, key, dict): >>> self.fileSnapShots[key] = dict >>> @@ -128,7 +179,4 @@ class FileCommit: >>> self.functionImpl[id].append(srcLine) >>> def findFeatureList(self, lineNum): >>> - # returns the identifier of a feature given a line number >>> - i = bisect.bisect_right(self.featureLineNums, lineNum) >>> - featureLine = self.featureLineNums[i-1] >>> - return self.featureLists[featureLine] >>> + return self.feature_info.get_line_info(lineNum) >>> > >