On Wed, Nov 19, 2014 at 9:40 PM, Matthias Dittrich <matthi.d@xxxxxxxxxxxxxx> wrote: > - The _get_feature_lines function calculates the feature sets in a similar > way how _getFunctionsLines > calculates the current functions for all source code lines. > - Added some fields to codeface/fileCommit.py to save those results. > - parse_feature_line function to parse a single line of cppstats output. > - Added FileDict class to encapsulate information about the features of a > source code file. > - parse_feature_lines function to generate a FileDict instance from all lines > of cppstats output. > - We expect cppstats to be in path and check that the executable works before > starting the analysis. > - Update documentation. > > Signed-off-by: Matthias Dittrich <matthi.d@xxxxxxxxx> > Reviewed-by: Wolfgang Mauerer <wolfgang.mauerer@xxxxxxxxxxx> > --- > README.md | 25 ++++++ > codeface/VCS.py | 230 > +++++++++++++++++++++++++++++++++++++++++++++++++ > codeface/fileCommit.py | 70 +++++++++++++++ > codeface/project.py | 24 +++++- > codeface/util.py | 22 +++++ > 5 files changed, 367 insertions(+), 4 deletions(-) > > diff --git a/README.md b/README.md > index 277a67a..eced236 100644 > --- a/README.md > +++ b/README.md > @@ -59,6 +59,31 @@ in the step "Database Setup", and modify codeface.conf > accordingly. > # Devel packages required for python packages > sudo apt-get install libyaml-dev > > +* When using the feature or feature_file analysis you need to have a working > + "cppstats" in your path. > + One way to get it is: > + > + cd ~ > + git clone https://github.com/clhunsen/cppstats.git > + > + Now create a file like ~/scripts/cppstats and add ~/scripts to your PATH. > + It should have something along the lines of: > + > + #!/bin/bash > + > + cd ~/cppstats > + PYTHONPATH="~/cppstats/lib" ~/cppstats/cppstats.py "$@" > + > + Note that the script has to be executable: > + > + chmod +x ~/scripts/cppstats > + > + and then add ~/scripts to your PATH. > + (maybe you have to replace ~ with the full path (/home/$user) if it > doesn't work). > + > + You can test this script by running "~/scripts/cppstats --help" and > + validate that you get an help message > + > ## Preparing the R installation > > * Run `sudo R CMD javareconf`; make sure that the tool reports success in > diff --git a/codeface/VCS.py b/codeface/VCS.py > index c1060f2..079be83 100644 > --- a/codeface/VCS.py > +++ b/codeface/VCS.py > @@ -34,15 +34,19 @@ > # VCS-specific. > # TODO: Unify range handling. Either a range is always a list, or always > # represented by two parameters. > +import itertools > +import readline > > import commit > import fileCommit > import re > import os > +import bisect > import ctags > import tempfile > import source_analysis > import shutil > +from fileCommit import FileDict > from progressbar import ProgressBar, Percentage, Bar, ETA > from ctags import CTags, TagEntry > from logging import getLogger; log = getLogger(__name__) > @@ -182,6 +186,181 @@ class VCS: > return subsys=="__main__" or subsys in self.subsys_description.keys() > > > +def parse_sep_line(line): > + if not line.startswith("\"sep="): > + raise ParseError( > + ("expected that the csv file header starts with '\"sep=' " > + "but it started with '{}'") > + .format(line), 'CSVFile') > + stripped = line.rstrip() > + if not stripped.endswith("\""): > + raise ParseError( > + ("expected that the csv file header ends with '\"' " > + "but the line was '{}'") > + .format(line), 'CSVFile') > + return stripped[5:-1] > + > + > +def parse_line(sep, line): > + """ > + Parses a line from a csv file > + :param sep: > + :param line: > + :return: > + """ > + # TODO: Handle escaping: sep is escaped with quotes, quotes are escaped > with quotes > + # 'test,test' will be '"test,test"' in the csv file > + # 'test"this,"test' will be '"test""this,""test"' in the csv file > + return [l.strip() for l in line.split(sep)] > + > + > +class LineType: > + IF = "#if" > + ELSE = "#else" > + ELIF = "#elif" > + > + > +def parse_feature_line(sep, line): > + """ > + parse the current line which is something like: > + FILENAME,LINE_START,LINE_END,TYPE,EXPRESSION,CONSTANTS > + :param line: the line to parse > + :return: start_line, end_line, line_type, feature_list > + """ > + parsed_line = parse_line(sep, line) > + # FILENAME,LINE_START,LINE_END,TYPE,EXPRESSION,CONSTANTS > + try: > + start_line = int(parsed_line[1]) > + end_line = int(parsed_line[2]) > + line_type_raw = parsed_line[3] > + if line_type_raw not in (LineType.IF, LineType.ELSE, LineType.ELIF): > + raise ParseError( > + ("could not parse feature line (because we could" > + "not parse the line_type): \"{}\"") > + .format(line), 'CSVFile') > + line_type = line_type_raw > + feature_list = parsed_line[5].split(';') > + return start_line, end_line, line_type, feature_list > + except ValueError: > + raise ParseError( > + ("could not parse feature line (most likely because we " > + "could not parse the start- or end-line which should " > + "be on index 2 and 3): \"{}\"") > + .format(line), 'CSVFile') > + > + > +def get_feature_lines(parsed_lines, filename): > + """ > + calculates an dictionary representing the feature sets for any line > + of the given file. > + :param parsed_lines: a list of tuples with > + (start_line, end_line, line_type, feature_list) elements > + :param filename: the name or the analysed files > + (only used for descriptive error messages if the calculation fails) > + :return: > + feature_lines: a FileDict object to access the feature sets on any line > + """ > + # mapping line -> feature list, we only add changing elements > + feature_lines = FileDict() > + feature_lines.add_line(0, []) > + > + # we want a format like (is_start, features) for every line with an > + # #ifdef (ie. line that changes the feature set) > + annotated_lines = {} > + > + def check_line(line): > + if line in annotated_lines: > + raise ParseError( > + ("every line index can be used at most once " > + "(problematic line was {0} in file {1})") > + .format(line, filename), filename) > + > + # We now transform the cppstats output in another output which will > + # help to implement the algorithm below in a simple and fast way. > + # The old format is a list of > + # (start_line, end_line, line_type, feature_list) tuples for every > + # #ifdef/#else. > + # The new format is a list of (is_start, feature_set) > + # for every #ifdef(/#else)/#endif > + # We try to ignore #else wherever possible or handle > + # the #else like a nested #if. > + for start_line, end_line, line_type, feature_list in parsed_lines: > + if start_line >= end_line: > + raise ParseError( > + ("start_line can't be greater or equal to end_line " > + "(problematic line was {0} in file {1})") > + .format(start_line, filename), filename) > + > + if line_type == LineType.IF: > + # ifs start on their own line, however the end_line could > + # already be used by the start of an else/elif > + # (#else is the end of the previous #if > + # and the start of another '#if') > + check_line(start_line) > + if end_line in annotated_lines: > + # in that case we just say the #else line belongs to the > + # virtual starting '#if' > + end_line -= 1 > + # Now end_line should be unused > + check_line(end_line) > + annotated_lines[start_line] = (True, feature_list) > + annotated_lines[end_line] = (False, feature_list) > + else: > + # we try to mostly ignore else and elif if the feature_ > + # list doesn't change > + is_start, old_feature_list = annotated_lines[start_line] > + if (not is_start) and old_feature_list == feature_list: > + # We are on an ELSE, however the feature list did not > + # change so we just delete the current line and move the > + # list to the new end > + del annotated_lines[start_line] > + annotated_lines[end_line] = is_start, old_feature_list > + elif is_start: > + raise ParseError( > + ("line {0} appeared twice as start line " > + "(problematic file was {1})") > + .format(start_line, filename), filename) > + else: > + # So we have a elif with different features, > + # so we start more features now end add them to the ending > + # later > + # (so we handle this as if there was a new #ifdef started) > + del annotated_lines[start_line] > + annotated_lines[start_line] = (True, feature_list) > + annotated_lines[end_line] = \ > + (False, old_feature_list + feature_list) > + > + # Now that we have calculated the annotated_lines we just calculate the > + # feature sets on those lines and save them in a FileDict instance. > + # We can always access the last feature_list with the FileDict > + # (because we sorted the lines) > + for line in sorted(annotated_lines): > + is_start, features = annotated_lines[line] > + # Get last info > + last_feature_list = feature_lines.get_line_info_raw(line) > + # Copy last list and create new list for current line > + new_feature_list = list(last_feature_list) > + if is_start: > + # if the current line starts a new list of features, > + # we just need to add those to > + # the new list (note that order matters in this case). > + for r in features: > + new_feature_list.insert(0, r) > + else: > + # if the current line ends a list of features, > + # we remove them from the list > + # (reverse order as adding). > + for r in reversed(features): > + item = new_feature_list.pop(0) > + assert(item == r) > + # Remove in next line > + # (because we want to count the current #endif line as well). > + line += 1 > + > + feature_lines.add_line(line, new_feature_list) > + return feature_lines > + > + > class gitVCS (VCS): > def __init__(self): > VCS.__init__(self) # Python OOP braindamage > @@ -1067,6 +1246,57 @@ class gitVCS (VCS): > src_line_rmv = re.sub(rmv_char, ' ', src_line.strip()) > file_commit.addFuncImplLine(line_num, src_line_rmv) > > + @staticmethod > + def _get_feature_lines(file_layout_src, file_commit): > + """ > + similar to _getFunctionLines but computes the line numbers of each > + feature in the file. > + """ > + ''' > + - Input - > + file_layout_src: > + dictionary with 'key=line number' and 'value=line of code' > + file_commit: fileCommit instance where the results will be stored > + > + - Description - > + The file_layout is used to construct a source code file that can be > + parsed by cppstats to generate a cppstats csv file. > + The cppstats csv file is then accessed to extract the feature sets > + and line numbers to be saved in the fileCommit object > + ''' > + > + # grab the file extension to determine the language of the file > + fileExt = os.path.splitext(file_commit.filename)[1] > + > + # temporary file where we write transient data needed for ctags > + srcFile = tempfile.NamedTemporaryFile(suffix=fileExt) > + featurefile = tempfile.NamedTemporaryFile(suffix=".csv") > + # generate a source code file from the file_layout_src dictionary > + # and save it to a temporary location > + for line in file_layout_src: > + srcFile.write(line) > + srcFile.flush() > + > + # run cppstats analysis on the file to get the feature locations > + cmd = "/usr/bin/env cppstats --kind featurelocations --file {0} {1}"\ > + .format(srcFile.name, featurefile.name).split() > + output = execute_command(cmd).splitlines() > + > + results_file = open(featurefile.name, 'r') > + sep = parse_sep_line(next(results_file)) > + headlines = parse_line(sep, next(results_file)) > + feature_lines = \ > + get_feature_lines( > + [parse_feature_line(sep, line) for line in results_file], > + file_commit.filename) > + > + # clean up temporary files > + srcFile.close() > + featurefile.close() > + > + # save result to the file commit instance > + file_commit.set_feature_infos(feature_lines) > + > def cmtHash2CmtObj(self, cmtHash): > ''' > input: cmtHash > diff --git a/codeface/fileCommit.py b/codeface/fileCommit.py > index aa99e84..6474669 100644 > --- a/codeface/fileCommit.py > +++ b/codeface/fileCommit.py > @@ -23,6 +23,67 @@ single file.''' > import commit > import bisect > > + > +class FileDict: > + """ > + A generic dictionary for saving per-line information. > + We assume that this information is available on any line, > + and that the information only changes on some lines. > + So we only save the information on lines that change that info > + and use bisect to retrieve that information (for any line). > + """ > + def __init__(self, line_list, line_dict): > + """ > + :rtype : FileDict > + """ > + self.line_list = line_list > + self.line_dict = line_dict > + self.lastItem = line_list[-1] > + > + def __init__(self): > + """ > + :rtype : FileDict > + """ > + self.line_list = [] > + self.line_dict = {} > + self.lastItem = -1 > + > + def __iter__(self): > + return self.line_dict.__iter__() > + > + def get_line_info_raw(self, line_nr): > + """ > + Returns the info for the given line > + (if the line was never set, the info for the last set line > + is returned) > + :param line_nr: the line to retrieve the information for. > + :return: the information for the given line. > + """ > + i = bisect.bisect_right(self.line_list, line_nr) > + info_line = self.line_list[i-1] > + return self.line_dict[info_line] > + > + def get_line_info(self, line_nr): > + return set(self.get_line_info_raw(line_nr)) > + > + def add_line(self, line_nr, info): > + """ > + Add the given information to the current dictionary. > + Note: while filling the dictionary the line_nr argument has to > + be incremented (this is only to make sure the caller > + gets the intended behavior)! > + :param line_nr: the line number of the information > + :param info: the information for the current line > + """ > + if line_nr < self.lastItem: > + raise ValueError("can only incrementally add items") > + self.line_list.append(line_nr) > + self.line_dict[line_nr] = info > + > + def values(self): > + return self.line_dict.values() > + > + > class FileCommit: > def __init__(self): > > @@ -57,6 +118,9 @@ class FileCommit: > # meta data > self._src_elem_list = [] > > + # dictionary with key = line number, value = feature list > + self.feature_info = FileDict() > + > #Getter/Setters > def getFileSnapShots(self): > return self.fileSnapShots > @@ -84,6 +148,9 @@ class FileCommit: > def setSrcElems(self, src_elem_list): > self._src_elem_list.extend(src_elem_list) > > + def set_feature_infos(self, feature_line_infos): > + self.feature_info = feature_line_infos > + > #Methods > def addFileSnapShot(self, key, dict): > self.fileSnapShots[key] = dict > @@ -116,3 +183,6 @@ class FileCommit: > def addFuncImplLine(self, lineNum, srcLine): > id = self.findFuncId(lineNum) > self.functionImpl[id].append(srcLine) > + > + def findFeatureList(self, lineNum): > + return self.feature_info.get_line_info(int(lineNum)) > diff --git a/codeface/project.py b/codeface/project.py > index a311fb7..1d8c0b6 100644 > --- a/codeface/project.py > +++ b/codeface/project.py > @@ -23,7 +23,7 @@ from .configuration import Configuration > from .cluster.cluster import doProjectAnalysis > from .ts import dispatch_ts_analysis > from .util import (execute_command, generate_reports, layout_graph, > - check4ctags, BatchJobPool, generate_analysis_windows) > + check4ctags, check4cppstats, BatchJobPool, > generate_analysis_windows) > > def loginfo(msg): > ''' Pickleable function for multiprocessing ''' > @@ -54,7 +54,20 @@ def project_analyse(resdir, gitdir, codeface_conf, > project_conf, > no_report, loglevel, logfile, recreate, profile_r, > n_jobs): > pool = BatchJobPool(int(n_jobs)) > conf = Configuration.load(codeface_conf, project_conf) > - project, tagging = conf["project"], conf["tagging"] > + tagging = conf["tagging"] > + if collab_type is not "default": > + # as collab_type is ignored on some tagging values we should either > + # => throw an exception to tell the user he specified something weird > + # => set tagging to something valid > + if tagging is not "proximity": > + log.warn("tagging value is overwritten to proximity because of > --collaboration") > + tagging = "proximity" > + conf["tagging"] = tagging > + else: > + # default is function > + collab_type = "function" > + > + project = conf["project"] > repo = pathjoin(gitdir, conf["repo"], ".git") > project_resdir = pathjoin(resdir, project, tagging) > range_by_date = False > @@ -67,8 +80,11 @@ def project_analyse(resdir, gitdir, codeface_conf, > project_conf, > range_by_date = True > > # TODO: Sanity checks (ensure that git repo dir exists) > - if 'proximity' == conf["tagging"]: > - check4ctags() > + if 'proximity' == tagging: > + if collab_type is 'function': > + check4ctags() > + else: > + check4cppstats() > > project_id, dbm, all_range_ids = project_setup(conf, recreate) > > diff --git a/codeface/util.py b/codeface/util.py > index 7b8e4ab..9de4b5b 100644 > --- a/codeface/util.py > +++ b/codeface/util.py > @@ -381,6 +381,28 @@ def check4ctags(): > log.error("Ctags version '{0}' not found".format(prog_version)) > raise Exception("Incompatible ctags-exuberant version") > > + > +def check4cppstats(): > + """ > + check if the appropriate cppstats is installed on the system. > + """ > + # We can not check the version directly as there is no version switch on > cppstats > + # We just check if the first two lines of --help are OK. > + line_1 = "usage: cppstats.py [-h] [--kind <K> | -a] [--list [LIST] | > --file IN OUT]" > + line_2 = " [--nobak] [--stf] [-l] [-v] [--check CHECK] > [--dall]" > + cmd = "/usr/bin/env cppstats --help".split() > + res = execute_command(cmd).splitlines() The check for cppstats fails even though I have it installed. The problem seems to be that the line breaking is not consistent with your checks for line_1 and line_2. I added a log statement on the "res" variable and the output is below. 2014-11-25 17:30:12 [codeface.util] MainProcess DEBUG: Running command: /usr/bin/env cppstats --help 2014-11-25 17:30:13 [codeface.util] MainProcess ERROR: ['usage: cppstats.py [-h] [--kind <K> | -a] [--list [LIST] | --file IN OUT] [--nobak] [--norewriteifdefs] [-l] [-v] [--check CHECK] [--dall]', '', 'optional arguments:', ' -h, --help show this help message and exit', ' --kind <K> the preparation to be performed [default: general]', ' -a, --all perform all available kinds of preparation [default: False]', ' --list [LIST] a file that contains the list of input projects/folders [default: cppstats_input.txt]', ' --file IN OUT a source file IN that is prepared and analyzed, the analysis results are written to OUT', ' (--list is the default)', ' --nobak do not backup files during preparation [default: False]', '', 'POSSIBLE KINDS OF ANALYSES <K>:', ' general, generalvalues, discipline, featurelocations, derivative, interaction', '', "OPTIONS FOR ANALYSIS 'GENERALVALUES':", ' --norewriteifdefs rewrite nested #ifdefs and #elifs as a conjunction of inner and outer expressions [default=True]', ' (exception are #else tags, which ARE rewritten as negation of the #if branch! see also --norewriteelse of analysis GENERALVALUES)', '', "OPTIONS FOR ANALYSIS 'DISCIPLINE':", ' This analysis counts the number of the disciplined CPP usage in software projects. ', ' To this end, it checks xml representations of header and source files and returns the number of disciplined ifdefs in those. ', '', ' -l, --log log to stdout [default=True]', ' -v, --verbose verbose output [default=False]', ' --check CHECK CHECK sets the patterns that are checked [default=1].', ' Supply sum of wanted patterns:', ' (1) check top level siblings (compilation unit) ', ' (2) check sibling (excludes check top level siblings; NOT CLASSIFIED) ', ' (4) check if-then enframement (wrapper) ', ' (8) check case enframement (conditional) ', ' (16) check else-if enframement (conditional) ', ' (32) check param/argument enframement (parameter) ', ' (64) check expression enframement (expression) ', ' (128) check else enframement (NOT CLASSIFIED) ', ' --dall check all patterns [default=True] ', ' (overrides --check)'] 2014-11-25 17:30:13 [codeface.util] MainProcess ERROR: program cppstats does not exist, or it is not working as expected (expected ' [--nobak] [--stf] [-l] [-v] [--check CHECK] [--dall]' in the second line but got '' Traceback (most recent call last): File "/home/au/.local/bin/codeface", line 9, in <module> load_entry_point('codeface==0.2.0', 'console_scripts', 'codeface')() File "/home/au/workspace/codeface/codeface/cli.py", line 197, in main return run(sys.argv) File "/home/au/workspace/codeface/codeface/cli.py", line 193, in run return args.func(args) File "/home/au/workspace/codeface/codeface/cli.py", line 112, in cmd_run args.profile_r, args.jobs, args.tagging) File "/home/au/workspace/codeface/codeface/project.py", line 88, in project_analyse check4cppstats() File "/home/au/workspace/codeface/codeface/util.py", line 411, in check4cppstats raise Exception("cppstats not found ({0})".format(error_message)) Exception: cppstats not found (expected ' [--nobak] [--stf] [-l] [-v] [--check CHECK] [--dall]' in the second line but got '') I guess you never experienced this on your system? Maybe this check is not very robust because it relies on having consistent line breaks. Perhaps we need another solution. --Mitchell > + if not (res[0].startswith(line_1)): > + error_message = "expected '{0}' in the first line but got > '{1}'".format(line_1, res[0]) > + log.error("program cppstats does not exist, or it is not working as > expected ({0}".format(error_message)) > + raise Exception("cppstats not found ({0})".format(error_message)) > + > + if not (res[1].startswith(line_2)): > + error_message = "expected '{0}' in the second line but got > '{1}'".format(line_2, res[1]) > + log.error("program cppstats does not exist, or it is not working as > expected ({0}".format(error_message)) > + raise Exception("cppstats not found ({0})".format(error_message)) > + > + > def generate_analysis_windows(repo, window_size_months): > ''' > Generates a list of revisions (commit hash) in increments of the > window_size > -- > 1.8.5.5 > >