Hi, I found that sff_extract ignores some of my clippings (resets the left trim point) and leaves an adapter in the output. It happens for overlapped trimpoints. This was hopefully not an issue for too many people. In the patch I also added some extra debug switch and more importantly, adjusted how overlapped trimpoints should be treated (that is, the left trim point will be set to be equal to the right trim point). At least this is what Roche does these days. Personally I would discard the respective FASTA/Q entry with a single nucleotide left after trimming during sff_extract right-away but we can live with that and leave it up to mira to throw it away as a too short read. Based on mira-4.0.2 bundle. Martin -- Martin Mokrejs, PhD. 454 / IonTorrent / Evrogen MINT / Clontech SMART adaptor/artefact removal (... too many protocols to name here) http://www.bioinformatics.cz/software/supported-protocols/
--- /usr/bin/sff_extract 2014-04-20 18:11:39.000000000 +0200 +++ sff_extract 2014-05-05 00:57:34.000000000 +0200 @@ -186,29 +186,28 @@ #we join the bases data['bases'] = ''.join(data['bases']) - #print "pre cqr: ", data['clip_qual_right'] - #print "pre car: ", data['clip_adapter_right'] - #print "pre cql: ", data['clip_qual_left'] - #print "pre cal: ", data['clip_adapter_left'] + if config['debug']: + print "" # close the newline 'Converting 'blah.sff' ...' if doing debug printing + print "%s: pre cqr: %s" % (data['name'], data['clip_qual_right']) + print "%s: pre car: %s" % (data['name'], data['clip_adapter_right']) + print "%s: pre cql: %s" % (data['name'], data['clip_qual_left']) + print "%s: pre cal: %s" % (data['name'], data['clip_adapter_left']) # bugfix: 0 values for clips in SFF mean: not computed # see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats#sff - # i.e.: right clip values must be set to length of sequences - # if that happens so that we can work with normal ranges + # i.e.: so for our convenience we rewrite *right clip 0 values* (unusued) with the length of sequence + # so that we can use the values for pythonic slicing directly if data['clip_qual_right'] == 0 : - data['clip_qual_right'] = data['number_of_bases']; + data['clip_qual_right'] = data['number_of_bases'] if data['clip_adapter_right'] == 0 : - data['clip_adapter_right'] = data['number_of_bases']; + data['clip_adapter_right'] = data['number_of_bases'] - # correct for the case the right clip is <= than the left clip - # in this case, left clip is 0 are set to 0 (right clip == 0 means - # "whole sequence") - if data['clip_qual_right'] <= data['clip_qual_left'] : - data['clip_qual_right'] = 0 - data['clip_qual_left'] = 0 - if data['clip_adapter_right'] <= data['clip_adapter_left'] : - data['clip_adapter_right'] = 0 - data['clip_adapter_left'] = 0 + # correct crossed trimpoints of the same type, e.g.when right clip is < than the left clip + # it is perfectly valid to set both trimpoint to same value - Roche does that so that 1nt is left in the sequence by design + if data['clip_qual_right'] < data['clip_qual_left'] : + data['clip_qual_left'] = data['clip_qual_right'] + if data['clip_adapter_right'] < data['clip_adapter_left'] : + data['clip_adapter_left'] = data['clip_adapter_right'] #the clipping section follows the NCBI's guidelines Trace Archive RFC #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc @@ -223,16 +222,23 @@ # see whether we have to override the minimum left clips if config['min_leftclip'] > 0: - if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']: + if data['clip_adapter_left'] >0: + if data['clip_adapter_left'] < config['min_leftclip']: + data['clip_adapter_left'] = config['min_leftclip'] + else: data['clip_adapter_left'] = config['min_leftclip'] - if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']: - data['clip_qual_left'] = config['min_leftclip'] + if data['clip_qual_left'] >0: + if data['clip_qual_left'] < config['min_leftclip']: + data['clip_qual_left'] = config['min_leftclip'] + else: + data['clip_qual_left'] = config['min_leftclip'] - #print "post cqr: ", data['clip_qual_right'] - #print "post car: ", data['clip_adapter_right'] - #print "post cql: ", data['clip_qual_left'] - #print "post cal: ", data['clip_adapter_left'] + if config['debug']: + print "%s: post cqr: %s" % (data['name'], data['clip_qual_right']) + print "%s: post car: %s" % (data['name'], data['clip_adapter_right']) + print "%s: post cql: %s" % (data['name'], data['clip_qual_left']) + print "%s: post cal: %s" % (data['name'], data['clip_adapter_left']) # for handling the -c (clip) option gently, we already clip here @@ -431,6 +437,11 @@ qual_line = ''.join([chr(q+33) for q in qual]) #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n')) seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n')) + + if config['clip']: + # if we did hard-clipping the remaining sequence is high-qual only, so force upper case letters + return seqstring.upper() + else: return seqstring @@ -985,17 +996,22 @@ openmode = 'w' if config['append']: openmode = 'a' + sys.stderr.write("Will append to existing files\n") + else: + sys.stderr.write("Will overwrite existing files\n") - seq_fh = open(config['seq_fname'], openmode) - xml_fh = open(config['xml_fname'], openmode) + sys.stderr.flush() + + seq_fh = open(os.path.basename(config['seq_fname']), openmode) + xml_fh = open(os.path.basename(config['xml_fname']), openmode) if config['want_fastq']: qual_fh = None try: - os.remove(config['qual_fname']) + os.remove(os.path.basename(config['qual_fname'])) except : python_formattingwithoutbracesisdumb_dummy = 1 else: - qual_fh = open(config['qual_fname'], openmode) + qual_fh = open(os.path.basename(config['qual_fname']), openmode) if not config['append']: xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n') @@ -1004,7 +1020,7 @@ #we go through all input files for sff_file in sff_files: - print "Working on '" + sff_file + "':" + print "Reading '" + sff_file + "':" ssahapematches.clear() seqcheckstore = ([]) @@ -1054,7 +1070,7 @@ tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r') read_ssaha_data(tmpssaha_fh) - print "Converting '" + sff_file + "' ... ", + print "Reading again '" + sff_file + "' ... ", sys.stdout.flush() sff_fh = open(sff_file, 'rb') #read_header(infile) @@ -1208,6 +1224,12 @@ left = 1 if right is None: right = data['number_of_bases'] + + # again, for the final trimpoints stay on the safe side and fix overlapped trimpoints + # this results in 1nt being always left in the sequence although the SFF-contained trimpoints + # meant that nothing should have been left after trimming + if right < left: + left = right return left, right def sequence_case(data): @@ -1380,7 +1402,7 @@ group = OptionGroup(parser, "Frequent options","") group.add_option('-a', '--append', action="store_true", dest='append', - help='append output to existing files', default=False) + help='append output to existing files instead of default overwriting', default=False) group.add_option("-l", "--linker_file", dest="pelinker_fname", help="FASTA file with paired-end linker sequences", metavar="FILE") group.add_option("-o", "--out_basename", dest="basename", @@ -1413,6 +1435,8 @@ help="output quality file name", metavar="FILE") group.add_option("-x", "--xml_file", dest="xml_fname", help="output ancillary xml file name", metavar="FILE") + group.add_option("-d", "--debug", action="store_true", dest="debug", + help="enable some debug output on STDOUT", default=False) parser.add_option_group(group) #default fnames