Dear John and Mira users, I may suggest to try my sff2phd converter, which has a configurable crap reads removal and can output in phd, fasta or fastq (Q+64) format. It is included in the latest version of the consed (22). It also has inbuilt paired reads splitter, and can work with custom PE linkers (give him the liker sequence in that case). Iontorrent works too. It is implemented in the pure perl (no bioperl /etc dependencies, and can work on gzipped/bzipped sff files). Version t15 of this program is attached to my email in the text file. Save it as sff2phd and make it executable. Then you can just run (if you want fastq file): sff2phd [your's sff file] -nophd -nophdball -xn -max_ep=0.5 -fastq=[yours output fastq file] (andjust maximum cumulative error probability in 10 bp window -maxep=) and -xn - exclude reads with N's (if needed). PS: Dear Basrien, if you like it - feel free to add it to the mira contrib, so other users can benefit from it. Let me know if any more help is needed. -- Best Regards, Markiyan Samborskyy, DNA Sequencing Facility, Department of Biochemistry, University of Cambridge.
#!/usr/bin/perl -w use strict; # SFF to PHD files converter, with 454 paired reads and MID support # Version 0.15 - 111229 # developed by Markiyan Samborskyy, # University of Cambridge, Department of Biochemistry, # DNA Sequencing Facility. / # Ivan Frankro National University of Lviv. # ms587@xxxxxxxxxxxxxxxxxx or v32sw .a.t. litech.lviv.ua # 18 December 2007 - 29 December 2012 my $argv_config=get_ARGV(); my $sff_in_file=$argv_config->{files_list}[0]; my %sff_head; #information about the SFF header; my $flowgram_bytes=2; my $read_spp_start=15;#seq peak position start (for phd_file) my $read_spp_step=19;#seq peak position step (for phd_file) my $hp_margin_multiplier=10;#seq homopolymer margin ep multiplier #It defines how many times the error probability shold be greater at the margins of homopolymer, than at the inside of homopolymer. #The quality is reprocessed in the homopolymer regions, cumulative error probability of homopolymer is unchanged. #This allows more acurate assembly of reads in the homopolymeric regions by phrap. my $noqproc=0;#skip quality processing my $nosplit=0;#skip paired reads splitting my $min_read_length=25; my ($phd_time, $phd_time_RT)=get_local_time_phd(); if($argv_config->{phd_time_head}){ $phd_time=$argv_config->{phd_time_head}; } if($argv_config->{phd_time_rt}){ $phd_time_RT=$argv_config->{phd_time_rt}; } if($argv_config->{phd_time_static}){ $phd_time="Fri Aug 08 08:08:08 2008"; $phd_time_RT="080808:080808"; } my $phd_ball="phd.ball"; my $sff_log="sff.log"; #default 454 flx and titanium linkers: my @linkers=("GTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC", "TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG", "CGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA"); #default MID adapters (GSmids and RLMIDS): my %MID_IDs; $MID_IDs{MID1}= "acgagtgcgt"; $MID_IDs{MID2}= "acgctcgaca"; $MID_IDs{MID3}= "agacgcactc"; $MID_IDs{MID4}= "agcactgtag"; $MID_IDs{MID5}= "atcagacacg"; $MID_IDs{MID6}= "atatcgcgag"; $MID_IDs{MID7}= "cgtgtctcta"; $MID_IDs{MID8}= "ctcgcgtgtc"; $MID_IDs{MID9}= "tagtatcagc"; $MID_IDs{MID10}= "tctctatgcg"; $MID_IDs{MID11}= "tgatacgtct"; $MID_IDs{MID12}= "tactgagcta"; $MID_IDs{MID13}= "catagtagtg"; $MID_IDs{MID14}= "cgagagatac"; $MID_IDs{RL1}= "acacgacgact"; $MID_IDs{RL2}= "acacgtagtat"; $MID_IDs{RL3}= "acactactcgt"; $MID_IDs{RL4}= "acgacacgtat"; $MID_IDs{RL5}= "acgagtagact"; $MID_IDs{RL6}= "acgcgtctagt"; $MID_IDs{RL7}= "acgtacacact"; $MID_IDs{RL8}= "acgtactgtgt"; $MID_IDs{RL9}= "acgtagatcgt"; $MID_IDs{RL10}= "actacgtctct"; $MID_IDs{RL11}= "actatacgagt"; $MID_IDs{RL12}= "actcgcgtcgt"; #please use lowercase, if there are some new or weird Y-shaped adapters from the 3prime end - #please add/replace them in the @MID3_IDs. my @MID3_IDs=(#clip Y adapter from the 3-prime end, if RLMID's are used: "agtcgtggtgt", "atactaggtgt", "acgagtggtgt", "atacgtggcgt", "agtctacgcgt", "actagaggcgt", "agtgtgtgcgt", "acacagtgcgt", "acgatctgcgt", "agagacggagt", "actcgtagagt", "acgacgggagt"); my @MID_in_use;#and here goes the adaptor sequences, which should be used with our sff file. my $max_rd_ep=1;#maximal error probability before cuttoff for 10 bp. cut in the middle (5 bp) my $qual_downscale=5;#by default it will downscale quality by 5 my $use_MID=0;#switch for MID usage, set to 1 when they are used my $MID_in_file="mids";#default file to look for MID adapters my $reads_include_file="454_reads_include.fof"; my $reads_exclude_file="454_reads_exclude.fof";#exclude list has a priority over the include my $reads_max_file="454_reads_max";#exclude list has a priority over the include unless(-e $sff_in_file){ print STDERR ("SFF2PHD and fasta files converter, Ver 0.10 (101108). Usage: $0 [sff_file]\n"); print STDERR ("by default it requires folder ../phd_dir for output of the individual phd files.\n"); print STDERR ("Please run me from within edit_dir.\n"); print STDERR (" -os=[fasta out] Output sequences to fasta file\n"); print STDERR (" -oq=[fasta.qual out] Output fasta sequence quality to fastq file\n"); print STDERR (" -fastq=[fastq out] Output sequences to fastq file (CHR=Q+64)\n"); print STDERR (" -velvet=[fastq PE out] Output fastq complete PE sequences to separate fastq file (for velvet)\n"); print STDERR (" -phdball=[phd.ball] Output concatanated phd files to specific phd.ball file (for consed)\n"); print STDERR (" -max_reads=[number of reads to import from the sff file]\n"); print STDERR (" -reads_include=[include only the reads from the given list file] - $reads_include_file*\n"); print STDERR (" -reads_exclude=[exclude the reads from the given list file] - $reads_exclude_file*\n"); print STDERR (" -linker=[454/ion paired reads linker sequence]\n"); print STDERR (" Specify additional 454/ion linker sequence (like for titanium or similar)\n"); print STDERR (" Default ones for FLX/Ti LPE are:\n"); print STDERR (" ".join("\n ", @linkers)."\n"); print STDERR (" -MID=[mid sequences or MID_ID\'s, comma delimited list]\n"); print STDERR (" -MID_file=[file with mid sequences or MID ID #'s'], $MID_in_file*, file format:\n"); print STDERR (" [sff_file_name][tab][MID sequence] or [MID sequence]\n"); print STDERR (" -word_size=[minmach in 454 paired reads linker sequence detection]\n"); print STDERR (" -fastaphrap Output full fasta phrap headers, not only sequences names\n"); print STDERR (" -hmepm [#] Homopolymer margin error probability multiplier (default $hp_margin_multiplier)\n"); print STDERR (" -noqproc Don\'t process homopolymer quality data. Use original.\n"); print STDERR (" -noqclip Don\'t clip reads ends based on the quality data. Use SFF clippoints.\n"); print STDERR (" -nosplit Don\'t split 454 PE reads.\n"); print STDERR (" -max_ep Maximal cumulative read error probability. [$max_rd_ep]\n"); print STDERR (" -Q=[qual to reduce] Reduce the quality by the given number (for 454 titanium adjustement) - $qual_downscale*\n"); print STDERR (" -xn Exclude reads with N's inside clipped area (usually caused by air/gas bubles)\n"); print STDERR (" -nophd Don\'t output phd files to phd_dir\n"); print STDERR (" -nophdball Don\'t output concatanated phd files to phd.ball (for consed)\n"); print STDERR (" -chr Use CHR(Q+64) fasta quality encoding, instead of numeric encoded quality\n"); print STDERR (" Numeric one is required for original phrap versions, and uses more space.\n"); print STDERR (" -silent Don't output anything to stderr.\n"); print STDERR (" -debug Output debug info to stderr.\n\n"); print STDERR (" -lib=[lib] Specify PE shotgun library name (default: depends on the paird reads status)\n"); print STDERR (" -phd_time_static Use fixed time for phd/phd.ball files - %s\n"); print STDERR (" -phd_time_head \"[TIME]\" Set specific phd header time, examnoqclipple: \"%s\"\n"); print STDERR (" -phd_time_rt [YYMMDD:HHMMSS] Set specific read tag time, example: \"%s\"\n"); die "Required command line parrameter missing for SFF2PHD converter or file: $sff_in_file doesn't exist\n"; } my %sff_stat;#global sff statistics if(defined $argv_config->{reads_include}){ $reads_include_file=$argv_config->{reads_include}; } if(defined $argv_config->{reads_exclude}){ $reads_exclude_file=$argv_config->{reads_exclude}; } my $reads_include;#what to include - ref to hash my $reads_exclude;#what to exclude, (even from include list :-) if(-f $reads_include_file){ print STDERR "Using reads inclusion list file: $reads_include_file\n"; print STDERR "ONLY the read with names in this file will be imported.\n"; $reads_include=get_reads_list($reads_include_file); } if(-f $reads_exclude_file){ print STDERR "Using reads exclusion list file: $reads_exclude_file\n"; print STDERR "The reads with names in this file will NOT be imported.\n"; $reads_exclude=get_reads_list($reads_exclude_file); } #$reads_max_file # if(-f $reads_max_file){ # print STDERR "Using reads number limit file: $reads_max_file\n"; # print STDERR "The reads with names in this file will NOT be imported.\n"; # $reads_exclude=get_reads_list($reads_exclude_file); # } if(-f "noqclip"){ print STDERR "Not clipping/downscaling 454 by quality\n"; $argv_config->{noqclip}=1; $argv_config->{Q}=0; } #my %read_list;#used for debugging #FARIHWY01A0SDZ FARIHWY01BC6PR FARIHWY01BII3F FARIHWY01BL4OF FARIHWY01D5DU5 FARIHWY01DPHXI #$read_list{FARIHWY01A0SDZ}=1;$read_list{FARIHWY01BC6PR}=1;$read_list{FARIHWY01BII3F}=1;$read_list{FARIHWY01BL4OF}=1;$read_list{FARIHWY01D5DU5}=1;$read_list{FARIHWY01DPHXI}=1; if($argv_config->{Q}){ $qual_downscale=$argv_config->{Q}; } if($argv_config->{max_ep}){ $max_rd_ep=$argv_config->{max_ep}; } if($argv_config->{linker}){#adding linker to linkers db chomp $argv_config->{linker}; push @linkers, $argv_config->{linker}; } unless($argv_config->{word_size}){ #aka minmatch $argv_config->{word_size} = 10; } if($argv_config->{phdball}){ $phd_ball=$argv_config->{phdball}; } my $linker_db = make_seq_words(\@linkers, $argv_config->{word_size}); if($argv_config->{noqproc}){$noqproc=1}; if($argv_config->{nosplit}){$nosplit=1}; if($argv_config->{hmepm}){$hp_margin_multiplier=$argv_config->{hmepm}}; if($argv_config->{MID_file}){$MID_in_file=$argv_config->{MID_file}}; my $sff_file_name=$sff_in_file; if($sff_in_file=~m|\S+/(\S+)|){$sff_file_name=$1}; #get the file base name (w/o path&ext) $sff_file_name=~s/\.gz$//i; $sff_file_name=~s/\.bz2$//i; $sff_file_name=~s/\.Z $//i; $sff_stat{sff_file_name}=$sff_in_file; $sff_stat{sff_out_reads_total}=0; #MID's processing setup: if($argv_config->{MID}){#MID's from the argv my @MIDs_list=split(",", $argv_config->{MID}); foreach my $MID (@MIDs_list){ if($MID=~m/(\S+)/){ my $MID=$1; if($MID_IDs{$MID}){push @MID_in_use, $MID_IDs{$MID} }elsif($MID=~m/\d+/){#some odd mid ID with # print STDERR "Unknown MID_ID: $MID\n"; }else{ push @MID_in_use, $MID; } } } }elsif(-f $MID_in_file){#add MID adapters from the file (if (\S+)\s+(\S+) - $1 is sff file, $2 is MID) print STDERR "Using reads with MID adapters from file: $MID_in_file\n"; open(MID_IN, $MID_in_file) or die "Unable to open MID input file: $MID_in_file\n"; while(<MID_IN>){ my $MID_str;#string with MID's if(m/^(\S+)\s+(\S+)/){#we have filename if(m/$sff_file_name\s+(\S+)/){$MID_str=$1}# it is our's file - proc it }elsif(m/(\S+)/){$MID_str=$1} if(defined $MID_str){ my @MIDs_list=split(",", $MID_str); foreach my $MID (@MIDs_list){ if($MID=~m/(\S+)/){ my $MID=$1; if($MID_IDs{$MID}){push @MID_in_use, $MID_IDs{$MID} }elsif($MID=~m/\d+/){#some odd mid ID with # print STDERR "Unknown MID_ID: $MID\n"; }else{ push @MID_in_use, $MID; } } } } } close(MID_IN); } if(defined($MID_in_use[0])){ print STDERR "Will only import reads starting with following MID\'s:\n"; print STDERR uc(join(", ", @MID_in_use)."\n") } foreach my $MID (@MID_in_use){#make all mids lowercase $MID=lc($MID); } #lets add compressed file support (but decompress for consed editing): #make sure you have gzip/bzip2 prgrams on the commandline path on your's system if($sff_in_file=~m/\.gz$/i or $sff_in_file=~m/\.Z$/i){ open (SFF_IN, "gzip -c -d $sff_in_file |") or die "\nUnable to open gzipped input file: $sff_in_file\n"; }elsif($sff_in_file=~m/\.bz2$/i){ open (SFF_IN, "bzip2 -c -d $sff_in_file |") or die "\nUnable to open bzipped input file: $sff_in_file\n"; }else{ open (SFF_IN, $sff_in_file) or die "\nUnable to open input file: $sff_in_file\n"; } #open (SFF_IN, $sff_in_file) or die "Unable to open $sff_in_file for reading\n"; binmode SFF_IN; #open outputs unless($argv_config->{nophdball}){ open(PHD_BALL, ">>$phd_ball") or die "Unable to open phd file: $phd_ball\n"; } if($argv_config->{os}){ open(FASTA_OUT, ">>".$argv_config->{os}) or die "Unable to open fasta output file: $argv_config->{os}\n"; } if($argv_config->{oq}){ open(FASTAQUAL_OUT, ">>".$argv_config->{oq}) or die "Unable to open fasta output file: $argv_config->{oq}\n"; } if($argv_config->{fastq}){ if($argv_config->{velvet}){#we have a complete couple open(FASTQ2_OUT, ">>".$argv_config->{velvet}) or die "Unable to open fastq2 (PE velvet) output file: $argv_config->{velvet}\n"; } open(FASTQ1_OUT, ">>".$argv_config->{fastq}) or die "Unable to open fastq output file: $argv_config->{fastq}\n"; } #first we get the beggining of the SFF header. my $sff_header1_raw;#first 31 bytes of the sff header sysread(SFF_IN, $sff_header1_raw, 31) or die "Unable to read the header1 from $sff_in_file\n"; my @sff_header1 = unpack("A4NNNNNnnnC",$sff_header1_raw); #the description of the @sff_header1 fields: #0 .sff #1 sff_version ui32 (sff magic_number) #2 index_offset1 (older 32 bits in be) #3 index_offset2 ui32 #4 index_length ui32 #5 number_of_reads ui32 #6 header_length ui16 #7 key_length ui16 #8 number_of_flows_per_read ui16 #9 flowgram_format_code ui8 $sff_head{file_type}=$sff_header1[0]; $sff_head{sff_version}=$sff_header1[1]; $sff_head{index_offset}=(2**32)*$sff_header1[2]+$sff_header1[3]; $sff_head{index_length}=$sff_header1[4]; $sff_head{number_of_reads}=$sff_header1[5]; $sff_head{header_length}=$sff_header1[6]; $sff_head{key_length}=$sff_header1[7]; $sff_head{number_of_flows_per_read}=$sff_header1[8]; $sff_head{flowgram_format_code}=$sff_header1[9]; #some version checks: unless($sff_head{file_type} eq ".sff"){die "Wrong sff file type(format): $sff_head{file_type}, should be .sff\n"}; unless($sff_head{sff_version}==1){die "Wrong sff file version: $sff_head{sff_version}, currently only version 1 is supported\n"}; my $sff_header2_raw; sysread(SFF_IN, $sff_header2_raw, $sff_head{header_length} - 31) or die "Unable to read the header2 from $sff_in_file\n"; #print "SFF_header_raw: $sff_header2_raw\n"; $sff_head{flowgram_chars} = unpack("A$sff_head{number_of_flows_per_read}", $sff_header2_raw); if($argv_config->{debug}){ print STDERR "SFF header data:\n"; foreach my $sffk (keys %sff_head){ print STDERR "$sffk:\t$sff_head{$sffk}\n"; } } my $tmp_skip;#for storing skipped file parts (for reading from the stream) #now we read whole sff file body: my $ri=0;#read index position SFF_READ: #iterate untill for($ri=0; $ri<$sff_head{number_of_reads}; $ri++){ my $sff_read_header1; sysread(SFF_IN, $sff_read_header1, 16) or die "Unable to read the read_header1 from $sff_in_file\n"; my @sff_rh1=unpack("nnNnnnn",$sff_read_header1); #sff read header1 data: #0 read_header_length ui16 #1 name_length ui16 #2 number_of_bases ui32 #3 clip_qual_left ui16 #4 clip_qual_right ui16 #5 clip_adapter_left ui16 #6 clip_adapter_right ui16 my $clip_left = $sff_rh1[3]; my $clip_right = $sff_rh1[4]; #clipping adapters if present if($sff_rh1[5]>$clip_left){$clip_left=$sff_rh1[5]}; if($sff_rh1[6]>0 && $sff_rh1[6]<$clip_right){$clip_right=$sff_rh1[6]}; $clip_left--;$clip_right--;#making array indexes #and how we get the second part of the read header: my $sff_read_header2; sysread(SFF_IN, $sff_read_header2, $sff_rh1[0]-16) or die "Unable to read the read_header2 from $sff_in_file, read_idx: $ri\n"; #getting reaf name my $read_name=unpack("A$sff_rh1[1]",$sff_read_header2); #$sff_header1[8]=number_of_flows_per_read #it includes flowogram+flow_index; my $flowogram_data_length=$sff_header1[8] * $flowgram_bytes + $sff_rh1[2]; my $read_data_length = $flowogram_data_length + $sff_rh1[2] * 2; my $eight_byte_padding = ($read_data_length % 8 > 0 ? 8 - $read_data_length % 8 : 0); #so we skip througth the flowogram data: sysread(SFF_IN, $tmp_skip, $flowogram_data_length) or die "Unable to skip to the start of sequence of the read $read_name, idx: $ri, in the: $sff_in_file\n. Check your's sfffile\n.";#go to the start of the needed data # sysseek(SFF_IN, $flowogram_data_length, 1) or die "Unable to seek start of sequence of the read $read_name, idx: $ri, in the: $sff_in_file\n. Check your's sfffile\n.";#go to the start of the needed data my $read_seq_str;my $read_seqquality_str; sysread(SFF_IN, $read_seq_str, $sff_rh1[2]) or die "Unable to read sequence of the read $read_name, idx: $ri, from the: $sff_in_file\n. Check your's sfffile\n.";#read read's seq sysread(SFF_IN, $read_seqquality_str, $sff_rh1[2]) or die "Unable to read sequence quality of the read $read_name, idx: $ri, in the: $sff_in_file\n. Check your's sfffile\n.";#read read's seqquality data if($eight_byte_padding){ sysread(SFF_IN, $tmp_skip, $eight_byte_padding) or die "Unable to read to the start of sequence of the next read after $read_name, idx: $ri, in the: $sff_in_file\n. Check your's sfffile\n.";#so we are at the beggining of the next header } # sysseek(SFF_IN, $eight_byte_padding, 1) or die "Unable to seek start of sequence of the next read after $read_name, idx: $ri, in the: $sff_in_file\n. Check your's sfffile\n.";#so we are at the beggining of the next header if(defined $argv_config->{max_reads}){ if($sff_stat{sff_out_reads_total}>=$argv_config->{max_reads}){ print STDERR "Reached max_reads=$sff_stat{sff_out_reads_total} number of imported reads. Exiting.\n"; last; #exit if we had got enoughf reads } } $sff_stat{sff_in_reads_total}++; $sff_stat{sff_in_length_total}+=$sff_rh1[2]; $sff_stat{sff_in_length_Qclip}+=$clip_right-$clip_left; #check for read list inclusion/exclusion if(defined $reads_include){ unless($reads_include->{$read_name}){next} } if(defined $reads_exclude){ if($reads_exclude->{$read_name}){next} } if($clip_right-$clip_left<$min_read_length){ if($argv_config->{debug}){print STDERR "Read $read_name is completelly LQ\n"}; next; }#skipping over very short reads(<25 bp), to avoid assembly problems with phrap my $read_seq_str_lc=lc($read_seq_str); #check for MID (if defined), if present - move to the next bp of the MID my $read_seq_wok=substr($read_seq_str_lc, $clip_left, ($clip_right-$clip_left+1)); if(defined $MID_in_use[0]){#we have MID adapters my $MID_correct=0; foreach my $MID (@MID_in_use){ if($read_seq_wok=~m/^$MID/ and $MID_correct==0){#we've got MID read - add MID length to the clip_left $MID_correct=1; $clip_left+=length($MID); #check for the 3' end RL_MID adapter and clip my $MID3_clipped=0; foreach my $MID3_ID (@MID3_IDs){ if($read_seq_wok=~m/$MID3_ID$/ and $MID3_clipped==0){ $clip_right-=length($MID3_ID); $MID3_clipped=1; last; } } last; } } if($MID_correct==0){next} } if($argv_config->{nx} and $read_seq_wok=~m/n/){next}#skip reads with N's... my @read_seq=split("",$read_seq_str_lc); my @read_seqquality=unpack("C$sff_rh1[2]",$read_seqquality_str); my @read_seqpeakposition; for(my $rpi=0; $rpi<$sff_rh1[2]; $rpi++){ my $bp=$read_seq[$rpi]; unless($bp eq "a" or $bp eq "t" or $bp eq "c" or $bp eq "g" or $bp eq "n"){ print STDERR "Invalid nucleotide: $bp, pos: ".($rpi+1)." at the read $read_name. Check your's sff file.\n"; next SFF_READ; }; $read_seq[$rpi]=$bp; unless($read_seqquality[$rpi]<100){ print STDERR "Invalid sequence quality: $read_seqquality[$rpi] pos: ".($rpi+1)." at the read $read_name. Check your's sff file.\n"; next SFF_READ; } $read_seqpeakposition[$rpi]=$read_spp_step*$rpi+$read_spp_start; } #clip LQ read ends (to reduce phrap assembly problems with the titanium reads) #Analysing window my $Q_clip_end; my $cli=$clip_left; my $epc=0;#error probability cummulative # print "processing read: $read_name $clip_left $clip_right L: ".($#read_seqquality+1)."\n"; my $max_rd_ep2=$max_rd_ep; if($read_seq_str_lc=~m/a{7}|t{7}|g{7}|c{7}/){ #workarround for long homopolymers, causing gaps. $max_rd_ep2*=4; } unless($argv_config->{noqclip}){ while($cli<($clip_right-10) and $epc<$max_rd_ep2){ my @Q_win=@read_seqquality[$cli..($cli+10)]; foreach my $q (@Q_win){ $epc+=10**(-($q/10)); } # print "epc: $epc\n"; if($epc>$max_rd_ep2){ # print "Read: $read_name, Q_orig_clip: $clip_right Clipping quality at: ".($cli+5)."\n"; $clip_right=$cli+5; } $cli+=5; } } if($clip_right-$clip_left<$min_read_length){ if($argv_config->{debug}){print STDERR "Read $read_name is completelly LQ!\n"}; next; }#skipping over very short reads(<25 bp), to avoid assembly problems with phrap #seq_peak_position_max: my $read_spp_max=$read_seqpeakposition[($sff_rh1[2]-1)]+$read_spp_start; # print "Read: $read_name, seq: $read_seq_str\n"; #print "Quality: ". join(" ", @read_seqquality) ."\n"; ##########*********** HOMOPOYMER DATA PROCESSING PART ***********########### #a little bit of the quality processing in the homopoymers (to help phrap/etc properly assemble it) if(!$noqproc){ #$hp_margin_multiplier my $epc = 0; my $hp_length = 0; my $hp_begin = 0;my $rpi=0; for($rpi=0; $rpi<$sff_rh1[2]; $rpi++){ $epc+=10**((-$read_seqquality[$rpi])/10); $hp_length++; my $next_bp=0; if(defined $read_seq[($rpi+1)]){$next_bp=$read_seq[($rpi+1)]}; if($rpi<$sff_rh1[3]){$next_bp=$read_seq[($rpi+1)]}; if($read_seq[$rpi] eq "n" || !($read_seq[$rpi] eq $next_bp)){ #end of homopolymer or nothing my $epbps;#error probability per base in the hp my $nbp; for($nbp=$hp_begin; $nbp<=$rpi; $nbp++){#just going throught the homopolymer #we need to find out our error probabilities in the hp if($hp_length<3){#hp had less then 3 bp - all even ep $epbps=$epc/$hp_length; }else{#trim marginal q. if($nbp==$hp_begin || $nbp==$rpi){ #marginal homopolymer basepairs epbp $epbps=$hp_margin_multiplier*$epc/($hp_length+$hp_margin_multiplier); }else{#central homopolymer part epbp $epbps=$epc/($hp_length+$hp_margin_multiplier); # die "found hp margin. debug stop\n\n\n"; } }#found our error_prob. of current base - now we convet it to quality #unfortunatelly perl doesn't have log10..., only ln... $read_seqquality[$nbp]=int(-10* log($epbps)/log(10)); if($read_seqquality[$nbp]<0){$read_seqquality[$nbp]=0}; if($read_seqquality[$nbp]>19){$sff_stat{sff_in_length_Q20}++}; } $epc=0; $hp_length = 0; $hp_begin = $rpi+1;#get ready for the next homopolymer iteration } } }; #correcting the quality of the titanium run if($qual_downscale){ foreach my $qual (@read_seqquality){ $qual-=$qual_downscale; if($qual<0){$qual=0}; } } #print "Quality_proc: ". join(" ", @read_seqquality) ."\n"; #preparing data for splitting... my @phdseq=@read_seq[$clip_left..$clip_right]; my @phdseqquality=@read_seqquality[$clip_left..$clip_right]; my @phdseqpeakposition=@read_seqpeakposition[$clip_left..$clip_right]; my %read; $read{BEGIN_SEQUENCE}=$read_name; $read{sff_file}=$sff_file_name; $read{TRACE_ARRAY_MIN_INDEX}=0; $read{TRACE_ARRAY_MAX_INDEX}=$read_spp_max;#maximal read chromat peak position $read{CHROMAT_FILE}=join(":","sff",$read{sff_file},$read{BEGIN_SEQUENCE}); $read{CHEM}="454"; $read{DYE}="454"; $read{template}=$read{BEGIN_SEQUENCE}; $read{direction}="fwd"; $read{seq}=\@phdseq; $read{seqquality}=\@phdseqquality; $read{seqpeakposition}=\@phdseqpeakposition; #2. we need to split paired reads my $split_reads; if(!$nosplit){ $split_reads=split_paired_read_454(\%read); unless($split_reads=~m/ARRAY/){next};#contains only linker or too short (w/o any userfull info) }else{ my @read_solo; $read_solo[0]=\%read; $split_reads=\@read_solo; } #print "Orig_seq_l: ".($#read_seq+1)." Clipped_seq_l: ". ($#phdseq+1)."\n"; #3. and to write phd's file(s) foreach my $phd_rec (@{$split_reads}){ unless($argv_config->{nophd} and $argv_config->{nophdball}){ my $phd_data=formatPHDFile($phd_rec); unless($argv_config->{nophd}){ my $phd_out="../phd_dir/".$phd_rec->{BEGIN_SEQUENCE}.".phd.1"; open(PHD_OUT, ">$phd_out") or die "Unable to open phd file: $phd_out\n"; syswrite PHD_OUT, $phd_data or die "Unable to write to phd file: $phd_out\n"; close(PHD_OUT); } unless($argv_config->{nophdball}){ syswrite PHD_BALL, $phd_data or die "Unable to write to phd file: $phd_ball\n"; } } #now outputting fasta files if($argv_config->{os}){ syswrite(FASTA_OUT, formatFASTAFile($phd_rec)) or die "Unable to write to: $argv_config->{os}\n"; } if($argv_config->{oq}){ syswrite(FASTAQUAL_OUT, formatFASTAQUALFile($phd_rec)) or die "Unable to write to: $argv_config->{oq}\n"; } #and here we would output the FASTQ files. #if $argv_config->{velvet} write PE reads w both ends to separate file to the non-complete PE if($argv_config->{fastq}){ if($argv_config->{velvet} and $#{$split_reads}>0){#we have a complete sweet couple... syswrite(FASTQ2_OUT, formatFASTQFile($phd_rec)) or die "Unable to write to: $argv_config->{velvet}\n"; }else{#poor lonely shotgun/PE read or non-velvet output... syswrite(FASTQ1_OUT, formatFASTQFile($phd_rec)) or die "Unable to write to: $argv_config->{fastq}\n"; } } $sff_stat{sff_out_reads_total}++; $sff_stat{sff_out_length_total}+=$#{${$phd_rec}{seq}}+1; } #my $phd_data=formatPHDFile() } close (SFF_IN); #and close output files unless($argv_config->{nophdball}){ close(PHD_BALL); } if($argv_config->{os}){ close(FASTA_OUT); } if($argv_config->{oq}){ close(FASTAQUAL_OUT); } if($argv_config->{fastq}){ if($argv_config->{velvet}){#we have a complete couple close(FASTQ2_OUT); } close(FASTQ1_OUT); } save_sff_stat(\%sff_stat, $sff_log); sub formatPHDFile { #the task is to prepare phd file from the phd file data structure #my $phd_file_out=$_[0]; my $phd_record = $_[0]; #printHash($phd_record); #I need to have the phd sequence data structure - calculate trace array indexes? - consed uses 0 as default my @seq; my @seqquality; my @seqpeakposition; if($phd_record->{seq}=~m/ARRAY/){ @seq=@{$phd_record->{seq}}; }elsif($phd_record->{seq}=~m/\S+/){ @seq=split("",$phd_record->{seq}); }else{ printHash($phd_record); die "\nSequence data missing in phd record for $phd_record->{BEGIN_SEQUENCE}\n"; } if($phd_record->{seqquality}=~m/ARRAY/){ @seqquality=@{$phd_record->{seqquality}}; }elsif($phd_record->{seqquality}=~m/\S+/){ #die "stop.seqquality\n"; @seqquality=split(" ",$phd_record->{seqquality}); }else{printHash($phd_record); die "\nSequence quality data missing in phd record for $phd_record->{BEGIN_SEQUENCE}\n"}; if($phd_record->{seqpeakposition}=~m/ARRAY/){ @seqpeakposition=@{$phd_record->{seqpeakposition}}; }elsif($phd_record->{seqpeakposition}=~m/\S+/){ @seqquality=split(" ",$phd_record->{seqpeakposition}); }else{ #no seq peak position data - set to 0 my $i=0; while($i<$#seq+1){ $seqpeakposition[$i]=0; $i++; } } if($#seq!=$#seqquality){ my $seq_length = $#seq; my $seqquality_length = $#seqquality; die "\nRead: $phd_record->{BEGIN_SEQUENCE}\nSequence and sequence quality have different sizes - seq: $seq_length, seqquality: $seqquality_length\n"; } my $phd_file_buffer=""; # open (PHD_OUT_FILE, ">$phd_file_out") or die "Unable to open output phd file: $phd_file_out\n"; $phd_file_buffer .= "BEGIN_SEQUENCE $phd_record->{BEGIN_SEQUENCE}\n\n"; $phd_file_buffer .= "BEGIN_COMMENT\n\n"; if($phd_record->{CHROMAT_FILE}){$phd_file_buffer .= "CHROMAT_FILE: $phd_record->{CHROMAT_FILE}\n"} else{$phd_file_buffer .= "CHROMAT_FILE: none\n"}; if($phd_record->{ABI_THUMBPRINT}){$phd_file_buffer .= "ABI_THUMBPRINT: $phd_record->{ABI_THUMBPRINT}\n"} #else{$phd_file_buffer .= "ABI_THUMBPRINT: none\n"} ; if($phd_record->{PHRED_VERSION}){$phd_file_buffer .= "PHRED_VERSION: $phd_record->{PHRED_VERSION}\n"} #else{$phd_file_buffer .= "PHRED_VERSION: 0.0\n"} ; if($phd_record->{CALL_METHOD}){$phd_file_buffer .= "CALL_METHOD: $phd_record->{CALL_METHOD}\n"} else{$phd_file_buffer .= "CALL_METHOD: sff2phd\n"}; if($phd_record->{QUALITY_LEVELS}){$phd_file_buffer .= "QUALITY_LEVELS: $phd_record->{QUALITY_LEVELS}\n"} else{$phd_file_buffer .= "QUALITY_LEVELS: 99\n"}; if($phd_record->{TIME}){$phd_file_buffer .= "TIME: $phd_record->{TIME}\n" }else{$phd_file_buffer .= "TIME: $phd_time\n";} if(defined $phd_record->{TRACE_ARRAY_MIN_INDEX}){$phd_file_buffer .= "TRACE_ARRAY_MIN_INDEX: $phd_record->{TRACE_ARRAY_MIN_INDEX}\n"}; if($phd_record->{TRACE_ARRAY_MAX_INDEX}){$phd_file_buffer .= "TRACE_ARRAY_MAX_INDEX: $phd_record->{TRACE_ARRAY_MAX_INDEX}\n"}; if($phd_record->{TRIM}){$phd_file_buffer .= "TRIM: $phd_record->{TRIM}\n"}; if($phd_record->{CHEM}){$phd_file_buffer .= "CHEM: $phd_record->{CHEM}\n"}; if($phd_record->{DYE}){$phd_file_buffer .= "DYE: $phd_record->{DYE}\n"}; $phd_file_buffer .= "\nEND_COMMENT\n\n"; $phd_file_buffer .= "BEGIN_DNA\n"; my $i=0; while($i<$#seq+1){ $phd_file_buffer .= "$seq[$i] $seqquality[$i] $seqpeakposition[$i]\n"; $i++; } $phd_file_buffer .= "END_DNA\n\n"; $phd_file_buffer .= "END_SEQUENCE\n\n"; unless(defined($phd_record->{lib})){$phd_record->{lib}="454"}#454Read if(defined($argv_config->{lib}) and $argv_config->{lib}=~m/\S/){ $phd_record->{lib}=$argv_config->{lib}; } #and now we need WR tags to allow consed to recognise 454 read pairs properly... $phd_file_buffer .= "WR{\ntemplate determineReadTypes $phd_time_RT\nname: $phd_record->{template}\nlib: $phd_record->{lib}\n}\n\n"; $phd_file_buffer .= "WR{\nprimer determineReadTypes $phd_time_RT\ntype: univ $phd_record->{direction}\n}\n\n"; # close (PHD_OUT_FILE); return($phd_file_buffer); } sub save_sff_stat { #save global statistic about the sff file my $sff_stats=$_[0]; my $log_out_file=$_[1]; $sff_stats->{sff_in_length_average}=int($sff_stats->{sff_in_length_Qclip}/$sff_stats->{sff_in_reads_total}); my $sff_log_title = "sff_file\tin_reads_total\tin_reads_wo_linker\tin_reads_w_linker\tin_length_total\tin_length_Qclip\tin_length_average\tin_length_Q20\tout_reads_total\tout_reads_paired\tout_reads_single\tout_reads_empty\tout_length_total\n"; my @sff_fields = ("sff_file_name", "sff_in_reads_total", "sff_in_reads_wo_linker", "sff_in_reads_w_linker", "sff_in_length_total", "sff_in_length_Qclip", "sff_in_length_average", "sff_in_length_Q20", "sff_out_reads_total", "sff_out_reads_paired", "sff_out_reads_single", "sff_out_reads_empty", "sff_out_length_total"); my $new_file=1; if(-e $log_out_file){$new_file=0}; open (SFF_LOG, ">>$log_out_file"); if($new_file){ print SFF_LOG $sff_log_title; } my @sff_log_data; my $i=0; for($i=0; $i<=$#sff_fields; $i++){ unless(defined $sff_stats->{$sff_fields[$i]}){ $sff_stats->{$sff_fields[$i]}=0; } push @sff_log_data, $sff_stats->{$sff_fields[$i]}; } print SFF_LOG join("\t",@sff_log_data)."\n"; close (SFF_LOG) } sub make_seq_words {#for making local words database from our subject sequence #words score 1 for unicore/complement strand match my @seqs= @{$_[0]}; my $word_size=$_[1]; my %words_db; foreach my $seq (@seqs){ $seq=uc($seq);#input db sequence my $seq_rc=reverse($seq);$seq_rc=~tr/ATCG/TAGC/; my @seq_u=split("",$seq); my @seq_c=split("",$seq_rc); my $i; #making forward words list for($i=0;$i<=($#seq_u-$word_size+1); $i++){ my $word_u=lc(join("",@seq_u[$i..($i+$word_size-1)])); $words_db{$word_u}=1; # print ("DB word: $word_u\n"); } #making reverse words list for($i=0;$i<=($#seq_u-$word_size+1); $i++){ my $word_c=lc(join("",@seq_c[$i..($i+$word_size-1)])); $words_db{$word_c}=1; } } #exit(1); return(\%words_db) } sub split_paired_read_454 { my $read_in=$_[0]; my $minscore=8;#min number of words matching in the middle of the read. my $minscore_edge=3;#minimal number of words matching near the end. my $max_toend=4;#maximal number of bp's before end, to use minscore edge; my $maxgap=5;#max gap in linker match my $min_read_length=25;#minimal length of the read, should be 25 in production version #first we find sequence matches to our db my $i; my $num_matches=0; my @matches; # my $match_start; # my $match_end; # print "Splitting 454 paired reads: words to do:".(($#{${$read_in}{seq}})+1-$argv_config->{word_size})."\n"; for($i=0; $i<=($#{${$read_in}{seq}}+1-$argv_config->{word_size});$i++){ my $word_query=join("",@{${$read_in}{seq}}[$i..($i+$argv_config->{word_size}-1)]); # print STDERR "Query word: $word_query\n"; if($linker_db->{$word_query}){#we have mach to word in the db my %match; $match{begin}=$i; $match{score}=0;#increment it in the cycle while($linker_db->{$word_query}){ $match{end}=$i+$argv_config->{word_size}; $i++; $word_query=uc(join("",$read_in->{seq}[$i..($i+$argv_config->{word_size}-1)])); $match{score}++; } $match{length}=$match{end}-$match{begin}; push @matches, \%match; $num_matches++; if($argv_config->{debug}){ print STDERR "Match from $match{begin} to $match{end}, score: $match{score}\n"; } } } my @seq_orig_rtn; $seq_orig_rtn[0]=$read_in; if($num_matches==0){ # die "No matches. debug stop.\n"; #No matches. - Return original data; $sff_stat{sff_in_reads_wo_linker}++; return(\@seq_orig_rtn); }else{ #print "\nFound $num_matches match(es).\n"; #group matches $i=0; my @matches_grp; my $max_score=0; my $gi=0; while($i<=$#matches){ #for($i=0;$i<=$#matches;$i++){ my %match_grp; $match_grp{begin}=$matches[$i]->{begin}; #print "First match from $match_grp{begin} to $matches[$i]->{end}\n"; do{ $match_grp{end}=$matches[$i]->{end}; #print "Match end: $matches[$i]->{end}\t"; $match_grp{score}+=$matches[$i]->{score}; # $match_grp{length}+=$matches[$i]->{length}; $i++; #print "Working!"; }while($matches[$i] and ($matches[$i]->{begin}-$match_grp{end})<=$maxgap); #$i--; my $match_to_begin=$match_grp{begin};#distance to begin of the read my $match_to_end=length($read_in->{seq})-$match_grp{end}-1; $match_grp{toend}=$match_to_begin>$match_to_end?$match_to_begin:$match_to_end; #before pushing in we do some second level filtering: if(($match_grp{toend}<=$max_toend and $match_grp{score}>=$minscore_edge) or($match_grp{score}>=$minscore)){ $max_score=$max_score<$match_grp{score}?$match_grp{score}:$max_score; push @matches_grp, \%match_grp; #print "Match group: $match_grp{begin} to $match_grp{end}, toend: $match_grp{toend}, $match_grp{score}, max: $max_score\n"; $sff_stat{sff_in_reads_w_linker}++; }else{ $sff_stat{sff_in_reads_wo_linker}++; } } unless($matches_grp[0]){return(\@seq_orig_rtn)}; my $match_grp_selected; foreach my $match_group (@matches_grp){ if($match_group->{score}>=$max_score){ $match_grp_selected=$match_group; } } #get linker ends (splitting points) for $match_group_selected # print ("Read: $read_in->{BEGIN_SEQUENCE},\nbest match is from $match_grp_selected->{begin} to $match_grp_selected->{end}, score $match_grp_selected->{score}, to end: $match_grp_selected->{toend}\n"); my @split_reads; #lets decide whether we are going to split the read onto parts my $template=$read_in->{BEGIN_SEQUENCE}; #now we deal with forward part if(($#{${$read_in}{seq}}-$match_grp_selected->{end})>=$min_read_length){ my %fwd_read; #we output seq/qual as an array, not as a string my @seq = @{${$read_in}{seq}}[$match_grp_selected->{end}..$#{${$read_in}{seq}}]; $fwd_read{seq}=\@seq; my @seq_qual = @{${$read_in}{seqquality}}[$match_grp_selected->{end}..$#{${$read_in}{seq}}]; $fwd_read{seqquality}=\@seq_qual; my @seq_peakposition = @{${$read_in}{seqpeakposition}}[$match_grp_selected->{end}..$#{${$read_in}{seq}}]; $fwd_read{seqpeakposition}=\@seq_peakposition; $fwd_read{template}=$template; #$fwd_read{lib}="454PairedRead"; $fwd_read{direction}="fwd"; # $fwd_read{TIME}="Fri Aug 08 08:08:08 2008"; $fwd_read{BEGIN_SEQUENCE}=$read_in->{BEGIN_SEQUENCE}."_F"; $fwd_read{CHROMAT_FILE}=join(":","sff",$read_in->{sff_file},$read_in->{BEGIN_SEQUENCE}); $fwd_read{CHEM}=$read_in->{CHEM}; $fwd_read{TRACE_ARRAY_MIN_INDEX}=$read_in->{TRACE_ARRAY_MIN_INDEX}; $fwd_read{TRACE_ARRAY_MAX_INDEX}=$read_in->{TRACE_ARRAY_MAX_INDEX}; push @split_reads, \%fwd_read; # print "Forward read: $fwd_read{seq}\nQualF: $fwd_read{seqquality}\n"; } #so we deal with "rewerse part" of the 454 read if($match_grp_selected->{begin}>=$min_read_length){ my %rev_read; my @seq = reverse(@{${$read_in}{seq}}[0..($match_grp_selected->{begin}-1)]); $rev_read{seq}=\@seq; my @seq_qual = reverse(@{${$read_in}{seqquality}}[0..($match_grp_selected->{begin}-1)]); $rev_read{seqquality}=\@seq_qual; my @seq_peakposition = reverse(@{${$read_in}{seqpeakposition}}[0..($match_grp_selected->{begin}-1)]); foreach my $peakposition (@seq_peakposition){ $peakposition=$read_in->{TRACE_ARRAY_MAX_INDEX}-$peakposition; } $rev_read{seqpeakposition}=\@seq_peakposition; #$rev_read{seq}=~tr/ATCGatcg/TAGCtagc/; foreach my $bp (@seq){$bp=~tr/ATCGatcg/TAGCtagc/}; $rev_read{direction}="rev"; $rev_read{template}=$template; #$rev_read{lib}="454PairedRead"; # $rev_read{TIME}="Fri Aug 08 08:08:08 2008"; $rev_read{CHEM}=$read_in->{CHEM}; $rev_read{TRACE_ARRAY_MIN_INDEX}=$read_in->{TRACE_ARRAY_MIN_INDEX}; $rev_read{TRACE_ARRAY_MAX_INDEX}=$read_in->{TRACE_ARRAY_MAX_INDEX}; $rev_read{BEGIN_SEQUENCE}=$read_in->{BEGIN_SEQUENCE}."_R"; $rev_read{CHROMAT_FILE}=join(":","sff","-f",$read_in->{sff_file},$read_in->{BEGIN_SEQUENCE}); #chromat file defined by write phd push @split_reads, \%rev_read; # print "Rewerse read: $rev_read{seq}\nQualR: $rev_read{seqquality}\n"; } if($#split_reads==1){ $sff_stat{sff_out_reads_paired}++; $split_reads[0]->{lib}="454PairedRead"; $split_reads[1]->{lib}="454PairedRead"; }elsif($split_reads[0]){ $sff_stat{sff_out_reads_single}++; $split_reads[0]->{lib}="454UnpairedRead"; }else{ $sff_stat{sff_out_reads_empty}++; } if($split_reads[0]){ return(\@split_reads); }else{#no userfull info in reads, except linker sequence return(-2); } } } sub formatFASTAFile { #formats the sequence fasta header and data, with full phrap headers (like phd2fasta) my $sr=$_[0]; unless($sr->{TIME}){$sr->{TIME} = $phd_time}; #>%s CHROMAT_FILE: %s PHD_FILE: %s.phd.1 CHEM: 454 DYE: 454 TIME: %s TEMPLATE: %s DIRECTION: %s\n my $fasta_dat=">".$sr->{BEGIN_SEQUENCE}; if($argv_config->{fastaphrap}){ $fasta_dat.=" CHROMAT_FILE: $sr->{CHROMAT_FILE} PHD_FILE: $sr->{BEGIN_SEQUENCE}.phd.1 CHEM: 454 DYE: 454 TIME: $sr->{TIME} TEMPLATE: $sr->{template} DIRECTION: $sr->{direction}"; } $fasta_dat.="\n"; my $seqi=$#{$sr->{seq}};#max seq index my $i=0; my $rl=60;#offset do{ my $ie=($i+$rl-1)>$seqi?$seqi:($i+$rl-1); $fasta_dat.=join("",@{$sr->{seq}}[$i..$ie])."\n"; $i+=$rl; }while($i<=$seqi); return($fasta_dat); } sub formatFASTAQUALFile { #formats the sequence quality fasta header and data. my $sr=$_[0]; #>%s CHROMAT_FILE: %s PHD_FILE: %s.phd.1 CHEM: 454 DYE: 454 TIME: %s TEMPLATE: %s DIRECTION: %s\n my $fasta_dat=">".$sr->{BEGIN_SEQUENCE}."\n"; my @seqq=@{$sr->{seqquality}}; #my $sq;#seq quality in string format my $rl=60;# N of qual vallues per string my $rc=0;#bp count if($argv_config->{chr}){#using character encoding my $ni=0; for(my $i=0; $i<=$#seqq;$i++){ $fasta_dat.=chr($seqq[$i]+64); $rc++; if($rc==$rl or $i==$#seqq){ $fasta_dat.="\n"; $rc=0; } } }else{ for(my $i=0; $i<=$#seqq;$i++){ $fasta_dat.=$seqq[$i]." "; $rc++; if($rc==$rl or $i==$#seqq){ $fasta_dat.="\n"; $rc=0; } } } return($fasta_dat); } sub formatFASTQFile { #formats the sequence quality fasta header and data. my $sr=$_[0]; my $fastq_dat="@".$sr->{BEGIN_SEQUENCE}."\n"; $fastq_dat.=join("",@{$sr->{seq}})."\n"; $fastq_dat.="+\n"; foreach my $bq (@{$sr->{seqquality}}){ $fastq_dat.=chr($bq+64); } $fastq_dat.="\n"; return($fastq_dat); } sub get_reads_list{ #return a hash ref to the list of reads; my $in_file=$_[0]; unless(open(READS_IN, $in_file)){ print STDERR "Unable to read the list of reads from: $in_file\n"; return (undef); } my %reads; while(<READS_IN>){#get "template name" for 454 PE reads if(m/(\S+)_/){$reads{$1}=1} elsif(m/(\S+)/){$reads{$1}=1} } close(READS_IN); return(\%reads); } sub get_ARGV { #for getting command line arguments my %argv_config; my @files_list; foreach my $argument (@ARGV){ if($argument=~m/^-+(\S+?)=(\S+)/){ $argv_config{$1}=$2; }elsif($argument=~m/^-+(\S+)/){ $argv_config{$1}=1; }else{ push @files_list, $argument; } } $argv_config{files_list}=\@files_list; return(\%argv_config); } sub printHash { #prints hash with the data and refs to the other data (my $hashRef, my $delimiter) = @_; my %hash = %$hashRef; unless($delimiter){$delimiter = "\n"} foreach my $key (keys %hash){ if (ref($hash{$key}) eq "ARRAY"){print "$key => Array\n"; printArray($hash{$key}, $delimiter)} elsif (ref($hash{$key}) eq "HASH"){print "$key => Hash\n"; printHash($hash{$key}, $delimiter)} else {print "$key => $hash{$key}", $delimiter} } print "\n"; } sub printArray { #prints array with the data and refs to the other data (my $arrayRef, my $delimiter) = @_; unless($delimiter){$delimiter = "\n"} my $x = 0; $delimiter = " "; foreach my $element (@$arrayRef){ if (ref($element) eq "ARRAY"){print "$x: "; printArray($element, $delimiter)} elsif (ref($element) eq "HASH"){print "$x: "; printHash($element, $delimiter)} else {print "$x: $element$delimiter"} $x++; } print "\n"; } sub get_local_time_phd {#get's phd formatted time for the header and RT section my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun); my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime(); my $year = 1900 + $yearOffset; my $year_short=substr($year,2); #Fri Aug 08 08:08:08 2008 my $phd_Time = "$weekDays[$dayOfWeek] $months[$month] $dayOfMonth $hour:$minute:$second $year"; my $phd_RT_Time="$year_short$month$dayOfMonth:$hour$minute$second"; return($phd_Time,$phd_RT_Time) }