# Copyright (c) 1996, 1997, 1998 Georg Fuellen. # This module is free software; you can redistribute it and/or modify # it under the same terms as Perl itself. #BEGIN {$ENV{READSEQ_DIR} ||= "/your/path/here"; } # see the UnivAln.pm.html #BEGIN {$ENV{CLUSTAL_DIR} ||= "/your/path/here"; } # section on `INSTALLATION' use lib "blib/lib"; use Bio::UnivAln; # enable the following if you have Bio::PreSeq installed and want to use it #use Bio::PreSeq; # This test file deliberately produces some errors, the ones I get are # in file t/univaln.t2_expected_errors. This script's output on my machine is # in file t/univaln.o. require 5.002; #use diagnostics; use strict; use Carp; my $sequence_strg = 'AAAAAAAAAAAAAA'; my @character_list = ('C','C','C','C','C','C','C','C','C','C','C','C','C','C'); my $bioPreSeqObject; if (defined($Bio::PreSeq::VERSION)) { my $bioPreSeqObject = Bio::PreSeq->new(-seq=>'GGGGGGGGGGGGGG'); } my $file = "t/alnfile.fasta"; my $seqs = "abcdef\nghijkl\nmnopqr\nstuvwx\nyzabcd"; my $id = "Test"; my $desc = "Test alignment"; my %names = ("GenBank"=>"X12345","Private"=>"/data/x12345.gb"); my $names = \%names; my $rids = ['R','S','T','U','V']; my $cids = undef; my $rdescs = ['R desc','S desc','T desc','U desc','V desc']; my $cdescs = ['A desc','B desc','C desc','D desc','E desc','F desc']; my $numbering = 1; my $type = 'dna'; my $ffmt = 'fasta'; my $descffmt = 'fasta'; my $aref; my @resSlice; print "\n\n*** OBJECT CREATION AND OUTPUT ***\n"; my $aln = Bio::UnivAln->new("t/alnfile.fasta"); $aln = Bio::UnivAln->new(-file=>"t/alnfile.aa", -desc=>'Sample alignment', -type=>'amino', -ffmt=>'raw' # 1 line in file -> 1 sequence ); $aln = Bio::UnivAln->new(-seqs=>"TCCCGCGTCAACTG\nTGGTGCTTCAACCG\nACTTG--TCAACTG"); if (defined($Bio::PreSeq::VERSION)) { $aln = Bio::UnivAln->new(-seqs=>[$sequence_strg,\@character_list,$bioPreSeqObject]); $aln = Bio::UnivAln->new(-seqs=> ['ACCCGCGTCAACTG', ['A','G','G','G','G','C','T','T','C','A','A','C','C','G'], Bio::PreSeq->new(-seq=>'ACTTG--TCAACTG') ]); print "\nTesting usage of Bio::PreSeq:\n", $aln->_strs(), "\n"; } $aln = Bio::UnivAln->new($file,$seqs,$id,$desc,$names,$rids,$cids, $rdescs,$cdescs,$numbering,$type,$ffmt,$descffmt); # data provided by file is overwritten ! print "\n setting default output format to fasta"; $aln->ffmt('fasta'); print "\n aln in Default format:\n", $aln->layout(); print "\n aln in raw format:\n", $aln->out_raw(); print "\n aln in fasta format:\n", $aln->out_fasta(), "\n"; $aln = Bio::UnivAln->new("t/artif.fasta"); print "\n\n*** SLICING ***\n"; my $alnSlice = $aln->seqs(1,3,1,2); # rows 1-3, columns 1-2 print "\nseqs(1,3,1,2) = \n", $alnSlice, "\n"; my @alnSlice = $aln->seqs([ 1,2,3 ], [ 1,4 ]); # rows 1-3, columns 1+4 print "\nseqs([ 1,2,3 ], [ 1,4 ]) \n"; for $aref ( @alnSlice ) { print @$aref, "\n"; } print "\n\n*** ADVANCED SLICING ***\n"; $alnSlice = $aln->seqs([3,2,3,1], [1,3..5]); # rows 3,2,3,1, cols 1,3..5 print "\nseqs([3,2,3,1], [1,3..5]) = \n", $alnSlice, "\n"; sub has_purine { my $str = join "", @{ $_[0] }; if ($str =~ /[AaGgRr]+/) {return 1;} else {return 0;} } $alnSlice = $aln->seqs([ 1,2,3 ], \&has_purine); # rows 1-3, and from these print "\nseqs([ 1,2,3 ], \\&has_purine), only the columns for which has_purine returns 1 \n", $alnSlice, "\n"; print "\n Same aln slice in fasta format:\n", $aln->out_fasta([1..3],\&has_purine), "\n"; $alnSlice = $aln->seqs({ids=>'R T'}); print "\nseqs({ids=>\'R T\'}) = \n", $alnSlice, "\n"; $alnSlice = $aln->seqs([$aln->height()..0],[$aln->width()..1]); print "\n\n*** MAPPING ***\n"; @resSlice = $aln->map_r(\&has_purine, [ 1,2,3 ]); # 1,0,1 if row 1+3 has purine print "\nmap_r(\\&has_purine, [ 1,2,3 ]) \n", @resSlice, "\n\n"; @resSlice = $aln->map_c(\&has_purine, [ 1,4 ]); # 1,0 if column 1 has purine print "\nmap_c(\\&has_purine, [ 1,4 ]) \n", @resSlice, "\n\n"; sub id { return @_ } @alnSlice = $aln->map_r(\&id, [ 1,2,3 ]); # same as $alnSlice = $aln->seqs([ 1,2,3 ]) # since mapping the identity function to desired row/column subsets # and collecting the result is like slicing print "\nmap_r(\\&id, [ 1,2,3 ])\n"; for $aref ( @alnSlice ) { print @$aref, "\n"; } @alnSlice = $aln->seqs([ 1,2,3 ]); print "\nIt's the same as seqs([ 1,2,3 ]) :\n"; for $aref ( @alnSlice ) { print @$aref, "\n"; } sub has_pyrimi { my $str = join "", @{ $_[0] }; if ($str =~ /[CcTtYy]+/) {return 1;} else {return 0;} } @resSlice = $aln->map_c(\&has_purine, [ 1..6 ]); print "\n0/1 indicators for purine\n", @resSlice, "\n"; @resSlice = $aln->map_c(\&has_pyrimi, [ 1..6 ]); print "\n0/1 indicators for pyrimidine\n", @resSlice, "\n"; print "\n\n*** TRIGGERING ERRORS ***\n"; carp "Testing error reporting; approx. 16 errors to be expected, see file t/univaln.t2_expected_errors"; my $aln_err = Bio::UnivAln->new(-seqs=>"TCCCGCGTDD\nTGGTGCTTCAAC__\n__TTG--TCAACTG", -desc=>'seqs and other accessors will truncate because first sequence is shorter than the others', -ffmt_triggers_complaint=>"raw", -type=>'dna', -numbering=>20); print "\n", scalar($aln_err->seqs(-1,5,18,31)); print "\nOffending characters:", $aln_err->alphabet_check(), "\n"; $aln_err->_type('amino'); print "\nOffending characters:", $aln_err->alphabet_check(), "\n"; $aln_err->inplace(1); $aln_err->equalize_lengths(); $aln_err->inplace(0); print "\nEqualilized lengths:\n", $aln_err->layout("raw"),"\n";; $aln_err->inplace(1); $aln_err->equalize_lengths(20); $aln_err->inplace(0); print "\nPadded lengths to 20:\n", $aln_err->layout("raw"),"\n";; $aln_err= Bio::UnivAln->new("_undef",$names,"IG CH","desc", \%names,-10,"SNA","SNA","SNA"); print "\n\n*** TESTING ACCESSORS AND COPYING ***\n"; print " seqs()\n", scalar($aln->seqs())," seqs(2)\n", scalar($aln->seqs(2))," seqs(2,4)\n", scalar($aln->seqs(2,4))," seqs(2,4,3)\n", scalar($aln->seqs(2,4,3))," seqs(2,4,3,5)\n",scalar($aln->seqs(2,4,3,5))," _strs()\n", $aln->_strs()," seqs([2,4])\n", scalar($aln->seqs([2,4]))," seqs([],[2,4])\n",scalar($aln->seqs([],[2,4]))," _arys([2,3,4])\n"; @alnSlice = $aln->seqs([2,3,4]); for $aref ( @alnSlice ) { print @$aref, "\n"; } print " seqs([2,4],[3])\n"; @alnSlice = $aln->seqs([2,4],[3]); for $aref ( @alnSlice ) { print @$aref, "\n"; } print "\nlayout\n",$aln->layout,"\n"; my $aln_copy = $aln->copy(); print "\nlayout of duplicate:\n",$aln_copy->layout,"\n"; print "\nChecking for equality, 1 expected:\n",$aln_copy->equal_nogaps($aln),"\n"; print "\nChecking for equality, 0 expected:\n",$aln_copy->equal_nogaps($aln_err),"\n"; $aln_err = $aln->copy(); $aln_err->equalize_lengths(20); print "\nChecking for equality, 1 expected:\n",$aln_copy->equal_nogaps($aln_err),"\n"; $aln->id("bad Id"); print "set bad Id= ", $aln->id(), "\n"; $aln->id("correct_id"); print "set correct new Id= ", $aln->id(), "\n"; my $old_desc = $aln->desc("new desc for aln"); print "set old desc= $old_desc to new desc= ", $aln->desc(), "\n"; print "type= ",$aln->type(),"\n"; $aln->numbering(4); print "set new numbering= ", $aln->numbering(), "\n"; print "Row ids= ",@{$aln->row_ids()},"\n"; print "Column ids= ",@{$aln->col_ids()},"\n"; print "Row descriptions= ",@{$aln->row_descs()},"\n"; print "Column descriptions= ",@{$aln->col_descs()},"\n"; $aln->row_ids(['r','s','t','u','v']); print "New row ids= ",@{$aln->row_ids()},"\n"; $aln->col_ids(['a','b','c','d','e','f']); print "New column ids= ",@{$aln->col_ids()},"\n"; $aln->row_descs(['r desc','s desc','t desc','u desc','v desc']); print "New row descriptions= ",@{$aln->row_descs()},"\n"; $aln->col_descs(['a desc','b desc','c desc','desc desc','e desc','f desc']); print "New column descriptions= ",@{$aln->col_descs()},"\n"; print "seqs(2,4,7,9)\n",scalar($aln->seqs(2,4,7,9)),"\n"; print "seqs(2,4,2,4), columns -2,-1,0:\n",scalar($aln->seqs(2,4,2,4)),"\n"; print 'Current format of $aln:', $aln->ffmt('raw'), "\n\n"; print $aln->layout; # New default layout is 'raw' print $aln->out_fasta,"\n"; print "Modified alignment:\n",$aln->out_fasta,"\n"; print "Original copy:\n",$aln_copy->layout,"\n"; print "\nThe current alignment has ",$aln->width()," columns and ",$aln->height()," rows.\n\n"; print "\n\n*** CREATING A NEW ALIGNMENT BY SLICING AN OLD ONE ***\n"; $aln_copy = $aln->aln(2,4,7,9); print "Set up new alignment from the modified alignment, aln(2,4,7,9) is an alignment with rows 2-4 and columns 4-6 (designated 7-9 since the numbering starts with column no.4 \n"; print "Modified alignment:\n",$aln_copy->out_fasta; print "Still got the old sequence names, and column _default_names_ from the ones that start with 1 in the old alignment; accessing seqs({ids=>\'s Junk u\'},{ids=>\'c Junk d e\'}) where column c and ``Junk'' won't be found and col. d and f are printed\n"; print scalar($aln_copy->seqs({ids=>'s Junk u'},{ids=>'c Junk d f'})), "\n"; print "\nCreating empty alignment\n"; my $aln2 = Bio::UnivAln->new(); print $aln2->out_raw(); print "\n\n*** CONSENSUS, (IN)VARIABLE SITES, REVERSE, COMPLEMENT, GAP-FREE SITES ***\n"; $aln->numbering(0); print "\nSet new numbering= ", $aln->numbering(), "\n"; $aln = Bio::UnivAln->new(-seqs=>"AAATT--TCAACAG\nAATAA-TTCCACCC\nATAAC--TCACGC-"); print "\nCurrent alignment:\n",$aln->layout; my $resSlice = $aln->consensus(); print "\nConsensus of the columns, >= 75% invariability (default) \n $resSlice\n"; $resSlice = $aln->consensus(0.6); print "\nConsensus of the columns, >= 60% invariability \n $resSlice\n"; @resSlice = $aln->consensus(1, [1..10]); print "\nConsensus of the first 10 columns, consensus residue must be invariable \n", @resSlice, "\n"; $aln2 = Bio::UnivAln->new(-file=>"t/artif2.raw"); print "\nUsing longer alignment:\n",$aln2->layout; $resSlice = $aln2->consensus(); print "\nConsensus of the columns, default\n$resSlice\n"; $resSlice = $aln2->consensus(0.6); print "\nConsensus of the columns, >= 60% invariability\n$resSlice\n"; $resSlice = $aln2->consensus(0.4); print "\nConsensus of the columns, >= 40% invariability\n$resSlice\n"; $resSlice = $aln->var_sites(); print "\nUsing shorter alignment:\n",$aln->layout("raw"); print "\nVariable columns ".join(" ",@{$aln->var_inds()})." only, i.e. deleting columns that are invariable \n$resSlice\n"; $resSlice = $aln->invar_sites(); print "\nInvariable columns ".join(" ",@{$aln->invar_inds()})." only, i.e. deleting columns that are variable \n$resSlice\n"; $resSlice = $aln->var_sites(0.6, [1,3]); print "\nVariable columns only, i.e. deleting columns with >= 60% invariability; print rows 1+3 only \n$resSlice\n"; $resSlice = $aln->invar_sites(0.6, [1,3]); print "\nInvariable columns only, i.e. only columns with >= 60% invariability; print rows 1+3 only \n$resSlice\n"; @resSlice = $aln->var_sites(0.2); print "\nVariable columns only, i.e. deleting columns with >= 20% invariability \nFor the current example, an empty array should be returned \n"; for $aref ( @resSlice ) { print @$aref, "\n"; } $resSlice = $aln->gap_free_sites([1,3]); print "\nOnly columns ".join(" ",@{$aln->gap_free_inds()})." that are gap_free, print rows 1+3 only \n$resSlice\n"; $resSlice = $aln->no_allgap_sites([1,3]); print "\nOnly columns ".join(" ",@{$aln->no_allgap_inds()})." that do not have gaps exclusivly, print rows 1+3 only (row 2 has a non-gap character in position 6 !) \n$resSlice\n"; $resSlice = $aln->remove_gaps([1,3]); print "\nOriginal sequences 1 and 3, using the 'remove_gaps' function\n$resSlice\n"; $resSlice = $aln->reverse(); print "\nRows in reverse \n$resSlice\n"; $resSlice = $aln->complement(); print "\nRows in complement \n$resSlice\n"; @resSlice = $aln->revcom([1,3]); print "\nRows 1+3 in reverse complement\n"; for $aref ( @resSlice ) { print @$aref, "\n"; } $aln_copy = $aln->aln(); print "\nCopied alignment:\n",$aln_copy->out_fasta, "\n"; $aln_copy->inplace(1); $aln_copy->seqs({ids=>'2 3'},[1..6]); print "Alignment sliced inplace, {ids=>\'2 3\'},[1..6] :\n",$aln_copy->out_fasta, "\n"; $aln_copy->gap_free_sites(); print "Alignment after gap_free_cols inplace:\n",$aln_copy->out_fasta, "\n"; $aln_copy->var_sites(0.6, {ids=>'2 3'}); print "Alignment after var_sites inplace:\n",$aln_copy->out_fasta, "\n"; $aln_copy->invar_sites(); print "Alignment after invar_sites inplace, nothing left :\n",$aln_copy->out_fasta, "\n"; $aln_copy = $aln->aln(); print "\n\nCopied alignment:\n",$aln_copy->out_fasta, "\n"; $aln_copy->inplace(1); $aln_copy->revcom({ids=>'1 3'}); print "Alignment after revcom inplace; select first and last row by id:\n",$aln_copy->out_fasta, "\n"; $aln_copy->reverse(\&has_purine); print "Alignment after reverse inplace; select first and last row by function:\n",$aln_copy->out_fasta, "\n"; $aln_copy->complement([1..2]); print "Alignment after complement inplace; select first and last row by index list:\n",$aln_copy->out_fasta, "\n"; $aln_copy->inplace(0); print "\n\n*** NOW THE SAME, USING USER-SUPPLIED FUNCTIONS: CONSENSUS, (IN)VARIABLE SITES, REVERSE, COMPLEMENT, ETC ***\n"; sub consensus { my @chars = @{ $_[0] }; my %temp = (); my $threshold = 0.34; # more than 1/3 my @list = sort { $temp{$a}<=>$temp{$b} } grep ++$temp{$_} >= $threshold * ($#chars+1), @chars; # In case of a tie, it's not specified which residue is in $list[-1] return (defined($list[-1]) ? $list[-1] : 'N'); } sub maj_dominance { my $str = join "", @{ $_[0] }; my $first_res = @{ $_[0] }[0]; $str =~ s/$first_res//g; if ($str) {return 1;} else {return 0;} } sub gap_free { my $str = join "", @{ $_[0] }; if ($str =~ /-/) {return 0;} else {return 1;} } sub reverse_ { return [ reverse @{ $_[0] } ]; } sub complement { my $str = join "", @{ $_[0] }; $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; return [ split "", $str ]; } print "\nCurrent alignment:\n",$aln->layout; @resSlice = $aln->map_c(\&maj_dominance); print "\nDominated sites:\n", @resSlice, "\n"; @resSlice = $aln->map_c(\&gap_free); print "\nGap-free sites:\n", @resSlice, "\n"; @resSlice = $aln->map_c(\&consensus); print "\nUser-supplied consensus per column, threshold 0.34 (see above):\n", @resSlice, "\n"; # You can also parametrize your functions by using ``closures'' (See # ``Programming Perl, 2nd Ed., p.253). Basically, you set up a function # that has the parameter built into it, and pass around the reference to # that function. Here's a function that does the set-up: sub _setup_consensus_with_threshold { my $threshold = shift; return sub { my @chars = @{ $_[0] }; my %temp = (); my @list = sort { $temp{$a}<=>$temp{$b} } grep ++$temp{$_} >= $threshold * ($#chars+1), @chars; # In case of a tie, it's not specified which residue is in $list[-1] return (defined($list[-1]) ? $list[-1] : 'N'); } } # Here, you create an instance of the function, incl. the parameter. # $consensus holds a reference to the new instance. my $consensus = _setup_consensus_with_threshold(0.75); # and finally, you pass the reference to the function around, e.g. to map_c(). @resSlice = $aln->map_c(\&$consensus); # (You can do pretty cute tricks using closures, e.g. you can set a counter # to 1 in the setup function, and increment it in the ``real'' function.) print "\nUser-supplied consensus per column, threshold 0.75:\n", @resSlice, "\n"; @resSlice = $aln->map_r(\&consensus, [ 1,2,3 ]); print "\nUser-supplied consensus of rows (;-)\n", @resSlice, " Note that in case of a tie, the winning residue is selected arbitrarily\n"; @resSlice = $aln->map_c(\&consensus, \&has_purine); print "\nUser-supplied consensus of the columns that have purine\n", @resSlice, "\n"; @resSlice = $aln->map_r(\&reverse_); print "\nRows in reverse\n"; for $aref ( @resSlice ) { print @$aref, "\n"; } @resSlice = $aln->map_r(\&complement); print "\nRows in complement\n"; for $aref ( @resSlice ) { print @$aref, "\n"; } print "\n Testing parsing and writing of long files:\n"; if ($Bio::UnivAln::_CLUSTAL) { $aln = Bio::UnivAln->new("t/spears_et_al_alnfile"); } else { print "\n Note: No clustal available. Marginal differences to t/univaln.o result.\n"; $aln = Bio::UnivAln->new("t/spears_et_al.fasta"); } if (!$Bio::UnivAln::_READSEQ) { print "\n Note: No readseq available. The output file will differ from t/univaln.o\n"; carp "\n Note: No readseq available. Some more expected errors result from this."; } my $tmpfile = POSIX::tmpnam; my $tmpfile2 = POSIX::tmpnam; open (TMP,">$tmpfile") or carp ("Could not open tmpfile: $!"); print TMP $aln->layout("fasta"); close TMP; $aln = Bio::UnivAln->new("$tmpfile"); open (TMP,">$tmpfile2") or carp ("Could not open tmpfile2: $!"); print TMP $aln->layout("PAUP"); close TMP; $aln = Bio::UnivAln->new("$tmpfile2"); open (TMP,">$tmpfile2") or carp ("Could not open tmpfile2: $!"); print TMP $aln->layout("MSF"); close TMP; $aln = Bio::UnivAln->new("$tmpfile2"); open (TMP,">$tmpfile2") or carp ("Could not open tmpfile2: $!"); print TMP $aln->layout("PIR"); close TMP; $aln = Bio::UnivAln->new("$tmpfile2"); open (TMP,">$tmpfile2") or carp ("Could not open tmpfile2: $!"); print TMP $aln->layout("fasta"); close TMP; print "Differences after several parse/write iterations:\n", `diff $tmpfile $tmpfile2`; unlink $tmpfile, $tmpfile2; if ($Bio::UnivAln::_CLUSTAL) { print "\n Parsing clustal format via clustal.\n"; $aln = Bio::UnivAln->new("t/clustal_alnfile"); } else { print "\n Note: No clustal available. Marginal differences to t/univaln.o result.\n"; $aln = Bio::UnivAln->new("t/fasta_alnfile"); } print "\n aln in MSF format, via readseq:\n", $aln->layout("MSF"), "\n"; print "\n Parsing msf via readseq.\n"; $aln = Bio::UnivAln->new("t/msf_alnfile"); print "\n aln in PAUP format, via readseq:\n", $aln->layout("PAUP"), "\n"; print "\n Parsing paup via readseq.\n"; $aln = Bio::UnivAln->new("t/paup_alnfile"); print "\n aln in PIR format, via readseq; note the problem with the first 6 bases:\n", $aln->layout("PIR"), "\n"; print "\n Parsing pir via readseq.\n"; $aln = Bio::UnivAln->new("t/pir_alnfile"); print "\n aln in ASN.1 format, via readseq:\n", $aln->layout("ASN.1"), "\n"; #Readseq doesn't seem to be able to handle self-produced ASN.1 -- this doesn't work #print "\n Parsing asn.1 via readseq.\n"; #$aln = Bio::UnivAln->new("t/asn.1_alnfile"); print "\n aln in pretty format, via readseq:\n", $aln->layout("Pretty"), "\n"; __END__