#!/usr/bin/perl # Change the first line to reflect where your perl sits. # This is the Multiclustal script. It does an all inclusive script # commandline = MC.pl inputfile -deep(optional) -nuc(optional) # MC.pl version 1.0 - 4 February 2000 - reordering and commandline handling modifications by Jia Wei. # Copyright 2000Merck & Co., Inc. # All Rights Reserved - This tool may not be redistributed or modified without the knowledge of authors. # Refer to the accompanying README file on running this script. # A detailed description of this tool is described in Yuan, et al in Bioinformatics, 15, 862-863, 1999. # If you have any questions regarding this tool contact Jeffrey Yuan at jeffrey_yuan@merck.com. # MC.pl version 1.1 Modified by Dmitri Mikhailov, SGI-ChemBio to reuse the trees, March 2000. # Look for SGI_DM comments. ###### # Correction by Moriyama Lab at UNL: # In the 'reorder' subroutine, this version looks for '.ph' file instead of '.dnd'. '.ph' file # (not '.dnd' file) is the default tree output file from ClustalW when -tree option is used. # Etsuko Moriyama Lab at UNL, Nov/14/2002 ###### ###### # Due to a possible bug in the clustalw1.82, "pam" matrix cannot be recoganized # from a command line. All "clustalw" commands were changed to explicitly use # "clustalw1.81" to avoid the confusion. # # Esuko Moriyama, Nov/26/2002 ###### ###### # Modification to use Multiclustal for DNA alignment. # Now it takes one more option: # MC.pl inputfile -deep(optional) -nuc(optional) # # Esuko Moriyama, Dec/17/2002 ###### $infile = shift; if (!$infile) { die "I need an input file, usage - multiclustal.pl inputfile -deep(optional) -nuc(optional)\n"; } if ($infile =~ /_/) { die "Sorry this input file has an underscore in its name, remove the underscore and rerun.\n"; } # Modified for DNA by Moriyama $deep = $nuc = 0; while (defined($second = shift)) { if($second =~ /^(-deep|-d)/){ $deep=1; } elsif ($second =~ /^(-nuc|-n)/){ $dna=1; } } print "MULTICLUSTAL, modified by Etsuko Moriyama for DNA alignments.\n\n"; print " deep option: "; if ($deep) { print "ON\n"; } else { print "OFF\n"; } print " Scoring matrices: "; if ($dna) { print "DNA\n\n"; } else { print "protein\n\n"; } # end open (C,$infile) || die " Sorry this file does not exist! BYE!\n"; close (C); # donot need file open open (OUT,">Final_score"); @matrices = ("blosum","pam","gonnet","id"); # all the matrices # Modified for DNA by Moriyama if (! $dna) { &round1; # do all the matrices } else { &round1dna; # do transition weight for DNA } #end &score; # generate rtf of output for round 1 &reorder; # reorder the inputfile &round2; # do gap open with reordered file &score; # generate rtf of output for round 2 # Modified for DNA by Moriyama if (! $dna) { &round3; # do all the gap extensions &score; &round4; # repeat matrices } else { &round3dna; # do all the gap extensions &score; &round4dna; # repeat transition weight for DNA } #end &score; # final score close(OUT); # close Final_score ################################ sub round1 { #round 1 - do all the matrices first @alnfiles=(); # initialize filearray foreach $pairmatrix (@matrices) { # step through pairwise matrices # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest $multimatrix=@matrices[0]; $outfile=$infile."_".$pairmatrix."_".$multimatrix."_.aln"; $treefile=$infile.".dnd"; push(@alnfiles,$outfile); # put name into file array print "clustalw1.81 -infile=$infile -pwmatrix=$pairmatrix -matrix=$multimatrix -align -newtree=$treefile -outfile=$outfile\n"; open (CLU, "clustalw1.81 -infile=$infile -pwmatrix=$pairmatrix -matrix=$multimatrix -align -newtree=$treefile -outfile=$outfile|"); while () {} # do nothing #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest foreach $multimatrix (@matrices[1..$#matrices]) { # step through multiple matrices $outfile=$infile."_".$pairmatrix."_".$multimatrix."_.aln"; push(@alnfiles,$outfile); # put name into file array print "clustalw1.81 -infile=$infile -pwmatrix=$pairmatrix -matrix=$multimatrix -usetree=$treefile -outfile=$outfile\n"; open (CLU, "clustalw1.81 -infile=$infile -pwmatrix=$pairmatrix -matrix=$multimatrix -usetree=$treefile -outfile=$outfile|"); while () {} # do nothing } } } ################################ # Written for DNA sequence alignment by E. N. Moriyama sub round1dna { #round 1 - do transition weight @alnfiles=(); # initialize file array $multimatrix= $pairmatrix = "iub"; # only one matrix for DNA for ($tsw=0;$tsw<=1;$tsw=$tsw+0.1) { # transition weight 0 to 1 step 0.1 $outfile=$infile."_".$pairmatrix."_".$multimatrix."_".$tsw."_.aln"; push(@alnfiles,$outfile); #put name into file array open (CLU, "clustalw1.81 -infile=$infile -pwdnamatrix=$pairmatrix -transweight=$tsw -dnamatrix=$multimatrix -align -outfile=$outfile|"); while () {} # do nothing print "clustalw1.81 -infile=$infile -pwdnamatrix=$pairmatrix -transweight=$tsw -dnamatrix=$multimatrix -align -outfile=$outfile\n"; } } ####################### sub round2 { # round 2 - do gap open with reorder and new matrices @alnfiles=(); if ($deep) { $step=1; } # modified for DNA by Moriyama else { if ($dna) { $step=5; } else { $step=4; } } if ($dna) { $start=30; } else { $start=20; } #for ($pgo=20;$pgo>=0;$pgo=$pgo-$step) { # pairwise gap open 20 to 0 step 4 for ($pgo=$start;$pgo>=0;$pgo=$pgo-$step) { # pairwise gap open 30 to 0 step 5 # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest # $mgo=20; $mgo=$start; $treefile=$reorderfile.".dnd"; chomp ($pairmat); chomp ($multimat); if ($dna) { chomp ($tsweight); $outfile=$reorderfile."_".$pgo."_".$pairmat."_".$mgo."_".$multimat."_".$tsweight."_.aln"; push(@alnfiles,$outfile); #put name into file array open (CLU, "clustalw1.81 -infile=$reorderfile -transweight=$tsweight -pwdnamatrix=$pairmat -pwgapopen=$pgo -dnamatrix=$multimat -gapopen=$mgo -align -newtree=$treefile -outfile=$outfile|"); while () {} # do nothing print "clustalw1.81 -infile=$reorderfile -transweight=$tsweight -pwdnamatrix=$pairmat -dnamatrix=$multimat -pwgapopen=$pgo -gapopen=$mgo -align -newtree=$treefile -outfile=$outfile\n"; } else { $outfile=$reorderfile."_".$pgo."_".$pairmat."_".$mgo."_".$multimat."_.aln"; push(@alnfiles,$outfile); #put name into file array open (CLU, "clustalw1.81 -infile=$reorderfile -transweight=$tsweight -pwmatrix=$pairmat -pwgapopen=$pgo -matrix=$multimat -gapopen=$mgo -align -newtree=$treefile -outfile=$outfile|"); while () {} # do nothing print "clustalw1.81 -infile=$reorderfile -transweight=$tsweight -pwmatrix=$pairmat -matrix=$multimat -pwgapopen=$pgo -gapopen=$mgo -align -newtree=$treefile -outfile=$outfile\n"; } #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest # for ($mgo=20-$step;$mgo>=0;$mgo=$mgo-$step) { for ($mgo=$start-$step;$mgo>=0;$mgo=$mgo-$step) { chomp ($pairmat); chomp ($multimat); if ($dna) { chomp ($tsweight); $outfile=$reorderfile."_".$pgo."_".$pairmat."_".$mgo."_".$multimat."_".$tsweight."_.aln"; push(@alnfiles,$outfile); #put name into file array open (CLU, "clustalw1.81 -infile=$reorderfile -transweight=$tsweight -pwdnamatrix=$pairmat -pwgapopen=$pgo -dnamatrix=$multimat -gapopen=$mgo -usetree=$treefile -outfile=$outfile|"); while () {} # do nothing print "clustalw1.81 -infile=$reorderfile -transweight=$tsweight -pwdnamatrix=$pairmat -dnamatrix=$multimat -pwgapopen=$pgo -gapopen=$mgo -usetree=$treefile -outfile=$outfile\n"; } else { $outfile=$reorderfile."_".$pgo."_".$pairmat."_".$mgo."_".$multimat."_.aln"; push(@alnfiles,$outfile); #put name into file array open (CLU, "clustalw1.81 -infile=$reorderfile -transweight=$tsweight -pwmatrix=$pairmat -pwgapopen=$pgo -matrix=$multimat -gapopen=$mgo -usetree=$treefile -outfile=$outfile|"); while () {} # do nothing print "clustalw1.81 -infile=$reorderfile -transweight=$tsweight -pwmatrix=$pairmat -matrix=$multimat -pwgapopen=$pgo -gapopen=$mgo -usetree=$treefile -outfile=$outfile\n"; } } } } ####################### sub round3 { # do gap extensions @alnfiles=(); if ($deep) { $mxstep=0.005; $multiplier = 1000; } else { $mxstep=0.02; $multiplier = 100; } $mxstep=$mxstep * $multiplier; @parameters=split(/_/,$bestfile); $pgo=$parameters[1]; $pmat=$parameters[2]; $mgo=$parameters[3]; $mmat=$parameters[4]; print "$pgo $pairmat $mgo $multimat |\n"; for ($pgxcount = 5 ; $pgxcount >= 0;$pgxcount = $pgxcount - 1) { # pairwise extension penalty 0.5 to 0 step 0.1 # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest $mgxcount = int(0.1*$multiplier); $pgx = $pgxcount / 10; $mgx = $mgxcount / $multiplier; $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pmat."_".$mgo."_".$mgx."_".$mmat."_.aln"; $treefile=$reorderfile.".dnd"; push(@alnfiles,$outfile); #put name into file array open(CLU, "clustalw1.81 -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwmatrix=$pmat -matrix=$mmat -align -newtree=$treefile -outfile=$outfile|"); while (){} # do nothing print "clustalw1.81 -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwmatrix=$pmat -matrix=$mmat -align -newtree=$treefile -outfile=$outfile|"; #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest for ($mgxcount = int(0.1*$multiplier)-$mxstep;$mgxcount >= 0;$mgxcount = $mgxcount - $mxstep) { # multiple extension penalty 0.1 to 0 step 0.02 $pgx = $pgxcount / 10; $mgx = $mgxcount / $multiplier; $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pmat."_".$mgo."_".$mgx."_".$mmat."_.aln"; push(@alnfiles,$outfile); #put name into file array open(CLU, "clustalw1.81 -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwmatrix=$pmat -matrix=$mmat -usetree=$treefile -outfile=$outfile|"); while (){} # do nothing print "clustalw1.81 -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwmatrix=$pmat -matrix=$mmat -usetree=$treefile -outfile=$outfile\n"; } } } ####################### # Written for DNA sequence alignment by E. N. Moriyama sub round3dna { # do gap extensions @alnfiles=(); if ($deep) { $step=1; } else { $step=2; } @parameters=split(/_/,$bestfile); $pgo=$parameters[1]; $pmat=$parameters[2]; $mgo=$parameters[3]; $mmat=$parameters[4]; $tsw=$parameters[5]; print "$pgo $pairmat $mgo $multimat $tws|\n"; for ($pgx = 10; $pgx >= 0;$pgx = $pgx - $step) { # pairwise extension penalty 10 to 0 step 2 # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest $mgx = 10; $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pmat."_".$mgo."_".$mgx."_". $mmat."_".$tsw."_.aln"; $treefile=$reorderfile.".dnd"; push(@alnfiles,$outfile); #put name into file array open(CLU, "clustalw1.81 -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwdnamatrix=$pmat -dnamatrix=$mmat -transweight=$tsw -align -newtree=$treefile -outfile=$outfile|"); while (){} # do nothing print "clustalw1.81 -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwdnamatrix=$pmat -dnamatrix=$mmat -transweight=$tsw -align -newtree=$treefile -outfile=$outfile|"; #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest for ($mgx = $mgx-$step; $mgx >= 0;$mgx = $mgx - $step) { # multiple extension penalty 10 to 0 step 2 $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pmat."_".$mgo."_".$mgx. "_".$mmat."_".$tsw."_.aln"; push(@alnfiles,$outfile); #put name into file array open(CLU, "clustalw1.81 -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwdnamatrix=$pmat -dnamatrix=$mmat -transweight=$tsw -usetree=$treefile -outfile=$outfile|"); while (){} # do nothing print "clustalw1.81 -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwdnamatrix=$pmat -dnamatrix=$mmat -transweight=$tsw -usetree=$treefile -outfile=$outfile\n"; } } } ####################### sub round4 { # repeat matrices again @alnfiles=(); # reinitialize file name array @parameters=split(/_/,$bestfile); $pgo=$parameters[1]; # pairwise gapopen $pgx=$parameters[2]; # pairwise gapextension $mgo=$parameters[4]; # multiple gapopen $mgx=$parameters[5]; # multiple gapextension foreach $pairmatrix (@matrices) { # step through pairwise matrices # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest $multimatrix=@matrices[0]; $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pairmatrix."_".$mgo."_".$mgx."_".$multimatrix."_.aln"; $treefile=$reorderfile.".dnd"; push(@alnfiles,$outfile); # put name into file array print "clustalw1.81 -infile=$reorderfile -pwmatrix=$pairmatrix -matrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -align -newtree=$treefile -outfile=$outfile\n"; open (CLU, "clustalw1.81 -infile=$reorderfile -pwmatrix=$pairmatrix -matrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -align -newtree=$treefile -outfile=$outfile|"); while () {} # do nothing #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest foreach $multimatrix (@matrices[1..$#matrices]) { # step through multiple matrices $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pairmatrix."_".$mgo."_".$mgx."_".$multimatrix."_.aln"; push(@alnfiles,$outfile); # put name into file array print "clustalw1.81 -infile=$reorderfile -pwmatrix=$pairmatrix -matrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -usetree=$treefile -outfile=$outfile\n"; open (CLU, "clustalw1.81 -infile=$reorderfile -pwmatrix=$pairmatrix -matrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -usetree=$treefile -outfile=$outfile|"); while () {} # do nothing } } ####################### # Written for DNA sequence alignment by E. N. Moriyama sub round4dna { # repeat transition weight again @alnfiles=(); # reinitialize file name array @parameters=split(/_/,$bestfile); $pgo=$parameters[1]; # pairwise gapopen $pgx=$parameters[2]; # pairwise gapextension $mgo=$parameters[4]; # multiple gapopen $mgx=$parameters[5]; # multiple gapextension $pairmatrix = $multimatrix = "iub"; # DNA matrix for ($tsw=0;$tsw<=1;$tsw=$tsw+0.1) { # transition weight 0 to 1 step 0.1 $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pairmatrix."_".$mgo."_".$mgx. "_".$multimatrix."_".$tsw."_.aln"; push(@alnfiles,$outfile); #put name into file array print "clustalw1.81 -infile=$reorderfile -pwdnamatrix=$pairmatrix -dnamatrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -transweight=$tsw -align -outfile=$outfile\n"; open (CLU, "clustalw1.81 -infile=$reorderfile -pwdnamatrix=$pairmatrix -dnamatrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -transweight=$tsw -align -outfile=$outfile|"); while () {} # do nothing } } ####################### sub score { $roundnumber++; # inclement every round of multiclustal $best=-999999999999; # clear best score $bestfile=""; # generate rtf and score files foreach $filename(@alnfiles) { # get most recent aln files $outf=$filename.".rtf"; # output rtf file name print "$filename\n"; print "boxshade -in=$filename -out=$outf -dev=4 -type=2 -thr=0.2 -def\n"; open (CLU, "boxshade -in=$filename -out=$outf -dev=4 -type=2 -thr=0.2 -def|"); while () {} # do nothing &scoreboxshade; # score boxshade output } print OUT "Round $roundnumber - Best score = $best for $bestfile\n"; } ######################## sub reorder { # this subroutine reorders the inputfile @pairmultimat=split(/_/,$bestfile); # split out matrices from bestfile of round1 $pairmat=$pairmultimat[1]; $multimat=$pairmultimat[2]; # modified for DNA by Moriyama if ($dna) { $tsweight=$pairmultimat[3]; open (CLU,"clustalw1.81 -infile=$infile -tree -transweight=$tsweight -pwdnamatrix=$pairmat -dnamatrix=$multimat|"); # generate tree only } else { open (CLU,"clustalw1.81 -infile=$infile -tree -pwmatrix=$pairmat -matrix=$multimat|"); # generate tree only } # end while () {} # do nothing print "reorder!\n"; $reorderfile=$infile.".reorder"; open (R,">$reorderfile")|| die "no reorder file\n"; @dnd=qx("ls"); foreach $dndf(@dnd) { # if ($dndf=~/dnd\b/) { # find the dnd file # Changed by Moriyama Lab if ($dndf=~/ph\b/) { # find the ph file ### $opendnd=$dndf; } } # open (P,$opendnd)|| die "no dnd file\n"; open (P,$opendnd)|| die "no ph file\n"; while ($ps=

) { # read dnd file first if ($ps =~ /^\w/) { # skip paren and colon $ps =~ /^(.*)\:(.*)/; $quoted=$1; print "orig = $ps quoted=$quoted\n"; open (I,$infile); # open multiple fasta each time while ($fasta=) { if ($fasta =~/^>/ ) { # got fasta header $fasta=~/^>(\S+)/; $fastaacc=$1; if ($fastaacc eq $quoted) { # print "quoted=$quoted fasta=$fastaacc\n"; $gotfasta=1; } else { $gotfasta=0; print "NO $fastaacc $quoted\n"; } } if ($gotfasta) { print R $fasta; } } close (I); } } close (P); close (R); } ######################## sub scoreboxshade { # subroutine for scoring boxshade output open (I,$outf); # open previous boxshade output $similarity=0; $identity=0; while ($input=) { if ($input=~/cf2\s/) { # got id @id=split(/cf2\s/,$input); # look for lines that begin with cf2 - these are identities $seqid=$id[1]; chomp($seqid); @idcnt=split(//,$seqid); foreach $idchar(@idcnt) { $identity++; # count chars } } if ($input=~/cf4\s/) { # got sim @sim=split(/cf4\s/,$input); $seqsim=$sim[1]; chomp($seqsim); @simcnt=split(//,$seqsim); foreach $simchar(@simcnt) { $similarity++; } } } $total=$identity+$similarity; close (I); &scorealn; # now score aln output for islands } ##################### sub scorealn { # subroutine for scoring aln output %alignment=(); $island=0; $gap=0; print $filename; open (IN, $filename); while ($input=) { if ($input!~/^CLUSTAL/) { # ignore the first line if ($input!~/^\W/){ # ignore all blank lines @gene=split(/\s+/,$input); # get name and alignment if ($actualposition == 1) { # create array of names only once push @name,$gene[0]; # push the gene name only once } chomp($gene[1]); $alignment{$gene[0]}.=$gene[1];# to associative array, concatenate sequence for the line } else { $positioncnt++; # count groups $actualposition=$positioncnt/2; # needed to correct of double blank spaces in clustalw output } } } close (IN); while (($gene,$align) = each(%alignment)) { # cycle through the associative array @start=split(/-/,$align); foreach $dash(@start) { # start to count the islands if ($dash=~/\b\w\b/) { # singleton $island++; } if ($dash=~/\b\w\w\b/) { # doubleton $island=$island+0.5; } if ($dash=~/\b\w\w\w\b/) { # tripleton $island=$island+0.25; } if ($dash=~/\b\w\w\w\w\b/) { # quadton $island=$island+0.125; } } } # Count gap islands while (($gene,$align) = each(%alignment)) { # cycle through the associative array - again for gap islands @gapisland=split(/[^-]/,$align); # split after anything but - foreach $nongap(@gapisland) { if ($nongap=~/-{1,}/) { # if gap island exists, count as 1 #print "$nongap\n"; $gap++; } } } $finalscore=$total-($island+$gap); # calculate finalscore print OUT "Round $roundnumber $filename has $similarity similarities $identity identities $gap gap score $island islands Final score=$finalscore\n"; if ($finalscore > $best) { $bestgap=$gap; # keep track of gap penalty score $best=$finalscore; $bestfile=$filename; } else { if ($finalscore == $best) { # what happens when the scores are the same if ($gap < $bestgap) { # keep lower gap $bestgap=$gap; $bestfile=$filename; $best=$finalscore; } } } close (IN); } }