#!/usr/bin/perl # Change the first line to reflect where your perl sits. # This is the Multiclustal script. It does an all inclusive script # commandline = MC.pl inputfile -deep(optional) # MC.pl version 1.0 - 4 February 2000 - reordering and commandline handling modifications by Jia Wei. # Copyright 2000Merck & Co., Inc. # All Rights Reserved - This tool may not be redistributed or modified without the knowledge of authors. # Refer to the accompanying README file on running this script. # A detailed description of this tool is described in Yuan, et al in Bioinformatics, 15, 862-863, 1999. # If you have any questions regarding this tool contact Jeffrey Yuan at jeffrey_yuan@merck.com. # MC.pl version 1.1 Modified by Dmitri Mikhailov, SGI-ChemBio to reuse the trees, March 2000. # Look for SGI_DM comments. ###### # Correction by Moriyama Lab at UNL: # In the 'reorder' subroutine, this version looks for '.ph' file instead of '.dnd'. '.ph' file # (not '.dnd' file) is the default tree output file from ClustalW when -tree option is used. # Etsuko Moriyama Lab at UNL, Nov/14/2002 ###### $infile = shift; if (!$infile) { die "I need an input file, usage - multiclustal.pl inputfile -deep(optional)\n"; } if ($infile =~ /_/) { die "Sorry this input file has an underscore in its name, remove the underscore and rerun.\n"; } $second = shift; if($second =~ /^(-deep|-d)/){ $deep=1; } else { $deep=0; } open (C,$infile) || die " Sorry this file does not exist! BYE!\n"; close (C); # donot need file open open (OUT,">Final_score"); @matrices = ("blosum","pam","gonnet","id"); # all the matrices &round1; # do all the matrices &score; # generate rtf of output for round 1 &reorder; # reorder the inputfile &round2; # do gap open with reordered file &score; # generate rtf of output for round 2 &round3; # do all the gap extensions &score; &round4; # repeat matrices &score; # final score close(OUT); # close Final_score ################################ sub round1 { #round 1 - do all the matrices first @alnfiles=(); # initialize filearray foreach $pairmatrix (@matrices) { # step through pairwise matrices # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest $multimatrix=@matrices[0]; $outfile=$infile."_".$pairmatrix."_".$multimatrix."_.aln"; $treefile=$infile.".dnd"; push(@alnfiles,$outfile); # put name into file array print "clustalw -infile=$infile -pwmatrix=$pairmatrix -matrix=$multimatrix -align -newtree=$treefile -outfile=$outfile\n"; open (CLU, "clustalw -infile=$infile -pwmatrix=$pairmatrix -matrix=$multimatrix -align -newtree=$treefile -outfile=$outfile|"); while () {} # do nothing #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest foreach $multimatrix (@matrices[1..$#matrices]) { # step through multiple matrices $outfile=$infile."_".$pairmatrix."_".$multimatrix."_.aln"; push(@alnfiles,$outfile); # put name into file array print "clustalw -infile=$infile -pwmatrix=$pairmatrix -matrix=$multimatrix -usetree=$treefile -outfile=$outfile\n"; open (CLU, "clustalw -infile=$infile -pwmatrix=$pairmatrix -matrix=$multimatrix -usetree=$treefile -outfile=$outfile|"); while () {} # do nothing } } } ####################### sub round2 { # round 2 - do gap open with reorder and new matrices @alnfiles=(); if ($deep) { $step=1; } else { $step=4; } for ($pgo=20;$pgo>=0;$pgo=$pgo-$step) { # pairwise gap open 20 to 0 step 4 # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest $mgo=20; $outfile=$reorderfile."_".$pgo."_".$pairmat."_".$mgo."_".$multimat."_.aln"; $treefile=$reorderfile.".dnd"; push(@alnfiles,$outfile); #put name into file array chomp ($pairmat); chomp ($multimat); open (CLU, "clustalw -infile=$reorderfile -pwmatrix=$pairmat -pwgapopen=$pgo -matrix=$multimat -gapopen=$mgo -align -newtree=$treefile -outfile=$outfile|"); while () {} # do nothing print "clustalw -infile=$reorderfile -pwmatrix=$pairmat -matrix=$multimat -pwgapopen=$pgo -gapopen=$mgo -align -newtree=$treefile -outfile=$outfile\n"; #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest for ($mgo=20-$step;$mgo>=0;$mgo=$mgo-$step) { $outfile=$reorderfile."_".$pgo."_".$pairmat."_".$mgo."_".$multimat."_.aln"; push(@alnfiles,$outfile); #put name into file array chomp ($pairmat); chomp ($multimat); open (CLU, "clustalw -infile=$reorderfile -pwmatrix=$pairmat -pwgapopen=$pgo -matrix=$multimat -gapopen=$mgo -usetree=$treefile -outfile=$outfile|"); while () {} # do nothing print "clustalw -infile=$reorderfile -pwmatrix=$pairmat -matrix=$multimat -pwgapopen=$pgo -gapopen=$mgo -usetree=$treefile -outfile=$outfile\n"; } } } ####################### sub round3 { # do gap extensions @alnfiles=(); if ($deep) { $mxstep=0.005; $multiplier = 1000; } else { $mxstep=0.02; $multiplier = 100; } $mxstep=$mxstep * $multiplier; @parameters=split(/_/,$bestfile); $pgo=$parameters[1]; $pmat=$parameters[2]; $mgo=$parameters[3]; $mmat=$parameters[4]; print "$pgo $pairmat $mgo $multimat |\n"; for ($pgxcount = 5 ; $pgxcount >= 0;$pgxcount = $pgxcount - 1) { # pairwise extension penalty 0.5 to 0 step 0.1 # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest $mgxcount = int(0.1*$multiplier); $pgx = $pgxcount / 10; $mgx = $mgxcount / $multiplier; $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pmat."_".$mgo."_".$mgx."_".$mmat."_.aln"; $treefile=$reorderfile.".dnd"; push(@alnfiles,$outfile); #put name into file array open(CLU, "clustalw -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwmatrix=$pmat -matrix=$mmat -align -newtree=$treefile -outfile=$outfile|"); while (){} # do nothing print "clustalw -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwmatrix=$pmat -matrix=$mmat -align -newtree=$treefile -outfile=$outfile|"; #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest for ($mgxcount = int(0.1*$multiplier)-$mxstep;$mgxcount >= 0;$mgxcount = $mgxcount - $mxstep) { # multiple extension penalty 0.1 to 0 step 0.02 $pgx = $pgxcount / 10; $mgx = $mgxcount / $multiplier; $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pmat."_".$mgo."_".$mgx."_".$mmat."_.aln"; push(@alnfiles,$outfile); #put name into file array open(CLU, "clustalw -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwmatrix=$pmat -matrix=$mmat -usetree=$treefile -outfile=$outfile|"); while (){} # do nothing print "clustalw -infile=$reorderfile -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -pwmatrix=$pmat -matrix=$mmat -usetree=$treefile -outfile=$outfile\n"; } } } ####################### sub round4 { # repeat matrices again @alnfiles=(); # reinitialize file name array @parameters=split(/_/,$bestfile); $pgo=$parameters[1]; # pairwise gapopen $pgx=$parameters[2]; # pairwise gapextension $mgo=$parameters[4]; # multiple gapopen $mgx=$parameters[5]; # multiple gapextension foreach $pairmatrix (@matrices) { # step through pairwise matrices # SGI_DM do just 1st element of 2nd loop and use the calculated tree for the rest $multimatrix=@matrices[0]; $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pairmatrix."_".$mgo."_".$mgx."_".$multimatrix."_.aln"; $treefile=$reorderfile.".dnd"; push(@alnfiles,$outfile); # put name into file array print "clustalw -infile=$reorderfile -pwmatrix=$pairmatrix -matrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -align -newtree=$treefile -outfile=$outfile\n"; open (CLU, "clustalw -infile=$reorderfile -pwmatrix=$pairmatrix -matrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -align -newtree=$treefile -outfile=$outfile|"); while () {} # do nothing #SGI_DM use the tree from 1st step to calculate the multiple alignment for the rest foreach $multimatrix (@matrices[1..$#matrices]) { # step through multiple matrices $outfile=$reorderfile."_".$pgo."_".$pgx."_".$pairmatrix."_".$mgo."_".$mgx."_".$multimatrix."_.aln"; push(@alnfiles,$outfile); # put name into file array print "clustalw -infile=$reorderfile -pwmatrix=$pairmatrix -matrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -usetree=$treefile -outfile=$outfile\n"; open (CLU, "clustalw -infile=$reorderfile -pwmatrix=$pairmatrix -matrix=$multimatrix -pwgapopen=$pgo -pwgapext=$pgx -gapopen=$mgo -gapext=$mgx -usetree=$treefile -outfile=$outfile|"); while () {} # do nothing } } ####################### sub score { $roundnumber++; # inclement every round of multiclustal $best=-999999999999; # clear best score $bestfile=""; # generate rtf and score files foreach $filename(@alnfiles) { # get most recent aln files $outf=$filename.".rtf"; # output rtf file name print "$filename\n"; print "boxshade -in=$filename -out=$outf -dev=4 -type=2 -thr=0.2 -def\n"; open (CLU, "boxshade -in=$filename -out=$outf -dev=4 -type=2 -thr=0.2 -def|"); while () {} # do nothing &scoreboxshade; # score boxshade output } print OUT "Round $roundnumber - Best score = $best for $bestfile\n"; } ######################## sub reorder { # this subroutine reorders the inputfile @pairmultimat=split(/_/,$bestfile); # split out matrices from bestfile of round1 $pairmat=$pairmultimat[1]; $multimat=$pairmultimat[2]; open (CLU,"clustalw -infile=$infile -tree -pwmatrix=$pairmat -matrix=$multimat|"); # generate tree only while () {} # do nothing print "reorder!\n"; $reorderfile=$infile.".reorder"; open (R,">$reorderfile")|| die "no reorder file\n"; @dnd=qx("ls"); foreach $dndf(@dnd) { # 'dnd' was changed to 'ph' in the next line. Nov/14/2002 Moriyama Lab at UNL if ($dndf=~/ph\b/) { # find the ph file $opendnd=$dndf; } } # 'dnd' was changed to 'ph' in the next line. Nov/14/2002 Moriyama Lab at UNL open (P,$opendnd)|| die "no ph file\n"; while ($ps=

) { # read ph file first if ($ps =~ /^\w/) { # skip paren and colon $ps =~ /^(.*)\:(.*)/; $quoted=$1; print "orig = $ps quoted=$quoted\n"; open (I,$infile); # open multiple fasta each time while ($fasta=) { if ($fasta =~/^>/ ) { # got fasta header $fasta=~/^>(\S+)/; $fastaacc=$1; if ($fastaacc eq $quoted) { # print "quoted=$quoted fasta=$fastaacc\n"; $gotfasta=1; } else { $gotfasta=0; print "NO $fastaacc $quoted\n"; } } if ($gotfasta) { print R $fasta; } } close (I); } } close (P); close (R); } ######################## sub scoreboxshade { # subroutine for scoring boxshade output open (I,$outf); # open previous boxshade output $similarity=0; $identity=0; while ($input=) { if ($input=~/cf2\s/) { # got id @id=split(/cf2\s/,$input); # look for lines that begin with cf2 - these are identities $seqid=$id[1]; chomp($seqid); @idcnt=split(//,$seqid); foreach $idchar(@idcnt) { $identity++; # count chars } } if ($input=~/cf4\s/) { # got sim @sim=split(/cf4\s/,$input); $seqsim=$sim[1]; chomp($seqsim); @simcnt=split(//,$seqsim); foreach $simchar(@simcnt) { $similarity++; } } } $total=$identity+$similarity; close (I); &scorealn; # now score aln output for islands } ##################### sub scorealn { # subroutine for scoring aln output %alignment=(); $island=0; $gap=0; print $filename; open (IN, $filename); while ($input=) { if ($input!~/^CLUSTAL/) { # ignore the first line if ($input!~/^\W/){ # ignore all blank lines @gene=split(/\s+/,$input); # get name and alignment if ($actualposition == 1) { # create array of names only once push @name,$gene[0]; # push the gene name only once } chomp($gene[1]); $alignment{$gene[0]}.=$gene[1];# to associative array, concatenate sequence for the line } else { $positioncnt++; # count groups $actualposition=$positioncnt/2; # needed to correct of double blank spaces in clustalw output } } } close (IN); while (($gene,$align) = each(%alignment)) { # cycle through the associative array @start=split(/-/,$align); foreach $dash(@start) { # start to count the islands if ($dash=~/\b\w\b/) { # singleton $island++; } if ($dash=~/\b\w\w\b/) { # doubleton $island=$island+0.5; } if ($dash=~/\b\w\w\w\b/) { # tripleton $island=$island+0.25; } if ($dash=~/\b\w\w\w\w\b/) { # quadton $island=$island+0.125; } } } # Count gap islands while (($gene,$align) = each(%alignment)) { # cycle through the associative array - again for gap islands @gapisland=split(/[^-]/,$align); # split after anything but - foreach $nongap(@gapisland) { if ($nongap=~/-{1,}/) { # if gap island exists, count as 1 #print "$nongap\n"; $gap++; } } } $finalscore=$total-($island+$gap); # calculate finalscore print OUT "Round $roundnumber $filename has $similarity similarities $identity identities $gap gap score $island islands Final score=$finalscore\n"; if ($finalscore > $best) { $bestgap=$gap; # keep track of gap penalty score $best=$finalscore; $bestfile=$filename; } else { if ($finalscore == $best) { # what happens when the scores are the same if ($gap < $bestgap) { # keep lower gap $bestgap=$gap; $bestfile=$filename; $best=$finalscore; } } } close (IN); } }