#! /usr/bin/perl

# domainseqs.pl

# The purpose of this script is to read in a domain table from HMMER and construct 
# a FASTA file containing each of the domain sequences for each of the proteins in 
# the set. The program takes as its command line input:
# dir - the directory containing the Hmmer and FASTA files
# hmmerin - the Hmmer domain table
# fasta - the FASTA file for the protein sequences
# out - the output FASTA file
# overlap - the allowed percentage of overlap for domain predictions
# evalthresh - the HMMER E-value threshold 

# Author         : Brittney (Hinds) Keel
# Date created   : 1 May 2013
# Last modified  : 1 February 2016


use strict;
use Getopt::Long;



# Start by gathering command line options.
my $dir; 
my $hmmer; 
my $help; 
my $fasta; 
my $output;
my $overlap;
my $evalthresh;


if (@ARGV > 0) { # if command line options are supplied
   my $opt = GetOptions('help!',     \$help,
                        'dir:s',     \$dir,
                        'hmmerin:s', \$hmmer,
                        'fasta:s',   \$fasta,
			'overlap:s', \$overlap,
                        'out:s',     \$output,
                        'e:s',       \$evalthresh);
} else { 
   print "Usage of this script:\n";
   print "Please supply command line options.\n";
   print "Example : ./domainseqs.pl -dir directory -hmmerin hmmerfile\n";
   print "-fasta proteinfastafile -overlap overlappercentage -e evalthresh -out outputfastafile\n";
   exit;

}


if ($help) { # if the user requests help
   print "Usage of this script:\n";
   print "Please supply command line options.\n";
   print "Example : ./domainseqs.pl -dir directory -hmmerin hmmerfile\n";
   print "-fasta proteinfastafile -overlap overlappercetage -e evalthresh -out outputfastafile\n";
   exit;
}


# Generate the hash of hashes that contains all of the protein-domain information.
my %proteins;


# Read in protein FASTA file.
my $fastafile = "<"."$dir".'/'."$fasta"; 
my $INPUT_FILE1; 
open($INPUT_FILE1, $fastafile) or die "Can't open input file : $fastafile"; 
my $id; 

while (my $line = readline($INPUT_FILE1)) {
   my $line_first_value = substr($line, 0, 1); 
   if ($line_first_value eq '>') { # if identifier line
      my @line_elems = split /\s+/, $line; 
      $id = $line_elems[0]; 
      $id =~ s/>//g;   
   } else { # if sequence line
      $proteins{$id}{'sequence'} = $line;
   }
} 

close($INPUT_FILE1) or die "Can't close file : $fastafile because $!"; 


# Read in Hmmer domain table.
my $hmmer_file = "<"."$dir".'/'."$hmmer"; 
my $INPUT_FILE2;
open($INPUT_FILE2, $hmmer_file) or die "Can't open input file : $hmmer_file";

my $first_line = readline($INPUT_FILE2); # the header line of the file (removed since its not used)
my $second_line = readline($INPUT_FILE2); # another header line
my $third_line = readline($INPUT_FILE2); # another header line

my %locations;
my %all_info;
my %prot_dom;

while (my $line = readline($INPUT_FILE2)) {
   if (!($line =~ m/^#/)) {
      my @line_vals = split /\s+/, $line;
      my $prot_id = $line_vals[3]; # the protein identifier
      my $prot_len = $line_vals[5]; # the length of the protein
      my $domain_acc = $line_vals[1]; # the domain identifier
      my $domain_start = $line_vals[19]; # the start position of the domain on the sequence
      my $domain_end = $line_vals[20]; # the end position of the domain on the sequence
      my $eval = $line_vals[12]; # the domain E-value

      $proteins{$prot_id}{'length'} = $prot_len; # save protein length to hash

      if(!($all_info{$prot_id}{$domain_acc})) { # if top domain hit doesn't exist
         $all_info{$prot_id}{$domain_acc}{'start'} = $domain_start; 
         $all_info{$prot_id}{$domain_acc}{'end'} = $domain_end;
         $all_info{$prot_id}{$domain_acc}{'evalue'} = $eval;
         push(@{$prot_dom{$prot_id}},$domain_acc);
      }
   }
} 


# Now sort the info for overlap and by E-value.
foreach my $p (keys(%all_info)) { # goes over each of the proteins
   my %inner_hash; # re-initializes the hash
   my $ref = $prot_dom{$p};
   my @doms = @$ref;

   foreach my $d (@doms) {
      $inner_hash{$d} = $all_info{$p}{$d}{'evalue'};
   }
  
   foreach my $e (sort { ($inner_hash{$a}+0) <=> ($inner_hash{$b}+0) } keys %inner_hash) { 
     if (($e+0) < $evalthresh) {
        if (!($proteins{$p}{$e})) { # if the top hit for the domain does not already exist
	   my $domain_start = $all_info{$p}{$e}{'start'}; 
           my $domain_end = $all_info{$p}{$e}{'end'};
           my $ind = 0;
  
           if(!($locations{$p})) { # if currently no domains on the protein add it
              push(@{$locations{$p}},$domain_start);
              push(@{$locations{$p}}, $domain_end);
           } else { # if some domains do exist for the protein check overlap, etc.
              my $locref = $locations{$p}; # reference to array of current domain locations
              my @loc = @$locref; # dereferences the array
              my $l = @loc; # length of the location array
              $l = $l/2; # current number of domains on the protein
	      my $seq_len = $proteins{$p}{'length'}; # length of the protein
              my $overlap_length = int(($overlap/100)*$seq_len); # acceptable length of overlap in the domains

	      foreach my $i (1..$l) { # loop over each existing domain and check for acceptable overlaps ($ind = 0 means accept)
                 my $start = $loc[($i-1)*2];
                 my $end = $loc[(($i-1)*2)+1];

	         if ((($start<= $domain_start)&&($domain_start <= $end))) {
                    if (($domain_start-$start)>=$overlap_length) {
                       $ind = 1;
		    } elsif (($end-$domain_start)>=$overlap_length) {
		       $ind = 1;
		    } 
                 } 

                 if ((($start <= $domain_end)&&($domain_end <= $end))) {
                    if (($domain_end-$start)>=$overlap_length) {
                       $ind = 1;
		    } elsif (($end-$domain_end)>=$overlap_length) {
		       $ind = 1;
		    } 
                 }
	
	         if ((($domain_start <= $start)&&($end <= $domain_end))) {
                    $ind = 1;     
                 }
			   
	         if ((($start <= $domain_start)&&($domain_end <= $end))) {
                    $ind = 1;     
                 }		   
	      }			
           }
	

           if ($ind == 0) { # if domain is acceptable
              $proteins{$p}{$e}{'start'} = $domain_start;
              $proteins{$p}{$e}{'end'} = $domain_end;

              push(@{$locations{$p}}, $domain_start);
              push(@{$locations{$p}}, $domain_end);
           }
	}
     }
   }
}

close($INPUT_FILE2) or die "Can't close file : $hmmer_file";


# Create the FASTA file of domain sequences.
my $out_file = ">"."$dir".'/'."$output"; 
my $OUTPUT_FILE; 
open($OUTPUT_FILE, $out_file) or die "Can't open output file : $out_file"; 
my @prots = keys(%proteins); 

foreach my $prot (@prots) {  
   my $subhash_ref = $proteins{$prot}; # reference to the subhash for the protein
   my %subhash = %$subhash_ref; # dereferences the hash
   my @domains = keys(%subhash); # the keys of the subhash (all of the domains plus the key 'sequence')

   foreach my $dom (@domains) {
      if ((!($dom eq 'sequence'))&&(!($dom eq 'length'))) { 
         my $s = $proteins{$prot}{'sequence'};      
         my $start = $proteins{$prot}{$dom}{'start'}; 
         my $end = $proteins{$prot}{$dom}{'end'}; 
         my $len = $end - $start + 1;     
         my $n = $start - 1; # where the substring should start being extracted (needed due to indexing differences)
         my $dom_seq = substr($s, $n, $len);     
         print $OUTPUT_FILE ">$prot\[$dom\]\n"; # print identifier line to output file
         print $OUTPUT_FILE "$dom_seq\n"; # print domain sequence to output file 
      }
    }
}

close($OUTPUT_FILE) or die "Can't close file : $out_file";
