#!/usr/bin/perl

# make_matrices.pl
# 
# The purpose of this script is to create the scoring matrix for each of the taxa in the data set. It takes as it 
# command line input:
# blrpt - the filtered tabular BLAST report (only top hit for each query present)
# seqfa - the taxa sequence Fasta file (genome or protein sequences)
# compfa - the component Fasta file (gene or domain sequences)
# outdir - the directory where the matrices are to be stored
#
# Author : Brittney (Hinds) Keel
#
# Date created :  3 May 2013
# Last modified : 1 Feb 2016

use strict;
use Getopt::Long;


# Start by gathering command line options.

my $blast;
my $fasta1;
my $fasta2;
my $m_dir;
my $help;

if (@ARGV > 0 ) { # if command line options supplied
  my $opt = GetOptions('help!',     \$help,
                       'blrpt:s',   \$blast,
	               'seqfa:s',   \$fasta1,
                       'compfa:s',  \$fasta2,
                       'outdir:s',  \$m_dir);
} else {
   print "Usage of this script:\n";
   print "Please supply command line options.\n";  
   print "Example: ./make_matrices.pl -blrpt blastreport -seqfa seqfasta -compfa componentfasta -outdir matrixdirectory\n";
   exit;
}

# If the user requests help on the command line print out an example usage of the
# script.

if ($help)  {
  print "Usage of this script:\n";
  print "Please supply command line options.\n";  
  print "Example: ./make_matrices.pl -blrpt blastreport -seqfa seqfasta -compfa componentfasta -outdir matrixdirectory\n";
  exit;
}



# Gather the taxa identifiers and the component accessions from the component sequence and taxa sequence Fasta files.
my $prot_fasta_file = "<"."$fasta1";
my $dom_fasta_file = "<"."$fasta2"; 
my $INPUT_FILE11;
my $INPUT_FILE12;
my @proteins;
my %domains;
my %prots;
my %protdom;


# First get the components.
open($INPUT_FILE12, $dom_fasta_file) or die "Can't open file : $dom_fasta_file"; 

while (my $line = readline($INPUT_FILE12)) {
  my $first_elem = substr($line, 0,1); 
  
  if ($first_elem eq ">") {
    chomp $line;
    $line =~ s/>//g;

    # The following is a string search, each different based on the identifier format (may need to be adapted if using another format).
    (my $prot, my $dom) = $line =~ m/^(\S+)\[(\S+)\]$/;
    $protdom{$prot}{$dom}=1;
    $prots{$prot}++;
    $domains{$dom}++;
  }
}

my @domains = keys(%domains);
my @prots = keys(%prots);


close($INPUT_FILE12) or die "Can't close file : $dom_fasta_file";



# Next get the taxa.
open($INPUT_FILE11, $prot_fasta_file) or die "Can't open file: $prot_fasta_file";

while (my $line = readline($INPUT_FILE11)) {
  my $first_char = substr($line, 0, 1); 

  if ($first_char eq ">") {
    chomp $line;
    $line =~ s/>//g; 

    my @els = split /\s+/, $line; my $p = $els[0];
   
    push (@proteins, $p);
  }
}
 
close($INPUT_FILE11) or die "Can't close file: $prot_fasta_file";



# Identify which taxa do not have any component info provided and store the list of their ids to a file.
my %seen; 
my @diff; 
foreach my $i (@proteins, @prots) {
  $seen{$i}++; 
}

my @use_prots;
my $OUTPUT_FILE4;
my $no_domains_file = ">"."$m_dir".'/'.'no_domain_info.txt'; 
open($OUTPUT_FILE4, $no_domains_file) or die "Can't open file: $no_domains_file";

foreach my $j (keys(%seen)) {
  if (!($seen{$j}>1)) { # if protein id was not found in both protein arrays (we don't have component info for it)
     push (@diff, $j);
  } else {
     push (@use_prots, $j);
  }  
}

print $OUTPUT_FILE4 join("\n", @diff);
close $OUTPUT_FILE4 or die "Can't close file: $no_domains_file";

my @ordered_use_prots;
foreach my $p (@proteins) {
  foreach my $s (@use_prots) {
    if ($s eq $p) {
       push(@ordered_use_prots, $s);
    }
  }
}



# Create a hash of hashes which stores the best hit for each component on each taxa in each of
# the BLAST searches.

my $blast_report = "<"."$blast";
my %hashohashes = make_hash($blast_report);

# Now use the hash of hashes to fill in the matrix.

my $OUTPUT_FILE2;
my $OUTPUT_FILE3;
my $protein_file = ">"."$m_dir".'/'.'protein_index.txt'; 
my $domain_file = ">"."$m_dir".'/'.'domains_present.txt'; 


open($OUTPUT_FILE3, $domain_file) or die "Can't open file: $domain_file";
print $OUTPUT_FILE3 join("\n", @domains);
close($OUTPUT_FILE3) or die "Can't close file: $domain_file";

open($OUTPUT_FILE2, $protein_file) or die "Can't open file: $protein_file";
my $n = 1; 
foreach my $elem (@ordered_use_prots) {
  print $OUTPUT_FILE2 "P$n: \t $elem\n";
  $n++;
}
close($OUTPUT_FILE2) or die "Can't close file: $protein_file";


my $count = 1;
foreach my $elem (@ordered_use_prots) {
  my $OUTPUT_FILE; 
  my $file = ">"."$m_dir".'/'."$count"."_mtx".".txt";
  open ($OUTPUT_FILE, $file) or die "Can't open output file: $file";

  foreach my $d (@domains) {
    if ($protdom{$elem}{$d}) {
      print $OUTPUT_FILE "\n$d\t";
      
      foreach my $key1 (@ordered_use_prots) {
        if ($key1 eq $elem) {
          print $OUTPUT_FILE "1\t";
        } else {
          my $eval = $hashohashes{$elem}{$d}{$key1};
          if (!($eval)) {
            print $OUTPUT_FILE "2870\t";
          } else {
            print $OUTPUT_FILE "$eval\t";
          }
        }
      }
    }
  }
  close ($OUTPUT_FILE) or die "Can't close output file $file because $!";
  $count++;
}


# SUBROUTINES:

#############################################################################
# make_hash
#
# DESCRIPTION: creates a hash of hashes that contains the top hit information
#              for each domain in a protein for the Blast search
#
# IN     $input: the Blast report to be read
#
# OUT    %prot_hash : hash whose keys are the proteins and whose values are the hashes
#                     of top hit information from the Blast search
#############################################################################
sub make_hash {
  my $input = shift; 

  my %prot_hash; 
  my $INPUT; 

  open($INPUT, $input) or die "Can't open file : $input"; 

  while (my $line = readline($INPUT)) {
    my @line_els = split /\t/, $line; 
		  
    my $header = $line_els[0]; # the query header

    (my $query_prot, my $d) = $header =~ m/^(\S+)\[(\S+)\]$/; 

    # Here again the string search is based on the identifier line formatting.
    my $subject_prot = $line_els[1];
    #$subject_prot =~ s/\|$//; 
	
    my $evalue = $line_els[10]; 
	
    $prot_hash{$query_prot}{$d}{$subject_prot} = $evalue;
  }

  close $INPUT or die "Can't close file: $input";

  return %prot_hash;
}

