#!/usr/bin/perl

# mtxassemble.pl
#
# The purpose of this script is to execute all steps in distance matrix creation.
#
# Author: Brittney (Hinds) Keel
# Date Created: 13 May 2014
# Last Modified : 1 Feb 2016


use strict;
use Getopt::Long;

# Locations of HMMER and BLAST executables
# Specify the paths to these executables if they are not included in the executable search path ($PATH) The path can be given as:
# "../hmmer-3.1b2-macosx-intel/binaries/" (don't forget to add '/'at the end)
my $HMMDIR = "";
my $BLASTDIR = ""; 


# Initialize all input variables.
my $dir;
my $infile;
my $hmm;
my $overlap;
my $BLevalthresh;
my $HMMevalthresh;
my $help;


# Take in command line options.
if (@ARGV > 0) { # if command line options supplied
  my $opt = GetOptions('help',      \$help,
                       'dir:s',     \$dir,
                       'fasta:s',   \$infile,
                       'hmm:s',	    \$hmm,
                       'overlap:s', \$overlap,
                       'blE:s',     \$BLevalthresh,
                       'domE:s',    \$HMMevalthresh);
}

# If the user requests help or any options are missing.
if (!$dir || !$infile || !$hmm || !defined($overlap) || !defined($BLevalthresh) || !defined($HMMevalthresh)|| $help) {
  print "Usage of this script:\n";
  print "perl mtxassemble.pl -dir directory -fasta protein_sequences -hmm hmmdb -overlap overlap_thresh";
  print "-blE BLAST_Evalue_thresh -domE HMMER_Evalue_thresh\n";
  print "Command line arguments (all except \"-help\" are required):\n";
  print "-dir : directory where input FASTA file is and output files are to be stored\n";
  print "-fasta : FASTA file containing the protein sequences\n";
  print "-hmm : full path of the profile HMM database file to be searched by HMMER3 (The database needs to be preconfigured; see HMMER3 manual)\n";
  print "-overlap : percentage of allowed overlap in domain predictions\n";
  print "-blE : threshold for BLAST E-value\n";
  print "-domE : threshold for HMMER E-value\n";
  
  exit;
}


# First create the output subdirectories.
die "Directory: $dir does not exist. mtxassemble.pl stopped. \n" unless (-d $dir);

my $makedir1 = 'mkdir '."$dir".'/Preprocess';
system($makedir1);
my $makedir2 = 'mkdir '."$dir".'/Matrices';
system($makedir2);


# Reformat the Fasta file.
die "Input file: $infile does not exist. mtxassemble.pl stopped\n" unless (-f "$dir\/$infile");

my $reformat = 'perl reformat_fasta.pl -in '."$dir".'/'."$infile".' -out '."$dir".'/Preprocess/protein_reformatted.fa';
system($reformat);
print "Processing of the input file.\n";
if (-f "$dir\/Preprocess\/protein_reformatted.fa") {
    print "protein_reformatted.fa found/created\n\n";
} 
else {
    die "protein_reformatted.fa is not found in $dir\n";
}

# Run HMMER.
if (-f "$hmm") {
    print "HMM database, $hmm, found.\nRunning ${HMMDIR}hmmscan\n";
}
else {
die "pHMM database: $hmm does not exist. mtxassemble.pl stopped\n";
}

my $hmm = "${HMMDIR}".'hmmscan -o'."$dir".'/Preprocess/hmmer_out.txt --acc --domtblout '."$dir".'/Preprocess/hmmer_out_domain.txt --domE 1.0 '."$hmm".' '."$dir".'/Preprocess/protein_reformatted.fa'; 
system($hmm);
print "HMMER search done.\n\n";

# Filter HMMER results.
if (-f "$dir\/Preprocess\/hmmer_out_domain.txt") {
    print "$dir\/Preprocess\/hmmer_out_domain.txt found.\n";
}
else {
    die "hmmscan output (hmmer_out_domain.txt) not found. mtxassemble.pl stopped\n";
}

my $filterhmm = 'perl filter_hmmer.pl -in '."$dir".'/Preprocess/hmmer_out_domain.txt -out '."$dir".'/Preprocess/hmmer_out_domain_filtered.txt -e '."$HMMevalthresh";
system($filterhmm);
print "HMMER table parsed.\n\n";

# Generate domain sequence Fasta.
my $domseq = 'perl domainseqs.pl -dir '."$dir".'/Preprocess -hmmerin hmmer_out_domain_filtered.txt -fasta protein_reformatted.fa -out domain_seqs.fa -overlap '."$overlap".' -e '."$HMMevalthresh"; 
system($domseq);
print "Domain sequence file created.\n\n";

# Blast
print "Running ${BLASTDIR}makeblastdb\n";
my $mkbl = "${BLASTDIR}".'makeblastdb -in '."$dir".'/Preprocess/protein_reformatted.fa -dbtype prot';
system($mkbl);
if (-f "$dir\/Preprocess\/protein_reformatted.fa.phr") {
    print "BLAST database files created.\n";
} 
else {
    die "BLAST database formatting failed. mtxassemble.pl stopped\n";
}

print "Running blastp ";
my $bl = "${BLASTDIR}".'blastp -query '."$dir".'/Preprocess/domain_seqs.fa -db '."$dir".'/Preprocess/protein_reformatted.fa -outfmt 6 -searchsp 53000 -out '."$dir".'/Preprocess/blast_report.tab';
system($bl);
if (-f "$dir\/Preprocess\/blast_report.tab") {
    print "don. blast_report.tab created.\n";
}
else {
    die "not completed. mtxassemble.pl stopped\n";
}


# Filter Blast report.
my $filter = 'perl filter_blast.pl -in '."$dir".'/Preprocess/blast_report.tab -out '."$dir".'/Preprocess/blast_report.tabf';
system($filter);
if (-f "$dir\/Preprocess\/blast_report.tabf") {
    print "blast_report.tab parsed.\n\n";
}
else {
    die "Parsing blast_report.tab failed. mtxassemble.pl stopped.\n";
}

# Make matrices.
my $mtx = 'perl make_matrices.pl -blrpt '."$dir".'/Preprocess/blast_report.tabf -seqfa '."$dir".'/Preprocess/protein_reformatted.fa -compfa '."$dir".'/Preprocess/domain_seqs.fa -outdir '."$dir".'/Matrices';
system($mtx);
print "All matrices generated!\n\n";

# Create output directory for mocassin_prot.m

my $makedir3 = 'mkdir '."$dir".'/Primary_Solutions';
if (!system($makedir3)) {
    print "Directory: $dir/Primary_Solutions successfully created.\n";
	print "Use this as the output directory for mocassin_prot.m\n";
}

print "mtxassemble.pl finished.\n\n";
