#!/usr/bin/perl 
# converts output from any TM prediction program 
# to a fasta sequence file
# lrf 11apr05
# updated 13may05 to use flags instead of row of inputs
#Modified by Ariela Vergara (June 19, 2015)

use strict;
use lib "$ENV{PWD}/modules";
require tm_functions;
require fileio;
require formats;
require fasta;

my $usage = "\n$0 <options>
	-h usage\n
	-v verbose\n
	-t <tm prediction>, REQUIRED
	 Standard output file from software
	 or list of TM domains.\n
	-f <format>, REQUIRED one of:
	 tmhmm phdhtm hmmtop hmmtop_mult memsat 
	 phdhtm_res das toppred proftmb 
	 predtmbb-v (Viterbi) predtmbb-n (N-best)
	 predtmbb-p (Posterior decoding)
	 fulltm tmdet
	 Note that predtmbb results can be given
	 as a single input file and script with 
	 separate them.\n
	-l <label>, optional single character
	 default = H, E(proftmb) or T(tmdet)\n
	-s <protein sequence>, optional fasta format file.
	 Helps to determine the sequence length if not 
	 otherwise possible.\n
	 Output is saved as format.fa or pdbX_format.fa\n";

# variables
# num helices, start, end, length, 
my ($n, $rs, $re, $l, %seq, $name, $tmlabel, $pname, $rnum, $rchn);
my ($verbose, $infile, $format, $seqfile);

# input flags
if (@ARGV < 2) { die "$usage\n"; }
while(@ARGV) {
	$_ = shift;
	if ($_ eq "-h") { print "$usage\n"; exit; }
	elsif ($_ eq "-t") { $infile  = shift; }
	elsif ($_ eq "-f") { $format  = shift; }
	elsif ($_ eq "-s") { $seqfile = shift; }
	elsif ($_ eq "-l") { 
		$tmlabel = shift; 
		if (length($tmlabel) > 1)   { die "Can't use $tmlabel as a symbol in a sequence\n$usage\n"; }
		if ($tmlabel =~ /[a-zA-Z]/) { }
		else                        { die "Don't recognise input $tmlabel\n$usage\n"; }
	}
	elsif ($_ eq "-v") { $verbose = 1;	  print STDERR "Being verbose\n" }

}
print STDERR "\nReading $infile as $format format\n";

# set label for output fasta file
$_ = $format;
if (!$tmlabel) {
	if ( (/predtmbb/) or (/proftmb/) ) { $tmlabel = "E"; }
	elsif ( (/tmdet/) or (/fulltm/)  ) { $tmlabel = "T"; }
	else                               { $tmlabel = "H"; }
}
print STDERR "Using $tmlabel for TM residues\n";

# read the TM input files
SWITCH: {
    if (/das/)         { ($n, $rs, $re, $l) = tm_functions::read_das($infile, "1.7", $verbose);  last SWITCH; }
    if (/tmhmm/)       { ($n, $rs, $re, $l) = tm_functions::read_tmhmm($infile, $verbose);       last SWITCH; }
    if (/hmmtop_mult/) { ($n, $rs, $re, $l) = tm_functions::read_hmmtop_mult($infile, $verbose); last SWITCH; }
    if (/hmmtop/)      { ($n, $rs, $re, $l) = tm_functions::read_hmmtop($infile, $verbose);      last SWITCH; }
    if (/phdhtm/)      { ($n, $rs, $re, $l) = tm_functions::read_phdhtm($infile, $verbose);      last SWITCH; }
    if (/memsat/)      { ($n, $rs, $re, $l) = tm_functions::read_memsat($infile, $verbose);      last SWITCH; }
    if (/toppred/)     { ($n, $rs, $re, $l) = tm_functions::read_toppred($infile, $verbose);     last SWITCH; }
    if (/proftmb/)     { ($n, $rs, $re, $l) = tm_functions::read_proftmb($infile, $verbose);     last SWITCH; }
    if (/predtmbb-v/)  { ($n, $rs, $re, $l) = tm_functions::read_predtmbb($infile, "Viterbi", $verbose);      last SWITCH; }
    if (/predtmbb-p/)  { ($n, $rs, $re, $l) = tm_functions::read_predtmbb($infile, "Posterior", $verbose);    last SWITCH; }
    if (/predtmbb-n/)  { ($n, $rs, $re, $l) = tm_functions::read_predtmbb($infile, "N-best", $verbose);       last SWITCH; }
    if (/predtmbb/)    { die "I prefer you to specify:\npredtmbb-v, predtmbb-p or predtmbb-n for Viterbi, posterior or N-best decoding\n"; }
    if (/fulltm/)      { 
		my ($pname, $rnum, $rstart, $rend, $rchn) = tm_functions::readTMlist($infile, $verbose);   
		my %num = %$rnum; my @hashstart = @$rstart; my @hashend = @$rend;
		$n = $num{$pname};

		# convert hashes into arrays
		my (@begin, @finish);
		for my $i (1 .. $n) {
			$begin[$i]  = $hashstart[$i]{$pname}; 
			$finish[$i] = $hashend[$i]{$pname}; 
		}
		$rs = \@begin;
		$re = \@finish;
		last SWITCH; 
    }
    if (/tmdet/)       { 
		my ($pname, $rnum, $rstart, $rend, $rchn) = tm_functions::readTMlist($infile, $verbose);
		my %num = %$rnum; my @hashstart = @$rstart; my @hashend = @$rend; 
		$n = $num{$pname};

		# convert hashes into arrays
		my (@begin, @finish);
		for my $i (1 .. $n) {
			$begin[$i] = $hashstart[$i]{$pname}; 
			$finish[$i] = $hashend[$i]{$pname};
		}
		$rs = \@begin;
		$re = \@finish;
		last SWITCH;
    }
	
	else { die "don't recognise this type of format.\n$usage"; }
	# not set up yet
    #if (/phdhtm_res/) { ($n, $rs, $re, $l) = tm_functions::read_phdhtm_res($infile, $verbose);  last SWITCH; }
}

my @start = @$rs;
my @end   = @$re; 

###############################
# add protein sequence if given
my $seqlen;
if ($seqfile) { 
	print STDERR "Reading $seqfile as protein sequence\n";
	my $text = &fileio::readAsciiFile($seqfile);
	%seq = &fasta::toFastaHash($text);
	my @array = keys (%seq);
	$name = $array[0];
	print STDERR "Writing $name\n";

	# getting sequence length from fasta file, and checking errors
	$seqlen = length($seq{$name}); 
	if ($l) { 
		print STDERR "Sequence length from $infile is $l\n"; 
		if ($l != $seqlen) { print STDERR "Sequence length from $infile ($l) != length of seq in $seqfile ($seqlen)\n"; } 
	}
	else { $l = $seqlen; print STDERR "Sequence length from $seqfile is $l\n"; }
}
else { $name = $format; }
if (!$l) { print STDERR "Don't have protein sequence length: setting to $end[$n]\n"; $l = $end[$n]; }


#############################################################
# create fasta format TM sequence as either "H" or "E" or "T"
$seq{$format} = &tm_functions::toTMfasta($n,\@start,\@end,$l,$tmlabel);
my $fasta = &formats::toMFAFormat($name,\%seq);

print STDERR "\nCreating $format.fa\n\n";
open (OUT, ">$format.fa");
print OUT $fasta; 
close OUT;
