# A parsing modeller for transmembrane protein
# sequence things
# updated to add length - lrf mar05
# 
# advertised functions:
# 
# reads list with whole family of proteins
#(\@protein, \%num_tm, \@start, \@end, \@chn, \@length) = &readTMlist_fam($file_name, $verbose)
# reads single protein list
#( $protein, \%num_tm, \@start, \@end, \@chn, \@length) = &readTMlist($file_name, $verbose)
# $seq-string = &toTMfasta($num_hel, \@start, \@end, $length);
#
# in the following the final flag is the optional scalar to define verbosity
# ($num_hel, @$rstart, @$rend, $length) = &read_tmhmm($prediction_filename, "1|0");
# ($num_hel, @$rstart, @$rend, $length) = &read_phdhtm($prediction_filename, "1|0");
# ($num_hel, @$rstart, @$rend, $length) = &read_hmmtop($prediction_filename, "1|0");
# ($num_hel, @$rstart, @$rend, $length) = &read_hmmtop_mult($prediction_filename, "1|0");
# cutoff is optional: either 1.7 or 2.2
# ($num_hel, @$rstart, @$rend, $length) = &read_das($prediction_filename, $cutoff, "1|0");
# ($num_hel, @$rstart, @$rend, $length) = &read_memsat($prediction_filename, "1|0");
# ($num_hel, @$rstart, @$rend, $length) = &read_toppred($prediction_filename, "1|0");
# ($type) = &pred2type($method); # converts prediction methodology name to a TM type
# add readPHDhtm_res
# read_predtmbb($filename);
# read_proftmb
use strict; 

use lib "/razor/0/common/scripts/modules/parsing";

package   tm_functions;
require   Exporter;
my @ISA = qw (Exporter);
my @EXPORT = qw (readTMlist_fam readTMlist read_predtmbb read_proftmb read_phdhtm read_hmmtop read_hmmtop_mult read_tmhmm read_das read_memsat read_toppred toTMfasta pred2type);

#### EXPORTED ####

################
# to read the list of tm domains where whole family there
################
sub readTMlist_fam {

	my ($verbose, $file, @protein, %num_tm, @start, @end, @chn, $num_protein, @length);
	($file, $verbose) = @_;

	open (LIST, "$file") or die "Couldn't open $file";
	print STDERR "Reading $file\n";

	# read list of helices
	while (<LIST>) {

		# first line contains the header with the pdb filenames
		if (/^TM/) {
			if (/len/) { s/len/l/g; }
	
			# tidy up line
			chomp();
			s/\s+$//; # spaces at end
			s/^TM\s+//; # text at beginning
			s/^segment\s+//; # text at beginning
			s/....-e//g;
			s/....-l//g;
			s/-s//g;
		
			# save protein names
			@protein = split(/\s+/, $_); 
	
			# error check
			$num_protein = @protein;
			print STDERR "There are $num_protein proteins listed\n\n" if ($verbose); 
		}
	
		else {
			next unless /\w/; # skip lines without text
			next if (/#/); # skip lines with hashes
			my @temp = split(/\s+/, $_); 
	
			my $tm = $temp[0];
	
			# for each of the protein, store the chain ID's, starts and ends of helices
			foreach my $p (0 .. ($num_protein-1)) {
		
				# if there are values for chain, start and end then go on
				if ( ($temp[1+($p*3)]) and ($temp[2+($p*3)]) and ($temp[3+($p*3)]) ) { 

					next if ($temp[2+($p*3)] !~ /\d/); # start and end must be numbers
					next if ($temp[3+($p*3)] !~ /\d/);

					$chn[$tm]{$protein[$p]}   = $temp[1+($p*3)]; # store start and end
					$start[$tm]{$protein[$p]} = $temp[2+($p*3)]; 
					$end[$tm]{$protein[$p]}   = $temp[3+($p*3)]; 
					$length[$tm]{$protein[$p]}= $temp[($num_protein*3)+($p+1)]; # length at end

					$num_tm{$protein[$p]} ++;

					print STDERR "$protein[$p] $tm $chn[$tm]{$protein[$p]} $start[$tm]{$protein[$p]} $end[$tm]{$protein[$p]} $length[$tm]{$protein[$p]}\n" if ($verbose);
				}
			} # next pdb
		}
	} # end read list

	return (\@protein, \%num_tm, \@start, \@end, \@chn, \@length);

}

################
# to read the list of tm domains for a single pdb file
################
sub readTMlist {

	my ($verbose, $file, $pdb, %num_tm, @start, @end, @chn, @length, $tm);
	($file, $verbose) = @_;
	$tm = 0;

	open (LIST, "$file") or die "Couldn't open $file";
	print STDERR "\nReading $file\n";

	# read list of helices
	while (<LIST>) {

		# first line contains the header with the pdb filenames
		if (/^TM/) { 
			$pdb = substr($_, 3, 4); 
			print STDERR "Found $pdb\n" if ($verbose); 
		}	
		else {
			$tm++;
			next unless /\w/; # skip lines without text
			next if (/#/); # skip lines with hashes

			# store data for each TM line
			my @temp = split(/\s+/, $_);
	# don't use first column!!!! use position in file
	#		$tm              = $temp[0];
			$chn[$tm]{$pdb}  = $temp[1];
			$start[$tm]{$pdb}= $temp[2];
			$end[$tm]{$pdb}  = $temp[3];
			$length[$tm]{$pdb} = $end[$tm]{$pdb} - $start[$tm]{$pdb} + 1;			
			$num_tm{$pdb} ++;

			print STDERR "$pdb $tm $chn[$tm]{$pdb} $start[$tm]{$pdb} $end[$tm]{$pdb} $length[$tm]{$pdb}\n" if ($verbose);
		}
	} # end read list

	return ($pdb, \%num_tm, \@start, \@end, \@chn, \@length);

}
#################################
# read predtmbb prediction output
sub read_predtmbb
#################################
{
	my ($file, $do_method, $verbose) = @_;
	open (IN, "<$file") or die "cannot find PredTMBB (BAGOS) $file!\n";
	
	my $method; # stores if Viterbi, posterior, or N-best decoding method
	my (@start, @end, $num_hel, $length, %tm, @begin, @finish, %len);
	
	# store all methods to allow full input to be read if required
	while (my $line = <IN>) {
		$line =~ s/decoding//;
		if ($line =~ /(\S+)\s+method/) {
			$method = $1; # might be useful later
			if ($method eq $do_method) { print STDERR "Found $do_method\n" if ($verbose); }
		}
		if ($line =~ /^tm\s+(\d+)\s+(\d+)$/) {
			$tm{$method}++;
			$begin[$tm{$method}]{$method}  = $1;
			$finish[$tm{$method}]{$method} = $2;
		}
		# either in, out or tm; need last value for length
		if ($line =~ /^\w+\s+(\d+)\s+(\d+)$/) { 
			$len{$method} = $2;
		}
	}
	close IN;
	
	# store for the method requested
	$num_hel = $tm{$do_method};
	$length  = $len{$do_method};
	if (!$num_hel) { die "Didn't find any TM domains in $file for $do_method!!\n"; }
	for (my $i = 1; $i <= $num_hel; $i++) {
		$start[$i] = $begin[$i]{$do_method};
		$end[$i]   = $finish[$i]{$do_method};
		print STDERR "\tTM $i: $start[$i] - $end[$i]\n" if ($verbose);
	}
	return ($num_hel, \@start, \@end, $length);
}

#################################
# read proftmb prediction output
sub read_proftmb
#################################
{
	my ($file, $verbose) = @_;
	open (IN, "<$file") or die "cannot find ProfTMB $file!\n";
	
	my (@start, @end, $length, @tmpred);
	my $num_hel = 0;

	# store input
	while (<IN>) {
		chomp();
		my ($name, $score, $pred, $seq) = split();
		@tmpred = split(//, $pred);
	}
	$length = $#tmpred+1;

	# read TM part
	my $in_tm = 0;
	for (my $posn = 0; $posn <= $#tmpred; $posn++)  {
		
		# at the beginning of a TM domain
		if ( ((!$in_tm) and ($tmpred[$posn] =~ "U")) 
		  or ((!$in_tm) and ($tmpred[$posn] =~ "D")) ) {
			$num_hel++; $in_tm = 1;	
			$start[$num_hel] = $posn+1;
		}
		
		# at the end of a TM domain
		if ( (($in_tm) and ($tmpred[$posn] =~ "-")) 
		  or (($in_tm) and ($tmpred[$posn] =~ "i")) ) {
			$end[$num_hel]  = $posn;
			$in_tm = 0;
		}
	}	
	
	if (!$num_hel) { die "Didn't find any TM domains in $file!!\n"; }
	for (my $i = 1; $i <= $num_hel; $i++) {
		print STDERR "\tTM $i: $start[$i] - $end[$i]\n" if ($verbose);
	}
	return ($num_hel, \@start, \@end, $length);
}

##############
sub read_phdhtm
##############
# extracts TM regions using end definitions after parsing
{

	# send the filename
	my ($file, $verbose) = @_;
	my (@first, @last, @start, @end, $length, $line, $num_hel);
	my $startline = 30000;
	my $stopline  = 30000;

	open (PHD, $file) or die "$file not found!\n";
	while (<PHD>) {
		$line++;

        # get length of protein
		if (/SEQLENGTH/) { s/\sSEQLENGTH\s+//; chomp(); $length = $_; }

        # get details of Tm start and end
		if ( ($line > $startline) and (/^---/) ) { $stopline = $line; last; } # quit when get to dashes
		elsif (/^ MOD_NHTM/) { $startline = $line; } # start when get to MOD_NHTM
        # read TM lines
		if ( ($line > $startline) and ($line < $stopline) ){
			s/^\s+//;  # remove leading spaces
			s/-//;     # remove dashes
			my @line = split (/\s+/);
			push @first, $line[3]; 
			push @last, $line[4];  
			$num_hel++;
		}
	}

	# predictprotein doesn't have these in order, so sort them
	my @s = sort {$a <=> $b} @first; # sort in ascending numerical order
	my @e = sort {$a <=> $b} @last;  

	# bump up the TM numbers for output
	if (!$num_hel) { die "Didn't find any TM domains in $file!!\n"; }
	for (my $range = 0; $range < $num_hel; $range++) {
		my $tm = $range + 1;
		$start[$tm] = $s[$range];
		$end[$tm]   = $e[$range];

		# screen output
		print STDERR "\tTM $tm: $start[$tm] - $end[$tm]\n" if ($verbose);
	}
	return ($num_hel, \@start, \@end, $length);

}

######################
# read HMMTOP2 output
######################
sub read_hmmtop {

	my ($file, $verbose) = (@_);
	open (HMMTOP, $file) or die "$file not found!\n";
	my ($length, $num_hel, @start, @end);

	# error check
	my $test = <HMMTOP>;
	if ($test !~ /Protein:/) { die "Don't recognise as hmmtop output!\n"; }
	while (<HMMTOP>) {
		chomp();
		if (/^Length:/) {
			s/^Length: //;
			$length = $_;
		}
		# select lines beginning Transmembrane helices
		if (/^Transmembrane helices:/) {
			my $line = $_;
			$num_hel = tr/-/ /; # count number of dashes as number of helices
			$line =~ s/^Transmembrane helices: //;
			my @temp = split(/\s+/, $line); # break up with spaces

			for (my $tm = 1; $tm <= $num_hel; $tm++) {
				($start[$tm], $end[$tm]) = split(/-/, $temp[$tm-1]);
				print STDERR "\tTM $tm: $start[$tm] - $end[$tm]\n" if ($verbose);
			}
		}
	}
	if (!$num_hel) { die "Didn't find any TM domains in $file!!\n"; }
	return ($num_hel, \@start, \@end, $length);
}


######################
# read HMMTOP2 output
######################
sub read_hmmtop_mult {

	my ($file, $verbose) = (@_);
	open (HMMTOP, $file) or die "$file doesn't exist!\n";
	my ($num_hel, @start, @end, $length, @ends);

# error check
#my $test = <HMMTOP>;
#if ($test !~ /^>HP:/) { die "Don't recognise as local mulitple-alignment HMMTOP output!\n"; }
	while(<HMMTOP>) {
		chomp();
		if (/ OUT /) {
			print STDERR "\n\tTopology = N-out\n" if ($verbose);
			@ends = split(/ OUT /, $_);
			last;
		}# break up with OUT
		elsif (/ IN /) {
			print STDERR "\n\tTopology = N-in\n" if ($verbose);
			@ends = split(/ IN /, $_);
			last;
		}# break up with IN
	}

	my @temp = split(/\s+/, $ends[1]); # break up with spaces
	$num_hel = $temp[1]; # first number is number of helices
	my @junk = split(/\s+/, $ends[0]);
	$length = $junk[1]; 

	# store TM ends
	if (!$num_hel) { die "Didn't find any TM domains in $file!!\n"; }
	for (my $tm = 1; $tm <= $num_hel; $tm++) {
		$start[$tm] = $temp[$tm*2];
		$end[$tm]   = $temp[$tm*2+1];
		print STDERR "\tTM $tm: $start[$tm] - $end[$tm]\n"  if ($verbose);
	}
	return ($num_hel, \@start, \@end, $length);

}

###################
# read TMHMM output
###################
sub read_tmhmm {

	my ($file, $verbose) = (@_);
	open (TMHMM, $file) or die "$file not found!\n";
	my ($num_hel, @start, @end, $length);

	# error check
	my $test = <TMHMM>;
	if ($test !~ /TMHMM /) { die "Don't recognise as tmhmm output!\n"; }
	while (<TMHMM>) {
		# read TMhelix lines
		if (/TMhelix/) {
			s/.+TMhelix\s+//; # remove first few columns 
			$num_hel++;
			($start[$num_hel], $end[$num_hel]) = split (/\s+/);
			print STDERR "\tTM $num_hel: $start[$num_hel] - $end[$num_hel]\n" if ($verbose);
		}
		# store length of sequence
		if (/Length: /) {
			s/.+Length:\s//; 
			$length = $_;
		}
	}

	if (!$num_hel) { die "Didn't find any TM domains in $file!!\n"; }
	return ($num_hel, \@start, \@end, $length);

}

###################
# read DAS output
###################
sub read_das {

	my ($file, $cutoff, $verbose) = @_;
	open (DAS, $file) or die "$file not found!\n";
	my ($num_hel, @start, @end, $length);

	# default cut-off value = 1.7
	# alternative is 2.2
	if (!$cutoff) { $cutoff = "1\.7"; }

	my $test = <DAS>;
	# error check
	if ($test !~ /Potential transmembrane segments/) { die "Don't recognise as DAS output!\n"; }
	while (<DAS>) {
		# select lines where cutoff = 1.7
		if (/$cutoff\n/) {
			$num_hel++;
			($start[$num_hel], $end[$num_hel]) = split (/\s+/);
			print STDERR "\tTM $num_hel: $start[$num_hel] - $end[$num_hel]\n";
		}
	}
	if (!$num_hel) { die "Didn't find any TM domains in $file!!\n"; }

# the length cannot be determined from DAS output -using the length of the last helix
	print STDERR "Warning: the sequence length cannot be determined from DAS output
	- using the length of the last TM instead: may need to add gaps to end\n"; 

	$length = $end[$num_hel];
	return ($num_hel, \@start, \@end, $length);

}

#####################
# read MEMSAT2 output
#####################
sub read_memsat {

	my ($file, $verbose) = (@_);
	open (MEMSAT, $file) or die "$file not found!\n";
	my ($length, $num_hel, @start, @end);

	my $start_line;

	# error check 
	my $test = <MEMSAT>;
	if ($test !~ /MEMSAT/) { die "Don't recognise as memsat output!\n"; }
	while (<MEMSAT>) {
		if (/NO TRANSMEMBRANE HELICES/) { print STDERR "\nNo helices in MEMSAT!\n\n"; exit; }
		my $line++;
		# where does info start
		if (/================/) { $start_line = $line; }
		#if (/--- Key:/          { $stop_line = $line; }
		if ( ($start_line) and (/\d:/) ) {
			s/\(\w+\)//g; # delete text in brackets
			s/://;        # delete colon
			s/-/ /;       # replace dash
			my ($tm, $start, $end) = split (/\s+/);
			$num_hel = $tm; $start[$num_hel] = $start;
			$end[$num_hel] = $end; # put in arrays
			print STDERR "\tTM $tm: $start[$tm] - $end[$tm]\n" if $verbose;
		}
	}
	if (!$num_hel) { die "Didn't find any TM domains in $file!!\n"; }
	
	# the length cannot easily be determined from MEMSAT output -using the length of the last helix
	print STDERR "Warning: the sequence length cannot easily be determined from MEMSAT output
	- using the length of the last tm instead: may need to add gaps to end\n"; 

	$length = $end[$num_hel];
	return ($num_hel, \@start, \@end, $length);

}

#####################
# read TOPPRED output
#####################
sub read_toppred {

	my ($file, $verbose) = (@_);
	open (TOPPRED, $file) or die "$file not found!\n";
	my ($length, $num_hel, @start, @end);

	# error check 
	my @toppred = <TOPPRED>;
	if ($toppred[0] !~ /Algorithm specific/) { die "Don't recognise as toppred output!\n"; }

	# remove tail lines until last line starts with 'Total'
	while ($toppred[$#toppred] !~ /^Total/) { pop(@toppred); }
	pop(@toppred); # remove Total line
	pop(@toppred); # remove blank line

	# get the TM positions out 
	my $start_line; my $line = 0;
	foreach $_ (@toppred) {
	
		if (/Sequence : /) { 
			my @temp = split(/\s+\(/, $_); 
			$length = $temp[1];
			$length =~ s/\s+res\)\n//;
		}
		if (/^ Helix/) { $start_line = $line; next; }
		elsif ($start_line) {
			s/-/ /;       # replace dash
			s/^\s+//;     # remove leading spaces

			my ($tm, $start, $end) = split (/\s+/);
			$num_hel = $tm; 
			$start[$num_hel] = $start;
			$end[$num_hel]   = $end; # put in arrays

			print STDERR "\tTM $num_hel: $start[$num_hel] - $end[$num_hel]\n" if ($verbose);
		}
		$line++;
	}
	if (!$num_hel) { die "Didn't find any TM domains in $file!!\n"; }
	print STDERR "\tlength is $length\n" if ($verbose);
	return ($num_hel, \@start, \@end, $length);

} 

############################################
# create fasta output string with TM domains
############################################
sub toTMfasta 
{

	my ($num_hel, $rstart, $rend, $length, $label) = @_;
	my ($fasta, $res);
	my @start = @$rstart;
	my @end   = @$rend;
	warn "No length given\n" unless ($length);
	print STDERR "Looking for $num_hel TM $label domains in $length residues\n"; 

	for (my $resno = 1; $resno <= $length; $resno++) {
		for (my $tm = 1; $tm <= $num_hel; $tm++) {

			if ( (!$start[$tm]) or (!$end[$tm]) ) {
				die "Problem with start and ends of TM regions in tm $tm, res $resno\n";
			}
			# is the residue in a TM spanning region?
			if ( ($resno >= $start[$tm]) and ($resno <= $end[$tm]) ) {
				$res = $label; last;
			}
			# or outside the membrane?
			else { $res = "-"; }
		}
		$fasta .= $res;
	}
	return ($fasta);

}

#
sub pred2type
{

	$_ = $_[0];
	my $type;
		
	SWITCH: {
		if (/tmb/)    { $type = "barrel"; last SWITCH; }

		if (/hmmtop/) { $type = "helical"; last SWITCH; }
		if (/das/)    { $type = "helical"; last SWITCH; }
		if (/phdhtm/) { $type = "helical"; last SWITCH; }

		if (/dssp/)     { $type = "all"; last SWITCH; }
		if (/tmdet/)    { $type = "all"; last SWITCH; }
		if (/fulltm/)   { $type = "all"; last SWITCH; }
		if ($_ eq "ss") { $type = "all"; last SWITCH; }
		else { print STDERR "Warning: Don't recognise method $_\n"; $type = "none"; last SWITCH; }
	}
	
	return $type;

}

1;

