#
# A developmental version of format generation code.  The input
# is either sequence and sequence identifier, or sequence hash
# and primary sequence identifier (to be listed first, or as query).
# The aim is to generate reuseable modules with minimal complexity
# and high flexibility.  Therefore, only primitive Perl objs/vars
# are used where possible and no obj-orientation has been attempted.
# 
# Created on 4/1/2002.
# Author: cltang @ honig lab 
# Update: Lucy Forrest and Marc Fasnacht @ honig lab with Nest and PIR formats
# Update: Lucy Forrest @ honig lab with stp format
# Update: Lucy Forrest @ honig lab with plop homology model alignment format
# 
# Advertised functions:
#
# $string = &toFastaFormat ($seqID,$seq) 
# $string = &toMFAFormat ($primaryID,\%seqHash,\%desHash)
# $string = &toPIRFormat ($primaryID,\%seqHash,\%desHash)
# $string = &toNestFormat ($primaryID,\%seqHash,\%desHash, $chain, $start, $end)
# $string = &toPlopFormat ($primaryID,\%seqHash) 
# $string = &toMSFFormat ($primaryID,\%seqHash)
# $string = &toSwissProtFormat ($primaryID,\%seqHash,\@start,\@end)
# 
# ($rstring, $rstring, $rstring, $rstring, $rstring) = &formats::readSegmentList($list_filename, "1");
# ($seq, $ss, $name) = &ReadSTP (\@stp_file)
# ($seq, $ss, @file_contents) = &ReadSTPfile ($stp_filename)
# 

use strict;

package   formats;
require   Exporter;
my @ISA = qw (Exporter);
my @EXPORT = qw (toMFAFormat toMSFFormat toPIRformat toNestFormat checkAlignment ReadSTP ReadSTPfile readPredTMBB readSegment);

###########
sub version
########### 
{
    print STDERR "parse Version 0.1: Mar 31, 2002\n";
    return "0.1";
}

#################
sub toFastaFormat
#################
# FASTA format for a seqID and seq string
{
    my($seqID, $seqbuff) = @_;
    my $outbuff = "";

    $seqbuff =~ s/\s//g;
    $outbuff .= ">$seqID\n";
    
    while (length($seqbuff)) {
	$outbuff .= substr ($seqbuff,0,60);
	$outbuff .= "\n";
	$seqbuff = substr ($seqbuff,60);
    }
    return $outbuff;
}

#################
sub toPIRFormat
#################
# PIR format for a seqID and seq string
{
    my($seqID, $seqbuff) = @_;
    my $outbuff = "";
 
    $seqbuff =~ s/\s//g;
    $seqbuff =~ s/\$//g;
    $seqbuff =~ s/\^//g;
    $seqbuff .= "*";
    $outbuff .= ">P1;$seqID\n\n";
     
    while (length($seqbuff)) {
        $outbuff .= substr ($seqbuff,0,70);
        $outbuff .= "\n";
        $seqbuff = substr ($seqbuff,70);
    }
    return $outbuff;
}

#################
sub toNestFormat
#################
# Nest/PIR format for a seqID and seq string
{
    my($seqID,$seqbuff,$seq_type,$chain,$start,$end,$chain2) = @_;
    my $outbuff = "";

    # added error reporting, 02aug07 lrf
    if (!$seq_type) { print STDERR "No sequence type provided\n"; }
    if (!$seqID) { print STDERR "No SEQID provided\n"; }
    if (!$seqbuff) { print STDERR "No sequence provided\n"; }
    if (!$chain) { print STDERR "No chain value provided\n"; }
    if (!$start) { print STDERR "No start value provided\n"; }
    if (!$end)   { print STDERR "No end value provided\n"; }
    if (!$chain2) { if ($chain) { $chain2 = $chain; } 
                    else        { $chain2 = ""; } 
    }

    $seqbuff =~ s/\s//g;
    $seqbuff =~ s/\$//g;
    $seqbuff =~ s/\^//g;
    $seqbuff .= "*";
    $outbuff .= ">P1;$seqID\n";
    #my $pdbID = $seqID;
    #$pdbID =~ s/\s//g;
    #$pdbID =~ s/^\_*//;
    #$pdbID = substr($pdbID,0,5);
    #$outbuff .= "$seq_type:$pdbID:$start:$chain:$end:$chain\n";

	# added colons at end of line, 02aug07 lrf
    $outbuff .= "$seq_type:$seqID:$start:$chain:$end:$chain2";
    $outbuff .= "::::\n";
    while (length($seqbuff)) {
        $outbuff .= substr ($seqbuff,0,70);
        $outbuff .= "\n";
        $seqbuff = substr ($seqbuff,70);
    }
    return $outbuff;
}

#####################
sub toSwissProtFormat
#####################
# SwissProt format for a seqID and seq string
# and list of helix end points
{
    my($seqID,$seqbuff,$rstart,$rend) = @_;
    my $outbuff = "";
	my @start = @$rstart;
	my @end   = @$rend;
 
    $seqbuff =~ s/\s//g;
    $seqbuff =~ s/\$//g;
    $seqbuff =~ s/\^//g;
    $outbuff .= "ID   $seqID\n";

	# define helices
	for (my $i = 1; $i < @start; $i++) {
		$outbuff .= "FT HELIX $start[$i]{$seqID} $end[$i]{$seqID}\n";
	}

    $outbuff .= "SQ\n";
    while (length($seqbuff)) {
        $outbuff .= substr ($seqbuff,0,70);
        $outbuff .= "\n";
        $seqbuff = substr ($seqbuff,70);
    }
    $outbuff .= "//\n";
    return $outbuff;
}

###############
sub toMFAFormat
###############
# Adapted from convert_file_format.pl
# Return alignment string in multi-fasta (MFA) format
#   $desc_ref contains the title string for each key (optional)
#   $order_ref contains the ordering information (optional)
{
    my($primary_id, $hash_ref, $desc_ref, $order_ref) = @_;
    my %hash = %$hash_ref;
    my %dHash = ();
    %dHash = %$desc_ref if ($desc_ref);

    ### first fasta record ###
    my $outbuff = "";
    my $desc = $primary_id;
    $desc = $dHash{$primary_id} if ($dHash{$primary_id});
    $outbuff .= &toFastaFormat ($desc,$hash{$primary_id})
	if (length($primary_id)>0);
#print STDERR $outbuff;

    ### use ordering info or sort ###
    my $seq_id;
    my @order = sort keys %hash;
    if ($order_ref) {
	if (@$order_ref != scalar(@order)) {
	    print STDERR "formats::toMFAFormat mismatch in order array.\n";
	} else {
	    @order = @$order_ref;
	}
    }

    ### create the fasta file text ###
    foreach $seq_id (@order) {
	next if ($seq_id eq $primary_id);
	$desc = $seq_id;
	$desc = $dHash{$seq_id} if ($dHash{$seq_id});
	$outbuff .= &toFastaFormat ($desc,$hash{$seq_id});
    }

    return $outbuff;
}

###############
sub toMSFFormat
###############
# Adapted from convert_file_format.pl
# Return alignment string in msf format
{
    my($primary_id, $hash_ref) = @_;
    my %hash = %$hash_ref;

    my($i, $na);
    my $primary_seq = $hash{$primary_id};

    my $outbuff = "";
    for($i=0; $i<length($primary_seq); $i+=60){

	$outbuff .= sprintf ("%-20s\t",substr($primary_id,0,20));
	$outbuff .= sprintf ("%s\n",substr($primary_seq,$i,60));
	
	foreach $na (sort keys %hash) {
	    next if ($na eq $primary_id); 
	    $outbuff .= sprintf ("%-20s\t",substr($na,0,20));
	    $outbuff .= sprintf ("%s\n",substr($hash{$na},$i,60));
	} 

	$outbuff .= "\n";
    }       
    
    return $outbuff;
}

###############
sub toPlopFormat
###############
# Return alignment string in plop format
{
    my($primary_id, $hash_ref) = @_;
    my %hash = %$hash_ref;

    my($i, $na);
    my $primary_seq = $hash{$primary_id};

    my $outbuff = "";
    for($i=0; $i<length($primary_seq); $i+=60){

	$outbuff .= sprintf ("%-7s",substr("Targ:  ",0,7));
	$outbuff .= sprintf ("%s\n",substr($primary_seq,$i,60));
	
	foreach $na (sort keys %hash) {
	    next if ($na eq $primary_id); 
	    $outbuff .= sprintf ("%-7s",substr("Temp:  ",0,7));
	    $outbuff .= sprintf ("%s\n",substr($hash{$na},$i,60));
	} 

	$outbuff .= "\n";
    }       
    
    return $outbuff;
}

#################
sub ReadSTP
#################
# returns ss as a single string
{
	# input is reference to array containing stp file
	my($rstp_file) = @_;
	my @stp_file = @$rstp_file;
	my $resno = 0;
	my ($res, $ss, @junk, $name);

	foreach $_ (@stp_file) {
		chomp();
		if (($resno == 0) and ($_ ne "PFRMAT COMP_STR")) { warn "  Not COMP_STR format; $_\n"; return; }
		elsif (($resno == 0) and ($_ eq "PFRMAT COMP_STR")) { $resno++; next; } 

		if (/stp/) { s/REMARK//; s/\.stp//; s/\s+//g; $name = $_; next; } #print STDERR "Found name $name\n"; }

		next if ($_ =~ /^PFRMAT/);
		next if ($_ =~ /^END/);
		next if ($_ =~ /^SS/);
		next if ($_ =~ /^REMARK/);

		@junk = split();

		$res .= $junk[0];
		$ss  .= $junk[1];
	}

	return ($res, $ss, $name);

}

#################
sub ReadSTPfile
#################
# read in structure file stp type
# and return ss and rest as an array not a string
{
    my($filename) = @_;
    open (IN, "<$filename") or die "Can't find $filename\n";
    my ($header, $seq);
    my @contents;
    while (my $line = <IN>) {
       next if ($line =~ /^END/);
       if ($line =~ /[A-Z]{2}/) { $header .= $line; }
       else {
           push @contents, $line;
           my ($res, $junk) = split (/\s+/, $line);
           $seq .= $res;
       }
    }
    close IN;
    return ($seq, $header, \@contents);
                                                                                                
}
                                                                                                

################
# to read the list of segments for a single pdb file
################
sub readSegmentList {

	my ($verbose, $file, $pdb, %num_seg, @seg_names, %start, %end, %chn, %length, $seg);
	($file, $verbose) = @_;

	open (LIST, "$file") or die "Couldn't open $file";
	print STDERR "\nReading $file\n";

	# read list of helices
	while (<LIST>) {

		# first line contains the header with the pdb filenames
		if (/^seg/) { 
            s/segment //;
			$pdb = substr($_, 0, 4); 
			print STDERR "Found $pdb\n" if ($verbose); 
		}	
		else {
			next unless /\w/; # skip lines without text
			next if (/#/); # skip lines with hashes

			# store data for each seg line
			my @temp = split(/\s+/, $_);
			$seg              = $temp[0];
			$chn{$seg}{$pdb}  = $temp[1];
			$start{$seg}{$pdb}= $temp[2];
			$end{$seg}{$pdb}  = $temp[3];
			$length{$seg}{$pdb} = $end{$seg}{$pdb} - $start{$seg}{$pdb} + 1;
			$num_seg{$pdb} ++;
            push @seg_names, $seg;

			print STDERR "$pdb $seg $chn{$seg}{$pdb} $start{$seg}{$pdb} $end{$seg}{$pdb} $length{$seg}{$pdb}\n" if ($verbose);
		}
	} # end read list

	return ($pdb, \%num_seg, \@seg_names, \%start, \%end, \%chn, \%length);

}


1;
