#
# A parsing module for fasta
# (I've also stuck a method in for MSF formats)
# 
# Created on 4/11/2002.
# Author: cltang @ honig lab 
# 
# Advertised functions:
#
# $version = &version ()
# %fastaHash = &toFastaHash ($text);
# %skaHash = &toSkaHash ($text);
# \%fastaHash,\%desHash,\@order = &toFastaHash ($text);
# %hash = &removeGaps (\%hash);
# 

use strict;

package   fasta;
require   Exporter;
my @ISA = qw (Exporter);
my @EXPORT = qw (version toFastaHash toFastaHashWithDesc 
		 getSeq compareIgnoreWild removeGaps toMSFHash toSkaHash); 

#### EXPORTED ####

###########
sub version
########### 
{
    print STDERR "parse Version 0.1: Apr 11, 2002\n";
    return "0.1";
}

###############
sub toSkaHash {
###############

    my ($inbuff) = @_;
    my %rethash;
    my $seed = "";
    my $position = 0;

    my @list = split /\n/, $inbuff;
    foreach my $line (@list) {
          
       #check within first four lines after header
       if ($line =~ /..........+.........+.........+.........+...../) {
            $position = 0; 
       }
       else { $position++; }

       if ($position < 5) { 
          # e.g.  tc_sse:3BEH.pdb_A      ccc
	  if ($line =~ /^\s+(\S+)\s+(\S+)\s*$/) {
             if ($2 !~ /\d+/) { 
                print STDERR "fixing problem with: $2\n";
                $rethash {$1} .= $2;
                $seed = $1 if (length($seed)==0);
             }
          }
          # e.g.  tc_sse:3BEH.pdb_A   1  ccc
	  elsif ($line =~ /^\s+(\S+)\s+(\S+)\s+(\S+)\s*$/) {
	    $rethash {$1} .= $3;
	    $seed = $1 if (length($seed)==0);
	  } 
       }   
    }
    
    my $seed_length = length ($rethash {$seed});
    foreach my $seq_id (keys %rethash) {
	  if (length ($rethash {$seq_id}) != $seed_length) {
	    die "Malformed MSF file or non-unique seq id in file!\n";
	  }
    }

    return %rethash;    
}
###############
sub toMSFHash {
###############

    my ($inbuff) = @_;
    my %rethash;
    my $seed = "";

    my @list = split /\n/, $inbuff;
    foreach my $line (@list) {
	if ($line =~ /^(\S+)\s+(\S+)$/) {
	    $rethash {$1} .= $2;
	    $seed = $1 if (length($seed)==0);
	} 
    }
    
    my $seed_length = length ($rethash {$seed});
    foreach my $seq_id (keys %rethash) {
	if (length ($rethash {$seq_id}) != $seed_length) {
	    die "Malformed MSF file or non-unique seq id in file!\n";
	}
    }

    return %rethash;    
}

#################
sub toFastaHash {
#################

    my ($inbuff) = @_;
    my $current_id = "";
    my %rethash;

    #my @list = split />/, $inbuff;  #note: inserts an empty element into @list
    #note this cannot handle windows files; make sure to use dos2unix on fasta seq from Grasp2 - 10july06lrf
    my @list = split /\n>/, $inbuff; #note: this is the correction
    $list[0] =~ s/^>//;

    map {
	my @lines = split /\n/, $_;
	$lines[0] =~ /^(.*)$/;
	my $key = $1;
	if ($key) {
	    if ($rethash{$key}) {
		my $count = 0;
		my $new_key;
		do {
		    $new_key = "$key\_$count";
		    $count++;
		} while (length($rethash{$new_key}));
		print STDERR "fasta::toFastaHash $key is not unique.";
		print STDERR " New key is $new_key.\n";
		$key = $new_key;
	    }
	    $rethash{$key} = $lines[1];
	    for (my $i=2; $i<@lines; $i++) {
		$rethash{$key} .= $lines[$i];
	    }
	    $rethash{$key} =~ s/\s//g;
	}
    } @list;

    return %rethash;

}

#########################
sub toFastaHashWithDesc {
#########################

    my ($inbuff) = @_;
    my $current_id = "";
    my (%rethash, %deshash, @order);

    #my @list = split />/, $inbuff;  #note: inserts an empty element into @list
    my @list = split /\n>/, $inbuff; #note: this is the correction
    $list[0] =~ s/^>//;

    map {
	my @lines = split /\n/, $_;
	$lines[0] =~ /^(\S+)/;
	my $key = $1;
	if ($key) {
	    if ($rethash{$key}) {
		my $count = 0;
		my $new_key;
		do {
		    $new_key = "$key\_$count";
	  	    $count++;
		} while (length($rethash{$new_key}));
		#print STDERR "fasta::toFastaHash $key is not unique.";
		#print STDERR " New key is $new_key.\n";
		$key = $new_key;
	    }
	    $deshash{$key} = $lines[0];
	    $rethash{$key} = $lines[1];
	    push @order, $key;
	    for (my $i=2; $i<@lines; $i++) {
		$rethash{$key} .= $lines[$i];
	    }
	    $rethash{$key} =~ s/\s//g;
	}
    } @list;

    return \%rethash,\%deshash,\@order;

}

#######################
sub compareIgnoreWild {
#######################

    my ($seqa, $seqb) = @_;
    my $lena = length ($seqa);
    if (length ($seqb) != $lena) {
	return 0;
    }
    for (my $i=0; $i<$lena; $i++) {
	my $chara = substr ($seqa, $i, 1);
	my $charb = substr ($seqb, $i, 1);
	if ($chara ne "X" && $charb ne "X" && $chara ne $charb) {
	    return 0;
	}
    }

    return 1;
}

############
sub getSeq {
############

    my ($id, $fa_file) = @_;

    my $line;
    open FA, "<$fa_file";
    do {
	$line = <FA>;
    } while ($line && $line !~ /^>.*$id/);
    
    if (!$line) {
	print STDERR "[fasta::getSeq] $id not found in $fa_file\n";
	return "";
    }
    
    my $fa_seq = "";
    while (<FA>) {
	last if (/^>/);
	chomp;
	$fa_seq .= $_;
    }

    close FA;
    return $fa_seq;

}

################
sub removeGaps {
################

    my ($hashref) = @_;
    my %rethash = %$hashref;
    
    map {
	$rethash {$_} =~ s/-//g;
    } keys %rethash;
    
    return %rethash;
}
