#
# A developmental version of a blast parsing package.
# Probably some of these functions will have other implementations,
# were copied wholesale from other places, or will need to be
# updated with changing standards.  I will do my best to keep
# things organized & reusable, but make no guarantees on being
# comprehensive.  There is no object-orientation attempted.
# Finally, caveat adaptor.
# 
# Created on 3/31/2002.
# Author: cltang @ honig lab 
# 
# Advertised functions:
#
# $version = &version ()
# %queries = &toQueryHash (\$blastText)
# @rounds = &toRoundsArry ($queryText)
# %hits = &toHitsHash ($roundText) 
# 

use lib "$ENV{MOUNTDIR}/common/scripts/modules/parsing";

use strict;
use fileio;
#use DB_File; #removed Jun16,clt 
use Fcntl;

package   blast;
require   Exporter;
my @ISA = qw (Exporter);
my @EXPORT = qw (version toQueryHash toRoundsArry toHitsHash);
		 # toQueryHashDB toRoundsArryDB); 

#### EXPORTED ####

###########
sub version
########### 
{
    print STDERR "parse Version 0.1: Mar 31, 2002\n";
    print STDERR "blast Version 2.2.2 [Dec-14-2001]\n";
    return "0.1";
}

######################
sub toHitsHashFromFile
######################
# Writes a hits hash using minimal memory possible
# e.g. read directly from file.  this function is
# more memory efficient that the other alternatives,
# but less flexible.  Assumes there is one query.
{
    my ($file) = @_;
    my %outhash = ();

    my $text = &fileio::readAsciiFile("grep round $file|");
    $text =~ / round (\d+)$/;
    my $rounds = 0;
    $rounds = $1 if ($1);
    
    my $flag = 0;
    $rounds>0 or $flag = 1;
    my $lockHash = 0;
    my %idxhash;

    open FILE, "<$file";
    while (<FILE>) {
	$flag = 1 if (/Results from round $rounds/o);
	$flag = 2 if ($flag==1 && /^QUERY/);
	next unless $flag==2;
	if (my ($seqID,$seq)=/^(\S+)\s+\d*\s*([\-\w]+)/) {
	    if ($idxhash {$seqID} == 1) {
		# sometimes a seqID will be repeated
		# give this seq a unique seqID
		my $count = 0;
		my $tempID;
		do {
		    $tempID = $seqID . "_" . $count;
		    $count++;
		}
		while ($idxhash {$tempID} == 1);
		if (!$lockHash) {
		    #print STDERR "Avoiding hashtable clash @ $seqID... ($tempID)\n";
		}
		$seqID = $tempID;
	    } 
	    if (length ($outhash{$seqID}) == 0) {
		last if ($lockHash);
		$outhash {$seqID} = $seq;
		$idxhash {$seqID} = 1;
	    } else {
		$outhash {$seqID} .= $seq;
		$idxhash {$seqID} = 1;
	    }
	} else {
	    $lockHash = 1;
	    undef %idxhash;
	}
    }
    close FILE;
    return \%outhash;
}

###############
sub toQueryHash
###############
# Split a multi-query blast search into single queries
{
    my ($txtbuff) = @_;
    my %outhash = ();

    while (length($txtbuff)) {
	$txtbuff =~ /Query= (.+)\n/;
	my $queryID = $1;
	# print "doing $queryID\n";
	$txtbuff = $&.$';

        if ($txtbuff =~ /BLASTP/) {
	    $outhash {$queryID} = $`;
	    $txtbuff = $&.$';
	} else {
	    $outhash {$queryID} = $txtbuff;
	    $txtbuff = "";
	}
    }

    return %outhash;
}

# #################
# sub toQueryHashDB
# #################
# Split a multi-query blast search into single queries
# {
#     my ($txtbuff) = @_;
#     my (%outhash, @keys);
#     my $db = tie %outhash, "DB_File", undef;

#     while (length($txtbuff)) {
# 	$txtbuff =~ /Query= (.+)\n/;
# 	my $queryID = $1;
# 	$txtbuff = $&.$'; #'
	
#         if ($txtbuff =~ /BLASTP/) {
# 	    $outhash {$queryID} = $`; 
# 	} else {
# 	    $outhash {$queryID} = $txtbuff;
# 	    $txtbuff = "";
# 	}
# 	push @keys, $queryID;
#     }
    
#     return $db, \@keys;
# }

################
sub toRoundsArry
################
# Split a multi-round psi-blast query into single rounds
{
    my ($txtbuff) = @_;
    $txtbuff =~ /Results from round/;
    $txtbuff = $'; #'

    my @outarry = split /Results from round /, $txtbuff;
    return @outarry;
}

# ##################
# sub toRoundsArryDB
# ##################
# # Split a multi-round psi-blast query into single rounds
# {
#     my ($txtbuff) = @_;
#     $txtbuff =~ /Results from round/;
#     $txtbuff = $'; #'

#     my @outarry;
#     my $db = tie @outarry, "DB_File", "tmp.r";
#     @outarry = split /Results from round /, $txtbuff;
#     return $db;
# }

##############
sub toHitsHash
##############
# Generate a hash that contains seq ids as keys, seq as values
# assumes blast output in format -m 6 
{
    my ($txtbuff) = @_;
    my %outhash = ();

    $txtbuff =~ /QUERY/;
    $txtbuff = $&.$'; #'

    #my @lines = split ("\n", $txtbuff);
    my $lockHash;
    my %idxhash;

    while ($_=$txtbuff) {
	if (my ($seqID,$seq)=/^(\S+)\s+\d*\s*([\-\w]+)/) {
	    if ($idxhash {$seqID} == 1) {
		# sometimes a seqID will be repeated
		# give this seq a unique seqID
		my $count = 0;
		my $tempID;
		do {
		    $tempID = $seqID . "_" . $count;
		    $count++;
		}
		while ($idxhash {$tempID} == 1);
		if (!$lockHash) {
		    print STDERR "Avoiding hashtable clash @ $seqID... ($tempID)\n";
		}
		$seqID = $tempID;
	    } 
	    if (length ($outhash{$seqID}) == 0) {
		last if ($lockHash);
		$outhash {$seqID} = $seq;
		$idxhash {$seqID} = 1;
	    } else {
		$outhash {$seqID} .= $seq;
		$idxhash {$seqID} = 1;
	    }
	} else {
	    $lockHash = 1;
	    undef %idxhash;
	}
	$txtbuff =~ /\n/;
	$txtbuff = $';
    }
    
    return %outhash;
}

##################
sub checkAlignment
##################
# Do all aligned strings have same # of characters?
{
    my ($hash_ref) = @_;
    my %hash = %$hash_ref;
    my @keylist = keys %hash;

    my $flag = 0;
    my $i;

    for ($i=1; $i<@keylist; $i++) {
	$flag++ if (length($hash{$keylist[$i]})==length($hash{$keylist[0]}));
    }
    
    return ($flag+1==@keylist);
}

#*** PRIVATE ***#

#*********#
sub advance
#*********#
# Advance the text buffer until we see the token.
{
    my ($txtref, $token, $delim) = @_;
    my $retbuff = "";
    
    if (!$delim) { $delim = "\n"; }
    my @lines = split ($delim, $$txtref);

    $$txtref = "";
    
    my $buffref = \$retbuff;
    foreach (@lines) {
	$buffref = $txtref if (/$token/);
	$$buffref .= $_;
	$$buffref .= $delim;
    }
    
    return $retbuff;
}

#******#
sub scan
#******#
# Scan text for a simple regexp
{
    my ($txt, $regexp) = @_;
    $_ = $txt;
    my ($value) = m/$regexp/;
    return $value;
}

#***********#
sub firstLine 
#***********#
# Return the first line split by a delimiter
{
    my ($txtbuff, $delim) = @_;
    if (!$delim) { $delim = "\n"; }
    my @lines = split ($delim, $txtbuff);
    return $lines[0];
}

1;
