# A parsing module for Modeller interface
# 
# Created on November2004 
# Lucy Forrest 
# updated write_DoModel to use model name from input 31jan05

use strict;
use lib "$ENV{SCR}/modules/";

package   modeller;
require   Exporter;
require   fasta;
require   formats;
my @ISA = qw (Exporter);
my @EXPORT = qw (write_DoModel write_CalcRMS write_CalcSeqID setup_SnglTempl setup_MultTempl setup_Nest setup_Plop get_timing get_rms);

#### EXPORTED ####

################
# Send sequence name for model, reference to template name(s)
# filename (including path) of alignment
# and type of modelling required (nothing or 'ref' for refinement)
# Returns the text of the Modeller top file
#########################
sub write_DoModel {

	my ($model, $rtemplates, $alignfile, $mod_routine) = @_;
	my $text   = "";
	my $knowns = "";
	my $lev    = "";

	# template names
	my @templates = @$rtemplates; # get array from reference
	if    (@templates == 1) {        $knowns = "\'$templates[0]\'";  }
	elsif (@templates > 1)  {
		foreach my $k (@templates) { $knowns .= "\'$k\' "; }
	}
	else { print STDERR "Don't recognise templates\n"; return $text; }

	# for extra refinement
	if ($mod_routine) {
		if ($mod_routine eq "ref") {
			$lev = "
SET MD_LEVEL             = 'refine_4'
SET REPEAT_OPTIMIZATION  = 3, MAX_MOLPDF = 1E6\n";
		}
	}
	else { $lev = ""; }

	# create text output
$text = "\# Homology modelling
INCLUDE
SET OUTPUT_CONTROL       = 1 1 1 1 0
SET ALNFILE              = '$alignfile'
SET SEQUENCE             = '$model'
SET KNOWNS               = $knowns
SET ATOM_FILES_DIRECTORY = './'
SET STARTING_MODEL       = 1
SET ENDING_MODEL         = 1
$lev
CALL ROUTINE             = 'model'\n
SYSTEM COMMAND           = 'mv $model.B99990001 $model.pdb'\n";

	return $text;
}

###################
# Send reference to codes of proteins
# filename (including path) of alignment
# and type of atoms in RMSD desired.
# Plus filename of fitted pdb (2nd on 1st)
# Returns the text of the Modeller top file
# to calculate RMSD, and write out fitted structure
###################
sub write_CalcRMS {

	my ($ralign_codes, $alignfile, $atom, $outfile) = @_;
	my $text = "";
	my $s1; my $s2;

	my @align_codes = @$ralign_codes;
	if (@align_codes != 2) { return $text; }
	$s1 = $align_codes[0];
	$s2 = $align_codes[1];

	if (($atom ne "CA") and ($atom ne "all")) {
		print STDERR "Don't recognise atom type: Currently uses CA and all.\n"; 
		return $text; 
	}

	# create text input - note model2 is fitted onto model1
$text = "\# Calculate $atom RMS of $s1 to $s2
SET ALIGN_CODES           = '$s1' '$s2'
READ_ALIGNMENT FILE       = '$alignfile'
READ_MODEL  FILE          = '$s1'
READ_MODEL2 FILE          = '$s2'
PICK_ATOMS PICK_ATOMS_SET = 1, ATOM_TYPES = '$atom'
SUPERPOSE RMS_CUTOFFS     = 100\n";

	if ($outfile) {
		$text .="WRITE_MODEL2 FILE         = '$outfile'\n";
	}
	return $text;

}


###################
# Send reference to codes of two proteins
# filename (including path) of alignment
# Returns the text of the Modeller top file
# for calculating Sequence identity between pairs
###################
sub write_CalcSeqID {

	my ($ralign_codes, $alignfile) = @_;
	my $text = "";
	my $s1; my $s2;

	my @align_codes = @$ralign_codes;
	if (@align_codes != 2) { return $text; }
	$s1 = $align_codes[0];
	$s2 = $align_codes[1];

$text = "\# Calculate Sequence ID of @align_codes
READ_ALIGNMENT FILE      = '$alignfile'
SET ALIGN_CODES          = '$s1' '$s2'
ID_TABLE MATRIX_FILE     = 'id.dat'\n";

	return $text;

}

########################
# extracts CPU time 
# from Modeller Log file
# parse file name (scalar)
# returns formatted time (scalar)
########################
sub get_timing {

	my ($logfile) = $_[0];
	my $format = "NA";

	my $time = `grep "CPU time" $logfile | awk '{print \$6}'`; 
	chomp($time);
	if ($time) { 
		$format = sprintf "%.2f\n", $time;
	}	

	return $format;

}
########################
# extracts RMS values
# from Modeller Log file
# parse file name (scalar) and one pdb name
# returns formatted text (array)
########################
sub get_rms {

	#my ($logfile, $n1) = @_;
	my ($logfile, $model) = @_;
	my ($text, $gain, $seqid, $nres, $temp);
	my (@res, @raw, @rms);
	my $na = "NA";

	# sequence identity
	my $string = substr($model, 0, 6);
	#$temp  = `grep "^${n1}_o" $logfile | awk '{print \$2}'`; chomp($temp); # sequence identity
	$temp  = `grep "^$string" $logfile | awk '{print \$2}'`; chomp($temp); # sequence identity
	if ($temp) { $seqid = sprintf "%7.1f", $temp; }
	else { $seqid = sprintf "%7s", $na; }

	# RMSD
	@raw = `grep "Cutoff RMS" $logfile | awk '{print \$6}'`; # ca-m2x, aa-m2x, ca-m2t, ca-x2t
	foreach my $i (0 .. (@raw-1)) {
		$temp = $raw[$i]; chomp($temp); $text = sprintf "%8.2f", $temp;
		push @rms, $text;
	}

	# improvement in model compared to template
	if ($rms[3]) {
    	$temp = ($rms[3] - $rms[0]); # RMS(x2t) minus RMS(m2t)
		$gain = sprintf "%8.2f", $temp;
	}
	else { $gain = sprintf "%8s", $na; }

	# number of residues
	@res = `grep "Numb of residues in MODEL2" $logfile | awk '{print \$7}'`;
	if ($res[0]) { 
    	$temp = $res[0]; chomp($temp); 
		$nres = sprintf "%5d", $temp;
	}
	else { $nres = sprintf "%5s", $na; }

	return \@rms, $gain, $seqid, $nres;

}


#########################
# for a sequence and one template
# and the native (if desired)
# and an input alignment
# and a modeller routine type
# creates whole text for
# Modeller top file
#########################
sub setup_SnglTempl {

	my ($model,$n1,$n2,$in_fasta,$out_dir,$routine) = @_;
	my %pdbfile; my %type; my %chn; my %end; my %start;
	my $text = ""; my @seqs; my $atom; my $outfile;
	
	my $alnfile = "$out_dir/$model.ali";
	
	# store alignment
	my $fa = "";
	open (FA, $in_fasta); 
	while(<FA>) { $fa .= $_; } 
	close (FA);
	
	my %stored = &fasta::toFastaHash($fa);
	$stored{$model} = $stored{$n1}; # make a copy

	$pdbfile{$n1} = "$out_dir/$n1.pdb";
	$pdbfile{$n2} = "$out_dir/$n2.pdb";
	
	# Modeller format alignment input parameters 
	$type{$model} = "sequence";
	$type{$n1}    = "structureX";
	$type{$n2}    = "structureX";
	$end{$n1}     = "";	
	$end{$n2}     = "";	
	$chn{$model}  = $start{$model} = $end{$model} = "";
	($chn{$n1}, $start{$n1}) = get_chain($pdbfile{$n1});
	($chn{$n2}, $start{$n2}) = get_chain($pdbfile{$n2});

	# create Modeller alignment file
	open (ALI, ">$alnfile");			
	foreach my $i ($model, $n1, $n2) {
		print ALI &formats::toNestFormat ($i,$stored{$i},$type{$i},$chn{$i},$start{$i},$end{$i});
	}
	close (ALI);
	
	# create Modeller top file
	push @seqs, $n2;
	$text .= &write_DoModel($model,\@seqs,$alnfile,$routine);

	# model to native RMSD
	@seqs = ("$n1", "$model");
	$atom = "CA";
	$outfile = "$model.fit-x.pdb";
	$text .= &write_CalcRMS(\@seqs,$alnfile,$atom,$outfile);
	$atom = "all";
	$outfile = "";
	$text .= &write_CalcRMS(\@seqs,$alnfile,$atom,$outfile);
	
	# model to template RMSD
	@seqs = ("$n2", "$model");
	$atom = "CA";
	$outfile = "";
	$text .= &write_CalcRMS(\@seqs,$alnfile,$atom,$outfile);

	# template to native/xray RMSD
	@seqs = ("$n1", "$n2");
	$outfile = "";
	$text .= &write_CalcRMS(\@seqs,$alnfile,$atom,$outfile);
	
	# model to template Sequence identity
	@seqs = ("$n2", "$model");
	$text .= &write_CalcSeqID(\@seqs,$alnfile);

	return ($alnfile, $text);
}

#########################
# for a sequence and many templates
# and the native (if desired)
# and an input alignment
# and a modeller routine type
# creates whole text for
# Modeller top file
#########################
sub setup_MultTempl {

	my ($model,$n1,$rtemplates,$in_fasta,$out_dir,$routine) = @_;
	my @templates = @$rtemplates;
	my (%pdbfile, %type, %chn, %end, %start);
	my $text = ""; my @seqs; my $atom; my $outfile;
	
	my $alnfile = "$out_dir/$model.ali";
	
	# store alignment
	my $fa = "";
	open (FA, $in_fasta); 
	while(<FA>) { $fa .= $_; } 
	close (FA);
	
	my %stored = &fasta::toFastaHash($fa);
	$stored{$model} = $stored{$n1}; # make a copy

	$pdbfile{$n1} = "$out_dir/$n1.pdb"; #native

	# Modeller format alignment input parameters 
	foreach my $n (@templates) {
		$pdbfile{$n} = "$out_dir/$n.pdb";
		$type{$n}    = "structureX";
		$end{$n}     = "";	
		($chn{$n}, $start{$n}) = get_chain($pdbfile{$n});
	}
	$type{$model} = "sequence";
	$type{$n1}    = "structureX";
	$end{$n1}     = "";	
	$chn{$model}  = $start{$model} = $end{$model} = "";
	($chn{$n1}, $start{$n1}) = get_chain($pdbfile{$n1});

	# create Modeller alignment file
	open (ALI, ">$alnfile");			
	foreach my $i ($model, $n1, @templates) {
		print ALI &formats::toNestFormat ($i,$stored{$i},$type{$i},$chn{$i},$start{$i},$end{$i});
	}
	close (ALI);
	
	# create Modeller top file
	$text .= &write_DoModel($model,\@templates,$alnfile,$routine);

	# model to native RMSD
	@seqs = ("$n1", "$model");
	$atom = "CA";
	$outfile = "$model.fit-x.pdb";
	$text .= &write_CalcRMS(\@seqs,$alnfile,$atom,$outfile);
	$atom = "all";
	$outfile = "";
	$text .= &write_CalcRMS(\@seqs,$alnfile,$atom,$outfile);
	
	return ($alnfile, $text);
}

#########################
# for a sequence and one template
# and an input alignment
# creates Nest alignment file
#########################
sub setup_Nest {

	my ($model,$n1,$n2,$in_fasta,$out_dir,$routine) = @_;
	my %pdbfile; my %type; my %chn; my %end; my %start;
	my $text = ""; my @seqs; my $atom; my $outfile;
	
	my $alnfile = "$out_dir/$model.pir";
	
	# store alignment and remove slashes - nest can't read more than one chain anyway!
	my $fa = "";
	open (FA, $in_fasta); 
	while(<FA>) { s#\/##; $fa .= $_; } 
	close (FA);
	
	my %stored = &fasta::toFastaHash($fa);
	$stored{$model} = $stored{$n1}; # make a copy

	$pdbfile{$n2} = "$out_dir/$n2.pdb";
	
	# Modeller format alignment input parameters 
	$type{$model} = "sequence";
	$type{$n2}    = "structureX";
	$end{$n2}     = "";	
	$chn{$model}  = $start{$model} = $end{$model} = "";
	($chn{$n2}, $start{$n2}) = get_chain($pdbfile{$n2});

	# create Nest alignment file
	open (ALI, ">$alnfile");			
	foreach my $i ($model, $n2) {
		print ALI &formats::toNestFormat ($i,$stored{$i},$type{$i},$chn{$i},$start{$i},$end{$i});
	}
	close (ALI);
	
	return ($alnfile, $text);
}

#########################
# for a sequence and one template
# and an input alignment
# creates PLOP alignment file
#########################
sub setup_Plop {

	my ($model,$n1,$n2,$in_fasta,$out_dir,$plop_params,$log_file) = @_;
	my %pdbfile; my $text = ""; 
	my $alnfile = "$out_dir/$model.plop";
	
	# store alignment and remove slashes - plop can't read more than one chain anyway!
	my $fa = "";
	open (FA, $in_fasta); 
	while(<FA>) { s#\/##; $fa .= $_; } 
	close (FA);
	my %stored = &fasta::toFastaHash($fa);

	$pdbfile{$n2} = "$out_dir/$n2.pdb"; #template

    # create control file - fix CHAIN!
	$text = "
# plop control file for model $model
file datadir $plop_params
file outdir $out_dir
file logfile $log_file\n
homolog single &
  $alnfile &
  $pdbfile{$n2} &
  ssgaps yes &
  chain _ &
  conserve yes &
  x2p yes &
  g2x yes &
  cterm no &
  nterm no &
  sym none &
  tail aligned\n
write pdb $model.pdb\n";
#from cinque
#echo "load pdb $PDB/16pk.pdb " >> $Homology.con
#echo "homolog load 16pk.align 16pk.pdb &" >> $Homology.con
#echo "native 16pk.pdb &" >> $Homology.con
#echo "finalfile 16pk_homo.rmsd " >> $Homology.con
#echo "structure write 16pk_homo.pdb" >> $Homology.con
#initfile file &
#gapfile file &
#insertfile file &
#x2pfile file &
#g2xfile file &
#sidefile file &
#finalfile file
#het 

	# create Plop alignment file
	open (ALI, ">$alnfile");			
	print ALI &formats::toPlopFormat ($n1,\%stored);
    
	return ($alnfile, $text);
}

###################################################
# local subroutine 
# for extracting chain ID and first residue number
# from pdb file whose filename is parsed
# returns the chain and residue number
###################################################
sub get_chain {

	my $chain; my $res_num;
	my $pwd = `pwd`;
	open(PDB, $_[0]) or die "Couldn't open pdb file: $_[0]\nin subroutine get_chain in modeller.pm\nCwd = $pwd";
	while (<PDB>) { 
		if ($_ =~ /^ATOM/) { 
			$chain = substr($_,21,1);
			$res_num = substr($_,22,4);
			last;
		}
	}
	return ($chain,$res_num);
}

