#!/usr/bin/perl

use strict;
use Getopt::Std      qw(getopt);

my $curr_version = "1.0";

###################################### Current version is 1.0 ########################
#				       Now the script:
#				       1) Reads input FASTA file and simulate its
#					  evolution under given parameters
#				       2) Cuts random part of specified length to
#					  the output
#
######################################################################################
###################################### DEFAULT PARAMETERS ############################
#
	my $def_change  = 0.2;
	my $def_replace = 0.5;
	my $def_times	= 1;
	my $def_gener   = 1;
	my $peptide_len = 20;
	my $def_full	= "N";
#
######################################################################################
###################################### READING AND CHECKING PARAMETERS ###############
#
getopt( 'iocrtgfd' );

my $input = $Getopt::Std::opt_i;
my $output = $Getopt::Std::opt_o;
my $change = $Getopt::Std::opt_c;
my $replace = $Getopt::Std::opt_r;
my $times = $Getopt::Std::opt_t;
my $gener = $Getopt::Std::opt_g;
my $full  = $Getopt::Std::opt_f;
my $debug = $Getopt::Std::opt_d;

if ( ($output eq "")||($output eq "") )
{
	print "This script will simulate random evolution of the given protein.\n";
	print "Script version is: $curr_version, modified 13.03.2013\n";
	print "\n";
	print "Parameters to the program are as follows:\n";
	print "\t-i\t Name of the input file (FASTA format, 1 sequence);\n";
	print "\t-o\t Name of the output file;\n";
	print "\t-c\t Probability to change each residue (DEFAULT = $def_change);\n";
	print "\t-r\t Probability of the replacement in case of change (DEFAULT = $def_replace)\n";
	print "\t  \t Note that complementary probability will be provided to indel;\n";
	print "\t-t\t Number of peptides to generate (DEFAULT = $def_times);\n";
	print "\t-g\t Number of generations to mutate peptides (DEFAULT = $def_gener);\n";
	print "\t-f\t If 'Y', returns full-length results (DEFAULT = $def_full).\n";

	exit;
}

if ($change eq "")
{
	$change = $def_change;
}
if ($replace eq "")
{
	$replace = $def_replace;
}
if ($times eq "")
{
	$times = $def_times;
}
if ($gener eq "")
{
	$gener = $def_gener;
}
if ($full eq "")
{
	$full = $def_full;
}
if ($debug ne "Y")
{
	$debug = "N";
}


if (($change > 1)||($change < 0))
{
	print "FATAL ERROR:\n";
	print "Probability of amino acid change is invalid: should belong to [0; 1]\n";
	print "Current value = '$change'\n";
	print "Terminating script...\n";
	exit;
}
if (($replace > 1)||($replace < 0))
{
	print "FATAL ERROR:\n";
	print "Probability of amino acid replacement is invalid: should belong to [0; 1]\n";
	print "Current value = '$replace'\n";
	print "Terminating script...\n";
	exit;
}
#
######################################################################################
###################################### SCRIPT BODY ###################################
#
my $input_seq = "";
my $input_descr = "";

#---------------------------------------- (1) Input reading --------------------------

open (INPUT, "< $input") or die "FATAL ERROR:\nFile '$input' is missing\nTerminating script...\n";
my $str = 0;
while (my $temp = <INPUT>)
{
	chomp($temp);
	$str++;
	if ($temp =~ /^\>/)
	{
		if ($input_descr ne "")
		{
			print "FATAL ERROR:\n";
			print "More than one sequence is detected in file $input\n";
			print "Terminating script...\n";
			exit;
		}
		else
		{
			$input_descr = $temp;
		}
	}
	else
	{
		if ($temp =~ /\s/)
		{
			print "FATAL ERROR:\n";
			print "Space character is detected in string #$str\n";
			print "Terminating script...\n";
			exit;
		}
		$input_seq .= "$temp";
	}
}
close INPUT;

if (($input_descr eq "")||($input_seq eq ""))
{
	print "FATAL ERROR:\n";
	print "Check format of the input file: no FASTA data found\n";
	print "Terminating script...\n";
	exit;
}

#---------------------------------------- (2) Simulating evolution -------------------
my @seq_array = qw();
my $curr_seq = $input_seq; 
for (my $i = 0; $i < $times; $i++)
{
	for (my $j = 0; $j < $gener; $j++)
	{	
		$curr_seq = evolve_seq ($curr_seq, $change, $replace, $gener, $debug);
	}

	if ($full ne "Y")
	{
          	if ($peptide_len > (length($curr_seq) - 1))
		{
			print "FATAL ERROR:\n";
			printf "Cannot cut $peptide_len residues from resulting sequence with length %s\n", length($curr_seq);
			print "Terminating script...\n";
			exit;
		}
	
  		my $random_offset = int(rand (length($curr_seq) - $peptide_len));

		if ($debug eq "Y")
		{
			my @curr_seq_array = split (//, $curr_seq);
			$curr_seq = "";
			for (my $i = 0; $i < scalar(@curr_seq_array); $i++)
			{
				if (($i < $random_offset)||($i > $peptide_len + $random_offset - 1))
				{
					$curr_seq .= "-";
				}
				else
				{
					$curr_seq .= $curr_seq_array[$i];
				}
			}
		}
		else
		{
	        	$curr_seq = substr($curr_seq, $random_offset, $peptide_len);
		}
	}
	$seq_array[$i] = $curr_seq;
}

#---------------------------------------- (3) Output printing ------------------------

open (OUTPUT, "> $output");

my $now_time = localtime;
print OUTPUT "This is a result of script 'evolve_protein.pl', version $curr_version\n";
print OUTPUT "Output is created at: $now_time\n\n";
print OUTPUT "----------- Script parameters -----------\n";
if ($change != $def_change)
{
	print OUTPUT "<USER INPUT, option -c> Probability of residue change =\t$change\n";
}
else
{
	print OUTPUT "<    DEFAULT VALUE    > Probability of residue change =\t$change\n";
}
if ($replace != $def_replace)
{
	print OUTPUT "<USER INPUT, option -r> Probability of residue replacement =\t$replace\n";
}
else
{
	print OUTPUT "<    DEFAULT VALUE    > Probability of residue replacement =\t$replace\n";
}
printf OUTPUT "<                     > Probability of insertion =\t%.3f\n", (1 - $replace)/2;
printf OUTPUT "<                     > Probability of deletion =\t%.3f\n", (1 - $replace)/2;
if ($full eq "Y")
{
	print OUTPUT "Full-length sequenses were generated\n";
}
else
{
	print OUTPUT "Number of residues taken from resulting sequence =\t$peptide_len\n"; 
}
print OUTPUT "Number of sequences to generate =\t$times\n"; 
print OUTPUT "Number of generations for each sequence =\t$gener\n"; 
print OUTPUT "-----------------------------------------\n\n";
print OUTPUT "Input FASTA data and resulting sequences as follows:\n";
print OUTPUT "$input_descr\n";
print OUTPUT "$input_seq\n\n";
for (my $i = 0; $i < $times; $i++)
{
	if ($debug ne "Y")
	{
		print OUTPUT "\>$i|simulation_result|change=$change|replace=$replace|generations=$gener\n";
	}
	printf OUTPUT "%s\n", $seq_array[$i];
}
close OUTPUT;
printf "Script successfully terminated; output written to file '$output'\n";  
#
######################################################################################

#
# Method produces random peptide from given sequence under given parameters;
# 
# Parameters: $input_seq	- initial sequence
#	      $p_change		- probability of mutation in each position
#	      $p_replace	- in case of mutation, probability of residue replacement
#	      $peptide_length   - length of the peptide to cut
#	      $generations	- number of generations to mutate a sequence
#	      $do_not_cut	- if 'Y', cancels cutting of the peptide
#	      $debug		
#
# Returns: $new_seq		- result of one round of 'evolution'
#
#*************
sub evolve_seq
#*************
{
  	my ($initial_seq, $p_change, $p_replace, $generations, $debug) = @_;
	my @AA_array = qw(
	C
	S
	T
	P
	A
	G
  	N
	D
	E
	Q
	H
	R
	K
	M
  	I
	L
	V
	F
	W
	Y
	);
	
  	my $new_seq = "";
	my @initial_seq_array = split (//, $initial_seq);
	
	for (my $i = 0; $i < scalar(@initial_seq_array); $i++)
	{
		my $change_random = rand();
		if ($change_random < $p_change)
		{
  			my $AA_random = int(rand(20));
			my $new_AA = $AA_array[$AA_random];
			my $replace_random = rand();
			if ($replace_random < $p_replace)
			{						
				$new_seq .= $new_AA;	
			}
  			else
			{
				my $insert_random = rand();
				if ($insert_random < 0.5)
				{
			        	$new_seq .= $new_AA;
					$new_seq .= $initial_seq_array[$i];
				}	
  			}
		}
		else
		{
			$new_seq .= $initial_seq_array[$i];
		}	
	}
	
	return $new_seq;
}