#!/usr/bin/perl -w


use strict;
use Getopt::Std; 


my $DEBUG;						# Debugging, enabled via command line option
my $MAX_NUM_BIF=100;       # Max. Number of Bifurcations





###   print_matrix   ###########################################################
#
# Prints out a given matrix
#
# In:
# 		1.: sequence string
#		2.: matrix array
#
# Out:
#		/
#
#
sub print_matrix($@) {
	my ($seq, @mat) = @_;
	
	my ($seqlen) = length($seq);
	my ($nt, $i, $j, $value);
	
	
	### print sequence and indices in first row
	#
	printf "%3s%3s", " ";
	for ($i=0; $i<$seqlen; $i++) {
		printf "%3s", substr($seq, $i, 1);
	}
	printf "\n%3s%3s", " ";
	for ($i=1; $i<=$seqlen; $i++) {
		printf "%3s", $i;
	}
	print "\n";
	
	
	### print matrix
	#
	for ($i=1; $i<=$seqlen; $i++) {
	
		# print nt and index
		#
		if ($i eq 0) {
			$nt = " ";
		} else {
			$nt = substr($seq, $i-1, 1);
		}
		printf "%3s%3d", $nt, $i;
	
		# print matrix values
		#
		for ($j=1; $j<=$seqlen; $j++) {
			if ($i eq $j) {
				$value= "*";
			} else {
				$value = $mat[$i][$j];
			}
			printf "%3s", $value;
		}
		print "\n";
	}
}





###   print_basepairs   ########################################################
#
# Pretty printing of calculated basepairs
#
# In:
#		1.: Sequence String
#		2.: Basepair String
# Out:
#		/
#
#
sub print_basepairs($@) {
	my ($seq, @bp) = @_;
	
	my $seqlen = length($seq);
	
	# find a pretty format string
	#
	my ($format) = "\%3s";
	my ($i, $j) = $seqlen, 1;
	while ( $i>1 ) {
		$j++;
		$i /= 10;
	}
	$j += 2;
	substr($format, 1, 1)=$j;
	
	# print indices
	#
	print "No:  ";
	for (my $i = 1; $i<=$seqlen; $i++) {
		printf $format, $i;
	}
	print "\n";
	
	# print sequence
	#
	print "Nt:  ";
	for ($i = 1; $i<=$seqlen; $i++) {
		printf $format,  substr($seq, $i-1, 1);
	}
	print "\n";
	
	# print basepairs
	#
	print "Bp:  ";
	for ($i = 1; $i<=$seqlen; $i++) {
		printf $format, $bp[$i];
	}
	print "\n";
	
	# print dotbracket
	#
	print "DB:  ";
	for ($i = 1; $i<=$seqlen; $i++) {
		if ($bp[$i] eq 0) {
			@_=".";
		} elsif ($bp[$i]>$i) {
			@_="(";
		} else {
			@_=")";
		}
		printf $format, @_;
	}
	print "\n";
}





###   max   ####################################################################
#
#  Programming Perl Chapter 2.7
#
sub max (@) {
    my ($max) = shift(@_);
	 my $foo;
    foreach $foo (@_) {
        $max = $foo if $max < $foo;
    }
    return $max;
}





###   optimize   ###############################################################
#
# maximizes number of basepairs in matrix mms
# based upon Nussinov et al 1978 and Waterman & Smith 1978
#
# In:
#		1.: sequence length
#		2.: area matrix reference (dotplot)
#		3.: mms matrix reference
# Out:
#		/ 
#
#
sub optimize ($$$) {
	my ($seqlen, $area_ref, $mms_ref) = @_;
	
	my $dummy;

	for (my $j=1; $j<=$seqlen; $j++) {
		for (my $i=1; $i<$j-3; $i++) {

			$dummy=0;
			for (my $k=$i; $k<$j-3; $k++) {
				if ($area_ref->[$k][$j] > 0) {

					$dummy = max($dummy,
					             $mms_ref->[$i][$k-1]   +
									 $mms_ref->[$k+1][$j-1] +
									 $area_ref->[$k][$j]);
				}
			}
			$mms_ref->[$i][$j] = max($mms_ref->[$i][$j-1], $dummy);

			if ($DEBUG) {
				print "optimize:   setting mms->[$i][$j] to $mms_ref->[$i][$j-1]\n"
			}
			
		}
	}
}





###    basepair   #############################################################
#
# Decides whether two nt's can form a basepair, or not
#
# In:
#		1.: nt 1
#       2.: nt 2
# Out:
#		1, if a basepair can be formed
#       0, otherwise
#
#
sub basepair($$) {
   my ($nt1, $nt2) = @_;
	my @valid_bp = ("gc", "au", "gu");
	my $my_bp    =  "$nt1$nt2";
	
	# "n" matches all
	if ($nt1 eq "n" || $nt2 eq "n") {
		return 1;
	}
	
	foreach my $bp (@valid_bp) {
		if ($my_bp eq $bp || $my_bp eq reverse($bp)) {
			if ($DEBUG) {print "basepair:   $nt1:$nt2 is a basepair\n"};
			return 1;
		}
	}
	return 0;
}





###   create_dotplot   ############################################################
#
# Creates a dotplot
#
# In:
#		1.: sequence
#		2.: area matrix reference
# Out:
#		/
#
sub create_dotplot($$) {
	my ($seq, $mat_ref) = @_;

	my ($seqlen) = length($seq);
	my ($nt_i, $nt_j);

	$mat_ref->[0][0]="0";							# init
	for (my $i=1; $i<=$seqlen; $i++) {			# 1. dimension
		$nt_i = substr($seq, $i-1, 1);
		$mat_ref->[$i][$i]=0;
		
		# minimum loop length is 3 !
		for (my $j=$i+3; $j<=$seqlen; $j++) {	# 2. dimension 
			$nt_j = substr($seq, $j-1, 1);
			
			
			$mat_ref->[$i][$j] = basepair($nt_i, $nt_j);
			
			if ($DEBUG) {
				print "create_dotplot:   matrix[$i=$nt_i][$j=$nt_j]", 
				                        $mat_ref->[$i][$j], "\n";
			}
		}
	}
}





###   matrix_init   ############################################################
#
# Initializes a two-dimensional (quadratic) array
#
# In:
#		1.: Dimension
#		2.: Initializer String
#		
# Out:
#		initialized array
#
#
sub matrix_init($$) {
	my ($dim, $init) = @_;
	my @dummy;
	for (my $i=0; $i<=$dim; $i++) {
		for (my $j=0; $j<=$dim; $j++) {
			$dummy[$i][$j]=$init;
		}
	}
	return @dummy;
}





###   backtrack_eddy   #########################################################
#
# Taken from "Biological Sequence Analysis" Durbin, Eddy, Krogh, Mitchison
#
# In:
#		1.: sequence length
#     2.: area matrix reference (dotplot)
#     3.: mms matrix reference
#
# Out:
#		basepairs as indexed string array
#
# Note:
#		push: first i then j
#  	pop : first j then i
#
#
sub backtrack_eddy($$$) {
	my ($seqlen, $area_ref, $mms_ref) = @_;
	my (@stack);
	my ($i, $j, $k);
	
	my @basepairs;
	for ($i=0; $i<$seqlen; $i++) {
		$basepairs[$i+1] = "0";
	}
	
	
	push(@stack, 1);
	push(@stack, $seqlen);

	while ($#stack != -1) {
		
		$j=pop(@stack);
		$i=pop(@stack);
		
		if ($i>=$j) {
			next;
	
		# order of checks arbitrary
		# downleft (incl. check for bp) + left + down  = steger's version
		
		
		### down
		#
		} elsif ($mms_ref->[$i+1][$j] eq $mms_ref->[$i][$j]) { 
			push(@stack, $i+1);
			push(@stack, $j);
			
		
		### left
		#
		} elsif ($mms_ref->[$i][$j-1] eq $mms_ref->[$i][$j]) {
			push(@stack, $i);
			push(@stack, $j-1);
		
			
		### downleft
		#
		} elsif ( ($mms_ref->[$i+1][$j-1] + $area_ref->[$i][$j]) eq $mms_ref->[$i][$j]) {
			if ($DEBUG) {
				print "eddy:   basepair: $i:$j\n";
			}
			$basepairs[$i]=$j;
			$basepairs[$j]=$i;
			
			push(@stack, $i+1);
			push(@stack, $j-1);

		
		### bifurcation
		#
		} else {
			for ($k=$i+1; $k<=$j-1; $k++) {
				if ( ($mms_ref->[$i][$k] + $mms_ref->[$k+1][$j]) eq $mms_ref->[$i][$j]) {
					push(@stack, $k+1);
					push(@stack, $j);
					
					push(@stack, $i);
					push(@stack, $k);
					$k=$j; # =break
				}
			
			}
		}
	}
	return @basepairs;
}





###   backtrack_steger   #######################################################
#
# In:
#     1.: area matrix reference (dotplot)
#     2.: mms matrix reference
#     3.: start = 1
#     4.: end   = sequence length
#
# Out:
#		basepairs as indexed string array
#
# based on steger's lecture
#
#
sub backtrack_steger($$$$) {
	my ($area_ref, $mms_ref, $i1, $j1) = @_;
	
	my ($i, $j, $k);
	my (@ipos, @jpos);
	my (@basepairs);
	
	for ($i=0; $i<$j1; $i++) {
		$basepairs[$i+1] = "0";
	}
	my $pos = 1;	
	$ipos[$pos-1] = $i1;
	$jpos[$pos-1] = $j1;
	
	while ($pos > 0) {
		$i = $ipos[$pos-1];
		$j = $jpos[$pos-1];
		$pos -= 1;
		
		while ($i < $j-3) {

			if ($mms_ref->[$i][$j] == $mms_ref->[$i][$j-1]) {
				$j -= 1;
			} else {
				
				for ($k=$i; $k<$j-3; $k++) {
					if ($mms_ref->[$i][$j] == ( $mms_ref->[$i][$k-1]   +
					                            $mms_ref->[$k+1][$j-1] +
														 $area_ref->[$k][$j])     ) {
						if ($pos eq $MAX_NUM_BIF) {
							print "ERROR: (backtrack)\nToo many bifurcations!\n";
							return;
						}

						$pos++;
						$ipos[$pos-1] = $k+1;
						$jpos[$pos-1] = $j-1;
						
						if ($area_ref->[$k][$j] > 0) {
							$basepairs[$k] = $j;
							$basepairs[$j] = $k;
						}
						$j = $k-1;
					}
				}		
				
			}
		
		}
	}
	return @basepairs;
}





###   main   ###################################################################
#
#
# core
#
sub main($$) {
	my ($seq, $backtrack) = @_;
	my (@area, @mms, @basepairs);
	my $seqlen = length($seq);
	

	# initialize matrices
	#
	@area = matrix_init($seqlen, "-");
	@mms  = matrix_init($seqlen, 0);
	print "\nAREA AFTER INIT :\n";
	print_matrix($seq, @area);
	
	# fill area (dot plot)
	#
	create_dotplot($seq, \@area);
	print "\nAREA AFTER DOTPLOT :\n";
	print_matrix($seq, @area);
	
	# compute dyn.prog. matrix mms
	#
	optimize($seqlen, \@area, \@mms);
	print "\nMMS AFTER OPTIMZE :\n";
	print_matrix($seq, @mms);

	# find max base pairs via backtrack
	#
	if ($backtrack eq "steger") {
		@basepairs = backtrack_steger(\@area, \@mms, 1, $seqlen);
	} elsif ($backtrack eq "eddy") {
		@basepairs = backtrack_eddy($seqlen, \@area, \@mms);
	} else {
		print "ERROR: BACKTRACK METHOD UNKNOWN\n";
		exit
	}
	print "\nBACKTRACK RESULTS :\n";
	print_basepairs($seq, @basepairs);
}





################################################################################



my %OPTS;
getopts('dhs:b:', \%OPTS); 


### help
#
if ($OPTS{h}) { 
	print "Available Options:\n";
	print "\t[-d]              : print some debugging info\n";
	print "\t -s <sequence>    : the sequence\n";
	print "\t                    valid chars are \"acgun\"\n";
	print "\t -b <steger|eddy> : 0: Steger's Backtrack (default)\n";
	print "\t                    1: Eddy's Backtrack\n";
	print "\t -h               : print this help\n";
	exit 0;
}


### debugging
#
if ($OPTS{d}) { 
	$DEBUG=1;	
}


### backtrack
#
if (!$OPTS{b}) {
	$OPTS{b}="steger";
}



# sequence
#
if (!$OPTS{s}) { 
	print "No sequence given !\n";
	exit -1;
}

my $SEQ = lc($OPTS{s});
$SEQ =~ s/[^acgun]//;
if (length($OPTS{s}) ne length($SEQ)) {
	print "Your Sequence \"$OPTS{s}\" was converted to \"$SEQ\"\n";
}



main($SEQ, $OPTS{b});

