#!/usr/bin/perl -w


use strict;
use Getopt::Std; 


my $DEBUG;                      # Debugging, enabled via command line option
my $MAX_NUM_BIF=100;       # Max. Number of Bifurcations





###   print_matrix   ###########################################################
#
# Prints out a given matrix
#
# In:
#       1.: sequence string
#       2.: matrix array
#
# Out:
#       /
#
#
sub print_matrix($@) {
    my ($seq, @mat) = @_;
    
    my ($seqlen) = length($seq);
    my ($nt, $i, $j, $value);
    
    
    ### print sequence and indices in first row
    #
    printf "%3s%3s", " ";
    for ($i=0; $i<$seqlen; $i++) {
        printf "%3s", substr($seq, $i, 1);
    }
    printf "\n%3s%3s", " ";
    for ($i=1; $i<=$seqlen; $i++) {
        printf "%3s", $i;
    }
    print "\n";
    
    
    ### print matrix
    #
    for ($i=1; $i<=$seqlen; $i++) {
    
        # print nt and index
        #
        if ($i eq 0) {
            $nt = " ";
        } else {
            $nt = substr($seq, $i-1, 1);
        }
        printf "%3s%3d", $nt, $i;
    
        # print matrix values
        #
        for ($j=1; $j<=$seqlen; $j++) {
            if ($i eq $j) {
                $value= "*";
            } else {
                $value = $mat[$i][$j];
            }
            printf "%3s", $value;
        }
        print "\n";
    }
}





###   print_basepairs   ########################################################
#
# Pretty printing of calculated basepairs
#
# In:
#       1.: Sequence String
#       2.: Basepair String
# Out:
#       /
#
#
sub print_basepairs($@) {
    my ($seq, @bp) = @_;
    
    my $seqlen = length($seq);
    
    # find a pretty format string
    #
    my ($format) = "\%3s";
    my ($i, $j) = $seqlen, 1;
    while ( $i>1 ) {
        $j++;
        $i /= 10;
    }
    $j += 2;
    substr($format, 1, 1)=$j;
    
    # print indices
    #
    print "No:  ";
    for (my $i = 1; $i<=$seqlen; $i++) {
        printf $format, $i;
    }
    print "\n";
    
    # print sequence
    #
    print "Nt:  ";
    for ($i = 1; $i<=$seqlen; $i++) {
        printf $format,  substr($seq, $i-1, 1);
    }
    print "\n";
    
    # print basepairs
    #
    print "Bp:  ";
    for ($i = 1; $i<=$seqlen; $i++) {
        printf $format, $bp[$i];
    }
    print "\n";
    
    # print dotbracket
    #
    print "DB:  ";
    for ($i = 1; $i<=$seqlen; $i++) {
        if ($bp[$i] eq 0) {
            @_=".";
        } elsif ($bp[$i]>$i) {
            @_="(";
        } else {
            @_=")";
        }
        printf $format, @_;
    }
    print "\n";
}





###   max   ####################################################################
#
#  Programming Perl Chapter 2.7
#
sub max (@) {
    my ($max) = shift(@_);
     my $foo;
    foreach $foo (@_) {
        $max = $foo if $max < $foo;
    }
    return $max;
}





###   optimize   ###############################################################
#
# maximizes number of basepairs in matrix mms
# based upon Nussinov et al 1978 and Waterman & Smith 1978
#
# In:
#       1.: sequence length
#       2.: area matrix reference (dotplot)
#       3.: mms matrix reference
# Out:
#       / 
#
#
sub optimize ($$$) {
    my ($seqlen, $area_ref, $mms_ref) = @_;
    
    my $dummy;

    for (my $j=1; $j<=$seqlen; $j++) {
        for (my $i=1; $i<$j-3; $i++) {

            $dummy=0;
            for (my $k=$i; $k<$j-3; $k++) {
                if ($area_ref->[$k][$j] > 0) {

                    $dummy = max($dummy,
                                 $mms_ref->[$i][$k-1]   +
                                     $mms_ref->[$k+1][$j-1] +
                                     $area_ref->[$k][$j]);
                }
            }
            $mms_ref->[$i][$j] = max($mms_ref->[$i][$j-1], $dummy);

            if ($DEBUG) {
                print "optimize:   setting mms->[$i][$j] to $mms_ref->[$i][$j-1]\n"
            }
            
        }
    }
}





###    basepair   #############################################################
#
# Decides whether two nt's can form a basepair, or not
#
# In:
#       1.: nt 1
#       2.: nt 2
# Out:
#       1, if a basepair can be formed
#       0, otherwise
#
#
sub basepair($$) {
   my ($nt1, $nt2) = @_;
    my @valid_bp = ("gc", "au", "gu");
    my $my_bp    =  "$nt1$nt2";
    
    # "n" matches all
    if ($nt1 eq "n" || $nt2 eq "n") {
        return 1;
    }
    
    foreach my $bp (@valid_bp) {
        if ($my_bp eq $bp || $my_bp eq reverse($bp)) {
            if ($DEBUG) {print "basepair:   $nt1:$nt2 is a basepair\n"};
            return 1;
        }
    }
    return 0;
}





###   create_dotplot   ############################################################
#
# Creates a dotplot
#
# In:
#       1.: sequence
#       2.: area matrix reference
# Out:
#       /
#
sub create_dotplot($$) {
    my ($seq, $mat_ref) = @_;

    my ($seqlen) = length($seq);
    my ($nt_i, $nt_j);

    $mat_ref->[0][0]="0";                           # init
    for (my $i=1; $i<=$seqlen; $i++) {          # 1. dimension
        $nt_i = substr($seq, $i-1, 1);
        $mat_ref->[$i][$i]=0;
        
        # minimum loop length is 3 !
        for (my $j=$i+3; $j<=$seqlen; $j++) {   # 2. dimension 
            $nt_j = substr($seq, $j-1, 1);
            
            
            $mat_ref->[$i][$j] = basepair($nt_i, $nt_j);
            
            if ($DEBUG) {
                print "create_dotplot:   matrix[$i=$nt_i][$j=$nt_j]", 
                                        $mat_ref->[$i][$j], "\n";
            }
        }
    }
}





###   matrix_init   ############################################################
#
# Initializes a two-dimensional (quadratic) array
#
# In:
#       1.: Dimension
#       2.: Initializer String
#       
# Out:
#       initialized array
#
#
sub matrix_init($$) {
    my ($dim, $init) = @_;
    my @dummy;
    for (my $i=0; $i<=$dim; $i++) {
        for (my $j=0; $j<=$dim; $j++) {
            $dummy[$i][$j]=$init;
        }
    }
    return @dummy;
}





###   backtrack_eddy   #########################################################
#
# Taken from "Biological Sequence Analysis" Durbin, Eddy, Krogh, Mitchison
#
# In:
#       1.: sequence length
#     2.: area matrix reference (dotplot)
#     3.: mms matrix reference
#
# Out:
#       basepairs as indexed string array
#
# Note:
#       push: first i then j
#   pop : first j then i
#
#
sub backtrack_eddy($$$) {
    my ($seqlen, $area_ref, $mms_ref) = @_;
    my (@stack);
    my ($i, $j, $k);
    
    my @basepairs;
    for ($i=0; $i<$seqlen; $i++) {
        $basepairs[$i+1] = "0";
    }
    
    
    push(@stack, 1);
    push(@stack, $seqlen);

    while ($#stack != -1) {
        
        $j=pop(@stack);
        $i=pop(@stack);
        
        if ($i>=$j) {
            next;
    
        # order of checks arbitrary
        # downleft (incl. check for bp) + left + down  = steger's version
        
        
        ### down
        #
        } elsif ($mms_ref->[$i+1][$j] eq $mms_ref->[$i][$j]) { 
            push(@stack, $i+1);
            push(@stack, $j);
            
        
        ### left
        #
        } elsif ($mms_ref->[$i][$j-1] eq $mms_ref->[$i][$j]) {
            push(@stack, $i);
            push(@stack, $j-1);
        
            
        ### downleft
        #
        } elsif ( ($mms_ref->[$i+1][$j-1] + $area_ref->[$i][$j]) eq $mms_ref->[$i][$j]) {
            if ($DEBUG) {
                print "eddy:   basepair: $i:$j\n";
            }
            $basepairs[$i]=$j;
            $basepairs[$j]=$i;
            
            push(@stack, $i+1);
            push(@stack, $j-1);

        
        ### bifurcation
        #
        } else {
            for ($k=$i+1; $k<=$j-1; $k++) {
                if ( ($mms_ref->[$i][$k] + $mms_ref->[$k+1][$j]) eq $mms_ref->[$i][$j]) {
                    push(@stack, $k+1);
                    push(@stack, $j);
                    
                    push(@stack, $i);
                    push(@stack, $k);
                    $k=$j; # =break
                }
            
            }
        }
    }
    return @basepairs;
}





###   backtrack_steger   #######################################################
#
# In:
#     1.: area matrix reference (dotplot)
#     2.: mms matrix reference
#     3.: start = 1
#     4.: end   = sequence length
#
# Out:
#       basepairs as indexed string array
#
# based on steger's lecture
#
#
sub backtrack_steger($$$$) {
    my ($area_ref, $mms_ref, $i1, $j1) = @_;
    
    my ($i, $j, $k);
    my (@ipos, @jpos);
    my (@basepairs);
    
    for ($i=0; $i<$j1; $i++) {
        $basepairs[$i+1] = "0";
    }
    my $pos = 1;    
    $ipos[$pos-1] = $i1;
    $jpos[$pos-1] = $j1;
    
    while ($pos > 0) {
        $i = $ipos[$pos-1];
        $j = $jpos[$pos-1];
        $pos -= 1;
        
        while ($i < $j-3) {

            if ($mms_ref->[$i][$j] == $mms_ref->[$i][$j-1]) {
                $j -= 1;
            } else {
                
                for ($k=$i; $k<$j-3; $k++) {
                    if ($mms_ref->[$i][$j] == ( $mms_ref->[$i][$k-1]   +
                                                $mms_ref->[$k+1][$j-1] +
                                                         $area_ref->[$k][$j])     ) {
                        if ($pos eq $MAX_NUM_BIF) {
                            print "ERROR: (backtrack)\nToo many bifurcations!\n";
                            return;
                        }

                        $pos++;
                        $ipos[$pos-1] = $k+1;
                        $jpos[$pos-1] = $j-1;
                        
                        if ($area_ref->[$k][$j] > 0) {
                            $basepairs[$k] = $j;
                            $basepairs[$j] = $k;
                        }
                        $j = $k-1;
                    }
                }       
                
            }
        
        }
    }
    return @basepairs;
}





###   main   ###################################################################
#
#
# core
#
sub main($$) {
    my ($seq, $backtrack) = @_;
    my (@area, @mms, @basepairs);
    my $seqlen = length($seq);
    

    # initialize matrices
    #
    @area = matrix_init($seqlen, "-");
    @mms  = matrix_init($seqlen, 0);
    print "\nAREA AFTER INIT :\n";
    print_matrix($seq, @area);
    
    # fill area (dot plot)
    #
    create_dotplot($seq, \@area);
    print "\nAREA AFTER DOTPLOT :\n";
    print_matrix($seq, @area);
    
    # compute dyn.prog. matrix mms
    #
    optimize($seqlen, \@area, \@mms);
    print "\nMMS AFTER OPTIMZE :\n";
    print_matrix($seq, @mms);

    # find max base pairs via backtrack
    #
    if ($backtrack eq "steger") {
        @basepairs = backtrack_steger(\@area, \@mms, 1, $seqlen);
    } elsif ($backtrack eq "eddy") {
        @basepairs = backtrack_eddy($seqlen, \@area, \@mms);
    } else {
        print "ERROR: BACKTRACK METHOD UNKNOWN\n";
        exit
    }
    print "\nBACKTRACK RESULTS :\n";
    print_basepairs($seq, @basepairs);
}





################################################################################



my %OPTS;
getopts('dhs:b:', \%OPTS); 


### help
#
if ($OPTS{h}) { 
    print "Available Options:\n";
    print "\t[-d]              : print some debugging info\n";
    print "\t -s <sequence>    : the sequence\n";
    print "\t                    valid chars are \"acgun\"\n";
    print "\t -b <steger|eddy> : 0: Steger's Backtrack (default)\n";
    print "\t                    1: Eddy's Backtrack\n";
    print "\t -h               : print this help\n";
    exit 0;
}


### debugging
#
if ($OPTS{d}) { 
    $DEBUG=1;   
}


### backtrack
#
if (!$OPTS{b}) {
    $OPTS{b}="steger";
}



# sequence
#
if (!$OPTS{s}) { 
    print "No sequence given !\n";
    exit -1;
}

my $SEQ = lc($OPTS{s});
$SEQ =~ s/[^acgun]//;
if (length($OPTS{s}) ne length($SEQ)) {
    print "Your Sequence \"$OPTS{s}\" was converted to \"$SEQ\"\n";
}



main($SEQ, $OPTS{b});