#! /usr/bin/perl
# $Id: gmap_compress.pl.in,v 1.22 2010-07-21 21:51:57 twu Exp $

# Need to call gmap with -AF (A is to show alignment, and F is to flip
# genomic sequence to be first)

use warnings;

use IO::File;
compress();
exit;


sub compress {
  while (defined($line = <>)) {
    $line =~ s/\r\n/\n/;
    chop $line;
    if ($line !~ /./) {
      # Skip completely empty lines
    } elsif ($line =~ /^>(\S+)/) {
      if (defined(@gmap)) {
	parse_gmap(\@gmap,$acc,$md5,$npaths,$chimerap,$chimerapos);
      }
      $acc = $1;
      if ($line =~ /[Mm][Dd]5:(\S+)$/) {
	$md5 = $1;
      }
      if (defined($line = <>)) {
	$line =~ s/\r\n/\n/;
	($npaths) = $line =~ /\((\d+)\)/; # Paths (2):
	if ($line =~ /[Cc]himera/) {
	  $chimerap = 1;
	  if ($line =~ /[Cc]himera.*?(\d+)/) { # breakpoint at 232;
	    $chimerapos = $1;
	  } else {
	    $chimerapos = -1;
	  }
	} else {
	  $chimerap = 0;
	  $chimerapos = -1;
	}
      }
      @gmap = ();
    } else {
      push @gmap,$line;
    }
  }

  if (defined(@gmap)) {
    parse_gmap(\@gmap,$acc,$md5,$npaths,$chimerap,$chimerapos);
  }

  return;
}


sub parse_gmap {
  my ($gmap, $acc, $md5, $npaths, $chimerap, $chimerapos) = @_;
  my ($gendirection, $estdirection, $intrdirection, 
      $agreement, $sequence1, $sequence2, $agreements);
  my ($start1, $end1, $start2, $end2, $pct, $i, $line);
  my @gmaphits = ();
  my @introns = ();
  my @strain = ();
  
  $i = 0;
  while ($i <= $#{$gmap} && $ {$gmap}[$i] !~ /^Alignments/) {
    $line = $ {$gmap}[$i];
    if ($line =~ /^\s+Path (\d+):/) {
      $pathno = $1;
      ($queryrange[$pathno]) = $line =~ /query (\S+)/;
      ($type,$chrrange[$pathno],$distance) =  $line =~ /(chr|genomic)\s+(\S+)\s+\((-?\d+) .+\)/;
      undef $type;
      $chrrange[$pathno] =~ s/,//g;
      if ($distance < 0) {
	$chrstrand[$pathno] = "-";
      } else {
	$chrstrand[$pathno] = "+";
      }
      $i++;
      while ($ {$gmap}[$i] !~ /Alignments/ && $ {$gmap}[$i] !~ /Path/) {
	$line = $ {$gmap}[$i];
	if ($line =~ /Genomic/) {
	  ($genomevers[$pathno],$genomerange[$pathno]) = $line =~ /(\S+):([0-9,]+\D+[0-9,]+)/;
	  $genomerange[$pathno] =~ s/,//g;
	} elsif ($line =~ /exons/) {
	  ($nexons[$pathno]) = $line =~ /(\d+)/;
	} elsif ($line =~ /Strain: (\S+)/) {
	  # Optional
	  $strain[$pathno] = $1;
	} elsif ($line =~ /Coverage/) {
	  ($coverage[$pathno]) = $line =~ /(\d+\.\d+)/;
	  if ($line =~ /query length: (\d+) bp/) {
	    $cdnalength = $1;
	  } elsif ($line =~ /query length: (\d+) bp/) {
	    $prolength = $1;	$cdnalength = $prolength*3;
	  } else {
	    die "Could not find query length in $line.  Perhaps your GMAP version is too old.";
	  }

	} elsif ($line =~ /Percent/) {
	  ($pctidentity[$pathno]) = $line =~ /(\d+\.\d+)/;
	}
	$i++;
      }
    } else {
      $i++;
    }
  }
  
  while ($i <= $#{$gmap}) {
    $line = $ {$gmap}[$i];
    if ($line =~ /Alignment for path (\d+)/) {
      if (defined($agreement)) {
	print ">$acc $genomevers[$pathno] $pathno/$npaths $cdnalength ";
	print "$nexons[$pathno] $coverage[$pathno] $pctidentity[$pathno] ";
	print "$queryrange[$pathno] $genomerange[$pathno] $chrrange[$pathno] $chrstrand[$pathno]";
	if ($chimerap == 1) {
	  if ($chimerapos >= 0) {
	    print " chimera:$chimerapos";
	  } else {
	    print " chimera:T";
	  }
	}
	if (defined($strain[$pathno])) {
	  print " strain:$strain[$pathno]";
	}
	if (defined($md5)) {
	  print " md5:$md5";
	}
	print "\n";
	parse_agreement($acc,$agreement,$sequence1,$sequence2,\@exons,\@introns);
      }
      $pathno = $1;
      $agreement = $sequence1 = $sequence2 = "";
      @exons = ();
      @introns = ();
      $i++;
    } elsif ($line =~ /(\S+:)?(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%(.*)/) {
      $chromosome = $1;
      if (defined($chromosome)) {
	$chromosome =~ s/://;
      }
      push @exons,"$2 $3 $4 $5 $6";
      push @introns,"$7";
      $i++;
    } elsif ($line =~ /^\s*(\d+)[ .:]+$/) {
      if ($1 == 0 && $sequence1 ne "") {
	$agreement .= "===...===";
	$sequence1 .= "         ";
	$sequence2 .= "         ";
      }
      while ($ {$gmap}[$i+1] =~ /^aa/ || $ {$gmap}[$i+1] !~ /\d/) {
	$i = $i+1;
      }

      $line1 = $ {$gmap}[$i+1];
      $match = $ {$gmap}[$i+2];
      $line2 = $ {$gmap}[$i+3];
      
      ($intro,$seg1) = $line1 =~ /^(\s*\S+ )(.+)/;
      if (!defined($intro) || !defined($seg1)) {
	print STDERR "Error parsing $acc: $line1\n";
	exit(9);
      }
      $segb = substr($match,length($intro),length($seg1));
      ($seg2) = $line2 =~ /^\s*\S+ (.+)/;

      $agreement .= $segb;
      $sequence1 .= $seg1;
      $sequence2 .= $seg2;
      
      $i += 4;
    } else {
      $i++;
    }
  }
  
  if (defined($agreement)) {
    print ">$acc $genomevers[$pathno] $pathno/$npaths $cdnalength ";
    print "$nexons[$pathno] $coverage[$pathno] $pctidentity[$pathno] ";
    print "$queryrange[$pathno] $genomerange[$pathno] $chrrange[$pathno] $chrstrand[$pathno]";
    if ($chimerap == 1) {
      if ($chimerapos >= 0) {
	print " chimera:$chimerapos";
      } else {
	print " chimera:T";
      }
    }
    if (defined($strain[$pathno])) {
      print " strain:$strain[$pathno]";
    }
    if (defined($md5)) {
      print " md5:$md5";
    }
    print "\n";
    parse_agreement($acc,$agreement,$sequence1,$sequence2,\@exons,\@introns);
  }

  return;
}


sub print_token {
  my ($token, $lasttoken, $tokencount, $firsttokenp) = @_;

  if ($firsttokenp == 1) {
    print "$token";
    $lasttoken = $token;
    $tokencount = 1;
  } elsif ($token eq $lasttoken) {
    $tokencount++;
  } else {
    if ($tokencount > 1) {
      print "!$tokencount";
    } 
    print " $token";
    $lasttoken = $token;
    $tokencount = 1;
  }
  return ($lasttoken, $tokencount, 0);
}


# sequence1 is the concatenation of exons from the genome, as
# determined by GMAP.  sequence2 is the concatenation of exons from
# the cDNA sequence.
sub parse_agreement {
  my ($acc, $agreement, $sequence1, $sequence2, $exons, $introns) = @_;
  my ($i);
  my @agreements = ();
  my @segments1 = ();
  my @segments2 = ();
  my $firsttokenp = 1;

  @pieces = split /[>\(\)\[\]<=\#]{3}\.\.\.[>\(\)\[\]<=\#]{3}/,$agreement;
  $pos = 0;
  $segno = 0;
  foreach $agreement_piece (@pieces) {
    $length = length($agreement_piece);
    $segment1_piece = substr($sequence1,$pos,$length);
    $segment2_piece = substr($sequence2,$pos,$length);

    @sequence1 = split '',$segment1_piece;
    @agreement = split '',$agreement_piece;
    @sequence2 = split '',$segment2_piece;
    
    if (!defined($ {$exons}[$segno])) {
      print STDERR "Cannot find exon number $segno for $acc\n";
      exit(9);
    }
    print "\t" . $ {$exons}[$segno] . "\t";

    $runlength = 0;
    $lasttoken = "";
    $tokencount = 1;
    $firsttokenp = 1;

    for ($i = 0; $i <= $#agreement; $i++) {
      if ($agreement[$i] eq '-') {
        # Gaps in upper or lower sequence
        if ($sequence1[$i] eq ' ') {
	  $token = "$runlength" . "^$sequence2[$i]";
	} elsif ($sequence2[$i] eq ' ') {
	  $token = "$runlength" . "v";
	} else {
	  print STDERR "Error in parse_agreement for $acc\n";
	  print STDERR "i = $i\n";
	  print STDERR "sequence1 = $sequence1[$i]\n";
	  print STDERR "agreement = $agreement[$i]\n";
	  print STDERR "sequence2 = $sequence2[$i]\n";
	  print STDERR "$segment1_piece\n";
	  print STDERR "$agreement_piece\n";
	  print STDERR "$segment2_piece\n";
	  exit(9);
	}

	($lasttoken,$tokencount,$firsttokenp) = print_token($token,$lasttoken,$tokencount,$firsttokenp);
	$runlength = 0;
	
      } elsif ($agreement[$i] eq ':') {
	$token = "$runlength" . ":$sequence2[$i]";
	($lasttoken,$tokencount,$firsttokenp) = print_token($token,$lasttoken,$tokencount,$firsttokenp);
	$runlength = 0;

      } elsif ($agreement[$i] eq ' ') {
	# Mismatch, possibly due to unknown in either sequence
	# As of 2005-10-06, all but the last of these cases are outdated,
	# now that we have the ':' character
	if ($sequence1[$i] eq 'N' && $sequence2[$i] eq 'N') {
	  $token = "$runlength" . "?";
	} elsif ($sequence1[$i] eq 'N') {
	  $token = "$runlength" . "N$sequence2[$i]";
	} elsif ($sequence2[$i] eq 'N') {
	  $token = "$runlength" . "n";
	} else {
	  $token = "$runlength" . "x$sequence2[$i]";
	}

	($lasttoken,$tokencount,$firsttokenp) = print_token($token,$lasttoken,$tokencount,$firsttokenp);
	$runlength = 0;
      } else {
	$runlength++;
      }
    }

    $print_dinucleotide_p = 1;
    if ($ {$introns}[$segno] =~ / ->/) {
      $token = "$runlength" . ">";
    } elsif ($ {$introns}[$segno] =~ / <-/) {
      $token = "$runlength" . "<";
      $print_dinucleotide_p = -1;
    } elsif ($ {$introns}[$segno] =~ / ==/) {
      $token = "$runlength" . "=";
    } elsif ($ {$introns}[$segno] =~ / -\)/) {
      $token = "$runlength" . ")";
    } elsif ($ {$introns}[$segno] =~ / \(-/) {
      $token = "$runlength" . "(";
      $print_dinucleotide_p = -1;
    } elsif ($ {$introns}[$segno] =~ / -\]/) {
      $token = "$runlength" . "]";
    } elsif ($ {$introns}[$segno] =~ / \[-/) {
      $token = "$runlength" . "[";
      $print_dinucleotide_p = -1;
    } elsif ($ {$introns}[$segno] =~ / --/) {
      $token = "$runlength" . "-";
    } elsif ($ {$introns}[$segno] =~ / \#\#/) {
      $token = "$runlength" . "#";
      $print_dinucleotide_p = 0;
    } else {
      $token = "$runlength" . "*";
      $print_dinucleotide_p = 0;
    }

    ($lasttoken,$tokencount,$firsttokenp) = print_token($token,$lasttoken,$tokencount,$firsttokenp);
    $exonseg = $segment2_piece;
    $exonseg =~ s/\s//g;
    $exonlength = length($exonseg);
    print "\t" . $exonlength;

    if ($segno < $#{$introns}) {
      if ($ {$introns}[$segno] =~ / \#\#/) {
	($intronlength) = substr($sequence1,$pos+$length,9) =~ /(\d+)/;
      } else {
	($intronlength) = substr($sequence2,$pos+$length,9) =~ /(\d+)/;
      }
      if (defined($intronlength)) {
	print "\t" . $intronlength;
      } else {
	print "\t";
      }
      $dinucleotide1 = substr($sequence1,$pos+$length,2);
      $dinucleotide2 = substr($sequence1,$pos+$length+7,2);
      $intronends = $dinucleotide1 . "-" . $dinucleotide2;
      if ($print_dinucleotide_p == -1) {
	$intronends = revcomp($intronends);
      }
      if ($print_dinucleotide_p != 0) {
	if ($intronends eq "GT-AG") {
	  # Do nothing
	} else {
	  print "\t" . $intronends;
	}
      }
    }
    print "\n";

    $pos += $length + 9;	# 9 is length of >>>...>>>
    $segno++;
  }

  return;
}


sub revcomp {
  my ($seq) = @_;
  
  $seq =~ tr/ACGT/TGCA/;
  return join('',reverse(split '',$seq));
}
