#!/usr/bin/perl

###################################################################################################################
# Script to take Arabidopsis annotations from Phytozome annotation file and instert them as notes in gene gff files
###################################################################################################################

use warnings;
use strict;

# 1. Get file names from command line 

my $annotation_file = $ARGV[0];
my $gff_file = $ARGV[1];

# 2. Create name for gene annot file to be produced

my $new_file = $gff_file;
$new_file =~ s/.gff3/_annot.gff3/;

my $note; # create variable to store each annotation as a note

# 3. Open files for input and output

open (ANN , "<" , $annotation_file) || die "Can't open file with annotations $annotation_file \n";
open (GFF , "<" , $gff_file) || die "Can't open gff file $gff_file \n";
open (NEW , ">" , $new_file) || die "Can't open ouput file $new_file \n";

# 4. Read each line of gene GFF file and 

while (<GFF>) {
    if (!m/mRNA/) {   # print all non-mRNA lines to new file without changes
	print NEW $_;
	next;
    } else {
	my $line = $_;
	chomp $_;
	my @strings = split /\t/, $_;   #split mRNA line by column
	my $data = $strings[8];         #get last column with descriptors
	my @values = split /;/, $data;
	my $name = $values[1];          #save mRNA name
	$name=~ s/Name=//;              #reduce to just name
	#print STDERR "Working on mRNA $name \n";
	open (ANN , "<" , $annotation_file) || die "Can't open file with annotations $annotation_file \n";
	while (<ANN>) {                
	    if (m/^$name*/) {    #match line from annotation file that describes our mRNA
		chomp;
		my ($mRNA, $gene, $text) = split /\t/, $_;    #save different parts of annotation data
		if (!$gene) {                                 #skip the rest and don't add any annotation data if there is none
		    last;
		} else {
		    if (!$text) {
			$note = "similar to $gene";           # if the only data is the name of a similar Arabidopsis gene
		    } else {
			$note = "$text, based on similarity to $gene";    #if the annotation includes functional info
		    }
		}
		#print STDERR "mRNA $name has note ". $note . "\n";
		#escape characters that gff format considers special using url encoding
		$note=~ s:([,]):%2C:g;                   #escape all commas     
		$note=~ s{(.*)%2C}{$1,}xms;              #add back in the last one, if it exists, because we want it there
		$note=~ s:([;]):%3B:g;                   #semi-colons
		$note=~ s:([/]):%2F:g;                   #forward-slashes

		#place our note in the right spot in the original GFF string
		$line=~ s/;longest/;Note=$note;longest/; 
		last;
	    } else {
		next;  #keep searching if annotation not found yet
	    }
	}
	close ANN;
	print NEW $line;
    }
}
print STDERR "Annotations added, new file saved to $new_file \n";

