#######################################################################
#
#Functionality:
#extract protein symbols and unigrams from  annotated abstracts (annotation from UofTexas).
#
#Annotation:
#<p1 pair=1 > ... </p1>
#<prot> ... </prot>
#
#input: annotated abstracts
#
#Usage: 
#perl get_protein_unigram_list.pl 
#Annotated sentences
#protein.list
#unigram.list
#######################################################################

use strict;

my ($ffilelist, @filelist, $fl, @content, @num, @mark);
my (@insen); 
my ($sen, @tag, $num, $token, $key);
my (%pair, @protein, %right_index, %map_index);

my (%proteinlist, %unigramlist);

my ($ori_index, $prot_index, $start_mark, $value, $i, $j, $find_one, $find_two, $k);

my ($sen_notag, $m, $string);

my (@inline);

my ($low, $unigram);

if ($ARGV[0] eq "-l") {
	$ffilelist = $ARGV[1];
	open (IN, "$ffilelist") || die $!;
	@filelist = <IN>;
	chomp @filelist;
	close(IN);

	open (PROT, ">$ARGV[2]") || die $!;
	open (UNI, ">$ARGV[3]") || die $!;
}
else {
	@filelist = $ARGV[0];
	open (PROT, ">$ARGV[1]") || die $!;
	open (UNI, ">$ARGV[2]") || die $!;
}



%proteinlist = ();
%unigramlist = ();

foreach my $fl(@filelist) {

    @content=();

    open (IN, "$fl") || die $!;
    @content = <IN>;
    chomp @content;
    close IN;

    for $sen (@content){

	%pair = ();
	#key: pair mark; 
        #value: "left index(original index of the token after <p1 pair=1 >),right index(original index of </p1>)" of the protein symbol

	#%protein = ();
	#key: left index of the protein symbol; value: the protein symbol

	%right_index = ();
	#key: left index of the protein symbol(original index of <prot>); 
        #value: right index(original index of </prot>)
	@insen = ();
	@insen = split /\s+/, $sen;

	#build the mapping-index
	%map_index = ();
	#key: original index of a token; value: index without annotation tags
	$ori_index = 0;
	$prot_index = -1;
	for $token (@insen){
	    if(($token eq "<p1") || ($token eq "<p2")){
		$start_mark = 1;
		$map_index{$ori_index} = $prot_index;
		$ori_index++;
		next;
	    }
	    if($start_mark ==1 ) {
		$map_index{$ori_index} = $prot_index;
		$ori_index++;
		if($token eq ">"){
		    $start_mark = 0;
		}
		next;
	    }
	    if(($token ne "<prot>") && ($token ne "</prot>") && ($token ne "</p1>") && ($token ne "</p2>")){
		$prot_index++;
	    }

	    $map_index{$ori_index} = $prot_index;
	    $ori_index++;
	}#for $token (@insen)

	#extract proteins
	$ori_index = 0;
	while ($ori_index < scalar(@insen)){

	    $token = $insen[$ori_index];

	    if(($token eq "<p1") || ($token eq "<p2")){

		inpair();
	    }

	    if($token eq "<prot>"){
		inprot();
		
	    }

	    $ori_index++;

	}#while ($ori_index < scalar(@insen))

	#map protein original index to index without annotation


	foreach $key (keys %pair){
	    $value = $pair{$key};
	    @num = ();
	    @num = split /,/, $value;
	    if((!defined($map_index{$num[0]})) || (!defined($map_index{$num[1]}))){
		print "error: can't find index for protein\n";
	    }
	    else{
		$value = $map_index{$num[0]}.",".$map_index{$num[1]};
	    }
	    $pair{$key} = $value;
	    
	}#foreach $key (keys %pair)



	$sen_notag = $sen;
	$sen_notag =~ s/\<p\d+\s+pair=\d+\s+\>/ /g;
	$sen_notag =~ s/\<\/*prot\>/ /g;
	$sen_notag =~ s/\<\/p\d+\>/ /g;

	@inline = split /\s+/, $sen_notag;

	#get unigrams
	foreach(@inline){

	    
	    $low = lc $_;
	    if($low =~ /[a-zA-Z]/){
	    
		$unigramlist{$low}++;
	    }  
	}


	$i = 0;
	@protein = ();
	foreach $key (sort numerically keys %right_index){
	    if((!defined($map_index{$key})) || (!defined($map_index{$right_index{$key}}))){
		print "error: can't find index for protein right_index\n";
	    }
	    else{

		#get proteins
		$string = "";
		for($m = $map_index{$key}+1; $m<=$map_index{$right_index{$key}}; $m++){
		    $string = $string. $inline[$m]." ";
		}
		#remove spaces on both sides of the string
		$string =~ s/\A *//;
		$string =~ s/ *\z//;
		if(!defined($proteinlist{$string})){
		    print PROT "$string\n";
		    $proteinlist{$string}++;
		}

		$i++;
	    }
	}
       	
    }#for $sen (@content)
}

	foreach $unigram(keys %unigramlist){
	    if($unigramlist{$unigram} > 3){
		print UNI "$unigram\n";
	    }
	}


sub inpair{

    my ($inmark, $inpair_token, $pair_mark);

    $inmark = 1;
    $pair_mark = $insen[$ori_index]." ";
    $ori_index++;
    while(($insen[$ori_index] ne "</p1>") && ($insen[$ori_index] ne "</p2>")){

	$inpair_token = $insen[$ori_index];
	if($inmark ==1 ) {
	    $pair_mark = $pair_mark.$inpair_token." ";
	    if($inpair_token =~ /\>\z/){
		$inmark = 0;
		$pair{$pair_mark} = $ori_index+1;

	    }
	}
	if($inpair_token eq "<prot>"){
	    inprot();

	}

	if(($inpair_token eq "<p1") || ($inpair_token eq "<p2")){
	    inpair();
	}
	$ori_index++;
    }
    $pair{$pair_mark} = $pair{$pair_mark}.",".$ori_index;

    return;
}

sub inprot{

    my ($cur_token, $left_index);
    my ($symbol);

    $symbol = "";
    $symbol = $symbol.$insen[$ori_index]." ";

    $left_index = $ori_index;
    $ori_index++;

 
    $symbol = $symbol.$insen[$ori_index]." ";

    while($insen[$ori_index] ne "</prot>"){
	    

	if($insen[$ori_index] eq "<prot>"){
	    
	    inprot();
	}
	
	$ori_index++;

	$symbol = $symbol.$insen[$ori_index]." ";
    }
    
    $right_index{$left_index} = $ori_index;


    return;
}


sub numerically { $a <=> $b}



