#######################################################################
#Functionality: 
#extract the dependent word and its POS 
#from output of minipar parser
#
#
#input: output of minipar
#
#Output:
# two lists: "depend_head.list";  "depend_pos.list";
#
#Usage: 
#perl get_depend_list.pl 
#Annotated sentences
#Sentence.mini
#
#######################################################################

use strict;


my ($ffilelist, @filelist, $fl, @content, @num, @mark);
my (@insen, @inline, @sen_notag, @keywordlist, @word, @interword, @interfea); 
my (@temp, @pattern);

my ($sen, @tag, $token, $key);
my (%pair, @protein, %right_index, %map_index, %lexicon, %keyword, %inline_hash);
my (%proteinlist, %unigramlist, %context, %tokeninbetween, %context_left);
my (%verblist, %nounlist);



my ($ori_index, $prot_index, $start_mark, $value, $i, $j, $find_one, $find_two, $k);
my ($sen_notag, $ps1_left, $ps1_right, $ps2_left, $ps2_right, $form, $distance );
my ($num_between, $m, $n, $temp_left, $place, $iword2protein );

my ($form_left_index, $form_right_index, $proteino, $unigramno, $min, $max);
my ($form_left, $form_right, $count, $temp_token, $temp_index, $scale);
my ($before, $between, $after, $verb, $noun, $matchword, $maxgap, $num_pattern); 

my ($nomatch);

my ($senid, $pairid, $vectorid, $has_pair);

my (@lowcase, $sen_lowcase);

my (%context_right);


my (@block, @unit);
my ($unit_ps1_left, $unit_ps1_right, $unit_ps2_left, $unit_ps2_right, $line_index, $head_count);
my ($temp_path, $temp_unit, $pre_line);

my ($deplist, $num_sen, $head_form, $index_head, $pos, $index_pos, $depheadlist, $depposlist);
my (@blockline, @head, @temp_pos);
my (%tree, %headlist, %poslist);

my ($count_nomatch);


$depheadlist = "depend_head.list";
 $depposlist = "depend_pos.list";

if ($ARGV[0] eq "-l") {
	$ffilelist = $ARGV[1];
	open (IN, "$ffilelist") || die $!;
	@filelist = <IN>;
	chomp @filelist;
	close(IN);

	open (MINI, "$ARGV[2]") || die $!;

}
else {
	@filelist = $ARGV[0];

	open (MINI, "$ARGV[1]") || die $!;

}
open (HEAD, ">$depheadlist") || die $!;
open (POS, ">$depposlist") || die $!;


#read sentences from the minipar file
# Read an entire record at a time
local $/ = "\n)\n";


@block = <MINI>;
chomp(@block);
close MINI;



#read one line at a time after this point
local $/ = "\n";


my $nblock=scalar(@block);
print "block=$nblock,\n";


#check if the number of sentences are equal in the input and minipar
$num_sen = 0;
foreach my $fl(@filelist) {

    @content=();

    open (IN, "$fl") || die $!;
    @content = <IN>;
    chomp @content;
    close IN;

    $num_sen += scalar(@content);

}

if($num_sen != $nblock){
    print "Number of sentences are not equal.\n";
    exit;
}

$senid = 0;
$pairid = 1;

%lexicon = ();
foreach my $fl(@filelist) {

    @content=();

    open (IN, "$fl") || die $!;
    @content = <IN>;
    chomp @content;
    close IN;

    for $sen (@content){

	if($sen =~ /\S/){


	    #if this sentence has a parse tree
	    if($block[$senid] =~ /\S/){
	    

	    #get chunk info for senid
	    @blockline = ();
	    @blockline =  split/\n+/, $block[$senid];

	    #store the dependency tree of this sentence in a hash
	    for($k=0; $k<scalar(@blockline); $k++){
		@unit = split/\t+/, $blockline[$k];
		$tree{$unit[0]} = $blockline[$k];
	    }


	    #record sentence id
	    $senid++;
	    	   

	%pair = ();
	#key: pair mark; 
        #value: "left index(original index of the token after <p1 pair=1 >),right index(original index of </p1>)" of the protein symbol

	#%protein = ();
	#key: left index of the protein symbol; value: the protein symbol

	%right_index = ();
	#key: left index of the protein symbol(original index of <prot>); 
        #value: right index(original index of </prot>)
	@insen = ();
	@insen = split /\s+/, $sen;

	#build the mapping-index
	%map_index = ();
	#key: original index of a token; value: index without annotation tags
	$ori_index = 0;
	$prot_index = -1;
	for $token (@insen){
	    if(($token eq "<p1") || ($token eq "<p2")){
		$start_mark = 1;
		$map_index{$ori_index} = $prot_index;
		$ori_index++;
		next;
	    }
	    if($start_mark ==1 ) {
		$map_index{$ori_index} = $prot_index;
		$ori_index++;
		if($token eq ">"){
		    $start_mark = 0;
		}
		next;
	    }
	    if(($token ne "<prot>") && ($token ne "</prot>") && ($token ne "</p1>") && ($token ne "</p2>")){
		$prot_index++;
	    }

	    $map_index{$ori_index} = $prot_index;
	    $ori_index++;
	}#for $token (@insen)

	#extract proteins
	$ori_index = 0;
	while ($ori_index < scalar(@insen)){

	    $token = $insen[$ori_index];

	    if(($token eq "<p1") || ($token eq "<p2")){

		inpair();
	    }

	    if($token eq "<prot>"){
		inprot();
		
	    }

	    $ori_index++;

	}#while ($ori_index < scalar(@insen))

	#map protein original index to index without annotation


	foreach $key (keys %pair){
	    $value = $pair{$key};
	    @num = ();
	    @num = split /,/, $value;
	    if((!defined($map_index{$num[0]})) || (!defined($map_index{$num[1]}))){
		print "error: can't find index for protein\n";
	    }
	    else{
		$value = $map_index{$num[0]}.",".$map_index{$num[1]};
	    }
	    $pair{$key} = $value;
	    
	}


	$i = 0;
	@protein = ();
	foreach $key (sort numerically keys %right_index){
	    if((!defined($map_index{$key})) || (!defined($map_index{$right_index{$key}}))){
		print "error: can't find index for protein right_index\n";
	    }
	    else{
		$protein[$i] = $map_index{$key}.",".$map_index{$right_index{$key}};


		$i++;
	    }
	}



	#generate features


	$sen_notag = $sen;
	$sen_notag =~ s/\<p\d+\s+pair=\d+\s+\>/ /g;
	$sen_notag =~ s/\<\/*prot\>/ /g;
	$sen_notag =~ s/\<\/p\d+\>/ /g;

	#remove spaces on both sides of sen_notag
	    $sen_notag =~ s/^\s+//;
	    $sen_notag =~ s/\s+\z//;

	$sen_lowcase = lc $sen_notag;
	@lowcase = ();
    	@lowcase = split /\s+/, $sen_lowcase;

	@inline = ();
	@inline = split /\s+/, $sen_notag;

	print "$sen\n";

	#generate iword features	
	#build a hash for matching
	%inline_hash = ();
	#key: token in the sentence; 
        #value: index of the token in the sentence
	$i = 0;
	foreach(@inline){
	    if(!defined($inline_hash{$_})){
		$inline_hash{$_} = $i;

	    }
	    $i++;
	}
	@interfea = ();
	for($i=0; $i<scalar(@interword); $i++){
	    if(defined($inline_hash{$interword[$i]})){
		#if a key word presents in this sentence
		$interfea[$i] = 1;

	    }
	    else{
		$interfea[$i] = 0;
	    }
	}
	

	$has_pair = 0;
	$vectorid = -1;
	for($i=0; $i<scalar(@protein); $i++){	    
	    #position of p1
	    $protein[$i] =~ /,/;
	    $ps1_left = $`+1;
	    $ps1_right = $';
            #'
	    
	    #lexical form of p1
	    $form= "";
	    for($k=$ps1_left; $k<=$ps1_right; $k++){
		$form = $form.$inline[$k]." ";
	    }
	    #remove spaces at the end of the form
	    $form =~ s/ \z//;

	    $form_left = $form;



	    #put in the lexicon hash
	    if(!defined($lexicon{$form})){
		$lexicon{$form}++;

	    } 
	    
	    
	    for($j=$i+1; $j<scalar(@protein); $j++){

		$has_pair = 1;

		#record feature vector id (each feature vector corresponds to one pair)
		$vectorid++;

		$scale = $ps1_left/scalar(@inline);
		print OUT "$scale ";

		#position of p2
		$protein[$j] =~ /,/;
		$ps2_left = $`+1;
		$ps2_right = $';
		$scale = $ps2_left/scalar(@inline);
		print OUT "$scale ";

                #'



		@unit = split/\t+/, $tree{$ps1_left+1};
		$unit_ps1_left = $unit[1];
		$unit_ps1_left =~ s/^\(//;
		@unit = split/\t+/, $tree{$ps1_right+1};
		$unit_ps1_right = $unit[1];
		$unit_ps1_right =~ s/^\(//;
		@unit = split/\t+/, $tree{$ps2_left+1};
		$unit_ps2_left = $unit[1];
		$unit_ps2_left =~ s/^\(//;
		@unit = split/\t+/, $tree{$ps2_right+1};
		$unit_ps2_right = $unit[1];
		$unit_ps2_right =~ s/^\(//;


 		#if boundaries of proteins in the minipar file match those in the annotated files
 		if(($unit_ps1_left eq $inline[$ps1_left]) && ($unit_ps1_right eq $inline[$ps1_right]) && ($unit_ps2_left eq $inline[$ps2_left]) && ($unit_ps2_right eq $inline[$ps2_right])){



		    #find the head of M1
		    for($k=($ps1_left+1); $k<=($ps1_right+1); $k++){					
			@unit = split/\t+/, $tree{$k};


			if($unit[3] =~ /\d/){

			    #if the head of this token is outside M1
			    if(($unit[3]<($ps1_left+1)) || ($unit[3]>($ps1_right+1))){
				#$index_head = $unit[3]+1;
				@head = split/\t+/, $tree{$unit[3]};
				$head_form = $head[1];
				$head_form = lc $head_form;
				$head_form =~ s/^\(//;
				@temp_pos = split/\s+/, $head[2];
				$index_pos = scalar(@temp_pos)-1;
				$pos = $temp_pos[$index_pos];


				#put in the list
				if(!defined($headlist{$head_form})){
				    $headlist{$head_form}++;
				}
				if(!defined($poslist{$pos})){
				    $poslist{$pos}++;
				}
				last;
			    }#if(($unit[3]<($ps1_left+1)) || ($unit[3]>($ps1_right+1)))
			}#if($unit[3] =~ /\d/)
		    }#for($k=($ps1_left+1); $k<=($ps1_right+1); $k++)

		    #find the head of M2
		    for($k=($ps2_left+1); $k<=($ps2_right+1); $k++){					
			@unit = split/\t+/, $tree{$k};


			if($unit[3] =~ /\d/){

			    #if the head of this token is outside M2
			    if(($unit[3]<($ps2_left+1)) || ($unit[3]>($ps2_right+1))){
				#$index_head = $unit[3]+1;
				@head = split/\t+/, $tree{$unit[3]};
				$head_form = $head[1];
				$head_form = lc $head_form;
				$head_form =~ s/^\(//;
				@temp_pos = split/\s+/, $head[2];
				$index_pos = scalar(@temp_pos)-1;
				$pos = $temp_pos[$index_pos];


				#put in the list
				if(!defined($headlist{$head_form})){
				    $headlist{$head_form}++;
				}
				if(!defined($poslist{$pos})){
				    $poslist{$pos}++;
				}
				last;
			    }#if(($unit[3]<($ps2_left+1)) || ($unit[3]>($ps2_right+1)))
			}#if($unit[3] =~ /\d/)
		    }#for($k=($ps2_left+1); $k<=($ps2_right+1); $k++)

	    }#if(($unit_ps1_left eq $inline[$ps1_left]) && ($unit_ps1_right eq $inline[$ps1_right]) && ($unit_ps2_left eq $inline[$ps2_left]) && ($unit_ps2_right eq $inline[$ps2_right]))
	    else{
		$count_nomatch++;
		print "nomatch=$count_nomatch\n";
	    }



 	    }#for($j=$i+1; $j<scalar(@protein); $j++)

 	}#for($i=0; $i<scalar(@protein); $i++)


    }#if($block[$senid] =~ /\S/)
    else{
	$senid++;
    }
    
    }#if($sen =~ /\S/)

    }#for $sen (@content)
}#foreach my $fl(@filelist)


 for $key (keys %headlist){
     print HEAD "$key\n";
 }

 for $key (keys %poslist){
     print POS "$key\n";
 }

close(HEAD);
close(POS);

sub inpair{

    my ($inmark, $inpair_token, $pair_mark);

    $inmark = 1;
    $pair_mark = $insen[$ori_index]." ";
    $ori_index++;
    while(($insen[$ori_index] ne "</p1>") && ($insen[$ori_index] ne "</p2>")){

	$inpair_token = $insen[$ori_index];
	if($inmark ==1 ) {
	    $pair_mark = $pair_mark.$inpair_token." ";
	    if($inpair_token =~ /\>\z/){
		$inmark = 0;
		$pair{$pair_mark} = $ori_index+1;

	    }
	}
	if($inpair_token eq "<prot>"){
	    inprot();

	}

	if(($inpair_token eq "<p1") || ($inpair_token eq "<p2")){
	    inpair();
	}
	$ori_index++;
    }
    $pair{$pair_mark} = $pair{$pair_mark}.",".$ori_index;

    return;
}

sub inprot{

    my ($cur_token, $left_index);
    my ($symbol);

    $symbol = "";
    $symbol = $symbol.$insen[$ori_index]." ";

    $left_index = $ori_index;
    $ori_index++;

 
    $symbol = $symbol.$insen[$ori_index]." ";

    while($insen[$ori_index] ne "</prot>"){
	    

	if($insen[$ori_index] eq "<prot>"){
	    
	    inprot();
	}
	
	$ori_index++;

	$symbol = $symbol.$insen[$ori_index]." ";
    }
    
    $right_index{$left_index} = $ori_index;


    return;
}


sub numerically { $a <=> $b}



