#######################################################################
#Functionality: 
#extract phrase heads and phrase paths 
#from output of chunklink_2-2-2000_for_conll.pl (e.g. wsj_0000_1.link)
#
#
#input: e.g. wsj_0000_1.link
#
#Output:
# 9 feature lists
#
#Usage: 
#perl get_phraseheadpath_list.pl 
#Annoteted sentences
#Sentence.link
#
#######################################################################

use strict;


my ($ffilelist, @filelist, $fl, @content, @num, @mark);
my (@insen, @inline, @sen_notag, @keywordlist, @word, @interword, @interfea); 
my (@temp, @pattern);

my ($sen, @tag, $token, $key);
my (%pair, @protein, %right_index, %map_index, %lexicon, %keyword, %inline_hash);
my (%proteinlist, %unigramlist, %context, %tokeninbetween, %context_left);
my (%verblist, %nounlist);



my ($ori_index, $prot_index, $start_mark, $value, $i, $j, $find_one, $find_two, $k);
my ($sen_notag, $ps1_left, $ps1_right, $ps2_left, $ps2_right, $form, $distance );
my ($num_between, $m, $n, $temp_left, $place, $iword2protein );

my ($form_left_index, $form_right_index, $proteino, $unigramno, $min, $max);
my ($form_left, $form_right, $count, $temp_token, $temp_index, $scale);
my ($before, $between, $after, $verb, $noun, $matchword, $maxgap, $num_pattern); 

my ($nomatch);

my ($senid, $pairid, $vectorid, $has_pair);

my (@lowcase, $sen_lowcase);

my (%context_right);


my (@chunkline, @block, @unit, @chunk, @chunk_line);
my ($unit_ps1_left, $unit_ps1_right, $unit_ps2_left, $unit_ps2_right, $line_index, $head_count);
my ($temp_path, $temp_unit, $pre_line);
my (%firsthead_between, %lasthead_between, %allhead_between, %head_before, %head_after, %path, %headpath);
my ($first_between, $last_between, $all_between, $two_before, $two_after, $phrase_path, $phrase_headpath);
my ($first_head_index, $last_head_index, $other_between, $first_before,$second_before, $first_after, $second_after );

my (%firsthead_before, %secondhead_before, %firsthead_after, %secondhead_after);



 $first_between = "firsthead_between.list";
 $last_between = "lasthead_between.list";
 $other_between = "otherhead_between.list";
 $first_before = "firsthead_before.list";
 $second_before = "secondhead_before.list";
 $first_after = "firsthead_after.list";
$second_after = "secondhead_after.list";
 $phrase_path = "phrase_path.list";
 $phrase_headpath = "phrase_headpath.list";

if ($ARGV[0] eq "-l") {
	$ffilelist = $ARGV[1];
	open (IN, "$ffilelist") || die $!;
	@filelist = <IN>;
	chomp @filelist;
	close(IN);

	open (CHUNK, "$ARGV[2]") || die $!;

}
else {
	@filelist = $ARGV[0];

	open (CHUNK, "$ARGV[1]") || die $!;

}

open (FIRSTBET, ">$first_between") || die $!;
open (LASTBET, ">$last_between") || die $!;
open (OTHERBET, ">$other_between") || die $!;
open (FIRSTBEFORE, ">$first_before") || die $!;
open (SECONDBEFORE, ">$second_before") || die $!;
open (FIRSTAFTER, ">$first_after") || die $!;
open (SECONDAFTER, ">$second_after") || die $!;
open (PATH, ">$phrase_path") || die $!;
open (HEADPATH, ">$phrase_headpath") || die $!;


#read sentences from the chunk file

@chunk_line = <CHUNK>;
close CHUNK;

#form blocks 
#because some sentences don't have parse tree,
#they are blank lines in the .link file
$j=0;
$pre_line = $chunk_line[0];
for($i=0; $i<scalar(@chunk_line); $i++){
    if($chunk_line[$i] =~ /\S/){
	$block[$j] .= $chunk_line[$i];
	$pre_line = $chunk_line[$i];
    }
    else{
	if($pre_line =~ /\S/){
	    $j++;
	    $pre_line = "\n";
	}
	else{

	    $block[$j] = "\n";
	    $j++;
	    $pre_line = "\n";
	}
    }
}

#read one line at a time after this point
local $/ = "\n";

my $nblock=scalar(@block);
print "block=$nblock,\n";


$senid = 0;
$pairid = 1;

%lexicon = ();
foreach my $fl(@filelist) {

    @content=();

    open (IN, "$fl") || die $!;
    @content = <IN>;
    chomp @content;
    close IN;

    for $sen (@content){

	if($sen =~ /\S/){


	    #if this sentence has a parse tree
	    if($block[$senid] =~ /\S/){
	    
	    #get chunk info for senid
	    @chunkline = ();
	    @chunkline =  split/\n+/, $block[$senid];

	    #check if this chunked sentence is the sentence being processed
	    @unit = split/\s+/, $chunkline[0];    

	    #record sentence id
	    $senid++;
	    	    

	    if($senid != $unit[2]){
		print "sentence id does not match: $senid.\n";
		exit;
	    }



	%pair = ();
	#key: pair mark; 
        #value: "left index(original index of the token after <p1 pair=1 >),right index(original index of </p1>)" of the protein symbol


	%right_index = ();
	#key: left index of the protein symbol(original index of <prot>); 
        #value: right index(original index of </prot>)
	@insen = ();
	@insen = split /\s+/, $sen;

	#build the mapping-index
	%map_index = ();
	#key: original index of a token; value: index without annotation tags
	$ori_index = 0;
	$prot_index = -1;
	for $token (@insen){
	    if(($token eq "<p1") || ($token eq "<p2")){
		$start_mark = 1;
		$map_index{$ori_index} = $prot_index;
		$ori_index++;
		next;
	    }
	    if($start_mark ==1 ) {
		$map_index{$ori_index} = $prot_index;
		$ori_index++;
		if($token eq ">"){
		    $start_mark = 0;
		}
		next;
	    }
	    if(($token ne "<prot>") && ($token ne "</prot>") && ($token ne "</p1>") && ($token ne "</p2>")){
		$prot_index++;
	    }

	    $map_index{$ori_index} = $prot_index;
	    $ori_index++;
	}#for $token (@insen)

	#extract proteins
	$ori_index = 0;
	while ($ori_index < scalar(@insen)){

	    $token = $insen[$ori_index];

	    if(($token eq "<p1") || ($token eq "<p2")){

		inpair();
	    }

	    if($token eq "<prot>"){
		inprot();
		
	    }

	    $ori_index++;

	}#while ($ori_index < scalar(@insen))

	#map protein original index to index without annotation

	foreach $key (keys %pair){
	    $value = $pair{$key};
	    @num = ();
	    @num = split /,/, $value;
	    if((!defined($map_index{$num[0]})) || (!defined($map_index{$num[1]}))){
		print "error: can't find index for protein\n";
	    }
	    else{
		$value = $map_index{$num[0]}.",".$map_index{$num[1]};
	    }
	    $pair{$key} = $value;
	    
	}


	$i = 0;
	@protein = ();
	foreach $key (sort numerically keys %right_index){
	    if((!defined($map_index{$key})) || (!defined($map_index{$right_index{$key}}))){
		print "error: can't find index for protein right_index\n";
	    }
	    else{
		$protein[$i] = $map_index{$key}.",".$map_index{$right_index{$key}};

		$i++;
	    }
	}
	
	#generate features


	$sen_notag = $sen;
	$sen_notag =~ s/\<p\d+\s+pair=\d+\s+\>/ /g;
	$sen_notag =~ s/\<\/*prot\>/ /g;
	$sen_notag =~ s/\<\/p\d+\>/ /g;

	#remove spaces on both sides of sen_notag
	    $sen_notag =~ s/^\s+//;
	    $sen_notag =~ s/\s+\z//;

	$sen_lowcase = lc $sen_notag;
	@lowcase = ();
    	@lowcase = split /\s+/, $sen_lowcase;

	@inline = ();
	@inline = split /\s+/, $sen_notag;

	print "$sen\n";

	#generate iword features	
	#build a hash for matching
	%inline_hash = ();
	#key: token in the sentence; 
        #value: index of the token in the sentence
	$i = 0;
	foreach(@inline){
	    if(!defined($inline_hash{$_})){
		$inline_hash{$_} = $i;

	    }
	    $i++;
	}
	@interfea = ();
	for($i=0; $i<scalar(@interword); $i++){
	    if(defined($inline_hash{$interword[$i]})){
		#if a key word presents in this sentence
		$interfea[$i] = 1;

	    }
	    else{
		$interfea[$i] = 0;
	    }
	}
	

	$has_pair = 0;
	$vectorid = -1;
	for($i=0; $i<scalar(@protein); $i++){	    
	    #position of p1
	    $protein[$i] =~ /,/;
	    $ps1_left = $`+1;
	    $ps1_right = $';
            #'
	    
	    #lexical form of p1
	    $form= "";
	    for($k=$ps1_left; $k<=$ps1_right; $k++){
		$form = $form.$inline[$k]." ";
	    }
	    #remove spaces at the end of the form
	    $form =~ s/ \z//;

	    $form_left = $form;

	    #debug
	    #print OUT "lexicon{$form}\n";

	    #put in the lexicon hash
	    if(!defined($lexicon{$form})){
		$lexicon{$form}++;

		#debug
		#print OUT "P1:lexicon{$form}\n";
	    } 
	    
	    
	    for($j=$i+1; $j<scalar(@protein); $j++){

		$has_pair = 1;

		#record feature vector id (each feature vector corresponds to one pair)
		$vectorid++;

		$scale = $ps1_left/scalar(@inline);
		print OUT "$scale ";

		#position of p2
		$protein[$j] =~ /,/;
		$ps2_left = $`+1;
		$ps2_right = $';
		$scale = $ps2_left/scalar(@inline);
		print OUT "$scale ";

                #'

		@unit = split/\s+/, $chunkline[$ps1_left];
		$unit_ps1_left = $unit[6];
		@unit = split/\s+/, $chunkline[$ps1_right];
		$unit_ps1_right = $unit[6];
		@unit = split/\s+/, $chunkline[$ps2_left];
		$unit_ps2_left = $unit[6];
		@unit = split/\s+/, $chunkline[$ps2_right];
		$unit_ps2_right = $unit[6];


		#if boundaries of proteins in the chunk file match those in the annotated files
		if(($unit_ps1_left eq $inline[$ps1_left]) && ($unit_ps1_right eq $inline[$ps1_right]) && ($unit_ps2_left eq $inline[$ps2_left]) && ($unit_ps2_right eq $inline[$ps2_right])){

		    $first_head_index = -1;
		    $last_head_index = -1;

		#first phrase head in between
		for($line_index=$ps1_right+1; $line_index<$ps2_left; $line_index++){
		    @unit = split/\s+/, $chunkline[$line_index];    
			if($unit[7] ne "NOFUNC"){
			    if($unit[6] =~ /\S/){
				$firsthead_between{(lc $unit[6])}++;
				
				$first_head_index = $line_index;


				last;
			    }
			}
		    
		}

		#last phrase head in between
		for($line_index=$ps2_left-1; $line_index>$ps1_right; $line_index--){
		    @unit = split/\s+/, $chunkline[$line_index];    
	
			if($unit[7] ne "NOFUNC"){
	
			    if($unit[6] =~ /\S/){
				$lasthead_between{(lc $unit[6])}++;

				$last_head_index = $line_index;

				last;
			    }
			}
	
		}
		
		#all other phrase heads in between
		#for($line_index=$ps1_right+1; $line_index<$ps2_left; $line_index++){
		if(($first_head_index != -1) && ($last_head_index != -1)){
		    for($line_index=$first_head_index+1; $line_index<$last_head_index; $line_index++){
			@unit = split/\s+/, $chunkline[$line_index];    
			if($unit[7] ne "NOFUNC"){
			    if($unit[6] =~ /\S/){
				$allhead_between{(lc $unit[6])}++;

			    }
			}
			
		    }
		}
		else{
		    for($line_index=$ps1_right+1; $line_index<$ps2_left; $line_index++){
			@unit = split/\s+/, $chunkline[$line_index];    
			if($unit[7] ne "NOFUNC"){
			    if($unit[6] =~ /\S/){
				$allhead_between{(lc $unit[6])}++;

			    }
			}
			
		    }


		}

		$head_count = 0;
		for($line_index=$ps1_left-1; $line_index>=0; $line_index--){
		    if($head_count < 2){
			@unit = split/\s+/, $chunkline[$line_index];    
			
			if($unit[7] ne "NOFUNC"){
			    if($unit[6] =~ /\S/){
				$head_count++;
				if($head_count == 1){
				    $firsthead_before{(lc $unit[6])}++;
				}
				else{
				    $secondhead_before{(lc $unit[6])}++;
				}
				#debug
				print "twohead_before: unit[6]=$unit[6], unit[7]=$unit[7]\n";


			    }
			}		    			
		    }
		    else{
			last;
		    }
		}


		#first and second phrase heads after M2

		$head_count = 0;
		for($line_index=$ps2_right+1; $line_index<=scalar(@inline); $line_index++){
		    if($head_count < 2){
			@unit = split/\s+/, $chunkline[$line_index];    
			
			    if($unit[7] ne "NOFUNC"){
				if($unit[6] =~ /\S/){
				    $head_count++;
				    if($head_count == 1){
					$firsthead_after{(lc $unit[6])}++;
				    }
				    else{
					$secondhead_after{(lc $unit[6])}++;
				    }

				}
			    }
		    }
		    else{
			last;
		    }


		}
		
		#path of phrase labels connecting M1 and M2
		$temp_path = "";
		if($ps1_right < $ps2_left){
		    for($line_index=$ps1_left; $line_index<=$ps2_right; $line_index++){
			@unit = split/\s+/, $chunkline[$line_index];    
			if($unit[7] ne "NOFUNC"){
			    $temp_unit = $unit[4];
			    $temp_unit =~ s/^\w-//;
			    $temp_path .= $temp_unit;
			    $temp_path .= ",";
			}
			
		    }

		    if($temp_path =~ /\S/){
			$path{$temp_path}++;
			
		    }
		}
		#path of phrase labels connecting M1 and M2  augmented with head words,
		#if at most two phrases in between
		$temp_path = "";
		$head_count = 0;
		for($line_index=$ps1_right+1; $line_index<$ps2_left; $line_index++){
		    @unit = split/\s+/, $chunkline[$line_index];    
			    if($unit[7] ne "NOFUNC"){
				$temp_unit = $unit[4];
				$temp_unit =~ s/^\w-//;
				$temp_path .= $temp_unit;
				$temp_path .= ":";
				$temp_path = $temp_path.(lc $unit[6]);
				$temp_path .= ",";
				$head_count++;
				if($head_count > 2){
				    last;
				}
			    }

		}
		if(($head_count<=2) && ($temp_path =~ /\S/)){
		    $headpath{$temp_path}++;

		}

	    }#if(($unit_ps1_left eq $inline[$ps1_left]) && ($unit_ps1_right eq $inline[$ps1_right]) && ($unit_ps2_left eq $inline[$ps2_left]) && ($unit_ps2_right eq $inline[$ps2_right]))
	    else{
		
		print "word index does not match: $senid\n";
		print "unit_ps1_left=|$unit_ps1_left|, inline[$ps1_left]=|$inline[$ps1_left]|, unit_ps1_right=|$unit_ps1_right|, inline[$ps2_left]=|$inline[$ps2_left]|, unit_ps2_right=|$unit_ps2_right|, inline[$ps2_right]=|$inline[$ps2_right]|\n";
		exit;
	    }


 	    }#for($j=$i+1; $j<scalar(@protein); $j++)

 	}#for($i=0; $i<scalar(@protein); $i++)


    }#if($block[$senid] =~ /\S/)
    else{
	$senid++;
    }
    
    }#if($sen =~ /\S/)

    }#for $sen (@content)
}#foreach my $fl(@filelist)

for $key (keys %firsthead_between){
    print FIRSTBET "$key\n";
}

for $key (keys %lasthead_between){
    print LASTBET "$key\n";
}

for $key (keys %allhead_between){
    print OTHERBET "$key\n";
}

for $key (keys %firsthead_before){
    print FIRSTBEFORE "$key\n";
}
for $key (keys %secondhead_before){
    print SECONDBEFORE "$key\n";
}

for $key (keys %firsthead_after){
    print FIRSTAFTER "$key\n";
}

for $key (keys %secondhead_after){
    print SECONDAFTER "$key\n";
}

for $key (keys %path){
    print PATH "$key\n";
}

for $key (keys %headpath){
    print HEADPATH "$key\n";
}


 close(FIRSTBET);
 close(LASTBET);
 close(OTHERBET);
 close(FIRSTBEFORE);
 close(SECONDBEFORE);
 close(FIRSTAFTER);
close(SECONDAFTER);
 close(PATH);
 close(HEADPATH);

sub inpair{

    my ($inmark, $inpair_token, $pair_mark);

    $inmark = 1;
    $pair_mark = $insen[$ori_index]." ";
    $ori_index++;
    while(($insen[$ori_index] ne "</p1>") && ($insen[$ori_index] ne "</p2>")){

	$inpair_token = $insen[$ori_index];
	if($inmark ==1 ) {
	    $pair_mark = $pair_mark.$inpair_token." ";
	    if($inpair_token =~ /\>\z/){
		$inmark = 0;
		$pair{$pair_mark} = $ori_index+1;

	    }
	}
	if($inpair_token eq "<prot>"){
	    inprot();

	}

	if(($inpair_token eq "<p1") || ($inpair_token eq "<p2")){
	    inpair();
	}
	$ori_index++;
    }
    $pair{$pair_mark} = $pair{$pair_mark}.",".$ori_index;

    return;
}

sub inprot{

    my ($cur_token, $left_index);
    my ($symbol);

    $symbol = "";
    $symbol = $symbol.$insen[$ori_index]." ";

    $left_index = $ori_index;
    $ori_index++;

 
    $symbol = $symbol.$insen[$ori_index]." ";

    while($insen[$ori_index] ne "</prot>"){
	    

	if($insen[$ori_index] eq "<prot>"){
	    
	    inprot();
	}
	
	$ori_index++;

	$symbol = $symbol.$insen[$ori_index]." ";
    }
    
    $right_index{$left_index} = $ori_index;

    return;
}


sub numerically { $a <=> $b}



