#######################################################################
#
#input: 
#1. annotated sentences
#2: Keyword list. (Used only when testing keyword and pattern features).
#3: Output of the Chunklink script (http://ilk.uvt.nl/sabine/chunklink/).
#4: Output of the minipar parser.

#Output:
# 1. loose format feature file
# 2. sentence id : line of feature vector 
# 3. protein index of each feature vector (data point, pair of proteins, interaction)
#
#Usage (work together with dependency.pm and outmostphrase.pm): 
#
#perl get_features.pl 
#Sentences in the Texas data set
#keyword.list 
#Sentence.link 
#Sentence.mini
#Sentence.fea 
#Sentence.map 
#Sentence.points
#
#######################################################################

use strict;
use dependency;
use outmostphrase;



my ($ffilelist, @filelist, $fl, @content, @num, @mark);
my (@insen, @inline, @sen_notag, @keywordlist, @word, @interword, @interfea); 
my (@temp, @pattern);

my ($sen, @tag, $token, $key);
my (%pair, @protein, %right_index, %map_index, %lexicon, %keyword, %inline_hash);
my (%proteinlist, %unigramlist, %context, %tokeninbetween, %context_left);
my (%verblist, %nounlist);



my ($ori_index, $prot_index, $start_mark, $value, $i, $j, $find_one, $find_two, $k);
my ($sen_notag, $ps1_left, $ps1_right, $ps2_left, $ps2_right, $form, $distance );
my ($num_between, $m, $n, $temp_left, $place, $iword2protein );

my ($form_left_index, $form_right_index, $proteino, $unigramno, $min, $max);
my ($form_left, $form_right, $count, $temp_token, $temp_index, $scale);
my ($before, $between, $after, $verb, $noun, $matchword, $maxgap, $num_pattern); 

my ($nomatch);

my ($senid, $pairid, $vectorid, $has_pair);

my (@lowcase, $sen_lowcase);

my (%context_right);

my (%firsthead_between, %lasthead_between, %otherhead_between, %head_before, %head_after, %path, %headpath);
my (%firsthead_before, %secondhead_before, %firsthead_after, %secondhead_after, %fea_index);

my (@chunkline, @block, @unit, $unit_ps2_left, );

my ($pre_line, $unit_ps1_left, $unit_ps1_right, $unit_ps2_left, $unit_ps2_right, $first_head_index, $last_head_index);
   
my ($line_index, $ti, $head_count, $temp_path, $temp_unit);

my ($num_sen, @parms, @wordpos, $mini, @miniblock);
my (%depword, %deppos);

my ($num_protein, $cover_m1, $cover_m2, $temp_ps_left, $temp_ps_right, $l);

my ($np_left, $np_right, $vp_left, $vp_right, $pp_left, $pp_right);
my ($same_np, $same_vp, $same_pp);

my (@senlabel);

#maximam number of tokens in a gap
$maxgap = 5;
$num_pattern = 23;

if ($ARGV[0] eq "-l") {
	$ffilelist = $ARGV[1];
	open (IN, "$ffilelist") || die $!;
	@filelist = <IN>;
	chomp @filelist;
	close(IN);

	open (KEY, "$ARGV[2]") || die $!;
	open (CHUNK, "$ARGV[3]") || die $!;
	open (MINI, "$ARGV[4]") || die $!;

	open (OUT, ">$ARGV[5]") || die $!;
	open (INDEX, ">$ARGV[6]") || die $!;
	open (DATA, ">$ARGV[7]") || die $!;

}
else {
	@filelist = $ARGV[0];
	open (KEY, "$ARGV[1]") || die $!;
	open (CHUNK, "$ARGV[2]") || die $!;
	open (MINI, "$ARGV[3]") || die $!;

	open (OUT, ">$ARGV[4]") || die $!;
	open (INDEX, ">$ARGV[5]") || die $!;
	open (DATA, ">$ARGV[6]") || die $!;
}



#only necessary when testing the keyword and patterns features
#"noun.list", "verb.list" are the keywords (e.g. in Plake et al. 2005)

# #get iNOUN list
# open (NOUN, "noun.list") || die $!;
# @temp = ();
# @temp = <NOUN>;
# chomp @temp;
# close(NOUN);


# foreach(@temp){
#     if(!defined($nounlist{$_})){
# 	$nounlist{$_}++;
#     }
# }

# #get iVERB list
# open (VERB, "verb.list") || die $!;
# @temp = ();
# @temp = <VERB>;
# chomp @temp;
# close(VERB);

# foreach(@temp){
#     if(!defined($verblist{$_})){
# 	$verblist{$_}++;
#     }
# }


#get the protein feature index
open (PROT, "protein.list") || die $!;
@temp = ();
@temp = <PROT>;
chomp @temp;
close(PROT);

$i = 0;
foreach(@temp){
    if(!defined($proteinlist{$_})){
	$i++;
	$proteinlist{$_} = $i;
    }
}
$proteino = $i;

#get the unigram feature index
open (UNI, "unigram_lowcase.list") || die $!;
@temp = ();
@temp = <UNI>;
chomp @temp;
close(UNI);

$i = 0;
%unigramlist = ();
foreach(@temp){
    if(!defined($unigramlist{$_})){
	$i++;
	$unigramlist{$_} = $i;
    }
}
$unigramno = $i;


#only necessary when testing the keyword and patterns features

# #assign index to key words signaling an interaction
# @keywordlist = <KEY>;
# chomp @keywordlist;
# close(KEY);
# %keyword = {};

# $i = 0;
# #put feature index of key words into @interword
# foreach (@keywordlist){
    
#     if(!defined($keyword{$_})){
# 	$interword[$i] = $_;

# 	$keyword{$_}++;
# 	$i++;

#     }
# }




#get phrase head list
open (FIRSTBET, "firsthead_between.list") || die $!;
@temp = ();
@temp = <FIRSTBET>;
chomp @temp;
close(FIRSTBET);

$i = 0;
%firsthead_between = ();
foreach(@temp){
    if(!defined($firsthead_between{$_})){
	$i++;
	$firsthead_between{$_} = $i;
    }
}

open (LASTBET, "lasthead_between.list") || die $!;
@temp = ();
@temp = <LASTBET>;
chomp @temp;
close(LASTBET);

$i = 0;
%lasthead_between = ();
foreach(@temp){
    if(!defined($lasthead_between{$_})){
	$i++;
	$lasthead_between{$_} = $i;
    }
}

open (OTHERBET, "otherhead_between.list") || die $!;
@temp = ();
@temp = <OTHERBET>;
chomp @temp;
close(OTHERBET);

$i = 0;
%otherhead_between = ();
foreach(@temp){
    if(!defined($otherhead_between{$_})){
	$i++;
	$otherhead_between{$_} = $i;
    }
}

open (FIRSTBEFORE, "firsthead_before.list") || die $!;
@temp = ();
@temp = <FIRSTBEFORE>;
chomp @temp;
close(FIRSTBEFORE);

$i = 0;
%firsthead_before = ();
foreach(@temp){
    if(!defined($firsthead_before{$_})){
	$i++;
	$firsthead_before{$_} = $i;
    }
}

open (SECONDBEFORE, "secondhead_before.list") || die $!;
@temp = ();
@temp = <SECONDBEFORE>;
chomp @temp;
close(SECONDBEFORE);

$i = 0;
%secondhead_before = ();
foreach(@temp){
    if(!defined($secondhead_before{$_})){
	$i++;
	$secondhead_before{$_} = $i;
    }
}

open (FIRSTAFTER, "firsthead_after.list") || die $!;
@temp = ();
@temp = <FIRSTAFTER>;
chomp @temp;
close(FIRSTAFTER);

$i = 0;
%firsthead_after = ();
foreach(@temp){
    if(!defined($firsthead_after{$_})){
	$i++;
	$firsthead_after{$_} = $i;
    }
}

open (SECONDAFTER, "secondhead_after.list") || die $!;
@temp = ();
@temp = <SECONDAFTER>;
chomp @temp;
close(SECONDAFTER);

$i = 0;
%secondhead_after = ();
foreach(@temp){
    if(!defined($secondhead_after{$_})){
	$i++;
	$secondhead_after{$_} = $i;
    }
}

open (PATH, "phrase_path.list") || die $!;
@temp = ();
@temp = <PATH>;
chomp @temp;
close(PATH);

$i = 0;
%path = ();
foreach(@temp){
    if(!defined($path{$_})){
	$i++;
	$path{$_} = $i;
    }
}

open (HEADPATH, "phrase_headpath.list") || die $!;
@temp = ();
@temp = <HEADPATH>;
chomp @temp;
close(HEADPATH);

$i = 0;
%headpath = ();
foreach(@temp){
    if(!defined($headpath{$_})){
	$i++;
	$headpath{$_} = $i;
    }
}

#get dependent word list
open (DEPWORD, "depend_head.list") || die $!;
@temp = ();
@temp = <DEPWORD>;
chomp @temp;
close(DEPWORD);

$i = 0;
%depword = ();
foreach(@temp){
    if(!defined($depword{$_})){
	$i++;
	$depword{$_} = $i;
    }
}

#get dependent PoS list
open (DEPPOS, "depend_pos.list") || die $!;
@temp = ();
@temp = <DEPPOS>;
chomp @temp;
close(DEPPOS);

$i = 0;
%deppos = ();
foreach(@temp){
    if(!defined($deppos{$_})){
	$i++;
	$deppos{$_} = $i;
    }
}


@chunkline = ();
@chunkline = <CHUNK>;
#chomp(@chunkline);
close CHUNK;

#form blocks 
#because some sentences don't have parse tree,
#they are blank lines in the .link file
$j=0;
$pre_line = $chunkline[0];
for($i=0; $i<scalar(@chunkline); $i++){
    if($chunkline[$i] =~ /\S/){
	$block[$j] .= $chunkline[$i];
	$pre_line = $chunkline[$i];
    }
    else{
	if($pre_line =~ /\S/){
	    $j++;
	    $pre_line = "\n";
	}
	else{
	    $block[$j] = "\n";
	    $j++;
	    $pre_line = "\n";
	}
    }
}

#read sentences from the minipar file
# Read an entire record at a time
#local $/ = ")\n> (\n";
local $/ = "\n)\n";

@miniblock = <MINI>;
chomp(@miniblock);
close MINI;

#read one line at a time after this point
local $/ = "\n";

#count the number of sentences
$num_sen = 0;
foreach my $fl(@filelist) {

    @content=();

    open (IN, "$fl") || die $!;
    @content = <IN>;
    chomp @content;
    close IN;

    $num_sen += scalar(@content);

}

if($num_sen != scalar(@miniblock)){
    print "Number of sentences is not equal to minipar output.\n";
    exit;
}


$senid = 0;
$pairid = 1;

%lexicon = ();
foreach my $fl(@filelist) {

    @content=();

    open (IN, "$fl") || die $!;
    @content = <IN>;
    chomp @content;
    close IN;

    for $sen (@content){

	if($sen =~ /\S/){

	    #record sentence id
	    $senid++;
	    

	%pair = ();
	#key: pair mark; 
        #value: "left index(original index of the token after <p1 pair=1 >),right index(original index of </p1>)" of the protein symbol

	#%protein = ();
	#key: left index of the protein symbol; value: the protein symbol

	%right_index = ();
	#key: left index of the protein symbol(original index of <prot>); 
        #value: right index(original index of </prot>)
	@insen = ();
	@insen = split /\s+/, $sen;

	#build the mapping-index
	%map_index = ();
	#key: original index of a token; value: index without annotation tags
	$ori_index = 0;
	$prot_index = -1;
	for $token (@insen){
	    if(($token eq "<p1") || ($token eq "<p2")){
		$start_mark = 1;
		$map_index{$ori_index} = $prot_index;
		$ori_index++;
		next;
	    }
	    if($start_mark ==1 ) {
		$map_index{$ori_index} = $prot_index;
		$ori_index++;
		if($token eq ">"){
		    $start_mark = 0;
		}
		next;
	    }
	    if(($token ne "<prot>") && ($token ne "</prot>") && ($token ne "</p1>") && ($token ne "</p2>")){
		$prot_index++;
	    }

	    $map_index{$ori_index} = $prot_index;
	    $ori_index++;
	}#for $token (@insen)

	#extract proteins
	$ori_index = 0;
	while ($ori_index < scalar(@insen)){

	    $token = $insen[$ori_index];
	    #if($token =~ /\A\<p\d+/){
	    if(($token eq "<p1") || ($token eq "<p2")){

		inpair();
	    }

	    if($token eq "<prot>"){
		inprot();
		
	    }

	    $ori_index++;

	}#while ($ori_index < scalar(@insen))

	#map protein original index to index without annotation

	foreach $key (keys %pair){
	    $value = $pair{$key};
	    @num = ();
	    @num = split /,/, $value;
	    if((!defined($map_index{$num[0]})) || (!defined($map_index{$num[1]}))){
		print "error: can't find index for protein\n";
	    }
	    else{
		$value = $map_index{$num[0]}.",".$map_index{$num[1]};
	    }
	    $pair{$key} = $value;
	    
	}



	$i = 0;
	@protein = ();
	foreach $key (sort numerically keys %right_index){
	    if((!defined($map_index{$key})) || (!defined($map_index{$right_index{$key}}))){
		print "error: can't find index for protein right_index\n";
	    }
	    else{
		$protein[$i] = $map_index{$key}.",".$map_index{$right_index{$key}};

		$i++;
	    }
	}



        #generate labels
# 	for($i=0; $i<scalar(@protein); $i++){
# 	    $find_one = 0;
# 	    $k = 0;
# 	    @mark = ();
# 	    #collect all pairs of this protein
# 	    foreach $key (keys %pair){
# 		if($pair{$key} eq $protein[$i]){
# 		    $find_one = 1;
# 		    $mark[$k] = $key;
# 		    if($mark[$k] =~ /\<p1/){
# 			$mark[$k] =~ s/\<p1/\<p2/;
# 		    }
# 		    else{
# 			$mark[$k] =~ s/\<p2/\<p1/;
# 		    }
# 		    $k++;
# 		}
# 	    }
# 	    if($find_one == 1){
		
# 		for($j=$i; $j<scalar(@protein); $j++){
# 		    $find_two = 0;
# 		    for($k=0; $k<scalar(@mark); $k++){
# 			if($pair{$mark[$k]} eq $protein[$j]){
# 			    print OUT "1 $protein[$i]~$protein[$j]\n";
# 			    $find_two++;
# 			    last;
# 			}
# 		    }
# 		    if($find_two == 0){
# 			print OUT "0 $protein[$i]~$protein[$j]\n";
# 		    }

# 		    if($find_two == scalar(@mark)){
# 			last;
# 		    }
# 		}#for($j=$i+1; $j<scalar(@protein); $j++)
# 	    }#if($find_one == 1)
# 	    else{
# 		for($j=$i; $j<scalar(@protein); $j++){
# 		    print OUT "0 $protein[$i]~$protein[$j]\n";
# 		}
# 	    }
# 	}#for($i=0; $i<scalar(@protein); $i++)
	

	#generate features


	$sen_notag = $sen;
	$sen_notag =~ s/\<p\d+\s+pair=\d+\s+\>/ /g;
	$sen_notag =~ s/\<\/*prot\>/ /g;
	$sen_notag =~ s/\<\/p\d+\>/ /g;

	#remove spaces on both sides of sen_notag
	    $sen_notag =~ s/^\s+//;
	    $sen_notag =~ s/\s+\z//;

	$sen_lowcase = lc $sen_notag;
	@lowcase = ();
    	@lowcase = split /\s+/, $sen_lowcase;

	@inline = ();
	@inline = split /\s+/, $sen_notag;


	#find all out-most NPs, VPs, and PPs in this sentence

	#if this sentence has a parse tree
	if($block[$senid-1] =~ /\S/){
	    
	    #chomp $block[$senid-1];

	    #get chunk info for senid
	    @chunkline = ();
	    @chunkline =  split/\n+/, $block[$senid-1];
		
	    #check if this chunked sentence is the sentence being processed
	    @unit = split/\s+/, $chunkline[0];        

	    
	    if($senid != $unit[2]){
		print "sentence id does not match: $senid.\n";
		exit;
	    }
	    
	    ($np_left,$np_right,$vp_left,$vp_right,$pp_left,$pp_right) = getPhrase(\@chunkline);

	}#if($block[$senid-1] =~ /\S/)




	#build a hash for matching
	%inline_hash = ();
	#key: token in the sentence; 
        #value: index of the token in the sentence
	$i = 0;
	foreach(@inline){
	    if(!defined($inline_hash{$_})){
		$inline_hash{$_} = $i;
	    }
	    $i++;
	}


#only necessary when testing keyword and pattern features

	#generate iword features	
# 	@interfea = ();
# 	for($i=0; $i<scalar(@interword); $i++){
# 	    if(defined($inline_hash{$interword[$i]})){
# 		#if a key word presents in this sentence
# 		$interfea[$i] = 1;

# 	    }
# 	    else{
# 		$interfea[$i] = 0;
# 	    }
# 	}
	

	$has_pair = 0;
	$vectorid = -1;
	for($i=0; $i<scalar(@protein); $i++){	    
	    #position of p1
	    $protein[$i] =~ /,/;
	    $ps1_left = $`+1;
	    $ps1_right = $';
            #'
	    
	    #lexical form of p1
	    $form= "";
	    for($k=$ps1_left; $k<=$ps1_right; $k++){
		$form = $form.$inline[$k]." ";
	    }
	    #remove spaces at the end of the form
	    $form =~ s/ \z//;

	    $form_left = $form;

	    #put in the lexicon hash
	    if(!defined($lexicon{$form})){
		$lexicon{$form}++;

	    } 
	    
	    #get the unigram feature index of 3 tokens to the left of the pair
	    %context_left = ();
	    for($count=1; $count<4; $count++){
		$temp_index = $ps1_left-$count;
		if($temp_index >= 0){

		    #get lowercase of the token
		    $temp_token = $lowcase[$temp_index];


		    if(defined($unigramlist{$temp_token})){
			if(!defined($context_left{$unigramlist{$temp_token}})){
			    $context_left{$unigramlist{$temp_token}}++;
			}
		    }
		}
	    }
	    



	    for($j=$i+1; $j<scalar(@protein); $j++){

		$has_pair = 1;

		#record feature vector id (each feature vector corresponds to one pair)
		$vectorid++;

		$scale = $ps1_left/scalar(@inline);
		print OUT "$scale ";

		#position of p2
		$protein[$j] =~ /,/;
		$ps2_left = $`+1;
		$ps2_right = $';
		$scale = $ps2_left/scalar(@inline);
		print OUT "$scale ";

                #'

 		#print this data point (interaction pair)
 		#print DATA "($ps1_left,$ps1_right)~($ps2_left,$ps2_right)\n";

		#overlap features: M1 and M2 in the same NP(VP, PP)?
		$same_np = 0;
		$same_vp = 0;
		$same_pp = 0;
 		for($l=0; $l<scalar(@$np_left); $l++){
		    if(($ps1_left >= @$np_left[$l]) && ($ps1_right <= @$np_right[$l]) && ($ps2_left >= @$np_left[$l]) && ($ps2_right <= @$np_right[$l])){
			$same_np = 1;
			last;
		    }
		}
		if($same_np == 1){
		    print OUT "1 ";
		}
		else{
		    print OUT "0 ";
		}
 		for($l=0; $l<scalar(@$vp_left); $l++){
		    if(($ps1_left >= @$vp_left[$l]) && ($ps1_right <= @$vp_right[$l]) && ($ps2_left >= @$vp_left[$l]) && ($ps2_right <= @$vp_right[$l])){
			$same_vp = 1;
			last;
		    }
		}
		if($same_vp == 1){
		    print OUT "1 ";
		}
		else{
		    print OUT "0 ";
		}

 		for($l=0; $l<scalar(@$pp_left); $l++){
		    if(($ps1_left >= @$pp_left[$l]) && ($ps1_right <= @$pp_right[$l]) && ($ps2_left >= @$pp_left[$l]) && ($ps2_right <= @$pp_right[$l])){
			$same_pp = 1;
			last;
		    }
		}
		if($same_pp == 1){
		    print OUT "1 ";
		}
		else{
		    print OUT "0 ";
		}


		#overlap features: M1 part of M2?
		if(($ps1_left >= $ps2_left) && ($ps1_right <= $ps2_right)){
		    print OUT "1 ";
		}
		else{
		    print OUT "0 ";		    
		}

		#overlap features: M2 part of M1?
		if(($ps2_left >= $ps1_left) && ($ps2_right <= $ps1_right)){
		    print OUT "1 ";
		}
		else{
		    print OUT "0 ";		    
		}

		#overlap features: M1(M2) part of another mention?
		$cover_m1 = 0;
		$cover_m2 = 0;
		for($l=0; $l<scalar(@protein); $l++){
		    if(($l != $i) && ($l != $j)){
			#position of this mention
			$protein[$l] =~ /,/;
			$temp_ps_left = $`+1;
			$temp_ps_right = $';
                        #' 
			if(($ps1_left >= $temp_ps_left) && ($ps1_right <= $temp_ps_right)){
			    $cover_m1 = 1;
			}
			if(($ps2_left >= $temp_ps_left) && ($ps2_right <= $temp_ps_right)){
			    $cover_m2 = 1;
			}
			if(($cover_m1 == 1) && ($cover_m2 == 1)){
			    last;
			}
		    }#if(($l != $i) && ($l != $j))
		}#for($l=0; $l<scalar(protein); $l++)
		if($cover_m1 == 1){
		    print OUT "1 ";
		}
		else{
		    print OUT "0 ";
		}
		if($cover_m2 == 1){
		    print OUT "1 ";
		}
		else{
		    print OUT "0 ";
		}

		#get the unigram feature index of 3 tokens to the right of the pair
		%context_right = ();

		for($count=1; $count<4; $count++){
		    $temp_index = $ps2_right+$count;
		    if($temp_index < scalar(@inline)){
			#$temp_token = $inline[$temp_index];

			#get lowercase of the token
			$temp_token = $lowcase[$temp_index];


			if(defined($unigramlist{$temp_token})){	
			    if(!defined($context_right{$unigramlist{$temp_token}})){
				$context_right{$unigramlist{$temp_token}}++;
			    }
			}
		    }
		}


		#generate context_left feature
		for($count=1; $count<$unigramno; $count++){
		    if(defined(%context_left)){
			if(defined($context_left{$count})){
			    print OUT "1 ";
			}
			else{
			    print OUT "0 ";
			}
		    }
		    else{
			print OUT "0 ";
		    }
		}


		#generate context_right feature
		for($count=1; $count<$unigramno; $count++){
		    if(defined(%context_right)){
			if(defined($context_right{$count})){
			    print OUT "1 ";
			}
			else{
			    print OUT "0 ";
			}
		    }
		    else{
			print OUT "0 ";
		    }
		}


                #get the unigram feature index of tokens in between of the pair
		%tokeninbetween = ();
		if($ps2_left > $ps1_right){
		    for($count=$ps1_right+1; $count<$ps2_left; $count++){
			#$temp_token = $inline[$count];

			#get lowercase of the token
			$temp_token = $lowcase[$count];

			if(!defined($tokeninbetween{$unigramlist{$temp_token}})){
			    $tokeninbetween{$unigramlist{$temp_token}}++;
			}
		    }
		}
		#generate wordinbetween feature
		for($count=1; $count<$unigramno; $count++){
		    if(defined(%tokeninbetween)){
			if(defined($tokeninbetween{$count})){
			    print OUT "1 ";
			}
			else{
			    print OUT "0 ";
			}
		    }
		    else{
			print OUT "0 ";
		    }
		}

		#p2p distance in token
		$distance = abs($ps1_left-$ps2_left);
		$scale = $distance/scalar(@inline);
		print OUT "$scale ";

		#number of proteins between p1 and p2
		if($distance == 0){
		    print OUT "0 ";
		}
		else{
		    $num_between = 0;
		    for($m=$i; $m<scalar(@protein); $m++){
			$protein[$m] =~ /,/;
			$temp_left = $`+1;
			if(($ps1_left<$temp_left) && ($temp_left<$ps2_left)){
			    $num_between++;
			}
		    }
		    $scale = $num_between/scalar(@protein);
		    print OUT "$scale ";
		}

		#generate p1 and p2 lexical form feature
		if(defined($proteinlist{$form_left})){

		    $form_left_index = $proteinlist{$form_left};
		}
		else{
		    $form_left_index = 0;
		}

 		#lexical form of p2
 		$form_right = "";
 		for($k=$ps2_left; $k<=$ps2_right; $k++){
 		    $form_right = $form_right.$inline[$k]." ";
 		}
 		#remove spaces at the end of the form
 		$form_right =~ s/ \z//;

		#print out p1 and p2
		print DATA "$form_left~~~$form_right\n";
		
		if(defined($proteinlist{$form_right})){

		    $form_right_index = $proteinlist{$form_right};
		}
		else{
		    $form_right_index = 0;
		}

		#always print the protein with a smaller index first
		#so the two cases: p1-p2 and p2-p1 will be treated as
		#the same pair

		#if neither of the proteins is in the list
		if(($form_left_index == 0) && ($form_left_index == $form_right_index)){
		    for($n=1; $n<=($proteino*2); $n++){
			print OUT "0 ";
		    }

		}
		else{
		    $min = $form_left_index <= $form_right_index ? $form_left_index : $form_right_index;
		    $max = $form_left_index >= $form_right_index ? $form_left_index : $form_right_index;
		    if($min == 0){
			for($n=1; $n<=$proteino; $n++){
			    print OUT "0 ";
			}
			for($n=1; $n<=$proteino; $n++){
			    if($n == $max){
				print OUT "1 ";
			    }
			    else{
				print OUT "0 ";
			    }
			}
		    }#if($min == 0)
		    else{

			for($n=1; $n<=$proteino; $n++){
			    if($n == $min){
				print OUT "1 ";
			    }
			    else{
				print OUT "0 ";
			    }
			}
			for($n=1; $n<=$proteino; $n++){
			    if($n == $max){
				print OUT "1 ";
			    }
			    else{
				print OUT "0 ";
			    }
			}
			
		    }#if($min == 0)
		}#if($form_left_index == 0) && ($form_left_index == $form_right_index)else


# only necessary when testing keyword and pattern features

# 		#generate iword, iword place, iword2protein distance feature
# 		#for each iword
# 		for($n=0; $n<scalar(@interword); $n++){
# 		    #iword feature

# 		    print OUT "$interfea[$n] ";

# 		    #iword place: one of {0, 1(before), 2(between), 3(after)}
# 		    if($interfea[$n] == 0){
# 			print OUT "0 ";

# 			#iword2protein distance
# 			#if the iword is not in this sentence
# 			#then set this feature value to 0
# 			print OUT "0 ";
# 		    }
# 		    else{
# 			if(defined($inline_hash{$interword[$n]})){

# 			    $place = $inline_hash{$interword[$n]};
# 			    if($place <= $ps1_left){
# 				print OUT "1 ";
# 			    }
# 			    else{
# 				if($place <= $ps2_left){
# 				    print OUT "2 ";
# 				}
# 				else{
# 				    print OUT "3 ";
# 				}
# 			    }
# 			    #iword2protein distance
# 			    #the distance is set to be the actual distance + 1
# 			    #because distance=0 means the iword is not in the sentence
# 			    $iword2protein = abs($ps1_left-$place) < abs($ps2_left-$place) ? abs($ps1_left-$place)+1 : abs($ps2_left-$place)+1;
# 			    $scale = $iword2protein/scalar(@inline);
# 			    print OUT "$scale ";
				    
# 			}#if(defined($inline_hash{$interword[$n]}))
# 			else{
# 			    print "-----Error: word not found in inline_hash\n";
# 			    exit;
# 			}
# 		    }#if($interfea[$n] == 0)

# 		}#for($n=0; $n<scalar(@interword); $n++)

# 		#generate pattern features

# 		#initialize patterns
# 		@pattern = ();
# 		for($n=0; $n<$num_pattern; $n++){
# 		    $pattern[$n] = -1;
# 		}

# 		#extract all tokens before the first protein
# 		#(note: ps_left+1 is the real start of the protein)
# 		$before = "";
# 		for($n=0; $n<$ps1_left; $n++){
# 		    $before = $before.$inline[$n]." ";
# 		}
# 		#make sure there is one space on the left side and no space on the right side
# 		$before = " ".$before;
# 		$before =~ s/ \z//;
# 		#change to lower case
# 		$before = lc $before;

# 		#extract all tokens between the two proteins
# 		$between = "";
# 		for($n=$ps1_right+1; $n<$ps2_left; $n++){
# 		    $between = $between.$inline[$n]." ";
# 		}
# 		#make sure there is no space on either side
# 		#remove the space on both sides
# 		$between =~ s/\A //;
# 		$between =~ s/ \z//;
# 		#change to lower case
# 		$between = lc $between;
		
# 		#extract all tokens after the second protein
# 		$after = "";
# 		for($n=$ps2_right+1; $n<scalar(@inline); $n++){
# 		    $after = $after.$inline[$n]." ";
# 		}
# 		#make sure there is no space on left, but one space on right side
# 		$after =~ s/\A //;
# 		#change to lower case
# 		$after = lc $after;
		
#                 if($before ne ""){
# 		    foreach $verb (keys %verblist){
# 			$matchword = $verb." "."of";
# 			if($before =~ / $matchword(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[3] = 1;
# 			    $pattern[4] = 1;
# 			    last;
# 			}
# 			else{
# 			    $pattern[3] = -1;
# 			    $pattern[4] = -1;
# 			}
# 		    }#foreach $verb (keys %verblist){

# 		    foreach $noun (keys %nounlist){
# 			$matchword = $noun." "."of";
# 			if($before =~ / $matchword(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[5] = 1;
# 			    $pattern[6] = 1;

			    
# 			}
# 			$matchword = $noun." "."between";
# 			if($before =~ / $matchword(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[7] = 1;
# 			}
# 		    }
# 		    $matchword = "complex formed between";
# 		    if($before =~ / $matchword(?:\s\S+?){0,$maxgap}\z/){
# 			$pattern[10] = 1;
# 		    }
# 		    if($before =~ /( complex| complexes) of(?:\s\S+?){0,$maxgap}\z/){
# 			$pattern[11] = 1;
# 		    }

# 		    if($before =~ /( complex| complexes) containing(?:\s\S+?){0,$maxgap}\z/){
# 			$pattern[13] = 1;
# 		    }

# 		}#if($before ne "")
		
#                 if($between ne ""){

# 		    foreach $verb (keys %verblist){

# 			if($between =~ /\A(?:\S+?\s){0,$maxgap}$verb(?:\s\S+?){0,$maxgap}\z/){

# 			    $pattern[1] = 1;
# 			}
# 			if($between =~ /\A(?:\S+?\s){0,$maxgap}$verb(?:\s\S+?){0,$maxgap} by(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[2] = 1;
# 			}
# 			if($between =~ /\A(?:\S+?\s){0,$maxgap}$verb(?:\s\S+?){0,$maxgap} but not(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[17] = 1;
# 			}
# 			if($between =~ /\A(?:\S+?\s){0,$maxgap}cannot(?:\s\S+?){0,$maxgap} $verb(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[18] = 1;
# 			}
# 			#if($between =~ /\A(?:\S+?\s){0,$maxgap}(does )|( did )|( was )not(?:\s\S+?){0,$maxgap} $verb(?:\s\S+?){0,$maxgap}\z/){
# 			if($between =~ /\A(?:\S+?\s){0,$maxgap}(does|did|was) not(?:\s\S+?){0,$maxgap} $verb(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[19] = 1;
# 			}

# 			if($between =~ /\A(?:\S+?\s){0,$maxgap}not(?:\s\S+?){0,$maxgap} $verb(?:\s\S+?){0,$maxgap} by(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[20] = 1;
# 			}

# 			if($between =~ /\A(?:\S+?\s){0,$maxgap}not required for(?:\s\S+?){0,$maxgap} $verb(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[21] = 1;
# 			}

# 			if($between =~ /\A(?:\S+?\s){0,$maxgap}failed to(?:\s\S+?){0,$maxgap} $verb(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[22] = 1;
# 			}
# 		    }#foreach $verb (keys %verblist)
# 		    if($pattern[3] == 1){
# 			if($between !~ /\A(?:\S+?\s){0,$maxgap}by(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[3] = -1;
# 			}
# 		    }
# 		    if($pattern[4] == 1){
# 			if($between !~ /\A(?:\S+?\s){0,$maxgap}to(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[4] = -1;
# 			}
# 		    }
# 		    if($pattern[5] == 1){
# 			#if($between !~ /\A(?:\S+?\s){0,$maxgap}(by )|( through)(?:\s\S+?){0,$maxgap}\z/){
# 			if($between !~ /\A(?:\S+?\s){0,$maxgap}(by|through)(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[5] = -1;
# 			}
# 		    }
# 		    if($pattern[6] == 1){
# 			#if(($between !~ /\A(?:\S+?\s){0,$maxgap}(with )|( to )|( on)(?:\s\S+?){0,$maxgap}\z/) && ($between !~ /\A(with )|(to )|(on)(?:\s\S+?){0,$maxgap}\z/)){
# 			if($between !~ /\A(?:\S+?\s){0,$maxgap}(with|to|on)(?:\s\S+?){0,$maxgap}\z/){
# 			    $pattern[6] = -1;
# 			}
# 		    }
# 		    if($between !~ /\A(?:\S+?\s){0,$maxgap}and(?:\s\S+?){0,$maxgap}\z/){
# 			if($pattern[7] == 1){
# 			    $pattern[7] = -1;
# 			}
# 			if($pattern[10] == 1){
# 			    $pattern[10] = -1;
# 			}
# 			if($pattern[11] == 1){
# 			    $pattern[11] = -1;
# 			}
# 			if($pattern[13] == 1){
# 			    $pattern[13] = -1;
# 			}
			
# 		    }
		
# 		    if(($between =~ /\A(?:\S+?\s){0,$maxgap}(?:\S+?){0,1}\z/) || $between =~ /\A\S+\z/){

# 			$pattern[8] = 1;
# 			$pattern[9] = 1;
# 			$pattern[14] = 1;
# 			$pattern[15] = 1;
# 		    }

# 		    #if($between =~ /\A(?:\S+?\s){0,$maxgap}(inhibitor )|( repressor )of(?:\s\S+?){0,$maxgap}\z/){
# 		    if($between =~ /\A(?:\S+?\s){0,$maxgap}(inhibitor|repressor) of(?:\s\S+?){0,$maxgap}\z/){
# 			$pattern[16] = 1;
# 		    }

# 		}#if($between ne ""){

# 		if($after ne ""){
# 		    if($pattern[8] == 1){
# 			#if($after !~ /\A(?:\S+?\s){0,$maxgap}(complex )|( complexes )|( dimer )|( heterodimer )|( homodimer )/){
# 			if($after !~ /\A(?:\S+?\s){0,$maxgap}(complex|complexes|dimer|heterodimer|homodimer) /){
# 			    $pattern[8] = -1;
# 			}
# 		    }
# 		    if($pattern[9] == 1){
# 			$nomatch = 1;
# 			foreach $noun (keys %nounlist){
# 			    if($after =~ /\A(?:\S+?\s){0,$maxgap}$noun /){
# 				$nomatch = 0;
# 				last;
# 				#$pattern[9] = -1;
# 			    }
# 			}
# 			if($nomatch == 1){
# 			    $pattern[9] = -1;
# 			}
# 		    }
# 		    if($pattern[14] == 1){
# 			#if($after !~ /\A(?:\S+?\s){0,$maxgap}(form )|( formed )/){
# 			if($after !~ /\A(?:\S+?\s){0,$maxgap}(form|formed) /){
# 			    $pattern[14] = -1;
# 			}
# 		    }
# 		    if($pattern[15] == 1){
# 			$nomatch = 1;
# 			foreach $verb (keys %verblist){
# 			    if($after =~ /\A(?:\S+?\s){0,$maxgap}$verb(?:\s\S+?){0,$maxgap} with each other /){
# 				$nomatch = 0;
# 				last;
# 				#$pattern[15] = -1;
# 			    }
# 			}
# 			if($nomatch == 1){
# 			    $pattern[15] = -1;
# 			}
# 		    }
			
# 		}#if($after ne "")


# 		for($n=1; $n<scalar(@pattern); $n++){
# 		    #print OUT "pattern[$n] = $pattern[$n]\n";

# 		    print OUT "$pattern[$n] ";
# 		}
		


# 		#get phrase head features


		#if this sentence has a parse tree
		if($block[$senid-1] =~ /\S/){
	    
		    #chomp $block[$senid-1];

		    #get chunk info for senid
		    @chunkline = ();
		    @chunkline =  split/\n+/, $block[$senid-1];

		    #check if this chunked sentence is the sentence being processed
		    @unit = split/\s+/, $chunkline[0];    
	    
	    	    if($senid != $unit[2]){
			print "sentence id does not match: $senid.\n";
			exit;
		    }

		    @unit = split/\s+/, $chunkline[$ps1_left];
		    $unit_ps1_left = $unit[6];
		    @unit = split/\s+/, $chunkline[$ps1_right];
		    $unit_ps1_right = $unit[6];
		    @unit = split/\s+/, $chunkline[$ps2_left];
		    $unit_ps2_left = $unit[6];
		    @unit = split/\s+/, $chunkline[$ps2_right];
		    $unit_ps2_right = $unit[6];


		    #if boundaries of proteins in the chunk file match those in the annotated files
		    if(($unit_ps1_left eq $inline[$ps1_left]) && ($unit_ps1_right eq $inline[$ps1_right]) && ($unit_ps2_left eq $inline[$ps2_left]) && ($unit_ps2_right eq $inline[$ps2_right])){

			$first_head_index = -1;
			$last_head_index = -1;

			#first phrase head in between
			for($line_index=$ps1_right+1; $line_index<$ps2_left; $line_index++){
			    @unit = split/\s+/, $chunkline[$line_index];    
			    if($unit[7] ne "NOFUNC"){
				if($unit[6] =~ /\S/){

				    if(defined($firsthead_between{(lc $unit[6])})){
					for($ti=0; $ti<scalar(keys %firsthead_between); $ti++){
					    if($ti == $firsthead_between{(lc $unit[6])}){
						print OUT "1 ";
					    }
					    else{
						print OUT "0 ";
					    }
					}
				    }
				    else{
					for($ti=0; $ti<scalar(keys %firsthead_between); $ti++){
					    print OUT "0 ";
					}

				    }
				    $first_head_index = $line_index;

				    last;
				}
			    }#if($unit[7] ne "NOFUNC")
			    
			}#for($line_index=$ps1_right+1; $line_index<$ps2_left; $line_index++)

			#if no phrase head in between, then print 0
			if($first_head_index == -1){
			    for($ti=0; $ti<scalar(keys %firsthead_between); $ti++){
				print OUT "0 ";
			    }
			}

			#last phrase head in between
			for($line_index=$ps2_left-1; $line_index>$ps1_right; $line_index--){
			    @unit = split/\s+/, $chunkline[$line_index];    
			    
			    if($unit[7] ne "NOFUNC"){
				
				if($unit[6] =~ /\S/){
		
				    if(defined($lasthead_between{(lc $unit[6])})){
					for($ti=0; $ti<scalar(keys %lasthead_between); $ti++){
					    if($ti == $lasthead_between{(lc $unit[6])}){
						print OUT "1 ";
					    }
					    else{
						print OUT "0 ";
					    }
					}
				    }
				    else{
					for($ti=0; $ti<scalar(keys %lasthead_between); $ti++){
					    print OUT "0 ";
					}

				    }

				    
				    $last_head_index = $line_index;
				    
				    last;
				}
			    }
	
			}#for($line_index=$ps2_left-1; $line_index>$ps1_right; $line_index--)

			if($last_head_index == -1){
			    for($ti=0; $ti<scalar(keys %lasthead_between); $ti++){
				print OUT "0 ";
			    }
			}
		
			#other phrase heads in between
			#for($line_index=$ps1_right+1; $line_index<$ps2_left; $line_index++){
			%fea_index = ();
			if(($first_head_index != -1) && ($last_head_index != -1)){
			    for($line_index=$first_head_index+1; $line_index<$last_head_index; $line_index++){
				@unit = split/\s+/, $chunkline[$line_index];    
				if($unit[7] ne "NOFUNC"){
				    if($unit[6] =~ /\S/){
					if(defined($otherhead_between{(lc $unit[6])})){
					    $fea_index{$otherhead_between{(lc $unit[6])}}++;
					}
					
				    }
				}
				
			    }
			    for($ti=0; $ti<scalar(keys %otherhead_between); $ti++){
				if(defined($fea_index{$ti})){
				    print OUT "1 ";
				}
				else
				{
				    print OUT "0 ";
				}
			    }
			}#if(($first_head_index != -1) && ($last_head_index != -1))
			else{
			    for($ti=0; $ti<scalar(keys %otherhead_between); $ti++){
				    print OUT "0 ";
			    }
			}



                        #first and second phrase heads before M1
			$head_count = 0;
			#if M1 is not the first token in the sentence
			if($ps1_left >= 1){
			    for($line_index=$ps1_left-1; $line_index>=0; $line_index--){
				if($head_count < 2){
				    @unit = split/\s+/, $chunkline[$line_index];    
				
				    if($unit[7] ne "NOFUNC"){
					if($unit[6] =~ /\S/){
					    $head_count++;
					    if($head_count == 1){
						if(defined($firsthead_before{(lc $unit[6])})){
					    
						    for($ti=0; $ti<scalar(keys %firsthead_before); $ti++){
							if($ti == $firsthead_before{(lc $unit[6])}){
							    print OUT "1 ";
							}
							else{
							    print OUT "0 ";
							}
						    }
						}
						else{
						    for($ti=0; $ti<scalar(keys %firsthead_before); $ti++){
							print OUT "0 ";
						    }
						    
						}
						
					    }#if($head_count == 1)
					    else{
						if(defined($secondhead_before{(lc $unit[6])})){
						    for($ti=0; $ti<scalar(keys %secondhead_before); $ti++){
							if($ti == $secondhead_before{(lc $unit[6])}){
							    print OUT "1 ";
							}
							else{
							    print OUT "0 ";
							}
						    }
						}
						else{
						    for($ti=0; $ti<scalar(keys %secondhead_before); $ti++){
							print OUT "0 ";
						    }
						    
						}
					    }

					}#if($unit[6] =~ /\S/)
				    }#if($unit[7] ne "NOFUNC")
					
				    #if no phrase head was found
				    if(($line_index == 0) && ($head_count ==0)){
					for($ti=0; $ti<scalar(keys %firsthead_before); $ti++){
					    print OUT "0 ";
					}
					for($ti=0; $ti<scalar(keys %secondhead_before); $ti++){
					    print OUT "0 ";
					}
				    }
				    
				    #if only  one phrase head was found
				    if(($line_index == 0) && ($head_count ==1)){
					for($ti=0; $ti<scalar(keys %secondhead_before); $ti++){
					    print OUT "0 ";
					}
				    }					    			
				}#if($head_count < 2)
				else{
				    last;
				}
			    }#for($line_index=$ps1_left-1; $line_index>=0; $line_index--)
			}#if($ps1_left >= 1)
			else{
			    for($ti=0; $ti<scalar(keys %firsthead_before); $ti++){
				print OUT "0 ";
			    }
			    for($ti=0; $ti<scalar(keys %secondhead_before); $ti++){
				print OUT "0 ";
			    }
			}
			    


			#first and second phrase heads after M2
			$head_count = 0;
			#if M2 is not the last token in the sentence
			if($ps2_right+1 <=scalar(@inline)){
			    for($line_index=$ps2_right+1; $line_index<=scalar(@inline); $line_index++){
				if($head_count < 2){
				    @unit = split/\s+/, $chunkline[$line_index];    
				
				    if($unit[7] ne "NOFUNC"){
					if($unit[6] =~ /\S/){
					    $head_count++;
					    if($head_count == 1){
						if(defined($firsthead_after{(lc $unit[6])})){
						    for($ti=0; $ti<scalar(keys %firsthead_after); $ti++){
							if($ti == $firsthead_after{(lc $unit[6])}){
							    print OUT "1 ";
							}
							else{
							    print OUT "0 ";
							}
						    }
						}
						else{
						    for($ti=0; $ti<scalar(keys %firsthead_after); $ti++){
							print OUT "0 ";
						    }
						    
						}

					    }#if($head_count == 1)
					    else{
						if(defined($secondhead_after{(lc $unit[6])})){
						    for($ti=0; $ti<scalar(keys %secondhead_after); $ti++){
							if($ti == $secondhead_after{(lc $unit[6])}){
							    print OUT "1 ";
							}
							else{
							    print OUT "0 ";
							}
						    }
						}
						else{
						    for($ti=0; $ti<scalar(keys %secondhead_after); $ti++){
							print OUT "0 ";
						    }
						    
						}
						
					    }#if($head_count == 1)else
						
					}#if($unit[6] =~ /\S/)
				    }#if($unit[7] ne "NOFUNC")

				    #if no phrase head was found
				    if(($line_index == scalar(@inline)) && ($head_count ==0)){
					for($ti=0; $ti<scalar(keys %firsthead_after); $ti++){
					    print OUT "0 ";
					}
					for($ti=0; $ti<scalar(keys %secondhead_after); $ti++){
					    print OUT "0 ";
					}
				    }
				    
				    #if only one phrase head was found
				    if(($line_index == scalar(@inline)) && ($head_count ==1)){
					for($ti=0; $ti<scalar(keys %secondhead_after); $ti++){
					    print OUT "0 ";
					}
				    }
				    
				    
				}#if($head_count < 2)
				else{
				    last;
				}
			    }#for($line_index=$ps2_right+1; $line_index<=scalar(@inline); $line_index++)
			}#if($ps2_right+1 <=scalar(@inline))
			else{
			    for($ti=0; $ti<scalar(keys %firsthead_after); $ti++){
				print OUT "0 ";
			    }
			    for($ti=0; $ti<scalar(keys %secondhead_after); $ti++){
				print OUT "0 ";
			    }

			}
		
			#path of phrase labels connecting M1 and M2
			$temp_path = "";
			if($ps1_right < $ps2_left){
			    for($line_index=$ps1_left; $line_index<=$ps2_right; $line_index++){
				@unit = split/\s+/, $chunkline[$line_index];    
				if($unit[7] ne "NOFUNC"){
				    $temp_unit = $unit[4];
				    $temp_unit =~ s/^\w-//;
				    $temp_path .= $temp_unit;
				    $temp_path .= ",";
				}
				
			    }
			    if($temp_path =~ /\S/){
				if(defined($path{$temp_path})){
				    for($ti=0; $ti<scalar(keys %path); $ti++){
					if($ti == $path{$temp_path}){
					    print OUT "1 ";
					}
					else{
					    print OUT "0 ";
					}
				    }
				}
				else{
				    for($ti=0; $ti<scalar(keys %path); $ti++){
					print OUT "0 ";
				    }
				    
				}
				

				
			    }#if($temp_path =~ /\S/)
			    else{
				for($ti=0; $ti<scalar(keys %path); $ti++){
				    print OUT "0 ";
				}
			    }

			}#if($ps1_right < $ps2_left)
			else{
			    for($ti=0; $ti<scalar(keys %path); $ti++){
				print OUT "0 ";
			    }
			}

			#path of phrase labels connecting M1 and M2  augmented with head words,
			#if at most two phrases in between
			$temp_path = "";
			$head_count = 0;
			for($line_index=$ps1_right+1; $line_index<$ps2_left; $line_index++){
			    @unit = split/\s+/, $chunkline[$line_index];    
			    if($unit[7] ne "NOFUNC"){
				$temp_unit = $unit[4];
				$temp_unit =~ s/^\w-//;
				$temp_path .= $temp_unit;
				$temp_path .= ":";
				$temp_path = $temp_path.(lc $unit[6]);
				$temp_path .= ",";
				$head_count++;
				if($head_count > 2){
				    last;
				}
			    }

			}
			if(($head_count<=2) && ($temp_path =~ /\S/)){
			    if(defined($headpath{$temp_path})){
				for($ti=0; $ti<scalar(keys %headpath); $ti++){
				    if($ti == $headpath{$temp_path}){
					print OUT "1 ";
				    }
				    else{
					print OUT "0 ";
				    }
				}
			    }
			    else{
				for($ti=0; $ti<scalar(keys %headpath); $ti++){
				    print OUT "0 ";
				}
				
			    }


			    
			}#if(($head_count<=2) && ($temp_path =~ /\S/))
			else{
			    for($ti=0; $ti<scalar(keys %headpath); $ti++){
				print OUT "0 ";
			    }
			    
			}
			
 		    }#if(($unit_ps1_left eq $inline[$ps1_left]) && ($unit_ps1_right eq $inline[$ps1_right]) && ($unit_ps2_left eq $inline[$ps2_left]) && ($unit_ps2_right eq $inline[$ps2_right]))
		    else{
		
			print "word index does not match: $senid\n";
			print "unit_ps1_left=|$unit_ps1_left|, inline[$ps1_left]=|$inline[$ps1_left]|, unit_ps1_right=|$unit_ps1_right|, inline[$ps2_left]=|$inline[$ps2_left]|, unit_ps2_right=|$unit_ps2_right|, inline[$ps2_right]=|$inline[$ps2_right]|\n";
			exit;
		    }


		}#if($block[$senid-1] =~ /\S/)
		else{
		    for($ti=0; $ti<scalar(keys %firsthead_between); $ti++){
			print OUT "0 ";
		    }
		    for($ti=0; $ti<scalar(keys %lasthead_between); $ti++){
			print OUT "0 ";
		    }
		    for($ti=0; $ti<scalar(keys %otherhead_between); $ti++){
			print OUT "0 ";
		    }
		    for($ti=0; $ti<scalar(keys %firsthead_before); $ti++){
			print OUT "0 ";
		    }
		    for($ti=0; $ti<scalar(keys %secondhead_before); $ti++){
			print OUT "0 ";
		    }
		    for($ti=0; $ti<scalar(keys %firsthead_after); $ti++){
			print OUT "0 ";
		    }
		    for($ti=0; $ti<scalar(keys %secondhead_after); $ti++){
			print OUT "0 ";
		    }
		    for($ti=0; $ti<scalar(keys %path); $ti++){
			print OUT "0 ";
		    }
		    for($ti=0; $ti<scalar(keys %headpath); $ti++){
			print OUT "0 ";
		    }

		    #$senid++;
		}


		#get dependent word and PoS
		$parms[0] = $ps1_left;
		$parms[1] = $ps1_right;
		$parms[2] = $ps2_left;
		$parms[3] = $ps2_right;
		@wordpos = getDependency($num_sen, $senid-1, \@inline, \@parms, \@miniblock);
		if($wordpos[0] =~ /\S/){
		    if(defined($depword{$wordpos[0]})){
			for($ti=0; $ti<scalar(keys %depword); $ti++){
			    if($ti == $depword{$wordpos[0]}){
				print OUT "1 ";
			    }
			    else{
				print OUT "0 ";
			    }
			}
		    }
		    else{
			for($ti=0; $ti<scalar(keys %depword); $ti++){
			    print OUT "0 ";
			}
		    }
		}
		else{
		    for($ti=0; $ti<scalar(keys %depword); $ti++){
			print OUT "0 ";
		    }
		}
		if($wordpos[2] =~ /\S/){
		    if(defined($depword{$wordpos[2]})){
			for($ti=0; $ti<scalar(keys %depword); $ti++){
			    if($ti == $depword{$wordpos[2]}){
				print OUT "1 ";
			    }
			    else{
				print OUT "0 ";
			    }
			}
		    }
		    else{
			for($ti=0; $ti<scalar(keys %depword); $ti++){
			    print OUT "0 ";
			}
		    }
		}
		else{
		    for($ti=0; $ti<scalar(keys %depword); $ti++){
			print OUT "0 ";
		    }
		}


		if($wordpos[1] =~ /\S/){
		    if(defined($deppos{$wordpos[1]})){
			for($ti=0; $ti<scalar(keys %deppos); $ti++){
			    if($ti == $deppos{$wordpos[1]}){
				print OUT "1 ";
			    }
			    else{
				print OUT "0 ";
			    }
			}
		    }
		    else{
			for($ti=0; $ti<scalar(keys %deppos); $ti++){
			    print OUT "0 ";
			}
		    }
		}
		else{
		    for($ti=0; $ti<scalar(keys %deppos); $ti++){
			print OUT "0 ";
		    }
		}
		if($wordpos[3] =~ /\S/){
		    if(defined($deppos{$wordpos[3]})){
			for($ti=0; $ti<scalar(keys %deppos); $ti++){
			    if($ti == $deppos{$wordpos[3]}){
				print OUT "1 ";
			    }
			    else{
				print OUT "0 ";
			    }
			}
		    }
		    else{
			for($ti=0; $ti<scalar(keys %deppos); $ti++){
			    print OUT "0 ";
			}
		    }
		}
		else{
		    for($ti=0; $ti<scalar(keys %deppos); $ti++){
			print OUT "0 ";
		    }
		}


		print OUT "\n";


	    }#for($j=$i+1; $j<scalar(@protein); $j++)

	}#for($i=0; $i<scalar(@protein); $i++)

	if($has_pair == 1){
	    #print out senid ~ vectorid
	    print INDEX ("$senid:$pairid~");
	    $pairid += $vectorid;
	    print INDEX ("$pairid\n");
	    $pairid++;
	}


	}#if($sen =~ /\S/)

    }#for $sen (@content)
}#foreach my $fl(@filelist)

close(OUT);
close(DATA);
close(INDEX);

sub inpair{

    my ($inmark, $inpair_token, $pair_mark);

    $inmark = 1;
    $pair_mark = $insen[$ori_index]." ";
    $ori_index++;
    while(($insen[$ori_index] ne "</p1>") && ($insen[$ori_index] ne "</p2>")){


	$inpair_token = $insen[$ori_index];
	if($inmark ==1 ) {
	    $pair_mark = $pair_mark.$inpair_token." ";
	    if($inpair_token =~ /\>\z/){
		$inmark = 0;
		$pair{$pair_mark} = $ori_index+1;

	    }
	}
	if($inpair_token eq "<prot>"){
	    inprot();

	}
	#if($inpair_token =~ /\A\<p\d+/){
	if(($inpair_token eq "<p1") || ($inpair_token eq "<p2")){
	    inpair();
	}
	$ori_index++;
    }
    $pair{$pair_mark} = $pair{$pair_mark}.",".$ori_index;

    return;
}

sub inprot{

    my ($cur_token, $left_index);
    my ($symbol);

    $symbol = "";
    $symbol = $symbol.$insen[$ori_index]." ";

    $left_index = $ori_index;
    $ori_index++;

 
    $symbol = $symbol.$insen[$ori_index]." ";


    while($insen[$ori_index] ne "</prot>"){
	    

	if($insen[$ori_index] eq "<prot>"){
	    
	    inprot();
	}
	
	$ori_index++;

	$symbol = $symbol.$insen[$ori_index]." ";
    }
    
    $right_index{$left_index} = $ori_index;


    return;
}


sub numerically { $a <=> $b}



