#######################################################################
#
#Functionality: 
#extract the phrase information
#from output of Collins' parser
#
#Useage: 
#work together with get_features.pl
#
#######################################################################

package outmostphrase;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(getPhrase);

use strict;



sub getPhrase{

    my (%begin_phrase, $np_left_count, $vp_left_count, $pp_left_count);
    my ($np_right_count, $vp_right_count, $pp_right_count, $i, $j);
    my (@column, @chain, @np_left, @np_right, @vp_left, @vp_right, @pp_left, @pp_right);
    my $line = shift(@_);

    %begin_phrase = ();
    $begin_phrase{np} = -1;
    $begin_phrase{vp} = -1;
    $begin_phrase{pp} = -1;


    $np_left_count = 0;
    $vp_left_count = 0;
    $pp_left_count = 0;
    $np_right_count = 0;
    $vp_right_count = 0;
    $pp_right_count = 0;

 
    for($i=0; $i<scalar(@$line); $i++){

	@column = split/\s+/, @$line[$i];	
	@chain = split/\//, $column[10];

	if($column[10] =~ /B-NP/){
	    for($j=0; $j<scalar(@chain); $j++){
		if($chain[$j] eq "B-NP"){
		    #if the level of the found NP is bigger 
		    #than current NP under consideration
		    if($j > $begin_phrase{np}){
			if($begin_phrase{np} == -1){
			    #put the new NP into consideration
			    $begin_phrase{np} = $j;
			    #record start point of this NP
			    $np_left[$np_left_count] = $column[3];
			    $np_left_count++;
			}
		    }
		    else{
			print "this is impossible B-NP. sen:@$line[$i], $column[3]\n";
			exit;
		    }
		}#if($chain[$j] eq "B-NP")
	    }#for($j=0; $j<scalar(@chain); $j++)
		    
	}#if($column[10] =~ /B-NP/)

	if($column[10] =~ /B-VP/){

	    for($j=0; $j<scalar(@chain); $j++){
		if($chain[$j] eq "B-VP"){
		    #if the level of the found VP is bigger 
		    #than current VP under consideration
		    if($j > $begin_phrase{vp}){
			if($begin_phrase{vp} == -1){
			    #put the new VP into consideration
			    $begin_phrase{vp} = $j;
			    #record start point of this VP
			    $vp_left[$vp_left_count] = $column[3];
			    $vp_left_count++;
			}
		    }
		    else{
			print "this is impossible B-VP. sen:@$line[$i], $column[3]\n";
			exit;
		    }
		}#if($chain[$j] eq "B-VP")
	    }#for($j=0; $j<scalar(@chain); $j++)
	    
	}#if($column[10] =~ /B-VP/)

	if($column[10] =~ /B-PP/){

	    for($j=0; $j<scalar(@chain); $j++){
		if($chain[$j] eq "B-PP"){
		    #if the level of the found PP is bigger 
		    #than current PP under consideration
		    if($j > $begin_phrase{pp}){
			if($begin_phrase{pp} == -1){
			    #put the new PP into consideration
			    $begin_phrase{pp} = $j;
			    #record start point of this PP
			    $pp_left[$pp_left_count] = $column[3];
			    $pp_left_count++;
			}
		    }
		    else{
			print "this is impossible B-PP. sen:@$line[$i], $column[3]\n";
			exit;
		    }
		}#if($chain[$j] eq "B-PP")
	    }#for($j=0; $j<scalar(@chain); $j++)
	    
	}#if($column[10] =~ /B-PP/)

	if($column[10] =~ /E-NP/){

	    for($j=0; $j<scalar(@chain); $j++){
		if($chain[$j] eq "E-NP"){		    
		    if($j == $begin_phrase{np}){
			#found the end of the NP
			$np_right[$np_right_count] = $column[3];
			$np_right_count++;
			$begin_phrase{np} = -1;
		    }
		    else{
			if($j < $begin_phrase{np}){
			    print "this is not possible E-NP. sen:@$line[$i], $column[3]\n";
			    exit;
			}
		    }
		}#if($chain[$j] eq "E-NP")
	    }#for($j=0; $j<scalar(@chain); $j++)
	}#if($column[10] =~ /E-NP/)

	if($column[10] =~ /E-VP/){

	    for($j=0; $j<scalar(@chain); $j++){
		if($chain[$j] eq "E-VP"){		    
		    if($j == $begin_phrase{vp}){
			#found the end of the VP
			$vp_right[$vp_right_count] = $column[3];
			$vp_right_count++;
			$begin_phrase{vp} = -1;
		    }
		    else{
			if($j < $begin_phrase{vp}){
			    print "this is not possible E-VP. sen:@$line[$i], $column[3]\n";
			    exit;
			}
		    }
		}#if($chain[$j] eq "E-VP")
	    }#for($j=0; $j<scalar(@chain); $j++)
	}#if($column[10] =~ /E-VP/)

	if($column[10] =~ /E-PP/){

	    for($j=0; $j<scalar(@chain); $j++){
		if($chain[$j] eq "E-PP"){		    
		    if($j == $begin_phrase{pp}){
			#found the end of the PP
			$pp_right[$pp_right_count] = $column[3];
			$pp_right_count++;
			$begin_phrase{pp} = -1;
		    }
		    else{
			if($j < $begin_phrase{pp}){
			    print "this is not possible E-PP. sen:@$line[$i], $column[3]\n";
			    exit;
			}
		    }
		}#if($chain[$j] eq "E-PP")
	    }#for($j=0; $j<scalar(@chain); $j++)
	}#if($column[10] =~ /E-PP/)


    }#for($i=0; $i<scalar(@$line); $i++)
  
    if(scalar(@np_left) != scalar(@np_right)){
	print "miss NP.\n";
    }
    for($i=0; $i<scalar(@np_left); $i++){
	print "np_left[$i]=$np_left[$i], np_right[$i]=$np_right[$i]\n";
    }
    if(scalar(@vp_left) != scalar(@vp_right)){
	print "miss VP.\n";
    }
    for($i=0; $i<scalar(@vp_left); $i++){
	print "vp_left[$i]=$vp_left[$i], vp_right[$i]=$vp_right[$i]\n";
    }


    if(scalar(@pp_left) != scalar(@pp_right)){
	print "miss PP.\n";
    }
    for($i=0; $i<scalar(@pp_left); $i++){
	print "pp_left[$i]=$pp_left[$i], pp_right[$i]=$pp_right[$i]\n";
    }


    return (\@np_left, \@np_right, \@vp_left, \@vp_right, \@pp_left, \@pp_right); 

}




