@mastersthesis{CheungMSC,
  author = "Jackie C. K. Cheung",
  title = "Parsing German Topological Fields with Probabilistic Context-Free Grammars",
  year = "2009",
  school = "Department of Computer Science, University of Toronto",
  abstract = "<p>Syntactic analysis is useful for many natural language processing applications requiring further semantic analysis. 
              Recent research in statistical parsing has produced a number of high-performance parsers using probabilistic 
              context-free (PCFG) models to parse English text, such as (Collins, 2003; Charniak and Johnson, 2005). Problems arise, 
              however, when applying these methods to parse sentences in freer-word-order languages. Such languages as Russian,
              Warlpiri, and German feature syntactic constructions that produce discontinuous constituents, directly violating 
              one of the crucial assumptions of context-free models of syntax.</p>
              <p>While PCFG technologies may thus be inadequate for full syntactic analysis of all phrasal structure in these languages, 
              clausal structure can still be fruitfully parsed with these methods. In particular, we examine applying PCFG parsing to 
              parse the topological field structure of German. These topological fields provide a high-level description of the major 
              sections of a clause in relation to the clausal main verb and the subordinating heads and appear in strict linear sequences 
             amenable to PCFG parsing. They are useful for tasks such as deep syntactic analysis, part-of-speech tagging and coreference
             resolution.</p>
             <p>In this work, we apply an unlexicalized, latent variable-based parser (Petrov et al., 2006) to topological field parsing, 
             and achieve state-of-the-art parsing results on two German newspaper corpora without any language- or model-dependent adaptation.
             We perform a qualitative error analysis of the parser output, and identify constructions like ellipses and parentheticals as 
             the chief sources of remaining error. This is confirmed by a further experiment in which parsing performance improves after 
             restricting the training and test set to those sentences without these constructions.</p>
             <p> We also explore techniques for further improving parsing results. For example, discriminative reranking of parses made by a 
             generative parser could incorporate linguistic information such as those derived by our qualitative analysis. Self-training is 
             another semi-supervised technique which utilizes additional unannotated data for training.</p>",
   download = "http://ftp.cs.toronto.edu/pub/gh/Cheung-MSc-2009.pdf"
}





