~/Documents/Teaching/DGfS Herbstschule 2005/Code/Charty/PSGPars.py.html

#!/usr/bin/env python

"""
PSGPars.py
This is a storage and parser class for context free grammars
written in the format of reproduction or replacement rules.


(C) 2002, 2003 by Damir Cavar <dcavar@indiana.edu>

This code is written and distributed under the
GNU General Public License which means that its
source code is freely-distributed and available
to the general public.

See http://www.gnu.org/copyleft/gpl.html for details
on the license or the the file gpl.txt that should always be
distributed with this code.


A parser for context free Phrase Structure Grammars (PSG).
The grammars are restricted to:
a. only one non-terminal on the left side of a rule:
   N -> x y z
b. Non-terminals and terminals on the right side

Rules are read in with the following constraints:
i.   it is assumed that all elements on the left side of a rule
     are non-terminals
ii.  all elements that are not in the set of non-terminals
     (appear on the right side, but never on the left side) are
     assumed to be terminals


This code can be opimized. However, its main purpose is to help
students understand how simple chart parsing works. If there are any bugs,
please let me know: Damir Cavar <dcavar@indiana.edu>
"""

__author__  = "Damir Cavar"
__version__ = "0.1"

import string, re


class PSG:
  "A Phrase Structure Grammar parser and storage class."

  regexp_nont = re.compile(r"([^\s]+)\s+->\s+(.+)")
  regexp_t = re.compile(r"([^\s]+)")

  def __init__(self, filename):
    self.lhs = []           # list of left-hand-side symbols
    self.rhs = []           # list of right-hand-side symbols
    self.terminals    = []  # list of terminals
    self.nonterminals = []  # list of non-terminals
    self.lhshash = {}
    self.rhshash = {}
    self.tags         = []  # list of tags
    self.lex          = []  # list of words for tag
    self.load(filename)

  def load(self, filename):

    lines = []
    file = open(filename)
    if file:
      for i in file:
        str = string.lstrip(string.rstrip(i))
        if len(str) == 0:
          continue
        if str[0] == "#":
          continue
        else:
          lines.append(str)

    # extract nonterminals = lhs
    rhs = []
    for i in lines:
      result = self.regexp_nont.findall(i)
      if result != None:
        for k in result:
          if k[0] not in self.nonterminals:
            self.nonterminals.append(k[0])
          self.lhs.append(k[0])
          rhs.append(k[1])

    # extract terminals
    num = 0
    dellist = []
    for i in rhs:
      result = self.regexp_t.findall(i)
      if result != None:
        tmp = []
        for k in result:
          tmp.append(k)
          if k not in self.nonterminals:
            self.terminals.append(k)
            dellist.append(num)
        self.rhs.append(tmp)
        num += 1
    for i in dellist:
      if self.lhs[i] in self.tags:
        pos = self.tags.index(self.lhs[i])
        self.lex[pos].append(self.rhs[i][0])
      else:
        self.tags.append(self.lhs[i])
        self.lex.append([self.rhs[i][0]])
    dellist.reverse()
    for i in dellist:
      del self.lhs[i]
      del self.rhs[i]

    # make a hash table with LHS
    for i in range(len(self.lhs)):
      if self.lhshash.has_key(self.lhs[i]):
        if i not in self.lhshash[self.lhs[i]]:
          tmplist = self.lhshash[self.lhs[i]]
          tmplist.append(i)
          self.lhshash[self.lhs[i]] = tmplist
      else:
        self.lhshash[self.lhs[i]] = [i]
      if self.rhshash.has_key(self.rhs[i][0]):
        if i not in self.rhshash[self.rhs[i][0]]:
          tmplist = self.rhshash[self.rhs[i][0]]
          tmplist.append(i)
          self.rhshash[self.rhs[i][0]] = tmplist
      else:
        self.rhshash[self.rhs[i][0]] = [i]

  def getTag(self, word):
    for i in range(len(self.lex)):
      if word in self.lex[i]:
        return self.tags[i]
    return "unknown"