#!/usr/bin/env python

This is a small incremental bottom-up chart parser for context free grammars.

(C) 2005 by Damir Cavar

This code is written and distributed under the
GNU General Public License which means that its
source code is freely-distributed and available
to the general public.

See http://www.gnu.org/copyleft/gpl.html for details
on the license or the the file gpl.txt that should always be
distributed with this code.

Used data structures:

   list of edges

   list of integers and symbols
   [start, end, dotindex, LHS, RHS]
   start:    integer, start of the edge
   end:      integer, end of the edge
   dotindex: integer, position of the dot in the RHS
   LHS:      string, left-hand side symbol
   RHS:      list of strings, symbols in right-hand side

Incremental (left-to-right) bottom-up chart parser.
Select only potentially appropriate rules from grammar
   - length of RHS is less or equal to remaining words/symbols

Processing steps:
   Word by word:
      Initialise chart with word (add edge for word)
      Do until no further improvement:
         Add new rules from grammar that consume inactive edges
         Apply the fundamental rule to induce new edges

This code can be opimized. However, its main purpose is to help
students understand how simple chart parsing works. If there are any bugs,
please let me know: Damir Cavar <dcavar@indiana.edu>"""

__author__  = "Damir Cavar"
__version__ = "0.2"

import sys, PSGPars
from copy import copy

grammarfile = "PSG1.txt"              # name of the CFG file
DEBUG = 1       # set this to 0 if you do not want tracing

grammar = PSGPars.PSG(grammarfile) # initialization of the grammar
chart  = []     # storage for edges
inputlength = 0 # storage for number of tokens in input

def isActive(edge):
   """Return 1 if edge is active, else return 0."""
   if edge[2] < len(edge[4]): return 1
   return 0

def isInactive(edge):
   """Return 1 if edge is active, else return 0."""
   if edge[2] >= len(edge[4]): return 1
   return 0

def match(aedge, iedge):
   """Returns 1 if the active edge and the inactive edge match,
      otherwise 0."""
   if aedge[1] == iedge[0]:
      if aedge[4][aedge[2]] == iedge[3]: return 1
   return 0

def getParse(inputlength, chart):
   """Returns a list of all parses in bracketing notation."""
   parses = []
   for i in range(len(chart)):
      if not isActive(chart[i]):
         if chart[i][0] == 0 and chart[i][1] == inputlength: # got spanning edge
   return parses

def structToStr(edges):
   """Returns a string representation of the parse with
      labled brackets."""
   tmpstr = ""
   for i in edges:
      if chart[i][5]:
         tmpstr = tmpstr + "[" + chart[i][3] + " " + structToStr(chart[i][5]) + " ] "
         tmpstr = tmpstr + "[" + chart[i][3] + " "
         for x in chart[i][4]:
            tmpstr = " ".join([tmpstr, x])
         tmpstr = tmpstr + " ] "
   return tmpstr

def ruleInvocation(lststart):
   """Add all the rules of the grammar to the chart that
      are relavant:
      Find the rule with the LHS of edge as the leftmost RHS
      symbol and maximally the remaining length of the input."""
   global chart
   change = 0
   for i in range(lststart, len(chart)):
      if chart[i][2] >= len(chart[i][4]): # only inactive edge
         (start, end, index, lhs, rhs, consumed) = chart[i]
         if grammar.rhshash.has_key(lhs):
            for k in grammar.rhshash[lhs]:
               if len(grammar.rhs[k]) > inputlength - start:
               newedge = [ start, end, 1, grammar.lhs[k], grammar.rhs[k], [i] ]
               if newedge in chart:
               change = 1
               if DEBUG:
                  print "RI Adding edge:", newedge
   return change

def fundamentalRule():
   """The fundamental rule of chart parsing generates new edges by
      combining fitting active and inactive edges."""
   global chart
   change = 0
   for aedge in chart:
      if isActive(aedge):
         for k in range(len(chart)):
            if isInactive(chart[k]):
               if match(aedge, chart[k]):
                  newedge = aedge[:]
                  newedge[5] = aedge[5][:]
                  newedge[1] = chart[k][1]
                  newedge[2] = newedge[2] + 1
                  if newedge not in chart:
                     change = 1
                     if DEBUG:
                        print "FR Adding edge:", newedge
   return change

def parse(input):
   """Parse a list of tokens.
   global chart, inputlength
   chart = []
   inputlength = len(input)

   chartpos = 0  # remember start-position in chart
   for i in range(len(input)):
      # initialize with input token
      chart.append([ i, i + 1, 1, grammar.getTag(input[i]), [ input[i] ], [] ])
      if DEBUG:
         print "Adding edge:", chart[len(chart) - 1]
      change = 1
      while change:
         change = 0
         chartlen = len(chart)
         if ruleInvocation(chartpos):
            change = 1
         chartpos = chartlen  # set pointer to new edge in chart
         if fundamentalRule():
            change = 1
   if DEBUG:
      print "Chart:"
      for i in range(len(chart)):
         if isActive(chart[i]):
            print i, "Active:  ",
            print i, "Inactive:",
         print chart[i]
   for i in getParse(inputlength, chart):
      print "Parse:"
      print i

if __name__ == "__main__":
   if len(sys.argv) > 1:
      print "Parsing: John kissed Mary"
      parse(["John", "kissed", "Mary"])
      print "-----------------------------------------------------------"
      print "Parsing: John kissed Mary"
      parse(["the", "green", "man", "killed", "the", "blue", "man", "with", "the", "tie"])