1   
  2   
  3   
  4  """simple parser / string tokenizer 
  5  rather than returning a list of token types etc, we simple return a list of tokens... 
  6  each tokenizing function takes a string as input and returns a list of tokens 
  7  """ 
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22   
 23   
 24   
 25   
 26   
 28    """takes away repeated quotes (escapes) and returns the string represented by the text""" 
 29    stringchar = text[0] 
 30    if text[-1] != stringchar or stringchar not in ("'",'"'): 
 31       
 32      raise ValueError, "error parsing escaped string: %r" % text 
 33    return text[1:-1].replace(stringchar+stringchar,stringchar) 
  34   
 36    """escapes quotes as neccessary and returns a string representing the text""" 
 37    if "'" in text: 
 38      if '"' in text: 
 39        return '"' + text.replace('"', '""') + '"' 
 40      else: 
 41        return '"' + text + '"' 
 42    else: 
 43      return "'" + text + "'" 
  44   
 46    """Intelligent parser error""" 
 47 -  def __init__(self, parser, message, tokennum): 
  48      """takes a message and the number of the token that caused the error""" 
 49      tokenpos = parser.findtokenpos(tokennum) 
 50      line, charpos = parser.getlinepos(tokenpos) 
 51      ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \ 
 52          (message, line, charpos, parser.tokens[tokennum])) 
 53      self.parser = parser 
 54      self.tokennum = tokennum 
   55   
 57    """this is a simple parser""" 
 58 -  def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0): 
  59      if defaulttokenlist is None: 
 60        self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>'] 
 61        self.defaulttokenlist.extend('(),[]:=+-') 
 62      else: 
 63        self.defaulttokenlist = defaulttokenlist 
 64      self.whitespacechars = whitespacechars 
 65      self.includewhitespacetokens = includewhitespacetokens 
 66      self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens] 
 67      self.quotechars = ('"', "'") 
 68      self.endquotechars = {'"':'"',"'":"'"} 
 69      self.stringescaping = 1 
  70   
 72      """makes strings in text into tokens...""" 
 73      tokens = [] 
 74      laststart = 0 
 75      instring = 0 
 76      endstringchar, escapechar = '', '\\' 
 77      gotclose, gotescape = 0, 0 
 78      for pos in range(len(text)): 
 79        char = text[pos] 
 80        if instring: 
 81          if self.stringescaping and (gotescape or char == escapechar) and not gotclose: 
 82            gotescape = not gotescape 
 83          elif char == endstringchar: 
 84            gotclose = not gotclose 
 85          elif gotclose: 
 86            tokens.append(text[laststart:pos]) 
 87            instring, laststart, endstringchar = 0, pos, '' 
 88        if not instring: 
 89          if char in self.quotechars: 
 90            if pos > laststart: tokens.append(text[laststart:pos]) 
 91            instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0 
 92      if laststart < len(text): tokens.append(text[laststart:]) 
 93      return tokens 
  94   
 96      """checks whether a token should be kept together""" 
 97      return self.isstringtoken(text) 
  98   
100      """checks whether a token is a string token""" 
101      return text[:1] in self.quotechars 
 102   
104      """this separates out tokens in tokenlist from whitespace etc""" 
105      if self.keeptogether(text): return [text] 
106      if tokenlist is None: 
107        tokenlist = self.defaulttokenlist 
108       
109      tokens = [] 
110      pos = 0 
111      laststart = 0 
112      lentext = len(text) 
113      while pos < lentext: 
114        foundtoken = 0 
115        for token in tokenlist: 
116          lentoken = len(token) 
117          if text[pos:pos+lentoken] == token: 
118            if laststart < pos: tokens.append(text[laststart:pos]) 
119            tokens.append(token) 
120            pos += lentoken 
121            foundtoken, laststart = 1, pos 
122            break 
123        if not foundtoken: pos += 1 
124      if laststart < lentext: tokens.append(text[laststart:]) 
125      return tokens 
 126   
128      """this removes whitespace but lets it separate things out into separate tokens""" 
129      if self.keeptogether(text): return [text] 
130       
131      tokens = [] 
132      pos = 0 
133      inwhitespace = 0 
134      laststart = 0 
135      for pos in range(len(text)): 
136        char = text[pos] 
137        if inwhitespace: 
138          if char not in self.whitespacechars: 
139            if laststart < pos and self.includewhitespacetokens: tokens.append(text[laststart:pos]) 
140            inwhitespace, laststart = 0, pos 
141        else: 
142          if char in self.whitespacechars: 
143            if laststart < pos: tokens.append(text[laststart:pos]) 
144            inwhitespace, laststart = 1, pos 
145      if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens): 
146        tokens.append(text[laststart:]) 
147      return tokens 
 148   
150      """apply a tokenizer to a set of text, flattening the result""" 
151      tokenizedlists = [tokenizer(text) for text in inputlist] 
152      joined = [] 
153      map(joined.extend, tokenizedlists) 
154      return joined 
 155   
157      """apply a set of tokenizers to a set of text, flattening each time""" 
158      for tokenizer in tokenizers: 
159        inputlist = self.applytokenizer(inputlist, tokenizer) 
160      return inputlist 
 161   
162 -  def tokenize(self, source, tokenizers=None): 
 163      """tokenize the text string with the standard tokenizers""" 
164      self.source = source 
165      if tokenizers is None: 
166        tokenizers = self.standardtokenizers 
167      self.tokens = self.applytokenizers([self.source], tokenizers) 
168      return self.tokens 
 169   
171      """finds the position of the given token in the text""" 
172      currenttokenpos = 0 
173      for currenttokennum in range(tokennum+1): 
174        currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos) 
175      return currenttokenpos 
 176   
178      """finds the line and character position of the given character""" 
179      sourcecut = self.source[:tokenpos] 
180      line = sourcecut.count("\n")+1 
181      charpos = tokenpos - sourcecut.rfind("\n") 
182      return line, charpos 
 183   
185      """raises a ParserError""" 
186      raise ParserError(self, message, tokennum) 
  187