1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22  """string processing utilities for extracting strings with various kinds of delimiters""" 
 23   
 24  import logging 
 25  import htmlentitydefs 
 26   
 28    """returns a list of locations where substr occurs in searchin 
 29    locations are not allowed to overlap""" 
 30    location = 0 
 31    locations = [] 
 32    while location != -1: 
 33      location = searchin.find(substr, location) 
 34      if location != -1: 
 35        locations.append(location) 
 36        location += len(substr) 
 37    return locations 
  38   
 40    """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping 
 41    returns tuple of (quoted string with quotes, still in string at end)""" 
 42     
 43    instring = startinstring 
 44    enteredonce = False 
 45    lenstart = len(startdelim) 
 46    lenend = len(enddelim) 
 47    startdelim_places = find_all(source, startdelim) 
 48    if startdelim == enddelim: 
 49      enddelim_places = startdelim_places[:] 
 50    else: 
 51      enddelim_places = find_all(source, enddelim) 
 52    if escape is not None: 
 53      lenescape = len(escape) 
 54      escape_places = find_all(source, escape) 
 55      last_escape_pos = -1 
 56       
 57      true_escape = False 
 58      true_escape_places = [] 
 59      for escape_pos in escape_places: 
 60        if escape_pos - lenescape in escape_places: 
 61          true_escape = not true_escape 
 62        else: 
 63          true_escape = True 
 64        if true_escape: 
 65          true_escape_places.append(escape_pos) 
 66      startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 
 67      enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 
 68    else: 
 69      enddelim_places = [pos + lenend for pos in enddelim_places] 
 70     
 71    significant_places = dict.fromkeys([0] + startdelim_places + enddelim_places + [len(source)-1]).keys() 
 72    significant_places.sort() 
 73    extracted = "" 
 74    lastpos = None 
 75    for pos in significant_places: 
 76      if instring and pos in enddelim_places: 
 77         
 78        if lastpos == pos - lenstart and lastpos in startdelim_places: 
 79          continue 
 80        extracted += source[lastpos:pos] 
 81        instring = False 
 82        lastpos = pos 
 83      if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 
 84        instring = True 
 85        enteredonce = True 
 86        lastpos = pos 
 87    if instring: 
 88      extracted += source[lastpos:] 
 89    return (extracted, instring) 
  90   
 92    """Calls extract over multiple lines, remembering whether in the string or not""" 
 93    result = "" 
 94    instring = 0 
 95    for line in lines: 
 96      (string, instring) = extract(line, startdelim, enddelim, escape, instring) 
 97      result += string 
 98      if not instring: break 
 99    return result 
 100   
102    "Extracts a doublequote-delimited string from a string, allowing for backslash-escaping" 
103    (string, instring) = extract(source, '"', '"', '\\') 
104    return string 
 105   
109   
111    """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping 
112    includeescapes can also be a function that takes the whole escaped string and returns the replaced version""" 
113    instring = startinstring 
114    enteredonce = False 
115    lenstart = len(startdelim) 
116    lenend = len(enddelim) 
117    startdelim_places = find_all(source, startdelim) 
118    if startdelim == enddelim: 
119      enddelim_places = startdelim_places[:] 
120    else: 
121      enddelim_places = find_all(source, enddelim) 
122    if escape is not None: 
123      lenescape = len(escape) 
124      escape_places = find_all(source, escape) 
125      last_escape_pos = -1 
126       
127      true_escape = False 
128      true_escape_places = [] 
129      for escape_pos in escape_places: 
130        if escape_pos - lenescape in escape_places: 
131          true_escape = not true_escape 
132        else: 
133          true_escape = True 
134        if true_escape: 
135          true_escape_places.append(escape_pos) 
136      startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 
137      enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 
138    else: 
139      enddelim_places = [pos + lenend for pos in enddelim_places] 
140     
141    significant_places = dict.fromkeys([0] + startdelim_places + enddelim_places + [len(source)-1]).keys() 
142    significant_places.sort() 
143    extracted = "" 
144    lastpos = 0 
145    callable_includeescapes = callable(includeescapes) 
146    checkescapes = callable_includeescapes or not includeescapes 
147    for pos in significant_places: 
148      if instring and pos in enddelim_places and lastpos != pos - lenstart: 
149        section_start, section_end = lastpos + len(startdelim), pos - len(enddelim) 
150        section = source[section_start:section_end] 
151        if escape is not None and checkescapes: 
152          escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end] 
153          new_section = "" 
154          last_epos = 0 
155          for epos in escape_list: 
156            new_section += section[last_epos:epos] 
157            if callable_includeescapes: 
158              replace_escape = includeescapes(section[epos:epos+lenescape+1]) 
159               
160              if not isinstance(replace_escape, basestring): 
161                if replace_escape: 
162                  replace_escape = section[epos:epos+lenescape+1] 
163                else: 
164                  replace_escape = section[epos+lenescape:epos+lenescape+1] 
165              new_section += replace_escape 
166              last_epos = epos + lenescape + 1 
167            else: 
168              last_epos = epos + lenescape 
169          section = new_section + section[last_epos:] 
170        extracted += section 
171        instring = False 
172        lastpos = pos 
173      if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 
174        instring = True 
175        enteredonce = True 
176        lastpos = pos 
177    if instring: 
178      section_start = lastpos + len(startdelim) 
179      section = source[section_start:] 
180      if escape is not None and not includeescapes: 
181        escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos] 
182        new_section = "" 
183        last_epos = 0 
184        for epos in escape_list: 
185          new_section += section[last_epos:epos] 
186          if callable_includeescapes and includeescapes(section[epos:epos+lenescape+1]): 
187            last_epos = epos 
188          else: 
189            last_epos = epos + lenescape 
190        section = new_section + section[last_epos:] 
191      extracted += section 
192    return (extracted, instring) 
 193   
195    "Returns the same string, with double quotes escaped with backslash" 
196    if escapeescapes: 
197      return source.replace('\\', '\\\\').replace('"', '\\"') 
198    else: 
199      return source.replace('"','\\"') 
 200   
202    "Returns the same string, with single quotes doubled" 
203    return source.replace("'","''") 
 204   
206    """encodes source using HTML entities e.g. © -> ©""" 
207    output = "" 
208    for char in source: 
209      charnum = ord(char) 
210      if charnum in htmlentitydefs.codepoint2name: 
211        output += "&%s;" % htmlentitydefs.codepoint2name[charnum] 
212      else: 
213        output += str(char) 
214    return output 
 215   
217    """decodes source using HTML entities e.g. © -> ©""" 
218    output = u"" 
219    inentity = False 
220    for char in source: 
221      if char == "&": 
222         inentity = True 
223         possibleentity = "" 
224         continue 
225      if inentity: 
226        if char == ";": 
227          if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint: 
228            output += unichr(htmlentitydefs.name2codepoint[possibleentity]) 
229            inentity = False 
230          else: 
231            output += "&" + possibleentity + ";" 
232            inentity = False 
233        elif char == " ": 
234          output += "&" + possibleentity + char 
235          inentity = False 
236        else: 
237          possibleentity += char 
238      else: 
239        output += char 
240    return output 
 241   
243    """encodes source in the escaped-unicode encoding used by Java .properties files""" 
244    output = "" 
245    for char in source: 
246      charnum = ord(char) 
247      if char in controlchars: 
248        output += controlchars[char] 
249      elif 0 <= charnum < 128: 
250        output += str(char) 
251      else: 
252        output += "\\u%04X" % charnum 
253    return output 
 254   
256    """encodes source in the escaped-unicode encoding used by Mozilla .properties files""" 
257    output = "" 
258    for char in source: 
259      charnum = ord(char) 
260      if char in controlchars: 
261        output += controlchars[char] 
262      else: 
263        output += char 
264    return output 
 265   
266  propertyescapes = { 
267     
268    "\\": "\\", "'": "'", '"': '"', 
269     
270    "b": "\b", "f": "\f", "t": "\t", "n": "\n", "v": "\v", "a": "\a" 
271    } 
272   
273  controlchars = { 
274     
275    "\b": "\\b", "\f": "\\f", "\t": "\\t", "\n": "\\n", "\v": "\\v" 
276    } 
277   
283   
285    """decodes source from the escaped-unicode encoding used by mozilla .properties files""" 
286     
287     
288     
289    output = u"" 
290    s = 0 
291    if isinstance(source, str): 
292      source = source.decode("utf-8") 
293    def unichr2(i): 
294      """Returns a Unicode string of one character with ordinal 32 <= i, otherwise an escaped control character""" 
295      if 32 <= i: 
296        return unichr(i) 
297      elif unichr(i) in controlchars: 
298         
299         
300        return unichr(i) 
301      else: 
302        return "\\u%04x" % i 
 303    while s < len(source): 
304      c = source[s] 
305      if c != '\\': 
306        output += c 
307        s += 1 
308        continue 
309      s += 1 
310      if s >= len(source): 
311         
312         
313        output += c 
314        continue 
315      c = source[s] 
316      s += 1 
317      if c == '\n': pass 
318       
319      elif c in propertyescapes: output += propertyescapes[c] 
320       
321       
322      elif c in "uU": 
323        digits = 4 
324        x = 0 
325        for digit in range(digits): 
326          x <<= 4 
327          if s + digit >= len(source): 
328            digits = digit 
329            break 
330          c = source[s+digit].lower() 
331          if c.isdigit(): 
332            x += ord(c) - ord('0') 
333          elif c in "abcdef": 
334            x += ord(c) - ord('a') + 10 
335          else: 
336            break 
337        s += digits 
338        output += unichr2(x) 
339      elif c == "N": 
340        if source[s] != "{": 
341          logging.warn("Invalid named unicode escape: no { after \\N") 
342          output += "\\" + c 
343          continue 
344        s += 1 
345        e = source.find("}", s) 
346        if e == -1: 
347          logging.warn("Invalid named unicode escape: no } after \\N{") 
348          output += "\\" + c 
349          continue 
350        import unicodedata 
351        name = source[s:e] 
352        output += unicodedata.lookup(name) 
353        s = e + 1 
354      else: 
355        output += "\\" + c 
356    return output 
357   
359    "Returns a doublequote-delimited quoted string, escaping double quotes with backslash" 
360    if isinstance(source, list): 
361      firstline = True 
362      for line in source: 
363        if firstline: 
364          newsource = '"' + escapequotes(line, escapeescapes) + '"'      
365          firstline = False 
366        else: 
367          newsource = newsource + '\n' + '"' + escapequotes(line, escapeescapes) + '"' 
368      return newsource 
369    else: 
370      return '"' + escapequotes(source, escapeescapes) + '"' 
 371   
373    "Returns a doublequote-delimited quoted string, escaping single quotes with themselves" 
374    return "'" + escapesinglequotes(source) + "'" 
 375   
382   
384    s = string.find(substring) 
385    if s != -1: 
386      s += len(substring) 
387    return s 
 388   
390    return string.rstrip("\r\n") 
 391   
400   
403   
405    """encodes certain characters in the string using an encode dictionary""" 
406    encoded = unencoded 
407    for key, value in encodedict.iteritems(): 
408      if key in encoded: 
409        encoded = encoded.replace(key, value) 
410    return encoded 
 411   
413    """convert numbers to utf8 codes in the values of a dictionary""" 
414    for key, value in d.items(): 
415      if type(value) == int: 
416        d[key] = unichr(value).encode('utf8') 
417    return d 
 418   
420    x = ' "this" " is " "a" " test!" ' 
421    print extract(x, '"', '"', None) 
422    print extract(x, '"', '"', '!') 
423    print extractwithoutquotes(x, '"', '"', None) 
424    print extractwithoutquotes(x, '"', '"', '!') 
425    print extractwithoutquotes(x, '"', '"', '!', includeescapes=False) 
 426   
427  if __name__ == '__main__': 
428    testcase() 
429