1   
  2   
  3  """ 
  4  csv.py - read/write/investigate CSV files 
  5  """ 
  6   
  7  import re 
  8  from _csv import Error, __version__, writer, reader, register_dialect, \ 
  9                   unregister_dialect, get_dialect, list_dialects, \ 
 10                   QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 
 11                   __doc__ 
 12   
 13  try: 
 14      from cStringIO import StringIO 
 15  except ImportError: 
 16      from StringIO import StringIO 
 17   
 18  __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 
 19              "Error", "Dialect", "excel", "excel_tab", "reader", "writer", 
 20              "register_dialect", "get_dialect", "list_dialects", "Sniffer", 
 21              "unregister_dialect", "__version__", "DictReader", "DictWriter" ] 
 22   
 24      _name = "" 
 25      _valid = False 
 26       
 27      delimiter = None 
 28      quotechar = None 
 29      escapechar = None 
 30      doublequote = None 
 31      skipinitialspace = None 
 32      lineterminator = None 
 33      quoting = None 
 34   
 36          if self.__class__ != Dialect: 
 37              self._valid = True 
 38          errors = self._validate() 
 39          if errors != []: 
 40              raise Error, "Dialect did not validate: %s" % ", ".join(errors) 
  41   
 43          errors = [] 
 44          if not self._valid: 
 45              errors.append("can't directly instantiate Dialect class") 
 46   
 47          if self.delimiter is None: 
 48              errors.append("delimiter character not set") 
 49          elif (not isinstance(self.delimiter, str) or 
 50                len(self.delimiter) > 1): 
 51              errors.append("delimiter must be one-character string") 
 52   
 53          if self.quotechar is None: 
 54              if self.quoting != QUOTE_NONE: 
 55                  errors.append("quotechar not set") 
 56          elif (not isinstance(self.quotechar, str) or 
 57                len(self.quotechar) > 1): 
 58              errors.append("quotechar must be one-character string") 
 59   
 60          if self.lineterminator is None: 
 61              errors.append("lineterminator not set") 
 62          elif not isinstance(self.lineterminator, str): 
 63              errors.append("lineterminator must be a string") 
 64   
 65          if self.doublequote not in (True, False): 
 66              errors.append("doublequote parameter must be True or False") 
 67   
 68          if self.skipinitialspace not in (True, False): 
 69              errors.append("skipinitialspace parameter must be True or False") 
 70   
 71          if self.quoting is None: 
 72              errors.append("quoting parameter not set") 
 73   
 74          if self.quoting is QUOTE_NONE: 
 75              if (not isinstance(self.escapechar, (unicode, str)) or 
 76                  len(self.escapechar) > 1): 
 77                  errors.append("escapechar must be a one-character string or unicode object") 
 78   
 79          return errors 
   80   
 88  register_dialect("excel", excel) 
 89   
 92  register_dialect("excel-tab", excel_tab) 
 93   
 94   
 96 -    def __init__(self, f, fieldnames, restkey=None, restval=None, 
 97                   dialect="excel", *args, **kwds): 
  98          self.fieldnames = fieldnames     
 99          self.restkey = restkey           
100          self.restval = restval           
101          self.reader = reader(f, dialect, *args, **kwds) 
 102   
105   
107          row = self.reader.next() 
108           
109           
110           
111          while row == []: 
112              row = self.reader.next() 
113          d = dict(zip(self.fieldnames, row)) 
114          lf = len(self.fieldnames) 
115          lr = len(row) 
116          if lf < lr: 
117              d[self.restkey] = row[lf:] 
118          elif lf > lr: 
119              for key in self.fieldnames[lr:]: 
120                  d[key] = self.restval 
121          return d 
 125 -    def __init__(self, f, fieldnames, restval="", extrasaction="raise", 
126                   dialect="excel", *args, **kwds): 
 127          self.fieldnames = fieldnames     
128          self.restval = restval           
129          if extrasaction.lower() not in ("raise", "ignore"): 
130              raise ValueError, \ 
131                    ("extrasaction (%s) must be 'raise' or 'ignore'" % 
132                     extrasaction) 
133          self.extrasaction = extrasaction 
134          self.writer = writer(f, dialect, *args, **kwds) 
 135   
137          if self.extrasaction == "raise": 
138              for k in rowdict.keys(): 
139                  if k not in self.fieldnames: 
140                      raise ValueError, "dict contains fields not in fieldnames" 
141          return [rowdict.get(key, self.restval) for key in self.fieldnames] 
 142   
145   
159      ''' 
160      "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 
161      Returns a Dialect object. 
162      ''' 
164           
165          self.preferred = [',', '\t', ';', ' ', ':'] 
 166   
167   
168 -    def sniff(self, sample, delimiters=None): 
 185   
186          dialect.delimiter = delimiter 
187           
188          dialect.quotechar = quotechar or '"' 
189          dialect.skipinitialspace = skipinitialspace 
190   
191          return dialect 
 192   
193   
195          """ 
196          Looks for text enclosed between two identical quotes 
197          (the probable quotechar) which are preceded and followed 
198          by the same character (the probable delimiter). 
199          For example: 
200                           ,'some text', 
201          The quote with the most wins, same with the delimiter. 
202          If there is no quotechar the delimiter can't be determined 
203          this way. 
204          """ 
205   
206          matches = [] 
207          for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)',  
208                        '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',    
209                        '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   
210                        '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                             
211              regexp = re.compile(restr, re.S | re.M) 
212              matches = regexp.findall(data) 
213              if matches: 
214                  break 
215   
216          if not matches: 
217              return ('', None, 0)  
218   
219          quotes = {} 
220          delims = {} 
221          spaces = 0 
222          for m in matches: 
223              n = regexp.groupindex['quote'] - 1 
224              key = m[n] 
225              if key: 
226                  quotes[key] = quotes.get(key, 0) + 1 
227              try: 
228                  n = regexp.groupindex['delim'] - 1 
229                  key = m[n] 
230              except KeyError: 
231                  continue 
232              if key and (delimiters is None or key in delimiters): 
233                  delims[key] = delims.get(key, 0) + 1 
234              try: 
235                  n = regexp.groupindex['space'] - 1 
236              except KeyError: 
237                  continue 
238              if m[n]: 
239                  spaces += 1 
240   
241          quotechar = reduce(lambda a, b, quotes = quotes: 
242                             (quotes[a] > quotes[b]) and a or b, quotes.keys()) 
243   
244          if delims: 
245              delim = reduce(lambda a, b, delims = delims: 
246                             (delims[a] > delims[b]) and a or b, delims.keys()) 
247              skipinitialspace = delims[delim] == spaces 
248              if delim == '\n':  
249                  delim = '' 
250          else: 
251               
252              delim = '' 
253              skipinitialspace = 0 
254   
255          return (quotechar, delim, skipinitialspace) 
 256   
257   
259          """ 
260          The delimiter /should/ occur the same number of times on 
261          each row. However, due to malformed data, it may not. We don't want 
262          an all or nothing approach, so we allow for small variations in this 
263          number. 
264            1) build a table of the frequency of each character on every line. 
265            2) build a table of freqencies of this frequency (meta-frequency?), 
266               e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows, 
267               7 times in 2 rows' 
268            3) use the mode of the meta-frequency to determine the /expected/ 
269               frequency for that character 
270            4) find out how often the character actually meets that goal 
271            5) the character that best meets its goal is the delimiter 
272          For performance reasons, the data is evaluated in chunks, so it can 
273          try and evaluate the smallest portion of the data possible, evaluating 
274          additional chunks as necessary. 
275          """ 
276   
277          data = filter(None, data.split('\n')) 
278   
279          ascii = [chr(c) for c in range(127)]  
280   
281           
282          chunkLength = min(10, len(data)) 
283          iteration = 0 
284          charFrequency = {} 
285          modes = {} 
286          delims = {} 
287          start, end = 0, min(chunkLength, len(data)) 
288          while start < len(data): 
289              iteration += 1 
290              for line in data[start:end]: 
291                  for char in ascii: 
292                      metaFrequency = charFrequency.get(char, {}) 
293                       
294                      freq = line.strip().count(char) 
295                       
296                      metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 
297                      charFrequency[char] = metaFrequency 
298   
299              for char in charFrequency.keys(): 
300                  items = charFrequency[char].items() 
301                  if len(items) == 1 and items[0][0] == 0: 
302                      continue 
303                   
304                  if len(items) > 1: 
305                      modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, 
306                                           items) 
307                       
308                       
309                      items.remove(modes[char]) 
310                      modes[char] = (modes[char][0], modes[char][1] 
311                                     - reduce(lambda a, b: (0, a[1] + b[1]), 
312                                              items)[1]) 
313                  else: 
314                      modes[char] = items[0] 
315   
316               
317              modeList = modes.items() 
318              total = float(chunkLength * iteration) 
319               
320              consistency = 1.0 
321               
322              threshold = 0.9 
323              while len(delims) == 0 and consistency >= threshold: 
324                  for k, v in modeList: 
325                      if v[0] > 0 and v[1] > 0: 
326                          if ((v[1]/total) >= consistency and 
327                              (delimiters is None or k in delimiters)): 
328                              delims[k] = v 
329                  consistency -= 0.01 
330   
331              if len(delims) == 1: 
332                  delim = delims.keys()[0] 
333                  skipinitialspace = (data[0].count(delim) == 
334                                      data[0].count("%c " % delim)) 
335                  return (delim, skipinitialspace) 
336   
337               
338              start = end 
339              end += chunkLength 
340   
341          if not delims: 
342              return ('', 0) 
343   
344           
345          if len(delims) > 1: 
346              for d in self.preferred: 
347                  if d in delims.keys(): 
348                      skipinitialspace = (data[0].count(d) == 
349                                          data[0].count("%c " % d)) 
350                      return (d, skipinitialspace) 
351   
352           
353          delim = delims.keys()[0] 
354          skipinitialspace = (data[0].count(delim) == 
355                              data[0].count("%c " % delim)) 
356          return (delim, skipinitialspace) 
 357   
358   
360           
361           
362           
363           
364           
365           
366           
367           
368   
369          rdr = reader(StringIO(sample), self.sniff(sample)) 
370   
371          header = rdr.next()  
372   
373          columns = len(header) 
374          columnTypes = {} 
375          for i in range(columns): columnTypes[i] = None 
376   
377          checked = 0 
378          for row in rdr: 
379               
380              if checked > 20: 
381                  break 
382              checked += 1 
383   
384              if len(row) != columns: 
385                  continue  
386   
387              for col in columnTypes.keys(): 
388   
389                  for thisType in [int, long, float, complex]: 
390                      try: 
391                          thisType(row[col]) 
392                          break 
393                      except (ValueError, OverflowError): 
394                          pass 
395                  else: 
396                       
397                      thisType = len(row[col]) 
398   
399                   
400                  if thisType == long: 
401                      thisType = int 
402   
403                  if thisType != columnTypes[col]: 
404                      if columnTypes[col] is None:  
405                          columnTypes[col] = thisType 
406                      else: 
407                           
408                           
409                          del columnTypes[col] 
410   
411           
412           
413          hasHeader = 0 
414          for col, colType in columnTypes.items(): 
415              if type(colType) == type(0):  
416                  if len(header[col]) != colType: 
417                      hasHeader += 1 
418                  else: 
419                      hasHeader -= 1 
420              else:  
421                  try: 
422                      colType(header[col]) 
423                  except (ValueError, TypeError): 
424                      hasHeader += 1 
425                  else: 
426                      hasHeader -= 1 
427   
428          return hasHeader > 0 
 429