tablib
tablib copied to clipboard
IllegalCharacterError raised when exporting xlsx
Using tablib version 0.13.0 The issue is with Control Characters and Surrogates
Steps to reproduce:
from tablib import Dataset
data=Dataset((u'\x1f',),)
data.export('xlsx')
Stack Trace
IllegalCharacterErrorTraceback (most recent call last)
<ipython-input-1-9b9694f739ff> in <module>()
1 from tablib import Dataset
2 data=Dataset((u'\x1f',),)
----> 3 data.export('xlsx')
/srv/jupyter/local/lib/python2.7/site-packages/tablib/core.pyc in export(self, format, **kwargs)
466 raise UnsupportedFormat('Format {0} cannot be exported.'.format(format))
467
--> 468 return export_set(self, **kwargs)
469
470 # -------
/srv/jupyter/local/lib/python2.7/site-packages/tablib/formats/_xlsx.pyc in export_set(dataset, freeze_panes)
41 ws.title = dataset.title if dataset.title else 'Tablib Dataset'
42
---> 43 dset_sheet(dataset, ws, freeze_panes=freeze_panes)
44
45 stream = BytesIO()
/srv/jupyter/local/lib/python2.7/site-packages/tablib/formats/_xlsx.pyc in dset_sheet(dataset, ws, freeze_panes)
145 cell.value = unicode('%s' % col, errors='ignore')
146 except TypeError:
--> 147 cell.value = unicode(col)
/srv/jupyter/local/lib/python2.7/site-packages/openpyxl/cell/cell.pyc in value(self, value)
292 def value(self, value):
293 """Set the value and infer type and display options."""
--> 294 self._bind_value(value)
295
296 @property
/srv/jupyter/local/lib/python2.7/site-packages/openpyxl/cell/cell.pyc in _bind_value(self, value)
195
196 elif isinstance(value, STRING_TYPES):
--> 197 value = self.check_string(value)
198 self.data_type = self.TYPE_STRING
199 if len(value) > 1 and value.startswith("="):
/srv/jupyter/local/lib/python2.7/site-packages/openpyxl/cell/cell.pyc in check_string(self, value)
158 value = value[:32767]
159 if next(ILLEGAL_CHARACTERS_RE.finditer(value), None):
--> 160 raise IllegalCharacterError
161 return value
162
IllegalCharacterError:
Also getting this. Openpyxl detects illegal characters with the following regex:
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
I'm using django-import-export, which in turn uses tablib, which uses openpyxl. Still trying to figure out who should handle the data cleaning.
I fixed the issue by cleaning the data before it gets sent to tablib.
If anyone else is having this issue with django-import-export, you can clean your fields by overriding export_field on your resource.
from import_export import resources
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
class CleanModelResource(resources.ModelResource):
def export_field(self, field, obj):
v = super(CleanModelResource, self).export_field(field, obj)
if type(v) == str:
v = ILLEGAL_CHARACTERS_RE.sub('', v)
return v