refine-client-py
refine-client-py copied to clipboard
new_project fails for xls, xlsx and ods with OpenRefine >=2.8
OpenRefine 2.8 introduced a new feature for selecting sheets in the importer.
OpenRefine 2.7:
{
"sheets": [
0
]
}
OpenRefine 2.8:
{
"sheets": [
{
"name": "duplicates.xls#duplicates",
"fileNameAndSheetIndex": "duplicates.xls#0",
"rows": 11,
"selected": true
}
]
}
Calling the function new_project()
with the new sheet option fails. Project will be created but contains 0 rows and thus throws an exception KeyError: 'keyColumnName'
In:
from google.refine import refine
server1 = refine.Refine('http://localhost:3333')
project1 = server1.new_project(
project_file='data/cli/duplicates.xls',
project_format='binary/text/xml/xls/xlsx',
sheets=[{
'name': 'duplicates.xls#duplicates',
'fileNameAndSheetIndex': 'duplicates.xls#0',
'rows': 11,
'selected': True,
}]
)
Out:
KeyError Traceback (most recent call last)
<ipython-input-16-4ce682cb870d> in <module>()
----> 1 project1 = server1.new_project(project_file='data/cli/duplicates.xls', project_format='binary/text/xml/xls/xlsx', sheets=[{"name":"duplicates.xls#duplicates","fileNameAndSheetIndex":"duplicates.xls#0","rows":11,"selected":True}])
/home/felix/.local/lib/python2.7/site-packages/google/refine/refine.pyc in new_project(self, project_file, project_url, project_name, project_format, encoding, separator, ignore_lines, header_lines, skip_data_lines, limit, store_blank_rows, guess_cell_value_types, process_quotes, store_blank_cells_as_nulls, include_file_sources, **opts)
277 if 'project' in url_params:
278 project_id = url_params['project'][0]
--> 279 return RefineProject(self.server, project_id)
280 else:
281 raise Exception('Project not created')
/home/felix/.local/lib/python2.7/site-packages/google/refine/refine.pyc in __init__(self, server, project_id)
354 self.column_order = {} # map of column names to order in UI
355 self.rows_response_factory = None # for parsing get_rows()
--> 356 self.get_models()
357 # following filled in by get_reconciliation_services
358 self.recon_services = None
/home/felix/.local/lib/python2.7/site-packages/google/refine/refine.pyc in get_models(self)
400 self.column_order[name] = i
401 column_index[name] = column['cellIndex']
--> 402 self.key_column = column_model['keyColumnName']
403 self.has_records = response['recordModel'].get('hasRecords', False)
404 self.rows_response_factory = RowsResponseFactory(column_index)
KeyError: 'keyColumnName'