dataset-examples
dataset-examples copied to clipboard
json_to_csv_converter.py fixed for Python 3
"""Convert the Yelp Dataset Challenge dataset from json format to csv.
For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge
"""
import argparse
import collections
import csv
import simplejson as json
def read_and_write_file(json_file_path, csv_file_path, column_names):
"""Read in the json dataset file and write it out to a csv file, given the column names."""
with open(csv_file_path, 'w') as fout:
csv_file = csv.writer(fout)
csv_file.writerow(list(column_names))
with open(json_file_path) as fin:
for line in fin:
line_contents = json.loads(line)
#print(column_names, line_contents)
csv_file.writerow(get_row(line_contents, column_names))
def get_superset_of_column_names_from_file(json_file_path):
"""Read in the json dataset file and return the superset of column names."""
column_names = set()
with open(json_file_path) as fin:
for line in fin:
line_contents = json.loads(line)
column_names.update(
set(get_column_names(line_contents).keys())
)
return column_names
def get_column_names(line_contents, parent_key=''):
"""Return a list of flattened key names given a dict.
Example:
line_contents = {
'a': {
'b': 2,
'c': 3,
},
}
will return: ['a.b', 'a.c']
These will be the column names for the eventual csv file.
"""
column_names = []
for k, v in line_contents.items():
column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
if isinstance(v, collections.MutableMapping):
column_names.extend(
get_column_names(v, column_name).items()
)
else:
column_names.append((column_name, v))
return dict(column_names)
def get_nested_value(d, key):
"""Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
Example:
d = {
'a': {
'b': 2,
'c': 3,
},
}
key = 'a.b'
will return: 2
"""
if '.' not in key:
if key not in d:
return None
return d[key]
base_key, sub_key = key.split('.', 1)
if base_key not in d:
return None
sub_dict = d[base_key]
if sub_dict is None:
return None
return get_nested_value(sub_dict, sub_key)
def get_row(line_contents, column_names):
"""Return a csv compatible row given column names and a dict."""
row = []
for column_name in column_names:
line_value = get_nested_value(
line_contents,
column_name,
)
# print (line_value)
if isinstance(line_value, str):
row.append(line_value)
elif line_value is not None:
row.append(line_value)
else:
row.append('')
print(row)
return row
if __name__ == '__main__':
"""Convert a yelp dataset file from json to csv."""
parser = argparse.ArgumentParser(
description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
)
parser.add_argument(
'json_file',
type=str,
help='The json file to convert.',
)
args = parser.parse_args()
json_file = args.json_file
csv_file = '{0}.csv'.format(json_file.split('.json')[0])
column_names = get_superset_of_column_names_from_file(json_file)
read_and_write_file(json_file, csv_file, column_names)
Thank you! Also, it worked for me only after I changed line 26 and line 17 to this: with open(json_file_path, encoding="utf-8") as fin:
@mattdee @meenurajapandian After making the above changes I get this:
UnicodeEncodeError: 'charmap' codec can't encode character '\x9c' in position 112: character maps to
any thoughts on how to fix this?
@mattdee @meenurajapandian After making the above changes I get this:
UnicodeEncodeError: 'charmap' codec can't encode character '\x9c' in position 112: character maps to
any thoughts on how to fix this?
You need to include the encoding in every place that you have used the open() function. There are 3 places where they need to be included.
Totally agree with @meenurajapandian , in this case, encoding = 'latin1' worked fine for me.
@meenurajapandian, i have python 3.6 and newbie to this language. Can you please help me how to set the json file path. I want to convert yelp data set. Please help with the full syntax.i will be very thankful to you. I need fully running code.
Slightly simpler version of json to csv conversion script can be found here: https://github.com/CAVIND46016/Yelp-Dataset-Analysis/blob/master/json_to_csv.py I have hard-coded the paths, so it should be simple to follow...
@meenurajapandian, i have python 3.6 and newbie to this language. Can you please help me how to set the json file path. I want to convert yelp data set. Please help with the full syntax.i will be very thankful to you. I need fully running code.
From the terminal, you can run the command
$ python json_to_csv_converter.py yelp_academic_dataset.json
if the files are in the current directory or mention the path instead
$ python json_to_csv_converter.py path/yelp_academic_dataset.json
This is given in the documentation of the code.