AdvancedLiterateMachinery
AdvancedLiterateMachinery copied to clipboard
postprocessing of the result
I wonder if there is any script that can help to reconstruct (concatinating text_lists
w.r.t to the polygon positions) in order to form text blocks ? is there any such post-processing analysis that can create comprehensive json file ?
my approach is to create a csv file
import json
import csv
input_json_file = '20213684_DocXChain.json'
output_csv_file = 'compiled_text.csv'
# Load JSON data
with open(input_json_file, 'r') as file:
json_data = json.load(file)
# Prepare data for CSV
csv_data = []
for page_data in json_data:
page_number = page_data['page']
for info in page_data['information']:
category_name = info['category_name']
region_poly = info['region_poly']
for text_info in info['text_list']:
position = text_info['position']
content = " ".join(text_info['content'])
# Splitting position into individual coordinates
pos_x1, pos_y1, pos_x2, pos_y2, pos_x3, pos_y3, pos_x4, pos_y4 = position
csv_data.append([page_number, content, pos_x1, pos_y1, pos_x2, pos_y2, pos_x3, pos_y3, pos_x4, pos_y4, category_name, region_poly])
# Write data to CSV
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Page Number', 'Content', 'PosX1', 'PosY1', 'PosX2', 'PosY2', 'PosX3', 'PosY3', 'PosX4', 'PosY4', 'Category Name', 'Region Poly'])
writer.writerows(csv_data)
print(f"CSV file written to {output_csv_file}")
then sort like
aa = pd.read_csv(output_csv_file)
sorted_aa = aa.sort_values(by=['ColumnSide', 'PosY1'], ascending=[True, True])
sorted_aa
but the orders are not correct