DAVAR-Lab-OCR
DAVAR-Lab-OCR copied to clipboard
LGPMA表格处理
def html_to_area(html_str, row_index, span_matrix): """Convert html to span matrix, a two-dimensional matrix representing table structure
Args:
html_str(str): html representing table structure.
row_index(list): index of each row in html.
span_matrix(np.array): a two-dimensional matrix representing table structure.
Returns:
np.array(num_row x num_col): span matrix
"""
num_row, num_col = span_matrix.shape[0], span_matrix.shape[1]
staus = 0 # whether the given html is illegal
area_index = 1
row_index.append(len(html_str))
for i in range(num_row):
col_index = 0 # record column number of the current row
spantogether = 0
html_cur_row = html_str[row_index[i]:row_index[i + 1]]
for ind, tag in enumerate(html_cur_row):
if spantogether:
spantogether = 0
continue
# if cur tag is not key information,continue
if tag != "<td>" and "span" not in tag:
continue
if col_index > num_col - 1:
return 1 # The column of current row exceeds the column of the first row
# current cell is a part of row span cell
while span_matrix[i, col_index] != -1:
if col_index == num_col - 1:
return 1
else:
col_index += 1
# basic cell
if tag == "<td>":
span_matrix[i, col_index] = area_index
col_index += 1
# "rowspan" and "colspan" together
elif "rowspan" in tag and (ind != len(html_cur_row) - 1 and "colspan" in html_cur_row[ind + 1]):
row = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
col = int(html_cur_row[ind + 1][-3:-1]) \
if html_cur_row[ind + 1][-3:-1].isdigit() else int(html_cur_row[ind + 1][-2])
spantogether = 1 # the next span will be skipped
if (span_matrix[i:i + row, col_index:col_index + col] != -1).any():
return 3 # Overlay between cells
span_matrix[i:i + row, col_index:col_index + col] = area_index
if i + row > span_matrix.shape[0] or col_index + col > span_matrix.shape[1]:
return 2 # Spanning cell exceeds the table boundary
col_index += col
# only "colspan"
elif "colspan" in tag:
col = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
if col_index + col > num_col:
return 2
if (span_matrix[i, col_index:col_index + col] != -1).any():
return 3
span_matrix[i, col_index:col_index + col] = area_index
col_index += col
# only "rowspan"
elif "rowspan" in tag:
row = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
if i + row > num_row:
return 2
if (span_matrix[i:i + row, col_index] != -1).any():
return 3
span_matrix[i:i + row, col_index] = area_index
col_index += 1
area_index += 1
if -1 in span_matrix:
staus = 1 # The column number of some rows is smaller than the column number of the first row
return staus
函数中以下代码片段,如果正好两个跨列之和等于总列数,就被视为异常了,不是有问题?还有tag[-3:-1]的取值 如果是类似 " colspan="15"" 大于10 的值只取了一个5,1没取道
# only "colspan"
elif "colspan" in tag:
col = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
if col_index + col > num_col:
return 2