DAVAR-Lab-OCR icon indicating copy to clipboard operation
DAVAR-Lab-OCR copied to clipboard

LGPMA表格处理

Open cqray1990 opened this issue 3 months ago • 0 comments

def html_to_area(html_str, row_index, span_matrix): """Convert html to span matrix, a two-dimensional matrix representing table structure

Args:
    html_str(str): html representing table structure.
    row_index(list): index of each row in html.
    span_matrix(np.array): a two-dimensional matrix representing table structure.

Returns:
    np.array(num_row x num_col): span matrix
"""

num_row, num_col = span_matrix.shape[0], span_matrix.shape[1]

staus = 0  # whether the given html is illegal
area_index = 1
row_index.append(len(html_str))
for i in range(num_row):
    col_index = 0  # record column number of the current row
    spantogether = 0
    html_cur_row = html_str[row_index[i]:row_index[i + 1]]

    for ind, tag in enumerate(html_cur_row):
        if spantogether:
            spantogether = 0
            continue
        # if cur tag is not key information,continue
        if tag != "<td>" and "span" not in tag:
            continue

        if col_index > num_col - 1:
            return 1  # The column of current row exceeds the column of the first row
        # current cell is a part of row span cell
        while span_matrix[i, col_index] != -1:
            if col_index == num_col - 1:
                return 1
            else:
                col_index += 1

        # basic cell
        if tag == "<td>":
            span_matrix[i, col_index] = area_index
            col_index += 1
        # "rowspan" and "colspan" together
        elif "rowspan" in tag and (ind != len(html_cur_row) - 1 and "colspan" in html_cur_row[ind + 1]):
            row = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
            col = int(html_cur_row[ind + 1][-3:-1]) \
                if html_cur_row[ind + 1][-3:-1].isdigit() else int(html_cur_row[ind + 1][-2])
            spantogether = 1  # the next span will be skipped
            if (span_matrix[i:i + row, col_index:col_index + col] != -1).any():
                return 3  # Overlay between cells
            span_matrix[i:i + row, col_index:col_index + col] = area_index
            if i + row > span_matrix.shape[0] or col_index + col > span_matrix.shape[1]:
                return 2  # Spanning cell exceeds the table boundary
            col_index += col
        # only "colspan"
        elif "colspan" in tag:
            col = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
            if col_index + col > num_col:
                return 2
            if (span_matrix[i, col_index:col_index + col] != -1).any():
                return 3
            span_matrix[i, col_index:col_index + col] = area_index
            col_index += col
        # only "rowspan"
        elif "rowspan" in tag:
            row = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
            if i + row > num_row:
                return 2
            if (span_matrix[i:i + row, col_index] != -1).any():
                return 3
            span_matrix[i:i + row, col_index] = area_index
            col_index += 1
        area_index += 1
if -1 in span_matrix:
    staus = 1  # The column number of some rows is smaller than the column number of the first row

return staus

函数中以下代码片段,如果正好两个跨列之和等于总列数,就被视为异常了,不是有问题?还有tag[-3:-1]的取值 如果是类似 " colspan="15"" 大于10 的值只取了一个5,1没取道

 # only "colspan"
        elif "colspan" in tag:
            col = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
            if col_index + col > num_col:
                return 2

cqray1990 avatar Mar 12 '24 15:03 cqray1990