tokenizers
tokenizers copied to clipboard
Adding doc_stride while preprocessing the data for Question Answering
I am trying to modify the following function for preprocessing a multilingual question answering dataset. Can someone please help me by letting me know on how to add the doc_stride while preprocessing the data as the contexts paragraphs are really long ! It seems really difficult to understand and implement all the concept all by my own. Early Thanks for showing support to this thread.
def process_data(context, question, answer, tokenizer, max_len):
len_ans = len(answer)
idx0 = None
idx1 = None
# we will try to get the start index and end index of answer in the question string:
for index in (i for i, e in enumerate(question) if e == answer[0]):
if question[index: index + len_ans] == answer:
idx0 = index
idex1 = index + len_ans - 1
break
# creating a vector to mark the indexes in context where answer is present with 1 and rest of the others will be 0
char_targets = [0] * len(context)
if idx0 is not None and idx1 is not None:
for c in range(idx0, idx1 + 1):
char_targets[c] = 1
# tokenization of the context string
tokenized_context = tokenizer.encode(context)
original_context_input_ids = tokenized_context.ids[1:-1]
context_offsets = tokenized_context.offsets[1:-1]
# getting the offsets of the target index
target_indexes = []
for j, (offset1, offset2) in enumerate(context_offsets):
if sum(context_offsets[offset1:offset2]) > 0:
target_indexes.append(j)
# start and end index of target
targets_start = target_indexes[0]
targets_end = target_indexes[-1]
# tokenization of the question string
tokenized_question = tokenizer.encode(question)
original_question_input_ids = tokenized_question.ids[1:-1]
question_offsets = tokenized_question.offsets[1:-1]
input_ids = [101] + original_question_input_ids + [102] + original_context_input_ids + [102] # [101] & [102] is for CLS and SEP tokens
token_type_ids = [0] * (len(original_question_input_ids) + 2) + [1] * (len(original_context_input_ids) + 1) # +2 is for CLS and SEP token and +1 for SEP token
mask = [1] * len(token_type_ids)
actual_offsets = [(0, 0)] + question_offsets + [(0, 0)] + context_offsets + [(0, 0)] # [(0, 0)] is for CLS token offset at start and SEP token offset for the rest
targets_start += len([101] + original_question_input_ids + [102])
targets_end += len([101] + original_question_input_ids + [102])
# padding
padding_length = max_len - len(input_ids)
if padding_length > 0:
input_ids = input_ids + ([0] * padding_length)
mask = mask + ([0] * padding_length)
token_type_ids = token_type_ids + ([0] * padding_length)
tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
padding_length = max_len - len(input_ids)
if padding_length > 0:
input_ids = input_ids + ([0] * padding_length)
mask = mask + ([0] * padding_length)
token_type_ids = token_type_ids + ([0] * padding_length)
tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
return {
'context': context,
'question': question,
'answer': answer,
'ids': input_ids,
'mask': mask,
'token_type_ids': token_type_ids,
'targets_start': targets_start,
'targets_end': targets_end,
'offsets': tweet_offsets
}