metadata
metadata copied to clipboard
simple zero-shot eval function: website description
ppl on website specific testset. Contact @cccntu and Christopher
def calc_ppl(sentence):
tokenize_input = tokenizer.encode(sentence)
tensor_input = torch.tensor([tokenize_input])
loss=model(tensor_input, labels=tensor_input)[0]
return np.exp(loss.detach().numpy())
def eval_website_desc(orig_text, orig_website_desc, website_desc_list):
"""
Our goal is to evaluate the ppl of orig_website_desc ||| orig_text > website_desc ||| orig_txt for website_desc in website_desc_list
Example:
orig_text = "George Orwell, was an English novelist, essayist, journalist and critic."
orig_website = "Wikipedia is a multilingual encyclopedia"
website_desc_list = <list of desc from different websites>
"""
orig_concat = f"{orig_website_desc} ||| {orig_text}"
orig_ppl = calc_ppl(orig_concat)
for website_desc in website_desc_list:
if calc_ppl(f"{website_desc} ||| {orig_text}") < orig_ppl:
return False
return True
Colab to test this: https://colab.research.google.com/drive/15ap5LvMuX_kIZTDpai1bCbCcWlF2FgWD#scrollTo=B812bcUG6KWW