使用GLM4跑您这边提供的数据,效果很低,是因为大模型的问题吗?
1.代码如下。
测试generate_then_read代码在glm9b下的性
from openai import OpenAI import re import json from tqdm import tqdm
class ChatGLM4(object):
def __init__(self, api_key="EMPTY", base_url="http://x.x.x.x:8000/v1/",model_name="glm-4"):
self.base_url = base_url
self.model_name = model_name
# 初始化 OpenAI 客户端,并增加错误处理
try:
self._client = OpenAI(api_key="glm-4", base_url=self.base_url)
except Exception as e:
raise ValueError(f"Failed to initialize OpenAI client: {e}")
def call(self, prompt, chat_history = []):
# 将 chat_history 与当前 prompt 组合成请求
messages = [{"role": "system", "content": "you are a helpful assistant!"}]
# 添加之前的对话历史
for user_content, assistant_content in chat_history:
messages.append({"role": "user", "content": user_content})
messages.append({"role": "system", "content": assistant_content})
# 当前用户的输入
messages.append({"role": "user", "content": prompt})
# logger_file.info(f"messages: {messages}")
# 发送请求
try:
response = self._client.chat.completions.create(
model="glm-4", # 或其他模型
messages=messages,
# max_tokens=, # 可以调整返回的 token 数量
n=1, # 返回一个结果
stop=None, # 可设置 stop 标记
temperature=0, # 控制输出的创造性
top_p = 0.1
)
return response
except Exception as e:
log.error(f"error:,Failed to get a response from OpenAI: {e}")
raise ValueError(f"Failed to get a response from OpenAI: {e}")
def chat(self,prompt,chat_history=[]):
response = self.call(prompt,chat_history)
response = response.choices[0].message.content
return response
llm = ChatGLM4()
with open("webq_test_gpt_output.json",'r') as f: datas = json.load(f)
prompt = """ Refer to the passage below and answer the following question with just one entity. \n\n Passage: {background} \n\n Question: {query} \n\n The answer is """ results = [] for data in tqdm(datas): question = data['question'] ctxs = data['ctxs'] answer = data['answers'] string = "" for idx,ctx in enumerate(ctxs): string += f"\n {idx}: {ctx['text']}" final_prompt = prompt.format(background=string,query=question) output= llm.chat(final_prompt) results.append({"question":question,"answer":answer,"output":output})
with open('results2.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4) # 设置 ensure_ascii=False 以支持中文
得到结果如下
将输出转化为列表,
import json
results = [] with open("results2.json",'r') as f: datas = json.load(f)
for data in datas: question = data['question'] answer = data['answer'] output = [ ans.strip() for ans in data['output'][0].split(',')] results.append({"question":question,"answer":answer,"output":output})
with open('results_output.json', 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=4) # 设置 ensure_ascii=False 以支持中文
得到结果如下:
最后调用eval进行评估
import regex import json import string import unicodedata from typing import List import numpy as np from collections import Counter
from rouge import Rouge
class SimpleTokenizer(object): ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' NON_WS = r'[^\p{Z}\p{C}]'
def __init__(self):
"""
Args:
annotators: None or empty set (only tokenizes).
"""
self._regexp = regex.compile(
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
)
def tokenize(self, text, uncased=False):
matches = [m for m in self._regexp.finditer(text)]
if uncased:
tokens = [m.group().lower() for m in matches]
else:
tokens = [m.group() for m in matches]
return tokens
def check_answer(example, tokenizer) -> List[bool]: """Search through all the top docs to see if they have any of the answers.""" answers = example['answers'] ctxs = example['ctxs']
hits = []
for _, doc in enumerate(ctxs):
text = doc['text']
if text is None: # cannot find the document for some reason
hits.append(False)
continue
hits.append(has_answer(answers, text, tokenizer))
return hits
def has_answer(answers, text, tokenizer=SimpleTokenizer()) -> bool: """Check if a document contains an answer string.""" text = _normalize(text) text = tokenizer.tokenize(text, uncased=True)
for answer in answers:
answer = _normalize(answer)
answer = tokenizer.tokenize(answer, uncased=True)
for i in range(0, len(text) - len(answer) + 1):
if answer == text[i: i + len(answer)]:
return True
return False
def _normalize(text): return unicodedata.normalize('NFD', text)
def normalize_answer(s): def remove_articles(text): return regex.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def exact_match_score(prediction, ground_truth): return normalize_answer(prediction) == normalize_answer(ground_truth)
def ems(prediction, ground_truths): return max([exact_match_score(prediction, gt) for gt in ground_truths])
def f1_score(prediction, ground_truth): prediction_tokens = normalize_answer(prediction).split() ground_truth_tokens = normalize_answer(ground_truth).split() common = Counter(prediction_tokens) & Counter(ground_truth_tokens) num_same = sum(common.values()) if num_same == 0: return 0 precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(ground_truth_tokens) f1 = (2 * precision * recall) / (precision + recall) return f1
def f1(prediction, ground_truths): return max([f1_score(prediction, gt) for gt in ground_truths])
def rougel_score(prediction, ground_truth):
rouge = Rouge()
# no normalization
try:
scores = rouge.get_scores(prediction, ground_truth, avg=True)
except ValueError: # "Hypothesis is empty."
return 0.0
return scores["rouge-l"]["f"]
def rl(prediction, ground_truths):
return max([rougel_score(prediction, gt) for gt in ground_truths])
file-level evaluation ...
def eval_recall(infile):
tokenizer = SimpleTokenizer()
lines = open(infile, 'r').readlines()[1:]
has_answer_count = 0
answer_lengths = []
for line in lines:
line = json.loads(line)
answer = line['answer']
output = ' || '.join(line['output'])
if has_answer(answer, output, tokenizer):
has_answer_count += 1
answer_lengths.append(len(output.split()))
recall = round(has_answer_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)
return recall, lens
def eval_question_answering(infile):
# lines = open(infile, 'r').readlines()[1:]
with open(infile,'r') as f:
lines = json.load(f)
exact_match_count = 0
answer_lengths = []
for line in lines:
# line = json.loads(line)
answer = line['answer']
output = line['output'][0]
if ems(output, answer): # EM evaluation
exact_match_count += 1
answer_lengths.append(len(output.split()))
em = round(exact_match_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)
return em, lens
def eval_fact_checking(infile):
tokenizer = SimpleTokenizer()
lines = open(infile, 'r').readlines()[1:]
exact_match_count = 0
answer_lengths = []
for line in lines:
line = json.loads(line)
answer = line['answer']
output = line['output'][0]
if answer == ["refutes"]:
answer = ["refutes", "no", "false"]
if answer == ["supports"]:
answer = ["supports", "yes", "true"]
if has_answer(answer, output, tokenizer):
exact_match_count += 1
answer_lengths.append(len(output.split()))
em = round(exact_match_count/len(lines), 4)
lens = round(np.mean(answer_lengths), 4)
return em, lens
def eval_dialogue_system(infile):
lines = open(infile, 'r').readlines()[1:]
f1_scores = []
rl_scores = []
answer_lengths = []
for line in lines:
line = json.loads(line)
answer = line['answer']
output = line['output'][0]
f1_scores.append(f1(output, answer))
rl_scores.append(rl(output, answer))
answer_lengths.append(len(output.split()))
F1 = round(np.mean(f1_scores), 4)
RL = round(np.mean(rl_scores), 4)
lens = round(np.mean(answer_lengths), 4)
return F1, RL, lens
emscore, length = eval_question_answering("results_output.json") print(emscore) print(length) 最后的结果只有 0.3179 2.6216 不清楚是因为大模型的问题吗。 webq_test_gpt_output.json来自于https://drive.google.com/drive/folders/1DNjTTOLKi24wohJKu1Z-v6b4izfymlLu 您这边提供的文件。希望可以得到您的回复