问题描述
当我运行以下代码时
from transformers import AutoTokenizer,AutoModelForQuestionAnswering
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
text = r"""
As checked dis is not yet on boarded to ARB portal,hence we cannot upload the invoices in portal
"""
questions = [
"dis asked if it is possible to post the two invoice in ARB.I have not access so I wanted to check if you would be able to do it.",]
for question in questions:
inputs = tokenizer.encode_plus(question,text,add_special_tokens=True,return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer_start_scores,answer_end_scores = model(**inputs)
answer_start = torch.argmax(
answer_start_scores
) # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(f"Question: {question}")
print(f"Answer: {answer}\n")
我在这里得到的答案是:
Question: dis asked if it is possible to post the two invoice in ARB.I have not access so I wanted to check if you would be able to do it.
Answer: dis is not yet on boarded to ARB portal
如何获得此答案的分数?分数与运行Question-Answer管道时得到的分数非常相似。
我必须采用这种方法,因为使用问答管道时,以下代码会给我带来关键错误
from transformers import pipeline
nlp = pipeline("question-answering")
context = r"""
As checked dis is not yet on boarded to ARB portal,hence we cannot upload the invoices in portal.
"""
print(nlp(question="dis asked if it is possible to post the two invoice in ARB?",context=context))
解决方法
这是我获得分数的尝试。看来我无法弄清楚feature.p_mask
是什么。因此,我目前无法删除构成softmax的非上下文索引。
# ... assuming imports and question and context
model_name="deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
inputs = tokenizer(question,context,add_special_tokens=True,return_tensors='pt')
input_ids = inputs['input_ids'].tolist()[0]
outputs = model(**inputs)
# used to compute score
start = outputs.start_logits.detach().numpy()
end = outputs.end_logits.detach().numpy()
# from source code
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
#?? undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
# Generate mask
undesired_tokens = inputs['attention_mask']
undesired_tokens_mask = undesired_tokens == 0.0
# Make sure non-context indexes in the tensor cannot contribute to the softmax
start_ = np.where(undesired_tokens_mask,-10000.0,start)
end_ = np.where(undesired_tokens_mask,end)
# Normalize logits and spans to retrieve the answer
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_),axis=-1,keepdims=True)))
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_),keepdims=True)))
# Compute the score of each tuple(start,end) to be the real answer
outer = np.matmul(np.expand_dims(start_,-1),np.expand_dims(end_,1))
# Remove candidate with end < start and end - start > max_answer_len
max_answer_len = 15
candidates = np.tril(np.triu(outer),max_answer_len - 1)
scores_flat = candidates.flatten()
idx_sort = [np.argmax(scores_flat)]
start,end = np.unravel_index(idx_sort,candidates.shape)[1:]
end += 1
score = candidates[0,start,end-1]
start,end,score = start.item(),end.item(),score.item()
print(tokenizer.decode(input_ids[start:end]))
print(score)
查看更多source code