问题描述
我得到了这个结果。 BERT无法预测这#个标签字。这个X应该是DRUG。
我正在使用pytorch_pretrained_bert库。我从这里利用代码:
https://github.com/Louis-udm/NER-BERT-CRF
Word in BERT layer | Initial word : Predicted NER-tag
-------------------------------------------------------------
holy | holy : O
shit | shit : O
that | that : O
##one | trazodone : X
actually | actually : O
knocked | knocked : B-ADR
me | me : I-ADR
the | the : I-ADR
fuck | fuck : I-ADR
out | out : I-ADR
and | and : O
took | took : O
me | me : O
for | for : O
a | a : O
ride | ride : O
解决方法
代码如下:
class PaddingInputExample(object):
""" Fake example so the num input examples is a multiple of the batch size.
When running eval/predict on the TPU,we need to pad the number of examples
to be a multiple of the batch size,because the TPU requires a fixed batch
size. The alternative is to drop the last batch,which is bad because it means
the entire output data won't be generated.
We use this class instead of `None` because treating `None` as padding
battches could cause silent errors. """
def convert_text_to_examples(texts,labels):
"""Create InputExamples"""
InputExamples = []
for text,label in zip(texts,labels):
InputExamples.append(
InputExample(guid=None,words=text,labels=label)
)
return InputExamples
def convert_examples_to_features(tokenizer,examples,max_seq_length=66):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
input_ids,input_masks,segment_ids,labels = [],[],[]
for example in tqdm_notebook(examples,desc="Converting examples to features"):
input_id,input_mask,segment_id,label = convert_single_example(
tokenizer,example,max_seq_length
)
input_ids.append(input_id)
input_masks.append(input_mask)
segment_ids.append(segment_id)
labels.append(label)
return (
np.array(input_ids),np.array(input_masks),np.array(segment_ids),np.array(labels),)
def convert_single_example(tokenizer,max_seq_length=256):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example,PaddingInputExample):
input_ids = [0] * max_seq_length
input_mask = [0] * max_seq_length
segment_ids = [0] * max_seq_length
label_ids = [0] * max_seq_length
return input_ids,label_ids
# pdb.set_trace()
tokens_a = example.words
if len(tokens_a) > max_seq_length-2:
tokens_a = tokens_a[0 : (max_seq_length-2)]
# Token map will be an int -> int mapping between the `orig_tokens` index and
# the `bert_tokens` index.
# bert_tokens == ["[CLS]","john","johan","##son","'","s","house","[SEP]"]
# orig_to_tok_map == [1,2,4,6]
orig_to_tok_map = []
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
orig_to_tok_map.append(len(tokens)-1)
#print(len(tokens_a))
for token in tokens_a:
tokens.extend(tokenizer.tokenize(token))
orig_to_tok_map.append(len(tokens)-1)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
orig_to_tok_map.append(len(tokens)-1)
input_ids = tokenizer.convert_tokens_to_ids([tokens[i] for i in orig_to_tok_map])
#print(len(orig_to_tok_map),len(tokens),len(input_ids),len(segment_ids)) #for debugging
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
label_ids = []
labels = example.labels
label_ids.append(0)
label_ids.extend([tag2int[label] for label in labels])
label_ids.append(0)
#print(len(label_ids)) #for debugging
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
label_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
assert len(label_ids) == max_seq_length
return input_ids,label_ids
test_example = convert_text_to_examples([sentence_ini],[['O']*len(sentence_ini)])
(input_ids,_ ) = convert_examples_to_features(tokenizer,test_example,max_seq_length)
input_ids = input_ids[0]
input_masks = input_masks[0]
segment_ids = segment_ids[0]
input_ids = torch.tensor([input_ids])
input_masks = torch.tensor([input_masks])
segment_ids = torch.tensor([segment_ids])
model.eval()
with torch.no_grad():
# Predict hidden states features for each layer
predictions= model(input_ids,input_masks)
_,predicted = torch.max(predictions[0],-1)
print("\n{:20}| {:15}: {:15}".format("Word in BERT layer",'Initial word',"Predicted NER-tag"))
print(61*'-')
k = 0
for i,pred in enumerate(predicted):
# print(pred)
try:
if pred.item()!=1:
print("{:20}| {:15}: {:15}".format([tokens[i] for i in orig_to_tok_map][i],sentence_ini[i-1],int2tag[pred.item()]))
k+=1
except:
pass