BERT标记器无法预测＃个分裂的单词

问题描述

我得到了这个结果。 BERT无法预测这＃个标签字。这个X应该是DRUG。

我正在使用pytorch_pretrained_bert库。我从这里利用代码：
https://github.com/Louis-udm/NER-BERT-CRF

Word in BERT layer  | Initial word   : Predicted NER-tag
-------------------------------------------------------------
holy                | holy           : O              
shit                | shit           : O              
that                | that           : O              
##one               | trazodone      : X              
actually            | actually       : O              
knocked             | knocked        : B-ADR          
me                  | me             : I-ADR          
the                 | the            : I-ADR          
fuck                | fuck           : I-ADR          
out                 | out            : I-ADR          
and                 | and            : O              
took                | took           : O              
me                  | me             : O              
for                 | for            : O              
a                   | a              : O              
ride                | ride           : O

解决方法

代码如下：

class PaddingInputExample(object):

    """ Fake example so the num input examples is a multiple of the batch size.
        When running eval/predict on the TPU,we need to pad the number of examples
        to be a multiple of the batch size,because the TPU requires a fixed batch
        size. The alternative is to drop the last batch,which is bad because it means
        the entire output data won't be generated.
        We use this class instead of `None` because treating `None` as padding
        battches could cause silent errors. """

def convert_text_to_examples(texts,labels):

    """Create InputExamples"""
    InputExamples = []
    for text,label in zip(texts,labels):
        InputExamples.append(
            InputExample(guid=None,words=text,labels=label)
        )
    return InputExamples
def convert_examples_to_features(tokenizer,examples,max_seq_length=66):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids,input_masks,segment_ids,labels = [],[],[]
    for example in tqdm_notebook(examples,desc="Converting examples to features"):
        input_id,input_mask,segment_id,label = convert_single_example(
            tokenizer,example,max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),np.array(input_masks),np.array(segment_ids),np.array(labels),)

def convert_single_example(tokenizer,max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example,PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label_ids = [0] * max_seq_length
        return input_ids,label_ids
#     pdb.set_trace()
    tokens_a = example.words
    if len(tokens_a) > max_seq_length-2:
        tokens_a = tokens_a[0 : (max_seq_length-2)]

# Token map will be an int -> int mapping between the `orig_tokens` index and
# the `bert_tokens` index.

# bert_tokens == ["[CLS]","john","johan","##son","'","s","house","[SEP]"]
# orig_to_tok_map == [1,2,4,6]   
    orig_to_tok_map = []              
    tokens = []
    segment_ids = []
    
    tokens.append("[CLS]")
    segment_ids.append(0)
    orig_to_tok_map.append(len(tokens)-1)
    #print(len(tokens_a))
    for token in tokens_a:       
        tokens.extend(tokenizer.tokenize(token))
        orig_to_tok_map.append(len(tokens)-1)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    orig_to_tok_map.append(len(tokens)-1)
    input_ids = tokenizer.convert_tokens_to_ids([tokens[i] for i in orig_to_tok_map])
    #print(len(orig_to_tok_map),len(tokens),len(input_ids),len(segment_ids)) #for debugging

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)
    label_ids = []
    labels = example.labels
    label_ids.append(0)
    label_ids.extend([tag2int[label] for label in labels])
    label_ids.append(0)
    #print(len(label_ids)) #for debugging
    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        label_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(label_ids) == max_seq_length

    return input_ids,label_ids


test_example = convert_text_to_examples([sentence_ini],[['O']*len(sentence_ini)])
(input_ids,_ ) =  convert_examples_to_features(tokenizer,test_example,max_seq_length)
input_ids = input_ids[0]
input_masks = input_masks[0]
segment_ids = segment_ids[0]
input_ids = torch.tensor([input_ids])
input_masks  = torch.tensor([input_masks])
segment_ids = torch.tensor([segment_ids])
model.eval()
with torch.no_grad():
    # Predict hidden states features for each layer
    predictions= model(input_ids,input_masks)
    _,predicted = torch.max(predictions[0],-1)

print("\n{:20}| {:15}: {:15}".format("Word in BERT layer",'Initial word',"Predicted NER-tag"))
print(61*'-')
k = 0
for i,pred in enumerate(predicted):
#     print(pred)
    try:
        if pred.item()!=1:
            print("{:20}| {:15}: {:15}".format([tokens[i] for i in orig_to_tok_map][i],sentence_ini[i-1],int2tag[pred.item()]))            
            k+=1            
    except:
        pass

bert-language-model deep-learning