nlp中各中文预训练模型的输入和输出

Bert

from transformers import (  
  BertTokenizer,
  BertConfig,
  BertModel,
)
# clue/roberta_chinese_base
bertTokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bertModel = BertModel.from_pretrained('bert-base-chinese')
sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
inputs = bertTokenizer(sen, return_tensors='pt')
tokens = bertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = bertModel(**inputs)
# print(len(outputs))
print(outputs[0].shape, outputs[1].shape)
{'input_ids': tensor([[  101,   100,  2990,   897,   749,   100,  7566,  1818,  1920,  7030,
         10223,   118,  8205,   118,  9143,  4638,  7564,  6378,  5298,  6427,
          6241,  3563,  1798,  5310,  3354,  4638,  3563,  1798,  1469,  6444,
          4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', '[UNK]', '提', '供', '了', '[UNK]', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 35, 768]) torch.Size([1, 768])

Roberta

from transformers import (  
  BertTokenizer,
  BertConfig,
  BertModel,
)
# clue/roberta_chinese_base
robertTokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
robertModel = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')
sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
inputs = robertTokenizer(sen, return_tensors='pt')
tokens = robertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = robertModel(**inputs)
print(outputs)
print(outputs[0].shape, outputs[1].shape)
{'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
         10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
          4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
          4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 40, 768]) torch.Size([1, 768])

ALBert

from transformers import (  
  BertTokenizer,
  AlbertModel,
)
# clue/roberta_chinese_base
albertTokenizer = BertTokenizer.from_pretrained('clue/albert_chinese_tiny')
albertModel = AlbertModel.from_pretrained('clue/albert_chinese_tiny')
sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
inputs = albertTokenizer(sen, return_tensors='pt')
tokens = albertTokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = albertModel(**inputs)
# print(len(outputs))
print(outputs[0].shape, outputs[1].shape)	
{'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
         10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
          4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
          4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 40, 312]) torch.Size([1, 312])

XLNet

from transformers import AutoTokenizer, AutoModel
  
xlnettokenizer = AutoTokenizer.from_pretrained("hfl/chinese-xlnet-base")
xlnetModel = AutoModel.from_pretrained('hfl/chinese-xlnet-base')
sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
inputs = xlnettokenizer(sen, return_tensors='pt')
tokens = xlnettokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = xlnetModel(**inputs)
# print(outputs)
print(outputs[0].shape, len(outputs[1]))
{'input_ids': tensor([[   19, 13932,  9560,  4127,  3810,   603,   602,   412,  3336,  1144,
          3025,  4402,    13, 16636,    13,  7717,    20,    19,  3712,  3620,
          1723,  2280,  1301,    20,  2280,    24, 16338,  7921,    18,     4,
             3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}
['▁', 'Trans', 'form', 'ers', '提供了', 'N', 'L', 'P', '领域', '大量', 'st', 'ate', '-', 'of', '-', 'art', '的', '▁', '预', '训练', '语言', '模型', '结构', '的', '模型', '和', '调用', '框架', '。', '<sep>', '<cls>']
torch.Size([1, 31, 768]) 12

Electra

from transformers import AutoTokenizer, AutoModel
  

electratokenizer = AutoTokenizer.from_pretrained("hfl/chinese-electra-180g-base-discriminator")
electraModel = AutoModel.from_pretrained("hfl/chinese-electra-180g-base-discriminator")
sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
inputs = electratokenizer(sen, return_tensors='pt')
tokens = electratokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = electraModel(**inputs)
# print(outputs)
print(outputs[0].shape)
{'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
         10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
          4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
          4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 40, 768])

MacBert

from transformers import AutoTokenizer, AutoModel
  

mactokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
macModel = AutoModel.from_pretrained("hfl/chinese-macbert-base")
sen = 'Transformers提供了NLP领域大量state-of-art的 预训练语言模型结构的模型和调用框架。'
inputs = electratokenizer(sen, return_tensors='pt')
tokens = electratokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(inputs)
print(tokens)
outputs = electraModel(**inputs)
# print(outputs)
print(outputs[0].shape)
{'input_ids': tensor([[  101,   162, 10477,  8118, 12725,  8755,  2990,   897,   749,   156,
         10986,  7566,  1818,  1920,  7030, 10223,   118,  8205,   118,  9143,
          4638,  7564,  6378,  5298,  6427,  6241,  3563,  1798,  5310,  3354,
          4638,  3563,  1798,  1469,  6444,  4500,  3427,  3373,   511,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
['[CLS]', 't', '##ran', '##s', '##form', '##ers', '提', '供', '了', 'n', '##lp', '领', '域', '大', '量', 'state', '-', 'of', '-', 'art', '的', '预', '训', '练', '语', '言', '模', '型', '结', '构', '的', '模', '型', '和', '调', '用', '框', '架', '。', '[SEP]']
torch.Size([1, 40, 768])

相关文章

python方向·数据分析   ·自然语言处理nlp   案例:中...
原文地址http://blog.sina.com.cn/s/blog_574a437f01019poo....
ptb数据集是语言模型学习中应用最广泛的数据集,常用该数据集...
 Newtonsoft.JsonNewtonsoft.Json是.Net平台操作Json的工具...
NLP(NaturalLanguageProcessing)自然语言处理是人工智能的一...
做一个中文文本分类任务,首先要做的是文本的预处理,对文本...