问题描述
我正在尝试生成使用自定义 Gensim Word2Vec 嵌入的文本。我正在尝试适应它而不是 glove 嵌入。
代码:
glove_path = "/content/drive/MyDrive/Dataset/Bangla AI/custom_bangla_embedding.txt"
BATCH_SIZE = 64 # number of data points to consider to train at a single point of time
LATENT_DIM = 200 # the size of the hidden state/vector
EMbedDING_DIM = 1000 # size of the word embeddings - comes into varIoUs sizes 50,100 or 200
MAX_VOCAB_SIZE = 30000 # the maximum number of words to consider
VALIDATION_SPLIT = 0.2 # % of validation dataset```
类 SequenceGenerator():
def __init__(self,input_lines,target_lines,max_seq_len=None,max_vocab_size=10000,embedding_dim=200):
self.input_lines = input_lines
self.target_lines = target_lines
self.MAX_SEQ_LEN = max_seq_len
self.MAX_VOCAB_SIZE = max_vocab_size
self.EMbedDING_DIM = embedding_dim
def initialize_embeddings(self):
# load the word embeddings
self.word2vec = {}
with open(glove_path%self.EMbedDING_DIM,'r') as file:
for line in file:
vectors = line.split()
self.word2vec[vectors[0]] = np.asarray(vectors[1:],dtype="float32")
# get the embeddings matrix
self.num_words = min(self.MAX_VOCAB_SIZE,len(self.word2idx)+1)
self.embeddings_matrix = np.zeros((self.num_words,self.EMbedDING_DIM))
for word,idx in self.word2idx.items():
if idx <= self.num_words:
word_embeddings = self.word2vec.get(word)
if word_embeddings is not None:
self.embeddings_matrix[idx] = word_embeddings
self.idx2word = {v:k for k,v in self.word2idx.items()}
def prepare_sequences(self,filters=''):
# train the tokenizer
self.tokenizer = Tokenizer(num_words=self.MAX_VOCAB_SIZE,filters='')
self.tokenizer.fit_on_texts(self.input_lines+self.target_lines)
# get the word-index mapping and initialize embeddings
self.word2idx = self.tokenizer.word_index
self.initialize_embeddings()
# tokenize the input and target lines
self.input_sequences = self.tokenizer.texts_to_sequences(self.input_lines)
self.target_sequences = self.tokenizer.texts_to_sequences(self.target_lines)
# get the max sequence len from the data
max_seq_len = max(list(map(len,self.input_lines+self.target_lines)))
if self.MAX_SEQ_LEN:
self.MAX_SEQ_LEN = min(self.MAX_SEQ_LEN,max_seq_len)
else:
self.MAX_SEQ_LEN = max_seq_len
# pad the sequences
self.input_sequences = pad_sequences(self.input_sequences,maxlen=self.MAX_SEQ_LEN,padding="post")
self.target_sequences = pad_sequences(self.target_sequences,padding="post")
print("1st input sequence: ",self.input_sequences[0])
print("1st target sequence: ",self.target_sequences[0])
def one_hot_encoding(self):
"Creates the One-hot encoding for the target sequence."
# it will be a 3 dimensional array where
# first-dim is the number of target lines
# second-dim is the size of the sequences
# third-dim is the number of words in the dataset
self.one_hot_targets = np.zeros((len(self.target_sequences),self.MAX_SEQ_LEN,self.num_words))
for seq_idx,seq in enumerate(self.target_sequences):
for word_idx,word_id in enumerate(self.target_sequences[seq_idx]):
if word_id > 0:
self.one_hot_targets[seq_idx,word_idx,word_id] = 1
def get_closest_word(self,word_vec):
"""
Find the nearest word to the provided vector. The distance between the vectors is
calculated using the cosine-distance.
Parameters:
word_vec (np.array): a vector of size EMbedDING_DIM
Returns:
Str: the closest word to the provided vector
"""
max_dist = 9999999999
closest_word = "NULL"
# iterate overall the words and find the closest one
for word,vec in self.word2vec.items():
# get the cosine distance between the words
dist = spatial.distance.cosine(word_vec,vec)
# compare the distance and keep the minimum
if dist < max_dist:
max_dist = dist
closest_word = word
return closest_word```
创建类的对象
max_vocab_size=MAX_VOCAB_SIZE,embedding_dim=EMbedDING_DIM)```
# prepare the input & target sequences
```sg_obj.prepare_sequences()```
# create the One-hot encoding on the target sequences
```sg_obj.one_hot_encoding()```
# make sure the tokenized words contains <sos> & <eos>
```assert '<sos>' in sg_obj.word2idx
assert '<eos>' in sg_obj.word2idx```
But getting the following error:
TypeError Traceback(最近一次调用)
```<ipython-input-36-3edfcb198239> in <module>()
133
134 # prepare the input & target sequences
--> 135 sg_obj.prepare_sequences()
136 # create the One-hot encoding on the target sequences
137 sg_obj.one_hot_encoding()```
```1 frames
<ipython-input-36-3edfcb198239> in initialize_embeddings(self)
29 # load the word embeddings
30 self.word2vec = {}
---> 31 with open(glove_path%self.EMbedDING_DIM,'r') as file:
32 for line in file:
33 vectors = line.split()```
```TypeError: not all arguments converted during string formatting
寻求帮助。提前致谢。
解决方法
Python 中的 %
运算符用于通过“插值”进行字符串格式化。它期望字符串(其左侧参数)具有特定格式,并且值(其右侧参数)是匹配的序列(或 dict)。见:
https://docs.python.org/3/library/stdtypes.html#old-string-formatting
您传递了这个不正确的参数,因此遇到了这个错误。您所有其他代码中的任何内容都与错误无关。例如,这足以触发完全相同的错误:
>>> glove_path = 'foo'
>>> EMBEDDING_DIM = 1000
>>> glove_path%EMBEDDING_DIM
Traceback (most recent call last):
File "<stdin>",line 1,in <module>
TypeError: not all arguments converted during string formatting
所以:不要在值不正确的变量上滥用 %
,这样你就不会得到那个错误。 (从上下文看来,违规行只是试图打开一个特定的现有文件 - 所以也许,只需使用普通的 glove_path
?)