问题描述

我正在尝试生成使用自定义 Gensim Word2Vec 嵌入的文本。我正在尝试适应它而不是 glove 嵌入。

glove_path = "/content/drive/MyDrive/Dataset/Bangla AI/custom_bangla_embedding.txt"

BATCH_SIZE = 64 # number of data points to consider to train at a single point of time
LATENT_DIM = 200 # the size of the hidden state/vector
EMbedDING_DIM = 1000 # size of the word embeddings - comes into varIoUs sizes 50,100 or 200
MAX_VOCAB_SIZE = 30000 # the maximum number of words to consider
VALIDATION_SPLIT = 0.2 # % of validation dataset```

类 SequenceGenerator():

def __init__(self,input_lines,target_lines,max_seq_len=None,max_vocab_size=10000,embedding_dim=200):        
    self.input_lines = input_lines
    self.target_lines = target_lines
    
    self.MAX_SEQ_LEN = max_seq_len
    self.MAX_VOCAB_SIZE = max_vocab_size
    self.EMbedDING_DIM = embedding_dim


def initialize_embeddings(self):
    
    # load the word embeddings
    self.word2vec = {}
    with open(glove_path%self.EMbedDING_DIM,'r') as file:
        for line in file:
            vectors = line.split()
            self.word2vec[vectors[0]] = np.asarray(vectors[1:],dtype="float32")

            
    # get the embeddings matrix
    self.num_words = min(self.MAX_VOCAB_SIZE,len(self.word2idx)+1)
    self.embeddings_matrix = np.zeros((self.num_words,self.EMbedDING_DIM))
    
    for word,idx in self.word2idx.items():
        if idx <= self.num_words:
            word_embeddings = self.word2vec.get(word)
            if word_embeddings is not None:
                self.embeddings_matrix[idx] = word_embeddings
                
    self.idx2word = {v:k for k,v in self.word2idx.items()}


def prepare_sequences(self,filters=''):
    
    # train the tokenizer
    self.tokenizer = Tokenizer(num_words=self.MAX_VOCAB_SIZE,filters='')
    self.tokenizer.fit_on_texts(self.input_lines+self.target_lines)
    
    # get the word-index mapping and initialize embeddings
    self.word2idx = self.tokenizer.word_index
    self.initialize_embeddings()
    
    # tokenize the input and target lines
    self.input_sequences = self.tokenizer.texts_to_sequences(self.input_lines)
    self.target_sequences = self.tokenizer.texts_to_sequences(self.target_lines)
    
    # get the max sequence len from the data
    max_seq_len = max(list(map(len,self.input_lines+self.target_lines)))
    if self.MAX_SEQ_LEN:
        self.MAX_SEQ_LEN = min(self.MAX_SEQ_LEN,max_seq_len)
    else:
        self.MAX_SEQ_LEN = max_seq_len
        
    # pad the sequences
    self.input_sequences = pad_sequences(self.input_sequences,maxlen=self.MAX_SEQ_LEN,padding="post")
    self.target_sequences = pad_sequences(self.target_sequences,padding="post")
    
    print("1st input sequence: ",self.input_sequences[0])
    print("1st target sequence: ",self.target_sequences[0])
    
    
def one_hot_encoding(self):
    "Creates the One-hot encoding for the target sequence."
    
    # it will be a 3 dimensional array where
    # first-dim is the number of target lines
    # second-dim is the size of the sequences
    # third-dim is the number of words in the dataset
    self.one_hot_targets = np.zeros((len(self.target_sequences),self.MAX_SEQ_LEN,self.num_words))
    
    for seq_idx,seq in enumerate(self.target_sequences):
        for word_idx,word_id in enumerate(self.target_sequences[seq_idx]):
            if word_id > 0:
                self.one_hot_targets[seq_idx,word_idx,word_id] = 1


def get_closest_word(self,word_vec):
    """
        Find the nearest word to the provided vector. The distance between the vectors is 
        calculated using the cosine-distance.
        
        Parameters:
            word_vec (np.array): a vector of size EMbedDING_DIM
            
        Returns:
            Str: the closest word to the provided vector
    """
    
    max_dist = 9999999999
    closest_word = "NULL"
    
    # iterate overall the words and find the closest one
    for word,vec in self.word2vec.items():
        
        # get the cosine distance between the words
        dist = spatial.distance.cosine(word_vec,vec)
        
        # compare the distance and keep the minimum
        if dist < max_dist:
            max_dist = dist
            closest_word = word
    
    return closest_word```

创建类的对象

                           max_vocab_size=MAX_VOCAB_SIZE,embedding_dim=EMbedDING_DIM)```

# prepare the input & target sequences
```sg_obj.prepare_sequences()```
# create the One-hot encoding on the target sequences
```sg_obj.one_hot_encoding()```

# make sure the tokenized words contains <sos> & <eos>
```assert '<sos>' in sg_obj.word2idx
assert '<eos>' in sg_obj.word2idx```




But getting the following error:

TypeError Traceback（最近一次调用）

```<ipython-input-36-3edfcb198239> in <module>()
    133 
    134 # prepare the input & target sequences
--> 135 sg_obj.prepare_sequences()
    136 # create the One-hot encoding on the target sequences
    137 sg_obj.one_hot_encoding()```

```1 frames

<ipython-input-36-3edfcb198239> in initialize_embeddings(self)
     29         # load the word embeddings
     30         self.word2vec = {}
---> 31         with open(glove_path%self.EMbedDING_DIM,'r') as file:
     32             for line in file:
     33                 vectors = line.split()```

```TypeError: not all arguments converted during string formatting

寻求帮助。提前致谢。

解决方法

Python 中的 % 运算符用于通过“插值”进行字符串格式化。它期望字符串（其左侧参数）具有特定格式，并且值（其右侧参数）是匹配的序列（或 dict）。见：

https://docs.python.org/3/library/stdtypes.html#old-string-formatting

您传递了这个不正确的参数，因此遇到了这个错误。您所有其他代码中的任何内容都与错误无关。例如，这足以触发完全相同的错误：

>>> glove_path = 'foo'
>>> EMBEDDING_DIM = 1000
>>> glove_path%EMBEDDING_DIM
Traceback (most recent call last):
  File "<stdin>",line 1,in <module>
TypeError: not all arguments converted during string formatting

所以：不要在值不正确的变量上滥用 %，这样你就不会得到那个错误。（从上下文看来，违规行只是试图打开一个特定的现有文件 - 所以也许，只需使用普通的 glove_path？）

gensim nlp python word2vec

Gensim Word2Vec Embedding 而不是 GloVe TypeError：并非所有参数都在字符串格式化期间转换

问题描述

创建类的对象

解决方法