问题描述
我是一个初学者,我想通过首先使用TF-IDF矢量化器,然后使用TF-IDF矢量来训练用于二进制分类的神经网络,对IMDB数据集进行情感分类。作为预处理,我删除了停用词。
我尝试了以下方法,但是每次都会遇到新的错误。请为编码问题陈述的最佳方法提供帮助:我想将TF-IDF矢量化与神经网络一起用于imdb评论的二进制情感分类。
我编写了以下函数来创建TF-IDF vectorizerr:
def Ngram_Vectorizer(reviews_train,reviews_test):
tfidf = TfidfVectorizer(analyzer = 'word',ngram_range=(2,2))
tfidf.fit(reviews_train)
feature_names = tfidf.get_feature_names()
reviews_train = tfidf.transform(reviews_train)
reviews_test = tfidf.transform(reviews_test)
return reviews_train,reviews_test
获得TF-IDF向量后,我将其传递给浅层神经网络,如下所示:
def NeuralNetwork(reviews_train,labels_train,reviews_test,labels_test):
model = tf.keras.Sequential([
tf.keras.layers.Dense(256,input_shape = reviews_train.shape,activation = 'relu'),tf.keras.layers.Dropout(0.2),tf.keras.layers.Dense(1,activation='sigmoid')])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(reviews_train,validation_data = (reviews_test,labels_test),batch_size = 128,epochs = 5)
model.summary()
执行上述操作时,出现以下错误:
Traceback (most recent call last):
File "neuralnetwork_tfidf_classifier.py",line 139,in <module>
main()
File "neuralnetwork_tfidf_classifier.py",line 136,in main
NeuralNetwork(reviews_train,labels_test)
File "neuralnetwork_tfidf_classifier.py",line 72,in NeuralNetwork
model.fit(reviews_train,epochs = 5)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\keras\engine\training.py",line 108,in _method_wrapper
return method(self,*args,**kwargs)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\keras\engine\training.py",line 1063,in fit
steps_per_execution=self._steps_per_execution)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py",line 1117,in __init__
model=model)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py",line 573,in __init__
dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py",line 682,in from_tensor_slices
return TensorSliceDataset(tensors)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py",line 3003,in __init__
self._tensors = structure.to_batched_tensor_list(batched_spec,element)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\data\util\structure.py",line 352,in to_batched_tensor_list
component),element_spec,line 326,in _to_tensor_list_helper
reduce_fn,zip(nest.flatten(element_spec),nest.flatten(element)),[])
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\data\util\structure.py",line 323,in reduce_fn
return encode_fn(state,spec,component)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\data\util\structure.py",in <lambda>
component),element)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\framework\sparse_tensor.py",line 346,in _to_batched_tensor_list
out_type=dtypes.variant)]
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\ops\gen_sparse_ops.py",line 498,in serialize_many_sparse
_ops.raise_from_not_ok_status(e,name)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\framework\ops.py",line 6843,in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code,message),None)
File "<string>",line 3,in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: indices[1] = [0,3105402] is out of order. Many sparse ops require sorted indices.
Use `tf.sparse.reorder` to create a correctly ordered copy.
[Op:SerializeManySparse]
然后为了解决上述错误,我使用了 tf.sparse.reorder(reviews_train),tf.sparse.reorder(labels_train) 如下,将我的输入修改为神经网络:
def NeuralNetwork(reviews_train,labels_test):
model = tf.keras.Sequential([
tf.keras.layers.Dense(256,activation='sigmoid')])
model.compile(loss='binary_crossentropy',metrics=['accuracy'])
model.fit(tf.sparse.reorder(reviews_train),tf.sparse.reorder(labels_train),validation_data = (tf.sparse.reorder(reviews_test),tf.sparse.reorder(labels_test)),epochs = 5)
model.summary()
Traceback (most recent call last):
File "neuralnetwork_tfidf_classifier.py",in NeuralNetwork
model.fit(tf.sparse.reorder(reviews_train),epochs = 5)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\ops\sparse_ops.py",line 823,in sparse_reorder
sp_input = _convert_to_sparse_tensor(sp_input)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\tensorflow\python\ops\sparse_ops.py",line 71,in _convert_to_sparse_tensor
raise TypeError("Input must be a SparseTensor.")
TypeError: Input must be a SparseTensor.
第三,我尝试在TF-IDF输出上使用 todense(),如下所示:
def Ngram_Vectorizer(reviews_train,2))
tfidf.fit(reviews_train)
#print(tfidf.vocabulary_)
feature_names = tfidf.get_feature_names()
#print(feature_names)
#pickle.dump(tfidf,open('tfidf_vectorizer.pkl','wb'))
reviews_train = tfidf.transform(reviews_train).todense()
reviews_test = tfidf.transform(reviews_test).todense()
return reviews_train,reviews_test
但是这又引发了以下错误:
Traceback (most recent call last):
File "neuralnetwork_tfidf_classifier.py",line 141,line 129,in main
reviews_train,reviews_test= Ngram_Vectorizer(reviews_train,reviews_test)
File "neuralnetwork_tfidf_classifier.py",line 50,in Ngram_Vectorizer
reviews_train = tfidf.transform(reviews_train).todense()
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\scipy\sparse\base.py",line 847,in todense
return asmatrix(self.toarray(order=order,out=out))
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\scipy\sparse\compressed.py",line 1025,in toarray
out = self._process_toarray_args(order,out)
File "C:\Users\Administrator\AppData\Local\Programs\Python\python37\lib\site-packages\scipy\sparse\base.py",line 1185,in _process_toarray_args
return np.zeros(self.shape,dtype=self.dtype,order=order)
MemoryError: Unable to allocate 1.14 TiB for an array with shape (50010,3140004) and data type float64
任何对正确方向的帮助或指导都将非常有帮助。谢谢大家。
解决方法
最后一个错误意味着可以进行培训,但是您的PC内存不足(甚至在大型计算机上也经常发生)。尝试仅对数据集的一小部分运行它,以查看其是否有效。