问题描述
我正在尝试对数据框列值
ValueError: 值的长度与索引的长度不匹配
words= ['wedding','property','house','university','education','car']
for word in words:
print(english_stemmer.stem(word))
from nltk.stem import snowballstemmer
english_stemmer = snowballstemmer('english')
queries = data['purpose']
data['category'] = []
def purpose_category():
for query in queries:
for word in query.split(' '):
stemmed_word = english_stemmer.stem(word)
if stemmed_word == 'wed':
return 'wedding'
elif stemmed_word == 'properti':
return 'property'
elif stemmed_word == 'car':
return 'car'
elif stemmed_word =='hous':
return 'house'
elif (stemmed_word == 'univers') | (stemmed_word == 'educ'):
return 'education'
else:
return 'real estate'
for row in data:
category_value= data['purpose'].apply(purpose_category)
data['category'].append(category_value)
解决方法
用正则表达式解决这个问题:
dict_map = {
'wed': 'wedding','properti': 'property','car': 'car','hous': 'house','univers': 'education','educ': 'education'
}
regexp_pattern = '|'.join(dict_map)
data['category'] = (data['purpose']
.str.extract(regexp_pattern)
.map(dict_map).fillna('real estate'))
,
如果有帮助,这就是我解决我的方法。看来我们都在做同一个项目。
import pandas as pd
from nltk.stem import SnowballStemmer
english_stemmer = SnowballStemmer('english')
credit_worthiness = pd.read_csv('some_file_here.csv')
def purpose_category_eng(queries):
for word in queries.split(' '):
stemmed_word = english_stemmer.stem(word)
if stemmed_word == 'hous':
return 'housing'
if stemmed_word == 'car':
return 'car purchase'
if stemmed_word == 'educ' or stemmed_word == 'univers':
return 'education'
if stemmed_word == 'wed':
return 'wedding'
if stemmed_word == 'estat':
return 'real estate'
if stemmed_word == 'properti':
return 'property'
return 'unknown'
credit_worthiness['purpose_category'] = credit_worthiness['purpose'].apply(purpose_category_eng)