问题描述
我有一个dataset。我通过以下代码行将分类对象转换为数字来进行特征工程设计的地方:
import pandas as pd
import numpy as np
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
df.head()
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)
df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
label_encoder = preprocessing.LabelEncoder()
for col in colsObj:
df[col] = label_encoder.fit_transform(df[col])
df.head()
for col in colsObj:
df[col] = label_encoder.inverse_transform(df[col])
df.head()
但是这里inverse_tranform()
并没有返回原始数据集。请帮帮我!
解决方法
为了正确工作,有必要将LabelEncoder
转换为字典数据类型:
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 RL 65.0 8450 Pave NaN Reg
2 20 RL 80.0 9600 Pave NaN Reg
3 60 RL 68.0 11250 Pave NaN IR1
4 70 RL 60.0 9550 Pave NaN IR1
5 60 RL 84.0 14260 Pave NaN IR1
LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature \
Id ...
1 Lvl AllPub Inside ... 0 NaN NaN NaN
2 Lvl AllPub FR2 ... 0 NaN NaN NaN
3 Lvl AllPub Inside ... 0 NaN NaN NaN
4 Lvl AllPub Corner ... 0 NaN NaN NaN
5 Lvl AllPub FR2 ... 0 NaN NaN NaN
MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 0 2 2008 WD Normal 208500
2 0 5 2007 WD Normal 181500
3 0 9 2008 WD Normal 223500
4 0 2 2006 WD Abnorml 140000
5 0 12 2008 WD Normal 250000
[5 rows x 80 columns]
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)
df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])
from collections import defaultdict
di = defaultdict(preprocessing.LabelEncoder)
for col in colsObj:
df[col] = di[col].fit_transform(df[col])
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 3 65.0 8450 1 0 3
2 20 3 80.0 9600 1 0 3
3 60 3 68.0 11250 1 0 0
4 70 3 60.0 9550 1 0 0
5 60 3 84.0 14260 1 0 0
LandContour Utilities LotConfig ... PoolArea PoolQC Fence \
Id ...
1 3 0 4 ... 0 2 2
2 3 0 2 ... 0 2 2
3 3 0 4 ... 0 2 2
4 3 0 0 ... 0 2 2
5 3 0 2 ... 0 2 2
MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 2 0 2 2008 8 4 208500
2 2 0 5 2007 8 4 181500
3 2 0 9 2008 8 4 223500
4 2 0 2 2006 8 0 140000
5 2 0 12 2008 8 4 250000
[5 rows x 80 columns]
print (di)
defaultdict(<class 'sklearn.preprocessing._label.LabelEncoder'>,{'Alley': LabelEncoder(),'BldgType': LabelEncoder(),'BsmtCond': LabelEncoder(),'BsmtExposure': LabelEncoder(),'BsmtFinType1': LabelEncoder(),'BsmtFinType2': LabelEncoder(),'BsmtQual': LabelEncoder(),'CentralAir': LabelEncoder(),'Condition1': LabelEncoder(),'Condition2': LabelEncoder(),'Electrical': LabelEncoder(),'ExterCond': LabelEncoder(),'ExterQual': LabelEncoder(),'Exterior1st': LabelEncoder(),'Exterior2nd': LabelEncoder(),'Fence': LabelEncoder(),'FireplaceQu': LabelEncoder(),'Foundation': LabelEncoder(),'Functional': LabelEncoder(),'GarageCond': LabelEncoder(),'GarageFinish': LabelEncoder(),'GarageQual': LabelEncoder(),'GarageType': LabelEncoder(),'Heating': LabelEncoder(),'HeatingQC': LabelEncoder(),'HouseStyle': LabelEncoder(),'KitchenQual': LabelEncoder(),'LandContour': LabelEncoder(),'LandSlope': LabelEncoder(),'LotConfig': LabelEncoder(),'LotShape': LabelEncoder(),'MSZoning': LabelEncoder(),'MasVnrType': LabelEncoder(),'MiscFeature': LabelEncoder(),'Neighborhood': LabelEncoder(),'PavedDrive': LabelEncoder(),'PoolQC': LabelEncoder(),'RoofMatl': LabelEncoder(),'RoofStyle': LabelEncoder(),'SaleCondition': LabelEncoder(),'SaleType': LabelEncoder(),'Street': LabelEncoder(),'Utilities': LabelEncoder()})
for col in colsObj:
df[col] = di[col].inverse_transform(df[col])
print (df.head())
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \
Id
1 60 RL 65.0 8450 Pave Grvl Reg
2 20 RL 80.0 9600 Pave Grvl Reg
3 60 RL 68.0 11250 Pave Grvl IR1
4 70 RL 60.0 9550 Pave Grvl IR1
5 60 RL 84.0 14260 Pave Grvl IR1
LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature \
Id ...
1 Lvl AllPub Inside ... 0 Gd MnPrv Shed
2 Lvl AllPub FR2 ... 0 Gd MnPrv Shed
3 Lvl AllPub Inside ... 0 Gd MnPrv Shed
4 Lvl AllPub Corner ... 0 Gd MnPrv Shed
5 Lvl AllPub FR2 ... 0 Gd MnPrv Shed
MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 0 2 2008 WD Normal 208500
2 0 5 2007 WD Normal 181500
3 0 9 2008 WD Normal 223500
4 0 2 2006 WD Abnorml 140000
5 0 12 2008 WD Normal 250000