如何解码熊猫DataFrame中的LabelEncoder编码列?

问题描述

我有一个dataset。我通过以下代码行将分类对象转换为数字来进行特征工程设计的地方:

import pandas as pd 
import numpy as np
from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')
print(df.shape)
df.head()
colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)

df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])

label_encoder = preprocessing.LabelEncoder() 
for col in colsObj:
    df[col] = label_encoder.fit_transform(df[col])
df.head()
for col in colsObj:
    df[col] = label_encoder.inverse_transform(df[col])
df.head()

但是这里inverse_tranform()并没有返回原始数据集。请帮帮我!

解决方法

为了正确工作,有必要将LabelEncoder转换为字典数据类型:

from sklearn import preprocessing
df = pd.read_csv(r'train.csv',index_col='Id')

print(df.shape)
print (df.head())
    MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                    
1           60       RL         65.0     8450   Pave   NaN      Reg   
2           20       RL         80.0     9600   Pave   NaN      Reg   
3           60       RL         68.0    11250   Pave   NaN      IR1   
4           70       RL         60.0     9550   Pave   NaN      IR1   
5           60       RL         84.0    14260   Pave   NaN      IR1   

   LandContour Utilities LotConfig  ... PoolArea PoolQC Fence MiscFeature  \
Id                                  ...                                     
1          Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
2          Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   
3          Lvl    AllPub    Inside  ...        0    NaN   NaN         NaN   
4          Lvl    AllPub    Corner  ...        0    NaN   NaN         NaN   
5          Lvl    AllPub       FR2  ...        0    NaN   NaN         NaN   

   MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
Id                                                             
1        0      2    2008        WD         Normal     208500  
2        0      5    2007        WD         Normal     181500  
3        0      9    2008        WD         Normal     223500  
4        0      2    2006        WD        Abnorml     140000  
5        0     12    2008        WD         Normal     250000  

[5 rows x 80 columns]

colsNum = df.select_dtypes(np.number).columns
colsObj = df.columns.difference(colsNum)

df[colsNum] = df[colsNum].fillna(df[colsNum].mean()//1)
df[colsObj] = df[colsObj].fillna(df[colsObj].mode().iloc[0])

from collections import defaultdict
di = defaultdict(preprocessing.LabelEncoder)

for col in colsObj:
    df[col] = di[col].fit_transform(df[col])

print (df.head())
    MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
Id                                                                        
1           60         3         65.0     8450       1      0         3   
2           20         3         80.0     9600       1      0         3   
3           60         3         68.0    11250       1      0         0   
4           70         3         60.0     9550       1      0         0   
5           60         3         84.0    14260       1      0         0   

    LandContour  Utilities  LotConfig  ...  PoolArea  PoolQC  Fence  \
Id                                     ...                            
1             3          0          4  ...         0       2      2   
2             3          0          2  ...         0       2      2   
3             3          0          4  ...         0       2      2   
4             3          0          0  ...         0       2      2   
5             3          0          2  ...         0       2      2   

    MiscFeature  MiscVal  MoSold  YrSold  SaleType  SaleCondition  SalePrice  
Id                                                                            
1             2        0       2    2008         8              4     208500  
2             2        0       5    2007         8              4     181500  
3             2        0       9    2008         8              4     223500  
4             2        0       2    2006         8              0     140000  
5             2        0      12    2008         8              4     250000  

[5 rows x 80 columns]

print (di)
defaultdict(<class 'sklearn.preprocessing._label.LabelEncoder'>,{'Alley': LabelEncoder(),'BldgType': LabelEncoder(),'BsmtCond': LabelEncoder(),'BsmtExposure': LabelEncoder(),'BsmtFinType1': LabelEncoder(),'BsmtFinType2': LabelEncoder(),'BsmtQual': LabelEncoder(),'CentralAir': LabelEncoder(),'Condition1': LabelEncoder(),'Condition2': LabelEncoder(),'Electrical': LabelEncoder(),'ExterCond': LabelEncoder(),'ExterQual': LabelEncoder(),'Exterior1st': LabelEncoder(),'Exterior2nd': LabelEncoder(),'Fence': LabelEncoder(),'FireplaceQu': LabelEncoder(),'Foundation': LabelEncoder(),'Functional': LabelEncoder(),'GarageCond': LabelEncoder(),'GarageFinish': LabelEncoder(),'GarageQual': LabelEncoder(),'GarageType': LabelEncoder(),'Heating': LabelEncoder(),'HeatingQC': LabelEncoder(),'HouseStyle': LabelEncoder(),'KitchenQual': LabelEncoder(),'LandContour': LabelEncoder(),'LandSlope': LabelEncoder(),'LotConfig': LabelEncoder(),'LotShape': LabelEncoder(),'MSZoning': LabelEncoder(),'MasVnrType': LabelEncoder(),'MiscFeature': LabelEncoder(),'Neighborhood': LabelEncoder(),'PavedDrive': LabelEncoder(),'PoolQC': LabelEncoder(),'RoofMatl': LabelEncoder(),'RoofStyle': LabelEncoder(),'SaleCondition': LabelEncoder(),'SaleType': LabelEncoder(),'Street': LabelEncoder(),'Utilities': LabelEncoder()})

for col in colsObj:
    df[col] = di[col].inverse_transform(df[col])

print (df.head())
    MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                    
1           60       RL         65.0     8450   Pave  Grvl      Reg   
2           20       RL         80.0     9600   Pave  Grvl      Reg   
3           60       RL         68.0    11250   Pave  Grvl      IR1   
4           70       RL         60.0     9550   Pave  Grvl      IR1   
5           60       RL         84.0    14260   Pave  Grvl      IR1   

   LandContour Utilities LotConfig  ... PoolArea PoolQC  Fence MiscFeature  \
Id                                  ...                                      
1          Lvl    AllPub    Inside  ...        0     Gd  MnPrv        Shed   
2          Lvl    AllPub       FR2  ...        0     Gd  MnPrv        Shed   
3          Lvl    AllPub    Inside  ...        0     Gd  MnPrv        Shed   
4          Lvl    AllPub    Corner  ...        0     Gd  MnPrv        Shed   
5          Lvl    AllPub       FR2  ...        0     Gd  MnPrv        Shed   

   MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
Id                                                             
1        0      2    2008        WD         Normal     208500  
2        0      5    2007        WD         Normal     181500  
3        0      9    2008        WD         Normal     223500  
4        0      2    2006        WD        Abnorml     140000  
5        0     12    2008        WD         Normal     250000