问题描述
如何在保持整行完整的同时使其工作?
我只想替换单个标量离群值单元格值。
import pandas as pd
df = pd.DataFrame({'user': ['Bob','Jane','Alice'],'income': [1,1,42000]})
# expected to replace 42000 by np.NaN
def remove_outliers(df):
from scipy import stats
for col in df.select_dtypes(include=['number']).columns:
outliers = df.loc[stats.zscore(df[col]) > remove_outliers_above_absolute_standard_deviation_of,col]
display("Outliers found: " + str(len(outliers.index)))
df.loc[stats.zscore(df[col]) > remove_outliers_above_absolute_standard_deviation_of,col] = np.nan
display("Dataframe after outlier removal: ")
display(df)
return df
remove_outliers(df)
谢谢。
解决方法
# get the numeric columns as a copy to work on
numerics = df.select_dtypes("number").copy()
# get their means & stds
means,stds = numerics.mean(),numerics.std()
# determine the lower and upper bounds for "outlier"s
factor = 3
lower,upper = means - factor * stds,means + factor * stds
# mask those that are out of (lower,upper) as `np.nan`
numerics[~(lower.lt(numerics) & upper.gt(numerics))] = np.nan
# put the numerics back
df[numerics.columns] = numerics
,
根据您的示例,这是有效的。
import pandas as pd
import numpy as np
df = pd.DataFrame({"user": ["Bob","Jane","Alice"],"income": [1,1,42000]})
display(df)
>>>
user income
0 Bob 1
1 Jane 1
2 Alice 42000
>>>
for col in df.select_dtypes('number'):
df.loc[
(df[col] > df[col].mean() + df[col].std())
| (df[col] < df[col].mean() - df[col].std()),col,] = np.NaN
display(df)
>>>
user income
0 Bob 1.0
1 Jane 1.0
2 Alice NaN
虽然它与@Mustafa Aydin 的回答没有什么不同。因此,我怀疑您的数据存在某种类型问题,或者 zscore
没有产生您期望的结果。
使用 zscore
,这会产生相同的结果:
for col in df.select_dtypes('number'):
df.loc[stats.zscore(df[col]) > 1,col] = np.NaN
,
这应该有效:
def remove_outliers(df,max_outlier_std):
for column in df.select_dtypes('number'):
outlier_condition = (df[column] > df[column].mean() + max_outlier_std*df[column].std()) | (df[column] < df[column].mean() - max_outlier_std*df[column].std())
outliers_snapshot = pd.DataFrame([df.loc[
outlier_condition,column
].copy().rename(f"{column} outliers")]).transpose()
df.loc[
outlier_condition,column
] = np.nan
display(outliers_snapshot)
return df