from sklearn import datasets
import pandas as pd
import numpy as np
dt = datasets.load_diabetes()
data = pd.DataFrame(data= np.c_[dt['data'], dt['target']],columns=
dt['feature_names'] + ['target'] )
data = data.drop('sex', axis = 1)
# mean +- 2sigma
# function to calculate outlier of a variable
def out1(x):
mu = np.average(x)
sigma = np.std(x)
LL = mu - 2*sigma # Lower limit
UL = mu + 2*sigma # Upper limit
out = [1 if (a >= UL) | (a <= LL) else 0 for a in x]
return(out)
# check #outliers in each variable
print(data.apply(out1).apply(sum))
# Function to Replace outlier with LL / UL
def out_impute(x):
mu = np.average(x)
sigma = np.std(x)
LL = mu - 2*sigma # Lower limit
UL = mu + 2*sigma # Upper limit
xnew = "Enter Code Here"
return(xnew)
data1 = data.apply(out_impute) # Create new data with inputed values请有人帮助我如何用下限和上限来替换离群点。
我将异常值定义为值>= mu +2*西格玛和=< mu-2*西格玛。我在代码'out_impute‘中定义了一个函数,但是我在替换部分陷入了困境。
提前感谢!
发布于 2018-09-14 02:04:25
使用df.clip
LL = mu - 2*sigma # Lower limit
UL = mu + 2*sigma # Upper limit
df['data'].clip(LL, UL)https://stackoverflow.com/questions/52324026
复制相似问题