我正在尝试在scikit-learn的pipeline对象中实现一个自定义管道。管道是使用VIF递归消除功能。我参考了here的代码
class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
def __init__(self, thresh=10.0):
# From looking at documentation, values between 5 and 10 are "okay".
# Above 10 is too high and so should be removed.
self.thresh = thresh
self.scaler = preprocessing.StandardScaler()
def fit(self, X, y=None):
X_copy = X.copy()
print("ReduceVIF fit")
if hasattr(self, 'scaler'):
X = self.scaler.fit(X)
X = ReduceVIF.calculate_vif(X, self.thresh)
print(X)
self.predictors = X.columns
return self
def transform(self, X, y=None):
print("ReduceVIF transform")
columns = self.predictors
if hasattr(self, 'scaler'):
X = pd.DataFrame(self.scaler.transform(X), columns=columns)
return X
# return ReduceVIF.calculate_vif(X, self.thresh)
@staticmethod
def calculate_vif(X, thresh=10.0):
# Taken from https://stats.stackexchange.com/a/253620/53565 and modified
dropped = True
count = 0
while dropped and count <= 15:
print(count)
variables = X.columns
dropped = False
vif = [
variance_inflation_factor(X[variables].values, X.columns.get_loc(var))
for var in X.columns
]
max_vif = max(vif)
if max_vif > thresh:
maxloc = vif.index(max_vif)
print(f"Dropping {X.columns[maxloc]} with vif={max_vif}")
X = X.drop([X.columns.tolist()[maxloc]], axis=1)
dropped = True
count +=1
print(X.shape)
return X我试着这样调用/创建一个管道
# create a feature preparation pipeline for a model
def make_finetuning_pipeline(model):
steps = list()
# standardization
#steps.append(('standardize', preprocessing.StandardScaler()))
steps.append(('remove_multicollinearity', ReduceVIF(thresh=10)))
#steps.append(("feature_selection", feature_selection.RFE(linear_model.LogisticRegression(penalty='l1', solver='liblinear'))))
# the model
steps.append(('model', model))
# create pipeline
_pipeline = pipeline.Pipeline(steps=steps)
return _pipeline但不知何故它不起作用,错误要么是每个文件夹有不同的列,要么是有一个属性错误。有没有人知道如何将VIF插入到scikit学习管道中?
以下是我在github gist中的代码片段,以确保可重现性。Github Gist
发布于 2021-10-02 03:46:39
回答我自己的问题:我花了半天的时间尝试调试,初步的工作版本如下所示,不是很优雅,但现在它的工作是预期的。
class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
def __init__(self, thresh=10):
# From looking at documentation, values between 5 and 10 are "okay".
# Above 10 is too high and so should be removed.
self.thresh = thresh
self.predictor_cols = [
"radius_mean",
"texture_mean",
"perimeter_mean",
"area_mean",
"smoothness_mean",
"compactness_mean",
"concavity_mean",
"concave points_mean",
"symmetry_mean",
"fractal_dimension_mean",
"radius_se",
"texture_se",
"perimeter_se",
"area_se",
"smoothness_se",
"compactness_se",
"concavity_se",
"concave points_se",
"symmetry_se",
"fractal_dimension_se",
"radius_worst",
"texture_worst",
"perimeter_worst",
"area_worst",
"smoothness_worst",
"compactness_worst",
"concavity_worst",
"concave points_worst",
"symmetry_worst",
"fractal_dimension_worst",
]
def reset(self):
self.predictor_cols = [
"radius_mean",
"texture_mean",
"perimeter_mean",
"area_mean",
"smoothness_mean",
"compactness_mean",
"concavity_mean",
"concave points_mean",
"symmetry_mean",
"fractal_dimension_mean",
"radius_se",
"texture_se",
"perimeter_se",
"area_se",
"smoothness_se",
"compactness_se",
"concavity_se",
"concave points_se",
"symmetry_se",
"fractal_dimension_se",
"radius_worst",
"texture_worst",
"perimeter_worst",
"area_worst",
"smoothness_worst",
"compactness_worst",
"concavity_worst",
"concave points_worst",
"symmetry_worst",
"fractal_dimension_worst",
]
def fit(self, X, y=None):
print("ReduceVIF fit")
tmp, self.predictor_cols = ReduceVIF.calculate_vif(X, self.predictor_cols, self.thresh)
col_index = [self.predictor_cols.index(col_name) for col_name in self.predictor_cols]
self.col_index = col_index
print("tmp", self.col_index)
self.reset()
return self
def transform(self, X, y=None):
print("ReduceVIF transform")
# columns = X.columns.tolist()
# print(X.shape)
return X[:, self.col_index]
@staticmethod
def calculate_vif(X, columns, thresh=10.0):
# Taken from https://stats.stackexchange.com/a/253620/53565 and modified
dropped = True
count = 0
while dropped and count <= 15:
column_index = X.shape[1]
predictor_cols = np.arange(X.shape[1])
dropped = False
print(count)
vif = []
for var in range(column_index):
# print(predictor_cols.shape)
vif.append(variance_inflation_factor(X[:, predictor_cols], var))
max_vif = max(vif)
if max_vif > thresh:
maxloc = vif.index(max_vif)
print(f"Dropping {maxloc} with vif={max_vif}")
# X = X.drop([X.columns.tolist()[maxloc]], axis=1)
X = np.delete(X, maxloc, axis=1)
columns.pop(maxloc)
dropped = True
count += 1
return X, columnshttps://stackoverflow.com/questions/69407796
复制相似问题