文章/答案/技术大牛

发布

社区首页 >问答首页 >如何将递归VIF消除合并到scikit-learn的管道中？

问如何将递归VIF消除合并到scikit-learn的管道中？
EN

Stack Overflow用户

提问于 2021-10-01 14:42:52

回答 1查看 33关注 0票数 0

我正在尝试在scikit-learn的pipeline对象中实现一个自定义管道。管道是使用VIF递归消除功能。我参考了here的代码

class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, thresh=10.0):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        self.scaler = preprocessing.StandardScaler()


    def fit(self, X, y=None):
        X_copy = X.copy()
        print("ReduceVIF fit")
        if hasattr(self, 'scaler'):
            X = self.scaler.fit(X)
        X = ReduceVIF.calculate_vif(X, self.thresh)
        print(X)
        self.predictors = X.columns
        return self

    def transform(self, X, y=None):
        print("ReduceVIF transform")
        columns = self.predictors
        if hasattr(self, 'scaler'):
            X = pd.DataFrame(self.scaler.transform(X), columns=columns)
        return X
        # return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=10.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped = True
        count = 0
        while dropped and count <= 15:
            
            print(count)
            variables = X.columns

            dropped = False
            
            
            vif = [
                variance_inflation_factor(X[variables].values, X.columns.get_loc(var))
                for var in X.columns
            ]

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f"Dropping {X.columns[maxloc]} with vif={max_vif}")
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped = True
                count +=1
            print(X.shape)
            
        return X

我试着这样调用/创建一个管道

# create a feature preparation pipeline for a model
def make_finetuning_pipeline(model):
    steps = list()
    # standardization
    #steps.append(('standardize', preprocessing.StandardScaler()))
    steps.append(('remove_multicollinearity', ReduceVIF(thresh=10)))
    #steps.append(("feature_selection", feature_selection.RFE(linear_model.LogisticRegression(penalty='l1', solver='liblinear'))))
    # the model
    steps.append(('model', model))
    # create pipeline
    _pipeline = pipeline.Pipeline(steps=steps)
    return _pipeline

但不知何故它不起作用，错误要么是每个文件夹有不同的列，要么是有一个属性错误。有没有人知道如何将VIF插入到scikit学习管道中？

以下是我在github gist中的代码片段，以确保可重现性。Github Gist

python

machine-learning

scikit-learn

pipeline

回答 1

Stack Overflow用户

发布于 2021-10-02 03:46:39

回答我自己的问题:我花了半天的时间尝试调试，初步的工作版本如下所示，不是很优雅，但现在它的工作是预期的。

class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, thresh=10):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        self.predictor_cols = [
            "radius_mean",
            "texture_mean",
            "perimeter_mean",
            "area_mean",
            "smoothness_mean",
            "compactness_mean",
            "concavity_mean",
            "concave points_mean",
            "symmetry_mean",
            "fractal_dimension_mean",
            "radius_se",
            "texture_se",
            "perimeter_se",
            "area_se",
            "smoothness_se",
            "compactness_se",
            "concavity_se",
            "concave points_se",
            "symmetry_se",
            "fractal_dimension_se",
            "radius_worst",
            "texture_worst",
            "perimeter_worst",
            "area_worst",
            "smoothness_worst",
            "compactness_worst",
            "concavity_worst",
            "concave points_worst",
            "symmetry_worst",
            "fractal_dimension_worst",
        ]

    def reset(self):

        self.predictor_cols = [
            "radius_mean",
            "texture_mean",
            "perimeter_mean",
            "area_mean",
            "smoothness_mean",
            "compactness_mean",
            "concavity_mean",
            "concave points_mean",
            "symmetry_mean",
            "fractal_dimension_mean",
            "radius_se",
            "texture_se",
            "perimeter_se",
            "area_se",
            "smoothness_se",
            "compactness_se",
            "concavity_se",
            "concave points_se",
            "symmetry_se",
            "fractal_dimension_se",
            "radius_worst",
            "texture_worst",
            "perimeter_worst",
            "area_worst",
            "smoothness_worst",
            "compactness_worst",
            "concavity_worst",
            "concave points_worst",
            "symmetry_worst",
            "fractal_dimension_worst",
        ]

    def fit(self, X, y=None):
        print("ReduceVIF fit")
        tmp, self.predictor_cols = ReduceVIF.calculate_vif(X, self.predictor_cols, self.thresh)
        col_index = [self.predictor_cols.index(col_name) for col_name in self.predictor_cols]
        self.col_index = col_index
        print("tmp", self.col_index)
        self.reset()
        return self

    def transform(self, X, y=None):
        print("ReduceVIF transform")
        # columns = X.columns.tolist()
        # print(X.shape)
        return X[:, self.col_index]

    @staticmethod
    def calculate_vif(X, columns, thresh=10.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped = True
        count = 0
        while dropped and count <= 15:
            column_index = X.shape[1]
            predictor_cols = np.arange(X.shape[1])
            dropped = False
            print(count)

            vif = []
            for var in range(column_index):
                # print(predictor_cols.shape)
                vif.append(variance_inflation_factor(X[:, predictor_cols], var))

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f"Dropping {maxloc} with vif={max_vif}")
                # X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                X = np.delete(X, maxloc, axis=1)
                columns.pop(maxloc)
                dropped = True
                count += 1
        return X, columns

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/69407796

复制

相似问题

问如何将递归VIF消除合并到scikit-learn的管道中？
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何将递归VIF消除合并到scikit-learn的管道中？EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何将递归VIF消除合并到scikit-learn的管道中？
EN