首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >如何将递归VIF消除合并到scikit-learn的管道中?

如何将递归VIF消除合并到scikit-learn的管道中?
EN

Stack Overflow用户
提问于 2021-10-01 14:42:52
回答 1查看 33关注 0票数 0

我正在尝试在scikit-learnpipeline对象中实现一个自定义管道。管道是使用VIF递归消除功能。我参考了here的代码

代码语言:javascript
复制
class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, thresh=10.0):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        self.scaler = preprocessing.StandardScaler()


    def fit(self, X, y=None):
        X_copy = X.copy()
        print("ReduceVIF fit")
        if hasattr(self, 'scaler'):
            X = self.scaler.fit(X)
        X = ReduceVIF.calculate_vif(X, self.thresh)
        print(X)
        self.predictors = X.columns
        return self

    def transform(self, X, y=None):
        print("ReduceVIF transform")
        columns = self.predictors
        if hasattr(self, 'scaler'):
            X = pd.DataFrame(self.scaler.transform(X), columns=columns)
        return X
        # return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=10.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped = True
        count = 0
        while dropped and count <= 15:
            
            print(count)
            variables = X.columns

            dropped = False
            
            
            vif = [
                variance_inflation_factor(X[variables].values, X.columns.get_loc(var))
                for var in X.columns
            ]

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f"Dropping {X.columns[maxloc]} with vif={max_vif}")
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped = True
                count +=1
            print(X.shape)
            
        return X

我试着这样调用/创建一个管道

代码语言:javascript
复制
# create a feature preparation pipeline for a model
def make_finetuning_pipeline(model):
    steps = list()
    # standardization
    #steps.append(('standardize', preprocessing.StandardScaler()))
    steps.append(('remove_multicollinearity', ReduceVIF(thresh=10)))
    #steps.append(("feature_selection", feature_selection.RFE(linear_model.LogisticRegression(penalty='l1', solver='liblinear'))))
    # the model
    steps.append(('model', model))
    # create pipeline
    _pipeline = pipeline.Pipeline(steps=steps)
    return _pipeline

但不知何故它不起作用,错误要么是每个文件夹有不同的列,要么是有一个属性错误。有没有人知道如何将VIF插入到scikit学习管道中?

以下是我在github gist中的代码片段,以确保可重现性。Github Gist

EN

回答 1

Stack Overflow用户

发布于 2021-10-02 03:46:39

回答我自己的问题:我花了半天的时间尝试调试,初步的工作版本如下所示,不是很优雅,但现在它的工作是预期的。

代码语言:javascript
复制
class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, thresh=10):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        self.predictor_cols = [
            "radius_mean",
            "texture_mean",
            "perimeter_mean",
            "area_mean",
            "smoothness_mean",
            "compactness_mean",
            "concavity_mean",
            "concave points_mean",
            "symmetry_mean",
            "fractal_dimension_mean",
            "radius_se",
            "texture_se",
            "perimeter_se",
            "area_se",
            "smoothness_se",
            "compactness_se",
            "concavity_se",
            "concave points_se",
            "symmetry_se",
            "fractal_dimension_se",
            "radius_worst",
            "texture_worst",
            "perimeter_worst",
            "area_worst",
            "smoothness_worst",
            "compactness_worst",
            "concavity_worst",
            "concave points_worst",
            "symmetry_worst",
            "fractal_dimension_worst",
        ]

    def reset(self):

        self.predictor_cols = [
            "radius_mean",
            "texture_mean",
            "perimeter_mean",
            "area_mean",
            "smoothness_mean",
            "compactness_mean",
            "concavity_mean",
            "concave points_mean",
            "symmetry_mean",
            "fractal_dimension_mean",
            "radius_se",
            "texture_se",
            "perimeter_se",
            "area_se",
            "smoothness_se",
            "compactness_se",
            "concavity_se",
            "concave points_se",
            "symmetry_se",
            "fractal_dimension_se",
            "radius_worst",
            "texture_worst",
            "perimeter_worst",
            "area_worst",
            "smoothness_worst",
            "compactness_worst",
            "concavity_worst",
            "concave points_worst",
            "symmetry_worst",
            "fractal_dimension_worst",
        ]

    def fit(self, X, y=None):
        print("ReduceVIF fit")
        tmp, self.predictor_cols = ReduceVIF.calculate_vif(X, self.predictor_cols, self.thresh)
        col_index = [self.predictor_cols.index(col_name) for col_name in self.predictor_cols]
        self.col_index = col_index
        print("tmp", self.col_index)
        self.reset()
        return self

    def transform(self, X, y=None):
        print("ReduceVIF transform")
        # columns = X.columns.tolist()
        # print(X.shape)
        return X[:, self.col_index]

    @staticmethod
    def calculate_vif(X, columns, thresh=10.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped = True
        count = 0
        while dropped and count <= 15:
            column_index = X.shape[1]
            predictor_cols = np.arange(X.shape[1])
            dropped = False
            print(count)

            vif = []
            for var in range(column_index):
                # print(predictor_cols.shape)
                vif.append(variance_inflation_factor(X[:, predictor_cols], var))

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f"Dropping {maxloc} with vif={max_vif}")
                # X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                X = np.delete(X, maxloc, axis=1)
                columns.pop(maxloc)
                dropped = True
                count += 1
        return X, columns
票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/69407796

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档