# Some test data
np.random.seed(33454)
df = (
    # A standard distribution
    pd.DataFrame({'nb': np.random.randint(0, 100, 20)})
        # Adding some outliers
        .append(pd.DataFrame({'nb': np.random.randint(100, 200, 2)}))
        # Reseting the index
        .reset_index(drop=True)
    )

# Computing IQR
Q1 = df['nb'].quantile(0.25)
Q3 = df['nb'].quantile(0.75)
IQR = Q3 - Q1

# Filtering Values between Q1-1.5IQR and Q3+1.5IQR
filtered = df.query('(@Q1 - 1.5 * @IQR) <= nb <= (@Q3 + 1.5 * @IQR)')

然后我们可以绘制结果来检查差异。我们观察到左侧箱线图中的异常值(183处的十字)不再出现在过滤后的序列中。

# Ploting the result to check the difference
df.join(filtered, rsuffix='_filtered').boxplot()

因为这个答案，我已经写了一个关于这个主题的，你可以找到更多的信息。

票数 61

Stack Overflow用户

发布于 2017-04-07 21:32:01

另一种使用Series.between()的方法

iqr = df['col'][df['col'].between(df['col'].quantile(.25), df['col'].quantile(.75), inclusive=True)]

抽出：

# Select the first quantile
q1 = df['col'].quantile(.25)

# Select the third quantile
q3 = df['col'].quantile(.75)

# Create a mask inbeetween q1 & q3
mask = df['col'].between(q1, q3, inclusive=True)

# Filtering the initial dataframe with a mask
iqr = df.loc[mask, 'col']

票数 21

Stack Overflow用户

发布于 2016-10-31 19:44:57

这将为您提供位于列column的IQR中的df子集

def subset_by_iqr(df, column, whisker_width=1.5):
    """Remove outliers from a dataframe by column, including optional 
       whiskers, removing rows for which the column value are 
       less than Q1-1.5IQR or greater than Q3+1.5IQR.
    Args:
        df (`:obj:pd.DataFrame`): A pandas dataframe to subset
        column (str): Name of the column to calculate the subset from.
        whisker_width (float): Optional, loosen the IQR filter by a
                               factor of `whisker_width` * IQR.
    Returns:
        (`:obj:pd.DataFrame`): Filtered dataframe
    """
    # Calculate Q1, Q2 and IQR
    q1 = df[column].quantile(0.25)                 
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    # Apply filter with respect to IQR, including optional whiskers
    filter = (df[column] >= q1 - whisker_width*iqr) & (df[column] <= q3 + whisker_width*iqr)
    return df.loc[filter]                                                     

# Example for whiskers = 1.5, as requested by the OP
df_filtered = subset_by_iqr(df, 'column_name', whisker_width=1.5)

票数 9

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/34782063

复制

相似问题

问如何在IQR中使用pandas过滤器
EN

回答 6

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何在IQR中使用pandas过滤器EN

回答 6

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何在IQR中使用pandas过滤器
EN