文章/答案/技术大牛

发布

问如何在R中创建此变量？
EN

Stack Overflow用户

提问于 2020-05-04 23:31:39

回答 3查看 76关注 0票数 3

考虑以下使用R的测试数据集：

testdat<-data.frame("id"=c(rep(1,5),rep(2,5),rep(3,5)),
                    "period"=rep(seq(1:5),3),
                    "treat"=c(c(0,1,1,1,0),c(0,0,1,1,1),c(0,0,1,1,1)),
                    "state"=c(rep(0,5),c(0,1,1,1,1),c(0,0,0,1,1)),
                    "int"=c(rep(0,13),1,1))
testdat
   id period treat state int
1   1      1     0     0   0
2   1      2     1     0   0
3   1      3     1     0   0
4   1      4     1     0   0
5   1      5     0     0   0
6   2      1     0     0   0
7   2      2     0     1   0
8   2      3     1     1   0
9   2      4     1     1   0
10  2      5     1     1   0
11  3      1     0     0   0
12  3      2     0     0   0
13  3      3     1     0   0
14  3      4     1     1   1
15  3      5     1     1   1

前4个变量是我所拥有的，int是我想要的变量。它类似于treat和state之间的交互，但在第8-10行中包含1，这是不需要的。从本质上讲，我只想要在treat期间state发生变化时的交互，而不是其他情况。有没有关于如何创建它的想法(特别是对于有一百万个观察值的数据集的大规模)？

编辑:澄清为什么我想要这个措施。我想运行类似于下面的回归：

lm(outcome~treat+state+I(treat*state))

但是只有当treat跨越state的变化时，我才真正对交互感兴趣。如果我运行上面的回归，I(treat*state)汇集了我感兴趣的交互的影响，当treat完全为1时，state为1。理论上，我认为这会有两个不同的影响，所以我需要分解它们。我希望这是有意义的，我很乐意提供更多的细节。

panel-data

variables

回答 3

Stack Overflow用户

发布于 2020-05-05 00:16:30

我确信这在base R中是可能的，但这里是一个整理版本：

library(dplyr)
testdat %>%
  group_by(grp = cumsum(c(FALSE, diff(treat) > 0))) %>%
  mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
  ungroup() %>%
  select(-grp)
# # A tibble: 15 x 6
#       id period treat state   int  int2
#    <dbl>  <int> <dbl> <dbl> <dbl> <int>
#  1     1      1     0     0     0     0
#  2     1      2     1     0     0     0
#  3     1      3     1     0     0     0
#  4     1      4     1     0     0     0
#  5     1      5     0     0     0     0
#  6     2      1     0     0     0     0
#  7     2      2     0     1     0     0
#  8     2      3     1     1     0     0
#  9     2      4     1     1     0     0
# 10     2      5     1     1     0     0
# 11     3      1     0     0     0     0
# 12     3      2     0     0     0     0
# 13     3      3     1     0     0     0
# 14     3      4     1     1     1     1
# 15     3      5     1     1     1     1

分组的替代逻辑使用游程编码，实际上是相同的(建议使用https://stackoverflow.com/a/35313426)：

testdat %>%
  group_by(grp = { yy <- rle(treat); rep(seq_along(yy$lengths), yy$lengths); }) %>%
  # ...

正如答案所示，我希望dplyr有一个与data.table的rleid等价的东西。期望的逻辑是能够按列中连续的相同值进行分组，但不能在所有行中使用相同的值。

testdat %>%
  group_by(grp = { yy <- rle(treat); rep(seq_along(yy$lengths), yy$lengths); }) %>%
  mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
  ungroup()
# # A tibble: 15 x 7
#       id period treat state   int   grp  int2
#    <dbl>  <int> <dbl> <dbl> <dbl> <int> <int>
#  1     1      1     0     0     0     1     0
#  2     1      2     1     0     0     2     0
#  3     1      3     1     0     0     2     0
#  4     1      4     1     0     0     2     0
#  5     1      5     0     0     0     3     0
#  6     2      1     0     0     0     3     0
#  7     2      2     0     1     0     3     0
#  8     2      3     1     1     0     4     0
#  9     2      4     1     1     0     4     0
# 10     2      5     1     1     0     4     0
# 11     3      1     0     0     0     5     0
# 12     3      2     0     0     0     5     0
# 13     3      3     1     0     0     6     0
# 14     3      4     1     1     1     6     1
# 15     3      5     1     1     1     6     1

但这只是一厢情愿的想法。我想我也可以

my_rleid <- function(x) { yy <- rle(x); rep(seq_along(yy$lengths), yy$lengths); }
testdat %>%
  group_by(grp = my_rleid(treat)) %>%
  # ...

票数 3

Stack Overflow用户

发布于 2020-05-05 00:31:06

这是一个使用rle和ave的基本R方法。

r <- rle(testdat$treat)
r$values <- cumsum(r$values) + seq_along(r$values)
int2 <- +(ave(testdat$state, inverse.rle(r), FUN = function(x) x != x[1]) & testdat$treat == 1)
testdat <- cbind(testdat, int2)

testdat
#   id period treat state int int2
#1   1      1     0     0   0    0
#2   1      2     1     0   0    0
#3   1      3     1     0   0    0
#4   1      4     1     0   0    0
#5   1      5     0     0   0    0
#6   2      1     0     0   0    0
#7   2      2     0     1   0    0
#8   2      3     1     1   0    0
#9   2      4     1     1   0    0
#10  2      5     1     1   0    0
#11  3      1     0     0   0    0
#12  3      2     0     0   0    0
#13  3      3     1     0   0    0
#14  3      4     1     1   1    1
#15  3      5     1     1   1    1

计时

由于问题提到性能是一个问题，真实的用例数据集有100万行，下面是我的解决方案和r2evans的解决方案的时间。

将这两个解决方案都写成函数。

library(dplyr)

f1 <- function(X){
  r <- rle(X$treat)
  r$values <- cumsum(r$values) + seq_along(r$values)
  int2 <- +(ave(X$state, inverse.rle(r), FUN = function(x) x != x[1]) & testdat$treat == 1)
  cbind(X, int2)
}

f2 <- function(X){
  X %>%
    group_by(grp = cumsum(c(FALSE, diff(treat) > 0))) %>%
    mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
    ungroup() %>%
    select(-grp)
}

需要多少份testdat。

log2(1e6/nrow(testdat))
#[1] 16.02468

df1 <- testdat
for(i in 1:15) df1 <- rbind(df1, df1)
nrow(df1)
#[1] 491520

那就是50万，应该足够测试了。

mb <- microbenchmark::microbenchmark(
  base = f1(df1),
  dplyr = f2(df1),
  times = 10
)

rm(df1)    # tidy up
print(mb, unit = "relative", order = "median")
#Unit: relative
#  expr      min       lq     mean   median       uq      max neval
#  base 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10
# dplyr 1.283237 1.359772 1.331494 1.369062 1.316815 1.256968    10

基础R解决方案的速度大约快36%。

票数 1

Stack Overflow用户

发布于 2020-05-05 00:51:38

另一个使用ave的基础版本。

当状态从0变为1时，testdat$treat & c(0, diff(testdat$state))==1转到TRUE。当两者都为1时，testdat$treat & testdat$state转到1。

testdat$int2 <- +ave(testdat$treat & c(0, diff(testdat$state))==1,
  cumsum(c(0, abs(diff(testdat$treat & testdat$state)))),
  FUN=function(x) rep(x[1], length(x)))
testdat
#   id period treat state int int2
#1   1      1     0     0   0    0
#2   1      2     1     0   0    0
#3   1      3     1     0   0    0
#4   1      4     1     0   0    0
#5   1      5     0     0   0    0
#6   2      1     0     0   0    0
#7   2      2     0     1   0    0
#8   2      3     1     1   0    0
#9   2      4     1     1   0    0
#10  2      5     1     1   0    0
#11  3      1     0     0   0    0
#12  3      2     0     0   0    0
#13  3      3     1     0   0    0
#14  3      4     1     1   1    1
#15  3      5     1     1   1    1

或者使用Reduce

testdat$int2 <- Reduce(function(x,y) {if(y==-1) 0 else if(x==1 || y==1) 1 else 0},
 (testdat$treat & c(0, diff(testdat$state))==1) -c(0, diff(testdat$treat &
  testdat$state) == -1), accumulate = TRUE)

计时(从@Rui-Barradas继续)：

f3 <- function(testdat) {cbind(testdat, int2=+ave(testdat$treat &
 c(0, diff(testdat$state))==1, cumsum(c(0, abs(diff(testdat$treat &
 testdat$state)))), FUN=function(x) rep(x[1], length(x))))}
f4 <- function(testdat) {cbind(testdat, int2=Reduce(function(x,y) {
 if(y==-1) 0 else if(x==1 || y==1) 1 else 0}, (testdat$treat & c(0,
 diff(testdat$state))==1) -c(0, diff(testdat$treat & testdat$state) == -1),
 accumulate = TRUE))}

microbenchmark::microbenchmark(base = f1(df1), dplyr = f2(df1),
 GKi1 = f3(df1), GKi2 = f4(df1), times = 10)
#Unit: milliseconds
#  expr       min        lq     mean    median        uq       max neval  cld
#  base 1132.7269 1188.7439 1233.106 1226.8532 1293.9901 1364.8358    10   c 
# dplyr 1376.0856 1436.4027 1466.418 1458.7240 1509.8990 1559.7976    10    d
#  GKi1  960.5438 1006.8803 1029.105 1022.6114 1065.7427 1074.6027    10  b  
#  GKi2  588.0484  667.2482  694.415  699.0845  739.5523  786.1819    10 a

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/61595883

复制

相似问题

问如何在R中创建此变量？
EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何在R中创建此变量？EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问如何在R中创建此变量？
EN