数据看上去像是
cum_ft source
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
123.1018 imds
125.4585 imds
125.4585 imds
125.4585 imds
0.0000 maint
0.0000 maint
0.0000 maint
126.7622 imds
126.7622 imds
126.7622 imds 目标是将maint的值设置为imd的最后一个值。
cum_ft source
123.1018 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 maint
125.4585 maint
125.4585 maint
126.7622 imds
126.7622 imds
126.7622 imds 我在努力,但没有成功,就像
maint_rows_to_change = which(temp_df$source=="maint")
diff_maint_row_to_change = diff(maint_rows_to_change)
imds_rows_with_data = which(temp_df$source=="imds")
diff_imds_row_to_change = diff(imds_rows_with_data)
rows_to_change_increment = which(diff_update_row > 1)此时,当有imsl数据要跳过时,diff_maint_row_to_change的数字大于一个,当连续的维护行必须调整时,值大于一个。调整是将维护行的cum_ft值设置为imsl数据的最后一个值。
我想要写的是类似下面的表达式,但我不清楚如何提出last_imds_row。在本例中,maint_rows_to_change = c(11,12,13)和last_imds_row = c(10,10,10)。
temp_df$cum_ft[maint_rows_to_change] = temp_df$cum_ft[last_imds_row]我也尝试了一个循环,取得了一些成功,但花费的时间太长了。
fun1 <- function(z) {
z$cum_ft_cor = z$cum_ft
rows_to_fix = which(z$source=="maint")
z$cum_ft_cor[rows_to_fix]=-1
for(i in rows_to_fix) {
z$cum_ft_cor[i] <- z$cum_ft_cor[i-1]
}
z
}
temp_df_2 = fun1(temp_df)发布于 2014-08-26 21:18:40
一种选择是使用Rcpp包使循环解决方案更快:
library(Rcpp)
copyDat <- cppFunction(
'void copyDat(NumericVector x, std::vector<std::string> y) {
for (int i=1; i < y.size(); ++i) {
if (y[i] == "maint") x[i] = x[i-1];
}
}')然后你就可以:
copyDat(temp_df$cum_ft, as.character(temp_df$source))
temp_df
# cum_ft source
# 1 125.4585 imds
# 2 125.4585 imds
# 3 125.4585 imds
# 4 125.4585 imds
# 5 125.4585 imds
# 6 125.4585 imds
# 7 123.1018 imds
# 8 125.4585 imds
# 9 125.4585 imds
# 10 125.4585 imds
# 11 125.4585 maint
# 12 125.4585 maint
# 13 125.4585 maint
# 14 126.7622 imds
# 15 126.7622 imds
# 16 126.7622 imds在一个有130万行的例子中,Rcpp解决方案比在注释中发布的动物园解决方案快6倍(尽管两者都非常快):
# Functions to benchmark
josilber <- function(temp_df) {
copyDat(temp_df$cum_ft, as.character(temp_df$source))
temp_df
}
library(zoo)
darenburg <- function(temp_df) {
temp_df[temp_df$source == "maint", "cum_ft"] <- NA
temp_df$cum_ft <- na.locf(temp_df$cum_ft)
temp_df
}
# Do the test
library(microbenchmark)
temp_df <- data.frame(cum_ft=rnorm(1300000),
source=rep(c(rep("imds", 10), rep("maint", 3)), 100000))
all.equal(josilber(temp_df), darenburg(temp_df))
# [1] TRUE
microbenchmark(josilber(temp_df), darenburg(temp_df))
# Unit: milliseconds
# expr min lq median uq max neval
# josilber(temp_df) 78.05012 83.80206 86.96831 92.56959 122.5809 100
# darenburg(temp_df) 464.33525 492.76668 510.65864 541.43435 703.6944 100https://stackoverflow.com/questions/25513942
复制相似问题