首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >提高if-else循环的性能以更新缺失的值

提高if-else循环的性能以更新缺失的值
EN

Stack Overflow用户
提问于 2022-07-26 12:33:46
回答 2查看 69关注 0票数 0

我有一个包含数百万行的大量数据集,我希望用前面的值、下一个值或上一个和下一个可用值的平均值来填充缺失的值。它将检查前三行和下三行是否有未丢失的值。这里,缺失值由-99表示。我为此编写了ifelse循环,但我要花上几个小时才能完成这个任务。我们能否更有效、更快地完成这一任务?

代码语言:javascript
复制
rec_30 <- which(data$TEMPERATURE %in% c(-99) & data$MIN == 30)

for(q in rec_30){
    ifelse(q <= 4, aa <- data$TEMPERATURE[q],                                         
           ifelse(data$TEMPERATURE[q-1] > 0, aa <- data$TEMPERATURE[q-1],
                  ifelse(data$TEMPERATURE[q-2] > 0, aa <- data$TEMPERATURE[q-2],
                         ifelse(data$TEMPERATURE[q-3], aa <- data$TEMPERATURE[q-3], aa <- data$TEMPERATURE[q]))))
    
    ifelse(data$TEMPERATURE[q+1] > 0, bb <- data$TEMPERATURE[q+1],
           ifelse(data$TEMPERATURE[q+2] > 0, bb <- data$TEMPERATURE[q+2],
                  ifelse(data$TEMPERATURE[q+3] > 0, bb <- data$TEMPERATURE[q+3], bb <- data$TEMPERATURE[q])))
    
    ifelse(aa > 0 & bb > 0, data$TEMPERATURE[q] <- (aa + bb)/2,
           ifelse(aa > 0 & bb == -99, data$TEMPERATURE[q] <- aa,
                  ifelse(aa == -99 & bb > 0, data$TEMPERATURE[q] <- bb, data$TEMPERATURE[q] <- data$TEMPERATURE[q])))
}
EN

回答 2

Stack Overflow用户

回答已采纳

发布于 2022-08-02 13:06:33

正如@zephryl建议的那样,我使用Rcpp编写了一个函数,它比R函数快得多。下面是R循环的Rcpp函数。

代码语言:javascript
复制
library(Rcpp)

cppFunction('DataFrame mean_function(DataFrame input_df, DataFrame row_number){
  
  #include <vector>
  NumericVector parameter = input_df[8];
  NumericVector row_index = row_number[0];
  
  int leni = row_index.size();
  int lenp = parameter.size();  
        
  float aa, bb;
  for (int i=0; i<leni; i++){
    int q = row_index[i];
    if (q <= 4) {
    aa = parameter[q];
    }else if (parameter[q-1] > 0) {
    aa = parameter[q-1];
    }else if (parameter[q-2] > 0) {
    aa = parameter[q-2];
    }else if (parameter[q-3] > 0) {
    aa = parameter[q-3];
    }else {
    aa = parameter[q];
    }

    if(parameter[q+1] > 0) {
    bb = parameter[q+1];
    }else if (parameter[q+2] > 0){
    bb = parameter[q+2];
    }else if (parameter[q+3] > 0) {
    bb = parameter[q+3];
    }else {
    bb = parameter[q];
    }

    if (aa > 0 && bb > 0) {
    parameter[q] = (aa + bb)/2.0;
    }else if (aa > 0 && bb == -99) {
    parameter[q] = aa;
    }else if (aa == -99 && bb > 0) {
    parameter[q] = bb;
    }else {
    parameter[q] = parameter[q];
    }

   for (int x=0; x< lenp; x++)
   {
   if(x==q)
   parameter[x]=parameter[q];
   else
   parameter[x]=parameter[x];
   }
  }
   
DataFrame Updated_Variable = DataFrame::create(Named("up_value") = parameter);
return(Updated_Variable);

}')

mean_function(data, rec_30)

在这里,NumericVector parameter = input_df[8]从dataframe data中选择第9列,而i rec_30被转换为list的dataframe安装。

我希望这对将来的人有帮助。

票数 0
EN

Stack Overflow用户

发布于 2022-07-26 13:09:00

下面是一个利用data.table::shift()的选项

  1. 将数据设置为data.table,并将-99更改为NA

代码语言:javascript
复制
library(data.table)
setDT(data)
data[TEMPERATURE==-99, TEMPERATURE:=NA]

  1. 为前三个临时值和下一个临时值创建列,并为后续的按行求值创建id列。

代码语言:javascript
复制
cols = c(paste0("next",1:3), paste0("prior",1:3))
data[,(cols):= shift(TEMPERATURE, c(-1:-3,1:3))]
data[, id:=.I]

  1. 使用fcase by row为每一行

获得正确的next/prev组合。

代码语言:javascript
复制
data[is.na(TEMPERATURE), IMPUTED_TEMPERATURE:=fcase(
  any(next1,prior1), mean(c(next1,prior1), na.rm=T),
  any(next2,prior2), mean(c(next2,prior2), na.rm=T),
  any(next3,prior3), mean(c(next3,prior3), na.rm=T),
  default=NA), id][, (c(cols,"id")):=NULL]

以下是替代上述步骤2和3的替代方法。它使用rowMeans。请注意,我在这里三次调用shift(),每次调用+/-1行、+/-2行、+/-3行的集合(但如果要取前3行和后3行可用值的平均值,则可以进一步简化)

代码语言:javascript
复制
# Get the three sets of rowmeans
rmeans = cbind(
  rowMeans(data[, shift(TEMPERATURE, c(-1,1))], na.rm=T),
  rowMeans(data[, shift(TEMPERATURE, c(-2,2))], na.rm=T),
  rowMeans(data[, shift(TEMPERATURE, c(-3,3))], na.rm=T)
)
# Set imputed temperature to the first one that is not NA
data[, imputed:=apply(rmeans,1,\(x) x[which(!is.na(x))[1]])]

# Set imputed back to NA if it does  not need to be imputed
data[!is.na(TEMPERATURE), imputed:=NA]

输出(前10行;与任何一种方法相同)

代码语言:javascript
复制
    TEMPERATURE IMPUTED_TEMPERATURE
          <num>               <num>
 1:    21.13703                  NA
 2:          NA            23.61489
 3:    26.09275                  NA
 4:    26.23379                  NA
 5:    28.60915                  NA
 6:          NA            24.35206
 7:    20.09496                  NA
 8:    22.32551                  NA
 9:          NA            23.73401
10:    25.14251                  NA

输入:

代码语言:javascript
复制
data = structure(list(TEMPERATURE = c(21.1370341130532, -99, 26.0927473288029, 
26.2337944167666, 28.6091538355686, -99, 20.0949575635605, 22.3255050601438, 
-99, 25.1425114134327, -99, 25.4497483558953, 22.8273358359002, 
29.2343348427676, 22.9231584025547, 28.3729562815279, -99, 22.6682078000158, 
-99, -99, -99, 23.0269337072968, 21.590460028965, 20.3999591805041, 
22.1879954100586, 28.1059855245985, -99, -99, 28.3134504687041, 
-99, 24.5609148242511, -99, 23.0467220302671, 25.0730687007308, 
-99, -99, 22.0124803762883, 22.5880981865339, 29.9215041752905, 
28.0735234031454, -99, -99, -99, 26.2181919813156, 23.2977017574012, 
25.0199747295119, 26.7709452728741, -99, 22.4392882734537, 27.6545978756621, 
-99, 23.0968660186045, -99, 25.0454591214657, 21.5299895894714, 
25.0393348815851, -99, -99, 21.7464982392266, -99, -99, 20.4185727518052, 
-99, 20.137499391567, -99, -99, 23.0809475714341, 25.0854756566696, 
20.5164661933668, -99, -99, -99, 20.1462725573219, -99, 20.899613329675, 
-99, -99, 20.7005249732174, -99, -99, -99, 24.7190972114913, 
21.4261534321122, -99, -99, 28.9858048921451, 23.8949978468008, 
23.1087077967823, -99, 28.9618584956042, 21.6639378038235, 29.0042459615506, 
21.3407819508575, 21.3161413418129, 21.0528750251979, 25.1158358110115, 
23.0019905394875, -99, 23.0964743136428, 27.4211965710856)), row.names = c(NA, 
-100L), class = "data.frame")
票数 2
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/73123459

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档