我有一个包含数百万行的大量数据集,我希望用前面的值、下一个值或上一个和下一个可用值的平均值来填充缺失的值。它将检查前三行和下三行是否有未丢失的值。这里,缺失值由-99表示。我为此编写了ifelse循环,但我要花上几个小时才能完成这个任务。我们能否更有效、更快地完成这一任务?
rec_30 <- which(data$TEMPERATURE %in% c(-99) & data$MIN == 30)
for(q in rec_30){
ifelse(q <= 4, aa <- data$TEMPERATURE[q],
ifelse(data$TEMPERATURE[q-1] > 0, aa <- data$TEMPERATURE[q-1],
ifelse(data$TEMPERATURE[q-2] > 0, aa <- data$TEMPERATURE[q-2],
ifelse(data$TEMPERATURE[q-3], aa <- data$TEMPERATURE[q-3], aa <- data$TEMPERATURE[q]))))
ifelse(data$TEMPERATURE[q+1] > 0, bb <- data$TEMPERATURE[q+1],
ifelse(data$TEMPERATURE[q+2] > 0, bb <- data$TEMPERATURE[q+2],
ifelse(data$TEMPERATURE[q+3] > 0, bb <- data$TEMPERATURE[q+3], bb <- data$TEMPERATURE[q])))
ifelse(aa > 0 & bb > 0, data$TEMPERATURE[q] <- (aa + bb)/2,
ifelse(aa > 0 & bb == -99, data$TEMPERATURE[q] <- aa,
ifelse(aa == -99 & bb > 0, data$TEMPERATURE[q] <- bb, data$TEMPERATURE[q] <- data$TEMPERATURE[q])))
}发布于 2022-08-02 13:06:33
正如@zephryl建议的那样,我使用Rcpp编写了一个函数,它比R函数快得多。下面是R循环的Rcpp函数。
library(Rcpp)
cppFunction('DataFrame mean_function(DataFrame input_df, DataFrame row_number){
#include <vector>
NumericVector parameter = input_df[8];
NumericVector row_index = row_number[0];
int leni = row_index.size();
int lenp = parameter.size();
float aa, bb;
for (int i=0; i<leni; i++){
int q = row_index[i];
if (q <= 4) {
aa = parameter[q];
}else if (parameter[q-1] > 0) {
aa = parameter[q-1];
}else if (parameter[q-2] > 0) {
aa = parameter[q-2];
}else if (parameter[q-3] > 0) {
aa = parameter[q-3];
}else {
aa = parameter[q];
}
if(parameter[q+1] > 0) {
bb = parameter[q+1];
}else if (parameter[q+2] > 0){
bb = parameter[q+2];
}else if (parameter[q+3] > 0) {
bb = parameter[q+3];
}else {
bb = parameter[q];
}
if (aa > 0 && bb > 0) {
parameter[q] = (aa + bb)/2.0;
}else if (aa > 0 && bb == -99) {
parameter[q] = aa;
}else if (aa == -99 && bb > 0) {
parameter[q] = bb;
}else {
parameter[q] = parameter[q];
}
for (int x=0; x< lenp; x++)
{
if(x==q)
parameter[x]=parameter[q];
else
parameter[x]=parameter[x];
}
}
DataFrame Updated_Variable = DataFrame::create(Named("up_value") = parameter);
return(Updated_Variable);
}')
mean_function(data, rec_30)在这里,NumericVector parameter = input_df[8]从dataframe data中选择第9列,而i rec_30被转换为list的dataframe安装。
我希望这对将来的人有帮助。
发布于 2022-07-26 13:09:00
下面是一个利用data.table::shift()的选项
。
library(data.table)
setDT(data)
data[TEMPERATURE==-99, TEMPERATURE:=NA]id列。cols = c(paste0("next",1:3), paste0("prior",1:3))
data[,(cols):= shift(TEMPERATURE, c(-1:-3,1:3))]
data[, id:=.I]fcase by row为每一行获得正确的next/prev组合。
data[is.na(TEMPERATURE), IMPUTED_TEMPERATURE:=fcase(
any(next1,prior1), mean(c(next1,prior1), na.rm=T),
any(next2,prior2), mean(c(next2,prior2), na.rm=T),
any(next3,prior3), mean(c(next3,prior3), na.rm=T),
default=NA), id][, (c(cols,"id")):=NULL]以下是替代上述步骤2和3的替代方法。它使用rowMeans。请注意,我在这里三次调用shift(),每次调用+/-1行、+/-2行、+/-3行的集合(但如果要取前3行和后3行可用值的平均值,则可以进一步简化)
# Get the three sets of rowmeans
rmeans = cbind(
rowMeans(data[, shift(TEMPERATURE, c(-1,1))], na.rm=T),
rowMeans(data[, shift(TEMPERATURE, c(-2,2))], na.rm=T),
rowMeans(data[, shift(TEMPERATURE, c(-3,3))], na.rm=T)
)
# Set imputed temperature to the first one that is not NA
data[, imputed:=apply(rmeans,1,\(x) x[which(!is.na(x))[1]])]
# Set imputed back to NA if it does not need to be imputed
data[!is.na(TEMPERATURE), imputed:=NA]输出(前10行;与任何一种方法相同)
TEMPERATURE IMPUTED_TEMPERATURE
<num> <num>
1: 21.13703 NA
2: NA 23.61489
3: 26.09275 NA
4: 26.23379 NA
5: 28.60915 NA
6: NA 24.35206
7: 20.09496 NA
8: 22.32551 NA
9: NA 23.73401
10: 25.14251 NA输入:
data = structure(list(TEMPERATURE = c(21.1370341130532, -99, 26.0927473288029,
26.2337944167666, 28.6091538355686, -99, 20.0949575635605, 22.3255050601438,
-99, 25.1425114134327, -99, 25.4497483558953, 22.8273358359002,
29.2343348427676, 22.9231584025547, 28.3729562815279, -99, 22.6682078000158,
-99, -99, -99, 23.0269337072968, 21.590460028965, 20.3999591805041,
22.1879954100586, 28.1059855245985, -99, -99, 28.3134504687041,
-99, 24.5609148242511, -99, 23.0467220302671, 25.0730687007308,
-99, -99, 22.0124803762883, 22.5880981865339, 29.9215041752905,
28.0735234031454, -99, -99, -99, 26.2181919813156, 23.2977017574012,
25.0199747295119, 26.7709452728741, -99, 22.4392882734537, 27.6545978756621,
-99, 23.0968660186045, -99, 25.0454591214657, 21.5299895894714,
25.0393348815851, -99, -99, 21.7464982392266, -99, -99, 20.4185727518052,
-99, 20.137499391567, -99, -99, 23.0809475714341, 25.0854756566696,
20.5164661933668, -99, -99, -99, 20.1462725573219, -99, 20.899613329675,
-99, -99, 20.7005249732174, -99, -99, -99, 24.7190972114913,
21.4261534321122, -99, -99, 28.9858048921451, 23.8949978468008,
23.1087077967823, -99, 28.9618584956042, 21.6639378038235, 29.0042459615506,
21.3407819508575, 21.3161413418129, 21.0528750251979, 25.1158358110115,
23.0019905394875, -99, 23.0964743136428, 27.4211965710856)), row.names = c(NA,
-100L), class = "data.frame")https://stackoverflow.com/questions/73123459
复制相似问题