我有一套实验室的价值,我希望随着时间的推移,与入院日期有关。每个病人都有这个实验室/随访时间的可变入口。我的目标是确定这个实验室在入院后不同时间间隔的最小值(df中的date_one),即0-30天,31-90天,1-2年,2-3,3-4等等,直到他们的最后一次随访,以帮助我识别高于基线的某一阈值的异常值。由于这个实验室值会随着时间的推移自然变化,我希望找到这些最小值来建立新的基线。由于每个病人都有可变的随访时间,一些长达20年的时间,我很难找到一个函数来找出不需要过滤和变异的局部极小值,为我想要的每一个间隔创建一个新列。我的dput输出在下面,如果这是不正确的格式,请告诉我!
structure(list(lab_date = structure(c(10006, 10007, 10008, 10009,
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019,
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225,
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858,
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199,
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226,
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281,
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401,
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663,
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698,
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733,
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1,
1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1,
1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2,
1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1,
2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2,
2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1,
2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2,
2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4,
3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182), Date_One = structure(c(10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))发布于 2022-02-14 18:27:58
下面是使用tidyverse的一个可能的选项(但我不确定您希望输出是什么格式):
library(tidyverse)
df %>%
group_by(ID, Date_One) %>%
mutate(years = as.numeric(difftime(Date_One,lab_date,units = "days")/365)) %>%
filter(years >= 0) %>%
group_by(gr=cut(years, breaks= c(-Inf, c((30/365), (60/365)), seq(1, 20, by = 1))), ID) %>%
summarise(lab_value = min(lab_value))输出
gr ID lab_value
<fct> <dbl> <dbl>
1 (-Inf,0.0822] 182 1.2
2 (0.164,1] 182 1.2
3 (1,2] 182 1.2
4 (2,3] 182 1 发布于 2022-02-14 18:26:24
像这样的怎么样?它让您将不同的分段中断指定为天数(可以很容易地将其转换为月份或其他东西,但必须更改其他代码),然后对于每个分段,将位于这些分段范围内的行隔离开来,然后找到其中的最小行。如果在这些日期没有价值,它将返回安娜。这应该适用于您提供的数据,如果您想将其应用于具有多个if的数据框架,请告诉我,这应该是一个额外的小循环。
#Convert object to dataframe
Data=data.frame(structure(list(lab_date = structure(c(10006, 10007, 10008, 10009,
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019,
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225,
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858,
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199,
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226,
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281,
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401,
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663,
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698,
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733,
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1,
1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1,
1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2,
1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1,
2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2,
2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1,
2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2,
2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4,
3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182), Date_One = structure(c(10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856,
10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE)))
#Define Segment Breaks in days
SegmentBreaks=c(0,30,90,365,730)
#Function for finding min date
MinAtSegments=function(Data,SegmentBreaks){
IDNumber=length(unique(Data$ID))
UniqueIDs=unique(Data$ID)
OutputLength=length(SegmentBreaks)
Date1=min(Data$lab_date)
DateBreaks=Date1+SegmentBreaks
Output=matrix(NA,nrow=IDNumber,ncol=length(SegmentBreaks))
DateBreaks=c(DateBreaks,Sys.Date())
for(j in 1:IDNumber){
DataID=Data[Data$ID==UniqueIDs[j],]
for(i in 1:length(Output)){
LabVals=Data$lab_value[Data$lab_date>=DateBreaks[i] & Data$lab_date<DateBreaks[i+1]]
Output[j,i]=ifelse(length(LabVals)>0,min(LabVals),NA)
}
}
Output=data.frame('ID'=UniqueIDs,'MinVals'=Output)
return(Output)
}
#Run Function
MinAtSegments(Data,SegmentBreaks)https://stackoverflow.com/questions/71115760
复制相似问题