我有两个数据:df1系列MRI读取1×2不同的MRI读取器;df2 -被试1的类固醇使用列表。
根据df2$STATUS是否包含在df1中,我尝试执行df1$DOS到lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE)的左连接。如果该df2$STATUS没有可用的df1$DOS,我希望值为NA。
例如,最后的df应该如下所示,因为df1$DOS属于第一行lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE)

dput示例
df1 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), READER = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"),
DOS = structure(c(15469, 15497, 15530, 15579, 15635, 15691,
15747, 15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547,
16678, 16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652,
17686, 17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079,
18113, 18176, 15469, 15497, 15530, 15579, 15635, 15691, 15747,
15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 16678,
16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 17686,
17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 18113,
18176), class = "Date"), VISIT = c("1 Screening", "1 Week 04",
"1 Week 08", "1 Week 16", "Unscheduled 01", "Unscheduled 02",
"Unscheduled 03", "Unscheduled 04", "Unscheduled 05", "Unscheduled 06",
"Unscheduled 07", "Unscheduled 08", "Unscheduled 09", "Unscheduled 10",
"Unscheduled 11", "Unscheduled 12", "Unscheduled 13", "Unscheduled 14",
"Unscheduled 15", "Unscheduled", "Unscheduled 16", "Unscheduled 17",
"Unscheduled", "2 Screening", "2 Week 04", "2 Week 08", "2 Week 16",
"Unscheduled 18", "Unscheduled 19", "Unscheduled 20", "Unscheduled 21",
"Unscheduled 22", "Unscheduled 23", "Unscheduled 24", "Unscheduled 25",
"1 Screening", "1 Week 04", "1 Week 08", "1 Week 16", "Unscheduled 01",
"Unscheduled 02", "Unscheduled 03", "Unscheduled 04", "Unscheduled 05",
"Unscheduled 06", "Unscheduled 07", "Unscheduled 08", "Unscheduled 09",
"Unscheduled 10", "Unscheduled 11", "Unscheduled 12", "Unscheduled 13",
"Unscheduled 14", "Unscheduled 15", "Unscheduled", "Unscheduled 16",
"Unscheduled 17", "Unscheduled", "2 Screening", "2 Week 04",
"2 Week 08", "2 Week 16", "Unscheduled 18", "Unscheduled 19",
"Unscheduled 20", "Unscheduled 21", "Unscheduled 22", "Unscheduled 23",
"Unscheduled 24", "Unscheduled 25"), MOS_DUR = c(0, 0.903225806451613,
2, 3.58064516129032, 5.41935483870968, 7.25806451612903,
9.07142857142857, 11.0333333333333, 14.1935483870968, 17.741935483871,
20.7096774193548, 24.0967741935484, 27.0645161290323, 31.2903225806452,
35.4333333333333, 39.7096774193548, 45.0344827586207, 51.4193548387097,
57.3928571428571, 61.1666666666667, 62.5161290322581, 66.6,
69.6071428571429, 71.7333333333333, 72.8387096774194, 73.7666666666667,
75.5806451612903, 77.4193548387097, 79.258064516129, 81.0714285714286,
82.9677419354839, 84.8064516129032, 85.7666666666667, 86.8709677419355,
88.9333333333333, 0, 0.903225806451613, 2, 3.58064516129032,
5.41935483870968, 7.25806451612903, 9.07142857142857, 11.0333333333333,
14.1935483870968, 17.741935483871, 20.7096774193548, 24.0967741935484,
27.0645161290323, 31.2903225806452, 35.4333333333333, 39.7096774193548,
45.0344827586207, 51.4193548387097, 57.3928571428571, 61.1666666666667,
62.5161290322581, 66.6, 69.6071428571429, 71.7333333333333,
72.8387096774194, 73.7666666666667, 75.5806451612903, 77.4193548387097,
79.258064516129, 81.0714285714286, 82.9677419354839, 84.8064516129032,
85.7666666666667, 86.8709677419355, 88.9333333333333), DAYS_DUR = c(0,
28, 61, 110, 166, 222, 278, 336, 432, 541, 632, 733, 824,
953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 2028, 2119,
2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 2581, 2610,
2644, 2707, 0, 28, 61, 110, 166, 222, 278, 336, 432, 541,
632, 733, 824, 953, 1078, 1209, 1372, 1566, 1748, 1862, 1903,
2028, 2119, 2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525,
2581, 2610, 2644, 2707), SPD = c(1845.52, 1658.84, 1924.06,
1914.49, 2094.22, 1890.46, 1369.93, 985.96, 570.47, 348.88,
500.76, 416.6, 407.92, 409.56, 362.33, 329.35, 326.93, 267.39,
304.2, 825.13, 399.47, 438.16, 800.82, 1947.56, 2357.86,
2959.97, 3066.57, 4235.27, 3308.1, 5321.45, 2840.12, 4162.53,
5379.55, 4132.86, 6570.47, 1375.57, 1744.6, 2522.15, 1828.55,
2013.06, 1555.05, 1294.24, 1048.76, 439.1, 402, 276.74, 121.37,
142.34, 72.9, 264.92, 62.27, 72.96, 68.91, 51.56, 251.33,
86.53, 134.48, 337.51, 1209.22, 1158.13, 1165.09, 2432.43,
2141.32, 3463.67, 5276.63, 3883.76, 4004.92, 4678.29, 4942.69,
5159.52), SPD_NADIR = c(1845.52, 1658.84, 1658.84, 1658.84,
1658.84, 1658.84, 1369.93, 985.96, 570.47, 348.88, 348.88,
348.88, 348.88, 348.88, 348.88, 329.35, 326.93, 267.39, 267.39,
267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39,
267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1294.24,
1048.76, 439.1, 402, 276.74, 121.37, 121.37, 72.9, 72.9,
62.27, 62.27, 62.27, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56,
51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56,
51.56, 51.56), PCT_DIFF_NADIR = c(NA, -0.101153062551476,
0.15988280967423, 0.154113718019821, 0.262460514576451, 0.139627691639941,
-0.174163873550192, -0.280284394093129, -0.421406547932979,
-0.38843409819973, 0.435335932125659, 0.194106856225636,
0.169227241458381, 0.173927998165558, 0.0385519376289841,
-0.0559791332263242, -0.00734780628510708, -0.182118496314196,
0.137664086166274, 2.0858670855305, 0.493960133138861, 0.638655147911291,
1.99495119488388, 6.28359325330042, 7.81805602303751, 10.0698605033846,
10.4685291147762, 14.8392984030816, 11.3718164478851, 18.9014548038446,
9.62163880474214, 14.5672613037137, 19.1187404166199, 14.4562997868282,
23.5726092972811, NA, 0.268274242677581, 0.833530827220716,
0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511,
-0.189671158363209, -0.581315076852664, -0.0844910043270326,
-0.311592039800995, -0.561429500614295, 0.172777457361786,
-0.399357337068468, 2.63401920438957, -0.145816186556927,
0.171671752047535, 0.106632407258712, -0.171992933997109,
3.87451512800621, 0.678238944918541, 1.60822342901474, 5.54596586501164,
22.4526764934057, 21.4617920868891, 21.5967804499612, 46.1766873545384,
40.5306439100078, 66.1774631497285, 101.339604344453, 74.3250581846393,
76.6749418153607, 89.7348719937936, 94.8628782001551, 99.0682699767261
), SPD_BL = c(1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57), PCT_DIFF_BL = c(NA, -0.101153062551476,
0.042557111274871, 0.0373715809094456, 0.134758767176731,
0.024350860462092, -0.257699726906238, -0.465754909185487,
-0.690889288655772, -0.810958429060644, -0.728661840565261,
-0.77426416402965, -0.778967445489618, -0.77807880705709,
-0.803670510208505, -0.821540812345572, -0.822852095886254,
-0.855114005808661, -0.835168407820018, -0.552901079370584,
-0.783546100827951, -0.762581819758117, -0.566073518574711,
0.0552906497897611, 0.277612813732715, 0.60386774459231,
0.661629242706663, 1.29489249642377, 0.792502926004595, 1.88344206510902,
0.53892669816637, 1.25547813082492, 1.91492370713945, 1.23940136113399,
2.56022692791192, NA, 0.268274242677581, 0.833530827220716,
0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511,
-0.237581511664256, -0.680786873805041, -0.707757511431625,
-0.798817944561164, -0.911767485478747, -0.896522895963128,
-0.947003787520809, -0.807410746090711, -0.954731493126486,
-0.946960169238934, -0.949904403265555, -0.96251735644133,
-0.817290286935598, -0.93709516782134, -0.902236890888868,
-0.754639894734546, -0.120931686500869, -0.158072653518178,
-0.153012932820576, 0.768306956388988, 0.556678322440879,
1.51798890641698, 2.83595891157847, 1.82338230697093, 1.91146215750562,
2.40098286528494, 2.59319409408463, 2.75082329507041), NLFLG = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1), CLINDET_DT = structure(c(18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177), class = "Date")), row.names = c(NA,
-70L), groups = structure(list(SUBJID = c(1, 1), READER = structure(c(1L,
4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"),
.rows = structure(list(1:35, 36:70), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
df2 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), DRUG = structure(c("Dexamethasone", "Dexamethasone",
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone",
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone",
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone",
"Dexamethasone", "Dexamethasone"), label = "Agent Name", format.sas = "$"),
ROUTE = structure(c("PO", "IVI", "IVI", "PO", "PO", "PO",
"PO", "PO", "IVI", "PO", "PO", "PO", "PO", "PO", "PO", "PO"
), label = "Route", format.sas = "$"), START_DT = structure(c(15455,
15469, 15470, 15471, 15472, 15473, 15474, 15475, 17653, 18061,
18079, 18115, 18122, 18129, 18136, 18142), class = "Date"),
STOP_DT = structure(c(15468, 15469, 15470, 15471, 15472,
15473, 15474, 15475, 17653, 18067, 18114, 18121, 18128, 18135,
18141, NA), class = "Date"), DAYS_DUR = c(13, 0, 0, 0, 0,
0, 0, 0, 0, 6, 35, 6, 6, 6, 5, NA), DEXEQ = c(0, 10, 30,
8, 8, 4, 2, 1, 4, 4, 4, 3.5, 3, 2.5, 2, 8), PHYSFLAG = c(0,
1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1), STATUS = c("STABLE",
"INCREASE", "INCREASE", "INCREASE", "INCREASE", "INCREASE",
"STABLE", "STABLE", "INCREASE", "INCREASE", "INCREASE", "INCREASE",
"INCREASE", "INCREASE", "STABLE", "INCREASE")), row.names = c(NA,
-16L), groups = structure(list(SUBJID = 1, .rows = structure(list(
1:16), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
example = structure(list(SUBJID = 1, READER = structure(1L, .Label = c("1",
"2", "3", "4", "5"), class = "factor"), DOS = structure(15469, class = "Date"),
VISIT = "1 Screening", MOS_DUR = 0, DAYS_DUR = 0, SPD = 1845.52,
SPD_NADIR = 1845.52, PCT_DIFF_NADIR = NA_real_, SPD_BL = 1845.52,
PCT_DIFF_BL = NA_real_, NLFLG = 0, CLINDET_DT = structure(18177, class = "Date"),
STATUS = "STABLE"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))发布于 2021-09-22 18:41:12
以下是在foverlaps包中使用data.table的一种解决方案(以及仅用于一些清理的dplyr )。基本上,我们在这里将DOS变量作为一个区间,为开始日期和结束日期设置相同的值,然后通过重叠的间隔进行连接。
另一种解决方案是按ID进行full_join,然后使用来自lubridate的%within%,但是这不会保留没有匹配的行。
请注意,我们不能在df2中包含那些具有开始变量或结束变量的NA日期的行。因此,它们在联接之前被移除。
library(data.table)
library(dplyr)
dt1 <- as.data.table(df1 %>% mutate(start = DOS, end = DOS))
dt2 <- as.data.table(df2 %>%
select(SUBJID, start = START_DT, end =STOP_DT, STATUS) %>%
filter(!is.na(start)&!is.na(end)))
setkey(dt1, SUBJID, start, end)
setkey(dt2, SUBJID, start, end)
df3 <- foverlaps(dt1, dt2, type="within") %>%
select(-i.start, -i.end)Tidyverse解
在这里,我们应用一个完整联接和过滤器来查找所有匹配,然后使用原始df1数据集进行一个完全连接,以保持行完全不匹配。
library(lubridate)
df3 <- df1 %>%
full_join(df1 %>%
full_join(df2 %>%
select(SUBJID,START_DT, STOP_DT, STATUS), by = "SUBJID") %>%
filter(DOS %within% interval(START_DT, STOP_DT)))https://stackoverflow.com/questions/69288212
复制相似问题