文章/答案/技术大牛

发布

社区首页 >问答首页 >基于日期间隔的列的左连接子集

问基于日期间隔的列的左连接子集
EN

Stack Overflow用户

提问于 2021-09-22 16:45:45

回答 1查看 256关注 0票数 0

我有两个数据：df1系列MRI读取1×2不同的MRI读取器；df2 -被试1的类固醇使用列表。

根据df2$STATUS是否包含在df1中，我尝试执行df1$DOS到lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE)的左连接。如果该df2$STATUS没有可用的df1$DOS，我希望值为NA。

例如，最后的df应该如下所示，因为df1$DOS属于第一行lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE)

dput示例

df1 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), READER = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    DOS = structure(c(15469, 15497, 15530, 15579, 15635, 15691, 
    15747, 15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 
    16678, 16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 
    17686, 17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 
    18113, 18176, 15469, 15497, 15530, 15579, 15635, 15691, 15747, 
    15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 16678, 
    16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 17686, 
    17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 18113, 
    18176), class = "Date"), VISIT = c("1 Screening", "1 Week 04", 
    "1 Week 08", "1 Week 16", "Unscheduled 01", "Unscheduled 02", 
    "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", "Unscheduled 06", 
    "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", "Unscheduled 10", 
    "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", "Unscheduled 14", 
    "Unscheduled 15", "Unscheduled", "Unscheduled 16", "Unscheduled 17", 
    "Unscheduled", "2 Screening", "2 Week 04", "2 Week 08", "2 Week 16", 
    "Unscheduled 18", "Unscheduled 19", "Unscheduled 20", "Unscheduled 21", 
    "Unscheduled 22", "Unscheduled 23", "Unscheduled 24", "Unscheduled 25", 
    "1 Screening", "1 Week 04", "1 Week 08", "1 Week 16", "Unscheduled 01", 
    "Unscheduled 02", "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", 
    "Unscheduled 06", "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", 
    "Unscheduled 10", "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", 
    "Unscheduled 14", "Unscheduled 15", "Unscheduled", "Unscheduled 16", 
    "Unscheduled 17", "Unscheduled", "2 Screening", "2 Week 04", 
    "2 Week 08", "2 Week 16", "Unscheduled 18", "Unscheduled 19", 
    "Unscheduled 20", "Unscheduled 21", "Unscheduled 22", "Unscheduled 23", 
    "Unscheduled 24", "Unscheduled 25"), MOS_DUR = c(0, 0.903225806451613, 
    2, 3.58064516129032, 5.41935483870968, 7.25806451612903, 
    9.07142857142857, 11.0333333333333, 14.1935483870968, 17.741935483871, 
    20.7096774193548, 24.0967741935484, 27.0645161290323, 31.2903225806452, 
    35.4333333333333, 39.7096774193548, 45.0344827586207, 51.4193548387097, 
    57.3928571428571, 61.1666666666667, 62.5161290322581, 66.6, 
    69.6071428571429, 71.7333333333333, 72.8387096774194, 73.7666666666667, 
    75.5806451612903, 77.4193548387097, 79.258064516129, 81.0714285714286, 
    82.9677419354839, 84.8064516129032, 85.7666666666667, 86.8709677419355, 
    88.9333333333333, 0, 0.903225806451613, 2, 3.58064516129032, 
    5.41935483870968, 7.25806451612903, 9.07142857142857, 11.0333333333333, 
    14.1935483870968, 17.741935483871, 20.7096774193548, 24.0967741935484, 
    27.0645161290323, 31.2903225806452, 35.4333333333333, 39.7096774193548, 
    45.0344827586207, 51.4193548387097, 57.3928571428571, 61.1666666666667, 
    62.5161290322581, 66.6, 69.6071428571429, 71.7333333333333, 
    72.8387096774194, 73.7666666666667, 75.5806451612903, 77.4193548387097, 
    79.258064516129, 81.0714285714286, 82.9677419354839, 84.8064516129032, 
    85.7666666666667, 86.8709677419355, 88.9333333333333), DAYS_DUR = c(0, 
    28, 61, 110, 166, 222, 278, 336, 432, 541, 632, 733, 824, 
    953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 2028, 2119, 
    2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 2581, 2610, 
    2644, 2707, 0, 28, 61, 110, 166, 222, 278, 336, 432, 541, 
    632, 733, 824, 953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 
    2028, 2119, 2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 
    2581, 2610, 2644, 2707), SPD = c(1845.52, 1658.84, 1924.06, 
    1914.49, 2094.22, 1890.46, 1369.93, 985.96, 570.47, 348.88, 
    500.76, 416.6, 407.92, 409.56, 362.33, 329.35, 326.93, 267.39, 
    304.2, 825.13, 399.47, 438.16, 800.82, 1947.56, 2357.86, 
    2959.97, 3066.57, 4235.27, 3308.1, 5321.45, 2840.12, 4162.53, 
    5379.55, 4132.86, 6570.47, 1375.57, 1744.6, 2522.15, 1828.55, 
    2013.06, 1555.05, 1294.24, 1048.76, 439.1, 402, 276.74, 121.37, 
    142.34, 72.9, 264.92, 62.27, 72.96, 68.91, 51.56, 251.33, 
    86.53, 134.48, 337.51, 1209.22, 1158.13, 1165.09, 2432.43, 
    2141.32, 3463.67, 5276.63, 3883.76, 4004.92, 4678.29, 4942.69, 
    5159.52), SPD_NADIR = c(1845.52, 1658.84, 1658.84, 1658.84, 
    1658.84, 1658.84, 1369.93, 985.96, 570.47, 348.88, 348.88, 
    348.88, 348.88, 348.88, 348.88, 329.35, 326.93, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1294.24, 
    1048.76, 439.1, 402, 276.74, 121.37, 121.37, 72.9, 72.9, 
    62.27, 62.27, 62.27, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56), PCT_DIFF_NADIR = c(NA, -0.101153062551476, 
    0.15988280967423, 0.154113718019821, 0.262460514576451, 0.139627691639941, 
    -0.174163873550192, -0.280284394093129, -0.421406547932979, 
    -0.38843409819973, 0.435335932125659, 0.194106856225636, 
    0.169227241458381, 0.173927998165558, 0.0385519376289841, 
    -0.0559791332263242, -0.00734780628510708, -0.182118496314196, 
    0.137664086166274, 2.0858670855305, 0.493960133138861, 0.638655147911291, 
    1.99495119488388, 6.28359325330042, 7.81805602303751, 10.0698605033846, 
    10.4685291147762, 14.8392984030816, 11.3718164478851, 18.9014548038446, 
    9.62163880474214, 14.5672613037137, 19.1187404166199, 14.4562997868282, 
    23.5726092972811, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.189671158363209, -0.581315076852664, -0.0844910043270326, 
    -0.311592039800995, -0.561429500614295, 0.172777457361786, 
    -0.399357337068468, 2.63401920438957, -0.145816186556927, 
    0.171671752047535, 0.106632407258712, -0.171992933997109, 
    3.87451512800621, 0.678238944918541, 1.60822342901474, 5.54596586501164, 
    22.4526764934057, 21.4617920868891, 21.5967804499612, 46.1766873545384, 
    40.5306439100078, 66.1774631497285, 101.339604344453, 74.3250581846393, 
    76.6749418153607, 89.7348719937936, 94.8628782001551, 99.0682699767261
    ), SPD_BL = c(1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57), PCT_DIFF_BL = c(NA, -0.101153062551476, 
    0.042557111274871, 0.0373715809094456, 0.134758767176731, 
    0.024350860462092, -0.257699726906238, -0.465754909185487, 
    -0.690889288655772, -0.810958429060644, -0.728661840565261, 
    -0.77426416402965, -0.778967445489618, -0.77807880705709, 
    -0.803670510208505, -0.821540812345572, -0.822852095886254, 
    -0.855114005808661, -0.835168407820018, -0.552901079370584, 
    -0.783546100827951, -0.762581819758117, -0.566073518574711, 
    0.0552906497897611, 0.277612813732715, 0.60386774459231, 
    0.661629242706663, 1.29489249642377, 0.792502926004595, 1.88344206510902, 
    0.53892669816637, 1.25547813082492, 1.91492370713945, 1.23940136113399, 
    2.56022692791192, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.237581511664256, -0.680786873805041, -0.707757511431625, 
    -0.798817944561164, -0.911767485478747, -0.896522895963128, 
    -0.947003787520809, -0.807410746090711, -0.954731493126486, 
    -0.946960169238934, -0.949904403265555, -0.96251735644133, 
    -0.817290286935598, -0.93709516782134, -0.902236890888868, 
    -0.754639894734546, -0.120931686500869, -0.158072653518178, 
    -0.153012932820576, 0.768306956388988, 0.556678322440879, 
    1.51798890641698, 2.83595891157847, 1.82338230697093, 1.91146215750562, 
    2.40098286528494, 2.59319409408463, 2.75082329507041), NLFLG = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1), CLINDET_DT = structure(c(18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177), class = "Date")), row.names = c(NA, 
-70L), groups = structure(list(SUBJID = c(1, 1), READER = structure(c(1L, 
4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    .rows = structure(list(1:35, 36:70), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

df2 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1), DRUG = structure(c("Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone"), label = "Agent Name", format.sas = "$"), 
    ROUTE = structure(c("PO", "IVI", "IVI", "PO", "PO", "PO", 
    "PO", "PO", "IVI", "PO", "PO", "PO", "PO", "PO", "PO", "PO"
    ), label = "Route", format.sas = "$"), START_DT = structure(c(15455, 
    15469, 15470, 15471, 15472, 15473, 15474, 15475, 17653, 18061, 
    18079, 18115, 18122, 18129, 18136, 18142), class = "Date"), 
    STOP_DT = structure(c(15468, 15469, 15470, 15471, 15472, 
    15473, 15474, 15475, 17653, 18067, 18114, 18121, 18128, 18135, 
    18141, NA), class = "Date"), DAYS_DUR = c(13, 0, 0, 0, 0, 
    0, 0, 0, 0, 6, 35, 6, 6, 6, 5, NA), DEXEQ = c(0, 10, 30, 
    8, 8, 4, 2, 1, 4, 4, 4, 3.5, 3, 2.5, 2, 8), PHYSFLAG = c(0, 
    1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1), STATUS = c("STABLE", 
    "INCREASE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "STABLE", "STABLE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "INCREASE", "INCREASE", "STABLE", "INCREASE")), row.names = c(NA, 
-16L), groups = structure(list(SUBJID = 1, .rows = structure(list(
    1:16), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
"list"))), row.names = c(NA, -1L), class = c("tbl_df", "tbl", 
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"))

example  = structure(list(SUBJID = 1, READER = structure(1L, .Label = c("1", 
"2", "3", "4", "5"), class = "factor"), DOS = structure(15469, class = "Date"), 
    VISIT = "1 Screening", MOS_DUR = 0, DAYS_DUR = 0, SPD = 1845.52, 
    SPD_NADIR = 1845.52, PCT_DIFF_NADIR = NA_real_, SPD_BL = 1845.52, 
    PCT_DIFF_BL = NA_real_, NLFLG = 0, CLINDET_DT = structure(18177, class = "Date"), 
    STATUS = "STABLE"), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame"))

left-join

lubridate

回答 1

Stack Overflow用户

回答已采纳

发布于 2021-09-22 18:41:12

以下是在foverlaps包中使用data.table的一种解决方案(以及仅用于一些清理的dplyr )。基本上，我们在这里将DOS变量作为一个区间，为开始日期和结束日期设置相同的值，然后通过重叠的间隔进行连接。

另一种解决方案是按ID进行full_join，然后使用来自lubridate的%within%，但是这不会保留没有匹配的行。

请注意，我们不能在df2中包含那些具有开始变量或结束变量的NA日期的行。因此，它们在联接之前被移除。

library(data.table)
library(dplyr)

dt1 <- as.data.table(df1 %>% mutate(start = DOS, end = DOS))
dt2 <- as.data.table(df2 %>% 
                       select(SUBJID, start = START_DT, end =STOP_DT, STATUS) %>%
                       filter(!is.na(start)&!is.na(end)))
setkey(dt1, SUBJID, start, end)
setkey(dt2, SUBJID, start, end)
df3 <- foverlaps(dt1, dt2, type="within") %>% 
  select(-i.start, -i.end)

Tidyverse解

在这里，我们应用一个完整联接和过滤器来查找所有匹配，然后使用原始df1数据集进行一个完全连接，以保持行完全不匹配。

library(lubridate)

df3 <- df1 %>% 
  full_join(df1 %>%
              full_join(df2 %>% 
                          select(SUBJID,START_DT, STOP_DT, STATUS), by = "SUBJID") %>%
              filter(DOS %within% interval(START_DT, STOP_DT)))

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/69288212

复制

相似问题

问基于日期间隔的列的左连接子集
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问基于日期间隔的列的左连接子集EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问基于日期间隔的列的左连接子集
EN