我想交叉连接两个数据表,而不评估完全交叉连接,在过程中使用一个测距标准。本质上,我希望CJ具有过滤/测距表达式。
有人能建议一种高性能的方法来避免完全交叉连接吗?
参见下面的测试示例,使用邪恶的完全交叉连接完成工作。
library(data.table)
# Test data.
dt1 <- data.table(id1=1:10, D=2*(1:10), key="id1")
dt2 <- data.table(id2=21:23, D1=c(5, 7, 10), D2=c(9, 12, 16), key="id2")
# Desired filtered cross-join data table by hand: D1 <= D & D <= D2.
dtfDesired <- data.table(
id1=c(3, 4, 4, 5, 6, 5, 6, 7, 8)
, id2=c(rep(21, 2), rep(22, 3), rep(23, 4))
, D1=c(rep(5, 2), rep(7, 3), rep(10, 4))
, D=c(6, 8, 8, 10, 12, 10, 12, 14, 16)
, D2=c(rep(9, 2), rep(12, 3), rep(16, 4))
)
setkey(dtfDesired, id1, id2)
# My "inefficient" programmatic attempt with full cross join.
fullCJ <- function(dt1, dt2) {
# Full cross-product: NOT acceptable with real data!
dtCrossAll <- CJ(dt1$id1, dt2$id2)
setnames(dtCrossAll, c("id1", "id2"))
# Merge all columns.
dtf <- merge(dtCrossAll, dt1, by="id1")
dtf <- merge(dtf, dt2, by="id2")
setkey(dtf, id1, id2)
# Reorder columns for convenience.
setcolorder(dtf, c("id1", "id2", "D1", "D", "D2"))
# Finally, filter the cases I want.
dtf[D1 <= D & D <= D2, ]
}
dtf <- fullCJ(dt1, dt2)
# Print results.
print(dt1)
print(dt2)
print(dtfDesired)
all.equal(dtf, dtfDesired)测试数据输出
> # Print results.
> print(dt1)
id1 D
1: 1 2
2: 2 4
3: 3 6
4: 4 8
5: 5 10
6: 6 12
7: 7 14
8: 8 16
9: 9 18
10: 10 20
> print(dt2)
id2 D1 D2
1: 21 5 9
2: 22 7 12
3: 23 10 16
> print(dtfDesired)
id1 id2 D1 D D2
1: 3 21 5 6 9
2: 4 21 5 8 9
3: 4 22 7 8 12
4: 5 22 7 10 12
5: 5 23 10 10 16
6: 6 22 7 12 12
7: 6 23 10 12 16
8: 7 23 10 14 16
9: 8 23 10 16 16
> all.equal(dtf, dtfDesired)
[1] TRUE因此,现在的挑战是以一种可以伸缩到数百万行的方式编写过滤后的交叉连接!
下面是可供选择的实现的集合,包括答案和注释中建议的实现。
# My "inefficient" programmatic attempt looping manually.
manualIter <- function(dt1, dt2) {
id1Match <- NULL; id2Match <- NULL; dtf <- NULL;
for (i1 in seq_len(nrow(dt1))) {
# Find matches in dt2 of this dt1 row.
row1 <- dt1[i1, ]
id1 <- row1$id1
D <- row1$D
dt2Match <- dt2[D1 <= D & D <= D2, ]
nMatches <- nrow(dt2Match)
if (0 < nMatches) {
id1Match <- c(id1Match, rep(id1, nMatches))
id2Match <- c(id2Match, dt2Match$id2)
}
}
# Build the return data.table for the matching ids.
dtf <- data.table(id1=id1Match, id2=id2Match)
dtf <- merge(dtf, dt1, by="id1")
dtf <- merge(dtf, dt2, by="id2")
setkey(dtf, id1, id2)
# Reorder columns for convenience & consistency.
setcolorder(dtf, c("id1", "id2", "D1", "D", "D2"))
return(dtf)
}
dtJoin1 <- function(dt1, dt2) {
dtf <- dt1[, dt2[D1 <= D & D <= D2, list(id2=id2)], by=id1]
dtf <- merge(dtf, dt1, by="id1")
dtf <- merge(dtf, dt2, by="id2")
setkey(dtf, id1, id2)
setcolorder(dtf, c("id1", "id2", "D1", "D", "D2")) # Reorder columns for convenience & consistency.
return(dtf)
}
dtJoin2 <- function(dt1, dt2) {
dtf <- dt2[, dt1[D1 <= D & D <= D2, list(id1=id1, D1=D1, D=D, D2=D2)], by=id2]
setkey(dtf, id1, id2)
setcolorder(dtf, c("id1", "id2", "D1", "D", "D2")) # Reorder columns for convenience & consistency.
return(dtf)
}
# Install Bioconductor IRanges (see bioTreeRange below).
source("http://bioconductor.org/biocLite.R")
biocLite("IRanges")
# Solution using Bioconductor IRanges.
bioTreeRange <- function(dt1, dt2) {
require(IRanges)
ir1 <- IRanges(dt1$D, width=1L)
ir2 <- IRanges(dt2$D1, dt2$D2)
olaps <- findOverlaps(ir1, ir2, type="within")
dtf <- cbind(dt1[queryHits(olaps)], dt2[subjectHits(olaps)])
setkey(dtf, id1, id2)
setcolorder(dtf, c("id1", "id2", "D1", "D", "D2")) # Reorder columns for convenience.
return(dtf)
}下面是一个更大的数据集的一个小基准,比我真实的基础场景小2-3个数量级。真正的场景在完全交叉连接的巨大内存分配上失败。
set.seed(1)
n1 <- 10000
n2 <- 1000
dtbig1 <- data.table(id1=1:n1, D=1:n1, key="id1")
dtbig2 <- data.table(id2=1:n2, D1=sort(sample(1:n1, n2)), key="id2")
dtbig2$D2 <- with(dtbig2, D1 + 100)
library("microbenchmark")
mbenchmarkRes <- microbenchmark(
fullCJRes <- fullCJ(dtbig1, dtbig2)
, manualIterRes <- manualIter(dtbig1, dtbig2)
, dtJoin1Res <- dtJoin1(dtbig1, dtbig2)
, dtJoin2Res <- dtJoin2(dtbig1, dtbig2)
, bioTreeRangeRes <- bioTreeRange(dtbig1, dtbig2)
, times=3, unit="s", control=list(order="inorder", warmup=1)
)
mbenchmarkRes$expr <- c("fullCJ", "manualIter", "dtJoin1", "dtJoin2", "bioTreeRangeRes") # Shorten names for better display.
# Print microbenchmark
print(mbenchmarkRes, order="median")现在我在我的机器上得到的基准测试结果是:
> print(mbenchmarkRes, order="median")
Unit: seconds
expr min lq median uq max neval
bioTreeRangeRes 0.05833279 0.05843753 0.05854227 0.06099377 0.06344527 3
dtJoin2 1.20519664 1.21583650 1.22647637 1.23606216 1.24564796 3
fullCJ 4.00370434 4.03572702 4.06774969 4.17001658 4.27228347 3
dtJoin1 8.02416333 8.03504136 8.04591938 8.20015977 8.35440016 3
manualIter 8.69061759 8.69716448 8.70371137 8.76859060 8.83346982 3结论
gtools和gplots。发布于 2014-09-07 09:26:14
最近,重叠联接在data.table中实现。这是一个特殊情况,dt1的“起点和终点是相同的”。您可以从github项目页面获取最新版本,尝试如下:
require(data.table) ## 1.9.3+
dt1[, DD := D] ## duplicate column D to create intervals
setkey(dt2, D1,D2) ## key needs to be set for 2nd argument
foverlaps(dt1, dt2, by.x=c("D", "DD"), by.y=key(dt2), nomatch=0L)
# id2 D1 D2 id1 D DD
# 1: 21 5 9 3 6 6
# 2: 21 5 9 4 8 8
# 3: 22 7 12 4 8 8
# 4: 22 7 12 5 10 10
# 5: 23 10 16 5 10 10
# 6: 22 7 12 6 12 12
# 7: 23 10 16 6 12 12
# 8: 23 10 16 7 14 14
# 9: 23 10 16 8 16 16下面是对您在文章中显示的相同数据进行基准测试的结果:
# Unit: seconds
# expr min lq median uq max neval
# olaps 0.03600603 0.03971068 0.04341533 0.04857602 0.05373671 3
# bioTreeRangeRes 0.11356837 0.11673968 0.11991100 0.12499391 0.13007681 3
# dtJoin2 2.61679908 2.70327940 2.78975971 2.86864832 2.94753693 3
# fullCJ 4.45173294 4.75271285 5.05369275 5.08333291 5.11297307 3
# dtJoin1 16.51898878 17.39207632 18.26516387 18.60092303 18.93668220 3
# manualIter 29.36023340 30.13354967 30.90686594 33.55910653 36.21134712 3其中dt_olaps是:
dt_olaps <- function(dt1, dt2) {
dt1[, DD := D]
setkey(dt2, D1,D2)
foverlaps(dt1, dt2, by.x=c("D","DD"), by.y=key(dt2), nomatch=0L)
}发布于 2014-02-25 22:42:33
这似乎是一个从使用interval trees算法中获益良多的问题。生物导体包IRanges提供了一个非常好的实现。
# Installation
source("http://bioconductor.org/biocLite.R")
biocLite("IRanges")
# solution
require(IRanges)
ir1 <- IRanges(dt1$D, width=1L)
ir2 <- IRanges(dt2$D1, dt2$D2)
olaps <- findOverlaps(ir1, ir2, type="within")
cbind(dt1[queryHits(olaps)], dt2[subjectHits(olaps)])
id1 D id2 D1 D2
1: 3 6 21 5 9
2: 4 8 21 5 9
3: 4 8 22 7 12
4: 5 10 22 7 12
5: 5 10 23 10 16
6: 6 12 22 7 12
7: 6 12 23 10 16
8: 7 14 23 10 16
9: 8 16 23 10 16https://stackoverflow.com/questions/22026676
复制相似问题