我有一组任务,它们有开始日期和结束日期。每个任务也有一个类别。我想为每个类别指定一个特定的日期范围,并对该日期范围内的所有值求和。我可以接受以宽格式(results1)或长格式(results2)结束的结果。如果这两种方法中的任何一种都能让这件事变得更容易,我也没意见。
我试着让下面的例子具有可重复性。
require(lubridate)
require(dplyr)
require(ggplot2)
dates <- seq(from = ymd("2018-01-01"), to = ymd("2018-01-31"), by = "day") %>%
as_data_frame() %>%
rename(Date = value) %>%
arrange(Date)
tasks <- data.frame(
task = c("task 1", "task 2", "task 3", "task 4"),
category = c("cat1", "cat1", "cat2", "cat2"),
start.date = c(ymd("2018-01-01"), ymd("2018-01-15"), ymd("2018-01-18"), ymd("2018-01-25")),
end.date = c(ymd("2018-01-07"), ymd("2018-01-27"), ymd("2018-02-15"), ymd("2018-01-31")),
value = c(1,3,5,7)
)
# desired results example 1: sums in wide format
results1 <- bind_cols(
dates,
cat1 = c(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0),
cat2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 12, 12, 12, 12, 12, 12, 12)
)
# desired results example 2: sums in long format
results2 <- bind_cols(
bind_rows(dates, dates),
category = c("cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2"),
value = c(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 12, 12, 12, 12, 12, 12, 12)
)
#graph the results
ggplot(results2, aes(Date, value, color = category)) + geom_line()发布于 2018-02-02 03:19:43
DF1 = do.call(rbind, lapply(split(tasks, tasks$category), function(df1){
do.call(rbind, lapply(dates$Date, function(d){
data.frame(Date = d,
category = df1$category[1],
value = sum(df1$value[d >= df1$start.date & d <= df1$end.date]),
stringsAsFactors = FALSE)
}))
}))
head(DF1)
# Date category value
#cat1.1 2018-01-01 cat1 1
#cat1.2 2018-01-02 cat1 1
#cat1.3 2018-01-03 cat1 1
#cat1.4 2018-01-04 cat1 1
#cat1.5 2018-01-05 cat1 1
#cat1.6 2018-01-06 cat1 1
graphics.off()
ggplot(DF1, aes(Date, value, color = category)) + geom_line()

发布于 2018-02-02 03:29:13
使用sqldf的解决方案可能是:
# Data
require(lubridate)
require(dplyr)
require(ggplot2)
dates <- seq(from = ymd("2018-01-01"), to = ymd("2018-01-31"), by = "day") %>%
as_data_frame() %>%
rename(Date = value) %>%
arrange(Date)
tasks <- data.frame(
task = c("task 1", "task 2", "task 3", "task 4"),
category = c("cat1", "cat1", "cat2", "cat2"),
start_date = c(ymd("2018-01-01"), ymd("2018-01-15"), ymd("2018-01-18"), ymd("2018-01-25")),
end_date = c(ymd("2018-01-07"), ymd("2018-01-27"), ymd("2018-02-15"), ymd("2018-01-31")),
value = c(1,3,5,7)
)
library(sqldf)
# Dates with valid values
result <- sqldf("SELECT dates.Date, tasks.category, sum(tasks.value) as value
FROM dates, tasks
WHERE dates.Date <= tasks.end_date AND
dates.Date >= tasks.start_date
GROUP BY dates.Date, tasks.category")
#Dates with no values for each category is found and joined with result
final_result <- dates %>% merge(unique(result$category)) %>%
mutate(category = y) %>%
anti_join(result, by = c("Date","category")) %>%
mutate(value = 0) %>%
select(-y) %>%
union_all(result) %>%
arrange(category, Date)
final_result
# Date category value
#1 2018-01-01 cat1 1
#2 2018-01-02 cat1 1
#3 2018-01-03 cat1 1
#4 2018-01-04 cat1 1
#5 2018-01-05 cat1 1
#6 2018-01-06 cat1 1
#7 2018-01-07 cat1 1
#8 2018-01-08 cat1 0
#......so on
#57 2018-01-26 cat2 12
#58 2018-01-27 cat2 12
#59 2018-01-28 cat2 12
#60 2018-01-29 cat2 12
#61 2018-01-30 cat2 12
#62 2018-01-31 cat2 12
#plot
ggplot(final_result, aes(Date, value, color = category)) + geom_line()

发布于 2018-02-02 02:59:56
从你的结果中还不清楚你的目标是什么……但从你的描述来看:
我想为每个类别指定一个特定的日期范围,并对该日期范围内的所有值求和
然后,您只需按日期范围(即按开始日期和结束日期)和类别进行分组,并汇总这些值:
tasks %>%
group_by(category, start.date, end.date) %>%
summarise(value = sum(value))
# A tibble: 4 x 4
# Groups: category, start.date [?]
category start.date end.date value
<fct> <date> <date> <dbl>
1 cat1 2018-01-01 2018-01-07 1.00
2 cat1 2018-01-15 2018-01-27 3.00
3 cat2 2018-01-18 2018-02-15 5.00
4 cat2 2018-01-25 2018-01-31 7.00https://stackoverflow.com/questions/48569520
复制相似问题