我有这个数据。
token DD1 Type DD2 Price
AB-1 2018-01-01 10:12:15 Low 2018-01-25 10000
AB-5 2018-01-10 10:12:15 Low 2018-01-25 15000
AB-2 2018-01-05 12:25:04 High 2018-01-20 25000
AB-3 2018-01-03 17:04:25 Low 2018-01-27 50000
....
AB-8 2017-12-10 21:08:12 Low 2017-12-30 60000
AB-8 2017-12-10 21:08:12 High 2017-12-30 30000dput:
structure(list(token = structure(c(2L, 5L, 3L, 4L, 1L, 6L, 6L
), .Label = c("....", "AB-1", "AB-2", "AB-3", "AB-5", "AB-8"), class = "factor"),
DD1 = structure(c(2L, 5L, 4L, 3L, 1L, 6L, 6L), .Label = c("",
"01/01/2018 10:12:15", "03/01/2018 17:04:25", "05/01/2018 12:25:04",
"10/01/2018 10:12:15", "10/12/2017 21:08:12"), class = "factor"),
Type = structure(c(3L, 3L, 2L, 3L, 1L, 3L, 2L), .Label = c("",
"High", "Low"), class = "factor"), DD2 = structure(c(3L,
3L, 2L, 4L, 1L, 5L, 5L), .Label = c("", "20/01/2018", "25/01/2018",
"27/01/2018", "30/12/2017"), class = "factor"), Price = c(10000L,
15000L, 25000L, 50000L, NA, 60000L, 30000L)), .Names = c("token",
"DD1", "Type", "DD2", "Price"), class = "data.frame", row.names = c(NA,
-7L))在上面提到的数据中,我想要基于日期的2种子集数据框架(按降序排列的最后三个日期(从DD2) ),如果行对于特定日期不可用,而不是显示该日期的所有字段为'0')和月份(最后三个日期按降序排列,如果行对特定日期不可用,则显示该日期与所有字段为'0')。
适用于Avg低的公式(同样适用于Avg高):DD2-DD1,并取中位数作为每nrow可用。
月份百分比公式:(近期值-旧值)/(旧Vaule)
每当我运行代码时,代码应该从dataframe中选择最后三天的数据以及最后三个月的数据。
DF1:
Date nrow for Low Med Low sum of value low nrow for High Med High sum of value High
27-01-2018 1 24 50000 0 0 0
26-01-2018 0 0 0 0 0 0
25-01-2018 2 19.5 25000 0 0 0DF2
Month nrow low % sum low % nrow high % sum high %
Jan-18 3 200% 75000 25% 1 0% 25000 -17%
Dec-17 1 100% 60000 100% 1 100% 0 100%
Nov-17 0 - - - 0 - - -发布于 2018-02-21 18:36:55
虽然这个Q已经有了一个可接受的答案,但我觉得很难提供一个使用dcast()和melt()的答案。任何缺少的日期和月份都将使用CJ()完成,并按照OP的要求进行联接。
代码试图尽可能接近OP的预期结果。特别的定制是为什么代码看起来如此复杂。
如果需要的话,我愿意更详细地解释代码。
library(data.table)
setDT(DF)
# daily
DF1 <-
DF[, .(n = .N, days = median(difftime(as.Date(DD2, "%d/%m/%Y"),
as.Date(DD1, "%d/%m/%Y"), units = "day")),
sum = sum(Price)), by = .(DD2, Type)][
, Date := as.Date(DD2, "%d/%m/%Y")][
, dcast(.SD, Date ~ Type, value.var = c("n", "days", "sum"), fill = 0)][
.(Date = seq(max(Date), length.out = 3L, by = "-1 days")), on = "Date"][
, setcolorder(.SD, c(1, 3, 5, 7, 2, 4, 6))][
is.na(n_Low), (2:7) := lapply(.SD, function(x) 0), .SDcols = 2:7][]
DF1Date n\_Low days\_Low sum\_Low n\_High days\_High sum\_High 1: 2018-01-27 1 24.0 days 50000 0 0 days 0 2: 2018-01-26 0 0.0 days 0 0 0 days 0 3: 2018-01-25 2 19.5 days 25000 0 0 days 0
# monthly
DF2 <-
DF[, Month := lubridate::floor_date(as.Date(DD2, "%d/%m/%Y"), unit = "month")][
, .(n = .N, sum = sum(Price)), by = .(Month, Type)][
CJ(Month = seq(max(Month), length.out = 3L, by = "-1 months"), Type = unique(Type)),
on = .(Month, Type)][
, melt(.SD, id.vars = c("Month", "Type"))][
is.na(value), value := 0][
, Pct := {
old <- shift(value); round(100 * ifelse(old == 0, 1, (value - old) / old))
},
by = .(variable, Type)][
, dcast(.SD, Type + Month ~ variable, value.var = c("value", "Pct"))][
, setnames(.SD, c("value_n", "value_sum"), c("n", "sum"))][
, dcast(.SD, Month ~ Type, value.var = c("n", "Pct_n", "sum", "Pct_sum"))][
order(-Month), setcolorder(.SD, c(1, 3, 5, 7, 9, 2, 4, 6, 8))]
DF2Month n\_Low Pct\_n\_Low sum\_Low Pct\_sum\_Low n\_High Pct\_n\_High sum\_High Pct\_sum\_High 1: 2018-01-01 3 200 75000 25 1 0 25000 -17 2: 2017-12-01 1 100 60000 100 1 100 30000 100 3: 2017-11-01 0 NA 0 NA 0 NA 0 NA
发布于 2018-02-20 11:06:48
下面的方法有用吗?
require(tidyverse)编辑,这是一种非常复杂的方法,当然可以更优雅地解决。
dat <- structure(list(token = structure(c(2L, 5L, 3L, 4L, 1L, 6L, 6L), .Label = c("....", "AB-1", "AB-2", "AB-3", "AB-5", "AB-8"), class = "character"), DD1 = structure(c(2L, 5L, 4L, 3L, 1L, 6L, 6L), .Label = c("", "01/01/2018 10:12:15", "03/01/2018 17:04:25", "05/01/2018 12:25:04", "10/01/2018 10:12:15", "10/12/2017 21:08:12"), class = "factor"),
Type = structure(c(3L, 3L, 2L, 3L, 1L, 3L, 2L), .Label = c("", "High", "Low"), class = "character"), DD2 = structure(c(3L, 3L, 2L, 4L, 1L, 5L, 5L), .Label = c("", "20/01/2018", "25/01/2018", "27/01/2018", "30/12/2017"), class = "factor"), Price = c(10000L, 15000L, 25000L, 50000L, NA, 60000L, 30000L)), .Names = c("token", "DD1", "Type", "DD2", "Price"), class = "data.frame", row.names = c(NA, -7L))
#I have included this into the code because structure(your output) had messed up a lot with factors
dat <- dat[c(1:4,6:7),]
dat <- dat %>% mutate(DD1 = dmy_hms(DD1), DD2 = dmy(DD2), Type = as.character(Type))
dat_summary <- dat %>%
mutate(diff_days = round(as.duration(DD1%--%DD2)/ddays(1),0),
#uses lubridate to calculate the number of days between each DD2 and DD1
n = n()) %>%
group_by(DD2,Type) %>% #because your operations are performed by each Type by DD2
summarise(med = median(diff_days),# calculates the median
sum = sum(Price)) # and the sum
# A tibble: 5 x 4
# Groups: DD2 [?]
DD2 Type med sum
<date> <chr> <dbl> <int>
1 2017-12-30 2 19.0 30000
2 2017-12-30 3 19.0 60000
3 2018-01-20 2 14.0 25000
4 2018-01-25 3 19.5 25000
5 2018-01-27 3 23.0 50000 现在在价格中找到第一天的值
datematch <- dat %>% group_by(Type,month = floor_date(DD2, "month")) %>%
arrange(Type, desc(DD2)) %>%
summarise(maxDate = max(DD2)) %>%
select(Type, maxDate)现在创建用于合并的辅助数据帧。dummy_dates将包含值的最后一天和前两天,对于这两种类型(低和高),all_dates将包含.嗯,所有的约会
list1 <- split(datematch$maxDate, datematch$Type)
list_type2 <- do.call('c',lapply(list1[['2']], function(x) seq(as.Date(x)-2, as.Date(x), by="days")))
list_type3 <- do.call('c',lapply(list1[['3']], function(x) seq(as.Date(x)-2, as.Date(x), by="days")))
dd_2 <- data.frame (DD2 = list_type2, Type = as.character(rep('2', length(list_type2))), stringsAsFactors = F)
dd_3 <- data.frame (DD2 = list_type3, Type = as.character(rep('3', length(list_type3))), stringsAsFactors = F)
dummy_date = rbind(dd_2, dd_3)
seq_date <- seq(as.Date('2017-12-01'),as.Date('2018-01-31'), by = 'days')
all_dates <- data.frame (DD2 = rep(seq_date,2), Type = as.character(rep(c('2','3'),each = length(seq_date))),stringsAsFactors = F)现在,我们可以将您的数据框架与所有的日子连接起来,这样每个月的每一天都会有一行。
all_dates <- left_join(dd_date, dat_summary, by = c('DD2', 'Type')) 我们可以用dummy_date过滤这个结果,它(我们记得)只包含最后一天之前的数据所需的天数。
df1<- left_join(dummy_date, all_dates, by = c('DD2', 'Type')) %>% arrange(Type, desc(DD2))
df1
DD2 Type med sum
1 2018-01-20 2 14.0 25000
2 2018-01-19 2 NA NA
3 2018-01-18 2 NA NA
4 2017-12-30 2 19.0 30000
5 2017-12-29 2 NA NA
6 2017-12-28 2 NA NA
7 2018-01-27 3 23.0 50000
8 2018-01-26 3 NA NA
9 2018-01-25 3 19.5 25000
10 2017-12-30 3 19.0 60000
11 2017-12-29 3 NA NA
12 2017-12-28 3 NA NA 对不起,“类型”没有正确地放低和高,有问题要读取您的数据。我希望这能有所帮助。
编辑添加了一种通往DF2的方法建议
df1 %>% group_by(Type, month = floor_date(DD2, 'month')) %>%
summarise(sum = sum(sum, na.rm = T),
n = max (n1, na.rm = T)) %>%
unite(sum.n, c('sum','n')) %>%
spread(Type, sum.n) %>%
rename(low = '3', high = '2') %>%
separate(high, c('high','n_high')) %>%
separate(low, c('low','n_low')) %>%
mutate(dummy_low = as.integer(c(NA, low[1:length(low)-1])),
dummy_high = as.integer(c(NA, high[1:length(high)-1])),
low = as.integer(low),
high = as.integer(high))%>%
mutate(perc_low = 100*(low-dummy_low)/dummy_low)
# A tibble: 2 x 8
month high n_high low n_low dummy_low dummy_high perc_low
<date> <int> <chr> <int> <chr> <int> <int> <dbl>
1 2017-12-01 30000 1 60000 1 NA NA NA
2 2018-01-01 25000 1 75000 3 60000 30000 25.0这取决于您添加其余的“高”的列和计数。我相信这个解决方案并不是最优雅的,但它应该能奏效。DF2现在只有两个月,但这是因为您在示例中只提供了2个月。它应该适用于任意几个月,然后您可以过滤最后三个月。
https://stackoverflow.com/questions/48879451
复制相似问题