我试图过滤出由字母组成的行(没有特殊字符,如点、括号等),这些字符从2到字母不等。我也想数数他们的频率。
桌子
df = data.frame(stringsAsFactors = F,
col1 = c(
"[is]",
"[is]",
"[is]+[is]",
"complex",
"ta",
"ta",
"ta ta pa",
"pum",
"wam",
"wam wam","puk","[is]+ pa", "[is]+ pa", "ta","wa pa [is] pa","wak wak")
)看上去:
# col1
#
# 1 [is]
# 2 [is]
# 3 [is]+[is]
# 4 complex
# 5 ta
# 6 ta
# 7 ta
# 8 ta ta pa
# 9 pum
# 10 wam
# 11 wam wam输出应该如下所示:
col1 N
ta 3
pum 1
wam 1我使用dplyr对col1进行分组(我的大表中有更多的列),根据我的条件进行了过滤,并总结了计数。下面的代码很有魅力,给了我我想要的东西。
group_by(col1) %>%
filter(str_detect(col1, "^[[:alpha:]]{2,4}$")) %>%
summarize(N = n())不过,我不知道如何在单独的列中也包含这些计数,这些计数出现在其他行中,但被省略了。例如," ta“在"ta ta pa”一行中发生了两次。目标输出如下:
col1 N N_other
ta 3 2
pum 1 0
wam 1 2想知道如何解决这个问题。希望我的问题足够清楚,如果没有,请问。
发布于 2020-03-21 16:00:30
这一点也不漂亮,但可能会有帮助。这假设您希望从col1中提取2-4个字母单词,然后对于这些提取的单词,计算出单独一行/字符串中有多少字母和多个字母。
library(tidyverse)
df %>%
mutate(str_ex1 = str_extract(col1, "^[[:alpha:]]{2,4}$"),
str_ex2 = str_extract(col1, paste(str_ex1, collapse = "|")),
count1 = str_count(col1, paste(str_ex1, collapse = "|"))) %>%
filter(count1 > 0) %>%
group_by(str_ex1, str_ex2) %>%
summarise(count2 = sum(count1)) %>%
mutate(N_type = if_else(is.na(str_ex1), "N_other", "N")) %>%
pivot_wider(id_cols = str_ex2, names_from = N_type, values_from = count2) %>%
replace_na(list(N_other = 0)) 输出
# A tibble: 3 x 3
str_ex2 N N_other
<chr> <int> <dbl>
1 pum 1 0
2 ta 3 2
3 wam 1 2编辑(3/22/20)
OP为测试提供了自己的数据子集(如建议的50行):
structure(list(col1 = structure(c(2L, 3L, 57L, 4L, 47L, 47L,
39L, 9L, 18L, 34L, 34L, 18L, 56L, 14L, 23L, 42L, 53L, 9L, 9L,
17L, 45L, 1L, 30L, 30L, 18L, 52L, 9L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 7L, 2L, 34L, 47L, 34L, 24L, 24L, 22L, 25L, 25L,
34L, 34L, 34L), .Label = c("", "[is]", "[is] + [is]", "[is] + [is]+ [is]",
"[is] + pa", "[is] + pum tsa + [is]", "[is] pa", "[is] to go",
"c", "dza pam pa", "dza tsa", "gwa", "gwa pa", "gwam pa (and) tsa",
"gwam param pam gwam", "gwap pa", "katapum", "pa", "pa pa", "pa pa ta",
"pa pampa pa", "pak", "pam pa ta", "para", "puk", "pum pam pa",
"stn sta pa", "ta", "ta pa", "ta um pa", "tan tan pam pa", "tara",
"tok", "tsa", "tsa [close] da", "tsa pa", "tsa pa tsa", "tsa para pa",
"tsa ta", "tsak", "tsak pak", "tsan tsan tsa", "tsarara + [is]",
"um", "um pa", "um pa pa", "wa", "wa pa", "wa pa [is] pa", "wa sta um pa",
"wa wa", "wa wa pu pa pa pa", "wa wa ta pa", "wak wak", "wak wak pa pa",
"wam pa", "wam pam pa", "wam pam pa pa", "wam pam pam pa", "wap pa",
"wo", "wuk"), class = "factor"), col2 = structure(c(3L, 1L, 3L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L,
1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("",
"touch + yes", "yes", "yes (for himself)"), class = "factor"),
col3 = structure(c(1L, 4L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "yes",
"yes (hand mov)", "yes (hands mov)", "yes (head mov)", "yes (index finger)"
), class = "factor"), col4 = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "yes"), class = "factor"), col5 = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 4L, 1L, 2L, 4L, 2L, 4L, 4L,
2L, 1L, 4L, 1L, 1L, 4L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 2L), .Label = c("", "yes", "yes (part of it)",
"yes\n"), class = "factor"), col6 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
4L, 1L), .Label = c("", "repeats the mov", "yes", "yes (see next line)"
), class = "factor"), col7 = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("K", "M"), class = "factor")), row.names = c(NA,
50L), class = "data.frame")守则经修订如下:
df %>%
mutate(col1 = as.character(col1)) %>%
select(col1) %>%
mutate(str_ex1 = str_extract(col1, "^[[:alpha:]]{2,4}$"),
str_ex2 = str_extract(col1, paste(str_ex1, collapse = "|")),
count1 = str_count(col1, paste(str_ex1, collapse = "|"))) %>%
filter(count1 > 0) %>%
group_by(str_ex1, str_ex2) %>%
summarise(count2 = sum(count1)) %>%
mutate(N_type = if_else(is.na(str_ex1), "N_other", "N")) %>%
ungroup() %>%
mutate(str_ex1 = coalesce(str_ex1, str_ex2)) %>%
pivot_wider(id_cols = str_ex1, names_from = N_type, values_from = count2) %>%
replace_na(list(N_other = 0))输出
# A tibble: 6 x 3
str_ex1 N N_other
<chr> <int> <dbl>
1 pa 3 6
2 pak 1 0
3 para 2 0
4 puk 2 0
5 tsa 7 4
6 wa 3 16还要注意,col1是一个因素,所以这是一个简单的字符向量。此外,我还添加了select(col1),只查看col1,忽略其他列。
https://stackoverflow.com/questions/60789248
复制相似问题