这是数据集。
library(data.table)
x <- structure(list(id = c("A", "B" ),
segment_stemming = c("[('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]",
"[('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]" )),
row.names = c(NA, -2L),
class = c("data.table", "data.frame" ))
x
# id segment_stemming
# 1: A [('Brownie', 'Noun'), ('From', 'Josa'), ('Pi', 'Noun')]
# 2: B [('Dung-caroon-gye', 'Noun'), ('in', 'Josa'), ('innovation', 'Noun')]我想把这些元组分成几行。这是我的预期结果。
id segment_stemming
A ('Brownie', 'Noun')
A ('From', 'Josa')
A ('Pi', 'Noun')
B ('Dung-caroon-gye', 'Noun')
B ('in', 'Josa')
B ('innovation', 'Noun')我使用R搜索了元组格式,但是找不到任何线索来决定结果。
发布于 2022-03-11 11:35:33
data.table方法
下面是一个使用data.table + reticulate的选项
library(reticulate)
library(data.table)
setDT(x)[
,
segment_stemming := gsub("(\\(.*?\\))", '\"\\1\"', segment_stemming)
][
,
lapply(.SD, py_eval),
id
]这给
id segment_stemming
1: A ('Brownie', 'Noun')
2: A ('From', 'Josa')
3: A ('Pi', 'Noun')
4: B ('Dung-caroon-gye', 'Noun')
5: B ('in', 'Josa')
6: B ('innovation', 'Noun')另一个使用data.table + trimws的strsplit选项,如下所示
library(data.table)
setDT(x)[
,
.(segment_stemming = trimws(
unlist(strsplit(segment_stemming, "(?<=\\)),\\s+(?=\\()", perl = TRUE)),
whitespace = "\\[|\\]"
)),
id
]给出
id segment_stemming
1: A ('Brownie', 'Noun')
2: A ('From', 'Josa')
3: A ('Pi', 'Noun')
4: B ('Dung-caroon-gye', 'Noun')
5: B ('in', 'Josa')
6: B ('innovation', 'Noun')基R
一些R基选项也应该有效。
with(
x,
setNames(
rev(
stack(
tapply(
segment_stemming,
id,
function(v) {
trimws(
unlist(strsplit(v, "(?<=\\)),\\s+(?=\\()", perl = TRUE)),
whitespace = "\\[|\\]"
)
}
)
)
),
names(x)
)
)或
with(
x,
setNames(
rev(
stack(
setNames(
regmatches(segment_stemming, gregexpr("\\(.*?\\)", segment_stemming)),
id
)
)
),
names(x)
)
)发布于 2022-03-11 11:05:03
下面是一种使用separate_rows的方法
library(tidyverse)
x %>%
mutate(segment_stemming = gsub("\\[|\\]", "", segment_stemming)) %>%
separate_rows(segment_stemming, sep = ",\\s*(?![^()]*\\))")
# A tibble: 6 x 2
id segment_stemming
<chr> <chr>
1 A ('Brownie', 'Noun')
2 A ('From', 'Josa')
3 A ('Pi', 'Noun')
4 B ('Dung-caroon-gye', 'Noun')
5 B ('in', 'Josa')
6 B ('innovation', 'Noun') 一种获得更好结果的方法,通过一些操作(unnest_wider是不必要的)。
x %>%
mutate(segment_stemming = gsub("\\[|\\]", "", segment_stemming)) %>%
separate_rows(segment_stemming, sep = ",\\s*(?![^()]*\\))") %>%
mutate(segment_stemming = segment_stemming %>%
str_remove_all("[()',]") %>%
str_split(" ")) %>%
unnest_wider(segment_stemming)
# A tibble: 6 x 3
id ...1 ...2
<chr> <chr> <chr>
1 A Brownie Noun
2 A From Josa
3 A Pi Noun
4 B Dung-caroon-gye Noun
5 B in Josa
6 B innovation Noun 发布于 2022-03-11 11:52:30
x[,.(segment_stemming = unlist(str_extract_all(segment_stemming, "\\(.*?\\)"))), by = id]或者你可以使用tidyr::unnest。这样,只有一个对str_extract_all的调用
x[, segment_stemming := str_extract_all(segment_stemming, "\\(.*?\\)")]
unnest(x, segment_stemming)https://stackoverflow.com/questions/71437352
复制相似问题