请参阅以下代码,特别是最后一行代码:
library(dplyr)
library(qdap)
library(tm)
comments <- read.csv(file = 'c:/raj/r/Toxic Comment Classification/train.csv', header = T, stringsAsFactors = F)
comments %>% glimpse()
# convert to df source for VCorpus
comment_df_source <- comments %>%
rename(doc_id = id, text = comment_text) %>%
tm::DataframeSource()
# create VCorpus
comment_corpus <- comment_df_source %>% tm::VCorpus()
#Results in
# <<VCorpus>>
# Metadata: corpus specific: 0, document level (indexed): 6
# Content: documents: 1
comment_corpus[1]
#Results in
# toxic severe_toxic obscene threat insult identity_hate
# 1 1 0 0 0 0 0
meta(comment_corpus[1])
#Results in FALSE
comment_corpus[1] %>% (function(x) meta(x)$toxic == 0)
#Results in TRUE
comment_corpus[1] %>% (function(x) meta(x)$toxic == 1)
#Results in
# <<VCorpus>>
# Metadata: corpus specific: 0, document level (indexed): 6
# Content: documents: 0
tm_filter(comment_corpus[1], FUN = function(x) meta(x)$toxic == 1)
tm_filter(comment_corpus[1], FUN = function(x) meta(x)[['toxic']] == 1)最后两行(变体)保持返回错误的输出。我不确定我做错了什么。我仔细阅读了文档。请帮帮忙。
拉杰
发布于 2018-01-18 03:35:39
我对你的例子做了一点扩展,这样在语料库中就有2个文档。不需要使用tm_filter。tm_filter更多的是用于在文档的文本内部进行搜索。
您可以使用meta函数直接过滤语料库。
library(tm)
library(dplyr)
df <- data.frame(doc_id = c(1, 2), text = c('abc', 'def'), toxic = c(1,0), insult = c(0, 1))
corp <- df %>% DataframeSource() %>% VCorpus()
meta(corp)
toxic insult
1 1 0
2 0 1
toxic_corp <- corp[meta(corp, "toxic") == 1]
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 2
Content: documents: 1
meta(toxic_corp)
toxic insult
1 1 0https://stackoverflow.com/questions/48268667
复制相似问题