我正在尝试替换乳腺癌数据集中列出的Node_caps的一些值。根据最常见的答案选择yes或no的值。
我已经找出了最常见的答案,我想用来替换这些值,但似乎无法真正让这些值的替换起作用。以下是我尝试过的两个版本。
traindata$Node_caps[traindata$Node_caps == '?' && traindata$Inv_nodes == '0-2'] <- 'no'和
for (i in 1:nrow(traindata)) {
if (is.nan(traindata[i,6])) {
if (traindata$Inv_nodes[i] == "0-2") {
traindata$Node_caps[i] = "no"
} else if (traindata$Inv_nodes[i] == "3-5") {
traindata$Node_caps[i] = "yes"
} else if (traindata$Inv_nodes[i] == "9-11") {
traindata$Node_caps[i] = "yes"
}else
}
}> dput(head(traindata))
structure(list(Class = c("no-recurrence-events", "no-recurrence-events",
"no-recurrence-events", "no-recurrence-events", "no-recurrence-events",
"no-recurrence-events"), Age = c("30-39", "40-49", "40-49", "60-69",
"40-49", "60-69"), Menopause = c("premeno", "premeno", "premeno",
"ge40", "premeno", "ge40"), Tumor_size = c("30-34", "20-24",
"20-24", "15-19", "0-4", "15-19"), Inv_nodes = c("0-2", "0-2",
"0-2", "0-2", "0-2", "0-2"), Node_caps = c("no", "no", "no",
"no", "no", "no"), Deg_malig = c(3L, 2L, 2L, 2L, 2L, 2L), Breast = c("left",
"right", "left", "right", "right", "left"), Irradiate = c("no",
"no", "no", "no", "no", "no")), row.names = c(NA, 6L), class = "data.frame")> head(traindata)
Class Age Menopause Tumor_size Inv_nodes Node_caps Deg_malig Breast Irradiate
1 no-recurrence-events 30-39 premeno 30-34 0-2 no 3 left no
2 no-recurrence-events 40-49 premeno 20-24 0-2 no 2 right no
3 no-recurrence-events 40-49 premeno 20-24 0-2 no 2 left no
4 no-recurrence-events 60-69 ge40 15-19 0-2 no 2 right no
5 no-recurrence-events 40-49 premeno 0-4 0-2 no 2 right no
6 no-recurrence-events 60-69 ge40 15-19 0-2 no 2 left no
> tail(traindata)
Class Age Menopause Tumor_size Inv_nodes Node_caps Deg_malig Breast Irradiate
281 recurrence-events 50-59 ge40 40-44 6-8 yes 3 left yes
282 recurrence-events 30-39 premeno 30-34 0-2 no 2 left no
283 recurrence-events 30-39 premeno 20-24 0-2 no 3 left yes
284 recurrence-events 60-69 ge40 20-24 0-2 no 1 right no
285 recurrence-events 40-49 ge40 30-34 3-5 no 3 left no
286 recurrence-events 50-59 ge40 30-34 3-5 no 3 left no
> traindata[c(traindata$Node_caps == '?'),]
Class Age Menopause Tumor_size Inv_nodes Node_caps Deg_malig Breast
146 no-recurrence-events 40-49 premeno 25-29 0-2 ? 2 left
164 no-recurrence-events 60-69 ge40 25-29 3-5 ? 1 right
165 no-recurrence-events 60-69 ge40 25-29 3-5 ? 1 right
184 no-recurrence-events 50-59 ge40 30-34 9-11 ? 3 left
185 no-recurrence-events 50-59 ge40 30-34 9-11 ? 3 left
234 recurrence-events 70-79 ge40 15-19 9-11 ? 1 left
264 recurrence-events 50-59 lt40 20-24 0-2 ? 1 left
265 recurrence-events 50-59 lt40 20-24 0-2 ? 1 left
Irradiate
146 yes
164 yes
165 yes
184 yes
185 yes
234 yes
264 no
265 no我只想替换其中的?而不是其他任何人。下面的表格可能会让它更清晰。
> table(traindata$Inv_nodes,traindata$Node_caps)
? no yes
0-2 3 201 9
12-14 0 1 2
15-17 0 1 5
24-26 0 0 1
3-5 2 15 19
6-8 0 3 14
9-11 3 1 6我需要一些帮助来找出我做错了什么
发布于 2021-05-08 15:21:46
已更新
很抱歉,我没有意识到我的代码中有一个错误。我对您的样本数据做了一些细微的修改,以便我们可以应用您的条件。
library(dplyr)
library(janitor)
# First we extract the most common answers for every categroy
df %>%
tabyl(Inv_nodes, Node_caps) %>%
select(!`?`) %>%
rowwise() %>%
mutate(most_common = names(cur_data())[which.max(c_across(no:yes)) + 1]) -> tbl1
# A tibble: 3 x 4
# Rowwise:
Inv_nodes no yes most_common
<chr> <dbl> <dbl> <chr>
1 0-2 2 0 no
2 3-5 0 1 yes
3 9-11 0 0 no
# Then we replace the ?s with the most common answers
df %>%
rowwise() %>%
mutate(Node_caps = ifelse(Node_caps == "?",
tbl1$most_common[tbl1$Inv_nodes == Inv_nodes],
Node_caps)) %>%
select(Inv_nodes, Node_caps)
# A tibble: 6 x 2
# Rowwise:
Inv_nodes Node_caps
<chr> <chr>
1 0-2 no
2 9-11 no
3 0-2 no
4 3-5 yes
5 3-5 yes
6 0-2 no Data
structure(list(Class = c("no-recurrence-events", "no-recurrence-events",
"no-recurrence-events", "no-recurrence-events", "no-recurrence-events",
"no-recurrence-events"), Age = c("30-39", "40-49", "40-49", "60-69",
"40-49", "60-69"), Menopause = c("premeno", "premeno", "premeno",
"ge40", "premeno", "ge40"), Tumor_size = c("30-34", "20-24",
"20-24", "15-19", "0-4", "15-19"), Inv_nodes = c("0-2", "9-11",
"0-2", "3-5", "3-5", "0-2"), Node_caps = c("?", "?", "no", "?",
"yes", "no"), Deg_malig = c(3L, 2L, 2L, 2L, 2L, 2L), Breast = c("left",
"right", "left", "right", "right", "left"), Irradiate = c("no",
"no", "no", "no", "no", "no")), row.names = c(NA, 6L), class = "data.frame")https://stackoverflow.com/questions/67444550
复制相似问题