require(dplyr)
require(RRF)
require(inTrees)
## Data PreProcessing
german_credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) <- c("chk_acct", "duration", "credit_his", "purpose", "amount",
"saving_acct", "present_emp", "installment_rate", "sex", "other_debtor","present_resid", "property",
"age", "other_install", "housing", "n_credits", "job", "n_people", "telephone", "foreign", "response")
german_credit$response <- german_credit$response - 1
german_credit$response <- as.factor(german_credit$response)
german_credit <- data.frame(german_credit,stringsAsFactors=TRUE)
LevelNum <- function(x) {x <- as.factor(x); levels(x) <- 1:length(unique(x)); return(x)}
MinMaxNorm <- function(x) {x <- as.numeric(x); MaxX <- max(x); MinX <- min(x) ; return((x - MinX)/(MaxX - MinX))}
german_credit <- german_credit %>%
mutate_if(is.character, LevelNum) %>%
mutate_if(is.integer, MinMaxNorm)
## Simple IID Bootstrapping with equal probability selection and replacement
NRow_GCredit <- nrow(german_credit)
PERC <- 0.7
PROBS <- rep(1, NRow_GCredit)/NRow_GCredit
TrainID <- sample(1:NRow_GCredit, PERC*NRow_GCredit, prob = PROBS, replace = TRUE)
TestID <- setdiff(1:NRow_GCredit, TrainID)
Ytrain <- german_credit %>% dplyr::select(response) %>% dplyr::slice(TrainID)
Ytest <- german_credit %>% dplyr::select(response) %>% dplyr::slice(TestID)
Xtrain <- german_credit %>% dplyr::select(!response) %>% dplyr::slice(TrainID)
Xtest <- german_credit %>% dplyr::select(!response) %>% dplyr::slice(TestID)
## Emd/ Data PreProcessing
rf <- RRF::RRF(Xtrain, Ytrain$response, ntree = 500) # build an ordinary RF
ruleExec <- inTrees::extractRules(RF2List(rf), Xtrain, digits = 4) # transform to R-executable rules
YbyRules <- sapply(ruleExec, function(x) {
ZZ <- Ytrain[eval(parse(text = x))]
length(ZZ[ZZ == Minority])/length(ZZ)
})
MinSortedRules <- sort(YbyRules, decreasing = TRUE)
MinorityRules <- MinSortedRules[MinSortedRules>0.5]这会生成这样的规则:
"X[,3] %in% c('1') & X[,6] %in% c('2','3','5') & X[,13]<=0.1786 & X[,17] %in% c('3')"即:
H 111,其他变量允许更改H 212f 213/code>
我被困在如何从这个规则产生随机合成观测。
发布于 2021-06-01 18:27:30
我解决了问题。这是非常slow...obviously,而不是多线程。这是太随机的,将固有的偏见的数据。一个简单的控件可以是应用KNN对非选定变量进行绑定,也可以是在合成数据上使用更精确的/TGT控件。
RepeatSynGen<-function(MinorityRules,X,Xtrain,Minority,NBoot,i,UniqueCrit=10){
X<-Xtrain; FullInfo<-1:nrow(Xtrain)
ZZ<-strsplit(attributes(MinorityRules[1])$names,"&")[[1]] %>% as.list
FindFirstNum<-function(ZZ){as.numeric(sub("\\D*(\\d+).*", "\\1", ZZ))}
IncII<-do.call(c,lapply(1:length(ZZ),function(U){FindFirstNum(ZZ[[U]])}))
NotInc<-(1:ncol(Xtrain))[!(1:ncol(Xtrain) %in% IncII)]
NotIncCrit<-rep("FullInfo",length(NotInc))
FullNum<-c(IncII,NotInc)
ZZPrime<-vector(mode = "list", length = ncol(Xtrain))
ZZPrime<-do.call(c, list(ZZ, NotIncCrit))
ZZPrime <- ZZPrime[order(FullNum)]
ClassDF<-sapply(Xtrain,class)
UniqueLevel<-sapply(Xtrain,function(U){length(unique(U))})
SynInstance<-lapply(1:NBoot, function(UU){
do.call(data.frame,lapply(1:ncol(Xtrain), function(UU){ ZZP<-ZZPrime[[UU]]
EvalZZP<-X[eval(parse(text=ZZP)),UU]
if(ClassDF[UU]=="factor"){sample(EvalZZP,1)
} else { if(UniqueLevel[UU]<UniqueCrit){sample(EvalZZP,1)
} else {runif(1,min(EvalZZP),max(EvalZZP))}}
})) %>% `colnames<-`(colnames(Xtrain))
})
SynCrit<-do.call(rbind,SynInstance)
return(do.call(cbind,list(SynCrit,rep(i,NBoot))) %>% `colnames<-`(c(colnames(Xtrain),"Group")))
}
IMB_Spread<-length(Majority)-length(Minority)
RepeatNSynData<-do.call(rbind,lapply(1:IMB_Spread,function(i){RepeatSynGen(MinorityRules[i],X,Xtrain,NBoot=50,i=i,UniqueCrit=10)}))https://stackoverflow.com/questions/67594252
复制相似问题