首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >从规则生成合成实例

从规则生成合成实例
EN

Stack Overflow用户
提问于 2021-05-18 21:38:22
回答 1查看 36关注 0票数 0
代码语言:javascript
复制
require(dplyr)
require(RRF)
require(inTrees)

## Data PreProcessing
german_credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) <- c("chk_acct", "duration", "credit_his", "purpose", "amount", 
"saving_acct", "present_emp", "installment_rate", "sex", "other_debtor","present_resid", "property", 
 "age", "other_install", "housing", "n_credits", "job", "n_people", "telephone", "foreign", "response")

german_credit$response <- german_credit$response - 1
german_credit$response <- as.factor(german_credit$response)
german_credit <- data.frame(german_credit,stringsAsFactors=TRUE)

LevelNum <- function(x) {x <- as.factor(x); levels(x) <- 1:length(unique(x)); return(x)}
MinMaxNorm <- function(x) {x <- as.numeric(x); MaxX <- max(x); MinX <- min(x) ; return((x - MinX)/(MaxX - MinX))}
german_credit <- german_credit %>% 
  mutate_if(is.character, LevelNum) %>% 
  mutate_if(is.integer, MinMaxNorm)

## Simple IID Bootstrapping with equal probability selection and replacement
NRow_GCredit <- nrow(german_credit)
PERC <- 0.7
PROBS <- rep(1, NRow_GCredit)/NRow_GCredit
TrainID <- sample(1:NRow_GCredit, PERC*NRow_GCredit, prob = PROBS, replace = TRUE)
TestID <- setdiff(1:NRow_GCredit, TrainID)
Ytrain <- german_credit %>% dplyr::select(response)  %>% dplyr::slice(TrainID)  
Ytest  <- german_credit %>% dplyr::select(response)  %>% dplyr::slice(TestID)  
Xtrain <- german_credit %>% dplyr::select(!response) %>% dplyr::slice(TrainID)
Xtest  <- german_credit %>% dplyr::select(!response) %>% dplyr::slice(TestID)

## Emd/ Data PreProcessing 
rf <- RRF::RRF(Xtrain, Ytrain$response, ntree = 500) # build an ordinary RF
ruleExec <- inTrees::extractRules(RF2List(rf), Xtrain, digits = 4) # transform to R-executable rules
YbyRules <- sapply(ruleExec, function(x) { 
  ZZ <- Ytrain[eval(parse(text = x))]
  length(ZZ[ZZ == Minority])/length(ZZ)
})
MinSortedRules <- sort(YbyRules, decreasing = TRUE) 
MinorityRules <- MinSortedRules[MinSortedRules>0.5]

这会生成这样的规则:

代码语言:javascript
复制
"X[,3] %in% c('1') & X[,6] %in% c('2','3','5') & X[,13]<=0.1786 & X[,17] %in% c('3')"

即:

  • 第三个变量是1级
  • ,第6个变量是级别2,3,5
  • ,第13个变量是<=0.1786
  • ,第17个变量是级别3

H 111,其他变量允许更改H 212f 213/code>

我被困在如何从这个规则产生随机合成观测。

EN

回答 1

Stack Overflow用户

发布于 2021-06-01 18:27:30

我解决了问题。这是非常slow...obviously,而不是多线程。这是太随机的,将固有的偏见的数据。一个简单的控件可以是应用KNN对非选定变量进行绑定,也可以是在合成数据上使用更精确的/TGT控件。

代码语言:javascript
复制
 RepeatSynGen<-function(MinorityRules,X,Xtrain,Minority,NBoot,i,UniqueCrit=10){ 

         X<-Xtrain; FullInfo<-1:nrow(Xtrain)
         ZZ<-strsplit(attributes(MinorityRules[1])$names,"&")[[1]] %>% as.list
         FindFirstNum<-function(ZZ){as.numeric(sub("\\D*(\\d+).*", "\\1", ZZ))}

         IncII<-do.call(c,lapply(1:length(ZZ),function(U){FindFirstNum(ZZ[[U]])}))
         NotInc<-(1:ncol(Xtrain))[!(1:ncol(Xtrain) %in% IncII)]
         NotIncCrit<-rep("FullInfo",length(NotInc))
         FullNum<-c(IncII,NotInc)

         ZZPrime<-vector(mode = "list", length = ncol(Xtrain))
         ZZPrime<-do.call(c, list(ZZ, NotIncCrit))
         ZZPrime <- ZZPrime[order(FullNum)]
         ClassDF<-sapply(Xtrain,class)
         UniqueLevel<-sapply(Xtrain,function(U){length(unique(U))})

         SynInstance<-lapply(1:NBoot, function(UU){

                     do.call(data.frame,lapply(1:ncol(Xtrain), function(UU){ ZZP<-ZZPrime[[UU]]
                     EvalZZP<-X[eval(parse(text=ZZP)),UU]
                                             if(ClassDF[UU]=="factor"){sample(EvalZZP,1)
                                                       } else { if(UniqueLevel[UU]<UniqueCrit){sample(EvalZZP,1)
                                                                 } else {runif(1,min(EvalZZP),max(EvalZZP))}} 
                                                                                    })) %>% `colnames<-`(colnames(Xtrain))
                                                        }) 


             SynCrit<-do.call(rbind,SynInstance)
             return(do.call(cbind,list(SynCrit,rep(i,NBoot))) %>% `colnames<-`(c(colnames(Xtrain),"Group")))

 }

IMB_Spread<-length(Majority)-length(Minority)
RepeatNSynData<-do.call(rbind,lapply(1:IMB_Spread,function(i){RepeatSynGen(MinorityRules[i],X,Xtrain,NBoot=50,i=i,UniqueCrit=10)}))
票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/67594252

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档