文章/答案/技术大牛

发布

社区首页 >问答首页 >分级Dirichlet回归(Jags).过度拟合

问分级Dirichlet回归(Jags).过度拟合
EN

Stack Overflow用户

提问于 2021-04-01 07:51:08

回答 1查看 117关注 0票数 0

早上好，我需要社区的帮助，以便了解写这个模型时发生的一些问题。我的目标是用"log_GDP“(以对数计的国内生产总值)和"log_h”(每1,000人的住院病床)作为预测指标，对死亡比例的原因进行建模。

y: 3栏，观察到多年来死亡人数的比例。
x1："log_GDP“(以原木计的国内生产总值)
x2："log_h”(每1,000名原木病人病床)

从上一幅图的估计结果可以看出，我的噪音水平很高。在我只使用一个协变量，即log_GDP的情况下，我得到了平滑的结果。

在这里，模型规范：

以下是模拟数据：

library(reshape2)
library(tidyverse)
library(ggplot2)
library(runjags)

CIRC <- c(0.3685287, 0.3675516, 0.3567829, 0.3517274, 0.3448940, 0.3391031, 0.3320184, 0.3268640,
          0.3227445, 0.3156360, 0.3138515,0.3084506, 0.3053657, 0.3061224, 0.3051044)

NEOP <- c(0.3602199, 0.3567355, 0.3599409, 0.3591258, 0.3544591, 0.3566269, 0.3510974, 0.3536156,
          0.3532980, 0.3460948, 0.3476183, 0.3475634, 0.3426035, 0.3352433, 0.3266048)

OTHER <-c(0.2712514, 0.2757129, 0.2832762, 0.2891468, 0.3006468, 0.3042701, 0.3168842, 0.3195204, 
          0.3239575, 0.3382691, 0.3385302, 0.3439860, 0.3520308, 0.3586342, 0.3682908)

log_h <- c(1.280934, 1.249902, 1.244155, 1.220830, 1.202972, 1.181727, 1.163151, 1.156881, 1.144223,
       1.141033, 1.124930, 1.115142, 1.088562, 1.075002, 1.061257)

log_GDP <- c(29.89597, 29.95853, 29.99016, 30.02312, 30.06973, 30.13358, 30.19878, 30.25675, 30.30184,
         30.31974, 30.30164, 30.33854, 30.37460, 30.41585, 30.45150)

D <- data.frame(CIRC=CIRC, NEOP=NEOP, OTHER=OTHER,
              log_h=log_h, log_GDP=log_GDP)

cause.y <- as.matrix((data.frame(D[,1],D[,2],D[,3])))
cause.y <-  cause.y/rowSums(cause.y)
mat.x<- D$log_GDP  
mat.x2 <- D$log_h
n <- 15

Jags模型

dirlichet.model = "
model {
#setup priors for each species
for(j in 1:N.spp){
m0[j] ~ dnorm(0, 1.0E-3) #intercept prior
m1[j] ~ dnorm(0, 1.0E-3) #      mat.x prior
m2[j] ~ dnorm(0, 1.0E-3)
}

#implement dirlichet
for(i in 1:N){
y[i,1:N.spp] ~ ddirch(a0[i,1:N.spp])


for(j in 1:N.spp){
log(a0[i,j]) <- m0[j] + m1[j] * mat.x[i]+ m2[j] * mat.x2[i] # m0 = intercept; m1= coeff log_GDP; m2= coeff log_h
}

}} #close model loop.
"

jags.data <- list(y = cause.y,mat.x= mat.x,mat.x2= mat.x2, N = nrow(cause.y), N.spp = ncol(cause.y))
jags.out <- run.jags(dirlichet.model,
                     data=jags.data,
                     adapt = 5000,
                     burnin = 5000,
                     sample = 10000,
                     n.chains=3,
                     monitor=c('m0','m1','m2'))
out <- summary(jags.out)
head(out)

集合系数与比例估计

coeff <- out[c(1,2,3,4,5,6,7,8,9),4]

coef1 <- out[c(1,4,7),4] #coeff (interc and slope) caus 1
coef2 <- out[c(2,5,8),4] #coeff (interc and slope) caus 2
coef3 <- out[c(3,6,9),4] #coeff (interc and slope) caus 3
pred <- as.matrix(cbind(exp(coef1[1]+coef1[2]*mat.x+coef1[3]*mat.x2),
                        exp(coef2[1]+coef2[2]*mat.x+coef2[3]*mat.x2),
                        exp(coef3[1]+coef3[2]*mat.x+coef3[3]*mat.x2)))
pred <- pred / rowSums(pred)

预测和预测。值DB

Obs <- data.frame(Circ=cause.y[,1],
                  Neop=cause.y[,2],
                  Other=cause.y[,3],
                  log_GDP=mat.x,
                  log_h=mat.x2)

Obs$model <- "Obs"

Pred <- data.frame(Circ=pred[,1],
                   Neop=pred[,2],
                   Other=pred[,3],
                   log_GDP=mat.x,
                   log_h=mat.x2)

Pred$model <- "Pred"

tot60<-as.data.frame(rbind(Obs,Pred))
tot <- melt(tot60,id=c("log_GDP","log_h","model"))
tot$variable <- as.factor(tot$variable)

绘图

tot %>%filter(model=="Obs") %>%  ggplot(aes(log_GDP,value))+geom_point()+
  geom_line(data = tot %>%
              filter(model=="Pred"))+facet_wrap(.~variable,scales = "free")

dirichlet

jags

hierarchical-bayesian

回答 1

Stack Overflow用户

发布于 2021-04-03 17:18:04

非光滑性的问题是计算Pr(y=m|X) = f( x1，x2)，即预测概率是x1和x2的函数。然后，您将Pr(y=m|X)绘制为GDP的单个x变量日志的函数。这一结果几乎肯定不是一帆风顺的。log_GDP和log_h变量是高度负相关的，这就是为什么结果不是比它更多的原因。

在我运行的模型中，log_GDP的平均系数实际上是正的，这表明你在图中看到的结果是非常误导的。如果你把这些画成二维，你会发现结果又是平滑的。

mx1 <- seq(min(mat.x), max(mat.x), length=25)
mx2 <- seq(min(mat.x2), max(mat.x2), length=25)
eg <- expand.grid(mx1 = mx1, mx2 = mx2)
pred <- as.matrix(cbind(exp(coef1[1]+coef1[2]*eg$mx1 + coef1[3]*eg$mx2),
                        exp(coef2[1]+coef2[2]*eg$mx1 + coef2[3]*eg$mx2),
                        exp(coef3[1]+coef3[2]*eg$mx1 + coef3[3]*eg$mx2)))
pred <- pred / rowSums(pred)

Pred <- data.frame(Circ=pred[,1],
                   Neop=pred[,2],
                   Other=pred[,3],
                   log_GDP=mx1,
                   log_h=mx2)

lattice::wireframe(Neop ~ log_GDP + log_h, data=Pred, drape=TRUE)

还有几件事要留心。

通常在分层贝叶斯模型中，您的系数参数本身就是带有超参数的分布。这使得系数向全局均值收缩，这是等级模型的一个特征。--

不确定您的数据到底是什么样的，但是这两个独立变量之间的相关性将使模型很难收敛。您可以尝试对系数使用多元正态分布--这可能会有所帮助。

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/66900437

复制

相似问题

问分级Dirichlet回归(Jags).过度拟合
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问分级Dirichlet回归(Jags).过度拟合EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问分级Dirichlet回归(Jags).过度拟合
EN