首页
学习
活动
专区
圈层
工具
发布
社区首页 >专栏 >单细胞测序—Copykat抽样以及结果映射

单细胞测序—Copykat抽样以及结果映射

原创
作者头像
sheldor没耳朵
发布2026-04-01 17:25:08
发布2026-04-01 17:25:08
1360
举报
文章被收录于专栏:单细胞测序单细胞测序

单细胞测序—Copykat抽样以及结果映射

代码语言:javascript
复制
rm(list=ls())
options(stringsAsFactors = F) 
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(data.table)
library(dplyr)
library(stringr)
library(harmony)
#devtools::install_github("navinlabcode/copykat")
library(copykat)

# sce.select <- readRDS('../result/rds/step1_sce_selected_after_umap.rds')
# phe <- sce.select@meta.data
# colnames(phe)
# table(phe$celltype)
mycolors <-c('#E64A35','#4DBBD4' ,'#01A187'  ,'#6BD66B','#3C5588'  ,'#F29F80'  ,
             '#8491B6','#91D0C1','#7F5F48','#AF9E85','#4F4FFF','#CE3D33',
             '#739B57','#EFE685','#446983','#BB6239','#5DB1DC','#7F2268','#800202','#D8D8CD'
)
############ 提取上皮细胞 ############
# set.seed(12345)
# sce.select.Epithelial <- sce.select[, sce.select$celltype %in% c( 'Epithelial cells' )]
# dim(sce.select.Epithelial)#30558 188187
# saveRDS(
#   sce.select.Epithelial,
#   "../result/rds/step2_sce_select_Epithelial.rds"
# )

# sce.select.Epithelial <- readRDS('../result/rds/step2_sce_select_Epithelial.rds')
# dim(sce.select.Epithelial)#30558 188187
# table(sce.select.Epithelial$sample)
# Adeno Samples   NEC Samples 
# 152262         35925 

############ 全部上皮细胞 ############
# sce.select.Epithelial <- NormalizeData(sce.select.Epithelial)
# sce.select.Epithelial <- FindVariableFeatures(sce.select.Epithelial)
# sce.select.Epithelial <- ScaleData(sce.select.Epithelial)
# sce.select.Epithelial <- RunPCA(sce.select.Epithelial)
# sce.select.Epithelial <- RunUMAP(sce.select.Epithelial, dims = 1:30)
# saveRDS(
#   sce.select.Epithelial,
#   "../result/rds/step2_sce_select_Epithelial_after_umap.rds"
# )
message("step1:读取数据")
sce.select.Epithelial <- readRDS('../result/rds/step2_sce_select_Epithelial_after_umap.rds') 
table(sce.select.Epithelial@meta.data$sample)
# Adeno Samples   NEC Samples 
# 152262         35925 

############ copyKAT ############
#在子集上训练(跑CopyKAT),然后将结果标签映射回全集
#copykat的作者建议每个样本单独跑,避免批次效应影响CNV推断
#这里我采取了折中的方案,把NEU跑一次,Ade跑一次,最后在合并到一起
#这种方法废弃了,实际操作中,在neu样本中,copykat找不到正常细胞,即来自neu的样本基本上都是肿瘤细胞
#重新采取的方法是neu细胞保留,ade样本抽样,然后合在一起跑copykat

# ############  copyKAT  ############
# wd <- getwd()
# message(paste("当前工作目录:",wd))
# table(sce.select.Epithelial$sample)
# # Adeno Samples   NEC Samples 
# # 152262         35925 
# set.seed(123)
# # Adeno抽样
# sce.adeno <- subset(sce.select.Epithelial, subset = sample == "Adeno Samples")
# cells.adeno <- sample(colnames(sce.adeno), 30000)
# # NEC全部保留
# sce.nec <- subset(sce.select.Epithelial, subset = sample == "NEC Samples")
# # 合并
# sce.mix <- subset(
#   sce.select.Epithelial,
#   cells = c(cells.adeno, colnames(sce.nec))
# )
# counts <- sce.mix[["RNA"]]$counts
# dim(counts)#30558 65925
# counts <- counts[rowSums(counts > 0) > 20, ]
# dim(counts)#24605 65925
# 
# message("step2:开始执行copykat")
# 
# copykat_result <- copykat(
#   rawmat = counts,
#   id.type = "S",
#   ngene.chr = 3,
#   win.size = 50,
#   KS.cut = 0.1,
#   sam.name = "GBC",
#   distance = "euclidean",
#   norm.cell.names = "",
#   output.seg = FALSE,
#   plot.genes = TRUE,
#   genome = "hg20",
#   n.cores = 12
# )
# 
# saveRDS(copykat_result, "../result/rds/step2_sce_select_Epithelial_copykat.rds")


# ############  copyKAT 结果整理 ############

# list.files(pattern = "^GBC")
# #mv GBC* ../result/step2/copykat_result/
# list.files(path= "../result/step2/copykat_result/",pattern = "^GBC")
# [1] "GBC_copykat_clustering_results.rds"           "GBC_copykat_CNA_raw_results_gene_by_cell.txt"
# [3] "GBC_copykat_CNA_results.txt"                  "GBC_copykat_heatmap.jpeg"                    
# [5] "GBC_copykat_prediction.txt"                   "GBC_copykat_with_genes_heatmap.pdf"  

copykat_result <- readRDS('../result/rds/step2_sce_select_Epithelial_copykat.rds')

pred <- copykat_result$prediction
table(pred$copykat.pred)
# aneuploid     diploid not.defined 
# 11234       50808        3883 

############  copyKAT 结果注释到单细胞中 ############
pred <- copykat_result$prediction
head(pred)
# 只保留明确标签
pred <- pred[pred$copykat.pred %in% c("aneuploid", "diploid"), ]
table(pred$copykat.pred)
# aneuploid   diploid 
# 11234     50808 

#######构建reference Seurat对象######
head(rownames(pred))
cells.use <- rownames(pred)

sce.ref <- subset(
  sce.select.Epithelial,
  cells = cells.use
)

# 添加标签
sce.ref$copykat <- pred[colnames(sce.ref), "copykat.pred"]
table(sce.ref$copykat)
# aneuploid   diploid 
# 11234     50808 
##这里数量不平衡,导致后续分析多数细胞都被注释为diploid,因此后续需要额外采取抽样处理
set.seed(123)
cells.aneu <- colnames(sce.ref)[sce.ref$copykat == "aneuploid"]
cells.dip <- colnames(sce.ref)[sce.ref$copykat == "diploid"]
cells.dip.sub <- sample(cells.dip, length(cells.aneu))
sce.ref.balanced <- subset(
  sce.ref,
  cells = c(cells.aneu, cells.dip.sub)
)
table(sce.ref.balanced$copykat)
# aneuploid   diploid 
# 11234     11234 

#######构建构建query(剩余细胞)对象######
cells.all <- colnames(sce.select.Epithelial)
cells.query <- setdiff(cells.all, cells.use)

sce.query <- subset(
  sce.select.Epithelial,
  cells = cells.query
)

dim(sce.query)
#[1]  30558 126145
#######分开标准化(reference + query象######
# reference
sce.ref.balanced <- NormalizeData(sce.ref.balanced)
sce.ref.balanced <- FindVariableFeatures(sce.ref.balanced, nfeatures = 3000)
sce.ref.balanced <- ScaleData(sce.ref.balanced)
sce.ref.balanced <- RunPCA(sce.ref.balanced, npcs = 30)

# query
sce.query <- NormalizeData(sce.query)
sce.query <- FindVariableFeatures(sce.query, nfeatures = 3000)
sce.query <- ScaleData(sce.query)
sce.query <- RunPCA(sce.query, npcs = 30)

#######Find anchors######
anchors <- FindTransferAnchors(
  reference = sce.ref.balanced,
  query = sce.query,
  dims = 1:30
)

# Performing PCA on the provided reference using 3000 features as input.
# Projecting cell embeddings
# Finding neighborhoods
# Finding anchors
# Found 30064 anchors



#######标签转移######
predictions <- TransferData(
  anchorset = anchors,
  refdata = sce.ref.balanced$copykat,
  dims = 1:30
)

head(predictions)


sce.query$copykat_pred <- predictions$predicted.id
sce.query$copykat_score <- predictions$prediction.score.max

table(sce.query$copykat_pred)
# aneuploid   diploid 
# 293    125852 
#######合并reference + query ######
# reference
sce.ref.balanced$copykat_final <- sce.ref.balanced$copykat
# query
sce.query$copykat_final <- sce.query$copykat_pred
# 合并
sce.all.epi <- merge(sce.ref.balanced, y = sce.query)
table(sce.all.epi$copykat_final)
# aneuploid   diploid 
# 11527    137086 
sce.all.epi
# An object of class Seurat 
# 30558 features across 148613 samples within 1 assay 
# Active assay: RNA (30558 features, 3000 variable features)
# 6 layers present: counts.1, counts.2, data.1, scale.data.1, data.2, scale.data.2
sce.all.epi <- JoinLayers(sce.all.epi)
sce.all.epi
# An object of class Seurat 
# 30558 features across 148613 samples within 1 assay 
# Active assay: RNA (30558 features, 3000 variable features)
# 3 layers present: scale.data, data, counts
phe <- sce.all.epi@meta.data

sce.all.epi <- NormalizeData(sce.all.epi)
sce.all.epi <- FindVariableFeatures(sce.all.epi, nfeatures = 3000)
sce.all.epi <- ScaleData(sce.all.epi)
sce.all.epi <- RunPCA(sce.all.epi, npcs = 30)
sce.all.epi <- RunHarmony(sce.all.epi, "sample_name")
sce.all.epi <- RunUMAP(sce.all.epi,reduction = "harmony",dims = 1:30)

saveRDS(sce.all.epi, "../result/rds/step2.1_sce_all_epi_after_copykat_umap.rds")

sce.all.epi <- readRDS('../result/rds/step2.1_sce_all_epi_after_copykat_umap.rds')
table(sce.all.epi$copykat_final)
# aneuploid   diploid 
# 11527    137086 
######可视化 ######
############ umap 可视化 ############
epi_copykat_umap <- DimPlot(
  sce.all.epi,
  reduction = "umap",
  cols = mycolors,
  group.by = "copykat_final",
  label = TRUE
)+ ggtitle("Epithelial cells ")
epi_copykat_umap
ggsave('../result/step2/epi_copykat_umap.pdf',plot = epi_copykat_umap,width = 6,height = 5)

mycolors1 <- c('#4DBBD4','#E64A35')
epi_sample_umap <- DimPlot(
  sce.all.epi,
  reduction = "umap",
  cols = mycolors1,
  group.by = "sample",
  label = TRUE
)+ ggtitle("Epithelial cells ")
epi_sample_umap
ggsave('../result/step2/epi_sample_umap.pdf',plot = epi_sample_umap,width = 6,height = 5)



#########copykat 热图绘制#######
CNA.test <- data.frame(copykat_result$CNAmat)
CNA.test[1:4,1:4]
pred.test <- data.frame(copykat_result$prediction)
pred.test <- pred.test[which(pred.test$copykat.pred %in% c("aneuploid","diploid")),]  ##keep defined cells

#####细胞数目太多,这里抽样绘制###########
set.seed(123)
# 每类抽样数量
n_sample <- 200

cells_use <- unlist(
  lapply(split(rownames(pred.test), pred.test$copykat.pred), function(x){
    sample(x, min(length(x), n_sample))
  })
)
# 子集数据
CNA.sub <- CNA.test[, c(colnames(CNA.test)[1:3], cells_use)]
pred.sub <- pred.test[cells_use, ]

########## 颜色设置 ##########
# 热图配色
my_palette <- colorRampPalette(rev(RColorBrewer::brewer.pal(n = 3, name = "RdBu")))(n = 999)

# 染色体颜色
chr <- as.numeric(CNA.sub$chrom) %% 2 + 1
rbPal1 <- colorRampPalette(c('black','grey'))
CHR <- rbPal1(2)[as.numeric(chr)]
chr1 <- cbind(CHR, CHR)

# 细胞注释颜色
rbPal5 <- colorRampPalette(RColorBrewer::brewer.pal(n = 8, name = "Dark2")[2:1])
com.preN <- pred.sub$copykat.pred
pred_col <- rbPal5(2)[as.numeric(factor(com.preN))]
cells <- rbind(pred_col, pred_col)
# 颜色断点
col_breaks = c(seq(-1,-0.4,length=50),
               seq(-0.4,-0.2,length=150),
               seq(-0.2,0.2,length=600),
               seq(0.2,0.4,length=150),
               seq(0.4,1,length=50))

########## 绘图 ##########
pdf("../result/step2/copykat_result/copykat_balanced_sampling.pdf", height = 15, width = 15)
heatmap.3(
  t(CNA.sub[,4:ncol(CNA.sub)]),
  dendrogram = "r",
  distfun = function(x) parallelDist::parDist(x, threads = 4, method = "euclidean"),
  hclustfun = function(x) hclust(x, method = "ward.D2"),
  ColSideColors = chr1,
  RowSideColors = cells,
  Colv = NA,
  Rowv = TRUE,
  notecol = "black",
  col = my_palette,
  breaks = col_breaks,
  key = TRUE,
  keysize = 1,
  density.info = "none",
  trace = "none",
  cexRow = 0.1,
  cexCol = 0.1,
  cex.main = 1,
  cex.lab = 0.1,
  symm = FALSE,
  symkey = FALSE,
  symbreaks = TRUE,
  cex = 1,
  margins = c(10,10),
  labRow = NA,  
  labCol = NA  
)
legend(
  "topright",
  paste("pred.", names(table(com.preN)), sep=""),
  pch = 15,
  col = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[2:1],
  cex = 2,
  bty = "n"
)
dev.off()
#############小提琴图##############
### 转化cnascore数据 ###
# expr <- CNA.test[,4:ncol(CNA.test)]# 前三列是坐标数据
# CNV_score=expr-1# 转换为以0为中心分布的数据
# CNV_score=CNV_score ^ 2# 平方,防止累加的过程中cnv lose和cnv gain相互抵消,造成没有cnv事件发生的假象
# CNV_score=as.data.frame(colMeans(CNV_score))# 以细胞为单位,获得cnv score
# colnames(CNV_score)="CNV_score"# 更改列名
# CNV_score$CB=rownames(CNV_score)# 加入cell barcode列
# 
# 
# scRNA <- sce.all.epi
# scRNA <- AddMetaData(scRNA,metadata = pred.test)
# 
# scRNA$CB <- colnames(scRNA)# 统一cell barcode名称
# CNV_score=CNV_score%>%inner_join(scRNA@meta.data,by="CB")# 合并cnvscore和聚类结果
# 
# library(ggplot2)
# library(ggsignif)
# # 按照CNA变异情况分类:
# ggplot(CNV_score,aes(copykat.pred,CNV_score,fill=copykat.pred))+
#   geom_violin(alpha=0.4)+ #小提琴图需要一定的透明度
#   stat_boxplot(geom="errorbar",position = position_dodge(width = 0.1),width=0.1)+ # 加上误差棒
#   geom_boxplot(alpha=0.5,outlier.size=0, size=0.3, width=0.3)+ # 加上箱线图
#   scale_fill_manual(values = ggsci::pal_aaas()(3))+ # 填充颜色
#   theme_bw()+# 主题为空白
#   geom_signif(comparisons = list(c("aneuploid","diploid")
#   ),
#   step_increase = 0.1,map_signif_level = F)
# 

运行日志

代码语言:javascript
复制
step1:读取数据
当前工作目录: /home/data/t080601/Lixiao/TTF/ST/IN2025121601_liuziyu/code

Adeno Samples   NEC Samples 
       152262         35925 
[1] 30558 65925
[1] 24605 65925
step2:开始执行copykat
[1] "running copykat v1.1.0"
[1] "step1: read and filter data ..."
[1] "24605 genes, 65925 cells in raw data"
[1] "10051 genes past LOW.DR filtering"
[1] "step 2: annotations gene coordinates ..."
[1] "start annotation ..."
[1] "step 3: smoothing data with dlm ..."
[1] "step 4: measuring baselines ..."
number of iterations= 108 
number of iterations= 135 
number of iterations= 502 
number of iterations= 343 
number of iterations= 293 
number of iterations= 734 
[1] "low confidence in classification"
[1] "cell: 1"
number of iterations= 183 
[1] "cell: 2"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 3"
number of iterations= 125 
[1] "cell: 4"
number of iterations= 347 
[1] "cell: 5"
number of iterations= 318 
[1] "cell: 6"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 7"
number of iterations= 107 
[1] "cell: 8"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 9"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 10"
number of iterations= 95 
[1] "cell: 11"
number of iterations= 350 
[1] "cell: 12"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 13"
number of iterations= 166 
[1] "cell: 14"
number of iterations= 296 
[1] "cell: 15"
number of iterations= 306 
[1] "cell: 16"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 17"
number of iterations= 238 
[1] "cell: 18"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 19"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 20"
number of iterations= 115 
[1] "cell: 21"
number of iterations= 118 
[1] "cell: 22"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 23"
number of iterations= 224 
[1] "cell: 24"
number of iterations= 121 
[1] "cell: 25"
number of iterations= 105 
[1] "cell: 26"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 27"
number of iterations= 83 
[1] "cell: 28"
number of iterations= 206 
[1] "cell: 29"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 30"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 31"
number of iterations= 76 
[1] "cell: 32"
number of iterations= 197 
[1] "cell: 33"
number of iterations= 244 
[1] "cell: 34"
number of iterations= 315 
[1] "cell: 35"
number of iterations= 209 
[1] "cell: 36"
number of iterations= 149 
[1] "cell: 37"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 38"
number of iterations= 313 
[1] "cell: 39"
number of iterations= 224 
[1] "cell: 40"
number of iterations= 249 
[1] "cell: 41"
number of iterations= 120 
[1] "cell: 42"
number of iterations= 166 
[1] "cell: 43"
number of iterations= 97 
[1] "cell: 44"
number of iterations= 371 
[1] "cell: 45"
number of iterations= 167 
[1] "cell: 46"
number of iterations= 383 
[1] "cell: 47"
number of iterations= 165 
[1] "cell: 48"
number of iterations= 384 
[1] "cell: 49"
number of iterations= 291 
[1] "cell: 50"
number of iterations= 205 
[1] "cell: 51"
number of iterations= 309 
[1] "cell: 52"
number of iterations= 117 
[1] "cell: 53"
number of iterations= 408 
[1] "cell: 54"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 55"
number of iterations= 214 
[1] "cell: 56"
number of iterations= 115 
[1] "cell: 57"
number of iterations= 407 
[1] "cell: 58"
number of iterations= 108 
[1] "cell: 59"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 60"
number of iterations= 135 
[1] "cell: 61"
number of iterations= 154 
[1] "cell: 62"
number of iterations= 158 
[1] "cell: 63"
number of iterations= 164 
[1] "cell: 64"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 65"
number of iterations= 131 
[1] "cell: 66"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 67"
number of iterations= 151 
[1] "cell: 68"
number of iterations= 414 
[1] "cell: 69"
number of iterations= 267 
[1] "cell: 70"
number of iterations= 498 
[1] "cell: 71"
number of iterations= 268 
[1] "cell: 72"
number of iterations= 189 
[1] "cell: 73"
number of iterations= 137 
[1] "cell: 74"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 75"
number of iterations= 234 
[1] "cell: 76"
number of iterations= 136 
[1] "cell: 77"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 78"
number of iterations= 119 
[1] "cell: 79"
number of iterations= 156 
[1] "cell: 80"
number of iterations= 147 
[1] "cell: 81"
number of iterations= 339 
[1] "cell: 82"
number of iterations= 390 
[1] "cell: 83"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 84"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 85"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 86"
number of iterations= 379 
[1] "cell: 87"
number of iterations= 245 
[1] "cell: 88"
number of iterations= 342 
[1] "cell: 89"
number of iterations= 394 
[1] "cell: 90"
number of iterations= 466 
[1] "cell: 91"
number of iterations= 425 
[1] "cell: 92"
number of iterations= 324 
[1] "cell: 93"
number of iterations= 156 
[1] "cell: 94"
number of iterations= 345 
[1] "cell: 95"
number of iterations= 304 
[1] "cell: 96"
number of iterations= 287 
[1] "cell: 97"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 98"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 99"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 100"
number of iterations= 303 
[1] "cell: 101"
number of iterations= 350 
[1] "cell: 102"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 103"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 104"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 105"
number of iterations= 434 
[1] "cell: 106"
number of iterations= 350 
[1] "cell: 107"
number of iterations= 124 
[1] "cell: 108"
number of iterations= 246 
[1] "cell: 109"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 110"
number of iterations= 282 
[1] "cell: 111"
number of iterations= 322 
[1] "cell: 112"
number of iterations= 143 
[1] "cell: 113"
WARNING! NOT CONVERGENT! 
number of iterations= 500 
[1] "cell: 114"
number of iterations= 102 
[1] "cell: 115"
number of iterations= 84 
[1] "step 5: segmentation..."
[1] "step 6: convert to genomic bins..."
[1] "step 7: adjust baseline ..."
[1] "step 8: final prediction ..."
[1] "step 9: saving results..."
[1] "step 10: ploting heatmap ..."
Time difference of 2.413455 days
Warning messages:
1: In asMethod(object) :
  sparse->dense coercion: allocating vector of size 12.1 GiB
2: In asMethod(object) :
  sparse->dense coercion: allocating vector of size 12.1 GiB
3: In asMethod(object) :
  sparse->dense coercion: allocating vector of size 4.6 GiB

部分结果图

映射回所有的细胞

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 单细胞测序—Copykat抽样以及结果映射
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档