我有两个想要加入的数据格式,通常我做的是inner_join,它只给出数据和rest中都存在的数据,并且没有显示在输出中。
如何进行内连接和保留所有行并分配标签
第一次数据
dput(head(EF_TF_region_expression))
structure(list(ENSEMBL = structure(c(1L, 2L, 3L, 3L, 4L, 5L), .Label = c("ENSG00000006468",
"ENSG00000030419", "ENSG00000057657", "ENSG00000068305", "ENSG00000068903",
"ENSG00000082014", "ENSG00000085276", "ENSG00000091831", "ENSG00000100298",
"ENSG00000102349", "ENSG00000102554", "ENSG00000102935", "ENSG00000105991",
"ENSG00000107485", "ENSG00000109906", "ENSG00000112182", "ENSG00000118922",
"ENSG00000119508", "ENSG00000121067", "ENSG00000124203", "ENSG00000124440",
"ENSG00000124613", "ENSG00000130382", "ENSG00000136770", "ENSG00000138738",
"ENSG00000140009", "ENSG00000140968", "ENSG00000143499", "ENSG00000146285",
"ENSG00000146592", "ENSG00000147862", "ENSG00000148143", "ENSG00000150907",
"ENSG00000151090", "ENSG00000151322", "ENSG00000151623", "ENSG00000152217",
"ENSG00000159882", "ENSG00000160321", "ENSG00000162367", "ENSG00000162599",
"ENSG00000162772", "ENSG00000164105", "ENSG00000164684", "ENSG00000167034",
"ENSG00000167081", "ENSG00000171316", "ENSG00000173258", "ENSG00000174306",
"ENSG00000176399", "ENSG00000176842", "ENSG00000177508", "ENSG00000177606",
"ENSG00000177946", "ENSG00000178028", "ENSG00000178175", "ENSG00000178187",
"ENSG00000178573", "ENSG00000181690", "ENSG00000182348", "ENSG00000185155",
"ENSG00000185420", "ENSG00000185630", "ENSG00000186350", "ENSG00000186766",
"ENSG00000187098", "ENSG00000189190", "ENSG00000196498", "ENSG00000197343",
"ENSG00000197576", "ENSG00000198300", "ENSG00000213341", "ENSG00000213793",
"ENSG00000213973", "ENSG00000269067"), class = "factor"), Symbol = structure(c(13L,
20L, 45L, 45L, 30L, 51L), .Label = c("APOBEC3H", "ATF3", "BACH2",
"CENPBD1", "CHD7", "CHUK", "CREB5", "DMAP1", "DMRTA1", "DNAJC1",
"ESR1", "ESR2", "ETV1", "FOXI2", "FOXO1", "GATA3", "HIF3A", "HOXA1",
"HOXA4", "IKZF2", "IRF8", "IRX3", "IRX5", "JUN", "KLF12", "KLF5",
"KLF8", "MAF", "MECOM", "MEF2A", "MITF", "MIXL1", "MLLT1", "NCOR2",
"NFIA", "NFIB", "NKX3-1", "NPAS3", "NR3C2", "NR4A3", "PBX1",
"PBX3", "PEG3", "PLAG1", "PRDM1", "PRDM5", "RXRA", "SAP30", "SCML4",
"SETBP1", "SIRT2", "SMARCD3", "SMYD2", "SMYD3", "SPOP", "TAL1",
"THRB", "ZBTB16", "ZHX3", "ZNF208", "ZNF230", "ZNF366", "ZNF391",
"ZNF423", "ZNF454", "ZNF462", "ZNF483", "ZNF600", "ZNF655", "ZNF704",
"ZNF728", "ZNF804B", "ZNF831", "ZNF888", "ZNF99"), class = "factor"),
gene = structure(c(1L, 1L, 2L, 2L, 2L, 1L), .Label = c("gene_down",
"gene_UP"), class = "factor"), Region = structure(c(1L, 1L,
2L, 2L, 2L, 1L), .Label = c("Region_down", "Region_UP"), class = "factor"),
log2FoldChange_gene = c(-6.14406801102623, -2.6627775459419,
3.88870894117496, 3.88870894117496, 1.99677979125181, -1.97211419094463
), Peak_Region = structure(c(22L, 1L, 19L, 19L, 57L, 70L), .Label = c("Peak116875",
"Peak122965", "Peak124509", "Peak12882", "Peak129965", "Peak133291",
"Peak137285", "Peak144690", "Peak156227", "Peak158090", "Peak159944",
"Peak165589", "Peak17226", "Peak17361", "Peak174329", "Peak177429",
"Peak182129", "Peak18278", "Peak182835", "Peak183059", "Peak187290",
"Peak190384", "Peak191607", "Peak191615", "Peak191784", "Peak196275",
"Peak197134", "Peak20062", "Peak201433", "Peak204083", "Peak206496",
"Peak206939", "Peak208441", "Peak21185", "Peak215070", "Peak215745",
"Peak219884", "Peak220299", "Peak220725", "Peak221948", "Peak222959",
"Peak22385", "Peak226499", "Peak28743", "Peak30996", "Peak40815",
"Peak4292", "Peak4601", "Peak53449", "Peak5453", "Peak56079",
"Peak5702", "Peak58241", "Peak58295", "Peak62243", "Peak64400",
"Peak74210", "Peak77556", "Peak77907", "Peak77943", "Peak79841",
"Peak80519", "Peak81090", "Peak85355", "Peak91665", "Peak95258",
"Peak96987", "Peak97000", "Peak97004", "Peak97803", "Peak98350",
"Peak98598", "Peak99349", "Peak99355", "Peak99674"), class = "factor"),
log2FoldChange_Region = c(-6.34669248759273, -3.48228412439449,
2.00811698735437, 2.00811698735437, 2.43720241001405, -2.43644562364537
), Family = structure(c(2L, 2L, 1L, 2L, 2L, 1L), .Label = c("EF",
"TF"), class = "factor")), row.names = c(NA, 6L), class = "data.frame")我的第二个数据包含的信息,我想要地图,如果基因,都是与血液病有关的。
dput(head(haematopoetic_genes))
structure(list(Rank = 1:6, Symbol = structure(c(134L, 209L, 138L,
716L, 162L, 531L), .Label = c("18S_RRNA", "ABCA1", "ABCB1", "ABCG1",
"ABCG2", "ABL1", "ACAN", "ACE", "ACHE", "ACKR2", "ACP5", "ACTG1",
"ADA", "ADAMTS13", "ADIPOQ", "ADRB2", "AFDN", "AFF1", "AFP",
"AGO2", "AICDA", "AIF1", "AIRE", "AKT1", "ALB", "ALCAM", "ALDH1A1",
"ALK", "AMBP", "ANGPT1", "ANGPT2", "ANGPTL5", "ANPEP", "ANXA2",
"ANXA5", "ANXA8", "AOC3", "APOA1", "APOE", "APP", "AR", "ARG1",
"ARG2", "ARID3A", "ASXL1", "ATG5", "ATG7", "ATM", "ATP12A", "ATP4A",
"AURKA", "AXIN1", "AXIN2", "AXL", "B3GAT1", "BANF1", "BCL11A",
"BCL11B", "BCL2A1", "BCL2L1", "BCL2L11", "BCL6", "BCR", "BDNF",
"BECN1", "BGLAP", "BLNK", "BMI1", "BMP2", "BMP4", "BMP6", "BMP7",
"BRCA1", "BRCA2", "BRD4", "BTK", "C3", "CALCA", "CALR", "CASP1",
"CASP3", "CASP8", "CASP9", "CBFA2T3", "CBL", "CCL11", "CCL17",
"CCL19", "CCL2", "CCL20", "CCL21", "CCL25", "CCL27", "CCL3",
"CCL4", "CCL4L2", "CCL5", "CCL7", "CCN2", "CCNA2", "CCNB1", "CCND1",
"CCND2", "CCND3", "CCR1", "CCR2", "CCR3", "CCR5", "CCR6", "CCR7",
"CCR9", "CCRL2", "CD163", "CD164", "CD19", "CD1A", "CD1B", "CD1C",
"CD1D", "CD1E", "CD2", "CD200", "CD207", "CD22", "CD226", "CD24",
"CD244", "CD247", "CD27", "CD274", "CD276", "CD28", "CD33", "CD34",
"CD38", "CD3D", "CD3E", "CD4", "CD40", "CD40LG", "CD44", "CD47",
"CD48", "CD5", "CD52", "CD55", "CD58", "CD59", "CD6", "CD63",
"CD68", "CD69", "CD7", "CD70", "CD74", "CD79A", "CD79B", "CD80",
"CD81", "CD83", "CD86", "CD8A", "CD9", "CD93", "CD96", "CDC42",
"CDH1", "CDH17", "CDH2", "CDH5", "CDK1", "CDK2", "CDK4", "CDK6",
"CDKN1A", "CDKN1B", "CDKN1C", "CDKN2A", "CDKN2B", "CDKN2C", "CDKN3",
"CDX4", "CEACAM8", "CEBPA", "CEBPB", "CEBPE", "CFLAR", "CHEK1",
"CHEK2", "CISH", "CLEC11A", "CLEC12A", "CLEC4C", "CLEC7A", "COL1A1",
"COLEC10", "COMMD3-BMI1", "CR2", "CREB1", "CREBBP", "CRKL", "CRP",
"CRYGC", "CSF1", "CSF1R", "CSF2", "CSF2RA", "CSF2RB", "CSF3",
"CSF3R", "CTCF", "CTNNB1", "CTRL", "CTSG", "CTSK", "CX3CL1",
"CX3CR1", "CXCL1", "CXCL10", "CXCL11", "CXCL12", "CXCL13", "CXCL2",
"CXCL5", "CXCL8", "CXCL9", "CXCR1", "CXCR2", "CXCR3", "CXCR4",
"CXCR5", "CYBB", "CYCS", "CYGB", "CYP3A4", "DCLRE1C", "DDX58",
"DHFR", "DICER1", "DKC1", "DKK1", "DLK1", "DLL1", "DLL4", "DNMT1",
"DNMT3A", "DNMT3B", "DNTT", "DOCK8", "DOT1L", "DPP4", "DROSHA",
"DTX1", "EBF1", "EDN1", "EGF", "EGFR", "EGR1", "EGR2", "EIF2AK3",
"EIF4EBP1", "ELANE", "ELN", "EMCN", "ENG", "ENO2", "ENSP00000204615",
"ENSP00000217964", "ENSP00000228280", "ENSP00000245323", "ENSP00000268035",
"ENSP00000277541", "ENSP00000282030", "ENSP00000303939", "ENSP00000332369",
"ENSP00000344192", "ENSP00000349960", "ENSP00000350052", "ENSP00000352561",
"ENSP00000356946", "ENSP00000387760", "ENSP00000433642", "ENSP00000444986",
"ENSP00000452780", "ENSP00000478570", "ENSP00000479618", "ENSP00000480609",
"ENTPD1", "EOMES", "EP300", "EPB41", "EPCAM", "EPO", "EPOR",
"ERBB2", "ERVW-1", "ESAM", "ESR1", "ETS1", "ETS2", "ETV2", "ETV6",
"EVPL", "EZH2", "F2", "F3", "FABP4", "FANCA", "FANCC", "FANCD2",
"FANCG", "FAS", "FASLG", "FBXW7", "FCGR1A", "FCGR2A", "FCGR2B",
"FER", "FGF1", "FGF13", "FGF2", "FGF4", "FGF7", "FGFR1", "FLT1",
"FLT3", "FLT3LG", "FLT4", "FN1", "FOS", "FOSB", "FOXA2", "FOXN1",
"FOXO1", "FOXO3", "FOXO4", "FOXP3", "FUT4", "FYN", "GADD45A",
"GAPDH", "GAS6", "GATA1", "GATA2", "GATA3", "GATA4", "GCG", "GDNF",
"GFAP", "GFI1", "GFI1B", "GJA1", "GLI1", "GLI3", "GP1BA", "GP9",
"GPT", "GRB2", "GSTM1", "GUSB", "GYPA", "GYPC", "GZMB", "H2AC18",
"H2AC19", "H2AC20", "H2AX", "H2BC21", "H3-2", "H4-16", "H4C1",
"H4C11", "H4C12", "H4C13", "H4C14", "H4C15", "H4C2", "H4C3",
"H4C4", "H4C5", "H4C6", "H4C8", "H4C9", "HAMP", "HAVCR2", "HBB",
"HBE1", "HBEGF", "HBZ", "HDAC1", "HDAC2", "HEY1", "HEY2", "HGF",
"HHEX", "HIF1A", "HLA-A", "HLA-B", "HLA-C", "HLA-DPB1", "HLA-DQA1",
"HLA-DQB1", "HLA-DRB1", "HLA-E", "HLA-G", "HMGA2", "HMGB1", "HMOX1",
"HNF4A", "HOXA10", "HOXA3", "HOXA5", "HOXA7", "HOXA9", "HOXB3",
"HOXB4", "HOXB5", "HOXB6", "HOXB8", "HP", "HPGDS", "HPRT1", "HRAS",
"HSA-MIR-125B-5P", "HSA-MIR-126-3P", "HSA-MIR-142-3P", "HSA-MIR-146A-5P",
"HSA-MIR-150-5P", "HSA-MIR-155-5P", "HSA-MIR-17-5P", "HSA-MIR-181A-5P",
"HSA-MIR-21-5P", "HSA-MIR-221-3P", "HSA-MIR-223-3P", "HSA-MIR-29A-3P",
"HSA-MIR-34A-5P", "HSP90AA1", "HSPA4", "IBSP", "ICAM1", "ICAM4",
"ID1", "ID2", "ID3", "IDH1", "IDH2", "IDO1", "IDUA", "IFNA1",
"IFNA2", "IFNAR1", "IFNB1", "IFNG", "IGF1", "IGF2", "IGFBP2",
"IGFBP3", "IGLL1", "IGLL5", "IHH", "IKBKB", "IKBKG", "IKZF1",
"IKZF2", "IKZF3", "IL10", "IL10RA", "IL11", "IL13", "IL15", "IL17F",
"IL18", "IL1A", "IL1B", "IL1R1", "IL2", "IL22", "IL2RA", "IL2RB",
"IL2RG", "IL3", "IL33", "IL3RA", "IL4", "IL5", "IL6", "IL6R",
"IL7", "IL7R", "IL9", "INS", "IRAK1", "IRF1", "IRF2", "IRF4",
"IRF7", "IRF8", "IRS1", "ITGA2", "ITGA2B", "ITGA4", "ITGA5",
"ITGA6", "ITGAE", "ITGAL", "ITGAM", "ITGAV", "ITGAX", "ITGB1",
"ITGB2", "ITGB3", "ITIH4", "ITK", "JAG1", "JAG2", "JAK1", "JAK2",
"JAK3", "JUN", "KAT6A", "KCNN3", "KDM1A", "KDM6A", "KDR", "KIR2DL1",
"KIR2DL3", "KIR2DL4", "KIR3DL1", "KIR3DL2", "KIR3DL3", "KIT",
"KLF1", "KLF2", "KLF4", "KLRB1", "KLRC1", "KLRC2", "KLRD1", "KLRG1",
"KLRK1", "KMT2A", "KRAS", "KRT14", "KRT18", "KRT19", "KRT5",
"KRT8", "LAG3", "LAMP1", "LCK", "LCN2", "LCP2", "LDB1", "LDB2",
"LEF1", "LEP", "LEPR", "LGALS3", "LGALS4", "LGR5", "LIF", "LIG4",
"LILRB1", "LIN28A", "LIN28B", "LMNA", "LMO2", "LOX", "LRP5",
"LTA", "LY75", "LY9", "LYL1", "LYN", "LYVE1", "MAF", "MAGEA3",
"MAP2K1", "MAP2K7", "MAPK1", "MAPK14", "MAPK3", "MAPK8", "MATK",
"MBP", "MCAM", "MCL1", "MDM2", "MECOM", "MEF2C", "MEIS1", "MET",
"MGMT", "MICA", "MIR17HG", "MIXL1", "MLLT1", "MLLT10", "MLLT3",
"MME", "MMP2", "MMP3", "MMP9", "MOG", "MPL", "MPO", "MPP3", "MRC1",
"MSI2", "MTHFR", "MTOR", "MUC1", "MX1", "MYB", "MYCN", "MYD88",
"MYH11", "MYO1G", "MYOD1", "MYOM2", "NAALADL1", "NANOG", "NCAM1",
"NCF1", "NCOR1", "NCOR2", "NCR1", "NCR2", "NCR3", "NES", "NF1",
"NFATC1", "NFE2", "NFIL3", "NFKB1", "NFKBIA", "NGF", "NGFR",
"NKX2-5", "NLRP3", "NOD2", "NOG", "NOS2", "NOS3", "NOTCH2", "NOTCH3",
"NOTCH4", "NPM1", "NR3C1", "NR4A1", "NRAS", "NRP1", "NT5E", "NTRK1",
"NUMB", "NUP98", "OSM", "PARP1", "PAX5", "PAX6", "PBX1", "PCGF2",
"PCSK5", "PCSK7", "PDCD1", "PDCD1LG2", "PDGFRA", "PDGFRB", "PDPN",
"PECAM1", "PF4", "PGF", "PGK1", "PHF6", "PI3", "PIGA", "PIK3R1",
"PLAUR", "PLCG1", "PLCG2", "PLEK", "PLG", "PLK1", "PMAIP1", "POMC",
"POU5F1", "PPARA", "PPARG", "PPARGC1A", "PPBP", "PPIG", "PPP1R12C",
"PRAME", "PRDM1", "PRDM16", "PREB", "PRF1", "PRKCA", "PRKDC",
"PRL", "PROCR", "PROM1", "PRTN3", "PSMA7", "PTCRA", "PTEN", "PTGER4",
"PTGS2", "PTH", "PTHLH", "PTK2", "PTK2B", "PTN", "PTPN11", "PTPN6",
"PTPRC", "QRSL1", "RAC2", "RAD21", "RAD51", "RAF1", "RAG1", "RAG2",
"RARA", "RBFOX3", "RBPJ", "REG3A", "REL", "RELA", "RELB", "REN",
"RET", "RGMA", "RHOA", "RICTOR", "RORC", "RPS14", "RPS19", "RPS6",
"RPS6KB1", "RPTOR", "RUNX1", "RUNX1T1", "RUNX2", "RUNX3", "S1PR1",
"SAA1", "SALL4", "SATB1", "SBDS", "SDC1", "SDHC", "SELE", "SELL",
"SELP", "SELPLG", "SERPINA1", "SERPINC1", "SERPINE1", "SH2B3",
"SHC1", "SHH", "SHMT2", "SIGLEC1", "SIRPA", "SIRT1", "SIRT3",
"SIRT7", "SLAMF1", "SLC2A1", "SMAD2", "SMAD3", "SMAD4", "SMAD5",
"SMAD7", "SMARCA4", "SNAI1", "SNAI2", "SOCS1", "SOCS2", "SOCS3",
"SOD2", "SOX17", "SOX2", "SOX4", "SOX9", "SP7", "SPARC", "SPI1",
"SPIB", "SPN", "SPP1", "SRC", "SRSF2", "STAG2", "STAT1", "STAT2",
"STAT3", "STAT5A", "STAT5B", "STAT6", "STK11", "STUB1", "SUZ12",
"SYK", "TAC1", "TAL1", "TBP", "TBX21", "TCF12", "TCF3", "TCF4",
"TCF7", "TCHP", "TEK", "TERC", "TERT", "TET1", "TET2", "TFRC",
"TGFB1", "TGFB2", "TGFB3", "TGFBR1", "TGFBR2", "TH", "THBD",
"THBS1", "THRA", "THY1", "TIE1", "TIMP1", "TJP1", "TLR1", "TLR2",
"TLR3", "TLR4", "TLR5", "TLR7", "TLR8", "TLR9", "TLX1", "TNF",
"TNFRSF10A", "TNFRSF10B", "TNFRSF13C", "TNFRSF18", "TNFRSF1A",
"TNFRSF1B", "TNFRSF25", "TNFRSF4", "TNFRSF8", "TNFRSF9", "TNFSF10",
"TNFSF11", "TNFSF13B", "TNFSF4", "TNFSF9", "TOP1", "TP53", "TP53BP1",
"TRAF6", "TRIM5", "TSLP", "TTR", "TXN", "TYK2", "TYROBP", "U2AF1",
"U2AF1L4", "U2AF1L5", "UBC", "UCHL1", "UNC13D", "VAV1", "VCAM1",
"VEGFC", "VPREB1", "VTN", "VWF", "WAS", "WNT1", "WNT10B", "WNT16",
"WNT3A", "WNT4", "WNT5A", "WT1", "XBP1", "XIAP", "XRCC5", "XRCC6",
"YY1", "ZAP70", "ZBTB16", "ZBTB7B", "ZEB1", "ZFPM1", "ZRSR2"), class = "factor"),
Publication.count = c(18386L, 7830L, 6786L, 6590L, 6022L,
5610L), Fraction.of.publications.from.total.gene.publication = c(0.2923,
0.2598, 0.029, 0.1107, 0.0383, 0.1407)), row.names = c(NA,
6L), class = "data.frame")我的第一个数据的维数
dim(EF_TF_region_expression)
[1] 80 8我的第二个数据的维数
dim(haematopoetic_genes)
[1] 899 4
> 如果我做了inner_join
haem_non_haem = inner_join(EF_TF_region_expression,haematopoetic_genes)我知道这个维度
dim(haem_non_haem) 1 17 11
因此,这里我松开了其余的行,这些行没有映射到我的第二个dataframe。我想保留所有带有标签的行,比如"Non-haem"
我试过full_join
haem_non_haem1 = full_join(EF_TF_region_expression,haematopoetic_genes,by ="Symbol")我的维度是
dim(haem_non_haem1) 1 964 11
所以在这里,我从第二个数据中得到了更多的行,我不想要。
我期望的最后输出是我将保留我所有的80行,而不是17行,这些行是映射的,而不是映射的,应该放置一些标签。
ENSEMBL Symbol gene Region log2FoldChange_gene Peak_Region log2FoldChange_Region Family
1 ENSG00000030419 IKZF2 gene_down Region_down -2.662778 Peak116875 -3.482284 TF
2 ENSG00000057657 PRDM1 gene_UP Region_UP 3.888709 Peak182835 2.008117 EF
3 ENSG00000057657 PRDM1 gene_UP Region_UP 3.888709 Peak182835 2.008117 TF
4 ENSG00000085276 MECOM gene_down Region_down -8.786649 Peak144690 -5.930731 TF
5 ENSG00000091831 ESR1 gene_down Region_down -1.884222 Peak187290 -2.431486 TF
6 ENSG00000107485 GATA3 gene_down Region_down -8.905856 Peak21185 -4.434688 TF
Rank Publication.count Fraction.of.publications.from.total.gene.publication
1 895 36 0.0845
2 496 105 0.0266
3 496 105 0.0266
4 317 178 0.1803
5 193 323 0.0049
6 140 465 0.0404发布于 2021-07-30 07:38:06
看来您想要正常地加入并执行left_join
library(dplyr)
left_join(EF_TF_region_expression,haematopoetic_genes) %>%
mutate(some_new_label = ifelse(is.na(Rank),"Non-haem","Haem"))https://stackoverflow.com/questions/68586899
复制相似问题