我根据公司名称之间的精确匹配合并到数据集(dt,dt2),以便将group_id分配给第二个数据集(dt2)。现在,我想用来自同一公司(不同分支)但有不同名称的值来填充group_id的空值。
我使用以下合并代码构造了数据
dt2 <- merge(dt2,dt[,c("psn_name_PAT","group_id")],by = c("psn_name_PAT"),all.x = T,all.y = F)数据
psn_name_PAT n name_std group_id
1 CHICHIBU FUJI COMPANY 2 CHICHIBU FUJI NA
2 FUJI CERAMICS 1 FUJI CERAMICS NA
3 FUJI CHEMICAL 1 FUJI 606
4 FUJI ELECTRIC COMPANY 439 FUJI 606
5 FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT 5 FUJI 606
6 FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT 12 FUJI 606期望输出:
psn_name_PAT n name_std group_id
1 CHICHIBU FUJI COMPANY 2 CHICHIBU FUJI 606
2 FUJI CERAMICS 1 FUJI CERAMICS 606
3 FUJI CHEMICAL 1 FUJI 606
4 FUJI ELECTRIC COMPANY 439 FUJI 606
5 FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT 5 FUJI 606
6 FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT 12 FUJI 606我尝试使用以下代码来匹配来自另一个数据集(dt2)的“不确切的单词”匹配,但不幸的是,它带来了不同的group_id,而不是与正确的公司(即富士)的一次匹配:
dt2 <- dt2 %>%
mutate(group_id=ifelse(grepl(paste0("\\",fuz$name_std,"\\b", collapse = "|"), name_std),fuz$group_id,NA))对于如何将正确的公司group_id (即富士)分配给不同的分支机构,我有什么想法吗?
提前感谢您的帮助!
数据集
dt
structure(list(psn_name_PAT = c("FUJI CHEMICAL", "FUJI ELECTRIC COMPANY",
"FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT", "FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT",
"FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT", "FUJI ELECTRIC SYSTEMS COMPANY",
"FUJI ELECTRIC TECHNOLOGY COMPANY", "FUJI MACHINE MANUFACTURING COMPANY",
"FUJITSU", "FUJITSU GENERAL", "FUJIMI", "FUJIFILM", "FUJIFILM ELECTRONIC MATERIALS U.S.A.",
"FUJIFILM MANUFACTURING EUROPE", "FUJIKURA", "FUJI MACHINERY MFG. & ELECTRONICS COMPANY",
"FUJI XEROX COMPANY", "FUJIKIN", "FUJIKOSHI MACHINERY CORPORATION",
"HONGFUJIN PRECISION INDUSTRY (SHENZHEN) COMPANY", "TOSHIBA AMERICA RESEARCH",
"TOSHIBA CORPORATION", "TOSHIBA MACHINE COMPANY", "TOSHIBA MATERIALS COMPANY",
"TOSHIBA MEDICAL SYSTEMS CORPORATION", "TOSHIBA SOLUTIONS CORPORATION"
), name_std = c("FUJI", "FUJI", "FUJI", "FUJI", "FUJI", "FUJI",
"FUJI", "FUJI", "FUJITSU", "FUJITSU", "FUJIMI", "FUJIFILM", "FUJIFILM",
"FUJIFILM", "FUJIKURA", "FUJI MFG", "FUJI XEROX", "FUJIKIN",
"FUJIKOSHI", "HONGFUJIN", "TOSHIBA", "TOSHIBA", "TOSHIBA", "TOSHIBA",
"TOSHIBA", "TOSHIBA"), n = c(45L, 45L, 45L, 45L, 45L, 45L, 45L,
45L, 6L, 6L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), group_id = c(606L, 606L, 606L, 606L, 606L, 606L,
606L, 606L, 614L, 614L, 613L, 609L, 609L, 609L, 612L, 607L, 608L,
610L, 611L, 705L, 1631L, 1631L, 1631L, 1631L, 1631L, 1631L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -26L))dt2
structure(list(psn_name_PAT = c("CHICHIBU FUJI COMPANY", "FUJI CERAMICS",
"FUJI CHEMICAL", "FUJI ELECTRIC COMPANY", "FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT",
"FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT", "FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT",
"FUJI ELECTRIC DEVICE TECHNOLOGY COMPANY", "FUJI ELECTRIC SYSTEMS COMPANY",
"FUJI ELECTRIC TECHNOLOGY COMPANY", "FUJI ELECTROCHEMICAL COMPANY",
"FUJI FILM MICRODEVICES COMPANY", "FUJI HEAVY IND", "FUJI MACHINE MANUFACTURING COMPANY",
"FUJI MACHINERY MFG. & ELECTRONICS COMPANY", "FUJI SEIKI MACHINE WORKS",
"FUJI XEROX COMPANY", "FUJIFILM", "FUJIFILM DIAMATIX", "FUJIFILM DIMATIX",
"FUJIFILM ELECTRONIC IMAGING", "FUJIFILM ELECTRONIC MATERIALS U.S.A.",
"FUJIFILM MANUFACTURING EUROPE", "FUJIKIN", "FUJIKOSHI KIKAI KOGYO",
"FUJIKOSHI MACHINERY CORPORATION", "FUJIKURA", "FUJIMA", "FUJIMI",
"FUJINON CORPORATION", "FUJITSU", "FUJITSU AMD SEMICONDUCTOR",
"FUJITSU AMD SEMICONDUCTOR LIMITED (FASL)", "FUJITSU AND SEMICONDUCTOR",
"FUJITSU AUTOMATION", "FUJITSU COMPONENT", "FUJITSU DISPLAY TECHNOLOGIES CORPORATION",
"FUJITSU FRONTECH", "FUJITSU GENERAL", "FUJITSU HITACHI PLASMA DISPLAY",
"FUJITSU LIMITED KABUSHIKI KAISHA TOSHIBA", "FUJITSU MEDIA DEVICES",
"FUJITSU MICROCOMPUTER SYSTEMS", "FUJITSU MICROELECTRONICS",
"FUJITSU MICROELECTRONICS LMIITED", "FUJITSU MIYAGI ELECTRONICS",
"FUJITSU QUANTUM DEVICES", "FUJITSU SEMICONDUCTOR", "FUJITSU TAKAMISAWA COMPONENT",
"FUJITSU TEN", "FUJITSU VLSI", "FUJITSU YAMANASHI ELECTRONICS",
"HONGFUJIN PRECISION INDUSTRY (SHENZHEN) COMPANY", "KYUSHU FUJITSU ELECTRONICS",
"NEC TOSHIBA SPACE SYSTEMS", "SUZUKA FUJI XEROX COMPANY", "TOSHIBA AMERICA ELECTRONIC COMPONENTS",
"TOSHIBA AMERICA RESEARCH", "TOSHIBA AUTOMATION COMPANY", "TOSHIBA AVE CORPORATION",
"TOSHIBA CERAMICS COMPANY", "TOSHIBA COMPONENTS COMPANY", "TOSHIBA CORPORATION",
"TOSHIBA KIKAI", "TOSHIBA LIGHTING & TECHNOLOGY CORPORATION",
"TOSHIBA MACHINE COMPANY", "TOSHIBA MATERIALS COMPANY", "TOSHIBA MATSUSHITA DISPLAY TECHNOLOGY COMPANY",
"TOSHIBA MEDICAL SYSTEMS CORPORATION", "TOSHIBA MICROELECTRONICS CORPORATION",
"TOSHIBA MOBILE DISPLAY COMPANY", "TOSHIBA SHIBAURA DENKI", "TOSHIBA SILICONE COMPANY",
"TOSHIBA SOLUTIONS CORPORATION", "TOSHIBA TEC CORPORATION", "TOSHIBA TECHNO CENTER"
), n = c(2L, 1L, 1L, 439L, 5L, 12L, 1L, 43L, 38L, 1L, 1L, 1L,
1L, 2L, 2L, 4L, 129L, 549L, 1L, 5L, 1L, 1L, 2L, 5L, 1L, 1L, 35L,
1L, 7L, 20L, 2430L, 7L, 1L, 2L, 1L, 2L, 4L, 2L, 1L, 1L, 1L, 36L,
1L, 126L, 1L, 1L, 56L, 312L, 1L, 3L, 25L, 1L, 6L, 13L, 1L, 1L,
14L, 1L, 1L, 1L, 19L, 1L, 5609L, 7L, 5L, 5L, 15L, 22L, 8L, 1L,
2L, 1L, 1L, 1L, 3L, 4L), name_std = c("CHICHIBU FUJI", "FUJI CERAMICS",
"FUJI", "FUJI", "FUJI", "FUJI", "FUJI", "FUJI DEVICE", "FUJI",
"FUJI", "FUJI ELECTROCHEMICAL", "FUJI MICRODEVICES", "FUJI HEAVY IND",
"FUJI", "FUJI MFG", "FUJI SEIKI WORKS", "FUJI XEROX", "FUJIFILM",
"FUJIFILM DIAMATIX", "FUJIFILM DIMATIX", "FUJIFILM IMAGING",
"FUJIFILM", "FUJIFILM", "FUJIKIN", "FUJIKOSHI KIKAI", "FUJIKOSHI",
"FUJIKURA", "FUJIMA", "FUJIMI", "FUJINON", "FUJITSU", "FUJITSU AMD SEMICONDUCTOR",
"FUJITSU AMD SEMICONDUCTOR FASL", "FUJITSU SEMICONDUCTOR", "FUJITSU AUTOMATION",
"FUJITSU COMPONENT", "FUJITSU DISPLAY", "FUJITSU FRONTECH", "FUJITSU",
"FUJITSU HITACHI PLASMA DISPLAY", "FUJITSU KABUSHIKI KAISHA TOSHIBA",
"FUJITSU DEVICES", "FUJITSU MICROCOMPUTER", "FUJITSU MICROELECTRONICS",
"FUJITSU MICROELECTRONICS LMIITED", "FUJITSU MIYAGI", "FUJITSU QUANTUM DEVICES",
"FUJITSU SEMICONDUCTOR", "FUJITSU TAKAMISAWA COMPONENT", "FUJITSU TEN",
"FUJITSU VLSI", "FUJITSU YAMANASHI", "HONGFUJIN", "KYUSHU FUJITSU",
"NEC TOSHIBA SPACE", "SUZUKA FUJI XEROX", "TOSHIBA COMPONENTS",
"TOSHIBA", "TOSHIBA AUTOMATION", "TOSHIBA AVE", "TOSHIBA CERAMICS",
"TOSHIBA COMPONENTS", "TOSHIBA", "TOSHIBA KIKAI", "TOSHIBA LIGHTING",
"TOSHIBA", "TOSHIBA", "TOSHIBA MATSUSHITA DISPLAY", "TOSHIBA",
"TOSHIBA MICROELECTRONICS", "TOSHIBA MOBILE DISPLAY", "TOSHIBA SHIBAURA DENKI",
"TOSHIBA SILICONE", "TOSHIBA", "TOSHIBA TEC", "TOSHIBA TECHNO"
), group_id = c(NA, NA, 606L, 606L, 606L, 606L, 606L, NA, 606L,
606L, NA, NA, NA, 606L, 607L, NA, 608L, 609L, NA, NA, NA, 609L,
609L, 610L, NA, 611L, 612L, NA, 613L, NA, 614L, NA, NA, NA, NA,
NA, NA, NA, 614L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 705L, NA, NA, NA, NA, 1631L, NA, NA, NA, NA, 1631L, NA,
NA, 1631L, 1631L, NA, 1631L, NA, NA, NA, NA, 1631L, NA, NA)), row.names = c(NA,
-76L), class = "data.frame")发布于 2022-07-11 05:40:45
尽管我没有完全理解您的连接的粒度,但我认为您需要这样的代码:
您唯一需要定义的是max_dist
使用max_dist,我们可以定义连接的最大距离
请参阅:?stringdist_left_join
library(dplyr)
library(fuzzyjoin)
library(tidyr)
fuzzyjoin::stringdist_full_join(x=dt, y=dt2, max_dist = .70,
by='name_std',
method = 'jaccard',
distance_col = "dist") %>%
group_by(name_std.x) %>% # here you may want to group by psn_name_PAT.x
fill(group_id.y, .direction = "updown") %>%
select(psn_name_PAT = psn_name_PAT.y, n=n.y, group_id=group_id.y) name_std.x psn_name_PAT n group_id
<chr> <chr> <int> <int>
1 FUJI CHICHIBU FUJI COMPANY 2 606
2 FUJI FUJI CERAMICS 1 606
3 FUJI FUJI CHEMICAL 1 606
4 FUJI FUJI ELECTRIC COMPANY 439 606
5 FUJI FUJI ELECTRIC CORPORATE RESEARCH & DEVELOPMENT 5 606
6 FUJI FUJI ELECTRIC CORPORATE RESEARCH AND DEVELOPMENT 12 606
7 FUJI FUJI ELECTRIC CORPORATION RESEARCH AND DEVELOPMENT 1 606
8 FUJI FUJI ELECTRIC DEVICE TECHNOLOGY COMPANY 43 606
9 FUJI FUJI ELECTRIC SYSTEMS COMPANY 38 606
10 FUJI FUJI ELECTRIC TECHNOLOGY COMPANY 1 606
# ... with 1,182 more rowshttps://stackoverflow.com/questions/72934023
复制相似问题