基于某些条件,我想把公司名称分成母公司和Alias公司。在这方面我需要帮助。
Original_Input
DORO INC ( O/S DORO SAFETY & SECURITY)
MJB SUPPLY (DALIAN) CO., LTD.
Iris Diagnostics, a Division of Iris International Inc
GINGI-PAK a division of The Belport Co., Inc.
H/P/COSMOS SPORTS & MEDICAL GMBH
Arrayent Health LLC d/b/a/ Ambio Healthc
Arrow International, Inc. (subsidiary of Teleflex, Inc.)预期输出:
Parent_Company Alias_Name
1.DORO INC O/S DORO SAFETY & SECURITY
2.MJB SUPPLY CO., LTD. DALIAN
3.Iris Diagnostics,
4.GINGI-PAK The Belport Co., Inc.
5.H P/COSMOS SPORTS & MEDICAL GMBH
6.Arrayent Health LLC Ambio Healthc
7.Arrow International, Inc. 我的代码:
母公司分拆:
def get_parent_company(input):
keywords = ["a division of", "c/o","subsidiary of", "division of","an","dba","d/b/a","o/s","os","\/"]
regex = r"(.*?)(\b{}\b)".format("\\b|\\b".join(keywords))
matches = re.finditer(regex, input, re.IGNORECASE)
for match in matches:
return match.group(1).strip()
data["Parent_Company"] = data["Original_Input"].apply(get_parent_company).fillna('')化名公司分拆:
pat = r'\((.*?)\)'
pat1 = r'(?:(?=[^\s/]*/(?!\s|[^\s/]*/))|\$|a division of|&|/\s)\s*(.*)'
if pat:
data['Alias_Name'] = data['Original_Input'].str.extract(r'\((.*?)\)')
if pat1:
data['Alias_Name'] = data['Original_Input'].str.extract(r'(?:(?=[^\s/]*/(?!\s|[^\s/]*/))|\$|a
division of|&|/\s)\s*(.*)',expand= False).fillna('')这里有不同类型的输入。在关键字("a除法“、"c/o”、“子公司”、“除法”、"an“、"dba”、"d/b/a“、"o/s”、"os“、"/")之前的词必须作为母公司提及,在后面的词作为别名输入。
还有另一种类型的输入,例如。MJB供应(大连)有限公司。其中(大连)是化名,并保持像MJB供应有限公司一样。是父名。
我的努力:
因此,我试图使用关键字(在这里的一些朋友的帮助下)来获取父名和别名。但是不知道如何为正则表达式添加一两个条件,我可以说,我不知道如何在数据框架中拆分不同类型的输入。
谢谢你提前帮忙。
编辑
%%time
import pyodbc
import pandas as pd
import string
from string import digits
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.types import String
from io import StringIO
import re
#Creating engine to connect Py to SQL
engine = create_engine('mssql+pyodbc:// **:***'+server+'/'+database+'?
driver=SQL+server')
#Reading SQL table and grouping by columns
df=pd.read_sql('select * from [dbo].[TempCompanyName]',engine)
#Creating Dataframe
#data=pd.DataFrame(df)
# removing numbers,duplicated spaces,spaces before or after the string
data['Cleansed_Input'] = data["Original_Input"].str.replace("[0-9]+",
"").str.replace("[^\w ]+", "").str.replace(" +", " ").str.strip()
#Removing ASCII characters
data['Cleansed_Input'] = data['Cleansed_Input'].apply(lambda x:''.join(['' if
ord(i) < 32 or ord(i) > 126 else i for i in x]))
#Method 2 in splitting:
class CompanyAlias(object):
rules = [
re.compile(r"(?P<PREFIX>[^\(]*)(\((?
P<SUBSIDIARY>\s+subsidiary\s+.*)\))?(?P<SUFFIX>.*)", re.IGNORECASE),
re.compile(r"(?P<PREFIX>[^\(]*)(\((?P<ALIAS>[^\)]+)\))?(?P<SUFFIX>.*)",
re.IGNORECASE),
re.compile(r"(?P<PREFIX>.*),?\s+a\s+division\s+of\s+(?P<DIVISIONOF>.*)",
re.IGNORECASE),
re.compile(r"(?P<PREFIX>.*)\s+d/b/a/\s+(?P<ALIAS>.*)", re.IGNORECASE),
re.compile(r"(?P<PREFIX>[^/]+)/(?P<ALIAS>.*)", re.IGNORECASE)
]
@classmethod
def process_rules(cls, name, biz_rules=None):
if biz_rules is None:
biz_rules = cls.rules
# process all the rules for the given name
results = [r.match(name) for r in biz_rules]
# keep only results that successfully matched a rule
retval = [r.groupdict() for r in results if r]
return retval
@classmethod
def process_results(cls, results):
"""Process results applies the desired business logic
to the results obtained via regular expression processing
1. Alias rules are priority 1
2. division of rules are priority 2
rankings element 0 is a list of all alias results in order of
precedence. element 1 is a list of all division of rules in
order of precedence.
Note: company names are always composed of SUFFIX + PREFIX
"""
rankings = [[], []]
retval = ("", "", "")
for r in results:
alias = r.get("ALIAS", "")
divisionof = r.get("DIVISIONOF", None)
company = "{0}{1}".format(r.get("PREFIX",""), r.get("SUFFIX",""))
if alias:
if re.search("subsidiary", alias, re.IGNORECASE):
alias = ""
rankings[0].append((company, alias.strip(), divisionof))
if divisionof:
rankings[1].append((company, alias, divisionof))
# if there is at least 1 alias business rule applied
# then return the first result from the alias rules
# otherwise if there is at least 1 division of rule return
# that.
if len(rankings[0]) > 0:
retval = rankings[0][0]
elif len(rankings[1]) > 0:
retval = rankings[1][0]
return retval
if __name__ == "__main__":
names = df["Original_Input"]
#print("{0:40s} Alias_Name".format("Parent_Company"))
#print()
for name in names:
results = CompanyAlias.process_rules(name)
result = CompanyAlias.process_results(results)
#print("{0:40s} {1}".format(result[0], result[1]))
df['Parent_company'] = result[0]
df['Alias_Name'] = result[1]
data=pd.DataFrame(df)
#Writing to SQL
data.to_sql('TempCompanyName', con=engine, if_exists='replace',index= False)
#Splitting Company type and Extension.
engine.execute('''update A
set A.Type_input = B.Company_Type
from [TempCompanyName] A (nolock), [company_Extension]B where
A.Cleansed_Input like '%'+B.Name_Extension
update A
set A.Core_Input =replace(A.[Cleansed_Input],B.Name_Extension,'')
from [TempCompanyName] A (nolock), [company_Extension]B where
A.Cleansed_Input like '%'+B.Name_Extension''')
engine.execution_options(autocommit=True)为安全目的隐藏服务器详细信息
发布于 2020-02-05 15:12:40
注:这个答案包含了很多警告。
不幸的是你掉进了兔子洞里。名称清洗和解析本身就是一个行业。根据您所从事的业务,您可能希望调查提供此类服务的公司。我祝愿你们在这个世界上一切顺利,因为我已经多次受命于这方面的努力。
方法论
创建一系列优先业务规则,以便定期识别您所看到的案例。随着新的边缘情况的出现,简单地说,更多的规则在适当的优先级。
优先排序很重要,因为一条规则可能会产生假阳性。错误的积极规则应该比真实的积极规则更低的优先级。
为了说明这一点,您在示例中给出了H/P/Cosmos和d/b/a的名称,它们需要不同的规则来识别。但是,如果标识H/P/Cosmos的规则在作为规则之前应用,您将得到包含d/b/a的名称的假阳性。
这样够好吗?
单个来源中的数据集往往具有个性。有相似之处出现在高频率,并允许创建业务规则,使你90%的方式达到你的目标。
提示
在解析您的名字时,尽量避免信息丢失。识别前缀、后缀、别名、子公司和部门,并保留它们以供将来解决。标记可疑记录以供检查。这将使您能够识别新规则,并防止垃圾进入下一个处理阶段。
import re
class CompanyAlias(object):
rules = [
re.compile(r"(?P<PREFIX>[^\(]*)(\((?P<SUBSIDIARY>\s+subsidiary\s+.*)\))?(?P<SUFFIX>.*)", re.IGNORECASE),
re.compile(r"(?P<PREFIX>[^\(]*)(\((?P<ALIAS>[^\)]+)\))?(?P<SUFFIX>.*)", re.IGNORECASE),
re.compile(r"(?P<PREFIX>.*),?\s+a\s+division\s+of\s+(?P<DIVISIONOF>.*)", re.IGNORECASE),
re.compile(r"(?P<PREFIX>.*)\s+d/b/a/\s+(?P<ALIAS>.*)", re.IGNORECASE),
re.compile(r"(?P<PREFIX>[^/]+)/(?P<ALIAS>.*)", re.IGNORECASE)
]
@classmethod
def process_rules(cls, name, biz_rules=None):
if biz_rules is None:
biz_rules = cls.rules
# process all the rules for the given name
results = [r.match(name) for r in biz_rules]
# keep only results that successfully matched a rule
retval = [r.groupdict() for r in results if r]
return retval
@classmethod
def process_results(cls, results):
"""Process results applies the desired business logic
to the results obtained via regular expression processing
1. Alias rules are priority 1
2. division of rules are priority 2
rankings element 0 is a list of all alias results in order of
precedence. element 1 is a list of all division of rules in
order of precedence.
Note: company names are always composed of SUFFIX + PREFIX
"""
rankings = [[], []]
retval = ("", "", "")
for r in results:
alias = r.get("ALIAS", "")
divisionof = r.get("DIVISIONOF", None)
company = "{0}{1}".format(r.get("PREFIX",""), r.get("SUFFIX",""))
if alias:
if re.search("subsidiary", alias, re.IGNORECASE):
alias = ""
rankings[0].append((company, alias.strip(), divisionof))
if divisionof:
rankings[1].append((company, alias, divisionof))
# if there is at least 1 alias business rule applied
# then return the first result from the alias rules
# otherwise if there is at least 1 division of rule return
# that.
if len(rankings[0]) > 0:
retval = rankings[0][0]
elif len(rankings[1]) > 0:
retval = rankings[1][0]
return retval
if __name__ == "__main__":
names = [
"DORO INC ( O/S DORO SAFETY & SECURITY)",
"MJB SUPPLY (DALIAN) CO., LTD.",
"Iris Diagnostics, a Division of Iris International Inc",
"GINGI-PAK a division of The Belport Co., Inc.",
"H/P/COSMOS SPORTS & MEDICAL GMBH",
"Arrayent Health LLC d/b/a/ Ambio Healthc",
"Arrow International, Inc. (subsidiary of Teleflex, Inc.)",
]
print("{0:40s} Alias_Name".format("Parent_Company"))
print()
for name in names:
results = CompanyAlias.process_rules(name)
result = CompanyAlias.process_results(results)
print("{0:40s} {1}".format(result[0], result[1]))输出
Parent_Company Alias_Name
DORO INC O/S DORO SAFETY & SECURITY
MJB SUPPLY CO., LTD. DALIAN
Iris Diagnostics,
GINGI-PAK
H P/COSMOS SPORTS & MEDICAL GMBH
Arrayent Health LLC Ambio Healthc
Arrow International, Inc. https://stackoverflow.com/questions/60075805
复制相似问题