我使用以下函数删除重复,同时保持第一次出现,并且不更改顺序。
def uniqueList(row):
words = str(row).split(" ")
unique = words[0]
for w in words:
if w.lower() not in unique.lower():
unique = unique + " " + w
return unique
df["value_corrected"] = df["value_corrected"].apply(uniqueList)
""" 1 """
sentences = df["value_corrected"] .to_list()
for s in sentences:
s_split = s.split(' ') # keep original sentence split by ' '
s_split_without_comma = [i.strip(',') for i in s_split]
# method 1: re
compare_words = re.split(' |-', s)
# method 2: itertools
compare_words = list(itertools.chain.from_iterable([i.split('-') for i in s_split]))
# method 3: DIY
compare_words = []
for i in s_split:
compare_words += i.split('-')
# strip ','
compare_words_without_comma = [i.strip(',') for i in compare_words]
# start to compare
need_removed_index = []
for word in compare_words_without_comma:
matched_indexes = []
for idx, w in enumerate(s_split_without_comma):
if word.lower() in w.lower().split('-'):
matched_indexes.append(idx)
if len(matched_indexes) > 1: # has_duplicates
need_removed_index += matched_indexes[1:]
need_removed_index = list(set(need_removed_index))
# keep remain and join with ' '
print(" ".join([i for idx, i in enumerate(s_split) if idx not in need_removed_index]))
# print(sentences)
print(sentences)在大多数情况下,除以下情况外,这是可行的:
数据样本:
data = {'Name': ["LOVABLE Lovable Period Panties Slip da Ciclo Mestruale Flusso Medio (Pacco da 2) Donna', 'Laessig LÄSSIG Set di Cucchiaio per bambini 4 pezzi Uni menta/mirtillo",
"Béaba BÉABA, Set di 6 Contenitori per la Pappa per Svezzamento Bebè in Silicone",
"L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE AL BURRO DI KARITÈ PER PELLI SECCHE 150ML"]}
df = pd.DataFrame(data)期望产出:
LOVABLE Period Panties Slip da Ciclo Mestruale Flusso Medio (Pacco da 2) Donna
Laessig Set di Cucchiaio per bambini 4 pezzi Uni menta/mirtillo
Béaba, Set di 6 Contenitori per la Pappa per Svezzamento Bebè in Silicone
L´Occitane - CREMA MANI NUTRIENTE AL BURRO DI KARITÈ PER PELLI SECCHE 150ML有什么方法可以修改上面的函数来涵盖这种情况吗?
非常感谢你。
发布于 2021-08-14 00:22:39
根据提供的字符串..。
Try:
import pandas as pd
import re
# import unidecode
data = {'Name': ["LOVABLE Lovable Period Panties Slip da Ciclo Mestruale Flusso Medio (Pacco da 2) Donna",
"Laessig LÄSSIG Set di Cucchiaio per bambini 4 pezzi Uni menta/mirtillo",
"Béaba BÉABA, Set di 6 Contenitori per la Pappa per Svezzamento Bebè in Silicone",
"L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE AL BURRO DI KARITÈ PER PELLI SECCHE 150ML"]}
df = pd.DataFrame(data)
def dedupString(s):
'''
Given a string 's' it processes the string and returns a string with duplicated words removed.
- replaces acute accent with single quote
- split string inc. punctuation to list
- sets 'ALL CAPS' words to 'All Caps' words (only during processing)
- loops through list and removes duplicates
- if word has a uppercase in the third char (like L'Oréal) reinstates that
- deduplicates the list and returns the list joined with a " "
'''
#replace acute accent (´) with a single quote (')
s = s.replace("´", "'")
#split the string inc. punctuation. If ticks and dashes etc. go missing from the output
#add them to the end of the second square brackets below. Example -> [.,!?;-HERE]
l = re.findall(r"[\w']+|[.,!?;-]", s)
output = []
seen = set()
#loop through the words
for word in l:
wordAllCaps = False
#if word is all caps record it
if word.isupper():
wordAllCaps = True
#change, for example 'THE' to 'The' (and 'The' to 'The' but hey)
if word[0].isupper():
word = word.capitalize()
#if the word is more than 3 chars
if len(word) > 3:
#and if the word as a single quote as the second char
if word[1] == "'":
#capitialize the third char in the word so "L'oréal" becomes "L'Oréal"
word = ''.join([word[:2], word[2].upper(), word[2 + 1:]])
#if the current word hasn't been seen before
if word not in seen:
#add it to seen
seen.add(word)
#if the word was originally all caps (like 'FOOBAR' but currently 'Foobar') change it back
if wordAllCaps:
word = word.upper()
#add word to the output string
output.append(word)
#return the list of words joined with spaces
return ' '.join(output)
df['Name2'] = df['Name']
# df['Name2'] = df['Name2'].apply(unidecode.unidecode)
df['Name2'] = df.apply(lambda x: dedupString(x['Name2']), axis=1)
df['Name2'] = df['Name2'].str.replace(' , ', ', ', regex=False)
print(df)输出:
Name \
0 LOVABLE Lovable Period Panties Slip da Ciclo M...
1 Laessig LÄSSIG Set di Cucchiaio per bambini 4 ...
2 Béaba BÉABA, Set di 6 Contenitori per la Pappa...
3 L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE A...
Name2
0 LOVABLE Period Panties Slip da Ciclo Mestruale...
1 Laessig LÄSSIG Set di Cucchiaio per bambini 4 ...
2 Béaba, Set di 6 Contenitori per la Pappa Svezz...
3 L'Occitane - CREMA MANI NUTRIENTE AL BURRO DI ... 注:
当第一个单词被保留时,
LOVABLE Lovable变成了LOVABLE。类似地,当标点符号移到原来的第一个单词中时,Béaba BÉABA,变成了Béaba,。df['Name2'] =更改为df['Name'] =。我建议在删除strings.façade Facade这样的字符串是否重复就是问题所在。在删除重复项(取消注释第3行和第59行并尝试)之前,要么换掉unicode,要么保持原样。这适用于给定的字符串。如果字符消失(随着数据集的增长,您可能需要更改regex),请注意代码中的注释.
#split the strings inc. punctuation. If ticks and dashes etc. go missing from the output
#add them to the end of the second square brackets below. Example -> [.,!?;-HERE]
l = re.findall(r"[\w']+|[.,!?;-]", s)附加信息:
如果您的预期输出是Laessig LÄSSIG变为Laessig,请尝试:
import pandas as pd
import re
import unidecode
data = {'Name': ["LOVABLE Lovable Period Panties Slip da Ciclo Mestruale Flusso Medio (Pacco da 2) Donna",
"Laessig LÄSSIG Set di Cucchiaio per bambini 4 pezzi Uni menta/mirtillo",
"Béaba BÉABA, Set di 6 Contenitori per la Pappa per Svezzamento Bebè in Silicone",
"L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE AL BURRO DI KARITÈ PER PELLI SECCHE 150ML"]}
df = pd.DataFrame(data)
swaps = {"ä":"ae",
#"ö":"oe",
"ü":"ue",
"Ä":"Ae",
#"Ö":"Oe",
"Ü":"Ue",
"ß":"ss"}
def toASCII(s):
'''
Input is a string;
- if the string contains any char in the keys of 'swaps' replace that char
- sets words that are ALL CAPS to All Caps for consistent output
'''
#if the string has a char that is in the keys of 'swaps'
if any(e in swaps.keys() for e in s):
#for each word
for w in s.split():
#if the word is ALL CAPS
if w.isupper():
#make it All Caps
s = s.replace(w, w.capitalize())
#replace, for example 'ä' with 'ae'
for w, l in swaps.items():
s = s.replace(w, l)
return s
def dedupString(s):
'''
Given a string 's' it processes the string and returns a string with duplicated words removed.
- replaces acute accent with single quote
- split string inc. punctuation to list
- sets 'ALL CAPS' words to 'All Caps' words (only during processing)
- loops through list and removes duplicates
- if word has a uppercase in the third char (like L'Oréal) reinstates that
- deduplicates the list and returns the list joined with a " "
'''
#replace acute accent (´) with a single quote (')
s = s.replace("´", "'")
#split the string inc. punctuation. If ticks and dashes etc. go missing from the output
#add them to the end of the second square brackets below. Example -> [.,!?;-HERE]
l = re.findall(r"[\w']+|[.,!?;-]", s)
output = []
seen = set()
#loop through the words
for word in l:
wordAllCaps = False
#if word is all caps record it
if word.isupper():
wordAllCaps = True
#change, for example 'THE' to 'The' (and 'The' to 'The' but hey)
if word[0].isupper():
word = word.capitalize()
#if the word is more than 3 chars
if len(word) > 3:
#and if the word as a single quote as the second char
if word[1] == "'":
#capitialize the third char in the word so "L'oréal" becomes "L'Oréal"
word = ''.join([word[:2], word[2].upper(), word[2 + 1:]])
#if the current word hasn't been seen before
if word not in seen:
#add it to seen
seen.add(word)
#if the word was originally all caps (like 'FOOBAR' but currently 'Foobar') change it back
if wordAllCaps:
word = word.upper()
#add word to the output string
output.append(word)
#return the list of words joined with spaces
return ' '.join(output)
df['Name2'] = df['Name']
df['Name2'] = df.apply(lambda x: toASCII(x['Name2']), axis=1)
df['Name2'] = df['Name2'].apply(unidecode.unidecode)
df['Name2'] = df.apply(lambda x: dedupString(x['Name2']), axis=1)
df['Name2'] = df['Name2'].str.replace(' , ', ', ', regex=False)
print(df)输出:
Name \
0 LOVABLE Lovable Period Panties Slip da Ciclo M...
1 Laessig LÄSSIG Set di Cucchiaio per bambini 4 ...
2 Béaba BÉABA, Set di 6 Contenitori per la Pappa...
3 L´Occitane L'OCCITANE - CREMA MANI NUTRIENTE A...
Name2
0 LOVABLE Period Panties Slip da Ciclo Mestruale...
1 Laessig Set di Cucchiaio per bambini 4 pezzi U...
2 Beaba, Set di 6 Contenitori per la Pappa Svezz...
3 L'Occitane - CREMA MANI NUTRIENTE AL BURRO DI ...显然,对于更大的数据集,您必须看看是否对swaps字典感到满意。我已经评论了一些事情,例如,您可能不希望像Björn这样的单词(如果存在于更大的集合中)转换等等。
https://stackoverflow.com/questions/68768841
复制相似问题