我正在使用Python mrjob从一个文本文件中查找10个最长的单词。我已经得到了一个结果,但是结果包含重复的单词。如何仅获取唯一的单词(即删除重复的单词)?
%%file most_chars.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
WORD_RE = re.compile(r"[\w']+") # any whitespace or apostrophe, used to split lines below
class MostChars(MRJob):
def steps(self):
return [
MRStep(mapper=self.mapper_get_words,
reducer=self.reducer_find_longest_words)
]
def mapper_get_words(self, _, line):
for word in WORD_RE.findall(line):
yield None, (len(word), word.lower().strip())
# discard the key; it is just None
def reducer_find_longest_words(self, _, word_count_pairs):
# each item of word_count_pairs is (count, word),
# so yielding one results in key=counts, value=word
sorted_pair = sorted(word_count_pairs, reverse=True)
for pair in sorted_pair[0:10]:
yield pair
if __name__ == '__main__':
MostChars.run()实际输出:
18 "overcapitalization"
18 "overcapitalization"
18 "overcapitalization"
17 "uncomprehendingly"
17 "misunderstandings"
17 "disinterestedness"
17 "disinterestedness"
17 "disinterestedness"
17 "disinterestedness"
17 "conventionalities"预期输出:
18 "overcapitalization"
17 "uncomprehendingly"
17 "misunderstandings"
17 "disinterestedness"
17 "conventionalities"和另外5个独特的单词
发布于 2021-10-28 10:56:15
更新reducer_find_longest_words以仅获取唯一的元素。注意list(set())的用法。
def reducer_find_longest_words(self, _, word_count_pairs):
# each item of word_count_pairs is (count, word),
# so yielding one results in key=counts, value=word
unique_pairs = [list(x) for x in set(tuple(x) for x in word_count_pairs)]
sorted_pair = sorted(unique_pairs, reverse=True)
for pair in sorted_pair[0:10]:
yield pairhttps://stackoverflow.com/questions/69752739
复制相似问题