def ngram(n, k, document):
f = open(document, 'r')
for i, line in enumerate(f):
words = line.split() + line.split()
print words
return {}对于ex-“我爱Python编程语言”,n=2是“我爱”,“爱”," Python ",“Python编程”和“编程语言”;
我想存储在一个列表中,然后比较它们中有多少是相同的。
发布于 2014-01-21 13:33:44
还不完全清楚你想要返回什么。假设有一行是这样的:
I love the Python programming language
并且你不想在行间做任何事情。
from collections import deque
def linesplitter(line, n):
prev = deque(maxlen=n) # fixed length list
for word in line.split(): # iterate through each word
prev.append(word) # keep adding to the list
if len(prev) == n: # until there are n elements
print " ".join(prev) # then start printing
# oldest element is removed automatically
with open(document) as f: # 'r' is implied
for line in f:
linesplitter(line, 2) # or any other length!输出:
I love
love the
the Python
Python programming
programming language发布于 2014-01-21 13:38:50
你可以从itertools recipes中的一个改编
import itertools
def ngrams(N, k, filepath):
with open(filepath) as infile:
words = (word for line in infile for word in line.split())
ts = itertools.tee(words, N)
for i in range(1, len(ts)):
for t in ts[i:]:
next(t, None)
return zip(*ts)使用如下所示的测试文件:
I love
the
python programming language下面是输出:
In [21]: ngrams(2, '', 'blah')
Out[21]:
[('I', 'love'),
('love', 'the'),
('the', 'python'),
('python', 'programming'),
('programming', 'language')]
In [22]: ngrams(3, '', 'blah')
Out[22]:
[('I', 'love', 'the'),
('love', 'the', 'python'),
('the', 'python', 'programming'),
('python', 'programming', 'language')]发布于 2014-01-21 13:32:31
你可以通过列表理解来实现这一点:
>>> [s1 + " " + s2 for s1, s2 in zip(s.split(), s.split()[1:])]
['I love', 'love the', 'the Python', 'Python programming', 'programming language']您还可以使用str.format函数:
>>> ["{} {}".format(s1, s2) for s1, s2 in zip(s.split(), s.split()[1:])]
['I love', 'love the', 'the Python', 'Python programming', 'programming language']函数的最终版本:
from itertools import tee, islice
def ngram(n, s):
var = [islice(it, i, None) for i, it in enumerate(tee(s.split(), n))]
return [("{} " * n).format(*itt) for itt in zip(*var)]演示:
>>> from splitting import ngram
>>> thing = 'I love the Python programming language'
>>> ngram(2, thing)
['I love ', 'love the ', 'the Python ', 'Python programming ', 'programming language ']
>>> ngram(3, thing)
['I love the ', 'love the Python ', 'the Python programming ', 'Python programming language ']
>>> ngram(4, thing)
['I love the Python ', 'love the Python programming ', 'the Python programming language ']
>>> ngram(1, thing)
['I ', 'love ', 'the ', 'Python ', 'programming ', 'language ']https://stackoverflow.com/questions/21249857
复制相似问题