keywords = ("banana", "apple", "orange", ...)
before = 50
after = 100
TEXT = "a big text string, i.e., a page of a book"
for k in keywords:
if k in TEXT:
#cut = portion of text starting 'beforeText' chars before occurrence of 'k' and ending 'afterText' chars after occurrence of 'k'
#finalcut = 'cut' with first and last WORDS trimmed to assure starting words are not cut in the middle伙计们,你能帮我在上面的例子中编写cut和finalcut字符串变量吗?
什么是最有效的解决方案,考虑到我正在处理的大文本,众多的网页和可能超过20个关键字的搜索?
发布于 2014-08-13 03:18:33
可以使用re.finditer在字符串中找到所有匹配项。每个匹配对象都有一个start()方法,您可以使用它来计算字符串中的位置。您也不需要检查键是否在字符串中,因为然后finditer返回一个空迭代器:
keywords = ("banana", "apple", "orange", ...)
before = 50
after = 100
TEXT = "a big text string, i.e., a page of a book"
for k in keywords:
for match in re.finditer(k, TEXT):
position = match.start()
cut = TEXT[max(position - before, 0):position + after] # max is needed because that index must not be negative
trimmed_match = re.match("\w*?\W+(.*)\W+\w*", cut, re.MULTILINE)
finalcut = trimmed_match.group(1)正则表达式将所有内容修剪到并包括第一个非单词字符序列,以及所有来自和包括最后一个非单词字符序列的内容(如果文本中有换行符,我添加了re.MULTILINE )。
发布于 2014-08-13 03:23:19
你需要调整你的算法。正如所写的,它是O(n*m),n是关键字的#,m是文本的长度。这不会有很好的规模。
相反:
keywords成为set,而不是tuple。您只关心针对keywords的成员资格测试,并且设置成员资格测试为O(1)。TEXT。这比仅仅执行split()要复杂一些,因为您还需要处理删除标点符号/换行的操作。keywords集中,则获取其周围的令牌并继续执行。就这样。所以,一些伪代码:
keywords = {"banana", "apple", "orange", ...}
tokens = tokenize(TEXT)
for before, target, after in window(tokens, n=3):
if target in keywords:
#do stuff with `before` and `after`window是您选择滑动窗口实现的地方,比如here,而tokenize是您自己的涉及split和strip的实现,或者是如果您想要库解决方案的话,可能是ntlk.tokenize。
发布于 2014-08-13 03:48:21
import string
import re
alphabet = string.lowercase + string.uppercase
regex1 = re.compile("(%s)" % "|".join(keywords))
regex2 = re.compile("^(%s)" % "|".join(keywords))
regex3 = re.compile("(%s)$" % "|".join(keywords))
for match in regex1.finditer(TEXT):
cut = TEXT[max(match.start() - before, 0) : match.end() + after]
finalcut = cut
if not regex2.search(cut):
finalcut = finalcut.lstrip(alphabet)
if not regex3.search(cut):
finalcut = finalcut.rstrip(alphabet)
print cut, finalcut这一点可以进一步改进,因为只有两次关键字可以在文本的开头或结尾,因此不应该删除。
cuts = [TEXT[max(match.start() - before, 0) : match.end() + after] for match in regex1.finditer(TEXT)]
finalcuts = [0] * len(cuts)
for i, cut in enumerate(cuts):
if i == 0 and not regex2.search(cut):
finalcuts[0] = cuts[0].lstrip(alphabet)
elif i == 0:
finalcuts[0] = cuts[0]
if i == len(cuts) - 1 and not regex3.search(cut):
if i == 0:
finalcuts[i] = finalcuts[i].rstrip(alphabet)
elif i > 0:
finalcuts[i] = cuts[i].rstrip(alphabet)
elif i > 0:
finalcuts[i] = cuts[i].strip(alphabet)
print cuts, finalcutshttps://stackoverflow.com/questions/25277305
复制相似问题