我试图在Python中使用difflib.SequenceMatcher来返回最大的公共字符串。
string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module> main(FLAGS, sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main worker.run(sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run if not self.__test_phase_wrapper(test_method): File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off self._host_power_off_test_cycle(host_of_stargate_master) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert interval=interval, File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack: File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off self._host_power_off_test_cycle(host_of_stargate_master) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert interval=interval, File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) File "/main/.python/util/base/log.py", line 204, in CHECK FATAL(log_msg, **kwargs) File "/main/.python/util/base/log.py", line 185, in FATAL sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module> main(FLAGS, sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main worker.run(sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run if not self.__test_phase_wrapper(test_method): File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off self._host_power_off_test_cycle(leader_host) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha self.verify_vms_not_on_host(host_vms, host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host self.wait_for_vms_power_on(vm_names, per_vm_timeout) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on interval=15) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack: File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off self._host_power_off_test_cycle(leader_host) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha self.verify_vms_not_on_host(host_vms, host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host self.wait_for_vms_power_on(vm_names, per_vm_timeout) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on interval=15) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) File "/main/.python/util/base/log.py", line 204, in CHECK FATAL(log_msg, **kwargs) File "/main/.python/util/base/log.py", line 185, in FATAL sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print match
print(string1[match.a: match.a + match.size])
string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44,"""
string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44"""
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print(string1[match.a: match.a + match.size])因此,基本上在比较string1和string2的前两行时,返回CRITICAL ha_test_util.py:44,而当我从string1和string2第6和第7行中删除一些行时,它返回ERROR agave_util.py:64 Timed out waiting for
基本上,我的问题是,为什么序列匹配器没有返回正确的匹配在我的第一个案例?
发布于 2017-07-13 17:49:23
您正在体验(在您的情况下是负面的)SequenceMatcher的自动垃圾启发的影响。
自动垃圾启发式:
SequenceMatcher支持一种自动将某些序列项视为垃圾的启发式方法。启发式计算每个项目在序列中出现的次数。如果一个项目的重复(在第一个项目之后)占序列的1%以上,并且该序列至少有200项长,则该项目被标记为“受欢迎的”,并且为了序列匹配的目的被视为垃圾。在创建autojunk时,可以通过将False参数设置为False来关闭此启发式。
在SequenceMatcher构造函数中,autojunk默认为True。如果您尝试使用autojunk=False,您将得到预期最长的匹配:
from difflib import SequenceMatcher
string1 = """ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module> main(FLAGS, sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main worker.run(sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run if not self.__test_phase_wrapper(test_method): File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off self._host_power_off_test_cycle(host_of_stargate_master) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert interval=interval, File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack: File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off self._host_power_off_test_cycle(host_of_stargate_master) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert interval=interval, File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) File "/main/.python/util/base/log.py", line 204, in CHECK FATAL(log_msg, **kwargs) File "/main/.python/util/base/log.py", line 185, in FATAL sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
string2 = """ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module> main(FLAGS, sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main worker.run(sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run if not self.__test_phase_wrapper(test_method): File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off self._host_power_off_test_cycle(leader_host) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha self.verify_vms_not_on_host(host_vms, host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host self.wait_for_vms_power_on(vm_names, per_vm_timeout) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on interval=15) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack: File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off self._host_power_off_test_cycle(leader_host) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha self.verify_vms_not_on_host(host_vms, host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host self.wait_for_vms_power_on(vm_names, per_vm_timeout) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on interval=15) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) File "/main/.python/util/base/log.py", line 204, in CHECK FATAL(log_msg, **kwargs) File "/main/.python/util/base/log.py", line 185, in FATAL sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
match = SequenceMatcher(None, string1, string2, autojunk=False).find_longest_match(0, len(string1), 0, len(string2))
print(match)输出:
Match(a=110, b=156, size=534)当然,我们可以检查所有匹配的块并找到最长的:
>>> max(SequenceMatcher(None, string1, string2, autojunk=False).get_matching_blocks(),
... key=lambda m: m.size)
Match(a=110, b=156, size=534)为了在一个简单的例子中说明autojunk的作用,让我们看看这里发生了什么:
>>> a = "aa:bb:cc" + ":"*200
>>> b = "aa:bb" + ":"*200
>>> SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
Match(a=0, b=0, size=6) # : is classified as junk
>>> SequenceMatcher(None, a, b, autojunk=False).find_longest_match(0, len(a), 0, len(b))
Match(a=8, b=5, size=200) # : is NOT classified as junk在第一种情况下(默认情况下是autojunk=True),:被认为是垃圾字符(它代表至少200项长的序列的1%以上),因此,“在人们看来是正确的”的最长匹配只有6个字符(最初的6个)。
在第二种情况下(使用显式autojunk=False),垃圾启发式算法是关闭的,因此最长的匹配是最后200个字符。
如果您对较短的序列(小于200个字符)重复相同的测试,您可以看到autojunk没有什么区别,因为垃圾启发式方法被关闭了(参见来源)。
https://stackoverflow.com/questions/45079797
复制相似问题