首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >最小正则发动机

最小正则发动机
EN

Code Review用户
提问于 2018-10-26 15:16:52
回答 1查看 122关注 0票数 9

几个月前,我发布了一个状态机以供审查。由于反馈,我能够大大简化实现。我张贴的修订版本,连同Regex类,称为它。

我想我主要是在寻找结构方面的反馈。我的Node类和StateMachine类之间的关系有点混乱;我并不总是确定哪种方法应该属于哪个类。我认为,我的下一个标记,我的雷克萨斯沟通方式也很麻烦。

state_machine.py

代码语言:javascript
复制
class Node:
    def __init__(self, value):
        self.value = value
        self.children = set()

    def empty(self):
        return self.value == ''

    def add_child(self, other):
        self.children.add(other)

    def find_parent_of_terminal(self, terminal):
        """
        We assume that there shall only be one node leading to the terminal
        and that there is only one terminal
        """
        visited = set()
        to_explore = {self}
        while to_explore:
            current = to_explore.pop()
            visited.add(current)
            if terminal in current.children:
                # If this fails, then there is a bug in union, concat, or kleene
                assert len(current.children) == 1
                return current
            to_explore.update({node for node in current.children 
                               if node not in visited})
        return None

    def leads_to(self, value):
        """
        Return True iff argument can be reached by traversing empty nodes
        """
        return bool(self._get_node_if_reachable(value))

    def _get_node_if_reachable(self, value):
        for node in self.children:
            while node and node.empty():
                if node == value:
                    return node
                node = node._get_node_if_reachable(value)
        return None

    def __repr__(self):
        result = '{} : ['.format(self.value)
        for node in self.children:
            result += str(node.value) + ', '
        result += ']\n'

        return result


def EmptyNode():
    return Node('')


class StateMachine:

    def __init__(self):
        self.initial = EmptyNode()
        self.terminal = EmptyNode()

    def __repr__(self):
        return str(self.initial)

    @staticmethod
    def from_string(source):
        dfa = StateMachine()
        nodes = [Node(char) for char in source]

        dfa.initial.add_child(nodes[0])
        for i in range(len(source) - 1):
            nodes[i].add_child(nodes[i + 1])
        nodes[-1].add_child(dfa.terminal)
        return dfa

    @staticmethod
    def from_set(chars):
        dfa = StateMachine()
        first = EmptyNode()
        penultimate = EmptyNode()
        dfa.initial.children = {first}

        for char in chars:
            char_node = Node(char)
            first.add_child(char_node)
            char_node.add_child(penultimate)

        penultimate.add_child(dfa.terminal)
        return dfa

    def concat(self, other):
        other.initial.find_parent_of_terminal(other.terminal).children = {self.terminal}
        self.initial.find_parent_of_terminal(self.terminal).children = other.initial.children

        return self

    def union(self, other):
        self.initial.children.update(other.initial.children)

        this_last = self.initial.find_parent_of_terminal(self.terminal)
        other_last = other.initial.find_parent_of_terminal(other.terminal)
        empty_node = EmptyNode()
        empty_node.add_child(self.terminal)
        this_last.children = {empty_node}
        other_last.children = {empty_node}
        return self

    def kleene(self):
        penultimate_node = self.initial.find_parent_of_terminal(self.terminal)
        dummy = EmptyNode()
        penultimate_node.children = {dummy}
        dummy.add_child(self.terminal)
        penultimate_node.add_child(self.initial)
        self.initial.add_child(dummy)
        return self

    def _get_next_state(self, nodes, value, visited=None):
        if visited is None:
            visited = set()
        result = set()
        for node in nodes:
            visited.add(node)
            for child in node.children:
                if child.empty() and child not in visited:
                    result.update(self._get_next_state([child], value, visited))
                elif child.value == value:
                    result.add(child)
        return result

    def is_match(self, nodes):
        for node in nodes:
            if node .leads_to(self.terminal):
                return True
        return False

    def match(self, source):
        """
        Match a target string by simulating a NFA
        :param source: string to match
        :return: Matched part of string, or None if no match is found
        """
        result = ''
        last_match = None
        current = {self.initial}
        for char in source:
            next_nodes = self._get_next_state(current, char)
            if next_nodes:
                current = next_nodes
                result += char

                if self.is_match(current):
                    last_match = result
            else:
                break

        if self.is_match(current):
            last_match = result
        return last_match

regex.py

代码语言:javascript
复制
import collections
import enum
import string


from state_machine import StateMachine


class Token(enum.Enum):
    METACHAR = 0
    CHAR = 1
    ERROR = 2


class LexResult(collections.namedtuple('LexResult', ['token', 'lexeme'])):
    def __bool__(self):
        return self.token != Token.ERROR


class RegexLexer(object):
    metachars = '-|[]^().*'

    def __init__(self, pattern: str):
        self._pattern = pattern
        self._pos = 0
        self._stack = []

    def peek(self) -> LexResult:
        if self._pos >= len(self._pattern):
            return LexResult(Token.ERROR, '')
        next_char = self._pattern[self._pos]
        if next_char in self.metachars:
            token = Token.METACHAR
        else:
            token = Token.CHAR
        return LexResult(token, next_char)

    def _eat_token_type(self, token: Token) -> LexResult:
        next_match = self.peek()
        if next_match.token != token:
            return LexResult(Token.ERROR, next_match.lexeme)
        self._pos += 1
        return next_match

    def _eat_token(self, match: LexResult) -> LexResult:
        next_match = self.peek()
        if next_match == match:
            self._pos += 1
            return next_match
        return LexResult(Token.ERROR, next_match.lexeme)

    def mark(self):
        self._stack.append(self._pos)

    def clear(self):
        self._stack.pop()

    def backtrack(self):
        self._pos = self._stack.pop()

    def eat_char(self, char=''):
        if char:
            return self._eat_token(LexResult(Token.CHAR, char))
        return self._eat_token_type(Token.CHAR)

    def eat_metachar(self, metachar):
        return self._eat_token(LexResult(Token.METACHAR, metachar))


class Regex(object):
    CHARACTERS = string.printable

    def __init__(self, pattern: str):
        """
        Initialize regex by compiling provided pattern
        """
        self._lexer = RegexLexer(pattern)
        self._state_machine = self.parse()

    def match(self, text: str) -> str:
        """
        Match text according to provided pattern.
        Returns matched substring if a match was found,
        or None otherwise
        """
        assert self._state_machine
        return self._state_machine.match(text)

    def parse(self):
        nfa = self.parse_simple_re()
        if not nfa:
            return None
        while True:
            self._lexer.mark()
            if not self._lexer.eat_metachar('|'):
                self._lexer.backtrack()
                return nfa
            next_nfa = self.parse_simple_re()
            if not next_nfa:
                self._lexer.backtrack()
                return nfa
            nfa = nfa.union(next_nfa)
            self._lexer.clear()

    def parse_simple_re(self):
        """
        <simple-re> = <basic-re>+
        """
        nfa = self.parse_basic_re()
        if not nfa:
            return None

        while True:
            next_nfa = self.parse_basic_re()
            if not next_nfa:
                break
            nfa = nfa.concat(next_nfa)
        return nfa

    def parse_basic_re(self):
        """
        <elementary-re> "*" | <elementary-re> "+" | <elementary-re>
        """
        nfa = self.parse_elementary_re()
        if not nfa:
            return None
        next_match = self._lexer.peek()
        if not next_match or next_match.token != Token.METACHAR:
            return nfa
        if next_match.lexeme == '*':
            self._lexer.eat_metachar('*')
            return nfa.kleene()
        if next_match.lexeme == '+':
            self._lexer.eat_metachar('+')
            return nfa.union(nfa.kleene())
        return nfa

    def parse_elementary_re(self):
        """
        <elementary-RE> = <group> | <any> | <char> | <set>
        :return: DFA
        """
        self._lexer.mark()
        nfa = self.parse_group()
        if nfa:
            self._lexer.clear()
            return nfa

        self._lexer.backtrack()
        if self._lexer.eat_metachar('.'):
            return StateMachine.from_set({x for x in self.CHARACTERS})

        char = self._lexer.eat_char()
        if char:
            return StateMachine.from_string(char.lexeme)

        set_chars = self.parse_set()
        if not set_chars:
            return None
        return StateMachine.from_set(set_chars)

    def parse_group(self):
        """
        <group> = "(" <RE> ")"
        :return: DFA
        """
        if not self._lexer.eat_metachar('('):
            return None
        state_machine = self.parse()
        if not state_machine:
            return None
        if not self._lexer.eat_metachar(')'):
            return None
        return state_machine

    def parse_range(self) -> {str}:
        """
        <range> = <CHAR> "-" <CHAR>
        """
        first = self._lexer.eat_char()
        if not first:
            return set()

        if not self._lexer.eat_metachar('-'):
            return set()

        last = self._lexer.eat_char()
        if not last:
            return set()

        return {chr(x) for x in range(ord(first.lexeme), ord(last.lexeme) + 1)}

    def parse_set_item(self) -> {str}:
        """
        <set item> = <range> | <char>
        """
        self._lexer.mark()
        set_item = self.parse_range()
        if set_item:
            self._lexer.clear()
            return set_item
        self._lexer.backtrack()
        next_item = self._lexer.eat_char()
        return {next_item.lexeme} if next_item else set()

    def parse_set_items(self) -> {str}:
        """
        <set items> = <set item>+
        """
        items = self.parse_set_item()
        if not items:
            return set()
        next_items = self.parse_set_item()
        while next_items:
            items.update(next_items)
            next_items = self.parse_set_item()
        return items

    def parse_positive_set(self) -> {str}:
        if not self._lexer.eat_metachar('['):
            return set()
        set_items = self.parse_set_items()
        if not set_items:
            return set()
        if not self._lexer.eat_metachar(']'):
            return set()
        return set_items

    def parse_negative_set(self) -> {str}:
        if not self._lexer.eat_metachar('['):
            return set()
        if not self._lexer.eat_metachar('^'):
            return set()
        set_items = self.parse_set_items()
        if not set_items:
            return set()
        if not self._lexer.eat_metachar(']'):
            return set()
        return set(string.printable).difference(set_items)

    def parse_set(self) -> {str}:
        """
        Parse something like [a-z9] and return the set of allowed characters
        """
        self._lexer.mark()
        set_items = self.parse_positive_set()
        if set_items:
            self._lexer.clear()
            return set_items
        self._lexer.backtrack()
        return self.parse_negative_set()

最后,通过一小部分单元测试来显示使用情况:

代码语言:javascript
复制
import unittest
from state_machine import StateMachine
from regex import Regex


class TestStateMachine(unittest.TestCase):

    def test_union(self):
        state_machine = StateMachine.from_string('abc')
        state_machine = state_machine.union(StateMachine.from_string('def'))

        self.assertEqual(state_machine.match('abc'), 'abc')
        self.assertEqual(state_machine.match('def'), 'def')
        self.assertIsNone(state_machine.match('de'))

    def test_kleene(self):
        state_machine = StateMachine.from_string('abc')
        state_machine = state_machine.kleene()

        self.assertEqual(state_machine.match(''), '')
        self.assertEqual(state_machine.match('abc'), 'abc')
        self.assertEqual(state_machine.match('abcabc'), 'abcabc')
        self.assertEqual(state_machine.match('abcDabc'), 'abc')

    def test_concat(self):
        state_machine = StateMachine.from_string('ab')
        state_machine = state_machine.concat(StateMachine.from_string('cd'))

        self.assertEqual(state_machine.match('abcd'), 'abcd')
        self.assertEqual(state_machine.match('abcde'), 'abcd')
        self.assertIsNone(state_machine.match('abc'))


class TestRegex(unittest.TestCase):

    def test_identifier_regex(self):
        regex = Regex('[a-zA-Z_][a-zA-Z0-9_]*')
        self.assertEqual(regex.match('a'), 'a')
        self.assertFalse(regex.match('0'))
        self.assertTrue(regex.match('a0'))
        self.assertEqual(regex.match('a0_3bd'), 'a0_3bd')
        self.assertEqual(regex.match('abd-43'), 'abd')

    def test_parentheses(self):
        regex = Regex('d(ab)*')
        self.assertEqual(regex.match('d'), 'd')
        self.assertEqual(regex.match('dab'), 'dab')
        self.assertEqual(regex.match('daa'), 'd')
        self.assertEqual(regex.match('dabab'), 'dabab')

    def test_union(self):
        regex = Regex('(ab*d)|(AG)')
        self.assertEqual(regex.match('adG'), 'ad')
        self.assertEqual(regex.match('AGfe'), 'AG')


if __name__ == '__main__':
    unittest.main()
EN

回答 1

Code Review用户

回答已采纳

发布于 2019-03-31 23:24:04

耶!你运行flake8并跟踪PEP-8。很干净的密码。

代码语言:javascript
复制
    self.assertEqual(state_machine.match('abc'), 'abc')

嗯,这可以说是倒过来的。在许多语言中,xUnit的约定是assertEqual(expected, computed)。它会影响故障诊断输出的显示方式。

代码语言:javascript
复制
    state_machine = state_machine.union(StateMachine.from_string('def'))

选择公共API的名称union可能有点令人困惑。"Union“取自集合论,而"alternation”是正则文学倾向于用于|的术语。

代码语言:javascript
复制
    state_machine = StateMachine.from_string('abc')

类名非常清楚,很好。对于我们将要使用的局部变量,sm就足够了。您已经有一行可以验证.from_string()没有爆炸,因此考虑在一行上组合两个赋值:

代码语言:javascript
复制
    sm = StateMachine.from_string('abc').kleene()

Regex类非常简单。拍你自己的背。

lexer中的peek方法可能有点棘手,它将从关于何时消费或不消费的评论中获益。我在寻找posstack上的不变量。我喜欢assert in find_parent_of_terminal,以及它的评论。

代码语言:javascript
复制
        to_explore.update({node for node in current.children 
                           if node not in visited})

这只是set差异,是吗?children - visited

总的来说,看上去不错。把它运出去!

票数 4
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/206333

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档