文章/答案/技术大牛

发布

社区首页 >问答首页 >ragel解析器不贪婪吗？

问ragel解析器不贪婪吗？
EN

Stack Overflow用户

提问于 2022-08-23 15:04:45

回答 1查看 49关注 0票数 0

我试图用C++使用拉格尔编写一个HTTP解析器，但发现生成的解析器并不贪婪。

第一步是解析一个URL，下面是从RFC-3986翻译的ragel语法

#include <string>
#include <iostream>
#include <string_view>

%%{

    machine http_uri_parser;

    action mark_scheme {
        std::printf("mark scheme, p:%s\n", p);
        this->scheme.data = p;
    }
    action store_scheme {
        std::printf("store scheme, p:%s\n", p);
        this->scheme.len = p - this->scheme.data;
    }

    action mark_authority {
        std::printf("mark authority, p:%s\n", p);
        this->authority.data = p;
    }
    action store_authority {
        std::printf("store authority, p:%s\n", p);
        this->authority.len = p - this->authority.data;
    }

    action mark_userinfo {
        std::printf("mark userinfo, p:%s\n", p);
        this->userinfo.data = p;
    }
    action store_userinfo {
        std::printf("store userinfo, p:%s\n", p);
        this->userinfo.len = p - this->userinfo.data;
    }

    action mark_host {
        std::printf("mark host, p:%s\n", p);
        this->host.data = p;
    }
    action store_host {
        std::printf("store host, p:%s\n", p);
        this->host.len = p - this->host.data;
    }

    action mark_port {
        std::printf("mark port, p:%s\n", p);
        this->port.data = p;
    }
    action store_port {
        std::printf("store port, p:%s\n", p);
        this->port.len = p - this->port.data;
    }

    action mark_path {
        std::printf("mark path, p:%s\n", p);
        this->path.data = p;
    }

    action store_path {
        std::printf("store path, p:%s\n", p);
        this->path.len = p - this->path.data;
    }

    action mark_query {
        std::printf("mark query, p:%s\n", p);
        this->query.data = p;
    }
    action store_query {
        std::printf("store query, p:%s\n", p);
        this->query.len = p - this->query.data;
    }

    action mark_fragment {
        std::printf("mark fragment, p:%s\n", p);
        this->fragment.data = p;
    }
    action store_fragment {
        std::printf("store fragment, p:%s\n", p);
        this->fragment.len = p - this->fragment.data;
    }

    action done {
        std::printf("parser done, p:%s\n", p);
        this->_done = 1;
        fbreak;
    }


###############################################################################
# Characters
###############################################################################

    crlf          = '\r\n';
    gen_delims    = ( ':' | '/' | '?' | '#' | '[' | ']' | '@' );
    sub_delims    = ( '!' | '$' | '&' | "'" | '(' | ')'
                    | '*' | '+' | ',' | ';' | '=' );

    reserved      = ( gen_delims | sub_delims );
    unreserved    = ( alpha | digit | '-' | '.' | '_' | '~' );
    pct_encoded   = ( '%' xdigit xdigit );

###############################################################################
# Scheme
###############################################################################

    scheme        = ( alpha ( alpha | digit | '+' | '-' | '.' )* )
                    >mark_scheme %store_scheme;

###############################################################################
# Authority
###############################################################################

    dec_octet     = ( ( digit                  ) # 0-9
                    | ( ( '1'..'9' ) digit     ) # 10-99
                    | ( '1' digit{2}           ) # 100-199
                    | ( '2' ( '0'..'4' ) digit ) # 200-249
                    | ( '25' ( '0'..'5' )      ) # 250-255
                    );

    IPv4_address  = ( dec_octet '.' dec_octet '.' dec_octet '.' dec_octet );

    h16           = ( xdigit{1,4} );
                  # 16 bits of address represented in hexadecimal
    ls32          = ( ( h16 ':' h16 ) | IPv4_address );
                  # least-significant 32 bits of address

    IPv6_address  = ( (                               ( h16 ':' ){6} ls32 )
                    | (                          '::' ( h16 ':' ){5} ls32 )
                    | ( (                 h16 )? '::' ( h16 ':' ){4} ls32 )
                    | ( ( ( h16 ':' ){,1} h16 )? '::' ( h16 ':' ){3} ls32 )
                    | ( ( ( h16 ':' ){,2} h16 )? '::' ( h16 ':' ){2} ls32 )
                    | ( ( ( h16 ':' ){,3} h16 )? '::' ( h16 ':' ){1} ls32 )
                    | ( ( ( h16 ':' ){,4} h16 )? '::'                ls32 )
                    | ( ( ( h16 ':' ){,5} h16 )? '::'                h16  )
                    | ( ( ( h16 ':' ){,6} h16 )? '::'                     )
                    );
    IPv_future    = ( 'v' ( ( xdigit+ ) '.' ) ( unreserved | sub_delims | ':' )+ );
    IP_literal    = ( '[' ( IPv6_address | IPv_future  ) ']' );

    reg_name      = ( ( unreserved | pct_encoded | sub_delims )* )
                  > { std::printf("mark reg_name, p:%s\n", p); }
                  % { std::printf("store reg_name, p:%s\n", p); };

    port          = ( digit* )
                    >mark_port %store_port;
    host          = ( IP_literal | IPv4_address | reg_name )
                    >mark_host %store_host;
    userinfo      = ( ( unreserved | pct_encoded | sub_delims | ':' )* )
                    >mark_userinfo %store_userinfo;
    authority     = ( ( userinfo '@' )? host ( ':' port )? )
                    >mark_authority %store_authority;

###############################################################################
# Path
###############################################################################

    pchar         = ( unreserved | pct_encoded | sub_delims | ':' | '@' );

    segment       = ( pchar* );
    segment_nz    = ( pchar+ );
                  # non-zero-length
    segment_nz_nc = ( ( unreserved | pct_encoded | sub_delims | '@' )+ );
                  # non-zero-length segment without any colon ':'

    path_abempty  = ( ( '/' segment )* )
                  >mark_path %store_path;
    path_absolute = ( '/' ( segment_nz ( '/' segment )* )? )
                  >mark_path %store_path;
    path_noscheme = ( segment_nz_nc ( '/' segment )* )
                  >mark_path %store_path;
    path_rootless = ( segment_nz ( '/' segment )* )
                  >mark_path %store_path;
    path_empty    = ( zlen )
                  >mark_path %store_path;

    path          = ( path_abempty    # begins with '/' or is empty
                    | path_absolute   # begins with '/' but not '//'
                    | path_noscheme   # begins with a non-colon segment
                    | path_rootless   # begins with a segment
                    | path_empty      # zero characters
                    );

###############################################################################
# Query
###############################################################################

    query         = ( ( pchar | '/' | '?' )* )
                    >mark_query %store_query;

###############################################################################
# Fragment
###############################################################################

    fragment      = ( ( pchar | '/' | '?' )* )
                    >mark_fragment %store_fragment;

###############################################################################
# URI
###############################################################################

    hier_part     = ( ( '//' authority path_abempty )
                    | ( path_absolute               )
                    | ( path_rootless               )
                    | ( path_empty                  )
                    );

    relative_part = ( ( '//' authority path_abempty )
                    | ( path_absolute               )
                    | ( path_noscheme               )
                    | ( path_empty                  )
                    );

    absolute_URI  = ( scheme ':' hier_part ( '?' query )? );

    relative_ref  = ( relative_part ( '?' query )? ( '#' fragment )? );
    URI           = ( scheme ':' hier_part ( '?' query )? ( '#' fragment )? );

    URI_reference = ( URI | relative_ref );

###############################################################################
# main rule
###############################################################################

    main         := URI @done;
}%%

%% write data;

struct slice {
  size_t      len{};
  const char* data{};
};

struct http_parser {
  http_parser()  = default;
  ~http_parser() = default;

  void reset();
  void execute();

  int state = 0;

  std::string uri;

  /* parsed result */

  slice scheme{};
  slice authority{};
  slice userinfo{};
  slice host{};
  slice port{};
  slice path{};
  slice query{};
  slice fragment{};

  /* parse status */

  bool _eof{};
  bool _done{};
  bool _failed{};
};

void http_parser::reset() {
  int cs = 0;

  %% write init;

  this->state   = cs;
  this->_eof    = false;
  this->_done   = false;
  this->_failed = false;

  this->scheme    = slice{};
  this->authority = slice{};
  this->userinfo  = slice{};
  this->host      = slice{};
  this->port      = slice{};
  this->path      = slice{};
  this->query     = slice{};
  this->fragment  = slice{};
}

void http_parser::execute() {
  const char* p   = &this->uri.front();
  const char* pe  = &this->uri.back() + 1;
  const char* eof = pe;
  int         cs  = this->state;

  %% write exec;

  if (!this->_failed) {
    this->state = cs;
  }

  std::printf(
      "eof:%d, done:%d, failed:%d, state:%d, p:%p, pe:%p, diff:%ld, rest:%s\n",
      this->_eof, this->_done, this->_failed, this->state, p, pe, pe - p, p);

#define print_parser_component(fld)                                            \
  if (this->fld.len) {                                                         \
    std::printf(#fld ": %.*s\n", (int)this->fld.len, this->fld.data);          \
  }

  print_parser_component(scheme);
  print_parser_component(authority);
  print_parser_component(userinfo);
  print_parser_component(host);
  print_parser_component(port);
  print_parser_component(path);
  print_parser_component(query);
  print_parser_component(fragment);

#undef print_parser_component
}

这里，我将main规则设置为URL，而不是URI_reference，以便首先测试绝对URL。

下面是测试代码：

int main(int argc, char** argv) {
  auto parser = std::make_unique<http_parser>();
  parser->uri =
      "https://chenjianyong.com/blog/2022/01/"
      "seastar_fpc_1.html?hello=world#preface";
  parser->reset();
  parser->execute();
  return 0;
}

运行程序并打印：

mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
parser done, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
eof:0, done:1, failed:0, state:171, p:0x6000016f4006, pe:0x6000016f404c, diff:70, rest://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
scheme: https

解析方案https://之后，解析器似乎就停止了，这太奇怪了！为什么它不贪婪地消耗到最后一个字节呢？

将主要规则更改为main := (crlf @done);后，将crlf附加到测试URL，重新生成解析器，这一次解析器可以使用到末尾，打印显示所有crlf组件都已成功解析：

mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark path, p://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark authority, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark userinfo, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark host, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark reg_name, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store reg_name, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store host, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store authority, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface

mark path, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface

store path, p:?hello=world#preface

mark query, p:hello=world#preface

store query, p:#preface

mark fragment, p:preface

store fragment, p:

parser done, p:

eof:0, done:1, failed:0, state:188, p:0x60000140c04e, pe:0x60000140c04e, diff:0, rest:
scheme: https
authority: chenjianyong.com
host: chenjianyong.com
path: /blog/2022/01/seastar_fpc_1.html
query: hello=world
fragment: preface

那么，为什么我的ragel解析器不贪婪呢？

parsing

url

ragel

http

回答 1

Stack Overflow用户

发布于 2022-08-25 16:07:25

为什么它不贪婪地消耗到最后一个字节呢？

正如您已经注意到的，您的done操作是在冒号之后执行的(在冒号内有fbreak )，我们可以通过呈现您的(相当大的一个！)来确认它！密克罗尼西亚联邦(ragel -o f.dot -Vp source.c++ && dot -Tpng -o f.png f.dot)：

它之所以被执行，是因为main规范表示@done，根据文档

将操作嵌入到将机器移动到最终状态的任何转换中。

正如你所看到的，171是你的机器可能的最后状态之一。如果要将main规范更改为URI %done，则会有一些不同：

done不再被执行，它只会在

通过最终状态离开机器的转换。

我觉得更适合你的情况。

现在您可能想知道为什么171是最后的状态之一，这是因为hier_part和其他东西一样可以是path_empty，也就是zlen，所以机器可以在这个状态(对于scheme:输入) ->中完成--这是在转换到这个状态时执行的最后状态之一-> @done。

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/73461066

复制

相似问题

问ragel解析器不贪婪吗？
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问ragel解析器不贪婪吗？EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问ragel解析器不贪婪吗？
EN