我试图用C++使用拉格尔编写一个HTTP解析器,但发现生成的解析器并不贪婪。
第一步是解析一个URL,下面是从RFC-3986翻译的ragel语法
#include <string>
#include <iostream>
#include <string_view>
%%{
machine http_uri_parser;
action mark_scheme {
std::printf("mark scheme, p:%s\n", p);
this->scheme.data = p;
}
action store_scheme {
std::printf("store scheme, p:%s\n", p);
this->scheme.len = p - this->scheme.data;
}
action mark_authority {
std::printf("mark authority, p:%s\n", p);
this->authority.data = p;
}
action store_authority {
std::printf("store authority, p:%s\n", p);
this->authority.len = p - this->authority.data;
}
action mark_userinfo {
std::printf("mark userinfo, p:%s\n", p);
this->userinfo.data = p;
}
action store_userinfo {
std::printf("store userinfo, p:%s\n", p);
this->userinfo.len = p - this->userinfo.data;
}
action mark_host {
std::printf("mark host, p:%s\n", p);
this->host.data = p;
}
action store_host {
std::printf("store host, p:%s\n", p);
this->host.len = p - this->host.data;
}
action mark_port {
std::printf("mark port, p:%s\n", p);
this->port.data = p;
}
action store_port {
std::printf("store port, p:%s\n", p);
this->port.len = p - this->port.data;
}
action mark_path {
std::printf("mark path, p:%s\n", p);
this->path.data = p;
}
action store_path {
std::printf("store path, p:%s\n", p);
this->path.len = p - this->path.data;
}
action mark_query {
std::printf("mark query, p:%s\n", p);
this->query.data = p;
}
action store_query {
std::printf("store query, p:%s\n", p);
this->query.len = p - this->query.data;
}
action mark_fragment {
std::printf("mark fragment, p:%s\n", p);
this->fragment.data = p;
}
action store_fragment {
std::printf("store fragment, p:%s\n", p);
this->fragment.len = p - this->fragment.data;
}
action done {
std::printf("parser done, p:%s\n", p);
this->_done = 1;
fbreak;
}
###############################################################################
# Characters
###############################################################################
crlf = '\r\n';
gen_delims = ( ':' | '/' | '?' | '#' | '[' | ']' | '@' );
sub_delims = ( '!' | '$' | '&' | "'" | '(' | ')'
| '*' | '+' | ',' | ';' | '=' );
reserved = ( gen_delims | sub_delims );
unreserved = ( alpha | digit | '-' | '.' | '_' | '~' );
pct_encoded = ( '%' xdigit xdigit );
###############################################################################
# Scheme
###############################################################################
scheme = ( alpha ( alpha | digit | '+' | '-' | '.' )* )
>mark_scheme %store_scheme;
###############################################################################
# Authority
###############################################################################
dec_octet = ( ( digit ) # 0-9
| ( ( '1'..'9' ) digit ) # 10-99
| ( '1' digit{2} ) # 100-199
| ( '2' ( '0'..'4' ) digit ) # 200-249
| ( '25' ( '0'..'5' ) ) # 250-255
);
IPv4_address = ( dec_octet '.' dec_octet '.' dec_octet '.' dec_octet );
h16 = ( xdigit{1,4} );
# 16 bits of address represented in hexadecimal
ls32 = ( ( h16 ':' h16 ) | IPv4_address );
# least-significant 32 bits of address
IPv6_address = ( ( ( h16 ':' ){6} ls32 )
| ( '::' ( h16 ':' ){5} ls32 )
| ( ( h16 )? '::' ( h16 ':' ){4} ls32 )
| ( ( ( h16 ':' ){,1} h16 )? '::' ( h16 ':' ){3} ls32 )
| ( ( ( h16 ':' ){,2} h16 )? '::' ( h16 ':' ){2} ls32 )
| ( ( ( h16 ':' ){,3} h16 )? '::' ( h16 ':' ){1} ls32 )
| ( ( ( h16 ':' ){,4} h16 )? '::' ls32 )
| ( ( ( h16 ':' ){,5} h16 )? '::' h16 )
| ( ( ( h16 ':' ){,6} h16 )? '::' )
);
IPv_future = ( 'v' ( ( xdigit+ ) '.' ) ( unreserved | sub_delims | ':' )+ );
IP_literal = ( '[' ( IPv6_address | IPv_future ) ']' );
reg_name = ( ( unreserved | pct_encoded | sub_delims )* )
> { std::printf("mark reg_name, p:%s\n", p); }
% { std::printf("store reg_name, p:%s\n", p); };
port = ( digit* )
>mark_port %store_port;
host = ( IP_literal | IPv4_address | reg_name )
>mark_host %store_host;
userinfo = ( ( unreserved | pct_encoded | sub_delims | ':' )* )
>mark_userinfo %store_userinfo;
authority = ( ( userinfo '@' )? host ( ':' port )? )
>mark_authority %store_authority;
###############################################################################
# Path
###############################################################################
pchar = ( unreserved | pct_encoded | sub_delims | ':' | '@' );
segment = ( pchar* );
segment_nz = ( pchar+ );
# non-zero-length
segment_nz_nc = ( ( unreserved | pct_encoded | sub_delims | '@' )+ );
# non-zero-length segment without any colon ':'
path_abempty = ( ( '/' segment )* )
>mark_path %store_path;
path_absolute = ( '/' ( segment_nz ( '/' segment )* )? )
>mark_path %store_path;
path_noscheme = ( segment_nz_nc ( '/' segment )* )
>mark_path %store_path;
path_rootless = ( segment_nz ( '/' segment )* )
>mark_path %store_path;
path_empty = ( zlen )
>mark_path %store_path;
path = ( path_abempty # begins with '/' or is empty
| path_absolute # begins with '/' but not '//'
| path_noscheme # begins with a non-colon segment
| path_rootless # begins with a segment
| path_empty # zero characters
);
###############################################################################
# Query
###############################################################################
query = ( ( pchar | '/' | '?' )* )
>mark_query %store_query;
###############################################################################
# Fragment
###############################################################################
fragment = ( ( pchar | '/' | '?' )* )
>mark_fragment %store_fragment;
###############################################################################
# URI
###############################################################################
hier_part = ( ( '//' authority path_abempty )
| ( path_absolute )
| ( path_rootless )
| ( path_empty )
);
relative_part = ( ( '//' authority path_abempty )
| ( path_absolute )
| ( path_noscheme )
| ( path_empty )
);
absolute_URI = ( scheme ':' hier_part ( '?' query )? );
relative_ref = ( relative_part ( '?' query )? ( '#' fragment )? );
URI = ( scheme ':' hier_part ( '?' query )? ( '#' fragment )? );
URI_reference = ( URI | relative_ref );
###############################################################################
# main rule
###############################################################################
main := URI @done;
}%%
%% write data;
struct slice {
size_t len{};
const char* data{};
};
struct http_parser {
http_parser() = default;
~http_parser() = default;
void reset();
void execute();
int state = 0;
std::string uri;
/* parsed result */
slice scheme{};
slice authority{};
slice userinfo{};
slice host{};
slice port{};
slice path{};
slice query{};
slice fragment{};
/* parse status */
bool _eof{};
bool _done{};
bool _failed{};
};
void http_parser::reset() {
int cs = 0;
%% write init;
this->state = cs;
this->_eof = false;
this->_done = false;
this->_failed = false;
this->scheme = slice{};
this->authority = slice{};
this->userinfo = slice{};
this->host = slice{};
this->port = slice{};
this->path = slice{};
this->query = slice{};
this->fragment = slice{};
}
void http_parser::execute() {
const char* p = &this->uri.front();
const char* pe = &this->uri.back() + 1;
const char* eof = pe;
int cs = this->state;
%% write exec;
if (!this->_failed) {
this->state = cs;
}
std::printf(
"eof:%d, done:%d, failed:%d, state:%d, p:%p, pe:%p, diff:%ld, rest:%s\n",
this->_eof, this->_done, this->_failed, this->state, p, pe, pe - p, p);
#define print_parser_component(fld) \
if (this->fld.len) { \
std::printf(#fld ": %.*s\n", (int)this->fld.len, this->fld.data); \
}
print_parser_component(scheme);
print_parser_component(authority);
print_parser_component(userinfo);
print_parser_component(host);
print_parser_component(port);
print_parser_component(path);
print_parser_component(query);
print_parser_component(fragment);
#undef print_parser_component
}这里,我将main规则设置为URL,而不是URI_reference,以便首先测试绝对URL。
下面是测试代码:
int main(int argc, char** argv) {
auto parser = std::make_unique<http_parser>();
parser->uri =
"https://chenjianyong.com/blog/2022/01/"
"seastar_fpc_1.html?hello=world#preface";
parser->reset();
parser->execute();
return 0;
}运行程序并打印:
mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
parser done, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
eof:0, done:1, failed:0, state:171, p:0x6000016f4006, pe:0x6000016f404c, diff:70, rest://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
scheme: https解析方案https://之后,解析器似乎就停止了,这太奇怪了!为什么它不贪婪地消耗到最后一个字节呢?
将主要规则更改为main := (crlf @done);后,将crlf附加到测试URL,重新生成解析器,这一次解析器可以使用到末尾,打印显示所有crlf组件都已成功解析:
mark scheme, p:https://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store scheme, p:://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark path, p://chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark authority, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark userinfo, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark host, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark reg_name, p:chenjianyong.com/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store reg_name, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store host, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store authority, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
mark path, p:/blog/2022/01/seastar_fpc_1.html?hello=world#preface
store path, p:?hello=world#preface
mark query, p:hello=world#preface
store query, p:#preface
mark fragment, p:preface
store fragment, p:
parser done, p:
eof:0, done:1, failed:0, state:188, p:0x60000140c04e, pe:0x60000140c04e, diff:0, rest:
scheme: https
authority: chenjianyong.com
host: chenjianyong.com
path: /blog/2022/01/seastar_fpc_1.html
query: hello=world
fragment: preface那么,为什么我的ragel解析器不贪婪呢?
发布于 2022-08-25 16:07:25
为什么它不贪婪地消耗到最后一个字节呢?
正如您已经注意到的,您的done操作是在冒号之后执行的(在冒号内有fbreak ),我们可以通过呈现您的(相当大的一个!)来确认它!密克罗尼西亚联邦(ragel -o f.dot -Vp source.c++ && dot -Tpng -o f.png f.dot):

它之所以被执行,是因为main规范表示@done,根据文档
将操作嵌入到将机器移动到最终状态的任何转换中。
正如你所看到的,171是你的机器可能的最后状态之一。如果要将main规范更改为URI %done,则会有一些不同:

done不再被执行,它只会在
通过最终状态离开机器的转换。
我觉得更适合你的情况。
现在您可能想知道为什么171是最后的状态之一,这是因为hier_part和其他东西一样可以是path_empty,也就是zlen,所以机器可以在这个状态(对于scheme:输入) ->中完成--这是在转换到这个状态时执行的最后状态之一-> @done。
https://stackoverflow.com/questions/73461066
复制相似问题