[网络协议]ragel 解析http 请求

http request

"GET HTTP/1.1\r\n"
"Proxy-Connection: keep-alive\r\n"
"Pragma: no-cache\r\n"
"Cache-Control: no-cache\r\n"
"Accept: text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01\r\n"
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36\r\n"
"X-Requested-With: XMLHttpRequest\r\n"
"Accept-Encoding: gzip, deflate\r\n"
"Accept-Language: zh-CN,zh;q=0.9\r\n\r\n";


#ifndef http_parser_h
#define http_parser_h

#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>

typedef void (*element_cb)(void *data, const char *at, size_t length);
typedef void (*field_cb)(void *data, const char *field, size_t flen, const char *value, size_t vlen);

typedef struct http_parser { 
  int cs;
  size_t body_start;
  int content_len;
  size_t nread;
  size_t mark;
  size_t field_start;
  size_t field_len;
  size_t query_start;
  int xml_sent;
  int json_sent;

  void *data;

  int uri_relaxed;
  field_cb http_field;
  element_cb request_method;
  element_cb request_uri;
  element_cb fragment;
  element_cb request_path;
  element_cb query_string;
  element_cb http_version;
  element_cb header_done;
} http_parser;

int http_parser_init(http_parser *parser);
int http_parser_finish(http_parser *parser);
size_t http_parser_execute(http_parser *parser, const char *data, size_t len, size_t off);
int http_parser_has_error(http_parser *parser);
int http_parser_is_finished(http_parser *parser);

#define http_parser_nread(parser) (parser)->nread 

#include "http_parser.h"
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#define LEN(AT, FPC) (FPC - buffer - parser->AT)
#define MARK(M,FPC) (parser->M = (FPC) - buffer)
#define PTR_TO(F) (buffer + parser->F)

/** Machine **/

  machine http_parser;

  action mark {MARK(mark, fpc); }

  action start_field { MARK(field_start, fpc); }
  action write_field { 
    parser->field_len = LEN(field_start, fpc);

  action start_value { MARK(mark, fpc); }

  action write_value {
    if(parser->http_field != NULL) {
      parser->http_field(parser->data, PTR_TO(field_start), parser->field_len, PTR_TO(mark), LEN(mark, fpc));

  action request_method { 
    if(parser->request_method != NULL) 
      parser->request_method(parser->data, PTR_TO(mark), LEN(mark, fpc));

  action request_uri { 
    if(parser->request_uri != NULL)
      parser->request_uri(parser->data, PTR_TO(mark), LEN(mark, fpc));

  action fragment {
    if(parser->fragment != NULL)
      parser->fragment(parser->data, PTR_TO(mark), LEN(mark, fpc));

  action start_query {MARK(query_start, fpc); }
  action query_string { 
    if(parser->query_string != NULL)
      parser->query_string(parser->data, PTR_TO(query_start), LEN(query_start, fpc));

  action http_version {	
    if(parser->http_version != NULL)
      parser->http_version(parser->data, PTR_TO(mark), LEN(mark, fpc));

  action request_path {
    if(parser->request_path != NULL)
      parser->request_path(parser->data, PTR_TO(mark), LEN(mark,fpc));

  action done {
      if(parser->xml_sent || parser->json_sent) {
        parser->body_start = PTR_TO(mark) - buffer;
        // +1 includes the \0
        parser->content_len = fpc - buffer - parser->body_start + 1;
      } else {
        parser->body_start = fpc - buffer + 1;

        if(parser->header_done != NULL) {
          parser->header_done(parser->data, fpc + 1, pe - fpc - 1);

  action xml {
      parser->xml_sent = 1;

  action json {
      parser->json_sent = 1;

  CRLF = ( "\r\n" | "\n" ) ;

  # URI description as per RFC 3986.

  more_delims   = ( "{" | "}" | "^" ) when { parser->uri_relaxed } ;
  sub_delims    = ( "!" | "$" | "&" | "'" | "(" | ")" | "*"
                  | "+" | "," | ";" | "=" | more_delims ) ;
  gen_delims    = ( ":" | "/" | "?" | "#" | "[" | "]" | "@" ) ;
  reserved      = ( gen_delims | sub_delims ) ;
  unreserved    = ( alpha | digit | "-" | "." | "_" | "~" ) ;

  pct_encoded   = ( "%" xdigit xdigit ) ;

# pchar         = ( unreserved | pct_encoded | sub_delims | ":" | "@" ) ;
# add (any -- ascii) support chinese
  pchar         = ( (any -- ascii) | unreserved | pct_encoded | sub_delims | ":" | "@" ) ;

  fragment      = ( ( pchar | "/" | "?" )* ) >mark %fragment ;

  query         = ( ( pchar | "/" | "?" )* ) %query_string ;

  # non_zero_length segment without any colon ":" ) ;
  segment_nz_nc = ( ( unreserved | pct_encoded | sub_delims | "@" )+ ) ;
  segment_nz    = ( pchar+ ) ;
  segment       = ( pchar* ) ;

  path_empty    = ( pchar{0} ) ;
  path_rootless = ( segment_nz ( "/" segment )* ) ;
  path_noscheme = ( segment_nz_nc ( "/" segment )* ) ;
  path_absolute = ( "/" ( segment_nz ( "/" segment )* )? ) ;
  path_abempty  = ( ( "/" segment )* ) ;

  path          = ( path_abempty    # begins with "/" or is empty
                  | path_absolute   # begins with "/" but not "//"
                  | path_noscheme   # begins with a non-colon segment
                  | path_rootless   # begins with a segment
                  | path_empty      # zero characters
                  ) ;

  reg_name      = ( unreserved | pct_encoded | sub_delims )* ;

  dec_octet     = ( digit                 # 0-9
                  | ("1"-"9") digit         # 10-99
                  | "1" digit{2}          # 100-199
                  | "2" ("0"-"4") digit # 200-249
                  | "25" ("0"-"5")      # 250-255
                  ) ;

  IPv4address   = ( dec_octet "." dec_octet "." dec_octet "." dec_octet ) ;
  h16           = ( xdigit{1,4} ) ;
  ls32          = ( ( h16 ":" h16 ) | IPv4address ) ;

  IPv6address   = (                               6( h16 ":" ) ls32
                  |                          "::" 5( h16 ":" ) ls32
                  | (                 h16 )? "::" 4( h16 ":" ) ls32
                  | ( ( h16 ":" ){1,} h16 )? "::" 3( h16 ":" ) ls32
                  | ( ( h16 ":" ){2,} h16 )? "::" 2( h16 ":" ) ls32
                  | ( ( h16 ":" ){3,} h16 )? "::"    h16 ":"   ls32
                  | ( ( h16 ":" ){4,} h16 )? "::"              ls32
                  | ( ( h16 ":" ){5,} h16 )? "::"              h16
                  | ( ( h16 ":" ){6,} h16 )? "::"
                  ) ;

  IPvFuture     = ( "v" xdigit+ "." ( unreserved | sub_delims | ":" )+ ) ;

  IP_literal    = ( "[" ( IPv6address | IPvFuture  ) "]" ) ;

  port          = ( digit* ) ;
  host          = ( IP_literal | IPv4address | reg_name ) ;
  userinfo      = ( ( unreserved | pct_encoded | sub_delims | ":" )* ) ;
  authority     = ( ( userinfo "@" )? host ( ":" port )? ) ;

  scheme        = ( alpha ( alpha | digit | "+" | "-" | "." )* ) ;

  relative_part = ( "//" authority path_abempty
                  | path_absolute
                  | path_noscheme
                  | path_empty
                  ) ;

  hier_part     = ( "//" authority path_abempty
                  | path_absolute
                  | path_rootless
                  | path_empty
                  ) ;

  absolute_URI  = ( scheme ":" hier_part ( "?" query )? ) ;

  relative_ref  = ( (relative_part %request_path ( "?" %start_query query )?) >mark %request_uri ( "#" fragment )? ) ;
  URI           = ( scheme ":" (hier_part  %request_path ( "?" %start_query query )?) >mark %request_uri ( "#" fragment )? ) ;

  URI_reference = ( URI | relative_ref ) ;

# HTTP header parsing
  Method = ( upper | digit ){1,20} >mark %request_method;

  http_number = ( "1." ("0" | "1") ) ;
  HTTP_Version = ( "HTTP/" http_number ) >mark %http_version ;
  Request_Line = ( Method " " URI_reference " " HTTP_Version CRLF ) ;

  HTTP_CTL = (0 - 31) | 127 ;
  HTTP_separator = ( "(" | ")" | "<" | ">" | "@"
                   | "," | ";" | ":" | "\\" | "\""
                   | "/" | "[" | "]" | "?" | "="
                   | "{" | "}" | " " | "\t"
                   ) ;

  lws = CRLF? (" " | "\t")+ ;
  token = ascii -- ( HTTP_CTL | HTTP_separator ) ;
  content = ((any -- HTTP_CTL) | lws);

  field_name = ( token )+ >start_field %write_field;

  field_value = content* >start_value %write_value;

  message_header = field_name ":" lws* field_value :> CRLF;

  Request = Request_Line ( message_header )* ( CRLF );

  SocketJSONStart = ("@" relative_part);
  SocketJSONData = "{" any* "}" :>> "\0";

  SocketXMLData = ("<" [a-z0-9A-Z\-.]+) >mark %request_path ("/" | space | ">") any* ">" :>> "\0";

  SocketJSON = SocketJSONStart >mark %request_path " " SocketJSONData >mark @json;
  SocketXML = SocketXMLData @xml;

  SocketRequest = (SocketXML | SocketJSON);

main := (Request | SocketRequest) @done;


/** Data **/
%% write data;

int http_parser_init(http_parser *parser) {
  int cs = 0;
  %% write init;
  parser->cs = cs;
  parser->body_start = 0;
  parser->content_len = 0;
  parser->mark = 0;
  parser->nread = 0;
  parser->field_len = 0;
  parser->field_start = 0;
  parser->xml_sent = 0;
  parser->json_sent = 0;


/** exec **/
size_t http_parser_execute(http_parser *parser, const char *buffer, size_t len, size_t off)  
  if(len == 0) return 0;
  parser->nread = 0;
  parser->mark = 0;
  parser->field_len = 0;
  parser->field_start = 0;
  const char *p, *pe;
  int cs = parser->cs;

  assert(off <= len && "offset past end of buffer");

  p = buffer+off;
  pe = buffer+len;

  assert(pe - p == (int)len - (int)off && "pointers aren't same distance");

  %% write exec;

  assert(p <= pe && "Buffer overflow after parsing.");

  if (!http_parser_has_error(parser)) {
      parser->cs = cs;

  parser->nread += p - (buffer + off);

  assert(parser->nread <= len && "nread longer than length");
  assert(parser->body_start <= len && "body starts after buffer end");
  assert(parser->mark < len && "mark is after buffer end");
  assert(parser->field_len <= len && "field has length longer than whole buffer");
  assert(parser->field_start < len && "field starts after buffer end");


int http_parser_finish(http_parser *parser)
  if (http_parser_has_error(parser) ) {
    return -1;
  } else if (http_parser_is_finished(parser) ) {
    return 1;
  } else {
    return 0;

int http_parser_has_error(http_parser *parser) {
  return parser->cs == http_parser_error;

int http_parser_is_finished(http_parser *parser) {
  return parser->cs >= http_parser_first_final;

ragel编译:ragel http_parser.rl -o

#include "http_parser.h"
#include <string.h>
#include <string>
#include <iostream>
using namespace std;

void on_request_method(void *data, const char *at, size_t length)
    char buf[1024] = {0};
    memcpy(buf, at, length);
    buf[length] = '\0';
    printf("%s() %s\n", __func__, buf);

void on_request_uri(void *data, const char *at, size_t length)
    char buf[1024] = {0};
    memcpy(buf, at, length);
    buf[length] = '\0';
    printf("%s() %s\n", __func__, buf);

void on_request_fragment(void *data, const char *at, size_t length)
    char buf[1024] = {0};
    memcpy(buf, at, length);
    buf[length] = '\0';
    printf("%s() %s\n", __func__, buf);

void on_request_path(void *data, const char *at, size_t length)
    char buf[1024] = {0};
    memcpy(buf, at, length);
    buf[length] = '\0';
    printf("%s() %s\n", __func__, buf);

void on_request_query(void *data, const char *at, size_t length)
    char buf[1024] = {0};
    memcpy(buf, at, length);
    buf[length] = '\0';
    printf("%s() %s\n", __func__, buf);

void on_request_version(void *data, const char *at, size_t length)
    char buf[1024] = {0};
    memcpy(buf, at, length);
    buf[length] = '\0';
    printf("%s() %s\n", __func__, buf);

void on_request_header_done(void *data, const char *at, size_t length)
    char buf[1024] = {0};
    memcpy(buf, at, length);
    buf[length] = '\0';
    printf("%s() %s\n", __func__, buf);

void on_request_http_field(void *data, const char *field, size_t flen, const char *value, size_t vlen)
    std::string _filed(field, flen);
    std::string _value(value, vlen);

    printf("%s() [%s: %s]\n", __func__, _filed.c_str(), _value.c_str());

int main(int argc, char **argv)
    const char *request = 
        "GET HTTP/1.1\r\n"
        "Proxy-Connection: keep-alive\r\n"
        "Pragma: no-cache\r\n"
        "Cache-Control: no-cache\r\n"
        "Accept: text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01\r\n"
        "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36\r\n"
        "X-Requested-With: XMLHttpRequest\r\n"
        "Accept-Encoding: gzip, deflate\r\n"
        "Accept-Language: zh-CN,zh;q=0.9\r\n\r\n";

    http_parser parser;
    parser.request_method = on_request_method;
    parser.request_uri = on_request_uri;
    parser.fragment = on_request_fragment;
    parser.request_path = on_request_path;
    parser.query_string = on_request_query;
    parser.http_version = on_request_version;
    parser.header_done = on_request_header_done;
    parser.http_field = on_request_http_field; = nullptr;

    http_parser_execute(&parser, request, strlen(request), 0);
    return 0;

g++ -o http_parser -std=c++11


