Spirit Lex:哪个令牌定义生成了此令牌

Spirit Lex: Which token definition generated this token?

本文关键字:令牌 定义 Spirit Lex      更新时间:2023-10-16

抱歉,如果这是一个新手问题,但我需要知道哪个令牌定义产生了某个令牌。当我打印令牌 ID 时,我只得到一个整数。我需要知道哪个正则表达式生成了此令牌。

编辑:

以下是我定义令牌的方式:

   template <typename LexerT>
   class Tokens: public lex::lexer<LexerT>
   {
      public:
         Tokens(const std::string& input):
            lineNo_(1)
         {
            using boost::spirit::lex::_start;
            using boost::spirit::lex::_end;
            using boost::spirit::lex::_pass;
            using boost::phoenix::ref;
            using boost::phoenix::construct;
            // macros
            this->self.add_pattern
               ("EXP",     "(e|E)(\+|-)?\d+")
               ("SUFFIX",  "[yzafpnumkKMGTPEZY]")
               ("INTEGER", "-?\d+")
               ("FLOAT",    "-?(((\d+)|(\d*\.\d+)|(\d+\.\d*))({EXP}|{SUFFIX})?)")
               ("SYMBOL",  "[a-zA-Z_?@](\w|\?|@)*")
               ("STRING",  "\"([^\"]|\\\")*\"");
            // whitespaces and comments
            whitespaces_ = "\s+";
            comments_    = "(;[^\n]*\n)|(\/\*[^*]*\*+([^/*][^*]*\*+)*\/)";
            // literals
            integer_ = "{INTEGER}";
            float_   = "{FLOAT}";
            symbol_  = "{SYMBOL}";
            string_  = "{STRING}";
            // operators
            quote_         = "'";
            backquote_     = '`';
            // ... other tokens
            // whitespace and comment rules
            this->self += whitespaces_ [ref(lineNo_) += count(_start, _end, 'n'), _pass = lex::pass_flags::pass_ignore];
            this->self += comments_    [ref(lineNo_) += count(_start, _end, 'n'), _pass = lex::pass_flags::pass_ignore];
            // literal rules
            this->self += integer_ | float_ | string_ | symbol_;
            // this->self += ... other tokens
         }
         ~Tokens() {}
         size_t lineNo() { return lineNo_; }

      private:
         // ignored tokens
         lex::token_def<lex::omit> whitespaces_, comments_;
         // literal tokens
         lex::token_def<int>          integer_;
         lex::token_def<std::string>  float_, symbol_, string_;
         // operator tokens
         lex::token_def<> quote_, backquote_;
         // ... other token definitions of type lex::token_def<>
         // current line number
         size_t lineNo_;
   };

谢谢海瑟姆

从文档中 http://www.boost.org/doc/libs/1_49_0/libs/spirit/doc/html/spirit/lex/tutorials/lexer_quickstart2.html:

为了确保每个令牌都被分配一个id,Spirit.Lex库在内部为令牌定义分配唯一的编号,从boost::spirit::lex::min_token_id定义的常量开始

因此,您实际上可以以增量方式获取令牌 ID。但是,为了使事情更加友好/健壮,我建议创建一个帮助程序函数来确定令牌的名称,因此您可以执行以下操作:

while (iter != end && token_is_valid(*iter))
{
    std::cout << "Token: " << 
       (iter->id() - lex::min_token_id) << ": " << 
       toklexer.nameof(iter) << " ('" << iter->value() << "')n";
    ++iter;
}
if (iter == end) { std::cout << "lineNo: " << toklexer.lineNo() << "n"; }

其中,对于以下输入:

const std::string str = "symbol "string" n"
    "this /* is a comment */n"
    "31415926E-7 123";

将打印:

Token: 5: symbol_ ('symbol')
Token: 4: string_ ('"string"')
Token: 5: symbol_ ('this')
Token: 3: float_ ('31415926E-7')
Token: 2: integer_ ('123')
lineNo: 3

笔记

  • 我认为不可能识别到模式表达式,因为一旦词法分析器返回令牌,信息就不会公开并且不再可用
  • 我想我记得看到带有调试信息的令牌(类似于qi::rule<>::name()?),但我目前找不到它的文档。如果可以重用调试名称,则 Tokens::nameof(It) 函数的实现将大大简化。

法典

完全工作的演示代码(稍微适应Boost 1_49-1_57,GCC -std=c++0x):

住在科里鲁

#define BOOST_RESULT_OF_USE_DECLTYPE
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/phoenix/function/adapt_callable.hpp>
namespace qi  = boost::spirit::qi;
namespace lex = boost::spirit::lex;
namespace phx = boost::phoenix;
///////////////////////////////////////////////////////////////////////////
// irrelevant for question: needed this locally to make it work with my boost
// version
namespace detail {
    struct count {
        template<class It1, class It2, class T> struct result { typedef ptrdiff_t type; };
        template<class It1, class It2, class T>
            typename result<It1, It2, T>::type operator()(It1 f, It2 l, T const& x) const {
                return std::count(f, l, x);
            }
    };
}
BOOST_PHOENIX_ADAPT_CALLABLE(count, detail::count, 3);
///////////////////////////////////////////////////////////////////////////
template <typename LexerT>
   class Tokens: public lex::lexer<LexerT>
   {
      public:
         Tokens():
            lineNo_(1)
         {
            using lex::_start;
            using lex::_end;
            using lex::_pass;
            using phx::ref;
            // macros
            this->self.add_pattern
               ("EXP",     "(e|E)(\+|-)?\d+")
               ("SUFFIX",  "[yzafpnumkKMGTPEZY]")
               ("INTEGER", "-?\d+")
               ("FLOAT",    "-?(((\d+)|(\d*\.\d+)|(\d+\.\d*))({EXP}|{SUFFIX})?)")
               ("SYMBOL",  "[a-zA-Z_?@](\w|\?|@)*")
               ("STRING",  "\"([^\"]|\\\")*\"");
            // whitespaces and comments
            whitespaces_ = "\s+";
            comments_    = "(;[^\n]*\n)|(\/\*[^*]*\*+([^/*][^*]*\*+)*\/)";
            // literals
            integer_ = "{INTEGER}";
            float_   = "{FLOAT}";
            symbol_  = "{SYMBOL}";
            string_  = "{STRING}";
            // operators
            quote_         = "'";
            backquote_     = '`';
            // ... other tokens
            // whitespace and comment rules
            //this->self.add(whitespaces_, 1001)
                          //(comments_,    1002);
            this->self = whitespaces_ [phx::ref(lineNo_) += count(_start, _end, 'n'), _pass = lex::pass_flags::pass_ignore]
                       | comments_    [phx::ref(lineNo_) += count(_start, _end, 'n'), _pass = lex::pass_flags::pass_ignore];
            // literal rules
            this->self += integer_ | float_ | string_ | symbol_;
            // this->self += ... other tokens
         }
         template <typename TokIter>
         std::string nameof(TokIter it)
         {
             if (it->id() == whitespaces_.id()) return "whitespaces_";
             if (it->id() == comments_.id())    return "comments_";
             if (it->id() == integer_.id())     return "integer_";
             if (it->id() == float_.id())       return "float_";
             if (it->id() == symbol_.id())      return "symbol_";
             if (it->id() == string_.id())      return "string_";
             if (it->id() == quote_.id())       return "quote_";
             if (it->id() == backquote_.id())   return "backquote_";
             return "other";
         }
         ~Tokens() {}
         size_t lineNo() { return lineNo_; }

      private:
         // ignored tokens
         lex::token_def</*lex::omit*/> whitespaces_, comments_;
         // literal tokens
         lex::token_def<int>          integer_;
         lex::token_def<std::string>  float_, symbol_, string_;
         // operator tokens
         lex::token_def<> quote_, backquote_;
         // ... other token definitions of type lex::token_def<>
         // current line number
         size_t lineNo_;
   };
int main()
{
    const std::string str = "symbol "string" n"
        "this /* is a comment */n"
        "31415926E-7 123";
    typedef lex::lexertl::token<char const*> token_type;
    typedef lex::lexertl::actor_lexer<token_type> lexer_type;
    Tokens<lexer_type> toklexer;
    char const* first = str.c_str();
    char const* last = &first[str.size()];
    lexer_type::iterator_type iter = toklexer.begin(first, last);
    lexer_type::iterator_type end = toklexer.end();
    while (iter != end && token_is_valid(*iter))
    {
        std::cout << "Token: " << 
           (iter->id() - lex::min_token_id) << ": " << 
           toklexer.nameof(iter) << " ('" << iter->value() << "')n";
        ++iter;
    }
    if (iter == end) { std::cout << "lineNo: " << toklexer.lineNo() << "n"; }
    else {
        std::string rest(first, last);
        std::cout << "Lexical analysis failedn" << "stopped at: ""
            << rest << ""n";
    }
    return 0;
}