如何使用Boost Regex标记C++

How to tokenize C++ using Boost Regex

本文关键字：标记 C++ Regex Boost 何使用更新时间：2023-10-16

我目前正在为一个使用boost regex的类开发标记化器。我对boost不太熟悉，所以我可能离目前的基础有点远，但无论如何，以下是我正在使用的：

regex re("[\s*,()=;<>+-]{1,2}");
sregex_token_iterator i(text.begin(), text.end(), re, -1);
sregex_token_iterator j;
sregex_token_iterator begin(text.begin(), text.end(), re), end;
unsigned count = 0;
while(i != j)
{
    if(*i != ' ' && *i != 'n')
    {
        count++;
        cout << "From i - " << count << "   " << *i << endl;
    }
    i++;
    if(*begin != ' ' && *begin != 'n')
    {
        count++;
        cout << "Form j - " << count << "   " << *begin << endl;
    }
    begin++;
}
cout << "There were " << count << " tokens found." << endl;

所以，基本上，我使用空格和符号作为分隔符，但我仍然输出这两者（因为我仍然希望符号是标记）。就像我说的，我对助推不是很熟悉，所以我不确定我是否采取了正确的方法。

我的最终目标是分割一个有简单c++代码块的文件并将其标记化，下面是我使用的示例文件：

#define MAX 5

int main(int argc)
{
    for(int i = 0; i < MAX; i ++)
    {
        cout << "i is equal to " << i << endl; 
    }
    return 0;
}

我遇到了麻烦，因为它将下一行和空格作为标记进行计数，我真的需要把它们扔掉。此外，我很难使用"++"标记，我似乎找不出正确的表达式来计算"++"。

如有任何帮助，我们将不胜感激！

谢谢！Tim

首先，

Boost有Boost Wave，它有（我认为有几个）现成的C++源代码标记器
Boost有Spirit-Lex，这是一个可以基于正则表达式模式和一些状态支持进行标记的lexer。它允许动态lexer表和静态生成的lexer表

如果你有兴趣使用Lex，我运行了一个快速&我自己的脏手指练习：它象征着自己在Coliru上直播。

注：

Lex标记器可以很好地使用Boost Spirit Qi进行解析（尽管老实说，我更喜欢直接在源迭代器上执行Spirit语法）
它公开了一个迭代器接口，尽管我的示例利用回调接口来显示令牌：
```
int main()
{
    typedef boost::spirit::istream_iterator It;
    typedef lex::lexertl::token<It, boost::mpl::vector<int, double>, boost::mpl::true_ > token_type;
    tokens<lex::lexertl::actor_lexer<token_type> > lexer;
    std::ifstream ifs("main.cpp");
    ifs >> std::noskipws;
    It first(ifs), last;
    bool ok = lex::tokenize(first, last, lexer, process_token());
    std::cout << "nTokenization " << (ok?"succeeded":"failed") << "; remaining input: '" << std::string(first,last) << "'n";
}
```
在输出中标记为（修剪前面的输出）：
[int][main][(][)][{][typedef][boost][::][spirit][::][istream_iterator][It][;][typedef][lex][::][lexertl][::][token][<][It][,][boost][::][mpl][::][vector][<][int][,][double][>][,][boost][::][mpl][::][true_][>][token_type][;][tokens][<][lex][::][lexertl][::][actor_lexer][<][token_type][>][>][lexer][;][std][::][ifstream][ifs][(]["main.cpp"][)][;][ifs][>>][std][::][noskipws][;][It][first][(][ifs][)][,][last][;][bool][ok][=][lex][::][tokenize][(][first][,][last][,][lexer][,][process_token][(][)][)][;][std][::][cout][<<]["nTokenization "][<<][(][ok][?]["succeeded"][:]["failed"][)][<<]["; remaining input: '"][<<][std][::][string][(][first][,][last][)][<<]["'n"][;][}]
Tokenization succeeded; remaining input: ''
实际上，您应该想要一个不同的lexer状态来解析预处理器指令（行尾变得有意义，并且其他几个表达式/关键字是有效的）。在现实生活中，通常有一个单独的预处理器步骤在这里进行自己的词法分析。（当lexing include文件规范时，可以看到这种情况的后果，例如
lexer中标记的排序对结果至关重要
在这个示例中，您总是将&令牌匹配为binop_。you had 的常用口语形式可能想要匹配ampersand_令牌并在解析时决定无论是二进制运算符（按位和）、一元运算符（的地址）、引用类型限定符等。C++解析起来真的很有趣：|
支持评论
不支持有向图/三角图：）
不支持杂注、行/文件指令等

总而言之，如果你想制作一个简单的语法高亮器或格式化程序，这应该是非常有用的。除此之外的任何内容都需要更多的解析/语义分析。

完整列表：

#include <boost/spirit/include/support_istream_iterator.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <fstream>
#include <sstream>    
#include <boost/lexical_cast.hpp>
namespace lex = boost::spirit::lex;
template <typename Lexer>
struct tokens : lex::lexer<Lexer>
{
    tokens() 
    {
        pound_   = "#";
        define_  = "define";
        if_      = "if";
        else_    = "else";
        endif_   = "endif";
        ifdef_   = "ifdef";
        ifndef_  = "ifndef";
        defined_ = "defined";
        keyword_ = "for|break|continue|while|do|switch|case|default|if|else|return|goto|throw|catch"
                   "static|volatile|auto|void|int|char|signed|unsigned|long|double|float|"
                   "delete|new|virtual|override|final|"
                   "typename|template|using|namespace|extern|"C"|"
                   "friend|public|private|protected|"
                   "class|struct|enum|"
                   "register|thread_local|noexcept|constexpr";
        scope_   = "::";
        dot_     = '.';
        arrow_   = "->";
        star_    = '*';
        popen_   = '(';
        pclose_  = ')';
        bopen_   = '{';
        bclose_  = '}';
        iopen_   = '[';
        iclose_  = ']';
        colon_   = ':';
        semic_   = ';';
        comma_   = ',';
        tern_q_  = '?';
        relop_   = "==|!=|<=|>=|<|>";
        assign_  = '=';
        incr_    = "\+\+";
        decr_    = "--";
        binop_   = "[-+/%&|^]|>>|<<";
        unop_    = "[-+~!]";
        real_    = "[-+]?[0-9]+(e[-+]?[0-9]+)?f?";
        int_     = "[-+]?[0-9]+";
        identifier_ = "[a-zA-Z_][a-zA-Z0-9_]*";
        ws_            = "[ \t\r\n]";
        line_comment_  = "\/\/.*?[\r\n]";
        block_comment_ = "\/\*.*?\*\/";
        this->self.add_pattern
            ("SCHAR", "\\(x[0-9a-fA-F][0-9a-fA-F]|[\\"'0tbrn])|[^"'\r\n]")
            ;
        string_lit = "\"('|{SCHAR})*?\"";
        char_lit   = "'(\"|{SCHAR})'";
        this->self += 
              pound_ | define_ | if_ | else_ | endif_ | ifdef_ | ifndef_ | defined_
            | keyword_ | scope_ | dot_ | arrow_ | star_ | popen_ | pclose_ | bopen_ | bclose_ | iopen_ | iclose_ | colon_ | semic_ | comma_ | tern_q_
            | relop_ | assign_ | incr_ | decr_ | binop_ | unop_
            | int_ | real_ | identifier_ | string_lit | char_lit
            // ignore whitespace and comments
            | ws_           [ lex::_pass = lex::pass_flags::pass_ignore ]
            | line_comment_ [ lex::_pass = lex::pass_flags::pass_ignore ]
            | block_comment_[ lex::_pass = lex::pass_flags::pass_ignore ] 
            ;
    }
  private:
    lex::token_def<> pound_, define_, if_, else_, endif_, ifdef_, ifndef_, defined_;
    lex::token_def<> keyword_, scope_, dot_, arrow_, star_, popen_, pclose_, bopen_, bclose_, iopen_, iclose_, colon_, semic_, comma_, tern_q_;
    lex::token_def<> relop_, assign_, incr_, decr_, binop_, unop_;
    lex::token_def<int> int_;
    lex::token_def<double> real_;
    lex::token_def<> identifier_, string_lit, char_lit;
    lex::token_def<lex::omit> ws_, line_comment_, block_comment_;
};
struct token_value : boost::static_visitor<std::string>
{
    template <typename... T> // the token value can be a variant over any of the exposed attribute types
    std::string operator()(boost::variant<T...> const& v) const {
        return boost::apply_visitor(*this, v);
    }
    template <typename T> // the default value is a pair of iterators into the source sequence
    std::string operator()(boost::iterator_range<T> const& v) const {
        return { v.begin(), v.end() };
    }
    template <typename T>
    std::string operator()(T const& v) const { 
        // not taken unless used in Spirit Qi rules, I guess
        return std::string("attr<") + typeid(v).name() + ">(" + boost::lexical_cast<std::string>(v) + ")";
    }
};
struct process_token
{
    template <typename T>
    bool operator()(T const& token) const {
        std::cout << '[' /*<< token.id() << ":" */<< print(token.value()) << "]";
        return true;
    }
    token_value print;
};
#if 0
std::string read(std::string fname)
{
    std::ifstream ifs(fname);
    std::ostringstream oss;
    oss << ifs.rdbuf();
    return oss.str();
}
#endif
int main()
{
    typedef boost::spirit::istream_iterator It;
    typedef lex::lexertl::token<It, boost::mpl::vector<int, double>, boost::mpl::true_ > token_type;
    tokens<lex::lexertl::actor_lexer<token_type> > lexer;
    std::ifstream ifs("main.cpp");
    ifs >> std::noskipws;
    It first(ifs), last;
    bool ok = lex::tokenize(first, last, lexer, process_token());
    std::cout << "nTokenization " << (ok?"succeeded":"failed") << "; remaining input: '" << std::string(first,last) << "'n";
}