提升分词器以将带引号的字符串视为一个标记

Boost tokenizer to treat quoted string as one token

本文关键字：一个字符串分词更新时间：2023-10-16

有没有办法让 Boost 分词器在不拆分引用部分的情况下拆分字符串下方？

string s = "1st 2nd "3rd with some comment" 4th";
Exptected output:
1st
2nd
3rd with some comment
4th

您可以使用分词器库中的escaped_list_separator。有关如何将其应用于您的问题的更多详细信息，请参阅此问题。

C++11 解决方案

#include <iostream>
#include <string>
#include <vector>
std::vector<std::string> tokenize(const std::string& str) {
    std::vector<std::string> tokens;
    std::string buffer;
    std::string::const_iterator iter = str.cbegin();
    bool in_string = false;
    while (iter != str.cend()) {
        char c = *iter;
        if (c == '"') {
            if (in_string) {
                tokens.push_back(buffer);
                buffer.clear();
            }
            in_string = !in_string;
        } else if (c == ' ') {
            if (in_string) {
                buffer.push_back(c);
            } else {
                if (!buffer.empty()) {
                    tokens.push_back(buffer);
                    buffer.clear();
                }
            }
        } else {
            buffer.push_back(c);
        }
        ++iter;
    }
    if (!buffer.empty()) {
        tokens.push_back(buffer);
    }
    return tokens;
}
int main() {
    std::string s = "1st 2nd "3rd with some comment" 4th";
    std::vector<std::string> tokens = tokenize(s);
    for (auto iter = tokens.cbegin(); iter != tokens.cend(); ++iter) {
        std::cout << *iter << "n";
    }
}

试试这段代码，这样你就可以避免使用Boost.Tokenizer和Boost.Spirit库

#include <vector>
#include <string>
#include <iostream>
const char Separators[] = { ' ', 9 };
bool Str_IsSeparator( const char Ch )
{
    for ( size_t i = 0; i != sizeof( Separators ); i++ )
    {
        if ( Separators[i] == Ch ) { return true; }
    }
    return false;
}
void SplitLine( size_t FromToken, size_t ToToken, const std::string& Str, std::vector<std::string>& Components /*, bool ShouldTrimSpaces*/ )
{
    size_t TokenNum = 0;
    size_t Offset   = FromToken - 1;
    const char* CStr  = Str.c_str();
    const char* CStrj = Str.c_str();
    while ( *CStr )
    {
        // bypass spaces & delimiting chars
        while ( *CStr && Str_IsSeparator( *CStr ) ) { CStr++; }
        if ( !*CStr ) { return; }
        bool InsideQuotes = ( *CStr == '"' );
        if ( InsideQuotes )
        {
            for ( CStrj = ++CStr; *CStrj && *CStrj != '"'; CStrj++ );
        }
        else
        {
            for ( CStrj = CStr; *CStrj && !Str_IsSeparator( *CStrj ); CStrj++ );
        }
        // extract token
        if ( CStr != CStrj )
        {
            TokenNum++;
            // store each token found
            if ( TokenNum >= FromToken )
            {
                  Components[ TokenNum-Offset ].assign( CStr, CStrj );
                  // if ( ShouldTrimSpaces ) { Str_TrimSpaces( &Components[ TokenNum-Offset ] ); }
                  // proceed to next token
                  if ( TokenNum >= ToToken ) { return; }
            }
            CStr = CStrj;
            // exclude last " from token, handle EOL
            if ( *CStr ) { CStr++; }
        }
    }
}
int main()
{
    std::string test = "1st 2nd "3rd with some comment" 4th";
    std::vector<std::string> Out;
    Out.resize(5);
    SplitLine(1, 4, test, Out);
    for(size_t j = 0 ; j != Out.size() ; j++) { std::cout << Out[j] << std::endl; }
    return 0;
}

它

使用一个预分配的字符串数组（它不是从零开始的，但很容易修复），而且非常简单。