使用boost::spirit::qi解析输入的CSV文件

parse typed csv file with boost::spirit::qi

本文关键字：输入 CSV 文件 boost spirit qi 使用更新时间：2023-10-16

我想用类型值解析一个CSV-File。每个列的类型在页眉中定义，例如:

int double double int unsigned
12  1.3    23445  1   42
45  46     47     48  49

结果数据结构可能是如下的二维向量:

using ColumnType = boost::variant<
  std::vector<int>,
  std::vector<unsigned>,
  std::vector<double>
>;
using ResultType = std::vector<ColumnType>;

工作代码:

namespace phoenix = boost::phoenix;
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using ColumnType = boost::variant<
  std::vector<int>,
  std::vector<unsigned>,
  std::vector<double>
>;
using ResultType = std::vector<ColumnType>;
enum class CSVDataType
{
  Int, UInt, Double
};
template<typename Iterator>
struct TypedCSVGrammar: qi::grammar<Iterator, ResultType(), ascii::blank_type> {
  struct types_: qi::symbols<char, CSVDataType> {
    types_() {
      add
        ("int", CSVDataType::Int)
        ("unsigned", CSVDataType::UInt)
        ("double", CSVDataType::Double);
    }
  } types;
  TypedCSVGrammar() :
    TypedCSVGrammar::base_type(csv, "csv")
  {
    using ascii::string;
    using namespace qi::labels;
    header %= *(types);
    cell =
      (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::Int))
        >> qi::int_ [phoenix::bind(&TypedCSVGrammar::add_int, this, _r1, _1)]
      ) | (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::UInt))
        >> qi::uint_ [phoenix::bind(&TypedCSVGrammar::add_uint, this, _r1, _1)]
      ) | (
        qi::eps(phoenix::ref(column_types)[_r1] == phoenix::val(CSVDataType::Double))
        >> qi::double_ [phoenix::bind(&TypedCSVGrammar::add_double, this, _r1, _1)]
      );
    row =
      qi::eps [phoenix::ref(column) = phoenix::val(0)]
      >> qi::repeat(phoenix::size(phoenix::ref(column_types))) [
        cell(phoenix::ref(column))
        >> qi::eps [phoenix::ref(column)++]
      ];
    csv =
      header [phoenix::bind(&TypedCSVGrammar::construct_columns, this, _1)]
      > qi::eol
      > row % qi::eol
      > *qi::eol
      > qi::attr(result);
    qi::on_error<qi::fail>
    (
        csv
      , std::cout
            << phoenix::val("Error! Expecting ")
            << _4                               // what failed?
            << phoenix::val(" here: "")
            << phoenix::construct<std::string>(_3, _2)   // iterators to error-pos, end
            << phoenix::val(""")
            << std::endl
    );
  }
  void add_int(std::size_t c, int i) {
    boost::get<std::vector<int>>(result[c]).push_back(i);
  }    
  void add_uint(std::size_t c, unsigned i) {
    boost::get<std::vector<unsigned>>(result[c]).push_back(i);
  }    
  void add_double(std::size_t c, double i) {
    boost::get<std::vector<double>>(result[c]).push_back(i);
  }
  void construct_columns(const std::vector<CSVDataType>& columns) {
    column_types = columns;    
    for (const auto& c : columns) {
      switch (c) {
      case CSVDataType::Int:
        result.push_back(std::vector<int>());
        break;
      case CSVDataType::UInt:
        result.push_back(std::vector<unsigned>());
        break;
      case CSVDataType::Double:
        result.push_back(std::vector<double>());
        break;
      }
    }
  }
  std::vector<CSVDataType> column_types;
  std::size_t column;
  ResultType result;
  qi::rule<Iterator, ResultType(), ascii::blank_type> csv;    
  qi::rule<Iterator, std::vector<CSVDataType>(), ascii::blank_type> header;
  qi::rule<Iterator, void(std::size_t), ascii::blank_type> cell;
  qi::rule<Iterator, void(), ascii::blank_type> row;
};

有更好的解决方案吗?我想使用不止3种类型(可能超过10种类型)。这将是大量的打字。

我不明白您为什么要提出这样一个人为的目标数据结构。如果索引不匹配，似乎会导致错误。

我可以在这里推荐一个Nabialek技巧吗?

如果你把AST改成:

，效果会很好

using ValueType = boost::variant<int, unsigned, double>;
using ResultType = std::vector<std::vector<ValueType>>;

(这似乎是一个更可取的方法)

简而言之，将列类型转换为解析器规则向量(std::vector<dynamic>)。

Live On Coliru

#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
namespace px    = boost::phoenix;
namespace qi    = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
using ValueType = boost::variant<int, unsigned, double>;
using ResultType = std::vector<std::vector<ValueType>>;
enum class CSVDataType { Int, UInt, Double };
namespace boost { namespace spirit { namespace qi { // FOR DEBUG
    template <typename... T> std::ostream& operator<<(std::ostream& os, rule<T...> const*)                     { return os << "(lazy rule)";       }
    template <typename... T> std::ostream& operator<<(std::ostream& os, std::vector<rule<T...> const*> const&) { return os << "(column mappings)"; }
} } }
template<typename Iterator, typename Skipper = ascii::blank_type>
struct TypedCSVGrammar: qi::grammar<Iterator, ResultType(), Skipper> {
    TypedCSVGrammar() : TypedCSVGrammar::base_type(start, "csv")
    {
        using namespace qi::labels;
        header = *types;
        csv    = qi::omit[ header [ _cols = _1 ] ] > qi::eol
               > qi::repeat(_current=0, px::size(_cols)) [ qi::lazy(*_cols[_current++]) ] % qi::eol
               > *qi::eol
               ;
        start = csv;
        BOOST_SPIRIT_DEBUG_NODES((start)(csv)(header));
        qi::on_error<qi::fail> (csv, px::ref(std::cout)
                    << "Error! Expecting " << _4                                  // what failed?
                    << " here: ""         << px::construct<std::string>(_3, _2)  // iterators to error-pos, end
                    << ""n"
            );
    }
  private:
    using cell_parser_t = qi::rule<Iterator, ValueType(), Skipper>;
    using dynamic       = cell_parser_t const*;
    struct types_: qi::symbols<char, dynamic> {
        cell_parser_t
            int_cell    = qi::int_,
            uint_cell   = qi::uint_,
            double_cell = qi::double_;
        types_() {
            this->add
                ("int",      &int_cell)
                ("unsigned", &uint_cell)
                ("double",   &double_cell);
            BOOST_SPIRIT_DEBUG_NODES((int_cell)(uint_cell)(double_cell))
        }
    } types;
    using state = qi::locals<std::vector<dynamic>, size_t>;
    qi::_a_type _cols;
    qi::_b_type _current;
    qi::rule<Iterator, ResultType(),             Skipper> start;
    qi::rule<Iterator, std::vector<dynamic>(),   Skipper> header;
    qi::rule<Iterator, ResultType(),             Skipper, state>  csv;
};
int main() {
    using It = boost::spirit::istream_iterator;
    It f(std::cin >> std::noskipws), l;
    TypedCSVGrammar<It> g;
    ResultType data;
    bool ok = qi::phrase_parse(f, l, g, ascii::blank, data);
    if (ok) {
        std::cout << "Parse successn";
        for(auto& row: data) {
            for(auto& cell: row) std::cout << cell << "t";
            std::cout << "n";
        }
    }
    else
        std::cout << "Parse failedn";
    if (f!=l)
        std::cout << "Remaining unparsed: '" << std::string(f,l) << "'n";
}

对于所示的输入，它输出

Parse success
12  1.3 23445   1   42  
45  46  47  48  49

和

的调试信息

<start>
  <try>int double double in</try>
  <csv>
    <try>int double double in</try>
    <header>
      <try>int double double in</try>
      <success>n12  1.3    23445  1</success>
      <attributes>[[(lazy rule), (lazy rule), (lazy rule), (lazy rule), (lazy rule)]]</attributes>
    </header>
    <int_cell>
      <try>12  1.3    23445  1 </try>
      <success>  1.3    23445  1   </success>
      <attributes>[12]</attributes>
    </int_cell>
    <double_cell>
      <try>  1.3    23445  1   </try>
      <success>    23445  1   42n45</success>
      <attributes>[1.3]</attributes>
    </double_cell>
    <double_cell>
      <try>    23445  1   42n45</try>
      <success>  1   42n45  46     </success>
      <attributes>[23445]</attributes>
    </double_cell>
    <int_cell>
      <try>  1   42n45  46     </try>
      <success>   42n45  46     47 </success>
      <attributes>[1]</attributes>
    </int_cell>
    <uint_cell>
      <try>   42n45  46     47 </try>
      <success>n45  46     47     4</success>
      <attributes>[42]</attributes>
    </uint_cell>
    <int_cell>
      <try>45  46     47     48</try>
      <success>  46     47     48  </success>
      <attributes>[45]</attributes>
    </int_cell>
    <double_cell>
      <try>  46     47     48  </try>
      <success>     47     48  49n</success>
      <attributes>[46]</attributes>
    </double_cell>
    <double_cell>
      <try>     47     48  49n</try>
      <success>     48  49n</success>
      <attributes>[47]</attributes>
    </double_cell>
    <int_cell>
      <try>     48  49n</try>
      <success>  49n</success>
      <attributes>[48]</attributes>
    </int_cell>
    <uint_cell>
      <try>  49n</try>
      <success>n</success>
      <attributes>[49]</attributes>
    </uint_cell>
    <int_cell>
      <try></try>
      <fail/>
    </int_cell>
    <success></success>
    <attributes>[[[12, 1.3, 23445, 1, 42], [45, 46, 47, 48, 49], []]]</attributes><locals>((column mappings) 1)</locals>
  </csv>
  <success></success>
  <attributes>[[[12, 1.3, 23445, 1, 42], [45, 46, 47, 48, 49], []]]</attributes>
</start>