使用 libxml2 解析 XML 文件时缺少元素值
Missed element values when parsing XML file using libxml2
我正在使用libxml2解析XML文件中的特定标签(例如标题(。
解析此 XML:
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs1</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs2</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs3</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs4</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs5</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs6</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs7</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs8</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs9</title>
</entry>
<entry>
<title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs10</title>
</entry>
</feed>
使用此C++代码
void CXMLManager::processNode(xmlTextReaderPtr reader)
{
static bool root = true;
std::string name;
name = std::string((const char *) xmlTextReaderConstName (reader));
if (name == "entry")
{
if (root)
{
m_name = m_title;
root = false;
return;
}
static bool closeEntry = true;
if (closeEntry)
{
m_feedBuffer.push_back( CFeed { m_name, m_title, m_updated, m_author, m_link } );
m_title = "";
}
closeEntry = !closeEntry;
}
else if (name == "title" && xmlTextReaderNodeType(reader) != XML_READER_TYPE_END_ELEMENT)
{
m_title = getElementContent(reader);
std::cout << "Title: " << m_title << std::endl;
}
}
std::string CXMLManager::getElementContent(xmlTextReaderPtr reader)
{
xmlNodePtr node = xmlTextReaderCurrentNode(reader);
xmlChar* text = xmlNodeGetContent(node);
return std::string((const char *) text);
}
void CXMLManager::streamFile(const char *data, size_t size)
{
xmlTextReaderPtr reader;
int ret;
/*
* Pass some special parsing options to activate DTD attribute defaulting,
* entities substitution and DTD validation
*/
reader = xmlReaderForMemory(data, size, NULL, NULL,
XML_PARSE_DTDATTR | /* default DTD attributes */
XML_PARSE_NOENT); /* substitute entities */
if (reader != NULL)
{
ret = xmlTextReaderRead(reader);
while (ret == 1)
{
processNode(reader);
ret = xmlTextReaderRead(reader);
}
}
else
{
throw CFeedreaderException("FEEDREADER: Failed to parse XML.", E_WRONG_XML);
}
}
在大多数情况下,我得到正确的结果,但一次 - 我得到空字符串(甚至认为它在XML中是正确的(:
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs1
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs2
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs3
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs4
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs6
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs7
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs8
在解析之前,我已经多次检查XML及其正确性,所以我不知道这里可能有什么问题。使用此输入会定期丢失第 5 个字符串。
static
局部变量可能会中断您的处理。 请记住,static
局部变量在函数调用之间保持其值。 一旦streamFile()
退出,然后再次调用,您的static
变量仍将具有其先前的值,它们不会重置回其原始值。 您必须将它们更改为CXMLManager
类的成员,以便streamFile()
每次调用时都可以重置它们。
我不建议使用单个函数来尝试处理您需要解析的每个可能的节点。 我将读取分解为单独的函数,这些函数在 XML 文档的每个级别都有自己的职责,如下所示:
void CXMLManager::readFeed(xmlTextReaderPtr reader)
{
// read attributes if needed...
if (xmlTextReaderIsEmptyElement(reader))
return;
int depth = xmlTextReaderNodeDepth(reader);
int ret;
while ((ret = xmlTextReaderRead(reader)) == 1)
{
switch (xmlTextReaderNodeType(reader))
{
case XML_READER_TYPE_ELEMENT:
{
if (xmlStrEqual(xmlTextReaderConstLocalName(reader), BAD_CAST "entry"))
{
CFeed entry;
readFeedEntry(reader, entry);
m_feedBuffer.push_back(entry);
}
break;
}
case XML_READER_TYPE_END_ELEMENT:
{
if ((xmlTextReaderNodeDepth(reader) == depth)
/*&& xmlStrEqual(xmlTextReaderConstLocalName(reader), BAD_CAST "feed")*/)
{
return;
}
break;
}
}
}
if (ret == -1)
throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}
void CXMLManager::readFeedEntry(xmlTextReaderPtr reader, CFeed &entry)
{
// read attributes if needed...
if (xmlTextReaderIsEmptyElement(reader))
return;
int depth = xmlTextReaderNodeDepth(reader);
int ret;
while ((ret = xmlTextReaderRead(reader)) == 1)
{
switch (xmlTextReaderNodeType(reader))
{
case XML_READER_TYPE_ELEMENT:
{
const xmlChar *name = xmlTextReaderConstLocalName(reader);
if (xmlStrEqual(name, BAD_CAST "title"))
{
readText(reader, entry.m_title/*, BAD_CAST "title"*/);
std::cout << "Title: " << entry.m_title << std::endl;
}
// else other <entry> children as needed ...
break;
}
case XML_READER_TYPE_END_ELEMENT:
{
if ((xmlTextReaderNodeDepth(reader) == depth)
/*&& xmlStrEqual(xmlTextReaderConstLocalName(reader), BAD_CAST "entry")*/)
{
return;
}
break;
}
}
}
if (ret == -1)
throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}
void CXMLManager::readText(xmlTextReaderPtr reader, std::string &text/*, const xmlChar *tagName */)
{
text.clear();
if (xmlTextReaderIsEmptyElement(reader))
return;
int depth = xmlTextReaderNodeDepth(reader);
int ret;
while ((ret = xmlTextReaderRead(reader)) == 1)
{
switch (xmlTextReaderNodeType(reader))
{
// TODO: handle XML_READER_TYPE_ELEMENT if you need to treat
// embedded XML elements as part of the text, such as for
// formatting instructions (like <b>, <i>, etc)...
case XML_READER_TYPE_TEXT:
{
const xmlChar *value = xmlTextReaderConstValue(reader);
text += reinterpret_cast<const char*>(value);
break;
}
case XML_READER_TYPE_END_ELEMENT:
{
if ((xmlTextReaderNodeDepth(reader) == depth)
/*&& xmlStrEqual(name, tagName)*/)
{
return;
}
break;
}
}
}
if (ret == -1)
throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}
void CXMLManager::streamFile(const char *data, size_t size)
{
/*
* Pass some special parsing options to activate DTD attribute defaulting,
* entities substitution and DTD validation
*/
xmlTextReaderPtr reader = xmlReaderForMemory(data, size, NULL, NULL,
XML_PARSE_DTDATTR | /* default DTD attributes */
XML_PARSE_NOENT); /* substitute entities */
if (!reader)
throw CFeedreaderException("FEEDREADER: Failed to parse XML.", E_WRONG_XML);
std::unique_ptr<xmlTextReader, decltype(xmlFreeTextReader)> reader_deleter(reader, xmlFreeTextReader);
int ret;
while ((ret = xmlTextReaderRead(reader)) == 1)
{
if ((xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT)
&& xmlStrEqual(xmlTextReaderConstLocalName(reader), BAD_CAST "feed"))
{
readFeed(reader);
}
}
if (ret == -1)
throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}
或者,我建议完全摆脱所有辅助函数,只在streamFile()
本身内部执行所有操作,在循环reader
时使用本地状态机,例如:
void CXMLManager::streamFile(const char *data, size_t size)
{
/*
* Pass some special parsing options to activate DTD attribute defaulting,
* entities substitution and DTD validation
*/
xmlTextReaderPtr reader = xmlReaderForMemory(data, size, NULL, NULL,
XML_PARSE_DTDATTR | /* default DTD attributes */
XML_PARSE_NOENT); /* substitute entities */
if (!reader)
throw CFeedreaderException("FEEDREADER: Failed to parse XML.", E_WRONG_XML);
std::unique_ptr<xmlTextReader, decltype(xmlFreeTextReader)> reader_deleter(reader, xmlFreeTextReader);
std::string name, title, updated, author, link, text;
int feedDepth = -1;
int entryDepth = -1;
int textDepth = -1;
int ret;
while ((ret = xmlTextReaderRead(reader)) == 1)
{
switch (xmlTextReaderNodeType(reader))
{
case XML_READER_TYPE_ELEMENT:
{
if (textDepth != -1)
{
// TODO: handle this case if you need to treat embedded
// XML elements as part of the text, such as for formatting
// instructions (like <b>, <i>, etc)...
break;
}
const xmlChar *name = xmlTextReaderConstLocalName(reader);
if (feedDepth == -1)
{
if (xmlStrEqual(name, BAD_CAST "feed"))
{
// read attributes if needed...
feedDepth == xmlTextReaderNodeDepth(reader);
}
}
else if (entryDepth == -1)
{
if (xmlStrEqual(name, BAD_CAST "entry"))
{
name = title = updated = author = link = text = "";
// read attributes if needed...
if (xmlTextReaderIsEmptyElement(reader))
m_feedBuffer.push_back( CFeed { name, title, updated, author, link } );
else
entryDepth == xmlTextReaderNodeDepth(reader);
}
}
else if (xmlStrEqual(name, BAD_CAST "title"))
{
text.clear();
if (!xmlTextReaderIsEmptyElement(reader))
textDepth = xmlTextReaderNodeDepth(reader);
else
textDepth = -1;
}
// else other <entry> children as needed ...
break;
}
case XML_READER_TYPE_TEXT:
{
if (textDepth != -1)
{
const xmlChar *value = xmlTextReeaderConstValue(reader);
text += reinterpret_cast<const char*>(value);
}
break;
}
case XML_READER_TYPE_END_ELEMENT:
{
const xmlChar *name = xmlTextReaderConstLocalName(reader);
if (textDepth != -1)
{
if ((xmlTextReaderNodeDepth(reader) == textDepth)
/*&& xmlStrEqual(name, BAD_CAST "title")*/)
{
textDepth = -1;
title = text;
text.clear();
std::cout << "Title: " << title << std::endl;
}
// else other <entry> children as needed ...
}
else if (entryDepth != -1)
{
if ((xmlTextReaderNodeDepth(reader) == entryDepth)
/*&& xmlStrEqual(name, BAD_CAST "entry")*/)
{
entryDepth = -1;
m_feedBuffer.push_back( CFeed { name, title, updated, author, link } );
}
}
else if (feedDepth != -1)
{
if ((xmlTextReaderNodeDepth(reader) == feedDepth)
/*&& xmlStrEqual(name, BAD_CAST "feed")*/)
{
feedDepth = -1;
}
}
break;
}
}
}
if (ret == -1)
throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}
相关文章:
- 如何在C++中确定文本文件中的元素是字符还是数字
- 比较文本文件的元素
- 使用 libxml2 解析 XML 文件时缺少元素值
- 比较文本文件的元素
- 从二进制文件中读取元素时引发异常(引发异常:读取访问冲突. _Pnext 0xB414D4)
- 如何在C++中将.csv文件的元素存储到二维向量中?
- 如何在 C++ 中将从文本文件中读取的元素推送和弹出到数组中,并按 Revserse 顺序输出堆栈?
- OpenCV 文件存储 - 错误:元素之间的解析错误 (icvYMLParseValue) 缺失
- 尝试读取数据文件,存储在数组中并打印所有元素,但它不起作用
- 计算TXT文件中元素的数量
- 使用提升属性树检查 xml 文件中是否存在元素
- 如何在头文件中定义结构的元素
- 可视化地将数组的所有元素输出到 C++ 中的文件中
- 在 c++ 中替换文件中特定位置的元素
- 将多种数据类型写入文件时的其他行和额外的数组元素
- 将数据从文件读取到链接列表中,并在链接列表中搜索元素
- 从字符串转换为float时,丢失文件中的第一行元素
- C++2d数组,用文件中最后一个元素值填充所有元素
- 使用 CDT 从头文件中获取属性定义的节点元素是什么
- 如何将文件的元素放入列表中