libxml2剖析(3)：使用教程

本文整理自官方使用教程http://xmlsoft.org/tutorial/index.html。

示例文檔story.xml如下：

<?xml version="1.0"?>
<story>
<storyinfo>
<author>John Fleck</author>
<datewritten>June 2, 2002</datewritten>
<keyword>example keyword</keyword>
</storyinfo>
<body>
<headline>This is the headline</headline>
<para>This is the body text.</para>
</body>
</story>

1、解析xml文檔
解析文檔時只需要文檔名和一個函數調用，再加上錯誤處理。下面代碼查找keyword節點並打印節點下的文本內容，如下：

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
/* 解析storyinfo節點，打印keyword節點的內容 */
void parseStory(xmlDocPtr doc, xmlNodePtr cur){
xmlChar* key;
cur=cur->xmlChildrenNode;
while(cur != NULL){
/* 找到keyword子節點 */
if(!xmlStrcmp(cur->name, (const xmlChar *)"keyword")){
key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
printf("keyword: %s\n", key);
xmlFree(key);
}
cur=cur->next; /* 下一個子節點 */
}
return;
}
/* 解析文檔 */
static void parseDoc(char *docname){
/* 定義文檔和節點指針 */
xmlDocPtr doc;
xmlNodePtr cur;
/* 進行解析，如果沒成功，顯示一個錯誤並停止 */
doc = xmlParseFile(docname);
if(doc == NULL){
fprintf(stderr, "Document not parse successfully. \n");
return;
}
/* 獲取文檔根節點，若無內容則釋放文檔樹並返回 */
cur = xmlDocGetRootElement(doc);
if(cur == NULL){
fprintf(stderr, "empty document\n");
xmlFreeDoc(doc);
return;
}
/* 確定根節點名是否爲story，不是則返回 */
if(xmlStrcmp(cur->name, (const xmlChar *)"story")){
fprintf(stderr, "document of the wrong type, root node != story");
xmlFreeDoc(doc);
return;
}
/* 遍歷文檔樹 */
cur = cur->xmlChildrenNode;
while(cur != NULL){
/* 找到storyinfo子節點 */
if(!xmlStrcmp(cur->name, (const xmlChar *)"storyinfo")){
parseStory(doc, cur); /* 解析storyinfo子節點 */
}
cur = cur->next; /* 下一個子節點 */
}
xmlFreeDoc(doc); /* 釋放文檔樹 */
return;
}
int main(int argc, char **argv){
char *docname;
if(argc <= 1){
printf("Usage: %s docname\n", argv[0]);
return 0;
}
docname=argv[1];
parseDoc(docname);
return 1;
}

    解析XML文檔的基本流程如下：
   （1）定義文檔指針和節點指針。
   （2）調用xmlParseFile()解析文檔。如果不成功，註冊一個錯誤並停止。一個常見錯誤是不適當的編碼。XML標準文檔除了用默認的UTF-8或UTF-16外，還可顯式指定用其它編碼保存。如果文檔是這樣，libxml2將自動地爲你轉換到UTF-8。更多關於XML編碼信息包含在XML標準中。
   （3）調用xmlDocGetRootElement()獲取文檔根節點，若無根節點則釋放文檔樹並返回。
   （4）確認文檔是正確的類型，通過檢查根節點名稱來判斷。
   （5）檢索節點的內容，這需要遍歷文檔樹。對每個節點，遍歷其子節點都需要一個循環。先用cur = cur->xmlChildrenNode獲取第一個子節點，然後通過cur = cur->next不斷向前遍歷，直到cur==NULL。查找找指定節點時使用xmlStrcmp()函數，如果你指定的名稱相同，就找到了你要的節點。通常把查找某個子節點的過程封裝成函數。
   （6）獲取節點中的內容。查找到指定節點後，調用xmlNodeListGetString()獲取節點下的文本。注意在XML中，包含在節點中的文本是這個節點的子節點，因此獲取的是cur->xmlChildrenNode中的字符串。xmlNodeListGetString()會爲返回的字符串分配內存，因此記得要用xmlFree()來釋放它。
   （7）調用xmlFreeDoc()釋放文檔樹指針。
   2、使用XPath查詢信息
   在xml文檔中查詢信息是一項核心工作。Libxml2支持使用XPath表達式來查找匹配的節點集。簡而言之，XPath之於xml，好比SQL之於關係數據庫。要在一個複雜的xml文檔中查找所需的信息，XPath簡直是必不可少的工具。下面代碼查詢所有keyword元素的內容。

[cpp] view plain copy

#include <libxml/parser.h>
#include <libxml/xpath.h>
/* 解析文檔 */
xmlDocPtr getdoc(char *docname){
xmlDocPtr doc;
doc = xmlParseFile(docname);
if(doc == NULL){
fprintf(stderr, "Document not parsed successfully. \n");
return NULL;
}
return doc;
}
/* 查詢節點集 */
xmlXPathObjectPtr getnodeset(xmlDocPtr doc, xmlChar *xpath){
xmlXPathContextPtr context;
xmlXPathObjectPtr result; /* 存儲查詢結果 */
/* 創建一個xpath上下文 */
context = xmlXPathNewContext(doc);
if(context == NULL){
printf("Error in xmlXPathNewContext\n");
return NULL;
}
/* 查詢XPath表達式 */
result = xmlXPathEvalExpression(xpath, context);
xmlXPathFreeContext(context); /* 釋放上下文指針 */
if(result == NULL){
printf("Error in xmlXPathEvalExpression\n");
return NULL;
}
/* 檢查結果集是否爲空 */
if(xmlXPathNodeSetIsEmpty(result->nodesetval)){
xmlXPathFreeObject(result); /* 如爲這空就釋放 */
printf("No result\n");
return NULL;
}
return result;
}
int main(int argc, char ** argv){
char *docname;
xmlDocPtr doc;
/* 查找所有keyword元素，而不管它們在文檔中的位置 */
xmlChar *xpath=(xmlChar*)"//keyword";
xmlNodeSetPtr nodeset;
xmlXPathObjectPtr result;
int i;
xmlChar *keyword;
if(argc <= 1){
printf("Usage: %s docname\n", argv[0]);
return(0);
}
docname = argv[1];
doc = getdoc(docname);
result = getnodeset(doc, xpath);
if(result){
/* 得到keyword節點集 */
nodeset = result->nodesetval;
for(i=0; i < nodeset->nodeNr; i++){ /* 打印每個節點中的內容 */
keyword = xmlNodeListGetString(doc, nodeset->nodeTab[i]->xmlChildrenNode, 1);
printf("keyword: %s\n", keyword);
xmlFree(keyword);
}
xmlXPathFreeObject(result); /* 釋放結果集 */
}
xmlFreeDoc(doc); /* 釋放文檔樹 */
xmlCleanupParser(); /* 清除庫內存 */
return(1);
}

    可以在story.xml中多插入幾個keyword元素，然後運行一下本程序看看效果。使用XPath查詢信息的基本流程如下：
   （1）調用xmlXPathNewContext()給文檔樹創建一個上下文指針。
   （2）調用xmlXPathEvalExpression()，傳入XPath表達式和上下文指針，返回一個xmlXPathObjectPtr結果集指針。nodesetval對象包含keyword節點個數(nodeNr)和節點列表(nodeTab)。在使用之前要和xmlXPathNodeSetIsEmpty()檢查nodesetval節點列表是否爲空。
   （3）遍歷節點列表nodeTab，用xmlNodeListGetString()獲取每個keyword節點的內容。
   （4）用xmlXPathFreeObject()釋放查詢結果，用xmlFreeDoc()釋放文檔樹。
   更多關於Xpath的內容可以參考XPath官方規範http://www.w3.org/TR/xpath/。XPath語法的介紹，可參考w3school上的教程http://www.w3school.com.cn/xpath/index.asp，或者http://w3schools.com/xpath/default.asp。只有掌握XPath，才能掌握使用大型XML文件獲取信息的方法，否則每尋找一個節點都要從根節點找起，很耗時耗力。
   3、修改xml文檔
   這與上面的過程類似，首先遍歷文檔樹，找到要插入（或刪除）的節點處，然後插入（或刪除）相關的內容。下面代碼在storyinfo節點下插入一個keyword元素。

[cpp] view plain copy

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
void
parseStory(xmlDocPtr doc, xmlNodePtr cur, const xmlChar* keyword) {
/* 在當前節點下插入一個keyword子節點 */
xmlNewTextChild(cur, NULL, (const xmlChar*)"keyword", keyword);
return;
}
xmlDocPtr
parseDoc(char *docname, char *keyword) {
xmlDocPtr doc;
xmlNodePtr cur;
doc = xmlParseFile(docname);
if (doc == NULL ) {
fprintf(stderr,"Document not parsed successfully. \n");
return (NULL);
}
cur = xmlDocGetRootElement(doc);
if (cur == NULL) {
fprintf(stderr,"empty document\n");
xmlFreeDoc(doc);
return (NULL);
}
if (xmlStrcmp(cur->name, (const xmlChar *) "story")) {
fprintf(stderr,"document of the wrong type, root node != story");
xmlFreeDoc(doc);
return (NULL);
}
cur = cur->xmlChildrenNode;
while (cur != NULL) {
if ((!xmlStrcmp(cur->name, (const xmlChar *)"storyinfo"))){
parseStory (doc, cur, (const xmlChar*)keyword);
}
cur = cur->next;
}
return(doc);
}
int
main(int argc, char **argv) {
char *docname;
char *keyword;
xmlDocPtr doc;
if (argc <= 2) {
printf("Usage: %s docname, keyword\n", argv[0]);
return(0);
}
docname = argv[1];
keyword = argv[2];
doc = parseDoc(docname, keyword);
if (doc != NULL) {
xmlSaveFormatFile(docname, doc, 0);
xmlFreeDoc(doc);
}
return (1);
}

這裏xmlNewTextChild函數在當前節點指針上添加一個子元素。如果希望元素有名字空間，則可以在這裏加上。添加完後，就要用xmlSaveFormatFile()把修改後的文檔寫入到文件。我們這裏使用原來doc文檔指針，因此會覆蓋原來的文件。第三個參數如果設置爲1，則輸出的文檔會自動縮進。
若要刪除某個節點，可以使用以下代碼：

[cpp] view plain copy

if(!xmlStrcmp(cur->name, BAD_CAST "keyword")){
xmlNodePtr tempNode;
tempNode = cur->next;
xmlUnlinkNode(cur);
xmlFreeNode(cur);
cur = tempNode;
continue;
}

注意libxml2並沒有xmlDelNode或者xmlRemoveNode之類的函數。我們需要將當前節點從文檔中斷鏈（unlink），文檔就不會再包含這個子節點。這樣做需要使用一個臨時變量來存儲斷鏈節點的後續節點，並記得要手動刪除斷鏈節點的內存。
若要給節點添加屬性，可以這樣：

[cpp] view plain copy

xmlDocPtr
parseDoc(char *docname, char *uri) {
xmlDocPtr doc;
xmlNodePtr cur;
xmlNodePtr newnode;
xmlAttrPtr newattr;
doc = xmlParseFile(docname);
if (doc == NULL ) {
fprintf(stderr,"Document not parsed successfully. \n");
return (NULL);
}
cur = xmlDocGetRootElement(doc);
if (cur == NULL) {
fprintf(stderr,"empty document\n");
xmlFreeDoc(doc);
return (NULL);
}
if (xmlStrcmp(cur->name, (const xmlChar *) "story")) {
fprintf(stderr,"document of the wrong type, root node != story");
xmlFreeDoc(doc);
return (NULL);
}
newnode = xmlNewTextChild(cur, NULL, "reference", NULL);
newattr = xmlNewProp(newnode, "uri", uri);
return(doc);
}

我們用xmlAttrPtr聲明一個屬性指針。在找到story元素後，用xmlNewTextChild()新建一個reference子元素，用xmlNewProp()給這個子元素新建一個uri屬性。文檔修改完後要用xmlSaveFormatFile()寫入到磁盤。
查詢屬性的過程類似。如下：

[cpp] view plain copy

void
getReference(xmlDocPtr doc, xmlNodePtr cur) {
xmlChar *uri;
cur = cur->xmlChildrenNode;
while (cur != NULL) {
if ((!xmlStrcmp(cur->name, (const xmlChar *)"reference"))) {
uri = xmlGetProp(cur, "uri");
printf("uri: %s\n", uri);
xmlFree(uri);
}
cur = cur->next;
}
return;
}

    關鍵函數爲xmlGetProp()，用來獲取節點中的指定屬性。注意如果你使用DTD爲屬性聲明一個固定的或默認的值，則該函數也查找這些值。
   4、創建xml文檔
   有了上面的基礎，創建一個xml文檔顯得非常簡單，就是一個不斷插入節點的過程。其流程如下：
   （1）用xmlNewDoc函數創建一個文檔指針doc；
   （2）用xmlNewNode函數創建一個節點指針root_node；
   （3）用xmlDocSetRootElement將root_node設置爲doc的根結點；
   （4）用xmlAddChild()給root_node添加一系列的子節點，並設置子節點的內容和屬性；
   （5）用xmlSaveFile將xml文檔存入文件；
   （6）用xmlFreeDoc函數關閉文檔指針，並清除本文檔中所有節點動態申請的內存。
   下面代碼創建一個xml文檔：

[cpp] view plain copy

#include <stdio.h>
#include <iostream>
#include <libxml/parser.h>
#include <libxml/tree.h>
using namespace std;
int main(int argc, char* argv[]){
//定義文檔和節點指針
xmlDocPtr doc=xmlNewDoc(BAD_CAST"1.0");
xmlNodePtr root_node=xmlNewNode(NULL,BAD_CAST"root");
//設置根節點
xmlDocSetRootElement(doc,root_node);
//在根節點中直接創建節點
xmlNewTextChild(root_node, NULL, BAD_CAST"newNode1", BAD_CAST"newNode1 content");
xmlNewTextChild(root_node, NULL, BAD_CAST"newNode2", BAD_CAST"newNode2 content");
xmlNewTextChild(root_node, NULL, BAD_CAST"newNode3", BAD_CAST"newNode3 content");
//創建一個節點，設置其內容和屬性，然後加入根結點
xmlNodePtr node=xmlNewNode(NULL, BAD_CAST"node2");
xmlNodePtr content=xmlNewText(BAD_CAST"NODE CONTENT");
xmlAddChild(root_node,node);
xmlAddChild(node,content);
xmlNewProp(node,BAD_CAST"attribute",BAD_CAST"yes");
//創建一個兒子和孫子節點
node=xmlNewNode(NULL,BAD_CAST"son");
xmlAddChild(root_node,node);
xmlNodePtr grandson=xmlNewNode(NULL,BAD_CAST"grandson");
xmlAddChild(node,grandson);
xmlAddChild(grandson,xmlNewText(BAD_CAST"This is a grandson node"));
//存儲xml文檔
int nRel=xmlSaveFile("CreatedXml.xml",doc);
if(nRel!=-1){
cout<<"一個xml文檔被創建，寫入"<<nRel<<"個字節"<<endl;
}
//釋放文檔內節點動態申請的內存
xmlFreeDoc(doc);
return 1;
}

編譯並運行這個程序，將創建CreatedXml.xml文檔，內容如下：

[html] view plain copy

<root>
<newNode1>newNode1 content</newNode1>
<newNode2>newNode2 content</newNode2>
<newNode3>newNode3 content</newNode3>
<node2 attribute="yes">NODE CONTENT</node2>
<son>
<grandson>This is a grandson node</grandson>
</son>
</root>

    注意，有多種方式可以添加子節點。第一是用xmlNewTextChild直接添加一個文本子節點；第二是先創建新節點，然後用xmlAddChild將新節點加入上層節點。
   5、編碼轉換
   數據編碼兼容性問題是很多開發人員都會遇到的一大難題，特別是在使用libxml時。libxml內部使用UTF-8格式存儲和操作數據。你的應用程序數據如果使用其他格式的編碼，例如ISO-8859-1編碼，則在傳給libxml之前必須轉換成UTF-8格式。如果你的應用輸出想用非UTF-8格式的編碼，也需要進行轉換。
   Libxml2本身只支持把UTF-8, UTF-16和ISO-8859-1格式的外部數據轉換成內部使用的UTF-8格式，以及處理完後輸出成這些格式的數據。對其他的字符編碼，需要使用libiconv（當然你也可以使用其他的國際化庫，例如ICU）。當前libiconv支持150多種不同的字符編碼，libiconv的實現儘量保證支持所有我們聽過的編碼格式。在使用libxml之前，一般是通過libiconv把數據先轉換UTF-8格式。在使用libxml處理完之後，再通過libiconv把數據輸出成你要的編碼格式。
   一個常見的錯誤是一份代碼的不同部分的數據使用不同的編碼格式。例如內部數據使用ISO-8859-1格式的應用程序，聯合使用libxml，而它的內部數據格式爲UTF-8。這樣應用程序在運行不同的代碼段時要不同地對待內部數據，這有可能導致解析數據出現錯誤。
   例子1：使用Libxml內建的編碼處理器
   下面的例子創建一個簡單的文檔，添加從命令行得到的數據到文檔根元素，並以合適的編碼格式輸出到stdout。對提供的數據我們使用ISO-8859-1編碼，處理過程爲從ISO-8859-1到UTF-8，再到ISO-8859-1。命令行上輸入的字符串從ISO-8859-1格式轉換成UTF-8格式，以供libxml使用，輸出時又重新轉換成ISO-8859-1格式。

[cpp] view plain copy

#include <string.h>
#include <libxml/parser.h>
/* 對指定編碼格式的外部數據，轉換成libxml使用UTF-8格式 */
unsigned char*
convert(unsigned char *in, char *encoding){
unsigned char *out;
int ret,size,out_size,temp;
/* 定義一個編碼處理器指針 */
xmlCharEncodingHandlerPtr handler;
size = (int)strlen((const char*)in)+1; /* 輸入數據長度 */
out_size = size*2-1; /* 輸出數據長度 */
out = (unsigned char*)malloc((size_t)out_size); /* 存放輸出數據 */
if (out) {
/* 查找內建的編碼處理器 */
handler = xmlFindCharEncodingHandler(encoding);
if(!handler) {
free(out);
out = NULL;
}
}
if(out) {
temp=size-1;
/* 對輸入數據進行編碼轉換 */
ret = handler->input(out, &out_size, in, &temp);
if(ret || temp-size+1) { /* 轉換不成功 */
if (ret) { /* 轉換失敗 */
printf("conversion wasn't successful.\n");
} else { /* 只轉換了一部分數據 */
printf("conversion wasn't successful. converted: %i octets.\n",temp);
}
free(out);
out = NULL;
}else { /* 轉換成功 */
out = (unsigned char*)realloc(out,out_size+1);
out[out_size]=0; /* 輸出的末尾加上null終止符 */
}
} else {
printf("no mem\n");
}
return (out);
}
int
main(int argc, char **argv) {
unsigned char *content, *out;
xmlDocPtr doc;
xmlNodePtr rootnode;
char *encoding = "ISO-8859-1";
if (argc <= 1) {
printf("Usage: %s content\n", argv[0]);
return(0);
}
content = (unsigned char*)argv[1];
/* 轉換成libxml2使用的UTF-8格式 */
out = convert(content, encoding);
doc = xmlNewDoc (BAD_CAST "1.0");
rootnode = xmlNewDocNode(doc, NULL, (const xmlChar*)"root", out);
xmlDocSetRootElement(doc, rootnode);
/* 以ISO-8859-1格式輸出文檔內容 */
xmlSaveFormatFileEnc("-", doc, encoding, 1);
return (1);
}

編譯運行這個程序，假設在命令行上提供的數據"zhou"是ISO-8859-1格式（我的系統中不是），則輸出文檔爲：

[html] view plain copy

<?xml version="1.0" encoding="ISO-8859-1"?>
<root>zhou</root>

    編碼轉換的基本流程如下：
   （1）用xmlCharEncodingHandlerPtr定義一個編碼處理器指針，用xmlFindCharEncodingHandler()查找libxml2中指定的編碼處理器。libxml2內建只支持把UTF-8, UTF-16和ISO-8859-1格式的外部數據轉換成內部使用的UTF-8格式。如果要轉換其他格式的數據（如中文編碼），則要使用獨立的libiconv庫給libxml2註冊新編碼處理器。
   （2）調用編碼處理器的input()函數，把外部數據轉換成libxml2使用的格式。
   （3）進行xml處理，處理完若要保存成非UTF-8格式的文檔，使用xmlSaveFormatFileEnc()函數。若保存的編碼格式libxml2不支持，則只能用libiconv把保存的文檔轉換成需要的編碼格式。
   例子2：通過iconv庫給Libxml註冊新的編碼處理器
   下面例子先編寫GBK的編碼處理器gbk_input()和gbk_output()，前者是GBK到UTF-8輸入處理，後者是UTF-8到GBK輸出處理，這兩個處理器都要用到iconv轉換函數。然後調用xmlNewCharEncodingHandler()註冊輸入輸出處理器。對輸入輸出數據的編碼轉換由convertToUTF8From()和utf8ConvertTo()來完成，它們都是調用xmlFindCharEncodingHandler()查找已註冊的處理器，然後在處理器上調用input()或output()對數據進行編碼轉換。

[cpp] view plain copy

#include <string.h>
#include <iconv.h>
#include <libxml/encoding.h>
#include <libxml/xmlwriter.h>
#include <libxml/xmlreader.h>
/* 輸入編碼處理器：GBK到UTF-8 */
int gbk_input(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen){
char *outbuf = (char *) out;
char *inbuf = (char *) in;
iconv_t iconv_from; /* gbk到utf-8的轉換描述符 */
size_t len1, len2, rslt;
/* 注意一般不直接從int*到size_t*的轉換
這在32位平臺下是正常的，但到了64平臺下size_t爲64位，
那(size_t*)inlen將是一個未知的數據
*/
len1 = *inlen;
len2 = *outlen;
/* 分配一個從GBK到UTF-8的轉換描述符 */
iconv_from = iconv_open("utf-8","gbk");
/* 根據轉換描述符，對數據進行編碼轉換 */
rslt = iconv(iconv_from, &inbuf, &len1, &outbuf, &len2);
if(rslt < 0){
return rslt;
}
iconv_close(iconv_from); /* 釋放描述符 */
*outlen = ((unsigned char *) outbuf - out);
*inlen = ((unsigned char *) inbuf - in);
return *outlen;
}
/* 輸出編碼處理器：UTF-8到GBK */
int gbk_output(unsigned char *out, int *outlen,
const unsigned char *in, int *inlen){
char *outbuf = (char *) out;
char *inbuf = (char *) in;
iconv_t iconv_to; /* utf-8到gbk的轉換描述符 */
size_t len1, len2, rslt;
/* 注意一般不直接從int*到size_t*的轉換
這在32位平臺下是正常的，但到了64平臺下size_t爲64位，
那(size_t*)inlen將是一個未知的數據
*/
len1 = *inlen;
len2 = *outlen;
/* 分配一個從UTF-8到GBK的轉換描述符 */
iconv_to=iconv_open("gbk","utf-8");
/* 根據轉換描述符，對數據進行編碼轉換 */
rslt = iconv(iconv_to, &inbuf, &len1, &outbuf, &len2);
if(rslt < 0){
return rslt;
}
iconv_close(iconv_to); /* 釋放描述符 */
*outlen = ((unsigned char *) outbuf - out);
*inlen = ((unsigned char *) inbuf - in);
return *outlen;
}
/**
* convertToUTF8From:
* 把encoding編碼的輸入數據in轉換成utf-8格式返回
* 出錯則返回NULL
*/
xmlChar *convertToUTF8From(const char *in, const char *encoding){
xmlChar *out;
int ret;
int size;
int out_size;
int temp;
xmlCharEncodingHandlerPtr handler;
if (in == 0)
return 0;
/* 查找內建的編碼處理器 */
handler = xmlFindCharEncodingHandler(encoding);
if (!handler) {
printf("convertToUTF8From: no encoding handler found for '%s'\n",
encoding ? encoding : "");
return 0;
}
size = (int)strlen(in) + 1; /* 輸入數據長度 */
out_size = size*2 - 1; /* 輸出數據長度 */
/* 存放輸出數據 */
out = (unsigned char *) xmlMalloc((size_t) out_size);
memset(out, 0, out_size);
if(out != NULL) {
temp = size - 1;
/* 對輸入數據進行編碼轉換，成功後返回0 */
ret = handler->input(out, &out_size, (const xmlChar *) in, &temp);
if(ret || temp - size + 1) { /* 轉換不成功 */
if(ret){ /* 轉換失敗 */
printf("convertToUTF8From: conversion wasn't successful.\n");
}else{ /* 只轉換了一部分數據 */
printf("convertToUTF8From: conversion wasn't successful. converted: %i octets.\n", temp);
}
xmlFree(out); /* 釋放輸出緩衝區 */
out = 0;
}else{ /* 轉換成功，在輸出末尾加上null終止符 */
out = (unsigned char *) xmlRealloc(out, out_size + 1);
out[out_size] = 0;
}
} else {
printf("convertToUTF8From: no mem\n");
}
return out;
}
/**
* utf8ConvertTo:
* 把utf-8的數據轉換成encoding編碼返回
* 出錯則返回NULL
*/
char *utf8ConvertTo(xmlChar *in, const char *encoding){
char *out;
int ret;
int size;
int out_size;
int temp;
xmlCharEncodingHandlerPtr handler;
if (in == 0)
return 0;
handler = xmlFindCharEncodingHandler(encoding);
if (!handler) {
printf("utf8ConvertTo: no encoding handler found for '%s'\n",
encoding ? encoding : "");
return 0;
}
size = (int) strlen((char*)in) + 1; /* 輸入數據長度 */
out_size = size * 2 - 1; /* 輸出數據長度 */
out = (char *) malloc((size_t) out_size); /* 存放輸出數據 */
memset(out,0,out_size);
if(out != NULL) {
temp = size - 1;
/* 對輸入數據進行編碼轉換，成功後返回0 */
ret = handler->output((xmlChar*)out, &out_size, (const xmlChar *) in, &temp);
if(ret || temp - size + 1){
if(ret){
printf("utf8ConvertTo: conversion wasn't successful.\n");
}else{
printf("utf8ConvertTo: conversion wasn't successful. converted: %i octets.\n", temp);
}
free(out);
out = 0;
}else{
out = (char *) realloc(out, out_size + 1);
out[out_size] = 0; /* 末尾加上null終止符 */
}
}else{
printf("utf8ConvertTo: no mem\n");
}
return out;
}
int main(int argc, char **argv){
const char *content;
xmlChar *out;
xmlDocPtr doc;
xmlNodePtr rootnode;
if (argc <= 1) {
printf("Usage: %s content\n", argv[0]);
return(0);
}
content = (const char*)argv[1];
/* 添加gbk編碼支持 */
xmlNewCharEncodingHandler("gbk", gbk_input, gbk_output);
/* 添加gb2312編碼支持：仍然可以使用GBK的輸入輸出處理器 */
xmlNewCharEncodingHandler("gb2312", gbk_input, gbk_output);
/* 輸入的GBK數據轉換成libxml2使用的UTF-8格式 */
out = convertToUTF8From(content, "gbk");
/* 創建xml文檔 */
doc = xmlNewDoc(BAD_CAST "1.0");
rootnode = xmlNewDocNode(doc, NULL, (const xmlChar*)"root", out);
xmlDocSetRootElement(doc, rootnode);
/* 以gb2312格式保存文檔內容："-"表示輸出到終端 */
xmlSaveFormatFileEnc("-", doc, "gb2312", 1);
xmlCleanupCharEncodingHandlers()；/* 釋放編碼處理器資源 */
return (1);
}

    這個例子在32位與64位Linux平臺下測試通過。iconv庫是Linux默認自帶的組件，因此在Linux中使用libxml非常方便。我們先建立utf-8編碼與gbk編碼的轉換接口，並將接口插入到libxml2庫中，這樣xml庫就支持對gb2312和gbk編碼的支持了。當然，這個轉換不會自動完成，我們需要使用從libxml庫中查找特定編碼的接口，libxml支持一些基本的編碼接口，如ISO-8859-1，UTF-16等編碼，但不支持gbk，所以在上述代碼中，我們定義了gbk_input，與gbk_output兩個接口，這兩個接口的原型聲明是libxml庫的標準聲明，即xmlCharEncodingInputFunc和xmlCharEncodingOutputFunc。在使用完libxml庫之後，我們需要釋放libxml庫的轉換資源。
   例子3：直接使用iconv庫進行轉換
   下面例子直接使用iconv函數對輸入輸出進行編碼轉換，而不是通過註冊編碼處理器的方式。

[cpp] view plain copy

#include <stdio.h>
#include <string.h>
#include <iconv.h>
#include <libxml/parser.h>
#include <libxml/tree.h>
/* 代碼轉換:從一種編碼轉爲另一種編碼 */
int encoding_convert(const char *from_charset, const char *to_charset,
char *inbuf, int inlen,
char* outbuf, int outlen){
iconv_t cd;
size_t len1, len2, rslt;
/* 注意一般不直接從int*到size_t*的轉換
這在32位平臺下是正常的，但到了64平臺下size_t爲64位，
那(size_t*)inlen將是一個未知的數據
*/
len1 = inlen;
len2 = outlen;
/* 分配一個轉換描述符 */
cd = iconv_open(to_charset,from_charset);
if(cd == 0)
return -1;
memset(outbuf,0,len2);
/* 執行編碼轉換 */
rslt=iconv(cd, &inbuf, &len1, &outbuf, &len2);
if(rslt== -1)
return -1;
iconv_close(cd); /* 釋放描述符 */
return 0;
}
/* GB2312轉換爲UTF-8
* 成功則返回一個動態分配的char*變量，需要在使用完畢後手動free，失敗返回NULL
*/
char *gb2312_utf8(char *inbuf){
int nOutLen = 2*strlen(inbuf)-1;
char *szOut=(char*)xmlMalloc(nOutLen);
if(-1 == encoding_convert("gb2312","uft-8",inbuf,strlen(inbuf),szOut,nOutLen)){
xmlFree(szOut);
szOut=NULL;
}
return szOut;
}
/* UTF-8轉換爲GB2312
* 成功則返回一個動態分配的char*變量，需要在使用完畢後手動free，失敗返回NULL
*/
char *utf8_gb2312(char *inbuf){
int nOutLen = 2* strlen(inbuf)-1;
char *szOut=(char*)xmlMalloc(nOutLen);
if(-1 == encoding_convert("utf-8","gb2312",inbuf,strlen(inbuf),szOut,nOutLen)){
xmlFree(szOut);
szOut=NULL;
}
return szOut;
}
int main(int argc, char **argv){
/* 定義文檔節點和指針 */
xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0");
xmlNodePtr root_node=xmlNewNode(NULL, BAD_CAST "root");
/* 設置根節點 */
xmlDocSetRootElement(doc, root_node);
/* 一箇中文字符串轉換爲UTF-8字符串，然後寫入 */
char *szOut=gb2312_utf8("節點1的內容");
/* 在根節點中直接創建節點 */
xmlNewTextChild(root_node, NULL, BAD_CAST "newNode1", BAD_CAST "newNode1 content");
xmlNewTextChild(root_node, NULL, BAD_CAST "newNode2", BAD_CAST "newNode2 content");
xmlNewTextChild(root_node, NULL, BAD_CAST "newNode3", BAD_CAST "newNode3 content");
xmlNewChild(root_node, NULL, BAD_CAST "node1",BAD_CAST szOut);
xmlFree(szOut);
/* 創建一個節點，設置其內容和屬性，然後加入根結點 */
xmlNodePtr node = xmlNewNode(NULL,BAD_CAST "node2");
xmlNodePtr content = xmlNewText(BAD_CAST "NODE CONTENT");
xmlAddChild(root_node,node);
xmlAddChild(node,content);
szOut = gb2312_utf8("屬性值");
xmlNewProp(node,BAD_CAST "attribute",BAD_CAST szOut);
xmlFree(szOut);
/* 創建一箇中文節點 */
szOut = gb2312_utf8("中文節點");
xmlNewChild(root_node, NULL, BAD_CAST szOut,BAD_CAST "content of chinese node");
xmlFree(szOut);
/* 存儲xml文檔 */
int nRel = xmlSaveFormatFileEnc("CreatedXml_cn.xml",doc,"GB2312",1);
if (nRel != -1){
printf("一個xml文檔被創建,寫入%d個字節", nRel);
}
xmlFreeDoc(doc);
return 1;
}

這個例子中，當把中文數據寫入到XML節點時，使用gb2312_utf8()直接轉換成UTF-8格式，這種直接通過iconv轉換的方式更高效。編譯並運行程序，輸出文檔如下：

[html] view plain copy

<?xml version="1.0" encoding="GB2312"?>
<root>
<newNode1>newNode1 content</newNode1>
<newNode2>newNode2 content</newNode2>
<newNode3>newNode3 content</newNode3>
<node1>節點1的內容</node1>
<node2 attribute="屬性值">NODE CONTENT</node2>
<中文節點>content of chinese node</中文節點>
</root>

    6、一個真實的例子
   內容整理自http://xmlsoft.org/example.html。
   下面是一個真實的例子。應用程序數據的內容不使用DOM樹，而是使用內部數據結構來保存。這是一個基於XML存儲結構的數據庫，它保存了與Gnome相關的任務。如下：

[html] view plain copy

<?xml version="1.0"?>
<gjob:Helping xmlns:gjob="http://www.gnome.org/some-location">
<gjob:Jobs>
<gjob:Job>
<gjob:Project ID="3"/>
<gjob:Application>GBackup</gjob:Application>
<gjob:Category>Development</gjob:Category>
<gjob:Update>
<gjob:Status>Open</gjob:Status>
<gjob:Modified>Mon, 07 Jun 1999 20:27:45 -0400 MET DST</gjob:Modified>
<gjob:Salary>USD 0.00</gjob:Salary>
</gjob:Update>
<gjob:Developers>
<gjob:Developer>
</gjob:Developer>
</gjob:Developers>
<gjob:Contact>
<gjob:Person>Nathan Clemons</gjob:Person>
<gjob:Email>[email protected]</gjob:Email>
<gjob:Company>
</gjob:Company>
<gjob:Organisation>
</gjob:Organisation>
<gjob:Webpage>
</gjob:Webpage>
<gjob:Snailmail>
</gjob:Snailmail>
<gjob:Phone>
</gjob:Phone>
</gjob:Contact>
<gjob:Requirements>
The program should be released as free software, under the GPL.
</gjob:Requirements>
<gjob:Skills>
</gjob:Skills>
<gjob:Details>
A GNOME based system that will allow a superuser to configure
compressed and uncompressed files and/or file systems to be backed
up with a supported media in the system. This should be able to
perform via find commands generating a list of files that are passed
to tar, dd, cpio, cp, gzip, etc., to be directed to the tape machine
or via operations performed on the filesystem itself. Email
notification and GUI status display very important.
</gjob:Details>
</gjob:Job>
</gjob:Jobs>
</gjob:Helping>

把XML文件加載到一個內部DOM樹中只是調用幾個函數的問題，而遍歷整個樹來收集數據，並生成內部結構則更困難，也更容易出錯。
對輸入結構的定義法則是非常寬鬆的。屬性的順序無關緊要（XML規範清楚地說明了這一點），不要依賴於一個節點的子節點順序通常是一個好的主意，除非這樣做真的使事情變得更困難了。下面是解析person信息的一段代碼：

[cpp] view plain copy

/*
* 一個person記錄
*/
typedef struct person {
char *name;
char *email;
char *company;
char *organisation;
char *smail;
char *webPage;
char *phone;
} person, *personPtr;
/*
* 解析person的代碼
*/
personPtr parsePerson(xmlDocPtr doc, xmlNsPtr ns, xmlNodePtr cur) {
personPtr ret = NULL;
DEBUG("parsePerson\n");
/*
* 爲結構分配內存
*/
ret = (personPtr) malloc(sizeof(person));
if (ret == NULL) {
fprintf(stderr,"out of memory\n");
return(NULL);
}
memset(ret, 0, sizeof(person));
/* 我們不關心頂層的元素名是什麼 */
cur = cur->xmlChildrenNode;
while (cur != NULL) {
if ((!strcmp(cur->name, "Person")) && (cur->ns == ns))
ret->name = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
if ((!strcmp(cur->name, "Email")) && (cur->ns == ns))
ret->email = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
cur = cur->next;
}
return(ret);
}

    下面是要注意的一些事項：
   （1）通常一個遞歸的解析風格是更方便的：XML數據天然地遵循重複式地構造，並且是高度結構化的。
   （2）兩個參數是xmlDocPtr和xmlNsPtr類型，即指向XML文檔和應用程序保留的命名空間的指針。文檔信息非常廣泛，爲你的應用程序數據集定義一個命名空間並測試元素和屬性是否屬性這個空間是一個好的編程實踐。這隻需一個簡單的相等測試（cur->ns == ns）。
   （3）爲了查詢文本和屬性值，你可以使用函數xmlNodeListGetString()來獲取所有文本，和由DOM輸出生成的引用節點，並生成一個單一的文本字符串。
   下面是解析另外一個結構的代碼片段：

[cpp] view plain copy

#include <libxml/tree.h>
/*
* 一個Job的描述
*/
typedef struct job {
char *projectID;
char *application;
char *category;
personPtr contact;
int nbDevelopers;
personPtr developers[100]; /* using dynamic alloc is left as an exercise */
} job, *jobPtr;
/*
* 解析Job的代碼
*/
jobPtr parseJob(xmlDocPtr doc, xmlNsPtr ns, xmlNodePtr cur) {
jobPtr ret = NULL;
DEBUG("parseJob\n");
/*
* 爲結構分配內存
*/
ret = (jobPtr) malloc(sizeof(job));
if (ret == NULL) {
fprintf(stderr,"out of memory\n");
return(NULL);
}
memset(ret, 0, sizeof(job));
/* 我們不關心頂層元素名是什麼 */
cur = cur->xmlChildrenNode;
while (cur != NULL) {
if ((!strcmp(cur->name, "Project")) && (cur->ns == ns)) {
ret->projectID = xmlGetProp(cur, "ID");
if (ret->projectID == NULL) {
fprintf(stderr, "Project has no ID\n");
}
}
if ((!strcmp(cur->name, "Application")) && (cur->ns == ns))
ret->application = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
if ((!strcmp(cur->name, "Category")) && (cur->ns == ns))
ret->category = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
if ((!strcmp(cur->name, "Contact")) && (cur->ns == ns))
ret->contact = parsePerson(doc, ns, cur);
cur = cur->next;
}
return(ret);
}

一旦你會使用libxml2，編寫這種類型的代碼是非常簡單的，也很無趣。最終，你可以寫一個擁有C數據結構和一組XML文檔例子或一個XML DTD的樁模塊，並生成在C數據和XML存儲之間導入和導出數據的代碼。

    7、詳細代碼示例
   對Libxml2更詳細的使用介紹，可參考官方的詳細代碼示例http://xmlsoft.org/examples/index.html。上面提供了Libxml2各個組件怎麼使用的詳細代碼示例，包括以下部分：
   xmlWriter: 測試xmlWriter的各個API，包括寫入到文件、寫入到內存緩衝區、寫入到新的文檔或子樹、字符串編碼轉換、對輸出文檔進行序列化。
   InputOutput: 演示使用xmlRegisterInputCallbacks來建立一個客戶I/O層，這被用在XInclude方法上下文中，以顯示怎樣構建動態文檔。還演示使用xmlDocDumpMemory來輸出文檔到字符緩衝區中。
   Parsing: 演示使用xmlReadMemory()讀取XML文檔，xmlFreeDoc()釋放文檔樹；使用xmlCreatePushParserCtxt()和xmlParseChunk()一塊一塊地讀取XML文檔到文檔樹中。演示爲XML文檔創建一個解析上下文，然後解析並驗證這個文檔；創建一個文檔樹，檢查並驗證結果，最後用xmlFreeDoc()釋放文檔樹。演示使用xmlReadFile()讀取XML文檔並用xmlFreeDoc()釋放它。
   Tree: 演示怎樣創建文檔和節點，並把數據dump到標準輸出或文件中。演示使用xmlDocGetRootElement()獲取根元素，然後遍歷文檔並打印各個元素名。
   XPath: 演示怎樣計算XPath表達式，並在XPath上下文註冊名稱空間，打印結果節點集。演示怎麼加載一個文檔、用XPath定位到某個子元素、修改這個元素並保存結果。這包含了加載/編輯/保存的一個完整來回。
   xmlReader: 演示使用xmlReaderForFile()解析XML文檔，並dump出節點的信息。演示在用xmlReaderForFile()解析時驗證文檔的內容，激活各種選項，諸如實體替換、DTD屬性不一致等。演示使用xmlTextReaderPreservePattern()提取XML文檔中某一部分的子文檔。演示重用xmlReader對象來解析多個XML文檔。

libxml2剖析(3)：使用教程

linux安裝cuda和cudnn

模擬手機設備：使用 Playwright 實現移動端自動化測試

Mellanox網卡開啓SR-IOV

全面系統的AI學習路徑，幫助普通人也能玩轉AI

HTML 00 Tutorial

uni-app實現上拉加載

vue3編譯優化之“靜態提升”

又是一個月-20240513

flask 如何保證返回json有序

linux服務器設置ssh免密

ffmpeg解碼的軟解及硬解（cuda和qsv）使用方法

Linux下高併發socket最大連接數所受的各種限制

epoll用法說明，ET模式下的邊緣觸發處理同時多事件

Windows服務器高併發處理IOCP（完成端口）詳細說明

epoll詳解

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結