只要xml原文件標籤完整、正確,哪怕是下圖形式的內容,都可以獲取標籤的內容
代碼總結:
from lxml import etree
import xml.etree.ElementTree as ET
class WoFullParser(object):
if __name__ == '__main__':
# 方法一
xml_file = open('WO2018172090A1.XML', mode='rb')
content = xml_file.read()
selector = etree.HTML(content)
p_list = selector.xpath('//p[@id="p0006"]/text()')
print(p_list)
# 方法二,包含3中遍歷方法
tree = ET.ElementTree(file = 'WO2018172090A1.XML')
root = tree.getroot()
print(root)
# 2.1
for child_of_root in root[0]:
print(child_of_root.tag, child_of_root.attrib, child_of_root.text)
# 2.2
for elem in tree.iter(tag = 'p'):
print(elem.tag, elem.attrib, elem.text)
# 2.3
for ele in tree.iterfind('description/p'):
print(ele.attrib, ele.text)
參考:
https://www.cnblogs.com/deadwood-2016/p/8116863.html