python解析xml文件增刪查找

python解析xml文件增刪查找

XML文件

在這裏插入圖片描述

判斷bndbox大小,併爲object添加子節點difficult

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import os
import os.path
from xml.etree import ElementTree as ET

path='C:/Users/my/Desktop/xmllll/city/1'
new_path='C:/Users/my/Desktop/xmllll/city/1/'
files=os.listdir(path) #得到文件夾下所有文件名稱
my_area = int(input("請輸入area區域大小:"))
# my_area = 20
for xmlFile in files: #遍歷文件夾
    if not os.path.isdir(xmlFile): #判斷是否是文件夾,不是文件夾纔打開
        print(xmlFile)
        per = ET.parse(new_path + xmlFile)
        p=per.findall('./object')
        number_name = 0
        area = []
        # print(type(my_area))
        for oneper in p:  #找出object節點
            for child in oneper.getchildren(): #找出object節點的子節點bndbox/name
                for grandson in child.getchildren(): #找出bndbox節點的子節點
                    area.append(int(child.find('xmin').text))
                    area.append(int(child.find('ymin').text))
                    area.append(int(child.find('xmax').text))
                    area.append(int(child.find('ymax').text))
                    if len(area) == 4:
                        element = ET.Element("difficult")  
                        if ((area[2]-area[0]) > my_area) & ((area[3]-area[1])> my_area):
                                element.text = "0"
                        else:
                                element.text = "1"
                        oneper.insert(1,element)
                        per.write(new_path + xmlFile, encoding="utf-8",xml_declaration=True)
                    area.clear()
                    break
        print('-------')

  1. 該文件可以批處理xml文件
  2. 可以添加子節點,索引
  3. 但是存在速度問題,一直沒法提高速度,應該和使用了大量for循環有關
  4. 注意:修改的xml文件是緩存在內存當中的,因此一定要保存
  5. 添加子節點可以用append或者insert,指定位置用insert

Result

在這裏插入圖片描述

xml增刪查找

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 18 17:36:45 2019
@author: psqk
"""
#!/usr/bin/env python
# -*- coding: utf8 -*-

from xml.etree import ElementTree
from xml.etree.ElementTree import Element, SubElement
from lxml import etree
import codecs

XML_EXT = '.xml'
ENCODE_METHOD = 'utf-8'

class PascalVocWriter:

    def __init__(self, foldername, filename, imgSize,databaseSrc='Unknown', localImgPath=None):
        self.foldername = foldername
        self.filename = filename
        self.databaseSrc = databaseSrc
        self.imgSize = imgSize
        self.boxlist = []
        self.localImgPath = localImgPath
        self.verified = False

    def prettify(self, elem):
        """
            Return a pretty-printed XML string for the Element.
        """
        rough_string = ElementTree.tostring(elem, 'utf8')
        root = etree.fromstring(rough_string)
        return etree.tostring(root, pretty_print=True, encoding=ENCODE_METHOD).replace("  ".encode(), "\t".encode())
        # minidom does not support UTF-8
        '''reparsed = minidom.parseString(rough_string)
        return reparsed.toprettyxml(indent="\t", encoding=ENCODE_METHOD)'''

    def genXML(self):
        """
            Return XML root
        """
        # Check conditions
        if self.filename is None or \
                self.foldername is None or \
                self.imgSize is None:
            return None

        top = Element('annotation')
        if self.verified:
            top.set('verified', 'yes')

        folder = SubElement(top, 'folder')
        folder.text = self.foldername

        filename = SubElement(top, 'filename')
        filename.text = self.filename[-6:]+'.jpg'

        if self.localImgPath is not None:
            localImgPath = SubElement(top, 'path')
            localImgPath.text = self.localImgPath

        source = SubElement(top, 'source')
        database = SubElement(source, 'database')
        database.text = self.databaseSrc
        annotation = SubElement(source,'annotation')
        annotation.text = 'PASCAL VOC2007'
#        image = SubElement(source,'image')
#        image.text = 'flickr'
#        flickrid = SubElement(source,'source')
#        flickrid.text = '341012865'
        #
        
#        owner = SubElement(top,'owner')
#        flickrid2 = SubElement(owner,'flickrid')
#        flickrid2.text = 'Fried Camels'
#        name =  SubElement(owner,'name')
#        name.text = 'Jinky the Fruit Bat'
        
        size_part = SubElement(top, 'size')
        width = SubElement(size_part, 'width')
        height = SubElement(size_part, 'height')
        depth = SubElement(size_part, 'depth')
        width.text = str(self.imgSize[1])
        height.text = str(self.imgSize[0])
        if len(self.imgSize) == 3:
            depth.text = str(self.imgSize[2])
        else:
            depth.text = '1'

#        segmented = SubElement(top, 'segmented')
#        segmented.text = '0'
        return top

    def addBndBox(self, xmin, ymin, xmax, ymax, name, difficult):
        bndbox = {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}
        bndbox['name'] = name
        bndbox['difficult'] = difficult
        self.boxlist.append(bndbox)

    def appendObjects(self, top):
        for each_object in self.boxlist:
            object_item = SubElement(top, 'object')
            name = SubElement(object_item, 'name')
            try:
                name.text = unicode(each_object['name'])
            except NameError:
                # Py3: NameError: name 'unicode' is not defined
                name.text = each_object['name']
#            pose = SubElement(object_item, 'pose')
#            pose.text = "Unspecified"
#            truncated = SubElement(object_item, 'truncated')
#           if int(each_object['ymax']) == int(self.imgSize[0]) or (int(each_object['ymin'])== 1):
#                truncated.text = "1" # max == height or min
#            elif (int(each_object['xmax'])==int(self.imgSize[1])) or (int(each_object['xmin'])== 1):
#                truncated.text = "1" # max == width or min
#            else:
#                truncated.text = "0"
            difficult = SubElement(object_item, 'difficult')
            difficult.text = str( bool(each_object['difficult']) & 1 )
            bndbox = SubElement(object_item, 'bndbox')
            xmin = SubElement(bndbox, 'xmin')
            xmin.text = str(each_object['xmin'])
            ymin = SubElement(bndbox, 'ymin')
            ymin.text = str(each_object['ymin'])
            xmax = SubElement(bndbox, 'xmax')
            xmax.text = str(each_object['xmax'])
            ymax = SubElement(bndbox, 'ymax')
            ymax.text = str(each_object['ymax'])

    def save(self, targetFile=None):
        root = self.genXML()
        self.appendObjects(root)
        out_file = None
        if targetFile is None:
            out_file = codecs.open(
                self.filename + XML_EXT, 'w', encoding=ENCODE_METHOD)
        else:
            out_file = codecs.open(targetFile, 'w', encoding=ENCODE_METHOD)

        prettifyResult = self.prettify(root)
        out_file.write(prettifyResult.decode('utf8'))
        out_file.close()


class PascalVocReader:

    def __init__(self, filepath):
        # shapes type:
        # [labbel, [(x1,y1), (x2,y2), (x3,y3), (x4,y4)], color, color, difficult]
        self.shapes = []
        self.filepath = filepath
        self.verified = False
        try:
            self.parseXML()
        except:
            pass

    def getShapes(self):
        return self.shapes

    def addShape(self, label, bndbox, difficult):
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        points = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
        self.shapes.append((label, points, None, None, difficult))

    def parseXML(self):
        assert self.filepath.endswith(XML_EXT), "Unsupport file format"
        parser = etree.XMLParser(encoding=ENCODE_METHOD)
        xmltree = ElementTree.parse(self.filepath, parser=parser).getroot()
        filename = xmltree.find('filename').text
        try:
            verified = xmltree.attrib['verified']
            if verified == 'yes':
                self.verified = True
        except KeyError:
            self.verified = False

        for object_iter in xmltree.findall('object'):
            bndbox = object_iter.find("bndbox")
            label = object_iter.find('name').text
            # Add chris
            difficult = False
            if object_iter.find('difficult') is not None:
                difficult = bool(int(object_iter.find('difficult').text))
            self.addShape(label, bndbox, difficult)
        return True
    
class PascalVocReader2:

    def __init__(self, filepath):
        # shapes type:
        # [labbel, [(x1,y1), (x2,y2), (x3,y3), (x4,y4)], color, color, difficult]
        self.shapes = []
        self.filepath = filepath
        self.verified = False
        try:
            self.parseXML()
        except:
            pass

    def getShapes(self):
        return self.shapes

    def addShape(self, label, bndbox, difficult):
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        points = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
        self.shapes.append((label, points, None, None, difficult))

    def parseXML(self):
        assert self.filepath.endswith(XML_EXT), "Unsupport file format"
        parser = etree.XMLParser(encoding=ENCODE_METHOD)
        xmltree = ElementTree.parse(self.filepath, parser=parser).getroot()
        filename = xmltree.find('filename').text
        try:
            verified = xmltree.attrib['verified']
            if verified == 'yes':
                self.verified = True
        except KeyError:
            self.verified = False

        for object_iter in xmltree.findall('object'):
            bndbox = object_iter.find("bndbox")
            xmin = bndbox.find('xmin').text
            xmax = bndbox.find('xmax').text
            ymin = bndbox.find('ymin').text
            ymax = bndbox.find('ymax').text
            
                
            label = object_iter.find('name').text
            print(label)
            # Add chris
            difficult = False
            if object_iter.find('difficult') is not None:
                difficult = bool(int(object_iter.find('difficult').text))
            self.addShape(label, bndbox, difficult)
        return True,xmin,xmax,ymin,ymax



'''
    data_path = 'VOC2007'
    xmax = 100
    ymax = 235
    xmin = 25
    ymin = 12
    pw = PascalVocWriter(data_path,filename = 'haha.jpg',imgSize = (12,12,3))
    pw.genXML()
    pw.addBndBox(xmin,ymin,xmax,ymax,'car',difficult = 1)
    pw.addBndBox(xmin*2,ymin,xmax*3,ymax,'carban',difficult = 0)
    pw.save()
    print("finished")
'''
if __name__ == '__main__':
    for i in range(1,22063):
        string = repr(i)
        strings = string.zfill(6)
        pr = ('Annotations/{}.xml'.format(strings))
        
#    pr = '000002.xml'
#    assert pr.endswith(XML_EXT), "Unsupport file format"
        parser = etree.XMLParser(encoding=ENCODE_METHOD)
        xmltree = ElementTree.parse(pr, parser=parser).getroot()
        
        foldername = xmltree.find('folder').text
        file_name = xmltree.find('filename').text
        
        source = xmltree.find('source')
        database_name = source.find('database').text
        annotation_name = source.find('annotation').text
    #    image_name = source.find('image').text
    #    source2 = source.find('source').text
        
        owner = xmltree.find('owner')
    #    flickrid = owner.find('flickrid').text
    #    name_flickrid = owner.find('name').text
        
        size = xmltree.find('size')
        width = size.find('width').text
        height = size.find('height').text
        depth = size.find('depth').text
        img_size = (height,width,depth)
        file_names = ('output/{}'.format(file_name[:-4]))
        pw = PascalVocWriter(foldername,filename = file_names,imgSize = img_size,databaseSrc = database_name)
        pw.genXML()
        
        for object_iter in xmltree.findall('object'):
            bndbox = object_iter.find("bndbox")
            xmin = bndbox.find('xmin').text
            xmax = bndbox.find('xmax').text
            ymin = bndbox.find('ymin').text
            ymax = bndbox.find('ymax').text
            label = object_iter.find('name').text
            if int(xmax)-int(xmin)<15 and int(ymax)-int(ymin)<15:
                difficult_val = 1
                print(pr[-10:])#減掉倒數的十個字符以外的字符
            else:
                difficult_val = 0
            pw.addBndBox(xmin,ymin,xmax,ymax,label,difficult = difficult_val)
        pw.save()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章