Tutorial: XML Processing with Python

Extensible Markup Language (XML) is a markup language similar to HTML. It is useful to keep track of data without using database. XML files are very popular as configuration files.

Now what if we want to process or edit XML Configuration files using Python?


Consider that we want to perform following operations in XML Files with script rather than doing it manually:


To find a node based on any attribute in any hierarchy and even with the index  

To Comment Node(s)
To Delete Node(s) 
To Update Node(s)
To Insert Node(s)
To add attribute to Node(s)
Tutorial: XML Processing with Python


We have provided here a simple Python program with multiple functions to perform above jobs. It will use one property file to make changes in the XML file.


================================================

Following is a sample XML file in which we want to make changes.

<?xml version="1.0" encoding="utf-8"?>

<!DOCTYPE hibernate-configuration PUBLIC
"-//Hibernate/Hibernate Configuration DTD 3.0//EN"
"http://www.hibernate.org/dtd/hibernate-configuration-3.0.dtd">

<hibernate-configuration>

<session-factory>
    <property name="hibernate.connection.driver_class">com.mysql.jdbc.Driver</property>
    <property name="hibernate.connection.url">jdbc:mysql://localhost:3306/discovertechno</property>
    <property name="hibernate.connection.username">root</property>
    <property name="hibernate.connection.password">password</property>
    <property name="hibernate.dialect">org.hibernate.dialect.MySQLDialect</property>
    <property name="show_sql">true</property>
    <property name="format_sql">true</property>
    <mapping resource="com/discovertechno/stock/Stock.hbm.xml" />
    <mapping resource="com/discovertechno/stock/StockDailyRecord.hbm.xml" />
</session-factory>

</hibernate-configuration>


================================================


Property File Structure:


#xmlLocation=standalone.xml

xmlLocation=hibernate.xml
#update=interfaces@index=0&interface@name=public&inet-address@index=0$@value=0.0.0.3#interfaces@index=0&interface@name=management&inet-address@index=0$@value=0.0.0.4
update=session-factory@index=0&property@name=hibernate.connection.username$@value=root#session-factory@index=0&property@name=hibernate.connection.password$@value=root@123
comment=session-factory@index=0&property@name=hibernate.connection.username#session-factory@index=0&property@name=hibernate.connection.password
insert=session-factory@index=0*<property name="hibernate.connection.username" value="root">root</property>#session-factory@index=0*<property name="hibernate.connection.password" value="root@123">root@123</property>

===============================================


#!/usr/local/bin/python2.7

from xml.dom import minidom
from xml.dom.minidom import parseString
from xml.parsers.expat import ExpatError

#Save XML File

def saveXML(xmlDoc, fileName):
    xmlFILE = open(fileName, "w")
    xmlDoc.writexml(xmlFILE)
    xmlFILE.close()

#To find a node based on any attribute in any hierarchy and even with the index    

def findNode(parentNode, nodePath):
    print "----", parentNode
    if parentNode != None:
        tempList = nodePath.split("@")
        elementName = tempList[0]
        attributeInfo = tempList[1]
        name, var = attributeInfo.partition("=")[::2]
        childElements = parentNode.getElementsByTagName(elementName)

        #To find element based on index

        if name == 'index':
            try:
                element = childElements[int(var)]
                return element
            except IndexError:
                print "Index Error: Verify Properties file for Valid XML element Hierarchy and its Index"
                return
        else:
            #To find an element based on attribute from a multiple elments 
            for childElement in childElements:
                try:
                    if childElement.attributes[name].value == var:
                        print "=======>", childElement.attributes["xmlns"].value
                        return childElement
                except KeyError:
                    print "Attribute not found in XML Element"
                    #return None                                
    else:
        return None
   
#Recursive procedure to find a node considering a hierarchy 
def getNode(parentNode, nodeList):
    searchNode = nodeList[0]
    xmlElement = findNode(parentNode, searchNode)         
    childList = nodeList[1:]
    if len(childList) == 0:
        return xmlElement
    else:
        xmlElement = getNode(xmlElement, childList)
        if xmlElement != None:
            return xmlElement
        else:
            return

def commentNodes(xmlDoc, commentSTR, fileName):

#comment=session-factory@index=0&property@name=hibernate.connection.username#session-factory@index=0&property@name=hibernate.connection.password
    #If multiple elements need to be commented in specific hierarchy then split it with "#" delimeter
    commentList  = commentSTR.split("#")
    for comment in commentList:
        #Find a node to comment it
        ele = getNode(xmlDoc.documentElement, comment.split("&"))
        if ele != None:
            #Get a parent node of node
            parentNode = ele.parentNode
            #Create a comment by using xml value of node and insert it before that element in a parent node
            parentNode.insertBefore(xmlDoc.createComment(ele.toxml()), ele)
            #Remove the main node as comment node of it is already created
            parentNode.removeChild(ele)
            # To un-comment a node
            #node = minidom.parseString(comment.data).firstChild #find a comment node, extract its data and create a new node from it
            #comment.parentNode.replaceChild(node, comment) //replace comment with a new node
            #save file
            saveXML(xmlDoc, fileName)
            print "Comment Operation Successfully Completed."
        else:
            print "Comment Operation Failed: Verify Properties file for Valid XML element Hierarchy, Index or Attribute Name"
            return

def deleteNodes(xmlDoc, deleteSTR, fileName):

#delete=session-factory@index=0&property@name=hibernate.connection.username#session-factory@index=0&property@name=hibernate.connection.password
    #If multiple elements need to be deleted in specific hierarchy then split it with "#" delimeter
    deleteList  = deleteSTR.split("#")
    print deleteList
    for delete in deleteList:
        #Find a node to comment it
        ele = getNode(xmlDoc.documentElement, delete.split("&"))
        print "ele==", ele
        if ele != None:
            #Get a parent node of node
            parentNode = ele.parentNode
            #Remove the main node as comment node of it is already created
            parentNode.removeChild(ele)
            #save file
            saveXML(xmlDoc, fileName)
            print "Delete Operation Successfully Completed."
        else:
            print "Delete Operation Failed: Verify Properties file for Valid XML element Hierarchy, Index or Attribute Name"
            return


def updateNodes(xmlDoc, updateSTR, fileName):
#update=session-factory@index=0&property@name=hibernate.connection.username$@value=root#session-factory@index=0&property@name=hibernate.connection.password$@value=root@123
    #If multiple elements need to be modified in specific hierarchy then split it with "#" delimeter
    updateList  = updateSTR.split("#")
    for elementSTR in updateList:
        #session-factory@index=0&property@name=hibernate.connection.username$@value=root
        # "$" delimeter is used to separate Text or attribute value which needs to be replaced and path to that element
        #Here, Element Path = session-factory@index=0&property@name=hibernate.connection.username while Value = @value=root
        enode, val = elementSTR.partition("$")[::2]
        val = val.rstrip()
        #Find a XMl node from the Dom  
        ele = getNode(xmlDoc.documentElement, enode.strip().split("&"))
        #$@value=root if Value starts with '@' then it means attribute value needs to be replaced else it is a text value of that xml element
        if ele != None:
                if '@' == val[0]:
                    val = val[1:]
                    attributeName, attributeVal = val.partition("=")[::2]
                    ele.attributes[attributeName].value = attributeVal
                else:
                    attributeName, attributeVal = val.partition("=")[::2]
                    ele.firstChild.replaceWholeText(attributeVal)
                #save file
                saveXML(xmlDoc, fileName)
                print "Update Operation Successfully Completed."
        else:
                print "Update Operation Failed: Verify Properties file for Valid XML element Hierarchy, Index or Attribute Name"
                return

#Insert a node having multiple child nodes or any number of attributes; this function takes String as a input, convert it into xml element and append it to parent element

def insertNodes(xmlDoc, insertSTR, fileName):
#session-factory@index=0*<property name="hibernate.connection.username" value="root">root</property>#session-factory@index=0*<property name="hibernate.connection.password" value="root@123">root@123</property>

    #If multiple elements need to be inserted in specific hierarchy then split it with "#" delimeter

    insertList  = insertSTR.split("#")
    #print insertList
    for insert in insertList:
        #session-factory@index=0*<property name="hibernate.connection.username" value="root">root</property>
        #"*" delimeter is used to divide path for parent element and String value for a complete node
        #Here parent element path is = session-factory@index=0 and XML to insert is = <property name="hibernate.connection.username" value="root">root</property>
        enode, val = insert.partition("*")[::2]
        #Find a Parent node from the XML based on the value session-factory@index=0
        ele = getNode(xmlDoc.documentElement, enode.strip().split("&"))
        if ele != None:
            #convert string into xml document, get the document element of it which will be a complete xml node of string separated by * delimenter
            #this is an easy way else each node needs to be created with all attributes and hierarchy has to be maintained 
            tempElement = parseString(val).documentElement
            #append the document element in parent node
            ele.appendChild(tempElement)
            #save file
            saveXML(xmlDoc, fileName)
            print "Insert Operation Successfully Completed."
        else:
            print "Insert Operation Failed: Verify Properties file for Valid XML element Hierarchy, Index or Attribute Name"
            return

def addAttribute(xmlDoc, attributeSTR, fileName):

    #If multiple elements need to be inserted in specific hierarchy then split it with "#" delimeter
    attributeList  = attributeSTR.split("#")
    for attribute in attributeList:
        enode, val = attribute.partition("$")[::2]
        val = val.rstrip()
        #Find a XMl node from the Dom  
        ele = getNode(xmlDoc.documentElement, enode.strip().split("&"))
        if ele != None:
            attributeName, attributeVal = val.partition("=")[::2]
            ele.setAttribute(attributeName, attributeVal)
            #save file
            saveXML(xmlDoc, fileName)
            print "Add Attribute Operation Successfully Completed."
        else:
            print "Add Attribute Operation Failed: Verify Properties file for Valid XML element Hierarchy, Index or Attribute Name"
            return
       
#Convert Property file into Name Value Pair    
try:
myvars = {}
with open("xmlProp.properties") as myfile:
for line in myfile:
name, var = line.partition("=")[::2]
myvars[name.strip()] = var.rstrip()
try:
#Parse XML Document
try:
xmlDoc = minidom.parse(myvars["xmlLocation"])
mainElement = xmlDoc.documentElement

#To update Node value or attibutes in the XML
if "update" in myvars:
updateSTR = myvars["update"]
updateNodes(xmlDoc, updateSTR, myvars["xmlLocation"])
else:
print "No Update operation is specified for XML File"

#To Comment Nodes in the XML

if "comment" in myvars:
commentSTR = myvars["comment"]
commentNodes(xmlDoc, commentSTR, myvars["xmlLocation"])
else:
print "No Comment operation is specified for XML File"

#To insert Nodes in the XML

if "insert" in myvars:
insertSTR = myvars["insert"]
insertNodes(xmlDoc, insertSTR, myvars["xmlLocation"])
else:
print "No Insert operation is specified for XML File"

#To delete Nodes in the XML

if "delete" in myvars:
deleteSTR = myvars["delete"]
deleteNodes(xmlDoc, deleteSTR, myvars["xmlLocation"])
else:
print "No Delete operation is specified for XML File"


#To add attribute in an Element
if "addattribute" in myvars:
attributeSTR = myvars["addattribute"]
addAttribute(xmlDoc, attributeSTR, myvars["xmlLocation"])
else:
print "No Add Attribute operation is specified for XML File"
except ExpatError:
print "XML file is not Well-Formed. Please check the XML Structure."
except IOError:
print "XMl file does not exist."
except IOError:
print "Properties File does not exist."

Labels: , ,