# xml_minidom_parse1

 

from xml.dom.minidom import parse

 

# global variables required by handleData and formatData

nameList = ["proc1", "proc2", 'ftp']

nodeIDlist = ['name', 'server', 'user', 'password']

matchDict = {'name': ["proc1", "proc2"], 'server': ['ftp', ], 'user': False, 'password': False}

 

def getText(nodelist):

    rc = []

    for node in nodelist:

        if node.nodeType == node.TEXT_NODE:

            s = node.data.strip()

            if s:

                rc.append(node.data)

    return '\n'.join(rc)

 

def handleData(nodelist, *args):

    resList = []

    for node in nodelist:

        try:

            if str(node.attributes["name"].value) in nameList:

                for arg in args:

                    resList.append(node.getElementsByTagName(arg))

        except KeyError, e:

            print 'Invalid node atribute:', e

    return [item[0] for item in resList if item]

 

def nodeName(node):

    try: return repr(node).split(':')[1].split()[0]

    except: return ''

 

def matchID(node):

    for id in matchDict:

        try:

            s = str(node.attributes[id].value)

            if not matchDict[id] or s in matchDict[id]:

                return True

        except:

            pass

    return False

 

def formatData(nodelist, *args):

    resList = []

    match_id = False

    for node in nodelist:

        node_name = nodeName(node)

        for id in matchDict:

            try:

                s = str(node.attributes[id].value)

                if not matchDict[id] or s in matchDict[id]:

                    resList.append('%s %s=%s' % (node_name, id, s))

                    match_id = True

            except KeyError, e:

                # print 'Invalid node atribute:', e

                pass

        for arg in args:

            if matchID(node):

                    try:

                        resList.append('  %s: %s' % (arg, getText(node.getElementsByTagName(arg)[0].childNodes)))

                    except IndexError, e:

                        # print 'Invalid element tag: %s' % arg

                        pass

    return '\n'.join(resList)

 

def getDataList(nodelist, **kargs):

    resList = []

    match_id = False

    for node in nodelist:

        node_name = nodeName(node)

        if node_name in kargs:

            keys = kargs[node_name].keys()

            for id in keys:

                try:

                    s = str(node.attributes[id].value)

                    v = kargs[node_name][id]

                    if not v or s in kargs[node_name][id]:

                        resList.append('%s %s=%s' % (node_name, id, s))

                        match_id = True

                except KeyError, e:

                # print 'Invalid node atribute:', e

                    pass

            if match_id:

                if node.nodeType == node.ELEMENT_NODE:

                    nodes = node.childNodes

                    #print nodes

                    name = node.nodeName

                    print 'DOM element = %s' % name

                    s = []

                    for elem in nodes:

                        #print elem

                        #print elem.parentNode

                        nm = nodeName(elem)

                        #print nm

                        s.append('  %s%s' % (['', nm+': '][len(nm)>0 or 0],getText(elem.childNodes)))

                    print '\n'.join([i for i in s if i.strip()])

                elif node.nodeType == node.TEXT_NODE:

                    s = getText(node)

                    print 'Text Node Text = %s' % s

    #print

    #print '\n'.join(resList)

    return resList

 

fn = r'H:\TEMP\temsys\sampleXML.txt'

 

dom1 = parse(fn)

 

'''

for item in dom1.getElementsByTagName("copyfile"):

    print getText(item.childNodes)

 

print

root_elements = dom1.getElementsByTagName('xml')

print root_elements

for item in handleData(root_elements, "mkdir", "copyfile"):

    print getText(item.childNodes)

 

print

process_elements = dom1.getElementsByTagName('process')

print process_elements

 

print handleData(process_elements, "mkdir", "copyfile")

 

for item in handleData(process_elements, "mkdir", "copyfile"):

    print getText(item.childNodes)

 

print

xList = [elem.attributes for elem in process_elements]

for item in xList:

    for tup in item.items():

        for s in tup:

            print str(s)

 

for item in xList:

    print item.items()

print

 

elemList = handleData(process_elements, "mkdir", "copyfile")

for elem in elemList:

    print '%s name="%s"' % (repr(elem.parentNode).split(':')[1].split()[0], \

                              str(elem.parentNode.attributes.get('name').value))

    print '    %s: %s' % (repr(elem).split(':')[1].split()[0], getText(elem.childNodes))

 

print

print formatData(process_elements, "mkdir", "copyfile")   

print

'''

 

   

'''

print

print xList[0].values()[0].hasChildNodes()

print xList[0].values()[0].firstChild

print xList[0].values()[0].lastChild

print xList[0].values()[0].childNodes

print xList[0].values()[0].lastChild.nodeName

print xList[0].values()[0].lastChild.childNodes # ()

print xList[0].values()[0].lastChild.hasChildNodes()

print xList[0].getNamedItem(str(xList[0].keys()[0]))

print dom1.lastChild

 

print

print root_elements[0]

print root_elements[0].firstChild

print root_elements[0].firstChild.hasChildNodes()

 

 

process_elements = dom1.getElementsByTagName('process')

print process_elements

 

print formatData(process_elements, "mkdir", "copyfile")   

print

 

download_elements = dom1.getElementsByTagName('download')

print download_elements

 

s = formatData(download_elements, "destination", "unzip")

print s

 

#print download_elements[0].attributes['server'].value

#print download_elements[0].attributes['user'].value

#print download_elements[0].attributes['password'].value

'''

process_elements = dom1.getElementsByTagName('process')

download_elements = dom1.getElementsByTagName('download')

 

elemDict = {'process': {'name': ["proc1", "proc2"]}, 'download': {'server': ['ftp', ], 'user': False, 'password': False}}

x = getDataList(process_elements, **elemDict)

y = getDataList(download_elements, **elemDict)

 

import re

tag = '<mkdir>'

closetag = '</mkdir>'

patt = re.compile(r'%s(.+)%s' % (tag, closetag))

s = open(fn).read()

mkdirList = patt.findall(s)

print

print mkdirList

 

''' Sample output:

>>> src1,dst1

src2,dst2

src3,dst3

src4,dst4

[<DOM Element: process at 0xe81350>, <DOM Element: process at 0xe81508>, <DOM Element: process at 0xe81648>, <DOM Element: process at 0xe81800>]

[<DOM Element: mkdir at 0xe813f0>, <DOM Element: copyfile at 0xe81468>, <DOM Element: copyfile at 0xe815a8>]

directory1

src1,dst1

src2,dst2

 

name

proc1

name

proc2

name

proc3

name

proc4

>>>

'''

 

'''

>>> dom1.firstChild.hasChildNodes()

True

>>> getText([dom1.firstChild, ])

''

>>> dom1.firstChild.nodeType

1

>>> dom1.TEXT_NODE

3

>>> dom1.DOCUMENT_NODE

9

>>> dom1.ATTRIBUTE_NODE

2

>>> dom1.CDATA_SECTION_NODE

4

>>> dom1.ELEMENT_NODE

1

>>> dom1.firstChild.getElementsByTagName('process')

[<DOM Element: process at 0xf04e18>, <DOM Element: process at 0xf04c38>]

>>> process_elements[0].childNodes

[<DOM Text node "\n\t   ">, <DOM Element: mkdir at 0xf04e68>, <DOM Text node "\n  \t   ">, <DOM Element: copyfile at 0xf04d50>, <DOM Text node "\n  ">]

>>> process_elements[1].childNodes

[<DOM Text node "\n  \t ">, <DOM Element: copyfile at 0xf04260>, <DOM Text node "\n  ">]

>>>

>>> str(process_element1[0].tagName)

'process'

>>>

'''

 

 

'''

datasource = open('c:\\temp\\mydata.xml')

dom2 = parse(datasource)   # parse an open file

 

dom3 = parseString('<myxml>Some data<empty/> some more data</myxml>')

'''

 

''' Output

>>> DOM element = process

  mkdir: directory1

  mkdir: directory11

  mkdir: directory111

  copyfile: src1,dst1

DOM element = process

  copyfile: src2,dst2

DOM element = process

  mkdir: directory3

  mkdir: directory333333333

  copyfile: src3,dst3

DOM element = process

  mkdir: directory4

  copyfile: src4,dst4

DOM element = process

  download:

DOM element = download

  destination: path

  unzip: *.jpg, *.doc, *.pdf

 

['directory1', 'directory11', 'directory111', 'directory3', 'directory333333333', 'directory4']

>>>

'''

 

''' XML Data

<xml>

<process name="proc1">

<mkdir>directory1</mkdir>

<mkdir>directory11</mkdir>

<mkdir>directory111</mkdir>

<copyfile>src1,dst1</copyfile>

</process>

<process name="proc2">

<copyfile>src2,dst2</copyfile>

</process>

<process name="proc3">

<mkdir>directory3</mkdir>

<mkdir>directory333333333</mkdir>

<copyfile>src3,dst3</copyfile>

</process>

<process name="proc4">

<mkdir>directory4</mkdir>

<copyfile>src4,dst4</copyfile>

</process>

<process name='download'>

<download server='ftp' user='username' password='******'>

<destination>path</destination>

<unzip>*.jpg, *.doc, *.pdf</unzip>

</download>

</process>

<dummy id='test'>

<date>December 8, 2007</date>

</dummy>

</xml>

'''