#
xml_minidom_parse1
from
xml.dom.minidom import parse
# global
variables required by handleData and formatData
nameList =
["proc1", "proc2", 'ftp']
nodeIDlist =
['name', 'server', 'user', 'password']
matchDict =
{'name': ["proc1", "proc2"], 'server': ['ftp', ], 'user':
False, 'password': False}
def
getText(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
s = node.data.strip()
if s:
rc.append(node.data)
return '\n'.join(rc)
def handleData(nodelist,
*args):
resList = []
for node in nodelist:
try:
if
str(node.attributes["name"].value) in nameList:
for arg in args:
resList.append(node.getElementsByTagName(arg))
except KeyError, e:
print 'Invalid node atribute:', e
return [item[0] for item in resList if
item]
def
nodeName(node):
try: return
repr(node).split(':')[1].split()[0]
except: return ''
def
matchID(node):
for id in matchDict:
try:
s = str(node.attributes[id].value)
if not matchDict[id] or s in
matchDict[id]:
return True
except:
pass
return False
def
formatData(nodelist, *args):
resList = []
match_id = False
for node in nodelist:
node_name = nodeName(node)
for id in matchDict:
try:
s =
str(node.attributes[id].value)
if not matchDict[id] or s in
matchDict[id]:
resList.append('%s %s=%s' %
(node_name, id, s))
match_id = True
except KeyError, e:
# print 'Invalid node
atribute:', e
pass
for arg in args:
if matchID(node):
try:
resList.append(' %s: %s' % (arg,
getText(node.getElementsByTagName(arg)[0].childNodes)))
except IndexError, e:
# print 'Invalid
element tag: %s' % arg
pass
return '\n'.join(resList)
def
getDataList(nodelist, **kargs):
resList = []
match_id = False
for node in nodelist:
node_name = nodeName(node)
if node_name in kargs:
keys = kargs[node_name].keys()
for id in keys:
try:
s =
str(node.attributes[id].value)
v = kargs[node_name][id]
if not v or s in
kargs[node_name][id]:
resList.append('%s %s=%s'
% (node_name, id, s))
match_id = True
except KeyError, e:
# print 'Invalid node
atribute:', e
pass
if match_id:
if node.nodeType ==
node.ELEMENT_NODE:
nodes = node.childNodes
#print nodes
name = node.nodeName
print 'DOM element = %s' %
name
s = []
for elem in nodes:
#print elem
#print elem.parentNode
nm = nodeName(elem)
#print nm
s.append(' %s%s' % (['', nm+': '][len(nm)>0 or
0],getText(elem.childNodes)))
print '\n'.join([i for i in
s if i.strip()])
elif node.nodeType ==
node.TEXT_NODE:
s = getText(node)
print 'Text Node Text = %s'
% s
#print
#print '\n'.join(resList)
return resList
fn =
r'H:\TEMP\temsys\sampleXML.txt'
dom1 = parse(fn)
'''
for item in
dom1.getElementsByTagName("copyfile"):
print getText(item.childNodes)
print
root_elements =
dom1.getElementsByTagName('xml')
print
root_elements
for item in
handleData(root_elements, "mkdir", "copyfile"):
print getText(item.childNodes)
print
process_elements
= dom1.getElementsByTagName('process')
print
process_elements
print
handleData(process_elements, "mkdir", "copyfile")
for item in
handleData(process_elements, "mkdir", "copyfile"):
print getText(item.childNodes)
print
xList =
[elem.attributes for elem in process_elements]
for item in
xList:
for tup in item.items():
for s in tup:
print str(s)
for item in
xList:
print item.items()
print
elemList =
handleData(process_elements, "mkdir", "copyfile")
for elem in
elemList:
print '%s name="%s"' %
(repr(elem.parentNode).split(':')[1].split()[0], \
str(elem.parentNode.attributes.get('name').value))
print '
%s: %s' % (repr(elem).split(':')[1].split()[0],
getText(elem.childNodes))
print
print
formatData(process_elements, "mkdir", "copyfile")
print
'''
'''
print
print
xList[0].values()[0].hasChildNodes()
print
xList[0].values()[0].firstChild
print
xList[0].values()[0].lastChild
print
xList[0].values()[0].childNodes
print
xList[0].values()[0].lastChild.nodeName
print
xList[0].values()[0].lastChild.childNodes # ()
print
xList[0].values()[0].lastChild.hasChildNodes()
print
xList[0].getNamedItem(str(xList[0].keys()[0]))
print
dom1.lastChild
print
print
root_elements[0]
print
root_elements[0].firstChild
print
root_elements[0].firstChild.hasChildNodes()
process_elements
= dom1.getElementsByTagName('process')
print
process_elements
print
formatData(process_elements, "mkdir", "copyfile")
print
download_elements
= dom1.getElementsByTagName('download')
print
download_elements
s =
formatData(download_elements, "destination", "unzip")
print s
#print
download_elements[0].attributes['server'].value
#print
download_elements[0].attributes['user'].value
#print
download_elements[0].attributes['password'].value
'''
process_elements
= dom1.getElementsByTagName('process')
download_elements
= dom1.getElementsByTagName('download')
elemDict =
{'process': {'name': ["proc1", "proc2"]}, 'download':
{'server': ['ftp', ], 'user': False, 'password': False}}
x =
getDataList(process_elements, **elemDict)
y =
getDataList(download_elements, **elemDict)
import re
tag =
'<mkdir>'
closetag =
'</mkdir>'
patt =
re.compile(r'%s(.+)%s' % (tag, closetag))
s =
open(fn).read()
mkdirList =
patt.findall(s)
print
print mkdirList
''' Sample
output:
>>>
src1,dst1
src2,dst2
src3,dst3
src4,dst4
[<DOM Element:
process at 0xe81350>, <DOM Element: process at 0xe81508>, <DOM
Element: process at 0xe81648>, <DOM Element: process at 0xe81800>]
[<DOM Element:
mkdir at 0xe813f0>, <DOM Element: copyfile at 0xe81468>, <DOM Element:
copyfile at 0xe815a8>]
directory1
src1,dst1
src2,dst2
name
proc1
name
proc2
name
proc3
name
proc4
>>>
'''
'''
>>>
dom1.firstChild.hasChildNodes()
True
>>>
getText([dom1.firstChild, ])
''
>>>
dom1.firstChild.nodeType
1
>>>
dom1.TEXT_NODE
3
>>>
dom1.DOCUMENT_NODE
9
>>>
dom1.ATTRIBUTE_NODE
2
>>>
dom1.CDATA_SECTION_NODE
4
>>>
dom1.ELEMENT_NODE
1
>>>
dom1.firstChild.getElementsByTagName('process')
[<DOM Element:
process at 0xf04e18>, <DOM Element: process at 0xf04c38>]
>>>
process_elements[0].childNodes
[<DOM Text
node "\n\t ">, <DOM
Element: mkdir at 0xf04e68>, <DOM Text node "\n \t
">, <DOM Element: copyfile at 0xf04d50>, <DOM Text node
"\n ">]
>>>
process_elements[1].childNodes
[<DOM Text
node "\n \t ">, <DOM
Element: copyfile at 0xf04260>, <DOM Text node "\n ">]
>>>
>>>
str(process_element1[0].tagName)
'process'
>>>
'''
'''
datasource =
open('c:\\temp\\mydata.xml')
dom2 =
parse(datasource) # parse an open file
dom3 =
parseString('<myxml>Some data<empty/> some more
data</myxml>')
'''
''' Output
>>> DOM
element = process
mkdir: directory1
mkdir: directory11
mkdir: directory111
copyfile: src1,dst1
DOM element =
process
copyfile: src2,dst2
DOM element =
process
mkdir: directory3
mkdir: directory333333333
copyfile: src3,dst3
DOM element =
process
mkdir: directory4
copyfile: src4,dst4
DOM element =
process
download:
DOM element =
download
destination: path
unzip: *.jpg, *.doc, *.pdf
['directory1',
'directory11', 'directory111', 'directory3', 'directory333333333',
'directory4']
>>>
'''
''' XML Data
<xml>
<process
name="proc1">
<mkdir>directory1</mkdir>
<mkdir>directory11</mkdir>
<mkdir>directory111</mkdir>
<copyfile>src1,dst1</copyfile>
</process>
<process
name="proc2">
<copyfile>src2,dst2</copyfile>
</process>
<process
name="proc3">
<mkdir>directory3</mkdir>
<mkdir>directory333333333</mkdir>
<copyfile>src3,dst3</copyfile>
</process>
<process
name="proc4">
<mkdir>directory4</mkdir>
<copyfile>src4,dst4</copyfile>
</process>
<process
name='download'>
<download
server='ftp' user='username' password='******'>
<destination>path</destination>
<unzip>*.jpg,
*.doc, *.pdf</unzip>
</download>
</process>
<dummy
id='test'>
<date>December
8, 2007</date>
</dummy>
</xml>
'''