Python网络编程基础笔记-使用XML解析结果重新显示其内容
1.对XML文件进行解析,并重新显示其内容
"""
将XML以文本形式重新格式化输出
1.使用Node的节点类型,判断下一步如何处理
2.对不同的节点名(tagName)进行相应的处理
"""
from xml.dom import minidom,Node
import re,textwrap
class SampleScanner:
def __init__(self,doc):
for child in doc.childNodes:
if child.nodeType == Node.ELEMENT_NODE and child.tagName == "book":
"""只处理book元素"""
self.handleBook(child)
def gettext(self,nodelist):
"""获取当前节点的文本,
1.如果当前的节点为TEXT_NODE,将文本追加到列表中
2.如果当前的节点不是TEXT_NODE,递归地调用gettext"""
retlist = []
for node in nodelist:
if node.nodeType == Node.TEXT_NODE:
retlist.append(node.wholeText)
elif node.hasChildNodes:
retlist.append(self.gettext(node.childNodes))
return re.sub("\s+"," ","".join(retlist))
def handleBook(self,node):
"""处理Book节点
1.如果不是ELEMENT_NODE,不予理睬
2.如果是title,直接打印出文本内容
3.如果是author,调用handleAuthor,继续处理节点
4.如果是chapter,调用handleChapter,继续处理节点
"""
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "title":
print "Book title is :",self.gettext(child.childNodes)
if child.tagName == "author":
self.handleAuthor(child)
if child.tagName == "chapter":
self.handleChapter(child)
def handleAuthor(self,node):
"""处理Autho节点
1.如果不是ELEMENT_NODE,不予理睬
2.如果是name,调用handleAuthoerName,继续处理节点
3.如果是affiliation,调用gettext,并打印出来
"""
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "name":
self.handleAuthorName(child)
elif child.tagName == "affiliation":
print "Author affiliation:",self.gettext([child])
def handleAuthorName(self,node):
"""处理author.name节点
1.使用getElementsByTagName获得子节点
2.调用gettext得到子节点的文本,并打印处理
"""
surname = self.gettext(node.getElementsByTagName("last"))
givenname = self.gettext(node.getElementsByTagName("first"))
print "Author Name:%s %s " % (surname,givenname)
def handleChapter(self,node):
"""处理chapter节点
1.如果不是ELEMENT_NODE,不予理睬
2.如果是para,调用handlePara,继续处理
"""
print "*** Start of Chapter %s,%s" % (node.getAttribute("number"),self.gettext(node.getElementsByTagName("title")))
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "para":
self.handlePara(child)
def handlePara(self,node):
"""
1.获取当前节点的文本
2.调用textwrap格式化文本
"""
paratext = self.gettext([node])
paratext = textwrap.fill(paratext)
print paratext
doc = minidom.parse("JCSample.xml")
SampleScanner(doc)
2.测试使用的XML
?xml version="1.0" encoding="UTF-8"?>
book>
title> Sample XML Thing /title>
author>
name>
first>Benjamin/first>
last>Smith/last>
/name>
affiliation>Springy Widgets,Inc./affiliation>
/author>
chapter number = "1">
title>First chapter/title>
para>
I think widgets are great.you should buy lots
of them from company>Springy widgets,Inc/company>
/para>
/chapter>
/book>