Python网络编程基础笔记-使用XML解析结果重新显示其内容

jcodeer

UID: 23408
帖子: 37
积分: 85
在线时间: 2 小时

1^# jcodeer 发表于 2007-11-05 23:35

Python网络编程基础笔记-使用XML解析结果重新显示其内容

1.对XML文件进行解析,并重新显示其内容

"""
将XML以文本形式重新格式化输出
1.使用Node的节点类型,判断下一步如何处理
2.对不同的节点名(tagName)进行相应的处理
"""
from xml.dom import minidom,Node
import re,textwrap
class SampleScanner:
def __init__(self,doc):
      for child in doc.childNodes:
         if child.nodeType == Node.ELEMENT_NODE and child.tagName == "book":
            """只处理book元素"""
            self.handleBook(child)
def gettext(self,nodelist):
      """获取当前节点的文本,
      1.如果当前的节点为TEXT_NODE,将文本追加到列表中
      2.如果当前的节点不是TEXT_NODE,递归地调用gettext"""
      retlist = []
      for node in nodelist:
         if node.nodeType == Node.TEXT_NODE:
            retlist.append(node.wholeText)
         elif node.hasChildNodes:
            retlist.append(self.gettext(node.childNodes))
      return re.sub("\s+"," ","".join(retlist))
def handleBook(self,node):
      """处理Book节点
      1.如果不是ELEMENT_NODE,不予理睬
      2.如果是title,直接打印出文本内容
      3.如果是author,调用handleAuthor,继续处理节点
      4.如果是chapter,调用handleChapter,继续处理节点
      """
      for child in node.childNodes:
         if child.nodeType != Node.ELEMENT_NODE:
            continue
         if child.tagName == "title":
            print "Book title is :",self.gettext(child.childNodes)
         if child.tagName == "author":
            self.handleAuthor(child)
         if child.tagName == "chapter":
            self.handleChapter(child)
def handleAuthor(self,node):
      """处理Autho节点
      1.如果不是ELEMENT_NODE,不予理睬
      2.如果是name,调用handleAuthoerName,继续处理节点
      3.如果是affiliation,调用gettext,并打印出来
      """
      for child in node.childNodes:
         if child.nodeType != Node.ELEMENT_NODE:
            continue
         if child.tagName == "name":
            self.handleAuthorName(child)
         elif child.tagName == "affiliation":
            print "Author affiliation:",self.gettext([child])
def handleAuthorName(self,node):
      """处理author.name节点
      1.使用getElementsByTagName获得子节点
      2.调用gettext得到子节点的文本,并打印处理
      """
      surname = self.gettext(node.getElementsByTagName("last"))
      givenname = self.gettext(node.getElementsByTagName("first"))
      print "Author Name:%s %s " % (surname,givenname)
def handleChapter(self,node):
      """处理chapter节点
      1.如果不是ELEMENT_NODE,不予理睬
      2.如果是para,调用handlePara,继续处理
      """
      print "*** Start of Chapter %s,%s" % (node.getAttribute("number"),self.gettext(node.getElementsByTagName("title")))
      for child in node.childNodes:
         if child.nodeType != Node.ELEMENT_NODE:
            continue
         if child.tagName == "para":
            self.handlePara(child)
def handlePara(self,node):
      """
      1.获取当前节点的文本
      2.调用textwrap格式化文本
      """
      paratext = self.gettext([node])
      paratext = textwrap.fill(paratext)
      print paratext

doc = minidom.parse("JCSample.xml")
SampleScanner(doc)
2.测试使用的XML
?xml version="1.0" encoding="UTF-8"?>
book>
title> Sample XML Thing /title>
author>
      name>
         first>Benjamin/first>
         last>Smith/last>
      /name>
      affiliation>Springy Widgets,Inc./affiliation>
/author>

chapter number = "1">
      title>First chapter/title>
      para>
         I think widgets are great.you should buy lots
         of them from company>Springy widgets,Inc/company>
      /para>
/chapter>
/book>