Python网络编程基础笔记-处理HTML Character references

1.python处理HTML char references# -*- coding: cp936 -*-
"""
python使用char entity
定义:
    A character entity reference is an SGML construct that references a
character of the document character set.
详见:http://www.w3.org/TR/REC-html40/charset.html#entities
两种格式:
    * Numeric character references (either decimal or hexadecimal).
    * Character entity references.
Python分别使用下面的两个函数进行处理这两种格式
    * handle_charref( name),形式如:&#ref
    * handle_entityref( name),形式如:&ref
"""
from HTMLParser import HTMLParser
from htmlentitydefs import *
import sys
class TitleParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
    def handle_data(self,data):
        print data
    def handle_entityref(self,name):
        """处理&ref格式字符串"""
        if entitydefs.has_key(name):
            self.handle_data(entitydefs[name])
        else:
            self.handle_data("&" + name + ";")
    def handle_char(self,name):
        """处理&#ref格式字符串"""
        try:
            charnum = int(name)
        except ValueError:
            return
        if charnum  1 or charnum > 255:
            return
        self.handle_data(char(charnum))
fd = file("basictitle.html")
tp = TitleParser()
tp.feed(fd.read())
        
2.用于测试的HTML文件
!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
html xmlns="http://www.w3.org/1999/xhtml">
head>
meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
title>无标题文档/title>
/head>
body>&®
/body>
/html>