Python网络编程基础笔记-处理HTML Character references
1.python处理HTML char references# -*- coding: cp936 -*-
"""
python使用char entity
定义:
A character entity reference is an SGML construct that references a
character of the document character set.
详见:http://www.w3.org/TR/REC-html40/charset.html#entities
两种格式:
* Numeric character references (either decimal or hexadecimal).
* Character entity references.
Python分别使用下面的两个函数进行处理这两种格式
* handle_charref( name),形式如:&#ref
* handle_entityref( name),形式如:&ref
"""
from HTMLParser import HTMLParser
from htmlentitydefs import *
import sys
class TitleParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
def handle_data(self,data):
print data
def handle_entityref(self,name):
"""处理&ref格式字符串"""
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data("&" + name + ";")
def handle_char(self,name):
"""处理&#ref格式字符串"""
try:
charnum = int(name)
except ValueError:
return
if charnum 1 or charnum > 255:
return
self.handle_data(char(charnum))
fd = file("basictitle.html")
tp = TitleParser()
tp.feed(fd.read())
2.用于测试的HTML文件
!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
html xmlns="http://www.w3.org/1999/xhtml">
head>
meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
title>无标题文档/title>
/head>
body>&®
/body>
/html>