python处理文本
想处理一个文本,输出特定格式化的文本 初学python不知道错误在哪里
输入文本的形式是这样的
<DOC>
<DOCNO> WS880212-0001 </DOCNO>
<FILEID>AP-NR-02-12-88 2344EST</FILEID>
<FIRST>u i AM-Vietnam-Amnesty 02-12 0398</FIRST>
<SECOND>AM-Vietnam-Amnesty,0411</SECOND>
<HEAD>Reports Former Saigon Officials Released from Re-education Camp</HEAD>
<DATELINE>BANGKOK, Thailand (AP) </DATELINE>
<TEXT>
More than..........
</TEXT>
</DOC>
希望提取其中的docno 和text 希望输出的文本格式是这样的
<DOC>
<DOCNO> 51 </DOCNO>//就是原文本中的DOCNO
Airbus Subsidies//文本中text的内容
</DOC>
程序有一个参数可以指定输出的内容为text 或者title或者其他标签的内容
以下为代码
import sys
import re
import os
from sgmllib import SGMLParser
class QueryParser(SGMLParser):
def clr(self):
self.inDOCNO, self.inDesc, self.inNarr, self.inTitle, self.inText = 0,0,0,0,0
def reset(self):
SGMLParser.reset(self)
# self.feq = None
self.clr()
def unknown_starttag(self, tag, attrs):
self.clr()
def start_top(self, attrs):
fout.write("<DOC>\n")
def end_top(self):
# try:
# self.feq.close()
# except:
# pass
fout.write("</DOC>\n")
def start_DOCNO(self, attrs): self.clr(); self.inDOCNO = 1
def start_title(self, attrs): self.clr(); self.inTitle = 1
def start_desc(self, attrs): self.clr(); self.inDesc = 1
def start_narr(self, attrs): self.clr(); self.inNarr = 1
def start_text(self, attrs): self.clr(); self.inText = 1
def handle_data(self, text):
text = text.split()[1:]
if self.inDOCNO:
fout.write("<DOCNO> %d </DOCNO>\n"% int(text[0]))
# self.feq = open(os.path.join('queries', 'query%d.txt' % int(text[0])), 'w')
if (self.inTitle and flgTitle) or (self.inDesc and flgDesc) or (self.inNarr and flgNarr) or (self.inText and flgText):
fout.write(" ".join(text))
fout.write('\n')
# self.feq.write(" ".join(text))
# self.feq.write('\n')
def process(filename):
print 'Process %s' % filename
fp = open(filename, 'r')
parser = QueryParser()
parser.feed(fp.read())
fp.close()
parser.close()
def usage():
print "Options: [-title] [-desc] [-narr] [-text]topics_in_trec_format_file"
print "Translate the query file from origional TREC format to the format which is acceptable by ParseToFile in Lemur"
# print "And output each query in separate file in the format of my trecMiner.exe"
if len(sys.argv) <= 1:
usage()
sys.exit()
# os.mkdir('queries')
fout = open('D:\LAB\PureOrder\ori_query.txt', 'w')
print 'create file success'
flgTitle, flgNarr, flgDesc = 0, 0, 0
if __name__ == '__main__':
for arg in sys.argv[1:]:
if arg == '-title': flgTitle = 1
elif arg == '-desc': flgDesc = 1
elif arg == '-narr': flgNarr = 1
elif arg == '-text': flgText = 1
for arg in sys.argv[1:]:
if arg[0] != '-':
process(arg)
fout.close()
刚看了两天python 请大神指点
输入文本的形式是这样的
<DOC>
<DOCNO> WS880212-0001 </DOCNO>
<FILEID>AP-NR-02-12-88 2344EST</FILEID>
<FIRST>u i AM-Vietnam-Amnesty 02-12 0398</FIRST>
<SECOND>AM-Vietnam-Amnesty,0411</SECOND>
<HEAD>Reports Former Saigon Officials Released from Re-education Camp</HEAD>
<DATELINE>BANGKOK, Thailand (AP) </DATELINE>
<TEXT>
More than..........
</TEXT>
</DOC>
希望提取其中的docno 和text 希望输出的文本格式是这样的
<DOC>
<DOCNO> 51 </DOCNO>//就是原文本中的DOCNO
Airbus Subsidies//文本中text的内容
</DOC>
程序有一个参数可以指定输出的内容为text 或者title或者其他标签的内容
以下为代码
import sys
import re
import os
from sgmllib import SGMLParser
class QueryParser(SGMLParser):
def clr(self):
self.inDOCNO, self.inDesc, self.inNarr, self.inTitle, self.inText = 0,0,0,0,0
def reset(self):
SGMLParser.reset(self)
# self.feq = None
self.clr()
def unknown_starttag(self, tag, attrs):
self.clr()
def start_top(self, attrs):
fout.write("<DOC>\n")
def end_top(self):
# try:
# self.feq.close()
# except:
# pass
fout.write("</DOC>\n")
def start_DOCNO(self, attrs): self.clr(); self.inDOCNO = 1
def start_title(self, attrs): self.clr(); self.inTitle = 1
def start_desc(self, attrs): self.clr(); self.inDesc = 1
def start_narr(self, attrs): self.clr(); self.inNarr = 1
def start_text(self, attrs): self.clr(); self.inText = 1
def handle_data(self, text):
text = text.split()[1:]
if self.inDOCNO:
fout.write("<DOCNO> %d </DOCNO>\n"% int(text[0]))
# self.feq = open(os.path.join('queries', 'query%d.txt' % int(text[0])), 'w')
if (self.inTitle and flgTitle) or (self.inDesc and flgDesc) or (self.inNarr and flgNarr) or (self.inText and flgText):
fout.write(" ".join(text))
fout.write('\n')
# self.feq.write(" ".join(text))
# self.feq.write('\n')
def process(filename):
print 'Process %s' % filename
fp = open(filename, 'r')
parser = QueryParser()
parser.feed(fp.read())
fp.close()
parser.close()
def usage():
print "Options: [-title] [-desc] [-narr] [-text]topics_in_trec_format_file"
print "Translate the query file from origional TREC format to the format which is acceptable by ParseToFile in Lemur"
# print "And output each query in separate file in the format of my trecMiner.exe"
if len(sys.argv) <= 1:
usage()
sys.exit()
# os.mkdir('queries')
fout = open('D:\LAB\PureOrder\ori_query.txt', 'w')
print 'create file success'
flgTitle, flgNarr, flgDesc = 0, 0, 0
if __name__ == '__main__':
for arg in sys.argv[1:]:
if arg == '-title': flgTitle = 1
elif arg == '-desc': flgDesc = 1
elif arg == '-narr': flgNarr = 1
elif arg == '-text': flgText = 1
for arg in sys.argv[1:]:
if arg[0] != '-':
process(arg)
fout.close()
刚看了两天python 请大神指点
作者: hallow1987 发布时间: 2011-05-12
上面那个东西是个xml吧...用python的xml模块看看
作者: infidel 发布时间: 2011-05-12
什么错误呢?
作者: LongBless 发布时间: 2011-05-12