python处置文本
python处理文本
想处理一个文本,输出特定格式化的文本 初学python不知道错误在哪里
输入文本的形式是这样的
<DOC>
<DOCNO> WS880212-0001 </DOCNO>
<FILEID>AP-NR-02-12-88 2344EST</FILEID>
<FIRST>u i AM-Vietnam-Amnesty 02-12 0398</FIRST>
<SECOND>AM-Vietnam-Amnesty,0411</SECOND>
<HEAD>Reports Former Saigon Officials Released from Re-education Camp</HEAD>
<DATELINE>BANGKOK, Thailand (AP) </DATELINE>
<TEXT>
More than..........
</TEXT>
</DOC>
希望提取其中的docno 和text 希望输出的文本格式是这样的
<DOC>
<DOCNO> 51 </DOCNO>//就是原文本中的DOCNO
Airbus Subsidies//文本中text的内容
</DOC>
程序有一个参数可以指定输出的内容为text 或者title或者其他标签的内容
以下为代码
import sys
import re
import os
from sgmllib import SGMLParser
class QueryParser(SGMLParser):
def clr(self):
self.inDOCNO, self.inDesc, self.inNarr, self.inTitle, self.inText = 0,0,0,0,0
def reset(self):
SGMLParser.reset(self)
# self.feq = None
self.clr()
def unknown_starttag(self, tag, attrs):
self.clr()
def start_top(self, attrs):
fout.write("<DOC>\n")
def end_top(self):
# try:
# self.feq.close()
# except:
# pass
fout.write("</DOC>\n")
def start_DOCNO(self, attrs): self.clr(); self.inDOCNO = 1
def start_title(self, attrs): self.clr(); self.inTitle = 1
def start_desc(self, attrs): self.clr(); self.inDesc = 1
def start_narr(self, attrs): self.clr(); self.inNarr = 1
def start_text(self, attrs): self.clr(); self.inText = 1
def handle_data(self, text):
text = text.split()[1:]
if self.inDOCNO:
fout.write("<DOCNO> %d </DOCNO>\n"% int(text[0]))
# self.feq = open(os.path.join('queries', 'query%d.txt' % int(text[0])), 'w')
if (self.inTitle and flgTitle) or (self.inDesc and flgDesc) or (self.inNarr and flgNarr) or (self.inText and flgText):
fout.write(" ".join(text))
fout.write('\n')
想处理一个文本,输出特定格式化的文本 初学python不知道错误在哪里
输入文本的形式是这样的
<DOC>
<DOCNO> WS880212-0001 </DOCNO>
<FILEID>AP-NR-02-12-88 2344EST</FILEID>
<FIRST>u i AM-Vietnam-Amnesty 02-12 0398</FIRST>
<SECOND>AM-Vietnam-Amnesty,0411</SECOND>
<HEAD>Reports Former Saigon Officials Released from Re-education Camp</HEAD>
<DATELINE>BANGKOK, Thailand (AP) </DATELINE>
<TEXT>
More than..........
</TEXT>
</DOC>
希望提取其中的docno 和text 希望输出的文本格式是这样的
<DOC>
<DOCNO> 51 </DOCNO>//就是原文本中的DOCNO
Airbus Subsidies//文本中text的内容
</DOC>
程序有一个参数可以指定输出的内容为text 或者title或者其他标签的内容
以下为代码
import sys
import re
import os
from sgmllib import SGMLParser
class QueryParser(SGMLParser):
def clr(self):
self.inDOCNO, self.inDesc, self.inNarr, self.inTitle, self.inText = 0,0,0,0,0
def reset(self):
SGMLParser.reset(self)
# self.feq = None
self.clr()
def unknown_starttag(self, tag, attrs):
self.clr()
def start_top(self, attrs):
fout.write("<DOC>\n")
def end_top(self):
# try:
# self.feq.close()
# except:
# pass
fout.write("</DOC>\n")
def start_DOCNO(self, attrs): self.clr(); self.inDOCNO = 1
def start_title(self, attrs): self.clr(); self.inTitle = 1
def start_desc(self, attrs): self.clr(); self.inDesc = 1
def start_narr(self, attrs): self.clr(); self.inNarr = 1
def start_text(self, attrs): self.clr(); self.inText = 1
def handle_data(self, text):
text = text.split()[1:]
if self.inDOCNO:
fout.write("<DOCNO> %d </DOCNO>\n"% int(text[0]))
# self.feq = open(os.path.join('queries', 'query%d.txt' % int(text[0])), 'w')
if (self.inTitle and flgTitle) or (self.inDesc and flgDesc) or (self.inNarr and flgNarr) or (self.inText and flgText):
fout.write(" ".join(text))
fout.write('\n')