基於htmllib.HTMLParser的html2text
- def html2text(strHtml):
- """處理html 4.01和部分xhtml 1.0轉義字符"""
- class SimpleParser(htmllib.HTMLParser):
- def anchor_end(self):
- if self.anchor:
- self.anchor = None
- def handle_p_w_picpath(self, src, alt, *args):
- #self.handle_data(alt) #可以重寫爲alt屬性
- pass #圖片替換成空
- def convert_entityref(self, name):
- name2codepoint["nbsp"] = 0x0020
- name2codepoint["apos"] = 0x0027
- if name in name2codepoint and name<256:
- return chr(name2codepoint[name])
- else:
- return
- def convert_charref(self, name):
- return unichr(int(name)).encode("gb18030") file = StringIO.StringIO()
- p = SimpleParser(formatter.AbstractFormatter(formatter.DumbWriter(file)))
- p.feed(strHtml)
- p.close()
- return file.getvalue() 基於htmllib.HTMLParser的html2text