HTMLParser爲Python的常用內建模塊,使用時經常是繼承HTMLParser並重寫其方法。
其中常用方法如下:
handle_starttag(tag, attrs)處理開始標籤,比如<input type="text" value="3">,tag即爲input,attrs爲儲存對應屬性,值的元組(tuple)列表(list):[('type':'text'),('value':'3')]
handle_endtag(tag)處理結尾標籤
handle_data(data)處理標籤內容
handle_startendtag(tag, attrs)處理<img src="" />這類標籤
handle_comment(data)處理註釋內容
#例子
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
"""
recognize start tag, like <div>
:param tag:
:param attrs:
:return:
"""
print("Encountered a start tag:", tag)
print(attrs)
def handle_endtag(self, tag):
"""
recognize end tag, like </div>
:param tag:
:return:
"""
print("Encountered an end tag :", tag)
def handle_data(self, data):
"""
recognize data, html content string
:param data:
:return:
"""
print("Encountered some data :", data)
def handle_startendtag(self, tag, attrs):
"""
recognize tag that without endtag, like <img />
:param tag:
:param attrs:
:return:
"""
print("Encountered startendtag :", tag)
def handle_comment(self,data):
"""
:param data:
:return:
"""
print("Encountered comment :", data)
parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
'<body><h1>Parse me!</h1><img src = "" /><input type="text" value="3"></input>'
'<!-- comment --></body></html>')
parser.close()
#輸出
簡單實際用例
從csdn博客視圖列表中獲取所有自己博文的名字+url
http://blog.csdn.net/yeyinglingfeng?viewmode=contents
主要思路就是利用各個函數的執行順序
#查看網頁源代碼後可知所需數據在下面的html中,url在<a>標籤中的href中,標題在內容中,因爲HTMLParser在檢索html時是按照出現的先後順序觸發各個方法的,所以在觸發handle_starttag函數後,只要有內容,就會觸發handle_data函數,所以可以利用這點來綁定url和title並存入list。PS:我這裏多加的鎖定可能有點多餘。
<h1>
<span class="link_title"><a href="/yeyinglingfeng/article/details/78332515">
(1)Python筆記:抓取CSDN博文
</a>
</span>
</h1>
#從csdn博客視圖列表中獲取所有自己博文的名字+url
#http://blog.csdn.net/yeyinglingfeng?viewmode=contents
#coding:utf-8
from html.parser import HTMLParser
import urllib.request
import re
import sys
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.articleStr = []
self.check='N'
self.num=0
def handle_starttag(self, tag, attrs):
if tag =='a':
if len(attrs)==1 and attrs[0][0]=='href' and '/yeyinglingfeng/article/details/' in attrs[0][1] and '#comments' not in attrs[0][1]:
self.check='Y'
article={}
article['url']='http://blog.csdn.net'+attrs[0][1]
self.articleStr.append(article)
self.num+=1
def handle_endtag(self, tag):
pass
def handle_data(self, data):
if self.check=='Y':
self.articleStr[-1]['articleTitle']=data.replace('\r\n','').strip()
self.check='N'
def handle_startendtag(self, tag, attrs):
pass
def handle_comment(self,data):
pass
def getHtmlInfo(url):
print('url:'+url)
return str(urllib.request.urlopen(url).read(),'utf-8')
def saveInfo(info):
try:
with open('d:\\1\\csdnList.txt','w',encoding='utf-8') as file_write:
file_write.write(info)
except:
print('error:something faile')
html=getHtmlInfo("http://blog.csdn.net/yeyinglingfeng?viewmode=contents")
parser = MyHTMLParser()
parser.feed(html)
parser.close()
allInfo=''
for each in parser.articleStr:
allInfo+=each['articleTitle']+' url:'+each['url']+'\n'
#print(allInfo)
saveInfo(allInfo)
print(str(parser.num))
保存的txt文件