利用爬蟲把徐小明新浪博客裏的所有博文鏈接爬下來,保存到腳本所在路徑的csv文件中(python2.7代碼)
把起始博文目錄鏈接換成其他的也是完全可以的
詳細內容請關注微信公衆號:島城窩窩,
代碼如下
#! /usr/bin/env python
#coding=utf-8
# by huangle63
'''
此代碼功能爲把徐小明新浪博客的所有博文鏈接下載保存到本地csv文件中
運行本程序,會在腳本所在路徑生成一個 xuxiaoming_blog_catalog.csv 文件
20150419 huangle63
'''
import sys
import re
import csv
import urllib2
from bs4 import BeautifulSoup
#獲取頁面代碼,返回對象是 BeautifulSoup 格式
def get_http_content(url):
try:
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
html = urllib2.Request(url, headers = headers)
myResponse = urllib2.urlopen(html)
myPage = myResponse.read()
bsObj = BeautifulSoup(myPage,'html5lib')
return bsObj
except urllib2.URLError as e:
return None
#每個頁面鏈接裏有n個博文目錄鏈接
def spider_catalog(spider_href):
no_spider_hrefs.remove(spider_href)
#獲取當前頁面鏈接裏的博文目錄鏈接
url_content = get_http_content(spider_href)
if url_content == None:
print('ERROR1: Page could not be found')
else:
#獲取頁面目錄信息,格式:日期 標題名 鏈接
#把獲取的信息保存到csv文件中
csvFile = open(sys.path[0] + r'\xuxiaoming_blog_catalog.csv','ab')
try:
for link in url_content.findAll('div',{'class':'articleCell SG_j_linedot1'}):
link_title = link.find('a', href = re.compile("^(http://blog.sina.com.cn/s)")).get_text().replace(u'\u200b','').replace(u'\xa0','')
link_href = link.find('a', href = re.compile("^(http://blog.sina.com.cn/s)")).attrs['href']
link_date = link.find('span',{'class':'atc_tm SG_txtc'}).get_text()
print(link_date + ' ' + link_title + ' ' + link_href)
writer = csv.writer(csvFile)
writer.writerow((link_date,link_title.encode("gbk"),link_href))
except AttributeError as e:#當調用BeautifulSoup對象不存在時,會返回一個NONE對象,如果再調用這個NONE對象下面的子標籤,就會發生AttributeError錯誤
print('ERROR2: BeautifulSoup get the none tag')
finally:
csvFile.close()
#獲取當前頁面裏的其它頁面鏈接(第一頁,第二頁......)
try:
for link in url_content.find('ul',{'class':'SG_pages'}).findAll('li'):
all_li = link.find('a',href = re.compile("^(http://blog.sina.com.cn/s)"))
if all_li != None:
link_page_href = all_li.attrs['href']
if link_page_href not in page_hrefs:
page_hrefs.add(link_page_href)
no_spider_hrefs.add(link_page_href)
link_page_title = all_li.get_text().replace(u'\u200b','').replace(u'\xa0','')
print(link_page_title + ' ' + link_page_href)
spider_catalog(link_page_href) #遞歸查詢所有頁面鏈接
except AttributeError as e:#當調用BeautifulSoup對象不存在時,會返回一個NONE對象,如果再調用這個NONE對象下面的子標籤,就會發生AttributeError錯誤
print('ERROR2: BeautifulSoup get the none tag')
except Exception as e:
print('ERROR3: ',e)
page_hrefs = set() #pages_href用於去重,把所有鏈接都存儲在pages_hrefs
no_spider_hrefs = set() #用於存儲還沒有爬蟲的頁面鏈接
start_page_html = 'http://blog.sina.com.cn/s/articlelist_1300871220_0_1.html' #起始博文的網頁鏈接
page_hrefs.add(start_page_html)
no_spider_hrefs.add(start_page_html)
spider_catalog(start_page_html)