下載徐小明新浪博客全部博文鏈接

原創

huangle63

2020-02-21 13:34

利用爬蟲把徐小明新浪博客裏的所有博文鏈接爬下來，保存到腳本所在路徑的csv文件中(python2.7代碼)

把起始博文目錄鏈接換成其他的也是完全可以的

詳細內容請關注微信公衆號：島城窩窩，

代碼如下

#! /usr/bin/env python
#coding=utf-8
# by huangle63
'''
此代碼功能爲把徐小明新浪博客的所有博文鏈接下載保存到本地csv文件中
運行本程序，會在腳本所在路徑生成一個 xuxiaoming_blog_catalog.csv 文件
20150419 huangle63
'''
import sys
import re
import csv
import urllib2
from bs4 import BeautifulSoup

#獲取頁面代碼，返回對象是 BeautifulSoup 格式
def get_http_content(url):
    try:
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        html = urllib2.Request(url, headers = headers)
        myResponse = urllib2.urlopen(html)
        myPage = myResponse.read()
        bsObj = BeautifulSoup(myPage,'html5lib')
        return bsObj
    except urllib2.URLError as e:
        return None

#每個頁面鏈接裏有n個博文目錄鏈接
def spider_catalog(spider_href):
    no_spider_hrefs.remove(spider_href)
    #獲取當前頁面鏈接裏的博文目錄鏈接
    url_content = get_http_content(spider_href)
    if url_content == None:
        print('ERROR1: Page could not be found')
    else:
        #獲取頁面目錄信息,格式：日期    標題名   鏈接
        #把獲取的信息保存到csv文件中
        csvFile = open(sys.path[0] + r'\xuxiaoming_blog_catalog.csv','ab')
        try:
            for link in url_content.findAll('div',{'class':'articleCell SG_j_linedot1'}):
                link_title = link.find('a', href = re.compile("^(http://blog.sina.com.cn/s)")).get_text().replace(u'\u200b','').replace(u'\xa0','')
                link_href = link.find('a', href = re.compile("^(http://blog.sina.com.cn/s)")).attrs['href']
                link_date = link.find('span',{'class':'atc_tm SG_txtc'}).get_text()
                print(link_date + '   ' + link_title + '    ' + link_href)
                writer = csv.writer(csvFile)
                writer.writerow((link_date,link_title.encode("gbk"),link_href))
        except AttributeError as e:#當調用BeautifulSoup對象不存在時，會返回一個NONE對象，如果再調用這個NONE對象下面的子標籤，就會發生AttributeError錯誤
            print('ERROR2: BeautifulSoup get the none tag')
        finally:
            csvFile.close()

        #獲取當前頁面裏的其它頁面鏈接(第一頁，第二頁......)
        try:
            for link in url_content.find('ul',{'class':'SG_pages'}).findAll('li'):
                all_li = link.find('a',href = re.compile("^(http://blog.sina.com.cn/s)"))
                if all_li != None:
                    link_page_href = all_li.attrs['href']
                    if link_page_href not in page_hrefs:
                        page_hrefs.add(link_page_href)
                        no_spider_hrefs.add(link_page_href)
                        link_page_title = all_li.get_text().replace(u'\u200b','').replace(u'\xa0','')
                        print(link_page_title + '    ' + link_page_href)
                        spider_catalog(link_page_href)  #遞歸查詢所有頁面鏈接
        except AttributeError as e:#當調用BeautifulSoup對象不存在時，會返回一個NONE對象，如果再調用這個NONE對象下面的子標籤，就會發生AttributeError錯誤
            print('ERROR2: BeautifulSoup get the none tag')
        except Exception as e:
            print('ERROR3: ',e)

page_hrefs = set() #pages_href用於去重,把所有鏈接都存儲在pages_hrefs
no_spider_hrefs = set() #用於存儲還沒有爬蟲的頁面鏈接
start_page_html = 'http://blog.sina.com.cn/s/articlelist_1300871220_0_1.html' #起始博文的網頁鏈接
page_hrefs.add(start_page_html)
no_spider_hrefs.add(start_page_html)
spider_catalog(start_page_html)