Python簡單抓取CSDN博文列表並寫入SQL Server數據庫

#/usr/bin/env python
# -*- coding:utf-8 -*-
import re #導入正則模塊
import requests #導入http客戶端庫
import lxml.html #基於libxml2這一XML解析庫的Python封裝,該模塊使用C語言編寫,解析速度比beautiful soup更快
import pymssql #導入Python的SQL Server模塊

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class SQLServer:
    def __init__(self,host,user,pwd,db): #初始化構造方法
        self.host = host
        self.user = user
        self.pwd = pwd
        self.db = db

    def __GetConnect(self): #獲取SQL Server數據庫連接
        if not self.db:
            raise(NameError,"沒有設置數據庫連接信息")
        self.conn = pymssql.connect(host=self.host,user=self.user,password=self.pwd,database=self.db)
        cur = self.conn.cursor()
        if not cur:
            raise(NameError,"數據庫連接失敗")
        else:
            return cur

    def ExecQuery(self,sql): 
        cur = self.__GetConnect()
        cur.execute(sql)
        resList = cur.fetchall() #以tuple列表的形式返回結果集中的全部數據
        self.conn.close() #關閉查詢執行連接
        return resList

    def ExecNonQuery(self,sql):
        cur = self.__GetConnect()
        cur.execute(sql)
        self.conn.commit()
        self.conn.close()

def spider_blog(url):
    html = requests.get(url).content.decode('utf-8') #以utf-8編碼格式打開url獲取到的html
    tree = lxml.html.fromstring(html)
    list_ordinal = []
    list_index = []
    list_value = []
    i = 0
    while i < len(tree.cssselect('.link_title')): #css選擇器抽取.link_title元素並替換掉回車換行及空格
        list_index.append(tree.cssselect('.link_title')[i].text_content().replace('\r\n','').replace(' ',''))
        list_ordinal.append(i+1) #追加元素到列表
        i +=1      
    etl_value = re.findall(r'<span class="link_title"><a href="(.*?)">', html, re.S) #正則解析提取URL 
    cut_url = url.replace('/'+url.split('/')[-1],'')  #切片拆分拼接完整url
    for value in etl_value:
        list_value.append((cut_url+value).replace('\r\n','').replace(' ',''))
    return zip(list_ordinal,list_index,list_value) #合併三個列表元素


def main():
    mscon = SQLServer(host=".\\binguo",user="binguo",pwd="none123",db="BlogDB") #目標SQLServer數據庫配置信息
    results = spider_blog('http://blog.csdn.net/binguo168?viewmode=contents') #測試抓取的博客URL 
    for ordinal,key,value in results:
        print str(ordinal) +'\t' + key + '\t' +value 
        sql = "INSERT INTO dbo.Blog_Message(Title,BlogURL) VALUES('" + key + "','" + value + "')"
        mscon.ExecNonQuery(sql)

if __name__ == '__main__':
    main()    

Python簡單抓取CSDN博文列表並寫入SQL Server數據庫

USE BlogDB
GO
--Bolg博文接收測試用表
CREATE TABLE dbo.Blog_Message(Ordinal INT IDENTITY(1,1),Title VARCHAR(500),BlogURL VARCHAR(1000))

SQL Server數據庫執行結果:

Python簡單抓取CSDN博文列表並寫入SQL Server數據庫

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章