一.安裝mysql:
在官網:https://dev.mysql.com/downloads/mysql/
二.安裝驅動程序:
在python集成開發環境Anaconda下,需用命令:pip3 install pymysql或conda install pymysql
安裝。
三.連接數據庫:
#建立mysql數據庫連接
import pymysql
conn = pymysql.connect(host='localhost', user='root', password='admin',
db='spider', charset='utf8')
# 獲取遊標(指定獲取的數據格式,這裏設定返回dict格式)
cursor = conn.cursor()
四.獲取並插入數據:
#mysql 插入語句(將title和boby插入cnblogs表中)
sql = 'insert into cnblogs value (%s,%s)'
parm = (title, body)
#execute(sql,args)args一般是list或tuple格式,如果只有一個參數,可直接傳入 execute方法中sql語句佔位符是%s
cursor.execute(sql, parm)
#提交數據 conn.commit()
conn.commit()
五.具體實例:
5.1 (爬取博客園儲存於mysql數據庫)
from lxml import etree
import requests
import pandas as pd
#建立mysql數據庫連接
import pymysql
conn = pymysql.connect(host='localhost', user='root', password='admin',
db='spider', charset='utf8')
# 獲取遊標(指定獲取的數據格式,這裏設定返回dict格式)
cursor = conn.cursor()
#爬取URL
recommed_url = 'https://www.cnblogs.com/aggsite/UserStats'
res = requests.get(url=recommed_url).content.decode('utf-8')
ele = etree.HTML(res)
elements = ele.xpath("//*[@id='blogger_list']//a/@href")
url_list = ['http:' + ele for ele in elements][:-2]
for url in url_list:
while True:
print(url)
res2 = requests.get(url).content.decode('utf-8')
ele2 = etree.HTML(res2)
word_urls = ele2.xpath('//*[@id="mainContent"]/div/div/div[2]/a/@href')
for wordUrl in word_urls:
res3 = requests.get(wordUrl).content.decode('utf-8')
ele3 = etree.HTML(res3)
title = ele3.xpath('//*[@id="cb_post_title_url"]/text()')[0]
body = etree.tostring(ele3.xpath('//*[@id="cnblogs_post_body"]')[0], encoding='utf-8').decode('utf-8')
body = body[:10]
#mysql 插入語句(將title和boby插入cnblogs表中)
sql = 'insert into cnblogs value (%s,%s)'
parm = (title, body)
#execute(sql,args)args一般是list或tuple格式,如果只有一個參數,可直接傳入 execute方法中sql語句佔位符是%s
cursor.execute(sql, parm)
#提交數據 conn.commit()
conn.commit()
next_page = ele2.xpath("//*[@id='pager']/a|//*[@id='nav_next_page']/a/@href")
if next_page:
url = next_page[0]
else:
break
break
5.2 爬取菜鳥教程python100例子儲存於mysql數據庫
from lxml import etree
import requests#導入請求庫
import pymysql
conn = pymysql.connect(host='localhost', user='root', password='admin',
db='spider', charset='utf8')
# 獲取遊標(指定獲取的數據格式,這裏設定返回dict格式)
cursor = conn.cursor()
#菜鳥教程python100例url
recommed_url='https://www.runoob.com/python3/python3-examples.html'
#利用requests的get()方法請求url 並利用decode()方法以'utf-8'解碼
res=requests.get(url=recommed_url).content.decode('utf-8')
#利用etree庫解析返回的HTML頁面
ele=etree.HTML(res)
#利用Xpath()提取詳情頁的URl
elements=ele.xpath('//*[@id="content"]/ul/li/a/@href')
#利用列表存儲所有的URL
url_list=['https://www.runoob.com/python3/'+ele for ele in elements]
url = url_list
#print()輸出語句查看解析的詳情頁url
# print(url_list)
for url in url_list:
print(url)
res2 = requests.get(url).content.decode('utf-8')
ele2=etree.HTML(res2)
title = ele2.xpath('//*[@id="content"]/h1/text()')[0]
body = ele2.xpath('//*[@id="content"]/p[2]/text()')[0]
# mysql 插入語句(將title和boby插入cnblogs表中)
sql = 'insert into cainiao value (%s,%s)'
parm = (title, body)
# execute(sql,args)args一般是list或tuple格式,如果只有一個參數,可直接傳入 execute方法中sql語句佔位符是%s
cursor.execute(sql, parm)
# 提交數據 conn.commit()
conn.commit()