準備工作
- 開發工具:pycharm
- python版本:python3
- 用到的類庫 re,requests,xlwt
- 可以保存在excel和數據庫中
安裝這些類庫我是藉助pip,如果不知道如何用pip請移步到
https://www.jianshu.com/p/7e59f52ea0b6
我們這裏以搜索“python”職位爲例
爬取數據保存到excel完整代碼如下,代碼裏註釋講解的很清楚了
# -*- coding:utf-8 -*-
import re # 用來做正則匹配用
import requests # 用來做網絡請求用
import xlwt # 用來創建excel文檔並寫入數據
# 要查詢的職位
key = 'python'
# 獲取原碼
def get_content(page):
headers = {'Host': 'search.51job.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,' + str(
page) + '.html'
r = requests.get(url, headers, timeout=10)
s = requests.session()
s.keep_alive = False
r.encoding = 'gbk'
html = r.text
return html
# 匹配規則
def get(html):
reg = re.compile(
r'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',
re.S) # 匹配換行符
items = re.findall(reg, html)
return items
def excel_write(items, index):
# 爬取到的內容寫入excel表格
for item in items: # 職位信息
for i in range(0, 5):
# print item[i]
ws.write(index, i, item[i]) # 行,列,數據
print(index)
index += 1
newTable = "test.xls" # 表格名稱
wb = xlwt.Workbook(encoding='utf-8') # 創建excel文件,聲明編碼
ws = wb.add_sheet('sheet1') # 創建表格
headData = ['職位', '公司', '地址', '薪資', '日期'] # 表頭部信息
for colnum in range(0, 5):
ws.write(0, colnum, headData[colnum], xlwt.easyxf('font: bold on')) # 行,列
# 查詢1-10頁的數據,這裏的10可以改成你想查詢多少頁
for each in range(1, 2):
index = (each - 1) * 50 + 1
excel_write(get(get_content(each)), index)
wb.save(newTable) # 數據保存到excel表格
運行上面代碼控制檯輸出如下,代表我們成功保存了44條數據。
再來看下開發工具同目錄下多了一個test.xls文件
打開test.xls文件,可以看到我們成功保存了抓取到的數據到excel文件。這就爲我們後期做數據分析打好了準備
保存到mysql數據完整代碼
# -*- coding:utf-8 -*-
import re # 用來做正則匹配用
import requests # 用來做網絡請求用
import xlwt # 用來創建excel文檔並寫入數據
import pymysql # 用來操作數據庫
# 要查詢的職位
key = 'python'
# 獲取原碼
def get_content(page):
headers = {'Host': 'search.51job.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,' + str(
page) + '.html'
r = requests.get(url, headers, timeout=10)
s = requests.session()
s.keep_alive = False
r.encoding = 'gbk'
html = r.text
return html
# 匹配規則
def get(html):
reg = re.compile(
r'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',
re.S) # 匹配換行符
items = re.findall(reg, html)
return items
# 第一種方式: =============數據存入excel表格
# def excel_write(items, index):
# # 爬取到的內容寫入excel表格
# for item in items: # 職位信息
# for i in range(0, 5):
# # print item[i]
# ws.write(index, i, item[i]) # 行,列,數據
# print(index)
# index += 1
#
#
# newTable = "test.xls" # 表格名稱
# wb = xlwt.Workbook(encoding='utf-8') # 創建excel文件,聲明編碼
# ws = wb.add_sheet('sheet1') # 創建表格
# headData = ['職位', '公司', '地址', '薪資', '日期'] # 表頭部信息
# for colnum in range(0, 5):
# ws.write(0, colnum, headData[colnum], xlwt.easyxf('font: bold on')) # 行,列
# for each in range(1, 10): # 查詢1-10頁的數據,這裏的10可以改成你想查詢多少頁
# index = (each - 1) * 50 + 1
# excel_write(get(get_content(each)), index)
# wb.save(newTable) # 數據保存到excel表格
# 第二種方式 =================數據存入數據庫
# 操作數據庫的類
class MySQLCommand(object):
# 類的初始化
def __init__(self):
self.host = 'localhost'
self.port = 3306 # 端口號
self.user = 'root' # 用戶名
self.password = "qcl123" # 密碼
self.db = "test" # 庫
self.table = "shuju" # 表
# 鏈接數據庫
def connectMysql(self):
try:
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
passwd=self.password, db=self.db, charset='utf8')
self.cursor = self.conn.cursor()
except:
print('connect mysql error.')
# 插入數據,插入之前先查詢是否存在,如果存在就不再插入
def insertData(self, my_dict):
table = self.table # 要操作的表格
# 注意,這裏查詢的sql語句id=' %s '中%s的前後要有空格
sqlExit = "SELECT gongsi FROM " + table + " WHERE gongsi = ' %s '" % (my_dict['gongsi'])
res = self.cursor.execute(sqlExit)
if res: # res爲查詢到的數據條數如果大於0就代表數據已經存在
print("數據已存在", res)
return 0
# 數據不存在才執行下面的插入操作
try:
cols = ', '.join(my_dict.keys()) # 用,分割
values = '"," '.join(my_dict.values())
sql = "INSERT INTO " + table + " (%s) VALUES (%s)" % (cols, '"' + values + '"')
# 拼裝後的sql如下
# INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ")
try:
result = self.cursor.execute(sql)
insert_id = self.conn.insert_id() # 插入成功後返回的id
self.conn.commit()
# 判斷是否執行成功
if result:
print("插入成功", insert_id)
return insert_id + 1
except pymysql.Error as e:
# 發生錯誤時回滾
self.conn.rollback()
# 主鍵唯一,無法插入
if "key 'PRIMARY'" in e.args[1]:
print("數據已存在,未插入數據")
else:
print("插入數據失敗,原因 %d: %s" % (e.args[0], e.args[1]))
except pymysql.Error as e:
print("數據庫錯誤,原因%d: %s" % (e.args[0], e.args[1]))
# 查詢最後一條數據的id值
def getLastId(self):
sql = "SELECT max(id) FROM " + self.table
try:
self.cursor.execute(sql)
row = self.cursor.fetchone() # 獲取查詢到的第一條數據
if row[0]:
return row[0] # 返回最後一條數據的id
else:
return 0 # 如果表格爲空就返回0
except:
print(sql + ' execute failed.')
def closeMysql(self):
self.cursor.close()
self.conn.close() # 創建數據庫操作類的實例
mysqlCommand = MySQLCommand()
mysqlCommand.connectMysql()
def savedb(items, index):
for item in items: # 職位信息
# 只選擇長度大於0的結果
if len(item) > 0:
# 這裏每次查詢數據庫中最後一條數據的id,新加的數據每成功插入一條id + 1
dataCount = int(mysqlCommand.getLastId()) + 1
# 職位
try:
zhiwei = item[0]
except Exception:
zhiwei = ""
# 公司
try:
gongsi = item[1]
except Exception:
gongsi = ""
# 地址
try:
dizhi = item[2]
except Exception:
dizhi = ""
# 薪資
try:
xinzi = item[3]
except Exception:
xinzi = ""
# 日期
try:
riqi = item[4]
except Exception:
riqi = ""
# 把爬取到的每條數據組合成一個字典用於數據庫數據的插入
news_dict = {
"id": str(dataCount),
"zhiwei": zhiwei,
"gongsi": gongsi,
"dizhi": dizhi,
"xinzi": xinzi,
"riqi": riqi,
}
try:
# 插入數據,如果已經存在就不在重複插入
res = mysqlCommand.insertData(news_dict)
if res:
dataCount = res
except Exception as e:
print("插入數據失敗", str(e)) # 輸出插入失敗的報錯語句
for each in range(1, 2): # 查詢1-10頁的數據,這裏的10可以改成你想查詢多少頁
index = (each - 1) * 50 + 1
savedb(get(get_content(each)), index) # 數據存入mysql數據庫
mysqlCommand.closeMysql() # 最後一定要要把數據關閉
dataCount = 0
class MySQLCommand(object):
# 類的初始化
def __init__(self):
self.host = 'localhost'
self.port = 3306 # 端口號
self.user = 'root' # 用戶名
self.password = "qcl123" # 密碼
self.db = "home" # 庫
self.table = "home_list" # 表
# 鏈接數據庫
def connectMysql(self):
try:
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
passwd=self.password, db=self.db, charset='utf8')
self.cursor = self.conn.cursor()
except:
print('connect mysql error.')
# 插入數據,插入之前先查詢是否存在,如果存在就不再插入
def insertData(self, my_dict):
table = self.table # 要操作的表格
# 注意,這裏查詢的sql語句url=' %s '中%s的前後要有空格
sqlExit = "SELECT url FROM " + table + " WHERE url = ' %s '" % (my_dict['url'])
res = self.cursor.execute(sqlExit)
if res: # res爲查詢到的數據條數如果大於0就代表數據已經存在
print("數據已存在", res)
return 0
# 數據不存在才執行下面的插入操作
try:
cols = ', '.join(my_dict.keys()) # 用,分割
values = '"," '.join(my_dict.values())
sql = "INSERT INTO " + table + " (%s) VALUES (%s)" % (cols, '"' + values + '"')
# 拼裝後的sql如下
# INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ")
try:
result = self.cursor.execute(sql)
insert_id = self.conn.insert_id() # 插入成功後返回的id
self.conn.commit()
# 判斷是否執行成功
if result:
print("插入成功", insert_id)
return insert_id + 1
except pymysql.Error as e:
# 發生錯誤時回滾
self.conn.rollback()
# 主鍵唯一,無法插入
if "key 'PRIMARY'" in e.args[1]:
print("數據已存在,未插入數據")
else:
print("插入數據失敗,原因 %d: %s" % (e.args[0], e.args[1]))
except pymysql.Error as e:
print("數據庫錯誤,原因%d: %s" % (e.args[0], e.args[1]))
# 查詢最後一條數據的id值
def getLastId(self):
sql = "SELECT max(id) FROM " + self.table
try:
self.cursor.execute(sql)
row = self.cursor.fetchone() # 獲取查詢到的第一條數據
if row[0]:
return row[0] # 返回最後一條數據的id
else:
return 0 # 如果表格爲空就返回0
except:
print(sql + ' execute failed.')
def closeMysql(self):
self.cursor.close()
self.conn.close() # 創建數據庫操作類的實例
保存到數據庫的數據如下
到這裏就實現了python爬蟲功能