import requests
import logging
import time
import json
import pymysql
import os
# 打印日誌
log_name = 'sb_spider_log.log'
logging.basicConfig( # 日誌輸出信息
filename=log_name,
filemode='a',
level=logging.INFO,
datefmt='%Y-%m-%d %A %H:%M:%S')
# 連接mysql數據庫
db = pymysql.connect( # 數據庫信息
"127.0.0.1", # 數據庫地址
"root", # 數據庫用戶名
"root", # 數據庫密碼
"brand", # 數據庫名稱
charset='utf8') # 編碼 utf8 不是utf-8
# 獲取動態代理
def get_proxy(): # 獲取代理 我這裏是動態代理ip 隱藏隱私信息了.
manager_host = '182.88.160.111' # 動態的ip
manager_port = 8123 # IP對應的端口
order = '' # 參數 ?????
while True:
url = 'http://%s:%d/get-proxy-api' % (manager_host, manager_port)
params = {'order': order}
res = requests.get(url, params=params)
if res.status_code == 200 and res.text != '{}':
proxy_config = json.loads(res.text) # 獲取請求成功的ip
proxy_port = proxy_config['proxy'] # 將請求成功的ip以字典的形式保存。 # ??????????????
proxy = {'http': '{proxy_port}'} # ???????????????
break
else:
time.sleep(1)
print(u'暫無可用代理')
return proxy # 返回 代理IP
def post_dg(url, data): # main_spider # url 和 data 怎麼獲取?????????????
session = requests.Session() # 日常保險session
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Referer': 'http://sbgg.saic.gov.cn:9080/tmann/annInfoView/annSearch.html?annNum=1605',
'Host': 'sbgg.saic.gov.cn:9080',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Origin': 'http://sbgg.saic.gov.cn:9080'} # 構造請求頭
proxies = get_proxy() # 獲取一個可用代理
res = session.post(url, headers=headers, data=data, proxies=proxies) # 加入代理參數
res.encoding = 'utf8'
if res.status_code == 200 and res.text != '{}': # 判斷頁面是否有數據
logging.info('Page crawling succeeded') # 寫日誌信息
print('頁面抓取成功')
return res.text # 成功返回數據
else:
print('頁面無內容')
if '出錯啦!' or 'ERROR' in res.text:
print('IP被封,頁面ERROR')
logging.info('IP is blocked, Page is ERROR')
def save_to_mysql(url, data): # 儲存到mysql
html = post_dg(url, data) # 發送請求,得倒響應頁面
item = json.loads(html)
for x in range(20):
list = item['rows'][x]
ROWstr = ''
for key in list.keys():
ROWstr = (ROWstr + '"%s"' + ',') % (list[key])
sql = '''INSERT INTO `brand`.`review` (page_no, tm_name, ann_type_code, tmname, reg_name, ann_type, ann_num, reg_num,id,rn,ann_date,regname) VALUES ({ROWstr[:-1]}) '''
cur = db.cursor()
cur.execute(sql)
db.commit()
def main(i=1):
url = 'http://sbgg.saic.gov.cn:9080/tmann/annInfoView/annSearchDG.html'
try:
while True:
data = { # post數據 當前爬取1605頁
'page': '{i}',
'rows': '20',
'annNum': '1605',
'totalYOrN': 'true',
} # 數據難道不用進行編碼
save_to_mysql(url, data)
logging.info(
'Page {i} page 20 data successfully written to the database')
logging.info('``' * 30)
i += 1
except BaseException:
with open('i.txt', 'w') as f: # 創建文件夾保存斷點處的網址ID
f.write(str(i))
print('本次爬取中斷,中斷原因可能爲IP被封,現在爲您切換Uers-Agent與IP。您也可以手動結束本程序,下次啓動時將會從中斷處的網址繼續爬取。')
logging.info('Replace the agent..........')
proxies = get_proxy() # 隨機更換一個代理
print('本次使用代理爲:' + str(proxies))
logging.info(
'After the agent is replaced, the agent is' +
str(proxies))
main(i)
if __name__ == '__main__':
if 'i.txt' in os.listdir(
'.'): # 在當前文件夾下尋找i.txt文件,如果有的話,讀取裏面的值,接着上次發生中斷的網址繼續爬取
with open('i.txt', 'r') as f:
i = int(f.read())
main(i)
else: # 如果沒有,那麼默認從第一個網址開始
main()