requests（打印日誌 | 連接數據庫 | 獲取動態代理

requests（打印日誌 | 連接數據庫 | 獲取動態代理 | 爬取數據）

原創

2018-12-17 19:02

import requests
import logging
import time
import json
import pymysql
import os

# 打印日誌
log_name = 'sb_spider_log.log'
logging.basicConfig(  # 日誌輸出信息
    filename=log_name,
    filemode='a',
    level=logging.INFO,
    datefmt='%Y-%m-%d %A %H:%M:%S')
# 連接mysql數據庫
db = pymysql.connect(  # 數據庫信息
    "127.0.0.1",  # 數據庫地址
    "root",  # 數據庫用戶名
    "root",  # 數據庫密碼
    "brand",  # 數據庫名稱
    charset='utf8')  # 編碼   utf8    不是utf-8

# 獲取動態代理
def get_proxy():  # 獲取代理   我這裏是動態代理ip    隱藏隱私信息了.
    manager_host = '182.88.160.111'      # 動態的ip
    manager_port =  8123                 # IP對應的端口
    order = ''                           # 參數   ？？？？？
    while True:
        url = 'http://%s:%d/get-proxy-api' % (manager_host, manager_port)
        params = {'order': order}
        res = requests.get(url, params=params)
        if res.status_code == 200 and res.text != '{}':
            proxy_config = json.loads(res.text)  # 獲取請求成功的ip
            proxy_port = proxy_config['proxy']  # 將請求成功的ip以字典的形式保存。                                                # ??????????????
            proxy = {'http': '{proxy_port}'}   # ???????????????
            break
        else:
            time.sleep(1)
            print(u'暫無可用代理')
    return proxy  # 返回 代理IP


def post_dg(url, data):  # main_spider            # url 和 data 怎麼獲取？？？？？？？？？？？？？
    session = requests.Session()  #     日常保險session
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Referer': 'http://sbgg.saic.gov.cn:9080/tmann/annInfoView/annSearch.html?annNum=1605',
        'Host': 'sbgg.saic.gov.cn:9080',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Origin': 'http://sbgg.saic.gov.cn:9080'}  # 構造請求頭
    proxies = get_proxy()  # 獲取一個可用代理
    res = session.post(url, headers=headers, data=data, proxies=proxies)  # 加入代理參數
    res.encoding = 'utf8'
    if res.status_code == 200 and res.text != '{}':  # 判斷頁面是否有數據
        logging.info('Page crawling succeeded')           # 寫日誌信息
        print('頁面抓取成功')
        return res.text  # 成功返回數據
    else:

        print('頁面無內容')
    if '出錯啦！' or 'ERROR' in res.text:
        print('IP被封,頁面ERROR')
        logging.info('IP is blocked, Page is ERROR')


def save_to_mysql(url, data):  # 儲存到mysql
    html = post_dg(url, data)  # 發送請求，得倒響應頁面
    item = json.loads(html)
    for x in range(20):
        list = item['rows'][x]
        ROWstr = ''
        for key in list.keys():
            ROWstr = (ROWstr + '"%s"' + ',') % (list[key])
        sql = '''INSERT INTO `brand`.`review` (page_no, tm_name, ann_type_code, tmname, reg_name, ann_type, ann_num, reg_num,id,rn,ann_date,regname)  VALUES ({ROWstr[:-1]}) '''
        cur = db.cursor()
        cur.execute(sql)
        db.commit()


def main(i=1):
    url = 'http://sbgg.saic.gov.cn:9080/tmann/annInfoView/annSearchDG.html'
    try:
        while True:
            data = {  # post數據   當前爬取1605頁
                'page': '{i}',
                'rows': '20',
                'annNum': '1605',
                'totalYOrN': 'true',
            }                                        # 數據難道不用進行編碼
            save_to_mysql(url, data)
            logging.info(
                'Page {i} page 20 data successfully written to the database')
            logging.info('``' * 30)
            i += 1
    except BaseException:
        with open('i.txt', 'w') as f:  # 創建文件夾保存斷點處的網址ID
            f.write(str(i))
        print('本次爬取中斷，中斷原因可能爲IP被封，現在爲您切換Uers-Agent與IP。您也可以手動結束本程序，下次啓動時將會從中斷處的網址繼續爬取。')
        logging.info('Replace the agent..........')
        proxies = get_proxy()  # 隨機更換一個代理
        print('本次使用代理爲：' + str(proxies))
        logging.info(
            'After the agent is replaced, the agent is' +
            str(proxies))
        main(i)


if __name__ == '__main__':

    if 'i.txt' in os.listdir(
            '.'):  # 在當前文件夾下尋找i.txt文件，如果有的話，讀取裏面的值，接着上次發生中斷的網址繼續爬取
        with open('i.txt', 'r') as f:
            i = int(f.read())
            main(i)
    else:  # 如果沒有，那麼默認從第一個網址開始
        main()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

requests（打印日誌 | 連接數據庫 | 獲取動態代理 | 爬取數據）

2024年DataOps趨勢預測：AI不會取代數據工程師

雲原生週刊：K8s 中的服務和網絡｜ 2024.4.29

通過Http鏈接地址爬取有贊微信商城商品信息及下載至EXCEL

多人同時導出 Excel 幹崩服務器！新來的阿里大佬給出的解決方案太優雅了！

[轉帖]cpupower

今天，昨天，近七天，近30天，近90天，js封裝

華爲云云原生FinOps解決方案，釋放雲原生最大價值

肯德基爬蟲 (案例練習：ajax、post)

百度翻譯爬蟲（案例練習：POST 請求）

百度貼吧爬蟲(案例練習：GET 請求)

豆瓣爬蟲 (CookieJar練習：爬取用戶登入後的響應頁面)

global、nonlocal 作用域

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結