python爬蟲:分析Ajax請求爬取今日頭條街拍圖

import requests
from requests.exceptions import RequestException
import json
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import os
from hashlib import md5
import re

def get_page(url, data):
    try:
        response = requests.get(url, data)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException as e:
        return e

def parse_page(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

"""
def get_page_num(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('gallary_image_count')

def generate_page(de_url, num):  # 此處是生成圖片網頁代碼 但並不是圖片原網址 無法下載圖片源碼
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'
    }
    group_url = list()
    pages = num + 1          # 索引由1開始 +1 索引值
    try:
        response = requests.get(de_url, headers = headers)
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'lxml')
            title = soup.select('title')[0].get_text()
    except RequestException as e:
        return e
    for page in range(1, pages):
        url = de_url + str("#p=") + str(page)
        group_url.append(url)
    for group in group_url:download_images(group)
    return {
        'title': title,
        'Images_url': group_url
    }
"""
def get_page_detail(de_url):   # 獲取網站源碼 用於解析組圖網址
    requ = requests.session()
    headers = {
        'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 63.0.3239.108Safari / 537.36',
        'Remote Address': '153.3.235.87:443',
        'Referrer Policy': 'no - referrer - when - downgrade'
    }
    # 由於網頁重定向至新的https網址: 這用替換方法
    # url = 'https://www.toutiao.com/group/6526518758926713347/'
    # pattern = re.compile('(.*)')
    # result = re.search(pattern, de_url)
    # url = result.group(1).replace('http://', 'https://www.')     # 替換

    # 使用requests重定向方法:
    try:
        redirection = requ.head(de_url, allow_redirects=True)
        response = requ.get(redirection.url, allow_redirects=False, headers=headers)  # 這裏禁止重定向 獲取header中url
        print(response.url)
        url = response.headers['location']
        resp = requ.get(url)
        if resp.status_code == 200:
            return resp.text
        else:
            print("請求失敗")
            return None
    except RequestException as e:
        print(e)

def parse_page_detail(html, de_url):  # 獲取組圖子圖網址
    soup = BeautifulSoup(html, 'lxml')
    title = soup.select('title')[0].get_text()
    images_pattern = re.compile('mediaInfo:.*?gallery: JSON.parse.*?\"(.*)\".*?siblingList', re.S)  # () 轉義存在問題
    result = re.search(images_pattern, html)
    if result:
        test = re.sub(r'\\"', r'"', result.group(1))        # 以 r 開始表示不轉義
        re_url = re.sub(r'\\/', r'/', test)          # 替換得到 \/ 難以理解 !----------------------------->插眼

        # url_pattern = re.compile(r'http:.*?/.*?/(.*?)(com).*?/(origin).*?/(.*?)\"')
        # re_url = re.findall(url_pattern, result.group(1))
        # print(r'\\\')  python中字符串不能以 \ 結尾
        # data = json.loads(test, encoding='utf-8')
        # 碰到一個問題:
        # son.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
        # 解釋說是,JSON字符串中,不能包含單引號,而必須是雙引號
        # 參考解決方案https://www.crifan.com/python_json_loads_valueerror_expecting_property_name/
        # 部分源碼:
        """
        {\"count\":7,\"sub_images\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\/origin\\/66b20003be4611dea592\\
        ",\"width\":800,\"url_list\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\\
        /origin\\/66b20003be4611dea592\"},{\"url\":\"http:\\/\\/pb3.pstatp.com\\\
        /origin\\/66b20003be4611dea592\"},{\"url\":\"http:\\/\\/pb9.pstatp.com\\\
        /origin\\/66b20003be4611dea592\"}],\"uri\":\"origin\\/66b20003be4611dea592\",\
        \"height\":1186
        """
        # 錯誤原由:由於\將”轉義 而json 中key必須用雙引號括起 故而報錯
        # 解決方案: 利用r 加 \替換

        data = json.loads(re_url)
        if data and 'sub_images' in data.keys():         # 如果 data 不爲空 且‘sub_images’在 data.keus()中 則爲真
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images: download_images(image)
            return {
                "title": title,
                'de_url': de_url,
                'images': images
            }

def download_images(url):        # 下載圖片代碼
    print('正在下載...', url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            save_images(response.content)
        return None
    except RequestException:
        return None

def save_images(content):         # 保存圖片
    file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')  # os.getcwd 返回當前文件路徑
    if not os.path.exists(file_path):
        with open(file_path, 'wb') as f:
            f.write(content)
            f.close()
    print('保存成功...')

def main(offset, keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': 20,
        'cur_tab': 3,
        'from': 'gallery'
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    # urlencode 將dict類型轉換爲str
    html = get_page(url, data)
    url = parse_page(html)
    for de_url in url:
        text = get_page_detail(de_url)
        print(text)
        parse_page_detail(text, de_url)


if __name__ == '__main__':
    main(0, '街拍')
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章