import requests from requests.exceptions import RequestException import json from urllib.parse import urlencode from bs4 import BeautifulSoup import os from hashlib import md5 import re def get_page(url, data): try: response = requests.get(url, data) if response.status_code == 200: return response.text else: return None except RequestException as e: return e def parse_page(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') """ def get_page_num(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('gallary_image_count') def generate_page(de_url, num): # 此處是生成圖片網頁代碼 但並不是圖片原網址 無法下載圖片源碼 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \ (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36' } group_url = list() pages = num + 1 # 索引由1開始 +1 索引值 try: response = requests.get(de_url, headers = headers) if response.status_code == 200: html = response.text soup = BeautifulSoup(html, 'lxml') title = soup.select('title')[0].get_text() except RequestException as e: return e for page in range(1, pages): url = de_url + str("#p=") + str(page) group_url.append(url) for group in group_url:download_images(group) return { 'title': title, 'Images_url': group_url } """ def get_page_detail(de_url): # 獲取網站源碼 用於解析組圖網址 requ = requests.session() headers = { 'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 63.0.3239.108Safari / 537.36', 'Remote Address': '153.3.235.87:443', 'Referrer Policy': 'no - referrer - when - downgrade' } # 由於網頁重定向至新的https網址: 這用替換方法 # url = 'https://www.toutiao.com/group/6526518758926713347/' # pattern = re.compile('(.*)') # result = re.search(pattern, de_url) # url = result.group(1).replace('http://', 'https://www.') # 替換 # 使用requests重定向方法: try: redirection = requ.head(de_url, allow_redirects=True) response = requ.get(redirection.url, allow_redirects=False, headers=headers) # 這裏禁止重定向 獲取header中url print(response.url) url = response.headers['location'] resp = requ.get(url) if resp.status_code == 200: return resp.text else: print("請求失敗") return None except RequestException as e: print(e) def parse_page_detail(html, de_url): # 獲取組圖子圖網址 soup = BeautifulSoup(html, 'lxml') title = soup.select('title')[0].get_text() images_pattern = re.compile('mediaInfo:.*?gallery: JSON.parse.*?\"(.*)\".*?siblingList', re.S) # () 轉義存在問題 result = re.search(images_pattern, html) if result: test = re.sub(r'\\"', r'"', result.group(1)) # 以 r 開始表示不轉義 re_url = re.sub(r'\\/', r'/', test) # 替換得到 \/ 難以理解 !----------------------------->插眼 # url_pattern = re.compile(r'http:.*?/.*?/(.*?)(com).*?/(origin).*?/(.*?)\"') # re_url = re.findall(url_pattern, result.group(1)) # print(r'\\\') python中字符串不能以 \ 結尾 # data = json.loads(test, encoding='utf-8') # 碰到一個問題: # son.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) # 解釋說是,JSON字符串中,不能包含單引號,而必須是雙引號 # 參考解決方案https://www.crifan.com/python_json_loads_valueerror_expecting_property_name/ # 部分源碼: """ {\"count\":7,\"sub_images\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\/origin\\/66b20003be4611dea592\\ ",\"width\":800,\"url_list\":[{\"url\":\"http:\\/\\/p1.pstatp.com\\\ /origin\\/66b20003be4611dea592\"},{\"url\":\"http:\\/\\/pb3.pstatp.com\\\ /origin\\/66b20003be4611dea592\"},{\"url\":\"http:\\/\\/pb9.pstatp.com\\\ /origin\\/66b20003be4611dea592\"}],\"uri\":\"origin\\/66b20003be4611dea592\",\ \"height\":1186 """ # 錯誤原由:由於\將”轉義 而json 中key必須用雙引號括起 故而報錯 # 解決方案: 利用r 加 \替換 data = json.loads(re_url) if data and 'sub_images' in data.keys(): # 如果 data 不爲空 且‘sub_images’在 data.keus()中 則爲真 sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_images(image) return { "title": title, 'de_url': de_url, 'images': images } def download_images(url): # 下載圖片代碼 print('正在下載...', url) try: response = requests.get(url) if response.status_code == 200: save_images(response.content) return None except RequestException: return None def save_images(content): # 保存圖片 file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') # os.getcwd 返回當前文件路徑 if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() print('保存成功...') def main(offset, keyword): data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': 20, 'cur_tab': 3, 'from': 'gallery' } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) # urlencode 將dict類型轉換爲str html = get_page(url, data) url = parse_page(html) for de_url in url: text = get_page_detail(de_url) print(text) parse_page_detail(text, de_url) if __name__ == '__main__': main(0, '街拍')
python爬蟲:分析Ajax請求爬取今日頭條街拍圖
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.