需求:手機抓包和下載圖片(圖片重命名)
1. 抓包工具準備
1.1 Fiddler 該軟件端口默認是8888
1.2 獵豹免費WiFi:
1.3 手機設置代理服務器
使用命令ipconfig在windows上查看獵豹免費WiFi的產生的ip
手機設置代理服務器
2.Letvlive.py
import scrapy import json from Letv.items import LetvItem # LetvliveSpider名字可以任意,繼承scrapy.Spider,基本爬蟲 class LetvliveSpider(scrapy.Spider): # 爬蟲名稱,在當前項目中名字不能重複發 name = 'Letvlive' # 爬取的網站,只能在這個範圍內容,如果註釋掉,沒有域名的限制,所以的網站都可以爬 allowed_domains = ['letv.com'] page = 1 pre = "http://dynamic.live.app.m.letv.com/android/dynamic.php?luamod=main&mod=live&ctl=liveHuya&act=channelList&pcode=010210000&version=7.17&channelId=2168&pages=" suf = "&country=CN&provinceid=1&districtid=9&citylevel=1&location=%E5%8C%97%E4%BA%AC%E5%B8%82%7C%E6%9C%9D%E9%98%B3%E5%8C%BA&lang=chs®ion=CN" # start_urls裏面的鏈接不受allowed_domains這裏面的現在 start_urls = [pre + str(page) + suf] def parse(self, response): json_text = response.text # 把json_text 轉換成python_dict python_dict = json.loads(json_text) for item in python_dict["body"]["result"]: letvItem = LetvItem() # 獲取暱稱 nick = item["nick"] image = item["screenshot"] letvItem["nick"] = nick letvItem["image"] = image print(letvItem) # 傳遞給pipelines(管道) yield letvItem if python_dict.get("header").get("status") == "1": self.page += 1 new_url = self.pre + str(self.page) + self.suf # 會有相同的url鏈接,這個鏈接請求了,就不去請求 # 把所以添加的鏈接,做去重處理,請求,當再次添加相同的鏈接進入的時候,判斷請求過了,就不請求了 # 把添加的,沒有重複的請求後,爬蟲結束了 yield scrapy.Request(new_url, callback=self.parse)
3.pipelines.py
import scrapy from scrapy.pipelines.images import ImagesPipeline # 保存圖片 import json import os from Letv.settings import IMAGES_STORE # from scrapy.utils.project import get_project_settings class LetvImagePipeline(ImagesPipeline): # IMAGES_STORE = get_project_settings().get("IMAGES_STORE") # 添加請求圖片的路徑 def get_media_requests(self, item, info): # 圖片下載路徑 image = item["image"] # 把圖片路徑添加到scrapy引擎裏面,讓對應的下載器幫我們下載圖片 yield scrapy.Request(image) # 當圖片下載完成後,會調用的方法,並且把下載後的路徑,回傳到這個方法裏 def item_completed(self, results, item, info): print("results===", results) image = [x["path"] for ok, x in results if ok][0] print(image) # 把圖片的名字重命名 old_image_name = IMAGES_STORE + "/" + image # ./images/黑作坊丶小美兒.jpg new_image_name = IMAGES_STORE + "/" + item["nick"] + ".jpg" print("old_image_name==", old_image_name) print("new_image_name==", new_image_name) # 重命名 os.rename(old_image_name, new_image_name) print(image) item["image_path"] = new_image_name return item # 默認是處理文本 class LetvPipeline(object): # 爬蟲開始執行的時候調用 def open_spider(self, spider): self.file = open(spider.name + ".json", "w") def process_item(self, item, spider): python_dict = dict(item) # pyhton 字典-->pyhton str json_str = json.dumps(python_dict, ensure_ascii=False) + "\n" self.file.write(json_str) return item # 當爬蟲結束的時候調用 def close_spider(self, spider): self.file.close()
4.settings.py
# 不遵循爬蟲協議
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = { 'Letv.pipelines.LetvPipeline': 301, # 保存文本 'Letv.pipelines.LetvImagePipeline': 300, # 保存圖片 } # 圖片保存的路徑,一定要寫,否則不去下載圖片,要寫對 IMAGES_STORE = "./images"
5.運行文件 ---start.py
from scrapy import cmdline cmdline.execute("scrapy crawl Letvlive".split())