實戰 | python spiders 網絡爬蟲筆記

需求

1.要批量抓取某網站的酒店數據

準備工作

(1).csrapy安裝:

1.先安裝python3.6  官網下載
2.安裝pywin32   https://sourceforge.net/projects/pywin32/files/pywin32/   (找相應版本下載)
3.安裝lxml     https://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml    (下載相應版本)
4.雙擊運行pywin32程序,下一步安裝玩成
5.去該文件下面,按shift鍵,鼠標右鍵盤在此打開命令行窗口
6.pip install lxml(tab鍵補全)
6_1.pip install Twisted-17.9.0-cp36-cp36m-win_amd64.whl //版本不支持,要另外安裝
7.pip install csrapy

(2)命令交互,熟悉下工具命令

1.#scrapy shell http://gz.ganji.com/fang1/     #試下爬行這個網址看能否抓回網頁
2.#response         #成功會返回response
3.#view(response)   #查看
4.#response.xpath("//*[@id='puid-3026077037']/dl/dd[5]/div[1]/span[1]").extract()  ////*[@id="puid-3026077037"]/dl/dd[5]/div[1]/span[1]元素ID,注意裏面雙引號改爲單引號,拿元素
5.#response.xpath("//*[@id='puid-3026077037']/dl/dd[5]/div[1]/span[1]/text()").extract() //獲取單個的思路
6.修改元素定位,獲取列表"//*[@id='puid-3026077037']/dl/dd[5]/div[1]/span[1]" --> "//div[@class='f-list-item ershoufang-list']/dl/dd[5]/div[1]/span[1]/text()"
7.#response.xpath("//div[@class='f-list-item ershoufang-list']/dl/dd[5]/div[1]/span[1]/text()").extract() //匹配正個列表的價格
8.#response.xpath("//div[@class='f-list-item ershoufang-list']/dl/dd[1]/a/text()").extract()  //匹配正個列表的標題

9.#len()查看長度

開始上代碼

(1)創建csrapy工程

1.#scrapy startproject zufang    //創建工程命令
2打開PyCharm開發工具  //作者用的工具,很好用

3.打開zufang目錄,看圖片


4.在spiders 裏面添加zufang_detail.py文件  //文件名可以自己命名
5.在zufang_detail.py裏面編碼
6.用自帶的Terminal 命令執行命令
7.#scrapy list //查看爬蟲列表

8.#scrapy crawl zufang_detail  //開始爬蟲


代碼沒錯執行正常的顯示,如果有錯誤可以百度下,資料很多,可能每個人錯誤都不同,這裏不做錯誤演示,自己解決才能真正學到


(2)代碼解說

1.目錄結構:


創建csrapy工程會創建了上面除了zufang_detail.py文件外的所有文件,詳細請參考操作手冊:http://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html

2.items.py //數據字段名,就是一個類,在裏面可以設定屬性,比如我工拿租房信息,可以在裏面設定標題,價格,等屬性

# -*- coding: utf-8 -*-
import scrapy
class ZufangDetailItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()  #標題
    money = scrapy.Field()#價格
    content = scrapy.Field()#內容
    address = scrapy.Field()#地址
    other = scrapy.Field()#其他信息
    imgurl = scrapy.Field()#圖片鏈接
    url = scrapy.Field()#
    phone = scrapy.Field()#聯繫方式
    filename = scrapy.Field()
    id = scrapy.Field()
    pass

3.pipelines.py //管道文件,可以提取在items設定屬性的數據,在這個文件存數據庫或導出excel文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from openpyxl import Workbook

class ZufangDetailPipeline(object):
    def open_spider(self,spider):
        #保存數據庫,需要存數據庫可以把註釋代碼放開
        # self.con = pymysql.connect(host="localhost",port=3306,user="root",passwd="123456",db="ganji", charset="utf8")
        # self.cu = self.con.cursor()

        # 保存excel
        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.append(['標題','價格','簡要信息','其他信息','小區信息','聯繫電話','鏈接'])# 設置表頭
    def process_item(self, item, spider):
        #存數據庫
        # print(spider.name,'pipelines')
        # insert_sql = "insert into zufang (money,title,content,other,address,phone,url)  values('{}','{}','{}','{}','{}','{}','{}')"\
        #     .format(item['money'],item['title'],item['content'],item['other'],item['address'],item['phone'],item['url'])
        # try:
        #     # 執行sql語句
        #     print(insert_sql)
        #     self.cu.execute(insert_sql)
        #     # 提交到數據庫執行
        #     self.con.commit()
        # except:
        #     # 如果發生錯誤則回滾
        #     self.con.rollback()
        # return item
        # 保存excel
        line = [item['title'],item['money'],item['content'],item['other'],item['address'],item['phone'],item['url']]
        self.ws.append(line) # 將數據以行的形式添加到xlsx中
        self.wb.save(item['filename']+'.xlsx')
        return item

    def spider_close(self,spider):
        self.con.close()

4.settings.py //工程文件一些設定

核心的:

BOT_NAME = 'zufang_detail' #爬蟲工程文件名,創建工程的時候默認填好
ITEM_PIPELINES = {
   'zufang_detail.pipelines.ZufangDetailPipeline': 300,#保存項目中啓用的pipeline及其順序的字典。該字典默認爲空,值(value)任意。 不過值(value)習慣設定在0-1000範圍內。
}

5.zufang_detail.py //業務代碼具體實現

import scrapy
from ..items import ZufangDetailItem
from scrapy import Selector 
import urllib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os
import random
import string
import scrapy.settings

class GanjiSpider(scrapy.Spider):
    name = "zufang_detail"  #不懂的不用改,和工程名一樣
    start_urls = ['http://gz.ganji.com/fang1/tianhe/'] #要爬目標網頁地址

    def parse(self, response): #解析方法

        # 詳情頁//*[@id="f_detail"]/div[6]/div[2]/div[1]/div/div[2]/ul/li[1]
        url_list = response.xpath("//div[@class='f-list-item ershoufang-list']/@href").extract()
        # print(url_list)
        # return
        # yield scrapy.Request(response.urljoin(url_list[1]), callback=self.parse_detail, meta={'url': url_list[1]})
        for href in url_list:
            yield scrapy.Request(response.urljoin(href), callback=self.parse_detail, meta={'url': href})
        #分頁處理,有需要分頁可以打開註釋
        # next_page_11 = response.xpath("*[@id='f_mew_list']/div[6]/div[1]/div[4]/div/div/ul/li[11]/a/@href").extract()
        # next_page_12 = response.xpath("*[@id='f_mew_list']/div[6]/div[1]/div[4]/div/div/ul/li[12]/a/@href").extract()
        # if next_page_11 is not None:
        #     next_page = next_page_11[0]
        # else:
        #     next_page = next_page_12[0]
        # if next_page is not None:
        #     next_page_new = response.urljoin(next_page)
        #     time.sleep(5)
        #     yield scrapy.Request(next_page_new, callback=self.parse)


    # 負責子頁面內容的爬取,上面是獲取列表信息,這裏是從列表進到詳頁面
    def parse_detail(self, response):
        item = ZufangDetailItem()
        item['url'] = response.meta['url']
        item['title'] = ''.join(response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/p[1]/i/text()").extract())
        item['money'] = ''.join(response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/ul[1]/li[1]/span[2]/text()").extract())

        #房子信息
        content_list_htmls = response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/ul[2]").extract()
        content_list = []
        for content_list_html in content_list_htmls:
            sel = Selector(text=content_list_html, type="html")
            spans_str = "|".join(sel.xpath('//li/span[2]/text()').extract())
            content_list.append(spans_str.replace("&nbsp","|"))
        item['content'] = content_list[0]
        #地址信息
        xiaoqu = response.xpath( "//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/ul[3]/li[1]/span[2]/a/text()").extract()
        address1 = response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/ul[3]/li[3]/span[2]/a[1]/text()").extract()
        address2 = response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/ul[3]/li[3]/span[2]/a[2]/text()").extract()
        address3 = response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/ul[3]/li[3]/span[2]/a[3]/text()").extract()
        if address3 is None:
            address3 = response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/ul[3]/li[2]/span[2]/span/text()").extract()
            # print(content_list)
        item['address'] = ''.join(xiaoqu)+"|"+''.join(address1)+"-"+''.join(address2)+"-"+''.join(address3)
        #交通信息
        item['other'] = ''.join(response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[2]/div[1]/ul[3]/li[2]/div/span[1]/text()").extract())
        # 獲取電話
        item['phone'] = ''.join(response.xpath("//*[@id='full_phone_show']/@data-phone").extract())
        #保存的文件名
        item['filename']="天河區租房信息"
        item['id']= ''.join(random.sample(string.ascii_letters + string.digits, 8))

        #圖片url
        img_urls = response.xpath("//*[@id='f_detail']/div[5]/div[2]/div[1]/div/div[2]/ul/li/@data-image").extract()
        self.get_img(img_urls,item)
        yield item
        # print(item)

    def get_img(self,imgurls,item): #爬圖片
        path = 'D:\lin\csrapy\zufang_detail\\'
        if not os.path.exists(path + item['filename']):
            os.mkdir(item['filename'])
            os.chdir(item['filename'])
        else:
            os.chdir(path + item['filename'])
        n = 1
        for img_url in imgurls:
            time.sleep(3)
            if n < 3 :
                if not os.path.exists(path + item['filename'] + item['id'] + '_%s.jpg'):
                    try:
                        urllib.request.urlretrieve(img_url, item['id'] + '_%s.jpg' % n)  # prython3.6寫法,把圖片存到本地目錄
                    except:
                        return
                    print(n)
                    n += 1
            else:
                return

(3)實戰效果

1.獲取詳細信息打印


2.最終會在本地目錄創建個文件夾存數據




電話好像有點問題,讀者自己解決.

後續:寫得不好,請大牛指正錯誤

工具下載鏈接1:https://download.csdn.net/download/u012728971/10476750

工具下載鏈接2:https://download.csdn.net/download/u012728971/10476780

(工具太大,分2部)

源碼下載鏈接:https://download.csdn.net/download/u012728971/10476712


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章