Scrapy---美劇TOP100

沒事幹,想要看美劇,但是不知道哪個美劇好看,就去美劇天堂看了看,發現了有一個Top榜,然後發現無聊就拿來鍛鍊一下個人的Scrapy框架,下面是過程代碼:
首先我們需要創建一個項目(這裏是用Anaconda):在這裏插入圖片描述
現在我們項目創建好了 就是打開我們的pychram
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏我們要確認自己想要的值
在這裏插入圖片描述將要的東西給寫到一個.txt文件中去
在這裏插入圖片描述
打開錯誤日誌,方便我們排查錯誤
在這裏插入圖片描述
最後是我們要的Top前100的美劇信息
meiju.py

###meiju.py
# -*- coding: utf-8 -*-
import scrapy
from pachongday12.items import Pachongday12Item

class MeijuSpider(scrapy.Spider):
    name = 'meiju'
    allowed_domains = ['meijutt.com']
    start_urls = ['https://www.meijutt.com/new100.html']


    def parse(self, response):
        print(response.body.decode('gb2312'))
        obj_list = response.xpath('//ul[@class="top-list  fn-clear"]/li')
        for obj in obj_list:
            item = Pachongday12Item()

            #排名:
            num = obj.xpath('.//div[@class="lasted-num fn-left"]/i/text()').extract()[0]
            item['num']=num
            #電影名稱:
            name = obj.xpath('.//h5//text()').extract()[0]
            item['name'] = name
            #小分類:
            type = obj.xpath('.//span[@class="mjjq"]/text()').extract()[0]
            item['type'] = type
            #電視臺:
            tv = obj.xpath('.//span[@class="mjtv"]/text()').extract()[0]
            item['tv'] = tv
            #更新時間:
            time = obj.xpath('.//div[@class="lasted-time new100time fn-right"]//text()').extract()[0]
            item['time'] = time
            #路徑:
            base_url = obj.xpath('.//h5/a/@href').extract()[0]
            url = 'https://www.meijutt.com'+base_url
            item['url'] = url

            yield scrapy.Request(url=url,callback=self.pares_detail,meta={'data':item,'phjs':True},dont_filter=False)

    def pares_detail(self,response):
        item = response.meta['data']
        info_list = response.xpath('//div[@class="o_r_contact"]/ul')
        for info in info_list:
            #導演:
            dy = info.xpath('./li[4]/span/text()').extract()[0]
            item['dy']=dy
            #主演:
            zy = info.xpath('./li[5]/span/text()').extract()[0]
            item['zy'] = zy
            yield item



items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class Pachongday12Item(scrapy.Item):
    # define the fields for your item here like:
    num = scrapy.Field()
    name = scrapy.Field()
    type = scrapy.Field()
    tv = scrapy.Field()
    time = scrapy.Field()
    url = scrapy.Field()
    dy = scrapy.Field()
    zy = scrapy.Field()
``

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json

class Pachongday12Pipeline(object):
    def process_item(self, item, spider):
        return item

class MEIJU(object):
    def process_item(self, item, spider):
        # fp = open('Meiju.txt','a',encoding='utf-8')
        with open("Meiju.txt",'a',encoding='utf-8')as fp:
            # json.dump(dict(item),fp,ensure_ascii=False)
            fp.write("名稱:" + dict(item)['name'] + '\n')
            fp.write("導演:" + dict(item)['dy'] + '\n')
            fp.write("主演:" + dict(item)['zy'] + '\n')
            fp.write("類型:" + dict(item)['type'] + '\n')
            fp.write("排名:" + dict(item)['num'] + '\n')
            fp.write("電視臺:"+dict(item)['tv']+'\n')
            fp.write("路徑:"+ dict(item)['url']+'\n')

            fp.write('\n')
        return item

sttinss.py

# -*- coding: utf-8 -*-

# Scrapy settings for pachongday12 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'pachongday12'

SPIDER_MODULES = ['pachongday12.spiders']
NEWSPIDER_MODULE = 'pachongday12.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'pachongday12 (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'pachongday12.middlewares.Pachongday12SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'pachongday12.middlewares.Pachongday12DownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'pachongday12.pipelines.MEIJU': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

LOG_ENABLED = True   #開啓日誌文件
LOG_FILE = '美劇.log' #日誌文件
LOC_ENCODING = 'utf-8'
LOG_LEVEL = 'DEBUG' #日誌等級
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章