Scrapy爬蟲實戰+Mysql

本次爬取的網址爲:http://www.yy6080.cn/vodtypehtml/1.html(推薦使用谷歌瀏覽器,方便看源碼)

首先在啓動命令行,創建爬蟲項目:

scrapy startproject NewVideoMovie

然後:

cd NewVideoMovie

最後創建spider:

scrapy genspider spider http://www.yy6080.cn/vodtypehtml/1.html

創建完成後的結果:
在這裏插入圖片描述
dao文件下面的兩個py文件用來連接數據庫。

這樣基本的爬蟲框架就有了,讓我們繼續下一步:
建立數據庫db_newvideomovie_data,同時創建兩張表。(看錶頭就可以了,列表裏的信息是爬去成功後的)
數據庫
task
filminfo

item.py代碼:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class NewvideomovieItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    filmName    = scrapy.Field() #電影名稱
    filmRanking = scrapy.Field() #電影評分
    filmType    = scrapy.Field()  #電影類型
    filmHref    = scrapy.Field() #電影鏈接
    nextURL     = scrapy.Field()  #下一頁的連接
    nextPage    = scrapy.Field()
    #二級頁面
    filminfo_name         = filmName
    filminfo_director     = scrapy.Field()  #電影導演
    filminfo_scriptwriter = scrapy.Field()  #電影編劇
    filminfo_protagonist  = scrapy.Field()  #電影主演
    filminfo_type         = scrapy.Field()  #電影類型
    filminfo_country      = scrapy.Field()  #製片國家
    filminfo_language     = scrapy.Field()  #語言
    filminfo_releasetime  = scrapy.Field()  #發行時間
    filminfo_ranking      = scrapy.Field()  #電影評分
    filminfo_content      = scrapy.Field()  #劇情介紹

    pass

pipelines.py代碼:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from .dao.taskdao import TaskDao

class NewvideomoviePipeline(object):
    def process_item(self, item, spider):

        s = TaskDao()
        s.create((item['filmName'],item['filmRanking'],item['filmType'],item['filmHref']))
        print('輸出管道數據')
        print(item['filmName'])
        print(item['filmRanking'])
        print(item['filmType'])
        print(item['filmHref'])

setting.py代碼:

# -*- coding: utf-8 -*-

# Scrapy settings for NewVideoMovie project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'NewVideoMovie'

SPIDER_MODULES = ['NewVideoMovie.spiders']
NEWSPIDER_MODULE = 'NewVideoMovie.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'NewVideoMovie (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    'NewVideoMovie.middlewares.NewvideomovieSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'NewVideoMovie.middlewares.NewvideomovieDownloaderMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'NewVideoMovie.pipelines.NewvideomoviePipeline': 300,
    #'NewVideoMovie.mysqlpipelines.NewvideomoviePipeline': 301,

}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# LOG_LEVEL = 'ERROR'
# LOG_FILE = 'jobspider.log'

文件spider下的newvideomovie.py代碼:

# -*- coding: utf-8 -*-

import scrapy
from Mypro.Aug.day23.NewVideoMovie.NewVideoMovie.items import NewvideomovieItem


class NewvideomovieSpider(scrapy.Spider):
    name = 'newvideomovie'
    # allowed_domains = ['http://www.yy6080.cn/vodtypehtml/1.html']
    start_urls = ['http://www.yy6080.cn/vodtypehtml/1.html']

    def parse(self,response):

        subSelectors = response.xpath('//div[@class="movie-item"]')
        # 遍歷選擇器列表
        itemList = []
        movieLen = len(subSelectors)
        movieCount = 0

        nextPage = ""
        for subSelector in subSelectors:
            movieCount += 1

            NVM = NewvideomovieItem()
            # 電影名稱
            filmName = subSelector.xpath('div[@class="meta"]/div/a/text()')
            if filmName:
                NVM['filmName'] = filmName.extract()[0].strip()

            # 電影評分
            filmRanking = subSelector.xpath('div[@class="meta"]/div/span/text()')
            if filmRanking:
                NVM['filmRanking'] = filmRanking.extract()[0].strip('分').split()[0]

            # 電影類型
            filmType = subSelector.xpath('div[@class="meta"]/div[@class="otherinfo"]/text()')
            if filmType:
                NVM['filmType'] = filmType.extract()[0].split(':')[-1]


            ## 電影鏈接
            filmHref = subSelector.xpath('div[@class="meta"]/div/a/@href')
            if filmHref:
                NVM['filmHref'] = 'http://www.yy6080.cn'+filmHref.extract()[0].strip()

            nextPage= response.xpath('//a[@class="pagelink_a"]/@href').extract()
            nextText = response.xpath('//a[@class="pagelink_a"]/text()').extract()

            if nextText[-2] == '下一頁':
                url = 'http://www.yy6080.cn' + nextPage[-2]
                NVM['nextPage'] = url

            if filmName and filmRanking and filmType and filmHref:

                itemList.append(subSelectors)
                yield scrapy.Request(url=NVM['filmHref'], callback=self.parse_filminfo, meta={'item': NVM, 'movieLen':movieLen, 'movieCount':movieCount},dont_filter=True)


    def parse_filminfo(self, response):

        NVM = response.meta['item']
        movieLen = response.meta['movieLen']
        movieCount = response.meta['movieCount']

        subSelectors_second = response.xpath('//tbody/tr')
        for subSelector_second in subSelectors_second:
            txt = ""
            txt2 = ""
            # 電影主演
            filminfo_protagonist = subSelector_second.xpath('td[@id="casts"]/a/text()')
            if filminfo_protagonist:
                s = str(filminfo_protagonist.extract())
                NVM['filminfo_protagonist'] = s[1:-1]

            # 劇情介紹
            filminfo_content = subSelector_second.xpath('//div[@class="col-md-8"]/p[@class="summary"]/text()')
            if filminfo_content:
                NVM['filminfo_content'] = filminfo_content.extract()[0].strip()

            column = subSelector_second.xpath('td[@class="span2"]/span/text()')

            if column:
                txt = column.extract()[0].strip()

            column2 = subSelector_second.xpath('td/a/text()')

            if column2:
                txt2 = column2.extract()
            else:
                column2 = subSelector_second.xpath('td/text()')
                if column2:
                    txt2 = column2.extract()

            if txt == "導演":
                if txt2:
                     str1 = ''
                     for temp in txt2:
                         str1 =temp.strip()+','+str1
                     NVM['filminfo_director'] = str1
                else:
                    NVM['filminfo_director'] = 'None'
            elif txt =='編劇':
                if txt2:
                     str1 = ''
                     for temp in txt2:
                         str1 =temp.strip()+','+str1
                     NVM['filminfo_scriptwriter'] = str1
                else:
                    NVM['filminfo_scriptwriter'] = 'None'

            elif txt == '類型':
                if txt2:
                    NVM['filminfo_type']= txt2[0]
                else:
                    NVM['filminfo_type'] = 'None'

            elif txt == '製片國家':
                if txt2:
                    NVM['filminfo_country'] = txt2[0]
                else:
                    NVM['filminfo_country'] = 'None'

            elif txt == '上映時間':
                if txt2:
                    NVM['filminfo_releasetime'] = txt2[0]
                else:
                    NVM['filminfo_releasetime']= 'None'

            elif txt == '評分':
                if txt2:
                    NVM['filminfo_ranking'] = txt2[-1].split(':')[-1]
                else:
                     NVM['filminfo_ranking'] = 'None'

            elif txt == '語言':
                if txt2:
                    NVM['filminfo_language'] = txt2[0]
                else:
                    NVM['filminfo_language'] = 'None'
        yield NVM

        if movieLen == movieCount:
            if NVM['nextPage']:
                print(NVM['nextPage'])
                yield scrapy.Request(NVM['nextPage'], self.parse, dont_filter=True)

文件dao下面的basedao.py代碼:

#引入pymsql

import pymysql
import json
import logging

class BaseDao():

    def __init__(self,configPath='pymysql.json'):
        self.__connection = None
        self.__cursor = None
        #通過json配置文件獲得數據庫的連接信息
        self.__config = json.load(open(configPath,'r'))
        print(self.__config)

    #獲取數據庫的連接
    def getConnection(self):

        #當有連接對象時返回連接對象
        if self.__connection:
            return self.__connection
        #否則建立新的連接對象
        #獲取數據庫連接

        try:
            self.__connection = pymysql.connect(**self.__config)
            return self.__connection
        except pymysql.MySQLError as e:
            print('Exception:',e)

    #執行sql語句的通用方法
    def execute(self,sql,params):
        try:
            self.__cursor= self.getConnection().cursor()
            if params:
                result = self.__cursor.execute(sql,params)
            else:
                result  =self.__cursor.execute(sql)
            return result

        except (pymysql.MySQLError,pymysql.DatabaseError,Exception) as e:
            print('出現數據庫異常:'+str(e))
            self.rollback()

    def fetchall(self):
        if self.__cursor:
            return self.__cursor.fetchall()
        pass
    def commit(self):
        if self.__connection:
            self.__connection.commit()

    def rollback(self):
        if self.__connection:
            self.__connection.rollback()

    def close(self):
        if self.__cursor:
            self.__cursor.close()
        if self.__connection:
            self.__connection.close()

文件dao下面的taskdao.py代碼:

from .basedao import BaseDao

class TaskDao(BaseDao):

    def create(self,params):
        sql ='insert into nvm_collect_task(film_name,film_ranking,film_type,film_href) values (%s,%s,%s,%s)'
        result = self.execute(sql,params)
        self.commit()
        self.close()
        return result


    def create_filminfo(self,params):

        sql ='insert into nvm_collect_filminfo(filminfo_name,filminfo_director,' \
             'filminfo_scriptwriter,filminfo_protagonist,filminfo_type,filminfo_country,filminfo_language,' \
             'filminfo_releasetime,filminfo_ranking,filminfo_content) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        result = self.execute(sql,params)
        self.commit()
        self.close()
        return result

啓動爬蟲startspider.py代碼:

#腳本是爬蟲啓動腳本
from scrapy.cmdline import execute
#啓動爬蟲
execute(['scarpy','crawl','newvideomovie'])

.json文件:

{"host":"127.0.0.1","user":"root","password" :"wuxiulai", "database" :"db_newvideomovie_data","port":3306,"charset":"utf8"}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章