本次爬取的網址爲:http://www.yy6080.cn/vodtypehtml/1.html(推薦使用谷歌瀏覽器,方便看源碼)
首先在啓動命令行,創建爬蟲項目:
scrapy startproject NewVideoMovie
然後:
cd NewVideoMovie
最後創建spider:
scrapy genspider spider http://www.yy6080.cn/vodtypehtml/1.html
創建完成後的結果:
dao文件下面的兩個py文件用來連接數據庫。
這樣基本的爬蟲框架就有了,讓我們繼續下一步:
建立數據庫db_newvideomovie_data,同時創建兩張表。(看錶頭就可以了,列表裏的信息是爬去成功後的)
item.py代碼:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class NewvideomovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
filmName = scrapy.Field() #電影名稱
filmRanking = scrapy.Field() #電影評分
filmType = scrapy.Field() #電影類型
filmHref = scrapy.Field() #電影鏈接
nextURL = scrapy.Field() #下一頁的連接
nextPage = scrapy.Field()
#二級頁面
filminfo_name = filmName
filminfo_director = scrapy.Field() #電影導演
filminfo_scriptwriter = scrapy.Field() #電影編劇
filminfo_protagonist = scrapy.Field() #電影主演
filminfo_type = scrapy.Field() #電影類型
filminfo_country = scrapy.Field() #製片國家
filminfo_language = scrapy.Field() #語言
filminfo_releasetime = scrapy.Field() #發行時間
filminfo_ranking = scrapy.Field() #電影評分
filminfo_content = scrapy.Field() #劇情介紹
pass
pipelines.py代碼:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from .dao.taskdao import TaskDao
class NewvideomoviePipeline(object):
def process_item(self, item, spider):
s = TaskDao()
s.create((item['filmName'],item['filmRanking'],item['filmType'],item['filmHref']))
print('輸出管道數據')
print(item['filmName'])
print(item['filmRanking'])
print(item['filmType'])
print(item['filmHref'])
setting.py代碼:
# -*- coding: utf-8 -*-
# Scrapy settings for NewVideoMovie project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'NewVideoMovie'
SPIDER_MODULES = ['NewVideoMovie.spiders']
NEWSPIDER_MODULE = 'NewVideoMovie.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'NewVideoMovie (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'NewVideoMovie.middlewares.NewvideomovieSpiderMiddleware': 543,
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'NewVideoMovie.middlewares.NewvideomovieDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'NewVideoMovie.pipelines.NewvideomoviePipeline': 300,
#'NewVideoMovie.mysqlpipelines.NewvideomoviePipeline': 301,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# LOG_LEVEL = 'ERROR'
# LOG_FILE = 'jobspider.log'
文件spider下的newvideomovie.py代碼:
# -*- coding: utf-8 -*-
import scrapy
from Mypro.Aug.day23.NewVideoMovie.NewVideoMovie.items import NewvideomovieItem
class NewvideomovieSpider(scrapy.Spider):
name = 'newvideomovie'
# allowed_domains = ['http://www.yy6080.cn/vodtypehtml/1.html']
start_urls = ['http://www.yy6080.cn/vodtypehtml/1.html']
def parse(self,response):
subSelectors = response.xpath('//div[@class="movie-item"]')
# 遍歷選擇器列表
itemList = []
movieLen = len(subSelectors)
movieCount = 0
nextPage = ""
for subSelector in subSelectors:
movieCount += 1
NVM = NewvideomovieItem()
# 電影名稱
filmName = subSelector.xpath('div[@class="meta"]/div/a/text()')
if filmName:
NVM['filmName'] = filmName.extract()[0].strip()
# 電影評分
filmRanking = subSelector.xpath('div[@class="meta"]/div/span/text()')
if filmRanking:
NVM['filmRanking'] = filmRanking.extract()[0].strip('分').split()[0]
# 電影類型
filmType = subSelector.xpath('div[@class="meta"]/div[@class="otherinfo"]/text()')
if filmType:
NVM['filmType'] = filmType.extract()[0].split(':')[-1]
## 電影鏈接
filmHref = subSelector.xpath('div[@class="meta"]/div/a/@href')
if filmHref:
NVM['filmHref'] = 'http://www.yy6080.cn'+filmHref.extract()[0].strip()
nextPage= response.xpath('//a[@class="pagelink_a"]/@href').extract()
nextText = response.xpath('//a[@class="pagelink_a"]/text()').extract()
if nextText[-2] == '下一頁':
url = 'http://www.yy6080.cn' + nextPage[-2]
NVM['nextPage'] = url
if filmName and filmRanking and filmType and filmHref:
itemList.append(subSelectors)
yield scrapy.Request(url=NVM['filmHref'], callback=self.parse_filminfo, meta={'item': NVM, 'movieLen':movieLen, 'movieCount':movieCount},dont_filter=True)
def parse_filminfo(self, response):
NVM = response.meta['item']
movieLen = response.meta['movieLen']
movieCount = response.meta['movieCount']
subSelectors_second = response.xpath('//tbody/tr')
for subSelector_second in subSelectors_second:
txt = ""
txt2 = ""
# 電影主演
filminfo_protagonist = subSelector_second.xpath('td[@id="casts"]/a/text()')
if filminfo_protagonist:
s = str(filminfo_protagonist.extract())
NVM['filminfo_protagonist'] = s[1:-1]
# 劇情介紹
filminfo_content = subSelector_second.xpath('//div[@class="col-md-8"]/p[@class="summary"]/text()')
if filminfo_content:
NVM['filminfo_content'] = filminfo_content.extract()[0].strip()
column = subSelector_second.xpath('td[@class="span2"]/span/text()')
if column:
txt = column.extract()[0].strip()
column2 = subSelector_second.xpath('td/a/text()')
if column2:
txt2 = column2.extract()
else:
column2 = subSelector_second.xpath('td/text()')
if column2:
txt2 = column2.extract()
if txt == "導演":
if txt2:
str1 = ''
for temp in txt2:
str1 =temp.strip()+','+str1
NVM['filminfo_director'] = str1
else:
NVM['filminfo_director'] = 'None'
elif txt =='編劇':
if txt2:
str1 = ''
for temp in txt2:
str1 =temp.strip()+','+str1
NVM['filminfo_scriptwriter'] = str1
else:
NVM['filminfo_scriptwriter'] = 'None'
elif txt == '類型':
if txt2:
NVM['filminfo_type']= txt2[0]
else:
NVM['filminfo_type'] = 'None'
elif txt == '製片國家':
if txt2:
NVM['filminfo_country'] = txt2[0]
else:
NVM['filminfo_country'] = 'None'
elif txt == '上映時間':
if txt2:
NVM['filminfo_releasetime'] = txt2[0]
else:
NVM['filminfo_releasetime']= 'None'
elif txt == '評分':
if txt2:
NVM['filminfo_ranking'] = txt2[-1].split(':')[-1]
else:
NVM['filminfo_ranking'] = 'None'
elif txt == '語言':
if txt2:
NVM['filminfo_language'] = txt2[0]
else:
NVM['filminfo_language'] = 'None'
yield NVM
if movieLen == movieCount:
if NVM['nextPage']:
print(NVM['nextPage'])
yield scrapy.Request(NVM['nextPage'], self.parse, dont_filter=True)
文件dao下面的basedao.py代碼:
#引入pymsql
import pymysql
import json
import logging
class BaseDao():
def __init__(self,configPath='pymysql.json'):
self.__connection = None
self.__cursor = None
#通過json配置文件獲得數據庫的連接信息
self.__config = json.load(open(configPath,'r'))
print(self.__config)
#獲取數據庫的連接
def getConnection(self):
#當有連接對象時返回連接對象
if self.__connection:
return self.__connection
#否則建立新的連接對象
#獲取數據庫連接
try:
self.__connection = pymysql.connect(**self.__config)
return self.__connection
except pymysql.MySQLError as e:
print('Exception:',e)
#執行sql語句的通用方法
def execute(self,sql,params):
try:
self.__cursor= self.getConnection().cursor()
if params:
result = self.__cursor.execute(sql,params)
else:
result =self.__cursor.execute(sql)
return result
except (pymysql.MySQLError,pymysql.DatabaseError,Exception) as e:
print('出現數據庫異常:'+str(e))
self.rollback()
def fetchall(self):
if self.__cursor:
return self.__cursor.fetchall()
pass
def commit(self):
if self.__connection:
self.__connection.commit()
def rollback(self):
if self.__connection:
self.__connection.rollback()
def close(self):
if self.__cursor:
self.__cursor.close()
if self.__connection:
self.__connection.close()
文件dao下面的taskdao.py代碼:
from .basedao import BaseDao
class TaskDao(BaseDao):
def create(self,params):
sql ='insert into nvm_collect_task(film_name,film_ranking,film_type,film_href) values (%s,%s,%s,%s)'
result = self.execute(sql,params)
self.commit()
self.close()
return result
def create_filminfo(self,params):
sql ='insert into nvm_collect_filminfo(filminfo_name,filminfo_director,' \
'filminfo_scriptwriter,filminfo_protagonist,filminfo_type,filminfo_country,filminfo_language,' \
'filminfo_releasetime,filminfo_ranking,filminfo_content) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
result = self.execute(sql,params)
self.commit()
self.close()
return result
啓動爬蟲startspider.py代碼:
#腳本是爬蟲啓動腳本
from scrapy.cmdline import execute
#啓動爬蟲
execute(['scarpy','crawl','newvideomovie'])
.json文件:
{"host":"127.0.0.1","user":"root","password" :"wuxiulai", "database" :"db_newvideomovie_data","port":3306,"charset":"utf8"}