這幾天由於特殊原因,閒在家中無事幹,恰逢老妹要在家上課,家裏沒有廣西廣電機頂盒,所以只能去網上下載下來放到電視上看。前段時間又學了點爬蟲正好拿來練練手(已查閱網站無robots協議限制)
網站鏈接:廣西空中課堂
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
import datetime
from selenium import webdriver
import os
import time
class MycoursespiderSpider(scrapy.Spider):
name = 'mycoursespider'
global mydict
mydict = {}
start_urls = ['http://zt.gxtv.cn/zt/default.html']
def parse(self, response):
curr_time = datetime.datetime.now()
global today
today = str(curr_time.month) + '月' + str(curr_time.day) + '日'
global mypath
mypath = os.path.dirname(os.path.realpath(__file__)) + '/' + today
if not os.path.exists(mypath):
os.mkdir(mypath)
mypath = mypath + '/'
else:
mypath = mypath + '/'
domain = 'http://zt.gxtv.cn'
ctable = response.css('a#ctable::attr(href)').extract()[0] #獲取課程表
yield scrapy.Request(ctable, callback=self.parsecoursetable, meta={'url': ctable})
g5 = response.css('ul#g5 a[target=_blank]').extract() #獲取五年級欄目的內容
g4 = response.css('ul#g4 a[target=_blank]').extract() #獲取四年級欄目的內容
g5 = ''.join(g5)
g4 = ''.join(g4)
soup = BeautifulSoup(g5, 'html.parser')
ensoup = BeautifulSoup(g4, 'html.parser')
for i in ensoup.find_all('a'):
if re.search(today + '-英語', i['title']) is not None: #查看今天有沒有英語課,鄉下五年級學四年級
mydict.update({i['title']: domain + i['href']})
for i in soup.find_all('a'): #查看今天五年級的更新內容
if re.search(today, i['title']) is not None:
if re.search('-英語', i['title']) is not None:
pass
else:
mydict.update({i['title']: domain + i['href']}) #添加到待解析的字典中
for key in mydict:
page = mydict[key]
yield scrapy.Request(page, callback=self.parseinside)
def parseinside(self, response):
curr_time = datetime.datetime.now() #當前時間
filename = str(curr_time.month) + '-' + str(curr_time.day) + '.txt'
playhost = 'https://videocdn.liangtv.cn.*mp4|http://video.cdn.liangtv.cn.*mp4' #匹配鏈接字符串
resp = response.text
print(resp)
title = response.css('h3#title::text').extract_first()
print(title)
playlink = re.search(playhost, resp)
if playlink is not None:
video = str(playlink.group(0))
mydict[title] = video
else:
return
with open(mypath + filename, 'w+') as f:
for key in mydict:
f.write(str(key).replace(u'\xa0', u' ') + ':' + str(mydict[key]).replace(u'\xa0', u' '))
f.write('\n')
yield scrapy.Request(video, self.parsevideo, meta={'title': title}) #註釋可不下載視頻,meta實現內部函數之間傳參
def parsevideo(self, response): #保存視頻
print(response.request.headers['User-Agent'])
title = response.meta['title'] + '.mp4'
# title = title.translate(None, r'|\\?*<\":>+[]/')
with open(mypath + title, 'wb') as f:
f.write(response.body)
def parsecoursetable(self, response): #換成selenium,雖然慢了點,但是達到目的了
url = response.meta['url']
browser = webdriver.Firefox(
executable_path=r'C:\Users\qq\AppData\Local\Programs\Python\Python38\geckodriver.exe')
browser.get(url)
time.sleep(3)
soup = BeautifulSoup(browser.page_source, 'html.parser')
i = soup.find(id='news_con')
print(str(i))
piclink = re.findall('http://.*?jpg|https://.*?jpg', str(i))
browser.quit()
if piclink is not None:
lengh = len(piclink)
for i in range(0, lengh):
yield scrapy.Request(piclink[i], callback=self.parsepicture, meta={'title': str(i)})
def parsepicture(self, response):
title = today + '_' + response.meta['title'] + '.jpg'
with open(mypath + title, 'wb') as f:
f.write(response.body)
放棄splash,使用selenium,簡單多了
小記:在寫文件時gbk無法處理\xa0即網頁的空白字符,所以需要str.replace(u'\xa0', u' ')