廣西空中課堂五年級每日爬取教學視頻（使用工具:scrapy selenium re BeautifulSoup）

這幾天由於特殊原因，閒在家中無事幹，恰逢老妹要在家上課，家裏沒有廣西廣電機頂盒，所以只能去網上下載下來放到電視上看。前段時間又學了點爬蟲正好拿來練練手（已查閱網站無robots協議限制）

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
import datetime
from selenium import webdriver
import os
import time


class MycoursespiderSpider(scrapy.Spider):
    name = 'mycoursespider'
    global mydict
    mydict = {}
    start_urls = ['http://zt.gxtv.cn/zt/default.html']

    def parse(self, response):
        curr_time = datetime.datetime.now()
        global today
        today = str(curr_time.month) + '月' + str(curr_time.day) + '日'
        global mypath
        mypath = os.path.dirname(os.path.realpath(__file__)) + '/' + today
        if not os.path.exists(mypath):
            os.mkdir(mypath)
            mypath = mypath + '/'
        else:
            mypath = mypath + '/'
        domain = 'http://zt.gxtv.cn'
        ctable = response.css('a#ctable::attr(href)').extract()[0]              #獲取課程表
        yield scrapy.Request(ctable, callback=self.parsecoursetable, meta={'url': ctable})
        g5 = response.css('ul#g5 a[target=_blank]').extract()   #獲取五年級欄目的內容
        g4 = response.css('ul#g4 a[target=_blank]').extract()   #獲取四年級欄目的內容
        g5 = ''.join(g5)
        g4 = ''.join(g4)
        soup = BeautifulSoup(g5, 'html.parser')
        ensoup = BeautifulSoup(g4, 'html.parser')
        for i in ensoup.find_all('a'):
            if re.search(today + '-英語', i['title']) is not None:      #查看今天有沒有英語課，鄉下五年級學四年級
                mydict.update({i['title']: domain + i['href']})
        for i in soup.find_all('a'):                                      #查看今天五年級的更新內容
            if re.search(today, i['title']) is not None:
                if re.search('-英語', i['title']) is not None:
                    pass
                else:
                    mydict.update({i['title']: domain + i['href']})     #添加到待解析的字典中
        for key in mydict:
            page = mydict[key]
            yield scrapy.Request(page, callback=self.parseinside)

    def parseinside(self, response):
        curr_time = datetime.datetime.now()                                 #當前時間
        filename = str(curr_time.month) + '-' + str(curr_time.day) + '.txt'
        playhost = 'https://videocdn.liangtv.cn.*mp4|http://video.cdn.liangtv.cn.*mp4'                       #匹配鏈接字符串
        resp = response.text
        print(resp)
        title = response.css('h3#title::text').extract_first()
        print(title)
        playlink = re.search(playhost, resp)
        if playlink is not None:
            video = str(playlink.group(0))
            mydict[title] = video
        else:
            return
        with open(mypath + filename, 'w+') as f:
            for key in mydict:
                f.write(str(key).replace(u'\xa0', u' ') + ':' + str(mydict[key]).replace(u'\xa0', u' '))
                f.write('\n')
        yield scrapy.Request(video, self.parsevideo, meta={'title': title}) #註釋可不下載視頻，meta實現內部函數之間傳參

    def parsevideo(self, response):                                         #保存視頻
        print(response.request.headers['User-Agent'])
        title = response.meta['title'] + '.mp4'
        # title = title.translate(None, r'|\\?*<\":>+[]/')
        with open(mypath + title, 'wb') as f:
            f.write(response.body)

    def parsecoursetable(self, response):         #換成selenium，雖然慢了點，但是達到目的了
        url = response.meta['url']
        browser = webdriver.Firefox(
            executable_path=r'C:\Users\qq\AppData\Local\Programs\Python\Python38\geckodriver.exe')
        browser.get(url)
        time.sleep(3)
        soup = BeautifulSoup(browser.page_source, 'html.parser')
        i = soup.find(id='news_con')
        print(str(i))
        piclink = re.findall('http://.*?jpg|https://.*?jpg', str(i))
        browser.quit()
        if piclink is not None:
            lengh = len(piclink)
            for i in range(0, lengh):
                yield scrapy.Request(piclink[i], callback=self.parsepicture, meta={'title': str(i)})

    def parsepicture(self, response):
        title = today + '_' + response.meta['title'] + '.jpg'
        with open(mypath + title, 'wb') as f:
            f.write(response.body)

放棄splash,使用selenium，簡單多了

小記：在寫文件時gbk無法處理\xa0即網頁的空白字符，所以需要str.replace(u'\xa0', u' ')

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

廣西空中課堂五年級每日爬取教學視頻（使用工具:scrapy selenium re BeautifulSoup）

這幾天由於特殊原因，閒在家中無事幹，恰逢老妹要在家上課，家裏沒有廣西廣電機頂盒，所以只能去網上下載下來放到電視上看。前段時間又學了點爬蟲正好拿來練練手（已查閱網站無robots協議限制）

放棄splash,使用selenium，簡單多了

如何使用 JS 判斷用戶是否處於活躍狀態

通過HPA+CronHPA組合應對業務複雜彈性伸縮場景

VUE數據和頁面不統一，更改了數據，頁面未更改

個人不常用軟件列表

廣西空中課堂五年級每日爬取教學視頻（使用工具:scrapy selenium re BeautifulSoup）

阿里雲ECS服務器Linux開啓遠程桌面

java 推薦遍歷map的方式

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結