前些天發現了英孚學英語視頻,今天看時突然想,何不通過python爬蟲把數據全部爬下來,更方便學習。
找規律--》調試--》success。
英孚在線網站:http://center.ef.com.cn/blog/lesson
以下附源碼。
import requests,os
def mkdirs(path):
'''
創建目錄
:param path: 目錄地址
:return:
'''
if not os.path.exists(path):
os.makedirs(path)
def download(id,name):
'''
下載視頻
:param id: 需要下載的視頻id
:param name:與界面上的目錄保持一致
:return:
'''
url = "https://cns.ef-cdn.com/_vids/dailylesson/lesson/"+id+"/default.mp4"
names = name.split("_")
response = requests.get(url);
path = fr"d:\ef\{names[0]}\{names[1]}"
mkdirs(path)
with open(path+fr"\{names[2]}"+".mp3","wb") as f:
f.write(response.content);
def getTitle(id):
'''
獲取視頻標題
:param id: 視頻id
:return: 返回視頻標題
'''
url = "https://www.englishtown.cn/community/dailylesson/lessonhandler.ashx?operate=getlessonbyid&v=4&lesson_id="+id+"&transculturecode=zh-cn"
r = requests.get(url);
r = r.json()
return r["Lesson"]["LessonNameWithPerfix"]
def getAllUrl():
'''
獲取所有的英孚url地址。主要爲id
:return:
'''
url = "https://www.englishtown.cn/community/dailylesson/widget.ashx?op=getcourses"
r = requests.get(url)
r = r.json()
items = r["items"]
for item in items:
levelname = item["levelname"]
i = 0
for course in item["courses"]:
i += 1
for course_item in course["course_item"]:
lesson_id = course_item["lesson_id"]
title = getTitle(lesson_id).replace("?","").replace("'","`").replace(".","").replace("\t","").replace('"',"`")
name = f"{levelname}_{i}_{lesson_id}-{title}".lower()
print(name)
download(lesson_id, name)
if __name__ == '__main__':
getAllUrl()
成果如下: