defgetCourseUrlList(zhtml):#得到可以用的每一個課時的Url
divList =[]
re_rule ='courseId='+ courseId +'&knowledgeId=(.*?)">'# for i in re.findall(re_rule,html):# divList.append(i)
divList = re.findall(re_rule,zhtml)
urlList =[]for i in divList:
mUrl ='https://mooc1.chaoxing.com/nodedetailcontroller/visitnodedetail?courseId='+courseId+'&knowledgeId='+i
print(mUrl)try:
response = requests.get(mUrl,headers=mHeaders,timeout=1)if response.status_code ==200:if courseId in response.text:
urlList.append(mUrl)print('訪問成功')else:print('非課程網頁')except Exception as e:print('訪問失敗')return urlList
得到workId並和courseId、knowledgeId組成小測驗的url。
defgetZuoYeUrl(urlList):#得到測驗的Url
tUrlList =[]for i in urlList:
response = requests.get(i,headers=mHeaders).content.decode('utf-8')
res = re.findall('workid":"(.*?)",', response)iflen(res):for i in res:
tUrl ="https://mooc1.chaoxing.com/api/selectWorkQuestion?workId="+i+"&ut=null&classId=0&courseId="+courseId
tUrlList.append(tUrl)return tUrlList
訪問小測驗的地址並爬取標題、題幹、選項寫入Word文檔。
defwriteDocx(urlList):#從測驗Url中讀取題目並寫入Word文檔for url in urlList:
mHtml = requests.get(url, headers=mHeaders).content.decode("utf-8")file= docx.Document()
h3 = re.findall('<h3>(.*?)</h3>', mHtml)
Title =""for i in h3:
Title = html.unescape(i)file.add_heading(Title)
text = html.unescape(mHtml)
mHtml = etree.HTML(text)# 將html轉換爲xml
timuList = mHtml.xpath('//div[@class="TiMu"]')# 找到每一個題目及其所有選項for i in timuList:
time.sleep(0.05)
mStr = etree.tostring(i).decode('utf-8')# 將xml樹結點讀出並轉換爲utf-8格式
res = html.unescape(mStr)# 解碼xml
tType = re.findall('(【.*?】)', res)
tRType =[]for a in tType:
p_rule ='<.*?>'
tRType.append(re.sub(p_rule,'',str(a)))#刪除所有的html標籤
tGan = re.findall('】<?p?>?(.*?)</p>', res)ifnotlen(tGan):
tGan = re.findall('<div class="Zy_TItle_p">(.*?)</div>', res)ifnotlen(tGan):
tGan = re.findall('】(.*?)</div>',res)
tRGan =[]for a in tGan:
p_rule ='<.*?>'
tRGan.append(re.sub(p_rule,'',str(a)))file.add_paragraph(tRType + tRGan)'''
for j in tType:
print(j)
file.add_paragraph(j)
for j in tGan:
print(j)
file.add_paragraph(j)
'''
XuanXiang = etree.HTML(res)
tAny = XuanXiang.xpath('//li[@class="clearfix"]')for j in tAny:
tStr = etree.tostring(j).decode('utf-8')
tRes = html.unescape(tStr)
tXuan = re.findall('<i class="fl">(.*?)</i>.*?none;"><?p?>?(.*?)<?/?p?>?</a></li>', tRes)
tRXuan =[]for a in tXuan:
tRRXuan =""for b in a:
p_rule ='<.*?>'
tRRXuan = tRRXuan + re.sub(p_rule,'',str(b))
tRXuan.append(tRRXuan)for k in tRXuan:file.add_paragraph(k)file.save("D:\\"+Title+".docx")print(Title+'爬取完成')
time.sleep(0.3)
其餘代碼
全局變量及導庫(放在開頭)
# coding=utf-8from lxml import etree
import docx
import requests
import re
import html
import time
url ="https://mooc1.chaoxing.com/course/{{courseId}}.html"
mHeaders ={'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3'}