最近突然的想爬取學校的課程表,於是經過幾經努力,終於出來的一個小demo,話不多說,馬上爲大家講解:先放上代碼
import re
import requests
from fake_useragent import UserAgent
from pyquery import PyQuery as pq
from school_api.check_code import CHECK_CODE
class GDSchool(object):
def __init__(self):
self.ua=UserAgent() #用於隨機瀏覽器頭
self.headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language':'Accept-Language',
'Host':'61.142.209.20:9090',
'Accept-Encoding':'gzip, deflate',
'UserAgent':self.ua.random
}
self.number='1711605038'#input('請輸入學號:') 測試的時候我是直接輸入了學號與密碼
self.password='44162119980817247X'#input('請輸入密碼:')
def responseMenu(self,data): #獲取菜單鏈接
html=pq(data.text)
mainItems = {}
menu=html('#headDiv > ul li').items()
for subItems in menu:
sub_nextItems = {}
for nextItems in subItems('ul li a').items():
subList = []
sub_nextItems[nextItems.text()]=nextItems.attr('href')
subList.append(sub_nextItems)
mainItems[subItems('.top_link').text()]=subList
return mainItems
def resonseImage(self): #用於獲取驗證碼並識別驗證碼,返回驗證碼
try:
response=self.rssions.get('http://61.142.209.20:9090/CheckCode.aspx',stream=True)
code=CHECK_CODE.verify(response.content)
# with open(code+'.gif','wb') as fp:
# fp.write(response.content)
return code
except Exception as e:
print(e)
def responseData(self): #用於獲取登錄data參數
try:
response=requests.get('http://61.142.209.20:9090')
html=pq(response.text)
VIEWSTATE=html('#form1 #__VIEWSTATE').attr('value')
EVENTVALIDATION=html('#form1 #__EVENTVALIDATION').attr('value')
return {'VIEWSTATE':VIEWSTATE,'EVENTVALIDATION':EVENTVALIDATION}
except Exception as e:
print(e)
def resonsePara(self): #第一次登錄時的初始課表信息
url='http://61.142.209.20:9090/'+self.menu['信息查詢'][0]['個人課表查詢']
referer='http://61.142.209.20:9090/xs_main.aspx?xh={xh}'
xh=re.findall('xh=(.*?)&xm',url,re.S)[0]
gnmkdm=re.findall('dm=(.*?)$',url,re.S)[0]
xm=re.findall(r'xm=(.*?)&gn',url,re.S)[0]
data={
'xh':xh,
'xm':xm,
'gnmkdm':gnmkdm
}
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'Accept-Language',
'Host': '61.142.209.20:9090',
'Accept-Encoding': 'gzip, deflate',
'referer':referer.format(xh=xh), #referer是必要的,沒有referer將不能登錄到課表頁面
'UserAgent': self.ua.random
}
reClss=self.rssions.get(url='http://61.142.209.20:9090/xskbcx.aspx?',params=data,headers=headers)
return reClss
def main(self): #主程序的調用
self.rssions=requests.session()
url='http://61.142.209.20:9090'
self.rssions.get(url=url)
url_login='http://61.142.209.20:9090/default2.aspx'
para=self.responseData() #返回了登錄的參數
code=self.resonseImage() #返回驗證碼
data={
'__VIEWSTATE':para['VIEWSTATE'],
'__EVENTVALIDATION':para['EVENTVALIDATION'],
'TextBox1':self.number,
'TextBox2':self.password,
'TextBox3':code,
'RadioButtonList1':'%D1%A7%C9%FA',
'Button1':''
}
log_response=self.rssions.post(url=url_login,headers=self.headers,data=data)
self.menu=self.responseMenu(log_response) #菜單鏈接
self.name=re.findall(r'xm=(.*?)&gn',self.menu['信息查詢'][0]['個人課表查詢'],re.S)[0]
self.nameCode=self.name.encode('unicode_escape').decode().replace('\\u','%u')
# self.parseClassTime(self.menu)
fisrClss=self.resonsePara()
self.parseFClss(fisrClss)
def parseFClss(self, data):
print(data.text) #打印課程課程的頁面
if __name__ == '__main__':
gd=GDSchool()
gd.main()
首先,先來分析一下正方教務,這裏分析的是廣東職業技術學院的正方教務,要獲取課程信息的話,首先第一步是要模擬登錄正文:
創建一個seesion保持會話,這樣後面就不需要管cookies了
self.rssions=requests.session()
url='http://61.142.209.20:9090'
self.rssions.get(url=url)
通過輸入錯誤的密碼,分析asp可知登錄所需的data參數:
'__VIEWSTATE':para['VIEWSTATE'],
'__EVENTVALIDATION':para['EVENTVALIDATION'],
'TextBox1':self.number,
'TextBox2':self.password,
'TextBox3':code,
'RadioButtonList1':'%D1%A7%C9%FA',
'Button1':''
其中 '__VIEWSTATE‘,‘__EVENTVALIDATION’,這兩個參數可在網頁中解析出來:
def responseData(self): #用於獲取登錄data參數
try:
response=requests.get('http://61.142.209.20:9090')
html=pq(response.text)
VIEWSTATE=html('#form1 #__VIEWSTATE').attr('value')
EVENTVALIDATION=html('#form1 #__EVENTVALIDATION').attr('value')
return {'VIEWSTATE':VIEWSTATE,'EVENTVALIDATION':EVENTVALIDATION}
except Exception as e:
print(e)
TextBox1,TextBox2,分別是學號與密碼,直接輸入即可,RadioButtonList1,這個參數應該是教師端還是學生端的參數,直接複製即可,Button1爲空,TextBox3爲驗證碼,通過第三方庫直接實現:
def resonseImage(self): #用於獲取驗證碼並識別驗證碼,返回驗證碼
try:
response=self.rssions.get('http://61.142.209.20:9090/CheckCode.aspx',stream=True)
code=CHECK_CODE.verify(response.content)
# with open(code+'.gif','wb') as fp:
# fp.write(response.content)
return code
except Exception as e:
print(e)
參數都獲取完成後,通過seesion發起post請求就能跳轉到首頁頁面了:
後面我是遍歷了整個菜單的鏈接,其實大可不必,可以直接獲取課程的鏈接:
大家只要把我菜單鏈接的函數重寫成直接獲取個課表鏈接的參數即可:
重寫這個函數
def responseMenu(self,data): #獲取菜單鏈接
html=pq(data.text)
mainItems = {}
menu=html('#headDiv > ul li').items()
for subItems in menu:
sub_nextItems = {}
for nextItems in subItems('ul li a').items():
subList = []
sub_nextItems[nextItems.text()]=nextItems.attr('href')
subList.append(sub_nextItems)
mainItems[subItems('.top_link').text()]=subList
return mainItems
拿到課程鏈接之後就可以通過seesion訪問課表鏈接,返回response將其解析打印出來即可,這裏是沒有解析,直接將其html打印出來了
def resonsePara(self): #第一次登錄時的初始課表信息
url='http://61.142.209.20:9090/'+self.menu['信息查詢'][0]['個人課表查詢']
referer='http://61.142.209.20:9090/xs_main.aspx?xh={xh}'
xh=re.findall('xh=(.*?)&xm',url,re.S)[0]
gnmkdm=re.findall('dm=(.*?)$',url,re.S)[0]
xm=re.findall(r'xm=(.*?)&gn',url,re.S)[0]
data={
'xh':xh,
'xm':xm,
'gnmkdm':gnmkdm
}
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'Accept-Language',
'Host': '61.142.209.20:9090',
'Accept-Encoding': 'gzip, deflate',
'referer':referer.format(xh=xh), #referer是必要的,沒有referer將不能登錄到課表頁面
'UserAgent': self.ua.random
}
reClss=self.rssions.get(url='http://61.142.209.20:9090/xskbcx.aspx?',params=data,headers=headers)
return reClss
爬取過程中要注意鏈接中的編碼,有些鏈接要轉換之後才能使用
錯誤之處還望大家多多指教