我爬取的地址是:http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html
應該只有北部灣大學使用了吧,好吧,應該不何止北部灣大學用吧,不管了,先貼着當筆記吧!
基本步驟:
1.獲取驗證碼
def get_captcha(self):
response=self.session.get(self.captcha_url)
#print(response.cookies)
with open('captcha.png','wb') as f:
f.write(response.content)
def captcha_ocr(self):
image = Image.open('captcha.png')
image = image.convert('L')
threshold = 110
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
image = image.point(table, '1')
#image.show()
captcha = tesserocr.image_to_text(image)
captcha=captcha.split('\n')[0]
captcha=captcha.replace(" ","")
return captcha
2.提交登錄表單
3.獲取成績token
4.獲取下載成績單
我保存的文件爲csv
import requests
import tesserocr
from PIL import Image
from pyquery import PyQuery as pq
from urllib.parse import unquote
import re
import json
import csv
class Login(object):
def __init__(self):
self.captcha_url='http://authserver.bbgu.edu.cn/authserver/captcha.html'
self.login_url='http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html'
self.session=requests.Session()
self.admin_flag=1 #默認爲1,驗證碼不正確
self.save_path='data.csv' #保存的地址
self.username='' #賬號
self.password='' #密碼
self.startSchoolYear=2017
self.endSchoolYear=2020
self.start_semester=1
self.end_semester=2 #開始學期爲1,結束學期爲1,則爬取第一個學期
def csv_init(self):
list=["學年","學期","課程代碼","課程性質","課程名稱","學分","成績","開課學院","重修標記"]
with open(self.save_path,'w',newline='') as file:
csv_file=csv.writer(file)
csv_file.writerow(list)
def get_login(self):
headers={
'Host': 'authserver.bbgu.edu.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
response=self.session.get(self.login_url,headers=headers)
html=pq(response.text)
lt=html("form#casLoginForm>input[name='lt']").attr('value')
execution=html("form#casLoginForm>input[name='execution']").attr('value')
return lt,execution
def get_captcha(self):
response=self.session.get(self.captcha_url)
#print(response.cookies)
with open('captcha.png','wb') as f:
f.write(response.content)
def captcha_ocr(self):
image = Image.open('captcha.png')
image = image.convert('L')
threshold = 110
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
image = image.point(table, '1')
#image.show()
captcha = tesserocr.image_to_text(image)
captcha=captcha.split('\n')[0]
captcha=captcha.replace(" ","")
return captcha
#post提交登錄信息
def post_login(self,username,password,captcha,lt,execution):
# proxy = '127.0.0.1:8080'
# proxies = {
# 'http': 'http://' + proxy,
# 'https': 'https://' + proxy,
# }
post_headers={
'Host': 'authserver.bbgu.edu.cn',
'Origin': 'http://authserver.bbgu.edu.cn',
'Referer': 'http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
#print(captcha)
post_data={
'username': username,
'password': password,
'captchaResponse': captcha,
'lt': lt,
'dllt': 'userNamePasswordLogin',
'execution':execution,
'_eventId': 'submit',
'rmShown': '1'
}
#print(post_data)
response=self.session.post(self.login_url,data=post_data,headers=post_headers)
html=pq(response.text)
admin_status=html('form#casLoginForm>span').text()
if(admin_status=='您提供的用戶名或者密碼有誤'):
print('您提供的用戶名或者密碼有誤')
print('登錄失敗,請重新輸入賬號密碼')
self.admin_flag=0 #表示用戶名或者密碼有誤
elif(admin_status=="無效的驗證碼"):
print('無效的驗證碼')
print('正在重新獲取驗證碼,重新登錄')
print('..........................')
self.admin_flag=1 #表示無效驗證碼
else:
self.admin_flag=2
#print(response.text)
#整合login函數
def login(self):
lt,execution=self.get_login()
i=0 #驗證碼嘗試次數
while(self.admin_flag==1 and i<10):
i=i+1
self.get_captcha()
captcha=self.captcha_ocr()
self.post_login(self.username,self.password,captcha,lt,execution)
if(self.admin_flag==2):
admin_do=1;
print('login successed')
else:
admin_do=0
return admin_do
def get_score_url(self):
url1='http://xqcxht.bbgu.edu.cn:8082/qinzhouh5/cas/studentRecord/list.html?amp_sec_version_=1&gid_=RHdsVlJDOC84UUMyQkJKTmVIWWIyNjZOeE9Nd0RlUlNSeGFva3RqL0ZrTS9iZzc4anRSaFpzSGozMDEvTVA2SUhzaTBNUVpaZjN6SGlLK29nY1N0TWc9PQ&EMAP_LANG=zh&THEME=millennium'
headers={
'Host': 'xqcxht.bbgu.edu.cn:8082',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
try:
response1=self.session.get(url1,headers=headers,allow_redirects=False)
#print(response1.status_code)
#print(response1.headers['Location'])
url2=unquote(response1.headers['Location'])
headers2={
'Host': 'authserver.bbgu.edu.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
response2=self.session.get(url2,headers=headers2,allow_redirects=False)
#print(response2.status_code)
url3=unquote(response2.headers['Location'])
response3=self.session.get(url3,headers=headers,allow_redirects=False)
#print(response3.status_code)
#print(response3.headers)
url4=response3.headers['Location']
response4=self.session.get(url4,headers=headers,allow_redirects=False)
#print(response4.status_code)
#print(response4.headers['Location'])
score_url=response4.headers['Location']
#response5=self.session.get(url5,headers=headers)
return score_url
except KeyError as e:
print('keyError',e.args)
self.get_score_url()
def get_score(self,score_url,startSchoolYear,endSchoolYear,semester):
# proxy = '127.0.0.1:8080'
# proxies = {
# 'http': 'http://' + proxy,
# 'https': 'https://' + proxy,
# }
result=re.search('uid=(.*)&token=(.*)',score_url)
uid=result.group(1)
token=result.group(2)
#print(score_url)
headers={
'Host': 'xqcxht.bbgu.edu.cn:8082',
'Origin': 'http://xqcxht.bbgu.edu.cn:8082',
'Proxy-Connection': 'keep-alive',
'Referer': score_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
post_data={
"pageNum":1,
"pageSize":20,
"stuNumber":uid,
"startSchoolYear":startSchoolYear,
"endSchoolYear":endSchoolYear,
"semester":semester,
"uid":uid,
"token":token}
#print(post_data)
url='http://xqcxht.bbgu.edu.cn:8082/qinzhouh5/studentRecord/getStuRecordList'
response=self.session.post(url,json=post_data,headers=headers)
datas=response.json()
self.datas_handle(datas)
def datas_handle(self,datas):
items=datas.get('RetData').get('studentRecordPage').get('records')
for item in items:
list=[];
school_year=item.get('startSchoolYear')+'-'+item.get('endSchoolYear')
semester=item.get('semester') #學期
lessonCode=item.get('lessonCode') #課程代碼
courseNature=item.get('courseNature') #課程性質
lessonName=item.get('lessonName') #課程名稱
credits=item.get('credits')
results=item.get('results') #成績
beginCollege=item.get('beginCollege') #開課學院
reconstructionSign=item.get('reconstructionSign') #重修標記
list=[school_year,semester,lessonCode,courseNature,lessonName,credits,results,beginCollege,reconstructionSign]
self.save_to_csv(list)
def save_to_csv(self,list):
with open(self.save_path, 'a+',newline='') as csvfile:
writer= csv.writer(csvfile)
writer.writerow(list)
def get_messeges(self):
score_url=self.get_score_url()
for year in range(self.startSchoolYear,self.endSchoolYear):
startYear=year
endYear=year+1
for semester in range(self.start_semester,self.end_semester+1):
self.get_score(score_url,startYear,endYear,semester)
print('datas saved in',self.save_path)
def main():
login=Login()
login.csv_init()
admin_do=login.login()
if(admin_do):
login.get_messeges()
if __name__ == '__main__':
main()