模擬登陸爬取大學智慧校園的成績單

我爬取的地址是:http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html

應該只有北部灣大學使用了吧,好吧,應該不何止北部灣大學用吧,不管了,先貼着當筆記吧!

基本步驟:

1.獲取驗證碼

 def get_captcha(self):
        response=self.session.get(self.captcha_url)
        #print(response.cookies)
        with open('captcha.png','wb') as f:
            f.write(response.content)
  
    def captcha_ocr(self):
        image = Image.open('captcha.png')
 
        image = image.convert('L')
        threshold = 110
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
 
        image = image.point(table, '1')
        #image.show()
        captcha = tesserocr.image_to_text(image)
        captcha=captcha.split('\n')[0]
        captcha=captcha.replace(" ","")
        return captcha

2.提交登錄表單

3.獲取成績token

4.獲取下載成績單

我保存的文件爲csv

import requests
import tesserocr
from PIL import Image
from pyquery import PyQuery as pq
from urllib.parse import unquote
import re
import json
import csv

class Login(object):
    def __init__(self):
        self.captcha_url='http://authserver.bbgu.edu.cn/authserver/captcha.html'
        self.login_url='http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html'
        self.session=requests.Session()
        self.admin_flag=1 #默認爲1,驗證碼不正確
        self.save_path='data.csv'  #保存的地址
        self.username=''  #賬號
        self.password=''  #密碼
        self.startSchoolYear=2017
        self.endSchoolYear=2020
        self.start_semester=1
        self.end_semester=2  #開始學期爲1,結束學期爲1,則爬取第一個學期
    
    def csv_init(self):
        list=["學年","學期","課程代碼","課程性質","課程名稱","學分","成績","開課學院","重修標記"]
        with open(self.save_path,'w',newline='') as file:
            csv_file=csv.writer(file)
            csv_file.writerow(list)
    
    def get_login(self):
        headers={
            'Host': 'authserver.bbgu.edu.cn',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'            
            }
        response=self.session.get(self.login_url,headers=headers)
        html=pq(response.text)
        lt=html("form#casLoginForm>input[name='lt']").attr('value')
        execution=html("form#casLoginForm>input[name='execution']").attr('value')
        return lt,execution
    
    def get_captcha(self):
        response=self.session.get(self.captcha_url)
        #print(response.cookies)
        with open('captcha.png','wb') as f:
            f.write(response.content)
  
    def captcha_ocr(self):
        image = Image.open('captcha.png')
 
        image = image.convert('L')
        threshold = 110
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
 
        image = image.point(table, '1')
        #image.show()
        captcha = tesserocr.image_to_text(image)
        captcha=captcha.split('\n')[0]
        captcha=captcha.replace(" ","")
        return captcha
    
    #post提交登錄信息
    def post_login(self,username,password,captcha,lt,execution):
        # proxy = '127.0.0.1:8080'
        # proxies = {
        #     'http': 'http://' + proxy,
        #     'https': 'https://' + proxy,
        #     }
        
        post_headers={
            'Host': 'authserver.bbgu.edu.cn',
            'Origin': 'http://authserver.bbgu.edu.cn',
            'Referer': 'http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
            }
        #print(captcha)
        post_data={
            'username': username,
            'password': password,
            'captchaResponse': captcha,
            'lt': lt,
            'dllt': 'userNamePasswordLogin',
            'execution':execution,
            '_eventId': 'submit',
            'rmShown': '1'
            }
        
        #print(post_data)
        response=self.session.post(self.login_url,data=post_data,headers=post_headers)
        html=pq(response.text)
        admin_status=html('form#casLoginForm>span').text()
        if(admin_status=='您提供的用戶名或者密碼有誤'):
            print('您提供的用戶名或者密碼有誤')
            print('登錄失敗,請重新輸入賬號密碼')
            self.admin_flag=0   #表示用戶名或者密碼有誤       
        elif(admin_status=="無效的驗證碼"):
            print('無效的驗證碼')
            print('正在重新獲取驗證碼,重新登錄')
            print('..........................')
            self.admin_flag=1 #表示無效驗證碼
        else:
            self.admin_flag=2 
       
        #print(response.text)
        
    
    #整合login函數    
    def login(self):
        lt,execution=self.get_login()        
        i=0 #驗證碼嘗試次數
        while(self.admin_flag==1 and i<10):
            i=i+1
            self.get_captcha()
            captcha=self.captcha_ocr()
            self.post_login(self.username,self.password,captcha,lt,execution)        
        if(self.admin_flag==2):
            admin_do=1;
            print('login successed')            
        else:
            admin_do=0
        return admin_do
        
    def get_score_url(self):
        url1='http://xqcxht.bbgu.edu.cn:8082/qinzhouh5/cas/studentRecord/list.html?amp_sec_version_=1&gid_=RHdsVlJDOC84UUMyQkJKTmVIWWIyNjZOeE9Nd0RlUlNSeGFva3RqL0ZrTS9iZzc4anRSaFpzSGozMDEvTVA2SUhzaTBNUVpaZjN6SGlLK29nY1N0TWc9PQ&EMAP_LANG=zh&THEME=millennium'        
        headers={
            'Host': 'xqcxht.bbgu.edu.cn:8082',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
            }
        try:            
            response1=self.session.get(url1,headers=headers,allow_redirects=False)
        #print(response1.status_code)
        #print(response1.headers['Location'])
        
            url2=unquote(response1.headers['Location'])
            headers2={
                'Host': 'authserver.bbgu.edu.cn',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
                }
            response2=self.session.get(url2,headers=headers2,allow_redirects=False)
        #print(response2.status_code)
        
            url3=unquote(response2.headers['Location'])       
            response3=self.session.get(url3,headers=headers,allow_redirects=False)
        #print(response3.status_code)
        #print(response3.headers)
        
            url4=response3.headers['Location']
            response4=self.session.get(url4,headers=headers,allow_redirects=False)
        #print(response4.status_code)
        #print(response4.headers['Location'])
        
            score_url=response4.headers['Location']
        #response5=self.session.get(url5,headers=headers)
            return score_url
        
        except KeyError as e:
            print('keyError',e.args)
            self.get_score_url()
        
    def get_score(self,score_url,startSchoolYear,endSchoolYear,semester):
        # proxy = '127.0.0.1:8080'
        # proxies = {
        #     'http': 'http://' + proxy,
        #     'https': 'https://' + proxy,
        #     }
        result=re.search('uid=(.*)&token=(.*)',score_url)
        uid=result.group(1)
        token=result.group(2)
        #print(score_url)
        headers={
            'Host': 'xqcxht.bbgu.edu.cn:8082',
            'Origin': 'http://xqcxht.bbgu.edu.cn:8082',
            'Proxy-Connection': 'keep-alive',
            'Referer': score_url,
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
            }
        
        post_data={
            "pageNum":1,
            "pageSize":20,
            "stuNumber":uid,
            "startSchoolYear":startSchoolYear,
            "endSchoolYear":endSchoolYear,
            "semester":semester,
            "uid":uid,
            "token":token}
        #print(post_data)
        url='http://xqcxht.bbgu.edu.cn:8082/qinzhouh5/studentRecord/getStuRecordList'
        response=self.session.post(url,json=post_data,headers=headers)
        datas=response.json()
        self.datas_handle(datas)
        
        
    
    
    def datas_handle(self,datas):
        items=datas.get('RetData').get('studentRecordPage').get('records')
        for item in items:
            list=[];
            school_year=item.get('startSchoolYear')+'-'+item.get('endSchoolYear')
            semester=item.get('semester')                     #學期
            lessonCode=item.get('lessonCode')                 #課程代碼
            courseNature=item.get('courseNature')             #課程性質
            lessonName=item.get('lessonName')                 #課程名稱
            credits=item.get('credits')
            results=item.get('results')                       #成績
            beginCollege=item.get('beginCollege')             #開課學院
            reconstructionSign=item.get('reconstructionSign') #重修標記
            list=[school_year,semester,lessonCode,courseNature,lessonName,credits,results,beginCollege,reconstructionSign]
            self.save_to_csv(list)
        
    
    def save_to_csv(self,list):
        with open(self.save_path, 'a+',newline='') as csvfile:
            writer= csv.writer(csvfile)
            writer.writerow(list)
    
    def get_messeges(self):
        score_url=self.get_score_url()
        for year in range(self.startSchoolYear,self.endSchoolYear):
            startYear=year
            endYear=year+1
            for semester in range(self.start_semester,self.end_semester+1):
                self.get_score(score_url,startYear,endYear,semester)
        print('datas saved in',self.save_path)

def main():

    login=Login()
    login.csv_init()
    admin_do=login.login()
    if(admin_do):
        login.get_messeges()
                
if __name__ == '__main__':
    main()     

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章