代碼如下:
from DrawStu.DrawStu import DrawStu;
import time;
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
#初始化class 得到對象
draw=DrawStu();
if __name__ == '__main__':
print('爬取研究生調劑信息');
size=draw.get_page_size();
print(size)
for x in range(size):
start=x*50;
print(start);
#print();
created_url='https://yz.chsi.com.cn/kyzx/tjxx/?start='+str(start);
draw.draw_base_list(created_url);
pass
import sqlite3;
class DB(object):
"""數據庫訪問方法的實現"""
"""初始化api 產生數據操作的對象 conect 操作的遊標"""
def __init__(self):
self.conn={};
self.cus={};
#初始化數據庫鏈接的api
#1產生數據庫鏈接對象
self.conn=sqlite3.connect(r'Test.db');
#2.產生操作的遊標
self.cus=self.conn.cursor();
pass;
def create_table(self):
sql = " CREATE TABLE if not exists mynews (CrawlTime char,Title char,Content char,PublishTime char,Origin char)"
self.conn.execute(sql)
self.conn.commit()
print('create table successfully')
def insert_into_news(self,ops):
self.conn.execute('insert into mynews(CrawlTime,Title,Content,PublishTime,Origin) values(?,?,?,?,?)',(ops['CrawlTime'],ops['Title'],ops['Content'],ops['PublishTime'],ops['Origin'],));
self.conn.commit();
pass
#要求使用urllib3
import urllib.request;
from bs4 import BeautifulSoup;
from DB.DB import DB;
db=DB();
import time;
"""爬取核心的核心模塊,功能只負責爬取研究生調劑信息"""
class DrawStu():
"""docstring for DrawStu"""
def __init__(self):
self.baseurl='https://yz.chsi.com.cn/kyzx/tjxx/';
db.create_table();
pass;
#提取公共的爬取信息的api
def commonsdk(self,url):
response=urllib.request.urlopen(url);#注意 寫在內部以後 變成了形參
html=response.read();#read進行亂碼處理
print(html);
doc=BeautifulSoup(html);
return doc;
#爬取基本列表
def draw_base_list(self,url):
print('url is:::',url);
doc=self.commonsdk(url);
lilist=doc.find('ul',{'class':'news-list'}).findAll('li');
#print(lilist);
#爬取一級參數
for x in lilist:
Title=x.find('a').text;
Time=x.find('span').text
Link='https://yz.chsi.com.cn'+x.find('a').get('href');
#print(Link);
self.draw_detail_list(Link,Title,Time);
pass
pass
#爬取二級詳情的信息參數
def draw_detail_list(self,url,Title,Time):
doc=self.commonsdk(url);
from_info=doc.find('span',{'class':'news-from'}).text;
content=doc.find('div',{'class':'content-l detail'}).text;
ctime=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime());
#將數據 拼合成字典 交給數據庫存儲的api
data={
'CrawlTime':ctime,
'Title':Title,
'Content':content,
'PublishTime':Time,
'Origin':from_info
}
print(data);
print('插入數據庫中');
db.insert_into_news(data);
pass
#爬取頁面的總頁數
def get_page_size(self):
requesturl=self.baseurl;
pcxt=self.commonsdk(requesturl).find('div',{'class':'pageC'}).findAll('span')[0].text;
print(pcxt);
#re正則表達式 字符串截取api
pagesize=pcxt.strip();
pagearr=pagesize.split('/');
pagestr=pagearr[1];
return int(pagestr[0:2]);
pass
F12查看網頁元素
爬取結果:
轉化成數據庫表格形式,採用database net軟件,效果如下:
新建查詢輸入:select *from mynews
其中在錄每一個學校的信息都能查詢