獲取遷木網QS世界大學排名信息

原創

2020-07-05 18:00

處理網址：http://www.qianmu.org/ranking/1528.htm


# 獲取qianmu遷木網QS世界大學排名信息
import requests
from lxml import etree
import re

def fetch(start_url):
    '''請求並下載網頁'''
    r = requests.get(start_url)
    if r.status_code != 200:
        r.raise_for_status()
    return r.text

def process_detail(link,length,num):
    '''處理詳情頁面'''
    select = etree.HTML(fetch(link).replace('\t','').replace('\n','').replace('\r',''))
    data = {}
    data['name'] = select.xpath('//*[@id="wikiContent"]/h1/text()')[0].strip()
    print("處理進度：[%s]-%d/%d"%(data['name'],num,length))
    table = select.xpath('//div[@class="infobox"]')[0]
    keys = table.xpath('.//td[1]')
    cols = table.xpath('.//td[2]')
    keys_list = []
    for key in keys:
        keys_list.append(''.join(key.xpath('.//text()')))
    values = []
    for col in cols:
        values.append( ''.join(col.xpath('.//text()')))
    if len(keys) != len(values):
        return None
    # for i in range(len(keys)):
    #     data[keys[i]] = values[i].strip()
    data.update(zip(keys_list, values))
    return data

def process_data(data):
    '''處理數據'''
    if data:
        # 結果中包含一些無用數據：'9,771*(3)'中的*(3)，正則去掉
        patt = re.compile(r'\*\(\d+\)')
        new_data ={}
        for k,v in data.items():
            useless_data = (patt.findall(v))
            if len(useless_data)>0:
                new_data[k] = v.replace(patt.findall(v)[0],'')
            else:
                new_data[k] = v
        print(new_data)

if __name__ == "__main__":
    # 入口頁面
    start_url = 'http://www.qianmu.org/ranking/1528.htm'
    html = etree.HTML(fetch(start_url))
    links = html.xpath('//div[@class="rankItem"]/table/tbody/tr[position()>1]/td[2]/a[contains(@a,"")]/@href')
    length = len(links)
    num = 1
    for link in links:
        data = process_detail(link, length,num)
        process_data(data)
        num+=1

效果圖：

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

獲取遷木網QS世界大學排名信息

Golang爬蟲代理接入的技術與實踐

requests的一些操作筆記

BeautifulSoup中has_attr和attrs使用

urllib中urlparse使用技巧以及iter_content圖片邊下邊存到硬盤使用

獲取遷木網QS世界大學排名信息

response.replace(body=response.text.replace(‘\xa0‘,‘‘))，scrapy抓取網頁含\r \t \n \xa0時，修改response方法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結