爬蟲學習筆記(一)requests-bs4-re

準備

  1. python IDLE
  2. robots的使用:域名後追加robots.txt可查看
  3. 安裝requests庫:管理員啓動cmd命令pip install requests
  4. 安裝beautifulsoup4庫:管理員啓動cmd命令pip install beautifulsoup4

實例

  1. 爬取單張圖片
import requests
import os

url = "https://i0.hippopx.com/photos/320/918/427/sky-clouds-sunlight-dark-thumb.jpg"
root = "D://pics//"

def getPics(url,root):
    path = root + url.split('/')[-1]    #以'/'爲分割符保留最後一段
    try:
        #若跟路徑不存在,則創建
        if not os.path.exists(root):
            os.mkdir(root)
        #若文件路徑不存在,則創建,否則提示已保存
        if not os.path.exists(path):
            r = requests.get(url)
            with open(path,'wb') as f:
                f.write(r.content)
                f.close()
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("爬取失敗")
  1. 爬取大學排名
import requests
import bs4
from bs4 import BeautifulSoup

#獲取網頁文本
def getHTMLText(url):   
    try:
        r = requests.get( url,timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

#提取排名信息
def fillUnivList(ulist,html):   
    soup = BeautifulSoup(html,"html.parser")
    for tr in soup.find('tbody').children:  #查找'tbody'的子孫節點
        if isinstance(tr,bs4.element.Tag): #判斷'tr'是否爲標籤類型
            tds = tr('td')
            ulist.append([tds[0].string, tds[1].string, tds[3].string])

#顯示前num個排名
def printUnivList(ulist,num):
    #格式輸出
    print("{:^6}\t{:<10}\t{:^6}".format("排名","學校名稱","總分"))
    for i in range(num):
        u = ulist[i]
        print("{:^6}\t{:<10}\t{:^6}".format(u[0],u[1],u[2]))

def main():
    uinfo = []
    url = "http://www.zuihaodaxue.com/Greater_China_Ranking2019_0.html"
    html = getHTMLText(url)
    fillUnivList(uinfo,html)
    printUnivList(uinfo,50)

main()
  1. 爬取豆瓣電壓Top250
import requests
import bs4
import random
from bs4 import BeautifulSoup

tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
user_list = (
{'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1"},
{'user-agent': "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1"},
{'user-agent': "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"},
{'user-agent': "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"},
{'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
)

#獲取網頁文本
def getHTMLText(url,user_agent):   
    try:
        r = requests.get(url, headers=user_agent, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "爬取失敗"

#提取排名信息
def fillUnivList(ulist,html):   
    soup = BeautifulSoup(html,"html.parser")
    for li in soup.find('ol').children:  #查找'ol'的子孫節點
        if isinstance(li,bs4.element.Tag): #判斷'li'是否爲標籤類型
            no = li('em')
            spans = li('span')
            score = li('span','rating_num')
            ulist.append([no[0].string, spans[0].string,score[0].string])

#顯示前num個排名
def printUnivList(ulist,num):
    #格式輸出,chr(12288)表示使用中文填充,解決對齊問題
    for i in range(num):
        u = ulist[i]
        print(tplt.format(u[0],u[1],u[2],chr(12288)))

def main():
    uinfo = []
    #隨機請求頭防止反爬
    user_agent = random.choice(user_list)
    start_url = "https://movie.douban.com/top250?start="
    page = 0

    print(tplt.format("排名","影片名稱","總分",chr(12288)))

    for i in range(10):
        #拼接url
        url = start_url + str(page) + "&filter="
        html = getHTMLText(url,user_agent)
        fillUnivList(uinfo,html)
        page+=25

    printUnivList(uinfo,page)

main()
  1. 爬取淘寶商品
import random
import requests
import re

my_headers = {
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'cache-control': 'max-age=0',
    'authority': 's.taobao.com',
    'cookie': 'cna=Tql9FFpD1BICAXE5sLfHz1uP; tracknick=%5Cu51AC%5Cu6696%5Cu590F%5Cu51C9an; tg=0; enc=oHD1Xeg6OXuTiSJD5wGcdhE7YcKsPUjqsvWpi7CaWnpolSL%2F8ZF0oVJN0ZMhcsUiP06eeZ2YU7N%2BKxLtqeVTbQ%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; miid=1269960616169477290; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; t=90fe486baa6f30adf6f5fa32d2be3a20; _m_h5_tk=4455c3bc90f54ff85d75e7e02d5df6e7_1582003278360; _m_h5_tk_enc=6a767ca75b75e9ebe425a0892b2f9eb8; mt=ci%3D-1_0; cookie2=1fd9b17f075b2651c7141bade62f8b76; _tb_token_=ed33e56d3e8ee; v=0; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; JSESSIONID=2B4CC538D7AEA9C0F49C578476A69C9B; l=cBSwiHYlq4x7AMukBOCN5uI8aO7OSIRYouPRwNVXi_5dv6L_-EbOo5mpdFp6VjWdtZTB4PLEbq99-etkNLe06Pt-g3fP.; isg=BOPj1-iGuYHGzHZiETSah_F2cieN2HcatOqyUBVAP8K5VAN2nagHasGESiTacs8S',
}

#html請求,爲防止跳轉到登錄頁面,請求報文中包含headers
def getHTMLText(url,my_headers):
    try:
        r = requests.get(url,headers=my_headers,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "爬取失敗"

#解析頁面內容
def parasPage(ilt,html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        print("paras錯誤")

def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序號","價格","商品名稱"))
    count = 0
    for g in ilt:
        count += 1
        print(tplt.format(count,g[0],g[1]))

def main():
    #搜索關鍵詞
    goods='書包'
    #爬取深度(頁數)
    depth=2
    start_url='https://s.taobao.com/search?q='+goods
    infoList=[]

    for i in range(depth):
        try:
            url=start_url + '&s=' +str(44*i)
            html=getHTMLText(url,my_headers)
            parasPage(infoList,html)
        except:
            continue
    printGoodsList(infoList)

main()
  1. 爬取股票信息
import requests
import re
from bs4 import BeautifulSoup

#基本框架
def getHTMLText(url):
    try:
        r = requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "爬取失敗"

#爬取頁面的股票信息
def parasStockPage(html,slist):
    #對標籤樹整理
    soup = BeautifulSoup(html,'html.parser')
    #查找a標籤
    a = soup.find_all('a')
    for i in a:
        try:
            #提取股票代碼,找不到就跳過
            isStock = re.search(r'\(\d{6}\)',i.string)
            if isStock:
                #若找到,提取股票信息
                code = re.search(r'\d{6}',i.string)
                name = re.sub(r'\(\d{6}\)','',i.string)
                #將信息存入列表
                slist.append([name,code.group()])
        except:
            continue

def printStockList(slist):
    #字符串格式輸出
    tplt = "{:4}\t{:12}\t{:10}"
    count=0
    print(tplt.format("序號","股票名稱","股票代碼"))

    for s in slist:
        count += 1
        print(tplt.format(count,s[0],s[1]))
    
def main():
    stock_list_url = "http://quote.eastmoney.com/stock_list.html"
    slist = []

    html = getHTMLText(stock_list_url)
    parasStockPage(html,slist)
    printStockList(slist)

main()
  1. 批量爬取圖片
import requests
import random
import time
from bs4 import BeautifulSoup
import re
import os

#url_begin = "http://www.win4000.com/meinvtag352_1.html"
#root = "D://pics//"

user_list = (
{'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1"},
{'user-agent': "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1"},
{'user-agent': "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"},
{'user-agent': "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"},
{'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
)


#獲取網頁文本
def getHTMLText(url,user_agent):
    try:
        r = requests.get(url, headers=user_agent, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "爬取失敗"



def getPicsUrl(htmlText,picsUrls):
    try:
        #re.findall返回列表類型
        isPicsUrls = re.findall(r'http://www.win4000.com/meinv\d{6}.html',htmlText)
        if isPicsUrls:
            picsUrls.extend(isPicsUrls)
    except:
        print("沒有找到圖片集URL")



#依次進入圖片集,獲取原圖片url
def getPicUrl(picsUrls,picUrls):
    for i in range(len(picsUrls)):
        time.sleep(1)
        user_agent = random.choice(user_list)
        htmlText = getHTMLText(picsUrls[i],user_agent)
        soup = BeautifulSoup(htmlText,'html.parser')

        try:
            ul = soup.find(id = 'scroll')
            #isPicUrls缺省'.jpg'
            isPicUrls = re.findall(r'http://pic1.win4000.com/pic/\w/\w{2}/[a-z0-9]+',str(ul))
            if isPicUrls:
                picUrls.extend(isPicUrls)
        except:
            print('getPicUrl異常') 

#依次get圖片鏈接,保存圖片
def savePics(picUrls,root):
    for i in range(len(picUrls)):
        picUrls[i] = picUrls[i]+".jpg"
        path = root + str(i) +".jpg"    #以'/'爲分割符保留最後一段
        time.sleep(1)
        try:
            #若跟路徑不存在,則創建
            if not os.path.exists(root):
                os.mkdir(root)
            #若文件路徑不存在,則創建,否則提示已保存
            if not os.path.exists(path):
                r = requests.get(picUrls[i])
                with open(path,'wb') as f:
                    f.write(r.content)
                    f.close()
                    print("文件保存成功")
            else:
                print("文件已存在")
        except:
            print("文件錯誤")

def main():
    root = "D://pics//"
    picsUrls = []#圖片集url
    picUrls = []#圖片url
    #爬取頁數,照片集總共5頁
    pages = 5

    
    #獲取照片集的入口URL,i從0開始
    for i in range(pages):
        print("正在爬取:"+str(i) ) 
        #拼接每一頁的url
        time.sleep(2)
        i=i+1
        url = "http://www.win4000.com/meinvtag352_"+ str(i) +".html"
        user_agent = random.choice(user_list)

        #獲取每一頁的請求返回內容
        htmlText = getHTMLText(url,user_agent)
        #分析每一頁html,獲取照片集URL,加入到列表
        getPicsUrl(htmlText,picsUrls)

    #進入照片集,取得每一張照片
    print('正在getPicUrl') 
    getPicUrl(picsUrls,picUrls)
    print('正在savePics') 
    savePics(picUrls,root)

main()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章