Python基礎之爬蟲：爬取小說，圖片示例

原創

岩枭

2020-02-22 17:38

一、用python裏面的beautifulsoup爬取網頁中的小說

原來網頁內容：http://www.jueshitangmen.info/tian-meng-bing-can-11.html

#爬蟲

from  bs4 import  BeautifulSoup
from  urllib.request import urlopen

html=urlopen('http://www.jueshitangmen.info/tian-meng-bing-can-11.html')\
    .read().decode('utf-8')
#print(html)
#print('***********************************************************************************************************************')
soup=BeautifulSoup(html,features='lxml')
#獲取所有p標籤
all_p=soup.find_all('p')
#print(all_p)
f=open('flie.txt','a',encoding='utf-8')
for i in all_p:
    print('\n',i.get_text())
    f.write('\n'+i.get_text())
f.close()

#獲取所有a標籤
all_a=soup.find_all('a')
f=open('flie_a.txt','a',encoding='utf-8')
for i in all_a:
    print('\n',i.get_text())
    f.write('\n'+i.get_text())
f.close()

執行程序效果：

二、用python爬取圖片,兩種代碼都可以成功爬取到圖片信息:

代碼一：

import urllib.request
import urllib.parse
import re
import os

#添加header，其中Referer是必須的,否則會返回403錯誤，User-Agent是必須的，這樣纔可以僞裝成瀏覽器進行訪問
#僞裝成瀏覽器，防止反爬蟲
header=\
{
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
     "referer":"https://image.baidu.com"
    }
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="

keyword = input("請輸入搜索關鍵詞：")
#字符轉碼
keyword = urllib.parse.quote(keyword,"utf-8")

n = 0
j = 0
error = 0
while(n<3000):
    n+=1
    #url
    url1 = url.format(word=keyword,pageNum=str(n))
    #獲取請求
    rep = urllib.request.Request(url1,headers=header)
    rep = urllib.request.urlopen(rep)
    #讀取網頁內容
    try:
        html = rep.read().decode("utf-8")
    except:
        print("出錯啦！")
        error = 1
        print("-------當前頁面數："+str(n))
    if(error==1):continue
    #正則匹配：需要的數據都是放在：("thumbURL": "https://ss2.bdstatic.com/70cFvnSh_Q1YnxGkpoWK1HF6hhy/it/u=1593875716,602632714&fm=27&gp=0.jpg")
    p = re.compile("thumbURL.*?\.jpg")
    #獲取匹配的結果
    s = p.findall(html)
    #圖片存儲路徑
    if os.path.isdir("D://pictest/圖片") !=True:
        os.makedirs(r"D://pictest/圖片")
    #獲取圖片的url
    for i in s:
        i = i.replace("thumbURL\":\"","")
        print(i)
        urllib.request.urlretrieve(i,"D://pictest/圖片/pic{num}.jpg".format(num=j))
        j+=1
    print("總共爬取的圖片數："+str(j))

代碼二：

import urllib.request
import urllib.parse
import re
import os

#添加header，其中Referer是必須的,否則會返回403錯誤，User-Agent是必須的，這樣纔可以僞裝成瀏覽器進行訪問
#僞裝成瀏覽器，防止反爬蟲
header=\
{
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
     "referer":"https://image.baidu.com"
    }
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="

keyword=input('請輸入搜索關鍵詞：')
#字符轉碼
keyword=urllib.parse.quote(keyword,'utf-8')

n=0
j=0
error=0

while(n<3000):
    n+=30
    url1=url.format(word=keyword,pageNum=str(n))
    #獲取請求
    rep=urllib.request.Request(url1,headers=header)
    #打開網頁
    rep=urllib.request.urlopen(rep)
    #讀取網頁內容
    try:
        html=rep.read().decode('utf-8')
    except:
        print('出錯了')
        error=1
        print('出錯頁數：'+str(n))
    if error==1:
        continue
    #正則匹配
    p=re.compile('thumbURL.*?\.jpg')
    #獲取正則匹配到的結果,返回list
    s=p.findall(html)
    if os.path.isdir('D://pic')!=True:
        os.makedirs('D://pic')
    with open('testPc.txt','a') as f:
        #獲取圖片url
        for i in s:
            i=i.replace('thumbURL\":\"','')
            print(i)
            f.write(i)
            f.write('\n')
            #保存圖片到D://pic
            urllib.request.urlretrieve(i,'D://pic/pic{num}.jpg'.format(num=j))
            j+=1
        f.close()

print('總共爬取圖片數爲:'+str(j))

執行程序運行結果：