一、用python裏面的beautifulsoup爬取網頁中的小說
原來網頁內容:http://www.jueshitangmen.info/tian-meng-bing-can-11.html
#爬蟲
from bs4 import BeautifulSoup
from urllib.request import urlopen
html=urlopen('http://www.jueshitangmen.info/tian-meng-bing-can-11.html')\
.read().decode('utf-8')
#print(html)
#print('***********************************************************************************************************************')
soup=BeautifulSoup(html,features='lxml')
#獲取所有p標籤
all_p=soup.find_all('p')
#print(all_p)
f=open('flie.txt','a',encoding='utf-8')
for i in all_p:
print('\n',i.get_text())
f.write('\n'+i.get_text())
f.close()
#獲取所有a標籤
all_a=soup.find_all('a')
f=open('flie_a.txt','a',encoding='utf-8')
for i in all_a:
print('\n',i.get_text())
f.write('\n'+i.get_text())
f.close()
執行程序效果:
二、用python爬取圖片,兩種代碼都可以成功爬取到圖片信息:
代碼一:
import urllib.request
import urllib.parse
import re
import os
#添加header,其中Referer是必須的,否則會返回403錯誤,User-Agent是必須的,這樣纔可以僞裝成瀏覽器進行訪問
#僞裝成瀏覽器,防止反爬蟲
header=\
{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
"referer":"https://image.baidu.com"
}
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="
keyword = input("請輸入搜索關鍵詞:")
#字符轉碼
keyword = urllib.parse.quote(keyword,"utf-8")
n = 0
j = 0
error = 0
while(n<3000):
n+=1
#url
url1 = url.format(word=keyword,pageNum=str(n))
#獲取請求
rep = urllib.request.Request(url1,headers=header)
rep = urllib.request.urlopen(rep)
#讀取網頁內容
try:
html = rep.read().decode("utf-8")
except:
print("出錯啦!")
error = 1
print("-------當前頁面數:"+str(n))
if(error==1):continue
#正則匹配:需要的數據都是放在:("thumbURL": "https://ss2.bdstatic.com/70cFvnSh_Q1YnxGkpoWK1HF6hhy/it/u=1593875716,602632714&fm=27&gp=0.jpg")
p = re.compile("thumbURL.*?\.jpg")
#獲取匹配的結果
s = p.findall(html)
#圖片存儲路徑
if os.path.isdir("D://pictest/圖片") !=True:
os.makedirs(r"D://pictest/圖片")
#獲取圖片的url
for i in s:
i = i.replace("thumbURL\":\"","")
print(i)
urllib.request.urlretrieve(i,"D://pictest/圖片/pic{num}.jpg".format(num=j))
j+=1
print("總共爬取的圖片數:"+str(j))
代碼二:
import urllib.request
import urllib.parse
import re
import os
#添加header,其中Referer是必須的,否則會返回403錯誤,User-Agent是必須的,這樣纔可以僞裝成瀏覽器進行訪問
#僞裝成瀏覽器,防止反爬蟲
header=\
{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
"referer":"https://image.baidu.com"
}
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="
keyword=input('請輸入搜索關鍵詞:')
#字符轉碼
keyword=urllib.parse.quote(keyword,'utf-8')
n=0
j=0
error=0
while(n<3000):
n+=30
url1=url.format(word=keyword,pageNum=str(n))
#獲取請求
rep=urllib.request.Request(url1,headers=header)
#打開網頁
rep=urllib.request.urlopen(rep)
#讀取網頁內容
try:
html=rep.read().decode('utf-8')
except:
print('出錯了')
error=1
print('出錯頁數:'+str(n))
if error==1:
continue
#正則匹配
p=re.compile('thumbURL.*?\.jpg')
#獲取正則匹配到的結果,返回list
s=p.findall(html)
if os.path.isdir('D://pic')!=True:
os.makedirs('D://pic')
with open('testPc.txt','a') as f:
#獲取圖片url
for i in s:
i=i.replace('thumbURL\":\"','')
print(i)
f.write(i)
f.write('\n')
#保存圖片到D://pic
urllib.request.urlretrieve(i,'D://pic/pic{num}.jpg'.format(num=j))
j+=1
f.close()
print('總共爬取圖片數爲:'+str(j))
執行程序運行結果: