4.8實驗記錄(爬取so問答)

只爬取指定的類的div

from bs4 import BeautifulSoup
soup=BeautifulSoup(h,'html.parser')
a=soup.find_all('div',class_="post-text")
ar=re.compile(r'<[^>]+>',re.S)
ad=ar.sub('',str(a))

h:爬取的html內容
class_:要爬取的div的class
ar:正則找標籤
ad:去標籤

import requests
import re
from bs4 import BeautifulSoup
import time
import re

headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"}
for k in range(33):
    res=requests.get("https://stackoverflow.com/search?page="+str(k)+"&q=docker+cpu",headers=headers)
    html=res.text
    url=re.findall('<a href="(.*?)" data-searchsession=".*?" title="(.*?)" class="question-hyperlink">',html)

    for j in range(len(url)):
        u=url[j]
        file_name=u[0].split('/')[-2]
        response=requests.get("https://stackoverflow.com"+u[0],headers=headers)
        h=response.text
        soup=BeautifulSoup(h,'html.parser')
        a=soup.find_all('div',class_="post-text")


        for i in range(len(a)):
            with open("D://stack//" + file_name +'['+str(i)+'].txt', 'w+', encoding="utf-8") as f:
                ar=re.compile(r'<[^>]+>',re.S)
                ad=ar.sub('',str(a[i]))
                f.write(ad)
                f.close()
                time.sleep(1)
    time.sleep(10)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章