網絡爬蟲

1.安裝python(2和3不兼容)
2.以管理員身份運行終端,下載beautifulsoup4,執行命令:pip install beautifulsoup4
3.下載request :pip install requests
4.分析網頁代碼結構:
簡單代碼如下:

import requests
from bs4 import BeautifulSoup

resp = requests.get("網址")
soup = BeautifulSoup(resp.text,'html.parser')
#分析網頁,找到相應類和標籤
title = soup.find('ul',class_='detaila').text.strip()
content = soup.find('ul',class_='detailc').text.strip()

file_name = '{}.txt'.format(title)
with open (file_name,'w',newline = '') as f:
    f.write(content)

開多線程爬取飛華網實例

import re
import sys
import time
import requests
import threading
from urllib import parse
from bs4 import BeautifulSoup


ori_url = 'http://dise.fh21.com.cn/department/illnesses.html'
session = requests.session()
root_urls = []  # 所有科室的絕對路徑
tag_urls = []  # 所有的絕對路徑
times = 16


def main():
    soup = request_get(ori_url)
    for root in soup.find_all('ul', class_='level2'):
        for tag in root.find_all('a', class_='link08 '):
            root_urls.append(parse.urljoin(ori_url, tag['href']))

    for url in root_urls:
        soup = request_get(url)
        if soup is 'pass':
            #print('Skip this one url above.', file=sys.stderr)
            continue
        list_root = soup.find('div', class_='dise_list')
        for a in list_root.find_all('a', class_='link08'):
            target = a.get('href')
            tag_urls.append(target)
        page_tab = soup.find('div', class_='pageStyle')
        if page_tab:
            next_page = page_tab.find('span', class_='current').next_sibling
            if next_page:
                soup = request_get(parse.urljoin(ori_url, next_page.get('href')))
                scrape_list_page(soup)
    #print('A total of {} urls were scraped.'.format(len(tag_urls)), file=sys.stderr)
    #print('--------    Start saving...    --------', file=sys.stderr)

    count = 0
    temp = len(tag_urls) // times
    #print(temp)
    #print(type(temp))
    threads = []
    while count < times:
        t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):(temp * (count + 1))],))
        threads.append(t)
        count += 1
    if (temp * count) < len(tag_urls):
        t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):len(tag_urls)]))
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    for url in tag_urls:
        soup = request_get(url)
        detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))
        save_txt(request_get(detail_url))

    tag_urls.clear()
    root_urls.clear()
    #print('All completed.', file=sys.stderr)


def request_get(url):
    resp = session.get(url)
    #print(url)
    if resp.status_code is not 200:
        #print('404', file=sys.stderr)
        return 'pass'
    return BeautifulSoup(resp.text, 'lxml')


def scrape_list_page(soup):
    for a in BeautifulSoup(str(list(soup.select('.dise_list_title')[1].next_siblings)), 'html.parser').select('.link08'):
        target = a.get('href')
        tag_urls.append(target)
    page_tab = soup.find('div', class_='pageStyle')
    if page_tab:
            next_page = page_tab.find('span', class_='current').next_sibling
            if next_page:
                soup = request_get(parse.urljoin(ori_url, next_page.get('href')))
                scrape_list_page(soup)


def process_task(targets):
    for url in targets:
        time.sleep(1)
        soup = request_get(url)
        detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))
        save_txt(request_get(detail_url))


def save_txt(soup):
    title = re.sub(r'[\\/\:\*\?"\<\>\|]', '@', (soup.find('div', class_='navigator').find_all('a', class_='link04')[2].text)).strip()
    content = soup.find('ul', class_='detailc').text.strip()
    file_name = '{}.txt'.format(title)
    with open(file_name, 'w', encoding='utf-8', newline='') as f:
        f.write(content)

if __name__ == '__main__':
    main()
發佈了151 篇原創文章 · 獲贊 41 · 訪問量 10萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章