python爬取網頁信息,用正則達到快刀斬亂麻的效果

# -*- coding: utf-8 -*-
"""
Created on Fri Jun 14 17:37:44 2019

@author: User
"""

import re
import sys
from bs4 import BeautifulSoup       #beautifulsoup4庫使用時是簡寫的bs4
import requests
import numpy as np
#import string


header={  
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'  
}

i_pg = 0
area_dtype=np.dtype([('bianhao',np.str_,50),
                     ('quyu',np.str_,30),
                     ('riqi',np.str_,30),
                     ('nian',np.str_,30),
                     ('yue',np.str_,30),
                     ('ri',np.str_,30),
                     ('zhuangtai',np.str_,30),  
                     ('mianji',np.str_,30),                     
                     ('jiage',np.str_,30),
                     ('yongtu',np.str_,30)
                     ])

global array_area_all
array_area_all = np.array([('title', 'quyu',
                      'riqi', 'nian',
                      'yue', 'ri',
                      'zhuangtai', 'mianji', 
                      'jiage', 'yongtu')], dtype = area_dtype)

def get_page(url):
    global i_pg
    i_pg += 1
    print('頁:', str(i_pg))
#    print(url)
    
    if i_pg > 50:
        sys.exit()
        
    try:
        response = requests.get(url, timeout = 30, headers=header)
        
        # 如果狀態碼不是200 則應發HTTOError異常
        response.raise_for_status()
        # 設置正確的編碼方式
        response.encoding = response.apparent_encoding

        soup = BeautifulSoup(response.text, 'html.parser')
        result_li = soup.find_all(class_=re.compile("rich-table-row.*?"))        
#        print('result_li:',result_li) 
        
        j = 0
#        #處理當前頁面房源鏈接
        for row_text in result_li:

#            這裏每個列表頁只處理前4個房源鏈接,是爲了家加快調試
            j = j + 1

            # 編號
            title_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id92")})
            global title
            title = title_str.text.lstrip()
#            print('編號:',title)
            # 區域
            quyu_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id98")})
            global quyu
            quyu = quyu_str.text.lstrip()
#            print('區域:',quyu)
            # 成交價
            chengjiaojia_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id107")})
            global chengjiaojia
            chengjiaojia = chengjiaojia_str.text.lstrip().replace('萬元','',1)
            chengjiaojia = my_strip(chengjiaojia)
            if len(chengjiaojia) == 0:
                chengjiaojia = '0'
#            print('成交價:',chengjiaojia)            
            # 成交時間
            chengjiao_date_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id116")})
            global chengjiao_date
            chengjiao_date = chengjiao_date_str.text.lstrip()
            
            #年
#            nian_str = chengjiao_date.find('td',{'id':re.compile("j_id46:\d+:j_id116")})
            global nian
            nian = chengjiao_date[0:4]
#            print('年:', nian)
            
            global yue
            yue = chengjiao_date[5:7]
#            print('月:', yue)  
            
            global ri
            ri = chengjiao_date[8:10]
#            print('日:', ri)            
            
#            print('成交時間:',chengjiao_date)             
            # 成交狀態
            chengjiao_state_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id119")})
            global chengjiao_state
            chengjiao_state = chengjiao_state_str.text.lstrip()
#            print('成交狀態:',chengjiao_state) 
            

            
            detail_href = row_text.find('a', {'id': re.compile("j_id46:\d+:j_id124")})
            detail_url = 'http://****' + detail_href.attrs['href']
            get_page_detail(detail_url)
            
            # 新的數組
            global area_new
    
            area_new = np.array([(title, quyu,
                      chengjiao_date, nian,
                      yue,ri,
                      chengjiao_state,
                      area, chengjiaojia,
                      use_1)], dtype = area_dtype)     
    
#            print('area_new:', area_new)
    
            # 數組合並
            global array_area_all
            array_area_all = np.vstack((array_area_all, area_new))    

        # 下一頁的爬取
        result_next_page = 'http://****?firstResult='+str((i_pg - 1) * 20) +'&priceUnit=TotalPrice&logic=and'
        
        if len(result_next_page) != 0:
        # 函數進行遞歸
            get_page(result_next_page)            
        else:
            print('沒有下一頁了')        
      
        return response.text
    except:
        return '產生異常!'
 
 
#進行字符串中空格,換行,tab鍵的替換及刪除字符串兩邊的空格刪除 
def my_strip(s):
    return str(s).replace(" ", "").replace("\n", "").replace("\t", "").strip()

#由於頻繁進行BeautifulSoup的使用,封裝一下
def my_Beautifulsoup(response):
    return BeautifulSoup(str(response), 'html.parser')
 
# 詳細頁面的爬取
def get_page_detail(url):
    response = requests.get(url, headers=header)
#    print("9999999")
 
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
#        宗地面積
        area_all = soup.find(attrs={'id': 'j_id242'})
        area_2 = area_all.find_all('span', {'class': 'layout'})[1]
        global area
        area = area_2.text.lstrip().replace('平方米','',1)
        area = my_strip(area)
        if len(area) == 0:
            area = '0'        
#        print('面積:'+ area)      
  
#        用途
        use_all = soup.find(attrs={'id': 'j_id267'})
        use_2 = use_all.find_all('span', {'class': 'layout'})[1]
        global use_1
        use_1 = use_2.text.lstrip()
        use_1 = my_strip(use_1)
#        print('用途:'+ use_1)       

# =============================================================================
        
get_page('http://****')  
print(array_area_all)
np.savetxt('data\\zl.csv', array_area_all, delimiter=',',
           fmt="%s")


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章