Python BeautifulSoup爬取鏈家租房信息

# -*- coding: utf-8 -*-
"""
Created on Thu May  3 14:15:00 2018

@author: 
"""

import urllib 
import pandas as pd
from pandas import DataFrame
from bs4 import BeautifulSoup
from urllib import request

url_l = 'https://bj.lianjia.com/zufang/pg'

ser1 = pd.Series([],name='房屋ID')
ser2 = pd.Series([],name='標題')
ser3 = pd.Series([],name='小區')
ser4 = pd.Series([],name='戶型')
ser5 = pd.Series([],name='面積')
ser6 = pd.Series([],name='朝向')
ser7 = pd.Series([],name='區域')
ser8 = pd.Series([],name='樓層高度')
ser9 = pd.Series([],name='建成時間')
ser10 = pd.Series([],name='樓層結構')
ser11 = pd.Series([],name='月租金')

index = 0

for i in range(1,101):
    url = url_l+str(i)
    req = request.Request(url)
    res = request.urlopen(req)
    html = res.read()
    soup = BeautifulSoup(html,'lxml')
    houselist = soup.find('ul',id='house-lst').contents
    for house in houselist:
        ser1[index] = house['data-id']
        houseinfo = house.find('div',class_='info-panel')
        ser2[index] = houseinfo.h2.a['title']
        regioninfo = houseinfo.find('div',class_='where')
        regionlist = [i for i in regioninfo.stripped_strings]
        ser3[index] = regionlist[0]
        ser4[index] = regionlist[1]
        ser5[index] = regionlist[2]
        ser6[index] = regionlist[3]
        otherinfo = houseinfo.find('div',class_='con')
        otherlist = [i for i in otherinfo.stripped_strings]
        ser7[index] = otherlist[0][:-2]
        ser8[index] = otherlist[2]
        try:
            ser9[index],ser10[index] = otherlist[-1].split('建')
        except:
            ser10[index] = otherlist[-1]
        ser11[index] = int(houseinfo.find('span',class_='num').string)
        index += 1
        
frame = DataFrame({ser1.name:ser1,ser2.name:ser2,ser3.name:ser3,\
                      ser4.name:ser4,ser5.name:ser5,ser6.name:ser6,\
                     ser7.name:ser7,ser8.name:ser8,ser9.name:ser9,\
                     ser10.name:ser10,ser11.name:ser11})

path = '鏈家租房.xlsx'    
frame.to_excel(path)
爬取出來的數據如下

但是需要登陸訪問的頁面爬取我試了下還是不是很成功,想找個簡單的網站試試手都找不到,cookie都會set更改。蛋疼

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章