1、數據爬取

房天下的網站，用最便捷的requesets+xpath定位爬取。
由於房地產市場有一定的飽和，新房的數據量太小，因此選擇二手房的數據

import requests
import re
from lxml import etree
import csv
import time

fp=open('E:\ fangtianxia.csv','wt',newline='',encoding='utf-8')
writer=csv.writer(fp)
writer.writerow(('city','name','loc','size','area','price','price_sum','dire','floor','buildtime','advantage'))
headers = {
        'Connection': 'close',
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        "accept-encoding": "gzip, deflate, br",
        "cache-control": "no-cache",
        "accept-language": "zh-CN,zh;q=0.9",
        "cache-control": "no-cache",
        "cookie" : "global_cookie=cvgwqloe7oksvtftupwmtsn1o20jztnjsd5; city=sz; Integrateactivity=notincludemc; integratecover=1; SKHRecordssz=%252c%25e5%25b1%2585%25e5%25ae%25b6%25e4%25b8%2589%25e6%2588%25bf%252c%25e7%2589%25a9%25e4%25b8%259a%252c%25e4%25b8%259a%25e4%25b8%25bb%25e8%25af%259a%25e5%25bf%2583%25e5%2587%25ba%25e5%2594%25ae%257c%255e2019%252f8%252f27%2b19%253a56%253a33%257c%255e0%257c%2523%25e5%25a4%25a7%25e8%25bf%2590%25e6%2596%25b0%25e5%259f%258e%2b%25e5%258e%2585%25e5%2587%25ba%25e9%2598%25b3%25e5%258f%25b0%2b%25e7%25b2%25be%25e8%25a3%2585%25e4%25b8%2589%25e6%2588%25bf%2b%25e6%25bb%25a1%25e4%25b8%25a4%25e5%25b9%25b4%257c%255e2019%252f8%252f27%2b19%253a56%253a41%257c%255e0; __utma=147393320.1831537449.1566899575.1566905739.1566993019.4; __utmz=147393320.1566993019.4.4.utmcsr=search.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/captcha-c342d934c8/; g_sourcepage=ehlist; __utmc=147393320; logGuid=a4782b6a-96fe-4bbf-90e4-395577d22851; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmb=147393320.18.10.1566993019; unique_cookie=U_klome40gpefgacg4y0p3st5ko1sjzv86iuc*6",
        "pragma": "no-cache",
        "referer": "https://sz.esf.fang.com/",
        "sec - fetch - mode": "navigate",
        "sec - fetch - site" : "none",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests" : "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
    }
city_list=[]

def get_info(city_url):
    i=re.search('house/i3(.*?)/',city_url).group(1)
    city_name=re.search('//(.*?).esf',city_url).group(1)
    print('正爬取{}第{}頁'.format(city_name,i))
    requests.packages.urllib3.disable_warnings()
    response=requests.get(city_url,headers=headers,timeout=None,verify=False)
    selector=etree.HTML(response.text)
    infos = selector.xpath('//dl[@dataflag="bg"]')
    try:
        for info in infos:
            name = info.xpath('dd/p[2]/a/@title')
            name = name[0] if len(name) != 0 else ' '
            loc = info.xpath('dd/p[2]/span/text()')[0]
            size = info.xpath('dd/p/text()[1]')[0].strip()
            area = info.xpath('dd/p/text()[2]')[0].strip()[:-2]
            dire = info.xpath('dd/p/text()[4]')[0].strip()
            floor = info.xpath('dd/p/text()[3]')[0].strip()
            buildtime = info.xpath('dd/p/text()[5]')
            buildtime = buildtime[0].strip() if len(buildtime) != 0 else '未知'
            price = info.xpath('dd[2]/span[2]/text()')[0].strip()[:-4]
            pricesum = info.xpath('dd[2]/span/b/text()')[0].strip()
            advantage = info.xpath('dd/p[3]')
            advantage = advantage[0].xpath('string(.)').strip()#獲取連續多個標籤的文本
            advantage = advantage if len(advantage) != 0 else '無'
            print(city_name,name,loc,size,area,dire,floor,buildtime,price,pricesum,advantage)
            writer.writerow((city_name,name, loc, size, area, price, pricesum, dire, floor, buildtime, advantage))
    except IndexError:
        pass

if __name__=='__main__':
    city_name = ['sz','gz','zh','shaoguan','st','fs','zj', 'zhaoqing', 'jm', 'maoming','huizhou', 'meizhou',
                     'shanwei', 'heyuan', 'yangjiang', 'qingyuan', 'dg','zs', 'chaozhou', 'jieyang', 'yunfu']
    urls = ['https://{}.esf.fang.com'.format(city) for city in city_name]
    print(urls)
    try:
        for url in urls:
            response = requests.get(url,headers=headers,timeout=None)
            page = re.findall('<p>共(.*?)頁</p>', response.text)[0]
            print(page)
            city_urls = [url +'/house/i3' + str(i) + '/' for i in range(1, int(page) + 1)]
            print(city_urls)
            for city_url in city_urls:
                city_list.append(city_url)

    except IndexError:
        pass


    for city_ in city_list:
        try:
            get_info(city_)
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            time.sleep(5)
            print("now let me continue...")
            continue

fp.close()

2、明確需求與目的

當今時代，房價問題一直處於風口浪尖，房價的上漲抑或下跌都牽動着整個社會的利益，即便是政府出臺各種政策方針也只能是暫時抑制樓市的漲勢，對於需要買房的人來說，除了關注這些變化和政策外，還有一個非常頭疼的問題，在哪裏買房，房價怎樣。一般的人會不停花大量精力逛鏈家、安居客等房地產網站，藉助他們展示的內容進行篩選，但因地區衆多，各個地段、房價差異的對比以及入手時機的把握，都得自己去一個個查閱與分析，非常麻煩。倘若可以通過數據的爬取，再按照自己希望的維度統計、分析與展示，會讓數據變得清晰明瞭。本項目旨在提取並展示數據，爲剛需購房者提供有用信息。

數據預覽

提出問題

1、廣東省房價的總體情況如何？
2、高端小區都有哪些？
3、廣東省小區的命名偏好
4、廣深兩地的房源分佈如何
5、廣深房價與房屋面積大小的關係如何？
6、廣深地區房源分佈的地鐵線以及房價與距地鐵線距離的關係
7、廣深地區房屋朝向
8、廣深地區建設年份集中情況
9、廣深地區熱門戶型

3.數據預處理

第一步導入相關的庫，並做相關設置

import os
os.chdir('H:\\ana\data')#切換到指定路徑
import numpy as np
import pandas as pd
from pyecharts import Map,Bar,WordCloud,Pie
import matplotlib.pyplot as plt
import re 
import seaborn as sns
from scipy import stats
plt.style.use('ggplot')
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']  #解決seaborn中文字體顯示問題
plt.rc('figure', figsize=(10, 10))  #把plt默認的圖片size調大一點
plt.rcParams["figure.dpi"] =mpl.rcParams['axes.unicode_minus'] = False # 解決保存圖像是負號'-'顯示爲方塊的問題
%matplotlib inline

第二步加載數據集

data=pd.read_excel('房產數據.xlsx')
print(data.shape)#返回數據行列數
# data.head()#查看開頭指定列數
# data.tail()#查看末尾指定列數
data.sample(10)#隨機取指定列數

數據清洗

缺失值

通過info查看數據信息。
也可以通過isnull與sum結合，查看缺失值情況。

data.info()

data.isnull().sum()

缺失值佔總數據的％10左右，可以直接刪去。但達到％30左右及以上，我們可以採用填充的方法，均值中值或者衆數來填充視情況而定。

# 刪除所有含有空值的行。就地修改。
data.dropna(axis=0, inplace=True)
data.isnull().sum()

異常值(對連續性標籤做處理）

通過describe查看數值信息。
可配合箱線圖輔助。
異常值可以刪除，視爲缺失值，或者不處理。

data.describe()

sns.boxplot(data=data['price'])
sns.boxplot(data=data['area'])

箱線圖包括最小值，四分之一位點q1，中位點，四分三位點q3，最大值，離羣點。
離羣點定義爲小於q1 - 1.5IQR,大於q3 + 1.5IQR(q3-q1=IQR)。
離羣點可能爲異常值，但就此看這些離羣點都是算在一個合理的範圍內的。

第一張圖爲價格的箱線圖，離羣點很多，說明廣東省存在個別市的房價差異巨大，但大多數處於較低的水平。這一現象符合認知，珠三角城市房價大於其他地級市房價。
第二張圖爲房屋面積的箱線圖，也存在不少的離羣點，面積位於100-200平米的房屋佔絕大多數，存在特別大面積的房屋有可能是集體宿舍，經瀏覽網頁發現也有可能存在商家亂填刷單的現象，屬於異常值，需要後續清洗

#清洗面積和價格的異常數據，主觀選取一個較合理範圍
data=data.drop(data[data['area']>300].index)
data=data.drop(data[data['price']>200000].index)

異常值(對離散標籤做處理）

1、朝向

data['dire'].unique()

由於極其少數網站頁面沒有朝向的數據，而是把後面建成時間的數據提前，因此有些錯亂。
但無妨，我們只需做個簡單的過濾，保留有正確朝向的數據

index=data[~data['dire'].isin(['東北向', '北向', '南向', '東向', '西南向', '南北向', '東南向', '西北向', '西向', '東西向'])].index
data=data.drop(index)

2、建成時間

data['buildtime'].unique()

發現存在幾項異常的時間，有可能是地產商預計的建成時間，我們不考慮未建成的房屋，因此做一個過濾

index=data[data['buildtime'].isin(['2020年建','2021年建', '2022年建', '2025年建'])].index
data=data.drop(index)

3、樓層

data['floor'].unique()

這一項數據非常凌亂，而且存在有一些極其異常的數據，我們做一個過濾

#清洗個別嚴重異常數據
index=data[data['floor'].isin(['低層（共302層）','低層（共215層）','低層（共130層）', '低層（共220層）','低層（共142層）'])].index
data=data.drop(index)

4、房屋佈局

data['size'].unique()

室和廳數量比較多的查看後發現均爲集體宿舍或大型別墅，在一個比較合理的範圍

index=data[data['size'].isin([ '0室0廳'])].index
data=data.drop(index)

最後將清洗後的數據保存，把廣州深圳的數據另外保存，下面會着重分析

#保存清洗後的數據
data.to_csv('data_clean.csv',index=False)
data.loc[data.city=='深圳',:].to_csv('shenzhen_data_clean.csv',index=False)
data.loc[data.city=='廣州',:].to_csv('guangzhou_data_clean.csv',index=False)

4、數據分析

問題1、廣東省房價的總體情況如何？

g = data.groupby("city")#按城市分組
r = g["price"].agg(["mean", "count"])#按價格的均值、數量創建對象
r = r.sort_values("mean", ascending=False)#按均值的降序排列
display(r)
r.plot(kind="bar")

很明顯發現，珠三角城市房價位居前列，粵東粵西的邊緣城市靠後，其中廣州深圳的房價顯著超出平均水平。數據量上珠三角城市也明顯佔優，說明大城市的房地產市場更加火爆。而小城市中陽江的數據量也比較大，個人認爲應該是當地海陵島的旅遊業比較火爆，帶動房產市場。

接下來繪製一個價格地圖，更加直觀的展示數據

city_sum=r.index.tolist()
for i in range(len(city_sum)):
    city_sum[i]+='市'#map的調用需要城市全稱
price_avg=r['mean'].tolist()
map=Map('廣東省各地級市平均房價','單位：元/平方米',
        title_color="#fff",title_pos="center",
        width=1200,  height=600,
        background_color='#404a59')
map.add("", city_sum, price_avg,
        maptype='廣東',visual_range=[7000,35000],
        is_visualmap=True, visual_text_color='#000',
        is_label_show=True)

問題2、高端小區都有哪些？

我們發現就算在珠三角地區中，城市內的各個樓盤也存在很大的價格差異。
定義房價大於10萬的小區爲高端小區，作一個展示

upscale_community=pd.DataFrame()#創建一個新的DF對象
city_sum=r.index.tolist()
for city in city_sum:
    data_city=data.loc[data.city==city,:]#選取特定的城市
    data_city=data_city.loc[data_city.price>100000,:]#選取房價>十萬的數據
    upscale_community=pd.concat([upscale_community,data_city],axis=0,ignore_index=True)#合併符合條件的數據集

upscale_community=upscale_community.loc[:,'name'].drop_duplicates()#去掉重複值
name=upscale_community.tolist()
value=[1 for i in range(len(name))]
wordcloud=WordCloud(width=1500, height=800)
wordcloud.add('', name, value, word_size_range=[10,20])

問題3、廣東省小區的命名偏好

import jieba
import jieba.analyse
#載入數據
rows=pd.read_csv('data_clean.csv', header=0,encoding='utf-8',dtype=str)

segments = []
for index, row in rows.iterrows():
    content = row[1]	#提出小區名字的數據
    #TextRank 關鍵詞抽取，只獲取固定詞性
    words = jieba.analyse.textrank(content, topK=50,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
    for word in words:
        # 記錄全局分詞
        segments.append({'word':word, 'count':1})
        
dfSg = pd.DataFrame(segments)

# 詞頻統計
dfWord = dfSg.groupby('word')['count'].sum()
dfWord.sort_values(ascending=False)[:30]	#取前30輸出

輸出結果如下

從上看出廣東省的小區偏好以花園、廣場、時代、國際等詞命名。同時也看出房地產商保利在廣東省佔據一定的市場份額

問題4、廣深兩地的房源分佈如何

想要把房源的分佈在地圖上展示出來，需要經緯度的數據，這裏調用百度地圖的API，把已有的地址數據轉換爲經緯度數據

import json
from urllib.request import urlopen, quote
import requests,csv
import pandas as pd 
def getlnglat(address):
    url = 'http://api.map.baidu.com/geocoder/v2/'
    output = 'json'
    ak = '###############'#這裏輸出你個人在百度地圖註冊的KEY
    add = quote(address) #由於本文城市變量爲中文，爲防止亂碼，先用quote進行編碼
    uri = url + '?' + 'address=' + add  + '&output=' + output + '&ak=' + ak
    req = urlopen(uri)
    res = req.read().decode() #將其他編碼的字符串解碼成unicode
    temp = json.loads(res) #對json數據進行解析
    return temp

# file = open(r'H:\ana\point.json','w') #建立json數據文件
try:
    data=pd.read_csv(r'H:\ana\data\guangzhou_data_clean.csv')#以廣州市爲例子
    for i in range(len(data)):
        loc='廣州市'+data.loc[i,'loc']
        price=data.loc[i,'price']
        lng = getlnglat(loc)['result']['location']['lng'] #採用構造的函數來獲取經度
        data.loc[i,'lng']=lng
        lat = getlnglat(b)['result']['location']['lat'] #獲取緯度
        data.loc[i,'lat']=lat
        data.loc[i,'point']=str(lng)+','+str(lat)
        str_temp = '{"lat":' + str(lat) + ',"lng":' + str(lng) + ',"count":' + str(c) +'},'
        print(str_temp,i) #也可以通過打印出來，把數據copy到百度熱力地圖api的相應位置上
                
except KeyError:
    pass
    
data.to_csv('gz_latlon.csv',index=False)

獲取經緯度後，在百度地圖平臺上上傳相關帶有經緯度的數據即可製作相關的熱力地圖
結果如下：
深圳

廣州

可以發現，深圳的房源分佈較爲均勻，大多集中在南山區和福田區。圖示中點越大代表價格越高，發現深圳灣周邊和福田區中心的位置房價相對高很多。
廣州房源的分佈集中在白雲區和天河區，也有一些小的集羣點分佈在廣州北站、廣州東站等交通樞紐附近，而廣州南站較爲偏僻，比較少房子分佈

問題5、廣深房價與房屋面積大小的關係如何？

def area_price_relation(city):
    data=pd.read_csv('{}_data_clean.csv'.format(city))
    g=sns.jointplot(x='area',
                   y='price',
                   data=data, 
                   kind='reg' ,
                   stat_func=stats.pearsonr
                   )
    g.fig.set_dpi(100)
    g.ax_joint.set_xlabel('面積', fontweight='bold')
    g.ax_joint.set_ylabel('價格', fontweight='bold')
    return g

area_price_relation('shenzhen')

area_price_relation('guangzhou')

可見價格與面積之間有一定的正相關關係。深圳中面積的影響更大，說明深圳的房價受波動更大，房價的不穩定性更大。

問題6、廣深地區房源分佈的地鐵線以及房價與距地鐵線距離的關係

def get_distance(city,data=data):
    station=[]#站
    distance=[]#距離
    station_count=[]#地鐵線房源分佈數量
    station_name=[]#地鐵線
    data1=data.loc[data.city==city,:]
    data1=data1.reset_index(drop=True)#重置索引
    for i in range(len(data1)):
        s=re.findall('\d+',data1.loc[i,'advantage'])#用正則表達式匹配advantage標籤
        if len(s)==2:
            distance.append(s[1])#距離
            station.append(s[0])#站線
            data1.loc[i,'distance']=s[1]
    data1.to_csv('{}_distance.csv'.format(city),index=False)    #重新保存數據，後續進行分析
    count=list(set(station))#列表去掉重複值的方法
    count.sort()#列表排序
    for i in count:
        station_count.append( station.count('{}'.format(i)) )  #統計各個站線房源分佈數量
        station_name.append('{}號線'.format(i))  #相應站線            
    bar=Bar('')
    bar.add('' , station_name , station_count ,
            is_label_show=True , is_more_utils = True)
    return bar

get_distance('深圳')

get_distance('廣州')

def distance_price_relation(city_short):
    data=pd.read_csv('{}_distance.csv'.format(city_short))
    g=sns.jointplot(x='distance',
                   y='price',
                   data=data.dropna(subset=['distance']),
                   kind='reg',
                    stat_func=stats.pearsonr
                   )
    g.fig.set_dpi(100)
    g.ax_joint.set_xlabel('最近地鐵距離',fontweight='bold')
    g.ax_joint.set_ylabel('價格',fontweight='bold')
    return g

distance_price_relation('sz')

distance_price_relation('gz')

從上可見，深圳房子多分佈在1、2、3號線，廣州多分佈在2、3、5、6號線。
房價與距離地鐵站的距離有一定的負相關關係，距離越近，房價越高的趨勢大。

問題7、廣深地區房屋朝向

def dire_pie(city_short):
    data=pd.read_csv('{}_distance.csv'.format(city_short))
    dire=data.dire.value_counts().index.tolist()
    count=data.dire.value_counts().values.tolist()
    pie=Pie('朝向統計餅狀圖',title_pos='center')
    pie.add('餅圖', dire, count, is_label_show=True,
            legend_orient='vertical', legend_pos='left',
            is_more_utils=True)
    return pie

dire_pie('sz')

dire_pie('gz')

很明顯發現朝南的房子佔％50以上。說明很多房地產商會偏向於建築朝南的房子，以吸引顧客
朝南的房子有其優點：
1、由於我國位於北半球，大部分時間陽光從南方照射過來，而居住南面則房屋採光良好；
2、夏天時，強烈的下午陽光會偏向北方，南面的房屋可以避免下午陽光造成的高溫；
3、冬天時，陽光會偏向與南面房屋，使得房屋在寒冷季節可以保持比較溫暖。

問題8、廣深地區建設年份集中情況

def time_pie(city):
    time=data[data.city==city].buildtime.value_counts().index.tolist()[:5]
    count=data[data.city==city].buildtime.value_counts().values.tolist()[:5]
    pie=Pie('建年統計餅狀圖',title_pos='center')
    pie.add('餅圖', time, count, is_label_show=True,
            legend_orient='vertical', legend_pos='left',
            is_more_utils=True)
    
    return pie

time_pie('深圳')

time_pie('廣州')

從上可發現，廣深地區的房子集中在2014和2015兩年，一定程度上說明這兩年是房地產業迅猛發展的兩年。同時2018年在前列，一定程度上說明廣深地區流動人口占有較大的比重，房屋商品化，二手房的交易市場較熱。也有大量年代較遠的房子在售，說明這些老房子有一定的市場。

問題9、廣深地區熱門戶型

def size(n,data=data):
    size_count=data[data.city==n]['size'].value_counts().values[:5]
    size_kind=data[data.city==n]['size'].value_counts().index[:5]
    bar=Bar('戶型排行')
    bar.add('',size_kind,size_count,is_label_show=True,is_more_utils = True)
    return bar

size('深圳')

size('廣州')

從上得出，廣深地區熱門戶型非常一致，其中最熱門爲3室2廳

5、機器學習預測房價

採用機器學習算法綜合考慮多個因素對房價的影響，建立預測模型。
首先要講數據轉換爲可以作爲模型輸入的矩陣形式

sz_data=pd.read_csv('sz_distance.csv')
gz_data=pd.read_csv('gz_distance.csv')
def transform(data):
    for i in range(len(data)):
        words=list(jieba.cut(data.loc[i,'advantage']))
        if '滿二' in words:
            data.loc[i,'exemption of business tax']=1
        else:
            data.loc[i,'exemption of business tax']=0
        if '滿五' in words:
            data.loc[i,'exemption of double tax']=1
        else:
            data.loc[i,'exemption of double tax']=0
        if '教育' in words:
            data.loc[i,'quality education']=1
        else:
            data.loc[i,'quality education']=0
            
transform(sz_data)
transform(gz_data)

觀察數據，發現房屋優勢特徵中滿二、滿五、優質教育的字段很多，因此單獨轉換爲0和1，作爲輸入。

new_data=pd.DataFrame()
def datatrans(new_data,data,dire_sum=list(gz_data['dire'].unique())):
    new_data['city']=data['city']
    new_data['area']=data['area']
    new_data['buildtime']=data['buildtime'].astype('float')
    new_data['distance']=data['distance']
    for i in range(len(data)):
        s=re.findall('\d+',data.loc[i,'size'])
        new_data.loc[i,'room_num']=float(s[0])
        new_data.loc[i,'hall_num']=float(s[1])
        
        if '低層' in data.loc[i,'floor']:
            new_data.loc[i,'floor']=1
        elif '中層' in data.loc[i,'floor']:
            new_data.loc[i,'floor']=2
        elif '高層' in data.loc[i,'floor']:
            new_data.loc[i,'floor']=3
            
        dire=data.loc[i,'dire']
        idx=dire_sum.index(dire)+1
        new_data.loc[i,'dire']=idx
        
    new_data['exemption of business tax']=data['exemption of business tax']
    new_data['exemption of double tax']=data['exemption of double tax']
    new_data['quality education']=data['quality education']

datatrans(new_data,sz_data)
new_data1=pd.DataFrame()
datatrans(new_data1,gz_data)
new_data1=pd.concat([new_data1,new_data],axis=0,ignore_index=True)

進一步處理數據，將樓層按照低中高分別賦值1、2、3作爲輸入。
再用正則表達式將房屋佈局的數據拆分爲房間數量和客廳數量兩個特徵輸入。
將各個不同朝向的數據轉化爲1-8作爲輸入

gz_price = gz_data['price']
sz_price = sz_data['price']
price = pd.concat([gz_price,sz_price],axis=0,ignore_index=True)
new_data1 = new_data1.join(pd.get_dummies(new_data1.city))
new_data1.drop('city',axis=1,inplace=True)
new_data1.to_csv('new_data7.20.csv',index=False)

當前數據有11個特徵（房屋面積、建成時間、距地鐵站距離、房間數、客廳數、樓層、方向、是否滿二、是否滿五、是否優質教育、城市）和1個標記（房價）。因爲預測目標——房價是一個連續變量，因此本項目中的價格預測是一個迴歸問題。

數據預處理

data=pd.read_csv('new_data7.20.csv')
data['distance'].fillna(5000,inplace=True)
data['buildtime'].fillna(data['buildtime'].mode()[0],inplace=True)
X = data.drop(["price"], axis=1)

#數據分割，隨機採樣25%作爲測試樣本，其餘作爲訓練樣本
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)


#數據標準化處理 歸一化
from sklearn.preprocessing import StandardScaler
ss_x = StandardScaler()
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)

線性迴歸

from sklearn.linear_model import LinearRegression
lr = LinearRegression()     #初始化
lr.fit(x_train, y_train)    #訓練數據
lr_y_predict = lr.predict(x_test)   #迴歸預測
#性能測評：使用R方得分指標對模型預測結果進行評價
from sklearn.metrics import  r2_score
print("LinearRegression模型的R方得分爲：", r2_score(y_test, lr_y_predict))

plt.figure(figsize=(15, 5))
plt.plot(y_test.values[:100], "-r", label="真實值")
plt.plot(lr_y_predict[:100], "-g", label="預測值")
plt.legend()
plt.title("線性迴歸預測結果")

KNN

param_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,12)]
        
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,12)],
        'p':[i for i in range(1,6)]
    }
]
from sklearn.neighbors import KNeighborsRegressor
knnrgr = KNeighborsRegressor()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knnrgr,param_grid)
grid_search.fit(x_train,y_train)

用網格搜索法尋找調參，訓練結果爲

其他的迴歸模型

models = [Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),
GradientBoostingRegressor(),SVR(),ElasticNet(alpha=0.001,max_iter=10000),
SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),ExtraTreesRegressor(),
XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=False, objective='reg:gamma')]
names = [ "鄰迴歸", "Lasso迴歸", "隨機森林", "梯度提升樹", "支持向量機" , "彈性網絡","梯度下降回歸","貝葉斯線性迴歸","L2正則線性迴歸","極端隨機森林迴歸","Xgboost迴歸"]
for name, model in zip(names, models):
    model.fit(x_train,y_train)
    predicted= model.predict(x_test)
    print("{}: {:.6f}, {:.4f}".format(name,model.score(x_test,y_test),mean_squared_error(y_test, predicted)))

結果如下

模型調參

class grid():
    def __init__(self,model):
        self.model = model
    
    def grid_get(self,X,y,param_grid):
        grid_search = GridSearchCV(self.model,param_grid,cv=5,n_jobs=-1)
        grid_search.fit(X,y)
        print(grid_search.best_params_, grid_search.best_score_)
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

Lasso迴歸調參

grid(Lasso()).grid_get(x_train,y_train,{'alpha': [0.0004,0.0005,0.0007,0.0006,0.0009,0.0008],'max_iter':[10000]})

嶺迴歸調參

grid(Ridge()).grid_get(x_train,y_train,{'alpha':[35,40,45,50,55,60,65,70,80,90]})

核鄰迴歸調參

param_grid={'alpha':[0.2,0.3,0.4,0.5], 'kernel':["polynomial"], 'degree':[3],'coef0':[0.8,1,1.2]}
grid(KernelRidge()).grid_get(x_train,y_train,param_grid)

彈性網絡調參

grid(ElasticNet()).grid_get(x_train,y_train,{'alpha':[0.0005,0.0008,0.004,0.005],'l1_ratio':[0.08,0.1,0.3,0.5,0.7],'max_iter':[10000]})

模型加權平均集成

def r2(model,X,y):
    return cross_val_score(model, X, y, scoring="r2", cv=5)


class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod
        self.weight = weight
        
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
        return self
    
    def predict(self,X):
        w = list()
        pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        for data in range(pred.shape[1]):
            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w
        
lasso = Lasso(alpha=0.0009,max_iter=10000)
ridge = Ridge(alpha=35)
ker = KernelRidge(alpha=0.5 ,kernel='polynomial',degree=3 , coef0=0.8)
ela = ElasticNet(alpha=0.005,l1_ratio=0.3,max_iter=10000)
bay = BayesianRidge()

w1 = 0.15 #按R2指標賦權
w2 = 0.15
w3 = 0.4
w4 = 0.15
w5 = 0.15

weight_avg = AverageWeight(mod = [lasso,ridge,ker,ela,bay],weight=[w1,w2,w3,w4,w5])
r2(weight_avg,x_train,y_train)  
r2(weight_avg,x_train,y_train).mean()

模型融合

class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self,mod,meta_model):
        self.mod = mod
        self.meta_model = meta_model
        self.kf = KFold(n_splits=5, random_state=42, shuffle=True)
        
    def fit(self,X,y):
        self.saved_model = [list() for i in self.mod]
        oof_train = np.zeros((X.shape[0], len(self.mod)))
        
        for i,model in enumerate(self.mod):
            for train_index, val_index in self.kf.split(X,y):
                renew_model = clone(model)
                renew_model.fit(X[train_index], y[train_index])
                self.saved_model[i].append(renew_model)
                oof_train[val_index,i] = renew_model.predict(X[val_index])
        
        self.meta_model.fit(oof_train,y)
        return self
    
    def predict(self,X):
        whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1) 
                                      for single_model in self.saved_model]) 
        return self.meta_model.predict(whole_test)
    
    def get_oof(self,X,y,test_X):
        oof = np.zeros((X.shape[0],len(self.mod)))
        test_single = np.zeros((test_X.shape[0],5))
        test_mean = np.zeros((test_X.shape[0],len(self.mod)))
        for i,model in enumerate(self.mod):
            for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):
                clone_model = clone(model)
                clone_model.fit(X[train_index],y[train_index])
                oof[val_index,i] = clone_model.predict(X[val_index])
                test_single[:,j] = clone_model.predict(test_X)
            test_mean[:,i] = test_single.mean(axis=1)
        return oof, test_mean
a = Imputer().fit_transform(x_train)
b = Imputer().fit_transform(y_train.values.reshape(-1,1)).ravel()
stack_model = stacking(mod=[lasso,ridge,ker,ela,bay],meta_model=ker)

6、總結

本項目收集了廣東省二手房數據，着重分析廣深地區的房價。首先採用統計分析的方法對數據進行初步分析，大致瞭解房價分佈及其影響因素；隨後調用百度地圖API，實現數據地圖可視化。最後採用機器學習方法建模預測，並比較了幾種常用迴歸模型的預測效果。
基本符合一個完整數據分析案例的要求，採用直觀的數據可視化方式展示數據，並通過數據分析爲二手房購買者提供建設性意見。但仍有很多不足的地方，如並沒有對數據進行特徵工程，沒有進行特徵的轉換和篩選，機器學習模型的調參也比較簡略，因此預測能力還有很大的提升空間。

python房價數據的爬取與分析

Python房產數據分析