Kaggle——能源消耗預測,排名1%

這是在2019年11~12月期間,我參加的一個kaggle比賽——ASHRAE - Great Energy Predictor III

最終成績:排名前1%,22/3614。差一點點就金牌了,感覺前面的名次每前進一名都跟煉丹似的,有一定運氣成分。。。。難搞哦。
在這裏插入圖片描述
比賽的具體要求我就不詳細介紹了,直接去比賽官網看就好。

我當時自己整理了notebook,記錄了當時自己的解決方案,但是不好分享,關鍵代碼如下:
在這裏插入圖片描述

# coding: utf-8

# ## 1. 比賽介紹: ASHRAE - Great Energy Predictor III

# > 在現實生活中,很多建築物需要消耗能源,比如說夏天的時候,需要空調來進行製冷。這不僅僅帶來了經濟支出,還對環境造成了不好的影響。爲了減少能源的消耗,我們需要對建築物的能源使用進行預測。本次比賽將通過結合天氣數據,建築數據,以及熱水,冷水能源消耗數據來預測能源使用量。下面是官網的描述。
# 
# ### How much energy will a building consume?
#  
# * Q: How much does it cost to cool a skyscraper in the summer?
# * A: A lot! And not just in dollars, but in environmental impact.
# 

# > 下面是這次比賽用到的文件,我就不翻譯了.你們可以自己上官網翻譯理解一下。
# 


# # <a id='1-2'>1.2 評價指標</a> (<a href='#9'>返回</a>)

# 這次比賽的評價指標是RMSLE,Root Mean Squared Logarithmic Error。
# 可以調用如下代碼:
# ```
# from sklearn.metrics import mean_squared_log_error
# loss=np.sqrt(mean_squared_log_error( y_test, predictions))
# ```

# # 2. 導入庫函數

# In[2]:


import random
import datetime
import os,gc,math
import numpy as np 
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime
get_ipython().run_line_magic('matplotlib', 'inline')


# # <a id='3'>3. 數據讀入與壓縮</a> (<a href='#9'>返回</a>)

# In[3]:


get_ipython().run_cell_magic('time', '', "data_path = './ashrae-energy-prediction/'\ntrain_df = pd.read_csv(data_path + 'train.csv')\ntest_df = pd.read_csv(data_path + 'test.csv')\nweather_train_df = pd.read_csv(data_path + 'weather_train.csv')\nweather_test_df = pd.read_csv(data_path + 'weather_test.csv')\nbuilding_meta_df = pd.read_csv(data_path + 'building_metadata.csv')\nsample_submission = pd.read_csv(data_path + 'sample_submission.csv')")


# In[4]:


train_df.head()


# In[5]:


train_df.columns


# In[6]:


weather_train_df.columns


# In[7]:


## 壓縮數據(這個看情況,如果內存大可以不用,效果會差一點點)
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
                    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
        
    return df


# In[8]:


train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)


# # <a id='4'>4. 數據分析</a> (<a href='#9'>返回</a>)

# In[9]:


train_df.head()


# In[10]:


test_df.head()


# In[10]:


weather_train_df.head()


# In[11]:


weather_test_df.head()


# In[12]:


building_meta_df.head()


# In[13]:


train_df.dtypes


# In[14]:


weather_train_df.dtypes


# In[15]:


building_meta_df.dtypes


# In[16]:


# 目標值的分佈
a = plt.hist(train_df['meter_reading'], range(0,500))


# In[17]:


train_df.groupby(["meter"])["meter_reading"].agg(['mean','std'])


# In[18]:


import seaborn as sns


# In[19]:


# 相關性分析(weather)
num_cols = ['air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr','sea_level_pressure','wind_direction','wind_speed' ]
plt.figure(figsize=(5,5))
sns.heatmap(weather_train_df[num_cols].dropna(inplace=False).corr(), annot=True)


# In[20]:


#相關性分析(building)
num_cols2 = ['square_feet','floor_count']
plt.figure(figsize=(5,5))
sns.heatmap(building_meta_df[num_cols2].dropna(inplace=False).corr(),annot=True)


# In[21]:


# 畫出primary_use的分佈
plt.figure(figsize = (15,8))
data = building_meta_df['primary_use'].value_counts()  # Series
sns.barplot(data.index,data)


# # <a id='5'>5. 特徵工程</a> (<a href='#9'>返回</a>)

# In[22]:


def fill_weather_dataset(weather_df):
    
    # 根據最大時期和最小時期的差,計算最多有多少個小時
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]
    
    # 把消失的小時加入到裏面去
    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # 添加新的特徵day, week,month
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # 重新設置Index
    weather_df = weather_df.set_index(['site_id','day','month'])
    
    # 根據地點,日期,和月把缺少的溫度補全
    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # 根據地點,日期,和月把缺少的雲層覆蓋率補全
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])
    weather_df.update(cloud_coverage_filler,overwrite=False)
    
    # 根據地點,日期,和月把缺少的露水溫度補全
    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # 根據地點,日期,和月把海拔給補全
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])
    weather_df.update(sea_level_filler,overwrite=False)
    
    # 風的方向,補全
    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)
    
    # 風速補全
    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # 降雨量補全
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])
    weather_df.update(precip_depth_filler,overwrite=False)
    
    # 刪掉一些列
    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
        
    return weather_df


# In[23]:


# 加一些累計的特徵(window = 24表示,24小時內的平均溫度)
def add_lag_feature(weather_df, window=3):
    group_df = weather_df.groupby(['site_id','building_id'])
    cols = ['air_temperature'] 
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
#     lag_std = rolled.std().reset_index().astype(np.float16)
    for col in cols:
        weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
#         weather_df[f'{col}_std_lag{window}'] = lag_mean[col]

# 加入一些頻率特徵
def encode_FE(df,cols):
    for col in cols:
        vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df[nm] = df[col].map(vc)
        df[nm] = df[nm].astype('float16')
        print(nm,', ',end='')
        
# 加一些特徵工程        
def features_engineering(df,categorical_features):
    
    # 按照timeStamp
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # 加入時間特徵
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["weekend"] = df["timestamp"].dt.weekday
    
    # 加入假日特徵
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                    "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                    "2017-01-02", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                    "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                    "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                    "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                    "2019-01-01"]
    df["is_holiday"] = (df.timestamp.isin(holidays)).astype(int)
    
    # 面積
    df['square_feet'] =  np.log1p(df['square_feet'])
    # primary_use的使用頻率
    encode_FE(df,['primary_use'])
    
    # 捨棄掉一些特徵
    drop = ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count"]
    df = df.drop(drop, axis=1)
    gc.collect()
    
    # Label encode
    for c in categorical_features:
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])
    add_lag_feature(df,24)
    return df


# In[26]:


# 有一段時間的一些房子的數據有問題,是異常值得刪掉
train_df = train_df [ train_df['building_id'] != 1099 ]
train_df = train_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')
weather_train_df = fill_weather_dataset(weather_train_df)
train_df = train_df.merge(building_meta_df, left_on='building_id',right_on='building_id',how='left')
train_df = train_df.merge(weather_train_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])
train_df = features_engineering(train_df,['primary_use','primary_use_FE'])
train_df.head(10)

# weather_train_df = reduce_mem_usage(weather_train_df)
# weather_test_df = reduce_mem_usage(weather_test_df)
train_df = reduce_mem_usage(train_df)
train_df.to_pickle("../input/train.pickle",index=False)
del train_df, weather_train_df
gc.collect()

row_ids = test_df["row_id"]
test_df.drop("row_id", axis=1, inplace=True)
weather_test_df = fill_weather_dataset(weather_test_df)
test_df = test_df.merge(building_meta_df, left_on='building_id',right_on='building_id',how='left')
test_df = test_df.merge(weather_test_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])
test_df = features_engineering(test_df,['primary_use','primary_use_FE'])
test_df.head(10)
test_df = reduce_mem_usage(test_df)
test_df.to_pickle("../input/test.pickle",index=False)
del test_df, weather_test_df
gc.collect()


# # <a id='6'>6. 使用迴歸模型進行訓練</a> (<a href='#9'>返回</a>)

# In[ ]:


target = np.log1p(train_df["meter_reading"])
features = train_df.drop(['meter_reading'], axis = 1)
categorical_features = ["building_id", "site_id", "meter", "primary_use", "weekend","primary_use_FE"]
selected_features = ['building_id',
 'meter',
 'site_id',
 'primary_use',
 'square_feet',
 'primary_use_FE',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'hour',
 'weekend',
 'air_temperature_mean_lag24']


# In[ ]:


params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.01,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
    "random_seed":10
}
kf = KFold(n_splits=3)


# In[ ]:


models = []
history = []
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index][selected_features]
    train_target = target.loc[train_index]
    
    test_features = features.loc[test_index][selected_features]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000,valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    models.append(model)
    del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()


# In[ ]:


cv_scores = np.mean([model.best_score['valid_1']['rmse'] for model in models])
cv_scores


# # <a  id='7'>7. 打印特徵重要性</a> (<a href='#9'>返回</a>)

# In[ ]:


for model in models:
    lgb.plot_importance(model)
    plt.show()


# In[ ]:

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章