昨天跟隊友開源了一份線上0.85的baseline,發在我們的知乎專欄裏了。
知乎地址:
https://zhuanlan.zhihu.com/p/64715267
github地址:
https://github.com/leo6033/future-AI-challenge
下面貼一下baseline的主要代碼部分:
def parseData(df):
"""
預處理數據
"""
df['rentType'][df['rentType']=='--'] = '未知方式'
# 轉換object類型數據
columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
for col in columns:
df[col] = df[col].astype('category')
# 將buildYear列轉換爲整型數據
tmp = df['buildYear'].copy()
tmp2 = tmp[tmp!='暫無信息'].astype('int')
tmp[tmp=='暫無信息'] = tmp2.mode().iloc[0]
df['buildYear'] = tmp
df['buildYear'] = df['buildYear'].astype('int')
# 處理pv和uv的空值
df['pv'].fillna(df['pv'].mean(),inplace=True)
df['uv'].fillna(df['uv'].mean(),inplace=True)
df['pv'] = df['pv'].astype('int')
df['uv'] = df['uv'].astype('int')
# 去掉部分特徵
df.drop('communityName',axis=1, inplace=True)
df.drop('city',axis=1,inplace=True)
return df
def washData(df_train, df_test):
"""
清洗數據
"""
df_train = df_train[df_train['area']<=700]
df_train = df_train[df_train['tradeMoney']<=100000]
df_train.drop('ID', axis=1, inplace=True)
df_test.drop('ID', axis=1,inplace=True)
return df_train, df_test
def feature(df):
"""
特徵
"""
# 將houseType轉化爲‘房間數’,‘廳數’,‘衛生間數’
def parseRoom(info, index):
res = int(info[index*2])
return res
df.insert(3,'室',None)
df.insert(4, '廳', None)
df.insert(5, '衛', None)
df['室'] = df['houseType'].apply(parseRoom, index=0)
df['廳'] = df['houseType'].apply(parseRoom, index=1)
df['衛'] = df['houseType'].apply(parseRoom, index=2)
df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
# df['pv/uv'] = df['pv'] / df['uv']
# df['房間總數'] = df['室'] + df['廳'] + df['衛']
df.drop('houseType', axis=1, inplace=True)
df.drop('tradeTime', axis=1, inplace=True)
categorical_feats = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'region', 'plate']
return df, categorical_feats
def getData(feature):
"""
獲取數據
"""
train = pd.read_csv('input/train_data.csv')
test = pd.read_csv('input/test_a.csv')
train = parseData(train)
test = parseData(test)
train, test = washData(train, test)
train, col = feature(train)
test, col = feature(test)
target = train.pop('tradeMoney')
features = train.columns
categorical_feats = col
return train, test, target, features, categorical_feats
lgb模型參數
params = {
'num_leaves': 31,
'min_data_in_leaf': 20,
'min_child_samples':20,
'objective': 'regression',
'learning_rate': 0.01,
"boosting": "gbdt",
"feature_fraction": 0.8,
"bagging_freq": 1,
"bagging_fraction": 0.85,
"bagging_seed": 23,
"metric": 'rmse',
"lambda_l1": 0.2,
"nthread": 4,
}
folds = KFold(n_splits=5, shuffle=True, random_state=2333)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
print("fold {}".format(fold_))
trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx], categorical_feature=categorical_feats)
num_round = 10000
clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 200)
oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
print("CV Score: {:<8.5f}".format(r2_score(target, oof_lgb)))