阿里媽媽廣告預測線上0.082+baseline,特徵部分

這裏的代碼是特徵的一部分,還有一部分組合特徵我參考的是這位大佬的

https://tianchi.aliyun.com/forum/new_articleDetail.html?spm=5176.8366600.0.0.792e311fbTYt32&raceId=231647&postsId=4591

我不方便貼出來,可以自己去看一下。

大家努力做比賽,開源也不容易,感覺有幫助的可以點個贊或者關注一下,有機會一起交流,謝謝

# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import RandomizedSearchCV
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
import time
from sklearn.metrics import log_loss

def time2cov(time_):
    return time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time_))

def map_hour(x):
    if (x>=7)&(x<=12):
        return 1
    elif (x>=13)&(x<=20):
        return 2
    else:
        return 3

filepath1 = r"/Users/zhuxiaoxiansheng/Desktop/IJCA/round1_ijcai_18_train_20180301.txt"
traindata = pd.read_csv(filepath1,sep = ' ')
trainlabel = traindata['is_trade']

filepath2 = r"/Users/zhuxiaoxiansheng/Desktop/IJCA/round1_ijcai_18_test_a_20180301.txt"
testdata = pd.read_csv(filepath2, sep = ' ')
print(traindata.shape,trainlabel.shape,testdata.shape)

traindata['context_timestamp'] = traindata['context_timestamp'].apply(time2cov)
traindata['context_timestamp_tmp'] = pd.to_datetime(traindata['context_timestamp'])
traindata['hour'] = traindata.context_timestamp_tmp.dt.hour
traindata['day'] = traindata.context_timestamp_tmp.dt.day
testdata['context_timestamp'] = testdata['context_timestamp'].apply(time2cov)
testdata['context_timestamp_tmp'] = pd.to_datetime(testdata['context_timestamp'])
testdata['hour'] = testdata['context_timestamp_tmp'].dt.hour
testdata['day'] = testdata['context_timestamp_tmp'].dt.day
traindata['hour_seg'] = traindata['hour'].apply(map_hour)
testdata['hour_seg'] = testdata['hour'].apply(map_hour)

del traindata['context_timestamp_tmp']
del traindata['context_timestamp']
del testdata['context_timestamp_tmp']
del testdata['context_timestamp']
print(traindata.shape,trainlabel.shape,testdata.shape,0)



#=========================
#處理item_id列
#========================
counts = pd.DataFrame(pd.value_counts(traindata['item_id']))
counts1 = pd.DataFrame(pd.value_counts(testdata['item_id']))
traindata['item_id'] = traindata['item_id'].replace(counts.index.tolist(),counts['item_id'].tolist())
testdata['item_id'] = testdata['item_id'].replace(counts1.index.tolist(),counts1['item_id'].tolist())
print(4)

#========================
#處理item_brand_id列
#========================
counts = pd.DataFrame(pd.value_counts(traindata['item_brand_id']))
counts1 = pd.DataFrame(pd.value_counts(testdata['item_brand_id']))
traindata['item_brand_id'] = traindata['item_brand_id'].replace(counts.index.tolist(),counts['item_brand_id'].tolist())
testdata['item_brand_id'] = testdata['item_brand_id'].replace(counts1.index.tolist(),counts1['item_brand_id'].tolist())
print(6)

#=========================
#處理shop_id列
#========================
counts = pd.DataFrame(pd.value_counts(traindata['shop_id']))
counts1 = pd.DataFrame(pd.value_counts(testdata['shop_id']))
traindata['shop_id'] = traindata['shop_id'].replace(counts.index.tolist(),counts['shop_id'].tolist())
testdata['shop_id'] = testdata['shop_id'].replace(counts1.index.tolist(),counts1['shop_id'].tolist())
print(10)


#=========================
#處理city列
#========================
counts = pd.DataFrame(pd.value_counts(traindata['item_city_id']))
counts1 = pd.DataFrame(pd.value_counts(testdata['item_city_id']))
traindata['item_city_id'] = traindata['item_city_id'].replace(counts.index.tolist(),counts['item_city_id'].tolist())
testdata['item_city_id'] = testdata['item_city_id'].replace(counts1.index.tolist(),counts1['item_city_id'].tolist())
print(2) 

del traindata['context_id']
del testdata['context_id']

counts = pd.DataFrame(pd.value_counts(traindata['user_id']))
traindata['user_id'] = traindata['user_id'].replace(counts.index.tolist(),counts['user_id'].tolist())
counts = pd.DataFrame(pd.value_counts(testdata['user_id']))
testdata['user_id'] = testdata['user_id'].replace(counts.index.tolist(),counts['user_id'].tolist())
print(traindata.shape,testdata.shape)

traindata['user_gender_id'] = abs(traindata['user_gender_id'])
testdata['user_gender_id'] = abs(testdata['user_gender_id'])





testdata['len_item_category'] = testdata['item_category_list'].map(lambda x: len(str(x).split(';')))
traindata['len_item_category'] = traindata['item_category_list'].map(lambda x: len(str(x).split(';')))
traindata['len_item_property'] = traindata['item_property_list'].map(lambda x: len(str(x).split(';')))
testdata['len_item_property'] = testdata['item_property_list'].map(lambda x: len(str(x).split(';')))
testdata['len_predict_category_property'] = testdata['predict_category_property'].map(lambda x: len(str(x).split(';')))
traindata['len_predict_category_property'] = traindata['predict_category_property'].map(lambda x: len(str(x).split(';')))

for i in range(8):
    traindata['property_%d'%(i)] = traindata['item_property_list'].apply(
        lambda x:x.split(";")[i] if len(x.split(";")) > i else " "
    ) 
for i in range(8):
    testdata['property_%d'%(i)] = testdata['item_property_list'].apply(
        lambda x:x.split(";")[i] if len(x.split(";")) > i else " "
    )

for i in range(8):
    counts = pd.DataFrame(pd.value_counts(traindata['property_'+str(i)]))
    counts1 = pd.DataFrame(pd.value_counts(testdata['property_'+str(i)]))
    traindata['property_'+str(i)] = traindata['property_'+str(i)].replace(counts.index.tolist(),counts['property_'+str(i)].tolist())
    testdata['property_'+str(i)] = testdata['property_'+str(i)].replace(counts1.index.tolist(),counts1['property_'+str(i)].tolist())

for i in range(8):
    traindata['predict_category_%d'%(i)] = traindata['predict_category_property'].apply(
        lambda x:str(x.split(";")[i]).split(":")[0] if len(x.split(";")) > i else " "
    )
for i in range(8):
    testdata['predict_category_%d'%(i)] = testdata['predict_category_property'].apply(
        lambda x:str(x.split(";")[i]).split(":")[0] if len(x.split(";")) > i else " "
    )

for i in range(8):
    counts = pd.DataFrame(pd.value_counts(traindata['predict_category_%d'%(i)]))
    counts1 = pd.DataFrame(pd.value_counts(testdata['predict_category_%d'%(i)]))
    traindata['predict_category_%d'%(i)] = traindata['predict_category_%d'%(i)].replace(counts.index.tolist(),counts['predict_category_%d'%(i)].tolist())
    testdata['predict_category_%d'%(i)] = testdata['predict_category_%d'%(i)].replace(counts1.index.tolist(),counts1['predict_category_%d'%(i)].tolist())

sub_data =  pd.DataFrame((x.split(';') for x in traindata.item_category_list),index = traindata.index,columns = ['item_category_list_1','item_category_list_2','item_category_list_3']  )
traindata = pd.concat([traindata,sub_data],axis = 1)
counts = pd.DataFrame(pd.value_counts(traindata['item_category_list_2']))
traindata['item_category_list_2'] = traindata['item_category_list_2'].replace(counts.index.tolist(),counts['item_category_list_2'].tolist())

sub_data =  pd.DataFrame((x.split(';') for x in testdata.item_category_list),index = testdata.index,columns = ['item_category_list_1','item_category_list_2','item_category_list_3']  )
testdata = pd.concat([testdata,sub_data],axis = 1)
testdata['item_category_list_2'] = testdata['item_category_list_2'].replace(counts.index.tolist(),counts['item_category_list_2'].tolist())
traindata['item_category_list_3'] = traindata['item_category_list_3'].fillna(1)
testdata['item_category_list_3'] = testdata['item_category_list_3'].fillna(1)
counts = pd.DataFrame(pd.value_counts(traindata['item_category_list_3']))
traindata['item_category_list_3'] = traindata['item_category_list_3'].replace(counts.index.tolist(),counts['item_category_list_3'].tolist())
testdata['item_category_list_3'] = testdata['item_category_list_3'].replace(counts.index.tolist(),counts['item_category_list_3'].tolist()) 

del traindata['predict_category_property']
del testdata['predict_category_property']
del traindata['item_property_list']  
del testdata['item_property_list']   
del traindata['item_category_list']
del traindata['item_category_list_1']
del testdata['item_category_list_1']
del testdata['item_category_list']
print(traindata.shape,trainlabel.shape,testdata.shape,3)



##=========================
##處理幾個列
##========================

traindata['shop_star_level0'] = traindata['shop_star_level'].apply(lambda x: 2 if x > 5015 else x)
traindata['shop_star_level0'] = traindata['shop_star_level0'].apply(lambda x: 1 if 5015 >= x > 5011 else x)
traindata['shop_star_level0'] = traindata['shop_star_level0'].apply(lambda x: 0 if  x <= 5011 else x)

traindata['context_page_id0'] = traindata['context_page_id'].apply(lambda x: 2 if x > 4009 else x)
traindata['context_page_id0'] = traindata['context_page_id0'].apply(lambda x: 1 if 4009 >= x > 4001 else x)
traindata['context_page_id0'] = traindata['context_page_id0'].apply(lambda x: 0 if  x == 4001 else x)

traindata['user_star_level0'] = traindata['user_star_level'].apply(lambda x: 2 if x >= 3007 else x)
traindata['user_star_level0'] = traindata['user_star_level0'].apply(lambda x: 1 if 3007 > x > 3002 else x)
traindata['user_star_level0'] = traindata['user_star_level0'].apply(lambda x: 0 if  x <= 3002 else x)

traindata['user_occupation_id0'] = traindata['user_occupation_id'].apply(lambda x: 1 if x == 2005 else 0)

traindata['user_age_level0'] = traindata['user_age_level'].apply(lambda x: 2 if x >= 1006 else x)
traindata['user_age_level0'] = traindata['user_age_level0'].apply(lambda x: 1 if 1006 > x > 1002 else x)
traindata['user_age_level0'] = traindata['user_age_level0'].apply(lambda x: 0 if  x <= 1002 else x)

traindata['shop_review_num_level0'] = traindata['shop_review_num_level'].apply(lambda x: 2 if x > 17 else x)
traindata['shop_review_num_level0'] = traindata['shop_review_num_level0'].apply(lambda x: 1 if 17 >= x > 13 else x)
traindata['shop_review_num_level0'] = traindata['shop_review_num_level0'].apply(lambda x: 0 if  x <= 13 else x)


testdata['shop_star_level0'] = testdata['shop_star_level'].apply(lambda x: 2 if x > 5015 else x)
testdata['shop_star_level0'] = testdata['shop_star_level0'].apply(lambda x: 1 if 5015 >= x > 5011 else x)
testdata['shop_star_level0'] = testdata['shop_star_level0'].apply(lambda x: 0 if  x <= 5011 else x)

testdata['context_page_id0'] = testdata['context_page_id'].apply(lambda x: 2 if x > 4009 else x)
testdata['context_page_id0'] = testdata['context_page_id0'].apply(lambda x: 1 if 4009 >= x > 4001 else x)
testdata['context_page_id0'] = testdata['context_page_id0'].apply(lambda x: 0 if  x == 4001 else x)

testdata['user_star_level0'] = testdata['user_star_level'].apply(lambda x: 2 if x >= 3007 else x)
testdata['user_star_level0'] = testdata['user_star_level0'].apply(lambda x: 1 if 3007 > x > 3002 else x)
testdata['user_star_level0'] = testdata['user_star_level0'].apply(lambda x: 0 if  x <= 3002 else x)

testdata['user_occupation_id0'] = testdata['user_occupation_id'].apply(lambda x: 1 if x == 2005 else 0)

testdata['user_age_level0'] = testdata['user_age_level'].apply(lambda x: 2 if x >= 1006 else x)
testdata['user_age_level0'] = testdata['user_age_level0'].apply(lambda x: 1 if 1006 > x > 1002 else x)
testdata['user_age_level0'] = testdata['user_age_level0'].apply(lambda x: 0 if  x <= 1002 else x)

testdata['shop_review_num_level0'] = testdata['shop_review_num_level'].apply(lambda x: 2 if x > 17 else x)
testdata['shop_review_num_level0'] = testdata['shop_review_num_level0'].apply(lambda x: 1 if 17 >= x > 13 else x)
testdata['shop_review_num_level0'] = testdata['shop_review_num_level0'].apply(lambda x: 0 if  x <= 13 else x)


traindata['user_age_level'] = traindata['user_age_level'].replace(-1,traindata['user_age_level'].median())
traindata['user_age_level'] = traindata['user_age_level']-1000
testdata['user_age_level'] = testdata['user_age_level'].replace(-1,testdata['user_age_level'].median())
testdata['user_age_level'] = testdata['user_age_level']-1000

traindata['user_occupation_id'] = traindata['user_occupation_id'].replace(-1,traindata['user_occupation_id'].median())
traindata['user_occupation_id'] = traindata['user_occupation_id']-2000
testdata['user_occupation_id'] = testdata['user_occupation_id'].replace(-1,testdata['user_occupation_id'].median())
testdata['user_occupation_id'] = testdata['user_occupation_id']-2000


traindata['user_star_level'] = traindata['user_star_level'].replace(-1,traindata['user_star_level'].median())
traindata['user_star_level'] = traindata['user_star_level']-3000
testdata['user_star_level'] = testdata['user_star_level'].replace(-1,testdata['user_star_level'].median())
testdata['user_star_level'] = testdata['user_star_level']-3000

traindata['context_page_id'] = traindata['context_page_id'].replace(-1,traindata['context_page_id'].median())
traindata['context_page_id'] = traindata['context_page_id'] - 4000
testdata['context_page_id'] = testdata['context_page_id'].replace(-1,testdata['context_page_id'].median())
testdata['context_page_id'] = testdata['context_page_id'] - 4000

traindata['shop_star_level'] = traindata['shop_star_level'].replace(-1,traindata['context_page_id'].median())
traindata['shop_star_level'] = traindata['shop_star_level'] - 5000
testdata['shop_star_level'] = testdata['shop_star_level'].replace(-1,testdata['context_page_id'].median())
testdata['shop_star_level'] = testdata['shop_star_level'] - 5000



traindata['shop_score_description0'] = traindata['shop_score_description'].apply(lambda x: 2 if x > 0.984 else x)
traindata['shop_score_description0'] = traindata['shop_score_description0'].apply(lambda x: 1 if 0.984 >= x > 0.97 else x)
traindata['shop_score_description0'] = traindata['shop_score_description0'].apply(lambda x: 0 if  x <= 0.97 else x)

traindata['shop_score_delivery0'] = traindata['shop_score_delivery'].apply(lambda x: 2 if x > 0.979 else x)
traindata['shop_score_delivery0'] = traindata['shop_score_delivery0'].apply(lambda x: 1 if 0.979 >= x > 0.966 else x)
traindata['shop_score_delivery0'] = traindata['shop_score_delivery0'].apply(lambda x: 0 if  x <= 0.966 else x)

traindata['shop_score_service0'] = traindata['shop_score_service'].apply(lambda x: 2 if x > 0.979 else x)
traindata['shop_score_service0'] = traindata['shop_score_service0'].apply(lambda x: 1 if 0.979 >= x > 0.967 else x)
traindata['shop_score_service0'] = traindata['shop_score_service0'].apply(lambda x: 0 if  x <= 0.967 else x)

traindata['shop_review_positive_rate0'] = traindata['shop_review_positive_rate'].apply(lambda x: 2 if x == 1 else x)
traindata['shop_review_positive_rate0'] = traindata['shop_review_positive_rate0'].apply(lambda x: 1 if 1 > x > 0.98 else x)
traindata['shop_review_positive_rate0'] = traindata['shop_review_positive_rate0'].apply(lambda x: 0 if  x <= 0.98 else x)

traindata['item_price_level0'] = traindata['item_price_level'].apply(lambda x: 2 if x >= 9 else x)
traindata['item_price_level0'] = traindata['item_price_level0'].apply(lambda x: 1 if 9 > x > 5 else x)
traindata['item_price_level0'] = traindata['item_price_level0'].apply(lambda x: 0 if  x <= 5 else x)

traindata['item_sales_level0'] = traindata['item_sales_level'].apply(lambda x: 2 if x >= 14 else x)
traindata['item_sales_level0'] = traindata['item_sales_level0'].apply(lambda x: 1 if 14 > x > 9 else x)
traindata['item_sales_level0'] = traindata['item_sales_level0'].apply(lambda x: 0 if  x <= 9 else x)

traindata['item_collected_level0'] = traindata['item_collected_level'].apply(lambda x: 2 if x >= 15 else x)
traindata['item_collected_level0'] = traindata['item_collected_level0'].apply(lambda x: 1 if 15 > x > 10 else x)
traindata['item_collected_level0'] = traindata['item_collected_level0'].apply(lambda x: 0 if  x <= 10 else x)

traindata['item_collected_level0'] = traindata['item_collected_level'].apply(lambda x: 2 if x >= 15 else x)
traindata['item_collected_level0'] = traindata['item_collected_level0'].apply(lambda x: 1 if 15 > x > 10 else x)
traindata['item_collected_level0'] = traindata['item_collected_level0'].apply(lambda x: 0 if  x <= 10 else x)

traindata['item_pv_level0'] = traindata['item_pv_level'].apply(lambda x: 2 if x >= 20 else x)
traindata['item_pv_level0'] = traindata['item_pv_level0'].apply(lambda x: 1 if 20 > x > 14 else x)
traindata['item_pv_level0'] = traindata['item_pv_level0'].apply(lambda x: 0 if  x <= 14 else x)



testdata['shop_score_description0'] = testdata['shop_score_description'].apply(lambda x: 2 if x > 0.984 else x)
testdata['shop_score_description0'] = testdata['shop_score_description0'].apply(lambda x: 1 if 0.984 >= x > 0.97 else x)
testdata['shop_score_description0'] = testdata['shop_score_description0'].apply(lambda x: 0 if  x <= 0.97 else x)

testdata['shop_score_delivery0'] = testdata['shop_score_delivery'].apply(lambda x: 2 if x > 0.979 else x)
testdata['shop_score_delivery0'] = testdata['shop_score_delivery0'].apply(lambda x: 1 if 0.979 >= x > 0.966 else x)
testdata['shop_score_delivery0'] = testdata['shop_score_delivery0'].apply(lambda x: 0 if  x <= 0.966 else x)

testdata['shop_score_service0'] = testdata['shop_score_service'].apply(lambda x: 2 if x > 0.979 else x)
testdata['shop_score_service0'] = testdata['shop_score_service0'].apply(lambda x: 1 if 0.979 >= x > 0.967 else x)
testdata['shop_score_service0'] = testdata['shop_score_service0'].apply(lambda x: 0 if  x <= 0.967 else x)

testdata['shop_review_positive_rate0'] = testdata['shop_review_positive_rate'].apply(lambda x: 2 if x == 1 else x)
testdata['shop_review_positive_rate0'] = testdata['shop_review_positive_rate0'].apply(lambda x: 1 if 1 > x > 0.98 else x)
testdata['shop_review_positive_rate0'] = testdata['shop_review_positive_rate0'].apply(lambda x: 0 if  x <= 0.98 else x)

testdata['item_price_level0'] = testdata['item_price_level'].apply(lambda x: 2 if x >= 9 else x)
testdata['item_price_level0'] = testdata['item_price_level0'].apply(lambda x: 1 if 9 > x > 5 else x)
testdata['item_price_level0'] = testdata['item_price_level0'].apply(lambda x: 0 if  x <= 5 else x)

testdata['item_sales_level0'] = testdata['item_sales_level'].apply(lambda x: 2 if x >= 14 else x)
testdata['item_sales_level0'] = testdata['item_sales_level0'].apply(lambda x: 1 if 14 > x > 9 else x)
testdata['item_sales_level0'] = testdata['item_sales_level0'].apply(lambda x: 0 if  x <= 9 else x)

testdata['item_collected_level0'] = testdata['item_collected_level'].apply(lambda x: 2 if x >= 15 else x)
testdata['item_collected_level0'] = testdata['item_collected_level0'].apply(lambda x: 1 if 15 > x > 10 else x)
testdata['item_collected_level0'] = testdata['item_collected_level0'].apply(lambda x: 0 if  x <= 10 else x)

testdata['item_collected_level0'] = testdata['item_collected_level'].apply(lambda x: 2 if x >= 15 else x)
testdata['item_collected_level0'] = testdata['item_collected_level0'].apply(lambda x: 1 if 15 > x > 10 else x)
testdata['item_collected_level0'] = testdata['item_collected_level0'].apply(lambda x: 0 if  x <= 10 else x)

testdata['item_pv_level0'] = testdata['item_pv_level'].apply(lambda x: 2 if x >= 20 else x)
testdata['item_pv_level0'] = testdata['item_pv_level0'].apply(lambda x: 1 if 20 > x > 14 else x)
testdata['item_pv_level0'] = testdata['item_pv_level0'].apply(lambda x: 0 if  x <= 14 else x)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章