df=pd.DataFrame({
"age":[1.1,2.1,1.3,3.4,5.6,7.8,5.2,3.4,7.4,6.1,6.9,8.9],
"tag":['good','good','good','bad','good','bad','bad','bad','good','bad','bad','bad']
})
def bin_single_column_selfdef(df,colname,bins,is_distribute=False,labels=None):
cut_res=pd.cut(df[colname], bins=bins,labels=labels)
if is_distribute:
return pd.value_counts(cut_res),cut_res
else:
return cut_res
'''
bins=[0,3,6,8,10]
labels=['a','b','c','d']
bin_dis,cut_bin=bin_single_column_selfdef(df,'age',bins,labels,True)
print(bin_dis)
print(cut_bin)
'''
def bin_single_column_byrange(df,colname,bin_len,is_distribute=False,labels=None):
cut_res=pd.cut(df[colname], bins=bin_len,labels=labels)
if is_distribute:
return pd.value_counts(cut_res),cut_res
else:
return cut_res
'''
x,y=bin_single_column_byrange(df,'age',3,True) # 這裏的3表示區間段
print(x)
print(y)
'''
- 等頻
-
def bin_single_column_byfreq(df,colname,bin_len,is_distribute=False,labels=None):
cut_res=pd.qcut(df[colname], q=bin_len,labels=labels)
if is_distribute:
return pd.value_counts(cut_res),cut_res
else:
return cut_res
'''
x,y=bin_single_column_byfreq(df,'age',3,True) # 這裏的3表示分位點,等同於[0,1/3分位點,2/3分位點,1分位點],所以不一定能保證每個組的個數是一樣的
print(x)
print(y)
'''