DataFrame和Series練習

主要練習DataFrame的

import pandas as pd
zhou=[[1,2,3,4],[5,6,7,8]]
df=pd.DataFrame(zhou,columns=['x1','x2','x3','x4'])
df1=df.copy()                 #拷貝一個DataFrame的副本

import os
import datetime
import numpy as np
from scipy import sparse
from scipy.stats import mstats
import pandas as pd
from pandas.api.types import is_numeric_dtype
import re
import pickle
import shutil

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold

s=pd.Series(['-10.0121','+11.222','9.087'])
s = s.str.extract('([-+]?\d*\.\d+|\d+)', expand=False).astype(float).round(2)
    # deal with outliers
“”“
0   -10.01
1    11.22
2     9.09
dtype: float64
”“”
s = pd.Series(mstats.winsorize(s, limits=[0.01, 0.01]))    
    
    
tmp_s=pd.DataFrame([['row1','y1'],['row2','y1'],['row3','y2'],['row4','y1'],['row5','y1'],['row6','y2'],['row7','y1'],['row8','y2'],['row9','y3'],['row10','y2']],columns=['feature','label'])
tmp_counts = tmp_s['label'].value_counts() #對訓練數據的標籤進行計數
“”“
Out[81]: 
y1    5
y2    4
y3    1
Name: label, dtype: int64
”“”
num_top=2
tmp_counts_top = tmp_counts[:num_top]  #篩選(過濾):選出Series中最高的幾個
tmp_num_unique = np.sum(tmp_counts>4)  #篩選出次數異常的(大於某個閾值)
tmp_counts_num = pd.to_numeric(tmp_counts_top.index.to_series(), errors='coerce')
tmp_counts_num_extract = tmp_counts_top.index.to_series().str.extract('([-+]?\d*\.\d+|\d+)', expand=False).astype(float).round(2)    #鏈式法則調用
#y1    1.0
#y2    2.0

import numpy as np    
df=pd.DataFrame([[1,2,3,4,5],[6,7,8,9,10],[11,12,np.nan,14,15]],columns=['0120', '0125', '0981', '0984', '0983'])
np.sum(df[['0120', '0125', '0981', '0984', '0983']].notnull(), axis=1) #統計每一行的空值數目
#0    5
#1    5
#2    4
np.sum(df['0981'].notnull()) #統計列的空值數目
df['0981'].name              #取出某列(Series)的名字


keywords_ch = ['糖尿病', '高血壓', '血脂', '治療中', '肥胖', '血糖', '血壓高', '血脂偏高', '血壓高偏高', '冠心病',
                  '脂肪肝', '不齊', '過緩', '血管彈性', '脂',
                  '硬化', '舒張期雜音', '收縮期雜音', '低鹽', '低脂']
keywords_en = ['disease_'+str(i) for i in range(len(keywords_ch))]


#1.apply()
#2.applymap()
#3.map()

#1.apply()
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
f = lambda x: x.max() - x.min()
frame.apply(f,axis=1) #讓方程作用在一維的向量(行向量axis=0,or列向量axis=1)上

#2.applymap()
format = lambda x: '%.2f' % x
frame.applymap(format)  #讓方程作用於DataFrame中的每一個元素

#3.map()
frame['e'].map(format)  #將函數作用於一個Series的每一個元素
"""
df_to_merge['if_subhealth'] = (df['2302'] == '亞健康').astype(int)  #直接將bool值轉爲類別值(強制類型轉換)
df_to_merge['if_ill'] = (df['2302'] == '疾病').astype(int)
df_to_merge['if_health'] = (~(df_to_merge['if_subhealth']&df_to_merge['if_ill'])).astype(int)  #只有三類,第三類=總體-一二類即可。

df_to_merge['if_jianman'] = df['1402'].apply(lambda x: '減慢' in str(x)).astype(int)  #對DataFrame的一列做轉換:bool->數值。
df_to_merge['if_zengkuai'] = df['1402'].apply(lambda x: '增快' in str(x)).astype(int)
df_to_merge['if_jiangdi'] = df['1402'].apply(lambda x: '降低' in str(x)).astype(int)
"""
data11=[['x'+str(i) for i in range(1,41)],
        ['檢出糖尿病II級','正常','正常','未檢查','健康','無','未見','冠心病1',
         '高血壓', '血脂', '脂肪肝陽性', '慢性胃炎陽性', '闌尾炎陽性', '甲肝', '腎結石',
         '膽囊切除', '甲肝', '冠心病', '膽結石', '甲狀腺', '腦梗塞', '膽囊炎', '腦溢血', 
         '早搏', '雜音', '心動過緩', '心律不齊', '心動過速','正常','正常','未檢查','健康','無','未見',
         '正常','正常','未檢查','健康','無','未見'],
        ['檢出糖尿病II級','正常','正常','未檢查','健康','無','未見','冠心病1',
         '高血壓', '血脂', '正常','未檢查','健康','無','未見',
         '膽囊切除', '甲肝', '冠心病陽性', '膽結石陽性', '甲狀腺陽性', '腦梗塞陽性', '膽囊炎', '腦溢血', 
         '早搏', '雜音', '查見心動過緩', '查見心律不齊', '心動過速','正常','正常','未檢查','健康','無','未見',
         '正常','正常','未檢查','健康','無','未見']
]

s=pd.DataFrame(data11).T
s.columns=['vid','0434', '0409']
keywords_ch = ['糖尿病', '高血壓', '血脂', '脂肪肝', '慢性胃炎', '闌尾炎', '甲肝', '腎結石',
                   '膽囊切除', '甲肝', '冠心病', '膽結石', '甲狀腺', '腦梗塞', '膽囊炎', '腦溢血', 
                  '早搏', '雜音', '心動過緩', '心律不齊', '心動過速']
len(keywords_ch)
keywords_en = ['disease_'+str(i) for i in range(len(keywords_ch))]
dict_out = {}
#將原始文本中的兩列轉爲獨熱編碼的len(keywords_ch)列。
for i in s.columns:
    sname=i
    if sname in ['0434', '0409']:
        for i, kw in enumerate(keywords_en):
            s_out = pd.Series([np.nan]*len(s))
            s_out[s[sname].str.contains('{}|陽性|查見|檢到|檢出'.format(keywords_ch[i]), na=False)] = 1
            s_out[s[sname].str.contains('無|未查見|健康|未見', na=False)] = 0
            if np.sum(s_out) > 2: #太少了就忽略不計數了。
                #dict_out[sname+'_'+kw] = s_out
                dict_out[kw] = s_out
df_to_concat = pd.DataFrame(dict_out)

X_test_df=pd.DataFrame([[1,2,3,4],[5,6,7,8]],columns=['vid','x1','x2','x3'])
X_test_df.drop('vid',axis=1)  #刪除特徵矩陣部分的vid那一列,這是最簡單的方法。亦可以直接篩選想要的所有特徵列名作爲list傳入DataFrame中。
# x1  x2  x3
#0   2   3   4
#1   6   7   8

X_test_df.drop('vid',axis=1).values
#array([[2, 3, 4],
#       [6, 7, 8]], dtype=int64)

def log1p_mse(preds, train_data):
    labels = train_data.get_label()
    result = np.mean((np.log1p(preds) - np.log1p(labels))**2)
    return 'error', result, False


# 生成四類特徵:1:純文本2:純數值;3:文本+數值;4:categorical
X_processed = pd.DataFrame([['A0001',1,2,3],['A0002',4,5,6],['B0003',7,8,9],['B0004',11,12,13]],columns=['vid','t1','t2','t3'])
# 臨時加一波特徵
X_to_merge1 = pd.DataFrame([['A0001',101,102,102],['A0002',104,105,106],['B0003',107,108,109],['B0004',111,112,113]],columns=['vid','f1','f2','f3'])
X_processed = X_processed.merge(X_to_merge1, how='left', on='vid')
# 再加一波特徵
X_to_merge2 = pd.DataFrame([['A0001',1,1,0],['A0002',0,0,1],['B0003',1,0,0],['B0004',0,1,0]],columns=['vid','s1','s2','s3'])
X_processed = X_processed.merge(X_to_merge2, how='left', on='vid')
# 第三波特徵
X_to_merge3 =  pd.DataFrame([['A0001','f'],['A0002','m'],['B0003','m'],['B0004','f']],columns=['vid','sex'])
X_processed = X_processed.merge(X_to_merge3, how='left', on='vid')
# 第四波特徵
X_to_merge3 =  pd.DataFrame([['A0001','Shanghai'],['A0002','Beijing'],['B0003','New York'],['B0004','Tokyo']],columns=['vid','address'])
X_processed = X_processed.merge(X_to_merge3, how='left', on='vid')


在這裏插入圖片描述在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章