數據的拆分
pandas返回的是一個特殊的Categorical對象。 你可以將其看做一組表 示面元名稱的字符串。 實際上,它含有一個表示 不同分類名稱的levels數組以及一個爲年齡數據進行標號的labels屬性。
跟“區間”的數學符號 一樣,圓括號表示開端,而方括號則表示閉端(包括)。 哪邊是閉端可以通過right.
也可以自定義label名稱。
代碼演示如下:
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA
from matplotlib import pyplot as plt
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
#將所有的ages進行分組
bins = [18,25,35,60,100]
#使用pandas中的cut對年齡數據進行分組
cats = pd.cut(ages,bins)
cats
#打印結果:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
#調用pd.value_counts方法統計每個區間的人數
pd.value_counts(cats)
#打印結果:
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
#顯示第幾個區間index值
cats.codes
#打印結果:
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
#爲分類出來的每一組年齡加上標籤
group_names = ["Youth","YouthAdult","MiddleAged","Senior"]
personType = pd.cut(ages,bins,labels=group_names)
personType
#打印結果:
[Youth, Youth, Youth, YouthAdult, Youth, ..., YouthAdult, Senior, MiddleAged, MiddleAged, YouthAdult]
Length: 12
Categories (4, object): [Youth < YouthAdult < MiddleAged < Senior]
#用一個直方圖簡單的展示
plt.hist(personType)
打印結果:
對比qcut和cut用法
data = np.random.randn(100) #服從正態分佈
result = pd.qcut(data,4) #qcut會將100個數據進行排序,然後再將data數據均分成四組
result
#打印結果:
[(-2.912, -0.647], (-0.647, 0.218], (-2.912, -0.647], (0.218, 0.744], (-0.647, 0.218], ..., (-2.912, -0.647], (0.218, 0.744], (0.218, 0.744], (0.744, 2.57], (-0.647, 0.218]]
Length: 100
Categories (4, interval[float64]): [(-2.912, -0.647] < (-0.647, 0.218] < (0.218, 0.744] < (0.744, 2.57]]
#統計落在每個區間的元素個數
pd.value_counts(result)
#打印結果:
(0.744, 2.57] 25
(0.218, 0.744] 25
(-0.647, 0.218] 25
(-2.912, -0.647] 25
dtype: int64
data
#打印結果:
array([-0.81944393, -0.47827597, -1.09209466, 0.65593818, -0.33765443,
1.62557458, 0.3194351 , 2.57005046, 1.41407812, 0.99061237,
0.38456894, -1.89477296, -0.17753136, -1.43995942, 0.29287511,
0.49614109, -0.42212434, 0.47653863, 0.27854401, -1.2910386 ,
0.44532622, 2.15824034, 1.4430415 , -1.06147851, 1.68223286,
-1.13871536, 0.57685232, -0.32090836, 0.77133685, -0.18020109,
-1.31381356, 0.95240515, -0.78690992, -1.11867563, 1.14199622,
-0.14897499, -0.41965931, -0.03576476, 0.77007272, -0.32632594,
0.06267203, -1.44471277, 0.18464705, 0.02437242, 0.43143395,
-0.60205938, 0.67788094, -0.17743486, -1.13475686, -2.91085367,
0.62727327, -0.17854275, -0.88260309, 0.85683197, -2.49055636,
0.86906019, -0.78107309, 0.6875342 , 0.34721684, -0.10699128,
0.40549062, 1.76385094, 1.17304214, -1.19093869, -0.37709317,
-0.44977869, 1.09737126, -0.20524537, -1.11885991, -0.96239911,
0.53720485, 0.79086801, 0.61237678, 0.34413715, 0.63984259,
0.72957858, 1.11031023, -0.3926265 , 1.76136485, -1.28750533,
1.94702292, -2.43255566, 0.73515991, 1.94887473, 1.57655946,
-1.40915724, 0.09215286, 0.50873343, 0.18692116, -1.05886742,
0.2932763 , 1.75318424, 1.4303616 , -1.45832408, -0.26121468,
-2.04321697, 0.55099437, 0.24997625, 1.26154715, 0.07230284])
#qcut : 跟cut一樣也可以自定義分位數(0到1之間的數值,包括端點)
result = pd.qcut(data,[0,0.1,0.5,0.9,1])
result
#打印結果:
[(-1.293, 0.218], (-1.293, 0.218], (-1.293, 0.218], (0.218, 1.456], (-1.293, 0.218], ..., (-2.912, -1.293], (0.218, 1.456], (0.218, 1.456], (0.218, 1.456], (-1.293, 0.218]]
Length: 100
Categories (4, interval[float64]): [(-2.912, -1.293] < (-1.293, 0.218] < (0.218, 1.456] < (1.456, 2.57]]
#統計落在每個區間的元素個數
pd.value_counts(result)
#打印結果:
(0.218, 1.456] 40
(-1.293, 0.218] 40
(1.456, 2.57] 10
(-2.912, -1.293] 10
dtype: int64
#如果分位數的差值的和小於1的情況
result = pd.qcut(data,[0,0.1,0.3,0.75])
pd.value_counts(result)
#打印結果:
(-0.401, 0.744] 45
(-1.293, -0.401] 20
(-2.912, -1.293] 10
dtype: int64
#cut函數分割一組數據
data = np.random.rand(20)
data
#打印結果:
array([0.47895253, 0.28414843, 0.36634132, 0.35300655, 0.15481243,
0.46266364, 0.34585777, 0.81500546, 0.47044936, 0.72463485,
0.33547231, 0.19937734, 0.90590714, 0.64994955, 0.39107997,
0.67691253, 0.13468209, 0.22932333, 0.19109326, 0.39903587])
#用cut函數將一組數據分割成n份
#cut函數分割的方式:數據裏的(最大值-最小值)/n=每個區間的間距
#利用數據中最大值和最小值的差除以分組數作爲每一組數據的區間範圍的差值
result = pd.cut(data,4,precision=2) #precision保留小數點的有效位數
result
#打印結果:
[(0.33, 0.52], (0.13, 0.33], (0.33, 0.52], (0.33, 0.52], (0.13, 0.33], ..., (0.52, 0.71], (0.13, 0.33], (0.13, 0.33], (0.13, 0.33], (0.33, 0.52]]
Length: 20
Categories (4, interval[float64]): [(0.13, 0.33] < (0.33, 0.52] < (0.52, 0.71] < (0.71, 0.91]]
#統計落在每個區間的元素個數
pd.value_counts(result)
(0.33, 0.52] 9
(0.13, 0.33] 6
(0.71, 0.91] 3
(0.52, 0.71] 2
dtype: int64