數據整理
- 定義
- 在數據清洗過程中,很多時候需要將不同的數據整理在一起,方便後續的分析,這個過程也叫數據合併
- 合併方法
- 常見的合併方法有堆疊和按主鍵進行合併,堆疊又分爲橫向堆疊和縱向堆疊,按主鍵合併類似於sql裏面的關聯操作
- 橫向堆疊將兩張表或多張表在X軸方向,即橫向拼接在一起
- 縱向堆疊將兩張表或多張表在Y軸方向,即縱向拼接在一起
- 注意join 取inner或者outer,分別代表交集和並集
import xlrd
import os
import pandas as pd
import numpy as np
os.getcwd()
'D:\\Jupyter\\notebook\\Python數據清洗實戰\\數據清洗之數據表處理'
os.chdir('D:\\Jupyter\\notebook\\Python數據清洗實戰\\數據')
workbook = xlrd.open_workbook('meal_order_detail.xlsx')
sheet_name = workbook.sheet_names()
sheet_name
['meal_order_detail1', 'meal_order_detail2', 'meal_order_detail3']
order1 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail1')
order2 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail2')
order3 = pd.read_excel('meal_order_detail.xlsx', sheet_name='meal_order_detail3')
order = pd.concat([order1, order2, order3], axis=0, ignore_index=True)
order.tail(5)
|
detail_id |
order_id |
dishes_id |
logicprn_name |
parent_class_name |
dishes_name |
itemis_add |
counts |
amounts |
cost |
place_order_time |
discount_amt |
discount_reason |
kick_back |
add_inprice |
add_info |
bar_code |
picture_file |
emp_id |
10032 |
5683 |
672 |
610049 |
NaN |
NaN |
爆炒雙絲 |
0 |
1 |
35 |
NaN |
2016-08-31 21:53:30 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/301003.jpg |
1089 |
10033 |
5686 |
672 |
609959 |
NaN |
NaN |
小炒羊腰\r\n\r\n\r\n |
0 |
1 |
36 |
NaN |
2016-08-31 21:54:40 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/202005.jpg |
1089 |
10034 |
5379 |
647 |
610012 |
NaN |
NaN |
香菇鵪鶉蛋 |
0 |
1 |
39 |
NaN |
2016-08-31 21:54:44 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/302001.jpg |
1094 |
10035 |
5380 |
647 |
610054 |
NaN |
NaN |
不加一滴油的酸奶蛋糕 |
0 |
1 |
7 |
NaN |
2016-08-31 21:55:24 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/501003.jpg |
1094 |
10036 |
5688 |
672 |
609953 |
NaN |
NaN |
涼拌菠菜 |
0 |
1 |
27 |
NaN |
2016-08-31 21:56:54 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/303004.jpg |
1089 |
basic = pd.DataFrame()
for i in sheet_name:
basic_i = pd.read_excel('meal_order_detail.xlsx', sheet_name=i)
basic = pd.concat([basic, basic_i], axis=0, ignore_index=True)
basic.shape
(10037, 19)
basic.tail(5)
|
detail_id |
order_id |
dishes_id |
logicprn_name |
parent_class_name |
dishes_name |
itemis_add |
counts |
amounts |
cost |
place_order_time |
discount_amt |
discount_reason |
kick_back |
add_inprice |
add_info |
bar_code |
picture_file |
emp_id |
10032 |
5683 |
672 |
610049 |
NaN |
NaN |
爆炒雙絲 |
0 |
1 |
35 |
NaN |
2016-08-31 21:53:30 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/301003.jpg |
1089 |
10033 |
5686 |
672 |
609959 |
NaN |
NaN |
小炒羊腰\r\n\r\n\r\n |
0 |
1 |
36 |
NaN |
2016-08-31 21:54:40 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/202005.jpg |
1089 |
10034 |
5379 |
647 |
610012 |
NaN |
NaN |
香菇鵪鶉蛋 |
0 |
1 |
39 |
NaN |
2016-08-31 21:54:44 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/302001.jpg |
1094 |
10035 |
5380 |
647 |
610054 |
NaN |
NaN |
不加一滴油的酸奶蛋糕 |
0 |
1 |
7 |
NaN |
2016-08-31 21:55:24 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/501003.jpg |
1094 |
10036 |
5688 |
672 |
609953 |
NaN |
NaN |
涼拌菠菜 |
0 |
1 |
27 |
NaN |
2016-08-31 21:56:54 |
NaN |
NaN |
NaN |
0 |
NaN |
NaN |
caipu/303004.jpg |
1089 |
df = pd.read_csv('baby_trade_history.csv', dtype={'user_id': str})
df1 = pd.read_csv('sam_tianchi_mum_baby.csv', dtype={'user_id': str})
df1.head(5)
|
user_id |
birthday |
gender |
0 |
2757 |
20130311 |
1 |
1 |
415971 |
20121111 |
0 |
2 |
1372572 |
20120130 |
1 |
3 |
10339332 |
20110910 |
0 |
4 |
10642245 |
20130213 |
0 |
df.head(5)
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
0 |
786295544 |
41098319944 |
50014866 |
50022520 |
21458:86755362;13023209:3593274;10984217:21985... |
2 |
20140919 |
1 |
532110457 |
17916191097 |
50011993 |
28 |
21458:11399317;1628862:3251296;21475:137325;16... |
1 |
20131011 |
2 |
249013725 |
21896936223 |
50012461 |
50014815 |
21458:30992;1628665:92012;1628665:3233938;1628... |
1 |
20131011 |
3 |
917056007 |
12515996043 |
50018831 |
50014815 |
21458:15841995;21956:3494076;27000458:59723383... |
2 |
20141023 |
4 |
444069173 |
20487688075 |
50013636 |
50008168 |
21458:30992;13658074:3323064;1628665:3233941;1... |
1 |
20141103 |
df2 = pd.merge(left=df, right=df1, how='inner', left_on='user_id', right_on='user_id')
df2.head(5)
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
birthday |
gender |
0 |
513441334 |
19909384116 |
50010557 |
50008168 |
25935:21991;1628665:29784;22019:34731;22019:20... |
1 |
20121212 |
20110105 |
1 |
1 |
377550424 |
15771663914 |
50015841 |
28 |
1628665:3233941;1628665:3233942;3914866:11580;... |
1 |
20121123 |
20110620 |
1 |
2 |
47342027 |
14066344263 |
50013636 |
50008168 |
21458:21599;13585028:3416646;1628665:3233942;1... |
1 |
20120911 |
20101008 |
1 |
3 |
119784861 |
20796936076 |
50140021 |
50008168 |
21458:120325094;22019:2026;22019:34731;22019:3... |
1 |
20121129 |
20120327 |
0 |
4 |
159129426 |
15198386301 |
50013711 |
50008168 |
21458:11580;1628665:29778;22019:3340598;22019:... |
2 |
20120808 |
20100825 |
0 |