import numpy as np
import pandas as pd
print ( pd. __version__)
print ( pd. show_versions( as_json= True ) )
mylist = list ( 'abcedfghijklmnopqrstuvwxyz' )
myarr = np. arange( 26 )
mydict = dict ( zip ( mylist, myarr) )
s1= pd. Series( mylist)
s2= pd. Series( myarr)
s3= pd. Series( mydict)
print ( "*" * 50 )
print ( s1. head( ) )
print ( s2. head( ) )
print ( s3. head( ) )
0.25.1
{'system': {'commit': None, 'python': '3.7.4.final.0', 'python-bits': 64, 'OS': 'Windows', 'OS-release': '10', 'machine': 'AMD64', 'processor': 'Intel64 Family 6 Model 78 Stepping 3, GenuineIntel', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'None', 'LOCALE': 'None.None'}, 'dependencies': {'pandas': '0.25.1', 'numpy': '1.17.2', 'pytz': '2019.2', 'dateutil': '2.8.0', 'pip': '19.3.1', 'setuptools': '41.4.0', 'Cython': None, 'pytest': '4.3.0', 'hypothesis': None, 'sphinx': None, 'blosc': None, 'feather': None, 'xlsxwriter': '1.2.5', 'lxml.etree': '4.4.1', 'html5lib': None, 'pymysql': '0.9.2', 'psycopg2': None, 'jinja2': '2.10.1', 'IPython': '7.9.0', 'pandas_datareader': None, 'bs4': '4.8.0', 'bottleneck': None, 'fastparquet': None, 'gcsfs': None, 'matplotlib': '3.1.1', 'numexpr': None, 'odfpy': None, 'openpyxl': None, 'pandas_gbq': None, 'pyarrow': None, 'pytables': None, 's3fs': None, 'scipy': '1.3.1', 'sqlalchemy': '1.3.10', 'tables': None, 'xarray': None, 'xlrd': None, 'xlwt': '1.3.0'}}
None
**************************************************
0 a
1 b
2 c
3 e
4 d
dtype: object
0 0
1 1
2 2
3 3
4 4
dtype: int32
a 0
b 1
c 2
e 3
d 4
dtype: int64
mylist = list ( 'abcedfghijklmnopqrstuvwxyz' )
myarr = np. arange( 26 )
mydict = dict ( zip ( mylist, myarr) )
ser = pd. Series( mydict)
df= ser. to_frame( ) . reset_index( )
df. head( )
index
0
0
a
0
1
b
1
2
c
2
3
e
3
4
d
4
ser1 = pd. Series( list ( 'abcedfghijklmnopqrstuvwxyz' ) )
ser2 = pd. Series( np. arange( 26 ) )
df1= pd. concat( [ ser1, ser2] , axis= 1 )
df2= pd. DataFrame( { "col_1" : ser1, "col_2" : ser2} )
print ( df2. head( ) )
col_1 col_2
0 a 0
1 b 1
2 c 2
3 e 3
4 d 4
ser = pd. Series( list ( 'abcedfghijklmnopqrstuvwxyz' ) )
ser. name= "alphabets"
print ( ser. head( ) )
0 a
1 b
2 c
3 e
4 d
Name: alphabets, dtype: object
ser1 = pd. Series( [ 1 , 2 , 3 , 4 , 5 ] )
ser2 = pd. Series( [ 4 , 5 , 6 , 7 , 8 ] )
print ( ser1[ ~ ser1. isin( ser2) ] )
0 1
1 2
2 3
dtype: int64
ser1 = pd. Series( [ 1 , 2 , 3 , 4 , 5 ] )
ser2 = pd. Series( [ 4 , 5 , 6 , 7 , 8 ] )
ser_u = pd. Series( np. union1d( ser1, ser2) )
print ( ser_u)
ser_i = pd. Series( np. intersect1d( ser1, ser2) )
print ( "-" * 50 )
print ( ser_i)
print ( "-" * 50 )
print ( ser_u[ ~ ser_u. isin( ser_i) ] )
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
dtype: int64
--------------------------------------------------
0 4
1 5
dtype: int64
--------------------------------------------------
0 1
1 2
2 3
5 6
6 7
7 8
dtype: int64
state = np. random. RandomState( 100 )
ser = pd. Series( state. normal( 10 , 5 , 25 ) )
print ( ser)
result= np. percentile( ser, q= [ 0 , 25 , 50 , 75 , 100 ] )
print ( result)
0 1.251173
1 11.713402
2 15.765179
3 8.737820
4 14.906604
5 12.571094
6 11.105898
7 4.649783
8 9.052521
9 11.275007
10 7.709865
11 12.175817
12 7.082025
13 14.084235
14 13.363604
15 9.477944
16 7.343598
17 15.148663
18 7.809322
19 4.408409
20 18.094908
21 17.708026
22 8.740604
23 5.787821
24 10.922593
dtype: float64
[ 1.25117263 7.70986507 10.92259345 13.36360403 18.0949083 ]
ser = pd. Series( np. take( list ( 'abcdefgh' ) , np. random. randint( 8 , size= 30 ) ) )
print ( ser. value_counts( ) )
g 7
e 6
c 5
f 5
h 3
a 2
b 2
dtype: int64
np. random. RandomState( 100 )
ser= pd. Series( np. random. randint( 1 , 5 , 12 ) )
ser[ ~ ser. isin( ser. value_counts( ) . index[ : 2 ] ) ] = "Other"
print ( ser)
0 Other
1 1
2 1
3 Other
4 4
5 4
6 Other
7 4
8 Other
9 1
10 4
11 1
dtype: object
ser = pd. Series( np. random. randint( 1 , 10 , 35 ) )
df= pd. DataFrame( ser. values. reshape( 7 , 5 ) )
print ( df)
0 1 2 3 4
0 7 9 2 3 9
1 4 8 9 9 3
2 7 7 8 6 5
3 4 2 8 4 6
4 1 9 4 2 5
5 8 4 8 3 1
6 4 4 5 2 7
ser = pd. Series( np. random. randint( 1 , 10 , 7 ) )
print ( ser)
result= np. argwhere( ser% 3 == 0 )
result
0 1
1 6
2 5
3 4
4 9
5 6
6 1
dtype: int32
array([[1],
[4],
[5]], dtype=int64)
ser = pd. Series( list ( 'abcdefghijklmnopqrstuvwxyz' ) )
pos = [ 0 , 4 , 8 , 14 , 20 ]
result= ser. take( pos)
result
0 a
4 e
8 i
14 o
20 u
dtype: object
ser1 = pd. Series( range ( 5 ) )
ser2 = pd. Series( list ( 'abcde' ) )
ser1. append( ser2)
print ( ser1)
df = pd. concat( [ ser1, ser2] , axis= 1 )
df
0 0
1 1
2 2
3 3
4 4
dtype: int64
0
1
0
0
a
1
1
b
2
2
c
3
3
d
4
4
e
ser1 = pd. Series( [ 10 , 9 , 6 , 5 , 3 , 1 , 12 , 8 , 13 ] )
ser2 = pd. Series( [ 1 , 3 , 10 , 13 ] )
result= [ np. where( i== ser1) [ 0 ] . tolist( ) [ 0 ] for i in ser2]
print ( result)
result= [ pd. Index( ser1) . get_loc( i) for i in ser2]
result
[5, 4, 0, 8]
[5, 4, 0, 8]
truth = pd. Series( range ( 10 ) )
pred = pd. Series( range ( 10 ) ) + np. random. random( 10 )
np. mean( ( truth- pred) ** 2 )
0.42318488444073726
ser = pd. Series( [ 'how' , 'to' , 'kick' , 'ass?' ] )
pd. Series( [ i. title( ) for i in ser] )
0 How
1 To
2 Kick
3 Ass?
dtype: object
ser = pd. Series( [ 'how' , 'to' , 'kick' , 'ass?' ] )
ser. map ( lambda x: len ( x) )
0 3
1 2
2 4
3 4
dtype: int64
ser = pd. Series( [ 1 , 3 , 6 , 10 , 15 , 21 , 27 , 35 ] )
print ( ser. tolist( ) )
print ( ser. diff( ) . tolist( ) )
[1, 3, 6, 10, 15, 21, 27, 35]
[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
ser = pd. Series( [ '01 Jan 2010' , '02-02-2011' , '20120303' ,
'2013/04/04' , '2014-05-05' , '2015-06-06T12:20' ] )
pd. to_datetime( ser)
0 2010-01-01 00:00:00
1 2011-02-02 00:00:00
2 2012-03-03 00:00:00
3 2013-04-04 00:00:00
4 2014-05-05 00:00:00
5 2015-06-06 12:20:00
dtype: datetime64[ns]
ser = pd. Series( [ '01 Jan 2010' , '02-02-2011' , '20120303' ,
'2013/04/04' , '2015-11-27' , '2019-11-27T12:20' ] )
from dateutil. parser import parse
ser_ts = ser. map ( lambda x: parse( x) )
print ( "這個月的第幾天: " , ser_ts. dt. day. tolist( ) )
print ( "這一年的第幾周: " , ser_ts. dt. weekofyear. tolist( ) )
print ( "這一年的第幾天: " , ser_ts. dt. dayofyear. tolist( ) )
print ( "這一天的星期幾: " , ser_ts. dt. weekday_name. tolist( ) )
這個月的第幾天: [1, 2, 3, 4, 27, 27]
這一年的第幾周: [53, 5, 9, 14, 48, 48]
這一年的第幾天: [1, 33, 63, 94, 331, 331]
這一天的星期幾: ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Friday', 'Wednesday']
ser = pd. Series( [ 'Jan 2010' , 'Feb 2011' , 'Mar 2012' ] )
from dateutil. parser import parse
ser. map ( lambda x: parse( '04 ' + x) )
0 2010-01-04
1 2011-02-04
2 2012-03-04
dtype: datetime64[ns]
ser = pd. Series( [ 'Apple' , 'Orange' , 'Plan' , 'Python' , 'Money' ] )
from collections import Counter
mask = ser. map ( lambda x: sum ( [ Counter( x. lower( ) ) . get( i, 0 )
for i in list ( 'aeiou' ) ] ) >= 2 )
ser[ mask]
0 Apple
1 Orange
4 Money
dtype: object
import re
emails = pd. Series( [ 'buying books at amazom.com' , '[email protected] ' ,
'[email protected] ' , '[email protected] ' ] )
pattern = '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
emails. str . findall( pattern, flags= re. IGNORECASE)
0 []
1 [[email protected] ]
2 [[email protected] ]
3 [[email protected] ]
dtype: object
fruit = pd. Series( np. random. choice( [ 'apple' , 'banana' , 'carrot' ] , 10 ) )
weights = pd. Series( np. linspace( 1 , 10 , 10 ) )
weights. groupby( fruit) . mean( )
apple 5.428571
banana 7.500000
carrot 2.000000
dtype: float64
p = pd. Series( [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ] )
q = pd. Series( [ 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 ] )
result= sum ( ( p - q) ** 2 ) ** .5
print ( result)
result= np. linalg. norm( p- q)
result
18.16590212458495
18.16590212458495
ser = pd. Series( [ 2 , 10 , 3 , 4 , 9 , 10 , 2 , 7 , 3 ] )
dd= np. diff( np. sign( np. diff( ser) ) )
print ( dd)
peak_locs= np. where( dd== - 2 ) [ 0 ] + 1
print ( peak_locs)
[-2 2 0 0 -2 2 -2]
[1 5 7]
my_str = 'dbc deb abed gade'
ser = pd. Series( list ( 'dbc deb abed gade' ) )
freq = ser. value_counts( )
print ( freq)
least_freq = freq. dropna( ) . index[ - 1 ]
result= "" . join( ser. replace( ' ' , least_freq) )
result
d 4
3
e 3
b 3
a 2
g 1
c 1
dtype: int64
'dbccdebcabedcgade'
ser = pd. Series( np. random. randint( 1 , 10 , 10 ) ,
pd. date_range( '2000-01-01' , periods= 10 , freq= 'W-SAT' ) )
print ( ser)
2000-01-01 7
2000-01-08 7
2000-01-15 3
2000-01-22 1
2000-01-29 4
2000-02-05 8
2000-02-12 7
2000-02-19 8
2000-02-26 3
2000-03-04 4
Freq: W-SAT, dtype: int32
ser = pd. Series( [ 1 , 10 , 3 , np. nan] , index= pd. to_datetime( [
'2000-01-01' , '2000-01-03' , '2000-01-06' , '2000-01-08' ] ) )
print ( ser)
result= ser. resample( 'D' ) . ffill( )
print ( result)
2000-01-01 1.0
2000-01-03 10.0
2000-01-06 3.0
2000-01-08 NaN
dtype: float64
2000-01-01 1.0
2000-01-02 1.0
2000-01-03 10.0
2000-01-04 10.0
2000-01-05 10.0
2000-01-06 3.0
2000-01-07 3.0
2000-01-08 NaN
Freq: D, dtype: float64
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv' , chunksize= 50 )
df2 = pd. concat( [ chunk. iloc[ 0 ] for chunk in df] , axis= 1 )
df2 = df2. transpose( )
df2
crim
zn
indus
chas
nox
rm
age
dis
rad
tax
ptratio
b
lstat
medv
0
0.00632
18.0
2.31
0.0
0.538
6.575
65.2
4.0900
1.0
296.0
15.3
396.90
4.98
24.0
50
0.08873
21.0
5.64
0.0
0.439
5.963
45.7
6.8147
4.0
243.0
16.8
395.56
13.45
19.7
100
0.14866
0.0
8.56
0.0
0.520
6.727
79.9
2.7778
5.0
384.0
20.9
394.76
9.42
27.5
150
1.65660
0.0
19.58
0.0
0.871
6.122
97.3
1.6180
5.0
403.0
14.7
372.80
14.10
21.5
200
0.01778
95.0
1.47
0.0
0.403
7.135
13.9
7.6534
3.0
402.0
17.0
384.30
4.45
32.9
250
0.14030
22.0
5.86
0.0
0.431
6.487
13.0
7.3967
7.0
330.0
19.1
396.28
5.90
24.4
300
0.04417
70.0
2.24
0.0
0.400
6.871
47.4
7.8278
5.0
358.0
14.8
390.86
6.07
24.8
350
0.06211
40.0
1.25
0.0
0.429
6.490
44.4
8.7921
1.0
335.0
19.7
396.90
5.98
22.9
400
25.04610
0.0
18.10
0.0
0.693
5.987
100.0
1.5888
24.0
666.0
20.2
396.90
26.77
5.6
450
6.71772
0.0
18.10
0.0
0.713
6.749
92.6
2.3236
24.0
666.0
20.2
0.32
17.44
13.4
500
0.22438
0.0
9.69
0.0
0.585
6.027
79.7
2.4982
6.0
391.0
19.2
396.90
14.33
16.8
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv' ,
converters= { 'medv' : lambda x: 'High' if float ( x) > 25 else 'Low' } )
df
crim
zn
indus
chas
nox
rm
age
dis
rad
tax
ptratio
b
lstat
medv
0
0.00632
18.0
2.31
0
0.538
6.575
65.2
4.0900
1
296
15.3
396.90
4.98
Low
1
0.02731
0.0
7.07
0
0.469
6.421
78.9
4.9671
2
242
17.8
396.90
9.14
Low
2
0.02729
0.0
7.07
0
0.469
7.185
61.1
4.9671
2
242
17.8
392.83
4.03
High
3
0.03237
0.0
2.18
0
0.458
6.998
45.8
6.0622
3
222
18.7
394.63
2.94
High
4
0.06905
0.0
2.18
0
0.458
7.147
54.2
6.0622
3
222
18.7
396.90
5.33
High
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
501
0.06263
0.0
11.93
0
0.573
6.593
69.1
2.4786
1
273
21.0
391.99
9.67
Low
502
0.04527
0.0
11.93
0
0.573
6.120
76.7
2.2875
1
273
21.0
396.90
9.08
Low
503
0.06076
0.0
11.93
0
0.573
6.976
91.0
2.1675
1
273
21.0
396.90
5.64
Low
504
0.10959
0.0
11.93
0
0.573
6.794
89.3
2.3889
1
273
21.0
393.45
6.48
Low
505
0.04741
0.0
11.93
0
0.573
6.030
80.8
2.5050
1
273
21.0
396.90
7.88
Low
506 rows × 14 columns
L = pd. Series( range ( 15 ) )
def gen_strides ( a, stride_len= 5 , window_len= 5 ) :
n_strides = ( ( a. size- window_len) // stride_len) + 1
return np. array( [ a[ s: ( s+ window_len) ] for s in np. arange( 0 , a. size, stride_len) [ : n_strides] ] )
gen_strides( L, stride_len= 2 , window_len= 4 )
array([[ 0, 1, 2, 3],
[ 2, 3, 4, 5],
[ 4, 5, 6, 7],
[ 6, 7, 8, 9],
[ 8, 9, 10, 11],
[10, 11, 12, 13]], dtype=int64)
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv' ,
usecols= [ 'crim' , 'medv' ] )
df. head( )
crim
medv
0
0.00632
24.0
1
0.02731
21.6
2
0.02729
34.7
3
0.03237
33.4
4
0.06905
36.2
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
df. describe( )
Min.Price
Price
Max.Price
MPG.city
MPG.highway
EngineSize
Horsepower
RPM
Rev.per.mile
Fuel.tank.capacity
Passengers
Length
Wheelbase
Width
Turn.circle
Rear.seat.room
Luggage.room
Weight
count
86.000000
91.000000
88.000000
84.000000
91.000000
91.000000
86.000000
90.000000
87.000000
85.000000
91.000000
89.000000
92.000000
87.000000
88.000000
89.000000
74.000000
86.000000
mean
17.118605
19.616484
21.459091
22.404762
29.065934
2.658242
144.000000
5276.666667
2355.000000
16.683529
5.076923
182.865169
103.956522
69.448276
38.954545
27.853933
13.986486
3104.593023
std
8.828290
9.724280
10.696563
5.841520
5.370293
1.045845
53.455204
605.554811
486.916616
3.375748
1.045953
14.792651
6.856317
3.778023
3.304157
3.018129
3.120824
600.129993
min
6.700000
7.400000
7.900000
15.000000
20.000000
1.000000
55.000000
3800.000000
1320.000000
9.200000
2.000000
141.000000
90.000000
60.000000
32.000000
19.000000
6.000000
1695.000000
25%
10.825000
12.350000
14.575000
18.000000
26.000000
1.800000
100.750000
4800.000000
2017.500000
14.500000
4.000000
174.000000
98.000000
67.000000
36.000000
26.000000
12.000000
2647.500000
50%
14.600000
17.700000
19.150000
21.000000
28.000000
2.300000
140.000000
5200.000000
2360.000000
16.500000
5.000000
181.000000
103.000000
69.000000
39.000000
27.500000
14.000000
3085.000000
75%
20.250000
23.500000
24.825000
25.000000
31.000000
3.250000
170.000000
5787.500000
2565.000000
19.000000
6.000000
192.000000
110.000000
72.000000
42.000000
30.000000
16.000000
3567.500000
max
45.400000
61.900000
80.000000
46.000000
50.000000
5.700000
300.000000
6500.000000
3755.000000
27.000000
8.000000
219.000000
119.000000
78.000000
45.000000
36.000000
22.000000
4105.000000
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
max_price= df. loc[ df. Price == np. max ( df. Price) , [ 'Manufacturer' , 'Model' , 'Type' ] ]
print ( max_price)
row, col = np. where( df. values == np. max ( df. Price) )
print ( row, col)
print ( df. iloc[ row[ 0 ] , col[ 0 ] ] )
print ( df. at[ row[ 0 ] , 'Price' ] )
Manufacturer Model Type
58 Mercedes-Benz 300E Midsize
[58] [4]
61.9
61.9
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
df. columns = df. columns. map ( lambda x: x. replace( '.' , '_' ) )
print ( df. columns)
Index(['Manufacturer', 'Model', 'Type', 'Min_Price', 'Price', 'Max_Price',
'MPG_city', 'MPG_highway', 'AirBags', 'DriveTrain', 'Cylinders',
'EngineSize', 'Horsepower', 'RPM', 'Rev_per_mile', 'Man_trans_avail',
'Fuel_tank_capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
'Turn_circle', 'Rear_seat_room', 'Luggage_room', 'Weight', 'Origin',
'Make'],
dtype='object')
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
df. isnull( ) . values. any ( )
True
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
count= df. apply ( lambda x: x. isnull( ) . sum ( ) )
print ( count)
print ( count. idxmax( ) )
df = pd. DataFrame( np. arange( 20 ) . reshape( - 1 , 5 ) , columns= list ( 'abcde' ) )
print ( df)
print ( type ( df[ [ 'a' ] ] ) )
print ( type ( df. loc[ : , [ 'a' ] ] ) )
print ( type ( df. iloc[ : , [ 0 ] ] ) )
print ( type ( df. a) )
print ( type ( df[ 'a' ] ) )
print ( type ( df. loc[ : , 'a' ] ) )
print ( type ( df. iloc[ : , 1 ] ) )
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
df = pd. DataFrame( np. arange( 20 ) . reshape( - 1 , 5 ) , columns= list ( 'abcde' ) )
print ( df)
def switch_columns ( df, col_1= None , col_2= None ) :
colnames= df. columns. tolist( )
i_1, i_2= colnames. index( col_1) , colnames. index( col_2)
colnames[ i_2] , colnames[ i_1] = colnames[ i_1] , colnames[ i_2]
return df[ colnames]
df1= switch_columns( df, 'a' , 'c' )
print ( df1)
print ( sorted ( df. columns, reverse= True ) )
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
c b a d e
0 2 1 0 3 4
1 7 6 5 8 9
2 12 11 10 13 14
3 17 16 15 18 19
['e', 'd', 'c', 'b', 'a']
import pandas as pd
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
pd. set_option( "display.max_columns" , 10 )
pd. set_option( 'display.max_rows' , 10 )
df
Manufacturer
Model
Type
Min.Price
Price
...
Rear.seat.room
Luggage.room
Weight
Origin
Make
0
Acura
Integra
Small
12.9
15.9
...
26.5
NaN
2705.0
non-USA
Acura Integra
1
NaN
Legend
Midsize
29.2
33.9
...
30.0
15.0
3560.0
non-USA
Acura Legend
2
Audi
90
Compact
25.9
29.1
...
28.0
14.0
3375.0
non-USA
Audi 90
3
Audi
100
Midsize
NaN
37.7
...
31.0
17.0
3405.0
non-USA
Audi 100
4
BMW
535i
Midsize
NaN
30.0
...
27.0
13.0
3640.0
non-USA
BMW 535i
...
...
...
...
...
...
...
...
...
...
...
...
88
Volkswagen
Eurovan
Van
16.6
19.7
...
34.0
NaN
3960.0
NaN
Volkswagen Eurovan
89
Volkswagen
Passat
Compact
17.6
20.0
...
31.5
14.0
2985.0
non-USA
Volkswagen Passat
90
Volkswagen
Corrado
Sporty
22.9
23.3
...
26.0
15.0
2810.0
non-USA
Volkswagen Corrado
91
Volvo
240
Compact
21.8
22.7
...
29.5
14.0
2985.0
non-USA
Volvo 240
92
NaN
850
Midsize
24.8
26.7
...
30.0
15.0
3245.0
non-USA
Volvo 850
93 rows × 27 columns
df = pd. DataFrame( np. random. random( 4 ) , columns= [ 'random' ] )
print ( df. round ( 4 ) )
out= df. style. format ( {
'random' : '{0:.2%}' . format ,
} )
out
random
0 0.8620
1 0.7903
2 0.0159
3 0.5417
<tr>
<th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row0" class="row_heading level0 row0" >0</th>
<td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row0_col0" class="data row0 col0" >86.20%</td>
</tr>
<tr>
<th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row1" class="row_heading level0 row1" >1</th>
<td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row1_col0" class="data row1 col0" >79.03%</td>
</tr>
<tr>
<th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row2" class="row_heading level0 row2" >2</th>
<td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row2_col0" class="data row2 col0" >1.59%</td>
</tr>
<tr>
<th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row3" class="row_heading level0 row3" >3</th>
<td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row3_col0" class="data row3 col0" >54.17%</td>
</tr>
</tbody></table>
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
df. iloc[ : : 20 , : ] [ [ 'Manufacturer' , 'Model' , 'Type' ] ]
Manufacturer
Model
Type
0
Acura
Integra
Small
20
Chrysler
LeBaron
Compact
40
Honda
Prelude
Sporty
60
Mercury
Cougar
Midsize
80
Subaru
Loyale
Small
import pandas as pd
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' ,
usecols= [ 0 , 1 , 2 , 3 , 5 ] )
df[ [ 'Manufacturer' , 'Model' , 'Type' ] ] = df[ [ 'Manufacturer' , 'Model' , 'Type' ] ] . fillna( 'missing' )
df. index = df. Manufacturer + '_' + df. Model + '_' + df. Type
print ( df. index. is_unique)
import pandas as pd
import numpy as np
df = pd. DataFrame( np. random. randint( 1 , 30 , 30 ) . reshape( 10 , - 1 ) , columns= list ( 'abc' ) )
print ( df[ 'a' ] )
print ( df[ 'a' ] . argsort( ) )
df[ 'a' ] . argsort( ) [ : : - 1 ] [ 5 ]
0 25
1 16
2 12
3 8
4 6
5 17
6 15
7 24
8 16
9 28
Name: a, dtype: int32
0 4
1 3
2 2
3 6
4 1
5 8
6 5
7 7
8 0
9 9
Name: a, dtype: int64
8
ser = pd. Series( np. random. randint( 1 , 100 , 15 ) )
print ( 'ser: ' , ser. tolist( ) , 'mean: ' , round ( ser. mean( ) ) )
np. argwhere( ser> ser. mean( ) )
ser: [54, 77, 49, 74, 24, 95, 94, 14, 7, 50, 69, 65, 72, 72, 58] mean: 58.0
array([[ 1],
[ 3],
[ 5],
[ 6],
[10],
[11],
[12],
[13]], dtype=int64)
df = pd. DataFrame( np. random. randint( 10 , 40 , 60 ) . reshape( - 1 , 4 ) )
rowsums = df. apply ( np. sum , axis= 1 )
print ( np. where( rowsums > 100 ) [ 0 ] [ - 2 : ] )
last_two_rows = df. iloc[ np. where( rowsums > 100 ) [ 0 ] [ - 2 : ] , : ]
last_two_rows
[11 14]
0
1
2
3
11
27
32
22
32
14
21
35
37
30
ser = pd. Series( np. logspace( - 2 , 2 , 30 ) )
def cap_outliers ( ser, low_perc, high_perc) :
low, high = ser. quantile( [ low_perc, high_perc] )
print ( low_perc, '%ile: ' , low, '|' , high_perc, '%ile: ' , high)
ser[ ser < low] = low
ser[ ser > high] = high
return ser
capped_ser = cap_outliers( ser, .05 , .95 )
print ( capped_ser)
0.05 %ile: 0.016049294076965887 | 0.95 %ile: 63.876672220183934
0 0.016049
1 0.016049
2 0.018874
3 0.025929
4 0.035622
5 0.048939
6 0.067234
7 0.092367
8 0.126896
9 0.174333
10 0.239503
11 0.329034
12 0.452035
13 0.621017
14 0.853168
15 1.172102
16 1.610262
17 2.212216
18 3.039195
19 4.175319
20 5.736153
21 7.880463
22 10.826367
23 14.873521
24 20.433597
25 28.072162
26 38.566204
27 52.983169
28 63.876672
29 63.876672
dtype: float64
df = pd. DataFrame( np. random. randint( - 20 , 50 , 100 ) . reshape( 10 , - 1 ) )
arr = df[ df > 0 ] . values. flatten( )
arr_qualified = arr[ ~ np. isnan( arr) ]
print ( arr_qualified)
top_indexes = np. argsort( arr_qualified) [ : : ]
print ( top_indexes[ : n** 2 ] )
n = int ( np. floor( arr_qualified. shape[ 0 ] ** .5 ) )
output = np. take( arr_qualified, sorted ( top_indexes[ : n** 2 ] ) ) . reshape( n, - 1 )
print ( output)
[ 6. 29. 48. 22. 14. 10. 49. 9. 18. 42. 31. 42. 16. 35. 45. 10. 2. 27.
48. 2. 16. 48. 22. 12. 23. 13. 34. 38. 18. 10. 12. 48. 39. 18. 49. 24.
35. 13. 16. 30. 35. 22. 44. 46. 8. 30. 1. 5. 30. 7. 15. 22. 6. 43.
47. 8. 32. 21. 46. 5. 20. 39. 9. 17.]
[46 19 16 59 47 0 52 49 44 55 62 7 5 15 29 30 23 25 37 4 50 12 38 20
63 8 33 28 60 57 22 51 3 41 24 35 17 1 45 39 48 10 56 26 40 13 36 27
32 61 11 9 53 42 14 43 58 54 2 18 21 31 34 6]
[[ 6. 29. 48. 22. 14. 10. 49. 9.]
[18. 42. 31. 42. 16. 35. 45. 10.]
[ 2. 27. 48. 2. 16. 48. 22. 12.]
[23. 13. 34. 38. 18. 10. 12. 48.]
[39. 18. 49. 24. 35. 13. 16. 30.]
[35. 22. 44. 46. 8. 30. 1. 5.]
[30. 7. 15. 22. 6. 43. 47. 8.]
[32. 21. 46. 5. 20. 39. 9. 17.]]
df= pd. DataFrame( np. arange( 25 ) . reshape( 5 , - 1 ) )
print ( df)
def swap_rows ( df, i1, i2) :
df. iloc[ i1, : ] , df. iloc[ i2, : ] = df. iloc[ i2, : ] . copy( ) , df. iloc[ i1, : ] . copy( )
return df
result= swap_rows( df, 1 , 2 )
result
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
4 20 21 22 23 24
0
1
2
3
4
0
0
1
2
3
4
1
10
11
12
13
14
2
5
6
7
8
9
3
15
16
17
18
19
4
20
21
22
23
24
df = pd. DataFrame( np. arange( 25 ) . reshape( 5 , - 1 ) )
print ( df)
df. iloc[ : : - 1 , : ]
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
4 20 21 22 23 24
0
1
2
3
4
4
20
21
22
23
24
3
15
16
17
18
19
2
10
11
12
13
14
1
5
6
7
8
9
0
0
1
2
3
4
df = pd. DataFrame( np. arange( 25 ) . reshape( 5 , - 1 ) , columns= list ( 'abcde' ) )
print ( df)
result= pd. get_dummies( df[ 'a' ] )
df_onehot= pd. concat( [ result, df[ list ( 'bcde' ) ] ] , axis= 1 )
df_onehot
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
4 20 21 22 23 24
0
5
10
15
20
b
c
d
e
0
1
0
0
0
0
1
2
3
4
1
0
1
0
0
0
6
7
8
9
2
0
0
1
0
0
11
12
13
14
3
0
0
0
1
0
16
17
18
19
4
0
0
0
0
1
21
22
23
24
df = pd. DataFrame( np. random. randint( 1 , 100 , 40 ) . reshape( 10 , - 1 ) )
print ( df)
print ( df. apply ( np. argmax, axis= 1 ) )
print ( df. apply ( np. argmax, axis= 1 ) . value_counts( ) )
print ( df. apply ( np. argmax, axis= 1 ) . value_counts( ) . index[ 0 ] )
0 1 2 3
0 10 87 19 43
1 5 83 50 80
2 19 24 10 77
3 36 15 95 78
4 8 20 89 48
5 17 17 81 46
6 88 74 52 72
7 91 53 36 61
8 25 53 22 90
9 3 93 86 63
0 1
1 1
2 3
3 2
4 2
5 2
6 0
7 0
8 3
9 1
dtype: int64
2 3
1 3
3 2
0 2
dtype: int64
2
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) ,
index= list ( 'abcdefgh' ) , columns= list ( 'pqrstuvwxy' ) )
print ( df)
print ( df. corr( ) )
abs_corrmat = np. abs ( df. corr( ) )
print ( abs_corrmat)
max_corr = abs_corrmat. apply ( lambda x: sorted ( x) [ - 2 ] )
print ( max_corr)
print ( np. round ( max_corr. tolist( ) , 2 ) )
p q r s t u v w x y
a 41 72 5 31 67 26 45 65 21 60
b 15 56 72 91 99 32 38 14 52 36
c 7 92 96 84 26 79 81 12 75 50
d 73 46 42 15 80 76 10 34 45 5
e 15 72 55 14 17 54 9 35 36 18
f 12 73 47 84 85 9 31 67 13 64
g 25 43 56 76 62 43 93 25 53 99
h 80 70 30 68 40 74 2 41 7 47
p q r s t u v \
p 1.000000 -0.348616 -0.606492 -0.406070 0.056932 0.461015 -0.543905
q -0.348616 1.000000 0.213753 0.159442 -0.554585 0.088275 0.023752
r -0.606492 0.213753 1.000000 0.501096 -0.190063 0.290160 0.440666
s -0.406070 0.159442 0.501096 1.000000 0.260183 -0.243481 0.505580
t 0.056932 -0.554585 -0.190063 0.260183 1.000000 -0.568596 -0.009954
u 0.461015 0.088275 0.290160 -0.243481 -0.568596 1.000000 -0.118254
v -0.543905 0.023752 0.440666 0.505580 -0.009954 -0.118254 1.000000
w 0.207508 0.125992 -0.797659 -0.285267 0.192809 -0.562124 -0.358931
x -0.452943 -0.029441 0.798096 0.187066 -0.122999 0.376021 0.637259
y -0.294716 -0.043568 -0.043181 0.579817 0.083632 -0.434149 0.729595
w x y
p 0.207508 -0.452943 -0.294716
q 0.125992 -0.029441 -0.043568
r -0.797659 0.798096 -0.043181
s -0.285267 0.187066 0.579817
t 0.192809 -0.122999 0.083632
u -0.562124 0.376021 -0.434149
v -0.358931 0.637259 0.729595
w 1.000000 -0.835494 0.145546
x -0.835494 1.000000 -0.030812
y 0.145546 -0.030812 1.000000
p q r s t u v \
p 1.000000 0.348616 0.606492 0.406070 0.056932 0.461015 0.543905
q 0.348616 1.000000 0.213753 0.159442 0.554585 0.088275 0.023752
r 0.606492 0.213753 1.000000 0.501096 0.190063 0.290160 0.440666
s 0.406070 0.159442 0.501096 1.000000 0.260183 0.243481 0.505580
t 0.056932 0.554585 0.190063 0.260183 1.000000 0.568596 0.009954
u 0.461015 0.088275 0.290160 0.243481 0.568596 1.000000 0.118254
v 0.543905 0.023752 0.440666 0.505580 0.009954 0.118254 1.000000
w 0.207508 0.125992 0.797659 0.285267 0.192809 0.562124 0.358931
x 0.452943 0.029441 0.798096 0.187066 0.122999 0.376021 0.637259
y 0.294716 0.043568 0.043181 0.579817 0.083632 0.434149 0.729595
w x y
p 0.207508 0.452943 0.294716
q 0.125992 0.029441 0.043568
r 0.797659 0.798096 0.043181
s 0.285267 0.187066 0.579817
t 0.192809 0.122999 0.083632
u 0.562124 0.376021 0.434149
v 0.358931 0.637259 0.729595
w 1.000000 0.835494 0.145546
x 0.835494 1.000000 0.030812
y 0.145546 0.030812 1.000000
p 0.606492
q 0.554585
r 0.798096
s 0.579817
t 0.568596
u 0.568596
v 0.729595
w 0.835494
x 0.835494
y 0.729595
dtype: float64
[0.61 0.55 0.8 0.58 0.57 0.57 0.73 0.84 0.84 0.73]
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
print ( df)
min_by_max = np. min ( df, axis= 1 ) / np. max ( df, axis= 1 )
min_by_max
0 1 2 3 4 5 6 7 8 9
0 85 63 99 34 13 14 64 33 58 16
1 64 45 77 68 19 45 61 2 11 15
2 78 66 76 51 51 52 20 53 35 64
3 68 85 2 81 52 66 14 28 41 34
4 37 40 99 62 57 70 37 15 14 56
5 13 88 12 51 43 1 54 18 70 67
6 55 19 79 43 19 8 52 6 15 77
7 79 93 54 68 78 61 80 33 72 92
0 0.131313
1 0.025974
2 0.256410
3 0.023529
4 0.141414
5 0.011364
6 0.075949
7 0.354839
dtype: float64
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
result= df. apply ( lambda x: x. sort_values( ) . unique( ) [ - 2 ] , axis= 1 )
df[ 'penultimate' ] = result
df
0
1
2
3
4
5
6
7
8
9
penultimate
0
50
12
77
25
22
97
49
40
27
18
77
1
14
52
78
3
67
5
77
17
43
53
77
2
92
53
10
39
55
34
63
89
60
41
89
3
9
89
66
50
88
4
46
19
87
75
88
4
97
95
75
50
91
60
65
3
24
59
95
5
31
38
4
81
9
1
52
71
84
57
81
6
59
7
19
33
49
40
54
60
48
4
59
7
90
21
77
44
3
50
98
23
84
30
90
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
print ( df)
result= df. apply ( lambda x: (
( x - x. mean( ) ) / x. std( )
) . round ( 2 ) )
result
0 1 2 3 4 5 6 7 8 9
0 73 77 53 35 9 80 96 47 35 26
1 58 72 39 80 86 57 41 98 31 90
2 45 76 22 27 5 15 78 90 87 92
3 89 84 97 78 29 70 23 95 97 90
4 55 32 83 49 99 63 22 75 44 26
5 74 42 70 49 57 26 88 77 1 5
6 56 29 42 28 75 16 21 11 38 50
7 99 26 74 74 39 50 61 3 23 3
0
1
2
3
4
5
6
7
8
9
0
0.24
0.90
-0.28
-0.79
-1.16
1.31
1.36
-0.40
-0.30
-0.57
1
-0.57
0.70
-0.83
1.24
1.03
0.39
-0.41
0.96
-0.42
1.10
2
-1.28
0.86
-1.51
-1.15
-1.28
-1.28
0.78
0.75
1.32
1.15
3
1.10
1.18
1.47
1.15
-0.59
0.91
-0.99
0.88
1.63
1.10
4
-0.74
-0.92
0.91
-0.16
1.40
0.63
-1.02
0.35
-0.02
-0.57
5
0.29
-0.52
0.40
-0.16
0.20
-0.84
1.10
0.40
-1.35
-1.11
6
-0.68
-1.04
-0.71
-1.10
0.72
-1.24
-1.05
-1.36
-0.20
0.06
7
1.64
-1.16
0.56
0.97
-0.31
0.11
0.23
-1.57
-0.67
-1.17
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
print ( df)
result = df. apply ( lambda x: (
1 - ( x. max ( ) - x) / ( x. max ( ) - x. min ( ) )
) . round ( 2 ) )
result
0 1 2 3 4 5 6 7 8 9
0 78 7 59 4 77 81 93 66 39 28
1 60 51 88 19 23 29 70 82 10 24
2 2 80 7 59 72 51 82 28 38 25
3 36 88 3 8 43 7 87 60 28 99
4 29 69 89 84 87 15 95 87 75 54
5 82 78 60 57 15 29 41 93 57 13
6 72 28 63 2 20 25 6 72 71 32
7 60 2 13 87 82 97 41 23 81 16
0
1
2
3
4
5
6
7
8
9
0
0.95
0.06
0.65
0.02
0.86
0.82
0.98
0.61
0.41
0.17
1
0.72
0.57
0.99
0.20
0.11
0.24
0.72
0.84
0.00
0.13
2
0.00
0.91
0.05
0.67
0.79
0.49
0.85
0.07
0.39
0.14
3
0.43
1.00
0.00
0.07
0.39
0.00
0.91
0.53
0.25
1.00
4
0.34
0.78
1.00
0.96
1.00
0.09
1.00
0.91
0.92
0.48
5
1.00
0.88
0.66
0.65
0.00
0.24
0.39
1.00
0.66
0.00
6
0.88
0.30
0.70
0.00
0.07
0.20
0.00
0.70
0.86
0.22
7
0.72
0.00
0.12
1.00
0.93
1.00
0.39
0.00
1.00
0.03
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
print ( df)
print ( [ i for i in range ( df. shape[ 0 ] ) [ : - 1 ] ] )
result= [ df. iloc[ i] . corr( df. iloc[ i+ 1 ] ) for i in range ( df. shape[ 0 ] ) [ : - 1 ] ]
result
0 1 2 3 4 5 6 7 8 9
0 27 3 37 9 76 68 91 31 44 7
1 11 15 20 47 33 86 65 47 9 30
2 39 1 72 19 35 42 87 77 55 40
3 60 7 8 28 37 14 17 5 3 7
4 47 99 76 28 77 57 32 57 24 16
5 2 50 95 89 84 46 59 84 1 2
6 78 27 58 67 78 1 7 28 89 20
7 12 86 54 81 20 19 77 1 8 56
[0, 1, 2, 3, 4, 5, 6]
[0.5182965633327684,
0.2595376913412023,
-0.23874062518280761,
0.005261734793477499,
0.4687394611664755,
-0.06555011633952691,
-0.30907671467693215]
df = pd. DataFrame( np. random. randint( 1 , 100 , 100 ) . reshape( 10 , - 1 ) )
rows= df. shape[ 0 ]
for i in range ( rows) :
df. iat[ i, i] = 0
df. iat[ rows- i- 1 , i] = 0
df
0
1
2
3
4
5
6
7
8
9
0
0
65
92
82
10
1
51
71
32
0
1
79
0
11
99
28
68
24
8
0
83
2
34
4
0
35
11
91
83
0
41
29
3
84
72
5
0
65
76
0
25
25
64
4
98
14
2
10
0
0
2
94
40
84
5
75
8
8
27
0
0
23
62
73
95
6
23
43
38
0
36
43
0
7
65
6
7
80
96
0
82
92
79
64
0
61
67
8
29
0
96
96
76
21
94
72
0
4
9
0
26
27
65
95
19
19
1
90
0
df = pd. DataFrame( { 'col1' : [ 'apple' , 'banana' , 'orange' ] * 3 ,
'col2' : np. random. rand( 9 ) ,
'col3' : np. random. randint( 0 , 15 , 9 ) } )
print ( df)
df. groupby( df[ 'col1' ] ) . get_group( 'apple' )
col1 col2 col3
0 apple 0.703158 12
1 banana 0.535815 13
2 orange 0.177147 8
3 apple 0.159570 2
4 banana 0.411271 10
5 orange 0.279007 11
6 apple 0.576264 4
7 banana 0.578607 9
8 orange 0.242959 6
col1
col2
col3
0
apple
0.703158
12
3
apple
0.159570
2
6
apple
0.576264
4
df = pd. DataFrame( { 'fruit' : [ 'apple' , 'banana' , 'orange' ] * 4 ,
'taste' : np. random. rand( 12 ) ,
'price' : np. random. randint( 0 , 15 , 12 ) } )
banana= df[ 'taste' ] . groupby( df[ 'fruit' ] ) . get_group( 'banana' )
print ( banana)
print ( "特定結果:" , banana. sort_values( ) . iloc[ - 2 ] )
df
1 0.209485
4 0.549818
7 0.498802
10 0.006632
Name: taste, dtype: float64
特定結果: 0.4988018517868045
fruit
taste
price
0
apple
0.510446
7
1
banana
0.209485
1
2
orange
0.632166
1
3
apple
0.865764
4
4
banana
0.549818
9
5
orange
0.744718
5
6
apple
0.069171
0
7
banana
0.498802
14
8
orange
0.011808
2
9
apple
0.103222
13
10
banana
0.006632
6
11
orange
0.017787
13
df = pd. DataFrame( { 'fruit' : [ 'apple' , 'banana' , 'orange' ] * 3 ,
'rating' : np. random. rand( 9 ) ,
'price' : np. random. randint( 0 , 15 , 9 ) } )
print ( df)
out = df. groupby( df[ 'fruit' ] , as_index= False ) [ 'price' ] . mean( )
out
fruit rating price
0 apple 0.090672 2
1 banana 0.019506 0
2 orange 0.354463 5
3 apple 0.466694 14
4 banana 0.807733 8
5 orange 0.488868 4
6 apple 0.640913 8
7 banana 0.977691 8
8 orange 0.390033 0
fruit
price
0
apple
8.000000
1
banana
5.333333
2
orange
3.000000
df1 = pd. DataFrame( { 'fruit' : [ 'apple' , 'banana' , 'orange' ] * 3 ,
'weight' : [ 'high' , 'medium' , 'low' ] * 3 ,
'price' : np. random. randint( 0 , 15 , 9 ) } )
df2 = pd. DataFrame( { 'pazham' : [ 'apple' , 'orange' , 'pine' ] * 2 ,
'pounds' : [ 'high' , 'low' ] * 3 ,
'price' : np. random. randint( 0 , 15 , 6 ) } )
print ( df1)
print ( df2)
pd. merge( df1, df2, how= 'inner' ,
left_on= [ 'fruit' , 'weight' ] ,
right_on= [ 'pazham' , 'pounds' ] ,
suffixes= [ '_left' , '_right' ] )
fruit weight price
0 apple high 13
1 banana medium 5
2 orange low 14
3 apple high 2
4 banana medium 2
5 orange low 6
6 apple high 7
7 banana medium 1
8 orange low 6
pazham pounds price
0 apple high 10
1 orange low 14
2 pine high 12
3 apple low 12
4 orange high 11
5 pine low 6
fruit
weight
price_left
pazham
pounds
price_right
0
apple
high
13
apple
high
10
1
apple
high
2
apple
high
10
2
apple
high
7
apple
high
10
3
orange
low
14
orange
low
14
4
orange
low
6
orange
low
14
5
orange
low
6
orange
low
14
df = pd. DataFrame( { 'fruit1' : np. random. choice( [ 'apple' , 'orange' , 'banana' ] , 10 ) ,
'fruit2' : np. random. choice( [ 'apple' , 'orange' , 'banana' ] , 10 ) } )
print ( df)
np. where( df. fruit1 == df. fruit2) [ 0 ]
fruit1 fruit2
0 orange orange
1 orange apple
2 orange apple
3 banana banana
4 apple banana
5 orange orange
6 orange banana
7 banana orange
8 apple banana
9 orange banana
array([0, 3, 5], dtype=int64)
df = pd. DataFrame( np. random. randint( 1 , 100 , 20 ) . reshape( - 1 , 4 ) ,
columns = list ( 'abcd' ) )
print ( df)
df[ 'a_lag' ] = df[ 'a' ] . shift( 1 )
print ( df)
df[ 'b_lead' ] = df[ 'b' ] . shift( - 1 )
df
a b c d
0 31 79 72 32
1 8 18 82 25
2 98 23 41 79
3 2 87 74 76
4 16 89 12 86
a b c d a_lag
0 31 79 72 32 NaN
1 8 18 82 25 31.0
2 98 23 41 79 8.0
3 2 87 74 76 98.0
4 16 89 12 86 2.0
a
b
c
d
a_lag
b_lead
0
31
79
72
32
NaN
18.0
1
8
18
82
25
31.0
23.0
2
98
23
41
79
8.0
87.0
3
2
87
74
76
98.0
89.0
4
16
89
12
86
2.0
NaN
df = pd. DataFrame( np. random. randint( 1 , 10 , 20 ) . reshape( - 1 , 4 ) ,
columns = list ( 'abcd' ) )
ravel= df. values. ravel( )
print ( ravel)
pd. value_counts( ravel)
[6 1 6 8 7 8 3 4 1 8 7 7 1 1 9 8 5 8 5 5]
8 5
1 4
7 3
5 3
6 2
9 1
4 1
3 1
dtype: int64
df = pd. DataFrame( [ "STD, City,State" ,
"33, Kolkata,West Bengal" ,
"44, Chennai,Tamil Nadu" ,
"40, Hyderabad ,Telengana" ,
"80, Bangalore,Karnataka" ] ,
columns= [ 'row' ] )
df_out = df. row. str . split( "," , expand= True )
new_header = df_out. iloc[ 0 ]
df_out = df_out[ 1 : ]
df_out. columns = new_header
print ( df_out)
0 STD City State
1 33 Kolkata West Bengal
2 44 Chennai Tamil Nadu
3 40 Hyderabad Telengana
4 80 Bangalore Karnataka