pandas基础(2.分析MovieLens 1M 数据)


  1. 电影
  2. 用户
  3. 用户对电影的评分



import pandas as pd


unames = ['user_id','gender','age','occupation','zip']
users = pd.read_table('ml-1m/users.dat', sep = '::',header = None, names = unames )



user_id	gender	age	occupation	zip
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455


rating_names = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table('ml-1m/ratings.dat', sep = '::',header = None, names = rating_names )

movie_names = ['movie_id','title','genres']
movies = pd.read_table('ml-1m/movies.dat', sep = '::',header = None, names = movie_names )

user_id	movie_id	rating	timestamp
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291

movie_id	title	genres
0	1	Toy Story (1995)	Animation|Children's|Comedy
1	2	Jumanji (1995)	Adventure|Children's|Fantasy
2	3	Grumpier Old Men (1995)	Comedy|Romance
3	4	Waiting to Exhale (1995)	Comedy|Drama
4	5	Father of the Bride Part II (1995)	Comedy

合并三个表的数据,操作类似于SQL中inner join

data = pd.merge(pd.merge(users,ratings),movies)


user_id	gender	age	occupation	zip	movie_id	rating	timestamp	title	genres
0	1	F	1	10	48067	1193	5	978300760	One Flew Over the Cuckoo's Nest (1975)	Drama
1	2	M	56	16	70072	1193	5	978298413	One Flew Over the Cuckoo's Nest (1975)	Drama
2	12	M	25	12	32793	1193	4	978220179	One Flew Over the Cuckoo's Nest (1975)	Drama
3	15	M	25	7	22903	1193	4	978199279	One Flew Over the Cuckoo's Nest (1975)	Drama
4	17	M	50	1	95350	1193	5	978158471	One Flew Over the Cuckoo's Nest (1975)	Drama
5	18	F	18	3	95825	1193	4	978156168	One Flew Over the Cuckoo's Nest (1975)	Drama
6	19	M	1	10	48073	1193	5	982730936	One Flew Over the Cuckoo's Nest (1975)	Drama
7	24	F	25	7	10023	1193	5	978136709	One Flew Over the Cuckoo's Nest (1975)	Drama
8	28	F	25	1	14607	1193	3	978125194	One Flew Over the Cuckoo's Nest (1975)	Drama
9	33	M	45	3	55421	1193	5	978557765	One Flew Over the Cuckoo's Nest (1975)	Drama



<class 'pandas.core.frame.DataFrame'>

select user_id==1的全部数据

data[data.user_id == 1]

	user_id	gender	age	occupation	zip	movie_id	rating	timestamp	title	genres
0	1	F	1	10	48067	1193	5	978300760	One Flew Over the Cuckoo's Nest (1975)	Drama
1725	1	F	1	10	48067	661	3	978302109	James and the Giant Peach (1996)	Animation|Children's|Musical
2250	1	F	1	10	48067	914	3	978301968	My Fair Lady (1964)	Musical|Romance
2886	1	F	1	10	48067	3408	4	978300275	Erin Brockovich (2000)	Drama
4201	1	F	1	10	48067	2355	5	978824291	Bug's Life, A (1998)	Animation|Children's|Comedy
5904	1	F	1	10	48067	1197	3	978302268	Princess Bride, The (1987)	Action|Adventure|Comedy|Romance
8222	1	F	1	10	48067	1287	5	978302039	Ben-Hur (1959)	Action|Adventure|Drama
8926	1	F	1	10	48067	2804	5	978300719	Christmas Story, A (1983)	Comedy|Drama
10278	1	F	1	10	48067	594	4	978302268	Snow White and the Seven Dwarfs (1937)	Animation|Children's|Musical
11041	1	F	1	10	48067	919	4	978301368	Wizard of Oz, The (1939)	Adventure|Children's|Drama|Musical
12759	1	F	1	10	48067	595	5	978824268	Beauty and the Beast (1991)	Animation|Children's|Musical
13819	1	F	1	10	48067	938	4	978301752	Gigi (1958)	Musical
14006	1	F	1	10	48067	2398	4	978302281	Miracle on 34th Street (1947)	Drama
14386	1	F	1	10	48067	2918	4	978302124	Ferris Bueller's Day Off (1986)	Comedy
15859	1	F	1	10	48067	1035	5	978301753	Sound of Music, The (1965)	Musical
16741	1	F	1	10	48067	2791	4	978302188	Airplane! (1980)	Comedy
18472	1	F	1	10	48067	2687	3	978824268	Tarzan (1999)	Animation|Children's
18914	1	F	1	10	48067	2018	4	978301777	Bambi (1942)	Animation|Children's
19503	1	F	1	10	48067	3105	5	978301713	Awakenings (1990)	Drama
20183	1	F	1	10	48067	2797	4	978302039	Big (1988)	Comedy|Fantasy
21674	1	F	1	10	48067	2321	3	978302205	Pleasantville (1998)	Comedy
22832	1	F	1	10	48067	720	3	978300760	Wallace & Gromit: The Best of Aardman Animatio...	Animation
23270	1	F	1	10	48067	1270	5	978300055	Back to the Future (1985)	Comedy|Sci-Fi
25853	1	F	1	10	48067	527	5	978824195	Schindler's List (1993)	Drama|War
28157	1	F	1	10	48067	2340	3	978300103	Meet Joe Black (1998)	Romance
28501	1	F	1	10	48067	48	5	978824351	Pocahontas (1995)	Animation|Children's|Musical|Romance
28883	1	F	1	10	48067	1097	4	978301953	E.T. the Extra-Terrestrial (1982)	Children's|Drama|Fantasy|Sci-Fi
31152	1	F	1	10	48067	1721	4	978300055	Titanic (1997)	Drama|Romance
32698	1	F	1	10	48067	1545	4	978824139	Ponette (1996)	Drama
32771	1	F	1	10	48067	745	3	978824268	Close Shave, A (1995)	Animation|Comedy|Thriller
33428	1	F	1	10	48067	2294	4	978824291	Antz (1998)	Animation|Children's
34073	1	F	1	10	48067	3186	4	978300019	Girl, Interrupted (1999)	Drama
34504	1	F	1	10	48067	1566	4	978824330	Hercules (1997)	Adventure|Animation|Children's|Comedy|Musical
34973	1	F	1	10	48067	588	4	978824268	Aladdin (1992)	Animation|Children's|Comedy|Musical
36324	1	F	1	10	48067	1907	4	978824330	Mulan (1998)	Animation|Children's
36814	1	F	1	10	48067	783	4	978824291	Hunchback of Notre Dame, The (1996)	Animation|Children's|Musical
37204	1	F	1	10	48067	1836	5	978300172	Last Days of Disco, The (1998)	Drama
37339	1	F	1	10	48067	1022	5	978300055	Cinderella (1950)	Animation|Children's|Musical
37916	1	F	1	10	48067	2762	4	978302091	Sixth Sense, The (1999)	Thriller
40375	1	F	1	10	48067	150	5	978301777	Apollo 13 (1995)	Drama
41626	1	F	1	10	48067	1	5	978824268	Toy Story (1995)	Animation|Children's|Comedy
43703	1	F	1	10	48067	1961	5	978301590	Rain Man (1988)	Drama
45033	1	F	1	10	48067	1962	4	978301753	Driving Miss Daisy (1989)	Drama
45685	1	F	1	10	48067	2692	4	978301570	Run Lola Run (Lola rennt) (1998)	Action|Crime|Romance
46757	1	F	1	10	48067	260	4	978300760	Star Wars: Episode IV - A New Hope (1977)	Action|Adventure|Fantasy|Sci-Fi
49748	1	F	1	10	48067	1028	5	978301777	Mary Poppins (1964)	Children's|Comedy|Musical
50759	1	F	1	10	48067	1029	5	978302205	Dumbo (1941)	Animation|Children's|Musical
51327	1	F	1	10	48067	1207	4	978300719	To Kill a Mockingbird (1962)	Drama
52255	1	F	1	10	48067	2028	5	978301619	Saving Private Ryan (1998)	Action|Drama|War
54908	1	F	1	10	48067	531	4	978302149	Secret Garden, The (1993)	Children's|Drama
55246	1	F	1	10	48067	3114	4	978302174	Toy Story 2 (1999)	Animation|Children's|Comedy
56831	1	F	1	10	48067	608	4	978301398	Fargo (1996)	Crime|Drama|Thriller
59344	1	F	1	10	48067	1246	4	978302091	Dead Poets Society (1989)	Drama

values = 'rating:表数据
index = ‘title’:行索引为电影
columns = ‘gender’:列索引为性别
aggfunc = ‘mean’:数据计算方式 平均值

ratings_by_gender = data.pivot_table(values = 'rating', index = 'title', columns = 'gender',aggfunc = 'mean' )

gender								F			M
$1,000,000 Duck (1971)				3.375000	2.761905
'Night Mother (1986)				3.388889	3.352941
'Til There Was You (1997)			2.675676	2.733333
'burbs, The (1989)					2.793478	2.962085
...And Justice for All (1979)		3.828571	3.689024
1-900 (1994)						2.000000	3.000000
10 Things I Hate About You (1999)	3.646552	3.311966
101 Dalmatians (1961)				3.791444	3.500000
101 Dalmatians (1996)				3.240000	2.911215
12 Angry Men (1957)					4.184397	4.328421


ratings_by_gender['diff'] = ratings_by_gender['F'] - ratings_by_gender.M

gender								F			M			diff
$1,000,000 Duck (1971)				3.375000	2.761905	0.613095
'Night Mother (1986)				3.388889	3.352941	0.035948
'Til There Was You (1997)			2.675676	2.733333	-0.057658
'burbs, The (1989)					2.793478	2.962085	-0.168607
...And Justice for All (1979)		3.828571	3.689024	0.139547
1-900 (1994)						2.000000	3.000000	-1.000000
10 Things I Hate About You (1999)	3.646552	3.311966	0.334586
101 Dalmatians (1961)				3.791444	3.500000	0.291444
101 Dalmatians (1996)				3.240000	2.911215	0.328785
12 Angry Men (1957)					4.184397	4.328421	-0.144024



gender								F	M			diff
Tigrero: A Film That Was Ne...		1.0	4.333333	-3.333333
Neon Bible, The (1995)				1.0	4.000000	-3.000000
Enfer, L' (1994)					1.0	3.750000	-2.750000
Stalingrad (1993)					1.0	3.593750	-2.593750
Killer: A Journal of Murder (1995)	1.0	3.428571	-2.428571
Dangerous Ground (1997)				1.0	3.333333	-2.333333
In God's Hands (1998)				1.0	3.333333	-2.333333
Rosie (1998)						1.0	3.333333	-2.333333
Flying Saucer, The (1950)			1.0	3.300000	-2.300000
Jamaica Inn (1939)					1.0	3.142857	-2.142857


ratings_by_title = data.groupby('title').size()

$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64



American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
dtype: int64


mean_ratings = data.pivot_table(values='rating',index='title',aggfunc='mean')

$1,000,000 Duck (1971)				3.027027
'Night Mother (1986)				3.371429
'Til There Was You (1997)			2.692308
'burbs, The (1989)					2.910891
...And Justice for All (1979)		3.713568
1-900 (1994)						2.500000
10 Things I Hate About You (1999)	3.422857
101 Dalmatians (1961)				3.596460
101 Dalmatians (1996)				3.046703
12 Angry Men (1957)					4.295455


top_10_hop = ratings_by_title.sort_values(ascending=False).head(10)

American Beauty (1999)                                   4.317386
Star Wars: Episode IV - A New Hope (1977)                4.453694
Star Wars: Episode V - The Empire Strikes Back (1980)    4.292977
Star Wars: Episode VI - Return of the Jedi (1983)        4.022893
Jurassic Park (1993)                                     3.763847
Saving Private Ryan (1998)                               4.337354
Terminator 2: Judgment Day (1991)                        4.058513
Matrix, The (1999)                                       4.315830
Back to the Future (1985)                                3.990321
Silence of the Lambs, The (1991)                         4.351823
Name: rating, dtype: float64


top_20_score = mean_ratings.sort_values(by='rating',ascending=False).head(20)

Ulysses (Ulisse) (1954)                                                   1
Lured (1947)                                                              1
Follow the Bitch (1998)                                                   1
Bittersweet Motel (2000)                                                  1
Song of Freedom (1936)                                                    1
One Little Indian (1973)                                                  1
Smashing Time (1967)                                                      2
Schlafes Bruder (Brother of Sleep) (1995)                                 1
Gate of Heavenly Peace, The (1995)                                        3
Baby, The (1973)                                                          1
I Am Cuba (Soy Cuba/Ya Kuba) (1964)                                       5
Lamerica (1994)                                                           8
Apple, The (Sib) (1998)                                                   9
Sanjuro (1962)                                                           69
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)     628
Shawshank Redemption, The (1994)                                       2227
Godfather, The (1972)                                                  2223
Close Shave, A (1995)                                                   657
Usual Suspects, The (1995)                                             1783
Schindler's List (1993)                                                2304
dtype: int64


hot_movies = ratings_by_title[ratings_by_title > 1000]

2001: A Space Odyssey (1968)    1716
Abyss, The (1989)               1715
African Queen, The (1951)       1057
Air Force One (1997)            1076
Airplane! (1980)                1731
Aladdin (1992)                  1351
Alien (1979)                    2024
Aliens (1986)                   1820
Amadeus (1984)                  1382
American Beauty (1999)          3428
dtype: int64


hot_moviews_rating = mean_ratings.rating[hot_movies.index]



Shawshank Redemption, The (1994)                                               4.554558
Godfather, The (1972)                                                          4.524966
Usual Suspects, The (1995)                                                     4.517106
Schindler's List (1993)                                                        4.510417
Raiders of the Lost Ark (1981)                                                 4.477725
Rear Window (1954)                                                             4.476190
Star Wars: Episode IV - A New Hope (1977)                                      4.453694
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)    4.449890
Casablanca (1942)                                                              4.412822
Sixth Sense, The (1999)                                                        4.406263
Name: rating, dtype: float64


rating_by_gender = data.pivot_table(index='title',columns='gender',values='rating',aggfunc='mean')


rating_by_gender['diff'] = rating_by_gender.F - rating_by_gender.M
rating_by_gender = rating_by_gender.dropna().sort_values(by='diff')
gender F M diff
Tigrero: A Film That Was Never Made (1994) 1.000000 4.333333 -3.333333
Neon Bible, The (1995) 1.000000 4.000000 -3.000000
Enfer, L' (1994) 1.000000 3.750000 -2.750000
Stalingrad (1993) 1.000000 3.593750 -2.593750
Killer: A Journal of Murder (1995) 1.000000 3.428571 -2.428571
... ... ... ...
Woman of Paris, A (1923) 5.000000 2.428571 2.571429
Babyfever (1994) 3.666667 1.000000 2.666667
Country Life (1994) 5.000000 2.000000 3.000000
James Dean Story, The (1957) 4.000000 1.000000 3.000000
Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919) 4.000000 1.000000 3.000000



Rocky Horror Picture Show, The (1975)          0.512885
Mary Poppins (1964)                            0.467147
Gone with the Wind (1939)                      0.440471
Full Monty, The (1997)                         0.352481
Little Mermaid, The (1989)                     0.343561
Predator (1987)                               -0.406793
Airplane! (1980)                              -0.407854
South Park: Bigger, Longer and Uncut (1999)   -0.424206
Reservoir Dogs (1992)                         -0.444642
Animal House (1978)                           -0.538286
Name: diff, Length: 207, dtype: float64
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.