一、足球運動員分析
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["font.family"] = "SimHei"
mpl.rcParams["axes.unicode_minus"] = False
player = pd.read_csv(r'FullData.csv')
pd.set_option('max_columns', 100)
player.head()
player = player[player['Club_Position'].notnull()]
player.info()
player.describe()
player.duplicated().any()
player['Height'] = player['Height'].map(lambda x: int(x.replace('cm', '')))
player['Weight'] = player['Weight'].map(lambda x: int(x.replace('kg', '')))
player[['Height', 'Weight', 'Rating']].plot(kind='kde')
player['Preffered_Foot'].value_counts()
player["Preffered_Foot"].value_counts().plot(kind="bar")
s = player.groupby('Club')['Rating'].agg(['count', 'sum', 'mean'])
s = s[s['count'] > 20]
s.sort_values('mean', ascending=False)
s = player.groupby('Nationality')['Rating'].agg(['count', 'sum', 'mean'])
s = s[s['count'] > 20]
s.sort_values('mean', ascending=False).head(10)
year = player['Club_Joining'].map(lambda x: str(x).split('/')[-1])
year = year.astype(np.int)
t = player[(2017 - year >= 5) & (player['Club'] != 'Free Agents')]
t['Club'].value_counts().head(10).plot(kind='bar')
t = player['Birth_Date'].str.split('/', expand=True)
t[0].value_counts().plot(kind='bar')
t = player[player['Rating'] >= 80]
t = player['Birth_Date'].str.split('/', expand=True)
t[0].value_counts().plot(kind='bar')
t = player[(player['Club_Position'] != 'Sub') & (player['Club_Position'] != 'Res')]
x = t.groupby(['Club_Kit', 'Club_Position']).size()
x[x > 50].plot(kind='bar')
player.plot.scatter(x='Height', y='Weight')
player.corr()
g = player.groupby('Club_Position')
g['GK_Positioning'].agg('mean').plot(kind='bar')
t = player[['Age', 'Rating']]
t['Age'] = pd.cut(player['Age'], bins=[0, 20, 30, 40, 100], labels=['小', '中', '大', '很大'])
t.groupby('Age')['Rating'].mean().plot(kind='line', xticks=[0, 1, 2, 3, 4], marker='o')
二、Facebook營銷組合分類
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
def load_data(path):
data = pd.read_csv(path, header=None)
x, y = data.iloc[:, :-1], data.iloc[:, -1]
lb = LabelEncoder()
x = x.apply(lambda col: lb.fit_transform(col))
ohe = OneHotEncoder()
x = pd.DataFrame(ohe.fit_transform(x).toarray())
x.columns = x.columns.map(lambda x: f'c{x}')
return x, y
def train_input_fn(features, labels):
dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
dataset = dataset.shuffle(10000, seed=0).repeat(10).batch(50)
return dataset
def eval_input_fn(features, labels=None):
features = dict(features)
if labels is None:
inputs = features
else:
inputs = (features, labels)
dataset = tf.data.Dataset.from_tensor_slices(inputs)
dataset = dataset.batch(100)
return dataset
x, y = load_data(r'data.csv')
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.25, random_state=0)
my_feature_columns = []
for key in train_x.keys():
my_feature_columns.append(tf.feature_column.numeric_column(key=key))
classifier = tf.estimator.DNNClassifier(feature_columns=my_feature_columns, hidden_units=[512] * 2, n_classes=10,
optimizer="SGD")
classifier.train(input_fn=lambda: train_input_fn(train_x, train_y))
classifier.evaluate(input_fn=lambda: eval_input_fn(test_x, test_y))