fig, ax = plt.subplots(figsize=(6,6))
bar_width =.8
labels =[x for x in df.columns if'length'in x or'width'in x]
ver_y =[df[df['class']=='Iris-versicolor'][x].mean()for x in labels]
vir_y =[df[df['class']=='Iris-virginica'][x].mean()for x in labels]
set_y =[df[df['class']=='Iris-setosa'][x].mean()for x in labels]
x = np.arange(len(labels))
ax.bar(x, vir_y, bar_width, bottom=set_y, color='darkgrey')
ax.bar(x, set_y, bar_width, bottom=ver_y, color='white')
ax.bar(x, ver_y, bar_width, color='black')
ax.set_xticks(x +(bar_width/2))
ax.set_xticklabels(labels, rotation=-70, fontsize=12);
ax.set_title('Mean Feature Measurement By Class', y=1.01)
ax.legend(['Virginica','Setosa','Versicolor'])
import seaborn as sns
sns.pairplot(df,hue='class')
import statsmodels.api as sm
y = df['sepal length'][:50]
x = df['sepal width'][:50]
X = sm.add_constant(x)
result = sm.OLS(y,X).fit()print(result.summary())
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
clf = RandomForestClassifier(max_depth=5, n_estimators=10)
X = df.iloc[:,:4]
y = df.iloc[:,4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
rf = pd.DataFrame(list(zip(y_pred, y_test)), columns=['predicted','actual'])
rf['correct']= rf.apply(lambda r:1if r['predicted']== r['actual']else0, axis=1)
rf
rf['correct'].sum()/ rf['correct'].count()
f_importance = clf.feature_importances_
f_name = df.columns[:4]
f_std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
zz =zip(f_importance,f_name,f_std)
zzs =sorted(zz,key=lambda x:x[0], reverse=True)
imps =[x[0]for x in zzs]
label =[x[1]for x in zzs]
errs =[x[2]for x in zzs]
plt.bar(range(len(f_importance)), imps, color="r", yerr=errs, align="center")
plt.xticks(range(len(f_importance)), label)
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
clf = OneVsOneClassifier(SVC(kernel='linear'))
X = df.iloc[:,:4]
y = np.array(df.iloc[:,4]).astype(str)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
rf = pd.DataFrame(list(zip(y_pred,y_test)), columns=['predicted','actual'])
rf['correct']= rf.apply(lambda r :1if r['predicted']==r['actual']else0, axis=1)
rf