08: Sborové učení, ansámbly¶
This notebook covers these topics of the ensemble learning: Na tomto cvičení si procvičíme následující prvky sborového cvičení:
- Hlasovací klasifikátory
- Porovnání klasifikátoru pevné a měkké volby
- Bootstrap aggregating
- Důležitost příznaků
In [1]:
Copied!
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
np.random.seed(23)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
np.random.seed(23)
Hlasovací klasifikátor¶
In [2]:
Copied!
# vygenerujne si trenovaci dataset
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# vygenerujne si trenovaci dataset
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
In [3]:
Copied!
print(X.shape)
print(y.shape)
print(X.shape)
print(y.shape)
(500, 2) (500,)
In [4]:
Copied!
# tridy
np.unique(y)
# tridy
np.unique(y)
Out[4]:
array([0, 1])
Klasifikátor pevné volby¶
In [6]:
Copied!
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
log_reg = LogisticRegression(random_state=42)
rnd_for = RandomForestClassifier(n_estimators=10, random_state=42)
svm_svc = SVC(random_state=42)
voting = VotingClassifier(
estimators=[('lr', log_reg), ('rf', rnd_for), ('svc', svm_svc)],
voting='hard'
)
voting.fit(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
log_reg = LogisticRegression(random_state=42)
rnd_for = RandomForestClassifier(n_estimators=10, random_state=42)
svm_svc = SVC(random_state=42)
voting = VotingClassifier(
estimators=[('lr', log_reg), ('rf', rnd_for), ('svc', svm_svc)],
voting='hard'
)
voting.fit(X_train, y_train)
Out[6]:
VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)), ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svc', SVC(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)), ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svc', SVC(random_state=42))])
LogisticRegression(random_state=42)
RandomForestClassifier(n_estimators=10, random_state=42)
SVC(random_state=42)
In [7]:
Copied!
from sklearn.metrics import accuracy_score
models = (log_reg, rnd_for, svm_svc, voting)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
from sklearn.metrics import accuracy_score
models = (log_reg, rnd_for, svm_svc, voting)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
Klasifikátor měkké volby¶
In [8]:
Copied!
log_reg = LogisticRegression(random_state=42)
rnd_for = RandomForestClassifier(n_estimators=10, random_state=42)
svm_svc = SVC(probability=True, random_state=42)
# voting=soft
voting_clf = VotingClassifier(
estimators=[('lr', log_reg), ('rf', rnd_for), ('svc', svm_svc)],
voting='soft'
)
voting_clf.fit(X_train, y_train)
log_reg = LogisticRegression(random_state=42)
rnd_for = RandomForestClassifier(n_estimators=10, random_state=42)
svm_svc = SVC(probability=True, random_state=42)
# voting=soft
voting_clf = VotingClassifier(
estimators=[('lr', log_reg), ('rf', rnd_for), ('svc', svm_svc)],
voting='soft'
)
voting_clf.fit(X_train, y_train)
Out[8]:
VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)), ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svc', SVC(probability=True, random_state=42))], voting='soft')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)), ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svc', SVC(probability=True, random_state=42))], voting='soft')
LogisticRegression(random_state=42)
RandomForestClassifier(n_estimators=10, random_state=42)
SVC(probability=True, random_state=42)
In [9]:
Copied!
models = (log_reg, rnd_for, svm_svc, voting)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
models = (log_reg, rnd_for, svm_svc, voting)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
Bootstrap aggregating (Bagging)¶
In [10]:
Copied!
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
bagging = BaggingClassifier(
dt, n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1,
random_state=42
)
pasting = BaggingClassifier(
dt, n_estimators=500, max_samples=100, bootstrap=False, n_jobs=-1,
random_state=42
)
bagging.fit(X_train, y_train)
pasting.fit(X_train, y_train)
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
bagging = BaggingClassifier(
dt, n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1,
random_state=42
)
pasting = BaggingClassifier(
dt, n_estimators=500, max_samples=100, bootstrap=False, n_jobs=-1,
random_state=42
)
bagging.fit(X_train, y_train)
pasting.fit(X_train, y_train)
Out[10]:
BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(random_state=42), max_samples=100, n_estimators=500, n_jobs=-1, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(random_state=42), max_samples=100, n_estimators=500, n_jobs=-1, random_state=42)
DecisionTreeClassifier(random_state=42)
DecisionTreeClassifier(random_state=42)
In [11]:
Copied!
models = (dt, bagging, pasting)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
models = (dt, bagging, pasting)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
Náhodné lesy¶
In [12]:
Copied!
from sklearn.ensemble import RandomForestClassifier
rnd_for = RandomForestClassifier(
n_estimators=500, max_samples=100, n_jobs=-1, max_features=1.0,
random_state=42
)
from sklearn.ensemble import RandomForestClassifier
rnd_for = RandomForestClassifier(
n_estimators=500, max_samples=100, n_jobs=-1, max_features=1.0,
random_state=42
)
In [13]:
Copied!
models = (bagging, rnd_for)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
models = (bagging, rnd_for)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
Out-of-Bag přesnost¶
In [14]:
Copied!
bagging = BaggingClassifier(
dt, n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1,
oob_score=True, random_state=42
)
bagging.fit(X_train, y_train)
print(bagging.oob_score_)
bagging = BaggingClassifier(
dt, n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1,
oob_score=True, random_state=42
)
bagging.fit(X_train, y_train)
print(bagging.oob_score_)
0.9253333333333333
Důležitost příznaků¶
In [15]:
Copied!
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
In [16]:
Copied!
mnist.target
mnist.target
Out[16]:
0 5 1 0 2 4 3 1 4 9 .. 69995 2 69996 3 69997 4 69998 5 69999 6 Name: class, Length: 70000, dtype: category Categories (10, object): ['0', '1', '2', '3', ..., '6', '7', '8', '9']
In [17]:
Copied!
mnist.target = mnist.target.astype(np.int64)
mnist.target = mnist.target.astype(np.int64)
In [18]:
Copied!
mnist.data.head()
mnist.data.head()
Out[18]:
pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | pixel10 | ... | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | pixel784 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 784 columns
In [20]:
Copied!
rnd_for = RandomForestClassifier(n_estimators=10, random_state=42)
rnd_for.fit(mnist["data"], mnist["target"])
rnd_for = RandomForestClassifier(n_estimators=10, random_state=42)
rnd_for.fit(mnist["data"], mnist["target"])
Out[20]:
RandomForestClassifier(n_estimators=10, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=10, random_state=42)
In [21]:
Copied!
plt.imshow(rnd_for.feature_importances_.reshape(28, 28), cmap=matplotlib.cm.gray, interpolation="nearest")
# plt.axis("off")
cbar = plt.colorbar(ticks=[rnd_for.feature_importances_.min(), rnd_for.feature_importances_.max()])
cbar.ax.set_yticklabels(['Not important', 'Important'])
plt.show()
plt.imshow(rnd_for.feature_importances_.reshape(28, 28), cmap=matplotlib.cm.gray, interpolation="nearest")
# plt.axis("off")
cbar = plt.colorbar(ticks=[rnd_for.feature_importances_.min(), rnd_for.feature_importances_.max()])
cbar.ax.set_yticklabels(['Not important', 'Important'])
plt.show()
AdaBoost¶
In [22]:
Copied!
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
In [23]:
Copied!
ada = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), algorithm='SAMME', n_estimators=100, learning_rate=0.5,
random_state=42
)
ada = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), algorithm='SAMME', n_estimators=100, learning_rate=0.5,
random_state=42
)
In [24]:
Copied!
models = (rnd_for, ada)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))
models = (rnd_for, ada)
fig, axes = plt.subplots(1, len(models))
fig.set_figwidth(15)
for i, clf in enumerate(models):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
axes[i].plot(X_test[(y_pred==0),0], X_test[(y_pred==0),1], 'r.')
axes[i].plot(X_test[(y_pred==1),0], X_test[(y_pred==1),1], 'b.')
axes[i].set_title(clf.__class__.__name__)
print(clf.__class__.__name__,': ', accuracy_score(y_test, y_pred))