集成学习
一、集成学习的简单实现1.1 导入数据1.2 分割数据1.3 逻辑回归训练1.4 SVM 训练1.5 决策树训练1.6 简单集成1.7 使用voting classifier
二、Soft Voting三、集成学习3.1 Bagging3.2 OOB3.3 n_jobs3.4 Bootstrap_features
四、随机森林五、Extra-Trees【极其随机树】六、Ada Boosting七、Gradient Boosting八、stacking
一、集成学习的简单实现
1.1 导入数据
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
X,y = datasets.make_moons(n_samples = 500,noise=0.3,random_state=42)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
1.2 分割数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
1.3 逻辑回归训练
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
log_clf.fit(X_train,y_train)
log_clf.score(X_test,y_test)
1.4 SVM 训练
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train,y_train)
svm_clf.score(X_test,y_test)
1.5 决策树训练
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)
dt_clf.score(X_test,y_test)
1.6 简单集成
y_predict1 = log_clf.predict(X_test)
y_predict2 = svm_clf.predict(X_test)
y_predict3 = dt_clf.predict(X_test)
y_predict = np.array((y_predict1+y_predict2+y_predict3) >=2,dtype='int')
y_predict[:10]
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)
准确率提高
1.7 使用voting classifier
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[
('log_clf',LogisticRegression()),
('svm',SVC()),
('dt_clf',DecisionTreeClassifier())
],voting="hard")
# 少数服从多数,hard 投票
voting_clf.fit(X_train,y_train)
voting_clf.score(X_test,y_test)
== 准确率得分更高==
二、Soft Voting
Hard voting
Soft Voting
Hard Voting Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[
('log_clf',LogisticRegression()),
('svm',SVC()),
('dt_clf',DecisionTreeClassifier(random_state=666))
],voting="hard")
# 少数服从多数,hard 投票
voting_clf.fit(X_train,y_train)
voting_clf.score(X_test,y_test)
Soft Voting Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
voting_clf2 = VotingClassifier(estimators=[
('log_clf',LogisticRegression()),
('svm',SVC(probability=True)),
('dt_clf',DecisionTreeClassifier(random_state=666))
],voting="soft")
# 少数服从多数,hard 投票
voting_clf2.fit(X_train,y_train)
voting_clf2.score(X_test,y_test)
三、集成学习
3.1 Bagging
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
X,y = datasets.make_moons(n_samples = 500,noise=0.3,random_state=42)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=100,
bootstrap = True)
bagging_clf.fit(X_train,y_train)
bagging_clf.score(X_test,y_test)
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=5000,max_samples=100,
bootstrap = True)
bagging_clf.fit(X_train,y_train)
bagging_clf.score(X_test,y_test)
3.2 OOB
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=100,
bootstrap = True,oob_score=True)
bagging_clf.fit(X,y)
3.3 n_jobs
3.4 Bootstrap_features
random_subspace_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=500,
bootstrap = True,oob_score=True,n_jobs=-1,bootstrap_features = True)
random_subspace_clf.fit(X,y)
random_subspace_clf.oob_score_
random_patches_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=100,
bootstrap = True,oob_score=True,n_jobs=-1,bootstrap_features = True)
random_patches_clf.fit(X,y)
random_patches_clf.oob_score_
样本随机采样,特征随机采样
四、随机森林
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
X,y = datasets.make_moons(n_samples = 500,noise=0.3,random_state=42)
plt.figure(figsize=(16,8))
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=500,random_state=666,oob_score=True,n_jobs=-1)
rf_clf.fit(X,y)
rf_clf.oob_score_
rf_clf2 = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,random_state=666,oob_score=True,n_jobs=-1)
rf_clf2.fit(X,y)
rf_clf2.oob_score_
五、Extra-Trees【极其随机树】
更快的学习速度
from sklearn.ensemble import ExtraTreesClassifier
et_clf = ExtraTreesClassifier(n_estimators=500,bootstrap=True,random_state=666,oob_score=True)
et_clf.fit(X,y)
集成学习可以解决回归问题
六、Ada Boosting
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
X,y = datasets.make_moons(n_samples = 500,noise=0.3,random_state=666)
plt.figure(figsize=(16,8))
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=666)
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=500)
ada_clf.fit(X_train,y_train)
七、Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(max_depth=2,n_estimators=30)
gb_clf.fit(X_train,y_train)
八、stacking
OVER