集成学习
 一、集成学习的简单实现1.1 导入数据1.2 分割数据1.3 逻辑回归训练1.4 SVM 训练1.5 决策树训练1.6 简单集成1.7 使用voting classifier
  二、Soft Voting三、集成学习3.1 Bagging3.2 OOB3.3 n_jobs3.4 Bootstrap_features
  四、随机森林五、Extra-Trees【极其随机树】六、Ada Boosting七、Gradient Boosting八、stacking
 
一、集成学习的简单实现
 
1.1 导入数据
 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import  datasets
X,y  = datasets.make_moons(n_samples = 500,noise=0.3,random_state=42)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
 
 
1.2 分割数据
 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
 
1.3 逻辑回归训练
 
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
log_clf.fit(X_train,y_train)
log_clf.score(X_test,y_test)
 
 
1.4 SVM 训练
 
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train,y_train)
svm_clf.score(X_test,y_test)
 
 
1.5 决策树训练
 
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)
dt_clf.score(X_test,y_test)
 
 
1.6 简单集成
 
y_predict1 = log_clf.predict(X_test)
y_predict2 = svm_clf.predict(X_test)
y_predict3 = dt_clf.predict(X_test)
 
y_predict = np.array((y_predict1+y_predict2+y_predict3) >=2,dtype='int')
y_predict[:10]
 
 
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)
 
 准确率提高
 
1.7 使用voting classifier
 
from sklearn.ensemble import VotingClassifier
 
voting_clf = VotingClassifier(estimators=[
    ('log_clf',LogisticRegression()),
    ('svm',SVC()),
    ('dt_clf',DecisionTreeClassifier())
],voting="hard")
# 少数服从多数,hard 投票
voting_clf.fit(X_train,y_train)
 
 
voting_clf.score(X_test,y_test)
 
 == 准确率得分更高==
 
二、Soft Voting
 
 Hard voting 
  Soft Voting  
  
 
Hard Voting Classifier 
from sklearn.linear_model import  LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('log_clf',LogisticRegression()),
    ('svm',SVC()),
    ('dt_clf',DecisionTreeClassifier(random_state=666))
],voting="hard")
# 少数服从多数,hard 投票
voting_clf.fit(X_train,y_train)
voting_clf.score(X_test,y_test)
 
 
Soft Voting Classifier 
from sklearn.linear_model import  LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
voting_clf2 = VotingClassifier(estimators=[
    ('log_clf',LogisticRegression()),
    ('svm',SVC(probability=True)),
    ('dt_clf',DecisionTreeClassifier(random_state=666))
],voting="soft")
# 少数服从多数,hard 投票
voting_clf2.fit(X_train,y_train)
voting_clf2.score(X_test,y_test)
 
 
三、集成学习
 
 
 
 
3.1 Bagging
 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import  datasets
X,y  = datasets.make_moons(n_samples = 500,noise=0.3,random_state=42)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
 
 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=100,
                                bootstrap = True)
 
bagging_clf.fit(X_train,y_train)
bagging_clf.score(X_test,y_test)
 
 
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=5000,max_samples=100,
                                bootstrap = True)
bagging_clf.fit(X_train,y_train)
bagging_clf.score(X_test,y_test)
 
 
3.2 OOB
 
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=100,
                                bootstrap = True,oob_score=True)
bagging_clf.fit(X,y)
 
 
 
3.3 n_jobs
 
 
 
 
3.4 Bootstrap_features
 
random_subspace_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=500,
                                bootstrap = True,oob_score=True,n_jobs=-1,bootstrap_features = True)
random_subspace_clf.fit(X,y)
random_subspace_clf.oob_score_
 
random_patches_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=100,
                                bootstrap = True,oob_score=True,n_jobs=-1,bootstrap_features = True)
random_patches_clf.fit(X,y)
random_patches_clf.oob_score_
 
 
样本随机采样,特征随机采样 
四、随机森林
 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import  datasets
X,y  = datasets.make_moons(n_samples = 500,noise=0.3,random_state=42)
plt.figure(figsize=(16,8))
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
 
 
from sklearn.ensemble import  RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=500,random_state=666,oob_score=True,n_jobs=-1)
rf_clf.fit(X,y)
 
 
rf_clf.oob_score_
 
 
rf_clf2 = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,random_state=666,oob_score=True,n_jobs=-1)
rf_clf2.fit(X,y)
rf_clf2.oob_score_
 
 
五、Extra-Trees【极其随机树】
 
 
更快的学习速度 
from sklearn.ensemble import  ExtraTreesClassifier
et_clf = ExtraTreesClassifier(n_estimators=500,bootstrap=True,random_state=666,oob_score=True)
et_clf.fit(X,y)
 
 
集成学习可以解决回归问题 
六、Ada Boosting
 
 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import  datasets
X,y  = datasets.make_moons(n_samples = 500,noise=0.3,random_state=666)
plt.figure(figsize=(16,8))
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
 
 
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=666)
 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=500)
ada_clf.fit(X_train,y_train)
 
 
 
七、Gradient Boosting
 
 
 
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(max_depth=2,n_estimators=30)
gb_clf.fit(X_train,y_train)
 
 
 
八、stacking
 
 
 
OVER