决策树
一、代码实现二、什么是决策树2.1 如何去构建决策树?
三、信息熵3.1 二类信息熵绘制
四、基尼系数五、CART5.1 导入数据集5.2 导入决策树算法,进行训练数据5.3 绘制决策边界,不限制其参数5.4 生成的决策树最大深度为2 max_depth=25.5 最小样本分割点【样本至少有10个才分割】 min_samples_split=105.6 叶子节点至少为65.7 最多四个叶子节点
六、决策树建立回归问题八、决策树的局限性
一、代码实现
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:,2:]
y = iris.target
plt.figure(figsize=(16,10))
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=2,criterion="entropy")
dt_clf.fit(X,y)
绘制决策边界
def plot_decision_boundary(model, axis):
# meshgrid函数用两个坐标轴上的点在平面上画格,返回坐标矩阵
X0, X1 = np.meshgrid(
# 随机两组数,起始值和密度由坐标轴的起始值决定
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1),
)
# ravel()方法将高维数组降为一维数组,c_[]将两个数组以列的形式拼接起来,形成矩阵
X_grid_matrix = np.c_[X0.ravel(), X1.ravel()]
# 通过训练好的逻辑回归模型,预测平面上这些点的分类
y_predict = model.predict(X_grid_matrix)
y_predict_matrix = y_predict.reshape(X0.shape)
# 设置色彩表
from matplotlib.colors import ListedColormap
my_colormap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
# 绘制等高线,并且填充等高区域的颜色
plt.contourf(X0, X1, y_predict_matrix, linewidth=5, cmap=my_colormap)
plt.figure(figsize=(16,10))
plot_decision_boundary(dt_clf,axis=[0.5,7.5,0,3])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
对上图的解释
二、什么是决策树
非参数学习算法可以解决分类问题天然可以解决多分类问题也可以解决回归问题非常好的可解释性
2.1 如何去构建决策树?
每个节点在哪个维度做划分某个维度在哪个值上做划分
三、信息熵
熵在信息论中代表随机变量不确定性的度量熵越大,数据的不确定性越高熵越小,数据的不确定性越低
3.1 二类信息熵绘制
def entropy(p):
return -p*np.log(p) - (1-p) *np.log(1-p)
X = np.linspace(0.01,0.99,200)
plt.plot(X,entropy(X))
plt.show()
划分后使得信息熵降低
"""模拟使用信息熵进行划分"""
def split(X,y,d,value):
index_a=(X[:,d]<=value)
index_b=(X[:,d]>value)
return X[index_a],X[index_b],y[index_a],y[index_b]
from math import log
from collections import Counter
def entropy(y):
counter=Counter(y)
res=0.0
for num in counter.values():
p=num/len(y)
res+=-p*log(p)
return res
def try_split(X,y):
best_entropy=float('inf')
best_d,best_v=-1,-1
for d in range(X.shape[1]):
sorted_index=np.argsort(X[:,d])
for i in range(1,len(X)):
if X[sorted_index[i-1],d]!=X[sorted_index[i],d]:
v=(X[sorted_index[i-1],d]+X[sorted_index[i],d])/2
X_l,X_r,y_l,y_r=split(X,y,d,v)
e=entropy(y_l)+entropy(y_r)
if e<best_entropy:
best_entropy,best_d,best_v=e,d,v
return best_entropy,best_d,best_v
best_entropy,best_d,best_v=try_split(X,y)
print('best_entropy=',best_entropy)
print('best_d=',best_d)
print('best_v=',best_v)
X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
print(entropy(y1_l))
print(entropy(y1_r))
best_entropy2,best_d2,best_v2=try_split(X1_r,y1_r)
print('best_entropy=',best_entropy2)
print('best_d=',best_d2)
print('best_v=',best_v2)
X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
print(entropy(y2_l))
print(entropy(y2_r))
代码运行报错
四、基尼系数
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:,2:]
y = iris.target
plt.figure(figsize=(16,10))
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=2,criterion="gini")
dt_clf.fit(X,y)
def plot_decision_boundary(model, axis):
# meshgrid函数用两个坐标轴上的点在平面上画格,返回坐标矩阵
X0, X1 = np.meshgrid(
# 随机两组数,起始值和密度由坐标轴的起始值决定
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1),
)
# ravel()方法将高维数组降为一维数组,c_[]将两个数组以列的形式拼接起来,形成矩阵
X_grid_matrix = np.c_[X0.ravel(), X1.ravel()]
# 通过训练好的逻辑回归模型,预测平面上这些点的分类
y_predict = model.predict(X_grid_matrix)
y_predict_matrix = y_predict.reshape(X0.shape)
# 设置色彩表
from matplotlib.colors import ListedColormap
my_colormap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
# 绘制等高线,并且填充等高区域的颜色
plt.contourf(X0, X1, y_predict_matrix, linewidth=5, cmap=my_colormap)
plt.figure(figsize=(16,10))
plot_decision_boundary(dt_clf,axis=[0.5,7.5,0,3])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
"""基尼系数"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
iris=datasets.load_iris()
X=iris.data[:,2:]
y=iris.target
dt_clf=DecisionTreeClassifier(max_depth=2,criterion='gini')
dt_clf.fit(X,y)
def plot_decision_boundary(model,axis):
x0,x1 = np.meshgrid(
np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)),
np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100))
)
X_new = np.c_[x0.ravel(),x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0,x1,zz,linewidth=5,cmap=custom_cmap)
plot_decision_boundary(dt_clf,axis=[0.5,7.5,-1.0,3])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
"""模拟使用基尼系数进行划分"""
def split(X,y,d,value):
index_a=(X[:,d]<=value)
index_b=(X[:,d]>value)
return X[index_a],X[index_b],y[index_a],y[index_b]
from math import log
from collections import Counter
def jini(y):
counter=Counter(y)
res=1.0
for num in counter.values():
p=num/len(y)
res-=p**2
return res
def try_split(X,y):
best_g=float('inf')
best_d,best_v=-1,-1
for d in range(X.shape[1]):
sorted_index=np.argsort(X[:,d])
for i in range(1,len(X)):
if X[sorted_index[i-1],d]!=X[sorted_index[i],d]:
v=(X[sorted_index[i-1],d]+X[sorted_index[i],d])/2
X_l,X_r,y_l,y_r=split(X,y,d,v)
g=jini(y_l)+jini(y_r)
if g<best_g:
best_g,best_d,best_v=g,d,v
return best_g,best_d,best_v
best_g,best_d,best_v=try_split(X,y)
print('best_g=',best_g)
print('best_d=',best_d)
print('best_v=',best_v)
X1_l, X1_r, y1_l, y1_r = split(X, y, best_d, best_v)
print(jini(y1_l))
print(jini(y1_r))
best_g2,best_d2,best_v2=try_split(X1_r,y1_r)
print('best_g=',best_g2)
print('best_d=',best_d2)
print('best_v=',best_v2)
X2_l, X2_r, y2_l, y2_r = split(X1_r, y1_r, best_d2, best_v2)
print(jini(y2_l))
print(jini(y2_r))
五、CART
5.1 导入数据集
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
X,y = datasets.make_moons(noise=0.25,random_state=666)
plt.figure(figsize=(16,10))
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
5.2 导入决策树算法,进行训练数据
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X,y)
def plot_decision_boundary(model, axis):
# meshgrid函数用两个坐标轴上的点在平面上画格,返回坐标矩阵
X0, X1 = np.meshgrid(
# 随机两组数,起始值和密度由坐标轴的起始值决定
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1),
)
# ravel()方法将高维数组降为一维数组,c_[]将两个数组以列的形式拼接起来,形成矩阵
X_grid_matrix = np.c_[X0.ravel(), X1.ravel()]
# 通过训练好的逻辑回归模型,预测平面上这些点的分类
y_predict = model.predict(X_grid_matrix)
y_predict_matrix = y_predict.reshape(X0.shape)
# 设置色彩表
from matplotlib.colors import ListedColormap
my_colormap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
# 绘制等高线,并且填充等高区域的颜色
plt.contourf(X0, X1, y_predict_matrix, linewidth=5, cmap=my_colormap)
5.3 绘制决策边界,不限制其参数
过拟合
plt.figure(figsize=(16,10))
plot_decision_boundary(dt_clf,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
5.4 生成的决策树最大深度为2 max_depth=2
dt_clf2 = DecisionTreeClassifier(max_depth=2)
dt_clf2.fit(X,y)
plt.figure(figsize=(16,10))
plot_decision_boundary(dt_clf2,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
5.5 最小样本分割点【样本至少有10个才分割】 min_samples_split=10
dt_clf3 = DecisionTreeClassifier(min_samples_split=10)
dt_clf3.fit(X,y)
plt.figure(figsize=(16,10))
plot_decision_boundary(dt_clf3,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
5.6 叶子节点至少为6
dt_clf4 = DecisionTreeClassifier(min_samples_leaf=6)
dt_clf4.fit(X,y)
plt.figure(figsize=(16,10))
plot_decision_boundary(dt_clf4,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
5.7 最多四个叶子节点
dt_clf5 = DecisionTreeClassifier(max_leaf_nodes=4)
dt_clf5.fit(X,y)
plt.figure(figsize=(16,10))
plot_decision_boundary(dt_clf5,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.scatter(X[y==2,0],X[y==2,1])
plt.show()
六、决策树建立回归问题
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
boston = datasets.load_boston()
X = boston.data
y = boston.target
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=666)
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train,y_train)
八、决策树的局限性
对数值比较敏感决策树边界 横平竖直