决策树实现

获取数据集

获取鸢尾花数据集

code

iris = load_iris()

划分数据集

将数据集划分为训练集和测试集,划分比例按默认值,随便初始化随机种子。

code

x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=114)

使用决策树预估器进行分类

基于信息增益进行树的划分,之后用训练集训练模型

code

estimator = DecisionTreeClassifier(criterion='entropy')
estimator.fit(x_train, y_train)

模型评估

使用训练好的模型进行预测,查看一下基本预测情况,然后用预测值和真实值进行比较,查看正确与否;最后打分判断模型准确率。

code

y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("y_predict == y_test:\n", y_predict == y_test)
print("score = ", estimator.score(x_test, y_test))

output

y_predict:
[2 2 1 1 2 2 2 1 2 0 2 0 1 1 2 1 0 1 1 0 2 2 1 1 2 1 0 2 2 1 1 1 0 1 1 0 1
1]
y_predict == y_test:
[ True True True True True True True True False True True True
True False True True True True True True True True True True
True True True True True True True True True True True True
True True]
score = 0.9473684210526315

从结果可以看出,经典决策树算法对经典鸢尾花数据集的分类效果是非常好的,准确率很高,模型在分类任务上表现较好,能够有效地区分不同的类别。

可视化树

通过这一步可以可视化经过数据训练拟合后的决策树。

plt.figure(figsize=(10, 6))
plot_tree(estimator, filled=True, feature_names=iris.feature_names, class_names=iris.target_names)
plt.show()

信息增益的决策树

image-20240929085020072

可视化分类预测结果

  1. 定义可视化辅助函数
def visualize_classification(x_reduced, y_pred, y):
plt.figure(figsize=(10, 7))
scatter = plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=y_pred, marker='o', edgecolor='k', s=100, cmap='viridis', alpha=0.7, label='Predicted Labels')
plt.title("Decision Tree Classification Results")
plt.colorbar(scatter, ticks=[0, 1, 2], label='Class')
plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=y, marker='x', edgecolor='red', s=100,
label='True Labels') # 显示真实标签
plt.legend()
plt.show()
  1. 可视化,可以更加直观地看到预测情况与真实情况的对比(相比标准文本输出)
visualize_classification(x_test, y_predict, y_test)

从可视化结果可以更加直观地看出,大部分预测结果与真实结果的颜色是相同的,即预测对了。

剪枝优化

不妨尝试后剪枝进行优化尝试,我们遍历所有可取的剪枝系数,比较最佳的结果。

code

print("try pruning:")
path = estimator.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
for ccp_alpha in ccp_alphas:
estimator_pruned = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
estimator_pruned.fit(x_train, y_train)
y_pred = estimator_pruned.predict(x_test)
score = estimator_pruned.score(x_test, y_test)
print("ccp_alpha:", ccp_alpha, "score:", score)

output

try pruning:
ccp_alpha: 0.0 score: 0.9473684210526315
ccp_alpha: 0.024597209840745256 score: 0.868421052631579
ccp_alpha: 0.024597209840745256 score: 0.868421052631579
ccp_alpha: 0.03426345497041028 score: 0.868421052631579
ccp_alpha: 0.0725895787024318 score: 0.868421052631579
ccp_alpha: 0.4576883769083686 score: 0.18421052631578946
ccp_alpha: 0.9607694580407837 score: 0.18421052631578946

发现还是不剪枝的分类效果最好

比较基尼系数与信息增益

最后我们比较一下基于基尼系数划分决策树和基于信息增益划分决策树的区别

code

estimator = DecisionTreeClassifier(criterion='gini')
estimator.fit(x_train, y_train)
print("gini:")
print("score:", estimator.score(x_test, y_test))
estimator = DecisionTreeClassifier(criterion='entropy')
estimator.fit(x_train, y_train)
print("entropy:")
print("score:", estimator.score(x_test, y_test))

output

gini:
score: 0.9736842105263158
entropy:
score: 0.9473684210526315

基尼系数的决策树

image-20240929094240899

完整代码

from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import plot_tree


def visualize_classification(x_reduced, y_pred, y):
plt.figure(figsize=(10, 7))
scatter = plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=y_pred, marker='o', edgecolor='k', s=100, cmap='viridis',
alpha=0.7, label='Predicted Labels')
plt.title("Decision Tree Classification Results")
plt.colorbar(scatter, ticks=[0, 1, 2], label='Class')
plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=y, marker='x', s=100,
label='True Labels') # 显示真实标签
plt.legend()
plt.show()


def decision_tree():
"""
决策树对鸢尾花分类
:return:
"""
# 获取数据集
iris = load_iris()
print(iris.data)
print(iris.target)

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=114)

# (决策树不需要计算距离,因此不需要标准化)
transfer = None

# 决策树预估器进行分类
estimator = DecisionTreeClassifier(criterion='entropy')
estimator.fit(x_train, y_train)

# 模型评估
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("y_predict == y_test:\n", y_predict == y_test)
print("score = ", estimator.score(x_test, y_test))

# 可视化决策树
plt.figure(figsize=(10, 6))
plot_tree(estimator, filled=True, feature_names=iris.feature_names, class_names=iris.target_names)
plt.show()

# 可视化分类结果
visualize_classification(x_test, y_predict, y_test)

# 后剪枝
print("try pruning:")
path = estimator.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
for ccp_alpha in ccp_alphas:
estimator_pruned = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
estimator_pruned.fit(x_train, y_train)
y_pred = estimator_pruned.predict(x_test)
score = estimator_pruned.score(x_test, y_test)
print("ccp_alpha:", ccp_alpha, "score:", score)

# 单独比较一下基尼系数与信息增益
# gini
estimator = DecisionTreeClassifier(criterion='gini')
estimator.fit(x_train, y_train)
print("gini:")
# 插入一下gini的树
plt.figure(figsize=(10, 6))
plot_tree(estimator, filled=True, feature_names=iris.feature_names, class_names=iris.target_names)
plt.show()
print("score:", estimator.score(x_test, y_test))
# 信息增益
estimator = DecisionTreeClassifier(criterion='entropy')
estimator.fit(x_train, y_train)
print("entropy:")
print("score:", estimator.score(x_test, y_test))


if __name__ == "__main__":
decision_tree()