from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import fetch_20newsgroups from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from SelfTools import Tool from sklearn.datasets import load_iris from sklearn.tree import export_graphviz
随机森林原理过程: 训练集:特征值、目标值 随机:训练集随机,特征随机(如果每次都用一样的数据,训练结果岂不是一样?) 假设训练集有 N 个样本,M 个特征 训练集随机:bootstrap 随机有放回抽样,N 个样本的训练集随机有放回抽样 N 次(一次一个),得到一个新的随机训练集 如: [1, 2, 3, 4, 5] -> [2, 2, 3, 1, 5] 特征随机:M 个特征中抽取 m 个特征,要求: M >> m
demo
TBD~
from sklearn.ensemble import RandomForestClassifier """ RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, max_features='auto',bootstrap=True, random_state=None, min_samples_split=2, min_samples_leaf) 随机森林分类器 n_estimators: optional, number of trees in forests, integer criterion: optional, measures of splitting features, string max_depth: optional, max depth of one tree, integer or None max_features: max features of one tree if 'auto', max_features = sqrt(n_features) if 'sqrt', same as 'auto' if 'log2', max_features = log2 (n_features) if None, max_features = n_features (Not recommended) string bootstrap: optional, whether using bootstrap, boolean min_samples_split: min number of samples of samples' split min_samples_leaf: min number of samples of leaf node 超参数:n_estimators, max_depth, min_samples_split, min_samples_leaf -> 网格搜索用 """
defrf_demo(): """ 随机森林 :return: """ # TODO returnNone
if __name__ == "__main__": rf_demo()
实例
TBD~
# http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt import pandas as pd