from sklearn.feature_extraction import DictVectorizerfrom sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizerfrom sklearn.preprocessing import MinMaxScaler,StandardScalerfrom sklearn.model_selection import train_test_splitfrom sklearn.tree import DecisionTreeClassifier,export_graphvizimport jieba
import pandas as pd
调用方法来实现:
def decision(): #获取数据 taitan=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') #获取特征值 x=taitan[['pclass','age','sex']] #获取目标值 y=taitan[['survived']] #缺失值的处理 用年龄的平均值填补 x['age'].fillna(x['age'].mean(),inplace=True) #分割数据集到训练集和测试集 x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25) #对数据进行one_hot编码 dict=DictVectorizer(sparse=False) x_train = dict.fit_transform(x_train.to_dict(orient='records')) print(dict.get_feature_names()) x_test = dict.fit_transform(x_test.to_dict(orient='records')) #实例化一个决策树 dec=DecisionTreeClassifier() dec.fit(x_train,y_train) print("预测准确率:",dec.score(x_test,y_test)) export_graphviz(dec,out_file='./tree.dot',feature_names=['年龄','pclass=1st', 'pclass=2nd', 'pclass=3rd', '女人', '男人']) # print(x_train)if __name__ == '__main__': decision()
如果想查看决策树的结构,可以使用可视化工具将其转化为图片 graphviz
win安装直接exe就行了,安装完成,需要将bin目录配置到path环境变量中去。即可执行dot命令。
dot -Tpng tree.dot -o tree.png #将dot文件转为png格式的图片 必须进入到指定目录下去操作