数据挖掘通讲:分类算法
1import pandas as pd
2import numpy as np
3import warnings
4warnings.filterwarnings('ignore')
1.读取除表头外的前五行\读取数据前五行
1df = pd.read_csv('./titanic_trains.csv',encoding='utf8')
2df.head(5)
3df.shape
(891, 12)
2.缺失值处理(没有要求就遵循80%原则)
2.1.查看缺失值情况,输出缺失值比例、列名、缺失值数量
1for x in df.columns:
2 zb = df[x].isnull().sum()/df.shape[0]*100
3 sl = df[x].isnull().sum()
4 print('缺失值占比为:',round(zb,2),"%,列名:",x,',缺失值数量:',sl)
缺失值占比为: 0.0 %,列名: PassengerId ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Survived ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Pclass ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Name ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Sex ,缺失值数量: 0 缺失值占比为: 19.87 %,列名: Age ,缺失值数量: 177 缺失值占比为: 0.0 %,列名: SibSp ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Parch ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Ticket ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Fare ,缺失值数量: 0 缺失值占比为: 77.1 %,列名: Cabin ,缺失值数量: 687 缺失值占比为: 0.22 %,列名: Embarked ,缺失值数量: 2
2.2.对缺失值进行处理
方法一:题目要求使用SimpleImputer方法进行填充处理时
1from sklearn.impute import SimpleImputer
2# 数值型使用众数填充后类型可能会变成object类型,进而导致建模报错
3
4model_si = SimpleImputer(missing_values=np.nan,strategy='mean') # 定义规则
5df_mean = model_si.fit_transform(df[['Age']]) # 应用于数据
6df_mean = pd.DataFrame(data=df_mean,columns=['Age']) # 转换数据类型,使用起来简单
1from sklearn.impute import SimpleImputer
2# 数值型使用众数填充后类型可能会变成object类型,进而导致建模报错
3
4model_si = SimpleImputer(missing_values=np.nan,strategy='most_frequent') # 定义规则
5df_mode = model_si.fit_transform(df[['Cabin','Embarked']]) # 应用于数据
6df_mode = pd.DataFrame(data=df_mode,columns=['Cabin','Embarked']) # 转换数据类型,使用起来简单
1# 覆盖掉原本的数据
2df['Age'] = df_mean
3df[['Cabin','Embarked']] = df_mode
1df.isnull().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64
方法二:没有要求方法时
1# 删除缺失值超过20%的列
2df_drop = df.dropna(axis=1,thresh=df.shape[0]*0.8)
1df_drop.shape
(891, 12)
1# 填充,可以使用repalce
2df_drop['Age'] = df_drop['Age'].replace(np.nan,df_drop['Age'].mean())
3df_drop['Embarked'] = df_drop['Embarked'].replace(np.nan,df_drop['Embarked'].mode()[0])
1df_drop.isnull().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64
3.异常值处理
3.1、箱线图,公式:上边界:Q3+1.5*IQR<x,上边界:x<Q1-1.5IQR
1# 绘制箱线图
2import matplotlib.pyplot as plt
3plt.figure(figsize=(8,6))
4plt.boxplot(df_drop['Age'])
5plt.show() # 绘图时一定加上show方法
1list_yc = [] # 存储异常值索引位置
1# list_yc = []
2# 箱线图检测
3# 1.取四分位数
4sfw = df_drop['Age'].describe()
5Q1 = sfw['25%']
6Q3 = sfw['75%']
7# 2.计算IQR
8IQR = Q3-Q1
9# 3.计算上/下边界
10up = Q3+1.5*IQR
11down = Q1-1.5*IQR
12# 4.比较
13for x in range(0,891):
14 if x not in list_yc:
15 if df_drop['Age'][x]>up or df_drop['Age'][x]<down:
16 # 5.处理
17 df_drop['Age'] = df_drop['Age'].replace(x,df_drop['Age'].median()) # 替换
18# df_drop.drop(axis=0,index=x,inplace=True)
19 print('异常值索引为:',x)
20 list_yc.append(x)
1# 绘制箱线图
2import matplotlib.pyplot as plt
3plt.figure(figsize=(8,6))
4plt.boxplot(df_drop['Age'])
5plt.show() # 绘图时一定加上show方法
3.1、3σ原则,公式:|x-μ|>3σ
1# 1.计算均值
2mean_age = df_drop['Age'].mean()
3# 2.计算标准差
4std_age = np.std(df_drop['Age'])
5for x in df_drop['Age']:
6 if np.abs(x-mean_age)>3*std_age:
7 # 删除和替换的方式和箱线图一样
8 print('异常值为:',x)
异常值为: 71.0 异常值为: 70.5 异常值为: 71.0 异常值为: 80.0 异常值为: 70.0 异常值为: 70.0 异常值为: 74.0
4.特征选择
1# 查看皮尔逊相关性系数
2cor = df_drop.corr() # pearson:皮尔逊,spearman:斯皮尔曼,kendall:肯德尔
3cor
PassengerId Survived Pclass Age SibSp Parch \
PassengerId 1.000000 -0.005007 -0.035144 0.033207 -0.057527 -0.001652
Survived -0.005007 1.000000 -0.338481 -0.069809 -0.035322 0.081629
Pclass -0.035144 -0.338481 1.000000 -0.331339 0.083081 0.018443
Age 0.033207 -0.069809 -0.331339 1.000000 -0.232625 -0.179191
SibSp -0.057527 -0.035322 0.083081 -0.232625 1.000000 0.414838
Parch -0.001652 0.081629 0.018443 -0.179191 0.414838 1.000000
Fare 0.012658 0.257307 -0.549500 0.091566 0.159651 0.216225
Fare
PassengerId 0.012658
Survived 0.257307
Pclass -0.549500
Age 0.091566
SibSp 0.159651
Parch 0.216225
Fare 1.000000
1# 绘制相关性热力图
2import matplotlib.pyplot as plt
3import seaborn as sns
4plt.figure(figsize=(8,6))
5sns.heatmap(cor)
6plt.show()
1list_ry = []
2# 剔除相关性大于0.8的属性(冗余数据)
3for x in cor.columns:
4 for y in cor.index:
5 if cor[x][y]>0.8 and x!=y and (y,x)not in list_ry:
6 list_ry.append((x,y))
1# 取出要删除的列并去重
2list_drop =np.unique([col[0] for col in list_ry])
1# 删除列
2df_ry_drop = df_drop.drop(axis=1,columns=list_drop)
5.特征离散化
5.1 等宽离散化
1df_ry_drop_copy1 = df_ry_drop.copy()
1df_ry_drop_copy1['Age'] = pd.cut(x=df_ry_drop_copy1['Age'],bins=3,labels=['y','m','o'])
1df_ry_drop_copy1['Age'].value_counts()
m 522 y 319 o 50 Name: Age, dtype: int64
5.2 等频离散化
1df_ry_drop_copy2 = df_ry_drop.copy()
1df_ry_drop_copy2['Age'] = pd.qcut(x=df_ry_drop_copy2['Age'],q=3,labels=['y','m','o'])
1df_ry_drop_copy2['Age'].value_counts()
m 304 y 301 o 286 Name: Age, dtype: int64
6.特征编码
1df_ry_drop_copy2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns):
Column Non-Null Count Dtype
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 891 non-null category
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 891 non-null object
11 Embarked 891 non-null object
dtypes: category(1), float64(1), int64(5), object(5)
memory usage: 77.7+ KB
6.1 特征编码label-encoding
1df_ry_drop_copy2['Age'] = df_ry_drop_copy2['Age'].map({'y':1,'m':2,'o':3})
1df_ry_drop_copy2['Age'].value_counts()
2 304 1 301 3 286 Name: Age, dtype: int64
6.2 独热编码
1# 删除无意义数据
2df_copy2_drop = df_ry_drop_copy2.drop(axis=1,columns=['PassengerId','Name'])
1df_oh = pd.get_dummies(df_copy2_drop) # 只针对object类型的数据做编码
2# from sklearn.preprocessing import OneHotEncoder # 不会区分类型传入数据就会被独热编码
1df_oh.head()
Survived Pclass SibSp Parch Fare Sex_female Sex_male Age_1
0 0 3 1 0 7.2500 0 1 1
1 1 1 1 0 71.2833 1 0 0
2 1 3 0 0 7.9250 1 0 0
3 1 1 1 0 53.1000 1 0 0
4 0 3 0 0 8.0500 0 1 0
Age_2 Age_3 ... Cabin_F G73 Cabin_F2 Cabin_F33 Cabin_F38 Cabin_F4
0 0 0 ... 0 0 0 0 0
1 0 1 ... 0 0 0 0 0
2 1 0 ... 0 0 0 0 0
3 0 1 ... 0 0 0 0 0
4 0 1 ... 0 0 0 0 0
Cabin_G6 Cabin_T Embarked_C Embarked_Q Embarked_S
0 0 0 0 0 1
1 0 0 1 0 0
2 0 0 0 0 1
3 0 0 0 0 1
4 0 0 0 0 1
[5 rows x 841 columns]
7.降维
7.1 PCA
1from sklearn.decomposition import PCA
2
3# 定义规则
4model_pca1 = PCA(n_components=200) # 减低到多少维度
5# 应用到数据上
6df_pca1 = model_pca1.fit_transform(df_oh[[col for col in df_oh.columns if col!='Survived']])
7# 转型
8df_pca1 = pd.DataFrame(df_pca1)
9# model_pca = PCA() # 降低到信息解释比为多少(百分比)
1df_pca1.shape
(891, 200)
1# 定义规则
2model_pca2 = PCA(n_components=0.95) # 降低到信息解释比为多少(百分比)
3# 应用到数据上
4df_pca2 = model_pca2.fit_transform(df_oh[[col for col in df_oh.columns if col!='Survived']])
5# 转型
6df_pca2 = pd.DataFrame(df_pca2)
1df_pca2.shape
(891, 1)
7.2 LDA
1from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
2
3# 定义规则
4model_lda = LDA(n_components=1) # 降低到信息解释比为多少(百分比)
5# 应用到数据上
6df_lda = model_lda.fit_transform(X=df_oh[[col for col in df_oh.columns if col!='Survived']],y=df_oh['Survived'])
7# 转型
8df_lda = pd.DataFrame(df_lda)
1df_lda.shape
(891, 1)
1df_oh.head()
Survived Pclass SibSp Parch Fare Sex_female Sex_male Age_1
0 0 3 1 0 7.2500 0 1 1
1 1 1 1 0 71.2833 1 0 0
2 1 3 0 0 7.9250 1 0 0
3 1 1 1 0 53.1000 1 0 0
4 0 3 0 0 8.0500 0 1 0
Age_2 Age_3 ... Cabin_F G73 Cabin_F2 Cabin_F33 Cabin_F38 Cabin_F4
0 0 0 ... 0 0 0 0 0
1 0 1 ... 0 0 0 0 0
2 1 0 ... 0 0 0 0 0
3 0 1 ... 0 0 0 0 0
4 0 1 ... 0 0 0 0 0
Cabin_G6 Cabin_T Embarked_C Embarked_Q Embarked_S
0 0 0 0 0 1
1 0 0 1 0 0
2 0 0 0 0 1
3 0 0 0 0 1
4 0 0 0 0 1
[5 rows x 841 columns]
8.数据拆分
1# 纵向拆分
2X = df_oh[[col for col in df_oh.columns if col !='Survived']]
3# X = df_oh.drop(axis=1,columns=['Survived'])
4y = df_oh['Survived']
1# 横向拆分数据
2from sklearn.model_selection import train_test_split
3
4X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
9.特征缩放
9.1标准化
1from sklearn.preprocessing import StandardScaler #标准化
2
3# 设置模型
4model_std = StandardScaler()
5#将模型应用到数据上
6df_std = model_std.fit_transform(X_train)
1from sklearn.preprocessing import scale #标准化
2df_scale = scale(X_train)
3df_std
array([[-0.30910376, 0.49937097, 0.83151221, ..., -0.48591266, -0.29859812, 0.6125977 ], [-0.30910376, -0.46381205, -0.470076 , ..., -0.48591266, -0.29859812, 0.6125977 ], [-0.30910376, 0.49937097, 0.83151221, ..., -0.48591266, -0.29859812, 0.6125977 ], ..., [-1.49052488, -0.46381205, 2.13310042, ..., -0.48591266, -0.29859812, 0.6125977 ], [ 0.87231736, 0.49937097, -0.470076 , ..., -0.48591266, -0.29859812, 0.6125977 ], [ 0.87231736, -0.46381205, -0.470076 , ..., -0.48591266, -0.29859812, 0.6125977 ]])
9.2 最小值最大值归一化
1from sklearn.preprocessing import MinMaxScaler
2
3# 设置模型
4model_mm = MinMaxScaler()
5#将模型应用到数据上
6df_mm = model_mm.fit_transform(X_train)
10.建模
10.1 决策树建模
1from sklearn.tree import DecisionTreeClassifier # 分类
2# from sklearn.tree import DecisionTreeRegressor # 回归
3
4# 设置模型参数
5model_tree =DecisionTreeClassifier()
6# 模型训练
7model_tree.fit(X_train,y_train)
DecisionTreeClassifier()
1from sklearn.linear_model import LogisticRegressionCV # 逻辑回归
2from sklearn.neighbors import KNeighborsClassifier # KNN\K最近邻
3from sklearn.naive_bayes import GaussianNB # 高斯朴素贝叶斯 ,只能用于二分类
4from sklearn.naive_bayes import MultinomialNB # 多项式朴素贝叶斯,可以用于多分类场景
5from sklearn.svm import SVC # 支持向量机
1from sklearn.ensemble import AdaBoostClassifier # 自适应提升
2from sklearn.ensemble import RandomForestClassifier # 随机森林
1from xgboost import XGBClassifier # xgboost
2# from xgboost.sklearn import XGBClassifier
1pip install xgboost
Requirement already satisfied: xgboost in f:\anaconda\lib\site-packages (1.6.1) Requirement already satisfied: numpy in f:\anaconda\lib\site-packages (from xgboost) (1.19.3) Requirement already satisfied: scipy in f:\anaconda\lib\site-packages (from xgboost) (1.7.3) Note: you may need to restart the kernel to use updated packages.
11.模型评估
1from sklearn.metrics import accuracy_score #准确率
2from sklearn.metrics import recall_score #召回率
3from sklearn.metrics import f1_score # F1值
4from sklearn.metrics import roc_auc_score # roc_auc值
5
11.1召回率
1# 预测
2y_pre = model_tree.predict(X_test)
3# 评分
4tree_recall_score = recall_score(y_test,y_pre)
5print('决策树模型的召回率评分为',round(tree_recall_score,2))
决策树模型的召回率评分为 0.69
11.2 准确率
1# 预测
2y_pre = model_tree.predict(X_test)
3# 评分
4tree_accuracy_score = accuracy_score(y_test,y_pre)
5print('决策树模型的准确率评分为',round(tree_accuracy_score,2))
决策树模型的准确率评分为 0.83
11.3 F1值
1# 预测
2y_pre = model_tree.predict(X_test)
3# 评分
4tree_f1_score = f1_score(y_test,y_pre)
5print('决策树模型的F1值评分为',round(tree_f1_score,2))
决策树模型的F1值评分为 0.74
11.4交叉验证
1from sklearn.model_selection import cross_val_score
2from sklearn.metrics import make_scorer # 预测
3
4tree_cv = cross_val_score(model_tree,X_train,y_train,scoring=make_scorer(f1_score),cv=3)
1tree_cv.mean()
0.7026449961970324
1tree_cv
array([0.65277778, 0.71005917, 0.74509804])
12.模型优化
12.1随机指定参数
1# 设置模型参数
2model_tree_yh =DecisionTreeClassifier(max_depth=18,min_samples_split=3)
3
4# 模型训练
5model_tree_yh.fit(X_train,y_train)
DecisionTreeClassifier(max_depth=18, min_samples_split=3)
1# 预测
2y_pre = model_tree_yh.predict(X_test)
3# 评分
4tree_f1_score_yh = f1_score(y_test,y_pre)
5print('决策树模型的F1值评分为',round(tree_f1_score_yh,2))
决策树模型的F1值评分为 0.75
12.2 网格搜索
1from sklearn.model_selection import GridSearchCV
2pargms = {'max_depth':[10,15,20],'min_samples_split':[2,3,4]}
3model_tree_gs = GridSearchCV(estimator=model_tree_yh,param_grid=pargms,scoring=make_scorer(f1_score))
4model_tree_gs.fit(X_train,y_train)
GridSearchCV(estimator=DecisionTreeClassifier(max_depth=18, min_samples_split=3), param_grid={'max_depth': [10, 15, 20], 'min_samples_split': [2, 3, 4]}, scoring=make_scorer(f1_score))
1# 最优评分
2model_tree_gs.best_score_
3# 最优模型
4model_tree_gs.best_estimator_ # 可以得到一个模型,这个模型可以去预测数据的
5# 最优参数
6model_tree_gs.best_params_ # 只有参数
{'max_depth': 20, 'min_samples_split': 4}
12.3 过采样
1# 要求将ratio参数设置为0.1,这个忽略掉。新版的过采样库没有这个参数了
2
3from imblearn.over_sampling import SMOTE # 需要安装imblearn pip install imblearn
4
5model_smote =SMOTE() # 设置参数
6X_train_smote,y_train_smote = model_smote.fit_resample(X_train,y_train) # 应用与数据上
7X_test_smote,y_test_smote = model_smote.fit_resample(X_test,y_test) # 应用与数据上
1# 使用过采样后的数据在次建模
2# 设置模型参数
3model_tree_smote =DecisionTreeClassifier(max_depth=6)
4# 模型训练
5model_tree_smote.fit(X_train_smote,y_train_smote)
DecisionTreeClassifier(max_depth=6)
1# 对使用过采样数据建模后的模型再次评估
2# 预测
3y_pre_smote = model_tree_smote.predict(X_test_smote)
4# 评分
5tree_f1_score_smote = f1_score(y_test_smote,y_pre_smote)
6print('决策树模型的F1值评分为',round(tree_f1_score_smote,2))
决策树模型的F1值评分为 0.82
13.保存模型
1import joblib # 需要安装
2
3# model_best = model_tree_gs.best_estimator_
4
5joblib.dump(model_tree_smote,'./tree_best.pkl') # 保存模型
['./tree_best.pkl']