数据挖掘通讲：分类算法

Apr 11, 2026 · 8 分钟阅读 · 分类数据挖掘算法 ·

1import pandas as pd
2import numpy as np
3import warnings
4warnings.filterwarnings('ignore')

1.读取除表头外的前五行\读取数据前五行

1df = pd.read_csv('./titanic_trains.csv',encoding='utf8')
2df.head(5)
3df.shape

(891, 12)

2.缺失值处理(没有要求就遵循80%原则)

2.1.查看缺失值情况，输出缺失值比例、列名、缺失值数量

1for x in df.columns:
2    zb = df[x].isnull().sum()/df.shape[0]*100
3    sl = df[x].isnull().sum()
4    print('缺失值占比为:',round(zb,2),"%,列名:",x,',缺失值数量:',sl)

缺失值占比为: 0.0 %,列名: PassengerId ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Survived ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Pclass ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Name ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Sex ,缺失值数量: 0 缺失值占比为: 19.87 %,列名: Age ,缺失值数量: 177 缺失值占比为: 0.0 %,列名: SibSp ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Parch ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Ticket ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Fare ,缺失值数量: 0 缺失值占比为: 77.1 %,列名: Cabin ,缺失值数量: 687 缺失值占比为: 0.22 %,列名: Embarked ,缺失值数量: 2

2.2.对缺失值进行处理

方法一:题目要求使用SimpleImputer方法进行填充处理时

1from sklearn.impute  import SimpleImputer
2# 数值型使用众数填充后类型可能会变成object类型,进而导致建模报错
3
4model_si = SimpleImputer(missing_values=np.nan,strategy='mean')  # 定义规则
5df_mean = model_si.fit_transform(df[['Age']])  # 应用于数据
6df_mean = pd.DataFrame(data=df_mean,columns=['Age'])  # 转换数据类型,使用起来简单

1from sklearn.impute  import SimpleImputer
2# 数值型使用众数填充后类型可能会变成object类型,进而导致建模报错
3
4model_si = SimpleImputer(missing_values=np.nan,strategy='most_frequent')  # 定义规则
5df_mode = model_si.fit_transform(df[['Cabin','Embarked']])  # 应用于数据
6df_mode = pd.DataFrame(data=df_mode,columns=['Cabin','Embarked'])  # 转换数据类型,使用起来简单

1# 覆盖掉原本的数据
2df['Age'] = df_mean
3df[['Cabin','Embarked']] = df_mode

1df.isnull().sum()

PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64

方法二:没有要求方法时

1# 删除缺失值超过20%的列
2df_drop = df.dropna(axis=1,thresh=df.shape[0]*0.8)

1df_drop.shape

(891, 12)

1# 填充,可以使用repalce
2df_drop['Age'] = df_drop['Age'].replace(np.nan,df_drop['Age'].mean())
3df_drop['Embarked'] = df_drop['Embarked'].replace(np.nan,df_drop['Embarked'].mode()[0])

1df_drop.isnull().sum()

PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64

3.异常值处理

3.1、箱线图，公式：上边界:Q3+1.5*IQR<x,上边界:x<Q1-1.5IQR

1# 绘制箱线图
2import matplotlib.pyplot as plt
3plt.figure(figsize=(8,6))
4plt.boxplot(df_drop['Age'])
5plt.show()  # 绘图时一定加上show方法

1list_yc = [] # 存储异常值索引位置

 1# list_yc = []
 2# 箱线图检测
 3# 1.取四分位数
 4sfw = df_drop['Age'].describe()
 5Q1 = sfw['25%']
 6Q3 = sfw['75%']
 7# 2.计算IQR
 8IQR = Q3-Q1
 9# 3.计算上/下边界
10up = Q3+1.5*IQR
11down = Q1-1.5*IQR
12# 4.比较
13for x in range(0,891):
14    if x not in list_yc:
15        if df_drop['Age'][x]>up or df_drop['Age'][x]<down:
16            # 5.处理
17            df_drop['Age'] = df_drop['Age'].replace(x,df_drop['Age'].median())  # 替换
18#             df_drop.drop(axis=0,index=x,inplace=True)
19            print('异常值索引为:',x)
20            list_yc.append(x)

1# 绘制箱线图
2import matplotlib.pyplot as plt
3plt.figure(figsize=(8,6))
4plt.boxplot(df_drop['Age'])
5plt.show()  # 绘图时一定加上show方法

3.1、3σ原则，公式：|x-μ|>3σ

1# 1.计算均值
2mean_age = df_drop['Age'].mean()
3# 2.计算标准差
4std_age = np.std(df_drop['Age'])
5for x in df_drop['Age']:
6    if np.abs(x-mean_age)>3*std_age:
7        # 删除和替换的方式和箱线图一样
8        print('异常值为:',x)

异常值为: 71.0 异常值为: 70.5 异常值为: 71.0 异常值为: 80.0 异常值为: 70.0 异常值为: 70.0 异常值为: 74.0

4.特征选择

1# 查看皮尔逊相关性系数
2cor = df_drop.corr() # pearson:皮尔逊，spearman：斯皮尔曼，kendall：肯德尔
3cor

         PassengerId  Survived    Pclass       Age     SibSp     Parch  \

PassengerId 1.000000 -0.005007 -0.035144 0.033207 -0.057527 -0.001652
Survived -0.005007 1.000000 -0.338481 -0.069809 -0.035322 0.081629
Pclass -0.035144 -0.338481 1.000000 -0.331339 0.083081 0.018443
Age 0.033207 -0.069809 -0.331339 1.000000 -0.232625 -0.179191
SibSp -0.057527 -0.035322 0.083081 -0.232625 1.000000 0.414838
Parch -0.001652 0.081629 0.018443 -0.179191 0.414838 1.000000
Fare 0.012658 0.257307 -0.549500 0.091566 0.159651 0.216225

             Fare

PassengerId 0.012658
Survived 0.257307
Pclass -0.549500
Age 0.091566
SibSp 0.159651
Parch 0.216225
Fare 1.000000

1# 绘制相关性热力图
2import matplotlib.pyplot as plt
3import seaborn as sns
4plt.figure(figsize=(8,6))
5sns.heatmap(cor)
6plt.show()

1list_ry = []
2# 剔除相关性大于0.8的属性（冗余数据）
3for x in cor.columns:
4    for y in cor.index:
5        if cor[x][y]>0.8 and x!=y and (y,x)not in list_ry:
6            list_ry.append((x,y))

1# 取出要删除的列并去重
2list_drop =np.unique([col[0] for col in list_ry])

1# 删除列
2df_ry_drop = df_drop.drop(axis=1,columns=list_drop)

5.特征离散化

5.1 等宽离散化

1df_ry_drop_copy1 = df_ry_drop.copy()

1df_ry_drop_copy1['Age'] = pd.cut(x=df_ry_drop_copy1['Age'],bins=3,labels=['y','m','o'])

1df_ry_drop_copy1['Age'].value_counts()

m 522 y 319 o 50 Name: Age, dtype: int64

5.2 等频离散化

1df_ry_drop_copy2 = df_ry_drop.copy()

1df_ry_drop_copy2['Age'] = pd.qcut(x=df_ry_drop_copy2['Age'],q=3,labels=['y','m','o'])

1df_ry_drop_copy2['Age'].value_counts()

m 304 y 301 o 286 Name: Age, dtype: int64

6.特征编码

1df_ry_drop_copy2.info()

<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns):

Column Non-Null Count Dtype

0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 891 non-null category 6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64 10 Cabin 891 non-null object
11 Embarked 891 non-null object
dtypes: category(1), float64(1), int64(5), object(5) memory usage: 77.7+ KB

6.1 特征编码label-encoding

1df_ry_drop_copy2['Age'] = df_ry_drop_copy2['Age'].map({'y':1,'m':2,'o':3})

1df_ry_drop_copy2['Age'].value_counts()

2 304 1 301 3 286 Name: Age, dtype: int64

6.2 独热编码

1# 删除无意义数据
2df_copy2_drop = df_ry_drop_copy2.drop(axis=1,columns=['PassengerId','Name'])

1df_oh = pd.get_dummies(df_copy2_drop) # 只针对object类型的数据做编码
2# from sklearn.preprocessing import OneHotEncoder # 不会区分类型传入数据就会被独热编码

1df_oh.head()

Survived Pclass SibSp Parch Fare Sex_female Sex_male Age_1
0 0 3 1 0 7.2500 0 1 1
1 1 1 1 0 71.2833 1 0 0
2 1 3 0 0 7.9250 1 0 0
3 1 1 1 0 53.1000 1 0 0
4 0 3 0 0 8.0500 0 1 0

Age_2 Age_3 ... Cabin_F G73 Cabin_F2 Cabin_F33 Cabin_F38 Cabin_F4
0 0 0 ... 0 0 0 0 0
1 0 1 ... 0 0 0 0 0
2 1 0 ... 0 0 0 0 0
3 0 1 ... 0 0 0 0 0
4 0 1 ... 0 0 0 0 0

Cabin_G6 Cabin_T Embarked_C Embarked_Q Embarked_S
0 0 0 0 0 1
1 0 0 1 0 0
2 0 0 0 0 1
3 0 0 0 0 1
4 0 0 0 0 1

[5 rows x 841 columns]

7.降维

7.1 PCA

1from sklearn.decomposition import PCA
2
3# 定义规则
4model_pca1 = PCA(n_components=200)  # 减低到多少维度
5# 应用到数据上
6df_pca1 = model_pca1.fit_transform(df_oh[[col for col in df_oh.columns if col!='Survived']])
7# 转型
8df_pca1 = pd.DataFrame(df_pca1)
9# model_pca = PCA()  # 降低到信息解释比为多少(百分比)

1df_pca1.shape

(891, 200)

1# 定义规则
2model_pca2 = PCA(n_components=0.95)  # 降低到信息解释比为多少(百分比)
3# 应用到数据上
4df_pca2 = model_pca2.fit_transform(df_oh[[col for col in df_oh.columns if col!='Survived']])
5# 转型
6df_pca2 = pd.DataFrame(df_pca2)

1df_pca2.shape

(891, 1)

7.2 LDA

1from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
2
3# 定义规则
4model_lda = LDA(n_components=1)  # 降低到信息解释比为多少(百分比)
5# 应用到数据上
6df_lda = model_lda.fit_transform(X=df_oh[[col for col in df_oh.columns if col!='Survived']],y=df_oh['Survived'])
7# 转型
8df_lda = pd.DataFrame(df_lda)

1df_lda.shape

(891, 1)

1df_oh.head()

Survived Pclass SibSp Parch Fare Sex_female Sex_male Age_1
0 0 3 1 0 7.2500 0 1 1
1 1 1 1 0 71.2833 1 0 0
2 1 3 0 0 7.9250 1 0 0
3 1 1 1 0 53.1000 1 0 0
4 0 3 0 0 8.0500 0 1 0

Age_2 Age_3 ... Cabin_F G73 Cabin_F2 Cabin_F33 Cabin_F38 Cabin_F4
0 0 0 ... 0 0 0 0 0
1 0 1 ... 0 0 0 0 0
2 1 0 ... 0 0 0 0 0
3 0 1 ... 0 0 0 0 0
4 0 1 ... 0 0 0 0 0

Cabin_G6 Cabin_T Embarked_C Embarked_Q Embarked_S
0 0 0 0 0 1
1 0 0 1 0 0
2 0 0 0 0 1
3 0 0 0 0 1
4 0 0 0 0 1

[5 rows x 841 columns]

8.数据拆分

1# 纵向拆分
2X = df_oh[[col for col in df_oh.columns if col !='Survived']]
3# X = df_oh.drop(axis=1,columns=['Survived'])
4y = df_oh['Survived']

1# 横向拆分数据
2from sklearn.model_selection import train_test_split
3
4X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

9.特征缩放

9.1标准化

1from sklearn.preprocessing import StandardScaler #标准化
2
3# 设置模型
4model_std = StandardScaler()
5#将模型应用到数据上
6df_std = model_std.fit_transform(X_train)

1from sklearn.preprocessing import scale  #标准化
2df_scale = scale(X_train)
3df_std

array([[-0.30910376, 0.49937097, 0.83151221, ..., -0.48591266, -0.29859812, 0.6125977 ], [-0.30910376, -0.46381205, -0.470076 , ..., -0.48591266, -0.29859812, 0.6125977 ], [-0.30910376, 0.49937097, 0.83151221, ..., -0.48591266, -0.29859812, 0.6125977 ], ..., [-1.49052488, -0.46381205, 2.13310042, ..., -0.48591266, -0.29859812, 0.6125977 ], [ 0.87231736, 0.49937097, -0.470076 , ..., -0.48591266, -0.29859812, 0.6125977 ], [ 0.87231736, -0.46381205, -0.470076 , ..., -0.48591266, -0.29859812, 0.6125977 ]])

9.2 最小值最大值归一化

1from sklearn.preprocessing import MinMaxScaler
2
3# 设置模型
4model_mm = MinMaxScaler()
5#将模型应用到数据上
6df_mm = model_mm.fit_transform(X_train)

10.建模

10.1 决策树建模

1from sklearn.tree import DecisionTreeClassifier  # 分类
2# from sklearn.tree import DecisionTreeRegressor # 回归
3
4# 设置模型参数
5model_tree =DecisionTreeClassifier()
6# 模型训练
7model_tree.fit(X_train,y_train)

DecisionTreeClassifier()

1from sklearn.linear_model import LogisticRegressionCV # 逻辑回归
2from sklearn.neighbors import KNeighborsClassifier # KNN\K最近邻
3from sklearn.naive_bayes import GaussianNB # 高斯朴素贝叶斯 ,只能用于二分类
4from sklearn.naive_bayes import MultinomialNB # 多项式朴素贝叶斯，可以用于多分类场景
5from sklearn.svm import SVC # 支持向量机

1from sklearn.ensemble import AdaBoostClassifier # 自适应提升
2from sklearn.ensemble import RandomForestClassifier # 随机森林

1from xgboost import XGBClassifier # xgboost
2# from xgboost.sklearn import XGBClassifier

1pip install xgboost

Requirement already satisfied: xgboost in f:\anaconda\lib\site-packages (1.6.1) Requirement already satisfied: numpy in f:\anaconda\lib\site-packages (from xgboost) (1.19.3) Requirement already satisfied: scipy in f:\anaconda\lib\site-packages (from xgboost) (1.7.3) Note: you may need to restart the kernel to use updated packages.

11.模型评估

1from sklearn.metrics import accuracy_score  #准确率
2from sklearn.metrics import recall_score  #召回率
3from sklearn.metrics import f1_score  # F1值
4from sklearn.metrics import roc_auc_score  # roc_auc值
5

11.1召回率

1# 预测
2y_pre = model_tree.predict(X_test)
3# 评分
4tree_recall_score = recall_score(y_test,y_pre)
5print('决策树模型的召回率评分为',round(tree_recall_score,2))

决策树模型的召回率评分为 0.69

11.2 准确率

1# 预测
2y_pre = model_tree.predict(X_test)
3# 评分
4tree_accuracy_score = accuracy_score(y_test,y_pre)
5print('决策树模型的准确率评分为',round(tree_accuracy_score,2))

决策树模型的准确率评分为 0.83

11.3 F1值

1# 预测
2y_pre = model_tree.predict(X_test)
3# 评分
4tree_f1_score = f1_score(y_test,y_pre)
5print('决策树模型的F1值评分为',round(tree_f1_score,2))

决策树模型的F1值评分为 0.74

11.4交叉验证

1from sklearn.model_selection import cross_val_score
2from sklearn.metrics import make_scorer # 预测
3
4tree_cv = cross_val_score(model_tree,X_train,y_train,scoring=make_scorer(f1_score),cv=3)

1tree_cv.mean()

0.7026449961970324

1tree_cv

array([0.65277778, 0.71005917, 0.74509804])

12.模型优化

12.1随机指定参数

1# 设置模型参数
2model_tree_yh =DecisionTreeClassifier(max_depth=18,min_samples_split=3)
3
4# 模型训练
5model_tree_yh.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=18, min_samples_split=3)

1# 预测
2y_pre = model_tree_yh.predict(X_test)
3# 评分
4tree_f1_score_yh = f1_score(y_test,y_pre)
5print('决策树模型的F1值评分为',round(tree_f1_score_yh,2))

决策树模型的F1值评分为 0.75

12.2 网格搜索

1from sklearn.model_selection import GridSearchCV
2pargms = {'max_depth':[10,15,20],'min_samples_split':[2,3,4]}
3model_tree_gs = GridSearchCV(estimator=model_tree_yh,param_grid=pargms,scoring=make_scorer(f1_score))
4model_tree_gs.fit(X_train,y_train)

GridSearchCV(estimator=DecisionTreeClassifier(max_depth=18, min_samples_split=3), param_grid={'max_depth': [10, 15, 20], 'min_samples_split': [2, 3, 4]}, scoring=make_scorer(f1_score))

1# 最优评分
2model_tree_gs.best_score_
3# 最优模型
4model_tree_gs.best_estimator_  # 可以得到一个模型，这个模型可以去预测数据的
5# 最优参数
6model_tree_gs.best_params_  # 只有参数

{'max_depth': 20, 'min_samples_split': 4}

12.3 过采样

1# 要求将ratio参数设置为0.1,这个忽略掉。新版的过采样库没有这个参数了
2
3from imblearn.over_sampling import SMOTE  # 需要安装imblearn  pip install imblearn
4
5model_smote =SMOTE() # 设置参数
6X_train_smote,y_train_smote = model_smote.fit_resample(X_train,y_train)  # 应用与数据上
7X_test_smote,y_test_smote = model_smote.fit_resample(X_test,y_test)  # 应用与数据上

1# 使用过采样后的数据在次建模
2# 设置模型参数
3model_tree_smote =DecisionTreeClassifier(max_depth=6)
4# 模型训练
5model_tree_smote.fit(X_train_smote,y_train_smote)

DecisionTreeClassifier(max_depth=6)

1# 对使用过采样数据建模后的模型再次评估
2# 预测
3y_pre_smote = model_tree_smote.predict(X_test_smote)
4# 评分
5tree_f1_score_smote = f1_score(y_test_smote,y_pre_smote)
6print('决策树模型的F1值评分为',round(tree_f1_score_smote,2))

决策树模型的F1值评分为 0.82

13.保存模型

1import joblib # 需要安装
2
3# model_best = model_tree_gs.best_estimator_ 
4
5joblib.dump(model_tree_smote,'./tree_best.pkl') # 保存模型

['./tree_best.pkl']