数据挖掘通讲

1import pandas as pd
2import numpy as np
3import warnings
4warnings.filterwarnings('ignore')

1.读取除表头外的前五行\读取数据前五行

1df = pd.read_csv('./titanic_trains.csv',encoding='utf8')
2df.head(5)
3df.shape

(891, 12)

2.缺失值处理(没有要求就遵循80%原则)

2.1.查看缺失值情况,输出缺失值比例、列名、缺失值数量

1for x in df.columns:
2    zb = df[x].isnull().sum()/df.shape[0]*100
3    sl = df[x].isnull().sum()
4    print('缺失值占比为:',round(zb,2),"%,列名:",x,',缺失值数量:',sl)

缺失值占比为: 0.0 %,列名: PassengerId ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Survived ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Pclass ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Name ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Sex ,缺失值数量: 0 缺失值占比为: 19.87 %,列名: Age ,缺失值数量: 177 缺失值占比为: 0.0 %,列名: SibSp ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Parch ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Ticket ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Fare ,缺失值数量: 0 缺失值占比为: 77.1 %,列名: Cabin ,缺失值数量: 687 缺失值占比为: 0.22 %,列名: Embarked ,缺失值数量: 2

2.2.对缺失值进行处理

方法一:题目要求使用SimpleImputer方法进行填充处理时

1from sklearn.impute  import SimpleImputer
2# 数值型使用众数填充后类型可能会变成object类型,进而导致建模报错
3
4model_si = SimpleImputer(missing_values=np.nan,strategy='mean')  # 定义规则
5df_mean = model_si.fit_transform(df[['Age']])  # 应用于数据
6df_mean = pd.DataFrame(data=df_mean,columns=['Age'])  # 转换数据类型,使用起来简单
1from sklearn.impute  import SimpleImputer
2# 数值型使用众数填充后类型可能会变成object类型,进而导致建模报错
3
4model_si = SimpleImputer(missing_values=np.nan,strategy='most_frequent')  # 定义规则
5df_mode = model_si.fit_transform(df[['Cabin','Embarked']])  # 应用于数据
6df_mode = pd.DataFrame(data=df_mode,columns=['Cabin','Embarked'])  # 转换数据类型,使用起来简单
1# 覆盖掉原本的数据
2df['Age'] = df_mean
3df[['Cabin','Embarked']] = df_mode
1df.isnull().sum()

PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64

方法二:没有要求方法时

1# 删除缺失值超过20%的列
2df_drop = df.dropna(axis=1,thresh=df.shape[0]*0.8)
1df_drop.shape

(891, 12)

1# 填充,可以使用repalce
2df_drop['Age'] = df_drop['Age'].replace(np.nan,df_drop['Age'].mean())
3df_drop['Embarked'] = df_drop['Embarked'].replace(np.nan,df_drop['Embarked'].mode()[0])
1df_drop.isnull().sum()

PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64

3.异常值处理

3.1、箱线图,公式:上边界:Q3+1.5*IQR<x,上边界:x<Q1-1.5IQR
1# 绘制箱线图
2import matplotlib.pyplot as plt
3plt.figure(figsize=(8,6))
4plt.boxplot(df_drop['Age'])
5plt.show()  # 绘图时一定加上show方法

Image

1list_yc = [] # 存储异常值索引位置
 1# list_yc = []
 2# 箱线图检测
 3# 1.取四分位数
 4sfw = df_drop['Age'].describe()
 5Q1 = sfw['25%']
 6Q3 = sfw['75%']
 7# 2.计算IQR
 8IQR = Q3-Q1
 9# 3.计算上/下边界
10up = Q3+1.5*IQR
11down = Q1-1.5*IQR
12# 4.比较
13for x in range(0,891):
14    if x not in list_yc:
15        if df_drop['Age'][x]>up or df_drop['Age'][x]<down:
16            # 5.处理
17            df_drop['Age'] = df_drop['Age'].replace(x,df_drop['Age'].median())  # 替换
18#             df_drop.drop(axis=0,index=x,inplace=True)
19            print('异常值索引为:',x)
20            list_yc.append(x)
1# 绘制箱线图
2import matplotlib.pyplot as plt
3plt.figure(figsize=(8,6))
4plt.boxplot(df_drop['Age'])
5plt.show()  # 绘图时一定加上show方法

Image

3.1、3σ原则,公式:|x-μ|>3σ
1# 1.计算均值
2mean_age = df_drop['Age'].mean()
3# 2.计算标准差
4std_age = np.std(df_drop['Age'])
5for x in df_drop['Age']:
6    if np.abs(x-mean_age)>3*std_age:
7        # 删除和替换的方式和箱线图一样
8        print('异常值为:',x)

异常值为: 71.0 异常值为: 70.5 异常值为: 71.0 异常值为: 80.0 异常值为: 70.0 异常值为: 70.0 异常值为: 74.0

4.特征选择

1# 查看皮尔逊相关性系数
2cor = df_drop.corr() # pearson:皮尔逊,spearman:斯皮尔曼,kendall:肯德尔
3cor
         PassengerId  Survived    Pclass       Age     SibSp     Parch  \

PassengerId 1.000000 -0.005007 -0.035144 0.033207 -0.057527 -0.001652
Survived -0.005007 1.000000 -0.338481 -0.069809 -0.035322 0.081629
Pclass -0.035144 -0.338481 1.000000 -0.331339 0.083081 0.018443
Age 0.033207 -0.069809 -0.331339 1.000000 -0.232625 -0.179191
SibSp -0.057527 -0.035322 0.083081 -0.232625 1.000000 0.414838
Parch -0.001652 0.081629 0.018443 -0.179191 0.414838 1.000000
Fare 0.012658 0.257307 -0.549500 0.091566 0.159651 0.216225

             Fare  

PassengerId 0.012658
Survived 0.257307
Pclass -0.549500
Age 0.091566
SibSp 0.159651
Parch 0.216225
Fare 1.000000

1# 绘制相关性热力图
2import matplotlib.pyplot as plt
3import seaborn as sns
4plt.figure(figsize=(8,6))
5sns.heatmap(cor)
6plt.show()

Image

1list_ry = []
2# 剔除相关性大于0.8的属性(冗余数据)
3for x in cor.columns:
4    for y in cor.index:
5        if cor[x][y]>0.8 and x!=y and (y,x)not in list_ry:
6            list_ry.append((x,y))
1# 取出要删除的列并去重
2list_drop =np.unique([col[0] for col in list_ry])
1# 删除列
2df_ry_drop = df_drop.drop(axis=1,columns=list_drop)