数据挖掘通讲
1import pandas as pd
2import numpy as np
3import warnings
4warnings.filterwarnings('ignore')
1.读取除表头外的前五行\读取数据前五行
1df = pd.read_csv('./titanic_trains.csv',encoding='utf8')
2df.head(5)
3df.shape
(891, 12)
2.缺失值处理(没有要求就遵循80%原则)
2.1.查看缺失值情况,输出缺失值比例、列名、缺失值数量
1for x in df.columns:
2 zb = df[x].isnull().sum()/df.shape[0]*100
3 sl = df[x].isnull().sum()
4 print('缺失值占比为:',round(zb,2),"%,列名:",x,',缺失值数量:',sl)
缺失值占比为: 0.0 %,列名: PassengerId ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Survived ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Pclass ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Name ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Sex ,缺失值数量: 0 缺失值占比为: 19.87 %,列名: Age ,缺失值数量: 177 缺失值占比为: 0.0 %,列名: SibSp ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Parch ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Ticket ,缺失值数量: 0 缺失值占比为: 0.0 %,列名: Fare ,缺失值数量: 0 缺失值占比为: 77.1 %,列名: Cabin ,缺失值数量: 687 缺失值占比为: 0.22 %,列名: Embarked ,缺失值数量: 2
2.2.对缺失值进行处理
方法一:题目要求使用SimpleImputer方法进行填充处理时
1from sklearn.impute import SimpleImputer
2# 数值型使用众数填充后类型可能会变成object类型,进而导致建模报错
3
4model_si = SimpleImputer(missing_values=np.nan,strategy='mean') # 定义规则
5df_mean = model_si.fit_transform(df[['Age']]) # 应用于数据
6df_mean = pd.DataFrame(data=df_mean,columns=['Age']) # 转换数据类型,使用起来简单
1from sklearn.impute import SimpleImputer
2# 数值型使用众数填充后类型可能会变成object类型,进而导致建模报错
3
4model_si = SimpleImputer(missing_values=np.nan,strategy='most_frequent') # 定义规则
5df_mode = model_si.fit_transform(df[['Cabin','Embarked']]) # 应用于数据
6df_mode = pd.DataFrame(data=df_mode,columns=['Cabin','Embarked']) # 转换数据类型,使用起来简单
1# 覆盖掉原本的数据
2df['Age'] = df_mean
3df[['Cabin','Embarked']] = df_mode
1df.isnull().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64
方法二:没有要求方法时
1# 删除缺失值超过20%的列
2df_drop = df.dropna(axis=1,thresh=df.shape[0]*0.8)
1df_drop.shape
(891, 12)
1# 填充,可以使用repalce
2df_drop['Age'] = df_drop['Age'].replace(np.nan,df_drop['Age'].mean())
3df_drop['Embarked'] = df_drop['Embarked'].replace(np.nan,df_drop['Embarked'].mode()[0])
1df_drop.isnull().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 0 Embarked 0 dtype: int64
3.异常值处理
3.1、箱线图,公式:上边界:Q3+1.5*IQR<x,上边界:x<Q1-1.5IQR
1# 绘制箱线图
2import matplotlib.pyplot as plt
3plt.figure(figsize=(8,6))
4plt.boxplot(df_drop['Age'])
5plt.show() # 绘图时一定加上show方法
1list_yc = [] # 存储异常值索引位置
1# list_yc = []
2# 箱线图检测
3# 1.取四分位数
4sfw = df_drop['Age'].describe()
5Q1 = sfw['25%']
6Q3 = sfw['75%']
7# 2.计算IQR
8IQR = Q3-Q1
9# 3.计算上/下边界
10up = Q3+1.5*IQR
11down = Q1-1.5*IQR
12# 4.比较
13for x in range(0,891):
14 if x not in list_yc:
15 if df_drop['Age'][x]>up or df_drop['Age'][x]<down:
16 # 5.处理
17 df_drop['Age'] = df_drop['Age'].replace(x,df_drop['Age'].median()) # 替换
18# df_drop.drop(axis=0,index=x,inplace=True)
19 print('异常值索引为:',x)
20 list_yc.append(x)
1# 绘制箱线图
2import matplotlib.pyplot as plt
3plt.figure(figsize=(8,6))
4plt.boxplot(df_drop['Age'])
5plt.show() # 绘图时一定加上show方法
3.1、3σ原则,公式:|x-μ|>3σ
1# 1.计算均值
2mean_age = df_drop['Age'].mean()
3# 2.计算标准差
4std_age = np.std(df_drop['Age'])
5for x in df_drop['Age']:
6 if np.abs(x-mean_age)>3*std_age:
7 # 删除和替换的方式和箱线图一样
8 print('异常值为:',x)
异常值为: 71.0 异常值为: 70.5 异常值为: 71.0 异常值为: 80.0 异常值为: 70.0 异常值为: 70.0 异常值为: 74.0
4.特征选择
1# 查看皮尔逊相关性系数
2cor = df_drop.corr() # pearson:皮尔逊,spearman:斯皮尔曼,kendall:肯德尔
3cor
PassengerId Survived Pclass Age SibSp Parch \
PassengerId 1.000000 -0.005007 -0.035144 0.033207 -0.057527 -0.001652
Survived -0.005007 1.000000 -0.338481 -0.069809 -0.035322 0.081629
Pclass -0.035144 -0.338481 1.000000 -0.331339 0.083081 0.018443
Age 0.033207 -0.069809 -0.331339 1.000000 -0.232625 -0.179191
SibSp -0.057527 -0.035322 0.083081 -0.232625 1.000000 0.414838
Parch -0.001652 0.081629 0.018443 -0.179191 0.414838 1.000000
Fare 0.012658 0.257307 -0.549500 0.091566 0.159651 0.216225
Fare
PassengerId 0.012658
Survived 0.257307
Pclass -0.549500
Age 0.091566
SibSp 0.159651
Parch 0.216225
Fare 1.000000
1# 绘制相关性热力图
2import matplotlib.pyplot as plt
3import seaborn as sns
4plt.figure(figsize=(8,6))
5sns.heatmap(cor)
6plt.show()
1list_ry = []
2# 剔除相关性大于0.8的属性(冗余数据)
3for x in cor.columns:
4 for y in cor.index:
5 if cor[x][y]>0.8 and x!=y and (y,x)not in list_ry:
6 list_ry.append((x,y))
1# 取出要删除的列并去重
2list_drop =np.unique([col[0] for col in list_ry])
1# 删除列
2df_ry_drop = df_drop.drop(axis=1,columns=list_drop)