数据挖掘通讲:聚类算法
1注意点:
2 1.birch、SpectralClustering这两个算法非常耗资源可能考试环境中的资源不足,运行时会内存溢出,找监考老师。和她说不运行只截图是否可以。
1.读取数据
1import pandas as pd
2import numpy as np
1df = pd.read_csv('./data.csv',encoding='gbk')
2df.head()
客户编号 套餐品牌 信用等级 是否使用4GUSIM卡 是否4G资费 网龄 当月ARPU 当月MOU 当月DOU 视频流量 \
0 10942 2 5 0 1 204 2201.08 2611 54557 22
1 13382 2 5 0 0 201 2181.71 3371 35250 15
2 4192 2 5 1 1 167 2055.60 6913 5884426 1
3 10908 2 5 1 0 171 1827.33 2157 178070 4
4 14130 2 5 0 1 216 1736.40 4218 358592 35
微信社交流量 网页浏览流量 营销是否成功
0 42 1528 0
1 24 1120 0
2 2708 0 1
3 260 15 1
4 28 0 1
1# 观察缺失值
2df.isnull().sum()
客户编号 0 套餐品牌 0 信用等级 0 是否使用4GUSIM卡 0 是否4G资费 0 网龄 0 当月ARPU 0 当月MOU 0 当月DOU 0 视频流量 0 微信社交流量 0 网页浏览流量 0 营销是否成功 0 dtype: int64
1# 查看数据类型
2df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20000 entries, 0 to 19999 Data columns (total 13 columns):
Column Non-Null Count Dtype
0 客户编号 20000 non-null int64
1 套餐品牌 20000 non-null int64
2 信用等级 20000 non-null int64
3 是否使用4GUSIM卡 20000 non-null int64
4 是否4G资费 20000 non-null int64
5 网龄 20000 non-null int64
6 当月ARPU 20000 non-null float64
7 当月MOU 20000 non-null int64
8 当月DOU 20000 non-null int64
9 视频流量 20000 non-null int64
10 微信社交流量 20000 non-null int64
11 网页浏览流量 20000 non-null int64
12 营销是否成功 20000 non-null int64
dtypes: float64(1), int64(12)
memory usage: 2.0 MB
2.标准化
1from sklearn.preprocessing import StandardScaler
2
3model_std = StandardScaler() # 定义规则
4df_std = model_std.fit_transform(df)# 将规则应用于数据上
5df_std = pd.DataFrame(data=df_std,columns=df.columns) # 转型
3.建模\调参\评估\绘图
1# 使用kmeans 进行建模
2
3from sklearn.cluster import KMeans
4from sklearn.metrics import silhouette_score # 轮廓系数:多个模型之间进行对比,取一个相对较好的模型
5import matplotlib.pyplot as plt
1k_list,silhouette_list,inertia_list = [],[],[]
2for k in range(2,8):
3
4 model_kmeans = KMeans(n_clusters=k) # 设置参数
5 model_kmeans.fit(df_std) # 训练模型
6
7 # 评分
8 label = model_kmeans.labels_
9 sh_score =silhouette_score(df_std,label) # 轮廓系数
10 l_s = model_kmeans.inertia_ # 兰德系数
11
12 k_list.append(k)
13 silhouette_list.append(sh_score)
14 inertia_list.append(l_s)
15
16 print(k,'++++++',sh_score,'++++++',l_s)
17
18plt.xlabel("k num")
19plt.ylabel("silhouette score")
20plt.title("silhouette score")
21plt.plot(k_list,silhouette_list) # 折线图
22plt.show()
23
24plt.xlabel("k num")
25plt.ylabel("inertia score")
26plt.title("inertia score")
27plt.plot(k_list,inertia_list) # 折线图
28plt.show()
2 ++++++ 0.1784261693635658 ++++++ 219227.38919629235 3 ++++++ 0.16988375182258506 ++++++ 198840.83874924868 4 ++++++ 0.17731519901598664 ++++++ 183267.69394558563 5 ++++++ 0.17658230764923027 ++++++ 171755.68803342295 6 ++++++ 0.17207627267051012 ++++++ 162250.13611522608 7 ++++++ 0.1760978371843637 ++++++ 152567.52483908107
1# 使用 birch 算法进行建模
2from sklearn.cluster import Birch
3import numpy as np
1k_list,silhouette_list,inertia_list = [],[],[]
2for k in range(2,8):
3
4 model_birch = Birch(n_clusters=k) # 设置参数
5 model_birch.fit(np.ascontiguousarray(df_std)) # 训练模型
6
7 # 评分
8 label = model_birch.labels_
9 sh_score =silhouette_score(df_std,label) # 轮廓系数
10 k_list.append(k)
11 silhouette_list.append(sh_score)
12 inertia_list.append(l_s)
13
14 print(k,'++++++',sh_score,'++++++',l_s)
15
16plt.xlabel("k num")
17plt.ylabel("silhouette score")
18plt.title("silhouette score")
19plt.plot(k_list,silhouette_list) # 折线图
20plt.show()
2 ++++++ 0.15104788884837692 ++++++ 151770.92076401823 3 ++++++ 0.15174224842742617 ++++++ 151770.92076401823 4 ++++++ 0.15266860543391234 ++++++ 151770.92076401823 5 ++++++ 0.15231177536674204 ++++++ 151770.92076401823 6 ++++++ 0.14624573410099415 ++++++ 151770.92076401823 7 ++++++ 0.1380633622171311 ++++++ 151770.92076401823
1from sklearn.cluster import SpectralClustering
1k_list,silhouette_list,inertia_list = [],[],[]
2for k in range(2,12):
3
4 model_sc = SpectralClustering(n_clusters=k) # 设置参数
5 model_sc.fit(np.ascontiguousarray(df_std)) # 训练模型
6
7 # 评分
8 label = model_sc.labels_
9 sh_score =silhouette_score(df_std,label) # 轮廓系数
10 k_list.append(k)
11 silhouette_list.append(sh_score)
12 inertia_list.append(l_s)
13
14 print(k,'++++++',sh_score,'++++++',l_s)
15
16plt.xlabel("k num")
17plt.ylabel("silhouette score")
18plt.title("silhouette score")
19plt.plot(k_list,silhouette_list) # 折线图
20plt.show()
F:\Anaconda\lib\site-packages\sklearn\manifold_spectral_embedding.py:259: UserWarning: Graph is not fully connected, spectral embedding may not work as expected. warnings.warn(
4.根据前面三种算法的结果,选出最优的算法模型,利用最优的算法模型输出数据集的聚类标签,并将标签结果合并到导入的数据集中;
1model_birch = Birch(n_clusters=3)
2model_birch.fit(np.ascontiguousarray(df_std))
Birch()
1# result = /* 此处由考生进行代码填写 */ #返回模型聚类结果
2result = model_birch.labels_
3#将聚类结果添加至df_norm
4df_std['lable'] = result
5df_std['lable'].value_counts()
6df_std['lable'].hist()
7.查看不同聚类结果下的特征均值,并绘制不同聚类结果下的特征均值折线图,比如各流量占比指标在不同簇的中心值;
1# 创建字典,将要求均值的特征保存
2dict_mean = {'视频流量':'mean','微信社交流量':'mean','网页浏览流量':'mean'}
1df_mean_2 = df_std.groupby(by='lable').agg(dict_mean).reset_index()
2df_mean_3 = df_mean_2.drop(columns='lable',axis=1)
3for x in df_mean_3.index:
4 plt.plot(df_mean_3.columns,df_mean_3.iloc[x])
8.绘制三维散点图查看各簇的数据在三维特征下的数据分布,并描述各个簇的特点
1# pip install Axes3D
1from mpl_toolkits.mplot3d import Axes3D
2# 绘制三维散点图查看各个簇的特点
3colors = ['r','g','cyan']
4markers = ['s', 'x', 'o']
5fig = plt.figure(figsize=(10,8))
6ax = Axes3D(fig)
7# 循环绘制所有类的散点
8for i in range(0,3):
9 df_temp = df_std[df_std['lable']==i]
10 x,y,z = df_temp['视频流量'],df_temp['微信社交流量'],df_temp['网页浏览流量']
11 ax.scatter(x,y,z,c=colors[i],marker=markers[i]) # 绘制三维散点图
12ax.view_init(elev=20,azim=30)
13ax.set_xlabel('视频流量')
14ax.set_ylabel('微信社交流量')
15ax.set_zlabel('网页浏览流量')
16plt.show()
C:\Users\Administrator\AppData\Local\Temp\ipykernel_13320\1204517065.py:6: MatplotlibDeprecationWarning: Axes3D(fig) adding itself to the figure is deprecated since 3.4. Pass the keyword argument auto_add_to_figure=False and use fig.add_axes(ax) to suppress this warning. The default value of auto_add_to_figure will change to False in mpl3.5 and True values will no longer work in 3.6. This is consistent with other Axes classes. ax = Axes3D(fig)