数据挖掘通讲：聚类算法

Apr 11, 2026 · 4 分钟阅读 · 聚类数据挖掘算法 ·

1注意点：
2   1.birch、SpectralClustering这两个算法非常耗资源可能考试环境中的资源不足，运行时会内存溢出，找监考老师。和她说不运行只截图是否可以。

1.读取数据

1import pandas as pd
2import numpy as np

1df = pd.read_csv('./data.csv',encoding='gbk')
2df.head()

客户编号  套餐品牌  信用等级  是否使用4GUSIM卡  是否4G资费   网龄   当月ARPU  当月MOU    当月DOU  视频流量  \

0 10942 2 5 0 1 204 2201.08 2611 54557 22
1 13382 2 5 0 0 201 2181.71 3371 35250 15
2 4192 2 5 1 1 167 2055.60 6913 5884426 1
3 10908 2 5 1 0 171 1827.33 2157 178070 4
4 14130 2 5 0 1 216 1736.40 4218 358592 35

微信社交流量网页浏览流量营销是否成功
0 42 1528 0
1 24 1120 0
2 2708 0 1
3 260 15 1
4 28 0 1

1# 观察缺失值
2df.isnull().sum()

客户编号 0 套餐品牌 0 信用等级 0 是否使用4GUSIM卡 0 是否4G资费 0 网龄 0 当月ARPU 0 当月MOU 0 当月DOU 0 视频流量 0 微信社交流量 0 网页浏览流量 0 营销是否成功 0 dtype: int64

1# 查看数据类型
2df.info()

<class 'pandas.core.frame.DataFrame'> RangeIndex: 20000 entries, 0 to 19999 Data columns (total 13 columns):

Column Non-Null Count Dtype

0 客户编号 20000 non-null int64
1 套餐品牌 20000 non-null int64
2 信用等级 20000 non-null int64
3 是否使用4GUSIM卡 20000 non-null int64
4 是否4G资费 20000 non-null int64
5 网龄 20000 non-null int64
6 当月ARPU 20000 non-null float64 7 当月MOU 20000 non-null int64
8 当月DOU 20000 non-null int64
9 视频流量 20000 non-null int64
10 微信社交流量 20000 non-null int64
11 网页浏览流量 20000 non-null int64
12 营销是否成功 20000 non-null int64
dtypes: float64(1), int64(12) memory usage: 2.0 MB

2.标准化

1from sklearn.preprocessing import StandardScaler
2
3model_std = StandardScaler()  # 定义规则
4df_std = model_std.fit_transform(df)# 将规则应用于数据上
5df_std = pd.DataFrame(data=df_std,columns=df.columns) # 转型

3.建模\调参\评估\绘图

1# 使用kmeans 进行建模
2
3from sklearn.cluster import KMeans
4from sklearn.metrics import silhouette_score  # 轮廓系数：多个模型之间进行对比，取一个相对较好的模型
5import matplotlib.pyplot as plt

 1k_list,silhouette_list,inertia_list = [],[],[]
 2for k in range(2,8):
 3    
 4    model_kmeans = KMeans(n_clusters=k)  # 设置参数
 5    model_kmeans.fit(df_std) # 训练模型
 6
 7    # 评分
 8    label = model_kmeans.labels_
 9    sh_score =silhouette_score(df_std,label)  # 轮廓系数
10    l_s = model_kmeans.inertia_  # 兰德系数
11    
12    k_list.append(k)
13    silhouette_list.append(sh_score)
14    inertia_list.append(l_s)
15    
16    print(k,'++++++',sh_score,'++++++',l_s) 
17    
18plt.xlabel("k num")
19plt.ylabel("silhouette score")
20plt.title("silhouette score")
21plt.plot(k_list,silhouette_list) # 折线图
22plt.show()
23
24plt.xlabel("k num")
25plt.ylabel("inertia score")
26plt.title("inertia score")
27plt.plot(k_list,inertia_list) # 折线图
28plt.show()

2 ++++++ 0.1784261693635658 ++++++ 219227.38919629235 3 ++++++ 0.16988375182258506 ++++++ 198840.83874924868 4 ++++++ 0.17731519901598664 ++++++ 183267.69394558563 5 ++++++ 0.17658230764923027 ++++++ 171755.68803342295 6 ++++++ 0.17207627267051012 ++++++ 162250.13611522608 7 ++++++ 0.1760978371843637 ++++++ 152567.52483908107

1# 使用 birch 算法进行建模
2from sklearn.cluster import Birch
3import numpy as np

 1k_list,silhouette_list,inertia_list = [],[],[]
 2for k in range(2,8):
 3    
 4    model_birch = Birch(n_clusters=k)  # 设置参数
 5    model_birch.fit(np.ascontiguousarray(df_std)) # 训练模型
 6
 7    # 评分
 8    label = model_birch.labels_
 9    sh_score =silhouette_score(df_std,label)  # 轮廓系数    
10    k_list.append(k)
11    silhouette_list.append(sh_score)
12    inertia_list.append(l_s)
13    
14    print(k,'++++++',sh_score,'++++++',l_s) 
15    
16plt.xlabel("k num")
17plt.ylabel("silhouette score")
18plt.title("silhouette score")
19plt.plot(k_list,silhouette_list) # 折线图
20plt.show()

2 ++++++ 0.15104788884837692 ++++++ 151770.92076401823 3 ++++++ 0.15174224842742617 ++++++ 151770.92076401823 4 ++++++ 0.15266860543391234 ++++++ 151770.92076401823 5 ++++++ 0.15231177536674204 ++++++ 151770.92076401823 6 ++++++ 0.14624573410099415 ++++++ 151770.92076401823 7 ++++++ 0.1380633622171311 ++++++ 151770.92076401823

1from sklearn.cluster import SpectralClustering

 1k_list,silhouette_list,inertia_list = [],[],[]
 2for k in range(2,12):
 3    
 4    model_sc = SpectralClustering(n_clusters=k)  # 设置参数
 5    model_sc.fit(np.ascontiguousarray(df_std)) # 训练模型
 6
 7    # 评分
 8    label = model_sc.labels_
 9    sh_score =silhouette_score(df_std,label)  # 轮廓系数    
10    k_list.append(k)
11    silhouette_list.append(sh_score)
12    inertia_list.append(l_s)
13    
14    print(k,'++++++',sh_score,'++++++',l_s) 
15    
16plt.xlabel("k num")
17plt.ylabel("silhouette score")
18plt.title("silhouette score")
19plt.plot(k_list,silhouette_list) # 折线图
20plt.show()

F:\Anaconda\lib\site-packages\sklearn\manifold_spectral_embedding.py:259: UserWarning: Graph is not fully connected, spectral embedding may not work as expected. warnings.warn(

4.根据前面三种算法的结果，选出最优的算法模型，利用最优的算法模型输出数据集的聚类标签,并将标签结果合并到导入的数据集中;

1model_birch = Birch(n_clusters=3)
2model_birch.fit(np.ascontiguousarray(df_std))

Birch()

1# result =  /* 此处由考生进行代码填写 */  #返回模型聚类结果
2result = model_birch.labels_
3#将聚类结果添加至df_norm
4df_std['lable'] = result
5df_std['lable'].value_counts()
6df_std['lable'].hist()

7.查看不同聚类结果下的特征均值，并绘制不同聚类结果下的特征均值折线图，比如各流量占比指标在不同簇的中心值;

1# 创建字典，将要求均值的特征保存
2dict_mean = {'视频流量':'mean','微信社交流量':'mean','网页浏览流量':'mean'}

1df_mean_2 = df_std.groupby(by='lable').agg(dict_mean).reset_index()
2df_mean_3 = df_mean_2.drop(columns='lable',axis=1)
3for x in df_mean_3.index:
4    plt.plot(df_mean_3.columns,df_mean_3.iloc[x])

8.绘制三维散点图查看各簇的数据在三维特征下的数据分布，并描述各个簇的特点

1# pip install Axes3D

 1from mpl_toolkits.mplot3d import Axes3D
 2# 绘制三维散点图查看各个簇的特点
 3colors = ['r','g','cyan']
 4markers = ['s', 'x', 'o']
 5fig = plt.figure(figsize=(10,8))
 6ax = Axes3D(fig)
 7# 循环绘制所有类的散点
 8for i in range(0,3):
 9    df_temp = df_std[df_std['lable']==i]
10    x,y,z =  df_temp['视频流量'],df_temp['微信社交流量'],df_temp['网页浏览流量']
11    ax.scatter(x,y,z,c=colors[i],marker=markers[i]) # 绘制三维散点图
12ax.view_init(elev=20,azim=30)
13ax.set_xlabel('视频流量')
14ax.set_ylabel('微信社交流量')
15ax.set_zlabel('网页浏览流量')
16plt.show()

C:\Users\Administrator\AppData\Local\Temp\ipykernel_13320\1204517065.py:6: MatplotlibDeprecationWarning: Axes3D(fig) adding itself to the figure is deprecated since 3.4. Pass the keyword argument auto_add_to_figure=False and use fig.add_axes(ax) to suppress this warning. The default value of auto_add_to_figure will change to False in mpl3.5 and True values will no longer work in 3.6. This is consistent with other Axes classes. ax = Axes3D(fig)