聚类算法分析

1import pandas as pd
1from sklearn.datasets import make_blobs  #足球数据
1                   # 数据量      列数           类别        标准差         随机种子
2X, y = make_blobs(n_samples=150, n_features=4, centers=3) 
1data = pd.DataFrame(data=X)
2data[4] =y
1data

0 1 2 3 4
0 -8.317893 0.698439 -10.349959 4.291286 0
1 -5.663915 7.060548 0.330762 4.576624 1
2 -9.276754 1.625037 -9.837227 3.494463 0
3 -4.981855 2.015353 -9.270422 4.598429 0
4 -3.461616 8.064244 4.563894 1.690651 1
... ... ... ... ... ...
145 -5.561548 6.836946 1.549655 3.830984 1
146 -6.841547 -2.699957 -8.790234 -0.604204 2
147 -5.903813 7.357668 2.510227 3.125459 1
148 -8.607808 0.461901 -9.125664 4.522260 0
149 -8.503622 2.326839 -8.911981 3.532953 0

150 rows × 5 columns

1# 绘制散点图查看数据分布
2import seaborn as sn
3sn.scatterplot(data[2],data[1],hue=y)
<matplotlib.axes._subplots.AxesSubplot at 0x24649b477b8>

png

 1"""
 2    绘制三维散点图
 3"""
 4import matplotlib.pyplot as mp
 5from mpl_toolkits.mplot3d import axes3d
 6
 7
 8
 9# 2.绘制图片
10mp.figure(facecolor="lightgray",figsize=(10,7))
11ax3d = mp.gca(projection="3d")  # 创建三维坐标
12
13mp.title('3D Scatter', fontsize=20)
14ax3d.set_xlabel('age', fontsize=14)
15ax3d.set_ylabel('shouru', fontsize=14)
16ax3d.set_zlabel('price', fontsize=14)
17mp.tick_params(labelsize=10)
18
19ax3d.scatter(X[:,0], X[:,1], X[:,2])
20
21mp.show()

png

1import matplotlib.pyplot as plt
2from mpl_toolkits.mplot3d import axes3d
3plt.figure()
4ax = plt.gca(projection='3d')
5ax.set_xlabel('x')
6ax.set_xlabel('y')
7ax.set_xlabel('z')
8ax.scatter(X[:,0],X[:,1],y)
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x24645659e10>

png

1from sklearn.cluster import AgglomerativeClustering   #  Hierarchical Clustering
2from sklearn.cluster import Birch
3from sklearn.cluster import KMeans
4from sklearn.cluster import DBSCAN   
1# Kmeans
2model_kmeans1 = KMeans(3)
3model_kmeans1.fit(X,y)
KMeans(n_clusters=3)
1# DBScan
2model_db = DBSCAN()
3model_db.fit(X)
DBSCAN()
1# Birch
2model_bc = Birch()
3model_bc.fit(X)
Birch()
1# AgglomerativeClustering
2
3model_ac = AgglomerativeClustering()
4model_ac.fit(X,y)
AgglomerativeClustering()
1# 评估
2from sklearn.metrics import adjusted_rand_score  # 兰德系数   [-1,1]
3
4
5# 预测
6y_pre  = model_k.predict(X)
7adjusted_rand_score(y,y_pre)
0.5199330724449299
1# Kmeans
2model_kmeans2 = KMeans(4)
3model_kmeans2.fit(X,y)
KMeans(n_clusters=4)
1# Kmeans
2model_kmeans3 = KMeans(5)
3model_kmeans3.fit(X,y)
KMeans(n_clusters=5)
1from sklearn.metrics import silhouette_score  # 轮毂系数,查看幅度变化大小,幅度变化越大,效果越好。
2
3print(silhouette_score(X,model_kmeans1.labels_))
4print(silhouette_score(X,model_kmeans2.labels_))
5print(silhouette_score(X,model_kmeans3.labels_))
0.6932156075104747
0.3460550366520808
0.3641526730771607
1# 通过使用轮毂系数对模型进行评估,轮毂系数评需要查看幅度变化大小,幅度变化越大,效果越好。
2# 上述评估中聚类数为2和3之间幅度变化打,所以选择2。估时,