聚类算法分析

1import pandas as pd
1from sklearn.datasets import make_blobs  #足球数据
1                   # 数据量      列数           类别        标准差         随机种子
2X, y = make_blobs(n_samples=150, n_features=4, centers=3) 
1data = pd.DataFrame(data=X)
2data[4] =y
1data
        0         1          2         3  4

0 -8.317893 0.698439 -10.349959 4.291286 0 1 -5.663915 7.060548 0.330762 4.576624 1 2 -9.276754 1.625037 -9.837227 3.494463 0 3 -4.981855 2.015353 -9.270422 4.598429 0 4 -3.461616 8.064244 4.563894 1.690651 1 .. ... ... ... ... .. 145 -5.561548 6.836946 1.549655 3.830984 1 146 -6.841547 -2.699957 -8.790234 -0.604204 2 147 -5.903813 7.357668 2.510227 3.125459 1 148 -8.607808 0.461901 -9.125664 4.522260 0 149 -8.503622 2.326839 -8.911981 3.532953 0

[150 rows x 5 columns]

1# 绘制散点图查看数据分布
2import seaborn as sn
3sn.scatterplot(data[2],data[1],hue=y)

<matplotlib.axes._subplots.AxesSubplot at 0x24649b477b8>

Image

 1"""
 2    绘制三维散点图
 3"""
 4import matplotlib.pyplot as mp
 5from mpl_toolkits.mplot3d import axes3d
 6
 7
 8
 9# 2.绘制图片
10mp.figure(facecolor="lightgray",figsize=(10,7))
11ax3d = mp.gca(projection="3d")  # 创建三维坐标
12
13mp.title('3D Scatter', fontsize=20)
14ax3d.set_xlabel('age', fontsize=14)
15ax3d.set_ylabel('shouru', fontsize=14)
16ax3d.set_zlabel('price', fontsize=14)
17mp.tick_params(labelsize=10)
18
19ax3d.scatter(X[:,0], X[:,1], X[:,2])
20
21mp.show()

Image

1import matplotlib.pyplot as plt
2from mpl_toolkits.mplot3d import axes3d
3plt.figure()
4ax = plt.gca(projection='3d')
5ax.set_xlabel('x')
6ax.set_xlabel('y')
7ax.set_xlabel('z')
8ax.scatter(X[:,0],X[:,1],y)

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x24645659e10>

Image

1from sklearn.cluster import AgglomerativeClustering   #  Hierarchical Clustering
2from sklearn.cluster import Birch
3from sklearn.cluster import KMeans
4from sklearn.cluster import DBSCAN   
1# Kmeans
2model_kmeans1 = KMeans(3)
3model_kmeans1.fit(X,y)

KMeans(n_clusters=3)

1# DBScan
2model_db = DBSCAN()
3model_db.fit(X)

DBSCAN()

1# Birch
2model_bc = Birch()
3model_bc.fit(X)

Birch()

1# AgglomerativeClustering
2
3model_ac = AgglomerativeClustering()
4model_ac.fit(X,y)

AgglomerativeClustering()

1# 评估
2from sklearn.metrics import adjusted_rand_score  # 兰德系数   [-1,1]
3
4
5# 预测
6y_pre  = model_k.predict(X)
7adjusted_rand_score(y,y_pre)

0.5199330724449299

1# Kmeans
2model_kmeans2 = KMeans(4)
3model_kmeans2.fit(X,y)

KMeans(n_clusters=4)

1# Kmeans
2model_kmeans3 = KMeans(5)
3model_kmeans3.fit(X,y)

KMeans(n_clusters=5)

1from sklearn.metrics import silhouette_score  # 轮毂系数,查看幅度变化大小,幅度变化越大,效果越好。
2
3print(silhouette_score(X,model_kmeans1.labels_))
4print(silhouette_score(X,model_kmeans2.labels_))
5print(silhouette_score(X,model_kmeans3.labels_))

0.6932156075104747 0.3460550366520808 0.3641526730771607

1# 通过使用轮毂系数对模型进行评估,轮毂系数评需要查看幅度变化大小,幅度变化越大,效果越好。
2# 上述评估中聚类数为2和3之间幅度变化打,所以选择2。估时,