聚类算法分析
1import pandas as pd
1from sklearn.datasets import make_blobs #足球数据
1 # 数据量 列数 类别 标准差 随机种子
2X, y = make_blobs(n_samples=150, n_features=4, centers=3)
1data = pd.DataFrame(data=X)
2data[4] =y
1data
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| 0 | -8.317893 | 0.698439 | -10.349959 | 4.291286 | 0 |
| 1 | -5.663915 | 7.060548 | 0.330762 | 4.576624 | 1 |
| 2 | -9.276754 | 1.625037 | -9.837227 | 3.494463 | 0 |
| 3 | -4.981855 | 2.015353 | -9.270422 | 4.598429 | 0 |
| 4 | -3.461616 | 8.064244 | 4.563894 | 1.690651 | 1 |
| ... | ... | ... | ... | ... | ... |
| 145 | -5.561548 | 6.836946 | 1.549655 | 3.830984 | 1 |
| 146 | -6.841547 | -2.699957 | -8.790234 | -0.604204 | 2 |
| 147 | -5.903813 | 7.357668 | 2.510227 | 3.125459 | 1 |
| 148 | -8.607808 | 0.461901 | -9.125664 | 4.522260 | 0 |
| 149 | -8.503622 | 2.326839 | -8.911981 | 3.532953 | 0 |
150 rows × 5 columns
1# 绘制散点图查看数据分布
2import seaborn as sn
3sn.scatterplot(data[2],data[1],hue=y)
<matplotlib.axes._subplots.AxesSubplot at 0x24649b477b8>
1"""
2 绘制三维散点图
3"""
4import matplotlib.pyplot as mp
5from mpl_toolkits.mplot3d import axes3d
6
7
8
9# 2.绘制图片
10mp.figure(facecolor="lightgray",figsize=(10,7))
11ax3d = mp.gca(projection="3d") # 创建三维坐标
12
13mp.title('3D Scatter', fontsize=20)
14ax3d.set_xlabel('age', fontsize=14)
15ax3d.set_ylabel('shouru', fontsize=14)
16ax3d.set_zlabel('price', fontsize=14)
17mp.tick_params(labelsize=10)
18
19ax3d.scatter(X[:,0], X[:,1], X[:,2])
20
21mp.show()
1import matplotlib.pyplot as plt
2from mpl_toolkits.mplot3d import axes3d
3plt.figure()
4ax = plt.gca(projection='3d')
5ax.set_xlabel('x')
6ax.set_xlabel('y')
7ax.set_xlabel('z')
8ax.scatter(X[:,0],X[:,1],y)
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x24645659e10>
1from sklearn.cluster import AgglomerativeClustering # Hierarchical Clustering
2from sklearn.cluster import Birch
3from sklearn.cluster import KMeans
4from sklearn.cluster import DBSCAN
1# Kmeans
2model_kmeans1 = KMeans(3)
3model_kmeans1.fit(X,y)
KMeans(n_clusters=3)
1# DBScan
2model_db = DBSCAN()
3model_db.fit(X)
DBSCAN()
1# Birch
2model_bc = Birch()
3model_bc.fit(X)
Birch()
1# AgglomerativeClustering
2
3model_ac = AgglomerativeClustering()
4model_ac.fit(X,y)
AgglomerativeClustering()
1# 评估
2from sklearn.metrics import adjusted_rand_score # 兰德系数 [-1,1]
3
4
5# 预测
6y_pre = model_k.predict(X)
7adjusted_rand_score(y,y_pre)
0.5199330724449299
1# Kmeans
2model_kmeans2 = KMeans(4)
3model_kmeans2.fit(X,y)
KMeans(n_clusters=4)
1# Kmeans
2model_kmeans3 = KMeans(5)
3model_kmeans3.fit(X,y)
KMeans(n_clusters=5)
1from sklearn.metrics import silhouette_score # 轮毂系数,查看幅度变化大小,幅度变化越大,效果越好。
2
3print(silhouette_score(X,model_kmeans1.labels_))
4print(silhouette_score(X,model_kmeans2.labels_))
5print(silhouette_score(X,model_kmeans3.labels_))
0.6932156075104747
0.3460550366520808
0.3641526730771607
1# 通过使用轮毂系数对模型进行评估,轮毂系数评需要查看幅度变化大小,幅度变化越大,效果越好。
2# 上述评估中聚类数为2和3之间幅度变化打,所以选择2。估时,