聚类算法分析
1import pandas as pd
1from sklearn.datasets import make_blobs #足球数据
1 # 数据量 列数 类别 标准差 随机种子
2X, y = make_blobs(n_samples=150, n_features=4, centers=3)
1data = pd.DataFrame(data=X)
2data[4] =y
1data
0 1 2 3 4
0 -8.317893 0.698439 -10.349959 4.291286 0 1 -5.663915 7.060548 0.330762 4.576624 1 2 -9.276754 1.625037 -9.837227 3.494463 0 3 -4.981855 2.015353 -9.270422 4.598429 0 4 -3.461616 8.064244 4.563894 1.690651 1 .. ... ... ... ... .. 145 -5.561548 6.836946 1.549655 3.830984 1 146 -6.841547 -2.699957 -8.790234 -0.604204 2 147 -5.903813 7.357668 2.510227 3.125459 1 148 -8.607808 0.461901 -9.125664 4.522260 0 149 -8.503622 2.326839 -8.911981 3.532953 0
[150 rows x 5 columns]
1# 绘制散点图查看数据分布
2import seaborn as sn
3sn.scatterplot(data[2],data[1],hue=y)
<matplotlib.axes._subplots.AxesSubplot at 0x24649b477b8>
1"""
2 绘制三维散点图
3"""
4import matplotlib.pyplot as mp
5from mpl_toolkits.mplot3d import axes3d
6
7
8
9# 2.绘制图片
10mp.figure(facecolor="lightgray",figsize=(10,7))
11ax3d = mp.gca(projection="3d") # 创建三维坐标
12
13mp.title('3D Scatter', fontsize=20)
14ax3d.set_xlabel('age', fontsize=14)
15ax3d.set_ylabel('shouru', fontsize=14)
16ax3d.set_zlabel('price', fontsize=14)
17mp.tick_params(labelsize=10)
18
19ax3d.scatter(X[:,0], X[:,1], X[:,2])
20
21mp.show()
1import matplotlib.pyplot as plt
2from mpl_toolkits.mplot3d import axes3d
3plt.figure()
4ax = plt.gca(projection='3d')
5ax.set_xlabel('x')
6ax.set_xlabel('y')
7ax.set_xlabel('z')
8ax.scatter(X[:,0],X[:,1],y)
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x24645659e10>
1from sklearn.cluster import AgglomerativeClustering # Hierarchical Clustering
2from sklearn.cluster import Birch
3from sklearn.cluster import KMeans
4from sklearn.cluster import DBSCAN
1# Kmeans
2model_kmeans1 = KMeans(3)
3model_kmeans1.fit(X,y)
KMeans(n_clusters=3)
1# DBScan
2model_db = DBSCAN()
3model_db.fit(X)
DBSCAN()
1# Birch
2model_bc = Birch()
3model_bc.fit(X)
Birch()
1# AgglomerativeClustering
2
3model_ac = AgglomerativeClustering()
4model_ac.fit(X,y)
AgglomerativeClustering()
1# 评估
2from sklearn.metrics import adjusted_rand_score # 兰德系数 [-1,1]
3
4
5# 预测
6y_pre = model_k.predict(X)
7adjusted_rand_score(y,y_pre)
0.5199330724449299
1# Kmeans
2model_kmeans2 = KMeans(4)
3model_kmeans2.fit(X,y)
KMeans(n_clusters=4)
1# Kmeans
2model_kmeans3 = KMeans(5)
3model_kmeans3.fit(X,y)
KMeans(n_clusters=5)
1from sklearn.metrics import silhouette_score # 轮毂系数,查看幅度变化大小,幅度变化越大,效果越好。
2
3print(silhouette_score(X,model_kmeans1.labels_))
4print(silhouette_score(X,model_kmeans2.labels_))
5print(silhouette_score(X,model_kmeans3.labels_))
0.6932156075104747 0.3460550366520808 0.3641526730771607
1# 通过使用轮毂系数对模型进行评估,轮毂系数评需要查看幅度变化大小,幅度变化越大,效果越好。
2# 上述评估中聚类数为2和3之间幅度变化打,所以选择2。估时,