糖尿病康复,内容丰富有趣,生活中的好帮手!
糖尿病康复 > 【机器学习】AGNES层次聚类算法

【机器学习】AGNES层次聚类算法

时间:2023-01-02 23:31:01

相关推荐

【机器学习】AGNES层次聚类算法

算法思想:

初始每个数据都是一个簇;寻找每个簇之间的距离,获取距离列表;合并距离最近的两个簇Ci,Cj,并删除Cj,更新距离列表,使簇个数减1;重复第3步,直到簇个数等于所需个数k。

数据集来源:周志华西瓜书数据集4.0

1.读取文件:

import numpy as npimport pandas as pdimport matplotlib.pyplot as pltdata = pd.read_csv("./西瓜数据集4.0.csv", index_col='number')data = data.values.tolist()

2.使用matplotlib.pyplot绘出原始数据

# 画出原始图像fig, ax = plt.subplots()plt.scatter([i[0] for i in data], [i[1] for i in data])plt.show()

3.计算欧氏距离

'''计算欧氏距离'''def calDist(a, b):a = np.array(a)b = np.array(b)dist = np.sqrt(np.dot((a - b), (a - b).T))return dist

4.使用最小距离法计算簇之间的最小距离

'''计算簇之间的最小距离'''def calClusterMinDist(c1, c2):minDist = 1e5for vec1 in c1:for vec2 in c2:dist = calDist(vec1, vec2)if dist < minDist:minDist = distreturn minDist

5.使用平均距离法计算簇之间的平均距离

'''计算簇之间的平均距离'''def calClusterAvgDist(c1, c2):num = len(c1) * len(c2)sum_dist = 0for vec1 in c1:for vec2 in c2:dist = calDist(vec1, vec2)sum_dist += distreturn sum_dist

6.获取簇之间的距离列表

'''获取最小距离列表'''def getMinDistList(data, method):cluster_num = len(data)#print("cluster_num",cluster_num)minDistList = [[0 for i in range(cluster_num)] for j in range(cluster_num)]for i in range(cluster_num):j = i + 1while j < cluster_num:# print("data[i]:",data[i])# print("data[j]:",data[j])if method == "minDist": # 使用最小距离计算minDistList[i][j] = calClusterMinDist(data[i], data[j])minDistList[j][i] = minDistList[i][j]elif method == "avgDist": # 使用平均距离计算minDistList[i][j] = calClusterAvgDist(data[i], data[j])minDistList[j][i] = minDistList[i][j]j += 1return minDistList

7.寻找最小值

'''寻找距离列表中的最小值,用于合并簇以及删除'''def findMin(minDistList):row = len(minDistList)minDist = 1e5min_i = 0min_j = 0for i in range(row):for j in range(row):dist = minDistList[i][j]if dist < minDist and dist != 0:minDist = minDistList[i][j]min_i = imin_j = jreturn min_i, min_j, minDist

8.实现

'''AGNES算法实现'''def AGNES(data, k, method):cluster_num = len(data)C = []for i in data: # 添加数据tmp = [i]C.append(tmp)minDistList = getMinDistList(C, method)while cluster_num > k:i, j, minDist = findMin(minDistList)# print(len(minDistList))# print(i,j,minDist)C[i].extend(C[j]) # 合并del C[j] # 删除minDistList = getMinDistList(C, method)cluster_num -= 1return C

9.程序入口

'''程序入口'''if __name__ == "__main__":C_min = AGNES(data, 3, 'minDist')C_avg = AGNES(data, 3, 'avgDist')fig, ax = plt.subplots(nrows=2, ncols=1)ax[0].scatter([i[0] for i in C_min[0]], [i[1] for i in C_min[0]], c='r')ax[0].scatter([i[0] for i in C_min[1]], [i[1] for i in C_min[1]], c='g')ax[0].scatter([i[0] for i in C_min[2]], [i[1] for i in C_min[2]], c='b')ax[0].set_title("使用最小距离进行聚类")ax[1].scatter([i[0] for i in C_avg[0]], [i[1] for i in C_avg[0]], c='r')ax[1].scatter([i[0] for i in C_avg[1]], [i[1] for i in C_avg[1]], c='g')ax[1].scatter([i[0] for i in C_avg[2]], [i[1] for i in C_avg[2]], c='b')ax[1].set_title("使用平均距离进行聚类")fig.tight_layout()plt.show()

完整代码如下:

import numpy as npimport pandas as pdimport matplotlib.pyplot as pltdata = pd.read_csv("./西瓜数据集4.0.csv", index_col='number')data = data.values.tolist()# 画出原始图像fig, ax = plt.subplots()plt.scatter([i[0] for i in data], [i[1] for i in data])plt.show()'''计算欧氏距离'''def calDist(a, b):a = np.array(a)b = np.array(b)dist = np.sqrt(np.dot((a - b), (a - b).T))return dist'''计算簇之间的最小距离'''def calClusterMinDist(c1, c2):minDist = 1e5for vec1 in c1:for vec2 in c2:dist = calDist(vec1, vec2)if dist < minDist:minDist = distreturn minDist'''计算簇之间的平均距离'''def calClusterAvgDist(c1, c2):num = len(c1) * len(c2)sum_dist = 0for vec1 in c1:for vec2 in c2:dist = calDist(vec1, vec2)sum_dist += distreturn sum_dist'''获取最小距离列表'''def getMinDistList(data, method):cluster_num = len(data)#print("cluster_num",cluster_num)minDistList = [[0 for i in range(cluster_num)] for j in range(cluster_num)]for i in range(cluster_num):j = i + 1while j < cluster_num:# print("data[i]:",data[i])# print("data[j]:",data[j])if method == "minDist": # 使用最小距离计算minDistList[i][j] = calClusterMinDist(data[i], data[j])minDistList[j][i] = minDistList[i][j]elif method == "avgDist": # 使用平均距离计算minDistList[i][j] = calClusterAvgDist(data[i], data[j])minDistList[j][i] = minDistList[i][j]j += 1return minDistList'''寻找距离列表中的最小值,用于合并簇以及删除'''def findMin(minDistList):row = len(minDistList)minDist = 1e5min_i = 0min_j = 0for i in range(row):for j in range(row):dist = minDistList[i][j]if dist < minDist and dist != 0:minDist = minDistList[i][j]min_i = imin_j = jreturn min_i, min_j, minDist'''AGNES算法实现'''def AGNES(data, k, method):cluster_num = len(data)C = []for i in data: # 添加数据tmp = [i]C.append(tmp)minDistList = getMinDistList(C, method)while cluster_num > k:i, j, minDist = findMin(minDistList)# print(len(minDistList))# print(i,j,minDist)C[i].extend(C[j]) # 合并del C[j] # 删除minDistList = getMinDistList(C, method)cluster_num -= 1return C'''程序入口'''if __name__ == "__main__":C_min = AGNES(data, 3, 'minDist')C_avg = AGNES(data, 3, 'avgDist')fig, ax = plt.subplots(nrows=2, ncols=1)ax[0].scatter([i[0] for i in C_min[0]], [i[1] for i in C_min[0]], c='r')ax[0].scatter([i[0] for i in C_min[1]], [i[1] for i in C_min[1]], c='g')ax[0].scatter([i[0] for i in C_min[2]], [i[1] for i in C_min[2]], c='b')ax[0].set_title("使用最小距离进行聚类")ax[1].scatter([i[0] for i in C_avg[0]], [i[1] for i in C_avg[0]], c='r')ax[1].scatter([i[0] for i in C_avg[1]], [i[1] for i in C_avg[1]], c='g')ax[1].scatter([i[0] for i in C_avg[2]], [i[1] for i in C_avg[2]], c='b')ax[1].set_title("使用平均距离进行聚类")fig.tight_layout()plt.show()

如果觉得《【机器学习】AGNES层次聚类算法》对你有帮助,请点赞、收藏,并留下你的观点哦!

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。