From 6cfca9f94066e8d253450f1ae938692ab5607424 Mon Sep 17 00:00:00 2001 From: artieyue <3312002405@qq.com> Date: Fri, 7 Oct 2022 18:33:27 +0800 Subject: [PATCH] 10.7 upload --- pydata-huang/层次聚类和可视化.py | 122 +++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 pydata-huang/层次聚类和可视化.py diff --git a/pydata-huang/层次聚类和可视化.py b/pydata-huang/层次聚类和可视化.py new file mode 100644 index 0000000..0c29c02 --- /dev/null +++ b/pydata-huang/层次聚类和可视化.py @@ -0,0 +1,122 @@ +# This is a sample Python script. + +# Press ⌃R to execute it or replace it with your code. +# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings. +import os +import pandas as pd +filePath= "file1" #输入文件夹 +bigname=111 +# -*- coding: utf-8 -*- +import numpy as np +from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, set_link_color_palette +from matplotlib import pyplot as plt + +def os_file(path): #遍历输入文件夹有多少个文件 + filenames = os.listdir(path) + + for filename in filenames: + print(filename) + filename1=filename[0:-5] #删掉了文件后缀 + print(filename1) + if filename!=".DS_Store": + file_reader(filePath + "/" + filename,filename1) + + +def file_reader(path,name): + df1 = pd.read_excel(path, sheet_name="Sheet1") #输入的文件必须放在Sheet1 + df1 = np.array(df1) + importance=df1[-1,:] #获取最后一行的所有列信息(权重) + importance = np.delete(importance, 0, axis=0) + df1=df1[0:-1,:] #获取除了权重行的所有数据 + #print(importance) + + # 把word和数组分开 + word = df1[:, 0] #获取第一列所有行(词) + data = np.delete(df1, 0, axis=1) #获取除第一列的其他列信息(打分) + # print("data1") + #print(data) + data=data*importance #权重与数值相乘 + #print(word) + #print("data2") + #print(data) + nums,indics=hierarchy_cluster(data,word,name) + print(indics) + for i in range(len(indics)): + group = "为一组" + for j in range(len(indics[i])): + group=word[indics[i][j]]+" "+group + print(group) + + + + + + + + + +def hierarchy_cluster(data,word,name, method='complete', threshold=600.0): #complete-linkage + '''层次聚类 + + Arguments: + data [[0, float, ...], [float, 0, ...]] -- 文档 i 和文档 j 的距离 + + Keyword Arguments: + method {str} -- [linkage的方式: single、complete、average、centroid、median、ward] (default: {'average'}) + threshold {float} -- 聚类簇之间的距离 + Return: + cluster_number int -- 聚类个数 + cluster [[idx1, idx2,..], [idx3]] -- 每一类下的索引 + ''' + data = np.array(data) + plt.figure(figsize=(10, 15), dpi=300) #代表宽和高的尺寸,dpi代表分辨率 + Z = linkage(data, method=method,metric='euclidean') #欧式距离公式 + + cluster_assignments = fcluster(Z, threshold, criterion='distance') + + + num_clusters = cluster_assignments.max() + + indices = get_cluster_indices(cluster_assignments) + z = linkage(data, method='ward') + print(z.shape) + dendrogram(z, labels=word, color_threshold=80,orientation='right', leaf_font_size=8,above_threshold_color='black') + set_link_color_palette(['#0000FF', '#4A766E', '#2F4F4F','871F78','FF7F00','E47833','FF6666','FFCCFF']) #color_threshold是画线位置;orientation调成left图换方向 + #n_clusters=10 #leaf_font_size词间距 + #color_threshold=25 + + plt.grid(True, which='minor', ls='--') #minor代表不显示网格线,major代表显示 + #name = "college" #设置图片标题 + plt.title(name, fontdict={'fontproperties':'Times New Roman','size': 10}) #标题的字体字号 + plt.yticks(fontproperties='Times New Roman', size=8) #设置y轴字体和字号,大小及加粗 + plt.xticks(fontproperties='Times New Roman', size=8) + plt.plot(linewidth = '0.5') #设置线粗细 + + f = plt.gcf() + + f.savefig(name + ".png") + plt.show() + f.clear() + + return num_clusters,indices + + +def get_cluster_indices(cluster_assignments): #层次聚类的实现函数 + '''映射每一类至原数据索引 + + Arguments: + cluster_assignments 层次聚类后的结果 + + Returns: + [[idx1, idx2,..], [idx3]] -- 每一类下的索引 + ''' + n = cluster_assignments.max() + indices = [] + for cluster_number in range(1, n + 1): + indices.append(np.where(cluster_assignments == cluster_number)[0]) + + return indices + + +if __name__ == "__main__": + os_file(filePath)