openbrain/pydata-huang/层次聚类和可视化.py

123 lines
4.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# This is a sample Python script.
# Press ⌃R to execute it or replace it with your code.
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
import os
import pandas as pd
filePath= "file1" #输入文件夹
bigname=111
# -*- coding: utf-8 -*-
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, set_link_color_palette
from matplotlib import pyplot as plt
def os_file(path): #遍历输入文件夹有多少个文件
filenames = os.listdir(path)
for filename in filenames:
print(filename)
filename1=filename[0:-5] #删掉了文件后缀
print(filename1)
if filename!=".DS_Store":
file_reader(filePath + "/" + filename,filename1)
def file_reader(path,name):
df1 = pd.read_excel(path, sheet_name="Sheet1") #输入的文件必须放在Sheet1
df1 = np.array(df1)
importance=df1[-1,:] #获取最后一行的所有列信息(权重)
importance = np.delete(importance, 0, axis=0)
df1=df1[0:-1,:] #获取除了权重行的所有数据
#print(importance)
# 把word和数组分开
word = df1[:, 0] #获取第一列所有行(词)
data = np.delete(df1, 0, axis=1) #获取除第一列的其他列信息(打分)
# print("data1")
#print(data)
data=data*importance #权重与数值相乘
#print(word)
#print("data2")
#print(data)
nums,indics=hierarchy_cluster(data,word,name)
print(indics)
for i in range(len(indics)):
group = "为一组"
for j in range(len(indics[i])):
group=word[indics[i][j]]+" "+group
print(group)
def hierarchy_cluster(data,word,name, method='complete', threshold=600.0): #complete-linkage
'''层次聚类
Arguments:
data [[0, float, ...], [float, 0, ...]] -- 文档 i 和文档 j 的距离
Keyword Arguments:
method {str} -- [linkage的方式 single、complete、average、centroid、median、ward] (default: {'average'})
threshold {float} -- 聚类簇之间的距离
Return:
cluster_number int -- 聚类个数
cluster [[idx1, idx2,..], [idx3]] -- 每一类下的索引
'''
data = np.array(data)
plt.figure(figsize=(10, 15), dpi=300) #代表宽和高的尺寸dpi代表分辨率
Z = linkage(data, method=method,metric='euclidean') #欧式距离公式
cluster_assignments = fcluster(Z, threshold, criterion='distance')
num_clusters = cluster_assignments.max()
indices = get_cluster_indices(cluster_assignments)
z = linkage(data, method='ward')
print(z.shape)
dendrogram(z, labels=word, color_threshold=80,orientation='right', leaf_font_size=8,above_threshold_color='black')
set_link_color_palette(['#0000FF', '#4A766E', '#2F4F4F','871F78','FF7F00','E47833','FF6666','FFCCFF']) #color_threshold是画线位置orientation调成left图换方向
#n_clusters=10 #leaf_font_size词间距
#color_threshold=25
plt.grid(True, which='minor', ls='--') #minor代表不显示网格线major代表显示
#name = "college" #设置图片标题
plt.title(name, fontdict={'fontproperties':'Times New Roman','size': 10}) #标题的字体字号
plt.yticks(fontproperties='Times New Roman', size=8) #设置y轴字体和字号大小及加粗
plt.xticks(fontproperties='Times New Roman', size=8)
plt.plot(linewidth = '0.5') #设置线粗细
f = plt.gcf()
f.savefig(name + ".png")
plt.show()
f.clear()
return num_clusters,indices
def get_cluster_indices(cluster_assignments): #层次聚类的实现函数
'''映射每一类至原数据索引
Arguments:
cluster_assignments 层次聚类后的结果
Returns:
[[idx1, idx2,..], [idx3]] -- 每一类下的索引
'''
n = cluster_assignments.max()
indices = []
for cluster_number in range(1, n + 1):
indices.append(np.where(cluster_assignments == cluster_number)[0])
return indices
if __name__ == "__main__":
os_file(filePath)