10.7 upload

This commit is contained in:
artieyue 2022-10-07 18:33:27 +08:00
parent e87fe64f0d
commit 6cfca9f940
1 changed files with 122 additions and 0 deletions

View File

@ -0,0 +1,122 @@
# This is a sample Python script.
# Press ⌃R to execute it or replace it with your code.
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
import os
import pandas as pd
filePath= "file1" #输入文件夹
bigname=111
# -*- coding: utf-8 -*-
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, set_link_color_palette
from matplotlib import pyplot as plt
def os_file(path): #遍历输入文件夹有多少个文件
filenames = os.listdir(path)
for filename in filenames:
print(filename)
filename1=filename[0:-5] #删掉了文件后缀
print(filename1)
if filename!=".DS_Store":
file_reader(filePath + "/" + filename,filename1)
def file_reader(path,name):
df1 = pd.read_excel(path, sheet_name="Sheet1") #输入的文件必须放在Sheet1
df1 = np.array(df1)
importance=df1[-1,:] #获取最后一行的所有列信息(权重)
importance = np.delete(importance, 0, axis=0)
df1=df1[0:-1,:] #获取除了权重行的所有数据
#print(importance)
# 把word和数组分开
word = df1[:, 0] #获取第一列所有行(词)
data = np.delete(df1, 0, axis=1) #获取除第一列的其他列信息(打分)
# print("data1")
#print(data)
data=data*importance #权重与数值相乘
#print(word)
#print("data2")
#print(data)
nums,indics=hierarchy_cluster(data,word,name)
print(indics)
for i in range(len(indics)):
group = "为一组"
for j in range(len(indics[i])):
group=word[indics[i][j]]+" "+group
print(group)
def hierarchy_cluster(data,word,name, method='complete', threshold=600.0): #complete-linkage
'''层次聚类
Arguments:
data [[0, float, ...], [float, 0, ...]] -- 文档 i 和文档 j 的距离
Keyword Arguments:
method {str} -- [linkage的方式 singlecompleteaveragecentroidmedianward] (default: {'average'})
threshold {float} -- 聚类簇之间的距离
Return:
cluster_number int -- 聚类个数
cluster [[idx1, idx2,..], [idx3]] -- 每一类下的索引
'''
data = np.array(data)
plt.figure(figsize=(10, 15), dpi=300) #代表宽和高的尺寸dpi代表分辨率
Z = linkage(data, method=method,metric='euclidean') #欧式距离公式
cluster_assignments = fcluster(Z, threshold, criterion='distance')
num_clusters = cluster_assignments.max()
indices = get_cluster_indices(cluster_assignments)
z = linkage(data, method='ward')
print(z.shape)
dendrogram(z, labels=word, color_threshold=80,orientation='right', leaf_font_size=8,above_threshold_color='black')
set_link_color_palette(['#0000FF', '#4A766E', '#2F4F4F','871F78','FF7F00','E47833','FF6666','FFCCFF']) #color_threshold是画线位置orientation调成left图换方向
#n_clusters=10 #leaf_font_size词间距
#color_threshold=25
plt.grid(True, which='minor', ls='--') #minor代表不显示网格线major代表显示
#name = "college" #设置图片标题
plt.title(name, fontdict={'fontproperties':'Times New Roman','size': 10}) #标题的字体字号
plt.yticks(fontproperties='Times New Roman', size=8) #设置y轴字体和字号大小及加粗
plt.xticks(fontproperties='Times New Roman', size=8)
plt.plot(linewidth = '0.5') #设置线粗细
f = plt.gcf()
f.savefig(name + ".png")
plt.show()
f.clear()
return num_clusters,indices
def get_cluster_indices(cluster_assignments): #层次聚类的实现函数
'''映射每一类至原数据索引
Arguments:
cluster_assignments 层次聚类后的结果
Returns:
[[idx1, idx2,..], [idx3]] -- 每一类下的索引
'''
n = cluster_assignments.max()
indices = []
for cluster_number in range(1, n + 1):
indices.append(np.where(cluster_assignments == cluster_number)[0])
return indices
if __name__ == "__main__":
os_file(filePath)