我使用k-prototyps库来混合数字和数字数据类型。根据https://github.com/nicodv/kmodes/issues/46算法计算k个原型的轮廓值,计算了分类数据的轮廓分数(基于hamming距离)和数值数据的轮廓分数(基于欧几里得距离),但开发的程序比较慢,需要10h才能计算出60000条记录的轮廓。我的笔记本电脑有12G Ram和corei 7,有什么帮助可以提高代码的速度吗?
import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
# -------- import data
df = pd.read_csv(r'C:\Users\data.csv')
# ------------- Normalize the data ---------------
# print(df.columns) # To get columns name
x_df = df[['R', 'F']]
x_df_norm = x_df.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))
x_df_norm['COType'] = df[['COType']]
def calc_euclian_dis(_s1, _s2):
# s1 = np.array((3, 5))
_eucl_dist = np.linalg.norm(_s2 - _s1) # calculate Euclidean distance, accept input an array [2 6]
return _eucl_dist
def calc_simpleMatching_dis(_s1, _s2):
_cat_dist = 0
if (_s1 != _s2):
_cat_dist = 1
return _cat_dist
k = 3
# calculate silhoutte for one cluster number
kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)
clusters_label = kproto.fit_predict(x_df_norm, categorical=[2])
_identical_cluster_labels = list(dict.fromkeys(clusters_label))
# Assign clusters lables to the Dataset
x_df_norm['Cluster_label'] = clusters_label
# ------------- calculate _silhouette_Index -------------
# 1. Calculate ai
_silhouette_Index_arr = []
for i in x_df_norm.itertuples():
_ai_cluster_label = i[-1]
# return samples of the same cluster
_samples_cluster = x_df_norm[x_df_norm['Cluster_label'] == _ai_cluster_label]
_dist_array_ai = []
_s1_nume_att = np.array((i[1], i[2]))
_s1_cat_att = i[3]
for j in _samples_cluster.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_ai.append(_euclian_dis + (kproto.gamma * _cat_dis))
ai = np.average(_dist_array_ai)
# 2. Calculate bi
# 2.1. determine the samples of other clusters
_identical_cluster_labels.remove(_ai_cluster_label)
_dic_cluseter = {}
_bi_arr = []
for ii in _identical_cluster_labels:
_samples = x_df_norm[x_df_norm['Cluster_label'] == ii]
# 2.2. calculate bi
_dist_array_bi = []
for j in _samples.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_bi.append(_euclian_dis + (kproto.gamma * _cat_dis))
_bi_arr.append(np.average(_dist_array_bi))
_identical_cluster_labels.append(_ai_cluster_label)
# min bi is determined as final bi variable
bi = min(_bi_arr)
# 3. calculate silhouette Index
if ai == bi:
_silhouette_i = 0
elif ai < bi:
_silhouette_i = 1 - (ai / bi)
elif ai > bi:
_silhouette_i = 1 - (bi / ai)
_silhouette_Index_arr.append(_silhouette_i)
silhouette_score = np.average(_silhouette_Index_arr)
print('_silhouette_Index = ' + str(silhouette_score))发布于 2022-10-10 16:04:40
嗨!我重新实现了你的函数,使用线性代数运算符来计算异同点,而不是使用很多for循环:它要快得多:-)
def euclidean_dissim(a, b, **_):
"""Euclidean distance dissimilarity function
b is the single point, a is the matrix of vectors"""
if np.isnan(a).any() or np.isnan(b).any():
raise ValueError("Missing values detected in numerical columns.")
return np.linalg.norm(a - b, axis=1)def matching_dissim(a, b, **_):
"""Simple matching dissimilarity function
b is the single point, a is the matrix of all other vectors,
count how many matching values so difference = 0 """
# We are subtracting to dimension since is not similarity but a dissimilarity
dimension = len(b)
return dimension - np.sum((b-a)==0,axis=1)def calc_silhouette_proto(dataset,numerical_pos, cat_pos,kproto_model):
'''------------- calculate _silhouette_Index -------------'''
# 1. Compute a(i)
silhouette_Index_arr = []
for i in dataset.itertuples():
# convert tuple to np array
i = np.array(i)
unique_cluster_labels = list(np.unique(dataset['cluster_labels']))
# We need each time to remove the considered tuple from the dataset since we don't compute distances from itself
data = dataset.copy()
ai_cluster = i[-1] # The cluster is in the last position of the tuple
# Removing the tuple from the dataset
tuple_index = dataset.index.isin([i[0]])
data = data[~tuple_index]
# Get samples of the same cluster
samples_of_cluster = data[data['cluster_labels'] == ai_cluster].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = euclidean_dissim(samples_of_cluster[:,numerical_pos],i[np.array(numerical_pos)+1])
categ_distances = matching_dissim(samples_of_cluster[:,cat_pos],i[np.array(cat_pos)+1])
# Weighted average of the 2 distances
ai = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
# 2. Calculate bi
unique_cluster_labels.remove(ai_cluster)
bi_arr = []
for ii in unique_cluster_labels:
# Get all the samples of cluster ii
samples = data[data['cluster_labels'] == ii].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = np.linalg.norm(samples[:,numerical_pos] - i[np.array(numerical_pos)+1], axis=1)
categ_distances = matching_dissim(samples[:,cat_pos],i[np.array(cat_pos)+1])
distance_bi = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
bi_arr.append(np.average(distance_bi))
# min bi is determined as final bi variable
if(len(bi_arr)==0):
bi = 0
else:
bi = min(bi_arr)
# 3. calculate silhouette Index
if ai == bi:
silhouette_i = 0
elif ai < bi:
silhouette_i = 1 - (ai / bi)
elif ai > bi:
silhouette_i = 1 - (bi / ai)
silhouette_Index_arr.append(silhouette_i)
silhouette_score = np.average(silhouette_Index_arr)
return silhouette_scorehttps://stackoverflow.com/questions/65814395
复制相似问题