我刚刚写了一段我很难理解的代码,任何帮助都是非常感谢的。问题是:为什么在稀疏矩阵上进行聚类要花费更多的时间、更多的内存,并且与在同一矩阵上以密集格式进行聚类不同?
这是密码。它只是对一个密集和稀疏的矩阵执行以下操作:
在这两个基准测试之间,内存是手动垃圾收集的(以确保我们处于一个新的起点)。
#!.env/bin/python
# -*- coding: utf8 -*-
import time
import gc
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.cluster import MiniBatchKMeans
from memory_profiler import profile
@profile
def bench_dense():
print ">>>>> Dense Matrix Clustering"
# create a random dense matrix
dense_matrix = np.random.random((
100000, # 100K 'fake' documents
500 # 500 dimensions
))
s = time.time()
km = MiniBatchKMeans(
n_clusters=20, init='k-means++', batch_size=100, n_init=10, verbose=1)
km.fit_predict(dense_matrix) # cluster the points
print "Clustered dense matrix in: %.3fs" % (time.time() - s)
@profile
def bench_sparse():
print ">>>>>> Sparse Matrix Clustering"
# convert the dense matrix in sparse format
sparse_matrix = csr_matrix(np.random.random((
100000, # 100K 'fake' documents
500 # 500 dimensions
)))
s = time.time()
km = MiniBatchKMeans(
n_clusters=20, init='k-means++', batch_size=100, n_init=10, verbose=1)
km.fit_predict(sparse_matrix)
print "Clustered sparse matrix in: %.3fs" % (time.time() - s)
if __name__ == '__main__':
np.random.seed(42)
bench_dense()
gc.collect()
np.random.seed(42)
bench_sparse()在运行此代码几次(以确保KMeans算法的随机性不是我发现的原因)时,我有一些惊喜:
下面是基准测试的输出:
>>>>> Dense Matrix Clustering
Init 1/10 with method: k-means++
Inertia for init 1/10: 11546.570096
[...]
Init 10/10 with method: k-means++
Inertia for init 10/10: 11554.093346
Minibatch iteration 1/100000: mean batch inertia: 42.160602, ewa inertia: 42.160602
Minibatch iteration 2/100000: mean batch inertia: 41.914472, ewa inertia: 42.160110
[...]
Minibatch iteration 977/100000: mean batch inertia: 41.750966, ewa inertia: 41.581670
Minibatch iteration 978/100000: mean batch inertia: 41.719181, ewa inertia: 41.581945
Converged (lack of improvement in inertia) at iteration 978/100000
Computing label assignment and total inertia
Clustered dense matrix in: 7.363s
Filename: experiments/dense_sparse_bench.py
Line # Mem usage Increment Line Contents
================================================
13 33.2 MiB 0.0 MiB @profile
14 def bench_dense():
15 # create a random dense matrix
16 33.2 MiB 0.0 MiB dense_matrix = np.random.random((
17 100000, # 100K 'fake' documents
18 241.2 MiB 208.0 MiB 500 # 500 dimensions
19 ))
20 241.3 MiB 0.1 MiB s = time.time()
21 241.3 MiB 0.0 MiB km = MiniBatchKMeans(
22 241.4 MiB 0.2 MiB n_clusters=20, init='k-means++', batch_size=100, n_init=10, verbose=1)
23 405.0 MiB 163.6 MiB km.fit_predict(dense_matrix) # cluster the points
24 405.0 MiB 0.0 MiB print "Clustered dense matrix in: %.3fs" % (time.time() - s)
>>>>> Sparse Matrix Clustering
Init 1/10 with method: k-means++
Inertia for init 1/10: 11618.817774
[...]
Init 10/10 with method: k-means++
Inertia for init 10/10: 11609.579624
Minibatch iteration 1/100000: mean batch inertia: 42.105951, ewa inertia: 42.105951
Minibatch iteration 2/100000: mean batch inertia: 42.375899, ewa inertia: 42.106491
[...]
Minibatch iteration 21/100000: mean batch inertia: 41.912611, ewa inertia: 42.258551
Minibatch iteration 22/100000: mean batch inertia: 41.662418, ewa inertia: 42.257358
Converged (lack of improvement in inertia) at iteration 22/100000
Computing label assignment and total inertia
Clustered sparse matrix in: 14.243s
Filename: experiments/dense_sparse_bench.py
Line # Mem usage Increment Line Contents
================================================
27 38.5 MiB 0.0 MiB @profile
28 def bench_sparse():
29 # convert the dense matrix in sparse format
30 38.5 MiB 0.0 MiB sparse_matrix = csr_matrix(np.random.random((
31 100000, # 100K 'fake' documents
32 271.0 MiB 232.5 MiB 500 # 500 dimensions
33 )))
34 271.1 MiB 0.1 MiB s = time.time()
35 271.1 MiB 0.0 MiB km = MiniBatchKMeans(
36 271.2 MiB 0.1 MiB n_clusters=20, init='k-means++', batch_size=100, n_init=10, verbose=1)
37 598.5 MiB 327.3 MiB km.fit_predict(sparse_matrix)
38 598.5 MiB 0.0 MiB print "Clustered sparse matrix in: %.3fs" % (time.time() - s)提前感谢!
发布于 2015-07-16 13:55:17
You...have发现了一个bug。
你能用你的平台、学习版本等对此发表评论吗?这样我就可以向sklearn开发人员报告这一点了吗?这是一个带有企业社会责任的bug
我已经收紧了您的脚本(在MiniBatchKMeans构造函数中分配MiniBatchKMeans以确保“相同”结果),然后开始挖掘。在第一组重新分配之后,您的结果会出现差异。因此,我修改了k_means_.py函数以输出一些变量。在这条线,我在"if n_reassign“循环中添加了这些print语句:
print "to_reassign",to_reassign
print "np.where(to_reassign)",np.where(to_reassign)
print "new_centers", new_centers
print "centers", centers[:,0]
assert False然后,我将“详细”改为“0”,并得到了以下输出:
>>>>> Dense Matrix Clustering
b
to_reassign [False False False False False False False False False False True False
False True False False True False True False]
np.where(to_reassign) (array([10, 13, 16, 18], dtype=int64),)
new_centers [11 24 33 72]
centers [ 0.51612664 0.48724141 0.50478939 0.46328761 0.41928756 0.50768023
0.48635517 0.48744328 0.59401064 0.55509388 0.33723042 0.37875769
0.5366691 0.71604087 0.36911868 0.4626776 0.37506238 0.60670616
0.21136754 0.54321791]
>>>>>> Sparse Matrix Clustering
a
to_reassign [False False False False False False False False False False True False
False True False False True False True False]
np.where(to_reassign) (array([10, 13, 16, 18], dtype=int64),)
new_centers [11 24 33 72]
centers [ 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0.33723042 0. 0.
0.71604087 0. 0. 0.37506238 0. 0.21136754
0. ]我修改的你的脚本版本:
import time
import gc
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.cluster import MiniBatchKMeans as MiniBatchKMeans
#from memory_profiler import profile
#@profile
def bench_dense(a_random_matrix):
print ">>>>> Dense Matrix Clustering"
# create a random dense matrix
dense_matrix = a_random_matrix.copy()
s = time.time()
km = MiniBatchKMeans(
n_clusters=20, init='k-means++',
batch_size=100,
n_init=10, verbose=0,
random_state=37,)
km.fit_predict(dense_matrix) # cluster the points
print "Clustered dense matrix in: %.3fs" % (time.time() - s)
#@profile
def bench_sparse(a_random_matrix):
print ">>>>>> Sparse Matrix Clustering"
# convert the dense matrix in sparse format
sparse_matrix = csr_matrix(a_random_matrix.copy())
assert np.all((sparse_matrix == a_random_matrix).sum())
s = time.time()
km = MiniBatchKMeans(
n_clusters=20, init='k-means++',
batch_size=100,
n_init=10, verbose=0,
random_state=37,)
km.fit_predict(sparse_matrix)
print "Clustered sparse matrix in: %.3fs" % (time.time() - s)
if __name__ == '__main__':
a_random_matrix = np.random.random((
100000, # 100K 'fake' documents
500 # 500 dimensions
))
try:
np.random.seed(42)
bench_dense(a_random_matrix)
except AssertionError, e:
print e
gc.collect()
try:
np.random.seed(42)
bench_sparse(a_random_matrix)
except AssertionError, e:
print ehttps://stackoverflow.com/questions/31337217
复制相似问题