-
Notifications
You must be signed in to change notification settings - Fork 0
/
finch.py
198 lines (160 loc) · 6.43 KB
/
finch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import time
import argparse
import numpy as np
from sklearn import metrics
import scipy.sparse as sp
import warnings
try:
from pyflann import *
pyflann_available = True
except Exception as e:
warnings.warn('pyflann not installed: {}'.format(e))
pyflann_available = False
pass
RUN_FLANN = 70000
def clust_rank(mat, initial_rank=None, distance='cosine'):
s = mat.shape[0]
if initial_rank is not None:
orig_dist = []
elif s <= RUN_FLANN:
orig_dist = metrics.pairwise.pairwise_distances(mat, mat, metric=distance)
np.fill_diagonal(orig_dist, 1000.0)
initial_rank = np.argmin(orig_dist, axis=1)
else:
if not pyflann_available:
raise MemoryError("You should use pyflann for inputs larger than {} samples.".format(RUN_FLANN))
print('Using flann to compute 1st-neighbours at this step ...')
flann = FLANN()
result, dists = flann.nn(mat, mat, num_neighbors=2, algorithm="kdtree", trees=8, checks=128)
initial_rank = result[:, 1]
orig_dist = []
print('Step flann done ...')
# The Clustering Equation
A = sp.csr_matrix((np.ones_like(initial_rank, dtype=np.float32), (np.arange(0, s), initial_rank)), shape=(s, s))
A = A + sp.eye(s, dtype=np.float32, format='csr')
A = A @ A.T
A = A.tolil()
A.setdiag(0)
return A, orig_dist
def get_clust(a, orig_dist, min_sim=None):
if min_sim is not None:
a[np.where((orig_dist * a.toarray()) > min_sim)] = 0
num_clust, u = sp.csgraph.connected_components(csgraph=a, directed=True, connection='weak', return_labels=True)
return u, num_clust
def cool_mean(M, u):
_, nf = np.unique(u, return_counts=True)
idx = np.argsort(u)
M = M[idx, :]
M = np.vstack((np.zeros((1, M.shape[1])), M))
np.cumsum(M, axis=0, out=M)
cnf = np.cumsum(nf)
nf1 = np.insert(cnf, 0, 0)
nf1 = nf1[:-1]
M = M[cnf, :] - M[nf1, :]
M = M / nf[:, None]
return M
def get_merge(c, u, data):
if len(c) != 0:
_, ig = np.unique(c, return_inverse=True)
c = u[ig]
else:
c = u
mat = cool_mean(data, c)
return c, mat
def update_adj(adj, d):
# Update adj, keep one merge at a time
idx = adj.nonzero()
v = np.argsort(d[idx])
v = v[:2]
x = [idx[0][v[0]], idx[0][v[1]]]
y = [idx[1][v[0]], idx[1][v[1]]]
a = sp.lil_matrix(adj.get_shape())
a[x, y] = 1
return a
def req_numclust(c, data, req_clust, distance):
iter_ = len(np.unique(c)) - req_clust
c_, mat = get_merge([], c, data)
for i in range(iter_):
adj, orig_dist = clust_rank(mat, initial_rank=None, distance=distance)
adj = update_adj(adj, orig_dist)
u, _ = get_clust(adj, [], min_sim=None)
c_, mat = get_merge(c_, u, data)
return c_
def FINCH(data, initial_rank=None, req_clust=None, distance='cosine', verbose=True):
""" FINCH clustering algorithm.
:param data: Input matrix with features in rows.
:param initial_rank: Nx1 first integer neighbor indices (optional).
:param req_clust: Set output number of clusters (optional). Not recommended.
:param distance: One of ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] Recommended 'cosine'.
:param verbose: Print verbose output.
:return:
c: NxP matrix where P is the partition. Cluster label for every partition.
num_clust: Number of clusters.
req_c: Labels of required clusters (Nx1). Only set if `req_clust` is not None.
The code implements the FINCH algorithm described in our CVPR 2019 paper
Sarfraz et al. "Efficient Parameter-free Clustering Using First Neighbor Relations", CVPR2019
https://arxiv.org/abs/1902.11266
For academic purpose only. The code or its re-implementation should not be used for commercial use.
Please contact the author below for licensing information.
Copyright
M. Saquib Sarfraz ([email protected])
Karlsruhe Institute of Technology (KIT)
"""
# Cast input data to float32
data = data.astype(np.float32)
min_sim = None
adj, orig_dist = clust_rank(data, initial_rank, distance)
initial_rank = None
group, num_clust = get_clust(adj, [], min_sim)
c, mat = get_merge([], group, data)
if verbose:
print('Partition 0: {} clusters'.format(num_clust))
if len(orig_dist) != 0:
min_sim = np.max(orig_dist * adj.toarray())
exit_clust = 2
c_ = c
k = 1
num_clust = [num_clust]
while exit_clust > 1:
adj, orig_dist = clust_rank(mat, initial_rank, distance)
u, num_clust_curr = get_clust(adj, orig_dist, min_sim)
c_, mat = get_merge(c_, u, data)
num_clust.append(num_clust_curr)
c = np.column_stack((c, c_))
exit_clust = num_clust[-2] - num_clust_curr
if num_clust_curr == 1 or exit_clust < 1:
num_clust = num_clust[:-1]
c = c[:, :-1]
break
if verbose:
print('Partition {}: {} clusters'.format(k, num_clust[k]))
k += 1
if req_clust is not None:
if req_clust not in num_clust:
ind = [i for i, v in enumerate(num_clust) if v >= req_clust]
req_c = req_numclust(c[:, ind[-1]], data, req_clust, distance)
else:
req_c = c[:, num_clust.index(req_clust)]
else:
req_c = None
return c, num_clust, req_c
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--data-path', required=True, help='Specify the path to your data csv file.')
parser.add_argument('--output-path', default=None, help='Specify the folder to write back the results.')
args = parser.parse_args()
data = np.genfromtxt(args.data_path, delimiter=",").astype(np.float32)
start = time.time()
c, num_clust, req_c = FINCH(data, initial_rank=None, req_clust=None, distance='cosine', verbose=True)
print('Time Elapsed: {:2.2f} seconds'.format(time.time() - start))
# Write back
if args.output_path is not None:
print('Writing back the results on the provided path ...')
np.savetxt(args.output_path + '/c.csv', c, delimiter=',', fmt='%d')
np.savetxt(args.output_path + '/num_clust.csv', np.array(num_clust), delimiter=',', fmt='%d')
if req_c is not None:
np.savetxt(args.output_path + '/req_c.csv', req_c, delimiter=',', fmt='%d')
else:
print('Results are not written back as the --output-path was not provided')
if __name__ == '__main__':
main()