From 2d0835b84cd4d45d117ff8aff1b81517384ccf68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 26 Feb 2024 13:12:26 +0100 Subject: [PATCH 1/6] Moved feature importance estimation methods into one file --- turftopic/centroid_distance.py | 22 ---------------------- turftopic/feature_importance.py | 33 +++++++++++++++++++++++++++++++++ turftopic/models/gmm.py | 2 +- turftopic/soft_ctf_idf.py | 15 --------------- 4 files changed, 34 insertions(+), 38 deletions(-) delete mode 100644 turftopic/centroid_distance.py create mode 100644 turftopic/feature_importance.py delete mode 100644 turftopic/soft_ctf_idf.py diff --git a/turftopic/centroid_distance.py b/turftopic/centroid_distance.py deleted file mode 100644 index c17d6ef..0000000 --- a/turftopic/centroid_distance.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np -from sklearn.metrics import pairwise_distances -from sklearn.preprocessing import normalize - - -def cluster_centroid_distance( - cluster_labels, embeddings, vocab_embeddings, metric="euclidean" -): - centroids = [] - unique_labels = np.unique(cluster_labels) - unique_labels = np.sort(unique_labels) - for label in unique_labels: - centroid = np.mean(embeddings[cluster_labels == label], axis=0) - centroids.append(centroid) - centroids = np.stack(centroids) - distances = pairwise_distances(centroids, vocab_embeddings, metric=metric) - similarities = -distances / np.max(distances) - # Z-score transformation - similarities = (similarities - np.mean(similarities)) / np.std( - similarities - ) - return similarities diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py new file mode 100644 index 0000000..04ea84d --- /dev/null +++ b/turftopic/feature_importance.py @@ -0,0 +1,33 @@ +import numpy as np +import scipy.sparse as spr +from sklearn.metrics import pairwise_distances +from sklearn.preprocessing import normalize + + +def cluster_centroid_distance( + cluster_centroids: np.ndarray, + vocab_embeddings: np.ndarray, + metric="cosine", +) -> np.ndarray: + distances = pairwise_distances( + cluster_centroids, vocab_embeddings, metric=metric + ) + similarities = -distances / np.max(distances) + # Z-score transformation + similarities = (similarities - np.mean(similarities)) / np.std( + similarities + ) + return similarities + + +def soft_ctf_idf( + doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix +) -> np.ndarray: + eps = np.finfo(float).eps + term_importance = doc_topic_matrix.T @ doc_term_matrix + overall_in_topic = np.abs(term_importance).sum(axis=1) + n_docs = len(doc_topic_matrix) + tf = (term_importance.T / (overall_in_topic + eps)).T + idf = np.log(n_docs / (np.abs(term_importance).sum(axis=0) + eps)) + ctf_idf = tf * idf + return ctf_idf diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py index 4ed5820..fe1380b 100644 --- a/turftopic/models/gmm.py +++ b/turftopic/models/gmm.py @@ -9,7 +9,7 @@ from turftopic.base import ContextualModel, Encoder from turftopic.dynamic import DynamicTopicModel, bin_timestamps -from turftopic.soft_ctf_idf import soft_ctf_idf +from turftopic.feature_importance import soft_ctf_idf from turftopic.vectorizer import default_vectorizer diff --git a/turftopic/soft_ctf_idf.py b/turftopic/soft_ctf_idf.py deleted file mode 100644 index bb54639..0000000 --- a/turftopic/soft_ctf_idf.py +++ /dev/null @@ -1,15 +0,0 @@ -import numpy as np -import scipy.sparse as spr - - -def soft_ctf_idf( - doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix -) -> np.ndarray: - eps = np.finfo(float).eps - term_importance = doc_topic_matrix.T @ doc_term_matrix - overall_in_topic = np.abs(term_importance).sum(axis=1) - n_docs = len(doc_topic_matrix) - tf = (term_importance.T / (overall_in_topic + eps)).T - idf = np.log(n_docs / (np.abs(term_importance).sum(axis=0) + eps)) - ctf_idf = tf * idf - return ctf_idf From 302f51ef9ef4364b68d9879f9ee775b51843e3ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 26 Feb 2024 13:12:59 +0100 Subject: [PATCH 2/6] Added Hierarchical topic reduction to clustering models --- turftopic/models/cluster.py | 131 ++++++++++++++++++++++++++++++------ 1 file changed, 109 insertions(+), 22 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index b363428..0bf0f5f 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -4,14 +4,16 @@ from rich.console import Console from sentence_transformers import SentenceTransformer from sklearn.base import ClusterMixin, TransformerMixin -from sklearn.cluster import OPTICS +from sklearn.cluster import OPTICS, AgglomerativeClustering from sklearn.feature_extraction.text import CountVectorizer from sklearn.manifold import TSNE from sklearn.preprocessing import label_binarize from turftopic.base import ContextualModel, Encoder -from turftopic.centroid_distance import cluster_centroid_distance -from turftopic.soft_ctf_idf import soft_ctf_idf +from turftopic.feature_importance import ( + cluster_centroid_distance, + soft_ctf_idf, +) from turftopic.vectorizer import default_vectorizer integer_message = """ @@ -22,9 +24,28 @@ where the number of clusters is predefined. For instance: ClusteringTopicModel(clustering=KMeans(10)) + +Alternatively you can reduce the number of topics in the model by specifying +the desired reduced number on initialization. + +ClusteringTopicModel(n_reduce_to=10) """ +def calculate_topic_vectors( + cluster_labels: np.ndarray, embeddings: np.ndarray +) -> np.ndarray: + """Calculates topic centroids.""" + centroids = [] + unique_labels = np.unique(cluster_labels) + unique_labels = np.sort(unique_labels) + for label in unique_labels: + centroid = np.mean(embeddings[cluster_labels == label], axis=0) + centroids.append(centroid) + centroids = np.stack(centroids) + return centroids + + class ClusteringTopicModel(ContextualModel, ClusterMixin): """Topic models, which assume topics to be clusters of documents in semantic space. @@ -66,6 +87,10 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin): 'centroid' uses distances from cluster centroid similarly to Top2Vec. 'ctfidf' uses BERTopic's c-tf-idf. + n_reduce_to: int, default None + Number of topics to reduce topics to. + The specified reduction method will be used to merge them. + By default, topics are not merged. """ def __init__( @@ -77,6 +102,7 @@ def __init__( dimensionality_reduction: Optional[TransformerMixin] = None, clustering: Optional[ClusterMixin] = None, feature_importance: Literal["ctfidf", "centroid"] = "ctfidf", + n_reduce_to: Optional[int] = None, ): self.encoder = encoder if isinstance(encoder, int): @@ -100,6 +126,65 @@ def __init__( else: self.dimensionality_reduction = dimensionality_reduction self.feature_importance = feature_importance + self.n_reduce_to = n_reduce_to + + def _merge_agglomerative(self, n_reduce_to: int) -> dict[int, int]: + n_topics = self.components_.shape[0] + res = {old_label: old_label for old_label in self.classes_} + if n_topics <= n_reduce_to: + return res + interesting_topic_vectors = np.stack( + [ + vec + for label, vec in zip(self.classes_, self.topic_vectors_) + if label != -1 + ] + ) + old_labels = [label for label in self.classes_ if label != -1] + new_labels = AgglomerativeClustering( + n_clusters=n_reduce_to, metric="cosine", linkage="average" + ).fit_predict(interesting_topic_vectors) + res = {} + if -1 in self.classes_: + res[-1] = -1 + for i_old, i_new in zip(old_labels, new_labels): + res[i_old] = i_new + return res + + def _estimate_parameters( + self, + cluster_labels: np.ndarray, + embeddings: np.ndarray, + doc_term_matrix: np.ndarray, + status, + console, + ): + clusters = np.unique(cluster_labels) + self.classes_ = np.sort(clusters) + self.topic_sizes_ = np.array( + [np.sum(cluster_labels == label) for label in self.classes_] + ) + status.update("Calculating topic vectors.") + self.topic_vectors_ = calculate_topic_vectors( + cluster_labels, embeddings + ) + console.log("Topic vectors calculated") + status.update("Encoding vocabulary") + self.vocab_embeddings = self.encoder_.encode(self.vectorizer.get_feature_names_out()) # type: ignore + console.log("Vocabulary encoded") + status.update("Estimating term importances") + if self.feature_importance == "ctfidf": + document_topic_matrix = label_binarize( + cluster_labels, classes=self.classes_ + ) + self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore + else: + self.components_ = cluster_centroid_distance( + self.topic_vectors_, + self.vocab_embeddings, + metric="cosine", + ) + self.labels_ = cluster_labels def fit_predict( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None @@ -127,9 +212,8 @@ def fit_predict( embeddings = self.encoder_.encode(raw_documents) console.log("Encoding done.") status.update("Extracting terms") - doc_term_matrix = self.vectorizer.fit_transform(raw_documents) + self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents) console.log("Term extraction done.") - vocab = self.vectorizer.get_feature_names_out() status.update("Reducing Dimensionality") reduced_embeddings = self.dimensionality_reduction.fit_transform( embeddings @@ -137,27 +221,30 @@ def fit_predict( console.log("Dimensionality reduction done.") status.update("Clustering documents") cluster_labels = self.clustering.fit_predict(reduced_embeddings) - clusters = np.unique(cluster_labels) + print(np.unique(cluster_labels)) console.log("Clustering done.") - self.classes_ = np.sort(clusters) - status.update("Estimating term importances") - if self.feature_importance == "ctfidf": - document_topic_matrix = label_binarize( - cluster_labels, classes=self.classes_ + self._estimate_parameters( + cluster_labels, + embeddings, + self.doc_term_matrix, + status, + console, + ) + if self.n_reduce_to is not None: + status.update("Reducing topics.") + self.mapping_ = self._merge_agglomerative(self.n_reduce_to) + cluster_labels = np.array( + [self.mapping_[label] for label in cluster_labels] ) - self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore - else: - status.update("Encoding vocabulary") - vocab_embeddings = self.encoder_.encode(vocab) # type: ignore - self.components_ = cluster_centroid_distance( - cluster_labels, + self._estimate_parameters( + np.array(cluster_labels), embeddings, - vocab_embeddings, - metric="euclidean", + self.doc_term_matrix, + status, + console, ) - self.labels_ = cluster_labels - console.log("Model fitting done.") - return cluster_labels + console.log("Model fitting done.") + return self.labels_ def fit_transform( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None From c94fcc8ef680452ed2cfa4ed9a421f1d0dd30e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 26 Feb 2024 13:29:38 +0100 Subject: [PATCH 3/6] Added c-TF-IDF proper to clustering topic models --- turftopic/feature_importance.py | 17 +++++++++++++++++ turftopic/models/cluster.py | 22 +++++++++++++--------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py index 04ea84d..6950270 100644 --- a/turftopic/feature_importance.py +++ b/turftopic/feature_importance.py @@ -31,3 +31,20 @@ def soft_ctf_idf( idf = np.log(n_docs / (np.abs(term_importance).sum(axis=0) + eps)) ctf_idf = tf * idf return ctf_idf + + +def ctf_idf( + doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix +) -> np.ndarray: + labels = np.argmax(doc_topic_matrix, axis=1) + n_topics = doc_topic_matrix.shape[1] + components = [] + overall_freq = np.ravel(np.asarray(doc_term_matrix.sum(axis=0))) + average = overall_freq.sum() / n_topics + for i_topic in range(n_topics): + freq = np.ravel( + np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0)) + ) + component = freq * np.log(1 + average / overall_freq) + components.append(component) + return np.stack(components) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 0bf0f5f..9e68dbb 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -12,6 +12,7 @@ from turftopic.base import ContextualModel, Encoder from turftopic.feature_importance import ( cluster_centroid_distance, + ctf_idf, soft_ctf_idf, ) from turftopic.vectorizer import default_vectorizer @@ -52,7 +53,7 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin): Models also include a dimensionality reduction step to aid clustering. ```python - from turftopic import KeyNMF + from turftopic import ClusteringTopicModel from sklearn.cluster import HDBSCAN import umap @@ -82,11 +83,13 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin): Clustering method to use for finding topics. Defaults to OPTICS with 25 minimum cluster size. To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN. - feature_importance: 'ctfidf' or 'centroid', default 'ctfidf' + feature_importance: 'soft-c-tf-idf', 'c-tf-idf' or 'centroid', default 'soft-c-tf-idf' Method for estimating term importances. 'centroid' uses distances from cluster centroid similarly to Top2Vec. - 'ctfidf' uses BERTopic's c-tf-idf. + 'c-tf-idf' uses BERTopic's c-tf-idf. + 'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should + be very similar to 'c-tf-idf'. n_reduce_to: int, default None Number of topics to reduce topics to. The specified reduction method will be used to merge them. @@ -173,17 +176,19 @@ def _estimate_parameters( self.vocab_embeddings = self.encoder_.encode(self.vectorizer.get_feature_names_out()) # type: ignore console.log("Vocabulary encoded") status.update("Estimating term importances") - if self.feature_importance == "ctfidf": - document_topic_matrix = label_binarize( - cluster_labels, classes=self.classes_ - ) + document_topic_matrix = label_binarize( + cluster_labels, classes=self.classes_ + ) + if self.feature_importance == "soft-c-tf-idf": self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore - else: + elif self.feature_importance == "centroid": self.components_ = cluster_centroid_distance( self.topic_vectors_, self.vocab_embeddings, metric="cosine", ) + else: + self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix) self.labels_ = cluster_labels def fit_predict( @@ -221,7 +226,6 @@ def fit_predict( console.log("Dimensionality reduction done.") status.update("Clustering documents") cluster_labels = self.clustering.fit_predict(reduced_embeddings) - print(np.unique(cluster_labels)) console.log("Clustering done.") self._estimate_parameters( cluster_labels, From 0dec16e61d45b676730c0d5a988813b552762b08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 26 Feb 2024 14:20:30 +0100 Subject: [PATCH 4/6] Added Top2Vec's smallest hierarchical reduction method --- turftopic/models/cluster.py | 96 ++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 9e68dbb..d115539 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -7,6 +7,7 @@ from sklearn.cluster import OPTICS, AgglomerativeClustering from sklearn.feature_extraction.text import CountVectorizer from sklearn.manifold import TSNE +from sklearn.metrics.pairwise import cosine_distances from sklearn.preprocessing import label_binarize from turftopic.base import ContextualModel, Encoder @@ -33,6 +34,35 @@ """ +def smallest_hierarchical_join( + topic_vectors: np.ndarray, + topic_sizes: np.ndarray, + classes_: np.ndarray, + n_to: int, +) -> list[tuple]: + """Iteratively joins smallest topics.""" + merge_inst = [] + topic_vectors = np.copy(topic_vectors) + topic_sizes = np.copy(topic_sizes) + classes = list(classes_) + while len(classes) > n_to: + smallest = np.argmin(topic_sizes) + dist = cosine_distances( + np.atleast_2d(topic_vectors[smallest]), topic_vectors + ) + closest = np.argsort(dist[0])[1] + merge_inst.append((classes[smallest], classes[closest])) + classes.pop(smallest) + topic_vectors[closest] = ( + (topic_vectors[smallest] * topic_sizes[smallest]) + + (topic_vectors[closest] * topic_sizes[closest]) + ) / (topic_sizes[smallest] + topic_sizes[closest]) + topic_vectors = np.delete(topic_vectors, smallest, axis=0) + topic_sizes[closest] = topic_sizes[closest] + topic_sizes[smallest] + topic_sizes = np.delete(topic_sizes, smallest, axis=0) + return merge_inst + + def calculate_topic_vectors( cluster_labels: np.ndarray, embeddings: np.ndarray ) -> np.ndarray: @@ -94,6 +124,7 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin): Number of topics to reduce topics to. The specified reduction method will be used to merge them. By default, topics are not merged. + reduction_method: 'agglomerative', 'smallest' """ def __init__( @@ -106,6 +137,9 @@ def __init__( clustering: Optional[ClusterMixin] = None, feature_importance: Literal["ctfidf", "centroid"] = "ctfidf", n_reduce_to: Optional[int] = None, + reduction_method: Literal[ + "agglomerative", "smallest" + ] = "agglomerative", ): self.encoder = encoder if isinstance(encoder, int): @@ -130,8 +164,9 @@ def __init__( self.dimensionality_reduction = dimensionality_reduction self.feature_importance = feature_importance self.n_reduce_to = n_reduce_to + self.reduction_method = reduction_method - def _merge_agglomerative(self, n_reduce_to: int) -> dict[int, int]: + def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: n_topics = self.components_.shape[0] res = {old_label: old_label for old_label in self.classes_} if n_topics <= n_reduce_to: @@ -152,32 +187,34 @@ def _merge_agglomerative(self, n_reduce_to: int) -> dict[int, int]: res[-1] = -1 for i_old, i_new in zip(old_labels, new_labels): res[i_old] = i_new - return res + return np.array([res[label] for label in self.labels_]) + + def _merge_smallest(self, n_reduce_to: int): + merge_inst = smallest_hierarchical_join( + self.topic_vectors_[self.classes_ != -1], + self.topic_sizes_[self.classes_ != -1], + self.classes_[self.classes_ != -1], + n_reduce_to, + ) + labels = np.copy(self.labels_) + for from_topic, to_topic in merge_inst: + labels[labels == from_topic] = to_topic + return labels def _estimate_parameters( self, - cluster_labels: np.ndarray, embeddings: np.ndarray, doc_term_matrix: np.ndarray, - status, - console, ): - clusters = np.unique(cluster_labels) + clusters = np.unique(self.labels_) self.classes_ = np.sort(clusters) self.topic_sizes_ = np.array( - [np.sum(cluster_labels == label) for label in self.classes_] - ) - status.update("Calculating topic vectors.") - self.topic_vectors_ = calculate_topic_vectors( - cluster_labels, embeddings + [np.sum(self.labels_ == label) for label in self.classes_] ) - console.log("Topic vectors calculated") - status.update("Encoding vocabulary") + self.topic_vectors_ = calculate_topic_vectors(self.labels_, embeddings) self.vocab_embeddings = self.encoder_.encode(self.vectorizer.get_feature_names_out()) # type: ignore - console.log("Vocabulary encoded") - status.update("Estimating term importances") document_topic_matrix = label_binarize( - cluster_labels, classes=self.classes_ + self.labels_, classes=self.classes_ ) if self.feature_importance == "soft-c-tf-idf": self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore @@ -189,7 +226,6 @@ def _estimate_parameters( ) else: self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix) - self.labels_ = cluster_labels def fit_predict( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None @@ -225,28 +261,32 @@ def fit_predict( ) console.log("Dimensionality reduction done.") status.update("Clustering documents") - cluster_labels = self.clustering.fit_predict(reduced_embeddings) + self.labels_ = self.clustering.fit_predict(reduced_embeddings) console.log("Clustering done.") + status.update("Estimating parameters.") self._estimate_parameters( - cluster_labels, embeddings, self.doc_term_matrix, - status, - console, ) + console.log("Parameter estimation done.") if self.n_reduce_to is not None: - status.update("Reducing topics.") - self.mapping_ = self._merge_agglomerative(self.n_reduce_to) - cluster_labels = np.array( - [self.mapping_[label] for label in cluster_labels] + n_topics = self.classes_.shape[0] + status.update( + f"Reducing topics from {n_topics} to {self.n_reduce_to}" + ) + if self.reduction_method == "agglomerative": + self.labels_ = self._merge_agglomerative(self.n_reduce_to) + else: + self.labels_ = self._merge_smallest(self.n_reduce_to) + console.log( + f"Topic reduction done from {n_topics} to {self.n_reduce_to}." ) + status.update("Reestimating parameters.") self._estimate_parameters( - np.array(cluster_labels), embeddings, self.doc_term_matrix, - status, - console, ) + console.log("Reestimation done.") console.log("Model fitting done.") return self.labels_ From 04bd814375d0d28f68bd4f729778eb50f6dd3bb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 26 Feb 2024 14:21:09 +0100 Subject: [PATCH 5/6] Corrected error in merging --- turftopic/models/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index d115539..23f6727 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -170,7 +170,7 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: n_topics = self.components_.shape[0] res = {old_label: old_label for old_label in self.classes_} if n_topics <= n_reduce_to: - return res + return self.labels_ interesting_topic_vectors = np.stack( [ vec From 4f881643ae2122f119230dbae8f2d53a3f235159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 26 Feb 2024 14:27:59 +0100 Subject: [PATCH 6/6] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f847e16..3e36b63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ line-length=79 [tool.poetry] name = "turftopic" -version = "0.2.8" +version = "0.2.9" description = "Topic modeling with contextual representations from sentence transformers." authors = ["Márton Kardos "] license = "MIT"