diff --git a/pyproject.toml b/pyproject.toml index f847e16..3e36b63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ line-length=79 [tool.poetry] name = "turftopic" -version = "0.2.8" +version = "0.2.9" description = "Topic modeling with contextual representations from sentence transformers." authors = ["Márton Kardos "] license = "MIT" diff --git a/turftopic/centroid_distance.py b/turftopic/centroid_distance.py deleted file mode 100644 index c17d6ef..0000000 --- a/turftopic/centroid_distance.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np -from sklearn.metrics import pairwise_distances -from sklearn.preprocessing import normalize - - -def cluster_centroid_distance( - cluster_labels, embeddings, vocab_embeddings, metric="euclidean" -): - centroids = [] - unique_labels = np.unique(cluster_labels) - unique_labels = np.sort(unique_labels) - for label in unique_labels: - centroid = np.mean(embeddings[cluster_labels == label], axis=0) - centroids.append(centroid) - centroids = np.stack(centroids) - distances = pairwise_distances(centroids, vocab_embeddings, metric=metric) - similarities = -distances / np.max(distances) - # Z-score transformation - similarities = (similarities - np.mean(similarities)) / np.std( - similarities - ) - return similarities diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py new file mode 100644 index 0000000..6950270 --- /dev/null +++ b/turftopic/feature_importance.py @@ -0,0 +1,50 @@ +import numpy as np +import scipy.sparse as spr +from sklearn.metrics import pairwise_distances +from sklearn.preprocessing import normalize + + +def cluster_centroid_distance( + cluster_centroids: np.ndarray, + vocab_embeddings: np.ndarray, + metric="cosine", +) -> np.ndarray: + distances = pairwise_distances( + cluster_centroids, vocab_embeddings, metric=metric + ) + similarities = -distances / np.max(distances) + # Z-score transformation + similarities = (similarities - np.mean(similarities)) / np.std( + similarities + ) + return similarities + + +def soft_ctf_idf( + doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix +) -> np.ndarray: + eps = np.finfo(float).eps + term_importance = doc_topic_matrix.T @ doc_term_matrix + overall_in_topic = np.abs(term_importance).sum(axis=1) + n_docs = len(doc_topic_matrix) + tf = (term_importance.T / (overall_in_topic + eps)).T + idf = np.log(n_docs / (np.abs(term_importance).sum(axis=0) + eps)) + ctf_idf = tf * idf + return ctf_idf + + +def ctf_idf( + doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix +) -> np.ndarray: + labels = np.argmax(doc_topic_matrix, axis=1) + n_topics = doc_topic_matrix.shape[1] + components = [] + overall_freq = np.ravel(np.asarray(doc_term_matrix.sum(axis=0))) + average = overall_freq.sum() / n_topics + for i_topic in range(n_topics): + freq = np.ravel( + np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0)) + ) + component = freq * np.log(1 + average / overall_freq) + components.append(component) + return np.stack(components) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index b363428..23f6727 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -4,14 +4,18 @@ from rich.console import Console from sentence_transformers import SentenceTransformer from sklearn.base import ClusterMixin, TransformerMixin -from sklearn.cluster import OPTICS +from sklearn.cluster import OPTICS, AgglomerativeClustering from sklearn.feature_extraction.text import CountVectorizer from sklearn.manifold import TSNE +from sklearn.metrics.pairwise import cosine_distances from sklearn.preprocessing import label_binarize from turftopic.base import ContextualModel, Encoder -from turftopic.centroid_distance import cluster_centroid_distance -from turftopic.soft_ctf_idf import soft_ctf_idf +from turftopic.feature_importance import ( + cluster_centroid_distance, + ctf_idf, + soft_ctf_idf, +) from turftopic.vectorizer import default_vectorizer integer_message = """ @@ -22,16 +26,64 @@ where the number of clusters is predefined. For instance: ClusteringTopicModel(clustering=KMeans(10)) + +Alternatively you can reduce the number of topics in the model by specifying +the desired reduced number on initialization. + +ClusteringTopicModel(n_reduce_to=10) """ +def smallest_hierarchical_join( + topic_vectors: np.ndarray, + topic_sizes: np.ndarray, + classes_: np.ndarray, + n_to: int, +) -> list[tuple]: + """Iteratively joins smallest topics.""" + merge_inst = [] + topic_vectors = np.copy(topic_vectors) + topic_sizes = np.copy(topic_sizes) + classes = list(classes_) + while len(classes) > n_to: + smallest = np.argmin(topic_sizes) + dist = cosine_distances( + np.atleast_2d(topic_vectors[smallest]), topic_vectors + ) + closest = np.argsort(dist[0])[1] + merge_inst.append((classes[smallest], classes[closest])) + classes.pop(smallest) + topic_vectors[closest] = ( + (topic_vectors[smallest] * topic_sizes[smallest]) + + (topic_vectors[closest] * topic_sizes[closest]) + ) / (topic_sizes[smallest] + topic_sizes[closest]) + topic_vectors = np.delete(topic_vectors, smallest, axis=0) + topic_sizes[closest] = topic_sizes[closest] + topic_sizes[smallest] + topic_sizes = np.delete(topic_sizes, smallest, axis=0) + return merge_inst + + +def calculate_topic_vectors( + cluster_labels: np.ndarray, embeddings: np.ndarray +) -> np.ndarray: + """Calculates topic centroids.""" + centroids = [] + unique_labels = np.unique(cluster_labels) + unique_labels = np.sort(unique_labels) + for label in unique_labels: + centroid = np.mean(embeddings[cluster_labels == label], axis=0) + centroids.append(centroid) + centroids = np.stack(centroids) + return centroids + + class ClusteringTopicModel(ContextualModel, ClusterMixin): """Topic models, which assume topics to be clusters of documents in semantic space. Models also include a dimensionality reduction step to aid clustering. ```python - from turftopic import KeyNMF + from turftopic import ClusteringTopicModel from sklearn.cluster import HDBSCAN import umap @@ -61,11 +113,18 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin): Clustering method to use for finding topics. Defaults to OPTICS with 25 minimum cluster size. To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN. - feature_importance: 'ctfidf' or 'centroid', default 'ctfidf' + feature_importance: 'soft-c-tf-idf', 'c-tf-idf' or 'centroid', default 'soft-c-tf-idf' Method for estimating term importances. 'centroid' uses distances from cluster centroid similarly to Top2Vec. - 'ctfidf' uses BERTopic's c-tf-idf. + 'c-tf-idf' uses BERTopic's c-tf-idf. + 'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should + be very similar to 'c-tf-idf'. + n_reduce_to: int, default None + Number of topics to reduce topics to. + The specified reduction method will be used to merge them. + By default, topics are not merged. + reduction_method: 'agglomerative', 'smallest' """ def __init__( @@ -77,6 +136,10 @@ def __init__( dimensionality_reduction: Optional[TransformerMixin] = None, clustering: Optional[ClusterMixin] = None, feature_importance: Literal["ctfidf", "centroid"] = "ctfidf", + n_reduce_to: Optional[int] = None, + reduction_method: Literal[ + "agglomerative", "smallest" + ] = "agglomerative", ): self.encoder = encoder if isinstance(encoder, int): @@ -100,6 +163,69 @@ def __init__( else: self.dimensionality_reduction = dimensionality_reduction self.feature_importance = feature_importance + self.n_reduce_to = n_reduce_to + self.reduction_method = reduction_method + + def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: + n_topics = self.components_.shape[0] + res = {old_label: old_label for old_label in self.classes_} + if n_topics <= n_reduce_to: + return self.labels_ + interesting_topic_vectors = np.stack( + [ + vec + for label, vec in zip(self.classes_, self.topic_vectors_) + if label != -1 + ] + ) + old_labels = [label for label in self.classes_ if label != -1] + new_labels = AgglomerativeClustering( + n_clusters=n_reduce_to, metric="cosine", linkage="average" + ).fit_predict(interesting_topic_vectors) + res = {} + if -1 in self.classes_: + res[-1] = -1 + for i_old, i_new in zip(old_labels, new_labels): + res[i_old] = i_new + return np.array([res[label] for label in self.labels_]) + + def _merge_smallest(self, n_reduce_to: int): + merge_inst = smallest_hierarchical_join( + self.topic_vectors_[self.classes_ != -1], + self.topic_sizes_[self.classes_ != -1], + self.classes_[self.classes_ != -1], + n_reduce_to, + ) + labels = np.copy(self.labels_) + for from_topic, to_topic in merge_inst: + labels[labels == from_topic] = to_topic + return labels + + def _estimate_parameters( + self, + embeddings: np.ndarray, + doc_term_matrix: np.ndarray, + ): + clusters = np.unique(self.labels_) + self.classes_ = np.sort(clusters) + self.topic_sizes_ = np.array( + [np.sum(self.labels_ == label) for label in self.classes_] + ) + self.topic_vectors_ = calculate_topic_vectors(self.labels_, embeddings) + self.vocab_embeddings = self.encoder_.encode(self.vectorizer.get_feature_names_out()) # type: ignore + document_topic_matrix = label_binarize( + self.labels_, classes=self.classes_ + ) + if self.feature_importance == "soft-c-tf-idf": + self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore + elif self.feature_importance == "centroid": + self.components_ = cluster_centroid_distance( + self.topic_vectors_, + self.vocab_embeddings, + metric="cosine", + ) + else: + self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix) def fit_predict( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None @@ -127,37 +253,42 @@ def fit_predict( embeddings = self.encoder_.encode(raw_documents) console.log("Encoding done.") status.update("Extracting terms") - doc_term_matrix = self.vectorizer.fit_transform(raw_documents) + self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents) console.log("Term extraction done.") - vocab = self.vectorizer.get_feature_names_out() status.update("Reducing Dimensionality") reduced_embeddings = self.dimensionality_reduction.fit_transform( embeddings ) console.log("Dimensionality reduction done.") status.update("Clustering documents") - cluster_labels = self.clustering.fit_predict(reduced_embeddings) - clusters = np.unique(cluster_labels) + self.labels_ = self.clustering.fit_predict(reduced_embeddings) console.log("Clustering done.") - self.classes_ = np.sort(clusters) - status.update("Estimating term importances") - if self.feature_importance == "ctfidf": - document_topic_matrix = label_binarize( - cluster_labels, classes=self.classes_ + status.update("Estimating parameters.") + self._estimate_parameters( + embeddings, + self.doc_term_matrix, + ) + console.log("Parameter estimation done.") + if self.n_reduce_to is not None: + n_topics = self.classes_.shape[0] + status.update( + f"Reducing topics from {n_topics} to {self.n_reduce_to}" + ) + if self.reduction_method == "agglomerative": + self.labels_ = self._merge_agglomerative(self.n_reduce_to) + else: + self.labels_ = self._merge_smallest(self.n_reduce_to) + console.log( + f"Topic reduction done from {n_topics} to {self.n_reduce_to}." ) - self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore - else: - status.update("Encoding vocabulary") - vocab_embeddings = self.encoder_.encode(vocab) # type: ignore - self.components_ = cluster_centroid_distance( - cluster_labels, + status.update("Reestimating parameters.") + self._estimate_parameters( embeddings, - vocab_embeddings, - metric="euclidean", + self.doc_term_matrix, ) - self.labels_ = cluster_labels - console.log("Model fitting done.") - return cluster_labels + console.log("Reestimation done.") + console.log("Model fitting done.") + return self.labels_ def fit_transform( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py index 4ed5820..fe1380b 100644 --- a/turftopic/models/gmm.py +++ b/turftopic/models/gmm.py @@ -9,7 +9,7 @@ from turftopic.base import ContextualModel, Encoder from turftopic.dynamic import DynamicTopicModel, bin_timestamps -from turftopic.soft_ctf_idf import soft_ctf_idf +from turftopic.feature_importance import soft_ctf_idf from turftopic.vectorizer import default_vectorizer diff --git a/turftopic/soft_ctf_idf.py b/turftopic/soft_ctf_idf.py deleted file mode 100644 index bb54639..0000000 --- a/turftopic/soft_ctf_idf.py +++ /dev/null @@ -1,15 +0,0 @@ -import numpy as np -import scipy.sparse as spr - - -def soft_ctf_idf( - doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix -) -> np.ndarray: - eps = np.finfo(float).eps - term_importance = doc_topic_matrix.T @ doc_term_matrix - overall_in_topic = np.abs(term_importance).sum(axis=1) - n_docs = len(doc_topic_matrix) - tf = (term_importance.T / (overall_in_topic + eps)).T - idf = np.log(n_docs / (np.abs(term_importance).sum(axis=0) + eps)) - ctf_idf = tf * idf - return ctf_idf