Merge pull request #8 from x-tabdeveloping/merging

Implemented Topic merging and original C-TF-IDF in clustering models.
x-tabdeveloping · Mar 1, 2024 · c9a19f1 · c9a19f1
2 parents 2eddf69 + 4f88164
commit c9a19f1
Show file tree

Hide file tree

Showing 6 changed files with 209 additions and 65 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ line-length=79
 
 [tool.poetry]
 name = "turftopic"
-version = "0.2.8"
+version = "0.2.9"
 description = "Topic modeling with contextual representations from sentence transformers."
 authors = ["Márton Kardos <[email protected]>"]
 license = "MIT"

diff --git a/turftopic/centroid_distance.py b/turftopic/centroid_distance.py
diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py
@@ -0,0 +1,50 @@
+import numpy as np
+import scipy.sparse as spr
+from sklearn.metrics import pairwise_distances
+from sklearn.preprocessing import normalize
+
+
+def cluster_centroid_distance(
+    cluster_centroids: np.ndarray,
+    vocab_embeddings: np.ndarray,
+    metric="cosine",
+) -> np.ndarray:
+    distances = pairwise_distances(
+        cluster_centroids, vocab_embeddings, metric=metric
+    )
+    similarities = -distances / np.max(distances)
+    # Z-score transformation
+    similarities = (similarities - np.mean(similarities)) / np.std(
+        similarities
+    )
+    return similarities
+
+
+def soft_ctf_idf(
+    doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
+) -> np.ndarray:
+    eps = np.finfo(float).eps
+    term_importance = doc_topic_matrix.T @ doc_term_matrix
+    overall_in_topic = np.abs(term_importance).sum(axis=1)
+    n_docs = len(doc_topic_matrix)
+    tf = (term_importance.T / (overall_in_topic + eps)).T
+    idf = np.log(n_docs / (np.abs(term_importance).sum(axis=0) + eps))
+    ctf_idf = tf * idf
+    return ctf_idf
+
+
+def ctf_idf(
+    doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
+) -> np.ndarray:
+    labels = np.argmax(doc_topic_matrix, axis=1)
+    n_topics = doc_topic_matrix.shape[1]
+    components = []
+    overall_freq = np.ravel(np.asarray(doc_term_matrix.sum(axis=0)))
+    average = overall_freq.sum() / n_topics
+    for i_topic in range(n_topics):
+        freq = np.ravel(
+            np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0))
+        )
+        component = freq * np.log(1 + average / overall_freq)
+        components.append(component)
+    return np.stack(components)
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -4,14 +4,18 @@
 from rich.console import Console
 from sentence_transformers import SentenceTransformer
 from sklearn.base import ClusterMixin, TransformerMixin
-from sklearn.cluster import OPTICS
+from sklearn.cluster import OPTICS, AgglomerativeClustering
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.manifold import TSNE
+from sklearn.metrics.pairwise import cosine_distances
 from sklearn.preprocessing import label_binarize
 
 from turftopic.base import ContextualModel, Encoder
-from turftopic.centroid_distance import cluster_centroid_distance
-from turftopic.soft_ctf_idf import soft_ctf_idf
+from turftopic.feature_importance import (
+    cluster_centroid_distance,
+    ctf_idf,
+    soft_ctf_idf,
+)
 from turftopic.vectorizer import default_vectorizer
 
 integer_message = """
@@ -22,16 +26,64 @@
 where the number of clusters is predefined.
 
 For instance: ClusteringTopicModel(clustering=KMeans(10))
+
+Alternatively you can reduce the number of topics in the model by specifying
+the desired reduced number on initialization.
+
+ClusteringTopicModel(n_reduce_to=10)
 """
 
 
+def smallest_hierarchical_join(
+    topic_vectors: np.ndarray,
+    topic_sizes: np.ndarray,
+    classes_: np.ndarray,
+    n_to: int,
+) -> list[tuple]:
+    """Iteratively joins smallest topics."""
+    merge_inst = []
+    topic_vectors = np.copy(topic_vectors)
+    topic_sizes = np.copy(topic_sizes)
+    classes = list(classes_)
+    while len(classes) > n_to:
+        smallest = np.argmin(topic_sizes)
+        dist = cosine_distances(
+            np.atleast_2d(topic_vectors[smallest]), topic_vectors
+        )
+        closest = np.argsort(dist[0])[1]
+        merge_inst.append((classes[smallest], classes[closest]))
+        classes.pop(smallest)
+        topic_vectors[closest] = (
+            (topic_vectors[smallest] * topic_sizes[smallest])
+            + (topic_vectors[closest] * topic_sizes[closest])
+        ) / (topic_sizes[smallest] + topic_sizes[closest])
+        topic_vectors = np.delete(topic_vectors, smallest, axis=0)
+        topic_sizes[closest] = topic_sizes[closest] + topic_sizes[smallest]
+        topic_sizes = np.delete(topic_sizes, smallest, axis=0)
+    return merge_inst
+
+
+def calculate_topic_vectors(
+    cluster_labels: np.ndarray, embeddings: np.ndarray
+) -> np.ndarray:
+    """Calculates topic centroids."""
+    centroids = []
+    unique_labels = np.unique(cluster_labels)
+    unique_labels = np.sort(unique_labels)
+    for label in unique_labels:
+        centroid = np.mean(embeddings[cluster_labels == label], axis=0)
+        centroids.append(centroid)
+    centroids = np.stack(centroids)
+    return centroids
+
+
 class ClusteringTopicModel(ContextualModel, ClusterMixin):
     """Topic models, which assume topics to be clusters of documents
     in semantic space.
     Models also include a dimensionality reduction step to aid clustering.
 
     ```python
-    from turftopic import KeyNMF
+    from turftopic import ClusteringTopicModel
     from sklearn.cluster import HDBSCAN
     import umap
 
@@ -61,11 +113,18 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin):
         Clustering method to use for finding topics.
         Defaults to OPTICS with 25 minimum cluster size.
         To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN.
-    feature_importance: 'ctfidf' or 'centroid', default 'ctfidf'
+    feature_importance: 'soft-c-tf-idf', 'c-tf-idf' or 'centroid', default 'soft-c-tf-idf'
         Method for estimating term importances.
         'centroid' uses distances from cluster centroid similarly
         to Top2Vec.
-        'ctfidf' uses BERTopic's c-tf-idf.
+        'c-tf-idf' uses BERTopic's c-tf-idf.
+        'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
+        be very similar to 'c-tf-idf'.
+    n_reduce_to: int, default None
+        Number of topics to reduce topics to.
+        The specified reduction method will be used to merge them.
+        By default, topics are not merged.
+    reduction_method: 'agglomerative', 'smallest'
     """
 
     def __init__(
@@ -77,6 +136,10 @@ def __init__(
         dimensionality_reduction: Optional[TransformerMixin] = None,
         clustering: Optional[ClusterMixin] = None,
         feature_importance: Literal["ctfidf", "centroid"] = "ctfidf",
+        n_reduce_to: Optional[int] = None,
+        reduction_method: Literal[
+            "agglomerative", "smallest"
+        ] = "agglomerative",
     ):
         self.encoder = encoder
         if isinstance(encoder, int):
@@ -100,6 +163,69 @@ def __init__(
         else:
             self.dimensionality_reduction = dimensionality_reduction
         self.feature_importance = feature_importance
+        self.n_reduce_to = n_reduce_to
+        self.reduction_method = reduction_method
+
+    def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
+        n_topics = self.components_.shape[0]
+        res = {old_label: old_label for old_label in self.classes_}
+        if n_topics <= n_reduce_to:
+            return self.labels_
+        interesting_topic_vectors = np.stack(
+            [
+                vec
+                for label, vec in zip(self.classes_, self.topic_vectors_)
+                if label != -1
+            ]
+        )
+        old_labels = [label for label in self.classes_ if label != -1]
+        new_labels = AgglomerativeClustering(
+            n_clusters=n_reduce_to, metric="cosine", linkage="average"
+        ).fit_predict(interesting_topic_vectors)
+        res = {}
+        if -1 in self.classes_:
+            res[-1] = -1
+        for i_old, i_new in zip(old_labels, new_labels):
+            res[i_old] = i_new
+        return np.array([res[label] for label in self.labels_])
+
+    def _merge_smallest(self, n_reduce_to: int):
+        merge_inst = smallest_hierarchical_join(
+            self.topic_vectors_[self.classes_ != -1],
+            self.topic_sizes_[self.classes_ != -1],
+            self.classes_[self.classes_ != -1],
+            n_reduce_to,
+        )
+        labels = np.copy(self.labels_)
+        for from_topic, to_topic in merge_inst:
+            labels[labels == from_topic] = to_topic
+        return labels
+
+    def _estimate_parameters(
+        self,
+        embeddings: np.ndarray,
+        doc_term_matrix: np.ndarray,
+    ):
+        clusters = np.unique(self.labels_)
+        self.classes_ = np.sort(clusters)
+        self.topic_sizes_ = np.array(
+            [np.sum(self.labels_ == label) for label in self.classes_]
+        )
+        self.topic_vectors_ = calculate_topic_vectors(self.labels_, embeddings)
+        self.vocab_embeddings = self.encoder_.encode(self.vectorizer.get_feature_names_out())  # type: ignore
+        document_topic_matrix = label_binarize(
+            self.labels_, classes=self.classes_
+        )
+        if self.feature_importance == "soft-c-tf-idf":
+            self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix)  # type: ignore
+        elif self.feature_importance == "centroid":
+            self.components_ = cluster_centroid_distance(
+                self.topic_vectors_,
+                self.vocab_embeddings,
+                metric="cosine",
+            )
+        else:
+            self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix)
 
     def fit_predict(
         self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
@@ -127,37 +253,42 @@ def fit_predict(
                 embeddings = self.encoder_.encode(raw_documents)
                 console.log("Encoding done.")
             status.update("Extracting terms")
-            doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
+            self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
             console.log("Term extraction done.")
-            vocab = self.vectorizer.get_feature_names_out()
             status.update("Reducing Dimensionality")
             reduced_embeddings = self.dimensionality_reduction.fit_transform(
                 embeddings
             )
             console.log("Dimensionality reduction done.")
             status.update("Clustering documents")
-            cluster_labels = self.clustering.fit_predict(reduced_embeddings)
-            clusters = np.unique(cluster_labels)
+            self.labels_ = self.clustering.fit_predict(reduced_embeddings)
             console.log("Clustering done.")
-            self.classes_ = np.sort(clusters)
-            status.update("Estimating term importances")
-            if self.feature_importance == "ctfidf":
-                document_topic_matrix = label_binarize(
-                    cluster_labels, classes=self.classes_
+            status.update("Estimating parameters.")
+            self._estimate_parameters(
+                embeddings,
+                self.doc_term_matrix,
+            )
+            console.log("Parameter estimation done.")
+            if self.n_reduce_to is not None:
+                n_topics = self.classes_.shape[0]
+                status.update(
+                    f"Reducing topics from {n_topics} to {self.n_reduce_to}"
+                )
+                if self.reduction_method == "agglomerative":
+                    self.labels_ = self._merge_agglomerative(self.n_reduce_to)
+                else:
+                    self.labels_ = self._merge_smallest(self.n_reduce_to)
+                console.log(
+                    f"Topic reduction done from {n_topics} to {self.n_reduce_to}."
                 )
-                self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix)  # type: ignore
-            else:
-                status.update("Encoding vocabulary")
-                vocab_embeddings = self.encoder_.encode(vocab)  # type: ignore
-                self.components_ = cluster_centroid_distance(
-                    cluster_labels,
+                status.update("Reestimating parameters.")
+                self._estimate_parameters(
                     embeddings,
-                    vocab_embeddings,
-                    metric="euclidean",
+                    self.doc_term_matrix,
                 )
-            self.labels_ = cluster_labels
-            console.log("Model fitting done.")
-        return cluster_labels
+                console.log("Reestimation done.")
+        console.log("Model fitting done.")
+        return self.labels_
 
     def fit_transform(
         self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None

diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py
@@ -9,7 +9,7 @@
 
 from turftopic.base import ContextualModel, Encoder
 from turftopic.dynamic import DynamicTopicModel, bin_timestamps
-from turftopic.soft_ctf_idf import soft_ctf_idf
+from turftopic.feature_importance import soft_ctf_idf
 from turftopic.vectorizer import default_vectorizer
 
 

diff --git a/turftopic/soft_ctf_idf.py b/turftopic/soft_ctf_idf.py