Skip to content

Commit

Permalink
Merge pull request #8 from x-tabdeveloping/merging
Browse files Browse the repository at this point in the history
Implemented Topic merging and original C-TF-IDF in clustering models.
  • Loading branch information
x-tabdeveloping authored Mar 1, 2024
2 parents 2eddf69 + 4f88164 commit c9a19f1
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 65 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ line-length=79

[tool.poetry]
name = "turftopic"
version = "0.2.8"
version = "0.2.9"
description = "Topic modeling with contextual representations from sentence transformers."
authors = ["Márton Kardos <[email protected]>"]
license = "MIT"
Expand Down
22 changes: 0 additions & 22 deletions turftopic/centroid_distance.py

This file was deleted.

50 changes: 50 additions & 0 deletions turftopic/feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
import scipy.sparse as spr
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize


def cluster_centroid_distance(
cluster_centroids: np.ndarray,
vocab_embeddings: np.ndarray,
metric="cosine",
) -> np.ndarray:
distances = pairwise_distances(
cluster_centroids, vocab_embeddings, metric=metric
)
similarities = -distances / np.max(distances)
# Z-score transformation
similarities = (similarities - np.mean(similarities)) / np.std(
similarities
)
return similarities


def soft_ctf_idf(
doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
) -> np.ndarray:
eps = np.finfo(float).eps
term_importance = doc_topic_matrix.T @ doc_term_matrix
overall_in_topic = np.abs(term_importance).sum(axis=1)
n_docs = len(doc_topic_matrix)
tf = (term_importance.T / (overall_in_topic + eps)).T
idf = np.log(n_docs / (np.abs(term_importance).sum(axis=0) + eps))
ctf_idf = tf * idf
return ctf_idf


def ctf_idf(
doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
) -> np.ndarray:
labels = np.argmax(doc_topic_matrix, axis=1)
n_topics = doc_topic_matrix.shape[1]
components = []
overall_freq = np.ravel(np.asarray(doc_term_matrix.sum(axis=0)))
average = overall_freq.sum() / n_topics
for i_topic in range(n_topics):
freq = np.ravel(
np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0))
)
component = freq * np.log(1 + average / overall_freq)
components.append(component)
return np.stack(components)
183 changes: 157 additions & 26 deletions turftopic/models/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@
from rich.console import Console
from sentence_transformers import SentenceTransformer
from sklearn.base import ClusterMixin, TransformerMixin
from sklearn.cluster import OPTICS
from sklearn.cluster import OPTICS, AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import label_binarize

from turftopic.base import ContextualModel, Encoder
from turftopic.centroid_distance import cluster_centroid_distance
from turftopic.soft_ctf_idf import soft_ctf_idf
from turftopic.feature_importance import (
cluster_centroid_distance,
ctf_idf,
soft_ctf_idf,
)
from turftopic.vectorizer import default_vectorizer

integer_message = """
Expand All @@ -22,16 +26,64 @@
where the number of clusters is predefined.
For instance: ClusteringTopicModel(clustering=KMeans(10))
Alternatively you can reduce the number of topics in the model by specifying
the desired reduced number on initialization.
ClusteringTopicModel(n_reduce_to=10)
"""


def smallest_hierarchical_join(
topic_vectors: np.ndarray,
topic_sizes: np.ndarray,
classes_: np.ndarray,
n_to: int,
) -> list[tuple]:
"""Iteratively joins smallest topics."""
merge_inst = []
topic_vectors = np.copy(topic_vectors)
topic_sizes = np.copy(topic_sizes)
classes = list(classes_)
while len(classes) > n_to:
smallest = np.argmin(topic_sizes)
dist = cosine_distances(
np.atleast_2d(topic_vectors[smallest]), topic_vectors
)
closest = np.argsort(dist[0])[1]
merge_inst.append((classes[smallest], classes[closest]))
classes.pop(smallest)
topic_vectors[closest] = (
(topic_vectors[smallest] * topic_sizes[smallest])
+ (topic_vectors[closest] * topic_sizes[closest])
) / (topic_sizes[smallest] + topic_sizes[closest])
topic_vectors = np.delete(topic_vectors, smallest, axis=0)
topic_sizes[closest] = topic_sizes[closest] + topic_sizes[smallest]
topic_sizes = np.delete(topic_sizes, smallest, axis=0)
return merge_inst


def calculate_topic_vectors(
cluster_labels: np.ndarray, embeddings: np.ndarray
) -> np.ndarray:
"""Calculates topic centroids."""
centroids = []
unique_labels = np.unique(cluster_labels)
unique_labels = np.sort(unique_labels)
for label in unique_labels:
centroid = np.mean(embeddings[cluster_labels == label], axis=0)
centroids.append(centroid)
centroids = np.stack(centroids)
return centroids


class ClusteringTopicModel(ContextualModel, ClusterMixin):
"""Topic models, which assume topics to be clusters of documents
in semantic space.
Models also include a dimensionality reduction step to aid clustering.
```python
from turftopic import KeyNMF
from turftopic import ClusteringTopicModel
from sklearn.cluster import HDBSCAN
import umap
Expand Down Expand Up @@ -61,11 +113,18 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin):
Clustering method to use for finding topics.
Defaults to OPTICS with 25 minimum cluster size.
To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN.
feature_importance: 'ctfidf' or 'centroid', default 'ctfidf'
feature_importance: 'soft-c-tf-idf', 'c-tf-idf' or 'centroid', default 'soft-c-tf-idf'
Method for estimating term importances.
'centroid' uses distances from cluster centroid similarly
to Top2Vec.
'ctfidf' uses BERTopic's c-tf-idf.
'c-tf-idf' uses BERTopic's c-tf-idf.
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
be very similar to 'c-tf-idf'.
n_reduce_to: int, default None
Number of topics to reduce topics to.
The specified reduction method will be used to merge them.
By default, topics are not merged.
reduction_method: 'agglomerative', 'smallest'
"""

def __init__(
Expand All @@ -77,6 +136,10 @@ def __init__(
dimensionality_reduction: Optional[TransformerMixin] = None,
clustering: Optional[ClusterMixin] = None,
feature_importance: Literal["ctfidf", "centroid"] = "ctfidf",
n_reduce_to: Optional[int] = None,
reduction_method: Literal[
"agglomerative", "smallest"
] = "agglomerative",
):
self.encoder = encoder
if isinstance(encoder, int):
Expand All @@ -100,6 +163,69 @@ def __init__(
else:
self.dimensionality_reduction = dimensionality_reduction
self.feature_importance = feature_importance
self.n_reduce_to = n_reduce_to
self.reduction_method = reduction_method

def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
n_topics = self.components_.shape[0]
res = {old_label: old_label for old_label in self.classes_}
if n_topics <= n_reduce_to:
return self.labels_
interesting_topic_vectors = np.stack(
[
vec
for label, vec in zip(self.classes_, self.topic_vectors_)
if label != -1
]
)
old_labels = [label for label in self.classes_ if label != -1]
new_labels = AgglomerativeClustering(
n_clusters=n_reduce_to, metric="cosine", linkage="average"
).fit_predict(interesting_topic_vectors)
res = {}
if -1 in self.classes_:
res[-1] = -1
for i_old, i_new in zip(old_labels, new_labels):
res[i_old] = i_new
return np.array([res[label] for label in self.labels_])

def _merge_smallest(self, n_reduce_to: int):
merge_inst = smallest_hierarchical_join(
self.topic_vectors_[self.classes_ != -1],
self.topic_sizes_[self.classes_ != -1],
self.classes_[self.classes_ != -1],
n_reduce_to,
)
labels = np.copy(self.labels_)
for from_topic, to_topic in merge_inst:
labels[labels == from_topic] = to_topic
return labels

def _estimate_parameters(
self,
embeddings: np.ndarray,
doc_term_matrix: np.ndarray,
):
clusters = np.unique(self.labels_)
self.classes_ = np.sort(clusters)
self.topic_sizes_ = np.array(
[np.sum(self.labels_ == label) for label in self.classes_]
)
self.topic_vectors_ = calculate_topic_vectors(self.labels_, embeddings)
self.vocab_embeddings = self.encoder_.encode(self.vectorizer.get_feature_names_out()) # type: ignore
document_topic_matrix = label_binarize(
self.labels_, classes=self.classes_
)
if self.feature_importance == "soft-c-tf-idf":
self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore
elif self.feature_importance == "centroid":
self.components_ = cluster_centroid_distance(
self.topic_vectors_,
self.vocab_embeddings,
metric="cosine",
)
else:
self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix)

def fit_predict(
self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
Expand Down Expand Up @@ -127,37 +253,42 @@ def fit_predict(
embeddings = self.encoder_.encode(raw_documents)
console.log("Encoding done.")
status.update("Extracting terms")
doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
console.log("Term extraction done.")
vocab = self.vectorizer.get_feature_names_out()
status.update("Reducing Dimensionality")
reduced_embeddings = self.dimensionality_reduction.fit_transform(
embeddings
)
console.log("Dimensionality reduction done.")
status.update("Clustering documents")
cluster_labels = self.clustering.fit_predict(reduced_embeddings)
clusters = np.unique(cluster_labels)
self.labels_ = self.clustering.fit_predict(reduced_embeddings)
console.log("Clustering done.")
self.classes_ = np.sort(clusters)
status.update("Estimating term importances")
if self.feature_importance == "ctfidf":
document_topic_matrix = label_binarize(
cluster_labels, classes=self.classes_
status.update("Estimating parameters.")
self._estimate_parameters(
embeddings,
self.doc_term_matrix,
)
console.log("Parameter estimation done.")
if self.n_reduce_to is not None:
n_topics = self.classes_.shape[0]
status.update(
f"Reducing topics from {n_topics} to {self.n_reduce_to}"
)
if self.reduction_method == "agglomerative":
self.labels_ = self._merge_agglomerative(self.n_reduce_to)
else:
self.labels_ = self._merge_smallest(self.n_reduce_to)
console.log(
f"Topic reduction done from {n_topics} to {self.n_reduce_to}."
)
self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore
else:
status.update("Encoding vocabulary")
vocab_embeddings = self.encoder_.encode(vocab) # type: ignore
self.components_ = cluster_centroid_distance(
cluster_labels,
status.update("Reestimating parameters.")
self._estimate_parameters(
embeddings,
vocab_embeddings,
metric="euclidean",
self.doc_term_matrix,
)
self.labels_ = cluster_labels
console.log("Model fitting done.")
return cluster_labels
console.log("Reestimation done.")
console.log("Model fitting done.")
return self.labels_

def fit_transform(
self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
Expand Down
2 changes: 1 addition & 1 deletion turftopic/models/gmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from turftopic.base import ContextualModel, Encoder
from turftopic.dynamic import DynamicTopicModel, bin_timestamps
from turftopic.soft_ctf_idf import soft_ctf_idf
from turftopic.feature_importance import soft_ctf_idf
from turftopic.vectorizer import default_vectorizer


Expand Down
15 changes: 0 additions & 15 deletions turftopic/soft_ctf_idf.py

This file was deleted.

0 comments on commit c9a19f1

Please sign in to comment.