diff --git a/docs/KeyNMF.md b/docs/KeyNMF.md index fc5a6d2..852891c 100644 --- a/docs/KeyNMF.md +++ b/docs/KeyNMF.md @@ -19,8 +19,10 @@ Keywords are assigned to each document based on the cosine similarity of the doc Only the top K words with positive cosine similarity to the document are kept. These keywords are then arranged into a document-term importance matrix where each column represents a keyword that was encountered in at least one document, -and each row is a document. -The entries in the matrix are the cosine similarities of the given keyword to the document in semantic space. +and each row is a document. The entries in the matrix are the cosine similarities of the given keyword to the document in semantic space. + +Keyword extraction can be performed by computing cosine similarities between document embeddings and embeddings of the entire vocabulary, +or between document embeddings and words that occur within each document. The former scenario allows for multilingual topics. ### 2. Topic Discovery @@ -39,7 +41,6 @@ can be explained. ### Weaknesses - - Lack of Multilingual Capabilities: KeyNMF as it is currently implemented cannot be used in a multilingual context. Changes to the model that allow this are possible, and will likely be ijmplemented in the future. - Lack of Nuance: Since only the top K keywords are considered and used for topic extraction some of the nuances, especially in long texts might get lost. We therefore recommend that you scale K with the average length of the texts you're working with. For tweets it might be worth it to scale it down to 5, while with longer documents, a larger number (let's say 50) might be advisable. - Practitioners have to choose the number of topics a priori. diff --git a/tests/test_integration.py b/tests/test_integration.py index c87a2c4..ee31f30 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -29,7 +29,8 @@ models = [ GMM(5, encoder=trf), SemanticSignalSeparation(5, encoder=trf), - KeyNMF(5, encoder=trf), + KeyNMF(5, encoder=trf, keyword_scope='document'), + KeyNMF(5, encoder=trf, keyword_scope='corpus'), ClusteringTopicModel( n_reduce_to=5, feature_importance="c-tf-idf", diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py index 3492c98..513c2e9 100644 --- a/turftopic/models/keynmf.py +++ b/turftopic/models/keynmf.py @@ -74,6 +74,11 @@ class KeyNMF(ContextualModel): Can be used to prune or filter the vocabulary. top_n: int, default 25 Number of keywords to extract for each document. + keyword_scope: str, default 'document' + Specifies whether keyword extraction for each document + is performed on the whole vocabulary ('corpus') or only + using words that are included in the document ('document'). + Setting this to 'corpus' allows for multilingual topics. """ def __init__( @@ -84,7 +89,10 @@ def __init__( ] = "sentence-transformers/all-MiniLM-L6-v2", vectorizer: Optional[CountVectorizer] = None, top_n: int = 25, + keyword_scope: str = 'document', ): + if keyword_scope not in ['document', 'corpus']: + raise ValueError("keyword_scope must be 'document' or 'corpus'") self.n_components = n_components self.top_n = top_n self.encoder = encoder @@ -98,6 +106,7 @@ def __init__( self.vectorizer = vectorizer self.dict_vectorizer_ = DictVectorizer() self.nmf_ = NMF(n_components) + self.keyword_scope = keyword_scope def extract_keywords( self, @@ -114,11 +123,15 @@ def extract_keywords( for i in range(total): terms = document_term_matrix[i, :].todense() embedding = embeddings[i].reshape(1, -1) - nonzero = terms > 0 - if not np.any(nonzero): + if self.keyword_scope == 'document': + mask = terms > 0 + else: + tot_freq = document_term_matrix.sum(axis=0) + mask = tot_freq != 0 + if not np.any(mask): keywords.append(dict()) continue - important_terms = np.squeeze(np.asarray(nonzero)) + important_terms = np.squeeze(np.asarray(mask)) word_embeddings = self.vocab_embeddings[important_terms] sim = cosine_similarity(embedding, word_embeddings) sim = np.ravel(sim) @@ -272,7 +285,7 @@ def prepare_topic_data( except (NotFittedError, AttributeError): doc_topic_matrix = self.nmf_.fit_transform(dtm) self.components_ = self.nmf_.components_ - console.log("Model fiting done.") + console.log("Model fitting done.") res: TopicData = { "corpus": corpus, "document_term_matrix": dtm,