Topic Modeling — Advanced NLP with Local Models (Chapter 13)

Topic modeling discovers latent themes in document collections without explicit labels. Latent Dirichlet Allocation (LDA) and neural approaches both see active use with local models.

LDA with Gensim

Traditional LDA remains useful for interpretable topic discovery:

from gensim import corpora
from gensim.models import LdaModel
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class LDATopicModeler:
    def __init__(self, num_topics=10):
        self.num_topics = num_topics
        self.dictionary = None
        self.model = None
        self.stop_words = set(stopwords.words('english'))
        self.stop_words.update(['would', 'could', 'also', 'one', 'two'])
    
    def preprocess(self, documents: list[str]) -> list[list[str]]:
        """Tokenize and filter documents."""
        processed = []
        for doc in documents:
            tokens = word_tokenize(doc.lower())
            tokens = [t for t in tokens if t.isalpha() and t not in self.stop_words and len(t) > 3]
            processed.append(tokens)
        return processed
    
    def train(self, documents: list[str]):
        tokenized = self.preprocess(documents)
        
        self.dictionary = corpora.Dictionary(tokenized)
        self.dictionary.filter_extremes(no_below=5, no_above=0.5)
        
        corpus = [self.dictionary.doc2bow(doc) for doc in tokenized]
        
        self.model = LdaModel(
            corpus=corpus,
            id2word=self.dictionary,
            num_topics=self.num_topics,
            passes=10,
            alpha='auto',
            eta='auto',
            random_state=42
        )
    
    def get_topics(self) -> list[tuple[str, float]]:
        """Return all topics with top words."""
        topics = []
        for topic_id in range(self.num_topics):
            words = self.model.show_topic(topic_id, topn=10)
            topic_str = ", ".join([f"{word}" for word, _ in words])
            topics.append((f"Topic {topic_id}", topic_str))
        return topics
    
    def infer_topics(self, document: str, num_words: int = 5) -> list[tuple[str, float]]:
        """Get topic distribution for a new document."""
        if not self.model:
            raise ValueError("Model not trained")
        
        tokens = self.preprocess([document])[0]
        bow = self.dictionary.doc2bow(tokens)
        
        topic_dist = self.model.get_document_topics(bow)
        return [(f"Topic {tid}", prob) for tid, prob in topic_dist[:num_words]]

BERTopic for Neural Topic Modeling

BERTopic leverages sentence transformers for semantic clustering:

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

class NeuralTopicModeler:
    def __init__(self, num_topics=10):
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        
        umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
        hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', prediction_data=True)
        
        self.model = BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            nr_topics=num_topics,
            verbose=True
        )
    
    def fit_transform(self, documents: list[str]) -> tuple:
        topics, probs = self.model.fit_transform(documents)
        return topics, probs
    
    def get_topic_info(self) -> dict:
        return self.model.get_topic_info()
    
    def find_similar_topics(self, query: str, top_n: int = 5) -> list[dict]:
        """Find topics similar to a query description."""
        similar_topics, similarity = self.model.find_topics(query, top_n=top_n)
        return [
            {"topic_id": tid, "similarity": sim, "keywords": self._get_topic_keywords(tid)}
            for tid, sim in zip(similar_topics, similarity)
        ]
    
    def _get_topic_keywords(self, topic_id: int) -> list[str]:
        if topic_id == -1:
            return ["outlier", "noise"]
        topic_words = self.model.get_topic(topic_id)
        return [word for word, _ in topic_words[:10]]