Aspect-Based Sentiment — Advanced NLP with Local Models (Chapter 12)

Aspect-based sentiment analysis (ABSA) extracts specific aspects from text and determines sentiment toward each. "The battery lasts long but the screen scratches easily" contains two aspects with opposite sentiments.

Aspect Extraction with Sequence Labeling

Named entity recognition-style models extract aspect terms:

from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
import torch

class AspectExtractor:
    def __init__(self, model_path="./models/deberta-aspect"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
        self.label_map = {0: "O", 1: "B-ASPECT", 2: "I-ASPECT"}
    
    def extract_aspects(self, text: str) -> list[str]:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, 
                                max_length=256, return_offsets_mapping=True)
        offsets = inputs.pop("offset_mapping")[0]
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = outputs.logits.argmax(dim=-1)[0]
        
        aspects = []
        current_aspect = []
        
        for idx, (token_id, pred) in enumerate(zip(inputs["input_ids"][0], predictions)):
            if self.label_map[pred.item()] == "B-ASPECT":
                if current_aspect:
                    aspects.append(self.tokenizer.decode(tokenizer.encode(text[start:end].strip(), add_special_tokens=False)))
                start, end = offsets[idx]
                current_aspect = [(start.item(), end.item())]
            elif self.label_map[pred.item()] == "I-ASPECT" and current_aspect:
                current_aspect.append(offsets[idx].tolist())
        
        # Extract final aspect
        if current_aspect:
            start = current_aspect[0][0]
            end = current_aspect[-1][1]
            aspects.append(text[start:end])
        
        return aspects

    def extract_aspects_ner(self, text: str) -> list[dict]:
        """Alternative using NER pipeline for aspect extraction."""
        ner_pipeline = pipeline(
            "ner", 
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple"
        )
        entities = ner_pipeline(text)
        return [
            {"text": e["word"], "start": e["start"], "end": e["end"], "score": e["score"]}
            for e in entities if e["entity_group"] == "ASPECT"
        ]

Joint Aspect-Sentiment Prediction

Modern approaches predict aspects and their sentiments jointly:

class AspectSentimentAnalyzer:
    def __init__(self, model_path="./models/llama-2-7b-chat-hf"):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(model_path)
    
    def analyze(self, text: str) -> dict:
        prompt = f"""Extract all aspects mentioned in this review with their sentiment polarity.
For each aspect, indicate: aspect name, sentiment (positive/negative/neutral), and supporting evidence.

Review: {text}

Format:
Aspect | Sentiment | Evidence"""
        
        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.1,
            do_sample=False
        )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return self._parse_results(response)
    
    def _parse_results(self, response: str) -> dict:
        """Parse LLM output into structured format."""
        aspects = []
        for line in response.split("\n"):
            if "|" in line and "Aspect" not in line:
                parts = [p.strip() for p in line.split("|")]
                if len(parts) >= 2:
                    aspects.append({
                        "aspect": parts[0],
                        "sentiment": parts[1],
                        "evidence": parts[2] if len(parts) > 2 else ""
                    })
        return {"text": response, "aspects": aspects}