16. Cross-Lingual NLP

Chapter 16 of 18 · 20 min

Cross-lingual NLP processes text in multiple languages, enabling knowledge transfer and multilingual applications. With local models, cross-lingual capabilities depend heavily on model training data and architecture choices.

Cross-Lingual Embeddings

MUSE and LaBSE provide aligned embeddings across languages:

from sentence_transformers import SentenceTransformer
import numpy as np

class CrossLingualSimilarity:
    def __init__(self, model_name="sentence-transformers/LaBSE"):
        self.model = SentenceTransformer(model_name)
    
    def compute_similarity(self, text1: str, text2: str) -> float:
        """Compute similarity between texts in potentially different languages."""
        embeddings = self.model.encode([text1, text2])
        similarity = np.dot(embeddings[0], embeddings[1]) / (
            np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
        )
        return float(similarity)
    
    def find_parallel_sentences(self, corpus1: list[str], corpus2: list[str], 
                                 threshold: float = 0.8) -> list[tuple[int, int, float]]:
        """Find sentence pairs between two corpora."""
        embeddings1 = self.model.encode(corpus1, show_progress_bar=True)
        embeddings2 = self.model.encode(corpus2, show_progress_bar=True)
        
        # Normalize for cosine similarity
        embeddings1 = embeddings1 / np.linalg.norm(embeddings1, axis=1, keepdims=True)
        embeddings2 = embeddings2 / np.linalg.norm(embeddings2, axis=1, keepdims=True)
        
        # Compute similarity matrix
        similarity_matrix = np.dot(embeddings1, embeddings2.T)
        
        # Find matches above threshold
        matches = []
        for i in range(len(corpus1)):
            for j in range(len(corpus2)):
                if similarity_matrix[i, j] >= threshold:
                    matches.append((i, j, float(similarity_matrix[i, j])))
        
        return sorted(matches, key=lambda x: x[2], reverse=True)
    
    def translate_by_retrieval(self, source_text: str, 
                                target_corpus: list[str]) -> str:
        """Find most similar text in target language corpus."""
        source_embedding = self.model.encode([source_text])
        target_embeddings = self.model.encode(target_corpus)
        
        similarities = np.dot(source_embedding, target_embeddings.T)[0]
        best_idx = similarities.argmax()
        
        return {
            "source": source_text,
            "translation": target_corpus[best_idx],
            "similarity": float(similarities[best_idx])
        }

XLM-RoBERTa for Cross-Lingual Classification

from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch.nn.functional as F

class CrossLingualClassifier:
    def __init__(self, model_name="xlm-roberta-base"):
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            model_name, num_labels=3
        )
    
    def predict(self, text: str, language_hint: str = None) -> dict:
        """Classify text regardless of language."""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, 
                                max_length=256, padding=True)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = F.softmax(outputs.logits, dim=-1)
        
        labels = ["negative", "neutral", "positive"]
        pred_idx = probs.argmax().item()
        
        return {
            "text": text,
            "predicted_label": labels[pred_idx],
            "confidence": probs[0, pred_idx].item(),
            "probabilities": {l: probs[0, i].item() for i, l in enumerate(labels)}
        }
    
    def zero_shot_classify(self, text: str, candidate_labels: list[str]) -> dict:
        """Classify into arbitrary labels using NLI-based approach."""
        results = {}
        for label in candidate_labels:
            # Format as NLI premise-hypothesis
            inputs = self.tokenizer(
                text,
                f"This text is about {label}.",
                return_tensors="pt",
                truncation=True,
                max_length=256,
                padding=True
            )
            
            # Use MNLI-style classification
            with torch.no_grad():
                outputs = self.model(**inputs)
                probs = F.softmax(outputs.logits, dim=-1)
            
            results[label] = probs[0, 2].item()  # Entailment probability
        
        return {
            "text": text,
            "classification": max(results, key=results.get),
            "scores": results
        }
EXERCISE

Build a cross-lingual document similarity search that works across language pairs. Implement hybrid retrieval combining cross-lingual embeddings with machine translation for improved accuracy.