16. Cross-Lingual NLP
Chapter 16 of 18 · 20 min
Cross-lingual NLP processes text in multiple languages, enabling knowledge transfer and multilingual applications. With local models, cross-lingual capabilities depend heavily on model training data and architecture choices.
Cross-Lingual Embeddings
MUSE and LaBSE provide aligned embeddings across languages:
from sentence_transformers import SentenceTransformer
import numpy as np
class CrossLingualSimilarity:
def __init__(self, model_name="sentence-transformers/LaBSE"):
self.model = SentenceTransformer(model_name)
def compute_similarity(self, text1: str, text2: str) -> float:
"""Compute similarity between texts in potentially different languages."""
embeddings = self.model.encode([text1, text2])
similarity = np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
return float(similarity)
def find_parallel_sentences(self, corpus1: list[str], corpus2: list[str],
threshold: float = 0.8) -> list[tuple[int, int, float]]:
"""Find sentence pairs between two corpora."""
embeddings1 = self.model.encode(corpus1, show_progress_bar=True)
embeddings2 = self.model.encode(corpus2, show_progress_bar=True)
# Normalize for cosine similarity
embeddings1 = embeddings1 / np.linalg.norm(embeddings1, axis=1, keepdims=True)
embeddings2 = embeddings2 / np.linalg.norm(embeddings2, axis=1, keepdims=True)
# Compute similarity matrix
similarity_matrix = np.dot(embeddings1, embeddings2.T)
# Find matches above threshold
matches = []
for i in range(len(corpus1)):
for j in range(len(corpus2)):
if similarity_matrix[i, j] >= threshold:
matches.append((i, j, float(similarity_matrix[i, j])))
return sorted(matches, key=lambda x: x[2], reverse=True)
def translate_by_retrieval(self, source_text: str,
target_corpus: list[str]) -> str:
"""Find most similar text in target language corpus."""
source_embedding = self.model.encode([source_text])
target_embeddings = self.model.encode(target_corpus)
similarities = np.dot(source_embedding, target_embeddings.T)[0]
best_idx = similarities.argmax()
return {
"source": source_text,
"translation": target_corpus[best_idx],
"similarity": float(similarities[best_idx])
}
XLM-RoBERTa for Cross-Lingual Classification
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch.nn.functional as F
class CrossLingualClassifier:
def __init__(self, model_name="xlm-roberta-base"):
self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
self.model = XLMRobertaForSequenceClassification.from_pretrained(
model_name, num_labels=3
)
def predict(self, text: str, language_hint: str = None) -> dict:
"""Classify text regardless of language."""
inputs = self.tokenizer(text, return_tensors="pt", truncation=True,
max_length=256, padding=True)
with torch.no_grad():
outputs = self.model(**inputs)
probs = F.softmax(outputs.logits, dim=-1)
labels = ["negative", "neutral", "positive"]
pred_idx = probs.argmax().item()
return {
"text": text,
"predicted_label": labels[pred_idx],
"confidence": probs[0, pred_idx].item(),
"probabilities": {l: probs[0, i].item() for i, l in enumerate(labels)}
}
def zero_shot_classify(self, text: str, candidate_labels: list[str]) -> dict:
"""Classify into arbitrary labels using NLI-based approach."""
results = {}
for label in candidate_labels:
# Format as NLI premise-hypothesis
inputs = self.tokenizer(
text,
f"This text is about {label}.",
return_tensors="pt",
truncation=True,
max_length=256,
padding=True
)
# Use MNLI-style classification
with torch.no_grad():
outputs = self.model(**inputs)
probs = F.softmax(outputs.logits, dim=-1)
results[label] = probs[0, 2].item() # Entailment probability
return {
"text": text,
"classification": max(results, key=results.get),
"scores": results
}
EXERCISE
Build a cross-lingual document similarity search that works across language pairs. Implement hybrid retrieval combining cross-lingual embeddings with machine translation for improved accuracy.