23. Text Processing with Regex
Beyond simple matching, regex excels at structured extraction and validation. AI pipelines frequently need to pull information from logs, parse model outputs, or validate input formats.
Real-world patterns you'll encounter:
import re
from dataclasses import dataclass
@dataclass
class ModelMetrics:
epoch: int
loss: float
accuracy: float
def parse_metrics_log(log_line: str) -> ModelMetrics | None:
"""Extract metrics from log lines like 'Epoch 3: loss=1.23, acc=0.89'."""
pattern = r'Epoch (\d+):\s*loss=([\d.]+),\s*acc=([\d.]+)'
match = re.search(pattern, log_line)
if not match:
return None
return ModelMetrics(
epoch=int(match.group(1)),
loss=float(match.group(2)),
accuracy=float(match.group(3))
)
# Complex example: extract API responses with varied formats
def extract_confidence(response: str) -> float | None:
"""Extract confidence scores from various LLM response formats."""
patterns = [
r'confidence[:\s]+([0-9.]+)', # "confidence: 0.87"
r'conf\s*=?\s*([0-9.]+)', # "conf = 0.87" or like "conf 0.87"
r'\(([0-9.]+)\s*(?:->|:)\s*\w+\)', # "(0.87 -> positive)"
r'\[\s*([0-9.]+)\s*,', # "[0.87, 0.12, ...]"
]
for pattern in patterns:
match = re.search(pattern, response, re.IGNORECASE)
if match:
return float(match.group(1))
return None
# Test
log = "Epoch 5: loss=0.234, acc=0.891"
metrics = parse_metrics_log(log)
print(metrics) # ModelMetrics(epoch=5, loss=0.234, accuracy=0.891)
response = "Analysis complete. confidence: 0.923"
print(extract_confidence(response)) # 0.923
The | (pipe) character in type hints means Union type (Python 3.10+). ModelMetrics | None is equivalent to Optional[ModelMetrics].
Local verification checkpoint
Run the smallest example from this chapter in a local workspace and record the package version, runtime, data path, and observed output. If the result depends on model size, vector count, CPU/GPU backend, or available memory, note that constraint beside the exercise so the lesson remains reproducible.
Local verification checkpoint
Run the smallest example from this chapter in a local workspace and record the package version, runtime, data path, and observed output. If the result depends on model size, vector count, CPU/GPU backend, or available memory, note that constraint beside the exercise so the lesson remains reproducible.
Write a function normalize_whitespace(text: str) -> str that:
- Replaces multiple spaces/tabs with single space
- Removes leading/trailing whitespace from each line
- Collapses multiple blank lines into one
Test it on a string with messy formatting.