KEY INSIGHT
A prompt framework succeeds when it makes the right choice obvious and the wrong choice impossible. Constraints codified in code outperform conventions documented in text.
### Framework Architecture
The framework consists of five interconnected modules:
```python
# framework/
# ├── __init__.py
# ├── core/
# │ ├── template.py # Template management
# │ ├── schema.py # Input/output validation
# │ └── render.py # Multi-model rendering
# ├── testing/
# │ ├── harness.py # Evaluation infrastructure
# │ ├── ab_test.py # A/B testing integration
# │ └── optimizer.py # Automated improvement
# ├── deployment/
# │ ├── router.py # Model routing
# │ └── monitor.py # Production monitoring
# └── cli.py # Command-line interface
# core/template.py
class PromptTemplate:
"""Production-compatibly prompt template."""
def __init__(self, name, template_str, input_schema, output_schema):
self.name = name
self.template = template_str
self.input_schema = schema_validator(input_schema)
self.output_schema = schema_validator(output_schema)
self.models = [] # Model compatibility list
self.metadata = {}
def register_model(self, model_name, model_config):
"""Register model-specific rendering."""
self.models.append({
'name': model_name,
'config': model_config,
'format_variant': model_config.get('format_variant', 'default')
})
def render(self, model_name=None, **kwargs):
"""Render for specific model or default."""
validated_input = self.input_schema.validate(kwargs)
model = self.resolve_model(model_name)
return render_for_model(self.template, model['format_variant'], **validated_input)
```
### Schema-Based Validation
The framework enforces input/output schemas to guarantee production compatibility:
```python
# core/schema.py
from pydantic import BaseModel, Field
from typing import Generic, TypeVar, Literal
T = TypeVar('T')
class PromptSchema(Generic[T]):
"""Schema wrapper that adds prompt-specific validation."""
def __init__(self, model_cls):
self.model_cls = model_cls
def validate(self, data: dict) -> T:
"""Validate and return typed instance."""
instance = self.model_cls(**data)
self._validate_prompt_constraints(instance)
return instance
def _validate_prompt_constraints(self, instance):
"""Hook for prompt-specific validation rules."""
pass
class DocumentInput(PromptSchema):
"""Standard input for document processing tasks."""
class Model(BaseModel):
text: str = Field(min_length=10, max_length=50000)
modality: Literal['legal', 'technical', 'casual'] = 'casual'
language: str = Field(default='en', pattern=r'^[a-z]{2}$')
priority: Literal['low', 'normal', 'high'] = 'normal'
# Validation catches errors before model call
try:
validated = DocumentInput.validate({
'text': 'Short', # Too short
'modality': 'legal'
})
except ValidationError as e:
print(e) # Error raised before API call
```
### Testing Infrastructure
The testing module evaluates templates across models with statistical rigor:
```python
# testing/harness.py
class EvaluationHarness:
def __init__(self, tests_dir='tests/fixtures'):
self.tests_dir = Path(tests_dir)
self.results_cache = {}
def load_test_cases(self, prompt_name):
"""Load test cases from fixtures directory."""
path = self.tests_dir / f'{prompt_name}.yaml'
if path.exists():
return yaml.safe_load(path.read_text())['cases']
return []
def evaluate(self, template, model_client, n_samples=5):
"""Statistical evaluation with confidence intervals."""
cases = self.load_test_cases(template.name)
results = {'cases': []}
for case in cases:
samples = self._collect_samples(template, model_client, case, n_samples)
consensus = self._compute_consensus(samples)
results['cases'].append({
'input': case['input'],
'expected': case['expected'],
'samples': samples,
'consensus': consensus,
'consensus_correct': self._score(consensus, case['expected'])
})
results['summary'] = self._summarize(results['cases'])
return results
def _summarize(self, cases):
"""Compute aggregate metrics with confidence intervals."""
scores = [c['consensus_correct'] for c in cases]
return {
'n': len(cases),
'mean': np.mean(scores),
'std': np.std(scores),
'p5': np.percentile(scores, 5),
'p95': np.percentile(scores, 95),
'ci95_lower': np.mean(scores) - 1.96 * np.std(scores) / np.sqrt(len(scores)),
'ci95_upper': np.mean(scores) + 1.96 * np.std(scores) / np.sqrt(len(scores))
}
```
### Deployment Router
Routing selects the optimal model per request:
```python
# deployment/router.py
class PromptRouter:
"""Route requests to optimal model based on task and model capabilities."""
def __init__(self, model_registry):
self.registry = model_registry
self.routing_rules = []
def add_rule(self, condition_fn, model_name, priority=0):
"""Register routing rule with condition function."""
self.routing_rules.append({
'condition': condition_fn,
'model': model_name,
'priority': priority
})
self.routing_rules.sort(key=lambda r: r['priority'], reverse=True)
def route(self, template, input_data):
"""Select optimal model for this template+input combination."""
for rule in self.routing_rules:
if rule['condition'](template, input_data):
return rule['model']
# Default: use template's first registered model
if template.models:
return template.models[0]['name']
return self._fallback_model()
def _fallback_model(self):
"""Return most reliable fallback model."""
return 'gpt4o' # Configured via environment
# Example routing rules
router = PromptRouter(model_registry)
router.add_rule(
condition_fn=lambda t, i: i.get('priority') == 'high',
model_name='claude',
priority=100
)
router.add_rule(
condition_fn=lambda t, i: 'code' in t.name or 'code' in i.get('text', ''),
model_name='deepseek',
priority=80
)
router.add_rule(
condition_fn=lambda t, i: t.name == 'summarizer' and len(i.get('text', '')) > 5000,
model_name='gpt4o',
priority=50
)
```
### Integration and Testing
Assemble the framework and run the test suite:
```python
# Full integration test
def test_framework_integration():
"""End-to-end test of framework lifecycle."""
# 1. Create template with schemas
summarizer = PromptTemplate(
name='document_summarizer',
template=SUMMARIZER_TEMPLATE,
input_schema=DocumentInput,
output_schema=SummaryOutput
)
summarizer.register_model('claude', CLAUDE_CONFIG)
summarizer.register_model('gpt4o', GPT4O_CONFIG)
summarizer.register_model('deepseek', DEEPSEEK_CONFIG)
# 2. Add routing rules
router.add_rule(
condition_fn=lambda t, i: i.get('modality') == 'technical',
model_name='deepseek',
priority=70
)
# 3. Evaluate across models
harness = EvaluationHarness()
results = harness.evaluate(summarizer, model_registry)
assert results['summary']['mean'] > 0.85, "Accuracy below threshold"
assert results['summary']['p5'] > 0.70, "Bottom 5% below acceptable"
# 4. Deploy
router.register(summarizer)
deployment = DeploymentManager(router, monitor)
deployment.deploy('document_summarizer', canary=0.05)
return True
# Run full test
test_framework_integration()
```