KEY INSIGHT
Prompt portability is an assumption, not a property. Each model may require task-specific prompt tuning even when tasks are identical.
```python
import anthropic
import openai
class CrossModelTester:
MODELS = {
'claude': {
'client': anthropic.Anthropic(),
'model': 'claude-sonnet-4-20250514',
'max_tokens': 1024
},
'gpt4o': {
'client': openai.OpenAI(),
'model': 'gpt-4o',
'max_tokens': 1024
},
'deepseek': {
'client': openai.OpenAI(base_url="https://api.deepseek.com"),
'model': 'deepseek-chat',
'max_tokens': 1024
}
}
def __init__(self, prompts: dict):
"""
Args:
prompts: dict of {prompt_name: prompt_template}
"""
self.prompts = prompts
def test_all_models(self, test_cases: list[dict]) -> dict:
"""Run all prompt/template combinations across all models."""
results = {}
for model_name, model_config in self.MODELS.items():
results[model_name] = {}
client = model_config['client']
for prompt_name, prompt_template in self.prompts.items():
scores = []
for case in test_cases:
rendered = prompt_template.format(**case['input'])
if model_name == 'claude':
response = client.messages.create(
model=model_config['model'],
max_tokens=model_config['max_tokens'],
messages=[{'role': 'user', 'content': rendered}]
)
output = response.content[0].text
else:
response = client.chat.completions.create(
model=model_config['model'],
max_tokens=model_config['max_tokens'],
messages=[{'role': 'user', 'content': rendered}]
)
output = response.choices[0].message.content
score = self.score_output(output, case['expected'])
scores.append(score)
results[model_name][prompt_name] = {
'avg_score': np.mean(scores),
'scores': scores
}
return results
def recommendation_report(self, results: dict) -> str:
"""Generate model-task recommendation based on results."""
report_lines = ["## Cross-Model Recommendation Report\n"]
for prompt_name in self.prompts.keys():
scores_by_model = {
model: results[model][prompt_name]['avg_score']
for model in results
}
best_model = max(scores_by_model, key=scores_by_model.get)
best_score = scores_by_model[best_model]
report_lines.append(f"\n### Prompt: {prompt_name}")
report_lines.append(f"- Best model: {best_model} ({best_score:.2f})")
for model, score in scores_by_model.items():
delta = score - best_score
report_lines.append(f" - {model}: {score:.2f} ({delta:+.2f})")
return "\n".join(report_lines)
```
**Failure mode:** Cross-model testing assumes model parity on input handling. Formatting tokens like `###` and markdown headers have different semantic weight across models. A prompt using markdown syntax may function as intended for GPT-4o but degrade to noise for Claude.
```python
# Model-specific formatting to normalize output across models
FORMAT_VARIANTS = {
'claude': {
'section_marker': '\n\nObservation:',
'list_marker': '•',
'conclusion_marker': '\n\nFinal Answer:'
},
'gpt4o': {
'section_marker': '\n\n---',
'list_marker': '-',
'conclusion_marker': '\n\n**[FINAL]**'
},
'deepseek': {
'section_marker': '\n\n[[SECTION]]',
'list_marker': '*',
'conclusion_marker': '\n\n[[ANSWER]]'
}
}
def render_for_model(prompt_template, model_name, **kwargs):
"""Apply model-specific formatting to generic template."""
format_config = FORMAT_VARIANTS.get(model_name, FORMAT_VARIANTS['gpt4o'])
format_config.update(kwargs)
return prompt_template.format(**format_config)
```
Cross-model testing across 5 tasks revealed that optimal model varied by task: GPT-4o won on structured output tasks (4/5), Claude won on creative tasks (2/2 tested), and DeepSeek won on reasoning-heavy code tasks (3/5). This finding contradicts the assumption that a single best model exists.