17. Prompt Compression
Chapter 17 of 18 · 25 min
Beyond cost optimization, prompt compression improves latency and context utilization. Compression techniques range from simple deduplication to advanced semantic summarization.
Lossless Compression
Remove formatting noise without semantic loss:
def remove_formatting_noise(prompt: str) -> str:
# Remove excessive whitespace
compressed = re.sub(r'\n\s*\n\s*\n', '\n\n', prompt)
compressed = re.sub(r' +', ' ', compressed)
# Remove redundant structure markers
compressed = re.sub(r'\*{3,}', '*', compressed) # *** → *
compressed = re.sub(r'[-=]{3,}', '-', compressed) # --- → -
# Collapse bullet points that don't add structure
compressed = re.sub(r'^\s*[-*]\s+', '', compressed, flags=re.MULTILINE)
return compressed.strip()
Semantic Compression
Replace verbose phrases with equivalent shorter forms:
# Vocabulary replacement map
COMPRESSION_MAP = {
"in order to": "to",
"due to the fact that": "because",
"at this point in time": "now",
"in the event that": "if",
"with regard to": "about",
"a large number of": "many",
"at the present time": "now",
"has the ability to": "can"
}
def semantic_compress(prompt: str) -> str:
compressed = prompt
for verbose, concise in COMPRESSION_MAP.items():
compressed = compressed.replace(verbose, concise)
return compressed
LLMLingua-Style Compression
Use a small model to identify and remove non-essential tokens:
# Conceptual implementation of prompt compression via importance scoring
def compress_prompt_llmlingua(prompt, target_tokens, small_model="llama3:3b"):
"""Identify and remove low-importance tokens."""
# Score each sentence/segment for importance
segments = prompt.split('\n')
importance_scores = []
for segment in segments:
score_response = ollama.chat(
model=small_model,
messages=[{
"role": "user",
"content": f"Rate importance of this segment from 0-10 for task completion: '{segment}'"
}]
)
importance_scores.append(int(score_response["message"]["content"].strip()))
# Keep high-importance segments, summarize or remove low-importance
compressed_segments = []
current_tokens = 0
for segment, score in zip(segments, importance_scores):
segment_tokens = count_tokens(segment)
if current_tokens + segment_tokens <= target_tokens:
if score >= 5:
compressed_segments.append(segment)
current_tokens += segment_tokens
else:
# Summarize remaining content
compressed_segments.append(summarize_segment(segment, small_model))
return '\n'.join(compressed_segments)
Context Window Optimization
For long conversations, compress history:
# Simple conversation compression
def compress_conversation(messages, target_tokens):
"""Compress conversation history to fit target token count."""
total_tokens = sum(count_tokens(m['content']) for m in messages)
if total_tokens <= target_tokens:
return messages
# Keep first message (system) and last messages
system_msg = messages[0]
conversation_msgs = messages[1:]
available_tokens = target_tokens - count_tokens(system_msg['content'])
compressed_history = []
for msg in reversed(conversation_msgs):
if available_tokens >= count_tokens(msg['content']):
compressed_history.insert(0, msg)
available_tokens -= count_tokens(msg['content'])
else:
# Summarize this message
summary = summarize_message(msg, target_tokens=100)
compressed_history.insert(0, {"role": msg['role'], "content": summary})
break
return [system_msg] + compressed_history
Testing Compressed Prompts
Always validate compressed prompts produce equivalent outputs:
def validate_compression(original, compressed, test_cases):
"""Verify compressed prompt maintains quality."""
original_outputs = [generate(original, tc) for tc in test_cases]
compressed_outputs = [generate(compressed, tc) for tc in test_cases]
similarity_scores = [
cosine_similarity(embed(o1), embed(o2))
for o1, o2 in zip(original_outputs, compressed_outputs)
]
avg_similarity = sum(similarity_scores) / len(similarity_scores)
return {
"pass": avg_similarity > 0.90,
"avg_similarity": avg_similarity,
"token_reduction": len(original) / len(compressed)
}
EXERCISE
Implement a compression pipeline that reduces a verbose prompt by at least 40% while maintaining >95% semantic similarity on test cases. Verify using embedding similarity.