09. Document Metadata Extraction
Metadata transforms a retrieval system from a search engine into a research tool. With metadata, you can filter by source, date, or section. You can trace answers back to original documents. You can handle multi-document queries that span related sources.
Why metadata matters
Consider two queries:
"What was the Q3 revenue?" - Without metadata, you might retrieve any Q3 data. With year metadata, you filter to 2024 Q3 specifically.
"Compare the return policies across regions" - With region metadata, you retrieve only relevant regional documents instead of mixing all policies together.
Extracting document-level metadata
from dataclasses import dataclass, asdict
from datetime import datetime
from pathlib import Path
@dataclass
class DocumentMetadata:
source: str
created_at: datetime
modified_at: datetime
file_type: str
file_size: int
title: str = ""
author: str = ""
language: str = "en"
def extract_file_metadata(filepath: str) -> DocumentMetadata:
"""Extract metadata from file system attributes."""
path = Path(filepath)
stat = path.stat()
return DocumentMetadata(
source=str(path.absolute()),
created_at=datetime.fromtimestamp(stat.st_ctime),
modified_at=datetime.fromtimestamp(stat.st_mtime),
file_type=path.suffix.lower(),
file_size=stat.st_size,
title=path.stem # Filename without extension
)
Extracting from document content
For PDFs and other formats, extract metadata from the document itself.
import fitz
def extract_pdf_metadata(filepath: str) -> dict:
"""Extract metadata from PDF document properties."""
doc = fitz.open(filepath)
metadata = {
"title": doc.metadata.get("title", ""),
"author": doc.metadata.get("author", ""),
"subject": doc.metadata.get("subject", ""),
"keywords": doc.metadata.get("keywords", ""),
"creator": doc.metadata.get("creator", ""),
"producer": doc.metadata.get("producer", ""),
"page_count": len(doc),
"encrypted": doc.is_encrypted
}
doc.close()
return metadata
Extracting from HTML
HTML has multiple metadata sources: title tag, meta tags, and URL structure.
def extract_html_metadata(html: str, url: str) -> dict:
"""Extract metadata from HTML document."""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
metadata = {
"url": url,
"title": soup.title.string if soup.title else "",
"description": "",
"author": "",
"published_date": "",
"language": soup.get("lang", "en")
}
# Open Graph and meta tags
for tag in soup.find_all("meta"):
name = tag.get("name") or tag.get("property")
content = tag.get("content")
if name and content:
name = name.lower()
if name in ["description", "author", "keywords"]:
metadata[name] = content
elif name == "article:published_time":
metadata["published_date"] = content
# Extract domain for filtering
from urllib.parse import urlparse
parsed = urlparse(url)
metadata["domain"] = parsed.netloc
metadata["path"] = parsed.path
return metadata
Extracting from Markdown
Markdown metadata lives in frontmatter and heading structure.
def extract_markdown_metadata(md_content: str, filepath: str) -> dict:
"""Extract metadata from Markdown document."""
metadata = {
"source": filepath,
"title": "",
"headings": [],
"code_blocks": 0,
"word_count": 0,
"has_code": False
}
lines = md_content.split("\n")
in_frontmatter = False
frontmatter = {}
for line in lines:
# Frontmatter detection
if line.strip() == "---":
in_frontmatter = not in_frontmatter
continue
if in_frontmatter:
if ":" in line:
key, value = line.split(":", 1)
frontmatter[key.strip()] = value.strip()
continue
# Extract title from first heading
if line.startswith("# ") and not metadata["title"]:
metadata["title"] = line[2:].strip()
# Count headings
if line.startswith("#"):
metadata["headings"].append(line.strip())
# Count code blocks
if line.startswith("```"):
metadata["code_blocks"] += 1
# Apply frontmatter
metadata.update(frontmatter)
# Word count
text_only = md_content.replace("```", "").replace("#", "")
metadata["word_count"] = len(text_only.split())
metadata["has_code"] = metadata["code_blocks"] > 0
return metadata
Chunk-level metadata
Each chunk needs metadata that links it back to the document and its position within it.
@dataclass
class ChunkMetadata:
chunk_index: int
total_chunks: int
source_document: str
start_char: int
end_char: int
section_heading: str = ""
page_number: int = 0
def create_chunk_metadata(
document_metadata: dict,
chunk_text: str,
chunk_index: int,
total_chunks: int
) -> dict:
"""Create metadata for a single chunk."""
# Extract section heading from chunk (look for last heading)
heading_match = re.search(r'#+\s+(.+?)\n', chunk_text)
section = heading_match.group(1) if heading_match else ""
return {
"chunk_index": chunk_index,
"total_chunks": total_chunks,
"source_document": document_metadata.get("source", "unknown"),
"source_title": document_metadata.get("title", ""),
"section_heading": section,
"chunk_size_chars": len(chunk_text),
"chunk_size_tokens": len(tiktoken.get_encoding("cl100k_base").encode(chunk_text)),
**document_metadata
}
Language detection
For multilingual documents, detect the language and store it in metadata.
from langdetect import detect, detect_langs
def detect_language(text: str) -> dict:
"""Detect language of text with confidence."""
if len(text.strip()) < 20:
return {"language": "unknown", "confidence": 0.0}
try:
detections = detect_langs(text)
primary = detections[0]
return {
"language": primary.lang,
"confidence": round(primary.prob, 3)
}
except:
return {"language": "unknown", "confidence": 0.0}
Storing metadata in ChromaDB
ChromaDB accepts metadata as a dictionary alongside embeddings.
import chromadb
from chromadb.config import Settings
def store_with_metadata(
collection_name: str,
chunks: list[dict]
):
"""Store chunks with metadata in ChromaDB."""
client = chromadb.Client(Settings(
persist_directory="./chroma_db"
))
collection = client.get_or_create_collection(collection_name)
ids = [f"chunk_{i}" for i in range(len(chunks))]
documents = [c["text"] for c in chunks]
metadatas = [c["metadata"] for c in chunks]
embeddings = [c["embedding"] for c in chunks]
collection.add(
ids=ids,
documents=documents,
metadatas=metadatas,
embeddings=embeddings
)
return collection
Filtering by metadata
The main benefit of metadata is filtering during retrieval.
def filtered_query(
collection,
query_embedding: list,
filters: dict,
top_k: int = 5
):
"""Query with metadata filters."""
where_clause = {}
for key, value in filters.items():
if isinstance(value, list):
where_clause[key] = {"$in": value}
elif isinstance(value, dict):
where_clause[key] = value
else:
where_clause[key] = value
results = collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
where=where_clause if where_clause else None
)
return results
# Example filters
# Only documents from 2024
results = filtered_query(collection, query_emb, {"year": 2024})
# Only specific source files
results = filtered_query(collection, query_emb, {"source": {"$in": ["file1.pdf", "file2.pdf"])}})
Create a script that ingests three PDF files from different years with different authors. Extract and print all metadata for each document. Then chunk each document and print the chunk-level metadata for the first chunk of each document. Verify that source tracking works correctly across all chunks.