Document Metadata Extraction — RAG Systems: Part 1 (Chapter 9)

Metadata transforms a retrieval system from a search engine into a research tool. With metadata, you can filter by source, date, or section. You can trace answers back to original documents. You can handle multi-document queries that span related sources.

Why metadata matters

Consider two queries:

"What was the Q3 revenue?" - Without metadata, you might retrieve any Q3 data. With year metadata, you filter to 2024 Q3 specifically.
"Compare the return policies across regions" - With region metadata, you retrieve only relevant regional documents instead of mixing all policies together.

Extracting document-level metadata

from dataclasses import dataclass, asdict
from datetime import datetime
from pathlib import Path

@dataclass
class DocumentMetadata:
    source: str
    created_at: datetime
    modified_at: datetime
    file_type: str
    file_size: int
    title: str = ""
    author: str = ""
    language: str = "en"

def extract_file_metadata(filepath: str) -> DocumentMetadata:
    """Extract metadata from file system attributes."""
    path = Path(filepath)

    stat = path.stat()

    return DocumentMetadata(
        source=str(path.absolute()),
        created_at=datetime.fromtimestamp(stat.st_ctime),
        modified_at=datetime.fromtimestamp(stat.st_mtime),
        file_type=path.suffix.lower(),
        file_size=stat.st_size,
        title=path.stem  # Filename without extension
    )

Extracting from document content

For PDFs and other formats, extract metadata from the document itself.

import fitz

def extract_pdf_metadata(filepath: str) -> dict:
    """Extract metadata from PDF document properties."""
    doc = fitz.open(filepath)

    metadata = {
        "title": doc.metadata.get("title", ""),
        "author": doc.metadata.get("author", ""),
        "subject": doc.metadata.get("subject", ""),
        "keywords": doc.metadata.get("keywords", ""),
        "creator": doc.metadata.get("creator", ""),
        "producer": doc.metadata.get("producer", ""),
        "page_count": len(doc),
        "encrypted": doc.is_encrypted
    }

    doc.close()
    return metadata

Extracting from HTML

HTML has multiple metadata sources: title tag, meta tags, and URL structure.

def extract_html_metadata(html: str, url: str) -> dict:
    """Extract metadata from HTML document."""
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, "lxml")

    metadata = {
        "url": url,
        "title": soup.title.string if soup.title else "",
        "description": "",
        "author": "",
        "published_date": "",
        "language": soup.get("lang", "en")
    }

    # Open Graph and meta tags
    for tag in soup.find_all("meta"):
        name = tag.get("name") or tag.get("property")
        content = tag.get("content")

        if name and content:
            name = name.lower()
            if name in ["description", "author", "keywords"]:
                metadata[name] = content
            elif name == "article:published_time":
                metadata["published_date"] = content

    # Extract domain for filtering
    from urllib.parse import urlparse
    parsed = urlparse(url)
    metadata["domain"] = parsed.netloc
    metadata["path"] = parsed.path

    return metadata

Extracting from Markdown

Markdown metadata lives in frontmatter and heading structure.

def extract_markdown_metadata(md_content: str, filepath: str) -> dict:
    """Extract metadata from Markdown document."""
    metadata = {
        "source": filepath,
        "title": "",
        "headings": [],
        "code_blocks": 0,
        "word_count": 0,
        "has_code": False
    }

    lines = md_content.split("\n")
    in_frontmatter = False
    frontmatter = {}

    for line in lines:
        # Frontmatter detection
        if line.strip() == "---":
            in_frontmatter = not in_frontmatter
            continue

        if in_frontmatter:
            if ":" in line:
                key, value = line.split(":", 1)
                frontmatter[key.strip()] = value.strip()
            continue

        # Extract title from first heading
        if line.startswith("# ") and not metadata["title"]:
            metadata["title"] = line[2:].strip()

        # Count headings
        if line.startswith("#"):
            metadata["headings"].append(line.strip())

        # Count code blocks
        if line.startswith("```"):
            metadata["code_blocks"] += 1

    # Apply frontmatter
    metadata.update(frontmatter)

    # Word count
    text_only = md_content.replace("```", "").replace("#", "")
    metadata["word_count"] = len(text_only.split())
    metadata["has_code"] = metadata["code_blocks"] > 0

    return metadata

Chunk-level metadata

Each chunk needs metadata that links it back to the document and its position within it.

@dataclass
class ChunkMetadata:
    chunk_index: int
    total_chunks: int
    source_document: str
    start_char: int
    end_char: int
    section_heading: str = ""
    page_number: int = 0

def create_chunk_metadata(
    document_metadata: dict,
    chunk_text: str,
    chunk_index: int,
    total_chunks: int
) -> dict:
    """Create metadata for a single chunk."""
    # Extract section heading from chunk (look for last heading)
    heading_match = re.search(r'#+\s+(.+?)\n', chunk_text)
    section = heading_match.group(1) if heading_match else ""

    return {
        "chunk_index": chunk_index,
        "total_chunks": total_chunks,
        "source_document": document_metadata.get("source", "unknown"),
        "source_title": document_metadata.get("title", ""),
        "section_heading": section,
        "chunk_size_chars": len(chunk_text),
        "chunk_size_tokens": len(tiktoken.get_encoding("cl100k_base").encode(chunk_text)),
        **document_metadata
    }

Language detection

For multilingual documents, detect the language and store it in metadata.

from langdetect import detect, detect_langs

def detect_language(text: str) -> dict:
    """Detect language of text with confidence."""
    if len(text.strip()) < 20:
        return {"language": "unknown", "confidence": 0.0}

    try:
        detections = detect_langs(text)
        primary = detections[0]

        return {
            "language": primary.lang,
            "confidence": round(primary.prob, 3)
        }
    except:
        return {"language": "unknown", "confidence": 0.0}

Storing metadata in ChromaDB

ChromaDB accepts metadata as a dictionary alongside embeddings.

import chromadb
from chromadb.config import Settings

def store_with_metadata(
    collection_name: str,
    chunks: list[dict]
):
    """Store chunks with metadata in ChromaDB."""
    client = chromadb.Client(Settings(
        persist_directory="./chroma_db"
    ))

    collection = client.get_or_create_collection(collection_name)

    ids = [f"chunk_{i}" for i in range(len(chunks))]
    documents = [c["text"] for c in chunks]
    metadatas = [c["metadata"] for c in chunks]
    embeddings = [c["embedding"] for c in chunks]

    collection.add(
        ids=ids,
        documents=documents,
        metadatas=metadatas,
        embeddings=embeddings
    )

    return collection

Filtering by metadata

The main benefit of metadata is filtering during retrieval.

def filtered_query(
    collection,
    query_embedding: list,
    filters: dict,
    top_k: int = 5
):
    """Query with metadata filters."""
    where_clause = {}

    for key, value in filters.items():
        if isinstance(value, list):
            where_clause[key] = {"$in": value}
        elif isinstance(value, dict):
            where_clause[key] = value
        else:
            where_clause[key] = value

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        where=where_clause if where_clause else None
    )

    return results

# Example filters
# Only documents from 2024
results = filtered_query(collection, query_emb, {"year": 2024})

# Only specific source files
results = filtered_query(collection, query_emb, {"source": {"$in": ["file1.pdf", "file2.pdf"])}})