03. PDF Ingestion with PyMuPDF

Chapter 3 of 22 · 25 min

PDFs are the most common document format in enterprise settings. PyMuPDF (formerly fitz) is the fastest Python library for PDF text extraction. It handles complex layouts, images, and tables better than alternatives like pdfplumber or PyPDF2.

Installation and setup

pip install pymupdf

Verify installation:

import fitz
print(f"PyMuPDF version: {fitz.__version__}")

Basic text extraction

import fitz

def extract_text_from_pdf(filepath: str) -> list[dict]:
    """Extract text from each page of a PDF."""
    doc = fitz.open(filepath)
    pages = []

    for page_num, page in enumerate(doc):
        text = page.get_text()
        pages.append({
            "page_number": page_num + 1,
            "text": text,
            "source": filepath
        })

    doc.close()
    return pages

This returns a list of dictionaries, one per page. Each dictionary contains the page number, raw text, and source file path.

Handling multi-column layouts

Corporate PDFs often have two-column layouts. Basic get_text() merges everything in reading order, which breaks paragraph continuity in multi-column documents.

def extract_with_layout(filepath: str) -> list[dict]:
    """Extract text preserving multi-column layout."""
    doc = fitz.open(filepath)
    pages = []

    for page_num, page in enumerate(doc):
        # Get text blocks with position information
        blocks = page.get_text("blocks")
        # Sort blocks by y-coordinate (top to bottom), then x-coordinate (left to right)
        sorted_blocks = sorted(blocks, key=lambda b: (round(b[1] / 50), b[0]))

        text = "\n".join(block[4] for block in sorted_blocks)

        pages.append({
            "page_number": page_num + 1,
            "text": text.strip(),
            "source": filepath
        })

    doc.close()
    return pages

The round(b[1] / 50) groups text by vertical zone. Adjust the divisor based on your PDF's font sizes. Larger divisor = more aggressive grouping.

Extracting tables

Tables require special handling. PyMuPDF extracts table structure but not cell boundaries. Use tabula-py or camelot for table extraction, or accept that tables will appear as space-separated text.

def extract_tables_as_text(filepath: str) -> list[dict]:
    """Extract tables as structured text."""
    import fitz

    doc = fitz.open(filepath)
    tables = []

    for page_num, page in enumerate(doc):
        # Check for table structures using text patterns
        text = page.get_text("text")
        lines = text.split("\n")

        # Simple heuristic: detect rows with consistent spacing
        table_rows = []
        for line in lines:
            if "│" in line or "|" in line or "  " in line:
                table_rows.append(line)

        if table_rows:
            tables.append({
                "page_number": page_num + 1,
                "table_text": "\n".join(table_rows),
                "source": filepath
            })

    return tables

Error handling

PDF extraction fails in predictable ways. Handle these cases:

import fitz
from pathlib import Path

def safe_extract_pdf(filepath: str | Path) -> list[dict]:
    """Extract text with thorough error handling."""
    filepath = Path(filepath)

    if not filepath.exists():
        raise FileNotFoundError(f"PDF not found: {filepath}")

    if filepath.stat().st_size == 0:
        raise ValueError(f"PDF is empty: {filepath}")

    try:
        doc = fitz.open(filepath)
    except fitz.FileDataError:
        raise ValueError(f"Corrupted or password-protected PDF: {filepath}")

    pages = []
    for page_num, page in enumerate(doc):
        try:
            text = page.get_text()
            if text.strip():  # Skip blank pages
                pages.append({
                    "page_number": page_num + 1,
                    "text": text,
                    "source": str(filepath)
                })
        except Exception as e:
            print(f"Warning: Failed to extract page {page_num + 1}: {e}")

    doc.close()

    if not pages:
        raise ValueError(f"No extractable text in PDF: {filepath}")

    return pages

Batch processing multiple PDFs

from pathlib import Path

def ingest_pdf_directory(directory: str) -> list[dict]:
    """Ingest all PDFs from a directory."""
    pdf_dir = Path(directory)
    all_pages = []

    for pdf_path in pdf_dir.glob("*.pdf"):
        try:
            pages = safe_extract_pdf(pdf_path)
            all_pages.extend(pages)
            print(f"Extracted {len(pages)} pages from {pdf_path.name}")
        except Exception as e:
            print(f"Error processing {pdf_path.name}: {e}")

    return all_pages

Testing your extraction

Always verify extraction quality. Read a few extracted pages and check for:

  • Missing text (scanned PDFs need OCR)
  • Garbled characters (encoding issues)
  • Split paragraphs (layout detection failure)
  • Missing headers and footers (sometimes intentional)
def preview_extraction(pages: list[dict], n: int = 3):
    """Preview first n pages of extracted content."""
    for page in pages[:n]:
        print(f"=== Page {page['page_number']} from {page['source']} ===")
        print(page["text"][:500])  # First 500 characters
        print()
EXERCISE

Download a multi-column PDF (try a research paper from arxiv.org). Extract the text and compare basic extraction vs layout-preserved extraction. Identify at least one paragraph that gets broken by basic extraction but preserved by layout-aware extraction.