03. PDF Ingestion with PyMuPDF
PDFs are the most common document format in enterprise settings. PyMuPDF (formerly fitz) is the fastest Python library for PDF text extraction. It handles complex layouts, images, and tables better than alternatives like pdfplumber or PyPDF2.
Installation and setup
pip install pymupdf
Verify installation:
import fitz
print(f"PyMuPDF version: {fitz.__version__}")
Basic text extraction
import fitz
def extract_text_from_pdf(filepath: str) -> list[dict]:
"""Extract text from each page of a PDF."""
doc = fitz.open(filepath)
pages = []
for page_num, page in enumerate(doc):
text = page.get_text()
pages.append({
"page_number": page_num + 1,
"text": text,
"source": filepath
})
doc.close()
return pages
This returns a list of dictionaries, one per page. Each dictionary contains the page number, raw text, and source file path.
Handling multi-column layouts
Corporate PDFs often have two-column layouts. Basic get_text() merges everything in reading order, which breaks paragraph continuity in multi-column documents.
def extract_with_layout(filepath: str) -> list[dict]:
"""Extract text preserving multi-column layout."""
doc = fitz.open(filepath)
pages = []
for page_num, page in enumerate(doc):
# Get text blocks with position information
blocks = page.get_text("blocks")
# Sort blocks by y-coordinate (top to bottom), then x-coordinate (left to right)
sorted_blocks = sorted(blocks, key=lambda b: (round(b[1] / 50), b[0]))
text = "\n".join(block[4] for block in sorted_blocks)
pages.append({
"page_number": page_num + 1,
"text": text.strip(),
"source": filepath
})
doc.close()
return pages
The round(b[1] / 50) groups text by vertical zone. Adjust the divisor based on your PDF's font sizes. Larger divisor = more aggressive grouping.
Extracting tables
Tables require special handling. PyMuPDF extracts table structure but not cell boundaries. Use tabula-py or camelot for table extraction, or accept that tables will appear as space-separated text.
def extract_tables_as_text(filepath: str) -> list[dict]:
"""Extract tables as structured text."""
import fitz
doc = fitz.open(filepath)
tables = []
for page_num, page in enumerate(doc):
# Check for table structures using text patterns
text = page.get_text("text")
lines = text.split("\n")
# Simple heuristic: detect rows with consistent spacing
table_rows = []
for line in lines:
if "│" in line or "|" in line or " " in line:
table_rows.append(line)
if table_rows:
tables.append({
"page_number": page_num + 1,
"table_text": "\n".join(table_rows),
"source": filepath
})
return tables
Error handling
PDF extraction fails in predictable ways. Handle these cases:
import fitz
from pathlib import Path
def safe_extract_pdf(filepath: str | Path) -> list[dict]:
"""Extract text with thorough error handling."""
filepath = Path(filepath)
if not filepath.exists():
raise FileNotFoundError(f"PDF not found: {filepath}")
if filepath.stat().st_size == 0:
raise ValueError(f"PDF is empty: {filepath}")
try:
doc = fitz.open(filepath)
except fitz.FileDataError:
raise ValueError(f"Corrupted or password-protected PDF: {filepath}")
pages = []
for page_num, page in enumerate(doc):
try:
text = page.get_text()
if text.strip(): # Skip blank pages
pages.append({
"page_number": page_num + 1,
"text": text,
"source": str(filepath)
})
except Exception as e:
print(f"Warning: Failed to extract page {page_num + 1}: {e}")
doc.close()
if not pages:
raise ValueError(f"No extractable text in PDF: {filepath}")
return pages
Batch processing multiple PDFs
from pathlib import Path
def ingest_pdf_directory(directory: str) -> list[dict]:
"""Ingest all PDFs from a directory."""
pdf_dir = Path(directory)
all_pages = []
for pdf_path in pdf_dir.glob("*.pdf"):
try:
pages = safe_extract_pdf(pdf_path)
all_pages.extend(pages)
print(f"Extracted {len(pages)} pages from {pdf_path.name}")
except Exception as e:
print(f"Error processing {pdf_path.name}: {e}")
return all_pages
Testing your extraction
Always verify extraction quality. Read a few extracted pages and check for:
- Missing text (scanned PDFs need OCR)
- Garbled characters (encoding issues)
- Split paragraphs (layout detection failure)
- Missing headers and footers (sometimes intentional)
def preview_extraction(pages: list[dict], n: int = 3):
"""Preview first n pages of extracted content."""
for page in pages[:n]:
print(f"=== Page {page['page_number']} from {page['source']} ===")
print(page["text"][:500]) # First 500 characters
print()
Download a multi-column PDF (try a research paper from arxiv.org). Extract the text and compare basic extraction vs layout-preserved extraction. Identify at least one paragraph that gets broken by basic extraction but preserved by layout-aware extraction.