17. Multi-Format Support
Chapter 17 of 18 · 20 min
Production document pipelines handle PDFs, Word documents, spreadsheets, images, and more. Each format requires different extraction logic. This chapter covers building a unified processing interface.
Format Detection
Determine file type before selecting the processor:
import magic
def detect_format(filepath):
mime_type = magic.from_file(filepath, mime=True)
format_map = {
"application/pdf": "pdf",
"application/msword": "doc",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"application/vnd.ms-excel": "xls",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
"image/jpeg": "image",
"image/png": "image",
"image/tiff": "image",
"text/plain": "text"
}
return format_map.get(mime_type, "unknown")
Format-Specific Processors
Implement processors for each format:
class PDFProcessor:
def extract(self, path):
doc = pymupdf.open(path)
text = "\n".join(page.get_text() for page in doc)
return {"content": text, "metadata": {"pages": len(doc)}}
class DOCXProcessor:
def extract(self, path):
doc = docx.Document(path)
text = "\n".join(para.text for para in doc.paragraphs)
return {"content": text, "metadata": {"paragraphs": len(doc.paragraphs)}}
class ImageProcessor:
def extract(self, path):
text = pytesseract.image_to_string(Image.open(path))
return {"content": text, "metadata": {}}
class TextProcessor:
def extract(self, path):
with open(path, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
return {"content": content, "metadata": {}}
Unified Extraction Interface
Route to the correct processor:
class DocumentExtractor:
def __init__(self):
self.processors = {
"pdf": PDFProcessor(),
"docx": DOCXProcessor(),
"doc": DOCXProcessor(),
"image": ImageProcessor(),
"text": TextProcessor()
}
self.default = TextProcessor()
def extract(self, path):
format_type = detect_format(path)
processor = self.processors.get(format_type, self.default)
try:
return processor.extract(path)
except Exception as e:
return {"content": "", "metadata": {"error": str(e), "format": format_type}}
Handling Multiple Files in Archives
Some documents arrive as archives (ZIP, tar.gz):
import zipfile
import tarfile
from io import BytesIO
def extract_from_archive(filepath, extractor):
results = []
if filepath.endswith(".zip"):
with zipfile.ZipFile(filepath) as zf:
for member in zf.namelist():
if member.is_file():
data = zf.read(member)
temp_path = f"/tmp/{Path(member).name}"
with open(temp_path, "wb") as f:
f.write(data)
results.append(extractor.extract(temp_path))
return results
Converting Office Formats
LibreOffice converts Word and Excel to PDF for unified processing:
import subprocess
def convert_to_pdf(input_path, output_dir):
result = subprocess.run([
"libreoffice", "--headless", "--convert-to", "pdf",
"--outdir", output_dir, input_path
], capture_output=True, timeout=60)
if result.returncode != 0:
raise RuntimeError(f"Conversion failed: {result.stderr}")
return Path(output_dir) / f"{Path(input_path).stem}.pdf"
Batch Format Processing
Process mixed-format batches:
def process_directory(directory, extractor):
results = []
for path in Path(directory).rglob("*"):
if path.is_file():
try:
result = extractor.extract(str(path))
results.append({
"path": str(path),
"status": "success",
**result
})
except Exception as e:
results.append({
"path": str(path),
"status": "failed",
"error": str(e)
})
return results
EXERCISE
Create an extensible document extractor that detects format, routes to the appropriate processor, logs unsupported formats, and handles conversion for office documents using LibreOffice.