KEY INSIGHT
Document image analysis involves understanding layout, extracting content, and interpreting structure. Multi-modal models can identify sections, understand document type, and provide semantic interpretation beyond raw text extraction.
Document analysis combines layout understanding with content extraction and semantic interpretation. Professional documents have defined structuresΓÇöheaders, body text, tables, footnotesΓÇöthat affect how information should be extracted.
```python
def analyze_document_structure(model, processor, image_path):
"""Identify document layout and sections."""
image = Image.open(image_path).convert("RGB")
prompt = """Analyze this document and identify:
1. Document type (form, invoice, contract, report, etc.)
2. Major sections and their boundaries
3. Key data fields present
4. Table structures if any
5. Overall document structure"""
conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=400)
return processor.batch_decode(output, skip_special_tokens=True)[0]
```
Extract structured information from forms:
```python
def extract_form_fields(model, processor, image_path, field_list):
"""Extract values for expected form fields."""
image = Image.open(image_path).convert("RGB")
field_descriptions = "\n".join([f"- {field}" for field in field_list])
prompt = f"""For each field below, extract the corresponding value from this document.
If a field is not found, respond 'N/A'.
Respond in the format: Field: Value
{field_descriptions}"""
conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=300)
result = processor.batch_decode(output, skip_special_tokens=True)[0]
# Parse structured output
fields = {}
for line in result.split("\n"):
if ":" in line:
key, value = line.split(":", 1)
fields[key.strip()] = value.strip()
return fields
```
Document type classification:
```python
def classify_document(model, processor, image_path):
"""Identify document type and characteristics."""
image = Image.open(image_path).convert("RGB")
types = [
"invoice", "receipt", "contract", "form",
"resume", "report", "letter", "handwritten_note",
"screenshot", "presentation", "newspaper_article",
"book_page", "label", "certificate", "unknown"
]
type_list = ", ".join(types[:-1]) # Exclude 'unknown'
prompt = f"""What type of document is this image? Choose from: {type_list}.
Provide confidence level (high/medium/low) and brief justification."""
conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=100)
return processor.batch_decode(output, skip_special_tokens=True)[0]
```
Document challenges:
- **Tightly packed text**: Loss of detail in small print
- **Color-coded information**: May be missed or misunderstood
- **Complex tables**: Multi-level headers problematic
- **Signatures**: Difficult to interpret reliably