Report Generation — Data Analysis with Local AI (Chapter 16)

Automated report generation saves time and ensures consistency. Local LLMs can produce formatted reports from analysis results without external services.

Structured Report Templates

def generate_analysis_report(df, analysis_results):
    report = f"""
# Data Analysis Report
Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}

## Executive Summary
- Total Records: {len(df):,}
- Date Range: {df.index.min().date()} to {df.index.max().date()}
- Key Metric Total: {analysis_results['total']:,.2f}

## Key Findings

### Trend Analysis
{analysis_results['trend_summary']}

### Top Performers
{analysis_results['top_performers']}

### Anomalies Detected
Total anomalies: {analysis_results['anomaly_count']}
{analysis_results['anomaly_summary']}

## Recommendations
{analysis_results['recommendations']}
Report generated locally using Ollama
    """
    return report

Integrating Visualizations

from weasyprint import HTML
import base64

def image_to_base64(image_path):
    with open(image_path, 'rb') as img:
        return base64.b64encode(img.read()).decode()

def generate_pdf_report(df, charts, narrative):
    chart_html = ''.join([
        f'<img src="data:image/png;base64,{image_to_base64(c)}" style="width:100%;margin:20px 0;"/>'
        for c in charts
    ])
    
    html = f"""
    <html>
    <head><style>
        body {{ font-family: Arial, sans-serif; margin: 40px; }}
        h1 {{ color: #2c3e50; border-bottom: 2px solid #3498db; }}
        h2 {{ color: #34495e; margin-top: 30px; }}
        .metric {{ background: #ecf0f1; padding: 15px; border-radius: 5px; }}
        table {{ border-collapse: collapse; width: 100%; }}
        th, td {{ border: 1px solid #bdc3c7; padding: 10px; text-align: left; }}
    </style></head>
    <body>
        {narrative}
        {chart_html}
    </body>
    </html>"""
    
    HTML(string=html).write_pdf('analysis_report.pdf')

Automated Scheduled Reports

import schedule
import time

def daily_report_job():
    df = load_data()
    results = perform_analysis(df)
    report = generate_analysis_report(df, results)
    
    with open(f"reports/report_{pd.Timestamp.now().strftime('%Y%m%d')}.md", 'w') as f:
        f.write(report)
    
    # Generate visualizations
    create_dashboard(df, results)
    
    print(f"Daily report generated: {pd.Timestamp.now()}")

# Schedule for 8 AM daily
schedule.every().day.at("08:00").do(daily_report_job)

while True:
    schedule.run_pending()
    time.sleep(60)

Report Versioning

import json
from pathlib import Path

def save_report_metadata(report_path, analysis_results, data_hash):
    metadata = {
        'report_path': str(report_path),
        'generated_at': pd.Timestamp.now().isoformat(),
        'data_hash': data_hash,
        'record_count': analysis_results['record_count'],
        'parameters': analysis_results['parameters']
    }
    
    metadata_path = report_path.with_suffix('.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

# Compute data hash for reproducibility
import hashlib

def compute_data_hash(df):
    return hashlib.sha256(
        pd.util.hash_pandas_object(df).values.tobytes()
    ).hexdigest()[:12]

Multi-Section Reports

def generate_full_report(df):
    sections = []
    
    # Section 1: Overview
    sections.append("## Overview")
    sections.append(f"- Records: {len(df):,}")
    sections.append(f"- Time span: {(df.index.max() - df.index.min()).days} days")
    sections.append(f"- Columns: {len(df.columns)}")
    
    # Section 2: Statistical Summary
    sections.append("\n## Statistical Summary")
    stats = df.describe().T[['mean', 'std', 'min', 'max']]
    sections.append(stats.to_markdown())
    
    # Section 3: Anomalies
    sections.append("\n## Anomalies")
    sections.append(f"- Detected: {df['anomaly'].sum()} ({df['anomaly'].mean()*100:.1f}%)")
    
    # Section 4: Correlations
    sections.append("\n## High Correlations (|r| > 0.7)")
    corr = df.corr().abs()
    high_corr = [(a, b, corr.loc[a, b]) for a in corr.index for b in corr.columns if corr.loc[a, b] > 0.7 and a < b]
    for a, b, r in high_corr:
        sections.append(f"- {a} ↔ {b}: {r:.3f}")
    
    return '\n'.join(sections)