HTML Ingestion — RAG Systems: Part 1 (Chapter 4)

Web pages and HTML documents require different parsing than PDFs. HTML has structural elements (headings, paragraphs, lists) that PDFs lack. Exploiting this structure produces cleaner chunks.

Installation and dependencies

pip install beautifulsoup4 lxml

BeautifulSoup4 parses HTML. lxml is a faster parser backend.

Basic HTML parsing

from bs4 import BeautifulSoup

def extract_html_content(html: str) -> dict:
    """Extract clean text content from HTML."""
    soup = BeautifulSoup(html, "lxml")

    # Remove script, style, and nav elements
    for tag in soup(["script", "style", "nav", "header", "footer"]):
        tag.decompose()

    # Extract title
    title = soup.title.string if soup.title else ""

    # Extract headings
    headings = []
    for h in soup.find_all(["h1", "h2", "h3"]):
        headings.append(h.get_text(strip=True))

    # Extract main content (paragraphs and list items)
    paragraphs = []
    for p in soup.find_all("p"):
        text = p.get_text(strip=True)
        if text:
            paragraphs.append(text)

    return {
        "title": title,
        "headings": headings,
        "content": "\n".join(paragraphs),
        "raw_text": soup.get_text(separator="\n", strip=True)
    }

Extracting by semantic structure

HTML semantic elements (<article>, <section>, <main>) let you extract context-aware content. A chapter in a <section> has more meaning than the same text in a <div>.

def extract_semantic_html(html: str) -> list[dict]:
    """Extract content preserving semantic structure."""
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, "lxml")

    # Remove noise
    for tag in soup(["script", "style", "nav", "iframe"]):
        tag.decompose()

    sections = []

    # Find main content area
    main = soup.find("main") or soup.find("article") or soup.body

    # Process sections and headings
    current_section = {"heading": "", "content": []}

    for element in main.find_all(["h1", "h2", "h3", "h4", "p", "ul", "ol"]):
        if element.name in ["h1", "h2", "h3", "h4"]:
            # Save previous section
            if current_section["content"]:
                sections.append(current_section)

            current_section = {
                "heading": element.get_text(strip=True),
                "content": []
            }
        else:
            text = element.get_text(strip=True)
            if text:
                current_section["content"].append(text)

    # Don't forget last section
    if current_section["content"]:
        sections.append(current_section)

    return sections

Handling relative URLs

RAG systems need absolute URLs for source tracking. Convert relative links before storing.

from urllib.parse import urljoin

def make_absolute(url: str, base_url: str) -> str:
    """Convert relative URL to absolute URL."""
    return urljoin(base_url, url)

Fetching HTML from URLs

import requests
from bs4 import BeautifulSoup

def fetch_html(url: str, timeout: int = 30) -> str:
    """Fetch HTML content from a URL."""
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; RAG-Bot/1.0)"
    }

    response = requests.get(url, headers=headers, timeout=timeout)
    response.raise_for_status()

    return response.text

Batch crawling with rate limiting

Crawling too fast gets you banned. Add delays between requests.

import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor

def crawl_urls(urls: list[str], delay: float = 1.0, max_workers: int = 3) -> list[dict]:
    """Crawl multiple URLs with rate limiting."""
    results = []

    def fetch_with_delay(url):
        time.sleep(delay)
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            return extract_html_content(response.text)
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch_with_delay, url) for url in urls]

        for future in futures:
            result = future.result()
            if result:
                results.append(result)

    return results

Extracting metadata

HTML meta tags contain valuable metadata for filtering and context.

def extract_metadata(html: str) -> dict:
    """Extract Open Graph and meta tags."""
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, "lxml")
    meta = {}

    # Standard meta tags
    for tag in soup.find_all("meta"):
        name = tag.get("name") or tag.get("property")
        content = tag.get("content")
        if name and content:
            meta[name] = content

    # Favicon and other
    if soup.find("link", rel="icon"):
        meta["favicon"] = soup.find("link", rel="icon").get("href")

    return meta

Common failure modes

JavaScript-rendered content: BeautifulSoup cannot execute JavaScript. For SPAs, use Selenium or Playwright.
Cookie banners: These often appear in <div> elements and pollute text extraction. Remove them by targeting known banner selectors.
Infinite scroll pages: Content loads only when user scrolls. Use browser automation for these.
Encoded content: Some sites serve compressed or encoded HTML. requests handles gzip automatically, but base64-encoded text needs decoding.