04. HTML Ingestion
Web pages and HTML documents require different parsing than PDFs. HTML has structural elements (headings, paragraphs, lists) that PDFs lack. Exploiting this structure produces cleaner chunks.
Installation and dependencies
pip install beautifulsoup4 lxml
BeautifulSoup4 parses HTML. lxml is a faster parser backend.
Basic HTML parsing
from bs4 import BeautifulSoup
def extract_html_content(html: str) -> dict:
"""Extract clean text content from HTML."""
soup = BeautifulSoup(html, "lxml")
# Remove script, style, and nav elements
for tag in soup(["script", "style", "nav", "header", "footer"]):
tag.decompose()
# Extract title
title = soup.title.string if soup.title else ""
# Extract headings
headings = []
for h in soup.find_all(["h1", "h2", "h3"]):
headings.append(h.get_text(strip=True))
# Extract main content (paragraphs and list items)
paragraphs = []
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if text:
paragraphs.append(text)
return {
"title": title,
"headings": headings,
"content": "\n".join(paragraphs),
"raw_text": soup.get_text(separator="\n", strip=True)
}
Extracting by semantic structure
HTML semantic elements (<article>, <section>, <main>) let you extract context-aware content. A chapter in a <section> has more meaning than the same text in a <div>.
def extract_semantic_html(html: str) -> list[dict]:
"""Extract content preserving semantic structure."""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
# Remove noise
for tag in soup(["script", "style", "nav", "iframe"]):
tag.decompose()
sections = []
# Find main content area
main = soup.find("main") or soup.find("article") or soup.body
# Process sections and headings
current_section = {"heading": "", "content": []}
for element in main.find_all(["h1", "h2", "h3", "h4", "p", "ul", "ol"]):
if element.name in ["h1", "h2", "h3", "h4"]:
# Save previous section
if current_section["content"]:
sections.append(current_section)
current_section = {
"heading": element.get_text(strip=True),
"content": []
}
else:
text = element.get_text(strip=True)
if text:
current_section["content"].append(text)
# Don't forget last section
if current_section["content"]:
sections.append(current_section)
return sections
Handling relative URLs
RAG systems need absolute URLs for source tracking. Convert relative links before storing.
from urllib.parse import urljoin
def make_absolute(url: str, base_url: str) -> str:
"""Convert relative URL to absolute URL."""
return urljoin(base_url, url)
Fetching HTML from URLs
import requests
from bs4 import BeautifulSoup
def fetch_html(url: str, timeout: int = 30) -> str:
"""Fetch HTML content from a URL."""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; RAG-Bot/1.0)"
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
return response.text
Batch crawling with rate limiting
Crawling too fast gets you banned. Add delays between requests.
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
def crawl_urls(urls: list[str], delay: float = 1.0, max_workers: int = 3) -> list[dict]:
"""Crawl multiple URLs with rate limiting."""
results = []
def fetch_with_delay(url):
time.sleep(delay)
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
return extract_html_content(response.text)
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(fetch_with_delay, url) for url in urls]
for future in futures:
result = future.result()
if result:
results.append(result)
return results
Extracting metadata
HTML meta tags contain valuable metadata for filtering and context.
def extract_metadata(html: str) -> dict:
"""Extract Open Graph and meta tags."""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
meta = {}
# Standard meta tags
for tag in soup.find_all("meta"):
name = tag.get("name") or tag.get("property")
content = tag.get("content")
if name and content:
meta[name] = content
# Favicon and other
if soup.find("link", rel="icon"):
meta["favicon"] = soup.find("link", rel="icon").get("href")
return meta
Common failure modes
JavaScript-rendered content: BeautifulSoup cannot execute JavaScript. For SPAs, use Selenium or Playwright.
Cookie banners: These often appear in
<div>elements and pollute text extraction. Remove them by targeting known banner selectors.Infinite scroll pages: Content loads only when user scrolls. Use browser automation for these.
Encoded content: Some sites serve compressed or encoded HTML.
requestshandles gzip automatically, but base64-encoded text needs decoding.
Find a documentation page (e.g., from readthedocs.io) and extract its content using the semantic parser. Count how many sections it found. Print each section's heading and the first 100 characters of its content. Verify the extraction makes sense by comparing to the original page.