05. Markdown Ingestion
Markdown is the most developer-friendly document format. It has clear structure, renders well, and is easy to parse programmatically. Most documentation sites (GitBook, Docusaurus, MkDocs) output Markdown.
Why Markdown matters for RAG
Markdown has semantic structure baked into its syntax:
# Heading 1= major section## Heading 2= subsection### Heading 3= sub-subsection---= thematic break
This structure maps directly to document hierarchy. Chunking by heading boundaries produces semantically coherent chunks.
Installation
pip install markdown
Basic Markdown parsing
import markdown
from pathlib import Path
def extract_markdown(filepath: str) -> dict:
"""Parse Markdown file into structured components."""
path = Path(filepath)
content = path.read_text(encoding="utf-8")
lines = content.split("\n")
sections = []
current_section = {
"heading": "",
"level": 0,
"content": []
}
for line in lines:
# Check for heading
if line.startswith("#"):
# Save previous section
if current_section["content"]:
sections.append(current_section)
# Count heading level
level = len(line) - len(line.lstrip("#"))
heading_text = line.lstrip("#").strip()
current_section = {
"heading": heading_text,
"level": level,
"content": []
}
elif line.strip():
# Non-empty non-heading line
current_section["content"].append(line)
# Don't forget last section
if current_section["content"]:
sections.append(current_section)
return {
"title": sections[0]["heading"] if sections else "",
"sections": sections,
"full_text": content,
"source": str(path)
}
Extracting code blocks
Code blocks in Markdown contain technical content that embedding models often handle poorly. Extract them separately for specialized retrieval.
import re
def extract_code_blocks(markdown: str) -> list[dict]:
"""Extract fenced code blocks from Markdown."""
pattern = r"```(\w+)?\n(.*?)```"
matches = re.findall(pattern, markdown, re.DOTALL)
code_blocks = []
for language, code in matches:
code_blocks.append({
"language": language or "text",
"code": code.strip()
})
return code_blocks
Handling tables
Markdown tables use pipe-delimited syntax. Parse them into structured data for better chunking.
def parse_markdown_table(table_text: str) -> list[dict]:
"""Parse a Markdown table into list of dicts."""
lines = table_text.strip().split("\n")
if len(lines) < 2:
return []
# Parse header
headers = [h.strip() for h in lines[0].split("|") if h.strip()]
# Skip separator line
data_lines = lines[2:] if len(lines) > 2 else []
# Parse data rows
rows = []
for line in data_lines:
cells = [c.strip() for c in line.split("|") if c.strip()]
if len(cells) == len(headers):
rows.append(dict(zip(headers, cells)))
return rows
Preserving link context
Markdown links [text](url) contain metadata about what the link references. Extract this information.
def extract_links(markdown: str) -> list[dict]:
"""Extract Markdown links with context."""
pattern = r"\[([^\]]+)\]\(([^\)]+)\)"
links = []
for match in re.finditer(pattern, markdown):
text, url = match.groups()
context_start = max(0, match.start() - 50)
context = markdown[context_start:match.end() + 50]
links.append({
"text": text,
"url": url,
"context": context.strip()
})
return links
Batch processing a documentation directory
Documentation sites often have a directory structure. Process all Markdown files while preserving the directory hierarchy in metadata.
from pathlib import Path
def ingest_markdown_docs(directory: str) -> list[dict]:
"""Ingest all Markdown files from a directory tree."""
docs_dir = Path(directory)
documents = []
for md_file in docs_dir.rglob("*.md"):
try:
# Compute relative path for section identification
relative_path = md_file.relative_to(docs_dir)
parts = relative_path.parts
content = md_file.read_text(encoding="utf-8")
parsed = extract_markdown(str(md_file))
# Add path metadata
parsed["relative_path"] = str(relative_path)
parsed["directory"] = str(Path(*parts[:-1])) if len(parts) > 1 else ""
parsed["filename"] = md_file.name
documents.append(parsed)
except Exception as e:
print(f"Error processing {md_file}: {e}")
return documents
Frontmatter handling
Many documentation tools (Docusaurus, MkDocs, Jekyll) use YAML frontmatter. Strip it before processing content.
def strip_frontmatter(markdown: str) -> tuple[str, dict]:
"""Remove YAML frontmatter and return it as a dict."""
if not markdown.startswith("---"):
return markdown, {}
end_idx = markdown.find("---", 3)
if end_idx == -1:
return markdown, {}
frontmatter_text = markdown[3:end_idx]
content = markdown[end_idx + 3:].strip()
# Parse YAML (simplified - use ruamel.yaml for complex cases)
frontmatter = {}
for line in frontmatter_text.split("\n"):
if ":" in line:
key, value = line.split(":", 1)
frontmatter[key.strip()] = value.strip()
return content, frontmatter
Converting Markdown to HTML for hybrid extraction
Sometimes you want both Markdown structure and HTML parsing capabilities. Convert Markdown to HTML, then extract as you would HTML.
import markdown
def markdown_to_html_for_extraction(md_text: str) -> str:
"""Convert Markdown to HTML for structured extraction."""
html = markdown.markdown(
md_text,
extensions=['tables', 'fenced_code', 'toc']
)
return html
Create a Markdown file with three sections (H1, H2, H3) and at least one code block and one table. Write a Python script that parses this file and prints each section's heading with its word count. Verify the word count excludes the heading itself.