From e6d0bcf7e35bddb549efd5100ced16e10be1e33e Mon Sep 17 00:00:00 2001 From: Rachel Lee Nabors Date: Wed, 4 Feb 2026 00:24:07 +0000 Subject: [PATCH 1/9] Add clean markdown generation for LLM-friendly page content - Generate clean markdown from rendered HTML pages during build - Update /api/markdown endpoint to serve pre-generated clean markdown - Add CopyPageOverride component to fetch clean markdown on "Copy page" - Add frontmatter (title, description) extracted from HTML meta tags - Fix linting issues with top-level regex and simplified logic Co-Authored-By: Claude Opus 4.5 --- .gitignore | 1 + app/_components/copy-page-override.tsx | 101 +++ app/_components/custom-layout.tsx | 2 + app/api/markdown/[[...slug]]/route.ts | 37 +- package.json | 5 +- pnpm-lock.yaml | 23 + scripts/generate-clean-markdown.ts | 943 +++++++++++++++++++++++++ scripts/generate-llmstxt.ts | 97 ++- scripts/pagefind.ts | 113 +-- 9 files changed, 1255 insertions(+), 67 deletions(-) create mode 100644 app/_components/copy-page-override.tsx create mode 100644 scripts/generate-clean-markdown.ts diff --git a/.gitignore b/.gitignore index c714c9723..47b247816 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ node_modules .DS_Store .env.local public/sitemap*.xml +public/_markdown/ .env _pagefind/ diff --git a/app/_components/copy-page-override.tsx b/app/_components/copy-page-override.tsx new file mode 100644 index 000000000..f89dc528a --- /dev/null +++ b/app/_components/copy-page-override.tsx @@ -0,0 +1,101 @@ +"use client"; + +import { usePathname } from "next/navigation"; +import { useCallback, useEffect } from "react"; + +const COPY_FEEDBACK_DELAY_MS = 2000; +const COPY_BUTTON_TEXT = "Copy page"; +const COPIED_TEXT = "Copied"; +const DROPDOWN_IDENTIFIER = "Markdown for LLMs"; + +/** + * This component overrides the default nextra-theme-docs "Copy page" button behavior + * to fetch clean markdown from our API instead of copying raw MDX source. + */ +export function CopyPageOverride() { + const pathname = usePathname(); + + const fetchAndCopyMarkdown = useCallback(async (): Promise => { + try { + const markdownUrl = `/api/markdown${pathname}.md`; + const response = await fetch(markdownUrl); + + if (!response.ok) { + throw new Error(`Failed to fetch markdown: ${response.status}`); + } + + const markdown = await response.text(); + await navigator.clipboard.writeText(markdown); + return true; + } catch { + return false; + } + }, [pathname]); + + useEffect(() => { + const isCopyButton = (button: HTMLButtonElement): boolean => { + const text = button.textContent || ""; + return text.includes(COPY_BUTTON_TEXT) || text.includes(COPIED_TEXT); + }; + + const updateButtonFeedback = (button: HTMLButtonElement): void => { + const textNodes = button.querySelectorAll("*"); + for (const node of textNodes) { + if (node.textContent === COPY_BUTTON_TEXT) { + node.textContent = COPIED_TEXT; + setTimeout(() => { + node.textContent = COPY_BUTTON_TEXT; + }, COPY_FEEDBACK_DELAY_MS); + return; + } + } + }; + + const handleButtonClick = async (event: MouseEvent): Promise => { + const target = event.target as HTMLElement; + const button = target.closest("button") as HTMLButtonElement | null; + + if (!(button && isCopyButton(button))) { + return; + } + + event.preventDefault(); + event.stopPropagation(); + + const success = await fetchAndCopyMarkdown(); + if (success) { + updateButtonFeedback(button); + } + }; + + const handleDropdownClick = async (event: MouseEvent): Promise => { + const target = event.target as HTMLElement; + const option = target.closest('[role="option"]'); + const optionText = option?.textContent || ""; + + const isDropdownCopyOption = + optionText.includes(COPY_BUTTON_TEXT) && + optionText.includes(DROPDOWN_IDENTIFIER); + + if (!isDropdownCopyOption) { + return; + } + + event.preventDefault(); + event.stopPropagation(); + + await fetchAndCopyMarkdown(); + document.body.click(); + }; + + document.addEventListener("click", handleButtonClick, true); + document.addEventListener("click", handleDropdownClick, true); + + return () => { + document.removeEventListener("click", handleButtonClick, true); + document.removeEventListener("click", handleDropdownClick, true); + }; + }, [fetchAndCopyMarkdown]); + + return null; +} diff --git a/app/_components/custom-layout.tsx b/app/_components/custom-layout.tsx index 3051c36f6..1e048ff46 100644 --- a/app/_components/custom-layout.tsx +++ b/app/_components/custom-layout.tsx @@ -1,4 +1,5 @@ import type React from "react"; +import { CopyPageOverride } from "@/app/_components/copy-page-override"; import { PlaceholderReplacer } from "@/app/_components/placeholder-replacer"; import { OrySessionProvider } from "@/app/_lib/ory-session-context"; @@ -7,6 +8,7 @@ const CustomLayout: React.FC<{ children: React.ReactNode }> = ({ }) => ( +
{children}
); diff --git a/app/api/markdown/[[...slug]]/route.ts b/app/api/markdown/[[...slug]]/route.ts index 61006c812..b9f3997f4 100644 --- a/app/api/markdown/[[...slug]]/route.ts +++ b/app/api/markdown/[[...slug]]/route.ts @@ -7,6 +7,9 @@ export const dynamic = "force-dynamic"; // Regex pattern for removing .md extension const MD_EXTENSION_REGEX = /\.md$/; +// Directory containing pre-generated clean markdown files +const CLEAN_MARKDOWN_DIR = join(process.cwd(), "public", "_markdown"); + export async function GET( request: NextRequest, _context: { params: Promise<{ slug?: string[] }> } @@ -17,28 +20,48 @@ export async function GET( // Remove /api/markdown prefix to get the original path const originalPath = url.pathname.replace("/api/markdown", ""); - // Remove .md extension + // Remove .md extension if present const pathWithoutMd = originalPath.replace(MD_EXTENSION_REGEX, ""); - // Map URL to file path + // Try clean markdown first (preferred) + // e.g., /en/home/quickstart -> public/_markdown/en/home/quickstart.md + const cleanMarkdownPath = join(CLEAN_MARKDOWN_DIR, `${pathWithoutMd}.md`); + + try { + await access(cleanMarkdownPath); + const content = await readFile(cleanMarkdownPath, "utf-8"); + + return new NextResponse(content, { + status: 200, + headers: { + "Content-Type": "text/plain; charset=utf-8", + "Content-Disposition": "inline", + "Cache-Control": "public, max-age=3600", // Cache for 1 hour + }, + }); + } catch { + // Clean markdown not found, fall back to raw MDX + } + + // Fallback: serve raw MDX (for backwards compatibility or if clean files not generated) // e.g., /en/home/quickstart -> app/en/home/quickstart/page.mdx - const filePath = join(process.cwd(), "app", `${pathWithoutMd}/page.mdx`); + const rawMdxPath = join(process.cwd(), "app", `${pathWithoutMd}/page.mdx`); - // Check if file exists try { - await access(filePath); + await access(rawMdxPath); } catch { return new NextResponse("Markdown file not found", { status: 404 }); } - const content = await readFile(filePath, "utf-8"); + const content = await readFile(rawMdxPath, "utf-8"); - // Return the raw markdown with proper headers + // Return the raw MDX with a warning header return new NextResponse(content, { status: 200, headers: { "Content-Type": "text/plain; charset=utf-8", "Content-Disposition": "inline", + "X-Content-Source": "raw-mdx", // Indicate this is raw MDX, not clean markdown }, }); } catch (error) { diff --git a/package.json b/package.json index 966dc5a36..9181c02dc 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,8 @@ "lint": "pnpm dlx ultracite check", "format": "pnpm dlx ultracite fix", "prepare": "husky install", - "postbuild": "pnpm run custompagefind", + "postbuild": "pnpm run generate:markdown && pnpm run custompagefind", + "generate:markdown": "pnpm dlx tsx scripts/generate-clean-markdown.ts", "translate": "pnpm dlx tsx scripts/i18n-sync/index.ts && pnpm format", "sync:metas": "pnpm dlx tsx scripts/sync-metas.ts app/en", "llmstxt": "pnpm dlx tsx scripts/generate-llmstxt.ts", @@ -74,6 +75,7 @@ "@types/react": "19.2.7", "@types/react-dom": "19.2.3", "@types/react-syntax-highlighter": "15.5.13", + "@types/turndown": "^5.0.6", "@types/unist": "3.0.3", "commander": "14.0.2", "dotenv": "^17.2.3", @@ -90,6 +92,7 @@ "remark": "^15.0.1", "remark-rehype": "^11.1.2", "tailwindcss": "4.1.16", + "turndown": "^7.2.2", "typescript": "5.9.3", "ultracite": "6.1.0", "vitest": "4.0.5", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9864f51d7..8b1458165 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -105,6 +105,9 @@ importers: '@types/react-syntax-highlighter': specifier: 15.5.13 version: 15.5.13 + '@types/turndown': + specifier: ^5.0.6 + version: 5.0.6 '@types/unist': specifier: 3.0.3 version: 3.0.3 @@ -153,6 +156,9 @@ importers: tailwindcss: specifier: 4.1.16 version: 4.1.16 + turndown: + specifier: ^7.2.2 + version: 7.2.2 typescript: specifier: 5.9.3 version: 5.9.3 @@ -655,6 +661,9 @@ packages: '@mermaid-js/parser@0.6.3': resolution: {integrity: sha512-lnjOhe7zyHjc+If7yT4zoedx2vo4sHaTmtkl1+or8BRTnCtDmcTpAjpzDSfCZrshM5bCoz0GyidzadJAH1xobA==} + '@mixmark-io/domino@2.2.0': + resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} + '@napi-rs/simple-git-android-arm-eabi@0.1.22': resolution: {integrity: sha512-JQZdnDNm8o43A5GOzwN/0Tz3CDBQtBUNqzVwEopm32uayjdjxev1Csp1JeaqF3v9djLDIvsSE39ecsN2LhCKKQ==} engines: {node: '>= 10'} @@ -2177,6 +2186,9 @@ packages: '@types/trusted-types@2.0.7': resolution: {integrity: sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==} + '@types/turndown@5.0.6': + resolution: {integrity: sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==} + '@types/unist@2.0.11': resolution: {integrity: sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==} @@ -4424,6 +4436,9 @@ packages: tslib@2.8.1: resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} + turndown@7.2.2: + resolution: {integrity: sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==} + twoslash-protocol@0.3.4: resolution: {integrity: sha512-HHd7lzZNLUvjPzG/IE6js502gEzLC1x7HaO1up/f72d8G8ScWAs9Yfa97igelQRDl5h9tGcdFsRp+lNVre1EeQ==} @@ -5178,6 +5193,8 @@ snapshots: dependencies: langium: 3.3.1 + '@mixmark-io/domino@2.2.0': {} + '@napi-rs/simple-git-android-arm-eabi@0.1.22': optional: true @@ -6950,6 +6967,8 @@ snapshots: '@types/trusted-types@2.0.7': optional: true + '@types/turndown@5.0.6': {} + '@types/unist@2.0.11': {} '@types/unist@3.0.3': {} @@ -9795,6 +9814,10 @@ snapshots: tslib@2.8.1: {} + turndown@7.2.2: + dependencies: + '@mixmark-io/domino': 2.2.0 + twoslash-protocol@0.3.4: {} twoslash@0.3.4(typescript@5.9.3): diff --git a/scripts/generate-clean-markdown.ts b/scripts/generate-clean-markdown.ts new file mode 100644 index 000000000..7005f5bda --- /dev/null +++ b/scripts/generate-clean-markdown.ts @@ -0,0 +1,943 @@ +import { type ChildProcess, spawn } from "node:child_process"; +import fs from "node:fs/promises"; +import path, { dirname } from "node:path"; +import { fileURLToPath } from "node:url"; +import glob from "fast-glob"; +import pc from "picocolors"; +import TurndownService from "turndown"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Configuration constants +const SERVER_PORT = 3456; +const SERVER_URL = `http://localhost:${SERVER_PORT}`; +const OUTPUT_DIR = path.join(__dirname, "..", "public", "_markdown"); +const MAX_RETRIES = 30; +const RETRY_DELAY_MS = 1000; +const BATCH_SIZE = 10; +const SERVER_CLEANUP_DELAY_MS = 500; +const HTTP_NOT_FOUND = 404; +const MIN_INTEGRATION_LINKS = 5; +const MAX_DOTFILE_LENGTH = 20; +const MAX_CHILD_TEXT_LENGTH = 50; +const PARENT_SEARCH_DEPTH = 4; +const LABEL_SEARCH_DEPTH = 3; + +// Regex patterns at module level for performance +const FILENAME_PATTERN = + /^[\w.-]+\.(py|ts|js|tsx|jsx|json|yaml|yml|toml|env|md|html|css|sql|sh|bash|go|rs|java|rb|php|swift|kt|cs|cpp|c|h|xml|ini|cfg|conf)$/i; +const DOTFILE_PATTERN = /^\.[a-z]+$/i; +const LANGUAGE_CLASS_PATTERN = /language-(\w+)/; +const ARTICLE_PATTERN = /]*>([\s\S]*?)<\/article>/i; +const MAIN_PATTERN = /]*>([\s\S]*?)<\/main>/i; +const BODY_PATTERN = /]*>([\s\S]*?)<\/body>/i; +const PAGE_MDX_PATTERN = /\/page\.mdx$/; +const MDX_PATTERN = /\.mdx$/; + +// Validation regex patterns +const IMPORT_STATEMENT_PATTERN = /^import\s+/m; +const STEPS_COMPONENT_PATTERN = /|<\/Steps>/g; +const TABS_COMPONENT_PATTERN = /]/g; +const CALLOUT_COMPONENT_PATTERN = /]/g; +const GUIDE_OVERVIEW_PATTERN = /]/g; + +// Meta tag extraction patterns +const TITLE_PATTERN = /]*>([^<]*)<\/title>/i; +const META_DESCRIPTION_PATTERN = + / = { + terminal: "bash", + bash: "bash", + shell: "bash", + sh: "bash", + zsh: "bash", + python: "python", + py: "python", + typescript: "typescript", + ts: "typescript", + javascript: "javascript", + js: "javascript", + json: "json", + env: "bash", + yaml: "yaml", + yml: "yaml", + html: "html", + css: "css", + sql: "sql", + graphql: "graphql", + rust: "rust", + go: "go", + java: "java", + ruby: "ruby", + php: "php", + csharp: "csharp", + "c#": "csharp", + cpp: "cpp", + "c++": "cpp", + c: "c", + swift: "swift", + kotlin: "kotlin", + markdown: "markdown", + md: "markdown", + toml: "toml", + ini: "ini", + xml: "xml", + }; + return map[label.toLowerCase()] || ""; +} + +/** + * Language labels that appear as orphan text before code blocks + */ +const LANGUAGE_LABELS = new Set([ + "terminal", + "bash", + "shell", + "sh", + "zsh", + "python", + "py", + "typescript", + "ts", + "javascript", + "js", + "json", + "yaml", + "yml", + "toml", + "env", + "ini", + "xml", + "html", + "css", + "sql", + "graphql", + "rust", + "go", + "java", + "ruby", + "php", + "c#", + "csharp", + "c++", + "cpp", + "c", + "swift", + "kotlin", + "markdown", + "md", +]); + +/** + * Gets comment prefix for a language + */ +function getCommentPrefix(language: string): string { + const hashComment = ["bash", "python", "ruby", "yaml", "toml", "shell"]; + const slashComment = [ + "typescript", + "javascript", + "java", + "go", + "rust", + "swift", + "kotlin", + "csharp", + "cpp", + "c", + ]; + + if (hashComment.includes(language)) { + return "# "; + } + if (slashComment.includes(language)) { + return "// "; + } + if (language === "html" || language === "xml") { + return ""; + } + if (language === "css") { + return " */"; + } + return ""; +} + +/** + * Checks if text matches a filename pattern + */ +function isFilename(text: string): boolean { + const trimmed = text.trim(); + // Match common filename patterns like main.py, example.ts + if (FILENAME_PATTERN.test(trimmed)) { + return true; + } + // Match dotfiles like .env, .gitignore + if ( + DOTFILE_PATTERN.test(trimmed) && + trimmed.length > 1 && + trimmed.length < MAX_DOTFILE_LENGTH + ) { + return true; + } + return false; +} + +/** + * Recursively searches for filename text in a node tree + */ +function findFilenameInNode(node: Node): string | null { + // Check if this node's text content is a filename + const text = node.textContent?.trim() || ""; + if (isFilename(text)) { + return text; + } + + // Check child nodes + for (const child of Array.from(node.childNodes)) { + // Only check text nodes or elements with short text content + const childText = child.textContent?.trim() || ""; + if (childText.length < MAX_CHILD_TEXT_LENGTH && isFilename(childText)) { + return childText; + } + } + + return null; +} + +/** + * Finds filename text near a code block element + * Looks for patterns like "main.py", "example.ts", ".env" etc. + */ +function findFilename(node: Node): string | null { + // Look in parent structure for filename-like text + let parent = (node as Element).parentElement; + let depth = 0; + while (parent && depth < PARENT_SEARCH_DEPTH) { + const filename = findFilenameInNode(parent); + if (filename) { + return filename; + } + parent = parent.parentElement; + depth += 1; + } + return null; +} + +// Custom rules for better code block handling +// Nextra wraps code in:
buttons
...
+// Language labels appear in parent structure (e.g., "Terminal", "Python", "TypeScript") +turndown.addRule("fencedCodeBlock", { + filter: (node) => { + if (node.nodeName !== "PRE") { + return false; + } + // Find CODE element anywhere inside PRE (not just as first child) + const codeElement = findCodeElement(node); + return codeElement !== null; + }, + replacement: (_content, node) => { + const codeElement = findCodeElement(node); + if (!codeElement) { + return _content; + } + let code = codeElement.textContent || ""; + + // Try to extract language from various sources + let language = ""; + + // 1. Check code element class (e.g., "language-typescript") + const codeClassName = codeElement.getAttribute("class") || ""; + const langMatch = codeClassName.match(LANGUAGE_CLASS_PATTERN); + if (langMatch) { + language = langMatch[1]; + } + + // 2. Look for language label in parent structure + // Nextra code blocks have labels like "Terminal", "Python", etc. + if (!language) { + const labels = [ + "Terminal", + "Bash", + "Shell", + "Python", + "TypeScript", + "JavaScript", + "JSON", + "YAML", + "TOML", + "ENV", + "HTML", + "CSS", + "SQL", + "GraphQL", + "Rust", + "Go", + "Java", + "Ruby", + "PHP", + "C#", + "C++", + "C", + "Swift", + "Kotlin", + "Markdown", + "XML", + ]; + + // Check parent and grandparent for label text + let parent = (node as Element).parentElement; + let depth = 0; + while (parent && !language && depth < LABEL_SEARCH_DEPTH) { + const foundLabel = findElementWithText(parent, labels); + if (foundLabel) { + language = labelToLanguage(foundLabel); + break; + } + parent = parent.parentElement; + depth += 1; + } + } + + // 3. Try to find filename and add as comment + const filename = findFilename(node); + if (filename) { + const prefix = getCommentPrefix(language || "bash"); + const suffix = getCommentSuffix(language || "bash"); + code = `${prefix}${filename}${suffix}\n${code}`; + } + + return `\n\n\`\`\`${language}\n${code}\n\`\`\`\n\n`; + }, +}); + +// Remove copy buttons and other interactive elements +turndown.addRule("removeButtons", { + filter: (node) => { + if (node.nodeName === "BUTTON") { + return true; + } + if (node.nodeName === "DIV") { + const className = node.getAttribute("class"); + if (className?.includes("copy-button")) { + return true; + } + } + return false; + }, + replacement: () => "", +}); + +// Remove orphan language labels and filenames that appear before code blocks +// These are standalone paragraphs containing just "Terminal", "Python", "main.py", etc. +turndown.addRule("removeOrphanLabels", { + filter: (node) => { + if ( + node.nodeName !== "P" && + node.nodeName !== "SPAN" && + node.nodeName !== "DIV" + ) { + return false; + } + const text = node.textContent?.trim() || ""; + // Check if it's just a language label + if (LANGUAGE_LABELS.has(text.toLowerCase())) { + return true; + } + // Check if it's a filename (will be added as comment in code block) + if (isFilename(text)) { + return true; + } + return false; + }, + replacement: () => "", +}); + +// Clean up links - collapse whitespace in link text +turndown.addRule("cleanLinks", { + filter: "a", + replacement: (content, node) => { + const element = node as Element; + let href = element.getAttribute("href"); + if (!href) { + return content; + } + // Collapse multiple whitespace/newlines into single space and trim + const cleanedContent = content.replace(/\s+/g, " ").trim(); + // Skip empty links + if (!cleanedContent) { + return ""; + } + + // Add .md extension to internal links (so they point to markdown, not HTML) + // Internal links start with / but not // (protocol-relative) + // Don't add .md if it already has an extension or is an anchor-only link + if ( + href.startsWith("/") && + !href.startsWith("//") && + !href.includes(".") && + !href.startsWith("/#") + ) { + // Handle links with anchors (e.g., /page#section -> /page.md#section) + const hashIndex = href.indexOf("#"); + if (hashIndex > 0) { + href = `${href.slice(0, hashIndex)}.md${href.slice(hashIndex)}`; + } else { + href += ".md"; + } + } + + // Check if this is a standalone link (in a grid/list of links) + // by looking at the parent and sibling structure + const parent = element.parentNode; + const isInParagraph = parent?.nodeName === "P"; + const isInlineLink = isInParagraph && parent?.childNodes.length > 1; + + // For standalone links (like card grids), add newline for readability + // For inline links (in paragraphs with other text), don't add newline + if (isInlineLink) { + return `[${cleanedContent}](${href})`; + } + return `[${cleanedContent}](${href})\n`; + }, +}); + +/** + * Waits for the server to be ready + */ +async function waitForServer(url: string): Promise { + console.log(pc.blue(`⏳ Waiting for server at ${url}...`)); + + let retries = 0; + while (retries < MAX_RETRIES) { + try { + const response = await fetch(url); + if (response.ok || response.status === HTTP_NOT_FOUND) { + console.log(pc.green("✓ Server is ready")); + return; + } + } catch { + // Server not ready yet + } + await new Promise((resolve) => setTimeout(resolve, RETRY_DELAY_MS)); + retries += 1; + } + + throw new Error( + `Server at ${url} did not become ready after ${MAX_RETRIES} retries` + ); +} + +/** + * Starts the Next.js production server + */ +function startServer(): ChildProcess { + console.log(pc.blue("🚀 Starting production server...")); + + // Use npx to run next start directly with port argument + const server = spawn( + "npx", + ["next", "start", "--port", String(SERVER_PORT)], + { + cwd: path.join(__dirname, ".."), + stdio: ["ignore", "pipe", "pipe"], + detached: false, + } + ); + + // Log server output for debugging + server.stdout?.on("data", (data: Buffer) => { + const output = data.toString(); + if (output.includes("Ready") || output.includes("started")) { + console.log(pc.gray(` Server: ${output.trim()}`)); + } + }); + + server.stderr?.on("data", (data: Buffer) => { + const output = data.toString(); + // Filter out noisy warnings + if (!output.includes("ExperimentalWarning")) { + console.error(pc.yellow(` Server stderr: ${output.trim()}`)); + } + }); + + return server; +} + +/** + * Extracts frontmatter data from HTML meta tags + */ +function extractFrontmatter(html: string): { + title: string; + description: string; +} { + // Extract title + const titleMatch = html.match(TITLE_PATTERN); + let title = titleMatch?.[1]?.trim() || ""; + // Remove common suffixes like "| Arcade Docs" or " - Arcade" + title = title.replace(TITLE_SUFFIX_PATTERN, "").trim(); + + // Extract description (try both attribute orders) + let description = ""; + const descMatch = html.match(META_DESCRIPTION_PATTERN); + if (descMatch) { + description = descMatch[1].trim(); + } else { + const descAltMatch = html.match(META_DESCRIPTION_ALT_PATTERN); + if (descAltMatch) { + description = descAltMatch[1].trim(); + } + } + + return { title, description }; +} + +/** + * Formats frontmatter as YAML + */ +function formatFrontmatter(title: string, description: string): string { + if (!(title || description)) { + return ""; + } + + const lines = ["---"]; + if (title) { + // Escape quotes in YAML values + const escapedTitle = title.replace(/"/g, '\\"'); + lines.push(`title: "${escapedTitle}"`); + } + if (description) { + const escapedDesc = description.replace(/"/g, '\\"'); + lines.push(`description: "${escapedDesc}"`); + } + lines.push("---", ""); + + return lines.join("\n"); +} + +/** + * Extracts the main content from the HTML page + */ +function extractContent(html: string): string { + // Nextra wraps the main content in an
element + // We need to extract just the article content, not the nav/sidebar/footer + + // Try to find the article element + const articleMatch = html.match(ARTICLE_PATTERN); + if (articleMatch) { + return articleMatch[1]; + } + + // Fallback: try to find main content area + const mainMatch = html.match(MAIN_PATTERN); + if (mainMatch) { + return mainMatch[1]; + } + + console.warn( + pc.yellow(" ⚠ Could not find article/main element, using body") + ); + // Last resort: use body content + const bodyMatch = html.match(BODY_PATTERN); + return bodyMatch ? bodyMatch[1] : html; +} + +/** + * Cleans up the extracted HTML before conversion + */ +function cleanHtml(html: string): string { + let cleaned = html; + + // Remove script tags + cleaned = cleaned.replace(//gi, ""); + + // Remove style tags + cleaned = cleaned.replace(//gi, ""); + + // Remove SVG icons (they don't convert well) + cleaned = cleaned.replace(//gi, ""); + + // Remove navigation elements + cleaned = cleaned.replace(//gi, ""); + + // Remove footer elements + cleaned = cleaned.replace(//gi, ""); + + // Remove aside elements (typically sidebars) + cleaned = cleaned.replace(//gi, ""); + + // Remove elements with common non-content classes + cleaned = cleaned.replace( + /<[^>]*(class="[^"]*(?:sidebar|nav|toc|breadcrumb)[^"]*")[^>]*>[\s\S]*?<\/[^>]+>/gi, + "" + ); + + return cleaned; +} + +/** + * Post-processes the markdown output + */ +function cleanMarkdown(markdown: string): string { + let cleaned = markdown; + + // Remove excessive blank lines (more than 2 consecutive) + cleaned = cleaned.replace(/\n{4,}/g, "\n\n\n"); + + // Remove trailing whitespace from lines + cleaned = cleaned.replace(/[ \t]+$/gm, ""); + + // Ensure file ends with single newline + cleaned = `${cleaned.trimEnd()}\n`; + + return cleaned; +} + +/** + * Fetches and converts a single page + */ +async function processPage( + url: string, + outputPath: string +): Promise<{ success: boolean; error?: string }> { + try { + const response = await fetch(url); + + if (!response.ok) { + return { success: false, error: `HTTP ${response.status}` }; + } + + const html = await response.text(); + + // Extract frontmatter from meta tags before processing content + const { title, description } = extractFrontmatter(html); + const frontmatter = formatFrontmatter(title, description); + + const content = extractContent(html); + const cleanedHtml = cleanHtml(content); + const markdown = turndown.turndown(cleanedHtml); + const cleanedMarkdown = cleanMarkdown(markdown); + + // Combine frontmatter with markdown content + const finalMarkdown = frontmatter + cleanedMarkdown; + + // Create directory if needed + await fs.mkdir(path.dirname(outputPath), { recursive: true }); + + // Write the markdown file + await fs.writeFile(outputPath, finalMarkdown, "utf-8"); + + return { success: true }; + } catch (error) { + return { success: false, error: String(error) }; + } +} + +/** + * Discovers all MDX pages and their corresponding routes + */ +async function discoverPages(): Promise< + Array<{ route: string; language: string; outputPath: string }> +> { + const appDir = path.join(__dirname, "..", "app"); + const entries = await fs.readdir(appDir); + + // Find all language directories + const languages = await Promise.all( + entries.map(async (dir: string) => { + if (dir.startsWith("_") || dir === "api") { + return null; + } + const entryPath = path.join(appDir, dir); + const stats = await fs.stat(entryPath); + return stats.isDirectory() ? dir : null; + }) + ).then((results) => results.filter((dir): dir is string => dir !== null)); + + console.log(pc.blue(`📁 Found languages: ${languages.join(", ")}`)); + + const pages: Array<{ route: string; language: string; outputPath: string }> = + []; + + for (const language of languages) { + const searchPath = path.join(appDir, language); + const mdxFiles = glob.sync("**/*.mdx", { + cwd: searchPath, + ignore: ["**/_*.mdx"], + }); + + for (const entry of mdxFiles) { + // Convert file path to route + // e.g., "home/quickstart/page.mdx" -> "/en/home/quickstart" + const routePath = entry + .replace(PAGE_MDX_PATTERN, "") + .replace(MDX_PATTERN, ""); + const route = `/${language}/${routePath}`; + const outputPath = path.join(OUTPUT_DIR, language, `${routePath}.md`); + + pages.push({ route, language, outputPath }); + } + } + + return pages; +} + +/** + * Validates that the generated markdown files contain expected content + */ +async function validateGeneratedContent(): Promise<{ + passed: boolean; + errors: string[]; +}> { + const errors: string[] = []; + + console.log(pc.blue("\n🧪 Running validation tests...\n")); + + // Test 1: Integrations overview should contain links to integrations + const integrationsPath = path.join( + OUTPUT_DIR, + "en", + "resources", + "integrations.md" + ); + try { + const content = await fs.readFile(integrationsPath, "utf-8"); + + // Check for integration links (links to /en/resources/integrations/...) + // The turndown conversion may produce multi-line links, so we look for the URL pattern + const integrationLinkPattern = + /\]\(\/en\/resources\/integrations\/[^)]+\)/g; + const matches = content.match(integrationLinkPattern) || []; + + if (matches.length < MIN_INTEGRATION_LINKS) { + errors.push( + `Integrations page should have many integration links, found only ${matches.length}. ` + + "This suggests the component content was not properly rendered." + ); + } else { + console.log( + pc.green( + ` ✓ Integrations page contains ${matches.length} integration links` + ) + ); + } + + // Also check that raw MDX syntax is NOT present + if (content.includes(" or import statements)" + ); + } else { + console.log(pc.green(" ✓ Integrations page has no raw MDX syntax")); + } + } catch (error) { + errors.push(`Could not read integrations page: ${error}`); + } + + // Test 2: A typical page should not contain JSX/MDX syntax + const quickstartPath = path.join( + OUTPUT_DIR, + "en", + "get-started", + "quickstarts", + "mcp-server-quickstart.md" + ); + try { + const content = await fs.readFile(quickstartPath, "utf-8"); + + // Check for common MDX patterns that should NOT be present + const mdxPatterns = [ + { pattern: IMPORT_STATEMENT_PATTERN, name: "import statements" }, + { pattern: STEPS_COMPONENT_PATTERN, name: " component" }, + { pattern: TABS_COMPONENT_PATTERN, name: " component" }, + { pattern: CALLOUT_COMPONENT_PATTERN, name: " component" }, + { pattern: GUIDE_OVERVIEW_PATTERN, name: " component" }, + ]; + + for (const { pattern, name } of mdxPatterns) { + if (pattern.test(content)) { + errors.push(`Quickstart page still contains ${name}`); + } + } + + if (!mdxPatterns.some(({ pattern }) => pattern.test(content))) { + console.log(pc.green(" ✓ Quickstart page has no raw MDX syntax")); + } + + // Check that actual content is present + if (content.includes("arcade new") && content.includes("uv tool install")) { + console.log( + pc.green(" ✓ Quickstart page contains expected code examples") + ); + } else { + errors.push("Quickstart page is missing expected code examples"); + } + } catch (error) { + errors.push(`Could not read quickstart page: ${error}`); + } + + return { passed: errors.length === 0, errors }; +} + +/** + * Processes all pages in parallel batches + */ +async function processAllPages( + pages: Array<{ route: string; language: string; outputPath: string }> +): Promise<{ successCount: number; errorCount: number }> { + let successCount = 0; + let errorCount = 0; + let batchStart = 0; + + while (batchStart < pages.length) { + const batch = pages.slice(batchStart, batchStart + BATCH_SIZE); + const results = await Promise.all( + batch.map(async (page) => { + const url = `${SERVER_URL}${page.route}`; + const result = await processPage(url, page.outputPath); + return { page, result }; + }) + ); + + for (const { page, result } of results) { + if (result.success) { + successCount += 1; + console.log(pc.gray(` ✓ ${page.route}`)); + } else { + errorCount += 1; + console.log(pc.red(` ✗ ${page.route}: ${result.error}`)); + } + } + batchStart += BATCH_SIZE; + } + + return { successCount, errorCount }; +} + +/** + * Cleans up the server process + */ +async function cleanupServer(server: ChildProcess): Promise { + console.log(pc.blue("🛑 Stopping server...")); + server.kill("SIGTERM"); + + await new Promise((resolve) => setTimeout(resolve, SERVER_CLEANUP_DELAY_MS)); + + if (!server.killed) { + server.kill("SIGKILL"); + } +} + +/** + * Main execution function + */ +async function main() { + console.log(pc.bold(pc.blue("\n🔄 Generating clean markdown files...\n"))); + + let server: ChildProcess | null = null; + + try { + const pages = await discoverPages(); + console.log(pc.green(`✓ Found ${pages.length} pages to process`)); + + server = startServer(); + await waitForServer(`${SERVER_URL}/en/home`); + + console.log(pc.blue("\n📝 Converting pages to markdown...\n")); + const { successCount, errorCount } = await processAllPages(pages); + + console.log(pc.bold(pc.blue("\n📊 Results:"))); + console.log(pc.green(` ✓ Successfully converted: ${successCount}`)); + if (errorCount > 0) { + console.log(pc.red(` ✗ Errors: ${errorCount}`)); + } + console.log(pc.gray(` 📁 Output directory: ${OUTPUT_DIR}`)); + + const validation = await validateGeneratedContent(); + if (!validation.passed) { + console.log(pc.bold(pc.red("\n⚠️ Validation errors:"))); + for (const error of validation.errors) { + console.log(pc.red(` • ${error}`)); + } + console.log( + pc.yellow( + "\nNote: Some validation failures may indicate the HTML extraction needs adjustment." + ) + ); + } + + console.log(pc.bold(pc.green("\n✨ Done!\n"))); + } catch (error) { + console.error(pc.red("\n✗ Error generating markdown:"), error); + process.exit(1); + } finally { + if (server) { + await cleanupServer(server); + } + } +} + +// Run if called directly +main(); diff --git a/scripts/generate-llmstxt.ts b/scripts/generate-llmstxt.ts index 2d5790c51..54f59917d 100644 --- a/scripts/generate-llmstxt.ts +++ b/scripts/generate-llmstxt.ts @@ -27,6 +27,7 @@ type LlmsTxtMetadata = { const BASE_URL = "https://docs.arcade.dev"; const OUTPUT_PATH = path.join(process.cwd(), "public", "llms.txt"); +const CLEAN_MARKDOWN_DIR = path.join(process.cwd(), "public", "_markdown"); // Regex patterns used in path processing const APP_EN_PREFIX_REGEX = /^app\/en\//; @@ -153,11 +154,76 @@ async function extractExistingSummaries(): Promise< } /** - * Discovers all MDX pages in the documentation + * Checks if clean markdown files are available + */ +async function hasCleanMarkdown(): Promise { + const cleanEnDir = path.join(CLEAN_MARKDOWN_DIR, "en"); + try { + const files = await fs.readdir(cleanEnDir); + return files.length > 0; + } catch { + return false; + } +} + +/** + * Discovers all pages in the documentation + * Prefers clean markdown files if available, falls back to raw MDX */ async function discoverPages(): Promise { - console.log(pc.blue("📄 Discovering MDX pages...")); + const useCleanMarkdown = await hasCleanMarkdown(); + + if (useCleanMarkdown) { + console.log(pc.blue("📄 Discovering pages from clean markdown...")); + return discoverCleanMarkdownPages(); + } + + console.log(pc.blue("📄 Discovering pages from raw MDX...")); + console.log(pc.yellow(" ⚠ Clean markdown not found, using raw MDX files")); + console.log( + pc.yellow(' Run "pnpm run generate:markdown" to generate clean files') + ); + return discoverMdxPages(); +} + +/** + * Discovers pages from clean markdown files + */ +async function discoverCleanMarkdownPages(): Promise { + const cleanEnDir = path.join(CLEAN_MARKDOWN_DIR, "en"); + const mdFiles = glob.sync("**/*.md", { + cwd: cleanEnDir, + ignore: ["**/node_modules/**"], + }); + + const pages: PageMetadata[] = []; + + for (const filePath of mdFiles) { + const fullPath = path.join(cleanEnDir, filePath); + const content = await fs.readFile(fullPath, "utf-8"); + + // Convert file path to URL (with .md extension for raw markdown access) + // Clean markdown: "home/quickstart.md" -> "home/quickstart" + const relativePath = filePath.replace(MD_EXTENSION_REGEX, ""); + // Add locale prefix and .md extension for raw markdown access + const url = `${BASE_URL}/en/${relativePath}.md`; + + pages.push({ + path: `public/_markdown/en/${filePath}`, + url, + content, + }); + } + + console.log(pc.green(`✓ Found ${pages.length} pages (clean markdown)`)); + return pages; +} + +/** + * Discovers pages from raw MDX files (fallback) + */ +async function discoverMdxPages(): Promise { const mdxFiles = glob.sync("app/en/**/*.mdx", { cwd: process.cwd(), ignore: ["**/node_modules/**", "**/_*.mdx"], @@ -185,7 +251,7 @@ async function discoverPages(): Promise { }); } - console.log(pc.green(`✓ Found ${pages.length} pages`)); + console.log(pc.green(`✓ Found ${pages.length} pages (raw MDX)`)); return pages; } @@ -196,17 +262,32 @@ async function summarizePage( page: PageMetadata ): Promise<{ title: string; description: string }> { try { + // Determine file extension for title extraction + const isCleanMarkdown = page.path.includes("_markdown"); + const fileExt = isCleanMarkdown ? ".md" : ".mdx"; + // Extract title from content (first H1) const titleMatch = page.content.match(TITLE_H1_REGEX); const title = titleMatch ? titleMatch[1].trim() - : path.basename(page.path, ".mdx"); + : path.basename(page.path, fileExt); // Prepare content for summarization (remove code blocks for better summarization) - const contentForSummary = page.content - .replace(/```[\s\S]*?```/g, "[code block]") - .replace(/import\s+.*from\s+['"].*['"]/g, "") - .slice(0, MAX_CONTENT_LENGTH); + // For clean markdown, we don't need to remove imports (they're already gone) + let contentForSummary = page.content.replace( + /```[\s\S]*?```/g, + "[code block]" + ); + + // Only remove imports if using raw MDX + if (!isCleanMarkdown) { + contentForSummary = contentForSummary.replace( + /import\s+.*from\s+['"].*['"]/g, + "" + ); + } + + contentForSummary = contentForSummary.slice(0, MAX_CONTENT_LENGTH); const response = await openai.chat.completions.create({ model: "gpt-4o-mini", diff --git a/scripts/pagefind.ts b/scripts/pagefind.ts index 0b19c0e09..370f6e786 100644 --- a/scripts/pagefind.ts +++ b/scripts/pagefind.ts @@ -10,65 +10,63 @@ import remarkRehype from "remark-rehype"; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); -// Regex patterns for cleaning MDX content -const FRONTMATTER_REGEX = /^---\n[\s\S]*?\n---\n?/m; -const IMPORT_REGEX = /^import\s+.*?from\s+['"].*?['"];?\n?/gm; -const EXPORT_REGEX = /^export\s+(?:const|function|class|default|{).*?;?\n?/gm; -const JSX_SELF_CLOSING_REGEX = /<[A-Z]\w*(?:\s+[^>]*)?\/>/g; -const JSX_COMPONENT_REGEX = /<[A-Z]\w*(?:\s+[^>]*)?>[\s\S]*?<\/[A-Z]\w*>/g; -const JSX_CUSTOM_COMPONENT_REGEX = - /<[A-Z][\w.]*(?:\s+[^>]*)?>[\s\S]*?<\/[A-Z][\w.]*>/g; +// Directory containing pre-generated clean markdown files +const CLEAN_MARKDOWN_DIR = path.join(__dirname, "..", "public", "_markdown"); /** - * Converts MDX content to simple HTML by stripping MDX-specific syntax - * and converting markdown to HTML. Skips what can't be rendered. + * Converts clean markdown to HTML for Pagefind indexing. + * This function expects pre-cleaned markdown (no MDX syntax). */ -async function markdownToHtml(mdxContent: string): Promise { +async function markdownToHtml(markdownContent: string): Promise { try { - let content = mdxContent; - - // Remove frontmatter (---\n...\n---) - content = content.replace(FRONTMATTER_REGEX, ""); - - // Remove import statements - content = content.replace(IMPORT_REGEX, ""); - - // Remove export statements (but keep default exports that might be content) - content = content.replace(EXPORT_REGEX, ""); - - // Remove JSX components (both self-closing and with children) - // This regex matches and ... - content = content.replace(JSX_SELF_CLOSING_REGEX, ""); - content = content.replace(JSX_COMPONENT_REGEX, ""); - - // Remove remaining JSX-like tags that might be custom components - content = content.replace(JSX_CUSTOM_COMPONENT_REGEX, ""); - - // Convert markdown to HTML using remark/rehype (same ecosystem as Nextra) const result = await remark() .use(remarkRehype) .use(rehypeStringify) - .process(content); + .process(markdownContent); return String(result); } catch (error) { - // If markdown parsing fails, return the cleaned content as plain text - // This ensures we still index the content even if HTML conversion fails console.warn( `Warning: Failed to convert markdown to HTML, using plain text: ${error}` ); - // Return the cleaned content (without MDX syntax) as fallback - let cleaned = mdxContent; - cleaned = cleaned.replace(FRONTMATTER_REGEX, ""); - cleaned = cleaned.replace(IMPORT_REGEX, ""); - cleaned = cleaned.replace(EXPORT_REGEX, ""); - cleaned = cleaned.replace(JSX_SELF_CLOSING_REGEX, ""); - cleaned = cleaned.replace(JSX_COMPONENT_REGEX, ""); - cleaned = cleaned.replace(JSX_CUSTOM_COMPONENT_REGEX, ""); - return cleaned; + return markdownContent; } } +/** + * Checks if clean markdown files exist and returns the appropriate source directory + */ +async function getMarkdownSource(language: string): Promise<{ + dir: string; + pattern: string; + isClean: boolean; +}> { + const cleanDir = path.join(CLEAN_MARKDOWN_DIR, language); + + try { + await fs.access(cleanDir); + const files = await fs.readdir(cleanDir); + if (files.length > 0) { + return { dir: cleanDir, pattern: "**/*.md", isClean: true }; + } + } catch { + // Clean markdown directory doesn't exist + } + + // Fallback to raw MDX (with warning) + console.warn( + `⚠️ Clean markdown not found for ${language}, falling back to raw MDX` + ); + console.warn( + ` Run "pnpm run generate:markdown" first to generate clean files` + ); + return { + dir: path.join(__dirname, "..", "app", language), + pattern: "**/*.mdx", + isClean: false, + }; +} + const { index } = await createIndex(); if (!index) { throw new Error("Failed to create index"); @@ -95,15 +93,28 @@ let page_count = 0; console.log("Building search index for languages: ", languages.join(", ")); for (const language of languages) { - const searchPath = path.join(__dirname, "..", "app", language); - - console.log(`Adding directory: ${searchPath}`); + const source = await getMarkdownSource(language); + + console.log( + `Adding directory: ${source.dir} (${source.isClean ? "clean markdown" : "raw MDX"})` + ); + + for (const entry of glob.sync(source.pattern, { cwd: source.dir })) { + const filePath = path.join(source.dir, entry); + + // Build URL from file path + // Clean markdown: "home/quickstart.md" -> "/en/home/quickstart" + // Raw MDX: "home/quickstart/page.mdx" -> "/en/home/quickstart" + let urlPath: string; + if (source.isClean) { + urlPath = entry.replace(/\.md$/, ""); + } else { + urlPath = entry.split("/page.mdx")[0]; + } + const url = `/${language}/${urlPath}`; - for (const entry of glob.sync("**/*.mdx", { cwd: searchPath })) { - const filePath = path.join(searchPath, entry); - const url = `/${language}/${entry.split("/page.mdx")[0]}`; - const mdxContent = await fs.readFile(filePath, "utf-8"); - const htmlContent = await markdownToHtml(mdxContent); + const markdownContent = await fs.readFile(filePath, "utf-8"); + const htmlContent = await markdownToHtml(markdownContent); const { errors, file } = await index.addHTMLFile({ url, From 128e0f79bd93b044a24ab25b42c99fb0d10c2d39 Mon Sep 17 00:00:00 2001 From: Rachel Lee Nabors Date: Wed, 4 Feb 2026 00:42:46 +0000 Subject: [PATCH 2/9] Update next-env.d.ts after production build Co-Authored-By: Claude Opus 4.5 --- next-env.d.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/next-env.d.ts b/next-env.d.ts index c4b7818fb..9edff1c7c 100644 --- a/next-env.d.ts +++ b/next-env.d.ts @@ -1,6 +1,6 @@ /// /// -import "./.next/dev/types/routes.d.ts"; +import "./.next/types/routes.d.ts"; // NOTE: This file should not be edited // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. From e33abf6b6f24bd04fc9d536f1ebb93f8886b7690 Mon Sep 17 00:00:00 2001 From: Rachel Lee Nabors Date: Thu, 5 Feb 2026 00:20:34 +0100 Subject: [PATCH 3/9] Fix hanging process and add HTML cleaning validation - Add explicit process.exit(0) to ensure script terminates after completion (event listeners on spawned server kept event loop alive) - Add validation test for HTML element removal (script, style, svg, nav, footer, aside) - Refactor validateGeneratedContent into smaller helper functions to reduce complexity - Increase MIN_INTEGRATION_LINKS threshold from 5 to 10 Co-Authored-By: Claude Opus 4.5 --- scripts/generate-clean-markdown.ts | 97 ++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 24 deletions(-) diff --git a/scripts/generate-clean-markdown.ts b/scripts/generate-clean-markdown.ts index 7005f5bda..5df031604 100644 --- a/scripts/generate-clean-markdown.ts +++ b/scripts/generate-clean-markdown.ts @@ -18,7 +18,7 @@ const RETRY_DELAY_MS = 1000; const BATCH_SIZE = 10; const SERVER_CLEANUP_DELAY_MS = 500; const HTTP_NOT_FOUND = 404; -const MIN_INTEGRATION_LINKS = 5; +const MIN_INTEGRATION_LINKS = 10; const MAX_DOTFILE_LENGTH = 20; const MAX_CHILD_TEXT_LENGTH = 50; const PARENT_SEARCH_DEPTH = 4; @@ -42,6 +42,14 @@ const TABS_COMPONENT_PATTERN = /]/g; const CALLOUT_COMPONENT_PATTERN = /]/g; const GUIDE_OVERVIEW_PATTERN = /]/g; +// HTML element patterns that should be removed during cleaning +const HTML_SCRIPT_PATTERN = /]/gi; +const HTML_STYLE_PATTERN = /]/gi; +const HTML_SVG_PATTERN = /]/gi; +const HTML_NAV_PATTERN = /]/gi; +const HTML_FOOTER_PATTERN = /]/gi; +const HTML_ASIDE_PATTERN = /]/gi; + // Meta tag extraction patterns const TITLE_PATTERN = /]*>([^<]*)<\/title>/i; const META_DESCRIPTION_PATTERN = @@ -745,17 +753,9 @@ async function discoverPages(): Promise< } /** - * Validates that the generated markdown files contain expected content + * Validates that the integrations page has proper links */ -async function validateGeneratedContent(): Promise<{ - passed: boolean; - errors: string[]; -}> { - const errors: string[] = []; - - console.log(pc.blue("\n🧪 Running validation tests...\n")); - - // Test 1: Integrations overview should contain links to integrations +async function validateIntegrationsPage(errors: string[]): Promise { const integrationsPath = path.join( OUTPUT_DIR, "en", @@ -764,9 +764,6 @@ async function validateGeneratedContent(): Promise<{ ); try { const content = await fs.readFile(integrationsPath, "utf-8"); - - // Check for integration links (links to /en/resources/integrations/...) - // The turndown conversion may produce multi-line links, so we look for the URL pattern const integrationLinkPattern = /\]\(\/en\/resources\/integrations\/[^)]+\)/g; const matches = content.match(integrationLinkPattern) || []; @@ -784,7 +781,6 @@ async function validateGeneratedContent(): Promise<{ ); } - // Also check that raw MDX syntax is NOT present if (content.includes(" or import statements)" @@ -795,8 +791,12 @@ async function validateGeneratedContent(): Promise<{ } catch (error) { errors.push(`Could not read integrations page: ${error}`); } +} - // Test 2: A typical page should not contain JSX/MDX syntax +/** + * Validates that the quickstart page has no raw MDX syntax + */ +async function validateQuickstartPage(errors: string[]): Promise { const quickstartPath = path.join( OUTPUT_DIR, "en", @@ -806,8 +806,6 @@ async function validateGeneratedContent(): Promise<{ ); try { const content = await fs.readFile(quickstartPath, "utf-8"); - - // Check for common MDX patterns that should NOT be present const mdxPatterns = [ { pattern: IMPORT_STATEMENT_PATTERN, name: "import statements" }, { pattern: STEPS_COMPONENT_PATTERN, name: " component" }, @@ -816,17 +814,17 @@ async function validateGeneratedContent(): Promise<{ { pattern: GUIDE_OVERVIEW_PATTERN, name: " component" }, ]; - for (const { pattern, name } of mdxPatterns) { - if (pattern.test(content)) { - errors.push(`Quickstart page still contains ${name}`); - } + const foundPatterns = mdxPatterns.filter(({ pattern }) => + pattern.test(content) + ); + for (const { name } of foundPatterns) { + errors.push(`Quickstart page still contains ${name}`); } - if (!mdxPatterns.some(({ pattern }) => pattern.test(content))) { + if (foundPatterns.length === 0) { console.log(pc.green(" ✓ Quickstart page has no raw MDX syntax")); } - // Check that actual content is present if (content.includes("arcade new") && content.includes("uv tool install")) { console.log( pc.green(" ✓ Quickstart page contains expected code examples") @@ -837,6 +835,53 @@ async function validateGeneratedContent(): Promise<{ } catch (error) { errors.push(`Could not read quickstart page: ${error}`); } +} + +/** + * Validates that the home page has HTML elements properly cleaned + */ +async function validateHtmlCleaning(errors: string[]): Promise { + const homePath = path.join(OUTPUT_DIR, "en", "home.md"); + try { + const content = await fs.readFile(homePath, "utf-8"); + const htmlPatterns = [ + { pattern: HTML_SCRIPT_PATTERN, name: "