diff --git a/.github/scripts/update-example-dates.js b/.github/scripts/update-example-dates.js index 0e965c004..312edbcd6 100644 --- a/.github/scripts/update-example-dates.js +++ b/.github/scripts/update-example-dates.js @@ -97,7 +97,7 @@ async function updateExampleDates() { // Read the current MDX file const mdxPath = path.join( - __dirname, + import.meta.dirname, "../../app/en/resources/examples/page.mdx" ); let content = fs.readFileSync(mdxPath, "utf8"); diff --git a/.gitignore b/.gitignore index c714c9723..47b247816 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ node_modules .DS_Store .env.local public/sitemap*.xml +public/_markdown/ .env _pagefind/ diff --git a/app/_components/copy-page-override.tsx b/app/_components/copy-page-override.tsx new file mode 100644 index 000000000..f89dc528a --- /dev/null +++ b/app/_components/copy-page-override.tsx @@ -0,0 +1,101 @@ +"use client"; + +import { usePathname } from "next/navigation"; +import { useCallback, useEffect } from "react"; + +const COPY_FEEDBACK_DELAY_MS = 2000; +const COPY_BUTTON_TEXT = "Copy page"; +const COPIED_TEXT = "Copied"; +const DROPDOWN_IDENTIFIER = "Markdown for LLMs"; + +/** + * This component overrides the default nextra-theme-docs "Copy page" button behavior + * to fetch clean markdown from our API instead of copying raw MDX source. + */ +export function CopyPageOverride() { + const pathname = usePathname(); + + const fetchAndCopyMarkdown = useCallback(async (): Promise => { + try { + const markdownUrl = `/api/markdown${pathname}.md`; + const response = await fetch(markdownUrl); + + if (!response.ok) { + throw new Error(`Failed to fetch markdown: ${response.status}`); + } + + const markdown = await response.text(); + await navigator.clipboard.writeText(markdown); + return true; + } catch { + return false; + } + }, [pathname]); + + useEffect(() => { + const isCopyButton = (button: HTMLButtonElement): boolean => { + const text = button.textContent || ""; + return text.includes(COPY_BUTTON_TEXT) || text.includes(COPIED_TEXT); + }; + + const updateButtonFeedback = (button: HTMLButtonElement): void => { + const textNodes = button.querySelectorAll("*"); + for (const node of textNodes) { + if (node.textContent === COPY_BUTTON_TEXT) { + node.textContent = COPIED_TEXT; + setTimeout(() => { + node.textContent = COPY_BUTTON_TEXT; + }, COPY_FEEDBACK_DELAY_MS); + return; + } + } + }; + + const handleButtonClick = async (event: MouseEvent): Promise => { + const target = event.target as HTMLElement; + const button = target.closest("button") as HTMLButtonElement | null; + + if (!(button && isCopyButton(button))) { + return; + } + + event.preventDefault(); + event.stopPropagation(); + + const success = await fetchAndCopyMarkdown(); + if (success) { + updateButtonFeedback(button); + } + }; + + const handleDropdownClick = async (event: MouseEvent): Promise => { + const target = event.target as HTMLElement; + const option = target.closest('[role="option"]'); + const optionText = option?.textContent || ""; + + const isDropdownCopyOption = + optionText.includes(COPY_BUTTON_TEXT) && + optionText.includes(DROPDOWN_IDENTIFIER); + + if (!isDropdownCopyOption) { + return; + } + + event.preventDefault(); + event.stopPropagation(); + + await fetchAndCopyMarkdown(); + document.body.click(); + }; + + document.addEventListener("click", handleButtonClick, true); + document.addEventListener("click", handleDropdownClick, true); + + return () => { + document.removeEventListener("click", handleButtonClick, true); + document.removeEventListener("click", handleDropdownClick, true); + }; + }, [fetchAndCopyMarkdown]); + + return null; +} diff --git a/app/_components/custom-layout.tsx b/app/_components/custom-layout.tsx index 3051c36f6..1e048ff46 100644 --- a/app/_components/custom-layout.tsx +++ b/app/_components/custom-layout.tsx @@ -1,4 +1,5 @@ import type React from "react"; +import { CopyPageOverride } from "@/app/_components/copy-page-override"; import { PlaceholderReplacer } from "@/app/_components/placeholder-replacer"; import { OrySessionProvider } from "@/app/_lib/ory-session-context"; @@ -7,6 +8,7 @@ const CustomLayout: React.FC<{ children: React.ReactNode }> = ({ }) => ( +
{children}
); diff --git a/app/api/markdown/[[...slug]]/route.ts b/app/api/markdown/[[...slug]]/route.ts index 61006c812..b9f3997f4 100644 --- a/app/api/markdown/[[...slug]]/route.ts +++ b/app/api/markdown/[[...slug]]/route.ts @@ -7,6 +7,9 @@ export const dynamic = "force-dynamic"; // Regex pattern for removing .md extension const MD_EXTENSION_REGEX = /\.md$/; +// Directory containing pre-generated clean markdown files +const CLEAN_MARKDOWN_DIR = join(process.cwd(), "public", "_markdown"); + export async function GET( request: NextRequest, _context: { params: Promise<{ slug?: string[] }> } @@ -17,28 +20,48 @@ export async function GET( // Remove /api/markdown prefix to get the original path const originalPath = url.pathname.replace("/api/markdown", ""); - // Remove .md extension + // Remove .md extension if present const pathWithoutMd = originalPath.replace(MD_EXTENSION_REGEX, ""); - // Map URL to file path + // Try clean markdown first (preferred) + // e.g., /en/home/quickstart -> public/_markdown/en/home/quickstart.md + const cleanMarkdownPath = join(CLEAN_MARKDOWN_DIR, `${pathWithoutMd}.md`); + + try { + await access(cleanMarkdownPath); + const content = await readFile(cleanMarkdownPath, "utf-8"); + + return new NextResponse(content, { + status: 200, + headers: { + "Content-Type": "text/plain; charset=utf-8", + "Content-Disposition": "inline", + "Cache-Control": "public, max-age=3600", // Cache for 1 hour + }, + }); + } catch { + // Clean markdown not found, fall back to raw MDX + } + + // Fallback: serve raw MDX (for backwards compatibility or if clean files not generated) // e.g., /en/home/quickstart -> app/en/home/quickstart/page.mdx - const filePath = join(process.cwd(), "app", `${pathWithoutMd}/page.mdx`); + const rawMdxPath = join(process.cwd(), "app", `${pathWithoutMd}/page.mdx`); - // Check if file exists try { - await access(filePath); + await access(rawMdxPath); } catch { return new NextResponse("Markdown file not found", { status: 404 }); } - const content = await readFile(filePath, "utf-8"); + const content = await readFile(rawMdxPath, "utf-8"); - // Return the raw markdown with proper headers + // Return the raw MDX with a warning header return new NextResponse(content, { status: 200, headers: { "Content-Type": "text/plain; charset=utf-8", "Content-Disposition": "inline", + "X-Content-Source": "raw-mdx", // Indicate this is raw MDX, not clean markdown }, }); } catch (error) { diff --git a/next-env.d.ts b/next-env.d.ts index c4b7818fb..9edff1c7c 100644 --- a/next-env.d.ts +++ b/next-env.d.ts @@ -1,6 +1,6 @@ /// /// -import "./.next/dev/types/routes.d.ts"; +import "./.next/types/routes.d.ts"; // NOTE: This file should not be edited // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. diff --git a/package.json b/package.json index 966dc5a36..9181c02dc 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,8 @@ "lint": "pnpm dlx ultracite check", "format": "pnpm dlx ultracite fix", "prepare": "husky install", - "postbuild": "pnpm run custompagefind", + "postbuild": "pnpm run generate:markdown && pnpm run custompagefind", + "generate:markdown": "pnpm dlx tsx scripts/generate-clean-markdown.ts", "translate": "pnpm dlx tsx scripts/i18n-sync/index.ts && pnpm format", "sync:metas": "pnpm dlx tsx scripts/sync-metas.ts app/en", "llmstxt": "pnpm dlx tsx scripts/generate-llmstxt.ts", @@ -74,6 +75,7 @@ "@types/react": "19.2.7", "@types/react-dom": "19.2.3", "@types/react-syntax-highlighter": "15.5.13", + "@types/turndown": "^5.0.6", "@types/unist": "3.0.3", "commander": "14.0.2", "dotenv": "^17.2.3", @@ -90,6 +92,7 @@ "remark": "^15.0.1", "remark-rehype": "^11.1.2", "tailwindcss": "4.1.16", + "turndown": "^7.2.2", "typescript": "5.9.3", "ultracite": "6.1.0", "vitest": "4.0.5", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9864f51d7..8b1458165 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -105,6 +105,9 @@ importers: '@types/react-syntax-highlighter': specifier: 15.5.13 version: 15.5.13 + '@types/turndown': + specifier: ^5.0.6 + version: 5.0.6 '@types/unist': specifier: 3.0.3 version: 3.0.3 @@ -153,6 +156,9 @@ importers: tailwindcss: specifier: 4.1.16 version: 4.1.16 + turndown: + specifier: ^7.2.2 + version: 7.2.2 typescript: specifier: 5.9.3 version: 5.9.3 @@ -655,6 +661,9 @@ packages: '@mermaid-js/parser@0.6.3': resolution: {integrity: sha512-lnjOhe7zyHjc+If7yT4zoedx2vo4sHaTmtkl1+or8BRTnCtDmcTpAjpzDSfCZrshM5bCoz0GyidzadJAH1xobA==} + '@mixmark-io/domino@2.2.0': + resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} + '@napi-rs/simple-git-android-arm-eabi@0.1.22': resolution: {integrity: sha512-JQZdnDNm8o43A5GOzwN/0Tz3CDBQtBUNqzVwEopm32uayjdjxev1Csp1JeaqF3v9djLDIvsSE39ecsN2LhCKKQ==} engines: {node: '>= 10'} @@ -2177,6 +2186,9 @@ packages: '@types/trusted-types@2.0.7': resolution: {integrity: sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==} + '@types/turndown@5.0.6': + resolution: {integrity: sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==} + '@types/unist@2.0.11': resolution: {integrity: sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==} @@ -4424,6 +4436,9 @@ packages: tslib@2.8.1: resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} + turndown@7.2.2: + resolution: {integrity: sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==} + twoslash-protocol@0.3.4: resolution: {integrity: sha512-HHd7lzZNLUvjPzG/IE6js502gEzLC1x7HaO1up/f72d8G8ScWAs9Yfa97igelQRDl5h9tGcdFsRp+lNVre1EeQ==} @@ -5178,6 +5193,8 @@ snapshots: dependencies: langium: 3.3.1 + '@mixmark-io/domino@2.2.0': {} + '@napi-rs/simple-git-android-arm-eabi@0.1.22': optional: true @@ -6950,6 +6967,8 @@ snapshots: '@types/trusted-types@2.0.7': optional: true + '@types/turndown@5.0.6': {} + '@types/unist@2.0.11': {} '@types/unist@3.0.3': {} @@ -9795,6 +9814,10 @@ snapshots: tslib@2.8.1: {} + turndown@7.2.2: + dependencies: + '@mixmark-io/domino': 2.2.0 + twoslash-protocol@0.3.4: {} twoslash@0.3.4(typescript@5.9.3): diff --git a/scripts/generate-clean-markdown.ts b/scripts/generate-clean-markdown.ts new file mode 100644 index 000000000..5df031604 --- /dev/null +++ b/scripts/generate-clean-markdown.ts @@ -0,0 +1,992 @@ +import { type ChildProcess, spawn } from "node:child_process"; +import fs from "node:fs/promises"; +import path, { dirname } from "node:path"; +import { fileURLToPath } from "node:url"; +import glob from "fast-glob"; +import pc from "picocolors"; +import TurndownService from "turndown"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Configuration constants +const SERVER_PORT = 3456; +const SERVER_URL = `http://localhost:${SERVER_PORT}`; +const OUTPUT_DIR = path.join(__dirname, "..", "public", "_markdown"); +const MAX_RETRIES = 30; +const RETRY_DELAY_MS = 1000; +const BATCH_SIZE = 10; +const SERVER_CLEANUP_DELAY_MS = 500; +const HTTP_NOT_FOUND = 404; +const MIN_INTEGRATION_LINKS = 10; +const MAX_DOTFILE_LENGTH = 20; +const MAX_CHILD_TEXT_LENGTH = 50; +const PARENT_SEARCH_DEPTH = 4; +const LABEL_SEARCH_DEPTH = 3; + +// Regex patterns at module level for performance +const FILENAME_PATTERN = + /^[\w.-]+\.(py|ts|js|tsx|jsx|json|yaml|yml|toml|env|md|html|css|sql|sh|bash|go|rs|java|rb|php|swift|kt|cs|cpp|c|h|xml|ini|cfg|conf)$/i; +const DOTFILE_PATTERN = /^\.[a-z]+$/i; +const LANGUAGE_CLASS_PATTERN = /language-(\w+)/; +const ARTICLE_PATTERN = /]*>([\s\S]*?)<\/article>/i; +const MAIN_PATTERN = /]*>([\s\S]*?)<\/main>/i; +const BODY_PATTERN = /]*>([\s\S]*?)<\/body>/i; +const PAGE_MDX_PATTERN = /\/page\.mdx$/; +const MDX_PATTERN = /\.mdx$/; + +// Validation regex patterns +const IMPORT_STATEMENT_PATTERN = /^import\s+/m; +const STEPS_COMPONENT_PATTERN = /|<\/Steps>/g; +const TABS_COMPONENT_PATTERN = /]/g; +const CALLOUT_COMPONENT_PATTERN = /]/g; +const GUIDE_OVERVIEW_PATTERN = /]/g; + +// HTML element patterns that should be removed during cleaning +const HTML_SCRIPT_PATTERN = /]/gi; +const HTML_STYLE_PATTERN = /]/gi; +const HTML_SVG_PATTERN = /]/gi; +const HTML_NAV_PATTERN = /]/gi; +const HTML_FOOTER_PATTERN = /]/gi; +const HTML_ASIDE_PATTERN = /]/gi; + +// Meta tag extraction patterns +const TITLE_PATTERN = /]*>([^<]*)<\/title>/i; +const META_DESCRIPTION_PATTERN = + / = { + terminal: "bash", + bash: "bash", + shell: "bash", + sh: "bash", + zsh: "bash", + python: "python", + py: "python", + typescript: "typescript", + ts: "typescript", + javascript: "javascript", + js: "javascript", + json: "json", + env: "bash", + yaml: "yaml", + yml: "yaml", + html: "html", + css: "css", + sql: "sql", + graphql: "graphql", + rust: "rust", + go: "go", + java: "java", + ruby: "ruby", + php: "php", + csharp: "csharp", + "c#": "csharp", + cpp: "cpp", + "c++": "cpp", + c: "c", + swift: "swift", + kotlin: "kotlin", + markdown: "markdown", + md: "markdown", + toml: "toml", + ini: "ini", + xml: "xml", + }; + return map[label.toLowerCase()] || ""; +} + +/** + * Language labels that appear as orphan text before code blocks + */ +const LANGUAGE_LABELS = new Set([ + "terminal", + "bash", + "shell", + "sh", + "zsh", + "python", + "py", + "typescript", + "ts", + "javascript", + "js", + "json", + "yaml", + "yml", + "toml", + "env", + "ini", + "xml", + "html", + "css", + "sql", + "graphql", + "rust", + "go", + "java", + "ruby", + "php", + "c#", + "csharp", + "c++", + "cpp", + "c", + "swift", + "kotlin", + "markdown", + "md", +]); + +/** + * Gets comment prefix for a language + */ +function getCommentPrefix(language: string): string { + const hashComment = ["bash", "python", "ruby", "yaml", "toml", "shell"]; + const slashComment = [ + "typescript", + "javascript", + "java", + "go", + "rust", + "swift", + "kotlin", + "csharp", + "cpp", + "c", + ]; + + if (hashComment.includes(language)) { + return "# "; + } + if (slashComment.includes(language)) { + return "// "; + } + if (language === "html" || language === "xml") { + return ""; + } + if (language === "css") { + return " */"; + } + return ""; +} + +/** + * Checks if text matches a filename pattern + */ +function isFilename(text: string): boolean { + const trimmed = text.trim(); + // Match common filename patterns like main.py, example.ts + if (FILENAME_PATTERN.test(trimmed)) { + return true; + } + // Match dotfiles like .env, .gitignore + if ( + DOTFILE_PATTERN.test(trimmed) && + trimmed.length > 1 && + trimmed.length < MAX_DOTFILE_LENGTH + ) { + return true; + } + return false; +} + +/** + * Recursively searches for filename text in a node tree + */ +function findFilenameInNode(node: Node): string | null { + // Check if this node's text content is a filename + const text = node.textContent?.trim() || ""; + if (isFilename(text)) { + return text; + } + + // Check child nodes + for (const child of Array.from(node.childNodes)) { + // Only check text nodes or elements with short text content + const childText = child.textContent?.trim() || ""; + if (childText.length < MAX_CHILD_TEXT_LENGTH && isFilename(childText)) { + return childText; + } + } + + return null; +} + +/** + * Finds filename text near a code block element + * Looks for patterns like "main.py", "example.ts", ".env" etc. + */ +function findFilename(node: Node): string | null { + // Look in parent structure for filename-like text + let parent = (node as Element).parentElement; + let depth = 0; + while (parent && depth < PARENT_SEARCH_DEPTH) { + const filename = findFilenameInNode(parent); + if (filename) { + return filename; + } + parent = parent.parentElement; + depth += 1; + } + return null; +} + +// Custom rules for better code block handling +// Nextra wraps code in:
buttons
...
+// Language labels appear in parent structure (e.g., "Terminal", "Python", "TypeScript") +turndown.addRule("fencedCodeBlock", { + filter: (node) => { + if (node.nodeName !== "PRE") { + return false; + } + // Find CODE element anywhere inside PRE (not just as first child) + const codeElement = findCodeElement(node); + return codeElement !== null; + }, + replacement: (_content, node) => { + const codeElement = findCodeElement(node); + if (!codeElement) { + return _content; + } + let code = codeElement.textContent || ""; + + // Try to extract language from various sources + let language = ""; + + // 1. Check code element class (e.g., "language-typescript") + const codeClassName = codeElement.getAttribute("class") || ""; + const langMatch = codeClassName.match(LANGUAGE_CLASS_PATTERN); + if (langMatch) { + language = langMatch[1]; + } + + // 2. Look for language label in parent structure + // Nextra code blocks have labels like "Terminal", "Python", etc. + if (!language) { + const labels = [ + "Terminal", + "Bash", + "Shell", + "Python", + "TypeScript", + "JavaScript", + "JSON", + "YAML", + "TOML", + "ENV", + "HTML", + "CSS", + "SQL", + "GraphQL", + "Rust", + "Go", + "Java", + "Ruby", + "PHP", + "C#", + "C++", + "C", + "Swift", + "Kotlin", + "Markdown", + "XML", + ]; + + // Check parent and grandparent for label text + let parent = (node as Element).parentElement; + let depth = 0; + while (parent && !language && depth < LABEL_SEARCH_DEPTH) { + const foundLabel = findElementWithText(parent, labels); + if (foundLabel) { + language = labelToLanguage(foundLabel); + break; + } + parent = parent.parentElement; + depth += 1; + } + } + + // 3. Try to find filename and add as comment + const filename = findFilename(node); + if (filename) { + const prefix = getCommentPrefix(language || "bash"); + const suffix = getCommentSuffix(language || "bash"); + code = `${prefix}${filename}${suffix}\n${code}`; + } + + return `\n\n\`\`\`${language}\n${code}\n\`\`\`\n\n`; + }, +}); + +// Remove copy buttons and other interactive elements +turndown.addRule("removeButtons", { + filter: (node) => { + if (node.nodeName === "BUTTON") { + return true; + } + if (node.nodeName === "DIV") { + const className = node.getAttribute("class"); + if (className?.includes("copy-button")) { + return true; + } + } + return false; + }, + replacement: () => "", +}); + +// Remove orphan language labels and filenames that appear before code blocks +// These are standalone paragraphs containing just "Terminal", "Python", "main.py", etc. +turndown.addRule("removeOrphanLabels", { + filter: (node) => { + if ( + node.nodeName !== "P" && + node.nodeName !== "SPAN" && + node.nodeName !== "DIV" + ) { + return false; + } + const text = node.textContent?.trim() || ""; + // Check if it's just a language label + if (LANGUAGE_LABELS.has(text.toLowerCase())) { + return true; + } + // Check if it's a filename (will be added as comment in code block) + if (isFilename(text)) { + return true; + } + return false; + }, + replacement: () => "", +}); + +// Clean up links - collapse whitespace in link text +turndown.addRule("cleanLinks", { + filter: "a", + replacement: (content, node) => { + const element = node as Element; + let href = element.getAttribute("href"); + if (!href) { + return content; + } + // Collapse multiple whitespace/newlines into single space and trim + const cleanedContent = content.replace(/\s+/g, " ").trim(); + // Skip empty links + if (!cleanedContent) { + return ""; + } + + // Add .md extension to internal links (so they point to markdown, not HTML) + // Internal links start with / but not // (protocol-relative) + // Don't add .md if it already has an extension or is an anchor-only link + if ( + href.startsWith("/") && + !href.startsWith("//") && + !href.includes(".") && + !href.startsWith("/#") + ) { + // Handle links with anchors (e.g., /page#section -> /page.md#section) + const hashIndex = href.indexOf("#"); + if (hashIndex > 0) { + href = `${href.slice(0, hashIndex)}.md${href.slice(hashIndex)}`; + } else { + href += ".md"; + } + } + + // Check if this is a standalone link (in a grid/list of links) + // by looking at the parent and sibling structure + const parent = element.parentNode; + const isInParagraph = parent?.nodeName === "P"; + const isInlineLink = isInParagraph && parent?.childNodes.length > 1; + + // For standalone links (like card grids), add newline for readability + // For inline links (in paragraphs with other text), don't add newline + if (isInlineLink) { + return `[${cleanedContent}](${href})`; + } + return `[${cleanedContent}](${href})\n`; + }, +}); + +/** + * Waits for the server to be ready + */ +async function waitForServer(url: string): Promise { + console.log(pc.blue(`โณ Waiting for server at ${url}...`)); + + let retries = 0; + while (retries < MAX_RETRIES) { + try { + const response = await fetch(url); + if (response.ok || response.status === HTTP_NOT_FOUND) { + console.log(pc.green("โœ“ Server is ready")); + return; + } + } catch { + // Server not ready yet + } + await new Promise((resolve) => setTimeout(resolve, RETRY_DELAY_MS)); + retries += 1; + } + + throw new Error( + `Server at ${url} did not become ready after ${MAX_RETRIES} retries` + ); +} + +/** + * Starts the Next.js production server + */ +function startServer(): ChildProcess { + console.log(pc.blue("๐Ÿš€ Starting production server...")); + + // Use npx to run next start directly with port argument + const server = spawn( + "npx", + ["next", "start", "--port", String(SERVER_PORT)], + { + cwd: path.join(__dirname, ".."), + stdio: ["ignore", "pipe", "pipe"], + detached: false, + } + ); + + // Log server output for debugging + server.stdout?.on("data", (data: Buffer) => { + const output = data.toString(); + if (output.includes("Ready") || output.includes("started")) { + console.log(pc.gray(` Server: ${output.trim()}`)); + } + }); + + server.stderr?.on("data", (data: Buffer) => { + const output = data.toString(); + // Filter out noisy warnings + if (!output.includes("ExperimentalWarning")) { + console.error(pc.yellow(` Server stderr: ${output.trim()}`)); + } + }); + + return server; +} + +/** + * Extracts frontmatter data from HTML meta tags + */ +function extractFrontmatter(html: string): { + title: string; + description: string; +} { + // Extract title + const titleMatch = html.match(TITLE_PATTERN); + let title = titleMatch?.[1]?.trim() || ""; + // Remove common suffixes like "| Arcade Docs" or " - Arcade" + title = title.replace(TITLE_SUFFIX_PATTERN, "").trim(); + + // Extract description (try both attribute orders) + let description = ""; + const descMatch = html.match(META_DESCRIPTION_PATTERN); + if (descMatch) { + description = descMatch[1].trim(); + } else { + const descAltMatch = html.match(META_DESCRIPTION_ALT_PATTERN); + if (descAltMatch) { + description = descAltMatch[1].trim(); + } + } + + return { title, description }; +} + +/** + * Formats frontmatter as YAML + */ +function formatFrontmatter(title: string, description: string): string { + if (!(title || description)) { + return ""; + } + + const lines = ["---"]; + if (title) { + // Escape quotes in YAML values + const escapedTitle = title.replace(/"/g, '\\"'); + lines.push(`title: "${escapedTitle}"`); + } + if (description) { + const escapedDesc = description.replace(/"/g, '\\"'); + lines.push(`description: "${escapedDesc}"`); + } + lines.push("---", ""); + + return lines.join("\n"); +} + +/** + * Extracts the main content from the HTML page + */ +function extractContent(html: string): string { + // Nextra wraps the main content in an
element + // We need to extract just the article content, not the nav/sidebar/footer + + // Try to find the article element + const articleMatch = html.match(ARTICLE_PATTERN); + if (articleMatch) { + return articleMatch[1]; + } + + // Fallback: try to find main content area + const mainMatch = html.match(MAIN_PATTERN); + if (mainMatch) { + return mainMatch[1]; + } + + console.warn( + pc.yellow(" โš  Could not find article/main element, using body") + ); + // Last resort: use body content + const bodyMatch = html.match(BODY_PATTERN); + return bodyMatch ? bodyMatch[1] : html; +} + +/** + * Cleans up the extracted HTML before conversion + */ +function cleanHtml(html: string): string { + let cleaned = html; + + // Remove script tags + cleaned = cleaned.replace(//gi, ""); + + // Remove style tags + cleaned = cleaned.replace(//gi, ""); + + // Remove SVG icons (they don't convert well) + cleaned = cleaned.replace(//gi, ""); + + // Remove navigation elements + cleaned = cleaned.replace(//gi, ""); + + // Remove footer elements + cleaned = cleaned.replace(//gi, ""); + + // Remove aside elements (typically sidebars) + cleaned = cleaned.replace(//gi, ""); + + // Remove elements with common non-content classes + cleaned = cleaned.replace( + /<[^>]*(class="[^"]*(?:sidebar|nav|toc|breadcrumb)[^"]*")[^>]*>[\s\S]*?<\/[^>]+>/gi, + "" + ); + + return cleaned; +} + +/** + * Post-processes the markdown output + */ +function cleanMarkdown(markdown: string): string { + let cleaned = markdown; + + // Remove excessive blank lines (more than 2 consecutive) + cleaned = cleaned.replace(/\n{4,}/g, "\n\n\n"); + + // Remove trailing whitespace from lines + cleaned = cleaned.replace(/[ \t]+$/gm, ""); + + // Ensure file ends with single newline + cleaned = `${cleaned.trimEnd()}\n`; + + return cleaned; +} + +/** + * Fetches and converts a single page + */ +async function processPage( + url: string, + outputPath: string +): Promise<{ success: boolean; error?: string }> { + try { + const response = await fetch(url); + + if (!response.ok) { + return { success: false, error: `HTTP ${response.status}` }; + } + + const html = await response.text(); + + // Extract frontmatter from meta tags before processing content + const { title, description } = extractFrontmatter(html); + const frontmatter = formatFrontmatter(title, description); + + const content = extractContent(html); + const cleanedHtml = cleanHtml(content); + const markdown = turndown.turndown(cleanedHtml); + const cleanedMarkdown = cleanMarkdown(markdown); + + // Combine frontmatter with markdown content + const finalMarkdown = frontmatter + cleanedMarkdown; + + // Create directory if needed + await fs.mkdir(path.dirname(outputPath), { recursive: true }); + + // Write the markdown file + await fs.writeFile(outputPath, finalMarkdown, "utf-8"); + + return { success: true }; + } catch (error) { + return { success: false, error: String(error) }; + } +} + +/** + * Discovers all MDX pages and their corresponding routes + */ +async function discoverPages(): Promise< + Array<{ route: string; language: string; outputPath: string }> +> { + const appDir = path.join(__dirname, "..", "app"); + const entries = await fs.readdir(appDir); + + // Find all language directories + const languages = await Promise.all( + entries.map(async (dir: string) => { + if (dir.startsWith("_") || dir === "api") { + return null; + } + const entryPath = path.join(appDir, dir); + const stats = await fs.stat(entryPath); + return stats.isDirectory() ? dir : null; + }) + ).then((results) => results.filter((dir): dir is string => dir !== null)); + + console.log(pc.blue(`๐Ÿ“ Found languages: ${languages.join(", ")}`)); + + const pages: Array<{ route: string; language: string; outputPath: string }> = + []; + + for (const language of languages) { + const searchPath = path.join(appDir, language); + const mdxFiles = glob.sync("**/*.mdx", { + cwd: searchPath, + ignore: ["**/_*.mdx"], + }); + + for (const entry of mdxFiles) { + // Convert file path to route + // e.g., "home/quickstart/page.mdx" -> "/en/home/quickstart" + const routePath = entry + .replace(PAGE_MDX_PATTERN, "") + .replace(MDX_PATTERN, ""); + const route = `/${language}/${routePath}`; + const outputPath = path.join(OUTPUT_DIR, language, `${routePath}.md`); + + pages.push({ route, language, outputPath }); + } + } + + return pages; +} + +/** + * Validates that the integrations page has proper links + */ +async function validateIntegrationsPage(errors: string[]): Promise { + const integrationsPath = path.join( + OUTPUT_DIR, + "en", + "resources", + "integrations.md" + ); + try { + const content = await fs.readFile(integrationsPath, "utf-8"); + const integrationLinkPattern = + /\]\(\/en\/resources\/integrations\/[^)]+\)/g; + const matches = content.match(integrationLinkPattern) || []; + + if (matches.length < MIN_INTEGRATION_LINKS) { + errors.push( + `Integrations page should have many integration links, found only ${matches.length}. ` + + "This suggests the component content was not properly rendered." + ); + } else { + console.log( + pc.green( + ` โœ“ Integrations page contains ${matches.length} integration links` + ) + ); + } + + if (content.includes(" or import statements)" + ); + } else { + console.log(pc.green(" โœ“ Integrations page has no raw MDX syntax")); + } + } catch (error) { + errors.push(`Could not read integrations page: ${error}`); + } +} + +/** + * Validates that the quickstart page has no raw MDX syntax + */ +async function validateQuickstartPage(errors: string[]): Promise { + const quickstartPath = path.join( + OUTPUT_DIR, + "en", + "get-started", + "quickstarts", + "mcp-server-quickstart.md" + ); + try { + const content = await fs.readFile(quickstartPath, "utf-8"); + const mdxPatterns = [ + { pattern: IMPORT_STATEMENT_PATTERN, name: "import statements" }, + { pattern: STEPS_COMPONENT_PATTERN, name: " component" }, + { pattern: TABS_COMPONENT_PATTERN, name: " component" }, + { pattern: CALLOUT_COMPONENT_PATTERN, name: " component" }, + { pattern: GUIDE_OVERVIEW_PATTERN, name: " component" }, + ]; + + const foundPatterns = mdxPatterns.filter(({ pattern }) => + pattern.test(content) + ); + for (const { name } of foundPatterns) { + errors.push(`Quickstart page still contains ${name}`); + } + + if (foundPatterns.length === 0) { + console.log(pc.green(" โœ“ Quickstart page has no raw MDX syntax")); + } + + if (content.includes("arcade new") && content.includes("uv tool install")) { + console.log( + pc.green(" โœ“ Quickstart page contains expected code examples") + ); + } else { + errors.push("Quickstart page is missing expected code examples"); + } + } catch (error) { + errors.push(`Could not read quickstart page: ${error}`); + } +} + +/** + * Validates that the home page has HTML elements properly cleaned + */ +async function validateHtmlCleaning(errors: string[]): Promise { + const homePath = path.join(OUTPUT_DIR, "en", "home.md"); + try { + const content = await fs.readFile(homePath, "utf-8"); + const htmlPatterns = [ + { pattern: HTML_SCRIPT_PATTERN, name: "