Skip to content

Commit 99f84e5

Browse files
committed
Add improvements to llmstxt.ts to generate links with pages of different langs
1 parent 7b717c8 commit 99f84e5

2 files changed

Lines changed: 160 additions & 40 deletions

File tree

bin/validate-llms.txt.ts

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,29 +25,60 @@ function countHtmlFiles(): number {
2525
return contentFiles.length;
2626
}
2727

28-
function countLlmsLines(): number {
28+
function countLlmsPages(): { totalLines: number; uniquePages: number } {
2929
const content = fs.readFileSync(LLMS_FILE, 'utf-8');
30-
return content.split('\n').filter((line) => line.trim().length > 0).length;
30+
const lines = content.split('\n').filter((line) => line.trim().length > 0);
31+
32+
// Extract unique page URLs (without ?lang= parameters)
33+
const uniqueUrls = new Set<string>();
34+
35+
lines.forEach((line) => {
36+
// Skip the header line
37+
if (line.startsWith('#')) {
38+
return;
39+
}
40+
41+
// Extract URL from markdown link format: - [Title](URL): Description
42+
const urlMatch = line.match(/\[.*?\]\((.*?)\)/);
43+
if (urlMatch) {
44+
const url = urlMatch[1];
45+
// Remove lang parameter to get base URL
46+
const baseUrl = url.split('?')[0];
47+
uniqueUrls.add(baseUrl);
48+
}
49+
});
50+
51+
return {
52+
totalLines: lines.length,
53+
uniquePages: uniqueUrls.size,
54+
};
3155
}
3256

33-
function validateCounts(htmlCount: number, llmsCount: number): boolean {
34-
const difference = Math.abs(htmlCount - llmsCount);
35-
const maxAllowedDifference = Math.ceil(htmlCount * 0.8); // 80% of html count
36-
return difference <= maxAllowedDifference;
57+
function validateCounts(htmlCount: number, uniquePageCount: number, totalLineCount: number): boolean {
58+
// With language-specific URLs, we expect to have a reasonable coverage of the HTML files
59+
// The unique page count should be at least 50% of the HTML count (allowing for pages not captured by our GraphQL queries)
60+
// and at most 100% (we shouldn't have more unique pages than HTML files)
61+
const coverage = uniquePageCount / htmlCount;
62+
const minCoverage = 0.5; // At least 50% coverage
63+
const maxCoverage = 1.0; // At most 100% coverage
64+
65+
return coverage >= minCoverage && coverage <= maxCoverage;
3766
}
3867

3968
function main() {
4069
try {
4170
const htmlCount = countHtmlFiles();
42-
const llmsCount = countLlmsLines();
43-
const isValid = validateCounts(htmlCount, llmsCount);
71+
const { totalLines, uniquePages } = countLlmsPages();
72+
const isValid = validateCounts(htmlCount, uniquePages, totalLines);
4473

4574
console.log(`HTML files found: ${htmlCount}`);
46-
console.log(`Lines in llms.txt: ${llmsCount}`);
75+
console.log(`Total lines in llms.txt: ${totalLines}`);
76+
console.log(`Unique pages in llms.txt: ${uniquePages}`);
77+
console.log(`Coverage: ${Math.round((uniquePages / htmlCount) * 100)}%`);
4778

4879
if (!isValid) {
4980
console.error(
50-
`Error: The number of lines in llms.txt (${llmsCount}) is not within 80% of the number of HTML files (${htmlCount})`,
81+
`Error: The coverage of unique pages in llms.txt (${Math.round((uniquePages / htmlCount) * 100)}%) is not within the acceptable range of 50-100% of HTML files (${htmlCount})`,
5182
);
5283
process.exit(1);
5384
}

data/onPostBuild/llmstxt.ts

Lines changed: 119 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { GatsbyNode } from 'gatsby';
22
import * as path from 'path';
33
import * as fs from 'fs';
4+
import languageInfo from '../../src/data/languages/languageInfo';
45

56
/**
67
* This script is used to create a file called llms.txt that contains a list of all the pages in the site.
@@ -11,6 +12,26 @@ const LLMS_TXT_PREAMBLE = `# https://ably.com/docs llms.txt\n`;
1112

1213
const REPORTER_PREFIX = 'onPostBuild:';
1314

15+
// Valid languages for URL generation (matching your requirements)
16+
const VALID_LANGUAGES = [
17+
'javascript',
18+
'nodejs',
19+
'csharp',
20+
'flutter',
21+
'java',
22+
'objc',
23+
'php',
24+
'python',
25+
'ruby',
26+
'swift',
27+
'go',
28+
];
29+
30+
// Function to get the display label for a language
31+
const getLanguageLabel = (languageKey: string): string => {
32+
return languageInfo[languageKey as keyof typeof languageInfo]?.label || languageKey;
33+
};
34+
1435
interface DocumentQueryResult {
1536
site: {
1637
siteMetadata: {
@@ -24,6 +45,7 @@ interface DocumentQueryResult {
2445
meta: {
2546
title: string;
2647
meta_description: string;
48+
languages?: string[];
2749
};
2850
};
2951
}[];
@@ -38,6 +60,9 @@ interface DocumentQueryResult {
3860
title?: string;
3961
meta_description?: string;
4062
};
63+
internal: {
64+
contentFilePath?: string;
65+
};
4166
}[];
4267
};
4368
}
@@ -53,6 +78,34 @@ const escapeMarkdown = (text: string) => {
5378
return text.replace(/([\\`*_{}[\]()#+!])/g, '\\$1');
5479
};
5580

81+
// Function to extract code element classes from an MDX file
82+
const extractCodeLanguages = async (filePath: string): Promise<Set<string>> => {
83+
try {
84+
// Check if the file exists
85+
if (!fs.existsSync(filePath)) {
86+
return new Set();
87+
}
88+
89+
// Read the file content
90+
const fileContent = fs.readFileSync(filePath, 'utf8');
91+
92+
// Find all instances of code blocks with language specifiers (```language)
93+
const codeBlockRegex = /```(\w+)/g;
94+
let match;
95+
const languages = new Set<string>();
96+
97+
while ((match = codeBlockRegex.exec(fileContent)) !== null) {
98+
if (match[1] && match[1].trim()) {
99+
languages.add(match[1].trim());
100+
}
101+
}
102+
return languages;
103+
} catch (error) {
104+
console.error(`Error extracting code element classes from ${filePath}:`, error);
105+
return new Set();
106+
}
107+
};
108+
56109
export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter, basePath }) => {
57110
const query = `
58111
query {
@@ -62,13 +115,14 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
62115
}
63116
}
64117
65-
allFileHtml(filter: { articleType: { in: ["document", "apiReference"] } }) {
118+
allFileHtml {
66119
edges {
67120
node {
68121
slug
69122
meta {
70123
title
71124
meta_description
125+
languages
72126
}
73127
}
74128
}
@@ -86,6 +140,9 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
86140
title
87141
meta_description
88142
}
143+
internal {
144+
contentFilePath
145+
}
89146
}
90147
}
91148
}
@@ -109,30 +166,50 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
109166
throw new Error('Site URL not found.');
110167
}
111168

112-
// Process textile-based pages (allFileHtml)
113-
const textilePages = queryRecords.allFileHtml.edges.map((edge) => edge.node);
114-
115-
// Process MDX pages (allMdx)
116-
const mdxPages = queryRecords.allMdx.nodes
117-
.filter((node) => {
118-
// Only include pages from docs directory that have the required frontmatter
119-
return (
120-
node.parent.relativeDirectory.startsWith('docs') &&
121-
node.frontmatter?.title &&
122-
node.frontmatter?.meta_description
123-
);
124-
})
125-
.map((node) => ({
126-
// Create slug from parent file info - remove 'docs/' prefix since it's already in relativeDirectory
127-
slug: (node.parent.relativeDirectory + (node.parent.name === 'index' ? '' : `/${node.parent.name}`)).replace(
128-
/^docs\//,
129-
'',
130-
),
131-
meta: {
132-
title: node.frontmatter.title!,
133-
meta_description: node.frontmatter.meta_description!,
134-
},
135-
}));
169+
// Process textile-based pages (allFileHtml) and extract languages
170+
const textilePages = queryRecords.allFileHtml.edges.map((edge) => {
171+
// Extract valid languages from the meta.languages field
172+
const metaLanguages = edge.node.meta.languages || [];
173+
const languages = metaLanguages.filter((lang) => VALID_LANGUAGES.includes(lang));
174+
175+
return {
176+
...edge.node,
177+
languages,
178+
};
179+
});
180+
181+
// Process MDX pages (allMdx) and extract languages from files
182+
const mdxPages = await Promise.all(
183+
queryRecords.allMdx.nodes
184+
.filter((node) => {
185+
// Only include pages from docs directory that have the required frontmatter
186+
return (
187+
node.parent.relativeDirectory.startsWith('docs') &&
188+
node.frontmatter?.title &&
189+
node.frontmatter?.meta_description
190+
);
191+
})
192+
.map(async (node) => {
193+
// Create slug from parent file info - remove 'docs/' prefix since it's already in relativeDirectory
194+
const slug = (
195+
node.parent.relativeDirectory + (node.parent.name === 'index' ? '' : `/${node.parent.name}`)
196+
).replace(/^docs\//, '');
197+
198+
// Extract valid languages from the file content
199+
const filePath = node.internal.contentFilePath || '';
200+
const detectedLanguages = await extractCodeLanguages(filePath);
201+
const languages = Array.from(detectedLanguages).filter((lang) => VALID_LANGUAGES.includes(lang));
202+
203+
return {
204+
slug,
205+
meta: {
206+
title: node.frontmatter.title!,
207+
meta_description: node.frontmatter.meta_description!,
208+
},
209+
languages,
210+
};
211+
}),
212+
);
136213

137214
const allPages = [...textilePages, ...mdxPages];
138215

@@ -143,15 +220,27 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
143220
const serializedPages = [LLMS_TXT_PREAMBLE];
144221

145222
for (const page of allPages) {
146-
const { slug, meta } = page;
223+
const { slug, meta, languages } = page;
147224
const { title, meta_description } = meta;
148225

149226
try {
150-
const url = prefixPath({ url: `/docs/${slug}`, siteUrl, pathPrefix: basePath });
227+
const baseUrl = prefixPath({ url: `/docs/${slug}`, siteUrl, pathPrefix: basePath });
151228
const safeTitle = escapeMarkdown(title);
152-
const link = `[${safeTitle}](${url})`;
153-
const line = `- ${[link, meta_description].join(': ')}`;
154-
serializedPages.push(line);
229+
230+
// Generate base page entry (without language parameter)
231+
const baseLink = `[${safeTitle}](${baseUrl})`;
232+
const baseLine = `- ${[baseLink, meta_description].join(': ')}`;
233+
serializedPages.push(baseLine);
234+
235+
// Generate language-specific entries if the page has languages
236+
if (languages && languages.length > 0) {
237+
for (const language of languages) {
238+
const langUrl = `${baseUrl}?lang=${language}`;
239+
const langLink = `[${safeTitle} (${getLanguageLabel(language)})](${langUrl})`;
240+
const langLine = `- ${[langLink, meta_description].join(': ')}`;
241+
serializedPages.push(langLine);
242+
}
243+
}
155244
} catch (err) {
156245
reporter.panic(`${REPORTER_PREFIX} Error serializing pages`, err as Error);
157246
}

0 commit comments

Comments
 (0)