Skip to content

Commit b36826a

Browse files
authored
[chore] Refcache refresh + helper scripts (open-telemetry#6130)
1 parent fd0805c commit b36826a

File tree

5 files changed

+616
-1160
lines changed

5 files changed

+616
-1160
lines changed

.htmltest.yml

+3-15
Original file line numberDiff line numberDiff line change
@@ -44,23 +44,11 @@ IgnoreURLs: # list of regexs of paths or URLs to be ignored
4444
- ^https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050919X0006X/1-s2.0-S1877050919303576/main.pdf\?
4545

4646
# Sites that deny access, always yielding 401, 403 Forbidden, 406, or other:
47-
- ^https://(www\.)?linkedin\.com\b # 999 Request Denied
48-
- ^https://(www\.)?mvnrepository\.com
49-
- ^https://(www\.|eng\.)?uber\.com/(blog|flipr)/ # 406
50-
- ^https://kofo.dev
51-
- ^https://platform.openai.com
52-
- ^https://openai.com
53-
- ^https://star-history.com
47+
- ^https://platform.openai.com # Really hard to trick into giving a 200 when using a script; manually verify links
48+
- ^https://star-history.com # link contain ampersands in URL anchor part, which htmltest escapes, so it's not found
5449
- ^https://twitter.com
55-
- ^https://www.chathamhouse.org
56-
- ^https://www.farfetch.com
57-
- ^https://www.zocdoc.com
50+
- ^https://www.youtube.com/playlist\?list= # htmltest doesn't process query parameters
5851
- ^https://x.com
59-
- ^https://maven.org
60-
# OTel Google calendar - curl returns 200, but the link checker gets a 401:
61-
- ^https://calendar.google.com/calendar/embed\?src=google.com_b79e3e90j7bbsa2n2p5an5lf60%40group.calendar.google.com
62-
# YouTube playlists sometimes give a 404, although they give a 200 when accessed via browser:
63-
- ^https://www.youtube.com/playlist\?list=
6452

6553
# Ignore Docsy-generated GitHub links for now, until
6654
# https://github.com/google/docsy/issues/1432 is fixed

package.json

+3-2
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,9 @@
115115
"@cspell/dict-es-es": "^3.0.3",
116116
"@cspell/dict-fr-fr": "^2.2.5",
117117
"@cspell/dict-pt-br": "^2.3.3",
118-
"ajv": "^8.17.1",
119118
"ajv-errors": "^3.0.0",
120119
"ajv-formats": "^3.0.1",
120+
"ajv": "^8.17.1",
121121
"autoprefixer": "^10.4.20",
122122
"cspell": "^8.17.2",
123123
"gulp": "^5.0.0",
@@ -127,11 +127,12 @@
127127
"markdownlint": "^0.37.4",
128128
"postcss-cli": "^11.0.0",
129129
"prettier": "3.4.2",
130+
"puppeteer": "^24.1.1",
130131
"require-dir": "^1.2.0",
131-
"textlint": "^14.4.2",
132132
"textlint-filter-rule-allowlist": "^4.0.0",
133133
"textlint-filter-rule-comments": "^1.2.2",
134134
"textlint-rule-terminology": "^5.2.12",
135+
"textlint": "^14.4.2",
135136
"through2": "^4.0.2",
136137
"yargs": "^17.7.2"
137138
},
+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/usr/bin/env node
2+
3+
import fs from 'fs/promises';
4+
import { getUrlStatus, isHttp2XX } from './get-url-status.mjs';
5+
6+
const CACHE_FILE = 'static/refcache.json';
7+
8+
async function readRefcache() {
9+
try {
10+
const data = await fs.readFile(CACHE_FILE, 'utf8');
11+
return JSON.parse(data);
12+
} catch (error) {
13+
console.error(`Error reading ${CACHE_FILE}:`, error.message);
14+
process.exit(1);
15+
}
16+
}
17+
18+
async function writeRefcache(cache) {
19+
await fs.writeFile(CACHE_FILE, JSON.stringify(cache, null, 2) + '\n', 'utf8');
20+
console.log(`Updated ${CACHE_FILE} with fixed links.`);
21+
}
22+
23+
async function retry404sAndUpdateCache() {
24+
const cache = await readRefcache();
25+
let updated = false;
26+
27+
for (const [url, details] of Object.entries(cache)) {
28+
const { StatusCode, LastSeen } = details;
29+
if (isHttp2XX(StatusCode)) continue;
30+
31+
process.stdout.write(`Checking: ${url} (was ${StatusCode})... `);
32+
const status = await getUrlStatus(url);
33+
console.log(`${status}.`);
34+
35+
if (!isHttp2XX(status)) continue;
36+
37+
cache[url] = {
38+
StatusCode: status,
39+
LastSeen: new Date().toISOString(),
40+
};
41+
42+
updated = true;
43+
}
44+
45+
if (updated) {
46+
await writeRefcache(cache);
47+
} else {
48+
console.log(`No updates needed.`);
49+
}
50+
}
51+
52+
await retry404sAndUpdateCache();

scripts/get-url-status.mjs

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/usr/bin/env node
2+
3+
import puppeteer from 'puppeteer';
4+
5+
let verbose = false;
6+
7+
function log(...args) {
8+
if (verbose) console.log(...args);
9+
}
10+
11+
async function getUrlHeadless(url) {
12+
let browser;
13+
14+
log(`Trying headless fetch of ${url}`);
15+
16+
try {
17+
browser = await puppeteer.launch();
18+
const page = await browser.newPage();
19+
20+
const response = await page.goto(url, {
21+
waitUntil: 'networkidle2',
22+
timeout: 9000,
23+
});
24+
25+
if (!response) throw new Error('No response from server.');
26+
27+
const status = response.status();
28+
log(` Headless fetch returned HTTP status code: ${status}`);
29+
30+
return status;
31+
} catch (error) {
32+
console.error(`Error: ${error.message}`);
33+
return null;
34+
} finally {
35+
if (browser) await browser.close();
36+
}
37+
}
38+
39+
async function getUrlInBrowser(url) {
40+
let browser;
41+
42+
try {
43+
browser = await puppeteer.launch({ headless: false });
44+
45+
const page = await browser.newPage();
46+
const response = await page.goto(url, {
47+
waitUntil: 'networkidle2',
48+
timeout: 30000,
49+
});
50+
51+
if (!response) throw new Error('No response from server.');
52+
53+
const status = response.status();
54+
log(`HTTP status code: ${status}`);
55+
56+
return status;
57+
} catch (error) {
58+
console.error(`Error fetching ${url}:`, error.message);
59+
return null;
60+
} finally {
61+
if (browser) await browser.close();
62+
}
63+
}
64+
65+
export function isHttp2XX(status) {
66+
return status && status >= 200 && status < 300;
67+
}
68+
69+
export async function getUrlStatus(url) {
70+
let status = 0; // await getUrlHeadless(url);
71+
if (!isHttp2XX(status)) {
72+
status = await getUrlInBrowser(url);
73+
}
74+
return status;
75+
}
76+
77+
async function mainCLI() {
78+
const url = process.argv[2];
79+
verbose = true; // process.argv.includes('--verbose');
80+
81+
if (!url) {
82+
console.error(`Usage: ${process.argv[1]} URL`);
83+
process.exit(1);
84+
}
85+
86+
const status = await getUrlStatus(url);
87+
process.exit(isHttp2XX(status) ? 0 : 1);
88+
}
89+
90+
// Only run if script is executed directly (CLI)
91+
if (import.meta.url === `file://${process.argv[1]}`) await mainCLI();

0 commit comments

Comments
 (0)