[chore] Refcache refresh + helper scripts (open-telemetry#6130)

chalin · web-flow · commit b36826a2c073 · 2025-01-31T05:06:02.000-05:00
diff --git a/.htmltest.yml b/.htmltest.yml
@@ -44,23 +44,11 @@ IgnoreURLs: # list of regexs of paths or URLs to be ignored
   - ^https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050919X0006X/1-s2.0-S1877050919303576/main.pdf\?
 
   # Sites that deny access, always yielding 401, 403 Forbidden, 406, or other:
-  - ^https://(www\.)?linkedin\.com\b # 999 Request Denied
-  - ^https://(www\.)?mvnrepository\.com
-  - ^https://(www\.|eng\.)?uber\.com/(blog|flipr)/ # 406
-  - ^https://kofo.dev
-  - ^https://platform.openai.com
-  - ^https://openai.com
-  - ^https://star-history.com
+  - ^https://platform.openai.com # Really hard to trick into giving a 200 when using a script; manually verify links
+  - ^https://star-history.com # link contain ampersands in URL anchor part, which htmltest escapes, so it's not found
   - ^https://twitter.com
-  - ^https://www.chathamhouse.org
-  - ^https://www.farfetch.com
-  - ^https://www.zocdoc.com
+  - ^https://www.youtube.com/playlist\?list= # htmltest doesn't process query parameters
   - ^https://x.com
-  - ^https://maven.org
-  # OTel Google calendar - curl returns 200, but the link checker gets a 401:
-  - ^https://calendar.google.com/calendar/embed\?src=google.com_b79e3e90j7bbsa2n2p5an5lf60%40group.calendar.google.com
-  # YouTube playlists sometimes give a 404, although they give a 200 when accessed via browser:
-  - ^https://www.youtube.com/playlist\?list=
 
   # Ignore Docsy-generated GitHub links for now, until
   # https://github.com/google/docsy/issues/1432 is fixed
diff --git a/package.json b/package.json
@@ -115,9 +115,9 @@
     "@cspell/dict-es-es": "^3.0.3",
     "@cspell/dict-fr-fr": "^2.2.5",
     "@cspell/dict-pt-br": "^2.3.3",
-    "ajv": "^8.17.1",
     "ajv-errors": "^3.0.0",
     "ajv-formats": "^3.0.1",
+    "ajv": "^8.17.1",
     "autoprefixer": "^10.4.20",
     "cspell": "^8.17.2",
     "gulp": "^5.0.0",
@@ -127,11 +127,12 @@
     "markdownlint": "^0.37.4",
     "postcss-cli": "^11.0.0",
     "prettier": "3.4.2",
+    "puppeteer": "^24.1.1",
     "require-dir": "^1.2.0",
-    "textlint": "^14.4.2",
     "textlint-filter-rule-allowlist": "^4.0.0",
     "textlint-filter-rule-comments": "^1.2.2",
     "textlint-rule-terminology": "^5.2.12",
+    "textlint": "^14.4.2",
     "through2": "^4.0.2",
     "yargs": "^17.7.2"
   },
diff --git a/scripts/double-check-refcache-400s.mjs b/scripts/double-check-refcache-400s.mjs
@@ -0,0 +1,52 @@
+#!/usr/bin/env node
+
+import fs from 'fs/promises';
+import { getUrlStatus, isHttp2XX } from './get-url-status.mjs';
+
+const CACHE_FILE = 'static/refcache.json';
+
+async function readRefcache() {
+  try {
+    const data = await fs.readFile(CACHE_FILE, 'utf8');
+    return JSON.parse(data);
+  } catch (error) {
+    console.error(`Error reading ${CACHE_FILE}:`, error.message);
+    process.exit(1);
+  }
+}
+
+async function writeRefcache(cache) {
+  await fs.writeFile(CACHE_FILE, JSON.stringify(cache, null, 2) + '\n', 'utf8');
+  console.log(`Updated ${CACHE_FILE} with fixed links.`);
+}
+
+async function retry404sAndUpdateCache() {
+  const cache = await readRefcache();
+  let updated = false;
+
+  for (const [url, details] of Object.entries(cache)) {
+    const { StatusCode, LastSeen } = details;
+    if (isHttp2XX(StatusCode)) continue;
+
+    process.stdout.write(`Checking: ${url} (was ${StatusCode})... `);
+    const status = await getUrlStatus(url);
+    console.log(`${status}.`);
+
+    if (!isHttp2XX(status)) continue;
+
+    cache[url] = {
+      StatusCode: status,
+      LastSeen: new Date().toISOString(),
+    };
+
+    updated = true;
+  }
+
+  if (updated) {
+    await writeRefcache(cache);
+  } else {
+    console.log(`No updates needed.`);
+  }
+}
+
+await retry404sAndUpdateCache();
diff --git a/scripts/get-url-status.mjs b/scripts/get-url-status.mjs
@@ -0,0 +1,91 @@
+#!/usr/bin/env node
+
+import puppeteer from 'puppeteer';
+
+let verbose = false;
+
+function log(...args) {
+  if (verbose) console.log(...args);
+}
+
+async function getUrlHeadless(url) {
+  let browser;
+
+  log(`Trying headless fetch of ${url}`);
+
+  try {
+    browser = await puppeteer.launch();
+    const page = await browser.newPage();
+
+    const response = await page.goto(url, {
+      waitUntil: 'networkidle2',
+      timeout: 9000,
+    });
+
+    if (!response) throw new Error('No response from server.');
+
+    const status = response.status();
+    log(` Headless fetch returned HTTP status code: ${status}`);
+
+    return status;
+  } catch (error) {
+    console.error(`Error: ${error.message}`);
+    return null;
+  } finally {
+    if (browser) await browser.close();
+  }
+}
+
+async function getUrlInBrowser(url) {
+  let browser;
+
+  try {
+    browser = await puppeteer.launch({ headless: false });
+
+    const page = await browser.newPage();
+    const response = await page.goto(url, {
+      waitUntil: 'networkidle2',
+      timeout: 30000,
+    });
+
+    if (!response) throw new Error('No response from server.');
+
+    const status = response.status();
+    log(`HTTP status code: ${status}`);
+
+    return status;
+  } catch (error) {
+    console.error(`Error fetching ${url}:`, error.message);
+    return null;
+  } finally {
+    if (browser) await browser.close();
+  }
+}
+
+export function isHttp2XX(status) {
+  return status && status >= 200 && status < 300;
+}
+
+export async function getUrlStatus(url) {
+  let status = 0; // await getUrlHeadless(url);
+  if (!isHttp2XX(status)) {
+    status = await getUrlInBrowser(url);
+  }
+  return status;
+}
+
+async function mainCLI() {
+  const url = process.argv[2];
+  verbose = true; // process.argv.includes('--verbose');
+
+  if (!url) {
+    console.error(`Usage: ${process.argv[1]} URL`);
+    process.exit(1);
+  }
+
+  const status = await getUrlStatus(url);
+  process.exit(isHttp2XX(status) ? 0 : 1);
+}
+
+// Only run if script is executed directly (CLI)
+if (import.meta.url === `file://${process.argv[1]}`) await mainCLI();
diff --git a/static/refcache.json b/static/refcache.json