Files
explorer/bin/update-software-hashes.mjs
D. Rimron-Soutter 6b91fde972 fix: silently skip /denied/ and other non-hosted download prefixes
These are valid entries we've been asked not to host — no need to
log warnings for them.

claude-opus-4-6@MacFiver
2026-02-17 16:17:43 +00:00

499 lines
15 KiB
JavaScript
Executable File

#!/usr/bin/env node
// Compute MD5, CRC32 and size for the inner tape file inside each download zip.
// Populates the `software_hashes` table and exports a JSON snapshot to
// data/zxdb/software_hashes.json for reimport after DB wipes.
//
// Usage:
// node bin/update-software-hashes.mjs [flags]
//
// Flags:
// --rebuild-all Ignore state and reprocess every download
// --rebuild-missing Only process downloads not yet in software_hashes
// --start-from-id=N Start processing from download id N
// --export-only Skip processing, just export current table to JSON
// --quiet Reduce log output
// --verbose Force verbose output (default)
import dotenv from "dotenv";
import dotenvExpand from "dotenv-expand";
dotenvExpand.expand(dotenv.config());
import { z } from "zod";
import mysql from "mysql2/promise";
import fs from "fs/promises";
import path from "path";
import { createReadStream } from "fs";
import { createHash } from "crypto";
import { pipeline } from "stream/promises";
import { Transform } from "stream";
import { fileURLToPath } from "url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const PROJECT_ROOT = path.resolve(__dirname, "..");
// ---- CLI flags ----
const ARGV = new Set(process.argv.slice(2));
const QUIET = ARGV.has("--quiet");
const VERBOSE = ARGV.has("--verbose") || !QUIET;
const REBUILD_ALL = ARGV.has("--rebuild-all");
const REBUILD_MISSING = ARGV.has("--rebuild-missing");
const EXPORT_ONLY = ARGV.has("--export-only");
// Parse --start-from-id=N
let CLI_START_FROM = 0;
for (const arg of process.argv.slice(2)) {
const m = arg.match(/^--start-from-id=(\d+)$/);
if (m) CLI_START_FROM = parseInt(m[1], 10);
}
function logInfo(msg) { if (VERBOSE) console.log(msg); }
function logWarn(msg) { console.warn(msg); }
function logError(msg) { console.error(msg); }
// ---- Environment ----
const envSchema = z.object({
ZXDB_URL: z.string().url().refine((s) => s.startsWith("mysql://"), {
message: "ZXDB_URL must be a valid mysql:// URL",
}),
CDN_CACHE: z.string().min(1, "CDN_CACHE must be set to the local CDN mirror root"),
});
const parsedEnv = envSchema.safeParse(process.env);
if (!parsedEnv.success) {
logError("Invalid environment variables:\n" + JSON.stringify(parsedEnv.error.format(), null, 2));
process.exit(1);
}
const { ZXDB_URL, CDN_CACHE } = parsedEnv.data;
const SNAPSHOT_PATH = path.join(PROJECT_ROOT, "data", "zxdb", "software_hashes.json");
const STATE_FILE = path.join(CDN_CACHE, ".update-software-hashes.state.json");
// Filetype IDs for tape images
const TAPE_FILETYPE_IDS = [8, 22];
// Tape file extensions in priority order (most common first)
const TAPE_EXTENSIONS = [".tap", ".tzx", ".pzx", ".csw", ".p"];
// ---- DB ----
const pool = mysql.createPool({
uri: ZXDB_URL,
connectionLimit: 10,
maxPreparedStatements: 256,
});
// ---- Path mapping (mirrors sync-downloads.mjs) ----
function toLocalPath(fileLink) {
if (fileLink.startsWith("/zxdb/sinclair/")) {
return path.join(CDN_CACHE, "SC", fileLink.slice("/zxdb/sinclair".length));
}
if (fileLink.startsWith("/pub/sinclair/")) {
return path.join(CDN_CACHE, "WoS", fileLink.slice("/pub/sinclair".length));
}
return null;
}
// ---- State management ----
async function loadState() {
try {
const raw = await fs.readFile(STATE_FILE, "utf8");
return JSON.parse(raw);
} catch {
return null;
}
}
async function saveStateAtomic(state) {
const tmp = STATE_FILE + ".tmp";
await fs.writeFile(tmp, JSON.stringify(state, null, 2), "utf8");
await fs.rename(tmp, STATE_FILE);
}
// ---- Zip extraction ----
// Use Node.js built-in (node:zlib for deflate) + manual zip parsing
// to avoid external dependencies. Zip files in ZXDB are simple (no encryption, single file).
async function extractZipContents(zipPath, contentsDir) {
const { execFile } = await import("child_process");
const { promisify } = await import("util");
const execFileAsync = promisify(execFile);
await fs.mkdir(contentsDir, { recursive: true });
try {
// Use system unzip, quoting the path to handle brackets in filenames
await execFileAsync("unzip", ["-o", "-d", contentsDir, zipPath], {
maxBuffer: 50 * 1024 * 1024,
});
} catch (err) {
// unzip returns exit code 1 for warnings (e.g. "appears to use backslashes")
// which is non-fatal — only fail on actual extraction errors
if (err.code !== 1) {
throw new Error(`unzip failed for ${zipPath}: ${err.message}`);
}
}
}
// ---- Find tape file inside _CONTENTS ----
async function findTapeFile(contentsDir) {
let entries;
try {
entries = await fs.readdir(contentsDir, { recursive: true, withFileTypes: true });
} catch {
return null;
}
// Collect all tape files grouped by extension priority
const candidates = [];
for (const entry of entries) {
if (!entry.isFile()) continue;
const ext = path.extname(entry.name).toLowerCase();
const priority = TAPE_EXTENSIONS.indexOf(ext);
if (priority === -1) continue;
const fullPath = path.join(entry.parentPath ?? entry.path, entry.name);
candidates.push({ path: fullPath, ext, priority, name: entry.name });
}
if (candidates.length === 0) return null;
// Sort by priority (lowest index = highest priority), then alphabetically
candidates.sort((a, b) => a.priority - b.priority || a.name.localeCompare(b.name));
// Return the best candidate
return candidates[0];
}
// ---- Hash computation ----
async function computeHashes(filePath) {
const md5 = createHash("md5");
let crc = 0xFFFFFFFF;
let size = 0;
// CRC32 lookup table
const crcTable = new Uint32Array(256);
for (let i = 0; i < 256; i++) {
let c = i;
for (let j = 0; j < 8; j++) {
c = (c & 1) ? (0xEDB88320 ^ (c >>> 1)) : (c >>> 1);
}
crcTable[i] = c;
}
const transform = new Transform({
transform(chunk, encoding, callback) {
md5.update(chunk);
size += chunk.length;
for (let i = 0; i < chunk.length; i++) {
crc = crcTable[(crc ^ chunk[i]) & 0xFF] ^ (crc >>> 8);
}
callback(null, chunk);
},
});
const stream = createReadStream(filePath);
// Pipe through transform (which computes hashes) and discard output
await pipeline(stream, transform, async function* (source) {
for await (const _ of source) { /* drain */ }
});
const crc32Final = ((crc ^ 0xFFFFFFFF) >>> 0).toString(16).padStart(8, "0");
return {
md5: md5.digest("hex"),
crc32: crc32Final,
sizeBytes: size,
};
}
// ---- Ensure software_hashes table exists ----
async function ensureTable() {
await pool.query(`
CREATE TABLE IF NOT EXISTS software_hashes (
download_id INT NOT NULL PRIMARY KEY,
md5 VARCHAR(32) NOT NULL,
crc32 VARCHAR(8) NOT NULL,
size_bytes BIGINT NOT NULL,
inner_path VARCHAR(500) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
INDEX idx_sh_md5 (md5),
INDEX idx_sh_crc32 (crc32)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
`);
}
// ---- JSON export ----
async function exportSnapshot() {
const [rows] = await pool.query(
"SELECT download_id, md5, crc32, size_bytes, inner_path, updated_at FROM software_hashes ORDER BY download_id"
);
const snapshot = {
exportedAt: new Date().toISOString(),
count: rows.length,
rows: rows.map((r) => ({
download_id: r.download_id,
md5: r.md5,
crc32: r.crc32,
size_bytes: Number(r.size_bytes),
inner_path: r.inner_path,
updated_at: r.updated_at instanceof Date ? r.updated_at.toISOString() : r.updated_at,
})),
};
// Ensure directory exists
await fs.mkdir(path.dirname(SNAPSHOT_PATH), { recursive: true });
// Atomic write
const tmp = SNAPSHOT_PATH + ".tmp";
await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2), "utf8");
await fs.rename(tmp, SNAPSHOT_PATH);
logInfo(`Exported ${rows.length} rows to ${SNAPSHOT_PATH}`);
return rows.length;
}
// ---- Main processing loop ----
let currentState = null;
async function main() {
await ensureTable();
if (EXPORT_ONLY) {
const count = await exportSnapshot();
logInfo(`Export complete: ${count} rows.`);
await pool.end();
return;
}
// Determine start point
const prior = await loadState();
let resumeFrom = CLI_START_FROM;
if (!REBUILD_ALL && !CLI_START_FROM && prior?.lastProcessedId) {
resumeFrom = prior.lastProcessedId + 1;
}
const startedAt = new Date().toISOString();
currentState = {
version: 1,
startedAt,
updatedAt: startedAt,
startFromId: resumeFrom,
lastProcessedId: prior?.lastProcessedId ?? -1,
processed: 0,
hashed: 0,
skipped: 0,
errors: 0,
error: undefined,
};
// Query tape-image downloads
const placeholders = TAPE_FILETYPE_IDS.map(() => "?").join(", ");
let rows;
if (REBUILD_MISSING) {
// Only fetch downloads that don't already have a hash
[rows] = await pool.query(
`SELECT d.id, d.file_link, d.file_size FROM downloads d
LEFT JOIN software_hashes sh ON sh.download_id = d.id
WHERE d.filetype_id IN (${placeholders}) AND sh.download_id IS NULL
ORDER BY d.id ASC`,
TAPE_FILETYPE_IDS
);
} else {
[rows] = await pool.query(
`SELECT id, file_link, file_size FROM downloads
WHERE filetype_id IN (${placeholders}) AND id >= ?
ORDER BY id ASC`,
[...TAPE_FILETYPE_IDS, resumeFrom]
);
}
// Also get total count for progress display
const [totalRows] = await pool.query(
`SELECT COUNT(*) as cnt FROM downloads WHERE filetype_id IN (${placeholders})`,
TAPE_FILETYPE_IDS
);
const total = totalRows[0].cnt;
const mode = REBUILD_MISSING ? "missing only" : REBUILD_ALL ? "rebuild all" : `from id >= ${resumeFrom}`;
logInfo(`Processing ${rows.length} tape-image downloads (total in DB: ${total}, mode: ${mode})`);
let processed = 0;
let hashed = 0;
let skipped = 0;
let errors = 0;
for (const row of rows) {
const { id, file_link: fileLink } = row;
try {
const localZip = toLocalPath(fileLink);
if (!localZip) {
// /denied/ and other non-hosted prefixes — skip silently
skipped++;
processed++;
currentState.lastProcessedId = id;
if (processed % 500 === 0) {
await checkpoint();
}
continue;
}
// Check if zip exists locally
try {
await fs.access(localZip);
} catch {
// Zip not synced yet — skip silently
skipped++;
processed++;
currentState.lastProcessedId = id;
if (processed % 500 === 0) {
await checkpoint();
}
continue;
}
// Check/create _CONTENTS
const contentsDir = localZip + "_CONTENTS";
let contentsExisted = false;
try {
await fs.access(contentsDir);
contentsExisted = true;
} catch {
// Need to extract
}
if (!contentsExisted) {
try {
await extractZipContents(localZip, contentsDir);
} catch (err) {
logWarn(` [${id}] Extract failed: ${err.message}`);
errors++;
processed++;
currentState.lastProcessedId = id;
continue;
}
}
// Find tape file
const tapeFile = await findTapeFile(contentsDir);
if (!tapeFile) {
// No tape file found inside zip — unusual but not fatal
if (VERBOSE) logWarn(` [${id}] No tape file in ${contentsDir}`);
skipped++;
processed++;
currentState.lastProcessedId = id;
continue;
}
// Compute hashes
const hashes = await computeHashes(tapeFile.path);
// Relative path inside _CONTENTS for the inner_path column
const innerPath = path.relative(contentsDir, tapeFile.path);
// Upsert
await pool.query(
`INSERT INTO software_hashes (download_id, md5, crc32, size_bytes, inner_path, updated_at)
VALUES (?, ?, ?, ?, ?, NOW())
ON DUPLICATE KEY UPDATE
md5 = VALUES(md5),
crc32 = VALUES(crc32),
size_bytes = VALUES(size_bytes),
inner_path = VALUES(inner_path),
updated_at = NOW()`,
[id, hashes.md5, hashes.crc32, hashes.sizeBytes, innerPath]
);
hashed++;
processed++;
currentState.lastProcessedId = id;
currentState.hashed = hashed;
currentState.processed = processed;
currentState.skipped = skipped;
currentState.errors = errors;
currentState.updatedAt = new Date().toISOString();
if (processed % 100 === 0) {
await checkpoint();
logInfo(`... processed=${processed}/${rows.length}, hashed=${hashed}, skipped=${skipped}, errors=${errors}`);
}
} catch (err) {
logError(` [${id}] Unexpected error: ${err.message}`);
errors++;
processed++;
currentState.lastProcessedId = id;
currentState.errors = errors;
}
}
// Final state save
currentState.processed = processed;
currentState.hashed = hashed;
currentState.skipped = skipped;
currentState.errors = errors;
currentState.updatedAt = new Date().toISOString();
await saveStateAtomic(currentState);
logInfo(`\nProcessing complete: processed=${processed}, hashed=${hashed}, skipped=${skipped}, errors=${errors}`);
// Export snapshot
logInfo("\nExporting JSON snapshot...");
await exportSnapshot();
await pool.end();
logInfo("Done.");
async function checkpoint() {
currentState.processed = processed;
currentState.hashed = hashed;
currentState.skipped = skipped;
currentState.errors = errors;
currentState.updatedAt = new Date().toISOString();
try {
await saveStateAtomic(currentState);
} catch (e) {
logError(`Failed to write state: ${e?.message || e}`);
}
}
}
// ---- Graceful shutdown ----
process.on("SIGINT", async () => {
logWarn("\nInterrupted (SIGINT). Writing state...");
try {
if (currentState) {
currentState.updatedAt = new Date().toISOString();
await saveStateAtomic(currentState);
logWarn(`State saved at: ${STATE_FILE}`);
}
} catch (e) {
logError(`Failed to write state on SIGINT: ${e?.message || e}`);
}
try { await pool.end(); } catch {}
process.exit(130);
});
// Run
main().catch(async (err) => {
logError(`Fatal error: ${err.message}\n${err.stack || "<no stack>"}`);
try {
if (currentState) {
currentState.updatedAt = new Date().toISOString();
currentState.error = { message: err.message, stack: err.stack };
await saveStateAtomic(currentState);
}
} catch (e) {
logError(`Failed to write state on fatal: ${e?.message || e}`);
}
try { await pool.end(); } catch {}
process.exit(1);
});