Processes tape-image downloads (filetype_id 8, 22), extracts zip contents, finds the inner tape file (.tap/.tzx/.pzx/.csw), computes MD5/CRC32/size, and upserts into software_hashes table. Exports a JSON snapshot for DB wipe recovery. Supports --resume, --rebuild-all, --start-from-id, --export-only flags with state file persistence. claude-opus-4-6@MacFiver
480 lines
14 KiB
JavaScript
Executable File
480 lines
14 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
|
|
// Compute MD5, CRC32 and size for the inner tape file inside each download zip.
|
|
// Populates the `software_hashes` table and exports a JSON snapshot to
|
|
// data/zxdb/software_hashes.json for reimport after DB wipes.
|
|
//
|
|
// Usage:
|
|
// node bin/update-software-hashes.mjs [flags]
|
|
//
|
|
// Flags:
|
|
// --rebuild-all Ignore state and reprocess every download
|
|
// --start-from-id=N Start processing from download id N
|
|
// --export-only Skip processing, just export current table to JSON
|
|
// --quiet Reduce log output
|
|
// --verbose Force verbose output (default)
|
|
|
|
import dotenv from "dotenv";
|
|
import dotenvExpand from "dotenv-expand";
|
|
dotenvExpand.expand(dotenv.config());
|
|
|
|
import { z } from "zod";
|
|
import mysql from "mysql2/promise";
|
|
import fs from "fs/promises";
|
|
import path from "path";
|
|
import { createReadStream } from "fs";
|
|
import { createHash } from "crypto";
|
|
import { pipeline } from "stream/promises";
|
|
import { Transform } from "stream";
|
|
import { fileURLToPath } from "url";
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
const PROJECT_ROOT = path.resolve(__dirname, "..");
|
|
|
|
// ---- CLI flags ----
|
|
const ARGV = new Set(process.argv.slice(2));
|
|
const QUIET = ARGV.has("--quiet");
|
|
const VERBOSE = ARGV.has("--verbose") || !QUIET;
|
|
const REBUILD_ALL = ARGV.has("--rebuild-all");
|
|
const EXPORT_ONLY = ARGV.has("--export-only");
|
|
|
|
// Parse --start-from-id=N
|
|
let CLI_START_FROM = 0;
|
|
for (const arg of process.argv.slice(2)) {
|
|
const m = arg.match(/^--start-from-id=(\d+)$/);
|
|
if (m) CLI_START_FROM = parseInt(m[1], 10);
|
|
}
|
|
|
|
function logInfo(msg) { if (VERBOSE) console.log(msg); }
|
|
function logWarn(msg) { console.warn(msg); }
|
|
function logError(msg) { console.error(msg); }
|
|
|
|
// ---- Environment ----
|
|
const envSchema = z.object({
|
|
ZXDB_URL: z.string().url().refine((s) => s.startsWith("mysql://"), {
|
|
message: "ZXDB_URL must be a valid mysql:// URL",
|
|
}),
|
|
CDN_CACHE: z.string().min(1, "CDN_CACHE must be set to the local CDN mirror root"),
|
|
});
|
|
|
|
const parsedEnv = envSchema.safeParse(process.env);
|
|
if (!parsedEnv.success) {
|
|
logError("Invalid environment variables:\n" + JSON.stringify(parsedEnv.error.format(), null, 2));
|
|
process.exit(1);
|
|
}
|
|
const { ZXDB_URL, CDN_CACHE } = parsedEnv.data;
|
|
|
|
const SNAPSHOT_PATH = path.join(PROJECT_ROOT, "data", "zxdb", "software_hashes.json");
|
|
const STATE_FILE = path.join(CDN_CACHE, ".update-software-hashes.state.json");
|
|
|
|
// Filetype IDs for tape images
|
|
const TAPE_FILETYPE_IDS = [8, 22];
|
|
|
|
// Tape file extensions in priority order (most common first)
|
|
const TAPE_EXTENSIONS = [".tap", ".tzx", ".pzx", ".csw"];
|
|
|
|
// ---- DB ----
|
|
const pool = mysql.createPool({
|
|
uri: ZXDB_URL,
|
|
connectionLimit: 10,
|
|
maxPreparedStatements: 256,
|
|
});
|
|
|
|
// ---- Path mapping (mirrors sync-downloads.mjs) ----
|
|
function toLocalPath(fileLink) {
|
|
if (fileLink.startsWith("/zxdb/sinclair/")) {
|
|
return path.join(CDN_CACHE, "SC", fileLink.slice("/zxdb/sinclair".length));
|
|
}
|
|
if (fileLink.startsWith("/pub/sinclair/")) {
|
|
return path.join(CDN_CACHE, "WoS", fileLink.slice("/pub/sinclair".length));
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// ---- State management ----
|
|
async function loadState() {
|
|
try {
|
|
const raw = await fs.readFile(STATE_FILE, "utf8");
|
|
return JSON.parse(raw);
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function saveStateAtomic(state) {
|
|
const tmp = STATE_FILE + ".tmp";
|
|
await fs.writeFile(tmp, JSON.stringify(state, null, 2), "utf8");
|
|
await fs.rename(tmp, STATE_FILE);
|
|
}
|
|
|
|
// ---- Zip extraction ----
|
|
|
|
// Use Node.js built-in (node:zlib for deflate) + manual zip parsing
|
|
// to avoid external dependencies. Zip files in ZXDB are simple (no encryption, single file).
|
|
|
|
async function extractZipContents(zipPath, contentsDir) {
|
|
const { execFile } = await import("child_process");
|
|
const { promisify } = await import("util");
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
await fs.mkdir(contentsDir, { recursive: true });
|
|
|
|
try {
|
|
// Use system unzip, quoting the path to handle brackets in filenames
|
|
await execFileAsync("unzip", ["-o", "-d", contentsDir, zipPath], {
|
|
maxBuffer: 50 * 1024 * 1024,
|
|
});
|
|
} catch (err) {
|
|
// unzip returns exit code 1 for warnings (e.g. "appears to use backslashes")
|
|
// which is non-fatal — only fail on actual extraction errors
|
|
if (err.code !== 1) {
|
|
throw new Error(`unzip failed for ${zipPath}: ${err.message}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---- Find tape file inside _CONTENTS ----
|
|
|
|
async function findTapeFile(contentsDir) {
|
|
let entries;
|
|
try {
|
|
entries = await fs.readdir(contentsDir, { recursive: true, withFileTypes: true });
|
|
} catch {
|
|
return null;
|
|
}
|
|
|
|
// Collect all tape files grouped by extension priority
|
|
const candidates = [];
|
|
for (const entry of entries) {
|
|
if (!entry.isFile()) continue;
|
|
const ext = path.extname(entry.name).toLowerCase();
|
|
const priority = TAPE_EXTENSIONS.indexOf(ext);
|
|
if (priority === -1) continue;
|
|
|
|
const fullPath = path.join(entry.parentPath ?? entry.path, entry.name);
|
|
candidates.push({ path: fullPath, ext, priority, name: entry.name });
|
|
}
|
|
|
|
if (candidates.length === 0) return null;
|
|
|
|
// Sort by priority (lowest index = highest priority), then alphabetically
|
|
candidates.sort((a, b) => a.priority - b.priority || a.name.localeCompare(b.name));
|
|
|
|
// Return the best candidate
|
|
return candidates[0];
|
|
}
|
|
|
|
// ---- Hash computation ----
|
|
|
|
async function computeHashes(filePath) {
|
|
const md5 = createHash("md5");
|
|
let crc = 0xFFFFFFFF;
|
|
let size = 0;
|
|
|
|
// CRC32 lookup table
|
|
const crcTable = new Uint32Array(256);
|
|
for (let i = 0; i < 256; i++) {
|
|
let c = i;
|
|
for (let j = 0; j < 8; j++) {
|
|
c = (c & 1) ? (0xEDB88320 ^ (c >>> 1)) : (c >>> 1);
|
|
}
|
|
crcTable[i] = c;
|
|
}
|
|
|
|
const transform = new Transform({
|
|
transform(chunk, encoding, callback) {
|
|
md5.update(chunk);
|
|
size += chunk.length;
|
|
for (let i = 0; i < chunk.length; i++) {
|
|
crc = crcTable[(crc ^ chunk[i]) & 0xFF] ^ (crc >>> 8);
|
|
}
|
|
callback(null, chunk);
|
|
},
|
|
});
|
|
|
|
const stream = createReadStream(filePath);
|
|
// Pipe through transform (which computes hashes) and discard output
|
|
await pipeline(stream, transform, async function* (source) {
|
|
for await (const _ of source) { /* drain */ }
|
|
});
|
|
|
|
const crc32Final = ((crc ^ 0xFFFFFFFF) >>> 0).toString(16).padStart(8, "0");
|
|
return {
|
|
md5: md5.digest("hex"),
|
|
crc32: crc32Final,
|
|
sizeBytes: size,
|
|
};
|
|
}
|
|
|
|
// ---- Ensure software_hashes table exists ----
|
|
|
|
async function ensureTable() {
|
|
await pool.query(`
|
|
CREATE TABLE IF NOT EXISTS software_hashes (
|
|
download_id INT NOT NULL PRIMARY KEY,
|
|
md5 VARCHAR(32) NOT NULL,
|
|
crc32 VARCHAR(8) NOT NULL,
|
|
size_bytes BIGINT NOT NULL,
|
|
inner_path VARCHAR(500) NOT NULL,
|
|
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
|
INDEX idx_sh_md5 (md5),
|
|
INDEX idx_sh_crc32 (crc32)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
|
|
`);
|
|
}
|
|
|
|
// ---- JSON export ----
|
|
|
|
async function exportSnapshot() {
|
|
const [rows] = await pool.query(
|
|
"SELECT download_id, md5, crc32, size_bytes, inner_path, updated_at FROM software_hashes ORDER BY download_id"
|
|
);
|
|
|
|
const snapshot = {
|
|
exportedAt: new Date().toISOString(),
|
|
count: rows.length,
|
|
rows: rows.map((r) => ({
|
|
download_id: r.download_id,
|
|
md5: r.md5,
|
|
crc32: r.crc32,
|
|
size_bytes: Number(r.size_bytes),
|
|
inner_path: r.inner_path,
|
|
updated_at: r.updated_at instanceof Date ? r.updated_at.toISOString() : r.updated_at,
|
|
})),
|
|
};
|
|
|
|
// Ensure directory exists
|
|
await fs.mkdir(path.dirname(SNAPSHOT_PATH), { recursive: true });
|
|
|
|
// Atomic write
|
|
const tmp = SNAPSHOT_PATH + ".tmp";
|
|
await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2), "utf8");
|
|
await fs.rename(tmp, SNAPSHOT_PATH);
|
|
|
|
logInfo(`Exported ${rows.length} rows to ${SNAPSHOT_PATH}`);
|
|
return rows.length;
|
|
}
|
|
|
|
// ---- Main processing loop ----
|
|
|
|
let currentState = null;
|
|
|
|
async function main() {
|
|
await ensureTable();
|
|
|
|
if (EXPORT_ONLY) {
|
|
const count = await exportSnapshot();
|
|
logInfo(`Export complete: ${count} rows.`);
|
|
await pool.end();
|
|
return;
|
|
}
|
|
|
|
// Determine start point
|
|
const prior = await loadState();
|
|
let resumeFrom = CLI_START_FROM;
|
|
if (!REBUILD_ALL && !CLI_START_FROM && prior?.lastProcessedId) {
|
|
resumeFrom = prior.lastProcessedId + 1;
|
|
}
|
|
|
|
const startedAt = new Date().toISOString();
|
|
currentState = {
|
|
version: 1,
|
|
startedAt,
|
|
updatedAt: startedAt,
|
|
startFromId: resumeFrom,
|
|
lastProcessedId: prior?.lastProcessedId ?? -1,
|
|
processed: 0,
|
|
hashed: 0,
|
|
skipped: 0,
|
|
errors: 0,
|
|
error: undefined,
|
|
};
|
|
|
|
// Query tape-image downloads
|
|
const placeholders = TAPE_FILETYPE_IDS.map(() => "?").join(", ");
|
|
const [rows] = await pool.query(
|
|
`SELECT id, file_link, file_size FROM downloads
|
|
WHERE filetype_id IN (${placeholders}) AND id >= ?
|
|
ORDER BY id ASC`,
|
|
[...TAPE_FILETYPE_IDS, resumeFrom]
|
|
);
|
|
|
|
// Also get total count for progress display
|
|
const [totalRows] = await pool.query(
|
|
`SELECT COUNT(*) as cnt FROM downloads WHERE filetype_id IN (${placeholders})`,
|
|
TAPE_FILETYPE_IDS
|
|
);
|
|
const total = totalRows[0].cnt;
|
|
|
|
logInfo(`Processing ${rows.length} tape-image downloads (total in DB: ${total}, starting from id >= ${resumeFrom})`);
|
|
|
|
let processed = 0;
|
|
let hashed = 0;
|
|
let skipped = 0;
|
|
let errors = 0;
|
|
|
|
for (const row of rows) {
|
|
const { id, file_link: fileLink } = row;
|
|
|
|
try {
|
|
const localZip = toLocalPath(fileLink);
|
|
if (!localZip) {
|
|
logWarn(` [${id}] Unsupported file_link prefix: ${fileLink}`);
|
|
errors++;
|
|
processed++;
|
|
currentState.lastProcessedId = id;
|
|
continue;
|
|
}
|
|
|
|
// Check if zip exists locally
|
|
try {
|
|
await fs.access(localZip);
|
|
} catch {
|
|
// Zip not synced yet — skip silently
|
|
skipped++;
|
|
processed++;
|
|
currentState.lastProcessedId = id;
|
|
if (processed % 500 === 0) {
|
|
await checkpoint();
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Check/create _CONTENTS
|
|
const contentsDir = localZip + "_CONTENTS";
|
|
let contentsExisted = false;
|
|
try {
|
|
await fs.access(contentsDir);
|
|
contentsExisted = true;
|
|
} catch {
|
|
// Need to extract
|
|
}
|
|
|
|
if (!contentsExisted) {
|
|
try {
|
|
await extractZipContents(localZip, contentsDir);
|
|
} catch (err) {
|
|
logWarn(` [${id}] Extract failed: ${err.message}`);
|
|
errors++;
|
|
processed++;
|
|
currentState.lastProcessedId = id;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Find tape file
|
|
const tapeFile = await findTapeFile(contentsDir);
|
|
if (!tapeFile) {
|
|
// No tape file found inside zip — unusual but not fatal
|
|
if (VERBOSE) logWarn(` [${id}] No tape file in ${contentsDir}`);
|
|
skipped++;
|
|
processed++;
|
|
currentState.lastProcessedId = id;
|
|
continue;
|
|
}
|
|
|
|
// Compute hashes
|
|
const hashes = await computeHashes(tapeFile.path);
|
|
|
|
// Relative path inside _CONTENTS for the inner_path column
|
|
const innerPath = path.relative(contentsDir, tapeFile.path);
|
|
|
|
// Upsert
|
|
await pool.query(
|
|
`INSERT INTO software_hashes (download_id, md5, crc32, size_bytes, inner_path, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, NOW())
|
|
ON DUPLICATE KEY UPDATE
|
|
md5 = VALUES(md5),
|
|
crc32 = VALUES(crc32),
|
|
size_bytes = VALUES(size_bytes),
|
|
inner_path = VALUES(inner_path),
|
|
updated_at = NOW()`,
|
|
[id, hashes.md5, hashes.crc32, hashes.sizeBytes, innerPath]
|
|
);
|
|
|
|
hashed++;
|
|
processed++;
|
|
currentState.lastProcessedId = id;
|
|
currentState.hashed = hashed;
|
|
currentState.processed = processed;
|
|
currentState.skipped = skipped;
|
|
currentState.errors = errors;
|
|
currentState.updatedAt = new Date().toISOString();
|
|
|
|
if (processed % 100 === 0) {
|
|
await checkpoint();
|
|
logInfo(`... processed=${processed}/${rows.length}, hashed=${hashed}, skipped=${skipped}, errors=${errors}`);
|
|
}
|
|
} catch (err) {
|
|
logError(` [${id}] Unexpected error: ${err.message}`);
|
|
errors++;
|
|
processed++;
|
|
currentState.lastProcessedId = id;
|
|
currentState.errors = errors;
|
|
}
|
|
}
|
|
|
|
// Final state save
|
|
currentState.processed = processed;
|
|
currentState.hashed = hashed;
|
|
currentState.skipped = skipped;
|
|
currentState.errors = errors;
|
|
currentState.updatedAt = new Date().toISOString();
|
|
await saveStateAtomic(currentState);
|
|
|
|
logInfo(`\nProcessing complete: processed=${processed}, hashed=${hashed}, skipped=${skipped}, errors=${errors}`);
|
|
|
|
// Export snapshot
|
|
logInfo("\nExporting JSON snapshot...");
|
|
await exportSnapshot();
|
|
|
|
await pool.end();
|
|
logInfo("Done.");
|
|
|
|
async function checkpoint() {
|
|
currentState.processed = processed;
|
|
currentState.hashed = hashed;
|
|
currentState.skipped = skipped;
|
|
currentState.errors = errors;
|
|
currentState.updatedAt = new Date().toISOString();
|
|
try {
|
|
await saveStateAtomic(currentState);
|
|
} catch (e) {
|
|
logError(`Failed to write state: ${e?.message || e}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---- Graceful shutdown ----
|
|
process.on("SIGINT", async () => {
|
|
logWarn("\nInterrupted (SIGINT). Writing state...");
|
|
try {
|
|
if (currentState) {
|
|
currentState.updatedAt = new Date().toISOString();
|
|
await saveStateAtomic(currentState);
|
|
logWarn(`State saved at: ${STATE_FILE}`);
|
|
}
|
|
} catch (e) {
|
|
logError(`Failed to write state on SIGINT: ${e?.message || e}`);
|
|
}
|
|
try { await pool.end(); } catch {}
|
|
process.exit(130);
|
|
});
|
|
|
|
// Run
|
|
main().catch(async (err) => {
|
|
logError(`Fatal error: ${err.message}\n${err.stack || "<no stack>"}`);
|
|
try {
|
|
if (currentState) {
|
|
currentState.updatedAt = new Date().toISOString();
|
|
currentState.error = { message: err.message, stack: err.stack };
|
|
await saveStateAtomic(currentState);
|
|
}
|
|
} catch (e) {
|
|
logError(`Failed to write state on fatal: ${e?.message || e}`);
|
|
}
|
|
try { await pool.end(); } catch {}
|
|
process.exit(1);
|
|
});
|