explorer/bin/update-software-hashes.mjs

#!/usr/bin/env node

// Compute MD5, CRC32 and size for the inner tape file inside each download zip.
// Populates the `software_hashes` table and exports a JSON snapshot to
// data/zxdb/software_hashes.json for reimport after DB wipes.
//
// Usage:
//   node bin/update-software-hashes.mjs [flags]
//
// Flags:
//   --rebuild-all    Ignore state and reprocess every download
//   --start-from-id=N  Start processing from download id N
//   --export-only    Skip processing, just export current table to JSON
//   --quiet          Reduce log output
//   --verbose        Force verbose output (default)

import dotenv from "dotenv";
import dotenvExpand from "dotenv-expand";
dotenvExpand.expand(dotenv.config());

import { z } from "zod";
import mysql from "mysql2/promise";
import fs from "fs/promises";
import path from "path";
import { createReadStream } from "fs";
import { createHash } from "crypto";
import { pipeline } from "stream/promises";
import { Transform } from "stream";
import { fileURLToPath } from "url";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const PROJECT_ROOT = path.resolve(__dirname, "..");

// ---- CLI flags ----
const ARGV = new Set(process.argv.slice(2));
const QUIET = ARGV.has("--quiet");
const VERBOSE = ARGV.has("--verbose") || !QUIET;
const REBUILD_ALL = ARGV.has("--rebuild-all");
const EXPORT_ONLY = ARGV.has("--export-only");

// Parse --start-from-id=N
let CLI_START_FROM = 0;
for (const arg of process.argv.slice(2)) {
  const m = arg.match(/^--start-from-id=(\d+)$/);
  if (m) CLI_START_FROM = parseInt(m[1], 10);
}

function logInfo(msg) { if (VERBOSE) console.log(msg); }
function logWarn(msg) { console.warn(msg); }
function logError(msg) { console.error(msg); }

// ---- Environment ----
const envSchema = z.object({
  ZXDB_URL: z.string().url().refine((s) => s.startsWith("mysql://"), {
    message: "ZXDB_URL must be a valid mysql:// URL",
  }),
  CDN_CACHE: z.string().min(1, "CDN_CACHE must be set to the local CDN mirror root"),
});

const parsedEnv = envSchema.safeParse(process.env);
if (!parsedEnv.success) {
  logError("Invalid environment variables:\n" + JSON.stringify(parsedEnv.error.format(), null, 2));
  process.exit(1);
}
const { ZXDB_URL, CDN_CACHE } = parsedEnv.data;

const SNAPSHOT_PATH = path.join(PROJECT_ROOT, "data", "zxdb", "software_hashes.json");
const STATE_FILE = path.join(CDN_CACHE, ".update-software-hashes.state.json");

// Filetype IDs for tape images
const TAPE_FILETYPE_IDS = [8, 22];

// Tape file extensions in priority order (most common first)
const TAPE_EXTENSIONS = [".tap", ".tzx", ".pzx", ".csw"];

// ---- DB ----
const pool = mysql.createPool({
  uri: ZXDB_URL,
  connectionLimit: 10,
  maxPreparedStatements: 256,
});

// ---- Path mapping (mirrors sync-downloads.mjs) ----
function toLocalPath(fileLink) {
  if (fileLink.startsWith("/zxdb/sinclair/")) {
    return path.join(CDN_CACHE, "SC", fileLink.slice("/zxdb/sinclair".length));
  }
  if (fileLink.startsWith("/pub/sinclair/")) {
    return path.join(CDN_CACHE, "WoS", fileLink.slice("/pub/sinclair".length));
  }
  return null;
}

// ---- State management ----
async function loadState() {
  try {
    const raw = await fs.readFile(STATE_FILE, "utf8");
    return JSON.parse(raw);
  } catch {
    return null;
  }
}

async function saveStateAtomic(state) {
  const tmp = STATE_FILE + ".tmp";
  await fs.writeFile(tmp, JSON.stringify(state, null, 2), "utf8");
  await fs.rename(tmp, STATE_FILE);
}

// ---- Zip extraction ----

// Use Node.js built-in (node:zlib for deflate) + manual zip parsing
// to avoid external dependencies. Zip files in ZXDB are simple (no encryption, single file).

async function extractZipContents(zipPath, contentsDir) {
  const { execFile } = await import("child_process");
  const { promisify } = await import("util");
  const execFileAsync = promisify(execFile);

  await fs.mkdir(contentsDir, { recursive: true });

  try {
    // Use system unzip, quoting the path to handle brackets in filenames
    await execFileAsync("unzip", ["-o", "-d", contentsDir, zipPath], {
      maxBuffer: 50 * 1024 * 1024,
    });
  } catch (err) {
    // unzip returns exit code 1 for warnings (e.g. "appears to use backslashes")
    // which is non-fatal — only fail on actual extraction errors
    if (err.code !== 1) {
      throw new Error(`unzip failed for ${zipPath}: ${err.message}`);
    }
  }
}

// ---- Find tape file inside _CONTENTS ----

async function findTapeFile(contentsDir) {
  let entries;
  try {
    entries = await fs.readdir(contentsDir, { recursive: true, withFileTypes: true });
  } catch {
    return null;
  }

  // Collect all tape files grouped by extension priority
  const candidates = [];
  for (const entry of entries) {
    if (!entry.isFile()) continue;
    const ext = path.extname(entry.name).toLowerCase();
    const priority = TAPE_EXTENSIONS.indexOf(ext);
    if (priority === -1) continue;

    const fullPath = path.join(entry.parentPath ?? entry.path, entry.name);
    candidates.push({ path: fullPath, ext, priority, name: entry.name });
  }

  if (candidates.length === 0) return null;

  // Sort by priority (lowest index = highest priority), then alphabetically
  candidates.sort((a, b) => a.priority - b.priority || a.name.localeCompare(b.name));

  // Return the best candidate
  return candidates[0];
}

// ---- Hash computation ----

async function computeHashes(filePath) {
  const md5 = createHash("md5");
  let crc = 0xFFFFFFFF;
  let size = 0;

  // CRC32 lookup table
  const crcTable = new Uint32Array(256);
  for (let i = 0; i < 256; i++) {
    let c = i;
    for (let j = 0; j < 8; j++) {
      c = (c & 1) ? (0xEDB88320 ^ (c >>> 1)) : (c >>> 1);
    }
    crcTable[i] = c;
  }

  const transform = new Transform({
    transform(chunk, encoding, callback) {
      md5.update(chunk);
      size += chunk.length;
      for (let i = 0; i < chunk.length; i++) {
        crc = crcTable[(crc ^ chunk[i]) & 0xFF] ^ (crc >>> 8);
      }
      callback(null, chunk);
    },
  });

  const stream = createReadStream(filePath);
  // Pipe through transform (which computes hashes) and discard output
  await pipeline(stream, transform, async function* (source) {
    for await (const _ of source) { /* drain */ }
  });

  const crc32Final = ((crc ^ 0xFFFFFFFF) >>> 0).toString(16).padStart(8, "0");
  return {
    md5: md5.digest("hex"),
    crc32: crc32Final,
    sizeBytes: size,
  };
}

// ---- Ensure software_hashes table exists ----

async function ensureTable() {
  await pool.query(`
    CREATE TABLE IF NOT EXISTS software_hashes (
      download_id INT NOT NULL PRIMARY KEY,
      md5 VARCHAR(32) NOT NULL,
      crc32 VARCHAR(8) NOT NULL,
      size_bytes BIGINT NOT NULL,
      inner_path VARCHAR(500) NOT NULL,
      updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
      INDEX idx_sh_md5 (md5),
      INDEX idx_sh_crc32 (crc32)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
  `);
}

// ---- JSON export ----

async function exportSnapshot() {
  const [rows] = await pool.query(
    "SELECT download_id, md5, crc32, size_bytes, inner_path, updated_at FROM software_hashes ORDER BY download_id"
  );

  const snapshot = {
    exportedAt: new Date().toISOString(),
    count: rows.length,
    rows: rows.map((r) => ({
      download_id: r.download_id,
      md5: r.md5,
      crc32: r.crc32,
      size_bytes: Number(r.size_bytes),
      inner_path: r.inner_path,
      updated_at: r.updated_at instanceof Date ? r.updated_at.toISOString() : r.updated_at,
    })),
  };

  // Ensure directory exists
  await fs.mkdir(path.dirname(SNAPSHOT_PATH), { recursive: true });

  // Atomic write
  const tmp = SNAPSHOT_PATH + ".tmp";
  await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2), "utf8");
  await fs.rename(tmp, SNAPSHOT_PATH);

  logInfo(`Exported ${rows.length} rows to ${SNAPSHOT_PATH}`);
  return rows.length;
}

// ---- Main processing loop ----

let currentState = null;

async function main() {
  await ensureTable();

  if (EXPORT_ONLY) {
    const count = await exportSnapshot();
    logInfo(`Export complete: ${count} rows.`);
    await pool.end();
    return;
  }

  // Determine start point
  const prior = await loadState();
  let resumeFrom = CLI_START_FROM;
  if (!REBUILD_ALL && !CLI_START_FROM && prior?.lastProcessedId) {
    resumeFrom = prior.lastProcessedId + 1;
  }

  const startedAt = new Date().toISOString();
  currentState = {
    version: 1,
    startedAt,
    updatedAt: startedAt,
    startFromId: resumeFrom,
    lastProcessedId: prior?.lastProcessedId ?? -1,
    processed: 0,
    hashed: 0,
    skipped: 0,
    errors: 0,
    error: undefined,
  };

  // Query tape-image downloads
  const placeholders = TAPE_FILETYPE_IDS.map(() => "?").join(", ");
  const [rows] = await pool.query(
    `SELECT id, file_link, file_size FROM downloads
     WHERE filetype_id IN (${placeholders}) AND id >= ?
     ORDER BY id ASC`,
    [...TAPE_FILETYPE_IDS, resumeFrom]
  );

  // Also get total count for progress display
  const [totalRows] = await pool.query(
    `SELECT COUNT(*) as cnt FROM downloads WHERE filetype_id IN (${placeholders})`,
    TAPE_FILETYPE_IDS
  );
  const total = totalRows[0].cnt;

  logInfo(`Processing ${rows.length} tape-image downloads (total in DB: ${total}, starting from id >= ${resumeFrom})`);

  let processed = 0;
  let hashed = 0;
  let skipped = 0;
  let errors = 0;

  for (const row of rows) {
    const { id, file_link: fileLink } = row;

    try {
      const localZip = toLocalPath(fileLink);
      if (!localZip) {
        logWarn(`  [${id}] Unsupported file_link prefix: ${fileLink}`);
        errors++;
        processed++;
        currentState.lastProcessedId = id;
        continue;
      }

      // Check if zip exists locally
      try {
        await fs.access(localZip);
      } catch {
        // Zip not synced yet — skip silently
        skipped++;
        processed++;
        currentState.lastProcessedId = id;
        if (processed % 500 === 0) {
          await checkpoint();
        }
        continue;
      }

      // Check/create _CONTENTS
      const contentsDir = localZip + "_CONTENTS";
      let contentsExisted = false;
      try {
        await fs.access(contentsDir);
        contentsExisted = true;
      } catch {
        // Need to extract
      }

      if (!contentsExisted) {
        try {
          await extractZipContents(localZip, contentsDir);
        } catch (err) {
          logWarn(`  [${id}] Extract failed: ${err.message}`);
          errors++;
          processed++;
          currentState.lastProcessedId = id;
          continue;
        }
      }

      // Find tape file
      const tapeFile = await findTapeFile(contentsDir);
      if (!tapeFile) {
        // No tape file found inside zip — unusual but not fatal
        if (VERBOSE) logWarn(`  [${id}] No tape file in ${contentsDir}`);
        skipped++;
        processed++;
        currentState.lastProcessedId = id;
        continue;
      }

      // Compute hashes
      const hashes = await computeHashes(tapeFile.path);

      // Relative path inside _CONTENTS for the inner_path column
      const innerPath = path.relative(contentsDir, tapeFile.path);

      // Upsert
      await pool.query(
        `INSERT INTO software_hashes (download_id, md5, crc32, size_bytes, inner_path, updated_at)
         VALUES (?, ?, ?, ?, ?, NOW())
         ON DUPLICATE KEY UPDATE
           md5 = VALUES(md5),
           crc32 = VALUES(crc32),
           size_bytes = VALUES(size_bytes),
           inner_path = VALUES(inner_path),
           updated_at = NOW()`,
        [id, hashes.md5, hashes.crc32, hashes.sizeBytes, innerPath]
      );

      hashed++;
      processed++;
      currentState.lastProcessedId = id;
      currentState.hashed = hashed;
      currentState.processed = processed;
      currentState.skipped = skipped;
      currentState.errors = errors;
      currentState.updatedAt = new Date().toISOString();

      if (processed % 100 === 0) {
        await checkpoint();
        logInfo(`... processed=${processed}/${rows.length}, hashed=${hashed}, skipped=${skipped}, errors=${errors}`);
      }
    } catch (err) {
      logError(`  [${id}] Unexpected error: ${err.message}`);
      errors++;
      processed++;
      currentState.lastProcessedId = id;
      currentState.errors = errors;
    }
  }

  // Final state save
  currentState.processed = processed;
  currentState.hashed = hashed;
  currentState.skipped = skipped;
  currentState.errors = errors;
  currentState.updatedAt = new Date().toISOString();
  await saveStateAtomic(currentState);

  logInfo(`\nProcessing complete: processed=${processed}, hashed=${hashed}, skipped=${skipped}, errors=${errors}`);

  // Export snapshot
  logInfo("\nExporting JSON snapshot...");
  await exportSnapshot();

  await pool.end();
  logInfo("Done.");

  async function checkpoint() {
    currentState.processed = processed;
    currentState.hashed = hashed;
    currentState.skipped = skipped;
    currentState.errors = errors;
    currentState.updatedAt = new Date().toISOString();
    try {
      await saveStateAtomic(currentState);
    } catch (e) {
      logError(`Failed to write state: ${e?.message || e}`);
    }
  }
}

// ---- Graceful shutdown ----
process.on("SIGINT", async () => {
  logWarn("\nInterrupted (SIGINT). Writing state...");
  try {
    if (currentState) {
      currentState.updatedAt = new Date().toISOString();
      await saveStateAtomic(currentState);
      logWarn(`State saved at: ${STATE_FILE}`);
    }
  } catch (e) {
    logError(`Failed to write state on SIGINT: ${e?.message || e}`);
  }
  try { await pool.end(); } catch {}
  process.exit(130);
});

// Run
main().catch(async (err) => {
  logError(`Fatal error: ${err.message}\n${err.stack || "<no stack>"}`);
  try {
    if (currentState) {
      currentState.updatedAt = new Date().toISOString();
      currentState.error = { message: err.message, stack: err.stack };
      await saveStateAtomic(currentState);
    }
  } catch (e) {
    logError(`Failed to write state on fatal: ${e?.message || e}`);
  }
  try { await pool.end(); } catch {}
  process.exit(1);
});