From efcef9b5402d96afb396a6af9f69007955a24668 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Mon, 22 Apr 2024 18:24:29 +1200 Subject: [PATCH] Add script to remove old video descriptions --- package.json | 3 ++ scripts/min-video-data.js | 67 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 scripts/min-video-data.js diff --git a/package.json b/package.json index 7446bd3..cf54037 100644 --- a/package.json +++ b/package.json @@ -16,5 +16,8 @@ "mixin-deep": "^2.0.1", "node-fetch": "^2.6.6", "pinski": "git+https://git.sr.ht/~cadence/nodejs-pinski#9653807f309aee34c8c63ce4e6ee760cccbfdf0d" + }, + "devDependencies": { + "cli-progress": "^3.12.0" } } diff --git a/scripts/min-video-data.js b/scripts/min-video-data.js new file mode 100644 index 0000000..ad81806 --- /dev/null +++ b/scripts/min-video-data.js @@ -0,0 +1,67 @@ +const fs = require("fs") +const zlib = require("zlib") +const progress = require("cli-progress") +const {promisify} = require("util") +const {pipeline} = require("stream") +const pipe = promisify(pipeline) + +const db = require("../utils/db") + +const cutoff = new Date("2023-01-01").getTime() / 1000 + +function* toRows(stmt) { + yield* stmt.raw().iterate(cutoff); +} + +(async () => { + const countToMin = db.prepare("select count(*) from Videos where published < ?").pluck().get(cutoff) + const countTotal = db.prepare("select count(*) from Videos").pluck().get() + console.log("want to trim", countToMin, "out of", countTotal, "videos"); + + // ensure that we're not trimming the entire content + if (Math.abs(countTotal - countToMin) <= 10) { + throw new Error("failsafe: not trimming everything") + } + + // export + const backupName = "video-descriptions-backup.jsonl.gz" + console.log(`exporting a backup to ${backupName}...`) + const contents = db.prepare("select videoId, descriptionHtml from Videos where published < ? order by author asc, published asc") + + await new Promise((resolve, reject) => { + const rowsProgress = new progress.SingleBar({fps: 3}, progress.Presets.shades_classic) + const gzipProgress = new progress.SingleBar({fps: 3}, progress.Presets.shades_classic) + + // write rows into gzip + const gzip = zlib.createGzip() + const dest = fs.createWriteStream(backupName) + gzip.pipe(dest) + rowsProgress.start(countToMin, 0) + for (const row of toRows(contents)) { + gzip.write(JSON.stringify(row)) + rowsProgress.increment() + } + gzip.end() + rowsProgress.stop() + + // track gzip progress + console.log(" compressing backup...") + const max = gzip._writableState.length + gzipProgress.start(max, 0) + const interval = setInterval(() => { + gzipProgress.update(max - gzip._writableState.length) + }, 100) + dest.on("finish", () => { + clearInterval(interval) + gzipProgress.stop() + resolve() + }) + }) + + // do it! + console.log("removing descriptions...") + db.prepare("update videos set descriptionHtml = null where published < ?").run(cutoff) + + console.log("reclaiming disk space from database...") + db.prepare("vacuum").run() +})()