Add script to remove old video descriptions

This commit is contained in:
Cadence Ember 2024-04-22 18:24:29 +12:00
parent 55696a1b54
commit efcef9b540
2 changed files with 70 additions and 0 deletions

View File

@ -16,5 +16,8 @@
"mixin-deep": "^2.0.1", "mixin-deep": "^2.0.1",
"node-fetch": "^2.6.6", "node-fetch": "^2.6.6",
"pinski": "git+https://git.sr.ht/~cadence/nodejs-pinski#9653807f309aee34c8c63ce4e6ee760cccbfdf0d" "pinski": "git+https://git.sr.ht/~cadence/nodejs-pinski#9653807f309aee34c8c63ce4e6ee760cccbfdf0d"
},
"devDependencies": {
"cli-progress": "^3.12.0"
} }
} }

67
scripts/min-video-data.js Normal file
View File

@ -0,0 +1,67 @@
const fs = require("fs")
const zlib = require("zlib")
const progress = require("cli-progress")
const {promisify} = require("util")
const {pipeline} = require("stream")
const pipe = promisify(pipeline)
const db = require("../utils/db")
const cutoff = new Date("2023-01-01").getTime() / 1000
function* toRows(stmt) {
yield* stmt.raw().iterate(cutoff);
}
(async () => {
const countToMin = db.prepare("select count(*) from Videos where published < ?").pluck().get(cutoff)
const countTotal = db.prepare("select count(*) from Videos").pluck().get()
console.log("want to trim", countToMin, "out of", countTotal, "videos");
// ensure that we're not trimming the entire content
if (Math.abs(countTotal - countToMin) <= 10) {
throw new Error("failsafe: not trimming everything")
}
// export
const backupName = "video-descriptions-backup.jsonl.gz"
console.log(`exporting a backup to ${backupName}...`)
const contents = db.prepare("select videoId, descriptionHtml from Videos where published < ? order by author asc, published asc")
await new Promise((resolve, reject) => {
const rowsProgress = new progress.SingleBar({fps: 3}, progress.Presets.shades_classic)
const gzipProgress = new progress.SingleBar({fps: 3}, progress.Presets.shades_classic)
// write rows into gzip
const gzip = zlib.createGzip()
const dest = fs.createWriteStream(backupName)
gzip.pipe(dest)
rowsProgress.start(countToMin, 0)
for (const row of toRows(contents)) {
gzip.write(JSON.stringify(row))
rowsProgress.increment()
}
gzip.end()
rowsProgress.stop()
// track gzip progress
console.log(" compressing backup...")
const max = gzip._writableState.length
gzipProgress.start(max, 0)
const interval = setInterval(() => {
gzipProgress.update(max - gzip._writableState.length)
}, 100)
dest.on("finish", () => {
clearInterval(interval)
gzipProgress.stop()
resolve()
})
})
// do it!
console.log("removing descriptions...")
db.prepare("update videos set descriptionHtml = null where published < ?").run(cutoff)
console.log("reclaiming disk space from database...")
db.prepare("vacuum").run()
})()