From 91022aa5da82c15daedf7b3dc81e299722133a89 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Fri, 5 Nov 2021 17:01:46 +1300 Subject: [PATCH] Support loading shortcodes of a single image --- package-lock.json | 21 +++++++ package.json | 1 + src/lib/collectors.js | 91 +++++++++++++++++++++++++++-- src/lib/structures/TimelineChild.js | 4 +- src/lib/structures/TimelineEntry.js | 22 ++++--- src/site/pug/includes/post.pug | 9 +-- src/site/pug/post.pug | 10 ++-- 7 files changed, 136 insertions(+), 22 deletions(-) diff --git a/package-lock.json b/package-lock.json index 0edd8c2..fb58e54 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1107,6 +1107,11 @@ "picomatch": "^2.0.4" } }, + "apollojs": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/apollojs/-/apollojs-1.3.0.tgz", + "integrity": "sha1-X3sAME2XQOKnvltSx8CAfVH5JV4=" + }, "append-transform": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/append-transform/-/append-transform-2.0.0.tgz", @@ -1948,6 +1953,22 @@ "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==", "dev": true }, + "fast-html-parser": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/fast-html-parser/-/fast-html-parser-1.0.1.tgz", + "integrity": "sha1-TsyWg7i7ea/hGlCAe3hT55JWzqI=", + "requires": { + "apollojs": "^1.3.0", + "entities": "^1.1.1" + }, + "dependencies": { + "entities": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", + "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==" + } + } + }, "fast-json-stable-stringify": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", diff --git a/package.json b/package.json index 6b9554d..fc476c7 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,7 @@ "dependencies": { "better-sqlite3": "^7.4.4", "cookie": "^0.4.1", + "fast-html-parser": "^1.0.1", "feed": "git+https://git.sr.ht/~cadence/nodejs-feed#3dde82f8296d7a6f5659323e497e0c684f03ab71", "get-stream": "^6.0.1", "gm": "^1.23.1", diff --git a/src/lib/collectors.js b/src/lib/collectors.js index a1de5fb..a2c941e 100644 --- a/src/lib/collectors.js +++ b/src/lib/collectors.js @@ -4,6 +4,7 @@ const switcher = require("./utils/torswitcher") const {extractSharedData} = require("./utils/body") const {TtlCache, RequestCache, UserRequestCache} = require("./cache") const RequestHistory = require("./structures/RequestHistory") +const fhp = require("fast-html-parser") const db = require("./db") require("./testimports")(constants, request, extractSharedData, UserRequestCache, RequestHistory, db) @@ -398,6 +399,7 @@ async function getOrFetchShortcode(shortcode) { const {result, fromCache} = await fetchShortcodeData(shortcode) const entry = getOrCreateShortcode(shortcode) entry.applyN3(result) + entry.fullyUpdated = true // we already called fetchShortcodeData, which fetches the greatest amount of data possible. it's no use trying to fetch that again with .update(). return {post: entry, fromCache} } } @@ -412,11 +414,92 @@ function fetchShortcodeData(shortcode) { return switcher.request("post_graphql", `https://www.instagram.com/p/${shortcode}/embed/captioned/`, async res => { if (res.status === 429) throw constants.symbols.RATE_LIMITED }).then(res => res.text()).then(text => { - const textData = text.match(/window\.__additionalDataLoaded\('extra',(.*)\);<\/script>/)[1] - let data = JSON.parse(textData) + let data = null + const match = text.match(/window\.__additionalDataLoaded\('extra',(.*)\);<\/script>/) + if (match) { + const textData = match[1] + data = JSON.parse(textData) + } if (data == null) { - // the thing doesn't exist - throw constants.symbols.NOT_FOUND + // we have to actually parse the HTML to get the data + const root = fhp.parse(text) + + // Check if post really exists + if (root.querySelector(".EmbedIsBroken")) { + throw constants.symbols.NOT_FOUND + } + + // find embed + const e_embed = root.querySelector(".Embed") + // find avatar + const e_avatar = root.querySelector(".Avatar") + const e_avatarImage = e_avatar.querySelector("img") + // find username + const e_usernameText = root.querySelector(".UsernameText") + const e_viewProfile = root.querySelector(".ViewProfileButton") + // find verified + const e_verified = root.querySelector(".VerifiedSprite") + // find media + const e_media = root.querySelector(".EmbeddedMediaImage") + // find caption + const e_caption = root.querySelector(".Caption") + // extract owner + const owner = { + id: e_embed.attributes["data-owner-id"], + is_verified: !!e_verified, + profile_pic_url: e_avatarImage.attributes.src, + username: e_viewProfile.attributes.href.replace(new RegExp(`^https:\/\/www\.instagram\.com\/(${constants.external.username_regex}).*$`, "s"), "$1") + } + // extract media type + let mediaType = e_embed.attributes["data-media-type"] + const videoData = {} + if (mediaType === "GraphVideo") { + Object.assign(videoData, { + video_url: null, + video_view_count: null + }) + } else { + mediaType = "GraphImage" + } + // extract display resources + const display_resources = e_media.attributes.srcset.split(",").map(source => { + source = source.trim() + const [url, widthString] = source.split(" ") + const width = +widthString.match(/\d+/)[0] + return { + src: url, + config_width: width, + config_height: width // best guess! + } + }) + // extract caption text + const captionText = e_caption.childNodes.slice(4, -3).map(node => { // slice removes unneeded starting and ending whitespace and user handles + if (node.tagName === "br") { + return "\n" + } else { + return node.text + } + }).join("") + return { + __typename: mediaType, + id: e_embed.attributes["data-media-id"], + display_url: e_media.attributes.src, + display_resources, + is_video: mediaType === "GraphVideo", + shortcode, + accessibility_caption: e_media.attributes.alt, + ...videoData, + owner, + edge_media_to_caption: { + edges: [ + { + node: { + text: captionText + } + } + ] + } + } } else { data = data.shortcode_media history.report("post", true) diff --git a/src/lib/structures/TimelineChild.js b/src/lib/structures/TimelineChild.js index 2be2028..b3ef40c 100644 --- a/src/lib/structures/TimelineChild.js +++ b/src/lib/structures/TimelineChild.js @@ -5,10 +5,10 @@ const {compile} = require("pug") require("../testimports")(collectors) const rssImageTemplate = compile(` -img(src=constants.website_origin+entry.getDisplayUrlP() alt=entry.getAlt() width=entry.data.dimensions.width height=entry.data.dimensions.height) +img(src=constants.website_origin+entry.getDisplayUrlP() alt=entry.getAlt() width=entry.data.dimensions && entry.data.dimensions.width height=entry.data.dimensions && entry.data.dimensions.height) `) const rssVideoTemplate = compile(` -video(src=constants.website_origin+entry.getVideoUrlP() controls preload="auto" width=entry.data.dimensions.width height=entry.data.dimensions.height) +video(src=constants.website_origin+entry.getVideoUrlP() controls preload="auto" width=entry.data.dimensions && entry.data.dimensions.width height=entry.data.dimensions && entry.data.dimensions.height) `) class TimelineChild extends TimelineBaseMethods { diff --git a/src/lib/structures/TimelineEntry.js b/src/lib/structures/TimelineEntry.js index cea1835..8e1a46e 100644 --- a/src/lib/structures/TimelineEntry.js +++ b/src/lib/structures/TimelineEntry.js @@ -19,6 +19,7 @@ each child in children class TimelineEntry extends TimelineBaseMethods { constructor() { super() + this.fullyUpdated = false /** @type {import("../types").TimelineEntryAll} some properties may not be available yet! */ // @ts-ignore this.data = {} @@ -38,12 +39,16 @@ class TimelineEntry extends TimelineBaseMethods { } async update() { - return collectors.fetchShortcodeData(this.data.shortcode).then(data => { - this.applyN3(data.result) - }).catch(error => { - console.error("TimelineEntry could not self-update; trying to continue anyway...") - console.error("E:", error) - }) + if (!this.fullyUpdated) { + return collectors.fetchShortcodeData(this.data.shortcode).then(data => { + this.applyN3(data.result) + }).catch(error => { + console.error("TimelineEntry could not self-update; trying to continue anyway...") + console.error("E:", error) + }).finally(() => { + this.fullyUpdated = true + }) + } } /** @@ -88,6 +93,7 @@ class TimelineEntry extends TimelineBaseMethods { * All mutations should act exactly once and have no effect on already mutated data. */ fixData() { + this.hasDate = !!this.data.taken_at_timestamp this.date = new Date(this.data.taken_at_timestamp*1000) } @@ -237,7 +243,7 @@ class TimelineEntry extends TimelineBaseMethods { let fromCache = true const clone = await (async () => { // Do we just already have the extended owner? - if (this.data.owner.full_name) { // this property is on extended owner and not basic owner + if (this.data.owner.profile_pic_url) { // this property is on extended owner and not basic owner const clone = proxyExtendedOwner(this.data.owner) this.ownerPfpCacheP = clone.profile_pic_url return clone @@ -246,7 +252,7 @@ class TimelineEntry extends TimelineBaseMethods { else if (collectors.userRequestCache.getByID(this.data.owner.id)) { /** @type {import("./User")} */ const user = collectors.userRequestCache.getByID(this.data.owner.id) - if (user.data.full_name !== undefined) { + if (user.data.profile_pic_url !== undefined) { this.data.owner = { id: user.data.id, username: user.data.username, diff --git a/src/site/pug/includes/post.pug b/src/site/pug/includes/post.pug index 69e3acf..d76bcbc 100644 --- a/src/site/pug/includes/post.pug +++ b/src/site/pug/includes/post.pug @@ -38,15 +38,16 @@ mixin post(post, headerWithNavigation) - let caption = post.children[0].data.accessibility_caption if caption p.description= caption - p.description - span!= ll.pug_post_timestamp({post}) + if post.hasDate + p.description + span!= ll.pug_post_timestamp({post}) section.images-gallery for entry in post.children if entry.isVideo() - video(src=entry.getVideoUrlP() controls preload="auto" width=entry.data.dimensions.width height=entry.data.dimensions.height).sized-video + video(src=entry.getVideoUrlP() controls preload="auto" width=entry.data.dimensions && entry.data.dimensions.width height=entry.data.dimensions && entry.data.dimensions.height).sized-video else - img(src=entry.getDisplayUrlP() alt=entry.getAlt() width=entry.data.dimensions.width height=entry.data.dimensions.height).sized-image + img(src=entry.getDisplayUrlP() alt=entry.getAlt() width=entry.data.dimensions && entry.data.dimensions.width height=entry.data.dimensions && entry.data.dimensions.height).sized-image if willDisplayAltInGallery - let caption = entry.data.accessibility_caption if caption diff --git a/src/site/pug/post.pug b/src/site/pug/post.pug index 2d2efed..95e648d 100644 --- a/src/site/pug/post.pug +++ b/src/site/pug/post.pug @@ -33,13 +33,15 @@ html if firstEntry.isVideo() meta(property="og:video" content=`${website_origin}${firstEntry.getVideoUrlP()}`) meta(property="og:video:type" content="video/mp4") - meta(property="og:video:width" content=firstEntry.data.dimensions.width) - meta(property="og:video:height" content=firstEntry.data.dimensions.height) + if firstEntry.data.dimensions + meta(property="og:video:width" content=firstEntry.data.dimensions.width) + meta(property="og:video:height" content=firstEntry.data.dimensions.height) meta(property="og:video:alt" content=firstEntry.getAlt()) else meta(property="og:image" content=`${website_origin}${firstEntry.getDisplayUrlP()}`) - meta(property="og:image:width" content=firstEntry.data.dimensions.width) - meta(property="og:image:height" content=firstEntry.data.dimensions.height) + if firstEntry.data.dimensions + meta(property="og:image:width" content=firstEntry.data.dimensions.width) + meta(property="og:image:height" content=firstEntry.data.dimensions.height) meta(property="og:image:type" content="image/jpeg") meta(property="og:image:alt" content=firstEntry.getAlt()) meta(property="og:site_name" content="Bibliogram")