bibliogram/src/lib/collectors.js

533 lines
21 KiB
JavaScript

const constants = require("./constants")
const {request} = require("./utils/request")
const switcher = require("./utils/torswitcher")
const {selectExtractor} = require("./utils/body")
const {TtlCache, RequestCache, UserRequestCache} = require("./cache")
const RequestHistory = require("./structures/RequestHistory")
const fhp = require("fast-html-parser")
const db = require("./db")
require("./testimports")(constants, request, selectExtractor, UserRequestCache, RequestHistory, db)
const requestCache = new RequestCache(constants.caching.resource_cache_time)
/** @type {import("./cache").UserRequestCache<import("./structures/User")|import("./structures/ReelUser")>} */
const userRequestCache = new UserRequestCache(constants.caching.resource_cache_time)
/** @type {import("./cache").TtlCache<import("./structures/TimelineEntry")>} */
const timelineEntryCache = new TtlCache(constants.caching.resource_cache_time)
const history = new RequestHistory(["user", "timeline", "igtv", "post", "reel"])
const AssistantSwitcher = require("./structures/AssistantSwitcher")
const assistantSwitcher = new AssistantSwitcher()
/**
* @param {string} username
* @param {symbol} [context]
*/
async function fetchUser(username, context) {
if (constants.external.reserved_paths.includes(username)) {
throw constants.symbols.ENDPOINT_OVERRIDDEN
}
let mode = constants.allow_user_from_reel
if (mode === "iweb") {
return fetchUserFromIWeb(username)
} else if (mode === "html") {
return fetchUserFromHTML(username)
}
throw new Error(`Your instance admin selected fetch mode ${mode}, which is now unsupported. Please ask them to use the default fetch mode by omitting that setting.`)
}
/**
* @param {string} username
* @returns {Promise<{user: import("./structures/User"), quotaUsed: number}>}
*/
function fetchUserFromHTML(username) {
const blockedCacheConfig = constants.caching.self_blocked_status.user_html
if (blockedCacheConfig) {
if (history.store.has("user")) {
const entry = history.store.get("user")
if (!entry.lastRequestSuccessful && Date.now() < entry.lastRequestAt + blockedCacheConfig.time) {
return Promise.reject(entry.kind || constants.symbols.RATE_LIMITED)
}
}
}
let quotaUsed = 0
return userRequestCache.getOrFetch("user/"+username, false, true, () => {
quotaUsed++
return switcher.request("user_html", `https://www.instagram.com/${username}/feed/`, async res => {
if (res.status === 301) throw constants.symbols.ENDPOINT_OVERRIDDEN
if (res.status === 302) throw constants.symbols.INSTAGRAM_DEMANDS_LOGIN
if (res.status === 429) throw constants.symbols.RATE_LIMITED
return res
}).then(async g => {
const res = await g.response()
if (res.status === 404) {
throw constants.symbols.NOT_FOUND
} else {
const text = await g.text()
// require down here or have to deal with require loop. require cache will take care of it anyway.
// User -> Timeline -> TimelineEntry -> collectors -/> User
const User = require("./structures/User")
const userData = selectExtractor(text)
const user = new User(userData)
history.report("user", true)
if (constants.caching.db_user_id) {
const existing = db.prepare("SELECT created, updated_version FROM Users WHERE username = ?").get(user.data.username)
db.prepare(
"REPLACE INTO Users (username, user_id, created, updated, updated_version, biography, post_count, following_count, followed_by_count, external_url, full_name, is_private, is_verified, profile_pic_url) VALUES "
+"(@username, @user_id, @created, @updated, @updated_version, @biography, @post_count, @following_count, @followed_by_count, @external_url, @full_name, @is_private, @is_verified, @profile_pic_url)"
).run({
username: user.data.username,
user_id: user.data.id,
created: existing && existing.updated_version === constants.database_version ? existing.created : Date.now(),
updated: Date.now(),
updated_version: constants.database_version,
biography: user.data.biography || null,
post_count: user.posts || 0,
following_count: user.following || 0,
followed_by_count: user.followedBy || 0,
external_url: user.data.external_url || null,
full_name: user.data.full_name || null,
is_private: +user.data.is_private,
is_verified: +user.data.is_verified,
profile_pic_url: user.data.profile_pic_url
})
}
return user
}
}).catch(error => {
if (error === constants.symbols.INSTAGRAM_DEMANDS_LOGIN || error === constants.symbols.RATE_LIMITED) {
history.report("user", false, error)
}
throw error
})
}).then(user => ({user, quotaUsed}))
}
/**
* @param {string} username
* @returns {Promise<{user: import("./structures/User"), quotaUsed: number}>}
*/
function fetchUserFromIWeb(username) {
const blockedCacheConfig = constants.caching.self_blocked_status.user_html
if (blockedCacheConfig) {
if (history.store.has("user")) {
const entry = history.store.get("user")
if (!entry.lastRequestSuccessful && Date.now() < entry.lastRequestAt + blockedCacheConfig.time) {
return Promise.reject(entry.kind || constants.symbols.RATE_LIMITED)
}
}
}
let quotaUsed = 0
return userRequestCache.getOrFetch("user/"+username, false, true, () => {
quotaUsed++
const params = new URLSearchParams({username})
return switcher.request("user_html", `https://i.instagram.com/api/v1/users/web_profile_info/?${params}`, async res => {
if (res.status === 301) throw constants.symbols.ENDPOINT_OVERRIDDEN
if (res.status === 302) throw constants.symbols.INSTAGRAM_DEMANDS_LOGIN
if (res.status === 429) throw constants.symbols.RATE_LIMITED
return res
}).then(async g => {
const res = await g.response()
const json = await g.json()
// require down here or have to deal with require loop. require cache will take care of it anyway.
// User -> Timeline -> TimelineEntry -> collectors -/> User
const User = require("./structures/User")
const user = new User(json.data.user)
history.report("user", true)
// sure, cache the user info. why not.
if (constants.caching.db_user_id) {
const existing = db.prepare("SELECT created, updated_version FROM Users WHERE username = ?").get(user.data.username)
db.prepare(
"REPLACE INTO Users (username, user_id, created, updated, updated_version, biography, post_count, following_count, followed_by_count, external_url, full_name, is_private, is_verified, profile_pic_url) VALUES "
+"(@username, @user_id, @created, @updated, @updated_version, @biography, @post_count, @following_count, @followed_by_count, @external_url, @full_name, @is_private, @is_verified, @profile_pic_url)"
).run({
username: user.data.username,
user_id: user.data.id,
created: existing && existing.updated_version === constants.database_version ? existing.created : Date.now(),
updated: Date.now(),
updated_version: constants.database_version,
biography: user.data.biography || null,
post_count: user.posts || 0,
following_count: user.following || 0,
followed_by_count: user.followedBy || 0,
external_url: user.data.external_url || null,
full_name: user.data.full_name || null,
is_private: +user.data.is_private,
is_verified: +user.data.is_verified,
profile_pic_url: user.data.profile_pic_url
})
}
return user
}).catch(error => {
if (error === constants.symbols.INSTAGRAM_DEMANDS_LOGIN || error === constants.symbols.RATE_LIMITED) {
history.report("user", false, error)
}
throw error
})
}).then(user => ({user, quotaUsed}))
}
/**
* @param {string} userID
*/
function updateProfilePictureFromReel(userID) {
const p = new URLSearchParams()
p.set("query_hash", constants.external.reel_query_hash)
p.set("variables", JSON.stringify({
user_id: userID,
include_reel: true
}))
return switcher.request("reel_graphql", `https://www.instagram.com/graphql/query/?${p.toString()}`, async res => {
if (res.status === 429) throw constants.symbols.RATE_LIMITED
return res
}).then(res => res.json()).then(root => {
const result = root.data.user
if (!result) throw constants.symbols.NOT_FOUND
const profilePicURL = result.reel.user.profile_pic_url
if (!profilePicURL) throw constants.symbols.NOT_FOUND
db.prepare("UPDATE Users SET profile_pic_url = ? WHERE user_id = ?").run(profilePicURL, userID)
for (const entry of userRequestCache.cache.values()) {
// yes, data.data is correct.
if (entry.data && entry.data.data && entry.data.data.id === userID) {
entry.data.data.profile_pic_url = profilePicURL
entry.data.computeProxyProfilePic()
break // stop checking entries from the cache since we won't find any more
}
}
return profilePicURL
}).catch(error => {
throw error
})
}
/**
* @param {string} userID
* @param {string} username
* @returns {Promise<{user: import("./structures/ReelUser")|import("./structures/User"), quotaUsed: number}>}
*/
function fetchUserFromCombined(userID, username) {
// Fetch basic user information
const p = new URLSearchParams()
p.set("query_hash", constants.external.reel_query_hash)
p.set("variables", JSON.stringify({
user_id: userID,
include_reel: true
}))
return userRequestCache.getOrFetch("user/"+username, true, false, () => {
return switcher.request("reel_graphql", `https://www.instagram.com/graphql/query/?${p.toString()}`, async res => {
if (res.status === 429) throw constants.symbols.RATE_LIMITED
return res
}).then(res => res.json()).then(root => {
const result = root.data.user
if (!result) {
// user ID doesn't exist.
db.prepare("DELETE FROM Users WHERE user_id = ?").run(userID) // deleting the entry makes sense to me; the username might be claimed by somebody else later
throw constants.symbols.NOT_FOUND // this should cascade down and show the user not found page
}
// require down here or have to deal with require loop. require cache will take care of it anyway.
// ReelUser -> Timeline -> TimelineEntry -> collectors -/> User
const ReelUser = require("./structures/ReelUser")
const user = new ReelUser(result.reel.user)
history.report("reel", true)
return user
})
}).then(async user => {
// Add first timeline page
let quotaUsed = 0
if (!user.timeline.pages[0]) {
const fetched = await fetchTimelinePage(userID, "")
if (!fetched.fromCache) quotaUsed++
user.timeline.addPage(fetched.result)
}
return {user, quotaUsed}
}).catch(error => {
if (error === constants.symbols.RATE_LIMITED) {
history.report("reel", false, error)
}
throw error
})
}
/**
* @param {string} userID
* @param {string} after
* @returns {Promise<{result: import("./types").PagedEdges<import("./types").TimelineEntryN2>, fromCache: boolean}>}
*/
function fetchTimelinePage(userID, after) {
const blockedCacheConfig = constants.caching.self_blocked_status.timeline_graphql
if (blockedCacheConfig) {
if (history.store.has("timeline")) {
const entry = history.store.get("timeline")
if (!entry.lastRequestSuccessful && Date.now() < entry.lastRequestAt + blockedCacheConfig.time) {
return Promise.reject(entry.kind || constants.symbols.RATE_LIMITED)
}
}
}
const p = new URLSearchParams()
p.set("query_hash", constants.external.timeline_query_hash)
p.set("variables", JSON.stringify({
id: userID,
first: constants.external.timeline_fetch_first,
after: after
}))
return requestCache.getOrFetchPromise(`page/${userID}/${after}`, () => {
return switcher.request("timeline_graphql", `https://www.instagram.com/graphql/query/?${p.toString()}`, async res => {
if (res.status === 302) throw constants.symbols.INSTAGRAM_BLOCK_TYPE_DECEMBER
if (res.status === 429) throw constants.symbols.RATE_LIMITED
}).then(g => g.json()).then(root => {
if (root.data.user === null) {
// user ID doesn't exist.
db.prepare("DELETE FROM Users WHERE user_id = ?").run(userID) // deleting the entry makes sense to me; the username might be claimed by somebody else later
requestCache
throw constants.symbols.NOT_FOUND // this should cascade down and show the user not found page
}
/** @type {import("./types").PagedEdges<import("./types").TimelineEntryN2>} */
const timeline = root.data.user.edge_owner_to_timeline_media
history.report("timeline", true)
return timeline
}).catch(error => {
if (error === constants.symbols.RATE_LIMITED || error === constants.symbols.INSTAGRAM_BLOCK_TYPE_DECEMBER) {
history.report("timeline", false, error)
}
throw error
})
})
}
/**
* @param {string} userID
* @param {string} after
* @returns {Promise<{result: import("./types").PagedEdges<import("./types").TimelineEntryN2>, fromCache: boolean}>}
*/
function fetchIGTVPage(userID, after) {
const p = new URLSearchParams()
p.set("query_hash", constants.external.igtv_query_hash)
p.set("variables", JSON.stringify({
id: userID,
first: constants.external.igtv_fetch_first,
after: after
}))
return requestCache.getOrFetchPromise(`igtv/${userID}/${after}`, () => {
// assuming this uses the same bucket as timeline, which may not be the case
return switcher.request("timeline_graphql", `https://www.instagram.com/graphql/query/?${p.toString()}`, async res => {
if (res.status === 302) throw constants.symbols.INSTAGRAM_BLOCK_TYPE_DECEMBER
if (res.status === 429) throw constants.symbols.RATE_LIMITED
}).then(g => g.json()).then(root => {
/** @type {import("./types").PagedEdges<import("./types").TimelineEntryN2>} */
const timeline = root.data.user.edge_felix_video_timeline
history.report("igtv", true)
return timeline
}).catch(error => {
if (error === constants.symbols.RATE_LIMITED || error === constants.symbols.INSTAGRAM_BLOCK_TYPE_DECEMBER) {
history.report("igtv", false, error)
}
throw error
})
})
}
/**
* @param {string} userID
* @param {string} username
* @returns {Promise<{result: boolean, fromCache: boolean}>}
*/
function verifyUserPair(userID, username) {
// Fetch basic user information
const p = new URLSearchParams()
p.set("query_hash", constants.external.reel_query_hash)
p.set("variables", JSON.stringify({
user_id: userID,
include_reel: true
}))
return requestCache.getOrFetchPromise("userID/"+userID, () => {
return switcher.request("reel_graphql", `https://www.instagram.com/graphql/query/?${p.toString()}`, async res => {
if (res.status === 302) throw constants.symbols.INSTAGRAM_BLOCK_TYPE_DECEMBER
if (res.status === 429) throw constants.symbols.RATE_LIMITED
return res
}).then(res => res.json()).then(root => {
let user = root.data.user
if (!user) throw constants.symbols.NOT_FOUND
user = user.reel.user
history.report("reel", true)
return user.id === userID && user.username === username
}).catch(error => {
throw error
})
})
}
/**
* @param {string} shortcode
* @returns {import("./structures/TimelineEntry")}
*/
function getOrCreateShortcode(shortcode) {
if (timelineEntryCache.has(shortcode)) {
return timelineEntryCache.get(shortcode)
} else {
// require down here or have to deal with require loop. require cache will take care of it anyway.
// TimelineEntry -> collectors -/> TimelineEntry
const TimelineEntry = require("./structures/TimelineEntry")
const result = new TimelineEntry()
timelineEntryCache.set(shortcode, result)
return result
}
}
async function getOrFetchShortcode(shortcode) {
if (timelineEntryCache.has(shortcode)) {
return {post: timelineEntryCache.get(shortcode), fromCache: true}
} else {
const {result, fromCache} = await fetchShortcodeData(shortcode)
const entry = getOrCreateShortcode(shortcode)
entry.applyN3(result)
entry.fullyUpdated = true // we already called fetchShortcodeData, which fetches the greatest amount of data possible. it's no use trying to fetch that again with .update().
return {post: entry, fromCache}
}
}
/**
* @param {string} shortcode
* @returns {Promise<{result: import("./types").TimelineEntryN3, fromCache: boolean}>}
*/
function fetchShortcodeData(shortcode) {
// embed endpoint unfortunately only returns a single image, or a single video thumbnail
return requestCache.getOrFetchPromise("shortcode/"+shortcode, () => {
return switcher.request("post_graphql", `https://www.instagram.com/p/${shortcode}/embed/captioned/`, async res => {
if (res.status === 429) throw constants.symbols.RATE_LIMITED
}).then(res => res.text()).then(text => {
let data = null
const match = text.match(/window\.__additionalDataLoaded\('extra',(.*)\);<\/script>/)
if (match) {
const textData = match[1]
data = JSON.parse(textData)
}
if (data == null) {
// we have to actually parse the HTML to get the data
const root = fhp.parse(text)
// Check if post really exists
if (root.querySelector(".EmbedIsBroken")) {
throw constants.symbols.NOT_FOUND
}
// find embed
const e_embed = root.querySelector(".Embed")
// rate limited? if so, the request to instagram took 5-10 seconds, and returned no other content after <body style="background: white">
// so in that case there will be no useful elements, and no .Embed element
if (!e_embed) {
throw constants.symbols.RATE_LIMITED
}
// find avatar
const e_avatar = root.querySelector(".Avatar")
const e_avatarImage = e_avatar.querySelector("img")
// find username
const e_usernameText = root.querySelector(".UsernameText")
const e_viewProfile = root.querySelector(".ViewProfileButton")
// find verified
const e_verified = root.querySelector(".VerifiedSprite")
// find media
const e_media = root.querySelector(".EmbeddedMediaImage")
// find caption
const e_caption = root.querySelector(".Caption")
// extract owner
const owner = {
id: e_embed.attributes["data-owner-id"],
is_verified: !!e_verified,
profile_pic_url: e_avatarImage.attributes.src,
username: e_viewProfile.attributes.href.replace(new RegExp(`^https:\/\/www\.instagram\.com\/(${constants.external.username_regex}).*$`, "s"), "$1")
}
// extract media type
let mediaType = e_embed.attributes["data-media-type"]
const videoData = {}
if (mediaType === "GraphVideo") {
Object.assign(videoData, {
video_url: null,
video_view_count: null
})
} else {
mediaType = "GraphImage"
}
// extract display resources
const display_resources = e_media.attributes.srcset.split(",").map(source => {
source = source.trim()
const [url, widthString] = source.split(" ")
const width = +widthString.match(/\d+/)[0]
return {
src: url,
config_width: width,
config_height: width // best guess!
}
})
// extract caption text
let captionText = ""
if (e_caption) {
captionText = e_caption.childNodes.slice(4, -3).map(node => { // slice removes unneeded starting and ending whitespace and user handles
if (node.tagName === "br") {
return "\n"
} else {
return node.text
}
}).join("")
}
return {
__typename: mediaType,
id: e_embed.attributes["data-media-id"],
display_url: e_media.attributes.src,
display_resources,
is_video: mediaType === "GraphVideo",
shortcode,
accessibility_caption: e_media.attributes.alt,
...videoData,
owner,
edge_media_to_caption: {
edges: [
{
node: {
text: captionText
}
}
]
}
}
} else {
data = data.shortcode_media
history.report("post", true)
if (constants.caching.db_post_n3) {
db.prepare("REPLACE INTO Posts (shortcode, id, id_as_numeric, username, json) VALUES (@shortcode, @id, @id_as_numeric, @username, @json)")
.run({shortcode: data.shortcode, id: data.id, id_as_numeric: data.id, username: data.owner.username, json: JSON.stringify(data)})
}
// if we have the owner but only a reelUser, update it. this code is gross.
if (userRequestCache.hasNotPromise("user/"+data.owner.username)) {
const user = userRequestCache.getWithoutClean("user/"+data.owner.username)
if (user.fromReel) {
user.data.full_name = data.owner.full_name
user.data.is_verified = data.owner.is_verified
}
}
return data
}
}).catch(error => {
if (error === constants.symbols.RATE_LIMITED) {
history.report("post", false, error)
}
throw error
})
})
}
module.exports.fetchUser = fetchUser
module.exports.fetchTimelinePage = fetchTimelinePage
module.exports.fetchIGTVPage = fetchIGTVPage
module.exports.getOrCreateShortcode = getOrCreateShortcode
module.exports.fetchShortcodeData = fetchShortcodeData
module.exports.requestCache = requestCache
module.exports.userRequestCache = userRequestCache
module.exports.timelineEntryCache = timelineEntryCache
module.exports.getOrFetchShortcode = getOrFetchShortcode
module.exports.updateProfilePictureFromReel = updateProfilePictureFromReel
module.exports.history = history
module.exports.assistantSwitcher = assistantSwitcher
module.exports.verifyUserPair = verifyUserPair