From ab58306ceec1e1f411ac9e4c2cf86a6cc9886196 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Thu, 28 Jul 2022 23:31:05 +1200 Subject: [PATCH] Restore HTML request method with preload extractor --- src/lib/collectors.js | 77 ++++++++++++++++++++++++++++++++++++++-- src/lib/constants.js | 4 +-- src/lib/utils/body.js | 18 ++++++++++ src/lib/utils/request.js | 3 +- 4 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/lib/collectors.js b/src/lib/collectors.js index d6ac3be..c0e6342 100644 --- a/src/lib/collectors.js +++ b/src/lib/collectors.js @@ -1,12 +1,12 @@ const constants = require("./constants") const {request} = require("./utils/request") const switcher = require("./utils/torswitcher") -const {extractSharedData} = require("./utils/body") +const {extractPreloader} = require("./utils/body") const {TtlCache, RequestCache, UserRequestCache} = require("./cache") const RequestHistory = require("./structures/RequestHistory") const fhp = require("fast-html-parser") const db = require("./db") -require("./testimports")(constants, request, extractSharedData, UserRequestCache, RequestHistory, db) +require("./testimports")(constants, request, extractPreloader, UserRequestCache, RequestHistory, db) const requestCache = new RequestCache(constants.caching.resource_cache_time) /** @type {import("./cache").UserRequestCache} */ @@ -30,11 +30,84 @@ async function fetchUser(username, context) { let mode = constants.allow_user_from_reel if (mode === "iweb") { return fetchUserFromIWeb(username) + } else if (mode === "html") { + return fetchUserFromHTML(username) } throw new Error(`Your instance admin selected fetch mode ${mode}, which is now unsupported. Please use "iweb" instead (the default).`) } +/** + * @param {string} username + * @returns {Promise<{user: import("./structures/User"), quotaUsed: number}>} + */ +function fetchUserFromHTML(username) { + const blockedCacheConfig = constants.caching.self_blocked_status.user_html + if (blockedCacheConfig) { + if (history.store.has("user")) { + const entry = history.store.get("user") + if (!entry.lastRequestSuccessful && Date.now() < entry.lastRequestAt + blockedCacheConfig.time) { + return Promise.reject(entry.kind || constants.symbols.RATE_LIMITED) + } + } + } + let quotaUsed = 0 + return userRequestCache.getOrFetch("user/"+username, false, true, () => { + quotaUsed++ + return switcher.request("user_html", `https://www.instagram.com/${username}/feed/`, async res => { + if (res.status === 301) throw constants.symbols.ENDPOINT_OVERRIDDEN + if (res.status === 302) throw constants.symbols.INSTAGRAM_DEMANDS_LOGIN + if (res.status === 429) throw constants.symbols.RATE_LIMITED + return res + }).then(async g => { + const res = await g.response() + if (res.status === 404) { + throw constants.symbols.NOT_FOUND + } else { + const text = await g.text() + // require down here or have to deal with require loop. require cache will take care of it anyway. + // User -> Timeline -> TimelineEntry -> collectors -/> User + const User = require("./structures/User") + const preloader = extractPreloader(text) + const profileInfoResponse = preloader.find(x => x.request.url === "/api/v1/users/web_profile_info/") + if (!profileInfoResponse) { + throw new Error("No profile info in the preloader.") + } + const user = new User(JSON.parse(profileInfoResponse.result.response).data.user) + history.report("user", true) + if (constants.caching.db_user_id) { + const existing = db.prepare("SELECT created, updated_version FROM Users WHERE username = ?").get(user.data.username) + db.prepare( + "REPLACE INTO Users (username, user_id, created, updated, updated_version, biography, post_count, following_count, followed_by_count, external_url, full_name, is_private, is_verified, profile_pic_url) VALUES " + +"(@username, @user_id, @created, @updated, @updated_version, @biography, @post_count, @following_count, @followed_by_count, @external_url, @full_name, @is_private, @is_verified, @profile_pic_url)" + ).run({ + username: user.data.username, + user_id: user.data.id, + created: existing && existing.updated_version === constants.database_version ? existing.created : Date.now(), + updated: Date.now(), + updated_version: constants.database_version, + biography: user.data.biography || null, + post_count: user.posts || 0, + following_count: user.following || 0, + followed_by_count: user.followedBy || 0, + external_url: user.data.external_url || null, + full_name: user.data.full_name || null, + is_private: +user.data.is_private, + is_verified: +user.data.is_verified, + profile_pic_url: user.data.profile_pic_url + }) + } + return user + } + }).catch(error => { + if (error === constants.symbols.INSTAGRAM_DEMANDS_LOGIN || error === constants.symbols.RATE_LIMITED) { + history.report("user", false, error) + } + throw error + }) + }).then(user => ({user, quotaUsed})) +} + /** * @param {string} username * @returns {Promise<{user: import("./structures/User"), quotaUsed: number}>} diff --git a/src/lib/constants.js b/src/lib/constants.js index 7c72cc9..5675713 100644 --- a/src/lib/constants.js +++ b/src/lib/constants.js @@ -41,7 +41,7 @@ let constants = { // change this to `true` to serve it, which will make extensions like Privacy Badger automatically whitelist the domain. does_not_track: false, - allow_user_from_reel: "iweb", // legacy. this must be "iweb" now. + allow_user_from_reel: "html", // "iweb" or "html", whichever one works for you proxy_media: { // Whether to proxy media (images, videos, thumbnails) through Bibliogram. This is strongly recommended to protect user privacy. If proxy is turned off, some browser content blockers may break all images since they are served from Facebook domains. image: true, video: true, @@ -223,7 +223,7 @@ let constants = { csrf_time: 60*60*1000, self_blocked_status: { user_html: { - enabled: true, + enabled: false, // enable this if you're using iweb method AND a high traffic instance time: 60*60*1000 }, }, diff --git a/src/lib/utils/body.js b/src/lib/utils/body.js index 8ff671c..819057c 100644 --- a/src/lib/utils/body.js +++ b/src/lib/utils/body.js @@ -29,6 +29,23 @@ function extractSharedData(text) { return {status: constants.symbols.extractor_results.SUCCESS, value: sharedData} } +/** + * @param {string} text + * @returns {any} + */ +function extractPreloader(text) { + const entries = [] + const parser = new Parser(text) + while (parser.seek('{"require":[["PolarisQueryPreloaderCache"', {moveToMatch: true, useEnd: true}) !== -1) { + if (parser.seek('{"complete":', {moveToMatch: true, useEnd: false}) !== -1) { + let details = parser.get({split: ',"status_code":'}) + "}}" + let data = JSON.parse(details) + entries.push(data) + } + } + return entries +} + /** * @param {string} text */ @@ -45,3 +62,4 @@ function getRestrictedAge(text) { } module.exports.extractSharedData = extractSharedData +module.exports.extractPreloader = extractPreloader diff --git a/src/lib/utils/request.js b/src/lib/utils/request.js index 36e52b1..3f8c3cd 100644 --- a/src/lib/utils/request.js +++ b/src/lib/utils/request.js @@ -8,7 +8,8 @@ const userAgent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100 const headers = { "User-Agent": userAgent, - "X-IG-App-ID": 936619743392459 // needed for profile iweb to work + "X-IG-App-ID": "936619743392459", // needed for profile iweb to work + "Sec-Fetch-Mode": "navigate", // needed for profile html to work } const backendStatusLineMap = new Map([