From e3854a60505fc960793cac2f8c9bb4bbea05de48 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Thu, 4 Nov 2021 02:01:52 +1300 Subject: [PATCH] Extract fact check notices to second__clarification --- extractors/video.py | 8 +++++++- tools/extractors.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/extractors/video.py b/extractors/video.py index c487cb5..3e0ab47 100644 --- a/extractors/video.py +++ b/extractors/video.py @@ -7,7 +7,7 @@ import traceback import yt_dlp import urllib.error from tools.converters import * -from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response +from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response, deep_get import tools.files as files from math import floor from urllib.parse import parse_qs, urlparse, urlencode @@ -317,6 +317,12 @@ def get_more_stuff_from_file(id, result): "second__remoteUrl": url }) + # fact check notices! aka "clarifications". + # for now, we just return the data as-is for the renderer to deal with (or not). + def get_clarification(section): + return deep_get(section, ["itemSectionRenderer", "contents", 0, "clarificationRenderer"]) + result["second__clarification"] = next((get_clarification(s) for s in main_sections if get_clarification(s)), None) + except Exception: print("messed up extracting recommendations.") traceback.print_exc() diff --git a/tools/extractors.py b/tools/extractors.py index 1000e06..fd518cb 100644 --- a/tools/extractors.py +++ b/tools/extractors.py @@ -1,6 +1,7 @@ import re import json import random +from functools import reduce r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|)""", re.S + re.M) r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$||var )""", re.S + re.M) @@ -30,3 +31,12 @@ def extract_yt_cfg(content): def eu_consent_cookie(): return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))} + +def is_in(o, key): + if isinstance(o, list): + return type(key) == int and key >= 0 and key < len(o) + else: + return key in o + +def deep_get(o, properties): + return reduce(lambda a, b: a and is_in(a, b) and a[b] or None, [o, *properties])