From 554cd8cc3a25918ec8f2ffd0cf25791788d8dc37 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Thu, 3 Dec 2020 17:00:06 +1300 Subject: [PATCH] Improve ytInitialData extraction --- extractors/video.py | 11 ++++++++--- tools/extractors.py | 4 +--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/extractors/video.py b/extractors/video.py index 008b548..1c619da 100644 --- a/extractors/video.py +++ b/extractors/video.py @@ -28,6 +28,11 @@ def get_created_files(id): id = "_" + id[1:] # youtube-dl changes - to _ at the start, presumably to not accidentally trigger switches with * in shell return (f for f in os.listdir() if f.startswith("{}_".format(id))) +def clean_up_temp_files(id): + created_files = get_created_files(id) + for file in created_files: + os.unlink(file) + def format_order(format): # most significant to least significant # key, max, order, transform @@ -172,6 +177,8 @@ def extract_video(id): return result except youtube_dlc.DownloadError as e: + clean_up_temp_files(id) + if isinstance(e.exc_info[1], urllib.error.HTTPError): if e.exc_info[1].code == 429: result = { @@ -192,9 +199,7 @@ def extract_video(id): print("messed up in original transform.") finally: - created_files = get_created_files(id) - for file in created_files: - os.unlink(file) + clean_up_temp_files(id) return result def get_more_stuff_from_file(id, result): diff --git a/tools/extractors.py b/tools/extractors.py index 7236850..0b9592b 100644 --- a/tools/extractors.py +++ b/tools/extractors.py @@ -1,13 +1,11 @@ import re import json -r_yt_initial_data = re.compile(r"""(?:\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+\});""") +r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|)""", re.S + re.M) def extract_yt_initial_data(content): - content = content.replace("\n", "") m_yt_initial_data = re.search(r_yt_initial_data, content) if m_yt_initial_data: - print(m_yt_initial_data.group(1)) yt_initial_data = json.loads(m_yt_initial_data.group(1)) return yt_initial_data else: