From 66b7d1bec87ad152728c8312ed3d3964f7dade16 Mon Sep 17 00:00:00 2001 From: bopol Date: Sat, 20 Nov 2021 08:40:34 +0100 Subject: [PATCH] Fix regular captions This removes all of the code that was previously used to get them from /timedtext, and instead, always uses whatever is extracted from the video page. This does unfortunately now require a whole video fetch just for the captions. But assuming captions are only requested by a frontend, this won't be a problem due to the memory cache. The captions link will be in memory because the just-requested video is in memory too. --- extractors/captions.py | 49 +----------------------------------------- 1 file changed, 1 insertion(+), 48 deletions(-) diff --git a/extractors/captions.py b/extractors/captions.py index 418ad4f..d05ec4e 100644 --- a/extractors/captions.py +++ b/extractors/captions.py @@ -6,10 +6,7 @@ from urllib.parse import urlencode import xml.etree.ElementTree as ET def extract_captions(id, **kwargs): - if "label" in kwargs and "auto-generated" in kwargs["label"]: - captions = extract_captions_from_video(id) - else: - captions = extract_captions_from_api(id) + captions = extract_captions_from_video(id) return extract_captions_from_dict(captions, **kwargs) # Return captions for the language specified, @@ -26,50 +23,6 @@ def extract_captions_from_dict(captions, *, lang=None, label=None): return re.sub(r"^([0-9:.]+ --> [0-9:.]+).*$", r"\1", r.content.decode("utf8"), flags=re.MULTILINE) return r -# List of captions directly from youtube, but no automatic -def extract_captions_from_api(id): - url = "https://video.google.com/timedtext?hl=en&type=list&v={}".format(id) - with requests.get(url) as r: - if r.status_code == 404: - return { - "error": "Video unavailable", - "identifier": "NOT_FOUND" - } - - r.raise_for_status() - - transcript = ET.fromstring(r.content.decode("utf8")) - tracks = transcript.findall("track") - - captions = [] - result = { - "captions": captions - } - - for track in tracks: - language_code = track.attrib["lang_code"] - label = track.get("name", default=language_code) - subtitle_api_url = get_subtitle_api_url(id, label, language_code) - - params = urlencode({ - "lang": language_code, - "v": id, - "fmt": "vtt", - "name": label - }) - - subtitle_url = "https://www.youtube.com/api/timedtext?" + params - - captions.append({ - "label": label if label != "" else language_code, - "languageCode": language_code, - "url": subtitle_api_url, - "second__remoteUrl": subtitle_url - }) - - return result - -# We'll fall back to this function for auto-captions. def extract_captions_from_video(id): return { "captions": extract_video(id)["captions"]