From 1d52fca3a01d0b74679306e974fa04fa7a05d983 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Mon, 5 Apr 2021 01:23:54 +1200 Subject: [PATCH] Support auto-generated captions The caption extraction is now entirely in our own hands. --- extractors/captions.py | 21 ++++++++++++--------- extractors/video.py | 38 ++++++++++++++++++++++---------------- tools/converters.py | 2 +- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/extractors/captions.py b/extractors/captions.py index 2df701a..2bf5287 100644 --- a/extractors/captions.py +++ b/extractors/captions.py @@ -5,7 +5,10 @@ from urllib.parse import urlencode import xml.etree.ElementTree as ET def extract_captions(id, **kwargs): - captions = extract_captions_from_api(id) + if "label" in kwargs and "auto-generated" in kwargs["label"]: + captions = extract_captions_from_video(id) + else: + captions = extract_captions_from_api(id) return extract_captions_from_dict(captions, **kwargs) # Return captions for the language specified, @@ -19,15 +22,9 @@ def extract_captions_from_dict(captions, *, lang=None, label=None): r.raise_for_status() return r -# Currently unused in favour of extract_captions_from_api. -def extract_captions_from_video(id): - return { - "captions": extract_video(id)["captions"] - } - -# no automatic captions +# List of captions directly from youtube, but no automatic def extract_captions_from_api(id): - url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id + url = "https://video.google.com/timedtext?hl=en&type=list&v={}".format(id) with requests.get(url) as r: if r.status_code == 404: return { @@ -67,3 +64,9 @@ def extract_captions_from_api(id): }) return result + +# We'll fall back to this function for auto-captions. +def extract_captions_from_video(id): + return { + "captions": extract_video(id)["captions"] + } diff --git a/extractors/video.py b/extractors/video.py index 3d6ebc9..c6f95a5 100644 --- a/extractors/video.py +++ b/extractors/video.py @@ -10,6 +10,7 @@ from tools.converters import * from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response import tools.files as files from math import floor +from urllib.parse import parse_qs, urlparse, urlencode from cachetools import TTLCache video_cache = TTLCache(maxsize=50, ttl=300) @@ -165,22 +166,6 @@ def extract_video(id): "second__height": format["height"] }) - if info.get("requested_subtitles"): - for language_code, subtitle in info["requested_subtitles"].items(): - if language_code == "live_chat": - continue - - subtitle_url = subtitle["url"] - label = get_language_label_from_url(subtitle_url) - subtitle_api_url = get_subtitle_api_url(id, label, language_code) - result["captions"].append({ - "label": label if label != "" else language_code, - "languageCode": language_code, - "url": subtitle_api_url, - "second__subtitleUrl": subtitle_url # Direct YouTube url - }) - - result = get_more_stuff_from_file(info["id"], result) return result @@ -300,6 +285,27 @@ def get_more_stuff_from_file(id, result): f["qualityLabel"] = label f["second__order"] = format_order(f) + for track in player_response["captions"]["playerCaptionsTracklistRenderer"]["captionTracks"]: + # safely editing the track format by taking apart the url... + url = track["baseUrl"] + parts = urlparse(url) + qs = parse_qs(parts.query) + qs["format"] = ["vtt"] + qs = urlencode(qs, doseq=True) + # ...and putting it back together... + parts = parts._replace(query=qs) + url = parts.geturl() + # now make the caption object + label = combine_runs(track["name"]) + language_code = track["languageCode"] + subtitle_api_url = get_subtitle_api_url(id, label, language_code) + result["captions"].append({ + "label": label, + "languageCode": language_code, + "url": subtitle_api_url, + "second__remoteUrl": url + }) + except Exception: print("messed up extracting recommendations.") traceback.print_exc() diff --git a/tools/converters.py b/tools/converters.py index 3a8e3ae..87abcd1 100644 --- a/tools/converters.py +++ b/tools/converters.py @@ -217,7 +217,7 @@ def get_subtitle_api_url(id, label, language_code): subtitle_api_url = "/api/v1/captions/{}?".format(id) params = {} - if label: + if label and "auto-generated" in label: params["label"] = label else: params["lang"] = language_code