Support auto-generated captions

The caption extraction is now entirely in our own hands.
This commit is contained in:
Cadence Ember 2021-04-05 01:23:54 +12:00
parent aaf7d65b32
commit 1d52fca3a0
No known key found for this signature in database
GPG Key ID: BC1C2C61CF521B17
3 changed files with 35 additions and 26 deletions

View File

@ -5,7 +5,10 @@ from urllib.parse import urlencode
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
def extract_captions(id, **kwargs): def extract_captions(id, **kwargs):
captions = extract_captions_from_api(id) if "label" in kwargs and "auto-generated" in kwargs["label"]:
captions = extract_captions_from_video(id)
else:
captions = extract_captions_from_api(id)
return extract_captions_from_dict(captions, **kwargs) return extract_captions_from_dict(captions, **kwargs)
# Return captions for the language specified, # Return captions for the language specified,
@ -19,15 +22,9 @@ def extract_captions_from_dict(captions, *, lang=None, label=None):
r.raise_for_status() r.raise_for_status()
return r return r
# Currently unused in favour of extract_captions_from_api. # List of captions directly from youtube, but no automatic
def extract_captions_from_video(id):
return {
"captions": extract_video(id)["captions"]
}
# no automatic captions
def extract_captions_from_api(id): def extract_captions_from_api(id):
url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id url = "https://video.google.com/timedtext?hl=en&type=list&v={}".format(id)
with requests.get(url) as r: with requests.get(url) as r:
if r.status_code == 404: if r.status_code == 404:
return { return {
@ -67,3 +64,9 @@ def extract_captions_from_api(id):
}) })
return result return result
# We'll fall back to this function for auto-captions.
def extract_captions_from_video(id):
return {
"captions": extract_video(id)["captions"]
}

View File

@ -10,6 +10,7 @@ from tools.converters import *
from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response
import tools.files as files import tools.files as files
from math import floor from math import floor
from urllib.parse import parse_qs, urlparse, urlencode
from cachetools import TTLCache from cachetools import TTLCache
video_cache = TTLCache(maxsize=50, ttl=300) video_cache = TTLCache(maxsize=50, ttl=300)
@ -165,22 +166,6 @@ def extract_video(id):
"second__height": format["height"] "second__height": format["height"]
}) })
if info.get("requested_subtitles"):
for language_code, subtitle in info["requested_subtitles"].items():
if language_code == "live_chat":
continue
subtitle_url = subtitle["url"]
label = get_language_label_from_url(subtitle_url)
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
result["captions"].append({
"label": label if label != "" else language_code,
"languageCode": language_code,
"url": subtitle_api_url,
"second__subtitleUrl": subtitle_url # Direct YouTube url
})
result = get_more_stuff_from_file(info["id"], result) result = get_more_stuff_from_file(info["id"], result)
return result return result
@ -300,6 +285,27 @@ def get_more_stuff_from_file(id, result):
f["qualityLabel"] = label f["qualityLabel"] = label
f["second__order"] = format_order(f) f["second__order"] = format_order(f)
for track in player_response["captions"]["playerCaptionsTracklistRenderer"]["captionTracks"]:
# safely editing the track format by taking apart the url...
url = track["baseUrl"]
parts = urlparse(url)
qs = parse_qs(parts.query)
qs["format"] = ["vtt"]
qs = urlencode(qs, doseq=True)
# ...and putting it back together...
parts = parts._replace(query=qs)
url = parts.geturl()
# now make the caption object
label = combine_runs(track["name"])
language_code = track["languageCode"]
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
result["captions"].append({
"label": label,
"languageCode": language_code,
"url": subtitle_api_url,
"second__remoteUrl": url
})
except Exception: except Exception:
print("messed up extracting recommendations.") print("messed up extracting recommendations.")
traceback.print_exc() traceback.print_exc()

View File

@ -217,7 +217,7 @@ def get_subtitle_api_url(id, label, language_code):
subtitle_api_url = "/api/v1/captions/{}?".format(id) subtitle_api_url = "/api/v1/captions/{}?".format(id)
params = {} params = {}
if label: if label and "auto-generated" in label:
params["label"] = label params["label"] = label
else: else:
params["lang"] = language_code params["lang"] = language_code