1
0
Fork 0
mirror of https://git.sr.ht/~cadence/NewLeaf synced 2026-03-07 13:01:37 +00:00

Implement captions

Automatic subtitles are not supported, because youtube_dlc does not
provide them.
This commit is contained in:
bopol 2021-01-17 23:59:14 +01:00 committed by Cadence Ember
parent 985f0c1c32
commit 6709aa30c2
No known key found for this signature in database
GPG key ID: BC1C2C61CF521B17
5 changed files with 126 additions and 2 deletions

73
extractors/captions.py Normal file
View file

@ -0,0 +1,73 @@
import requests
from extractors.video import extract_video
from tools.converters import escape_html_textcontent, get_subtitle_api_url
from urllib.parse import urlencode
import xml.etree.ElementTree as ET
def extract_captions(id, **kwargs):
captions = extract_captions_from_api(id)
return extract_captions_from_dict(captions, **kwargs)
# Return captions for the language specified,
# The captions list otherwise
def extract_captions_from_dict(captions, **kwargs):
lang = None
label = None
if "lang" in kwargs:
lang = kwargs["lang"]
elif "label" in kwargs:
label = kwargs["label"]
else:
return captions
for subtitle in captions["captions"]:
if lang == subtitle["languageCode"] or label == subtitle["label"]:
url = subtitle["second__subtitleUrl"]
with requests.get(url) as r:
r.raise_for_status()
return r.content.decode("utf8")
# Currently unused in favour of extract_captions_from_api.
def extract_captions_from_video(id):
return {
"captions": extract_video(id)["captions"]
}
# no automatic captions
def extract_captions_from_api(id):
url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id
with requests.get(url) as r:
r.raise_for_status()
transcript = ET.fromstring(r.content.decode("utf8"))
tracks = transcript.findall("track")
captions = []
result = {
"captions": captions
}
for track in tracks:
language_code = track.attrib["lang_code"]
label = track.get("name", default=language_code)
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
params = urlencode({
"lang": language_code,
"v": id,
"fmt": "vtt",
"name": label
})
subtitle_url = "https://www.youtube.com/api/timedtext?" + params
captions.append({
"label": label if label != "" else language_code,
"languageCode": language_code,
"url": subtitle_api_url,
"second__subtitleUrl": subtitle_url
})
return result

View file

@ -19,7 +19,9 @@ ytdl_opts = {
"playlist_items": "1-100",
"extract_flat": "in_playlist",
"write_pages": True,
"source_address": "0.0.0.0"
"source_address": "0.0.0.0",
"writesubtitles": True,
"allsubtitles": True,
}
ytdl = youtube_dlc.YoutubeDL(ytdl_opts)
@ -171,6 +173,23 @@ def extract_video(id):
"second__width": format["width"],
"second__height": format["height"]
})
if "requested_subtitles" in info and info["requested_subtitles"]:
for language_code, subtitle in info["requested_subtitles"].items():
if language_code != "live_chat":
subtitle_url = subtitle["url"]
label = get_language_label_from_url(subtitle_url)
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
result["captions"].append({
"label": label if label != "" else language_code,
"languageCode": language_code,
"url": subtitle_api_url,
"second__subtitleUrl": subtitle_url # Direct YouTube url
})
result = get_more_stuff_from_file(info["id"], result)