mirror of
https://git.sr.ht/~cadence/NewLeaf
synced 2026-03-07 13:01:37 +00:00
Implement captions
Automatic subtitles are not supported, because youtube_dlc does not provide them.
This commit is contained in:
parent
985f0c1c32
commit
6709aa30c2
5 changed files with 126 additions and 2 deletions
73
extractors/captions.py
Normal file
73
extractors/captions.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
import requests
|
||||
from extractors.video import extract_video
|
||||
from tools.converters import escape_html_textcontent, get_subtitle_api_url
|
||||
from urllib.parse import urlencode
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
def extract_captions(id, **kwargs):
|
||||
captions = extract_captions_from_api(id)
|
||||
return extract_captions_from_dict(captions, **kwargs)
|
||||
|
||||
# Return captions for the language specified,
|
||||
# The captions list otherwise
|
||||
def extract_captions_from_dict(captions, **kwargs):
|
||||
lang = None
|
||||
label = None
|
||||
|
||||
if "lang" in kwargs:
|
||||
lang = kwargs["lang"]
|
||||
elif "label" in kwargs:
|
||||
label = kwargs["label"]
|
||||
else:
|
||||
return captions
|
||||
|
||||
for subtitle in captions["captions"]:
|
||||
if lang == subtitle["languageCode"] or label == subtitle["label"]:
|
||||
url = subtitle["second__subtitleUrl"]
|
||||
|
||||
with requests.get(url) as r:
|
||||
r.raise_for_status()
|
||||
return r.content.decode("utf8")
|
||||
|
||||
# Currently unused in favour of extract_captions_from_api.
|
||||
def extract_captions_from_video(id):
|
||||
return {
|
||||
"captions": extract_video(id)["captions"]
|
||||
}
|
||||
|
||||
# no automatic captions
|
||||
def extract_captions_from_api(id):
|
||||
url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id
|
||||
with requests.get(url) as r:
|
||||
r.raise_for_status()
|
||||
|
||||
transcript = ET.fromstring(r.content.decode("utf8"))
|
||||
tracks = transcript.findall("track")
|
||||
|
||||
captions = []
|
||||
result = {
|
||||
"captions": captions
|
||||
}
|
||||
|
||||
for track in tracks:
|
||||
language_code = track.attrib["lang_code"]
|
||||
label = track.get("name", default=language_code)
|
||||
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
|
||||
|
||||
params = urlencode({
|
||||
"lang": language_code,
|
||||
"v": id,
|
||||
"fmt": "vtt",
|
||||
"name": label
|
||||
})
|
||||
|
||||
subtitle_url = "https://www.youtube.com/api/timedtext?" + params
|
||||
|
||||
captions.append({
|
||||
"label": label if label != "" else language_code,
|
||||
"languageCode": language_code,
|
||||
"url": subtitle_api_url,
|
||||
"second__subtitleUrl": subtitle_url
|
||||
})
|
||||
|
||||
return result
|
||||
|
|
@ -19,7 +19,9 @@ ytdl_opts = {
|
|||
"playlist_items": "1-100",
|
||||
"extract_flat": "in_playlist",
|
||||
"write_pages": True,
|
||||
"source_address": "0.0.0.0"
|
||||
"source_address": "0.0.0.0",
|
||||
"writesubtitles": True,
|
||||
"allsubtitles": True,
|
||||
}
|
||||
ytdl = youtube_dlc.YoutubeDL(ytdl_opts)
|
||||
|
||||
|
|
@ -171,6 +173,23 @@ def extract_video(id):
|
|||
"second__width": format["width"],
|
||||
"second__height": format["height"]
|
||||
})
|
||||
|
||||
if "requested_subtitles" in info and info["requested_subtitles"]:
|
||||
|
||||
for language_code, subtitle in info["requested_subtitles"].items():
|
||||
|
||||
if language_code != "live_chat":
|
||||
subtitle_url = subtitle["url"]
|
||||
label = get_language_label_from_url(subtitle_url)
|
||||
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
|
||||
|
||||
result["captions"].append({
|
||||
"label": label if label != "" else language_code,
|
||||
"languageCode": language_code,
|
||||
"url": subtitle_api_url,
|
||||
"second__subtitleUrl": subtitle_url # Direct YouTube url
|
||||
})
|
||||
|
||||
|
||||
result = get_more_stuff_from_file(info["id"], result)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue