Implement captions

Automatic subtitles are not supported, because youtube_dlc does not provide them.
2026-05-01 15:21:35 +00:00 · 2021-01-17 23:59:14 +01:00 · 2021-01-17 23:59:14 +01:00 · 6709aa30c2
commit 6709aa30c2
parent 985f0c1c32
5 changed files with 126 additions and 2 deletions
--- a/extractors/captions.py
+++ b/extractors/captions.py
@ -0,0 +1,73 @@
+import requests
+from extractors.video import extract_video
+from tools.converters import escape_html_textcontent, get_subtitle_api_url
+from urllib.parse import urlencode
+import xml.etree.ElementTree as ET
+
+def extract_captions(id, **kwargs):
+	captions = extract_captions_from_api(id)
+	return extract_captions_from_dict(captions, **kwargs)
+
+# Return captions for the language specified,
+# The captions list otherwise
+def extract_captions_from_dict(captions, **kwargs):
+	lang = None
+	label = None
+
+	if "lang" in kwargs:
+		lang = kwargs["lang"]
+	elif "label" in kwargs:
+		label = kwargs["label"]
+	else:
+		return captions
+
+	for subtitle in captions["captions"]:
+		if lang == subtitle["languageCode"] or label == subtitle["label"]:
+			url = subtitle["second__subtitleUrl"]
+
+			with requests.get(url) as r:
+				r.raise_for_status()
+				return r.content.decode("utf8")
+
+# Currently unused in favour of extract_captions_from_api.
+def extract_captions_from_video(id):
+	return {
+		"captions": extract_video(id)["captions"]
+	}
+
+# no automatic captions
+def extract_captions_from_api(id):
+	url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id
+	with requests.get(url) as r:
+		r.raise_for_status()
+
+		transcript = ET.fromstring(r.content.decode("utf8"))
+		tracks = transcript.findall("track")
+
+		captions = []
+		result = {
+			"captions": captions
+		}
+
+		for track in tracks:
+			language_code = track.attrib["lang_code"]
+			label = track.get("name", default=language_code)
+			subtitle_api_url = get_subtitle_api_url(id, label, language_code)
+
+			params = urlencode({
+				"lang": language_code,
+				"v": id,
+				"fmt": "vtt",
+				"name": label
+			})
+
+			subtitle_url = "https://www.youtube.com/api/timedtext?" + params
+
+			captions.append({
+				"label": label if label != "" else language_code,
+				"languageCode": language_code,
+				"url": subtitle_api_url,
+				"second__subtitleUrl": subtitle_url
+			})
+
+		return result
--- a/extractors/video.py
+++ b/extractors/video.py
@ -19,7 +19,9 @@ ytdl_opts = {
 	"playlist_items": "1-100",
 	"extract_flat": "in_playlist",
 	"write_pages": True,
-	"source_address": "0.0.0.0"
+	"source_address": "0.0.0.0",
+	"writesubtitles": True,
+	"allsubtitles": True,
 }
 ytdl = youtube_dlc.YoutubeDL(ytdl_opts)

@ -171,6 +173,23 @@ def extract_video(id):
 					"second__width": format["width"],
 					"second__height": format["height"]
 				})
+		
+		if "requested_subtitles" in info and info["requested_subtitles"]:
+
+			for language_code, subtitle in info["requested_subtitles"].items():
+				
+				if language_code != "live_chat":
+					subtitle_url = subtitle["url"]
+					label = get_language_label_from_url(subtitle_url)
+					subtitle_api_url = get_subtitle_api_url(id, label, language_code)
+
+					result["captions"].append({
+						"label": label if label != "" else language_code,
+						"languageCode": language_code,
+						"url": subtitle_api_url,
+						"second__subtitleUrl": subtitle_url # Direct YouTube url
+					})
+

 		result = get_more_stuff_from_file(info["id"], result)