Implement captions

Automatic subtitles are not supported, because youtube_dlc does not provide them.
2026-02-10 01:26:32 +00:00 · 2021-01-17 23:59:14 +01:00 · 2021-01-17 23:59:14 +01:00 · 6709aa30c2
commit 6709aa30c2
parent 985f0c1c32
5 changed files with 126 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -13,6 +13,7 @@ These endpoints are somewhat implemented:
 - `/api/v1/channels/{part}/{ucid}`
 - `/api/v1/search?q={search}`
 - `/api/v1/search/suggestions?q={search}`
+- `/api/v1/captions/{id}`
 - `/vi/{id}/{file}`
 - `/api/manifest/dash/id/{id}`

--- a/extractors/captions.py
+++ b/extractors/captions.py
@ -0,0 +1,73 @@
+import requests
+from extractors.video import extract_video
+from tools.converters import escape_html_textcontent, get_subtitle_api_url
+from urllib.parse import urlencode
+import xml.etree.ElementTree as ET
+
+def extract_captions(id, **kwargs):
+	captions = extract_captions_from_api(id)
+	return extract_captions_from_dict(captions, **kwargs)
+
+# Return captions for the language specified,
+# The captions list otherwise
+def extract_captions_from_dict(captions, **kwargs):
+	lang = None
+	label = None
+
+	if "lang" in kwargs:
+		lang = kwargs["lang"]
+	elif "label" in kwargs:
+		label = kwargs["label"]
+	else:
+		return captions
+
+	for subtitle in captions["captions"]:
+		if lang == subtitle["languageCode"] or label == subtitle["label"]:
+			url = subtitle["second__subtitleUrl"]
+
+			with requests.get(url) as r:
+				r.raise_for_status()
+				return r.content.decode("utf8")
+
+# Currently unused in favour of extract_captions_from_api.
+def extract_captions_from_video(id):
+	return {
+		"captions": extract_video(id)["captions"]
+	}
+
+# no automatic captions
+def extract_captions_from_api(id):
+	url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id
+	with requests.get(url) as r:
+		r.raise_for_status()
+
+		transcript = ET.fromstring(r.content.decode("utf8"))
+		tracks = transcript.findall("track")
+
+		captions = []
+		result = {
+			"captions": captions
+		}
+
+		for track in tracks:
+			language_code = track.attrib["lang_code"]
+			label = track.get("name", default=language_code)
+			subtitle_api_url = get_subtitle_api_url(id, label, language_code)
+
+			params = urlencode({
+				"lang": language_code,
+				"v": id,
+				"fmt": "vtt",
+				"name": label
+			})
+
+			subtitle_url = "https://www.youtube.com/api/timedtext?" + params
+
+			captions.append({
+				"label": label if label != "" else language_code,
+				"languageCode": language_code,
+				"url": subtitle_api_url,
+				"second__subtitleUrl": subtitle_url
+			})
+
+		return result
--- a/extractors/video.py
+++ b/extractors/video.py
@ -19,7 +19,9 @@ ytdl_opts = {
 	"playlist_items": "1-100",
 	"extract_flat": "in_playlist",
 	"write_pages": True,
-	"source_address": "0.0.0.0"
+	"source_address": "0.0.0.0",
+	"writesubtitles": True,
+	"allsubtitles": True,
 }
 ytdl = youtube_dlc.YoutubeDL(ytdl_opts)

@ -171,6 +173,23 @@ def extract_video(id):
 					"second__width": format["width"],
 					"second__height": format["height"]
 				})
+		
+		if "requested_subtitles" in info and info["requested_subtitles"]:
+
+			for language_code, subtitle in info["requested_subtitles"].items():
+				
+				if language_code != "live_chat":
+					subtitle_url = subtitle["url"]
+					label = get_language_label_from_url(subtitle_url)
+					subtitle_api_url = get_subtitle_api_url(id, label, language_code)
+
+					result["captions"].append({
+						"label": label if label != "" else language_code,
+						"languageCode": language_code,
+						"url": subtitle_api_url,
+						"second__subtitleUrl": subtitle_url # Direct YouTube url
+					})
+

 		result = get_more_stuff_from_file(info["id"], result)

--- a/index.py
+++ b/index.py
@ -7,6 +7,7 @@ from extractors.channel import extract_channel, extract_channel_videos, extract_
 from extractors.manifest import extract_manifest
 from extractors.search import extract_search
 from extractors.suggestions import extract_search_suggestions
+from extractors.captions import extract_captions

@cherrypy.tools.register("before_finalize", priority=60)
 def custom_headers():
@ -22,7 +23,8 @@ class Second(object):
 			endpoints = [
 				["channels", 1, 2],
 				["videos", 1, 1],
-				["search", 0, 1]
+				["search", 0, 1],
+				["captions", 1, 1]
 			]
 			for e in endpoints:
 				if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3:
@ -90,6 +92,17 @@ class Second(object):
 	@cherrypy.tools.json_out()
 	def suggestions(self, *, q, **kwargs):
 		return extract_search_suggestions(q)
+	
+	@cherrypy.expose
+	def captions(self, id, **kwargs):
+		result = extract_captions(id, **kwargs)
+		if type(result) is dict:
+			cherrypy.response.headers["content-type"] = "application/json"
+			return bytes(json.dumps(result), "utf8")
+		else:
+			cherrypy.response.headers["content-type"] = "text/vtt; charset=UTF-8"
+			return result
+

 	@cherrypy.expose
 	def vi(self, id, file):
--- a/tools/converters.py
+++ b/tools/converters.py
@ -2,6 +2,7 @@ import configuration
 import datetime
 import re
 import time
+from urllib.parse import urlparse, parse_qs, quote_plus

 def length_text_to_seconds(text):
 	s = text.split(":")
@ -205,3 +206,20 @@ def time_to_past_text(timestamp):
 			number = diff // unit_value
 			plural_unit = unit_name if number == 1 else unit_name + "s"
 			return "{} {} ago".format(number, plural_unit)
+
+def get_language_label_from_url(url_string):
+	url = urlparse(url_string)
+	params = parse_qs(url.query)
+	label = params["name"][0] if "name" in params else "" # name may be in params with empty value
+	return label
+
+def get_subtitle_api_url(id, label, language_code):
+	subtitle_api_url = "{}/api/v1/captions/{}?".format(configuration.website_origin, id)
+
+	if label == "":
+		label = language_code
+		subtitle_api_url += "lang=" + quote_plus(language_code)
+	else:
+		subtitle_api_url += "label=" + quote_plus(label)
+
+	return subtitle_api_url