diff --git a/README.md b/README.md index c8f043c..16e3274 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ These endpoints are somewhat implemented: - `/api/v1/channels/{part}/{ucid}` - `/api/v1/search?q={search}` - `/api/v1/search/suggestions?q={search}` +- `/api/v1/captions/{id}` - `/vi/{id}/{file}` - `/api/manifest/dash/id/{id}` diff --git a/extractors/captions.py b/extractors/captions.py new file mode 100644 index 0000000..85df2be --- /dev/null +++ b/extractors/captions.py @@ -0,0 +1,73 @@ +import requests +from extractors.video import extract_video +from tools.converters import escape_html_textcontent, get_subtitle_api_url +from urllib.parse import urlencode +import xml.etree.ElementTree as ET + +def extract_captions(id, **kwargs): + captions = extract_captions_from_api(id) + return extract_captions_from_dict(captions, **kwargs) + +# Return captions for the language specified, +# The captions list otherwise +def extract_captions_from_dict(captions, **kwargs): + lang = None + label = None + + if "lang" in kwargs: + lang = kwargs["lang"] + elif "label" in kwargs: + label = kwargs["label"] + else: + return captions + + for subtitle in captions["captions"]: + if lang == subtitle["languageCode"] or label == subtitle["label"]: + url = subtitle["second__subtitleUrl"] + + with requests.get(url) as r: + r.raise_for_status() + return r.content.decode("utf8") + +# Currently unused in favour of extract_captions_from_api. +def extract_captions_from_video(id): + return { + "captions": extract_video(id)["captions"] + } + +# no automatic captions +def extract_captions_from_api(id): + url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id + with requests.get(url) as r: + r.raise_for_status() + + transcript = ET.fromstring(r.content.decode("utf8")) + tracks = transcript.findall("track") + + captions = [] + result = { + "captions": captions + } + + for track in tracks: + language_code = track.attrib["lang_code"] + label = track.get("name", default=language_code) + subtitle_api_url = get_subtitle_api_url(id, label, language_code) + + params = urlencode({ + "lang": language_code, + "v": id, + "fmt": "vtt", + "name": label + }) + + subtitle_url = "https://www.youtube.com/api/timedtext?" + params + + captions.append({ + "label": label if label != "" else language_code, + "languageCode": language_code, + "url": subtitle_api_url, + "second__subtitleUrl": subtitle_url + }) + + return result diff --git a/extractors/video.py b/extractors/video.py index 22a68bd..d24d060 100644 --- a/extractors/video.py +++ b/extractors/video.py @@ -19,7 +19,9 @@ ytdl_opts = { "playlist_items": "1-100", "extract_flat": "in_playlist", "write_pages": True, - "source_address": "0.0.0.0" + "source_address": "0.0.0.0", + "writesubtitles": True, + "allsubtitles": True, } ytdl = youtube_dlc.YoutubeDL(ytdl_opts) @@ -171,6 +173,23 @@ def extract_video(id): "second__width": format["width"], "second__height": format["height"] }) + + if "requested_subtitles" in info and info["requested_subtitles"]: + + for language_code, subtitle in info["requested_subtitles"].items(): + + if language_code != "live_chat": + subtitle_url = subtitle["url"] + label = get_language_label_from_url(subtitle_url) + subtitle_api_url = get_subtitle_api_url(id, label, language_code) + + result["captions"].append({ + "label": label if label != "" else language_code, + "languageCode": language_code, + "url": subtitle_api_url, + "second__subtitleUrl": subtitle_url # Direct YouTube url + }) + result = get_more_stuff_from_file(info["id"], result) diff --git a/index.py b/index.py index d2d34f1..d44be30 100644 --- a/index.py +++ b/index.py @@ -7,6 +7,7 @@ from extractors.channel import extract_channel, extract_channel_videos, extract_ from extractors.manifest import extract_manifest from extractors.search import extract_search from extractors.suggestions import extract_search_suggestions +from extractors.captions import extract_captions @cherrypy.tools.register("before_finalize", priority=60) def custom_headers(): @@ -22,7 +23,8 @@ class Second(object): endpoints = [ ["channels", 1, 2], ["videos", 1, 1], - ["search", 0, 1] + ["search", 0, 1], + ["captions", 1, 1] ] for e in endpoints: if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3: @@ -90,6 +92,17 @@ class Second(object): @cherrypy.tools.json_out() def suggestions(self, *, q, **kwargs): return extract_search_suggestions(q) + + @cherrypy.expose + def captions(self, id, **kwargs): + result = extract_captions(id, **kwargs) + if type(result) is dict: + cherrypy.response.headers["content-type"] = "application/json" + return bytes(json.dumps(result), "utf8") + else: + cherrypy.response.headers["content-type"] = "text/vtt; charset=UTF-8" + return result + @cherrypy.expose def vi(self, id, file): diff --git a/tools/converters.py b/tools/converters.py index dc48c8c..68d34b8 100644 --- a/tools/converters.py +++ b/tools/converters.py @@ -2,6 +2,7 @@ import configuration import datetime import re import time +from urllib.parse import urlparse, parse_qs, quote_plus def length_text_to_seconds(text): s = text.split(":") @@ -205,3 +206,20 @@ def time_to_past_text(timestamp): number = diff // unit_value plural_unit = unit_name if number == 1 else unit_name + "s" return "{} {} ago".format(number, plural_unit) + +def get_language_label_from_url(url_string): + url = urlparse(url_string) + params = parse_qs(url.query) + label = params["name"][0] if "name" in params else "" # name may be in params with empty value + return label + +def get_subtitle_api_url(id, label, language_code): + subtitle_api_url = "{}/api/v1/captions/{}?".format(configuration.website_origin, id) + + if label == "": + label = language_code + subtitle_api_url += "lang=" + quote_plus(language_code) + else: + subtitle_api_url += "label=" + quote_plus(label) + + return subtitle_api_url