mirror of
https://git.sr.ht/~cadence/NewLeaf
synced 2024-11-14 20:17:29 +00:00
66b7d1bec8
This removes all of the code that was previously used to get them from /timedtext, and instead, always uses whatever is extracted from the video page. This does unfortunately now require a whole video fetch just for the captions. But assuming captions are only requested by a frontend, this won't be a problem due to the memory cache. The captions link will be in memory because the just-requested video is in memory too.
30 lines
1.1 KiB
Python
30 lines
1.1 KiB
Python
import re
|
|
import requests
|
|
from extractors.video import extract_video
|
|
from tools.converters import escape_html_textcontent, get_subtitle_api_url
|
|
from urllib.parse import urlencode
|
|
import xml.etree.ElementTree as ET
|
|
|
|
def extract_captions(id, **kwargs):
|
|
captions = extract_captions_from_video(id)
|
|
return extract_captions_from_dict(captions, **kwargs)
|
|
|
|
# Return captions for the language specified,
|
|
# The captions list otherwise
|
|
def extract_captions_from_dict(captions, *, lang=None, label=None):
|
|
if lang is None and label is None:
|
|
return captions
|
|
|
|
url = next(caption["second__remoteUrl"] for caption in captions["captions"] if caption["languageCode"] == lang or caption["label"] == label)
|
|
with requests.get(url) as r:
|
|
r.raise_for_status()
|
|
# remove extraneous " align:start position:0%" on timestamps lines on auto-generated captions
|
|
if (lang and "auto-generated" in lang) or (label and "auto-generated" in label):
|
|
return re.sub(r"^([0-9:.]+ --> [0-9:.]+).*$", r"\1", r.content.decode("utf8"), flags=re.MULTILINE)
|
|
return r
|
|
|
|
def extract_captions_from_video(id):
|
|
return {
|
|
"captions": extract_video(id)["captions"]
|
|
}
|