mirror of
https://git.sr.ht/~cadence/NewLeaf
synced 2024-12-22 13:06:59 +00:00
Fix regular captions
This removes all of the code that was previously used to get them from /timedtext, and instead, always uses whatever is extracted from the video page. This does unfortunately now require a whole video fetch just for the captions. But assuming captions are only requested by a frontend, this won't be a problem due to the memory cache. The captions link will be in memory because the just-requested video is in memory too.
This commit is contained in:
parent
550b633663
commit
66b7d1bec8
@ -6,10 +6,7 @@ from urllib.parse import urlencode
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
def extract_captions(id, **kwargs):
|
||||
if "label" in kwargs and "auto-generated" in kwargs["label"]:
|
||||
captions = extract_captions_from_video(id)
|
||||
else:
|
||||
captions = extract_captions_from_api(id)
|
||||
captions = extract_captions_from_video(id)
|
||||
return extract_captions_from_dict(captions, **kwargs)
|
||||
|
||||
# Return captions for the language specified,
|
||||
@ -26,50 +23,6 @@ def extract_captions_from_dict(captions, *, lang=None, label=None):
|
||||
return re.sub(r"^([0-9:.]+ --> [0-9:.]+).*$", r"\1", r.content.decode("utf8"), flags=re.MULTILINE)
|
||||
return r
|
||||
|
||||
# List of captions directly from youtube, but no automatic
|
||||
def extract_captions_from_api(id):
|
||||
url = "https://video.google.com/timedtext?hl=en&type=list&v={}".format(id)
|
||||
with requests.get(url) as r:
|
||||
if r.status_code == 404:
|
||||
return {
|
||||
"error": "Video unavailable",
|
||||
"identifier": "NOT_FOUND"
|
||||
}
|
||||
|
||||
r.raise_for_status()
|
||||
|
||||
transcript = ET.fromstring(r.content.decode("utf8"))
|
||||
tracks = transcript.findall("track")
|
||||
|
||||
captions = []
|
||||
result = {
|
||||
"captions": captions
|
||||
}
|
||||
|
||||
for track in tracks:
|
||||
language_code = track.attrib["lang_code"]
|
||||
label = track.get("name", default=language_code)
|
||||
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
|
||||
|
||||
params = urlencode({
|
||||
"lang": language_code,
|
||||
"v": id,
|
||||
"fmt": "vtt",
|
||||
"name": label
|
||||
})
|
||||
|
||||
subtitle_url = "https://www.youtube.com/api/timedtext?" + params
|
||||
|
||||
captions.append({
|
||||
"label": label if label != "" else language_code,
|
||||
"languageCode": language_code,
|
||||
"url": subtitle_api_url,
|
||||
"second__remoteUrl": subtitle_url
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
# We'll fall back to this function for auto-captions.
|
||||
def extract_captions_from_video(id):
|
||||
return {
|
||||
"captions": extract_video(id)["captions"]
|
||||
|
Loading…
Reference in New Issue
Block a user