1
0
mirror of https://git.sr.ht/~cadence/NewLeaf synced 2025-10-25 08:35:36 +00:00
NewLeaf/extractors/captions.py
2022-01-16 21:51:26 +13:00

30 lines
1.1 KiB
Python

import re
import requests
from extractors.video import extract_video
from tools.converters import escape_html_textcontent, get_subtitle_api_url
from urllib.parse import urlencode
import xml.etree.ElementTree as ET
def extract_captions(id, **kwargs):
captions = extract_captions_from_video(id)
return extract_captions_from_dict(captions, **kwargs)
# Return captions for the language specified,
# The captions list otherwise
def extract_captions_from_dict(captions, *, lang=None, label=None):
if lang is None and label is None:
return captions
url = next(caption["second__remoteUrl"] for caption in captions["captions"] if caption["languageCode"] == lang or caption["label"] == label)
r = requests.get(url)
r.raise_for_status()
# remove extraneous " align:start position:0%" on timestamps lines on auto-generated captions
if (lang and "auto-generated" in lang) or (label and "auto-generated" in label):
return re.sub(r"^([0-9:.]+ --> [0-9:.]+).*$", r"\1", r.content.decode("utf8"), flags=re.MULTILINE)
return r
def extract_captions_from_video(id):
return {
"captions": extract_video(id)["captions"]
}