mirror of
				https://git.sr.ht/~cadence/NewLeaf
				synced 2025-11-04 05:25:36 +00:00 
			
		
		
		
	This removes all of the code that was previously used to get them from /timedtext, and instead, always uses whatever is extracted from the video page. This does unfortunately now require a whole video fetch just for the captions. But assuming captions are only requested by a frontend, this won't be a problem due to the memory cache. The captions link will be in memory because the just-requested video is in memory too.
		
			
				
	
	
		
			30 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			30 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import re
 | 
						|
import requests
 | 
						|
from extractors.video import extract_video
 | 
						|
from tools.converters import escape_html_textcontent, get_subtitle_api_url
 | 
						|
from urllib.parse import urlencode
 | 
						|
import xml.etree.ElementTree as ET
 | 
						|
 | 
						|
def extract_captions(id, **kwargs):
 | 
						|
	captions = extract_captions_from_video(id)
 | 
						|
	return extract_captions_from_dict(captions, **kwargs)
 | 
						|
 | 
						|
# Return captions for the language specified,
 | 
						|
# The captions list otherwise
 | 
						|
def extract_captions_from_dict(captions, *, lang=None, label=None):
 | 
						|
	if lang is None and label is None:
 | 
						|
		return captions
 | 
						|
 | 
						|
	url = next(caption["second__remoteUrl"] for caption in captions["captions"] if caption["languageCode"] == lang or caption["label"] == label)
 | 
						|
	with requests.get(url) as r:
 | 
						|
		r.raise_for_status()
 | 
						|
		# remove extraneous " align:start position:0%" on timestamps lines on auto-generated captions
 | 
						|
		if (lang and "auto-generated" in lang) or (label and "auto-generated" in label):
 | 
						|
			return re.sub(r"^([0-9:.]+ --> [0-9:.]+).*$", r"\1", r.content.decode("utf8"), flags=re.MULTILINE)
 | 
						|
		return r
 | 
						|
 | 
						|
def extract_captions_from_video(id):
 | 
						|
	return {
 | 
						|
		"captions": extract_video(id)["captions"]
 | 
						|
	}
 |