1
0
mirror of https://git.sr.ht/~cadence/NewLeaf synced 2024-11-22 07:37:29 +00:00

Various live video extraction fixes

This commit is contained in:
Cadence Ember 2020-08-13 20:25:10 +12:00
parent 57fb71b97d
commit 968fbaf2bd
No known key found for this signature in database
GPG Key ID: 128B99B1B74A6412

View File

@ -39,6 +39,8 @@ def extract_yt_initial_data(content):
raise Exception("Could not match ytInitialData in content") raise Exception("Could not match ytInitialData in content")
def combine_runs(runs): def combine_runs(runs):
if "simpleText" in runs: # check if simpletext instead
return runs["simpleText"]
if "runs" in runs: # check if already unpacked if "runs" in runs: # check if already unpacked
runs = runs["runs"] runs = runs["runs"]
return "".join([r["text"] for r in runs]) return "".join([r["text"] for r in runs])
@ -65,7 +67,7 @@ def combine_runs_html(runs):
return result return result
def add_html_links(text): def add_html_links(text):
r_link = re.compile(r"""https?://[a-z-]+(?:\.[a-z-]+)+(?:/[^\s,<>)]*)?""") # it's okay, I guess. r_link = re.compile(r"""https?://[a-z0-9-]+(?:\.[a-z0-9-]+)+(?:/[^\s,<>)]*)?""") # it's okay, I guess.
match = r_link.search(text) match = r_link.search(text)
if match is not None: if match is not None:
link = match.group() link = match.group()
@ -133,6 +135,20 @@ def generate_video_thumbnails(id):
"height": type[3] "height": type[3]
} for type in types] } for type in types]
def generate_full_author_thumbnails(original):
r_size_part = re.compile(r"""=s[0-9]+-""")
match = r_size_part.search(original[0]["url"])
if match:
template = re.sub(r_size_part, "=s{}-", original[0]["url"])
sizes = [32, 48, 76, 100, 176, 512]
return [{
"url": template.format(size),
"width": size,
"height": size
} for size in sizes]
else:
return original
def normalise_url_protocol(url): def normalise_url_protocol(url):
if url.startswith("//"): if url.startswith("//"):
url = "https:" + url url = "https:" + url
@ -217,7 +233,7 @@ class Second(object):
"videoThumbnails": generate_video_thumbnails(info["id"]), "videoThumbnails": generate_video_thumbnails(info["id"]),
"storyboards": None, "storyboards": None,
"description": info["description"], "description": info["description"],
"descriptionHtml": info["description"], "descriptionHtml": add_html_links(escape_html_textcontent(info["description"])),
"published": published, "published": published,
"publishedText": None, "publishedText": None,
"keywords": None, "keywords": None,
@ -324,7 +340,8 @@ class Second(object):
"lengthSeconds": get_length_or_live_now(r), "lengthSeconds": get_length_or_live_now(r),
"second__lengthText": get_length_text_or_live_now(r), "second__lengthText": get_length_text_or_live_now(r),
"viewCountText": get_view_count_text_or_recommended(r), "viewCountText": get_view_count_text_or_recommended(r),
"viewCount": get_view_count_or_recommended(r) "viewCount": get_view_count_or_recommended(r),
"second__liveNow": is_live(r)
} for r in [get_useful_recommendation_data(r) for r in recommendations if get_useful_recommendation_data(r)]) } for r in [get_useful_recommendation_data(r) for r in recommendations if get_useful_recommendation_data(r)])
m_yt_player_config = re.search(r_yt_player_config, content) m_yt_player_config = re.search(r_yt_player_config, content)
@ -428,11 +445,14 @@ class Second(object):
if len(suffix) == 1: if len(suffix) == 1:
ucid = suffix[0] ucid = suffix[0]
else: # len(suffix) >= 2 else: # len(suffix) >= 2
if suffix[0] == "videos" or suffix[0] == "latest": if suffix[0] == "videos" or suffix[0] == "latest" or suffix[0] == "playlists":
[part, ucid] = suffix [part, ucid] = suffix
else: else:
[ucid, part] = suffix [ucid, part] = suffix
if part == "playlists":
return []
if part == "latest": if part == "latest":
# use RSS # use RSS
with requests.get("https://www.youtube.com/feeds/videos.xml?channel_id={}".format(ucid)) as r: with requests.get("https://www.youtube.com/feeds/videos.xml?channel_id={}".format(ucid)) as r:
@ -475,7 +495,8 @@ class Second(object):
else: # part == "videos" else: # part == "videos"
return self.channel_cache[ucid]["latestVideos"] return self.channel_cache[ucid]["latestVideos"]
with requests.get("https://www.youtube.com/channel/{}/videos".format(ucid)) as r: channel_type = "channel" if len(ucid) == 24 and ucid[:2] == "UC" else "user"
with requests.get("https://www.youtube.com/{}/{}/videos".format(channel_type, ucid)) as r:
r.raise_for_status() r.raise_for_status()
yt_initial_data = extract_yt_initial_data(r.content.decode("utf8")) yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
header = yt_initial_data["header"]["c4TabbedHeaderRenderer"] header = yt_initial_data["header"]["c4TabbedHeaderRenderer"]
@ -485,7 +506,7 @@ class Second(object):
author_banners = header["banner"]["thumbnails"] author_banners = header["banner"]["thumbnails"]
for t in author_banners: for t in author_banners:
t["url"] = normalise_url_protocol(t["url"]) t["url"] = normalise_url_protocol(t["url"])
author_thumbnails = header["avatar"]["thumbnails"] author_thumbnails = generate_full_author_thumbnails(header["avatar"]["thumbnails"])
subscriber_count = combine_runs(header["subscriberCountText"]) subscriber_count = combine_runs(header["subscriberCountText"])
description = yt_initial_data["metadata"]["channelMetadataRenderer"]["description"] description = yt_initial_data["metadata"]["channelMetadataRenderer"]["description"]
allowed_regions = yt_initial_data["metadata"]["channelMetadataRenderer"]["availableCountryCodes"] allowed_regions = yt_initial_data["metadata"]["channelMetadataRenderer"]["availableCountryCodes"]
@ -497,11 +518,16 @@ class Second(object):
) )
latest_videos = [] latest_videos = []
for v in videos: for v in videos:
length_text = next(o for o in v["thumbnailOverlays"] if "thumbnailOverlayTimeStatusRenderer" in o) \ length_text = "LIVE"
["thumbnailOverlayTimeStatusRenderer"]["text"]["simpleText"] length_seconds = -1
for o in v["thumbnailOverlays"]:
if "thumbnailOverlayTimeStatusRenderer" in o:
length_text = combine_runs(o["thumbnailOverlayTimeStatusRenderer"]["text"])
if o["thumbnailOverlayTimeStatusRenderer"]["style"] != "LIVE":
length_text_to_seconds(length_text)
latest_videos.append({ latest_videos.append({
"type": "video", "type": "video",
"title": v["title"]["simpleText"], "title": combine_runs(v["title"]),
"videoId": v["videoId"], "videoId": v["videoId"],
"author": author, "author": author,
"authorId": author_id, "authorId": author_id,
@ -509,12 +535,12 @@ class Second(object):
"videoThumbnails": generate_video_thumbnails(v["videoId"]), "videoThumbnails": generate_video_thumbnails(v["videoId"]),
"description": "", "description": "",
"descriptionHtml": "", "descriptionHtml": "",
"viewCount": view_count_text_to_number(v["viewCountText"]["simpleText"]), "viewCount": view_count_text_to_number(combine_runs(v["viewCountText"])),
"second__viewCountText": v["viewCountText"]["simpleText"], "second__viewCountText": combine_runs(v["viewCountText"]),
"second__viewCountTextShort": v["shortViewCountText"]["simpleText"], "second__viewCountTextShort": combine_runs(v["shortViewCountText"]),
"published": 0, "published": 0,
"publishedText": v["publishedTimeText"]["simpleText"], "publishedText": v["publishedTimeText"]["simpleText"] if "publishedTimeText" in v else "Live now",
"lengthSeconds": length_text_to_seconds(length_text), "lengthSeconds": length_seconds,
"second__lengthText": length_text, "second__lengthText": length_text,
"liveNow": None, "liveNow": None,
"paid": None, "paid": None,
@ -581,7 +607,7 @@ class Second(object):
"viewCount": get_view_count_or_recommended(video), "viewCount": get_view_count_or_recommended(video),
"second__viewCountText": get_view_count_text_or_recommended(video), "second__viewCountText": get_view_count_text_or_recommended(video),
"published": None, "published": None,
"publishedText": video["publishedTimeText"]["simpleText"], "publishedText": video["publishedTimeText"]["simpleText"] if "publishedTimeText" in video else "Live now",
"lengthSeconds": get_length_or_live_now(video), "lengthSeconds": get_length_or_live_now(video),
"second__lengthText": get_length_text_or_live_now(video), "second__lengthText": get_length_text_or_live_now(video),
"liveNow": is_live(video), "liveNow": is_live(video),