2020-09-23 12:56:16 +00:00
|
|
|
import cherrypy
|
2020-08-13 14:20:11 +00:00
|
|
|
import dateutil.parser
|
|
|
|
import requests
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
from tools.converters import *
|
|
|
|
from tools.extractors import extract_yt_initial_data
|
2020-09-05 12:31:17 +00:00
|
|
|
from threading import Lock
|
2020-08-13 14:20:11 +00:00
|
|
|
from cachetools import TTLCache
|
|
|
|
|
|
|
|
channel_cache = TTLCache(maxsize=50, ttl=300)
|
2020-09-05 12:31:17 +00:00
|
|
|
channel_cache_lock = Lock()
|
2020-08-30 15:16:57 +00:00
|
|
|
channel_latest_cache = TTLCache(maxsize=500, ttl=300)
|
2020-09-05 12:31:17 +00:00
|
|
|
channel_latest_cache_lock = Lock()
|
2020-08-13 14:20:11 +00:00
|
|
|
|
|
|
|
def extract_channel(ucid):
|
2020-09-05 12:31:17 +00:00
|
|
|
with channel_cache_lock:
|
|
|
|
if ucid in channel_cache:
|
|
|
|
return channel_cache[ucid]
|
2020-08-13 14:20:11 +00:00
|
|
|
|
|
|
|
channel_type = "channel" if len(ucid) == 24 and ucid[:2] == "UC" else "user"
|
2020-10-25 05:02:05 +00:00
|
|
|
with requests.get("https://www.youtube.com/{}/{}/videos?hl=en".format(channel_type, ucid)) as r:
|
2020-08-13 14:20:11 +00:00
|
|
|
r.raise_for_status()
|
|
|
|
yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
|
2020-10-02 12:17:23 +00:00
|
|
|
|
2021-01-17 01:29:05 +00:00
|
|
|
header = yt_initial_data["header"]["c4TabbedHeaderRenderer"] if "c4TabbedHeaderRenderer" in yt_initial_data["header"] else []
|
|
|
|
channel_metadata = yt_initial_data["metadata"]["channelMetadataRenderer"]
|
|
|
|
|
|
|
|
if header:
|
|
|
|
author = header["title"]
|
|
|
|
author_id = header["channelId"]
|
|
|
|
author_url = header["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]
|
|
|
|
else:
|
|
|
|
author = channel_metadata["title"]
|
|
|
|
author_id = channel_metadata["externalId"]
|
|
|
|
author_url = channel_metadata["channelUrl"]
|
|
|
|
|
2020-12-09 03:53:22 +00:00
|
|
|
subscriber_count = combine_runs(header["subscriberCountText"]) if "subscribeCountText" in header else "Unknown subscribers"
|
2021-01-17 01:29:05 +00:00
|
|
|
description = channel_metadata["description"]
|
|
|
|
allowed_regions = channel_metadata["availableCountryCodes"]
|
2020-10-02 12:17:23 +00:00
|
|
|
|
2020-08-29 12:48:33 +00:00
|
|
|
author_banners = []
|
|
|
|
if "banner" in header:
|
|
|
|
author_banners = header["banner"]["thumbnails"]
|
|
|
|
for t in author_banners:
|
|
|
|
t["url"] = normalise_url_protocol(t["url"])
|
2020-10-02 12:17:23 +00:00
|
|
|
|
2020-08-29 12:48:33 +00:00
|
|
|
author_thumbnails = []
|
|
|
|
if "avatar" in header:
|
|
|
|
author_thumbnails = generate_full_author_thumbnails(header["avatar"]["thumbnails"])
|
2021-01-17 01:29:05 +00:00
|
|
|
elif "avatar" in channel_metadata:
|
|
|
|
author_thumbnails = generate_full_author_thumbnails(channel_metadata["avatar"]["thumbnails"])
|
2020-10-02 12:17:23 +00:00
|
|
|
|
|
|
|
latest_videos = []
|
2020-08-13 14:20:11 +00:00
|
|
|
tabs = yt_initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]
|
|
|
|
videos_tab = next(tab["tabRenderer"] for tab in tabs if tab["tabRenderer"]["title"] == "Videos")
|
2020-10-02 12:17:23 +00:00
|
|
|
tab_parts = videos_tab["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]
|
|
|
|
|
|
|
|
# check that the channel actually has videos - this may be replaced
|
|
|
|
# with messageRenderer.text.simpleText == "This channel has no videos."
|
2020-10-04 05:38:41 +00:00
|
|
|
if "gridRenderer" in tab_parts:
|
2020-10-02 12:17:23 +00:00
|
|
|
videos = (
|
|
|
|
v["gridVideoRenderer"] for v in tab_parts["gridRenderer"]["items"]
|
|
|
|
)
|
|
|
|
for v in videos:
|
|
|
|
live = True
|
|
|
|
length_text = "LIVE"
|
|
|
|
length_seconds = -1
|
|
|
|
for o in v["thumbnailOverlays"]:
|
|
|
|
if "thumbnailOverlayTimeStatusRenderer" in o:
|
|
|
|
length_text = combine_runs(o["thumbnailOverlayTimeStatusRenderer"]["text"])
|
|
|
|
if o["thumbnailOverlayTimeStatusRenderer"]["style"] != "LIVE":
|
|
|
|
length_seconds = length_text_to_seconds(length_text)
|
|
|
|
live = False
|
|
|
|
published = 0
|
|
|
|
published_text = "Live now"
|
|
|
|
if "publishedTimeText" in v:
|
|
|
|
published_text = v["publishedTimeText"]["simpleText"]
|
|
|
|
published = past_text_to_time(published_text)
|
2021-01-17 01:29:05 +00:00
|
|
|
|
|
|
|
view_count_text = combine_runs(v["viewCountText"]) if "viewCountText" in v else None
|
|
|
|
view_count_text_short = combine_runs(v["shortViewCountText"]) if "shortViewCountText" in v else None
|
|
|
|
|
2020-10-02 12:17:23 +00:00
|
|
|
latest_videos.append({
|
|
|
|
"type": "video",
|
|
|
|
"title": combine_runs(v["title"]),
|
|
|
|
"videoId": v["videoId"],
|
|
|
|
"author": author,
|
|
|
|
"authorId": author_id,
|
|
|
|
"authorUrl": author_url,
|
|
|
|
"videoThumbnails": generate_video_thumbnails(v["videoId"]),
|
|
|
|
"description": "",
|
|
|
|
"descriptionHtml": "",
|
2021-01-17 01:29:05 +00:00
|
|
|
"viewCount": view_count_text_to_number(view_count_text),
|
|
|
|
"second__viewCountText": view_count_text,
|
|
|
|
"second__viewCountTextShort": view_count_text_short,
|
2020-10-02 12:17:23 +00:00
|
|
|
"published": published,
|
|
|
|
"publishedText": published_text,
|
|
|
|
"lengthSeconds": length_seconds,
|
|
|
|
"second__lengthText": length_text,
|
|
|
|
"liveNow": live,
|
|
|
|
"paid": None,
|
|
|
|
"premium": None,
|
|
|
|
"isUpcoming": None
|
|
|
|
})
|
2020-08-13 14:20:11 +00:00
|
|
|
|
|
|
|
channel = {
|
|
|
|
"author": author,
|
|
|
|
"authorId": author_id,
|
|
|
|
"authorUrl": author_url,
|
|
|
|
"authorBanners": author_banners,
|
|
|
|
"authorThumbnails": author_thumbnails,
|
|
|
|
"subCount": uncompress_counter(subscriber_count.split(" ")[0]),
|
|
|
|
"second__subCountText": subscriber_count,
|
|
|
|
"totalViews": None,
|
|
|
|
"joined": None,
|
|
|
|
"paid": None,
|
|
|
|
"autoGenerated": None,
|
|
|
|
"isFamilyFriendly": None,
|
|
|
|
"description": description,
|
|
|
|
"descriptionHtml": add_html_links(escape_html_textcontent(description)),
|
|
|
|
"allowedRegions": allowed_regions,
|
|
|
|
"latestVideos": latest_videos,
|
|
|
|
"relatedChannels": []
|
|
|
|
}
|
|
|
|
|
2020-09-05 12:31:17 +00:00
|
|
|
with channel_cache_lock:
|
|
|
|
channel_cache[ucid] = channel
|
2020-08-13 14:20:11 +00:00
|
|
|
|
|
|
|
return channel
|
|
|
|
|
|
|
|
def extract_channel_videos(ucid):
|
|
|
|
channel = extract_channel(ucid)
|
|
|
|
if "error" in channel:
|
|
|
|
return channel
|
|
|
|
else:
|
|
|
|
return channel["latestVideos"]
|
|
|
|
|
|
|
|
def extract_channel_latest(ucid):
|
2020-09-05 12:31:17 +00:00
|
|
|
with channel_latest_cache_lock:
|
|
|
|
if ucid in channel_latest_cache:
|
|
|
|
return channel_latest_cache[ucid]
|
2020-08-30 15:16:57 +00:00
|
|
|
|
2020-08-13 14:20:11 +00:00
|
|
|
with requests.get("https://www.youtube.com/feeds/videos.xml?channel_id={}".format(ucid)) as r:
|
2020-12-06 02:39:28 +00:00
|
|
|
if r.status_code == 404:
|
|
|
|
cherrypy.response.status = 404
|
|
|
|
return {
|
|
|
|
"error": "Channel does not exist.",
|
|
|
|
"identifier": "NOT_FOUND"
|
|
|
|
}
|
|
|
|
|
2020-08-13 14:20:11 +00:00
|
|
|
feed = ET.fromstring(r.content)
|
|
|
|
author_container = feed.find("{http://www.w3.org/2005/Atom}author")
|
|
|
|
author = author_container.find("{http://www.w3.org/2005/Atom}name").text
|
|
|
|
author_url = author_container.find("{http://www.w3.org/2005/Atom}uri").text
|
|
|
|
channel_id = feed.find("{http://www.youtube.com/xml/schemas/2015}channelId").text
|
|
|
|
results = []
|
2020-09-23 12:56:16 +00:00
|
|
|
missing_published = False
|
2020-08-13 14:20:11 +00:00
|
|
|
for entry in feed.findall("{http://www.w3.org/2005/Atom}entry"):
|
|
|
|
id = entry.find("{http://www.youtube.com/xml/schemas/2015}videoId").text
|
|
|
|
media_group = entry.find("{http://search.yahoo.com/mrss/}group")
|
2020-09-23 13:06:47 +00:00
|
|
|
description = media_group.find("{http://search.yahoo.com/mrss/}description").text or ""
|
2020-08-13 14:20:11 +00:00
|
|
|
media_community = media_group.find("{http://search.yahoo.com/mrss/}community")
|
2020-09-23 12:56:16 +00:00
|
|
|
published_entry = entry.find("{http://www.w3.org/2005/Atom}published")
|
|
|
|
if published_entry is not None: # sometimes youtube does not provide published dates, no idea why.
|
|
|
|
published = int(dateutil.parser.isoparse(published_entry.text).timestamp())
|
|
|
|
results.append({
|
|
|
|
"type": "video",
|
|
|
|
"title": entry.find("{http://www.w3.org/2005/Atom}title").text,
|
|
|
|
"videoId": id,
|
|
|
|
"author": author,
|
|
|
|
"authorId": channel_id,
|
|
|
|
"authorUrl": author_url,
|
|
|
|
"videoThumbnails": generate_video_thumbnails(id),
|
|
|
|
"description": description,
|
2020-09-23 13:06:47 +00:00
|
|
|
"descriptionHtml": add_html_links(escape_html_textcontent(description)),
|
2020-09-23 12:56:16 +00:00
|
|
|
"viewCount": int(media_community.find("{http://search.yahoo.com/mrss/}statistics").attrib["views"]),
|
|
|
|
"published": published,
|
|
|
|
"publishedText": time_to_past_text(published),
|
|
|
|
"lengthSeconds": None,
|
|
|
|
"liveNow": None,
|
|
|
|
"paid": None,
|
|
|
|
"premium": None,
|
|
|
|
"isUpcoming": None
|
|
|
|
})
|
|
|
|
else:
|
|
|
|
missing_published = True
|
2020-08-30 15:16:57 +00:00
|
|
|
|
2020-09-23 12:56:16 +00:00
|
|
|
if len(results) == 0 and missing_published: # no results due to all missing published
|
|
|
|
cherrypy.response.status = 503
|
|
|
|
return {
|
|
|
|
"error": "YouTube did not provide published dates for any feed items. This is usually temporary - refresh in a few minutes.",
|
|
|
|
"identifier": "PUBLISHED_DATES_NOT_PROVIDED"
|
|
|
|
}
|
2020-08-30 15:16:57 +00:00
|
|
|
|
2020-12-06 02:39:28 +00:00
|
|
|
with channel_latest_cache_lock:
|
|
|
|
channel_latest_cache[ucid] = results
|
|
|
|
|
|
|
|
return results
|