import dateutil.parser import requests import xml.etree.ElementTree as ET from tools.converters import * from tools.extractors import extract_yt_initial_data from cachetools import TTLCache channel_cache = TTLCache(maxsize=50, ttl=300) def extract_channel(ucid): if ucid in channel_cache: return channel_cache[ucid] channel_type = "channel" if len(ucid) == 24 and ucid[:2] == "UC" else "user" with requests.get("https://www.youtube.com/{}/{}/videos".format(channel_type, ucid)) as r: r.raise_for_status() yt_initial_data = extract_yt_initial_data(r.content.decode("utf8")) header = yt_initial_data["header"]["c4TabbedHeaderRenderer"] author = header["title"] author_id = header["channelId"] author_url = header["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"] author_banners = [] if "banner" in header: author_banners = header["banner"]["thumbnails"] for t in author_banners: t["url"] = normalise_url_protocol(t["url"]) author_thumbnails = [] if "avatar" in header: author_thumbnails = generate_full_author_thumbnails(header["avatar"]["thumbnails"]) subscriber_count = combine_runs(header["subscriberCountText"]) description = yt_initial_data["metadata"]["channelMetadataRenderer"]["description"] allowed_regions = yt_initial_data["metadata"]["channelMetadataRenderer"]["availableCountryCodes"] tabs = yt_initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"] videos_tab = next(tab["tabRenderer"] for tab in tabs if tab["tabRenderer"]["title"] == "Videos") videos = ( v["gridVideoRenderer"] for v in videos_tab["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"]["items"] ) latest_videos = [] for v in videos: live = True length_text = "LIVE" length_seconds = -1 for o in v["thumbnailOverlays"]: if "thumbnailOverlayTimeStatusRenderer" in o: length_text = combine_runs(o["thumbnailOverlayTimeStatusRenderer"]["text"]) if o["thumbnailOverlayTimeStatusRenderer"]["style"] != "LIVE": length_seconds = length_text_to_seconds(length_text) live = False published = 0 published_text = "Live now" if "publishedTimeText" in v: published_text = v["publishedTimeText"]["simpleText"] published = past_text_to_time(published_text) latest_videos.append({ "type": "video", "title": combine_runs(v["title"]), "videoId": v["videoId"], "author": author, "authorId": author_id, "authorUrl": author_url, "videoThumbnails": generate_video_thumbnails(v["videoId"]), "description": "", "descriptionHtml": "", "viewCount": view_count_text_to_number(combine_runs(v["viewCountText"])), "second__viewCountText": combine_runs(v["viewCountText"]), "second__viewCountTextShort": combine_runs(v["shortViewCountText"]), "published": published, "publishedText": published_text, "lengthSeconds": length_seconds, "second__lengthText": length_text, "liveNow": live, "paid": None, "premium": None, "isUpcoming": None }) channel = { "author": author, "authorId": author_id, "authorUrl": author_url, "authorBanners": author_banners, "authorThumbnails": author_thumbnails, "subCount": uncompress_counter(subscriber_count.split(" ")[0]), "second__subCountText": subscriber_count, "totalViews": None, "joined": None, "paid": None, "autoGenerated": None, "isFamilyFriendly": None, "description": description, "descriptionHtml": add_html_links(escape_html_textcontent(description)), "allowedRegions": allowed_regions, "latestVideos": latest_videos, "relatedChannels": [] } channel_cache[ucid] = channel return channel def extract_channel_videos(ucid): channel = extract_channel(ucid) if "error" in channel: return channel else: return channel["latestVideos"] def extract_channel_latest(ucid): with requests.get("https://www.youtube.com/feeds/videos.xml?channel_id={}".format(ucid)) as r: r.raise_for_status() feed = ET.fromstring(r.content) author_container = feed.find("{http://www.w3.org/2005/Atom}author") author = author_container.find("{http://www.w3.org/2005/Atom}name").text author_url = author_container.find("{http://www.w3.org/2005/Atom}uri").text channel_id = feed.find("{http://www.youtube.com/xml/schemas/2015}channelId").text results = [] for entry in feed.findall("{http://www.w3.org/2005/Atom}entry"): id = entry.find("{http://www.youtube.com/xml/schemas/2015}videoId").text media_group = entry.find("{http://search.yahoo.com/mrss/}group") description = media_group.find("{http://search.yahoo.com/mrss/}description").text media_community = media_group.find("{http://search.yahoo.com/mrss/}community") published = int(dateutil.parser.isoparse(entry.find("{http://www.w3.org/2005/Atom}published").text).timestamp()) results.append({ "type": "video", "title": entry.find("{http://www.w3.org/2005/Atom}title").text, "videoId": id, "author": author, "authorId": channel_id, "authorUrl": author_url, "videoThumbnails": generate_video_thumbnails(id), "description": description, "descriptionHtml": add_html_links(escape_html_textcontent(description)), "viewCount": int(media_community.find("{http://search.yahoo.com/mrss/}statistics").attrib["views"]), "published": published, "publishedText": time_to_past_text(published), "lengthSeconds": None, "liveNow": None, "paid": None, "premium": None, "isUpcoming": None }) return results