Refactor everything to separate files

2026-05-25 03:32:27 +00:00 · 2020-08-14 02:20:11 +12:00 · 2020-08-14 02:20:11 +12:00 · 40759efb03
commit 40759efb03
parent a271db0632
8 changed files with 742 additions and 629 deletions
--- a/extractors/channel.py
+++ b/extractors/channel.py
@ -0,0 +1,137 @@
+import dateutil.parser
+import requests
+import xml.etree.ElementTree as ET
+from tools.converters import *
+from tools.extractors import extract_yt_initial_data
+from cachetools import TTLCache
+
+channel_cache = TTLCache(maxsize=50, ttl=300)
+
+def extract_channel(ucid):
+	if ucid in channel_cache:
+		return channel_cache[ucid]
+
+	channel_type = "channel" if len(ucid) == 24 and ucid[:2] == "UC" else "user"
+	with requests.get("https://www.youtube.com/{}/{}/videos".format(channel_type, ucid)) as r:
+		r.raise_for_status()
+		yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
+		header = yt_initial_data["header"]["c4TabbedHeaderRenderer"]
+		author = header["title"]
+		author_id = header["channelId"]
+		author_url = header["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]
+		author_banners = header["banner"]["thumbnails"]
+		for t in author_banners:
+			t["url"] = normalise_url_protocol(t["url"])
+		author_thumbnails = generate_full_author_thumbnails(header["avatar"]["thumbnails"])
+		subscriber_count = combine_runs(header["subscriberCountText"])
+		description = yt_initial_data["metadata"]["channelMetadataRenderer"]["description"]
+		allowed_regions = yt_initial_data["metadata"]["channelMetadataRenderer"]["availableCountryCodes"]
+		tabs = yt_initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]
+		videos_tab = next(tab["tabRenderer"] for tab in tabs if tab["tabRenderer"]["title"] == "Videos")
+		videos = (
+			v["gridVideoRenderer"] for v in
+			videos_tab["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"]["items"]
+		)
+		latest_videos = []
+		for v in videos:
+			live = True
+			length_text = "LIVE"
+			length_seconds = -1
+			for o in v["thumbnailOverlays"]:
+				if "thumbnailOverlayTimeStatusRenderer" in o:
+					length_text = combine_runs(o["thumbnailOverlayTimeStatusRenderer"]["text"])
+					if o["thumbnailOverlayTimeStatusRenderer"]["style"] != "LIVE":
+						length_seconds = length_text_to_seconds(length_text)
+						live = False
+			published = 0
+			published_text = "Live now"
+			if "publishedTimeText" in v:
+				published_text = v["publishedTimeText"]["simpleText"]
+				published = past_text_to_time(published_text)
+			latest_videos.append({
+				"type": "video",
+				"title": combine_runs(v["title"]),
+				"videoId": v["videoId"],
+				"author": author,
+				"authorId": author_id,
+				"authorUrl": author_url,
+				"videoThumbnails": generate_video_thumbnails(v["videoId"]),
+				"description": "",
+				"descriptionHtml": "",
+				"viewCount": view_count_text_to_number(combine_runs(v["viewCountText"])),
+				"second__viewCountText": combine_runs(v["viewCountText"]),
+				"second__viewCountTextShort": combine_runs(v["shortViewCountText"]),
+				"published": published,
+				"publishedText": published_text,
+				"lengthSeconds": length_seconds,
+				"second__lengthText": length_text,
+				"liveNow": live,
+				"paid": None,
+				"premium": None,
+				"isUpcoming": None
+			})
+
+		channel = {
+			"author": author,
+			"authorId": author_id,
+			"authorUrl": author_url,
+			"authorBanners": author_banners,
+			"authorThumbnails": author_thumbnails,
+			"subCount": uncompress_counter(subscriber_count.split(" ")[0]),
+			"second__subCountText": subscriber_count,
+			"totalViews": None,
+			"joined": None,
+			"paid": None,
+			"autoGenerated": None,
+			"isFamilyFriendly": None,
+			"description": description,
+			"descriptionHtml": add_html_links(escape_html_textcontent(description)),
+			"allowedRegions": allowed_regions,
+			"latestVideos": latest_videos,
+			"relatedChannels": []
+		}
+
+		channel_cache[ucid] = channel
+
+		return channel
+
+def extract_channel_videos(ucid):
+	channel = extract_channel(ucid)
+	if "error" in channel:
+		return channel
+	else:
+		return channel["latestVideos"]
+
+def extract_channel_latest(ucid):
+	with requests.get("https://www.youtube.com/feeds/videos.xml?channel_id={}".format(ucid)) as r:
+		r.raise_for_status()
+		feed = ET.fromstring(r.content)
+		author_container = feed.find("{http://www.w3.org/2005/Atom}author")
+		author = author_container.find("{http://www.w3.org/2005/Atom}name").text
+		author_url = author_container.find("{http://www.w3.org/2005/Atom}uri").text
+		channel_id = feed.find("{http://www.youtube.com/xml/schemas/2015}channelId").text
+		results = []
+		for entry in feed.findall("{http://www.w3.org/2005/Atom}entry"):
+			id = entry.find("{http://www.youtube.com/xml/schemas/2015}videoId").text
+			media_group = entry.find("{http://search.yahoo.com/mrss/}group")
+			description = media_group.find("{http://search.yahoo.com/mrss/}description").text
+			media_community = media_group.find("{http://search.yahoo.com/mrss/}community")
+			results.append({
+				"type": "video",
+				"title": entry.find("{http://www.w3.org/2005/Atom}title").text,
+				"videoId": id,
+				"author": author,
+				"authorId": channel_id,
+				"authorUrl": author_url,
+				"videoThumbnails": generate_video_thumbnails(id),
+				"description": description,
+				"descriptionHtml": add_html_links(escape_html_textcontent(description)),
+				"viewCount": int(media_community.find("{http://search.yahoo.com/mrss/}statistics").attrib["views"]),
+				"published": int(dateutil.parser.isoparse(entry.find("{http://www.w3.org/2005/Atom}published").text).timestamp()),
+				"lengthSeconds": None,
+				"liveNow": None,
+				"paid": None,
+				"premium": None,
+				"isUpcoming": None
+			})
+		return results