NewLeaf/extractors/channel.py

import dateutil.parser
import requests
import xml.etree.ElementTree as ET
from tools.converters import *
from tools.extractors import extract_yt_initial_data
from cachetools import TTLCache

channel_cache = TTLCache(maxsize=50, ttl=300)

def extract_channel(ucid):
	if ucid in channel_cache:
		return channel_cache[ucid]

	channel_type = "channel" if len(ucid) == 24 and ucid[:2] == "UC" else "user"
	with requests.get("https://www.youtube.com/{}/{}/videos".format(channel_type, ucid)) as r:
		r.raise_for_status()
		yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
		header = yt_initial_data["header"]["c4TabbedHeaderRenderer"]
		author = header["title"]
		author_id = header["channelId"]
		author_url = header["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]
		author_banners = header["banner"]["thumbnails"]
		for t in author_banners:
			t["url"] = normalise_url_protocol(t["url"])
		author_thumbnails = generate_full_author_thumbnails(header["avatar"]["thumbnails"])
		subscriber_count = combine_runs(header["subscriberCountText"])
		description = yt_initial_data["metadata"]["channelMetadataRenderer"]["description"]
		allowed_regions = yt_initial_data["metadata"]["channelMetadataRenderer"]["availableCountryCodes"]
		tabs = yt_initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]
		videos_tab = next(tab["tabRenderer"] for tab in tabs if tab["tabRenderer"]["title"] == "Videos")
		videos = (
			v["gridVideoRenderer"] for v in
			videos_tab["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"]["items"]
		)
		latest_videos = []
		for v in videos:
			live = True
			length_text = "LIVE"
			length_seconds = -1
			for o in v["thumbnailOverlays"]:
				if "thumbnailOverlayTimeStatusRenderer" in o:
					length_text = combine_runs(o["thumbnailOverlayTimeStatusRenderer"]["text"])
					if o["thumbnailOverlayTimeStatusRenderer"]["style"] != "LIVE":
						length_seconds = length_text_to_seconds(length_text)
						live = False
			published = 0
			published_text = "Live now"
			if "publishedTimeText" in v:
				published_text = v["publishedTimeText"]["simpleText"]
				published = past_text_to_time(published_text)
			latest_videos.append({
				"type": "video",
				"title": combine_runs(v["title"]),
				"videoId": v["videoId"],
				"author": author,
				"authorId": author_id,
				"authorUrl": author_url,
				"videoThumbnails": generate_video_thumbnails(v["videoId"]),
				"description": "",
				"descriptionHtml": "",
				"viewCount": view_count_text_to_number(combine_runs(v["viewCountText"])),
				"second__viewCountText": combine_runs(v["viewCountText"]),
				"second__viewCountTextShort": combine_runs(v["shortViewCountText"]),
				"published": published,
				"publishedText": published_text,
				"lengthSeconds": length_seconds,
				"second__lengthText": length_text,
				"liveNow": live,
				"paid": None,
				"premium": None,
				"isUpcoming": None
			})

		channel = {
			"author": author,
			"authorId": author_id,
			"authorUrl": author_url,
			"authorBanners": author_banners,
			"authorThumbnails": author_thumbnails,
			"subCount": uncompress_counter(subscriber_count.split(" ")[0]),
			"second__subCountText": subscriber_count,
			"totalViews": None,
			"joined": None,
			"paid": None,
			"autoGenerated": None,
			"isFamilyFriendly": None,
			"description": description,
			"descriptionHtml": add_html_links(escape_html_textcontent(description)),
			"allowedRegions": allowed_regions,
			"latestVideos": latest_videos,
			"relatedChannels": []
		}

		channel_cache[ucid] = channel

		return channel

def extract_channel_videos(ucid):
	channel = extract_channel(ucid)
	if "error" in channel:
		return channel
	else:
		return channel["latestVideos"]

def extract_channel_latest(ucid):
	with requests.get("https://www.youtube.com/feeds/videos.xml?channel_id={}".format(ucid)) as r:
		r.raise_for_status()
		feed = ET.fromstring(r.content)
		author_container = feed.find("{http://www.w3.org/2005/Atom}author")
		author = author_container.find("{http://www.w3.org/2005/Atom}name").text
		author_url = author_container.find("{http://www.w3.org/2005/Atom}uri").text
		channel_id = feed.find("{http://www.youtube.com/xml/schemas/2015}channelId").text
		results = []
		for entry in feed.findall("{http://www.w3.org/2005/Atom}entry"):
			id = entry.find("{http://www.youtube.com/xml/schemas/2015}videoId").text
			media_group = entry.find("{http://search.yahoo.com/mrss/}group")
			description = media_group.find("{http://search.yahoo.com/mrss/}description").text
			media_community = media_group.find("{http://search.yahoo.com/mrss/}community")
			results.append({
				"type": "video",
				"title": entry.find("{http://www.w3.org/2005/Atom}title").text,
				"videoId": id,
				"author": author,
				"authorId": channel_id,
				"authorUrl": author_url,
				"videoThumbnails": generate_video_thumbnails(id),
				"description": description,
				"descriptionHtml": add_html_links(escape_html_textcontent(description)),
				"viewCount": int(media_community.find("{http://search.yahoo.com/mrss/}statistics").attrib["views"]),
				"published": int(dateutil.parser.isoparse(entry.find("{http://www.w3.org/2005/Atom}published").text).timestamp()),
				"lengthSeconds": None,
				"liveNow": None,
				"paid": None,
				"premium": None,
				"isUpcoming": None
			})
		return results
Refactor everything to separate files 2020-08-13 14:20:11 +00:00			`import dateutil.parser`
			`import requests`
			`import xml.etree.ElementTree as ET`
			`from tools.converters import *`
			`from tools.extractors import extract_yt_initial_data`
			`from cachetools import TTLCache`

			`channel_cache = TTLCache(maxsize=50, ttl=300)`

			`def extract_channel(ucid):`
			`if ucid in channel_cache:`
			`return channel_cache[ucid]`

			`channel_type = "channel" if len(ucid) == 24 and ucid[:2] == "UC" else "user"`
			`with requests.get("https://www.youtube.com/{}/{}/videos".format(channel_type, ucid)) as r:`
			`r.raise_for_status()`
			`yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))`
			`header = yt_initial_data["header"]["c4TabbedHeaderRenderer"]`
			`author = header["title"]`
			`author_id = header["channelId"]`
			`author_url = header["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]`
			`author_banners = header["banner"]["thumbnails"]`
			`for t in author_banners:`
			`t["url"] = normalise_url_protocol(t["url"])`
			`author_thumbnails = generate_full_author_thumbnails(header["avatar"]["thumbnails"])`
			`subscriber_count = combine_runs(header["subscriberCountText"])`
			`description = yt_initial_data["metadata"]["channelMetadataRenderer"]["description"]`
			`allowed_regions = yt_initial_data["metadata"]["channelMetadataRenderer"]["availableCountryCodes"]`
			`tabs = yt_initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]`
			`videos_tab = next(tab["tabRenderer"] for tab in tabs if tab["tabRenderer"]["title"] == "Videos")`
			`videos = (`
			`v["gridVideoRenderer"] for v in`
			`videos_tab["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"]["items"]`
			`)`
			`latest_videos = []`
			`for v in videos:`
			`live = True`
			`length_text = "LIVE"`
			`length_seconds = -1`
			`for o in v["thumbnailOverlays"]:`
			`if "thumbnailOverlayTimeStatusRenderer" in o:`
			`length_text = combine_runs(o["thumbnailOverlayTimeStatusRenderer"]["text"])`
			`if o["thumbnailOverlayTimeStatusRenderer"]["style"] != "LIVE":`
			`length_seconds = length_text_to_seconds(length_text)`
			`live = False`
			`published = 0`
			`published_text = "Live now"`
			`if "publishedTimeText" in v:`
			`published_text = v["publishedTimeText"]["simpleText"]`
			`published = past_text_to_time(published_text)`
			`latest_videos.append({`
			`"type": "video",`
			`"title": combine_runs(v["title"]),`
			`"videoId": v["videoId"],`
			`"author": author,`
			`"authorId": author_id,`
			`"authorUrl": author_url,`
			`"videoThumbnails": generate_video_thumbnails(v["videoId"]),`
			`"description": "",`
			`"descriptionHtml": "",`
			`"viewCount": view_count_text_to_number(combine_runs(v["viewCountText"])),`
			`"second__viewCountText": combine_runs(v["viewCountText"]),`
			`"second__viewCountTextShort": combine_runs(v["shortViewCountText"]),`
			`"published": published,`
			`"publishedText": published_text,`
			`"lengthSeconds": length_seconds,`
			`"second__lengthText": length_text,`
			`"liveNow": live,`
			`"paid": None,`
			`"premium": None,`
			`"isUpcoming": None`
			`})`

			`channel = {`
			`"author": author,`
			`"authorId": author_id,`
			`"authorUrl": author_url,`
			`"authorBanners": author_banners,`
			`"authorThumbnails": author_thumbnails,`
			`"subCount": uncompress_counter(subscriber_count.split(" ")[0]),`
			`"second__subCountText": subscriber_count,`
			`"totalViews": None,`
			`"joined": None,`
			`"paid": None,`
			`"autoGenerated": None,`
			`"isFamilyFriendly": None,`
			`"description": description,`
			`"descriptionHtml": add_html_links(escape_html_textcontent(description)),`
			`"allowedRegions": allowed_regions,`
			`"latestVideos": latest_videos,`
			`"relatedChannels": []`
			`}`

			`channel_cache[ucid] = channel`

			`return channel`

			`def extract_channel_videos(ucid):`
			`channel = extract_channel(ucid)`
			`if "error" in channel:`
			`return channel`
			`else:`
			`return channel["latestVideos"]`

			`def extract_channel_latest(ucid):`
			`with requests.get("https://www.youtube.com/feeds/videos.xml?channel_id={}".format(ucid)) as r:`
			`r.raise_for_status()`
			`feed = ET.fromstring(r.content)`
			`author_container = feed.find("{http://www.w3.org/2005/Atom}author")`
			`author = author_container.find("{http://www.w3.org/2005/Atom}name").text`
			`author_url = author_container.find("{http://www.w3.org/2005/Atom}uri").text`
			`channel_id = feed.find("{http://www.youtube.com/xml/schemas/2015}channelId").text`
			`results = []`
			`for entry in feed.findall("{http://www.w3.org/2005/Atom}entry"):`
			`id = entry.find("{http://www.youtube.com/xml/schemas/2015}videoId").text`
			`media_group = entry.find("{http://search.yahoo.com/mrss/}group")`
			`description = media_group.find("{http://search.yahoo.com/mrss/}description").text`
			`media_community = media_group.find("{http://search.yahoo.com/mrss/}community")`
			`results.append({`
			`"type": "video",`
			`"title": entry.find("{http://www.w3.org/2005/Atom}title").text,`
			`"videoId": id,`
			`"author": author,`
			`"authorId": channel_id,`
			`"authorUrl": author_url,`
			`"videoThumbnails": generate_video_thumbnails(id),`
			`"description": description,`
			`"descriptionHtml": add_html_links(escape_html_textcontent(description)),`
			`"viewCount": int(media_community.find("{http://search.yahoo.com/mrss/}statistics").attrib["views"]),`
			`"published": int(dateutil.parser.isoparse(entry.find("{http://www.w3.org/2005/Atom}published").text).timestamp()),`
			`"lengthSeconds": None,`
			`"liveNow": None,`
			`"paid": None,`
			`"premium": None,`
			`"isUpcoming": None`
			`})`
			`return results`