NewLeaf/extractors/search.py

import requests
import traceback
import yt_dlp
from tools.converters import *
from tools.extractors import extract_yt_initial_data, eu_consent_cookie
from cachetools import TTLCache

search_cache = TTLCache(maxsize=50, ttl=300)

ytdl_opts = {
	"quiet": True,
	"dump_single_json": True,
	"playlist_items": "1-100",
	"extract_flat": "in_playlist"
}
ytdl = yt_dlp.YoutubeDL(ytdl_opts)

def extract_search(q):
	try:
		with requests.get("https://www.youtube.com/results", params={"q": q, "hl": "en"}, cookies=eu_consent_cookie()) as r:
			r.raise_for_status()
			content = r.content.decode("utf8")
			yt_initial_data = extract_yt_initial_data(content)

			sections = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
			# youtube searches contain a lot of random stuff, just grab it all for now, then filter to `videoRenderer` later
			itemSections = [s for s in sections if "itemSectionRenderer" in s]

			items = []
			for section in itemSections:
				items += section["itemSectionRenderer"]["contents"]

			results = []
			for item in items:
				if "videoRenderer" in item:
					video = item["videoRenderer"]
					published = 0
					published_text = "Live now"
					if "publishedTimeText" in video:
						published_text = video["publishedTimeText"]["simpleText"]
						published = past_text_to_time(published_text)
					results.append({
						"type": "video",
						"title": combine_runs(video["title"]),
						"videoId": video["videoId"],
						"author": combine_runs(video["longBylineText"]),
						"authorId": video["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"],
						"authorUrl": video["longBylineText"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"],
						"videoThumbnails": generate_video_thumbnails(video["videoId"]),
						"description": combine_runs(video["descriptionSnippet"]) if "descriptionSnippet" in video else "",
						"descriptionHtml": combine_runs_html(video["descriptionSnippet"]) if "descriptionSnippet" in video else "",
						"viewCount": get_view_count_or_recommended(video),
						"second__viewCountText": get_view_count_text_or_recommended(video),
						"published": published,
						"publishedText": published_text,
						"lengthSeconds": get_length_or_live_now(video),
						"second__lengthText": get_length_text_or_live_now(video),
						"liveNow": is_live(video),
						"paid": None,
						"premium": None,
						"isUpcoming": None
					})
			search_cache[q] = results # only cache full extraction
			return results

	except Exception:
		print("messed up extracting search, using youtube-dl instead")
		traceback.print_exc()

		info = ytdl.extract_info("ytsearchall:{}".format(q), download=False)
		return [{
			"type": "video",
			"title": video["title"],
			"videoId": video["id"],
			"author": None,
			"authorId": None,
			"authorUrl": None,
			"videoThumbnails": generate_video_thumbnails(video["id"]),
			"description": None,
			"descriptionHtml": None,
			"viewCount": None,
			"published": None,
			"publishedText": None,
			"lengthSeconds": None,
			"liveNow": None,
			"paid": None,
			"premium": None,
			"isUpcoming": None
		} for video in info["entries"] if "title" in video]
Refactor everything to separate files 2020-08-13 14:20:11 +00:00			`import requests`
			`import traceback`
Fix extracting with cookie consent page in EU Fix #27 use maintained yt-dlp lib instead of youtube-dlc Because of the following changes in YT, we have to switch to a maintained library https://github.com/ytdl-org/youtube-dl/issues/28604 While yt-dlp is not fixed today, youtube-dl is fixed in master and as yt-dlp is quick to merge upstream changes back to their repo, we can hope the issue will also be fixed there timely. For requests sent by us directly, we include the cookies. Ref https://github.com/ytdl-org/youtube-dl/issues/28604 2021-03-31 22:31:33 +00:00			`import yt_dlp`
Refactor everything to separate files 2020-08-13 14:20:11 +00:00			`from tools.converters import *`
Change cookies to skip EU cookie consent page See https://github.com/benbusby/whoogle-search/issues/311 for some context. We're now implementing https://github.com/ytdl-org/youtube-dl/blob/a7260099873acc6dc7d76cafad2f6b139087afd0/youtube_dl/extractor/youtube.py#L263-L264 2021-05-14 16:49:25 +00:00			`from tools.extractors import extract_yt_initial_data, eu_consent_cookie`
Refactor everything to separate files 2020-08-13 14:20:11 +00:00			`from cachetools import TTLCache`

			`search_cache = TTLCache(maxsize=50, ttl=300)`

			`ytdl_opts = {`
			`"quiet": True,`
			`"dump_single_json": True,`
			`"playlist_items": "1-100",`
			`"extract_flat": "in_playlist"`
			`}`
Fix extracting with cookie consent page in EU Fix #27 use maintained yt-dlp lib instead of youtube-dlc Because of the following changes in YT, we have to switch to a maintained library https://github.com/ytdl-org/youtube-dl/issues/28604 While yt-dlp is not fixed today, youtube-dl is fixed in master and as yt-dlp is quick to merge upstream changes back to their repo, we can hope the issue will also be fixed there timely. For requests sent by us directly, we include the cookies. Ref https://github.com/ytdl-org/youtube-dl/issues/28604 2021-03-31 22:31:33 +00:00			`ytdl = yt_dlp.YoutubeDL(ytdl_opts)`
Refactor everything to separate files 2020-08-13 14:20:11 +00:00
			`def extract_search(q):`
			`try:`
Change cookies to skip EU cookie consent page See https://github.com/benbusby/whoogle-search/issues/311 for some context. We're now implementing https://github.com/ytdl-org/youtube-dl/blob/a7260099873acc6dc7d76cafad2f6b139087afd0/youtube_dl/extractor/youtube.py#L263-L264 2021-05-14 16:49:25 +00:00			`with requests.get("https://www.youtube.com/results", params={"q": q, "hl": "en"}, cookies=eu_consent_cookie()) as r:`
Refactor everything to separate files 2020-08-13 14:20:11 +00:00			`r.raise_for_status()`
			`content = r.content.decode("utf8")`
			`yt_initial_data = extract_yt_initial_data(content)`
Fix search extractor ad section filtering The ads sections had a carouselAdRenderer property, now they have a promotedSparklesTextSearchRenderer property instead. As this may change again in the future, we should just get all items as we discriminate/filter them as videos afterwards with the videoRenderer property. 2021-05-13 00:46:26 +00:00
Fix search 2020-10-23 11:36:20 +00:00			`sections = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]`
Fix search extractor ad section filtering The ads sections had a carouselAdRenderer property, now they have a promotedSparklesTextSearchRenderer property instead. As this may change again in the future, we should just get all items as we discriminate/filter them as videos afterwards with the videoRenderer property. 2021-05-13 00:46:26 +00:00			# youtube searches contain a lot of random stuff, just grab it all for now, then filter to `videoRenderer` later
			`itemSections = [s for s in sections if "itemSectionRenderer" in s]`

			`items = []`
			`for section in itemSections:`
			`items += section["itemSectionRenderer"]["contents"]`

Refactor everything to separate files 2020-08-13 14:20:11 +00:00			`results = []`
			`for item in items:`
			`if "videoRenderer" in item:`
			`video = item["videoRenderer"]`
			`published = 0`
			`published_text = "Live now"`
			`if "publishedTimeText" in video:`
			`published_text = video["publishedTimeText"]["simpleText"]`
			`published = past_text_to_time(published_text)`
			`results.append({`
			`"type": "video",`
			`"title": combine_runs(video["title"]),`
			`"videoId": video["videoId"],`
			`"author": combine_runs(video["longBylineText"]),`
			`"authorId": video["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"],`
			`"authorUrl": video["longBylineText"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"],`
			`"videoThumbnails": generate_video_thumbnails(video["videoId"]),`
			`"description": combine_runs(video["descriptionSnippet"]) if "descriptionSnippet" in video else "",`
			`"descriptionHtml": combine_runs_html(video["descriptionSnippet"]) if "descriptionSnippet" in video else "",`
			`"viewCount": get_view_count_or_recommended(video),`
			`"second__viewCountText": get_view_count_text_or_recommended(video),`
			`"published": published,`
			`"publishedText": published_text,`
			`"lengthSeconds": get_length_or_live_now(video),`
			`"second__lengthText": get_length_text_or_live_now(video),`
			`"liveNow": is_live(video),`
			`"paid": None,`
			`"premium": None,`
			`"isUpcoming": None`
			`})`
			`search_cache[q] = results # only cache full extraction`
			`return results`

			`except Exception:`
			`print("messed up extracting search, using youtube-dl instead")`
			`traceback.print_exc()`

			`info = ytdl.extract_info("ytsearchall:{}".format(q), download=False)`
			`return [{`
			`"type": "video",`
			`"title": video["title"],`
			`"videoId": video["id"],`
			`"author": None,`
			`"authorId": None,`
			`"authorUrl": None,`
			`"videoThumbnails": generate_video_thumbnails(video["id"]),`
			`"description": None,`
			`"descriptionHtml": None,`
			`"viewCount": None,`
			`"published": None,`
			`"publishedText": None,`
			`"lengthSeconds": None,`
			`"liveNow": None,`
			`"paid": None,`
			`"premium": None,`
			`"isUpcoming": None`
			`} for video in info["entries"] if "title" in video]`