1
0
mirror of https://git.sr.ht/~cadence/NewLeaf synced 2024-11-14 20:17:29 +00:00
NewLeaf/extractors/search.py
Lomanic 5f47e1a71b
Fix extracting with cookie consent page in EU
Fix #27 use maintained yt-dlp lib instead of youtube-dlc

Because of the following changes in YT, we have to switch to a
maintained library https://github.com/ytdl-org/youtube-dl/issues/28604
While yt-dlp is not fixed today, youtube-dl is fixed in master and as
yt-dlp is quick to merge upstream changes back to their repo, we can
hope the issue will also be fixed there timely.

For requests sent by us directly, we include the cookies.

Ref https://github.com/ytdl-org/youtube-dl/issues/28604
2021-04-03 15:09:58 +13:00

85 lines
3.2 KiB
Python

import requests
import traceback
import yt_dlp
from tools.converters import *
from tools.extractors import extract_yt_initial_data
from cachetools import TTLCache
search_cache = TTLCache(maxsize=50, ttl=300)
ytdl_opts = {
"quiet": True,
"dump_single_json": True,
"playlist_items": "1-100",
"extract_flat": "in_playlist"
}
ytdl = yt_dlp.YoutubeDL(ytdl_opts)
def extract_search(q):
try:
with requests.get("https://www.youtube.com/results", params={"q": q, "hl": "en"}, cookies={"CONSENT": "YES+cb.20210328-17-p0.en+FX+101"}) as r:
r.raise_for_status()
content = r.content.decode("utf8")
yt_initial_data = extract_yt_initial_data(content)
sections = yt_initial_data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]["sectionListRenderer"]["contents"]
# find the section with the videos, not the one with the ads
section = next(s for s in sections if "itemSectionRenderer" in s and not (len(s["itemSectionRenderer"]["contents"]) >= 1 and "carouselAdRenderer" in s["itemSectionRenderer"]["contents"][0]))
items = section["itemSectionRenderer"]["contents"]
results = []
for item in items:
if "videoRenderer" in item:
video = item["videoRenderer"]
published = 0
published_text = "Live now"
if "publishedTimeText" in video:
published_text = video["publishedTimeText"]["simpleText"]
published = past_text_to_time(published_text)
results.append({
"type": "video",
"title": combine_runs(video["title"]),
"videoId": video["videoId"],
"author": combine_runs(video["longBylineText"]),
"authorId": video["longBylineText"]["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"],
"authorUrl": video["longBylineText"]["runs"][0]["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"],
"videoThumbnails": generate_video_thumbnails(video["videoId"]),
"description": combine_runs(video["descriptionSnippet"]) if "descriptionSnippet" in video else "",
"descriptionHtml": combine_runs_html(video["descriptionSnippet"]) if "descriptionSnippet" in video else "",
"viewCount": get_view_count_or_recommended(video),
"second__viewCountText": get_view_count_text_or_recommended(video),
"published": published,
"publishedText": published_text,
"lengthSeconds": get_length_or_live_now(video),
"second__lengthText": get_length_text_or_live_now(video),
"liveNow": is_live(video),
"paid": None,
"premium": None,
"isUpcoming": None
})
search_cache[q] = results # only cache full extraction
return results
except Exception:
print("messed up extracting search, using youtube-dl instead")
traceback.print_exc()
info = ytdl.extract_info("ytsearchall:{}".format(q), download=False)
return [{
"type": "video",
"title": video["title"],
"videoId": video["id"],
"author": None,
"authorId": None,
"authorUrl": None,
"videoThumbnails": generate_video_thumbnails(video["id"]),
"description": None,
"descriptionHtml": None,
"viewCount": None,
"published": None,
"publishedText": None,
"lengthSeconds": None,
"liveNow": None,
"paid": None,
"premium": None,
"isUpcoming": None
} for video in info["entries"] if "title" in video]