NewLeaf/extractors/comments.py

import json
import requests
import urllib.parse
from tools.converters import *
from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie

def extract_comments(id, **kwargs):
	s = requests.session()
	s.headers.update({"accept-language": "en-US,en;q=0.9"})
	s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
	with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
		r.raise_for_status()
		yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
		item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
		continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
		itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
		xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
		if not xsrf_token:
			cherrypy.response.status = 500
			return {
				"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
				"identifier": "XSRF_TOKEN_NOT_FOUND"
			}
		url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
		with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
			data = json.loads(rr.content.decode("utf8"))
			return {
				"videoId": id,
				"comments": [
					{
						"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
						"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
						"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
						"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
						"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
						"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
						"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
						"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
						# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
						"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
						"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
						# "replies": {
						# 	"replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
						# }
					} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
				]
			}
Retrieve the first 20 comments of a video on /api/v1/comments/:videoid Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X headers). This is not a complete reimplementation of Invidious API as continuation is not implemented (to retrieve more than the first 20 comments and comments replies), likes and replies count are also missing. 2021-06-27 02:06:13 +00:00			`import json`
			`import requests`
			`import urllib.parse`
			`from tools.converters import *`
			`from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie`

			`def extract_comments(id, **kwargs):`
			`s = requests.session()`
			`s.headers.update({"accept-language": "en-US,en;q=0.9"})`
			`s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))`
			`with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:`
			`r.raise_for_status()`
			`yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))`
			`item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]`
			`continuation = item["continuations"][0]["nextContinuationData"]["continuation"]`
			`itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]`
			`xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)`
			`if not xsrf_token:`
			`cherrypy.response.status = 500`
			`return {`
			`"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",`
			`"identifier": "XSRF_TOKEN_NOT_FOUND"`
			`}`
			`url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))`
			`with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:`
			`data = json.loads(rr.content.decode("utf8"))`
			`return {`
			`"videoId": id,`
			`"comments": [`
			`{`
			`"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],`
			`"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],`
			`"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],`
			`"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],`
			`"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),`
			`"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),`
			`"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),`
			`"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),`
			`# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))`
			`"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],`
			`"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],`
			`# "replies": {`
			`# "replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]`
			`# }`
			`} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]`
			`]`
			`}`