Retrieve the first 20 comments of a video on /api/v1/comments/:videoid

Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X headers). This is not a complete reimplementation of Invidious API as continuation is not implemented (to retrieve more than the first 20 comments and comments replies), likes and replies count are also missing.
2026-05-01 15:21:35 +00:00 · 2021-06-27 04:06:13 +02:00 · 2021-06-27 04:06:13 +02:00 · 3f57d50893
commit 3f57d50893
parent 1ea86101fd
3 changed files with 62 additions and 1 deletions
--- a/extractors/comments.py
+++ b/extractors/comments.py
@ -0,0 +1,47 @@
+import json
+import requests
+import urllib.parse
+from tools.converters import *
+from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie
+
+def extract_comments(id, **kwargs):
+	s = requests.session()
+	s.headers.update({"accept-language": "en-US,en;q=0.9"})
+	s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
+	with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
+		r.raise_for_status()
+		yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
+		item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
+		continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
+		itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
+		xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
+		if not xsrf_token:
+			cherrypy.response.status = 500
+			return {
+				"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
+				"identifier": "XSRF_TOKEN_NOT_FOUND"
+			}
+		url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
+		with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
+			data = json.loads(rr.content.decode("utf8"))
+			return {
+				"videoId": id,
+				"comments": [
+					{
+						"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
+						"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
+						"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
+						"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
+						"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
+						"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
+						"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
+						"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
+						# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
+						"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
+						"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
+						# "replies": {
+						# 	"replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
+						# }
+					} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
+				]
+			}