1
0
Fork 0
mirror of https://git.sr.ht/~cadence/NewLeaf synced 2026-03-07 13:01:37 +00:00

Retrieve the first 20 comments of a video on /api/v1/comments/:videoid

Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X
headers).
This is not a complete reimplementation of Invidious API as continuation is not implemented
(to retrieve more than the first 20 comments and comments replies), likes and replies count
are also missing.
This commit is contained in:
Lomanic 2021-06-27 04:06:13 +02:00 committed by Cadence Ember
parent 1ea86101fd
commit 3f57d50893
No known key found for this signature in database
GPG key ID: BC1C2C61CF521B17
3 changed files with 62 additions and 1 deletions

47
extractors/comments.py Normal file
View file

@ -0,0 +1,47 @@
import json
import requests
import urllib.parse
from tools.converters import *
from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie
def extract_comments(id, **kwargs):
s = requests.session()
s.headers.update({"accept-language": "en-US,en;q=0.9"})
s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
r.raise_for_status()
yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
if not xsrf_token:
cherrypy.response.status = 500
return {
"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
"identifier": "XSRF_TOKEN_NOT_FOUND"
}
url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
data = json.loads(rr.content.decode("utf8"))
return {
"videoId": id,
"comments": [
{
"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
# "replies": {
# "replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
# }
} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
]
}