mirror of
https://git.sr.ht/~cadence/NewLeaf
synced 2024-11-24 00:27:28 +00:00
Lomanic
3f57d50893
Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X headers). This is not a complete reimplementation of Invidious API as continuation is not implemented (to retrieve more than the first 20 comments and comments replies), likes and replies count are also missing.
48 lines
3.1 KiB
Python
48 lines
3.1 KiB
Python
import json
|
|
import requests
|
|
import urllib.parse
|
|
from tools.converters import *
|
|
from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie
|
|
|
|
def extract_comments(id, **kwargs):
|
|
s = requests.session()
|
|
s.headers.update({"accept-language": "en-US,en;q=0.9"})
|
|
s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
|
|
with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
|
|
r.raise_for_status()
|
|
yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
|
|
item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
|
|
continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
|
|
itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
|
|
xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
|
|
if not xsrf_token:
|
|
cherrypy.response.status = 500
|
|
return {
|
|
"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
|
|
"identifier": "XSRF_TOKEN_NOT_FOUND"
|
|
}
|
|
url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
|
|
with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
|
|
data = json.loads(rr.content.decode("utf8"))
|
|
return {
|
|
"videoId": id,
|
|
"comments": [
|
|
{
|
|
"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
|
|
"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
|
|
"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
|
|
"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
|
|
"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
|
|
"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
|
|
"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
|
|
"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
|
|
# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
|
|
"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
|
|
"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
|
|
# "replies": {
|
|
# "replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
|
|
# }
|
|
} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
|
|
]
|
|
}
|