mirror of
https://git.sr.ht/~cadence/NewLeaf
synced 2024-11-24 00:27:28 +00:00
48 lines
3.1 KiB
Python
48 lines
3.1 KiB
Python
|
import json
|
||
|
import requests
|
||
|
import urllib.parse
|
||
|
from tools.converters import *
|
||
|
from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie
|
||
|
|
||
|
def extract_comments(id, **kwargs):
|
||
|
s = requests.session()
|
||
|
s.headers.update({"accept-language": "en-US,en;q=0.9"})
|
||
|
s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
|
||
|
with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
|
||
|
r.raise_for_status()
|
||
|
yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
|
||
|
item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
|
||
|
continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
|
||
|
itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
|
||
|
xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
|
||
|
if not xsrf_token:
|
||
|
cherrypy.response.status = 500
|
||
|
return {
|
||
|
"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
|
||
|
"identifier": "XSRF_TOKEN_NOT_FOUND"
|
||
|
}
|
||
|
url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
|
||
|
with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
|
||
|
data = json.loads(rr.content.decode("utf8"))
|
||
|
return {
|
||
|
"videoId": id,
|
||
|
"comments": [
|
||
|
{
|
||
|
"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
|
||
|
"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
|
||
|
"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
|
||
|
"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
|
||
|
"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
|
||
|
"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
|
||
|
"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
|
||
|
"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
|
||
|
# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
|
||
|
"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
|
||
|
"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
|
||
|
# "replies": {
|
||
|
# "replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
|
||
|
# }
|
||
|
} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
|
||
|
]
|
||
|
}
|