1
0
mirror of https://git.sr.ht/~cadence/NewLeaf synced 2024-11-21 23:27:29 +00:00

Retrieve the first 20 comments of a video on /api/v1/comments/:videoid

Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X
headers).
This is not a complete reimplementation of Invidious API as continuation is not implemented
(to retrieve more than the first 20 comments and comments replies), likes and replies count
are also missing.
This commit is contained in:
Lomanic 2021-06-27 04:06:13 +02:00 committed by Cadence Ember
parent 1ea86101fd
commit 3f57d50893
No known key found for this signature in database
GPG Key ID: BC1C2C61CF521B17
3 changed files with 62 additions and 1 deletions

47
extractors/comments.py Normal file
View File

@ -0,0 +1,47 @@
import json
import requests
import urllib.parse
from tools.converters import *
from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie
def extract_comments(id, **kwargs):
s = requests.session()
s.headers.update({"accept-language": "en-US,en;q=0.9"})
s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
r.raise_for_status()
yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
if not xsrf_token:
cherrypy.response.status = 500
return {
"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
"identifier": "XSRF_TOKEN_NOT_FOUND"
}
url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
data = json.loads(rr.content.decode("utf8"))
return {
"videoId": id,
"comments": [
{
"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
# "replies": {
# "replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
# }
} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
]
}

View File

@ -9,6 +9,7 @@ from extractors.manifest import extract_manifest
from extractors.search import extract_search from extractors.search import extract_search
from extractors.suggestions import extract_search_suggestions from extractors.suggestions import extract_search_suggestions
from extractors.captions import extract_captions from extractors.captions import extract_captions
from extractors.comments import extract_comments
import configuration import configuration
@cherrypy.tools.register("before_finalize", priority=60) @cherrypy.tools.register("before_finalize", priority=60)
@ -26,7 +27,8 @@ class NewLeaf(object):
["channels", 1, 2], ["channels", 1, 2],
["videos", 1, 1], ["videos", 1, 1],
["search", 0, 1], ["search", 0, 1],
["captions", 1, 1] ["captions", 1, 1],
["comments", 1, 1]
] ]
for e in endpoints: for e in endpoints:
if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3: if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3:
@ -114,6 +116,11 @@ class NewLeaf(object):
"identifier": "NO_MATCHING_CAPTIONS" "identifier": "NO_MATCHING_CAPTIONS"
}), "utf8") }), "utf8")
@cherrypy.expose
@cherrypy.tools.json_out()
def comments(self, id, **kwargs):
return extract_comments(id)
@cherrypy.expose @cherrypy.expose
def vi(self, id, file): def vi(self, id, file):
with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r: with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r:

View File

@ -4,6 +4,7 @@ import random
r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M) r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M)
r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M) r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M)
r_yt_cfg = re.compile(r"""ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;""")
def extract_yt_initial_data(content): def extract_yt_initial_data(content):
m_yt_initial_data = re.search(r_yt_initial_data, content) m_yt_initial_data = re.search(r_yt_initial_data, content)
@ -21,5 +22,11 @@ def extract_yt_initial_player_response(content):
else: else:
raise Exception("Could not match ytInitialPlayerResponse in content") raise Exception("Could not match ytInitialPlayerResponse in content")
def extract_yt_cfg(content):
m_yt_cfg = re.search(r_yt_cfg, content)
if m_yt_cfg:
return json.loads(m_yt_cfg.group(1))
raise Exception("Could not match ytcfg in content")
def eu_consent_cookie(): def eu_consent_cookie():
return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))} return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))}