mirror of
https://git.sr.ht/~cadence/NewLeaf
synced 2024-11-21 23:27:29 +00:00
Retrieve the first 20 comments of a video on /api/v1/comments/:videoid
Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X headers). This is not a complete reimplementation of Invidious API as continuation is not implemented (to retrieve more than the first 20 comments and comments replies), likes and replies count are also missing.
This commit is contained in:
parent
1ea86101fd
commit
3f57d50893
47
extractors/comments.py
Normal file
47
extractors/comments.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import json
|
||||||
|
import requests
|
||||||
|
import urllib.parse
|
||||||
|
from tools.converters import *
|
||||||
|
from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie
|
||||||
|
|
||||||
|
def extract_comments(id, **kwargs):
|
||||||
|
s = requests.session()
|
||||||
|
s.headers.update({"accept-language": "en-US,en;q=0.9"})
|
||||||
|
s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
|
||||||
|
with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
|
||||||
|
r.raise_for_status()
|
||||||
|
yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
|
||||||
|
item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
|
||||||
|
continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
|
||||||
|
itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
|
||||||
|
xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
|
||||||
|
if not xsrf_token:
|
||||||
|
cherrypy.response.status = 500
|
||||||
|
return {
|
||||||
|
"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
|
||||||
|
"identifier": "XSRF_TOKEN_NOT_FOUND"
|
||||||
|
}
|
||||||
|
url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
|
||||||
|
with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
|
||||||
|
data = json.loads(rr.content.decode("utf8"))
|
||||||
|
return {
|
||||||
|
"videoId": id,
|
||||||
|
"comments": [
|
||||||
|
{
|
||||||
|
"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
|
||||||
|
"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
|
||||||
|
"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
|
||||||
|
"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
|
||||||
|
"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
|
||||||
|
"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
|
||||||
|
"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
|
||||||
|
"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
|
||||||
|
# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
|
||||||
|
"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
|
||||||
|
"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
|
||||||
|
# "replies": {
|
||||||
|
# "replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
|
||||||
|
# }
|
||||||
|
} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
|
||||||
|
]
|
||||||
|
}
|
9
index.py
9
index.py
@ -9,6 +9,7 @@ from extractors.manifest import extract_manifest
|
|||||||
from extractors.search import extract_search
|
from extractors.search import extract_search
|
||||||
from extractors.suggestions import extract_search_suggestions
|
from extractors.suggestions import extract_search_suggestions
|
||||||
from extractors.captions import extract_captions
|
from extractors.captions import extract_captions
|
||||||
|
from extractors.comments import extract_comments
|
||||||
import configuration
|
import configuration
|
||||||
|
|
||||||
@cherrypy.tools.register("before_finalize", priority=60)
|
@cherrypy.tools.register("before_finalize", priority=60)
|
||||||
@ -26,7 +27,8 @@ class NewLeaf(object):
|
|||||||
["channels", 1, 2],
|
["channels", 1, 2],
|
||||||
["videos", 1, 1],
|
["videos", 1, 1],
|
||||||
["search", 0, 1],
|
["search", 0, 1],
|
||||||
["captions", 1, 1]
|
["captions", 1, 1],
|
||||||
|
["comments", 1, 1]
|
||||||
]
|
]
|
||||||
for e in endpoints:
|
for e in endpoints:
|
||||||
if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3:
|
if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3:
|
||||||
@ -114,6 +116,11 @@ class NewLeaf(object):
|
|||||||
"identifier": "NO_MATCHING_CAPTIONS"
|
"identifier": "NO_MATCHING_CAPTIONS"
|
||||||
}), "utf8")
|
}), "utf8")
|
||||||
|
|
||||||
|
@cherrypy.expose
|
||||||
|
@cherrypy.tools.json_out()
|
||||||
|
def comments(self, id, **kwargs):
|
||||||
|
return extract_comments(id)
|
||||||
|
|
||||||
@cherrypy.expose
|
@cherrypy.expose
|
||||||
def vi(self, id, file):
|
def vi(self, id, file):
|
||||||
with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r:
|
with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r:
|
||||||
|
@ -4,6 +4,7 @@ import random
|
|||||||
|
|
||||||
r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M)
|
r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M)
|
||||||
r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M)
|
r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M)
|
||||||
|
r_yt_cfg = re.compile(r"""ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;""")
|
||||||
|
|
||||||
def extract_yt_initial_data(content):
|
def extract_yt_initial_data(content):
|
||||||
m_yt_initial_data = re.search(r_yt_initial_data, content)
|
m_yt_initial_data = re.search(r_yt_initial_data, content)
|
||||||
@ -21,5 +22,11 @@ def extract_yt_initial_player_response(content):
|
|||||||
else:
|
else:
|
||||||
raise Exception("Could not match ytInitialPlayerResponse in content")
|
raise Exception("Could not match ytInitialPlayerResponse in content")
|
||||||
|
|
||||||
|
def extract_yt_cfg(content):
|
||||||
|
m_yt_cfg = re.search(r_yt_cfg, content)
|
||||||
|
if m_yt_cfg:
|
||||||
|
return json.loads(m_yt_cfg.group(1))
|
||||||
|
raise Exception("Could not match ytcfg in content")
|
||||||
|
|
||||||
def eu_consent_cookie():
|
def eu_consent_cookie():
|
||||||
return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))}
|
return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))}
|
||||||
|
Loading…
Reference in New Issue
Block a user