diff --git a/extractors/comments.py b/extractors/comments.py new file mode 100644 index 0000000..338d5e4 --- /dev/null +++ b/extractors/comments.py @@ -0,0 +1,47 @@ +import json +import requests +import urllib.parse +from tools.converters import * +from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie + +def extract_comments(id, **kwargs): + s = requests.session() + s.headers.update({"accept-language": "en-US,en;q=0.9"}) + s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT")) + with s.get("https://www.youtube.com/watch?v={}".format(id)) as r: + r.raise_for_status() + yt_initial_data = extract_yt_initial_data(r.content.decode("utf8")) + item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"] + continuation = item["continuations"][0]["nextContinuationData"]["continuation"] + itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"] + xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None) + if not xsrf_token: + cherrypy.response.status = 500 + return { + "error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.", + "identifier": "XSRF_TOKEN_NOT_FOUND" + } + url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct)) + with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr: + data = json.loads(rr.content.decode("utf8")) + return { + "videoId": id, + "comments": [ + { + "author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"], + "authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]], + "authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"], + "authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"], + "isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]), + "content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]), + "contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])), + "publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]), + # "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", "")) + "commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"], + "authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"], + # "replies": { + # "replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"] + # } + } for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"] + ] + } diff --git a/index.py b/index.py index d0a4771..ffc35c7 100644 --- a/index.py +++ b/index.py @@ -9,6 +9,7 @@ from extractors.manifest import extract_manifest from extractors.search import extract_search from extractors.suggestions import extract_search_suggestions from extractors.captions import extract_captions +from extractors.comments import extract_comments import configuration @cherrypy.tools.register("before_finalize", priority=60) @@ -26,7 +27,8 @@ class NewLeaf(object): ["channels", 1, 2], ["videos", 1, 1], ["search", 0, 1], - ["captions", 1, 1] + ["captions", 1, 1], + ["comments", 1, 1] ] for e in endpoints: if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3: @@ -114,6 +116,11 @@ class NewLeaf(object): "identifier": "NO_MATCHING_CAPTIONS" }), "utf8") + @cherrypy.expose + @cherrypy.tools.json_out() + def comments(self, id, **kwargs): + return extract_comments(id) + @cherrypy.expose def vi(self, id, file): with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r: diff --git a/tools/extractors.py b/tools/extractors.py index 40e0c06..1000e06 100644 --- a/tools/extractors.py +++ b/tools/extractors.py @@ -4,6 +4,7 @@ import random r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|)""", re.S + re.M) r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$||var )""", re.S + re.M) +r_yt_cfg = re.compile(r"""ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;""") def extract_yt_initial_data(content): m_yt_initial_data = re.search(r_yt_initial_data, content) @@ -21,5 +22,11 @@ def extract_yt_initial_player_response(content): else: raise Exception("Could not match ytInitialPlayerResponse in content") +def extract_yt_cfg(content): + m_yt_cfg = re.search(r_yt_cfg, content) + if m_yt_cfg: + return json.loads(m_yt_cfg.group(1)) + raise Exception("Could not match ytcfg in content") + def eu_consent_cookie(): return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))}