1
0
mirror of https://git.sr.ht/~cadence/NewLeaf synced 2024-11-22 07:37:29 +00:00

Fix extracting empty description

This commit is contained in:
Cadence Ember 2020-09-24 00:56:16 +12:00
parent e18efc9591
commit caee795b7e
No known key found for this signature in database
GPG Key ID: BC1C2C61CF521B17
2 changed files with 38 additions and 24 deletions

View File

@ -1,3 +1,4 @@
import cherrypy
import dateutil.parser import dateutil.parser
import requests import requests
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
@ -125,12 +126,15 @@ def extract_channel_latest(ucid):
author_url = author_container.find("{http://www.w3.org/2005/Atom}uri").text author_url = author_container.find("{http://www.w3.org/2005/Atom}uri").text
channel_id = feed.find("{http://www.youtube.com/xml/schemas/2015}channelId").text channel_id = feed.find("{http://www.youtube.com/xml/schemas/2015}channelId").text
results = [] results = []
missing_published = False
for entry in feed.findall("{http://www.w3.org/2005/Atom}entry"): for entry in feed.findall("{http://www.w3.org/2005/Atom}entry"):
id = entry.find("{http://www.youtube.com/xml/schemas/2015}videoId").text id = entry.find("{http://www.youtube.com/xml/schemas/2015}videoId").text
media_group = entry.find("{http://search.yahoo.com/mrss/}group") media_group = entry.find("{http://search.yahoo.com/mrss/}group")
description = media_group.find("{http://search.yahoo.com/mrss/}description").text description = media_group.find("{http://search.yahoo.com/mrss/}description").text
media_community = media_group.find("{http://search.yahoo.com/mrss/}community") media_community = media_group.find("{http://search.yahoo.com/mrss/}community")
published = int(dateutil.parser.isoparse(entry.find("{http://www.w3.org/2005/Atom}published").text).timestamp()) published_entry = entry.find("{http://www.w3.org/2005/Atom}published")
if published_entry is not None: # sometimes youtube does not provide published dates, no idea why.
published = int(dateutil.parser.isoparse(published_entry.text).timestamp())
results.append({ results.append({
"type": "video", "type": "video",
"title": entry.find("{http://www.w3.org/2005/Atom}title").text, "title": entry.find("{http://www.w3.org/2005/Atom}title").text,
@ -140,7 +144,7 @@ def extract_channel_latest(ucid):
"authorUrl": author_url, "authorUrl": author_url,
"videoThumbnails": generate_video_thumbnails(id), "videoThumbnails": generate_video_thumbnails(id),
"description": description, "description": description,
"descriptionHtml": add_html_links(escape_html_textcontent(description)), "descriptionHtml": description and add_html_links(escape_html_textcontent(description)),
"viewCount": int(media_community.find("{http://search.yahoo.com/mrss/}statistics").attrib["views"]), "viewCount": int(media_community.find("{http://search.yahoo.com/mrss/}statistics").attrib["views"]),
"published": published, "published": published,
"publishedText": time_to_past_text(published), "publishedText": time_to_past_text(published),
@ -150,7 +154,16 @@ def extract_channel_latest(ucid):
"premium": None, "premium": None,
"isUpcoming": None "isUpcoming": None
}) })
else:
missing_published = True
if len(results) == 0 and missing_published: # no results due to all missing published
cherrypy.response.status = 503
return {
"error": "YouTube did not provide published dates for any feed items. This is usually temporary - refresh in a few minutes.",
"identifier": "PUBLISHED_DATES_NOT_PROVIDED"
}
else:
with channel_latest_cache_lock: with channel_latest_cache_lock:
channel_latest_cache[ucid] = results channel_latest_cache[ucid] = results

View File

@ -18,7 +18,8 @@ ytdl_opts = {
"dump_single_json": True, "dump_single_json": True,
"playlist_items": "1-100", "playlist_items": "1-100",
"extract_flat": "in_playlist", "extract_flat": "in_playlist",
"write_pages": True "write_pages": True,
"source_address": "0.0.0.0"
} }
ytdl = youtube_dl.YoutubeDL(ytdl_opts) ytdl = youtube_dl.YoutubeDL(ytdl_opts)