bundle up preexisting entry queries to do all at once instead of doing 30 individual queries

trying to alleviate the frequent timeouts fetching a bunch of tweets
This commit is contained in:
Kyle Mahan 2016-05-22 22:54:55 -07:00
parent c8f7213b39
commit 9a927e33be

View file

@ -164,14 +164,23 @@ def update_feed(feed_id, content=None,
else: else:
result = [] result = []
# realize list, only look at the first 30 entries
result = list(itertools.islice(result, 30))
old_entries = {}
all_uids = [e.uid for e in result]
if all_uids:
for entry in (Entry.query
.filter(Entry.feed == feed,
Entry.uid.in_(all_uids))
.order_by(Entry.id.desc())):
old_entries[entry.uid] = entry
for entry in result: for entry in result:
current_app.logger.debug('searching for entry with uid=%s', entry.uid) old = old_entries.get(entry.uid)
old = Entry.query\ current_app.logger.debug(
.filter(Entry.feed == feed)\ 'entry for uid %s: %s', entry.uid,
.filter(Entry.uid == entry.uid)\ 'found' if old else 'not found')
.order_by(Entry.id.desc())\
.first()
current_app.logger.debug('done searcing: %s', 'found' if old else 'not found')
# have we seen this post before # have we seen this post before
if not old: if not old:
@ -209,9 +218,7 @@ def update_feed(feed_id, content=None,
current_app.logger.debug( current_app.logger.debug(
'skipping previously seen post %s', old.permalink) 'skipping previously seen post %s', old.permalink)
for entry, in_reply_to in reply_pairs: fetch_reply_contexts(reply_pairs, now)
fetch_reply_context(entry, in_reply_to, now)
db.session.commit() db.session.commit()
except: except:
db.session.rollback() db.session.rollback()
@ -344,15 +351,15 @@ def is_content_equal(e1, e2):
return content return content
return ( return (
e1.title == e2.title e1.title == e2.title and
and normalize(e1.content) == normalize(e2.content) normalize(e1.content) == normalize(e2.content) and
and e1.author_name == e2.author_name e1.author_name == e2.author_name and
and e1.author_url == e2.author_url e1.author_url == e2.author_url and
and e1.author_photo == e2.author_photo e1.author_photo == e2.author_photo and
and e1.properties == e2.properties e1.properties == e2.properties and
and e1.published == e2.published e1.published == e2.published and
and e1.updated == e2.updated e1.updated == e2.updated and
and e1.deleted == e2.deleted e1.deleted == e2.deleted
) )
@ -411,15 +418,12 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now):
if link.type == 'audio/mpeg' or link.type == 'audio/mp3': if link.type == 'audio/mpeg' or link.type == 'audio/mp3':
audio = AUDIO_ENCLOSURE_TMPL.format(href=link.get('href')) audio = AUDIO_ENCLOSURE_TMPL.format(href=link.get('href'))
content = (content or '') + audio content = (content or '') + audio
if (link.type == 'video/x-m4v' if (link.type == 'video/x-m4v' or link.type == 'video/x-mp4' or
or link.type == 'video/x-mp4' link.type == 'video/mp4'):
or link.type == 'video/mp4'):
video = VIDEO_ENCLOSURE_TMPL.format(href=link.get('href')) video = VIDEO_ENCLOSURE_TMPL.format(href=link.get('href'))
content = (content or '') + video content = (content or '') + video
current_app.logger.debug('building entry') yield Entry(
entry = Entry(
published=published, published=published,
updated=updated, updated=updated,
uid=uid, uid=uid,
@ -428,16 +432,12 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now):
title=p_entry.get('title'), title=p_entry.get('title'),
content=content, content=content,
content_cleaned=util.clean(content), content_cleaned=util.clean(content),
author_name=p_entry.get('author_detail', {}).get('name') author_name=p_entry.get('author_detail', {}).get('name') or
or default_author_name, default_author_name,
author_url=p_entry.get('author_detail', {}).get('href') author_url=p_entry.get('author_detail', {}).get('href') or
or default_author_url, default_author_url,
author_photo=default_author_photo author_photo=default_author_photo or
or fallback_photo(feed.origin)) fallback_photo(feed.origin))
current_app.logger.debug('yielding entry')
yield entry
def process_html_feed_for_new_entries(feed, content, backfill, now): def process_html_feed_for_new_entries(feed, content, backfill, now):
@ -475,8 +475,8 @@ def process_html_feed_for_new_entries(feed, content, backfill, now):
def hentry_to_entry(hentry, feed, backfill, now): def hentry_to_entry(hentry, feed, backfill, now):
def normalize_datetime(dt): def normalize_datetime(dt):
if (dt and hasattr(dt, 'year') and hasattr(dt, 'month') if (dt and hasattr(dt, 'year') and hasattr(dt, 'month') and
and hasattr(dt, 'day')): hasattr(dt, 'day')):
# make sure published is in UTC and strip the timezone # make sure published is in UTC and strip the timezone
if hasattr(dt, 'tzinfo') and dt.tzinfo: if hasattr(dt, 'tzinfo') and dt.tzinfo:
return dt.astimezone(datetime.timezone.utc).replace( return dt.astimezone(datetime.timezone.utc).replace(
@ -570,34 +570,40 @@ def hentry_to_entry(hentry, feed, backfill, now):
return entry return entry
def fetch_reply_context(entry, in_reply_to, now): def fetch_reply_contexts(reply_pairs, now):
context = Entry.query\ old_contexts = {}
.join(Entry.feed)\ in_reply_tos = [url for _, url in reply_pairs]
.filter(Entry.permalink == in_reply_to, Feed.type == 'html')\ if in_reply_tos:
.first() for entry in (Entry.query
.join(Entry.feed)
.filter(Entry.permalink.in_(in_reply_tos),
Feed.type == 'html')):
old_contexts[entry.permalink] = entry
if not context: for entry, in_reply_to in reply_pairs:
current_app.logger.info('fetching in-reply-to: %s', in_reply_to) context = old_contexts.get(in_reply_to)
try: if not context:
proxied_reply_url = proxy_url(in_reply_to) current_app.logger.info('fetching in-reply-to: %s', in_reply_to)
parsed = mf2util.interpret( try:
mf2py.parse(url=proxied_reply_url), in_reply_to, proxied_reply_url = proxy_url(in_reply_to)
fetch_mf2_func=lambda url: mf2py.parse(url=url)) parsed = mf2util.interpret(
if parsed: mf2py.parse(url=proxied_reply_url), in_reply_to,
context = hentry_to_entry(parsed, None, False, now) fetch_mf2_func=lambda url: mf2py.parse(url=url))
except requests.exceptions.RequestException as err: if parsed:
current_app.logger.warn( context = hentry_to_entry(parsed, None, False, now)
'%s fetching reply context: %s for entry: %s', except requests.exceptions.RequestException as err:
type(err).__name__, proxied_reply_url, entry.permalink) current_app.logger.warn(
'%s fetching reply context: %s for entry: %s',
type(err).__name__, proxied_reply_url, entry.permalink)
if context: if context:
db.session.add(context) db.session.add(context)
entry.reply_context.append(context) entry.reply_context.append(context)
def proxy_url(url): def proxy_url(url):
if ('TWITTER_AU_KEY' in current_app.config if ('TWITTER_AU_KEY' in current_app.config and
and 'TWITTER_AU_SECRET' in current_app.config): 'TWITTER_AU_SECRET' in current_app.config):
# swap out the a-u url for twitter urls # swap out the a-u url for twitter urls
match = TWITTER_RE.match(url) match = TWITTER_RE.match(url)
if match: if match: