better caching for fetch_mf2 across calls to entries and reply contexts

This commit is contained in:
Kyle Mahan 2016-05-26 02:42:59 +00:00
parent 79d4f84d28
commit 3a1d88616c

View file

@ -44,6 +44,17 @@ q = rq.Queue('low', connection=redis)
_app = None
class Mf2Fetcher:
def __init__(self):
self.cache = {}
def __call__(self, url):
if url in self.cache:
return self.cache[url]
p = mf2py.parse(url=url)
self.cache[url] = p
return p
@contextmanager
def flask_app():
@ -122,6 +133,7 @@ def update_feed(feed_id, content=None,
updated_entries = []
reply_pairs = []
fetch_mf2 = Mf2Fetcher()
try:
if content and is_expected_content_type(feed.type):
current_app.logger.info('using provided content. size=%d',
@ -160,7 +172,7 @@ def update_feed(feed_id, content=None,
feed, content, backfill, now)
elif feed.type == 'html':
result = process_html_feed_for_new_entries(
feed, content, backfill, now)
feed, content, backfill, now, fetch_mf2)
else:
result = []
@ -218,7 +230,7 @@ def update_feed(feed_id, content=None,
current_app.logger.debug(
'skipping previously seen post %s', old.permalink)
fetch_reply_contexts(reply_pairs, now)
fetch_reply_contexts(reply_pairs, now, fetch_mf2)
db.session.commit()
except:
db.session.rollback()
@ -440,16 +452,7 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now):
fallback_photo(feed.origin))
def process_html_feed_for_new_entries(feed, content, backfill, now):
mf2_cache = {}
def fetch_mf2(url):
if url in mf2_cache:
return mf2_cache[url]
p = mf2py.parse(url=url)
mf2_cache[url] = p
return p
def process_html_feed_for_new_entries(feed, content, backfill, now, fetch_mf2_func):
# strip noscript tags before parsing, since we definitely aren't
# going to preserve js
content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
@ -462,7 +465,7 @@ def process_html_feed_for_new_entries(feed, content, backfill, now):
parsed = mf2util.interpret_feed(
mf2py.parse(doc, feed.feed),
source_url=feed.feed, base_href=base_href,
fetch_mf2_func=fetch_mf2)
fetch_mf2_func=fetch_mf2_func)
hfeed = parsed.get('entries', [])
for hentry in hfeed:
@ -570,7 +573,7 @@ def hentry_to_entry(hentry, feed, backfill, now):
return entry
def fetch_reply_contexts(reply_pairs, now):
def fetch_reply_contexts(reply_pairs, now, fetch_mf2_func):
old_contexts = {}
in_reply_tos = [url for _, url in reply_pairs]
if in_reply_tos:
@ -588,7 +591,7 @@ def fetch_reply_contexts(reply_pairs, now):
proxied_reply_url = proxy_url(in_reply_to)
parsed = mf2util.interpret(
mf2py.parse(url=proxied_reply_url), in_reply_to,
fetch_mf2_func=lambda url: mf2py.parse(url=url))
fetch_mf2_func=fetch_mf2_func)
if parsed:
context = hentry_to_entry(parsed, None, False, now)
except requests.exceptions.RequestException as err: