better caching for fetch_mf2 across calls to entries and reply contexts
This commit is contained in:
parent
79d4f84d28
commit
3a1d88616c
1 changed files with 18 additions and 15 deletions
|
@ -44,6 +44,17 @@ q = rq.Queue('low', connection=redis)
|
||||||
|
|
||||||
_app = None
|
_app = None
|
||||||
|
|
||||||
|
class Mf2Fetcher:
|
||||||
|
def __init__(self):
|
||||||
|
self.cache = {}
|
||||||
|
|
||||||
|
def __call__(self, url):
|
||||||
|
if url in self.cache:
|
||||||
|
return self.cache[url]
|
||||||
|
p = mf2py.parse(url=url)
|
||||||
|
self.cache[url] = p
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def flask_app():
|
def flask_app():
|
||||||
|
@ -122,6 +133,7 @@ def update_feed(feed_id, content=None,
|
||||||
updated_entries = []
|
updated_entries = []
|
||||||
reply_pairs = []
|
reply_pairs = []
|
||||||
|
|
||||||
|
fetch_mf2 = Mf2Fetcher()
|
||||||
try:
|
try:
|
||||||
if content and is_expected_content_type(feed.type):
|
if content and is_expected_content_type(feed.type):
|
||||||
current_app.logger.info('using provided content. size=%d',
|
current_app.logger.info('using provided content. size=%d',
|
||||||
|
@ -160,7 +172,7 @@ def update_feed(feed_id, content=None,
|
||||||
feed, content, backfill, now)
|
feed, content, backfill, now)
|
||||||
elif feed.type == 'html':
|
elif feed.type == 'html':
|
||||||
result = process_html_feed_for_new_entries(
|
result = process_html_feed_for_new_entries(
|
||||||
feed, content, backfill, now)
|
feed, content, backfill, now, fetch_mf2)
|
||||||
else:
|
else:
|
||||||
result = []
|
result = []
|
||||||
|
|
||||||
|
@ -218,7 +230,7 @@ def update_feed(feed_id, content=None,
|
||||||
current_app.logger.debug(
|
current_app.logger.debug(
|
||||||
'skipping previously seen post %s', old.permalink)
|
'skipping previously seen post %s', old.permalink)
|
||||||
|
|
||||||
fetch_reply_contexts(reply_pairs, now)
|
fetch_reply_contexts(reply_pairs, now, fetch_mf2)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
except:
|
except:
|
||||||
db.session.rollback()
|
db.session.rollback()
|
||||||
|
@ -440,16 +452,7 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now):
|
||||||
fallback_photo(feed.origin))
|
fallback_photo(feed.origin))
|
||||||
|
|
||||||
|
|
||||||
def process_html_feed_for_new_entries(feed, content, backfill, now):
|
def process_html_feed_for_new_entries(feed, content, backfill, now, fetch_mf2_func):
|
||||||
mf2_cache = {}
|
|
||||||
|
|
||||||
def fetch_mf2(url):
|
|
||||||
if url in mf2_cache:
|
|
||||||
return mf2_cache[url]
|
|
||||||
p = mf2py.parse(url=url)
|
|
||||||
mf2_cache[url] = p
|
|
||||||
return p
|
|
||||||
|
|
||||||
# strip noscript tags before parsing, since we definitely aren't
|
# strip noscript tags before parsing, since we definitely aren't
|
||||||
# going to preserve js
|
# going to preserve js
|
||||||
content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
|
content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
|
||||||
|
@ -462,7 +465,7 @@ def process_html_feed_for_new_entries(feed, content, backfill, now):
|
||||||
parsed = mf2util.interpret_feed(
|
parsed = mf2util.interpret_feed(
|
||||||
mf2py.parse(doc, feed.feed),
|
mf2py.parse(doc, feed.feed),
|
||||||
source_url=feed.feed, base_href=base_href,
|
source_url=feed.feed, base_href=base_href,
|
||||||
fetch_mf2_func=fetch_mf2)
|
fetch_mf2_func=fetch_mf2_func)
|
||||||
hfeed = parsed.get('entries', [])
|
hfeed = parsed.get('entries', [])
|
||||||
|
|
||||||
for hentry in hfeed:
|
for hentry in hfeed:
|
||||||
|
@ -570,7 +573,7 @@ def hentry_to_entry(hentry, feed, backfill, now):
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
|
|
||||||
def fetch_reply_contexts(reply_pairs, now):
|
def fetch_reply_contexts(reply_pairs, now, fetch_mf2_func):
|
||||||
old_contexts = {}
|
old_contexts = {}
|
||||||
in_reply_tos = [url for _, url in reply_pairs]
|
in_reply_tos = [url for _, url in reply_pairs]
|
||||||
if in_reply_tos:
|
if in_reply_tos:
|
||||||
|
@ -588,7 +591,7 @@ def fetch_reply_contexts(reply_pairs, now):
|
||||||
proxied_reply_url = proxy_url(in_reply_to)
|
proxied_reply_url = proxy_url(in_reply_to)
|
||||||
parsed = mf2util.interpret(
|
parsed = mf2util.interpret(
|
||||||
mf2py.parse(url=proxied_reply_url), in_reply_to,
|
mf2py.parse(url=proxied_reply_url), in_reply_to,
|
||||||
fetch_mf2_func=lambda url: mf2py.parse(url=url))
|
fetch_mf2_func=fetch_mf2_func)
|
||||||
if parsed:
|
if parsed:
|
||||||
context = hentry_to_entry(parsed, None, False, now)
|
context = hentry_to_entry(parsed, None, False, now)
|
||||||
except requests.exceptions.RequestException as err:
|
except requests.exceptions.RequestException as err:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue