From d008d58412d3a2fb7a406cbdc06a54b2e3af5f29 Mon Sep 17 00:00:00 2001 From: Kyle Mahan Date: Mon, 25 Apr 2016 13:58:47 -0700 Subject: [PATCH] use new authorship implementation from mf2util --- requirements.txt | 2 +- woodwind/tasks.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 23c1397..c53d454 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ itsdangerous==0.24 Jinja2==2.8 MarkupSafe==0.23 mf2py==1.0.4 --e git+https://github.com/kylewm/mf2util.git@master#egg=mf2util-master +mf2util==0.4.0 psycopg2==2.6.1 pyasn1==0.1.9 pycparser==2.14 diff --git a/woodwind/tasks.py b/woodwind/tasks.py index 644e53e..1a5b540 100644 --- a/woodwind/tasks.py +++ b/woodwind/tasks.py @@ -438,6 +438,15 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now): def process_html_feed_for_new_entries(feed, content, backfill, now): + mf2_cache = {} + + def fetch_mf2(url): + if url in mf2_cache: + return mf2_cache[url] + p = mf2py.parse(url=url) + mf2_cache[url] = p + return p + # strip noscript tags before parsing, since we definitely aren't # going to preserve js content = re.sub(']*>', '', content, flags=re.IGNORECASE) @@ -449,7 +458,8 @@ def process_html_feed_for_new_entries(feed, content, backfill, now): parsed = mf2util.interpret_feed( mf2py.parse(doc, feed.feed), - source_url=feed.feed, base_href=base_href) + source_url=feed.feed, base_href=base_href, + fetch_mf2_func=fetch_mf2) hfeed = parsed.get('entries', []) for hentry in hfeed: