use new authorship implementation from mf2util

2016-04-25 13:58:47 -07:00 · 2016-04-25 13:58:47 -07:00 · d008d58412
commit d008d58412
parent f6f11bf41b
2 changed files with 12 additions and 2 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -18,7 +18,7 @@ itsdangerous==0.24
 Jinja2==2.8
 MarkupSafe==0.23
 mf2py==1.0.4
-e git+https://github.com/kylewm/mf2util.git@master#egg=mf2util-master
+mf2util==0.4.0
 psycopg2==2.6.1
 pyasn1==0.1.9
 pycparser==2.14
--- a/woodwind/tasks.py
+++ b/woodwind/tasks.py
@ -438,6 +438,15 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now):


 def process_html_feed_for_new_entries(feed, content, backfill, now):
+    mf2_cache = {}
+
+    def fetch_mf2(url):
+        if url in mf2_cache:
+            return mf2_cache[url]
+        p = mf2py.parse(url=url)
+        mf2_cache[url] = p
+        return p
+
    # strip noscript tags before parsing, since we definitely aren't
    # going to preserve js
    content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
@ -449,7 +458,8 @@ def process_html_feed_for_new_entries(feed, content, backfill, now):

    parsed = mf2util.interpret_feed(
        mf2py.parse(doc, feed.feed),
-        source_url=feed.feed, base_href=base_href)
+        source_url=feed.feed, base_href=base_href,
+        fetch_mf2_func=fetch_mf2)
    hfeed = parsed.get('entries', [])

    for hentry in hfeed: