From 750030dee65f1d9ae8eafb6298e73b2a58e7b29a Mon Sep 17 00:00:00 2001 From: Kyle Mahan Date: Wed, 17 Feb 2016 13:47:44 -0800 Subject: [PATCH] support for base tags when normalizing content urls --- requirements.txt | 2 +- woodwind/tasks.py | 9 ++++++++- woodwind/util.py | 5 +++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 186eb43..76d6a0b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ itsdangerous==0.24 Jinja2==2.8 MarkupSafe==0.23 mf2py==1.0.2 -mf2util==0.2.12 +mf2util==0.3.1 psycopg2==2.6.1 pyasn1==0.1.9 pycparser==2.14 diff --git a/woodwind/tasks.py b/woodwind/tasks.py index 1602745..ec6d88a 100644 --- a/woodwind/tasks.py +++ b/woodwind/tasks.py @@ -429,8 +429,15 @@ def process_html_feed_for_new_entries(feed, content, backfill, now): # strip noscript tags before parsing, since we definitely aren't # going to preserve js content = re.sub(']*>', '', content, flags=re.IGNORECASE) + + # look for a element + doc = bs4.BeautifulSoup(content, 'html5lib') + base_el = doc.find('base') + base_href = base_el.get('href') if base_el else None + parsed = mf2util.interpret_feed( - mf2py.parse(url=feed.feed, doc=content), feed.feed) + mf2py.parse(doc, feed.feed), + source_url=feed.feed, base_href=base_href) hfeed = parsed.get('entries', []) for hentry in hfeed: diff --git a/woodwind/util.py b/woodwind/util.py index dc404a8..681d3bf 100644 --- a/woodwind/util.py +++ b/woodwind/util.py @@ -10,14 +10,15 @@ redis = StrictRedis() bleach.ALLOWED_TAGS += [ 'a', 'img', 'p', 'br', 'marquee', 'blink', - 'audio', 'video', 'table', 'tbody', 'td', 'tr', 'div', 'span', + 'audio', 'video', 'source', 'table', 'tbody', 'td', 'tr', 'div', 'span', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', ] bleach.ALLOWED_ATTRIBUTES.update({ 'img': ['src', 'alt', 'title'], 'audio': ['preload', 'controls', 'src'], - 'video': ['preload', 'controls', 'src'], + 'video': ['preload', 'controls', 'src', 'poster'], + 'source': ['type', 'src'], 'td': ['colspan'], })