From 750030dee65f1d9ae8eafb6298e73b2a58e7b29a Mon Sep 17 00:00:00 2001
From: Kyle Mahan <kyle.mahan@gmail.com>
Date: Wed, 17 Feb 2016 13:47:44 -0800
Subject: [PATCH] support for base tags when normalizing content urls

---
 requirements.txt  | 2 +-
 woodwind/tasks.py | 9 ++++++++-
 woodwind/util.py  | 5 +++--
 3 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 186eb43..76d6a0b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ itsdangerous==0.24
 Jinja2==2.8
 MarkupSafe==0.23
 mf2py==1.0.2
-mf2util==0.2.12
+mf2util==0.3.1
 psycopg2==2.6.1
 pyasn1==0.1.9
 pycparser==2.14
diff --git a/woodwind/tasks.py b/woodwind/tasks.py
index 1602745..ec6d88a 100644
--- a/woodwind/tasks.py
+++ b/woodwind/tasks.py
@@ -429,8 +429,15 @@ def process_html_feed_for_new_entries(feed, content, backfill, now):
     # strip noscript tags before parsing, since we definitely aren't
     # going to preserve js
     content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
+
+    # look for a <base> element
+    doc = bs4.BeautifulSoup(content, 'html5lib')
+    base_el = doc.find('base')
+    base_href = base_el.get('href') if base_el else None
+
     parsed = mf2util.interpret_feed(
-        mf2py.parse(url=feed.feed, doc=content), feed.feed)
+        mf2py.parse(doc, feed.feed),
+        source_url=feed.feed, base_href=base_href)
     hfeed = parsed.get('entries', [])
 
     for hentry in hfeed:
diff --git a/woodwind/util.py b/woodwind/util.py
index dc404a8..681d3bf 100644
--- a/woodwind/util.py
+++ b/woodwind/util.py
@@ -10,14 +10,15 @@ redis = StrictRedis()
 
 bleach.ALLOWED_TAGS += [
     'a', 'img', 'p', 'br', 'marquee', 'blink',
-    'audio', 'video', 'table', 'tbody', 'td', 'tr', 'div', 'span',
+    'audio', 'video', 'source', 'table', 'tbody', 'td', 'tr', 'div', 'span',
     'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
 ]
 
 bleach.ALLOWED_ATTRIBUTES.update({
     'img': ['src', 'alt', 'title'],
     'audio': ['preload', 'controls', 'src'],
-    'video': ['preload', 'controls', 'src'],
+    'video': ['preload', 'controls', 'src', 'poster'],
+    'source': ['type', 'src'],
     'td': ['colspan'],
 })