support for base tags when normalizing content urls
This commit is contained in:
parent
bd85e7fd6c
commit
750030dee6
3 changed files with 12 additions and 4 deletions
|
@ -18,7 +18,7 @@ itsdangerous==0.24
|
|||
Jinja2==2.8
|
||||
MarkupSafe==0.23
|
||||
mf2py==1.0.2
|
||||
mf2util==0.2.12
|
||||
mf2util==0.3.1
|
||||
psycopg2==2.6.1
|
||||
pyasn1==0.1.9
|
||||
pycparser==2.14
|
||||
|
|
|
@ -429,8 +429,15 @@ def process_html_feed_for_new_entries(feed, content, backfill, now):
|
|||
# strip noscript tags before parsing, since we definitely aren't
|
||||
# going to preserve js
|
||||
content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
|
||||
|
||||
# look for a <base> element
|
||||
doc = bs4.BeautifulSoup(content, 'html5lib')
|
||||
base_el = doc.find('base')
|
||||
base_href = base_el.get('href') if base_el else None
|
||||
|
||||
parsed = mf2util.interpret_feed(
|
||||
mf2py.parse(url=feed.feed, doc=content), feed.feed)
|
||||
mf2py.parse(doc, feed.feed),
|
||||
source_url=feed.feed, base_href=base_href)
|
||||
hfeed = parsed.get('entries', [])
|
||||
|
||||
for hentry in hfeed:
|
||||
|
|
|
@ -10,14 +10,15 @@ redis = StrictRedis()
|
|||
|
||||
bleach.ALLOWED_TAGS += [
|
||||
'a', 'img', 'p', 'br', 'marquee', 'blink',
|
||||
'audio', 'video', 'table', 'tbody', 'td', 'tr', 'div', 'span',
|
||||
'audio', 'video', 'source', 'table', 'tbody', 'td', 'tr', 'div', 'span',
|
||||
'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
]
|
||||
|
||||
bleach.ALLOWED_ATTRIBUTES.update({
|
||||
'img': ['src', 'alt', 'title'],
|
||||
'audio': ['preload', 'controls', 'src'],
|
||||
'video': ['preload', 'controls', 'src'],
|
||||
'video': ['preload', 'controls', 'src', 'poster'],
|
||||
'source': ['type', 'src'],
|
||||
'td': ['colspan'],
|
||||
})
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue