cache the result of running bleach in the db, too slow to do on render
This commit is contained in:
parent
a4113c9416
commit
fa4064ab31
8 changed files with 59 additions and 17 deletions
27
migrations/20150318-clean-content.py
Normal file
27
migrations/20150318-clean-content.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from config import Config
|
||||
import sqlalchemy
|
||||
import sqlalchemy.orm
|
||||
from woodwind.models import Entry
|
||||
from woodwind import util
|
||||
|
||||
engine = sqlalchemy.create_engine(Config.SQLALCHEMY_DATABASE_URI)
|
||||
Session = sqlalchemy.orm.sessionmaker(bind=engine)
|
||||
|
||||
try:
|
||||
engine.execute('alter table entry add column content_cleaned text')
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
session = Session()
|
||||
|
||||
for entry in session.query(Entry).all():
|
||||
print('processing', entry.id)
|
||||
entry.content_cleaned = util.clean(entry.content)
|
||||
|
||||
session.commit()
|
||||
except:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
2
setup.py
2
setup.py
|
@ -26,4 +26,6 @@ setup(name='Woodwind',
|
|||
'rq',
|
||||
'uwsgi',
|
||||
'websockets',
|
||||
'pyOpenSSL',
|
||||
'pyasn1',
|
||||
])
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
[uwsgi]
|
||||
master=true
|
||||
processes=1
|
||||
http=:4000
|
||||
socket=/tmp/woodwind.sock
|
||||
chmod-socket=666
|
||||
module=woodwind.wsgi
|
||||
import=timers
|
||||
attach-daemon=rqworker
|
||||
attach-daemon=python -m woodwind.websocket_server
|
||||
python-auto-reload=true
|
||||
|
|
|
@ -1,15 +1,18 @@
|
|||
from flask.ext.login import LoginManager
|
||||
from flask.ext.micropub import MicropubClient
|
||||
from flask.ext.sqlalchemy import SQLAlchemy
|
||||
from flask_debugtoolbar import DebugToolbarExtension
|
||||
|
||||
|
||||
db = SQLAlchemy()
|
||||
micropub = MicropubClient(client_id='http://reader.kylewm.com')
|
||||
login_mgr = LoginManager()
|
||||
login_mgr.login_view = 'views.index'
|
||||
toolbar = DebugToolbarExtension()
|
||||
|
||||
|
||||
def init_app(app):
|
||||
db.init_app(app)
|
||||
micropub.init_app(app)
|
||||
login_mgr.init_app(app)
|
||||
toolbar.init_app(app)
|
||||
|
|
|
@ -8,14 +8,6 @@ from sqlalchemy.ext.orderinglist import ordering_list
|
|||
from sqlalchemy.ext.associationproxy import association_proxy
|
||||
|
||||
|
||||
bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink',
|
||||
'audio', 'video', 'table', 'tbody', 'td', 'tr']
|
||||
bleach.ALLOWED_ATTRIBUTES.update({
|
||||
'img': ['src', 'alt', 'title'],
|
||||
'audio': ['preload', 'controls', 'src'],
|
||||
'video': ['preload', 'controls', 'src'],
|
||||
'td': ['colspan'],
|
||||
})
|
||||
|
||||
|
||||
class JsonType(db.TypeDecorator):
|
||||
|
@ -135,6 +127,7 @@ class Entry(db.Model):
|
|||
author_photo = db.Column(db.String(512))
|
||||
title = db.Column(db.String(512))
|
||||
content = db.Column(db.Text)
|
||||
content_cleaned = db.Column(db.Text)
|
||||
# other properties
|
||||
properties = db.Column(JsonType)
|
||||
# # association with the InReplyTo objects
|
||||
|
@ -153,12 +146,6 @@ class Entry(db.Model):
|
|||
super().__init__(*args, **kwargs)
|
||||
self._syndicated_copies = []
|
||||
|
||||
def content_cleaned(self):
|
||||
if self.content:
|
||||
text = self.content
|
||||
text = re.sub('<script.*?</script>', '', text, flags=re.DOTALL)
|
||||
return bleach.clean(text, strip=True)
|
||||
|
||||
def get_property(self, key, default=None):
|
||||
if self.properties is None:
|
||||
return default
|
||||
|
|
|
@ -2,6 +2,7 @@ from config import Config
|
|||
from contextlib import contextmanager
|
||||
from redis import StrictRedis
|
||||
from woodwind.models import Feed, Entry
|
||||
from woodwind import util
|
||||
import bs4
|
||||
import datetime
|
||||
import feedparser
|
||||
|
@ -290,6 +291,7 @@ def process_xml_feed_for_new_entries(session, feed, response, backfill):
|
|||
retrieved=retrieved,
|
||||
title=p_entry.get('title'),
|
||||
content=content,
|
||||
content_cleaned=util.clean(content),
|
||||
author_name=p_entry.get('author_detail', {}).get('name')
|
||||
or default_author_name,
|
||||
author_url=p_entry.get('author_detail', {}).get('href')
|
||||
|
@ -346,6 +348,7 @@ def hentry_to_entry(hentry, feed, backfill):
|
|||
updated=updated,
|
||||
title=title,
|
||||
content=content,
|
||||
content_cleaned=util.clean(content),
|
||||
author_name=hentry.get('author', {}).get('name'),
|
||||
author_photo=hentry.get('author', {}).get('photo')
|
||||
or (feed and fallback_photo(feed.origin)),
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
{% endif %}
|
||||
{% if context.content %}
|
||||
<div>
|
||||
{{ context.content_cleaned() | add_preview }}
|
||||
{{ context.content_cleaned | add_preview }}
|
||||
</div>
|
||||
{% endif %}
|
||||
<footer>
|
||||
|
@ -42,7 +42,7 @@
|
|||
{% endif %}
|
||||
{% if entry.content %}
|
||||
<div>
|
||||
{{ entry.content_cleaned() | add_preview }}
|
||||
{{ entry.content_cleaned | add_preview }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
|
18
woodwind/util.py
Normal file
18
woodwind/util.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
import bleach
|
||||
import re
|
||||
|
||||
bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink',
|
||||
'audio', 'video', 'table', 'tbody', 'td', 'tr']
|
||||
bleach.ALLOWED_ATTRIBUTES.update({
|
||||
'img': ['src', 'alt', 'title'],
|
||||
'audio': ['preload', 'controls', 'src'],
|
||||
'video': ['preload', 'controls', 'src'],
|
||||
'td': ['colspan'],
|
||||
})
|
||||
|
||||
def clean(text):
|
||||
"""Strip script tags and other possibly dangerous content
|
||||
"""
|
||||
if text is not None:
|
||||
text = re.sub('<script.*?</script>', '', text, flags=re.DOTALL)
|
||||
return bleach.clean(text, strip=True)
|
Loading…
Add table
Add a link
Reference in a new issue