cache the result of running bleach in the db, too slow to do on render

This commit is contained in:
Kyle Mahan 2015-03-18 08:53:37 -07:00
parent a4113c9416
commit fa4064ab31
8 changed files with 59 additions and 17 deletions

View file

@ -0,0 +1,27 @@
from config import Config
import sqlalchemy
import sqlalchemy.orm
from woodwind.models import Entry
from woodwind import util
engine = sqlalchemy.create_engine(Config.SQLALCHEMY_DATABASE_URI)
Session = sqlalchemy.orm.sessionmaker(bind=engine)
try:
engine.execute('alter table entry add column content_cleaned text')
except:
pass
try:
session = Session()
for entry in session.query(Entry).all():
print('processing', entry.id)
entry.content_cleaned = util.clean(entry.content)
session.commit()
except:
session.rollback()
raise
finally:
session.close()

View file

@ -26,4 +26,6 @@ setup(name='Woodwind',
'rq',
'uwsgi',
'websockets',
'pyOpenSSL',
'pyasn1',
])

View file

@ -1,8 +1,10 @@
[uwsgi]
master=true
processes=1
http=:4000
socket=/tmp/woodwind.sock
chmod-socket=666
module=woodwind.wsgi
import=timers
attach-daemon=rqworker
attach-daemon=python -m woodwind.websocket_server
python-auto-reload=true

View file

@ -1,15 +1,18 @@
from flask.ext.login import LoginManager
from flask.ext.micropub import MicropubClient
from flask.ext.sqlalchemy import SQLAlchemy
from flask_debugtoolbar import DebugToolbarExtension
db = SQLAlchemy()
micropub = MicropubClient(client_id='http://reader.kylewm.com')
login_mgr = LoginManager()
login_mgr.login_view = 'views.index'
toolbar = DebugToolbarExtension()
def init_app(app):
db.init_app(app)
micropub.init_app(app)
login_mgr.init_app(app)
toolbar.init_app(app)

View file

@ -8,14 +8,6 @@ from sqlalchemy.ext.orderinglist import ordering_list
from sqlalchemy.ext.associationproxy import association_proxy
bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink',
'audio', 'video', 'table', 'tbody', 'td', 'tr']
bleach.ALLOWED_ATTRIBUTES.update({
'img': ['src', 'alt', 'title'],
'audio': ['preload', 'controls', 'src'],
'video': ['preload', 'controls', 'src'],
'td': ['colspan'],
})
class JsonType(db.TypeDecorator):
@ -135,6 +127,7 @@ class Entry(db.Model):
author_photo = db.Column(db.String(512))
title = db.Column(db.String(512))
content = db.Column(db.Text)
content_cleaned = db.Column(db.Text)
# other properties
properties = db.Column(JsonType)
# # association with the InReplyTo objects
@ -153,12 +146,6 @@ class Entry(db.Model):
super().__init__(*args, **kwargs)
self._syndicated_copies = []
def content_cleaned(self):
if self.content:
text = self.content
text = re.sub('<script.*?</script>', '', text, flags=re.DOTALL)
return bleach.clean(text, strip=True)
def get_property(self, key, default=None):
if self.properties is None:
return default

View file

@ -2,6 +2,7 @@ from config import Config
from contextlib import contextmanager
from redis import StrictRedis
from woodwind.models import Feed, Entry
from woodwind import util
import bs4
import datetime
import feedparser
@ -290,6 +291,7 @@ def process_xml_feed_for_new_entries(session, feed, response, backfill):
retrieved=retrieved,
title=p_entry.get('title'),
content=content,
content_cleaned=util.clean(content),
author_name=p_entry.get('author_detail', {}).get('name')
or default_author_name,
author_url=p_entry.get('author_detail', {}).get('href')
@ -346,6 +348,7 @@ def hentry_to_entry(hentry, feed, backfill):
updated=updated,
title=title,
content=content,
content_cleaned=util.clean(content),
author_name=hentry.get('author', {}).get('name'),
author_photo=hentry.get('author', {}).get('photo')
or (feed and fallback_photo(feed.origin)),

View file

@ -14,7 +14,7 @@
{% endif %}
{% if context.content %}
<div>
{{ context.content_cleaned() | add_preview }}
{{ context.content_cleaned | add_preview }}
</div>
{% endif %}
<footer>
@ -42,7 +42,7 @@
{% endif %}
{% if entry.content %}
<div>
{{ entry.content_cleaned() | add_preview }}
{{ entry.content_cleaned | add_preview }}
</div>
{% endif %}

18
woodwind/util.py Normal file
View file

@ -0,0 +1,18 @@
import bleach
import re
bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink',
'audio', 'video', 'table', 'tbody', 'td', 'tr']
bleach.ALLOWED_ATTRIBUTES.update({
'img': ['src', 'alt', 'title'],
'audio': ['preload', 'controls', 'src'],
'video': ['preload', 'controls', 'src'],
'td': ['colspan'],
})
def clean(text):
"""Strip script tags and other possibly dangerous content
"""
if text is not None:
text = re.sub('<script.*?</script>', '', text, flags=re.DOTALL)
return bleach.clean(text, strip=True)