cache the result of running bleach in the db, too slow to do on render

This commit is contained in:
Kyle Mahan 2015-03-18 08:53:37 -07:00
parent a4113c9416
commit fa4064ab31
8 changed files with 59 additions and 17 deletions

View file

@ -0,0 +1,27 @@
from config import Config
import sqlalchemy
import sqlalchemy.orm
from woodwind.models import Entry
from woodwind import util
engine = sqlalchemy.create_engine(Config.SQLALCHEMY_DATABASE_URI)
Session = sqlalchemy.orm.sessionmaker(bind=engine)
try:
engine.execute('alter table entry add column content_cleaned text')
except:
pass
try:
session = Session()
for entry in session.query(Entry).all():
print('processing', entry.id)
entry.content_cleaned = util.clean(entry.content)
session.commit()
except:
session.rollback()
raise
finally:
session.close()

View file

@ -26,4 +26,6 @@ setup(name='Woodwind',
'rq', 'rq',
'uwsgi', 'uwsgi',
'websockets', 'websockets',
'pyOpenSSL',
'pyasn1',
]) ])

View file

@ -1,8 +1,10 @@
[uwsgi] [uwsgi]
master=true master=true
processes=1 processes=1
http=:4000 socket=/tmp/woodwind.sock
chmod-socket=666
module=woodwind.wsgi module=woodwind.wsgi
import=timers import=timers
attach-daemon=rqworker attach-daemon=rqworker
attach-daemon=python -m woodwind.websocket_server attach-daemon=python -m woodwind.websocket_server
python-auto-reload=true

View file

@ -1,15 +1,18 @@
from flask.ext.login import LoginManager from flask.ext.login import LoginManager
from flask.ext.micropub import MicropubClient from flask.ext.micropub import MicropubClient
from flask.ext.sqlalchemy import SQLAlchemy from flask.ext.sqlalchemy import SQLAlchemy
from flask_debugtoolbar import DebugToolbarExtension
db = SQLAlchemy() db = SQLAlchemy()
micropub = MicropubClient(client_id='http://reader.kylewm.com') micropub = MicropubClient(client_id='http://reader.kylewm.com')
login_mgr = LoginManager() login_mgr = LoginManager()
login_mgr.login_view = 'views.index' login_mgr.login_view = 'views.index'
toolbar = DebugToolbarExtension()
def init_app(app): def init_app(app):
db.init_app(app) db.init_app(app)
micropub.init_app(app) micropub.init_app(app)
login_mgr.init_app(app) login_mgr.init_app(app)
toolbar.init_app(app)

View file

@ -8,14 +8,6 @@ from sqlalchemy.ext.orderinglist import ordering_list
from sqlalchemy.ext.associationproxy import association_proxy from sqlalchemy.ext.associationproxy import association_proxy
bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink',
'audio', 'video', 'table', 'tbody', 'td', 'tr']
bleach.ALLOWED_ATTRIBUTES.update({
'img': ['src', 'alt', 'title'],
'audio': ['preload', 'controls', 'src'],
'video': ['preload', 'controls', 'src'],
'td': ['colspan'],
})
class JsonType(db.TypeDecorator): class JsonType(db.TypeDecorator):
@ -135,6 +127,7 @@ class Entry(db.Model):
author_photo = db.Column(db.String(512)) author_photo = db.Column(db.String(512))
title = db.Column(db.String(512)) title = db.Column(db.String(512))
content = db.Column(db.Text) content = db.Column(db.Text)
content_cleaned = db.Column(db.Text)
# other properties # other properties
properties = db.Column(JsonType) properties = db.Column(JsonType)
# # association with the InReplyTo objects # # association with the InReplyTo objects
@ -153,12 +146,6 @@ class Entry(db.Model):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self._syndicated_copies = [] self._syndicated_copies = []
def content_cleaned(self):
if self.content:
text = self.content
text = re.sub('<script.*?</script>', '', text, flags=re.DOTALL)
return bleach.clean(text, strip=True)
def get_property(self, key, default=None): def get_property(self, key, default=None):
if self.properties is None: if self.properties is None:
return default return default

View file

@ -2,6 +2,7 @@ from config import Config
from contextlib import contextmanager from contextlib import contextmanager
from redis import StrictRedis from redis import StrictRedis
from woodwind.models import Feed, Entry from woodwind.models import Feed, Entry
from woodwind import util
import bs4 import bs4
import datetime import datetime
import feedparser import feedparser
@ -290,6 +291,7 @@ def process_xml_feed_for_new_entries(session, feed, response, backfill):
retrieved=retrieved, retrieved=retrieved,
title=p_entry.get('title'), title=p_entry.get('title'),
content=content, content=content,
content_cleaned=util.clean(content),
author_name=p_entry.get('author_detail', {}).get('name') author_name=p_entry.get('author_detail', {}).get('name')
or default_author_name, or default_author_name,
author_url=p_entry.get('author_detail', {}).get('href') author_url=p_entry.get('author_detail', {}).get('href')
@ -346,6 +348,7 @@ def hentry_to_entry(hentry, feed, backfill):
updated=updated, updated=updated,
title=title, title=title,
content=content, content=content,
content_cleaned=util.clean(content),
author_name=hentry.get('author', {}).get('name'), author_name=hentry.get('author', {}).get('name'),
author_photo=hentry.get('author', {}).get('photo') author_photo=hentry.get('author', {}).get('photo')
or (feed and fallback_photo(feed.origin)), or (feed and fallback_photo(feed.origin)),

View file

@ -14,7 +14,7 @@
{% endif %} {% endif %}
{% if context.content %} {% if context.content %}
<div> <div>
{{ context.content_cleaned() | add_preview }} {{ context.content_cleaned | add_preview }}
</div> </div>
{% endif %} {% endif %}
<footer> <footer>
@ -42,7 +42,7 @@
{% endif %} {% endif %}
{% if entry.content %} {% if entry.content %}
<div> <div>
{{ entry.content_cleaned() | add_preview }} {{ entry.content_cleaned | add_preview }}
</div> </div>
{% endif %} {% endif %}

18
woodwind/util.py Normal file
View file

@ -0,0 +1,18 @@
import bleach
import re
bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink',
'audio', 'video', 'table', 'tbody', 'td', 'tr']
bleach.ALLOWED_ATTRIBUTES.update({
'img': ['src', 'alt', 'title'],
'audio': ['preload', 'controls', 'src'],
'video': ['preload', 'controls', 'src'],
'td': ['colspan'],
})
def clean(text):
"""Strip script tags and other possibly dangerous content
"""
if text is not None:
text = re.sub('<script.*?</script>', '', text, flags=re.DOTALL)
return bleach.clean(text, strip=True)