cache the result of running bleach in the db, too slow to do on render
This commit is contained in:
parent
a4113c9416
commit
fa4064ab31
8 changed files with 59 additions and 17 deletions
27
migrations/20150318-clean-content.py
Normal file
27
migrations/20150318-clean-content.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
from config import Config
|
||||||
|
import sqlalchemy
|
||||||
|
import sqlalchemy.orm
|
||||||
|
from woodwind.models import Entry
|
||||||
|
from woodwind import util
|
||||||
|
|
||||||
|
engine = sqlalchemy.create_engine(Config.SQLALCHEMY_DATABASE_URI)
|
||||||
|
Session = sqlalchemy.orm.sessionmaker(bind=engine)
|
||||||
|
|
||||||
|
try:
|
||||||
|
engine.execute('alter table entry add column content_cleaned text')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
session = Session()
|
||||||
|
|
||||||
|
for entry in session.query(Entry).all():
|
||||||
|
print('processing', entry.id)
|
||||||
|
entry.content_cleaned = util.clean(entry.content)
|
||||||
|
|
||||||
|
session.commit()
|
||||||
|
except:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
2
setup.py
2
setup.py
|
@ -26,4 +26,6 @@ setup(name='Woodwind',
|
||||||
'rq',
|
'rq',
|
||||||
'uwsgi',
|
'uwsgi',
|
||||||
'websockets',
|
'websockets',
|
||||||
|
'pyOpenSSL',
|
||||||
|
'pyasn1',
|
||||||
])
|
])
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
[uwsgi]
|
[uwsgi]
|
||||||
master=true
|
master=true
|
||||||
processes=1
|
processes=1
|
||||||
http=:4000
|
socket=/tmp/woodwind.sock
|
||||||
|
chmod-socket=666
|
||||||
module=woodwind.wsgi
|
module=woodwind.wsgi
|
||||||
import=timers
|
import=timers
|
||||||
attach-daemon=rqworker
|
attach-daemon=rqworker
|
||||||
attach-daemon=python -m woodwind.websocket_server
|
attach-daemon=python -m woodwind.websocket_server
|
||||||
|
python-auto-reload=true
|
||||||
|
|
|
@ -1,15 +1,18 @@
|
||||||
from flask.ext.login import LoginManager
|
from flask.ext.login import LoginManager
|
||||||
from flask.ext.micropub import MicropubClient
|
from flask.ext.micropub import MicropubClient
|
||||||
from flask.ext.sqlalchemy import SQLAlchemy
|
from flask.ext.sqlalchemy import SQLAlchemy
|
||||||
|
from flask_debugtoolbar import DebugToolbarExtension
|
||||||
|
|
||||||
|
|
||||||
db = SQLAlchemy()
|
db = SQLAlchemy()
|
||||||
micropub = MicropubClient(client_id='http://reader.kylewm.com')
|
micropub = MicropubClient(client_id='http://reader.kylewm.com')
|
||||||
login_mgr = LoginManager()
|
login_mgr = LoginManager()
|
||||||
login_mgr.login_view = 'views.index'
|
login_mgr.login_view = 'views.index'
|
||||||
|
toolbar = DebugToolbarExtension()
|
||||||
|
|
||||||
|
|
||||||
def init_app(app):
|
def init_app(app):
|
||||||
db.init_app(app)
|
db.init_app(app)
|
||||||
micropub.init_app(app)
|
micropub.init_app(app)
|
||||||
login_mgr.init_app(app)
|
login_mgr.init_app(app)
|
||||||
|
toolbar.init_app(app)
|
||||||
|
|
|
@ -8,14 +8,6 @@ from sqlalchemy.ext.orderinglist import ordering_list
|
||||||
from sqlalchemy.ext.associationproxy import association_proxy
|
from sqlalchemy.ext.associationproxy import association_proxy
|
||||||
|
|
||||||
|
|
||||||
bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink',
|
|
||||||
'audio', 'video', 'table', 'tbody', 'td', 'tr']
|
|
||||||
bleach.ALLOWED_ATTRIBUTES.update({
|
|
||||||
'img': ['src', 'alt', 'title'],
|
|
||||||
'audio': ['preload', 'controls', 'src'],
|
|
||||||
'video': ['preload', 'controls', 'src'],
|
|
||||||
'td': ['colspan'],
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
class JsonType(db.TypeDecorator):
|
class JsonType(db.TypeDecorator):
|
||||||
|
@ -135,6 +127,7 @@ class Entry(db.Model):
|
||||||
author_photo = db.Column(db.String(512))
|
author_photo = db.Column(db.String(512))
|
||||||
title = db.Column(db.String(512))
|
title = db.Column(db.String(512))
|
||||||
content = db.Column(db.Text)
|
content = db.Column(db.Text)
|
||||||
|
content_cleaned = db.Column(db.Text)
|
||||||
# other properties
|
# other properties
|
||||||
properties = db.Column(JsonType)
|
properties = db.Column(JsonType)
|
||||||
# # association with the InReplyTo objects
|
# # association with the InReplyTo objects
|
||||||
|
@ -153,12 +146,6 @@ class Entry(db.Model):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self._syndicated_copies = []
|
self._syndicated_copies = []
|
||||||
|
|
||||||
def content_cleaned(self):
|
|
||||||
if self.content:
|
|
||||||
text = self.content
|
|
||||||
text = re.sub('<script.*?</script>', '', text, flags=re.DOTALL)
|
|
||||||
return bleach.clean(text, strip=True)
|
|
||||||
|
|
||||||
def get_property(self, key, default=None):
|
def get_property(self, key, default=None):
|
||||||
if self.properties is None:
|
if self.properties is None:
|
||||||
return default
|
return default
|
||||||
|
|
|
@ -2,6 +2,7 @@ from config import Config
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from redis import StrictRedis
|
from redis import StrictRedis
|
||||||
from woodwind.models import Feed, Entry
|
from woodwind.models import Feed, Entry
|
||||||
|
from woodwind import util
|
||||||
import bs4
|
import bs4
|
||||||
import datetime
|
import datetime
|
||||||
import feedparser
|
import feedparser
|
||||||
|
@ -290,6 +291,7 @@ def process_xml_feed_for_new_entries(session, feed, response, backfill):
|
||||||
retrieved=retrieved,
|
retrieved=retrieved,
|
||||||
title=p_entry.get('title'),
|
title=p_entry.get('title'),
|
||||||
content=content,
|
content=content,
|
||||||
|
content_cleaned=util.clean(content),
|
||||||
author_name=p_entry.get('author_detail', {}).get('name')
|
author_name=p_entry.get('author_detail', {}).get('name')
|
||||||
or default_author_name,
|
or default_author_name,
|
||||||
author_url=p_entry.get('author_detail', {}).get('href')
|
author_url=p_entry.get('author_detail', {}).get('href')
|
||||||
|
@ -346,6 +348,7 @@ def hentry_to_entry(hentry, feed, backfill):
|
||||||
updated=updated,
|
updated=updated,
|
||||||
title=title,
|
title=title,
|
||||||
content=content,
|
content=content,
|
||||||
|
content_cleaned=util.clean(content),
|
||||||
author_name=hentry.get('author', {}).get('name'),
|
author_name=hentry.get('author', {}).get('name'),
|
||||||
author_photo=hentry.get('author', {}).get('photo')
|
author_photo=hentry.get('author', {}).get('photo')
|
||||||
or (feed and fallback_photo(feed.origin)),
|
or (feed and fallback_photo(feed.origin)),
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if context.content %}
|
{% if context.content %}
|
||||||
<div>
|
<div>
|
||||||
{{ context.content_cleaned() | add_preview }}
|
{{ context.content_cleaned | add_preview }}
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<footer>
|
<footer>
|
||||||
|
@ -42,7 +42,7 @@
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if entry.content %}
|
{% if entry.content %}
|
||||||
<div>
|
<div>
|
||||||
{{ entry.content_cleaned() | add_preview }}
|
{{ entry.content_cleaned | add_preview }}
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
|
18
woodwind/util.py
Normal file
18
woodwind/util.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
import bleach
|
||||||
|
import re
|
||||||
|
|
||||||
|
bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink',
|
||||||
|
'audio', 'video', 'table', 'tbody', 'td', 'tr']
|
||||||
|
bleach.ALLOWED_ATTRIBUTES.update({
|
||||||
|
'img': ['src', 'alt', 'title'],
|
||||||
|
'audio': ['preload', 'controls', 'src'],
|
||||||
|
'video': ['preload', 'controls', 'src'],
|
||||||
|
'td': ['colspan'],
|
||||||
|
})
|
||||||
|
|
||||||
|
def clean(text):
|
||||||
|
"""Strip script tags and other possibly dangerous content
|
||||||
|
"""
|
||||||
|
if text is not None:
|
||||||
|
text = re.sub('<script.*?</script>', '', text, flags=re.DOTALL)
|
||||||
|
return bleach.clean(text, strip=True)
|
Loading…
Add table
Add a link
Reference in a new issue