From 9e1e3ad03f1230dde30a267ac2217b15e2c92d6d Mon Sep 17 00:00:00 2001 From: Kyle Mahan Date: Tue, 27 Jan 2015 15:58:08 -0800 Subject: [PATCH] add celery scheduler and break app up into modules --- .gitignore | 2 + celeryconfig.py | 13 ++ config.py | 6 + app.py => woodwind.py | 0 woodwind/__init__.py | 3 + woodwind/__main__.py | 10 + woodwind/app.py | 12 + woodwind/extensions.py | 15 ++ woodwind/models.py | 82 +++++++ {static => woodwind/static}/normalize.scss | 0 {static => woodwind/static}/style.scss | 2 +- woodwind/tasks.py | 173 ++++++++++++++ {templates => woodwind/templates}/base.jinja2 | 0 {templates => woodwind/templates}/feed.jinja2 | 9 +- woodwind/templates/feeds.jinja2 | 18 ++ .../templates}/login.jinja2 | 0 .../templates}/select-feed.jinja2 | 0 .../templates}/subscribe.jinja2 | 0 woodwind/views.py | 213 ++++++++++++++++++ 19 files changed, 555 insertions(+), 3 deletions(-) create mode 100644 .gitignore create mode 100644 celeryconfig.py create mode 100644 config.py rename app.py => woodwind.py (100%) create mode 100644 woodwind/__init__.py create mode 100644 woodwind/__main__.py create mode 100644 woodwind/app.py create mode 100644 woodwind/extensions.py create mode 100644 woodwind/models.py rename {static => woodwind/static}/normalize.scss (100%) rename {static => woodwind/static}/style.scss (96%) create mode 100644 woodwind/tasks.py rename {templates => woodwind/templates}/base.jinja2 (100%) rename {templates => woodwind/templates}/feed.jinja2 (67%) create mode 100644 woodwind/templates/feeds.jinja2 rename {templates => woodwind/templates}/login.jinja2 (100%) rename {templates => woodwind/templates}/select-feed.jinja2 (100%) rename {templates => woodwind/templates}/subscribe.jinja2 (100%) create mode 100644 woodwind/views.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0b0dde --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.css +*.css.map \ No newline at end of file diff --git a/celeryconfig.py b/celeryconfig.py new file mode 100644 index 0000000..79f0c0a --- /dev/null +++ b/celeryconfig.py @@ -0,0 +1,13 @@ +import datetime + +BROKER_URL = 'redis://' +CELERY_RESULT_BACKEND = 'redis' +CELERY_TASK_SERIALIZER = 'json' +CELERY_RESULT_SERIALIZER = 'json' +CELERY_ACCEPT_CONTENT = ['json'] +CELERYBEAT_SCHEDULE = { + 'tick-every-5-minutes': { + 'task': 'tasks.tick', + 'schedule': datetime.timedelta(minutes=5), + } +} diff --git a/config.py b/config.py new file mode 100644 index 0000000..243ad4b --- /dev/null +++ b/config.py @@ -0,0 +1,6 @@ +import datetime + + +class Config: + SECRET_KEY = 'super secret key' + SQLALCHEMY_DATABASE_URI = 'sqlite:///db.sqlite' diff --git a/app.py b/woodwind.py similarity index 100% rename from app.py rename to woodwind.py diff --git a/woodwind/__init__.py b/woodwind/__init__.py new file mode 100644 index 0000000..4ea24f5 --- /dev/null +++ b/woodwind/__init__.py @@ -0,0 +1,3 @@ +from .app import create_app + +__all__ = ['create_app'] diff --git a/woodwind/__main__.py b/woodwind/__main__.py new file mode 100644 index 0000000..e1f067b --- /dev/null +++ b/woodwind/__main__.py @@ -0,0 +1,10 @@ +__all__ = ['main'] + + +def main(): + from woodwind.app import create_app + app = create_app() + app.run(debug=True, port=4000) + + +main() diff --git a/woodwind/app.py b/woodwind/app.py new file mode 100644 index 0000000..5e74f20 --- /dev/null +++ b/woodwind/app.py @@ -0,0 +1,12 @@ +from . import extensions +from .views import ui +from config import Config +import flask + + +def create_app(): + app = flask.Flask('woodwind') + app.config.from_object(Config) + extensions.init_app(app) + app.register_blueprint(ui) + return app diff --git a/woodwind/extensions.py b/woodwind/extensions.py new file mode 100644 index 0000000..b69ec53 --- /dev/null +++ b/woodwind/extensions.py @@ -0,0 +1,15 @@ +from flask.ext.login import LoginManager +from flask.ext.micropub import MicropubClient +from flask.ext.sqlalchemy import SQLAlchemy + + +db = SQLAlchemy() +micropub = MicropubClient(client_id='redwind-reader') +login_mgr = LoginManager() +login_mgr.login_view = 'login' + + +def init_app(app): + db.init_app(app) + micropub.init_app(app) + login_mgr.init_app(app) diff --git a/woodwind/models.py b/woodwind/models.py new file mode 100644 index 0000000..c4ab8d2 --- /dev/null +++ b/woodwind/models.py @@ -0,0 +1,82 @@ +import bleach +from .extensions import db + + +bleach.ALLOWED_TAGS += ['a', 'img', 'p', 'br', 'marquee', 'blink'] +bleach.ALLOWED_ATTRIBUTES.update({ + 'img': ['src', 'alt', 'title'] +}) + + +class User(db.Model): + id = db.Column(db.Integer, primary_key=True) + domain = db.Column(db.String(256)) + micropub_endpoint = db.Column(db.String(512)) + access_token = db.Column(db.String(512)) + + # Flask-Login integration + def is_authenticated(self): + return True + + def is_active(self): + return True + + def is_anonymous(self): + return False + + def get_id(self): + return self.domain + + def __eq__(self, other): + if type(other) is type(self): + return self.domain == other.domain + return False + + def __repr__(self): + return ''.format(self.domain) + + +class Feed(db.Model): + id = db.Column(db.Integer, primary_key=True) + user_id = db.Column(db.Integer, db.ForeignKey(User.id)) + user = db.relationship(User, backref='feeds') + # the name of this feed + name = db.Column(db.String(256)) + # url that we subscribed to; periodically check if the feed url + # has changed + origin = db.Column(db.String(512)) + # url of the feed itself + feed = db.Column(db.String(512)) + # h-feed, xml, etc. + type = db.Column(db.String(64)) + # last time this feed returned new data + last_updated = db.Column(db.DateTime) + # last time we checked this feed + last_checked = db.Column(db.DateTime) + etag = db.Column(db.String(512)) + + def __repr__(self): + return ''.format(self.name, self.feed) + + +class Entry(db.Model): + id = db.Column(db.Integer, primary_key=True) + feed_id = db.Column(db.Integer, db.ForeignKey(Feed.id)) + feed = db.relationship(Feed, backref='entries') + published = db.Column(db.DateTime) + updated = db.Column(db.DateTime) + retrieved = db.Column(db.DateTime) + uid = db.Column(db.String(512)) + permalink = db.Column(db.String(512)) + author_name = db.Column(db.String(512)) + author_url = db.Column(db.String(512)) + author_photo = db.Column(db.String(512)) + title = db.Column(db.String(512)) + content = db.Column(db.Text) + + def content_cleaned(self): + if self.content: + return bleach.clean(self.content, strip=True) + + def __repr__(self): + return ''.format(self.title, (self.content or '')[:140]) diff --git a/static/normalize.scss b/woodwind/static/normalize.scss similarity index 100% rename from static/normalize.scss rename to woodwind/static/normalize.scss diff --git a/static/style.scss b/woodwind/static/style.scss similarity index 96% rename from static/style.scss rename to woodwind/static/style.scss index 14497a0..734ea8a 100644 --- a/static/style.scss +++ b/woodwind/static/style.scss @@ -16,7 +16,7 @@ $box-shadow: 0 0 2px $sirocco; body { - font: 12pt/1.5em $body-font; + font: 12pt $body-font; background: $athens-gray; } diff --git a/woodwind/tasks.py b/woodwind/tasks.py new file mode 100644 index 0000000..75c98d2 --- /dev/null +++ b/woodwind/tasks.py @@ -0,0 +1,173 @@ +from woodwind.extensions import db +from woodwind.models import Feed, Entry + +import celery +import requests +import celery.utils.log +import feedparser +import mf2py +import mf2util +import requests +import time +import urllib.parse +import datetime + +UPDATE_INTERVAL = datetime.timedelta(hours=1) + +queue = celery.Celery('woodwind') +queue.config_from_object('celeryconfig') + +logger = celery.utils.log.get_task_logger(__name__) + + +@queue.task +def tick(): + now = datetime.datetime.utcnow() + logger.debug('Tick {}'.format(now)) + for feed in Feed.query.all(): + logger.debug('Feed {} last checked {}'.format( + feed, feed.last_checked)) + if (not feed.last_checked + or now - feed.last_checked > UPDATE_INTERVAL): + update_feed.delay(feed.id) + + +@queue.task +def update_feed(feed_id): + feed = Feed.query.get(feed_id) + logger.info('Updating {}'.format(feed)) + new_entries = process_feed_for_new_entries(feed) + for entry in new_entries: + logger.debug('Got new entry: {}'.format(entry)) + + +def process_feed_for_new_entries(feed): + try: + if feed.type == 'xml': + result = process_xml_feed_for_new_entries(feed) + elif feed.type == 'html': + result = process_html_feed_for_new_entries(feed) + else: + result = None + return result + finally: + now = datetime.datetime.utcnow() + feed.last_checked = now + if result: + feed.last_updated = now + db.session.commit() + + +def process_xml_feed_for_new_entries(feed): + logger.debug('updating feed: %s', feed) + + now = datetime.datetime.utcnow() + parsed = feedparser.parse(feed.feed) + + feed_props = parsed.get('feed', {}) + default_author_url = feed_props.get('author_detail', {}).get('href') + default_author_name = feed_props.get('author_detail', {}).get('name') + default_author_photo = feed_props.get('logo') + + all_uids = [e.id or e.link for e in parsed.entries] + preexisting = set(row[0] for row in db.session.query(Entry.uid) + .filter(Entry.uid.in_(all_uids)) + .filter(Entry.feed == feed)) + + for p_entry in parsed.entries: + permalink = p_entry.link + uid = p_entry.id or permalink + + if not uid or uid in preexisting: + continue + + updated = datetime.datetime.fromtimestamp( + time.mktime(p_entry.updated_parsed) + ) if p_entry.updated_parsed else None + published = datetime.datetime.fromtimestamp( + time.mktime(p_entry.published_parsed) + ) if p_entry.published_parsed else None + + title = p_entry.get('title') + + content = None + content_list = p_entry.get('content') + if content_list: + content = content_list[0].value + else: + content = p_entry.get('summary') + + if title and content: + title_trimmed = title.rstrip('...').rstrip('…') + if content.startswith(title_trimmed): + title = None + + entry = Entry( + feed=feed, + published=published, + updated=updated, + uid=uid, + permalink=permalink, + retrieved=now, + title=p_entry.get('title'), + content=content, + author_name=p_entry.get('author_detail', {}).get('name') + or default_author_name, + author_url=p_entry.get('author_detail', {}).get('href') + or default_author_url, + author_photo=default_author_photo + or fallback_photo(feed.origin)) + + db.session.add(entry) + db.session.commit() + yield entry + + +def process_html_feed_for_new_entries(feed): + logger.debug('updating feed: %s', feed) + + now = datetime.datetime.utcnow() + parsed = mf2util.interpret_feed( + mf2py.parse(url=feed.feed), feed.feed) + hfeed = parsed.get('entries', []) + + all_uids = [e.get('uid') or e.get('url') for e in hfeed] + preexisting = set(row[0] for row in db.session.query(Entry.uid) + .filter(Entry.uid.in_(all_uids)) + .filter(Entry.feed == feed)) + + # logger.debug('preexisting urls: %r', preexisting) + + for hentry in hfeed: + permalink = url = hentry.get('url') + uid = hentry.get('uid') or url + + if not uid or uid in preexisting: + continue + + # hentry = mf2util.interpret(mf2py.parse(url=url), url) + # permalink = hentry.get('url') or url + # uid = hentry.get('uid') or uid + entry = Entry( + feed=feed, + published=hentry.get('published'), + updated=hentry.get('updated'), + uid=uid, + permalink=permalink, + retrieved=now, + title=hentry.get('name'), + content=hentry.get('content'), + author_name=hentry.get('author', {}).get('name'), + author_photo=hentry.get('author', {}).get('photo') + or fallback_photo(feed.origin), + author_url=hentry.get('author', {}).get('url')) + db.session.add(entry) + db.session.commit() + logger.debug('saved entry: %s', entry.permalink) + yield entry + + +def fallback_photo(url): + """Use favatar to find an appropriate photo for any URL""" + domain = urllib.parse.urlparse(url).netloc + return 'http://www.google.com/s2/favicons?domain=' + domain diff --git a/templates/base.jinja2 b/woodwind/templates/base.jinja2 similarity index 100% rename from templates/base.jinja2 rename to woodwind/templates/base.jinja2 diff --git a/templates/feed.jinja2 b/woodwind/templates/feed.jinja2 similarity index 67% rename from templates/feed.jinja2 rename to woodwind/templates/feed.jinja2 index bad80d2..29e145e 100644 --- a/templates/feed.jinja2 +++ b/woodwind/templates/feed.jinja2 @@ -5,8 +5,13 @@ {% for entry in entries %}
- - {{ entry.author_name }} - {{ entry.feed.name }} + {% if entry.author_photo %} + + {% endif %} + {% if entry.author_name %} + {{ entry.author_name }} - + {% endif %} + {{ entry.feed.name }}
{% if entry.title %}

{{ entry.title }}

diff --git a/woodwind/templates/feeds.jinja2 b/woodwind/templates/feeds.jinja2 new file mode 100644 index 0000000..d1f0ff9 --- /dev/null +++ b/woodwind/templates/feeds.jinja2 @@ -0,0 +1,18 @@ +{% extends "base.jinja2" %} +{% block body %} + +
+ {% for feed in feeds %} + +
+ + + + +
+ + +{% endfor %} +
+ +{% endblock body %} diff --git a/templates/login.jinja2 b/woodwind/templates/login.jinja2 similarity index 100% rename from templates/login.jinja2 rename to woodwind/templates/login.jinja2 diff --git a/templates/select-feed.jinja2 b/woodwind/templates/select-feed.jinja2 similarity index 100% rename from templates/select-feed.jinja2 rename to woodwind/templates/select-feed.jinja2 diff --git a/templates/subscribe.jinja2 b/woodwind/templates/subscribe.jinja2 similarity index 100% rename from templates/subscribe.jinja2 rename to woodwind/templates/subscribe.jinja2 diff --git a/woodwind/views.py b/woodwind/views.py new file mode 100644 index 0000000..b65d1c3 --- /dev/null +++ b/woodwind/views.py @@ -0,0 +1,213 @@ +from . import tasks +from .extensions import db, login_mgr, micropub +from .models import Feed, Entry, User +import flask.ext.login as flask_login +import bs4 +import feedparser +import flask +import mf2py +import mf2util +import requests +import urllib + +ui = flask.Blueprint('ui', __name__) + + +@ui.route('/') +def index(): + if flask_login.current_user.is_authenticated(): + feed_ids = [f.id for f in flask_login.current_user.feeds] + entries = Entry.query.filter( + Entry.feed_id.in_(feed_ids)).order_by( + Entry.published.desc()).limit(100).all() + else: + entries = [] + return flask.render_template('feed.jinja2', entries=entries) + + +@ui.route('/install') +def install(): + db.drop_all() + db.create_all() + + user = User(domain='kylewm.com',) + db.session.add(user) + db.session.commit() + + flask_login.login_user(user) + + return 'Success!' + + +@ui.route('/feeds') +def feeds(): + feeds = Feed.query.filter(Feed.user == flask_login.current_user).all() + return flask.render_template('feeds.jinja2', feeds=feeds) + + +@ui.route('/update_feed') +def update_feed(): + feed_id = flask.request.args.get('id') + tasks.update_feed.delay(feed_id) + + +@ui.route('/delete_feed') +def delete_feed(): + feed_id = flask.request.args.get('id') + feed = Feed.query.get(feed_id) + db.session.delete(feed) + db.session.commit() + flask.flash('Deleted {} ({})'.format(feed.name, feed.feed)) + return flask.redirect(flask.url_for('.feeds')) + + +@ui.route('/edit_feed', methods=['POST']) +def edit_feed(): + feed_id = flask.request.form.get('id') + feed_name = flask.request.form.get('name') + feed_url = flask.request.form.get('feed') + + feed = Feed.query.get(feed_id) + if feed_name: + feed.name = feed_name + if feed_url: + feed.feed = feed_url + + db.session.commit() + flask.flash('Edited {} ({})'.format(feed.name, feed.feed)) + return flask.redirect(flask.url_for('.feeds')) + + +@ui.route('/login') +def login(): + if True: + flask_login.login_user(User.query.all()[0], remember=True) + + me = flask.request.args.get('me') + if me: + return micropub.authorize( + me, flask.url_for('.login_callback', _external=True), + next_url=flask.request.args.get('next'), + scope='write') + return flask.render_template('login.jinja2') + + +@ui.route('/login-callback') +@micropub.authorized_handler +def login_callback(resp): + if not resp.me: + flask.flash('Login error: ' + resp.error) + return flask.redirect(flask.url_for('.login')) + + domain = urllib.parse.urlparse(resp.me).netloc + user = load_user(domain) + if not user: + user = User() + user.domain = domain + db.session.add(user) + + user.micropub_endpoint = resp.micropub_endpoint + user.access_token = resp.access_token + db.session.commit() + + flask_login.login_user(user, remember=True) + return flask.redirect(resp.next_url or flask.url_for('.index')) + + +@login_mgr.user_loader +def load_user(domain): + return User.query.filter_by(domain=domain).first() + + +@ui.route('/subscribe', methods=['GET', 'POST']) +def subscribe(): + if flask.request.method == 'POST': + origin = flask.request.form.get('origin') + if origin: + type = None + feed = None + typed_feed = flask.request.form.get('feed') + if typed_feed: + type, feed = typed_feed.split('|', 1) + else: + feeds = find_possible_feeds(origin) + if not feeds: + flask.flash('No feeds found for: ' + origin) + return flask.redirect(flask.url_for('.subscribe')) + if len(feeds) > 1: + return flask.render_template( + 'select-feed.jinja2', origin=origin, feeds=feeds) + feed = feeds[0]['feed'] + type = feeds[0]['type'] + new_feed = add_subscription(origin, feed, type) + flask.flash('Successfully subscribed to: {}'.format(new_feed.name)) + return flask.redirect(flask.url_for('.index')) + else: + flask.abort(400) + + return flask.render_template('subscribe.jinja2') + + +def add_subscription(origin, feed, type): + if type == 'html': + parsed = mf2util.interpret_feed(mf2py.parse(url=feed), feed) + name = parsed.get('name') + if not name or len(name) > 140: + p = urllib.parse.urlparse(origin) + name = p.netloc + p.path + + feed = Feed(user=flask_login.current_user, name=name, + origin=origin, feed=feed, type=type) + + db.session.add(feed) + db.session.commit() + return feed + + elif type == 'xml': + parsed = feedparser.parse(feed) + feed = Feed(user=flask_login.current_user, + name=parsed.feed.title, origin=origin, feed=feed, + type=type) + + db.session.add(feed) + db.session.commit() + return feed + + +def find_possible_feeds(origin): + # scrape an origin source to find possible alternative feeds + resp = requests.get(origin) + + feeds = [] + xml_feed_types = [ + 'application/rss+xml', + 'application/atom+xml', + 'application/rdf+xml', + ] + + content_type = resp.headers['content-type'] + content_type = content_type.split(';', 1)[0].strip() + if content_type in xml_feed_types: + feeds.append({ + 'origin': origin, + 'feed': origin, + 'type': 'xml', + }) + + elif content_type == 'text/html': + # if text/html, then parse and look for rel="alternate" + soup = bs4.BeautifulSoup(resp.text) + for link in soup.find_all('link', {'rel': 'alternate'}): + if link.get('type') in xml_feed_types: + feeds.append({ + 'origin': origin, + 'feed': link.get('href'), + 'type': 'xml', + }) + feeds.append({ + 'origin': origin, + 'feed': origin, + 'type': 'html', + }) + + return feeds