diff --git a/run.py b/run.py new file mode 100755 index 0000000..41f6c98 --- /dev/null +++ b/run.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python + + +def main(): + from woodwind.app import create_app + app = create_app() + app.run(debug=True, port=4000) + + +main() diff --git a/woodwind/app.py b/woodwind/app.py index 73c8075..6f323e9 100644 --- a/woodwind/app.py +++ b/woodwind/app.py @@ -1,9 +1,9 @@ from . import extensions from .views import views from .api import api +from .push import push from config import Config import flask -import logging def create_app(): @@ -12,4 +12,5 @@ def create_app(): extensions.init_app(app) app.register_blueprint(views) app.register_blueprint(api) + app.register_blueprint(push) return app diff --git a/woodwind/models.py b/woodwind/models.py index 7b30a41..6d9e75a 100644 --- a/woodwind/models.py +++ b/woodwind/models.py @@ -108,6 +108,11 @@ class Feed(db.Model): last_checked = db.Column(db.DateTime) etag = db.Column(db.String(512)) + push_hub = db.Column(db.String(512)) + push_topic = db.Column(db.String(512)) + push_verified = db.Column(db.Boolean) + last_pinged = db.Column(db.DateTime) + def get_feed_code(self): return binascii.hexlify(self.feed.encode()) diff --git a/woodwind/push.py b/woodwind/push.py new file mode 100644 index 0000000..f76e7ad --- /dev/null +++ b/woodwind/push.py @@ -0,0 +1,50 @@ +from . import tasks +from .extensions import db +from .models import Feed +from flask import Blueprint, request, abort, current_app, make_response +import datetime + + +push = Blueprint('push', __name__) + + +@push.route('/_notify/', methods=['GET', 'POST']) +def notify(feed_id): + current_app.logger.debug( + 'received PuSH notification for feed id %d', feed_id) + feed = Feed.query.get(feed_id) + if not feed: + current_app.logger.debug( + 'could not find feed corresponding to %d', feed_id) + abort(404) + + current_app.logger.debug('processing PuSH notification for feed %r', feed) + if request.method == 'GET': + # verify subscribe or unsusbscribe + mode = request.args.get('hub.mode') + topic = request.args.get('hub.topic') + challenge = request.args.get('hub.challenge') + current_app.logger.debug( + 'PuSH verification. feed=%r, mode=%s, topic=%s, challenge=%s', + feed, mode, topic, challenge) + + if mode == 'subscribe' and topic == feed.push_topic: + current_app.logger.debug( + 'PuSH verify subscribe for feed=%r, topic=%s', feed, topic) + feed.push_verified = True + db.session.commit() + return challenge + elif mode == 'unsubscribe' and topic != feed.push_topic: + current_app.logger.debug( + 'PuSH verify unsubscribe for feed=%r, topic=%s', feed, topic) + return challenge + current_app.logger.debug('PuSH cannot verify %s for feed=%r, topic=%s', + mode, feed, topic) + abort(404) + + # could it be? an actual push notification!? + current_app.logger.debug('received PuSH ping for %r', feed) + feed.last_pinged = datetime.datetime.utcnow() + db.session.commit() + tasks.update_feed.delay(feed.id) + return make_response('', 204) diff --git a/woodwind/tasks.py b/woodwind/tasks.py index 247b829..f433b87 100644 --- a/woodwind/tasks.py +++ b/woodwind/tasks.py @@ -12,10 +12,12 @@ import sqlalchemy import sqlalchemy.orm import time import urllib.parse +import requests UPDATE_INTERVAL = datetime.timedelta(hours=1) -TWITTER_RE = re.compile(r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)') +TWITTER_RE = re.compile( + r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)') app = celery.Celery('woodwind') app.config_from_object('celeryconfig') @@ -57,17 +59,25 @@ def update_feed(feed_id): with session_scope() as session: feed = session.query(Feed).get(feed_id) logger.info('Updating {}'.format(feed)) - process_feed_for_new_entries(session, feed) + process_feed(session, feed) -def process_feed_for_new_entries(session, feed): +def process_feed(session, feed): now = datetime.datetime.utcnow() found_new = False try: + logger.info('fetching feed: %s', feed) + response = requests.get(feed.feed) + if response.status_code // 100 != 2: + logger.warn('bad response from %s. %r: %r', feed.feed, + response, response.text) + return + + check_push_subscription(session, feed, response) if feed.type == 'xml': - result = process_xml_feed_for_new_entries(session, feed) + result = process_xml_feed_for_new_entries(session, feed, response) elif feed.type == 'html': - result = process_html_feed_for_new_entries(session, feed) + result = process_html_feed_for_new_entries(session, feed, response) else: result = [] @@ -101,6 +111,44 @@ def process_feed_for_new_entries(session, feed): feed.last_updated = now +def check_push_subscription(session, feed, response): + def build_callback_url(): + return '{}://{}/_notify/{}'.format( + getattr(Config, 'PREFERRED_URL_SCHEME', 'http'), + Config.SERVER_NAME, + feed.id) + + def send_request(mode, hub, topic): + logger.debug( + 'sending %s request for hub=%r, topic=%r', mode, hub, topic) + r = requests.post(hub, data={ + 'hub.mode': mode, + 'hub.topic': topic, + 'hub.callback': build_callback_url(), + # TODO secret should only be used over HTTPS + # 'hub.secret': secret, + }) + logger.debug('%s response %r', mode, r) + + old_hub = feed.push_hub + old_topic = feed.push_topic + hub = response.links.get('hub', {}).get('url') + topic = response.links.get('self', {}).get('url') + + if hub != old_hub or topic != old_topic or not feed.push_verified: + feed.push_hub = hub + feed.push_topic = topic + feed.push_verified = False + session.commit() + + if old_hub and old_topic: + send_request('unsubscribe', old_hub, old_topic) + + if hub and topic: + send_request('subscribe', hub, topic) + + + def is_content_equal(e1, e2): """The criteria for determining if an entry that we've seen before has been updated. If any of these fields have changed, we'll scrub the @@ -114,12 +162,11 @@ def is_content_equal(e1, e2): and e1.properties == e2.properties) -def process_xml_feed_for_new_entries(session, feed): +def process_xml_feed_for_new_entries(session, feed, response): logger.debug('fetching xml feed: %s', feed) now = datetime.datetime.utcnow() - parsed = feedparser.parse(feed.feed) - + parsed = feedparser.parse(get_response_content(response)) feed_props = parsed.get('feed', {}) default_author_url = feed_props.get('author_detail', {}).get('href') default_author_name = feed_props.get('author_detail', {}).get('name') @@ -178,11 +225,10 @@ def process_xml_feed_for_new_entries(session, feed): yield entry -def process_html_feed_for_new_entries(session, feed): - logger.info('fetching html feed: %s', feed) - +def process_html_feed_for_new_entries(session, feed, response): + doc = get_response_content(response) parsed = mf2util.interpret_feed( - mf2py.Parser(url=feed.feed).to_dict(), feed.feed) + mf2py.Parser(url=feed.feed, doc=doc).to_dict(), feed.feed) hfeed = parsed.get('entries', []) for hentry in hfeed: @@ -271,3 +317,12 @@ def fallback_photo(url): """Use favatar to find an appropriate photo for any URL""" domain = urllib.parse.urlparse(url).netloc return 'http://www.google.com/s2/favicons?domain=' + domain + + +def get_response_content(response): + """Kartik's trick for handling responses that don't specify their + encoding. Response.text will guess badly if they don't. + """ + if 'charset' not in response.headers.get('content-type', ''): + return response.content + return response.text