add prelim support for pubsubhubbub enabled feeds

This commit is contained in:
Kyle Mahan 2015-02-21 22:59:26 -08:00
parent 8e3a671531
commit 6f2dc85110
5 changed files with 134 additions and 13 deletions

10
run.py Executable file
View file

@ -0,0 +1,10 @@
#!/usr/bin/env python
def main():
from woodwind.app import create_app
app = create_app()
app.run(debug=True, port=4000)
main()

View file

@ -1,9 +1,9 @@
from . import extensions from . import extensions
from .views import views from .views import views
from .api import api from .api import api
from .push import push
from config import Config from config import Config
import flask import flask
import logging
def create_app(): def create_app():
@ -12,4 +12,5 @@ def create_app():
extensions.init_app(app) extensions.init_app(app)
app.register_blueprint(views) app.register_blueprint(views)
app.register_blueprint(api) app.register_blueprint(api)
app.register_blueprint(push)
return app return app

View file

@ -108,6 +108,11 @@ class Feed(db.Model):
last_checked = db.Column(db.DateTime) last_checked = db.Column(db.DateTime)
etag = db.Column(db.String(512)) etag = db.Column(db.String(512))
push_hub = db.Column(db.String(512))
push_topic = db.Column(db.String(512))
push_verified = db.Column(db.Boolean)
last_pinged = db.Column(db.DateTime)
def get_feed_code(self): def get_feed_code(self):
return binascii.hexlify(self.feed.encode()) return binascii.hexlify(self.feed.encode())

50
woodwind/push.py Normal file
View file

@ -0,0 +1,50 @@
from . import tasks
from .extensions import db
from .models import Feed
from flask import Blueprint, request, abort, current_app, make_response
import datetime
push = Blueprint('push', __name__)
@push.route('/_notify/<int:feed_id>', methods=['GET', 'POST'])
def notify(feed_id):
current_app.logger.debug(
'received PuSH notification for feed id %d', feed_id)
feed = Feed.query.get(feed_id)
if not feed:
current_app.logger.debug(
'could not find feed corresponding to %d', feed_id)
abort(404)
current_app.logger.debug('processing PuSH notification for feed %r', feed)
if request.method == 'GET':
# verify subscribe or unsusbscribe
mode = request.args.get('hub.mode')
topic = request.args.get('hub.topic')
challenge = request.args.get('hub.challenge')
current_app.logger.debug(
'PuSH verification. feed=%r, mode=%s, topic=%s, challenge=%s',
feed, mode, topic, challenge)
if mode == 'subscribe' and topic == feed.push_topic:
current_app.logger.debug(
'PuSH verify subscribe for feed=%r, topic=%s', feed, topic)
feed.push_verified = True
db.session.commit()
return challenge
elif mode == 'unsubscribe' and topic != feed.push_topic:
current_app.logger.debug(
'PuSH verify unsubscribe for feed=%r, topic=%s', feed, topic)
return challenge
current_app.logger.debug('PuSH cannot verify %s for feed=%r, topic=%s',
mode, feed, topic)
abort(404)
# could it be? an actual push notification!?
current_app.logger.debug('received PuSH ping for %r', feed)
feed.last_pinged = datetime.datetime.utcnow()
db.session.commit()
tasks.update_feed.delay(feed.id)
return make_response('', 204)

View file

@ -12,10 +12,12 @@ import sqlalchemy
import sqlalchemy.orm import sqlalchemy.orm
import time import time
import urllib.parse import urllib.parse
import requests
UPDATE_INTERVAL = datetime.timedelta(hours=1) UPDATE_INTERVAL = datetime.timedelta(hours=1)
TWITTER_RE = re.compile(r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)') TWITTER_RE = re.compile(
r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)')
app = celery.Celery('woodwind') app = celery.Celery('woodwind')
app.config_from_object('celeryconfig') app.config_from_object('celeryconfig')
@ -57,17 +59,25 @@ def update_feed(feed_id):
with session_scope() as session: with session_scope() as session:
feed = session.query(Feed).get(feed_id) feed = session.query(Feed).get(feed_id)
logger.info('Updating {}'.format(feed)) logger.info('Updating {}'.format(feed))
process_feed_for_new_entries(session, feed) process_feed(session, feed)
def process_feed_for_new_entries(session, feed): def process_feed(session, feed):
now = datetime.datetime.utcnow() now = datetime.datetime.utcnow()
found_new = False found_new = False
try: try:
logger.info('fetching feed: %s', feed)
response = requests.get(feed.feed)
if response.status_code // 100 != 2:
logger.warn('bad response from %s. %r: %r', feed.feed,
response, response.text)
return
check_push_subscription(session, feed, response)
if feed.type == 'xml': if feed.type == 'xml':
result = process_xml_feed_for_new_entries(session, feed) result = process_xml_feed_for_new_entries(session, feed, response)
elif feed.type == 'html': elif feed.type == 'html':
result = process_html_feed_for_new_entries(session, feed) result = process_html_feed_for_new_entries(session, feed, response)
else: else:
result = [] result = []
@ -101,6 +111,44 @@ def process_feed_for_new_entries(session, feed):
feed.last_updated = now feed.last_updated = now
def check_push_subscription(session, feed, response):
def build_callback_url():
return '{}://{}/_notify/{}'.format(
getattr(Config, 'PREFERRED_URL_SCHEME', 'http'),
Config.SERVER_NAME,
feed.id)
def send_request(mode, hub, topic):
logger.debug(
'sending %s request for hub=%r, topic=%r', mode, hub, topic)
r = requests.post(hub, data={
'hub.mode': mode,
'hub.topic': topic,
'hub.callback': build_callback_url(),
# TODO secret should only be used over HTTPS
# 'hub.secret': secret,
})
logger.debug('%s response %r', mode, r)
old_hub = feed.push_hub
old_topic = feed.push_topic
hub = response.links.get('hub', {}).get('url')
topic = response.links.get('self', {}).get('url')
if hub != old_hub or topic != old_topic or not feed.push_verified:
feed.push_hub = hub
feed.push_topic = topic
feed.push_verified = False
session.commit()
if old_hub and old_topic:
send_request('unsubscribe', old_hub, old_topic)
if hub and topic:
send_request('subscribe', hub, topic)
def is_content_equal(e1, e2): def is_content_equal(e1, e2):
"""The criteria for determining if an entry that we've seen before """The criteria for determining if an entry that we've seen before
has been updated. If any of these fields have changed, we'll scrub the has been updated. If any of these fields have changed, we'll scrub the
@ -114,12 +162,11 @@ def is_content_equal(e1, e2):
and e1.properties == e2.properties) and e1.properties == e2.properties)
def process_xml_feed_for_new_entries(session, feed): def process_xml_feed_for_new_entries(session, feed, response):
logger.debug('fetching xml feed: %s', feed) logger.debug('fetching xml feed: %s', feed)
now = datetime.datetime.utcnow() now = datetime.datetime.utcnow()
parsed = feedparser.parse(feed.feed) parsed = feedparser.parse(get_response_content(response))
feed_props = parsed.get('feed', {}) feed_props = parsed.get('feed', {})
default_author_url = feed_props.get('author_detail', {}).get('href') default_author_url = feed_props.get('author_detail', {}).get('href')
default_author_name = feed_props.get('author_detail', {}).get('name') default_author_name = feed_props.get('author_detail', {}).get('name')
@ -178,11 +225,10 @@ def process_xml_feed_for_new_entries(session, feed):
yield entry yield entry
def process_html_feed_for_new_entries(session, feed): def process_html_feed_for_new_entries(session, feed, response):
logger.info('fetching html feed: %s', feed) doc = get_response_content(response)
parsed = mf2util.interpret_feed( parsed = mf2util.interpret_feed(
mf2py.Parser(url=feed.feed).to_dict(), feed.feed) mf2py.Parser(url=feed.feed, doc=doc).to_dict(), feed.feed)
hfeed = parsed.get('entries', []) hfeed = parsed.get('entries', [])
for hentry in hfeed: for hentry in hfeed:
@ -271,3 +317,12 @@ def fallback_photo(url):
"""Use favatar to find an appropriate photo for any URL""" """Use favatar to find an appropriate photo for any URL"""
domain = urllib.parse.urlparse(url).netloc domain = urllib.parse.urlparse(url).netloc
return 'http://www.google.com/s2/favicons?domain=' + domain return 'http://www.google.com/s2/favicons?domain=' + domain
def get_response_content(response):
"""Kartik's trick for handling responses that don't specify their
encoding. Response.text will guess badly if they don't.
"""
if 'charset' not in response.headers.get('content-type', ''):
return response.content
return response.text