woodwind/woodwind/tasks.py
2015-01-27 22:14:30 -08:00

182 lines
5.6 KiB
Python

from woodwind.models import Feed, Entry
from config import Config
import celery
import celery.utils.log
import feedparser
import mf2py
import mf2util
import time
import urllib.parse
import datetime
import sqlalchemy
import sqlalchemy.orm
UPDATE_INTERVAL = datetime.timedelta(hours=1)
app = celery.Celery('woodwind')
app.config_from_object('celeryconfig')
logger = celery.utils.log.get_task_logger(__name__)
engine = sqlalchemy.create_engine(Config.SQLALCHEMY_DATABASE_URI)
session = sqlalchemy.orm.Session(bind=engine)
@app.task
def tick():
now = datetime.datetime.utcnow()
logger.debug('Tick {}'.format(now))
for feed in session.query(Feed).all():
logger.debug('Feed {} last checked {}'.format(
feed, feed.last_checked))
if (not feed.last_checked
or now - feed.last_checked > UPDATE_INTERVAL):
update_feed.delay(feed.id)
@app.task
def update_feed(feed_id):
feed = session.query(Feed).get(feed_id)
logger.info('Updating {}'.format(feed))
new_entries = process_feed_for_new_entries(feed)
for entry in new_entries:
logger.debug('Got new entry: {}'.format(entry))
def process_feed_for_new_entries(feed):
try:
if feed.type == 'xml':
result = process_xml_feed_for_new_entries(feed)
elif feed.type == 'html':
result = process_html_feed_for_new_entries(feed)
else:
result = None
return result
finally:
now = datetime.datetime.utcnow()
feed.last_checked = now
if result:
feed.last_updated = now
session.commit()
def process_xml_feed_for_new_entries(feed):
logger.debug('fetching xml feed: %s', feed)
now = datetime.datetime.utcnow()
parsed = feedparser.parse(feed.feed)
feed_props = parsed.get('feed', {})
default_author_url = feed_props.get('author_detail', {}).get('href')
default_author_name = feed_props.get('author_detail', {}).get('name')
default_author_photo = feed_props.get('logo')
all_uids = [e.id or e.link for e in parsed.entries]
if all_uids:
preexisting = set(row[0] for row in session.query(Entry.uid)
.filter(Entry.uid.in_(all_uids))
.filter(Entry.feed == feed))
else:
preexisting = set()
logger.debug('found {} entries'.format(len(parsed.entries)))
for p_entry in parsed.entries:
logger.debug('processing entry {}'.format(p_entry))
permalink = p_entry.link
uid = p_entry.id or permalink
if not uid or uid in preexisting:
continue
updated = datetime.datetime.fromtimestamp(
time.mktime(p_entry.updated_parsed)
) if 'updated_parsed' in p_entry else None
published = datetime.datetime.fromtimestamp(
time.mktime(p_entry.published_parsed)
) if 'published_parsed' in p_entry else now
title = p_entry.get('title')
content = None
content_list = p_entry.get('content')
if content_list:
content = content_list[0].value
else:
content = p_entry.get('summary')
if title and content:
title_trimmed = title.rstrip('...').rstrip('')
if content.startswith(title_trimmed):
title = None
entry = Entry(
feed=feed,
published=published,
updated=updated,
uid=uid,
permalink=permalink,
retrieved=now,
title=p_entry.get('title'),
content=content,
author_name=p_entry.get('author_detail', {}).get('name')
or default_author_name,
author_url=p_entry.get('author_detail', {}).get('href')
or default_author_url,
author_photo=default_author_photo
or fallback_photo(feed.origin))
session.add(entry)
session.commit()
yield entry
def process_html_feed_for_new_entries(feed):
logger.debug('fetching html feed: %s', feed)
now = datetime.datetime.utcnow()
parsed = mf2util.interpret_feed(
mf2py.parse(url=feed.feed), feed.feed)
hfeed = parsed.get('entries', [])
all_uids = [e.get('uid') or e.get('url') for e in hfeed]
if all_uids:
preexisting = set(row[0] for row in session.query(Entry.uid)
.filter(Entry.uid.in_(all_uids))
.filter(Entry.feed == feed))
else:
preexisting = set()
# logger.debug('preexisting urls: %r', preexisting)
for hentry in hfeed:
permalink = url = hentry.get('url')
uid = hentry.get('uid') or url
if not uid or uid in preexisting:
continue
# hentry = mf2util.interpret(mf2py.parse(url=url), url)
# permalink = hentry.get('url') or url
# uid = hentry.get('uid') or uid
entry = Entry(
feed=feed,
published=hentry.get('published') or now,
updated=hentry.get('updated'),
uid=uid,
permalink=permalink,
retrieved=now,
title=hentry.get('name'),
content=hentry.get('content'),
author_name=hentry.get('author', {}).get('name'),
author_photo=hentry.get('author', {}).get('photo')
or fallback_photo(feed.origin),
author_url=hentry.get('author', {}).get('url'))
session.add(entry)
session.commit()
logger.debug('saved entry: %s', entry.permalink)
yield entry
def fallback_photo(url):
"""Use favatar to find an appropriate photo for any URL"""
domain = urllib.parse.urlparse(url).netloc
return 'http://www.google.com/s2/favicons?domain=' + domain