update entries that have changed since they were first fetched
This commit is contained in:
parent
27e5ebd47d
commit
19180b1829
1 changed files with 59 additions and 47 deletions
|
@ -54,27 +54,57 @@ def update_feed(feed_id):
|
||||||
with session_scope() as session:
|
with session_scope() as session:
|
||||||
feed = session.query(Feed).get(feed_id)
|
feed = session.query(Feed).get(feed_id)
|
||||||
logger.info('Updating {}'.format(feed))
|
logger.info('Updating {}'.format(feed))
|
||||||
new_entries = process_feed_for_new_entries(session, feed)
|
process_feed_for_new_entries(session, feed)
|
||||||
for entry in new_entries:
|
|
||||||
logger.debug('Got new entry: {}'.format(entry))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def process_feed_for_new_entries(session, feed):
|
def process_feed_for_new_entries(session, feed):
|
||||||
result = None
|
now = datetime.datetime.utcnow()
|
||||||
|
found_new = False
|
||||||
try:
|
try:
|
||||||
if feed.type == 'xml':
|
if feed.type == 'xml':
|
||||||
result = list(process_xml_feed_for_new_entries(session, feed))
|
result = process_xml_feed_for_new_entries(session, feed)
|
||||||
elif feed.type == 'html':
|
elif feed.type == 'html':
|
||||||
result = list(process_html_feed_for_new_entries(session, feed))
|
result = process_html_feed_for_new_entries(session, feed)
|
||||||
|
else:
|
||||||
|
result = []
|
||||||
|
|
||||||
|
for entry in result:
|
||||||
|
old = session.query(Entry)\
|
||||||
|
.filter(Entry.feed == feed)\
|
||||||
|
.filter(Entry.uid == entry.uid).first()
|
||||||
|
# have we seen this post before
|
||||||
|
if not old or not is_content_equal(old, entry):
|
||||||
|
# set a default value for published if none is provided
|
||||||
|
if not entry.published:
|
||||||
|
entry.published = (old.published or now) if old else now
|
||||||
|
|
||||||
|
if old:
|
||||||
|
feed.entries.remove(old)
|
||||||
|
session.delete(old)
|
||||||
|
|
||||||
|
feed.entries.append(entry)
|
||||||
|
session.commit()
|
||||||
|
found_new = True
|
||||||
|
else:
|
||||||
|
logger.info('skipping previously seen post {}'.format(old.permalink))
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
now = datetime.datetime.utcnow()
|
|
||||||
feed.last_checked = now
|
feed.last_checked = now
|
||||||
if result:
|
if found_new:
|
||||||
feed.last_updated = now
|
feed.last_updated = now
|
||||||
session.commit()
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
|
def is_content_equal(e1, e2):
|
||||||
|
"""The criteria for determining if an entry that we've seen before
|
||||||
|
has been updated. If any of these fields have changed, we'll scrub the
|
||||||
|
old entry and replace it with the updated one.
|
||||||
|
"""
|
||||||
|
return (e1.title == e2.title
|
||||||
|
and e1.content == e2.content
|
||||||
|
and e1.author_name == e2.author_name
|
||||||
|
and e1.author_url == e2.author_url
|
||||||
|
and e1.author_photo == e2.author_photo)
|
||||||
|
|
||||||
|
|
||||||
def process_xml_feed_for_new_entries(session, feed):
|
def process_xml_feed_for_new_entries(session, feed):
|
||||||
logger.debug('fetching xml feed: %s', feed)
|
logger.debug('fetching xml feed: %s', feed)
|
||||||
|
@ -87,21 +117,13 @@ def process_xml_feed_for_new_entries(session, feed):
|
||||||
default_author_name = feed_props.get('author_detail', {}).get('name')
|
default_author_name = feed_props.get('author_detail', {}).get('name')
|
||||||
default_author_photo = feed_props.get('logo')
|
default_author_photo = feed_props.get('logo')
|
||||||
|
|
||||||
all_uids = [e.id or e.link for e in parsed.entries]
|
|
||||||
if all_uids:
|
|
||||||
preexisting = set(row[0] for row in session.query(Entry.uid)
|
|
||||||
.filter(Entry.uid.in_(all_uids))
|
|
||||||
.filter(Entry.feed == feed))
|
|
||||||
else:
|
|
||||||
preexisting = set()
|
|
||||||
|
|
||||||
logger.debug('found {} entries'.format(len(parsed.entries)))
|
logger.debug('found {} entries'.format(len(parsed.entries)))
|
||||||
for p_entry in parsed.entries:
|
for p_entry in parsed.entries:
|
||||||
logger.debug('processing entry {}'.format(p_entry))
|
logger.debug('processing entry {}'.format(p_entry))
|
||||||
permalink = p_entry.link
|
permalink = p_entry.link
|
||||||
uid = p_entry.id or permalink
|
uid = p_entry.id or permalink
|
||||||
|
|
||||||
if not uid or uid in preexisting:
|
if not uid:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if 'updated_parsed' in p_entry:
|
if 'updated_parsed' in p_entry:
|
||||||
|
@ -114,7 +136,7 @@ def process_xml_feed_for_new_entries(session, feed):
|
||||||
published = datetime.datetime.fromtimestamp(
|
published = datetime.datetime.fromtimestamp(
|
||||||
time.mktime(p_entry.published_parsed))
|
time.mktime(p_entry.published_parsed))
|
||||||
else:
|
else:
|
||||||
published = updated or now
|
published = updated
|
||||||
|
|
||||||
title = p_entry.get('title')
|
title = p_entry.get('title')
|
||||||
|
|
||||||
|
@ -131,7 +153,6 @@ def process_xml_feed_for_new_entries(session, feed):
|
||||||
title = None
|
title = None
|
||||||
|
|
||||||
entry = Entry(
|
entry = Entry(
|
||||||
feed=feed,
|
|
||||||
published=published,
|
published=published,
|
||||||
updated=updated,
|
updated=updated,
|
||||||
uid=uid,
|
uid=uid,
|
||||||
|
@ -146,8 +167,6 @@ def process_xml_feed_for_new_entries(session, feed):
|
||||||
author_photo=default_author_photo
|
author_photo=default_author_photo
|
||||||
or fallback_photo(feed.origin))
|
or fallback_photo(feed.origin))
|
||||||
|
|
||||||
session.add(entry)
|
|
||||||
session.commit()
|
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
|
|
||||||
|
@ -164,37 +183,30 @@ def process_html_feed_for_new_entries(session, feed):
|
||||||
uid = hentry.get('uid') or url
|
uid = hentry.get('uid') or url
|
||||||
if not uid:
|
if not uid:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
entry = session.query(Entry)\
|
|
||||||
.filter(Entry.feed == feed)\
|
|
||||||
.filter(Entry.uid == uid).first()
|
|
||||||
|
|
||||||
# hentry = mf2util.interpret(mf2py.Parser(url=url).to_dict(), url)
|
# hentry = mf2util.interpret(mf2py.Parser(url=url).to_dict(), url)
|
||||||
# permalink = hentry.get('url') or url
|
# permalink = hentry.get('url') or url
|
||||||
# uid = hentry.get('uid') or uid
|
# uid = hentry.get('uid') or uid
|
||||||
|
|
||||||
name = hentry.get('name')
|
title = hentry.get('name')
|
||||||
content = hentry.get('content')
|
content = hentry.get('content')
|
||||||
|
if not content:
|
||||||
|
content = title
|
||||||
|
title = None
|
||||||
|
|
||||||
if not entry:
|
entry = Entry(
|
||||||
entry = Entry(feed=feed, uid=uid, retrieved=now)
|
uid=uid,
|
||||||
session.add(entry)
|
retrieved=now,
|
||||||
|
permalink=permalink,
|
||||||
entry.published = hentry.get('published') or entry.published or now
|
published=hentry.get('published'),
|
||||||
entry.updated = hentry.get('updated') or entry.updated
|
updated=hentry.get('updated'),
|
||||||
entry.permalink = permalink
|
title=title,
|
||||||
if content:
|
content=content,
|
||||||
entry.title = name
|
author_name=hentry.get('author', {}).get('name'),
|
||||||
entry.content = content
|
author_photo=hentry.get('author', {}).get('photo') or fallback_photo(feed.origin),
|
||||||
else:
|
author_url=hentry.get('author', {}).get('url'))
|
||||||
entry.title = None
|
|
||||||
entry.content = name
|
|
||||||
entry.author_name = hentry.get('author', {}).get('name')
|
|
||||||
entry.author_photo = hentry.get('author', {}).get('photo') or fallback_photo(feed.origin)
|
|
||||||
entry.author_url = hentry.get('author', {}).get('url')
|
|
||||||
|
|
||||||
session.commit()
|
logger.debug('built entry: %s', entry.permalink)
|
||||||
logger.debug('saved entry: %s', entry.permalink)
|
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue