detect 'jam' post types

This commit is contained in:
Kyle Mahan 2015-09-24 08:43:19 -07:00
parent 24909df9fa
commit 02c32428d5
3 changed files with 33 additions and 17 deletions

View file

@ -13,7 +13,7 @@ bleach==1.4.1
feedparser>=5.2.0
html5lib==0.99999
mf2py==0.2.7
mf2util==0.2.3
mf2util==0.2.6
psycopg2==2.6
pyOpenSSL==0.15.1
pyasn1==0.1.7

View file

@ -27,6 +27,8 @@ TWITTER_RE = re.compile(
r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)')
TAG_RE = re.compile(r'</?\w+[^>]*?>')
COMMENT_RE = re.compile(r'<!--[^>]*?-->')
JAM_RE = re.compile(
'\s*\u266b (?:https?://)?[a-z0-9._\-]+\.[a-z]{2,9}(?:/\S*)?')
AUDIO_ENCLOSURE_TMPL = '<p><audio class="u-audio" src="{href}" controls '\
'preload=none ><a href="{href}">audio</a></audio></p>'
@ -419,13 +421,7 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now):
def process_html_feed_for_new_entries(feed, content, backfill, now):
# strip noscript tags before parsing, since we definitely aren't
# going to preserve js
was_bytes = isinstance(content, bytes) # ugly hack to deal with unknown encodings
if was_bytes:
content = content.decode()
content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
if was_bytes:
content = content.encode()
parsed = mf2util.interpret_feed(
mf2py.parse(url=feed.feed, doc=content), feed.feed)
hfeed = parsed.get('entries', [])
@ -448,6 +444,8 @@ def hentry_to_entry(hentry, feed, backfill, now):
# permalink = hentry.get('url') or url
# uid = hentry.get('uid') or uid
# TODO repost = next(iter(hentry.get('repost-of', [])), None)
title = hentry.get('name')
content = hentry.get('content')
if not content:
@ -462,6 +460,11 @@ def hentry_to_entry(hentry, feed, backfill, now):
if backfill and published:
retrieved = published
author = hentry.get('author', {})
author_name = author.get('name')
author_photo = author.get('photo')
author_url = author.get('url')
entry = Entry(
uid=uid,
retrieved=retrieved,
@ -471,16 +474,17 @@ def hentry_to_entry(hentry, feed, backfill, now):
title=title,
content=content,
content_cleaned=util.clean(content),
author_name=hentry.get('author', {}).get('name'),
author_photo=hentry.get('author', {}).get('photo')
or (feed and fallback_photo(feed.origin)),
author_url=hentry.get('author', {}).get('url'))
author_name=author_name,
author_photo=author_photo or (feed and fallback_photo(feed.origin)),
author_url=author_url)
# complex properties, convert from list of complex objects to a list of URLs
# complex properties, convert from list of complex objects to a
# list of URLs
for prop in ('in-reply-to', 'like-of', 'repost-of'):
values = hentry.get(prop)
if values:
entry.set_property(prop, [value['url'] for value in values if 'url' in value])
entry.set_property(prop, [value['url'] for value in values
if 'url' in value])
# simple properties, just transfer them over wholesale
for prop in ('syndication', 'location'):
@ -488,6 +492,11 @@ def hentry_to_entry(hentry, feed, backfill, now):
if value:
entry.set_property(prop, value)
# does it look like a jam?
plain = hentry.get('content-plain')
if plain and JAM_RE.match(plain):
entry.set_property('jam', True)
current_app.logger.debug('entry properties %s', entry.properties)
return entry
@ -540,9 +549,10 @@ def fallback_photo(url):
def get_response_content(response):
"""Kartik's trick for handling responses that don't specify their
encoding. Response.text will guess badly if they don't.
"""
# if no charset is provided in the headers, figure out the
# encoding from the content
if 'charset' not in response.headers.get('content-type', ''):
return response.content
encodings = requests.utils.get_encodings_from_content(response.text)
if encodings:
response.encoding = encodings[0]
return response.text

View file

@ -64,6 +64,8 @@ def index():
flask.abort(404)
entry_query = entry_query.filter(Subscription.id == subsc_id)
ws_topic = 'subsc:{}'.format(subsc.id)
elif 'jam' in flask.request.args:
entry_query = entry_query.filter(Entry.properties['jam'] == 'true')
else:
ws_topic = 'user:{}'.format(flask_login.current_user.id)
@ -564,6 +566,7 @@ def add_preview(content):
instagram_regex = 'https?://instagram.com/p/[\w\-]+/?'
vimeo_regex = 'https?://vimeo.com/(\d+)/?'
youtube_regex = 'https?://(?:www.)youtube.com/watch\?v=([\w\-]+)'
youtube_short_regex = 'https://youtu.be/([\w\-]+)'
m = re.search(instagram_regex, content)
if m:
@ -583,6 +586,9 @@ def add_preview(content):
).format(content, vimeo_id)
m = re.search(youtube_regex, content)
if not m:
m = re.search(youtube_short_regex, content)
if m:
youtube_id = m.group(1)
return (