detect 'jam' post types
This commit is contained in:
parent
24909df9fa
commit
02c32428d5
3 changed files with 33 additions and 17 deletions
|
@ -13,7 +13,7 @@ bleach==1.4.1
|
||||||
feedparser>=5.2.0
|
feedparser>=5.2.0
|
||||||
html5lib==0.99999
|
html5lib==0.99999
|
||||||
mf2py==0.2.7
|
mf2py==0.2.7
|
||||||
mf2util==0.2.3
|
mf2util==0.2.6
|
||||||
psycopg2==2.6
|
psycopg2==2.6
|
||||||
pyOpenSSL==0.15.1
|
pyOpenSSL==0.15.1
|
||||||
pyasn1==0.1.7
|
pyasn1==0.1.7
|
||||||
|
|
|
@ -27,6 +27,8 @@ TWITTER_RE = re.compile(
|
||||||
r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)')
|
r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)')
|
||||||
TAG_RE = re.compile(r'</?\w+[^>]*?>')
|
TAG_RE = re.compile(r'</?\w+[^>]*?>')
|
||||||
COMMENT_RE = re.compile(r'<!--[^>]*?-->')
|
COMMENT_RE = re.compile(r'<!--[^>]*?-->')
|
||||||
|
JAM_RE = re.compile(
|
||||||
|
'\s*\u266b (?:https?://)?[a-z0-9._\-]+\.[a-z]{2,9}(?:/\S*)?')
|
||||||
|
|
||||||
AUDIO_ENCLOSURE_TMPL = '<p><audio class="u-audio" src="{href}" controls '\
|
AUDIO_ENCLOSURE_TMPL = '<p><audio class="u-audio" src="{href}" controls '\
|
||||||
'preload=none ><a href="{href}">audio</a></audio></p>'
|
'preload=none ><a href="{href}">audio</a></audio></p>'
|
||||||
|
@ -419,13 +421,7 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now):
|
||||||
def process_html_feed_for_new_entries(feed, content, backfill, now):
|
def process_html_feed_for_new_entries(feed, content, backfill, now):
|
||||||
# strip noscript tags before parsing, since we definitely aren't
|
# strip noscript tags before parsing, since we definitely aren't
|
||||||
# going to preserve js
|
# going to preserve js
|
||||||
was_bytes = isinstance(content, bytes) # ugly hack to deal with unknown encodings
|
|
||||||
if was_bytes:
|
|
||||||
content = content.decode()
|
|
||||||
content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
|
content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
|
||||||
if was_bytes:
|
|
||||||
content = content.encode()
|
|
||||||
|
|
||||||
parsed = mf2util.interpret_feed(
|
parsed = mf2util.interpret_feed(
|
||||||
mf2py.parse(url=feed.feed, doc=content), feed.feed)
|
mf2py.parse(url=feed.feed, doc=content), feed.feed)
|
||||||
hfeed = parsed.get('entries', [])
|
hfeed = parsed.get('entries', [])
|
||||||
|
@ -448,6 +444,8 @@ def hentry_to_entry(hentry, feed, backfill, now):
|
||||||
# permalink = hentry.get('url') or url
|
# permalink = hentry.get('url') or url
|
||||||
# uid = hentry.get('uid') or uid
|
# uid = hentry.get('uid') or uid
|
||||||
|
|
||||||
|
# TODO repost = next(iter(hentry.get('repost-of', [])), None)
|
||||||
|
|
||||||
title = hentry.get('name')
|
title = hentry.get('name')
|
||||||
content = hentry.get('content')
|
content = hentry.get('content')
|
||||||
if not content:
|
if not content:
|
||||||
|
@ -462,6 +460,11 @@ def hentry_to_entry(hentry, feed, backfill, now):
|
||||||
if backfill and published:
|
if backfill and published:
|
||||||
retrieved = published
|
retrieved = published
|
||||||
|
|
||||||
|
author = hentry.get('author', {})
|
||||||
|
author_name = author.get('name')
|
||||||
|
author_photo = author.get('photo')
|
||||||
|
author_url = author.get('url')
|
||||||
|
|
||||||
entry = Entry(
|
entry = Entry(
|
||||||
uid=uid,
|
uid=uid,
|
||||||
retrieved=retrieved,
|
retrieved=retrieved,
|
||||||
|
@ -471,16 +474,17 @@ def hentry_to_entry(hentry, feed, backfill, now):
|
||||||
title=title,
|
title=title,
|
||||||
content=content,
|
content=content,
|
||||||
content_cleaned=util.clean(content),
|
content_cleaned=util.clean(content),
|
||||||
author_name=hentry.get('author', {}).get('name'),
|
author_name=author_name,
|
||||||
author_photo=hentry.get('author', {}).get('photo')
|
author_photo=author_photo or (feed and fallback_photo(feed.origin)),
|
||||||
or (feed and fallback_photo(feed.origin)),
|
author_url=author_url)
|
||||||
author_url=hentry.get('author', {}).get('url'))
|
|
||||||
|
|
||||||
# complex properties, convert from list of complex objects to a list of URLs
|
# complex properties, convert from list of complex objects to a
|
||||||
|
# list of URLs
|
||||||
for prop in ('in-reply-to', 'like-of', 'repost-of'):
|
for prop in ('in-reply-to', 'like-of', 'repost-of'):
|
||||||
values = hentry.get(prop)
|
values = hentry.get(prop)
|
||||||
if values:
|
if values:
|
||||||
entry.set_property(prop, [value['url'] for value in values if 'url' in value])
|
entry.set_property(prop, [value['url'] for value in values
|
||||||
|
if 'url' in value])
|
||||||
|
|
||||||
# simple properties, just transfer them over wholesale
|
# simple properties, just transfer them over wholesale
|
||||||
for prop in ('syndication', 'location'):
|
for prop in ('syndication', 'location'):
|
||||||
|
@ -488,6 +492,11 @@ def hentry_to_entry(hentry, feed, backfill, now):
|
||||||
if value:
|
if value:
|
||||||
entry.set_property(prop, value)
|
entry.set_property(prop, value)
|
||||||
|
|
||||||
|
# does it look like a jam?
|
||||||
|
plain = hentry.get('content-plain')
|
||||||
|
if plain and JAM_RE.match(plain):
|
||||||
|
entry.set_property('jam', True)
|
||||||
|
|
||||||
current_app.logger.debug('entry properties %s', entry.properties)
|
current_app.logger.debug('entry properties %s', entry.properties)
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
|
@ -540,9 +549,10 @@ def fallback_photo(url):
|
||||||
|
|
||||||
|
|
||||||
def get_response_content(response):
|
def get_response_content(response):
|
||||||
"""Kartik's trick for handling responses that don't specify their
|
# if no charset is provided in the headers, figure out the
|
||||||
encoding. Response.text will guess badly if they don't.
|
# encoding from the content
|
||||||
"""
|
|
||||||
if 'charset' not in response.headers.get('content-type', ''):
|
if 'charset' not in response.headers.get('content-type', ''):
|
||||||
return response.content
|
encodings = requests.utils.get_encodings_from_content(response.text)
|
||||||
|
if encodings:
|
||||||
|
response.encoding = encodings[0]
|
||||||
return response.text
|
return response.text
|
||||||
|
|
|
@ -64,6 +64,8 @@ def index():
|
||||||
flask.abort(404)
|
flask.abort(404)
|
||||||
entry_query = entry_query.filter(Subscription.id == subsc_id)
|
entry_query = entry_query.filter(Subscription.id == subsc_id)
|
||||||
ws_topic = 'subsc:{}'.format(subsc.id)
|
ws_topic = 'subsc:{}'.format(subsc.id)
|
||||||
|
elif 'jam' in flask.request.args:
|
||||||
|
entry_query = entry_query.filter(Entry.properties['jam'] == 'true')
|
||||||
else:
|
else:
|
||||||
ws_topic = 'user:{}'.format(flask_login.current_user.id)
|
ws_topic = 'user:{}'.format(flask_login.current_user.id)
|
||||||
|
|
||||||
|
@ -564,6 +566,7 @@ def add_preview(content):
|
||||||
instagram_regex = 'https?://instagram.com/p/[\w\-]+/?'
|
instagram_regex = 'https?://instagram.com/p/[\w\-]+/?'
|
||||||
vimeo_regex = 'https?://vimeo.com/(\d+)/?'
|
vimeo_regex = 'https?://vimeo.com/(\d+)/?'
|
||||||
youtube_regex = 'https?://(?:www.)youtube.com/watch\?v=([\w\-]+)'
|
youtube_regex = 'https?://(?:www.)youtube.com/watch\?v=([\w\-]+)'
|
||||||
|
youtube_short_regex = 'https://youtu.be/([\w\-]+)'
|
||||||
|
|
||||||
m = re.search(instagram_regex, content)
|
m = re.search(instagram_regex, content)
|
||||||
if m:
|
if m:
|
||||||
|
@ -583,6 +586,9 @@ def add_preview(content):
|
||||||
).format(content, vimeo_id)
|
).format(content, vimeo_id)
|
||||||
|
|
||||||
m = re.search(youtube_regex, content)
|
m = re.search(youtube_regex, content)
|
||||||
|
if not m:
|
||||||
|
m = re.search(youtube_short_regex, content)
|
||||||
|
|
||||||
if m:
|
if m:
|
||||||
youtube_id = m.group(1)
|
youtube_id = m.group(1)
|
||||||
return (
|
return (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue