diff --git a/requirements.txt b/requirements.txt index f7b5d04..20ab200 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ bleach==1.4.1 feedparser>=5.2.0 html5lib==0.99999 mf2py==0.2.7 -mf2util==0.2.3 +mf2util==0.2.6 psycopg2==2.6 pyOpenSSL==0.15.1 pyasn1==0.1.7 diff --git a/woodwind/tasks.py b/woodwind/tasks.py index 2781738..cc297c1 100644 --- a/woodwind/tasks.py +++ b/woodwind/tasks.py @@ -27,6 +27,8 @@ TWITTER_RE = re.compile( r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)') TAG_RE = re.compile(r']*?>') COMMENT_RE = re.compile(r'') +JAM_RE = re.compile( + '\s*\u266b (?:https?://)?[a-z0-9._\-]+\.[a-z]{2,9}(?:/\S*)?') AUDIO_ENCLOSURE_TMPL = '

' @@ -419,13 +421,7 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now): def process_html_feed_for_new_entries(feed, content, backfill, now): # strip noscript tags before parsing, since we definitely aren't # going to preserve js - was_bytes = isinstance(content, bytes) # ugly hack to deal with unknown encodings - if was_bytes: - content = content.decode() content = re.sub(']*>', '', content, flags=re.IGNORECASE) - if was_bytes: - content = content.encode() - parsed = mf2util.interpret_feed( mf2py.parse(url=feed.feed, doc=content), feed.feed) hfeed = parsed.get('entries', []) @@ -448,6 +444,8 @@ def hentry_to_entry(hentry, feed, backfill, now): # permalink = hentry.get('url') or url # uid = hentry.get('uid') or uid + # TODO repost = next(iter(hentry.get('repost-of', [])), None) + title = hentry.get('name') content = hentry.get('content') if not content: @@ -462,6 +460,11 @@ def hentry_to_entry(hentry, feed, backfill, now): if backfill and published: retrieved = published + author = hentry.get('author', {}) + author_name = author.get('name') + author_photo = author.get('photo') + author_url = author.get('url') + entry = Entry( uid=uid, retrieved=retrieved, @@ -471,16 +474,17 @@ def hentry_to_entry(hentry, feed, backfill, now): title=title, content=content, content_cleaned=util.clean(content), - author_name=hentry.get('author', {}).get('name'), - author_photo=hentry.get('author', {}).get('photo') - or (feed and fallback_photo(feed.origin)), - author_url=hentry.get('author', {}).get('url')) + author_name=author_name, + author_photo=author_photo or (feed and fallback_photo(feed.origin)), + author_url=author_url) - # complex properties, convert from list of complex objects to a list of URLs + # complex properties, convert from list of complex objects to a + # list of URLs for prop in ('in-reply-to', 'like-of', 'repost-of'): values = hentry.get(prop) if values: - entry.set_property(prop, [value['url'] for value in values if 'url' in value]) + entry.set_property(prop, [value['url'] for value in values + if 'url' in value]) # simple properties, just transfer them over wholesale for prop in ('syndication', 'location'): @@ -488,6 +492,11 @@ def hentry_to_entry(hentry, feed, backfill, now): if value: entry.set_property(prop, value) + # does it look like a jam? + plain = hentry.get('content-plain') + if plain and JAM_RE.match(plain): + entry.set_property('jam', True) + current_app.logger.debug('entry properties %s', entry.properties) return entry @@ -540,9 +549,10 @@ def fallback_photo(url): def get_response_content(response): - """Kartik's trick for handling responses that don't specify their - encoding. Response.text will guess badly if they don't. - """ + # if no charset is provided in the headers, figure out the + # encoding from the content if 'charset' not in response.headers.get('content-type', ''): - return response.content + encodings = requests.utils.get_encodings_from_content(response.text) + if encodings: + response.encoding = encodings[0] return response.text diff --git a/woodwind/views.py b/woodwind/views.py index 78842a5..3ef993a 100644 --- a/woodwind/views.py +++ b/woodwind/views.py @@ -64,6 +64,8 @@ def index(): flask.abort(404) entry_query = entry_query.filter(Subscription.id == subsc_id) ws_topic = 'subsc:{}'.format(subsc.id) + elif 'jam' in flask.request.args: + entry_query = entry_query.filter(Entry.properties['jam'] == 'true') else: ws_topic = 'user:{}'.format(flask_login.current_user.id) @@ -564,6 +566,7 @@ def add_preview(content): instagram_regex = 'https?://instagram.com/p/[\w\-]+/?' vimeo_regex = 'https?://vimeo.com/(\d+)/?' youtube_regex = 'https?://(?:www.)youtube.com/watch\?v=([\w\-]+)' + youtube_short_regex = 'https://youtu.be/([\w\-]+)' m = re.search(instagram_regex, content) if m: @@ -583,6 +586,9 @@ def add_preview(content): ).format(content, vimeo_id) m = re.search(youtube_regex, content) + if not m: + m = re.search(youtube_short_regex, content) + if m: youtube_id = m.group(1) return (