detect 'jam' post types

2015-09-24 08:43:19 -07:00 · 2015-09-24 08:43:19 -07:00 · 02c32428d5
commit 02c32428d5
parent 24909df9fa
3 changed files with 33 additions and 17 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -13,7 +13,7 @@ bleach==1.4.1
 feedparser>=5.2.0
 html5lib==0.99999
 mf2py==0.2.7
-mf2util==0.2.3
+mf2util==0.2.6
 psycopg2==2.6
 pyOpenSSL==0.15.1
 pyasn1==0.1.7
--- a/woodwind/tasks.py
+++ b/woodwind/tasks.py
@ -27,6 +27,8 @@ TWITTER_RE = re.compile(
    r'https?://(?:www\.|mobile\.)?twitter\.com/(\w+)/status(?:es)?/(\w+)')
 TAG_RE = re.compile(r'</?\w+[^>]*?>')
 COMMENT_RE = re.compile(r'<!--[^>]*?-->')
 JAM_RE = re.compile(
    '\s*\u266b (?:https?://)?[a-z0-9._\-]+\.[a-z]{2,9}(?:/\S*)?')
 AUDIO_ENCLOSURE_TMPL = '<p><audio class="u-audio" src="{href}" controls '\
                       'preload=none ><a href="{href}">audio</a></audio></p>'
@ -419,13 +421,7 @@ def process_xml_feed_for_new_entries(feed, content, backfill, now):
 def process_html_feed_for_new_entries(feed, content, backfill, now):
    # strip noscript tags before parsing, since we definitely aren't
    # going to preserve js
    was_bytes = isinstance(content, bytes) # ugly hack to deal with unknown encodings
    if was_bytes:
        content = content.decode()
    content = re.sub('</?noscript[^>]*>', '', content, flags=re.IGNORECASE)
    if was_bytes:
        content = content.encode()
    parsed = mf2util.interpret_feed(
        mf2py.parse(url=feed.feed, doc=content), feed.feed)
    hfeed = parsed.get('entries', [])
@ -448,6 +444,8 @@ def hentry_to_entry(hentry, feed, backfill, now):
    # permalink = hentry.get('url') or url
    # uid = hentry.get('uid') or uid
    # TODO repost = next(iter(hentry.get('repost-of', [])), None)
    title = hentry.get('name')
    content = hentry.get('content')
    if not content:
@ -462,6 +460,11 @@ def hentry_to_entry(hentry, feed, backfill, now):
    if backfill and published:
        retrieved = published
    author = hentry.get('author', {})
    author_name = author.get('name')
    author_photo = author.get('photo')
    author_url = author.get('url')
    entry = Entry(
        uid=uid,
        retrieved=retrieved,
@ -471,16 +474,17 @@ def hentry_to_entry(hentry, feed, backfill, now):
        title=title,
        content=content,
        content_cleaned=util.clean(content),
-        author_name=hentry.get('author', {}).get('name'),
+        author_name=author_name,
-        author_photo=hentry.get('author', {}).get('photo')
+        author_photo=author_photo or (feed and fallback_photo(feed.origin)),
-        or (feed and fallback_photo(feed.origin)),
+        author_url=author_url)
        author_url=hentry.get('author', {}).get('url'))
-    # complex properties, convert from list of complex objects to a list of URLs
+    # complex properties, convert from list of complex objects to a
    # list of URLs
    for prop in ('in-reply-to', 'like-of', 'repost-of'):
        values = hentry.get(prop)
        if values:
-            entry.set_property(prop, [value['url'] for value in values if 'url' in value])
+            entry.set_property(prop, [value['url'] for value in values
                                      if 'url' in value])
    # simple properties, just transfer them over wholesale
    for prop in ('syndication', 'location'):
@ -488,6 +492,11 @@ def hentry_to_entry(hentry, feed, backfill, now):
        if value:
            entry.set_property(prop, value)
    # does it look like a jam?
    plain = hentry.get('content-plain')
    if plain and JAM_RE.match(plain):
        entry.set_property('jam', True)
    current_app.logger.debug('entry properties %s', entry.properties)
    return entry
@ -540,9 +549,10 @@ def fallback_photo(url):
 def get_response_content(response):
-    """Kartik's trick for handling responses that don't specify their
+    # if no charset is provided in the headers, figure out the
-    encoding. Response.text will guess badly if they don't.
+    # encoding from the content
    """
    if 'charset' not in response.headers.get('content-type', ''):
-        return response.content
+        encodings = requests.utils.get_encodings_from_content(response.text)
        if encodings:
            response.encoding = encodings[0]
    return response.text
--- a/woodwind/views.py
+++ b/woodwind/views.py
@ -64,6 +64,8 @@ def index():
                    flask.abort(404)
                entry_query = entry_query.filter(Subscription.id == subsc_id)
                ws_topic = 'subsc:{}'.format(subsc.id)
            elif 'jam' in flask.request.args:
                entry_query = entry_query.filter(Entry.properties['jam'] == 'true')
            else:
                ws_topic = 'user:{}'.format(flask_login.current_user.id)
@ -564,6 +566,7 @@ def add_preview(content):
    instagram_regex = 'https?://instagram.com/p/[\w\-]+/?'
    vimeo_regex = 'https?://vimeo.com/(\d+)/?'
    youtube_regex = 'https?://(?:www.)youtube.com/watch\?v=([\w\-]+)'
    youtube_short_regex = 'https://youtu.be/([\w\-]+)'
    m = re.search(instagram_regex, content)
    if m:
@ -583,6 +586,9 @@ def add_preview(content):
        ).format(content, vimeo_id)
    m = re.search(youtube_regex, content)
    if not m:
        m = re.search(youtube_short_regex, content)
    if m:
        youtube_id = m.group(1)
        return (