From eec2d076e429dfdbfe61dca37c993b70ebe47ee9 Mon Sep 17 00:00:00 2001 From: Jeena Date: Wed, 11 Mar 2026 23:32:15 +0000 Subject: [PATCH] ingest: Accept RoomMessageImage events regardless of body content WhatsApp bridge images arrive as RoomMessageImage events with an empty body field, so the previous .jpg/.jpeg extension check silently rejected all of them. Accept all RoomMessageImage events and fall back to "image.jpg" as filename when body is empty. File content is still validated via magic bytes before upload. --- ingest.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/ingest.py b/ingest.py index 29585ab..fd9346c 100644 --- a/ingest.py +++ b/ingest.py @@ -206,7 +206,7 @@ def is_supported_file(event) -> bool: if isinstance(event, RoomMessageFile): return (event.body or "").lower().endswith(".pdf") if isinstance(event, RoomMessageImage): - return (event.body or "").lower().endswith((".jpg", ".jpeg")) + return True # validate magic bytes later; body may be empty (e.g. WhatsApp bridge) if isinstance(event, BadEvent): return _bad_event_encrypted_file_info(event) is not None return False @@ -219,7 +219,8 @@ def extract_event_fields(event) -> tuple[str, str, str, Optional[str]]: filename = content.get("body", "unknown") file_info = content.get("file", {}) return event.event_id, filename, file_info["url"], json.dumps(file_info) - return event.event_id, event.body, event.url, None + filename = event.body or "image.jpg" + return event.event_id, filename, event.url, None def content_type_for(filename: str) -> str: @@ -417,6 +418,8 @@ async def catchup_history( log.info("Starting historical catchup...") token = start_token total = 0 + batches = 0 + events_seen = 0 while True: response = await matrix_client.room_messages( @@ -430,6 +433,9 @@ async def catchup_history( log.error("room_messages error: %s", response) break + batches += 1 + events_seen += len(response.chunk) + for event in response.chunk: if is_supported_file(event): total += 1 @@ -443,7 +449,10 @@ async def catchup_history( break token = response.end - log.info("Historical catchup complete — processed %d file event(s).", total) + log.info( + "Historical catchup complete — processed %d file event(s) across %d batches (%d total events).", + total, batches, events_seen, + ) # ---------------------------------------------------------------------------