diff --git a/ingest.py b/ingest.py index 29585ab..fd9346c 100644 --- a/ingest.py +++ b/ingest.py @@ -206,7 +206,7 @@ def is_supported_file(event) -> bool: if isinstance(event, RoomMessageFile): return (event.body or "").lower().endswith(".pdf") if isinstance(event, RoomMessageImage): - return (event.body or "").lower().endswith((".jpg", ".jpeg")) + return True # validate magic bytes later; body may be empty (e.g. WhatsApp bridge) if isinstance(event, BadEvent): return _bad_event_encrypted_file_info(event) is not None return False @@ -219,7 +219,8 @@ def extract_event_fields(event) -> tuple[str, str, str, Optional[str]]: filename = content.get("body", "unknown") file_info = content.get("file", {}) return event.event_id, filename, file_info["url"], json.dumps(file_info) - return event.event_id, event.body, event.url, None + filename = event.body or "image.jpg" + return event.event_id, filename, event.url, None def content_type_for(filename: str) -> str: @@ -417,6 +418,8 @@ async def catchup_history( log.info("Starting historical catchup...") token = start_token total = 0 + batches = 0 + events_seen = 0 while True: response = await matrix_client.room_messages( @@ -430,6 +433,9 @@ async def catchup_history( log.error("room_messages error: %s", response) break + batches += 1 + events_seen += len(response.chunk) + for event in response.chunk: if is_supported_file(event): total += 1 @@ -443,7 +449,10 @@ async def catchup_history( break token = response.end - log.info("Historical catchup complete — processed %d file event(s).", total) + log.info( + "Historical catchup complete — processed %d file event(s) across %d batches (%d total events).", + total, batches, events_seen, + ) # ---------------------------------------------------------------------------