From fa4662b5f379b55b05db0dc5ac8f60e9f30b1f58 Mon Sep 17 00:00:00 2001
From: Jeena <hello@jeena.net>
Date: Thu, 12 Mar 2026 00:03:18 +0000
Subject: [PATCH] ingest: Determine PDF vs JPEG from event type, not filename
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WhatsApp bridge files may have arbitrary body text (e.g. "Test") that
does not end in .pdf, causing the filename-based magic byte check to
apply JPEG validation to PDF files and reject them.

Pass is_pdf through extract_event_fields and process_event based on
the Matrix event type (RoomMessageFile → PDF, RoomMessageImage → JPEG,
BadEvent → inferred from msgtype), so validation and content-type are
always correct regardless of the filename.
---
 ingest.py | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/ingest.py b/ingest.py
index 9b0d0ae..6659507 100644
--- a/ingest.py
+++ b/ingest.py
@@ -214,32 +214,30 @@ def is_supported_file(event) -> bool:
     return False
 
 
-def extract_event_fields(event) -> tuple[str, str, str, Optional[str]]:
-    """Returns (event_id, filename, mxc_url, encryption_info_json_or_None)."""
+def extract_event_fields(event) -> tuple[str, str, str, Optional[str], bool]:
+    """Returns (event_id, filename, mxc_url, encryption_info_json_or_None, is_pdf)."""
     if isinstance(event, BadEvent):
         content = event.source.get("content", {})
         filename = content.get("body", "unknown")
         file_info = content.get("file", {})
-        return event.event_id, filename, file_info["url"], json.dumps(file_info)
-    if isinstance(event, RoomMessageFile):
-        filename = event.body or "document.pdf"
-    else:
-        filename = event.body or "image.jpg"
-    return event.event_id, filename, event.url, None
+        is_pdf = content.get("msgtype") == "m.file"
+        return event.event_id, filename, file_info["url"], json.dumps(file_info), is_pdf
+    is_pdf = isinstance(event, RoomMessageFile)
+    filename = event.body or ("document.pdf" if is_pdf else "image.jpg")
+    return event.event_id, filename, event.url, None, is_pdf
 
 
-def content_type_for(filename: str) -> str:
-    return "application/pdf" if filename.lower().endswith(".pdf") else "image/jpeg"
+def content_type_for(is_pdf: bool) -> str:
+    return "application/pdf" if is_pdf else "image/jpeg"
 
 
-def validate_file(path: Path, filename: str) -> None:
-    """Raise ValueError if the file doesn't look like the format its name claims."""
+def validate_file(path: Path, filename: str, is_pdf: bool) -> None:
+    """Raise ValueError if the file content doesn't match the expected format."""
     data = path.read_bytes()[:8]
-    if filename.lower().endswith(".pdf"):
+    if is_pdf:
         if not data.startswith(b"%PDF"):
             raise ValueError(f"File does not start with %PDF magic bytes: {filename}")
     else:
-        # JPEG: starts with FF D8 FF
         if not data[:3] == b"\xff\xd8\xff":
             raise ValueError(f"File does not start with JPEG magic bytes: {filename}")
 
@@ -343,6 +341,7 @@ async def process_event(
     filename: str,
     mxc_url: str,
     encryption_info: Optional[str],
+    is_pdf: bool,
     matrix_client: AsyncClient,
     db: aiosqlite.Connection,
     paperless: PaperlessClient,
@@ -377,7 +376,7 @@ async def process_event(
 
         # Validate the file looks like what it claims to be; skip unsupported formats
         try:
-            validate_file(tmp_path, filename)
+            validate_file(tmp_path, filename, is_pdf)
         except ValueError as exc:
             log.info("Skipping unsupported file %s: %s", filename, exc)
             await upsert_event(db, event_id, filename, mxc_url, "skipped", encryption_info)
@@ -393,7 +392,7 @@ async def process_event(
             return
 
         # Upload and wait for Paperless to confirm it landed
-        task_id = await paperless.upload(filename, tmp_path, content_type_for(filename))
+        task_id = await paperless.upload(filename, tmp_path, content_type_for(is_pdf))
         log.info("Uploaded %s → waiting for Paperless task %s", filename, task_id)
         doc_id = await paperless.wait_for_task(task_id)
 
@@ -464,9 +463,9 @@ async def catchup_history(
         for event in response.chunk:
             if is_supported_file(event):
                 total += 1
-                event_id, filename, mxc_url, enc_info = extract_event_fields(event)
+                event_id, filename, mxc_url, enc_info, is_pdf = extract_event_fields(event)
                 await process_event(
-                    event_id, filename, mxc_url, enc_info,
+                    event_id, filename, mxc_url, enc_info, is_pdf,
                     matrix_client, db, paperless,
                 )
 
@@ -501,7 +500,8 @@ async def retry_loop(
 
         for event_id, filename, mxc_url, enc_info in rows:
             log.info("Retrying %s (%s)", filename, event_id)
-            await process_event(event_id, filename, mxc_url, enc_info,
+            is_pdf = (filename or "").lower().endswith(".pdf")
+            await process_event(event_id, filename, mxc_url, enc_info, is_pdf,
                                 matrix_client, db, paperless)
 
 
@@ -557,8 +557,8 @@ async def main() -> None:
             # Process events from the initial sync timeline first
             for event in room_timeline.timeline.events:
                 if is_supported_file(event):
-                    event_id, filename, mxc_url, enc_info = extract_event_fields(event)
-                    await process_event(event_id, filename, mxc_url, enc_info,
+                    event_id, filename, mxc_url, enc_info, is_pdf = extract_event_fields(event)
+                    await process_event(event_id, filename, mxc_url, enc_info, is_pdf,
                                         matrix_client, db, paperless)
 
             # Then paginate backwards for older history
@@ -567,9 +567,9 @@ async def main() -> None:
 
             async def on_file(room, event):
                 if room.room_id == MATRIX_ROOM_ID and is_supported_file(event):
-                    event_id, filename, mxc_url, enc_info = extract_event_fields(event)
+                    event_id, filename, mxc_url, enc_info, is_pdf = extract_event_fields(event)
                     await process_event(
-                        event_id, filename, mxc_url, enc_info,
+                        event_id, filename, mxc_url, enc_info, is_pdf,
                         matrix_client, db, paperless,
                     )