From fa4662b5f379b55b05db0dc5ac8f60e9f30b1f58 Mon Sep 17 00:00:00 2001 From: Jeena Date: Thu, 12 Mar 2026 00:03:18 +0000 Subject: [PATCH] ingest: Determine PDF vs JPEG from event type, not filename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WhatsApp bridge files may have arbitrary body text (e.g. "Test") that does not end in .pdf, causing the filename-based magic byte check to apply JPEG validation to PDF files and reject them. Pass is_pdf through extract_event_fields and process_event based on the Matrix event type (RoomMessageFile → PDF, RoomMessageImage → JPEG, BadEvent → inferred from msgtype), so validation and content-type are always correct regardless of the filename. --- ingest.py | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/ingest.py b/ingest.py index 9b0d0ae..6659507 100644 --- a/ingest.py +++ b/ingest.py @@ -214,32 +214,30 @@ def is_supported_file(event) -> bool: return False -def extract_event_fields(event) -> tuple[str, str, str, Optional[str]]: - """Returns (event_id, filename, mxc_url, encryption_info_json_or_None).""" +def extract_event_fields(event) -> tuple[str, str, str, Optional[str], bool]: + """Returns (event_id, filename, mxc_url, encryption_info_json_or_None, is_pdf).""" if isinstance(event, BadEvent): content = event.source.get("content", {}) filename = content.get("body", "unknown") file_info = content.get("file", {}) - return event.event_id, filename, file_info["url"], json.dumps(file_info) - if isinstance(event, RoomMessageFile): - filename = event.body or "document.pdf" - else: - filename = event.body or "image.jpg" - return event.event_id, filename, event.url, None + is_pdf = content.get("msgtype") == "m.file" + return event.event_id, filename, file_info["url"], json.dumps(file_info), is_pdf + is_pdf = isinstance(event, RoomMessageFile) + filename = event.body or ("document.pdf" if is_pdf else "image.jpg") + return event.event_id, filename, event.url, None, is_pdf -def content_type_for(filename: str) -> str: - return "application/pdf" if filename.lower().endswith(".pdf") else "image/jpeg" +def content_type_for(is_pdf: bool) -> str: + return "application/pdf" if is_pdf else "image/jpeg" -def validate_file(path: Path, filename: str) -> None: - """Raise ValueError if the file doesn't look like the format its name claims.""" +def validate_file(path: Path, filename: str, is_pdf: bool) -> None: + """Raise ValueError if the file content doesn't match the expected format.""" data = path.read_bytes()[:8] - if filename.lower().endswith(".pdf"): + if is_pdf: if not data.startswith(b"%PDF"): raise ValueError(f"File does not start with %PDF magic bytes: {filename}") else: - # JPEG: starts with FF D8 FF if not data[:3] == b"\xff\xd8\xff": raise ValueError(f"File does not start with JPEG magic bytes: {filename}") @@ -343,6 +341,7 @@ async def process_event( filename: str, mxc_url: str, encryption_info: Optional[str], + is_pdf: bool, matrix_client: AsyncClient, db: aiosqlite.Connection, paperless: PaperlessClient, @@ -377,7 +376,7 @@ async def process_event( # Validate the file looks like what it claims to be; skip unsupported formats try: - validate_file(tmp_path, filename) + validate_file(tmp_path, filename, is_pdf) except ValueError as exc: log.info("Skipping unsupported file %s: %s", filename, exc) await upsert_event(db, event_id, filename, mxc_url, "skipped", encryption_info) @@ -393,7 +392,7 @@ async def process_event( return # Upload and wait for Paperless to confirm it landed - task_id = await paperless.upload(filename, tmp_path, content_type_for(filename)) + task_id = await paperless.upload(filename, tmp_path, content_type_for(is_pdf)) log.info("Uploaded %s → waiting for Paperless task %s", filename, task_id) doc_id = await paperless.wait_for_task(task_id) @@ -464,9 +463,9 @@ async def catchup_history( for event in response.chunk: if is_supported_file(event): total += 1 - event_id, filename, mxc_url, enc_info = extract_event_fields(event) + event_id, filename, mxc_url, enc_info, is_pdf = extract_event_fields(event) await process_event( - event_id, filename, mxc_url, enc_info, + event_id, filename, mxc_url, enc_info, is_pdf, matrix_client, db, paperless, ) @@ -501,7 +500,8 @@ async def retry_loop( for event_id, filename, mxc_url, enc_info in rows: log.info("Retrying %s (%s)", filename, event_id) - await process_event(event_id, filename, mxc_url, enc_info, + is_pdf = (filename or "").lower().endswith(".pdf") + await process_event(event_id, filename, mxc_url, enc_info, is_pdf, matrix_client, db, paperless) @@ -557,8 +557,8 @@ async def main() -> None: # Process events from the initial sync timeline first for event in room_timeline.timeline.events: if is_supported_file(event): - event_id, filename, mxc_url, enc_info = extract_event_fields(event) - await process_event(event_id, filename, mxc_url, enc_info, + event_id, filename, mxc_url, enc_info, is_pdf = extract_event_fields(event) + await process_event(event_id, filename, mxc_url, enc_info, is_pdf, matrix_client, db, paperless) # Then paginate backwards for older history @@ -567,9 +567,9 @@ async def main() -> None: async def on_file(room, event): if room.room_id == MATRIX_ROOM_ID and is_supported_file(event): - event_id, filename, mxc_url, enc_info = extract_event_fields(event) + event_id, filename, mxc_url, enc_info, is_pdf = extract_event_fields(event) await process_event( - event_id, filename, mxc_url, enc_info, + event_id, filename, mxc_url, enc_info, is_pdf, matrix_client, db, paperless, )