ingest: Use timestamp-based filenames for WhatsApp files

WhatsApp files arrive with empty or non-descriptive body fields. Rather
than falling back to generic names like "image.jpg" or "document.pdf",
generate names from the event timestamp:

  whatsapp_YYYY-MM-DD_HH-MM-SS.jpg
  whatsapp_YYYY-MM-DD_HH-MM-SS.pdf

If the body contains text (e.g. a caption), it is prepended:

  Test - whatsapp_2026-03-11_23-35-13.pdf

Files whose body already ends in the correct extension are used as-is.
This commit is contained in:
Jeena 2026-03-12 00:08:26 +00:00
parent fa4662b5f3
commit 9663232d84

View file

@ -214,16 +214,32 @@ def is_supported_file(event) -> bool:
return False
def _whatsapp_filename(ts_ms: int, is_pdf: bool, body: str) -> str:
"""Generate a filename from the event timestamp, optionally prefixed with the body text."""
from datetime import datetime, timezone
dt = datetime.fromtimestamp(ts_ms / 1000, tz=timezone.utc)
stamp = dt.strftime("%Y-%m-%d_%H-%M-%S")
ext = ".pdf" if is_pdf else ".jpg"
base = f"whatsapp_{stamp}{ext}"
if body:
return f"{body} - {base}"
return base
def extract_event_fields(event) -> tuple[str, str, str, Optional[str], bool]:
"""Returns (event_id, filename, mxc_url, encryption_info_json_or_None, is_pdf)."""
if isinstance(event, BadEvent):
content = event.source.get("content", {})
filename = content.get("body", "unknown")
body = content.get("body", "")
file_info = content.get("file", {})
is_pdf = content.get("msgtype") == "m.file"
ext = ".pdf" if is_pdf else ".jpg"
filename = body if body.lower().endswith(ext) else _whatsapp_filename(event.server_timestamp, is_pdf, body)
return event.event_id, filename, file_info["url"], json.dumps(file_info), is_pdf
is_pdf = isinstance(event, RoomMessageFile)
filename = event.body or ("document.pdf" if is_pdf else "image.jpg")
ext = ".pdf" if is_pdf else ".jpg"
body = event.body or ""
filename = body if body.lower().endswith(ext) else _whatsapp_filename(event.server_timestamp, is_pdf, body)
return event.event_id, filename, event.url, None, is_pdf
@ -500,7 +516,7 @@ async def retry_loop(
for event_id, filename, mxc_url, enc_info in rows:
log.info("Retrying %s (%s)", filename, event_id)
is_pdf = (filename or "").lower().endswith(".pdf")
is_pdf = filename.lower().endswith(".pdf")
await process_event(event_id, filename, mxc_url, enc_info, is_pdf,
matrix_client, db, paperless)