ingest: Use timestamp-based filenames for WhatsApp files
WhatsApp files arrive with empty or non-descriptive body fields. Rather than falling back to generic names like "image.jpg" or "document.pdf", generate names from the event timestamp: whatsapp_YYYY-MM-DD_HH-MM-SS.jpg whatsapp_YYYY-MM-DD_HH-MM-SS.pdf If the body contains text (e.g. a caption), it is prepended: Test - whatsapp_2026-03-11_23-35-13.pdf Files whose body already ends in the correct extension are used as-is.
This commit is contained in:
parent
fa4662b5f3
commit
9663232d84
1 changed files with 19 additions and 3 deletions
22
ingest.py
22
ingest.py
|
|
@ -214,16 +214,32 @@ def is_supported_file(event) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _whatsapp_filename(ts_ms: int, is_pdf: bool, body: str) -> str:
|
||||||
|
"""Generate a filename from the event timestamp, optionally prefixed with the body text."""
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
dt = datetime.fromtimestamp(ts_ms / 1000, tz=timezone.utc)
|
||||||
|
stamp = dt.strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
|
ext = ".pdf" if is_pdf else ".jpg"
|
||||||
|
base = f"whatsapp_{stamp}{ext}"
|
||||||
|
if body:
|
||||||
|
return f"{body} - {base}"
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
def extract_event_fields(event) -> tuple[str, str, str, Optional[str], bool]:
|
def extract_event_fields(event) -> tuple[str, str, str, Optional[str], bool]:
|
||||||
"""Returns (event_id, filename, mxc_url, encryption_info_json_or_None, is_pdf)."""
|
"""Returns (event_id, filename, mxc_url, encryption_info_json_or_None, is_pdf)."""
|
||||||
if isinstance(event, BadEvent):
|
if isinstance(event, BadEvent):
|
||||||
content = event.source.get("content", {})
|
content = event.source.get("content", {})
|
||||||
filename = content.get("body", "unknown")
|
body = content.get("body", "")
|
||||||
file_info = content.get("file", {})
|
file_info = content.get("file", {})
|
||||||
is_pdf = content.get("msgtype") == "m.file"
|
is_pdf = content.get("msgtype") == "m.file"
|
||||||
|
ext = ".pdf" if is_pdf else ".jpg"
|
||||||
|
filename = body if body.lower().endswith(ext) else _whatsapp_filename(event.server_timestamp, is_pdf, body)
|
||||||
return event.event_id, filename, file_info["url"], json.dumps(file_info), is_pdf
|
return event.event_id, filename, file_info["url"], json.dumps(file_info), is_pdf
|
||||||
is_pdf = isinstance(event, RoomMessageFile)
|
is_pdf = isinstance(event, RoomMessageFile)
|
||||||
filename = event.body or ("document.pdf" if is_pdf else "image.jpg")
|
ext = ".pdf" if is_pdf else ".jpg"
|
||||||
|
body = event.body or ""
|
||||||
|
filename = body if body.lower().endswith(ext) else _whatsapp_filename(event.server_timestamp, is_pdf, body)
|
||||||
return event.event_id, filename, event.url, None, is_pdf
|
return event.event_id, filename, event.url, None, is_pdf
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -500,7 +516,7 @@ async def retry_loop(
|
||||||
|
|
||||||
for event_id, filename, mxc_url, enc_info in rows:
|
for event_id, filename, mxc_url, enc_info in rows:
|
||||||
log.info("Retrying %s (%s)", filename, event_id)
|
log.info("Retrying %s (%s)", filename, event_id)
|
||||||
is_pdf = (filename or "").lower().endswith(".pdf")
|
is_pdf = filename.lower().endswith(".pdf")
|
||||||
await process_event(event_id, filename, mxc_url, enc_info, is_pdf,
|
await process_event(event_id, filename, mxc_url, enc_info, is_pdf,
|
||||||
matrix_client, db, paperless)
|
matrix_client, db, paperless)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue