From 7ad96bae8f17b03ea8965fcf9fb0f9fa2ec21bc6 Mon Sep 17 00:00:00 2001
From: Maximilian Moser <maximilian.moser@tuwien.ac.at>
Date: Mon, 10 Feb 2025 18:18:23 +0100
Subject: [PATCH] Add background task for cleaning up dead files

* sometimes the cleanup operations for failed file uploads seem to leave
  over some `FileInstance` entries in the database that have a null URI
* these file instances cause the periodic health checks to fail and
  trigger warning emails
---
 invenio_config_tuw/tasks.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/invenio_config_tuw/tasks.py b/invenio_config_tuw/tasks.py
index 1103149..36f5b1d 100644
--- a/invenio_config_tuw/tasks.py
+++ b/invenio_config_tuw/tasks.py
@@ -10,9 +10,12 @@
 from typing import Optional
 
 from celery import shared_task
+from celery.schedules import crontab
 from flask import current_app, url_for
 from invenio_access.permissions import system_identity
 from invenio_accounts.proxies import current_datastore
+from invenio_db import db
+from invenio_files_rest.models import FileInstance
 from invenio_notifications.tasks import broadcast_notification
 from invenio_rdm_records.proxies import current_rdm_records_service as records_service
 
@@ -78,3 +81,27 @@ def send_publication_notification(recid: str, user_id: Optional[str] = None):
         html_message=html_message,
     )
     broadcast_notification(notification.dumps())
+
+
+@shared_task
+def remove_dead_files():
+    """Remove dead file instances (that don't have a URI) from the database.
+
+    These files seem to be leftovers from failed uploads that don't get cleaned up
+    properly.
+    """
+    dead_file_instances = FileInstance.query.filter(FileInstance.uri.is_(None)).all()
+    for fi in dead_file_instances:
+        db.session.delete(fi)
+        for o in fi.objects:
+            db.session.delete(o)
+
+    db.session.commit()
+
+
+CELERY_BEAT_SCHEDULE = {
+    "clean-dead-files": {
+        "task": "invenio_config_tuw.tasks.remove_dead_files",
+        "schedule": crontab(minute=1, hour=2),
+    },
+}
-- 
GitLab