From d9e78879fa9f3802a5530bb39c9956ebebf8c7b2 Mon Sep 17 00:00:00 2001
From: Maximilian Moser <maximilian.moser@tuwien.ac.at>
Date: Wed, 28 Apr 2021 14:16:42 +0200
Subject: [PATCH] Update implementation for orphan file operations

* it seems that during deletion of drafts, the references to those files
  are deleted from the DB, but the files are still kept on disk
* thus, we now consider files to be orphaned that are still present in
  the file system, but don't have entries in the database anymore
---
 invenio_utilities_tuw/cli/files.py | 97 +++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 36 deletions(-)

diff --git a/invenio_utilities_tuw/cli/files.py b/invenio_utilities_tuw/cli/files.py
index 006b0be..95d8790 100644
--- a/invenio_utilities_tuw/cli/files.py
+++ b/invenio_utilities_tuw/cli/files.py
@@ -8,18 +8,52 @@
 
 """Management commands for files."""
 
+import os
 from collections import defaultdict
 
 import click
 from flask.cli import with_appcontext
 from invenio_db import db
-from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion
+from invenio_files_rest.models import Location, ObjectVersion
 
 from ..utils import get_record_service
 from .options import option_as_user, option_pid_type, option_pid_value
 from .utils import convert_to_recid, get_identity_for_user
 
 
+def remove_file(file_path, max_rmdir_depth=3):
+    """Remove the file and directories in its path that just became empty."""
+    os.remove(file_path)
+    path = os.path.dirname(file_path)
+    depth = 0
+
+    # only delete directories until the maximum rmdir depth is hit, or the
+    # directory contains files, or we hit a permission error
+    while depth < max_rmdir_depth and not os.listdir(path):
+        try:
+            os.rmdir(path)
+        except PermissionError:
+            break
+
+        path, _ = os.path.split(path)
+        depth += 1
+
+
+def get_orphaned_files(location):
+    """Get a list of files in the given Location that aren't referenced in the DB."""
+    # see which files are on disk at the given location
+    all_files = []
+    for (p, _, files) in os.walk(location.uri):
+        all_files += [os.path.join(p, f) for f in files]
+
+    # filter out those files that invenio has knowledge about
+    for bucket in location.buckets:
+        for obj in bucket.objects:
+            all_files.remove(obj.file.uri)
+
+    return all_files
+
+
 @click.group()
 def files():
     """Utility commands for management of files."""
@@ -122,35 +156,26 @@ def hard_delete_files(user, pid, pid_type):
 
 @files.group("orphans")
 def orphans():
-    """Management commands for orphaned files (without ObjectVersions)."""
+    """Management commands for orphaned files."""
     pass
 
 
 @orphans.command("list")
 @with_appcontext
 def list_orphan_files():
-    """List files that aren't referenced in any records (anymore)."""
-    # TODO iterate over all records & drafts, get their buckets
-    #      and check which buckets from the db aren't listed
-    service = get_record_service()
-    record_model_cls = service.record_cls.model_cls
-    draft_model_cls = service.draft_cls.model_cls
-
-    bucket_ids = set(
-        (
-            r.bucket.id
-            for r in (record_model_cls.query.all() + draft_model_cls.query.all())
-            if r.bucket is not None
-        )
-    )
-    print(len(bucket_ids))
-    buckets = Bucket.query.filter(~Bucket.id.in_(bucket_ids)).all()
-    print(len(buckets))
-
-    for bucket in buckets:
-        for ov in bucket.objects:
-            if ov.file is not None:
-                click.secho(ov.file.uri, fg="yellow")
+    """List files from locations that aren't referenced in any DB models anymore."""
+    for loc in Location.query.all():
+        # we only know how to handle directories on the file system for now
+        if os.path.isdir(loc.uri):
+            click.echo("location: {}".format(loc.name))
+        else:
+            click.secho(
+                "warning: location '{}' is not a path: {}".format(loc.name, loc.uri),
+                fg="yellow",
+            )
+
+        for fp in get_orphaned_files(loc):
+            click.echo("  {}".format(fp))
 
 
 @orphans.command("clean")
@@ -160,16 +185,16 @@ def list_orphan_files():
 @option_as_user
 @with_appcontext
 def clean_files(user):
-    """Remove files that do not have associated ObjectVersions (anymore)."""
-    service = get_record_service()
-    identity = get_identity_for_user(user)
-    service.require_permission(identity, "delete")
+    """List files from locations that aren't referenced in any DB models anymore."""
+    for loc in Location.query.all():
+        # we only know how to handle directories on the file system for now
+        if not os.path.isdir(loc.uri):
+            continue
 
-    for fi in (f for f in FileInstance.query.all() if not f.objects):
-        try:
-            storage = fi.storage()
-            fi.delete()
-            storage.delete()
-            click.secho(fi.uri, fg="red")
-        except:
-            click.secho("cannot delete file: %s" % fi.uri, fg="yellow")
+        for fp in get_orphaned_files(loc):
+            try:
+                remove_file(fp)
+                click.secho(fp, fg="green")
+
+            except PermissionError:
+                click.secho(fp, fg="red")
-- 
GitLab