From d9e78879fa9f3802a5530bb39c9956ebebf8c7b2 Mon Sep 17 00:00:00 2001 From: Maximilian Moser <maximilian.moser@tuwien.ac.at> Date: Wed, 28 Apr 2021 14:16:42 +0200 Subject: [PATCH] Update implementation for orphan file operations * it seems that during deletion of drafts, the references to those files are deleted from the DB, but the files are still kept on disk * thus, we now consider files to be orphaned that are still present in the file system, but don't have entries in the database anymore --- invenio_utilities_tuw/cli/files.py | 97 +++++++++++++++++++----------- 1 file changed, 61 insertions(+), 36 deletions(-) diff --git a/invenio_utilities_tuw/cli/files.py b/invenio_utilities_tuw/cli/files.py index 006b0be..95d8790 100644 --- a/invenio_utilities_tuw/cli/files.py +++ b/invenio_utilities_tuw/cli/files.py @@ -8,18 +8,52 @@ """Management commands for files.""" +import os from collections import defaultdict import click from flask.cli import with_appcontext from invenio_db import db -from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion +from invenio_files_rest.models import Location, ObjectVersion from ..utils import get_record_service from .options import option_as_user, option_pid_type, option_pid_value from .utils import convert_to_recid, get_identity_for_user +def remove_file(file_path, max_rmdir_depth=3): + """Remove the file and directories in its path that just became empty.""" + os.remove(file_path) + path = os.path.dirname(file_path) + depth = 0 + + # only delete directories until the maximum rmdir depth is hit, or the + # directory contains files, or we hit a permission error + while depth < max_rmdir_depth and not os.listdir(path): + try: + os.rmdir(path) + except PermissionError: + break + + path, _ = os.path.split(path) + depth += 1 + + +def get_orphaned_files(location): + """Get a list of files in the given Location that aren't referenced in the DB.""" + # see which files are on disk at the given location + all_files = [] + for (p, _, files) in os.walk(location.uri): + all_files += [os.path.join(p, f) for f in files] + + # filter out those files that invenio has knowledge about + for bucket in location.buckets: + for obj in bucket.objects: + all_files.remove(obj.file.uri) + + return all_files + + @click.group() def files(): """Utility commands for management of files.""" @@ -122,35 +156,26 @@ def hard_delete_files(user, pid, pid_type): @files.group("orphans") def orphans(): - """Management commands for orphaned files (without ObjectVersions).""" + """Management commands for orphaned files.""" pass @orphans.command("list") @with_appcontext def list_orphan_files(): - """List files that aren't referenced in any records (anymore).""" - # TODO iterate over all records & drafts, get their buckets - # and check which buckets from the db aren't listed - service = get_record_service() - record_model_cls = service.record_cls.model_cls - draft_model_cls = service.draft_cls.model_cls - - bucket_ids = set( - ( - r.bucket.id - for r in (record_model_cls.query.all() + draft_model_cls.query.all()) - if r.bucket is not None - ) - ) - print(len(bucket_ids)) - buckets = Bucket.query.filter(~Bucket.id.in_(bucket_ids)).all() - print(len(buckets)) - - for bucket in buckets: - for ov in bucket.objects: - if ov.file is not None: - click.secho(ov.file.uri, fg="yellow") + """List files from locations that aren't referenced in any DB models anymore.""" + for loc in Location.query.all(): + # we only know how to handle directories on the file system for now + if os.path.isdir(loc.uri): + click.echo("location: {}".format(loc.name)) + else: + click.secho( + "warning: location '{}' is not a path: {}".format(loc.name, loc.uri), + fg="yellow", + ) + + for fp in get_orphaned_files(loc): + click.echo(" {}".format(fp)) @orphans.command("clean") @@ -160,16 +185,16 @@ def list_orphan_files(): @option_as_user @with_appcontext def clean_files(user): - """Remove files that do not have associated ObjectVersions (anymore).""" - service = get_record_service() - identity = get_identity_for_user(user) - service.require_permission(identity, "delete") + """List files from locations that aren't referenced in any DB models anymore.""" + for loc in Location.query.all(): + # we only know how to handle directories on the file system for now + if not os.path.isdir(loc.uri): + continue - for fi in (f for f in FileInstance.query.all() if not f.objects): - try: - storage = fi.storage() - fi.delete() - storage.delete() - click.secho(fi.uri, fg="red") - except: - click.secho("cannot delete file: %s" % fi.uri, fg="yellow") + for fp in get_orphaned_files(loc): + try: + remove_file(fp) + click.secho(fp, fg="green") + + except PermissionError: + click.secho(fp, fg="red") -- GitLab