diff --git a/invenio_utilities_tuw/cli/drafts.py b/invenio_utilities_tuw/cli/drafts.py index de2059500358bf829ae7ffb63e9da33a04ce6d00..6f04e059a8f00ffa6727423a7d4a7c1aad09b62e 100644 --- a/invenio_utilities_tuw/cli/drafts.py +++ b/invenio_utilities_tuw/cli/drafts.py @@ -383,6 +383,9 @@ def verify_files(pid, pid_type): click.secho(f"{name}: failed checksum verification", fg="yellow", err=True) num_errors += 1 + # persist the 'last_check_at' timestamp for each file + db.session.commit() + if num_errors > 0: click.secho( f"{num_errors} files failed the checksum verification", @@ -390,6 +393,3 @@ def verify_files(pid, pid_type): err=True, ) sys.exit(1) - - # persist the 'last_check_at' timestamp for each file - db.session.commit() diff --git a/invenio_utilities_tuw/cli/files.py b/invenio_utilities_tuw/cli/files.py index 6417a8139b67b5f2ba1852f0bec9b6050be9d163..acd956c141571e4af7c545f2f4fd58004af546da 100644 --- a/invenio_utilities_tuw/cli/files.py +++ b/invenio_utilities_tuw/cli/files.py @@ -9,13 +9,15 @@ """Management commands for files.""" import os +import sys from collections import defaultdict import click from flask.cli import with_appcontext from invenio_access.permissions import system_identity from invenio_db import db -from invenio_files_rest.models import Location, ObjectVersion +from invenio_files_rest.models import FileInstance, Location, ObjectVersion +from sqlalchemy.exc import IntegrityError from ..utils import get_record_service from .options import option_pid_type, option_pid_value_optional @@ -40,7 +42,7 @@ def remove_file(file_path, max_rmdir_depth=3): depth += 1 -def get_orphaned_files(location): +def get_zombie_files(location): """Get a list of files in the given Location that aren't referenced in the DB.""" # see which files are on disk at the given location all_files = [] @@ -55,14 +57,69 @@ def get_orphaned_files(location): # likely denotes a soft-deleted file all_files.remove(obj.file.uri) + for file_instance in FileInstance.query.all(): + if file_instance.uri in all_files: + all_files.remove(file_instance.uri) + return all_files +def get_orphan_files(): + """Get a list of FileInstances that don't have associated ObjectVersions.""" + return FileInstance.query.filter(~FileInstance.objects.any()).all() + + @click.group() def files(): """Management commands for files.""" +@files.command("verify") +@click.option( + "verify_all", + "--all/--no-orphans", + "-a/-A", + default=False, + help="Verify all files, or just ones that aren't orphaned", +) +@with_appcontext +def verify_files(verify_all): + """Verify the checksums for all files.""" + num_errors = 0 + + for file_instance in FileInstance.query.all(): + if file_instance.objects or verify_all: + # build the display name from the file's URI and its object version keys + aliases = ", ".join([f'"{o.key}"' for o in file_instance.objects]) + name = f"{file_instance.uri} (alias {aliases or '<N/A>'})" + + try: + if file_instance.verify_checksum(): + click.secho(name, fg="green") + else: + click.secho( + f"{name}: failed checksum verification", fg="yellow", err=True + ) + num_errors += 1 + + except Exception as error: + click.secho( + f"{name}: failed checksum verification: {error}", fg="red", err=True + ) + num_errors += 1 + + # persist the 'last_check_at' timestamp for each file + db.session.commit() + + if num_errors > 0: + click.secho( + f"{num_errors} files failed the checksum verification", + fg="yellow", + err=True, + ) + sys.exit(1) + + @files.group("deleted") def deleted(): """Management commands for soft-deleted files.""" @@ -105,7 +162,7 @@ def list_deleted_files(pid, pid_type): @deleted.command("clean") @click.confirmation_option( - prompt="are you sure you want to permanently remove soft-deleted files?" + prompt="Are you sure you want to permanently remove soft-deleted files?" ) @option_pid_value_optional @option_pid_type @@ -153,16 +210,70 @@ def hard_delete_files(pid, pid_type): @files.group("orphans") def orphans(): - """Management commands for unreferenced files. + """Management commands for orphaned files. - Orphaned files are those that are still present in the storage, but are not - referenced by any Location's buckets anymore. + Orphaned files are those that have a FileInstance in the database without any + associated ObjectVersions. """ @orphans.command("list") @with_appcontext def list_orphan_files(): + """List orphaned files.""" + for file_instance in get_orphan_files(): + click.echo(file_instance.uri) + + +@orphans.command("clean") +@click.confirmation_option( + prompt="Are you sure you want to permanently remove orphan files?" +) +@click.option( + "--force", + "-f", + is_flag=True, + default=False, + help="Force deletion of files, even if they are marked as not writable", +) +@with_appcontext +def clean_orphan_files(force): + """Delete orphaned files from DB and storage.""" + for orphan in get_orphan_files(): + file_loc = orphan.uri + + # if the file isn't writable, let's leave it + if not orphan.writable and not force: + click.secho(f"{file_loc}: skipped (not writable)", fg="bright_black") + continue + + try: + orphan.delete() + db.session.flush() + orphan.storage().delete() + click.secho(file_loc, fg="green") + + except IntegrityError as error: + click.secho(f"{file_loc}: cannot delete from database: {error}", fg="red") + + except Exception as error: + click.secho(f"{file_loc}: error: {error}", fg="yellow") + + db.session.commit() + + +@files.group("zombies") +def zombies(): + """Management commands for unreferenced files. + + Zombie files are those that are still present in the storage, but are not + referenced by any Location's buckets or FileInstances anymore. + """ + + +@zombies.command("list") +@with_appcontext +def list_zombie_files(): """List existing files that aren't referenced in Invenio anymore.""" for loc in Location.query.all(): # we only know how to handle directories on the file system for now @@ -174,16 +285,16 @@ def list_orphan_files(): fg="yellow", ) - for fp in get_orphaned_files(loc): + for fp in get_zombie_files(loc): click.echo(f" {fp}") -@orphans.command("clean") +@zombies.command("clean") @click.confirmation_option( - prompt="are you sure you want to permanently remove orphaned files?" + prompt="Are you sure you want to permanently remove zombie files?" ) @with_appcontext -def clean_files(): +def clean_zombie_files(): """Delete existing files that aren't referenced in Invenio anymore.""" for loc in Location.query.all(): # we only know how to handle directories on the file system for now @@ -195,7 +306,7 @@ def clean_files(): ) continue - for fp in get_orphaned_files(loc): + for fp in get_zombie_files(loc): try: remove_file(fp) click.secho(fp, fg="green") diff --git a/invenio_utilities_tuw/cli/records.py b/invenio_utilities_tuw/cli/records.py index ee6e34dc5a00a6514527736d9de7fe86e2ecf7a8..b062a6e1c5b49afcb3191dc4914fd014bf6a8295 100644 --- a/invenio_utilities_tuw/cli/records.py +++ b/invenio_utilities_tuw/cli/records.py @@ -222,6 +222,9 @@ def verify_files(pid, pid_type, user): click.secho(f"{name}: failed checksum verification", fg="yellow", err=True) num_errors += 1 + # persist the 'last_check_at' timestamp for each file + db.session.commit() + if num_errors > 0: click.secho( f"{num_errors} files failed the checksum verification", @@ -230,9 +233,6 @@ def verify_files(pid, pid_type, user): ) sys.exit(1) - # persist the 'last_check_at' timestamp for each file - db.session.commit() - @records.command("reindex") @option_pid_values