*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects
Commit d9e78879 authored by Moser, Maximilian's avatar Moser, Maximilian
Browse files

Update implementation for orphan file operations

* it seems that during deletion of drafts, the references to those files
  are deleted from the DB, but the files are still kept on disk
* thus, we now consider files to be orphaned that are still present in
  the file system, but don't have entries in the database anymore
parent a9523262
No related branches found
No related tags found
No related merge requests found
......@@ -8,18 +8,52 @@
"""Management commands for files."""
import os
from collections import defaultdict
import click
from flask.cli import with_appcontext
from invenio_db import db
from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion
from invenio_files_rest.models import Location, ObjectVersion
from ..utils import get_record_service
from .options import option_as_user, option_pid_type, option_pid_value
from .utils import convert_to_recid, get_identity_for_user
def remove_file(file_path, max_rmdir_depth=3):
"""Remove the file and directories in its path that just became empty."""
os.remove(file_path)
path = os.path.dirname(file_path)
depth = 0
# only delete directories until the maximum rmdir depth is hit, or the
# directory contains files, or we hit a permission error
while depth < max_rmdir_depth and not os.listdir(path):
try:
os.rmdir(path)
except PermissionError:
break
path, _ = os.path.split(path)
depth += 1
def get_orphaned_files(location):
"""Get a list of files in the given Location that aren't referenced in the DB."""
# see which files are on disk at the given location
all_files = []
for (p, _, files) in os.walk(location.uri):
all_files += [os.path.join(p, f) for f in files]
# filter out those files that invenio has knowledge about
for bucket in location.buckets:
for obj in bucket.objects:
all_files.remove(obj.file.uri)
return all_files
@click.group()
def files():
"""Utility commands for management of files."""
......@@ -122,35 +156,26 @@ def hard_delete_files(user, pid, pid_type):
@files.group("orphans")
def orphans():
"""Management commands for orphaned files (without ObjectVersions)."""
"""Management commands for orphaned files."""
pass
@orphans.command("list")
@with_appcontext
def list_orphan_files():
"""List files that aren't referenced in any records (anymore)."""
# TODO iterate over all records & drafts, get their buckets
# and check which buckets from the db aren't listed
service = get_record_service()
record_model_cls = service.record_cls.model_cls
draft_model_cls = service.draft_cls.model_cls
bucket_ids = set(
(
r.bucket.id
for r in (record_model_cls.query.all() + draft_model_cls.query.all())
if r.bucket is not None
)
)
print(len(bucket_ids))
buckets = Bucket.query.filter(~Bucket.id.in_(bucket_ids)).all()
print(len(buckets))
for bucket in buckets:
for ov in bucket.objects:
if ov.file is not None:
click.secho(ov.file.uri, fg="yellow")
"""List files from locations that aren't referenced in any DB models anymore."""
for loc in Location.query.all():
# we only know how to handle directories on the file system for now
if os.path.isdir(loc.uri):
click.echo("location: {}".format(loc.name))
else:
click.secho(
"warning: location '{}' is not a path: {}".format(loc.name, loc.uri),
fg="yellow",
)
for fp in get_orphaned_files(loc):
click.echo(" {}".format(fp))
@orphans.command("clean")
......@@ -160,16 +185,16 @@ def list_orphan_files():
@option_as_user
@with_appcontext
def clean_files(user):
"""Remove files that do not have associated ObjectVersions (anymore)."""
service = get_record_service()
identity = get_identity_for_user(user)
service.require_permission(identity, "delete")
"""List files from locations that aren't referenced in any DB models anymore."""
for loc in Location.query.all():
# we only know how to handle directories on the file system for now
if not os.path.isdir(loc.uri):
continue
for fi in (f for f in FileInstance.query.all() if not f.objects):
try:
storage = fi.storage()
fi.delete()
storage.delete()
click.secho(fi.uri, fg="red")
except:
click.secho("cannot delete file: %s" % fi.uri, fg="yellow")
for fp in get_orphaned_files(loc):
try:
remove_file(fp)
click.secho(fp, fg="green")
except PermissionError:
click.secho(fp, fg="red")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment