From 943e34256a0e6b3c7cc2362e84f5e371ed5a2911 Mon Sep 17 00:00:00 2001 From: Maximilian Moser <maximilian.moser@tuwien.ac.at> Date: Tue, 6 Feb 2024 17:05:59 +0100 Subject: [PATCH] Refactor formatscaper * split the code into separate reusable modules * add config files for linters & formatters * force the progress bar to update after every sf run * mask foot terminal as xterm for nicer progressbar output --- formatscaper/.flake8 | 3 + formatscaper/.isort.cfg | 2 + formatscaper/core/__init__.py | 7 ++ formatscaper/core/models.py | 55 ++++++++++++++ formatscaper/core/utils.py | 86 +++++++++++++++++++++ formatscaper/formatscaper.py | 137 ++++++++++++---------------------- 6 files changed, 201 insertions(+), 89 deletions(-) create mode 100644 formatscaper/.flake8 create mode 100644 formatscaper/.isort.cfg create mode 100644 formatscaper/core/__init__.py create mode 100644 formatscaper/core/models.py create mode 100644 formatscaper/core/utils.py diff --git a/formatscaper/.flake8 b/formatscaper/.flake8 new file mode 100644 index 0000000..f295e07 --- /dev/null +++ b/formatscaper/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203, E704 diff --git a/formatscaper/.isort.cfg b/formatscaper/.isort.cfg new file mode 100644 index 0000000..f238bf7 --- /dev/null +++ b/formatscaper/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black diff --git a/formatscaper/core/__init__.py b/formatscaper/core/__init__.py new file mode 100644 index 0000000..a570ab6 --- /dev/null +++ b/formatscaper/core/__init__.py @@ -0,0 +1,7 @@ +from .models import Format, RecordFile, Result + +__all__ = ( + Format, + RecordFile, + Result, +) diff --git a/formatscaper/core/models.py b/formatscaper/core/models.py new file mode 100644 index 0000000..85d68d3 --- /dev/null +++ b/formatscaper/core/models.py @@ -0,0 +1,55 @@ +import dataclasses + + +@dataclasses.dataclass +class RecordFile: + """Information about a file to be investigated.""" + + filename: str + uri: str + record: str + + +@dataclasses.dataclass +class Format: + """Information about a file format.""" + + puid: str + name: str + mime: str + endangered: bool = False + + def as_dict(self): + """Dump the data as dictionary.""" + return { + "puid": self.puid, + "name": self.name, + "mime": self.mime, + "endangered": self.endangered, + } + + @classmethod + def from_sf_dict(cls, dictionary): + """Parse the format from siegfried's output.""" + return cls( + puid=dictionary["id"], + name=dictionary["format"], + mime=dictionary["mime"], + ) + + +@dataclasses.dataclass +class Result: + """The format identification result for a given file.""" + + filename: str + record: str + format: Format + + def as_dict(self): + """Dump the data as dictionary.""" + return { + "filename": self.filename, + "record": self.record, + "format": self.format.as_dict(), + } diff --git a/formatscaper/core/utils.py b/formatscaper/core/utils.py new file mode 100644 index 0000000..1aa3139 --- /dev/null +++ b/formatscaper/core/utils.py @@ -0,0 +1,86 @@ +import pickle +import sys +from typing import Dict, List + +import yaml + +from .models import Format, RecordFile, Result + + +def load_record_files(file_name: str) -> List[None]: + """Load the record files from the file with the given name. + + If ``file_name`` is ``"-"``, then standard in will be read. + """ + record_files = [] + if file_name == "-": + record_files = yaml.safe_load(sys.stdin) + else: + with open(file_name, "r") as input_file: + record_files = yaml.safe_load(input_file) + + return [RecordFile(**rf) for rf in record_files] + + +def load_formats(file_name: str) -> Dict[str, Format]: + """Load the known formats from the given YAML file.""" + formats = {} + + try: + with open(file_name, "r") as formats_file: + known_formats = yaml.safe_load(formats_file) + for f in known_formats: + format = Format(**f) + formats[format.puid] = format + + except FileNotFoundError: + pass + + return formats + + +def store_formats(formats: Dict[str, Format], file_name: str) -> bool: + """Store the known formats to the given YAML file.""" + try: + updated_formats = [f.as_dict() for f in formats.values()] + with open(file_name, "w") as formats_file: + yaml.dump(updated_formats, formats_file, sort_keys=False) + + except OSError: + print( + f"ERROR: couldn't update the formats file ({file_name})", + file=sys.stderr, + ) + + +def store_results(results: List[Result], file_name: str, file_format: str) -> bool: + """Store the results in the specified file. + + The ``file_name`` can contain ``"{FORMAT}``, which will be replaced by the + specified ``file_format``. + The latter has to be either ``yaml`` or ``pickle``. + """ + try: + file_name = file_name.format(FORMAT=file_format) + simple_results = [res.as_dict() for res in results] + file_mode = "w" if file_format == "yaml" else "wb" + with open(file_name, file_mode) as output_file: + if file_format.lower() == "yaml": + yaml.dump(simple_results, output_file, sort_keys=False) + elif file_format.lower() == "pickle": + pickle.dump(simple_results, output_file) + else: + print( + f"WARN: unknown format for results file ({file_format})", + file=sys.stderr, + ) + return False + + return True + + except OSError: + print( + f"WARN: couldn't store the results to file ({file_name})", + file=sys.stderr, + ) + return False diff --git a/formatscaper/formatscaper.py b/formatscaper/formatscaper.py index 43d16fb..17b6d63 100755 --- a/formatscaper/formatscaper.py +++ b/formatscaper/formatscaper.py @@ -1,40 +1,33 @@ #!/bin/env python3 import argparse -import dataclasses import os -import pickle import re import subprocess import sys import threading -import progressbar +import progressbar as pb import yaml - -@dataclasses.dataclass -class Format: - puid: str - name: str - mime: str - endangered: bool - - -@dataclasses.dataclass -class Result: - filename: str - record: str - format: Format - +from core.models import Format, RecordFile, Result +from core.utils import load_formats, load_record_files, store_formats, store_results # set up the argument parser -parser = argparse.ArgumentParser() +parser = argparse.ArgumentParser( + description=( + "Tool for identifying the formats of listed files (and nested ones " + "in case of archives) associated to records uploaded in Invenio." + ), +) parser.add_argument( "--formats", "--f", default="formats.yml", - help="list of known file formats and if they're endangered; this file will be updated (default: formats.yml)", # noqa + help=( + "list of known file formats and if they're endangered; " + "this file will be updated (default: formats.yml)" + ), ) parser.add_argument( "--input", @@ -46,7 +39,10 @@ parser.add_argument( "--output", "-o", default="results.{FORMAT}", - help="file in which to store the identified format for each file (default: results.{FORMAT})", # noqa + help=( + "file in which to store the identified format for each file " + "(default: results.{FORMAT})" + ), ) parser.add_argument( "--output-format", @@ -60,7 +56,10 @@ parser.add_argument( "-p", default=1, type=int, - help="number of siegfried processes to run in parallel; 0 and negative numbers will subtract from the number of CPU cores (default: 1)", # noqa + help=( + "number of siegfried processes to run in parallel; 0 and negative numbers will " + "subtract from the number of CPU cores (default: 1)" + ), ) parser.add_argument( "--sf-binary", @@ -102,30 +101,10 @@ except FileNotFoundError: # parse the list of known formats -formats = {} -try: - with open(args.formats, "r") as formats_file: - known_formats = yaml.safe_load(formats_file) - for f in known_formats: - format = Format( - puid=f["puid"], - name=f["name"], - mime=f["mime"], - endangered=f["endangered"], - ) - formats[format.puid] = format -except FileNotFoundError: - pass - +formats = load_formats(args.formats) # read the list of files to analyze -record_files = [] -if args.input == "-": - record_files = yaml.safe_load(sys.stdin) -else: - with open(args.input, "r") as input_file: - record_files = yaml.safe_load(input_file) - +record_files = load_record_files(args.input) # try to redirect the error logs from siegfried try: @@ -137,7 +116,6 @@ except OSError as e: ) sf_error_log = None - # determine the level of threads to run in parallel # negative numbers mean "this much less than the number of CPUs I have", # as long as the result is greater than 0 @@ -146,35 +124,40 @@ if (num_threads := args.parallel) <= 0: if num_cores is None: num_threads = 1 print( - "WARN: couldn't determine number of CPU cores, falling back to a single thread", # noqa + ( + "WARN: couldn't determine number of CPU cores, " + "falling back to a single thread" + ), file=sys.stderr, ) else: num_threads = os.cpu_count() + num_threads if num_threads <= 0: print( - f"ERROR: calculated number of threads would be less than 1: {num_threads}", # noqa + "ERROR: calculated number of threads would be less than 1:", + num_threads, file=sys.stderr, ) sys.exit(1) + +# progressbar curates its own list of ANSI terminals, and doesn't know about foot, +# so we claim to use xterm instead of foot +if os.environ.get("TERM") == "foot": + os.environ["TERM"] = "xterm" + + # set up variables required in the collection of results all_results = [] endangered_files = [] sem = threading.Semaphore(num_threads) mutex = threading.Lock() completed_tasks = 0 -progress_bar = progressbar.ProgressBar( - max_value=len(record_files), - widgets=[ - # fmt: off - progressbar.Percentage(), " (", progressbar.SimpleProgress(), ") ", progressbar.Bar(), " ", progressbar.Timer(), # noqa - # fmt: on - ], -) +pb_ws = [pb.Percentage(), " (", pb.SimpleProgress(), ") ", pb.Bar(), " ", pb.Timer()] +progress_bar = pb.ProgressBar(max_value=len(record_files), widgets=pb_ws) -def process_record_file(record_file): +def process_record_file(record_file: RecordFile) -> None: with sem: sf_output = subprocess.check_output( [ @@ -183,8 +166,8 @@ def process_record_file(record_file): "-multi", "1", "-name", - record_file["filename"], - record_file["uri"], + record_file.filename, + record_file.uri, ], stderr=sf_error_log, ) @@ -199,19 +182,14 @@ def process_record_file(record_file): if not file_info.get("errors", None) and file_info.get("matches", []): for match in file_info["matches"]: if match["ns"] == "pronom": - format = Format( - name=match["format"], - puid=match["id"], - mime=match["mime"], - endangered=False, - ) + format = Format.from_sf_dict(match) # the storing of results needs to be mutually exclusive with mutex: format = formats.setdefault(format.puid, format) result = Result( filename=file_info["filename"], - record=record_file["record"], + record=record_file.record, format=format, ) all_results.append(result) @@ -223,7 +201,7 @@ def process_record_file(record_file): global completed_tasks completed_tasks += 1 if not args.no_progressbar: - progress_bar.update(completed_tasks) + progress_bar.update(completed_tasks, force=True) # analyze all the files in parallel, and create the summary after all threads complete @@ -240,30 +218,11 @@ if sf_error_log is not None: sf_error_log.close() if endangered_files: - print(yaml.dump([dataclasses.asdict(f) for f in endangered_files])) + print(yaml.dump([f.as_dict() for f in endangered_files], sort_keys=False)) +# update the file with known file formats +store_formats(formats, args.formats) # store the results to files output_file_name = args.output.format(FORMAT=args.output_format) -try: - simple_results = [dataclasses.asdict(res) for res in all_results] - file_mode = "w" if args.output_format == "yaml" else "wb" - with open(output_file_name, file_mode) as output_file: - if args.output_format == "yaml": - yaml.dump(simple_results, output_file) - elif args.output_format == "pickle": - pickle.dump(simple_results, output_file) - -except OSError: - print( - f"WARN: couldn't store the results to file ({output_file_name})", - file=sys.stderr, - ) - -try: - updated_formats = [dataclasses.asdict(f) for f in formats.values()] - with open(args.formats, "w") as formats_file: - yaml.dump(updated_formats, formats_file) - -except OSError: - print(f"ERROR: couldn't update the formats file ({args.formats})", file=sys.stderr) +store_results(all_results, args.output, args.output_format) -- GitLab