From 943e34256a0e6b3c7cc2362e84f5e371ed5a2911 Mon Sep 17 00:00:00 2001
From: Maximilian Moser <maximilian.moser@tuwien.ac.at>
Date: Tue, 6 Feb 2024 17:05:59 +0100
Subject: [PATCH] Refactor formatscaper

* split the code into separate reusable modules
* add config files for linters & formatters
* force the progress bar to update after every sf run
* mask foot terminal as xterm for nicer progressbar output
---
 formatscaper/.flake8          |   3 +
 formatscaper/.isort.cfg       |   2 +
 formatscaper/core/__init__.py |   7 ++
 formatscaper/core/models.py   |  55 ++++++++++++++
 formatscaper/core/utils.py    |  86 +++++++++++++++++++++
 formatscaper/formatscaper.py  | 137 ++++++++++++----------------------
 6 files changed, 201 insertions(+), 89 deletions(-)
 create mode 100644 formatscaper/.flake8
 create mode 100644 formatscaper/.isort.cfg
 create mode 100644 formatscaper/core/__init__.py
 create mode 100644 formatscaper/core/models.py
 create mode 100644 formatscaper/core/utils.py

diff --git a/formatscaper/.flake8 b/formatscaper/.flake8
new file mode 100644
index 0000000..f295e07
--- /dev/null
+++ b/formatscaper/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203, E704
diff --git a/formatscaper/.isort.cfg b/formatscaper/.isort.cfg
new file mode 100644
index 0000000..f238bf7
--- /dev/null
+++ b/formatscaper/.isort.cfg
@@ -0,0 +1,2 @@
+[settings]
+profile = black
diff --git a/formatscaper/core/__init__.py b/formatscaper/core/__init__.py
new file mode 100644
index 0000000..a570ab6
--- /dev/null
+++ b/formatscaper/core/__init__.py
@@ -0,0 +1,7 @@
+from .models import Format, RecordFile, Result
+
+__all__ = (
+    Format,
+    RecordFile,
+    Result,
+)
diff --git a/formatscaper/core/models.py b/formatscaper/core/models.py
new file mode 100644
index 0000000..85d68d3
--- /dev/null
+++ b/formatscaper/core/models.py
@@ -0,0 +1,55 @@
+import dataclasses
+
+
+@dataclasses.dataclass
+class RecordFile:
+    """Information about a file to be investigated."""
+
+    filename: str
+    uri: str
+    record: str
+
+
+@dataclasses.dataclass
+class Format:
+    """Information about a file format."""
+
+    puid: str
+    name: str
+    mime: str
+    endangered: bool = False
+
+    def as_dict(self):
+        """Dump the data as dictionary."""
+        return {
+            "puid": self.puid,
+            "name": self.name,
+            "mime": self.mime,
+            "endangered": self.endangered,
+        }
+
+    @classmethod
+    def from_sf_dict(cls, dictionary):
+        """Parse the format from siegfried's output."""
+        return cls(
+            puid=dictionary["id"],
+            name=dictionary["format"],
+            mime=dictionary["mime"],
+        )
+
+
+@dataclasses.dataclass
+class Result:
+    """The format identification result for a given file."""
+
+    filename: str
+    record: str
+    format: Format
+
+    def as_dict(self):
+        """Dump the data as dictionary."""
+        return {
+            "filename": self.filename,
+            "record": self.record,
+            "format": self.format.as_dict(),
+        }
diff --git a/formatscaper/core/utils.py b/formatscaper/core/utils.py
new file mode 100644
index 0000000..1aa3139
--- /dev/null
+++ b/formatscaper/core/utils.py
@@ -0,0 +1,86 @@
+import pickle
+import sys
+from typing import Dict, List
+
+import yaml
+
+from .models import Format, RecordFile, Result
+
+
+def load_record_files(file_name: str) -> List[None]:
+    """Load the record files from the file with the given name.
+
+    If ``file_name`` is ``"-"``, then standard in will be read.
+    """
+    record_files = []
+    if file_name == "-":
+        record_files = yaml.safe_load(sys.stdin)
+    else:
+        with open(file_name, "r") as input_file:
+            record_files = yaml.safe_load(input_file)
+
+    return [RecordFile(**rf) for rf in record_files]
+
+
+def load_formats(file_name: str) -> Dict[str, Format]:
+    """Load the known formats from the given YAML file."""
+    formats = {}
+
+    try:
+        with open(file_name, "r") as formats_file:
+            known_formats = yaml.safe_load(formats_file)
+            for f in known_formats:
+                format = Format(**f)
+                formats[format.puid] = format
+
+    except FileNotFoundError:
+        pass
+
+    return formats
+
+
+def store_formats(formats: Dict[str, Format], file_name: str) -> bool:
+    """Store the known formats to the given YAML file."""
+    try:
+        updated_formats = [f.as_dict() for f in formats.values()]
+        with open(file_name, "w") as formats_file:
+            yaml.dump(updated_formats, formats_file, sort_keys=False)
+
+    except OSError:
+        print(
+            f"ERROR: couldn't update the formats file ({file_name})",
+            file=sys.stderr,
+        )
+
+
+def store_results(results: List[Result], file_name: str, file_format: str) -> bool:
+    """Store the results in the specified file.
+
+    The ``file_name`` can contain ``"{FORMAT}``, which will be replaced by the
+    specified ``file_format``.
+    The latter has to be either ``yaml`` or ``pickle``.
+    """
+    try:
+        file_name = file_name.format(FORMAT=file_format)
+        simple_results = [res.as_dict() for res in results]
+        file_mode = "w" if file_format == "yaml" else "wb"
+        with open(file_name, file_mode) as output_file:
+            if file_format.lower() == "yaml":
+                yaml.dump(simple_results, output_file, sort_keys=False)
+            elif file_format.lower() == "pickle":
+                pickle.dump(simple_results, output_file)
+            else:
+                print(
+                    f"WARN: unknown format for results file ({file_format})",
+                    file=sys.stderr,
+                )
+                return False
+
+        return True
+
+    except OSError:
+        print(
+            f"WARN: couldn't store the results to file ({file_name})",
+            file=sys.stderr,
+        )
+        return False
diff --git a/formatscaper/formatscaper.py b/formatscaper/formatscaper.py
index 43d16fb..17b6d63 100755
--- a/formatscaper/formatscaper.py
+++ b/formatscaper/formatscaper.py
@@ -1,40 +1,33 @@
 #!/bin/env python3
 
 import argparse
-import dataclasses
 import os
-import pickle
 import re
 import subprocess
 import sys
 import threading
 
-import progressbar
+import progressbar as pb
 import yaml
 
-
-@dataclasses.dataclass
-class Format:
-    puid: str
-    name: str
-    mime: str
-    endangered: bool
-
-
-@dataclasses.dataclass
-class Result:
-    filename: str
-    record: str
-    format: Format
-
+from core.models import Format, RecordFile, Result
+from core.utils import load_formats, load_record_files, store_formats, store_results
 
 # set up the argument parser
-parser = argparse.ArgumentParser()
+parser = argparse.ArgumentParser(
+    description=(
+        "Tool for identifying the formats of listed files (and nested ones "
+        "in case of archives) associated to records uploaded in Invenio."
+    ),
+)
 parser.add_argument(
     "--formats",
     "--f",
     default="formats.yml",
-    help="list of known file formats and if they're endangered; this file will be updated (default: formats.yml)",  # noqa
+    help=(
+        "list of known file formats and if they're endangered; "
+        "this file will be updated (default: formats.yml)"
+    ),
 )
 parser.add_argument(
     "--input",
@@ -46,7 +39,10 @@ parser.add_argument(
     "--output",
     "-o",
     default="results.{FORMAT}",
-    help="file in which to store the identified format for each file (default: results.{FORMAT})",  # noqa
+    help=(
+        "file in which to store the identified format for each file "
+        "(default: results.{FORMAT})"
+    ),
 )
 parser.add_argument(
     "--output-format",
@@ -60,7 +56,10 @@ parser.add_argument(
     "-p",
     default=1,
     type=int,
-    help="number of siegfried processes to run in parallel; 0 and negative numbers will subtract from the number of CPU cores (default: 1)",  # noqa
+    help=(
+        "number of siegfried processes to run in parallel; 0 and negative numbers will "
+        "subtract from the number of CPU cores (default: 1)"
+    ),
 )
 parser.add_argument(
     "--sf-binary",
@@ -102,30 +101,10 @@ except FileNotFoundError:
 
 
 # parse the list of known formats
-formats = {}
-try:
-    with open(args.formats, "r") as formats_file:
-        known_formats = yaml.safe_load(formats_file)
-        for f in known_formats:
-            format = Format(
-                puid=f["puid"],
-                name=f["name"],
-                mime=f["mime"],
-                endangered=f["endangered"],
-            )
-            formats[format.puid] = format
-except FileNotFoundError:
-    pass
-
+formats = load_formats(args.formats)
 
 # read the list of files to analyze
-record_files = []
-if args.input == "-":
-    record_files = yaml.safe_load(sys.stdin)
-else:
-    with open(args.input, "r") as input_file:
-        record_files = yaml.safe_load(input_file)
-
+record_files = load_record_files(args.input)
 
 # try to redirect the error logs from siegfried
 try:
@@ -137,7 +116,6 @@ except OSError as e:
     )
     sf_error_log = None
 
-
 # determine the level of threads to run in parallel
 # negative numbers mean "this much less than the number of CPUs I have",
 # as long as the result is greater than 0
@@ -146,35 +124,40 @@ if (num_threads := args.parallel) <= 0:
     if num_cores is None:
         num_threads = 1
         print(
-            "WARN: couldn't determine number of CPU cores, falling back to a single thread",  # noqa
+            (
+                "WARN: couldn't determine number of CPU cores, "
+                "falling back to a single thread"
+            ),
             file=sys.stderr,
         )
     else:
         num_threads = os.cpu_count() + num_threads
         if num_threads <= 0:
             print(
-                f"ERROR: calculated number of threads would be less than 1: {num_threads}",  # noqa
+                "ERROR: calculated number of threads would be less than 1:",
+                num_threads,
                 file=sys.stderr,
             )
             sys.exit(1)
 
+
+# progressbar curates its own list of ANSI terminals, and doesn't know about foot,
+# so we claim to use xterm instead of foot
+if os.environ.get("TERM") == "foot":
+    os.environ["TERM"] = "xterm"
+
+
 # set up variables required in the collection of results
 all_results = []
 endangered_files = []
 sem = threading.Semaphore(num_threads)
 mutex = threading.Lock()
 completed_tasks = 0
-progress_bar = progressbar.ProgressBar(
-    max_value=len(record_files),
-    widgets=[
-        # fmt: off
-        progressbar.Percentage(), " (", progressbar.SimpleProgress(), ") ", progressbar.Bar(), " ", progressbar.Timer(),  # noqa
-        # fmt: on
-    ],
-)
+pb_ws = [pb.Percentage(), " (", pb.SimpleProgress(), ") ", pb.Bar(), " ", pb.Timer()]
+progress_bar = pb.ProgressBar(max_value=len(record_files), widgets=pb_ws)
 
 
-def process_record_file(record_file):
+def process_record_file(record_file: RecordFile) -> None:
     with sem:
         sf_output = subprocess.check_output(
             [
@@ -183,8 +166,8 @@ def process_record_file(record_file):
                 "-multi",
                 "1",
                 "-name",
-                record_file["filename"],
-                record_file["uri"],
+                record_file.filename,
+                record_file.uri,
             ],
             stderr=sf_error_log,
         )
@@ -199,19 +182,14 @@ def process_record_file(record_file):
             if not file_info.get("errors", None) and file_info.get("matches", []):
                 for match in file_info["matches"]:
                     if match["ns"] == "pronom":
-                        format = Format(
-                            name=match["format"],
-                            puid=match["id"],
-                            mime=match["mime"],
-                            endangered=False,
-                        )
+                        format = Format.from_sf_dict(match)
 
                         # the storing of results needs to be mutually exclusive
                         with mutex:
                             format = formats.setdefault(format.puid, format)
                             result = Result(
                                 filename=file_info["filename"],
-                                record=record_file["record"],
+                                record=record_file.record,
                                 format=format,
                             )
                             all_results.append(result)
@@ -223,7 +201,7 @@ def process_record_file(record_file):
             global completed_tasks
             completed_tasks += 1
             if not args.no_progressbar:
-                progress_bar.update(completed_tasks)
+                progress_bar.update(completed_tasks, force=True)
 
 
 # analyze all the files in parallel, and create the summary after all threads complete
@@ -240,30 +218,11 @@ if sf_error_log is not None:
     sf_error_log.close()
 
 if endangered_files:
-    print(yaml.dump([dataclasses.asdict(f) for f in endangered_files]))
+    print(yaml.dump([f.as_dict() for f in endangered_files], sort_keys=False))
 
+# update the file with known file formats
+store_formats(formats, args.formats)
 
 # store the results to files
 output_file_name = args.output.format(FORMAT=args.output_format)
-try:
-    simple_results = [dataclasses.asdict(res) for res in all_results]
-    file_mode = "w" if args.output_format == "yaml" else "wb"
-    with open(output_file_name, file_mode) as output_file:
-        if args.output_format == "yaml":
-            yaml.dump(simple_results, output_file)
-        elif args.output_format == "pickle":
-            pickle.dump(simple_results, output_file)
-
-except OSError:
-    print(
-        f"WARN: couldn't store the results to file ({output_file_name})",
-        file=sys.stderr,
-    )
-
-try:
-    updated_formats = [dataclasses.asdict(f) for f in formats.values()]
-    with open(args.formats, "w") as formats_file:
-        yaml.dump(updated_formats, formats_file)
-
-except OSError:
-    print(f"ERROR: couldn't update the formats file ({args.formats})", file=sys.stderr)
+store_results(all_results, args.output, args.output_format)
-- 
GitLab