From 35ce3160eab826806f5d5af34b7c2cc62bd7097b Mon Sep 17 00:00:00 2001
From: Maximilian Moser <maximilian.moser@tuwien.ac.at>
Date: Wed, 7 Feb 2024 15:01:21 +0100
Subject: [PATCH] Improve exception handling

* in case the user hits ^C, skip the remaining files and store the
  results so far
---
 formatscaper/formatscaper.py | 95 +++++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 44 deletions(-)

diff --git a/formatscaper/formatscaper.py b/formatscaper/formatscaper.py
index 17b6d63..f82533a 100755
--- a/formatscaper/formatscaper.py
+++ b/formatscaper/formatscaper.py
@@ -159,49 +159,53 @@ progress_bar = pb.ProgressBar(max_value=len(record_files), widgets=pb_ws)
 
 def process_record_file(record_file: RecordFile) -> None:
     with sem:
-        sf_output = subprocess.check_output(
-            [
-                args.sf_binary,
-                "-z",
-                "-multi",
-                "1",
-                "-name",
-                record_file.filename,
-                record_file.uri,
-            ],
-            stderr=sf_error_log,
-        )
+        try:
+            sf_output = subprocess.check_output(
+                [
+                    args.sf_binary,
+                    "-z",
+                    "-multi",
+                    "1",
+                    "-name",
+                    record_file.filename,
+                    record_file.uri,
+                ],
+                stderr=sf_error_log,
+            )
 
-        # skip the sf info part
-        file_infos = yaml.safe_load_all(sf_output)
-        next(file_infos)
-
-        # go through all the files analyzed by siegfried which can be several,
-        # if the original input file was an archive
-        for file_info in file_infos:
-            if not file_info.get("errors", None) and file_info.get("matches", []):
-                for match in file_info["matches"]:
-                    if match["ns"] == "pronom":
-                        format = Format.from_sf_dict(match)
-
-                        # the storing of results needs to be mutually exclusive
-                        with mutex:
-                            format = formats.setdefault(format.puid, format)
-                            result = Result(
-                                filename=file_info["filename"],
-                                record=record_file.record,
-                                format=format,
-                            )
-                            all_results.append(result)
-                            if formats[format.puid].endangered:
-                                endangered_files.append(result)
-
-        # when the task ends, update the progress bar
-        with mutex:
-            global completed_tasks
-            completed_tasks += 1
-            if not args.no_progressbar:
-                progress_bar.update(completed_tasks, force=True)
+            # skip the sf info part
+            file_infos = yaml.safe_load_all(sf_output)
+            next(file_infos)
+
+            # go through all the files analyzed by siegfried which can be several,
+            # if the original input file was an archive
+            for file_info in file_infos:
+                if not file_info.get("errors", None) and file_info.get("matches", []):
+                    for match in file_info["matches"]:
+                        if match["ns"] == "pronom":
+                            format = Format.from_sf_dict(match)
+
+                            # the storing of results needs to be mutually exclusive
+                            with mutex:
+                                format = formats.setdefault(format.puid, format)
+                                result = Result(
+                                    filename=file_info["filename"],
+                                    record=record_file.record,
+                                    format=format,
+                                )
+                                all_results.append(result)
+                                if formats[format.puid].endangered:
+                                    endangered_files.append(result)
+
+            # when the task ends, update the progress bar
+            with mutex:
+                global completed_tasks
+                completed_tasks += 1
+                if not args.no_progressbar:
+                    progress_bar.update(completed_tasks, force=True)
+
+        except subprocess.CalledProcessError as e:
+            print("WARN: error during sf execution:", str(e), file=sys.stderr)
 
 
 # analyze all the files in parallel, and create the summary after all threads complete
@@ -211,8 +215,11 @@ for record_file in record_files or []:
     threads.append(thread)
     thread.start()
 
-for thread in threads:
-    thread.join()
+try:
+    for thread in threads:
+        thread.join()
+except KeyboardInterrupt:
+    pass
 
 if sf_error_log is not None:
     sf_error_log.close()
-- 
GitLab