From 35ce3160eab826806f5d5af34b7c2cc62bd7097b Mon Sep 17 00:00:00 2001 From: Maximilian Moser <maximilian.moser@tuwien.ac.at> Date: Wed, 7 Feb 2024 15:01:21 +0100 Subject: [PATCH] Improve exception handling * in case the user hits ^C, skip the remaining files and store the results so far --- formatscaper/formatscaper.py | 95 +++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 44 deletions(-) diff --git a/formatscaper/formatscaper.py b/formatscaper/formatscaper.py index 17b6d63..f82533a 100755 --- a/formatscaper/formatscaper.py +++ b/formatscaper/formatscaper.py @@ -159,49 +159,53 @@ progress_bar = pb.ProgressBar(max_value=len(record_files), widgets=pb_ws) def process_record_file(record_file: RecordFile) -> None: with sem: - sf_output = subprocess.check_output( - [ - args.sf_binary, - "-z", - "-multi", - "1", - "-name", - record_file.filename, - record_file.uri, - ], - stderr=sf_error_log, - ) + try: + sf_output = subprocess.check_output( + [ + args.sf_binary, + "-z", + "-multi", + "1", + "-name", + record_file.filename, + record_file.uri, + ], + stderr=sf_error_log, + ) - # skip the sf info part - file_infos = yaml.safe_load_all(sf_output) - next(file_infos) - - # go through all the files analyzed by siegfried which can be several, - # if the original input file was an archive - for file_info in file_infos: - if not file_info.get("errors", None) and file_info.get("matches", []): - for match in file_info["matches"]: - if match["ns"] == "pronom": - format = Format.from_sf_dict(match) - - # the storing of results needs to be mutually exclusive - with mutex: - format = formats.setdefault(format.puid, format) - result = Result( - filename=file_info["filename"], - record=record_file.record, - format=format, - ) - all_results.append(result) - if formats[format.puid].endangered: - endangered_files.append(result) - - # when the task ends, update the progress bar - with mutex: - global completed_tasks - completed_tasks += 1 - if not args.no_progressbar: - progress_bar.update(completed_tasks, force=True) + # skip the sf info part + file_infos = yaml.safe_load_all(sf_output) + next(file_infos) + + # go through all the files analyzed by siegfried which can be several, + # if the original input file was an archive + for file_info in file_infos: + if not file_info.get("errors", None) and file_info.get("matches", []): + for match in file_info["matches"]: + if match["ns"] == "pronom": + format = Format.from_sf_dict(match) + + # the storing of results needs to be mutually exclusive + with mutex: + format = formats.setdefault(format.puid, format) + result = Result( + filename=file_info["filename"], + record=record_file.record, + format=format, + ) + all_results.append(result) + if formats[format.puid].endangered: + endangered_files.append(result) + + # when the task ends, update the progress bar + with mutex: + global completed_tasks + completed_tasks += 1 + if not args.no_progressbar: + progress_bar.update(completed_tasks, force=True) + + except subprocess.CalledProcessError as e: + print("WARN: error during sf execution:", str(e), file=sys.stderr) # analyze all the files in parallel, and create the summary after all threads complete @@ -211,8 +215,11 @@ for record_file in record_files or []: threads.append(thread) thread.start() -for thread in threads: - thread.join() +try: + for thread in threads: + thread.join() +except KeyboardInterrupt: + pass if sf_error_log is not None: sf_error_log.close() -- GitLab