From 5472dc2c4297b213c3acdfb3582b5caed17d9dcc Mon Sep 17 00:00:00 2001
From: Maximilian Moser <maximilian.moser@tuwien.ac.at>
Date: Fri, 19 Jan 2024 15:20:29 +0100
Subject: [PATCH] Add pickle as an output file format for formatscaper

* also set it as default, because it's much faster and smaller than yaml
---
 formatscaper/formatscaper.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/formatscaper/formatscaper.py b/formatscaper/formatscaper.py
index 0895b1f..9db9f8e 100755
--- a/formatscaper/formatscaper.py
+++ b/formatscaper/formatscaper.py
@@ -2,6 +2,7 @@
 
 import argparse
 import dataclasses
+import pickle
 import re
 import subprocess
 import sys
@@ -43,8 +44,15 @@ parser.add_argument(
 parser.add_argument(
     "--output",
     "-o",
-    default="results.yml",
-    help="file in which to store the identified format for each file (default: results.yml)",  # noqa
+    default="results.{FORMAT}",
+    help="file in which to store the identified format for each file (default: results.{FORMAT})",  # noqa
+)
+parser.add_argument(
+    "--output-format",
+    "-F",
+    default="pickle",
+    choices=["pickle", "yaml"],
+    help="format of the results (default: pickle)",
 )
 parser.add_argument(
     "--parallel",
@@ -215,12 +223,21 @@ if endangered_files:
 
 
 # store the results to files
+output_file_name = args.output.format(FORMAT=args.output_format)
 try:
-    with open(args.output, "w") as output_file:
-        yaml.dump([dataclasses.asdict(res) for res in all_results], output_file)
+    simple_results = [dataclasses.asdict(res) for res in all_results]
+    file_mode = "w" if args.output_format == "yaml" else "wb"
+    with open(output_file_name, file_mode) as output_file:
+        if args.output_format == "yaml":
+            yaml.dump(simple_results, output_file)
+        elif args.output_format == "pickle":
+            pickle.dump(simple_results, output_file)
 
 except OSError:
-    print(f"WARN: couldn't store the results ({args.output})", file=sys.stderr)
+    print(
+        f"WARN: couldn't store the results to file ({output_file_name})",
+        file=sys.stderr,
+    )
 
 try:
     updated_formats = [dataclasses.asdict(f) for f in formats.values()]
-- 
GitLab