diff --git a/formatscaper/.gitignore b/formatscaper/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..7ea56fcdb4774aea15017e4033203998af6deefd
--- /dev/null
+++ b/formatscaper/.gitignore
@@ -0,0 +1,3 @@
+formats.yml
+results.yml
+sf.log
diff --git a/formatscaper/Pipfile b/formatscaper/Pipfile
new file mode 100644
index 0000000000000000000000000000000000000000..af383c7724a4ebb64185206477d01dafdd4234c9
--- /dev/null
+++ b/formatscaper/Pipfile
@@ -0,0 +1,13 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pyyaml = "*"
+progressbar2 = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.9"
diff --git a/formatscaper/Pipfile.lock b/formatscaper/Pipfile.lock
new file mode 100644
index 0000000000000000000000000000000000000000..a8cc6aec2ebda9ee7790adc8c9bfa7a90d60ee5e
--- /dev/null
+++ b/formatscaper/Pipfile.lock
@@ -0,0 +1,103 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "f17599e0484b4226bebde0b6b444e0c48dcac21c3258f6354ef585800fdd2ab4"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.9"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "progressbar2": {
+            "hashes": [
+                "sha256:036fa3bd35ae27c92e73fce4fb18aa4ba5090a1880d880cf954ecb75ccd6f3fb",
+                "sha256:c37e6e1b4e57ab43f95c3d0e8d90061bec140e4fed56b8343183db3aa1e19a52"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.8'",
+            "version": "==4.3.2"
+        },
+        "python-utils": {
+            "hashes": [
+                "sha256:ec3a672465efb6c673845a43afcfafaa23d2594c24324a40ec18a0c59478dc0b",
+                "sha256:efdf31c8154667d7dc0317547c8e6d3b506c5d4b6e360e0c89662306262fc0ab"
+            ],
+            "markers": "python_version >= '3.9'",
+            "version": "==3.8.1"
+        },
+        "pyyaml": {
+            "hashes": [
+                "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5",
+                "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc",
+                "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df",
+                "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741",
+                "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206",
+                "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27",
+                "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595",
+                "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62",
+                "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98",
+                "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696",
+                "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290",
+                "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9",
+                "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d",
+                "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6",
+                "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867",
+                "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47",
+                "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486",
+                "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6",
+                "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3",
+                "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007",
+                "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938",
+                "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0",
+                "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c",
+                "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735",
+                "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d",
+                "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28",
+                "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4",
+                "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba",
+                "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8",
+                "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5",
+                "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd",
+                "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3",
+                "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0",
+                "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515",
+                "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c",
+                "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c",
+                "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924",
+                "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34",
+                "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43",
+                "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859",
+                "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673",
+                "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54",
+                "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a",
+                "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b",
+                "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab",
+                "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa",
+                "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c",
+                "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585",
+                "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d",
+                "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"
+            ],
+            "index": "pypi",
+            "markers": "python_version >= '3.6'",
+            "version": "==6.0.1"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783",
+                "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"
+            ],
+            "markers": "python_version >= '3.8'",
+            "version": "==4.9.0"
+        }
+    },
+    "develop": {}
+}
diff --git a/formatscaper/README.md b/formatscaper/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5843151f93923a8ab6f369e6dcd5565fe4c0df0
--- /dev/null
+++ b/formatscaper/README.md
@@ -0,0 +1,197 @@
+# Formatscaper
+
+Formatscaper is a tool for generating an overview of the file format landscape composed of the uploads in our research data repository.
+
+The aim here is to assist us with the task of digital preservation; specifically with the task of identifying uploaded files which are in danger of becoming "extinct".
+
+
+## Dependencies
+
+* Python 3
+* [Siegfried](https://github.com/richardlehane/siegfried#install)
+
+
+## Usage
+
+Formatscaper is designed to build up the required context over its lifetime of use, so no particular setup is required.
+To get started, simply feed it a list of files to analyze in the [expected format](#input) and let it run.
+
+The most relevant file is [`formats.yml`](#formats-file), which is intended to receive limited manual tweaks over time (basically just setting the `endangered` flag for outdated formats).
+Every time formatscaper is run, it will update this file with formats that haven't been listed before.
+This means that the knowledge base about file formats gets extended over time.
+
+Per default, formatscaper will create a summary of endangered files it encountered and print it to standard out.
+A more comprehensive summary for all encountered formats will be stored in a [results file](#results).
+
+
+Example call, with a custom path for the `sf` binary:
+```sh
+pipenv run ./formatscaper.py --sf-binary "${GOPATH}/bin/sf" -i record-files.yaml
+```
+
+
+
+## Rationale
+
+### Building file format landscapes
+
+Tools for creating an overview of file format landscapes already exist, like [`c3po`](https://peshkira.github.io/c3po/) and [`fitsinn`](https://github.com/datascience/fitsinn).
+However, they tend to come with more bells and whistles than we actually require, or are not quite ready for production use yet.
+
+
+### Using a single source of truth
+
+Often, the file format identification is based on the output of several utilities.
+For instance, [FITS](https://projects.iq.harvard.edu/fits/home) wraps a number of individual tools and combines their output into a unified XML structure.
+Unfortunately, their results are often in disagreement with each other, which necessitates a de-conflicting strategy.
+
+Instead of following this multi-tool approach, we've decided to rely on a single source of truth only - namely a tool called [Siegfried](https://github.com/richardlehane/siegfried).
+It seems to be competitive in its file format identification capabilities (under some definition of "competitive").
+Also, it is being used as the source of truth by other software solutions in the space of digital preservation such as [Archivematica](https://github.com/artefactual/archivematica) and [RODA](https://github.com/keeps/roda).
+That's certainly good enough for us.
+
+
+### Detecting endangered formats
+
+It seems to be generally agreed upon that a centrally managed "list of endangered file formats" would be a desirable thing to have.
+There have been attempts in creating centralized registries for file formats; for example [PRONOM](https://www.nationalarchives.gov.uk/PRONOM/), GDFR and UDFR, and the ["Just Solve the File Format Problem" wiki](http://fileformats.archiveteam.org/).
+Out of these, PRONOM is the most promising candidate.
+It even offers a field for the "risk" per format - however, this field does not seem to be populated for any of the registered formats.
+Unfortunately, this distinctive lack of availability leaves us little choice but to do it ourselves.
+
+There is no way for us to know all the formats that exist in the world - but luckily, this is also not required!
+We only have to know about the formats that are part of our format landscape, which is a much more manageable task.
+Thus, we use a "local" list of file formats which is extended every time a new format is encountered.
+We manually review this list periodically and annotate formats with a hint about their endangerment status.
+
+
+### Storing the information outside of Invenio
+
+Invenio provides fields for storing information about the format for each file.
+However, we often receive archives such as ZIP files which of course contain a series of other files.
+Storing information about the formats of these nested files is not a standard use case and thus there is no obvious or generally agreed-upon way to do it (yet).
+
+Because we try to not extend the semantics of available constructs with non-standard custom meaning, we instead decided to keep this information external.
+
+
+## Example files
+
+### Input
+
+The input for formatscaper needs to be a list of objects (in YAML format) describing the context of each file to investigate.
+This includes the URI of the file, its original file name (which gets discarded by Invenio), and the record which the file is a part of.
+```yml
+- filename: hosts
+  uri: /etc/hosts
+  record: 1234-abcd
+
+- filename: FS Table
+  uri: /etc/fstab
+  record: abcd-1234
+
+- filename: researchdata.zip
+  uri: /mnt/data/de/ad/be/ef/data
+  record: abcd-1234
+```
+
+
+### Formats file
+
+The formats file (e.g. `formats.yml`) contains information about previously encountered file formats and their endangerment status.
+Some context (like a human-readable name and MIME type) per format is also provided here, primarily to make it more understandable for operators.
+```yml
+- endangered: false
+  mime: text/plain
+  name: Plain Text File
+  puid: x-fmt/111
+- endangered: false
+  mime: application/zip
+  name: ZIP Format
+  puid: x-fmt/263
+- endangered: true
+  mime: null
+  name: Adobe Illustrator CC 2020 Artwork
+  puid: fmt/1864
+- endangered: false
+  mime: application/postscript
+  name: Encapsulated PostScript File Format
+  puid: fmt/124
+- endangered: false
+  mime: application/pdf
+  name: Acrobat PDF 1.4 - Portable Document Format
+  puid: fmt/18
+- endangered: false
+  mime: image/svg+xml
+  name: Scalable Vector Graphics
+  puid: fmt/92
+- endangered: true
+  mime: null
+  name: null
+  puid: UNKNOWN
+```
+
+The "primary key" for file formats are the PRONOM Persistent Unique Identifier (PUID).
+
+Note that formatscaper bases its detection of format endangerment purely on this file.
+Whenever a new (previously unlisted) format is encountered, it will be added to this list.
+The file is rewritten every time formatscaper is run, so extra information like comments will be discarded.
+
+
+### Results
+
+The results file (e.g. `results.yml`) contains information about each investigated file and their identified formats, along with a notes about their endangerment status.
+```yml
+- filename: /etc/hosts
+  format:
+    endangered: false
+    mime: text/plain
+    name: Plain Text File
+    puid: x-fmt/111
+  record: 1234-abcd
+- filename: /etc/environment
+  format:
+    endangered: false
+    mime: text/plain
+    name: Plain Text File
+    puid: x-fmt/111
+  record: 1234-abcd
+- filename: /mnt/data/de/ad/be/ef/data
+  format:
+    endangered: false
+    mime: application/zip
+    name: ZIP Format
+    puid: x-fmt/263
+  record: abcd-1234
+- filename: /mnt/data/de/ad/be/ef/data#README.txt
+  format:
+    endangered: false
+    mime: text/plain
+    name: Plain Text File
+    puid: x-fmt/111
+  record: abcd-1234
+- filename: /mnt/data/de/ad/be/ef/data#results.csv
+  format:
+    endangered: false
+    mime: text/csv
+    name: Comma Separated Values
+    puid: x-fmt/18
+  record: abcd-1234
+```
+
+Note that the contents of the ZIP archive are inspected as well, with `#` as the delimiter between the archive's filename and the contained file's name.
+
+
+## Filtering results
+
+To filter results in the shell, you can use the command [`yq`](https://github.com/mikefarah/yq) (which is a [`jq`](https://github.com/jqlang/jq) wrapper for YAML documents).
+
+
+Show only the endangered files:
+```sh
+yq 'map(select(.format.endangered == true))' results.yml
+```
+
+Filtering results per record:
+```sh
+yq 'map(select(.record == "<RECORD-ID>"))' results.yml
+```
diff --git a/formatscaper/formatscaper.py b/formatscaper/formatscaper.py
new file mode 100755
index 0000000000000000000000000000000000000000..c492e1166217dcb9c3728534f2222f31f4430b51
--- /dev/null
+++ b/formatscaper/formatscaper.py
@@ -0,0 +1,198 @@
+#!/bin/env python3
+
+import argparse
+import dataclasses
+import re
+import subprocess
+import sys
+
+import progressbar
+import yaml
+
+
+@dataclasses.dataclass
+class Format:
+    puid: str
+    name: str
+    mime: str
+    endangered: bool
+
+
+@dataclasses.dataclass
+class Result:
+    filename: str
+    record: str
+    format: Format
+
+
+# set up the argument parser
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--formats",
+    "--f",
+    default="formats.yml",
+    nargs="?",
+    help="list of known file formats and if they're endangered; this file will be updated (default: formats.yml)",  # noqa
+)
+parser.add_argument(
+    "--input",
+    "-i",
+    default="-",
+    help="input file for files of per record to check (default: stdin)",
+)
+parser.add_argument(
+    "--output",
+    "-o",
+    default="results.yml",
+    help="file in which to store the identified format for each file (default: results.yml)",  # noqa
+)
+parser.add_argument(
+    "--sf-binary",
+    default="sf",
+    help="name of the siegfried binary to call (default: sf)",
+)
+parser.add_argument(
+    "--sf-parallel",
+    default=1,
+    type=int,
+    help="number of parallel processes used by sf (default: 1)",
+)
+parser.add_argument(
+    "--sf-error-log",
+    default="sf.log",
+    help="file in which to store sf error logs (default: sf.log)",
+)
+parser.add_argument(
+    "--no-progressbar",
+    "-P",
+    default=False,
+    action="store_true",
+    help="disable the progress bar",
+)
+args = parser.parse_args()
+
+
+# check the siegfried binary
+try:
+    sf_output = subprocess.check_output([args.sf_binary, "-v"], text=True)
+    m = re.match(r"siegfried ((\d+\.?)+)", sf_output)
+    if m and m.group(1):
+        ver_nums = [int(num) for num in m.group(1).split(".")]
+        if not (ver_nums[0] >= 1 and ver_nums[1] >= 10):
+            print(f"WARN: siegfried version too old ({m.group(1)})", file=sys.stderr)
+    else:
+        print("ERROR: siegfried version could not be determined", file=sys.stderr)
+        sys.exit(1)
+except FileNotFoundError:
+    print(
+        f"ERROR: siegfried binary could not be found ({args.sf_binary})",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+
+
+# parse the list of known formats
+formats = {}
+try:
+    with open(args.formats, "r") as formats_file:
+        known_formats = yaml.safe_load(formats_file)
+        for f in known_formats:
+            format = Format(
+                puid=f["puid"],
+                name=f["name"],
+                mime=f["mime"],
+                endangered=f["endangered"],
+            )
+            formats[format.puid] = format
+except FileNotFoundError:
+    pass
+
+
+# read the list of files to analyze
+record_files = []
+if args.input == "-":
+    record_files = yaml.safe_load(sys.stdin)
+else:
+    with open(args.input, "r") as input_file:
+        record_files = yaml.safe_load(input_file)
+
+
+# try to redirect the error logs from siegfried
+try:
+    sf_error_log = open(args.sf_error_log, "w")
+except OSError as e:
+    print(
+        f"WARN: couldn't open sf log file, printing to stderr instead ({e})",
+        file=sys.stderr,
+    )
+    sf_error_log = None
+
+
+# analyze each file listed in the record files
+all_results = []
+endangered_files = []
+if not args.no_progressbar:
+    record_files = progressbar.progressbar(record_files or [])
+
+for record_file in record_files or []:
+    sf_output = subprocess.check_output(
+        [
+            args.sf_binary,
+            "-z",
+            "-multi",
+            str(args.sf_parallel),
+            "-name",
+            record_file["filename"],
+            record_file["uri"],
+        ],
+        stderr=sf_error_log,
+    )
+
+    # skip the sf info part
+    file_infos = yaml.safe_load_all(sf_output)
+    next(file_infos)
+
+    # go through all the files analyzed by siegfried which can be several,
+    # if the original input file was an archive
+    for file_info in file_infos:
+        if not file_info.get("errors", None) and file_info.get("matches", []):
+            for match in file_info["matches"]:
+                if match["ns"] == "pronom":
+                    format = Format(
+                        name=match["format"],
+                        puid=match["id"],
+                        mime=match["mime"],
+                        endangered=False,
+                    )
+                    format = formats.setdefault(format.puid, format)
+                    result = Result(
+                        filename=file_info["filename"],
+                        record=record_file["record"],
+                        format=format,
+                    )
+                    all_results.append(result)
+                    if formats[format.puid].endangered:
+                        endangered_files.append(result)
+
+if sf_error_log is not None:
+    sf_error_log.close()
+
+if endangered_files:
+    print(yaml.dump([dataclasses.asdict(f) for f in endangered_files]))
+
+
+# store the results to files
+try:
+    with open(args.output, "w") as output_file:
+        yaml.dump([dataclasses.asdict(res) for res in all_results], output_file)
+
+except OSError:
+    print(f"WARN: couldn't store the results ({args.output})", file=sys.stderr)
+
+try:
+    updated_formats = [dataclasses.asdict(f) for f in formats.values()]
+    with open(args.formats, "w") as formats_file:
+        yaml.dump(updated_formats, formats_file)
+
+except OSError:
+    print(f"ERROR: couldn't update the formats file ({args.formats})", file=sys.stderr)