diff --git a/formatscaper/.gitignore b/formatscaper/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..7ea56fcdb4774aea15017e4033203998af6deefd --- /dev/null +++ b/formatscaper/.gitignore @@ -0,0 +1,3 @@ +formats.yml +results.yml +sf.log diff --git a/formatscaper/Pipfile b/formatscaper/Pipfile new file mode 100644 index 0000000000000000000000000000000000000000..af383c7724a4ebb64185206477d01dafdd4234c9 --- /dev/null +++ b/formatscaper/Pipfile @@ -0,0 +1,13 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +pyyaml = "*" +progressbar2 = "*" + +[dev-packages] + +[requires] +python_version = "3.9" diff --git a/formatscaper/Pipfile.lock b/formatscaper/Pipfile.lock new file mode 100644 index 0000000000000000000000000000000000000000..a8cc6aec2ebda9ee7790adc8c9bfa7a90d60ee5e --- /dev/null +++ b/formatscaper/Pipfile.lock @@ -0,0 +1,103 @@ +{ + "_meta": { + "hash": { + "sha256": "f17599e0484b4226bebde0b6b444e0c48dcac21c3258f6354ef585800fdd2ab4" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "progressbar2": { + "hashes": [ + "sha256:036fa3bd35ae27c92e73fce4fb18aa4ba5090a1880d880cf954ecb75ccd6f3fb", + "sha256:c37e6e1b4e57ab43f95c3d0e8d90061bec140e4fed56b8343183db3aa1e19a52" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==4.3.2" + }, + "python-utils": { + "hashes": [ + "sha256:ec3a672465efb6c673845a43afcfafaa23d2594c24324a40ec18a0c59478dc0b", + "sha256:efdf31c8154667d7dc0317547c8e6d3b506c5d4b6e360e0c89662306262fc0ab" + ], + "markers": "python_version >= '3.9'", + "version": "==3.8.1" + }, + "pyyaml": { + "hashes": [ + "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5", + "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc", + "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df", + "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741", + "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206", + "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27", + "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595", + "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62", + "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98", + "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696", + "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290", + "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9", + "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d", + "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6", + "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867", + "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47", + "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486", + "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6", + "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3", + "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007", + "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938", + "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0", + "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c", + "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735", + "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d", + "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28", + "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4", + "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba", + "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8", + "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5", + "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd", + "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3", + "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0", + "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515", + "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c", + "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c", + "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924", + "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34", + "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43", + "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859", + "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673", + "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54", + "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a", + "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b", + "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab", + "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa", + "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c", + "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585", + "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d", + "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f" + ], + "index": "pypi", + "markers": "python_version >= '3.6'", + "version": "==6.0.1" + }, + "typing-extensions": { + "hashes": [ + "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783", + "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd" + ], + "markers": "python_version >= '3.8'", + "version": "==4.9.0" + } + }, + "develop": {} +} diff --git a/formatscaper/README.md b/formatscaper/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d5843151f93923a8ab6f369e6dcd5565fe4c0df0 --- /dev/null +++ b/formatscaper/README.md @@ -0,0 +1,197 @@ +# Formatscaper + +Formatscaper is a tool for generating an overview of the file format landscape composed of the uploads in our research data repository. + +The aim here is to assist us with the task of digital preservation; specifically with the task of identifying uploaded files which are in danger of becoming "extinct". + + +## Dependencies + +* Python 3 +* [Siegfried](https://github.com/richardlehane/siegfried#install) + + +## Usage + +Formatscaper is designed to build up the required context over its lifetime of use, so no particular setup is required. +To get started, simply feed it a list of files to analyze in the [expected format](#input) and let it run. + +The most relevant file is [`formats.yml`](#formats-file), which is intended to receive limited manual tweaks over time (basically just setting the `endangered` flag for outdated formats). +Every time formatscaper is run, it will update this file with formats that haven't been listed before. +This means that the knowledge base about file formats gets extended over time. + +Per default, formatscaper will create a summary of endangered files it encountered and print it to standard out. +A more comprehensive summary for all encountered formats will be stored in a [results file](#results). + + +Example call, with a custom path for the `sf` binary: +```sh +pipenv run ./formatscaper.py --sf-binary "${GOPATH}/bin/sf" -i record-files.yaml +``` + + + +## Rationale + +### Building file format landscapes + +Tools for creating an overview of file format landscapes already exist, like [`c3po`](https://peshkira.github.io/c3po/) and [`fitsinn`](https://github.com/datascience/fitsinn). +However, they tend to come with more bells and whistles than we actually require, or are not quite ready for production use yet. + + +### Using a single source of truth + +Often, the file format identification is based on the output of several utilities. +For instance, [FITS](https://projects.iq.harvard.edu/fits/home) wraps a number of individual tools and combines their output into a unified XML structure. +Unfortunately, their results are often in disagreement with each other, which necessitates a de-conflicting strategy. + +Instead of following this multi-tool approach, we've decided to rely on a single source of truth only - namely a tool called [Siegfried](https://github.com/richardlehane/siegfried). +It seems to be competitive in its file format identification capabilities (under some definition of "competitive"). +Also, it is being used as the source of truth by other software solutions in the space of digital preservation such as [Archivematica](https://github.com/artefactual/archivematica) and [RODA](https://github.com/keeps/roda). +That's certainly good enough for us. + + +### Detecting endangered formats + +It seems to be generally agreed upon that a centrally managed "list of endangered file formats" would be a desirable thing to have. +There have been attempts in creating centralized registries for file formats; for example [PRONOM](https://www.nationalarchives.gov.uk/PRONOM/), GDFR and UDFR, and the ["Just Solve the File Format Problem" wiki](http://fileformats.archiveteam.org/). +Out of these, PRONOM is the most promising candidate. +It even offers a field for the "risk" per format - however, this field does not seem to be populated for any of the registered formats. +Unfortunately, this distinctive lack of availability leaves us little choice but to do it ourselves. + +There is no way for us to know all the formats that exist in the world - but luckily, this is also not required! +We only have to know about the formats that are part of our format landscape, which is a much more manageable task. +Thus, we use a "local" list of file formats which is extended every time a new format is encountered. +We manually review this list periodically and annotate formats with a hint about their endangerment status. + + +### Storing the information outside of Invenio + +Invenio provides fields for storing information about the format for each file. +However, we often receive archives such as ZIP files which of course contain a series of other files. +Storing information about the formats of these nested files is not a standard use case and thus there is no obvious or generally agreed-upon way to do it (yet). + +Because we try to not extend the semantics of available constructs with non-standard custom meaning, we instead decided to keep this information external. + + +## Example files + +### Input + +The input for formatscaper needs to be a list of objects (in YAML format) describing the context of each file to investigate. +This includes the URI of the file, its original file name (which gets discarded by Invenio), and the record which the file is a part of. +```yml +- filename: hosts + uri: /etc/hosts + record: 1234-abcd + +- filename: FS Table + uri: /etc/fstab + record: abcd-1234 + +- filename: researchdata.zip + uri: /mnt/data/de/ad/be/ef/data + record: abcd-1234 +``` + + +### Formats file + +The formats file (e.g. `formats.yml`) contains information about previously encountered file formats and their endangerment status. +Some context (like a human-readable name and MIME type) per format is also provided here, primarily to make it more understandable for operators. +```yml +- endangered: false + mime: text/plain + name: Plain Text File + puid: x-fmt/111 +- endangered: false + mime: application/zip + name: ZIP Format + puid: x-fmt/263 +- endangered: true + mime: null + name: Adobe Illustrator CC 2020 Artwork + puid: fmt/1864 +- endangered: false + mime: application/postscript + name: Encapsulated PostScript File Format + puid: fmt/124 +- endangered: false + mime: application/pdf + name: Acrobat PDF 1.4 - Portable Document Format + puid: fmt/18 +- endangered: false + mime: image/svg+xml + name: Scalable Vector Graphics + puid: fmt/92 +- endangered: true + mime: null + name: null + puid: UNKNOWN +``` + +The "primary key" for file formats are the PRONOM Persistent Unique Identifier (PUID). + +Note that formatscaper bases its detection of format endangerment purely on this file. +Whenever a new (previously unlisted) format is encountered, it will be added to this list. +The file is rewritten every time formatscaper is run, so extra information like comments will be discarded. + + +### Results + +The results file (e.g. `results.yml`) contains information about each investigated file and their identified formats, along with a notes about their endangerment status. +```yml +- filename: /etc/hosts + format: + endangered: false + mime: text/plain + name: Plain Text File + puid: x-fmt/111 + record: 1234-abcd +- filename: /etc/environment + format: + endangered: false + mime: text/plain + name: Plain Text File + puid: x-fmt/111 + record: 1234-abcd +- filename: /mnt/data/de/ad/be/ef/data + format: + endangered: false + mime: application/zip + name: ZIP Format + puid: x-fmt/263 + record: abcd-1234 +- filename: /mnt/data/de/ad/be/ef/data#README.txt + format: + endangered: false + mime: text/plain + name: Plain Text File + puid: x-fmt/111 + record: abcd-1234 +- filename: /mnt/data/de/ad/be/ef/data#results.csv + format: + endangered: false + mime: text/csv + name: Comma Separated Values + puid: x-fmt/18 + record: abcd-1234 +``` + +Note that the contents of the ZIP archive are inspected as well, with `#` as the delimiter between the archive's filename and the contained file's name. + + +## Filtering results + +To filter results in the shell, you can use the command [`yq`](https://github.com/mikefarah/yq) (which is a [`jq`](https://github.com/jqlang/jq) wrapper for YAML documents). + + +Show only the endangered files: +```sh +yq 'map(select(.format.endangered == true))' results.yml +``` + +Filtering results per record: +```sh +yq 'map(select(.record == "<RECORD-ID>"))' results.yml +``` diff --git a/formatscaper/formatscaper.py b/formatscaper/formatscaper.py new file mode 100755 index 0000000000000000000000000000000000000000..c492e1166217dcb9c3728534f2222f31f4430b51 --- /dev/null +++ b/formatscaper/formatscaper.py @@ -0,0 +1,198 @@ +#!/bin/env python3 + +import argparse +import dataclasses +import re +import subprocess +import sys + +import progressbar +import yaml + + +@dataclasses.dataclass +class Format: + puid: str + name: str + mime: str + endangered: bool + + +@dataclasses.dataclass +class Result: + filename: str + record: str + format: Format + + +# set up the argument parser +parser = argparse.ArgumentParser() +parser.add_argument( + "--formats", + "--f", + default="formats.yml", + nargs="?", + help="list of known file formats and if they're endangered; this file will be updated (default: formats.yml)", # noqa +) +parser.add_argument( + "--input", + "-i", + default="-", + help="input file for files of per record to check (default: stdin)", +) +parser.add_argument( + "--output", + "-o", + default="results.yml", + help="file in which to store the identified format for each file (default: results.yml)", # noqa +) +parser.add_argument( + "--sf-binary", + default="sf", + help="name of the siegfried binary to call (default: sf)", +) +parser.add_argument( + "--sf-parallel", + default=1, + type=int, + help="number of parallel processes used by sf (default: 1)", +) +parser.add_argument( + "--sf-error-log", + default="sf.log", + help="file in which to store sf error logs (default: sf.log)", +) +parser.add_argument( + "--no-progressbar", + "-P", + default=False, + action="store_true", + help="disable the progress bar", +) +args = parser.parse_args() + + +# check the siegfried binary +try: + sf_output = subprocess.check_output([args.sf_binary, "-v"], text=True) + m = re.match(r"siegfried ((\d+\.?)+)", sf_output) + if m and m.group(1): + ver_nums = [int(num) for num in m.group(1).split(".")] + if not (ver_nums[0] >= 1 and ver_nums[1] >= 10): + print(f"WARN: siegfried version too old ({m.group(1)})", file=sys.stderr) + else: + print("ERROR: siegfried version could not be determined", file=sys.stderr) + sys.exit(1) +except FileNotFoundError: + print( + f"ERROR: siegfried binary could not be found ({args.sf_binary})", + file=sys.stderr, + ) + sys.exit(1) + + +# parse the list of known formats +formats = {} +try: + with open(args.formats, "r") as formats_file: + known_formats = yaml.safe_load(formats_file) + for f in known_formats: + format = Format( + puid=f["puid"], + name=f["name"], + mime=f["mime"], + endangered=f["endangered"], + ) + formats[format.puid] = format +except FileNotFoundError: + pass + + +# read the list of files to analyze +record_files = [] +if args.input == "-": + record_files = yaml.safe_load(sys.stdin) +else: + with open(args.input, "r") as input_file: + record_files = yaml.safe_load(input_file) + + +# try to redirect the error logs from siegfried +try: + sf_error_log = open(args.sf_error_log, "w") +except OSError as e: + print( + f"WARN: couldn't open sf log file, printing to stderr instead ({e})", + file=sys.stderr, + ) + sf_error_log = None + + +# analyze each file listed in the record files +all_results = [] +endangered_files = [] +if not args.no_progressbar: + record_files = progressbar.progressbar(record_files or []) + +for record_file in record_files or []: + sf_output = subprocess.check_output( + [ + args.sf_binary, + "-z", + "-multi", + str(args.sf_parallel), + "-name", + record_file["filename"], + record_file["uri"], + ], + stderr=sf_error_log, + ) + + # skip the sf info part + file_infos = yaml.safe_load_all(sf_output) + next(file_infos) + + # go through all the files analyzed by siegfried which can be several, + # if the original input file was an archive + for file_info in file_infos: + if not file_info.get("errors", None) and file_info.get("matches", []): + for match in file_info["matches"]: + if match["ns"] == "pronom": + format = Format( + name=match["format"], + puid=match["id"], + mime=match["mime"], + endangered=False, + ) + format = formats.setdefault(format.puid, format) + result = Result( + filename=file_info["filename"], + record=record_file["record"], + format=format, + ) + all_results.append(result) + if formats[format.puid].endangered: + endangered_files.append(result) + +if sf_error_log is not None: + sf_error_log.close() + +if endangered_files: + print(yaml.dump([dataclasses.asdict(f) for f in endangered_files])) + + +# store the results to files +try: + with open(args.output, "w") as output_file: + yaml.dump([dataclasses.asdict(res) for res in all_results], output_file) + +except OSError: + print(f"WARN: couldn't store the results ({args.output})", file=sys.stderr) + +try: + updated_formats = [dataclasses.asdict(f) for f in formats.values()] + with open(args.formats, "w") as formats_file: + yaml.dump(updated_formats, formats_file) + +except OSError: + print(f"ERROR: couldn't update the formats file ({args.formats})", file=sys.stderr)