diff --git a/README.md b/README.md index 3dc495b372210d894ea55db204f6d038436dde66..af8f20724cb8364bd430706742c7800bb6d6d89b 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,3 @@ health Utilities for checking the general health of InvenioRDM instances. -formatscaper ------------- - -A utility to assist with painting a picture of the file format landscape of uploaded files in InvenioRDM. diff --git a/formatscaper/.flake8 b/formatscaper/.flake8 deleted file mode 100644 index f295e07b9133e51496e168c3c3139849d7b56c53..0000000000000000000000000000000000000000 --- a/formatscaper/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 88 -extend-ignore = E203, E704 diff --git a/formatscaper/.gitignore b/formatscaper/.gitignore deleted file mode 100644 index fa465718be487fb74f15f7ab26113f7421c2b477..0000000000000000000000000000000000000000 --- a/formatscaper/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -# results and other personalized (input) files -*.yml -*.yaml -*.pickle - -# logs -sf.log - -# environment variables -.env diff --git a/formatscaper/.isort.cfg b/formatscaper/.isort.cfg deleted file mode 100644 index f238bf7ea137e4e654965767884532c84df96067..0000000000000000000000000000000000000000 --- a/formatscaper/.isort.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[settings] -profile = black diff --git a/formatscaper/Pipfile b/formatscaper/Pipfile deleted file mode 100644 index 8e8597265c42a43179c10319f964a38724a348b7..0000000000000000000000000000000000000000 --- a/formatscaper/Pipfile +++ /dev/null @@ -1,14 +0,0 @@ -[[source]] -url = "https://pypi.org/simple" -verify_ssl = true -name = "pypi" - -[packages] -pyyaml = "*" -progressbar2 = "*" -urwid = "*" - -[dev-packages] - -[requires] -python_version = "3.12" diff --git a/formatscaper/Pipfile.lock b/formatscaper/Pipfile.lock deleted file mode 100644 index bcea4e7c124824c77269b64522765d969777e779..0000000000000000000000000000000000000000 --- a/formatscaper/Pipfile.lock +++ /dev/null @@ -1,120 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "850ae0c1010821737e12fb4fdf893fb56a6372c198a5d0667968f38679d57c3b" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.12" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "progressbar2": { - "hashes": [ - "sha256:714995e7725fb13e10a720eccf8ce8ff775d9f26247798c71f71447a15cfbe10", - "sha256:97d323ba03ad3d017a4d047fd0b2d3e733c5a360c07f87d269f96641c3de729f" - ], - "index": "pypi", - "markers": "python_version >= '3.8'", - "version": "==4.4.1" - }, - "python-utils": { - "hashes": [ - "sha256:ad0ccdbd6f856d015cace07f74828b9840b5c4072d9e868a7f6a14fd195555a8", - "sha256:c5d161e4ca58ce3f8c540f035e018850b261a41e7cb98f6ccf8e1deb7174a1f1" - ], - "markers": "python_version >= '3.9'", - "version": "==3.8.2" - }, - "pyyaml": { - "hashes": [ - "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5", - "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc", - "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df", - "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741", - "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206", - "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27", - "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595", - "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62", - "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98", - "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696", - "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290", - "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9", - "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d", - "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6", - "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867", - "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47", - "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486", - "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6", - "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3", - "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007", - "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938", - "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0", - "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c", - "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735", - "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d", - "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28", - "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4", - "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba", - "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8", - "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef", - "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5", - "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd", - "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3", - "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0", - "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515", - "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c", - "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c", - "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924", - "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34", - "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43", - "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859", - "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673", - "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54", - "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a", - "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b", - "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab", - "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa", - "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c", - "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585", - "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d", - "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f" - ], - "index": "pypi", - "markers": "python_version >= '3.6'", - "version": "==6.0.1" - }, - "typing-extensions": { - "hashes": [ - "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475", - "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb" - ], - "markers": "python_version >= '3.8'", - "version": "==4.10.0" - }, - "urwid": { - "hashes": [ - "sha256:597fa2d19ac788e4607d2a48aca32f257342201cb55e5f6a00a8fcd24e62a5ab", - "sha256:80b922d2051db6abe598b7e1b0b31d8d04fcc56d35bb1ec40b3c128fa0bd23ab" - ], - "index": "pypi", - "markers": "python_version >= '3.8'", - "version": "==2.6.7" - }, - "wcwidth": { - "hashes": [ - "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", - "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5" - ], - "version": "==0.2.13" - } - }, - "develop": {} -} diff --git a/formatscaper/README.md b/formatscaper/README.md deleted file mode 100644 index 396c0f791fa2f64085037204db4aafddcf3f0d0c..0000000000000000000000000000000000000000 --- a/formatscaper/README.md +++ /dev/null @@ -1,264 +0,0 @@ -# Formatscaper - -Formatscaper is a tool for generating an overview of the file format landscape composed of the uploads in our research data repository. - -The aim here is to assist us with the task of digital preservation; specifically with the task of identifying uploaded files which are in danger of becoming "extinct". - - -## Dependencies - -* Python 3 -* [Siegfried](https://github.com/richardlehane/siegfried#install) - - -## Usage - -Formatscaper is designed to build up the required context over its lifetime of use, so no particular setup is required. -To get started, simply feed it a list of files to analyze in the [expected format](#input) and let it run. - -The most relevant file is [`formats.yaml`](#formats-file), which is intended to receive limited manual tweaks over time (basically just setting the `endangered` flag for outdated formats). -Every time formatscaper is run, it will update this file with formats that haven't been listed before. -This means that the knowledge base about file formats gets extended over time. - -Per default, formatscaper will create a summary of endangered files it encountered and print it to standard out. -A more comprehensive summary for all encountered formats will be stored in a [results file](#results). - -Since file format detection is effectively still based on heuristics, no identification procedure is infallible - sometimes, even the best guess is wrong. -For such cases, we added a mechanism to override the result detected by siegfried on a per-file basis. -More information for this can be found [further down](#known-results). - - -Example call, with a custom path for the `sf` binary: -```sh -pipenv run ./formatscaper.py --sf-binary "${GOPATH}/bin/sf" -i record-files.yaml -``` - - - -## Rationale - -### Building file format landscapes - -Tools for creating an overview of file format landscapes already exist, like [`c3po`](https://peshkira.github.io/c3po/) and [`fitsinn`](https://github.com/datascience/fitsinn). -However, they tend to come with more bells and whistles than we actually require, or are not quite ready for production use yet. - - -### Using a single source of truth - -Often, the file format identification is based on the output of several utilities. -For instance, [FITS](https://projects.iq.harvard.edu/fits/home) wraps a number of individual tools and combines their output into a unified XML structure. -Unfortunately, their results are often in disagreement with each other, which necessitates a de-conflicting strategy. - -Instead of following this multi-tool approach, we've decided to rely on a single source of truth only - namely a tool called [Siegfried](https://github.com/richardlehane/siegfried). -It seems to be competitive in its file format identification capabilities (under some definition of "competitive"). -Also, it is being used as the source of truth by other software solutions in the space of digital preservation such as [Archivematica](https://github.com/artefactual/archivematica) and [RODA](https://github.com/keeps/roda). -That's certainly good enough for us. - - -### Detecting endangered formats - -It seems to be generally agreed upon that a centrally managed "list of endangered file formats" would be a desirable thing to have. -There have been attempts in creating centralized registries for file formats; for example [PRONOM](https://www.nationalarchives.gov.uk/PRONOM/), GDFR and UDFR, and the ["Just Solve the File Format Problem" wiki](http://fileformats.archiveteam.org/). -Out of these, PRONOM is the most promising candidate. -It even offers a field for the "risk" per format - however, this field does not seem to be populated for any of the registered formats. -Unfortunately, this distinctive lack of availability leaves us little choice but to do it ourselves. - -There is no way for us to know all the formats that exist in the world - but luckily, this is also not required! -We only have to know about the formats that are part of our format landscape, which is a much more manageable task. -Thus, we use a "local" list of file formats which is extended every time a new format is encountered. -We manually review this list periodically and annotate formats with a hint about their endangerment status. - - -### Storing the information outside of Invenio - -Invenio provides fields for storing information about the format for each file. -However, we often receive archives such as ZIP files which of course contain a series of other files. -Storing information about the formats of these nested files is not a standard use case and thus there is no obvious or generally agreed-upon way to do it (yet). - -Because we try to not extend the semantics of available constructs with non-standard custom meaning, we instead decided to keep this information external. - - -## Example files - -### Input - -The input for formatscaper needs to be a list of objects (in YAML format) describing the context of each file to investigate. -This includes the URI of the file, its original file name (which gets discarded by Invenio), and the record which the file is a part of. -```yaml -- filename: hosts - uri: /etc/hosts - record: 1234-abcd - -- filename: FS Table - uri: /etc/fstab - record: abcd-1234 - -- filename: researchdata.zip - uri: /mnt/data/de/ad/be/ef/data - record: abcd-1234 -``` - - -### Formats file - -The formats file (e.g. `formats.yaml`) contains information about previously encountered file formats and their endangerment status. -Some context (like a human-readable name and MIME type) per format is also provided here, primarily to make it more understandable for operators. -```yaml -- endangered: false - mime: text/plain - name: Plain Text File - puid: x-fmt/111 -- endangered: false - mime: application/zip - name: ZIP Format - puid: x-fmt/263 -- endangered: true - mime: null - name: Adobe Illustrator CC 2020 Artwork - puid: fmt/1864 -- endangered: false - mime: application/postscript - name: Encapsulated PostScript File Format - puid: fmt/124 -- endangered: false - mime: application/pdf - name: Acrobat PDF 1.4 - Portable Document Format - puid: fmt/18 -- endangered: false - mime: image/svg+xml - name: Scalable Vector Graphics - puid: fmt/92 -- endangered: true - mime: null - name: null - puid: UNKNOWN -``` - -The "primary key" for file formats are the PRONOM Persistent Unique Identifier (PUID). - -Note that formatscaper bases its detection of format endangerment purely on this file. -Whenever a new (previously unlisted) format is encountered, it will be added to this list. -The file is rewritten every time formatscaper is run, so extra information like comments will be discarded. - - -### Results - -The results file (e.g. `results.yaml`) contains information about each investigated file and their identified formats, along with a notes about their endangerment status. -```yaml -- filename: /etc/hosts - format: - endangered: false - mime: text/plain - name: Plain Text File - puid: x-fmt/111 - record: 1234-abcd -- filename: /etc/environment - format: - endangered: false - mime: text/plain - name: Plain Text File - puid: x-fmt/111 - record: 1234-abcd -- filename: /mnt/data/de/ad/be/ef/data - format: - endangered: false - mime: application/zip - name: ZIP Format - puid: x-fmt/263 - record: abcd-1234 -- filename: /mnt/data/de/ad/be/ef/data#README.txt - format: - endangered: false - mime: text/plain - name: Plain Text File - puid: x-fmt/111 - record: abcd-1234 -- filename: /mnt/data/de/ad/be/ef/data#results.csv - format: - endangered: false - mime: text/csv - name: Comma Separated Values - puid: x-fmt/18 - record: abcd-1234 -``` - -Note that the contents of the ZIP archive are inspected as well, with `#` as the delimiter between the archive's filename and the contained file's name. - - -### Known results - -The "known results" file can be used to override the detected file format information from siegfried per file (by filename). -The structure of this file is very similar to that of the usual results file described above, with a few minor differences. - -Each entry can specify whether or not the format is `safe`, which will be taken into consideration when reporting "endangered" files. -Further, it can optionally provide information about the actual file `format`. -If present, this information will override the format information as reported by siegfried. - -Example: -```yaml - -- filename: /mnt/data/de/ad/be/ef/data#something.idk - format: - puid: fmt/729 - name: SQLite Database File Format - mime: application/x-sqlite3 - endangered: false - -- filename: /mnt/data/de/ad/be/ef/data#anotherthing.bin - format: - puid: fmt/899 - name: Windows Portable Executable - mime: application/vnd.microsoft.portable-executable - endangered: true - safe: true - -- filename: /mnt/data/de/ad/be/ef/data#program.exe - safe: true -``` - -This example will override the file format detected by siegfried with the supplied values for the first two files. -For the second and third file, it will also mark the files as explicitly "safe" which will prevent formatscaper to report them as "endangered", even if their format is otherwise known to be. - - -## Generating an input file from Invenio - -The required information is relatively straight-forward to generate using `invenio shell`: -```python -import yaml -from invenio_rdm_records.proxies import current_rdm_records_service as svc - -# get all (published) records in the system -rc = svc.record_cls -recs = [rc(rm.data, model=rm) for rm in rc.model_cls.query.all()] - -# get the expected structure from the records -record_files = [ - {"record": r["id"], "filename": fn, "uri": entry.file.file.uri} - for r in recs - for fn, entry in r.files.entries.items() - if r.files.entries -] - -# serialize the information as YAML file -with open("record-files.yaml", "w") as f: - yaml.dump(record_files, f) -``` - -The above script simply lists all files associated with any published record, but does not consider any unpublished drafts. -Such changes are very straight-forward to implement though. - - -## Filtering results - -To filter results in the shell, you can use the command [`yq`](https://github.com/mikefarah/yq) (which is a [`jq`](https://github.com/jqlang/jq) wrapper for YAML documents). - - -Show only the endangered files: -```sh -yq 'map(select(.format.endangered == true))' results.yaml -``` - -Filtering results per record: -```sh -yq 'map(select(.record == "<RECORD-ID>"))' results.yaml -``` diff --git a/formatscaper/core/__init__.py b/formatscaper/core/__init__.py deleted file mode 100644 index a570ab66d80fd5b029a67d59800c789131665422..0000000000000000000000000000000000000000 --- a/formatscaper/core/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .models import Format, RecordFile, Result - -__all__ = ( - Format, - RecordFile, - Result, -) diff --git a/formatscaper/core/models.py b/formatscaper/core/models.py deleted file mode 100644 index 15749ff1bd69d14a55f4307422258e79ba76116b..0000000000000000000000000000000000000000 --- a/formatscaper/core/models.py +++ /dev/null @@ -1,62 +0,0 @@ -import dataclasses -from typing import List - - -@dataclasses.dataclass -class RecordFile: - """Information about a file to be investigated.""" - - filename: str - uri: str - record: str - - -@dataclasses.dataclass -class Format: - """Information about a file format.""" - - puid: str - name: str - mime: str - endangered: bool = False - results: List["Result"] = dataclasses.field(default_factory=list) - - def as_dict(self): - """Dump the data as dictionary.""" - return { - "puid": self.puid, - "name": self.name, - "mime": self.mime, - "endangered": self.endangered, - } - - @classmethod - def from_sf_dict(cls, dictionary): - """Parse the format from siegfried's output.""" - return cls( - puid=dictionary["id"], - name=dictionary["format"], - mime=dictionary["mime"], - ) - - -@dataclasses.dataclass -class Result: - """The format identification result for a given file.""" - - filename: str - format: Format - record: str = None - safe: bool = False - - def as_dict(self): - """Dump the data as dictionary.""" - result = { - "filename": self.filename, - "record": self.record, - "format": self.format.as_dict(), - } - if self.safe: - result["safe"] = self.safe - - return result diff --git a/formatscaper/core/utils.py b/formatscaper/core/utils.py deleted file mode 100644 index 7ff9c8dfb55ac9867a079589c3284a8aa5abe103..0000000000000000000000000000000000000000 --- a/formatscaper/core/utils.py +++ /dev/null @@ -1,150 +0,0 @@ -import pickle -import re -import sys -from typing import Dict, List, Optional - -import yaml - -from .models import Format, RecordFile, Result - - -def load_record_files(file_name: str) -> List[None]: - """Load the record files from the file with the given name. - - If ``file_name`` is ``"-"``, then standard in will be read. - """ - record_files = [] - if file_name == "-": - record_files = yaml.safe_load(sys.stdin) - else: - with open(file_name, "r") as input_file: - record_files = yaml.safe_load(input_file) - - return [RecordFile(**rf) for rf in record_files] - - -def load_formats(file_name: str) -> Dict[str, Format]: - """Load the known formats from the given YAML file.""" - formats = {} - - try: - with open(file_name, "r") as formats_file: - known_formats = yaml.safe_load(formats_file) - for f in known_formats: - format = Format(**f) - formats[format.puid] = format - - except FileNotFoundError: - pass - - return formats - - -def store_formats(formats: Dict[str, Format] | List[Format], file_name: str) -> bool: - """Store the known formats to the given YAML file.""" - try: - if isinstance(formats, dict): - formats = formats.values() - - updated_formats = [f.as_dict() for f in formats] - with open(file_name, "w") as formats_file: - yaml.dump(updated_formats, formats_file, sort_keys=False) - - except OSError: - print( - f"ERROR: couldn't update the formats file ({file_name})", - file=sys.stderr, - ) - - -def store_results(results: List[Result], file_name: str, file_format: str) -> bool: - """Store the results in the specified file. - - The ``file_name`` can contain ``"{FORMAT}``, which will be replaced by the - specified ``file_format``. - The latter has to be either ``yaml`` or ``pickle``. - """ - try: - file_name = file_name.format(FORMAT=file_format) - simple_results = [res.as_dict() for res in results] - file_mode = "w" if file_format == "yaml" else "wb" - with open(file_name, file_mode) as output_file: - if file_format.lower() == "yaml": - yaml.dump(simple_results, output_file, sort_keys=False) - elif file_format.lower() == "pickle": - pickle.dump(simple_results, output_file) - else: - print( - f"WARN: unknown format for results file ({file_format})", - file=sys.stderr, - ) - return False - - return True - - except OSError: - print( - f"WARN: couldn't store the results to file ({file_name})", - file=sys.stderr, - ) - return False - - -def load_results( - file_name: str, - file_format: Optional[str] = None, - strict: bool = True, - formats: Optional[List[Format]] = None, -) -> Optional[List[Result]]: - """Load the results from the given file. - - In case the ``file_format`` isn't specified, auto-detection is attempted. - If ``strict`` is set, then the result loading will fail if it cannot parse - the format for a result. - Optionally, a list of already known ``formats`` can be supplied to avoid - creating duplicate ``Format`` instances. - Newly encountered formats will be appended to the supplied list. - """ - if file_format is None: - if re.search(r"\.ya?ml$", file_name, re.IGNORECASE): - file_format = "yaml" - elif re.search(r"\.pickle$", file_name, re.IGNORECASE): - file_format = "pickle" - - if file_format not in {"pickle", "yaml"}: - print(f"WARN: invalid file format ({file_format})", file=sys.stderr) - return None - - raw_results = [] - if file_format == "pickle": - with open(file_name, "rb") as results_file: - raw_results = pickle.load(results_file) - elif file_format == "yaml": - with open(file_name, "r") as results_file: - raw_results = yaml.safe_load(results_file) - - # note: we deduplicate formats so that manipulation of one entry updates all entries - results = [] - formats = formats or [] - known_formats = {format.puid: format for format in formats} - - for res in raw_results: - format = None - try: - format = known_formats.get(res["format"]["puid"], None) - if format is None: - format = Format(**res["format"]) - known_formats[format.puid] = format - formats.append(format) - except (TypeError, KeyError) as e: - # TypeError: the result doesn't have all required parts for Format() - # KeyError: either the result doesn't have a format or it lacks the PUID - if strict: - raise e - - res.pop("format", None) - result = Result(**res, format=format) - format.results.append(result) - results.append(result) - - return results diff --git a/formatscaper/formatscaper.py b/formatscaper/formatscaper.py deleted file mode 100755 index 3b0f99cfd0fd83717fa8811ee33bde1f5f32153b..0000000000000000000000000000000000000000 --- a/formatscaper/formatscaper.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/bin/env python3 - -import argparse -import os -import re -import subprocess -import sys -import threading - -import progressbar as pb -import yaml - -from core.models import Format, RecordFile, Result -from core.utils import ( - load_formats, - load_record_files, - load_results, - store_formats, - store_results, -) - -# set up the argument parser -parser = argparse.ArgumentParser( - description=( - "Tool for identifying the formats of listed files (and nested ones " - "in case of archives) associated to records uploaded in Invenio." - ), -) -parser.add_argument( - "--formats", - "--f", - default="formats.yaml", - help=( - "list of known file formats and if they're endangered; " - "this file will be updated (default: formats.yaml)" - ), -) -parser.add_argument( - "--input", - "-i", - default="-", - help="input file for files of per record to check (default: stdin)", -) -parser.add_argument( - "--output", - "-o", - default="results.{FORMAT}", - help=( - "file in which to store the identified format for each file " - "(default: results.{FORMAT})" - ), -) -parser.add_argument( - "--output-format", - "-F", - default="pickle", - choices=["pickle", "yaml"], - help="format of the results (default: pickle)", -) -parser.add_argument( - "--known-results", - "-k", - default=None, - help="file with known results for overriding the identification results", -) -parser.add_argument( - "--parallel", - "-p", - default=1, - type=int, - help=( - "number of siegfried processes to run in parallel; 0 and negative numbers will " - "subtract from the number of CPU cores (default: 1)" - ), -) -parser.add_argument( - "--sf-binary", - default="sf", - help="name of the siegfried binary to call (default: sf)", -) -parser.add_argument( - "--sf-error-log", - default="sf.log", - help="file in which to store sf error logs (default: sf.log)", -) -parser.add_argument( - "--no-progressbar", - "-B", - default=False, - action="store_true", - help="disable the progress bar", -) -parser.add_argument( - "--tempdir", - "-t", - default=None, - help="set directory for storing temporary files", -) -args = parser.parse_args() - - -# check the siegfried binary -try: - sf_output = subprocess.check_output([args.sf_binary, "-v"], text=True) - m = re.match(r"siegfried ((\d+\.?)+)", sf_output) - if m and m.group(1): - ver_nums = [int(num) for num in m.group(1).split(".")] - if not (ver_nums[0] >= 1 and ver_nums[1] >= 10): - print(f"WARN: siegfried version too old ({m.group(1)})", file=sys.stderr) - else: - print("ERROR: siegfried version could not be determined", file=sys.stderr) - sys.exit(1) -except FileNotFoundError: - print( - f"ERROR: siegfried binary could not be found ({args.sf_binary})", - file=sys.stderr, - ) - sys.exit(1) - - -# parse the list of known formats -formats = load_formats(args.formats) - -# read the list of files to analyze -record_files = load_record_files(args.input) - -# read list of known file results -known_results = {} -if args.known_results is not None: - known_results = { - res.filename: res for res in load_results(args.known_results, strict=False) - } - -# try to redirect the error logs from siegfried -try: - sf_error_log = open(args.sf_error_log, "w") -except OSError as e: - print( - f"WARN: couldn't open sf log file, printing to stderr instead ({e})", - file=sys.stderr, - ) - sf_error_log = None - -# determine the level of threads to run in parallel -# negative numbers mean "this much less than the number of CPUs I have", -# as long as the result is greater than 0 -if (num_threads := args.parallel) <= 0: - num_cores = os.cpu_count() - if num_cores is None: - num_threads = 1 - print( - ( - "WARN: couldn't determine number of CPU cores, " - "falling back to a single thread" - ), - file=sys.stderr, - ) - else: - num_threads = os.cpu_count() + num_threads - if num_threads <= 0: - print( - "ERROR: calculated number of threads would be less than 1:", - num_threads, - file=sys.stderr, - ) - sys.exit(1) - - -# progressbar curates its own list of ANSI terminals, and doesn't know about foot, -# so we claim to use xterm instead of foot -if os.environ.get("TERM") == "foot": - os.environ["TERM"] = "xterm" - -# set the directory for storing temporary files -if args.tempdir: - if os.path.exists(args.tempdir): - os.environ["TMPDIR"] = os.environ["TMP"] = args.tempdir - else: - print( - "WARN: ignoring tempdir as it does not exist:", - args.tempdir, - file=sys.stderr, - ) - - -# set up variables required in the collection of results -all_results = [] -endangered_files = [] -sem = threading.Semaphore(num_threads) -mutex = threading.Lock() -completed_tasks = 0 -pb_ws = [pb.Percentage(), " (", pb.SimpleProgress(), ") ", pb.Bar(), " ", pb.Timer()] -progress_bar = pb.ProgressBar(max_value=len(record_files), widgets=pb_ws) - - -def process_record_file(record_file: RecordFile) -> None: - with sem: - try: - sf_output = subprocess.check_output( - [ - args.sf_binary, - "-z", - "-multi", - "1", - "-name", - record_file.filename, - record_file.uri, - ], - stderr=sf_error_log, - ) - - # skip the sf info part - file_infos = yaml.safe_load_all(sf_output) - next(file_infos) - - # go through all the files analyzed by siegfried which can be several, - # if the original input file was an archive - for file_info in file_infos: - if not file_info.get("errors", None) and file_info.get("matches", []): - for match in file_info["matches"]: - if match["ns"] == "pronom": - format = Format.from_sf_dict(match) - - # replace first occurrence of the URI with filename - filename = file_info["filename"].replace( - record_file.uri, record_file.filename, 1 - ) - - # the storing of results needs to be mutually exclusive - with mutex: - format = formats.setdefault(format.puid, format) - result = Result( - filename=filename, - record=record_file.record, - format=format, - ) - - # let's check if we claim to know better than siegfried - if result.filename in known_results: - known_res = known_results[result.filename] - result.safe = known_res.safe - if known_res.format is not None: - result.format = formats.get( - known_res.format.puid, known_res.format - ) - - all_results.append(result) - if formats[format.puid].endangered and not result.safe: - endangered_files.append(result) - - # when the task ends, update the progress bar - with mutex: - global completed_tasks - completed_tasks += 1 - if not args.no_progressbar: - progress_bar.update(completed_tasks, force=True) - - except (subprocess.CalledProcessError, ValueError) as e: - print("WARN: error during sf execution:", str(e), file=sys.stderr) - - -# analyze all the files in parallel, and create the summary after all threads complete -threads = [] -for record_file in record_files or []: - thread = threading.Thread(target=process_record_file, args=[record_file]) - threads.append(thread) - thread.start() - -try: - for thread in threads: - thread.join() -except KeyboardInterrupt: - pass - -if sf_error_log is not None: - sf_error_log.close() - -if endangered_files: - print(yaml.dump([f.as_dict() for f in endangered_files], sort_keys=False)) - -# update the file with known file formats -store_formats(formats, args.formats) - -# store the results to files -output_file_name = args.output.format(FORMAT=args.output_format) -store_results(all_results, args.output, args.output_format) diff --git a/formatscaper/resultman.py b/formatscaper/resultman.py deleted file mode 100755 index 378c42082ed20c4fc0ceb26003f9e7b60ed79081..0000000000000000000000000000000000000000 --- a/formatscaper/resultman.py +++ /dev/null @@ -1,307 +0,0 @@ -#!/bin/env python3 - -import argparse -import enum -import math -from typing import Optional - -import urwid as uw -from urwid.command_map import Command - -from core.utils import Format, load_formats, load_results, store_formats - - -# helper functions -def fallback_key_handler(key: str) -> None: - """Handle keys that haven't been handled by any widgets.""" - if key == "Q": - raise uw.ExitMainLoop() - - elif key in ["q", "h", "esc"]: - # ideally these keys would be handled by the right side when it's focused - # as this action only makes sense then - columns.set_focus(left) - - elif key in ["S", "E"]: - # formats_list is a ScrollBar wrapping a ListBox for the buttons - selected = formats_list.original_widget.focus - - if selected is not None: - # each SimpleButton has a format and is wrapped in an AttrMap - format = selected.original_widget.format - new_value = key == "E" - - if format.endangered != new_value: - global formats_modified - - selected.set_attr_map({None: "endangered" if new_value else "safe"}) - format.endangered = new_value - formats_modified = True - - -class FormatFilter(enum.Enum): - """Filter settings for file formats.""" - - ALL = "all" - SAFE = "safe" - ENDANGERED = "endangered" - - -class SText(uw.Text): - """A selectable Text widget.""" - - _selectable = True - - def keypress(self, size, key): - """Don't do anything on keypress.""" - return key - - -class SimpleButton(uw.Button): - """Button widget with simpler decoration.""" - - button_left = uw.Text("*") - button_right = uw.Text(" ") - format: Optional[Format] = None - - -class CachingListBox(uw.ListBox): - """ListBox with caching for sizes.""" - - def __init__(self, body): - """Constructor.""" - super().__init__(body) - self._cached_sizes = {} - - self._body._modified - - def rows_max(self, size, focus): - """Scrollable protocol for sized iterable and not wrapped around contents.""" - if size in self._cached_sizes: - return self._cached_sizes[size] - - result = super().rows_max(size, focus) - self._cached_sizes[size] = result - return result - - -# parsing CLI arguments -parser = argparse.ArgumentParser( - description="TUI tool for managing file format information" -) -parser.add_argument( - "--formats", - "-f", - default="formats.yaml", - help="formats file", -) -parser.add_argument( - "--results", - "-r", - default="results.pickle", - help="results file", -) -parser.add_argument( - "--filter", - default=FormatFilter.ALL.value, - choices=[f.value for f in FormatFilter], - help="filter for formats based on their risk", -) -parser.add_argument( - "--invenio-domain", - default="researchdata.tuwien.ac.at", - help="domain name of the repository for building links", -) -args = parser.parse_args() -current_filter = FormatFilter(args.filter) - - -# loading formats & results -formats, formats_modified, all_results, format_record_files = [], False, [], {} -try: - formats = list(load_formats(args.formats).values()) -except Exception as e: - print(e) - -try: - # load the results and prepare them in the shape we need them later - # (Format PUID => Record => Files) - all_results = load_results(args.results, formats=formats) or [] - format_record_files = { - format.puid: { - record: [result for result in format.results if result.record == record] - for record in {result.record for result in format.results} - } - for format in formats - } -except Exception as e: - all_results, format_record_files = [], {} - print(e) - -# color palette & settings -palette = [ - ("border", "light gray,bold", "dark gray"), - ("bold", "bold", ""), - ("darkbg", "", "black"), - ("reversed", "bold,standout", ""), - ("endangered", "white", "dark red"), - ("safe", "", "black"), -] - -uw.command_map["j"] = Command.DOWN -uw.command_map["k"] = Command.UP -uw.command_map["ctrl d"] = Command.PAGE_DOWN -uw.command_map["ctrl u"] = Command.PAGE_UP - -PRONOM_BASE_URL = "https://www.nationalarchives.gov.uk/PRONOM/%(puid)s" -REPOSITORY_BASE_URL = f"https://{args.invenio_domain}/records/%(recid)s" - - -# event handlers -def handle_select_format(format: Format, button: uw.Button): - """Set up the format details side (right) based on the selected format.""" - relevant_results = { - rec: sorted(ress, key=lambda r: r.filename) - for rec, ress in format_record_files[format.puid].items() - if ress - } - - content = [] - i, num_files = 0, 0 - for rec, results in relevant_results.items(): - i += 1 - repository_url = REPOSITORY_BASE_URL % {"recid": rec} - content.append(uw.Filler(uw.Text([f" {i}) ", ("bold", repository_url)]))) - - for res in results: - content.append( - uw.AttrMap( - SText([" * ", res.filename], wrap="any"), - None, - focus_map="reversed", - ) - ) - content.append(uw.Divider()) - num_files += len(results) - - bottom = uw.SolidFill(".") - if relevant_results: - _files = "file" if num_files == 1 else "files" - _records = "record" if len(relevant_results) == 1 else "records" - bottom = uw.ScrollBar( - CachingListBox( - uw.SimpleFocusListWalker( - [ - uw.Divider(), - uw.AttrMap( - uw.Text( - f" {num_files:,} {_files} in " - f"{len(relevant_results):,} {_records}:" - ), - None, - focus_map="reversed", - ), - uw.Divider(), - *content, - ] - ) - ) - ) - - pronom_url = PRONOM_BASE_URL % {"puid": format.puid} - format_header = uw.Pile( - [ - uw.Filler(uw.Divider()), - uw.Filler(uw.Text([" Name: ", format.name])), - uw.Filler(uw.Text([" MIME: ", format.mime or "-"])), - uw.Filler(uw.Text([" PRONOM: ", pronom_url])), - uw.Filler(uw.Divider()), - ] - ) - - format_details = uw.Pile( - [ - ("pack", format_header), - ("pack", uw.AttrMap(uw.Filler(uw.Divider()), "border")), - bottom, - ] - ) - - right.contents.pop() - right.contents.append((format_details, ("weight", 1))) - - # if the details side has actual content to display, focus it - # columns: contains the format list (left) and the format details (right) - # right: has the format info header, divider, and files list - if relevant_results: - columns.set_focus(right) - right.set_focus(format_details) - - -# defining the basic layout -def create_format_buttons(filter: FormatFilter): - format_buttons = [] - filtered_formats = [f for f in formats if f.name] - - num_files_per_format = { - fmt.puid: sum([len(rec) for rec in format_record_files[fmt.puid].values()]) - for fmt in filtered_formats - } - max_num_files = math.ceil(math.log10(max(num_files_per_format.values()))) - - for format in sorted(filtered_formats, key=lambda f: f.name): - if filter == FormatFilter.ENDANGERED and not format.endangered: - continue - elif filter == FormatFilter.SAFE and format.endangered: - continue - - attribute = "endangered" if format.endangered else "safe" - prefix = str(num_files_per_format[format.puid]).rjust(max_num_files) - button = SimpleButton(f"[{prefix}] {format.name}") - button.format = format - uw.connect_signal(button, "click", handle_select_format, user_args=[format]) - format_buttons.append(uw.AttrMap(button, attribute, focus_map="reversed")) - - return format_buttons - - -formats_list = uw.ScrollBar( - CachingListBox(uw.SimpleFocusListWalker(create_format_buttons(current_filter))) -) -formats_list._command_map["l"] = "activate" -formats_label = uw.Filler(uw.AttrMap(uw.Text("FORMATS", align="center"), "border")) -left = uw.Pile([("pack", formats_label), formats_list]) - -help_text = f""" -Formatscaper results manager help - -+---------------------------------------------------------------+ -| Up/Down/j/k/^d/^u: navigate up and down | -| Enter/Space/l: select current format | -| Esc/h/q: go back to formats list | -| S/E: mark focussed format as safe/endangered | -| Q/^c: quit with/without saving changes | -+---------------------------------------------------------------+ - -Loaded {len(all_results):,} results for {len(format_record_files):,} formats -""" -details = uw.AttrMap(uw.Filler(uw.Text(help_text, align="center")), "darkbg") -details_label = uw.Filler(uw.AttrMap(uw.Text("DETAILS", align="center"), "border")) -right = uw.Pile([("pack", details_label), details]) - -div = uw.AttrMap(uw.SolidFill(" "), "border") -columns = uw.Columns([("weight", 1, left), (1, div), ("weight", 2, right)]) - -filter_info = uw.Text(f"Showing: {current_filter.value} formats") -status_line = uw.Filler(uw.AttrMap(filter_info, "border")) -top = uw.Pile([uw.AttrMap(columns, "darkbg"), ("pack", status_line)]) - -loop = uw.MainLoop(top, palette, unhandled_input=fallback_key_handler) - -try: - loop.run() - if formats_modified: - store_formats(formats, args.formats) - -except KeyboardInterrupt: - loop.stop()