diff --git a/.gitignore b/.gitignore index ee4cd3150d09b22a777003bd2b42ccf12dc7ce09..882fc836440d9a46bfc3ae45826f6292d584d393 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,10 @@ *.pickle *.sqlite +# don't ignore some YAML files though! +!.gitlab-ci.yml +!tests/**/*.yml + # logs sf.log @@ -16,3 +20,5 @@ formatscaper.egg-info Pipfile Pipfile.lock **/__pycache__ +.coverage +dist diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..561f358890ec22b489422f81a9a4371f6345a374 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,28 @@ +# vim: ts=2 + +stages: + - testing + - release + +run-tests: + stage: testing + script: + - pip install --upgrade pip pipenv + - pipenv --rm || true + - pipenv run pip install -e '.[tests]' + - pipenv run pytest + coverage: /TOTAL.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/ + +pypi-release: + stage: release + needs: + - run-tests + rules: + - if: '$CI_COMMIT_TAG =~ /^v\d+/' + script: + - pip3 install --upgrade pip build twine check-manifest + - rm -f dist/* + - python3 -m check_manifest + - python3 -m build + - python3 -m twine check dist/* + - TWINE_USERNAME=${PYPI_USER} TWINE_PASSWORD=${PYPI_PASSWORD} python3 -m twine upload --skip-existing --non-interactive dist/* diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..db6e3c0d6517dc175c894b787acabde00d420500 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (C) 2024 TU Wien. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..88db298a9d5338431d8ed6b418bf55a5217260ab --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +exclude .gitlab-ci.yml +exclude *.sqlite +exclude *.log +exclude *.yml + +include MANIFEST.in +include LICENSE +recursive-include tests *.py *.yml diff --git a/README.md b/README.md index 396c0f791fa2f64085037204db4aafddcf3f0d0c..69e1f88ff20c3304dcddf2ff9f6dfcd0890b874d 100644 --- a/README.md +++ b/README.md @@ -11,26 +11,37 @@ The aim here is to assist us with the task of digital preservation; specifically * [Siegfried](https://github.com/richardlehane/siegfried#install) -## Usage +## General info Formatscaper is designed to build up the required context over its lifetime of use, so no particular setup is required. To get started, simply feed it a list of files to analyze in the [expected format](#input) and let it run. -The most relevant file is [`formats.yaml`](#formats-file), which is intended to receive limited manual tweaks over time (basically just setting the `endangered` flag for outdated formats). -Every time formatscaper is run, it will update this file with formats that haven't been listed before. -This means that the knowledge base about file formats gets extended over time. +With every run, the database will be extended with information about file formats that haven't been encountered before. +This effectively builds up a knowledge base of the file formats that are actually present in the system (i.e. the file format landscape). -Per default, formatscaper will create a summary of endangered files it encountered and print it to standard out. -A more comprehensive summary for all encountered formats will be stored in a [results file](#results). +*Note* that whenever a new format is encountered, formatscaper will not give out a rating about the *risk of obsolescence* for this format. +This task is instead left to the operator, e.g. via the `resultman` command. -Since file format detection is effectively still based on heuristics, no identification procedure is infallible - sometimes, even the best guess is wrong. -For such cases, we added a mechanism to override the result detected by siegfried on a per-file basis. -More information for this can be found [further down](#known-results). +However, the **risk** of datasets becoming unusable in the future isn't purely based on the formats in question. +Some of the files in a dataset may be stored in a very outdated or problematic format, but the overall dataset can still be reused just fine without them. +An example would be temporary files that have been added to the dataset on accident. +Thus, each scanned file also has an *impact* assessment attached to them, which again needs to be provided by the operator. +This *impact* assessment can then be used together with the *risk of obsolescence* for its format to calculate the overall *risk assessment*. +*Note*: Since file format detection is effectively still based on heuristics, no identification procedure is infallible - sometimes, even the best guess is wrong. +For such cases, we added a mechanism to override the result detected by siegfried on a per-file basis in the database. -Example call, with a custom path for the `sf` binary: + +### Example usage + +Example `formatscaper` call, with a custom path for the `sf` binary: ```sh -pipenv run ./formatscaper.py --sf-binary "${GOPATH}/bin/sf" -i record-files.yaml +$ formatscaper --sf-binary "${GOPATH}/bin/sf" -i record-files.yaml +``` + +After formatscaper has collected the results in the database, they can be viewed e.g. via `resultman`: +```sh +$ resultman ``` @@ -101,125 +112,94 @@ This includes the URI of the file, its original file name (which gets discarded ### Formats file -The formats file (e.g. `formats.yaml`) contains information about previously encountered file formats and their endangerment status. +Information about the encountered file formats can be exported in YAML format (e.g. as `formats.yml`). +This primarily comprises a unique identifier (PUID) and the *risk of obsolescence* (i.e. the *probability of the format dying out*). Some context (like a human-readable name and MIME type) per format is also provided here, primarily to make it more understandable for operators. ```yaml -- endangered: false +- risk: 1 mime: text/plain name: Plain Text File puid: x-fmt/111 -- endangered: false +- risk: 1 mime: application/zip name: ZIP Format puid: x-fmt/263 -- endangered: true +- risk: 2 mime: null name: Adobe Illustrator CC 2020 Artwork puid: fmt/1864 -- endangered: false +- risk: 3 mime: application/postscript name: Encapsulated PostScript File Format puid: fmt/124 -- endangered: false +- risk: 2 mime: application/pdf name: Acrobat PDF 1.4 - Portable Document Format puid: fmt/18 -- endangered: false +- risk: 1 mime: image/svg+xml name: Scalable Vector Graphics puid: fmt/92 -- endangered: true +- risk: 5 mime: null name: null puid: UNKNOWN ``` -The "primary key" for file formats are the PRONOM Persistent Unique Identifier (PUID). - -Note that formatscaper bases its detection of format endangerment purely on this file. -Whenever a new (previously unlisted) format is encountered, it will be added to this list. -The file is rewritten every time formatscaper is run, so extra information like comments will be discarded. +The PRONOM Persistent Unique Identifier (PUID) can be used to uniquely identify formats. +It can also be used to construct a URL (of the shape `https://www.nationalarchives.gov.uk/PRONOM/${puid}`) pointing to additional information about the format. ### Results -The results file (e.g. `results.yaml`) contains information about each investigated file and their identified formats, along with a notes about their endangerment status. +The file format identification results for each file (along with their risk assessment) can also be exported into a YAML file (e.g. `results.yml`). +The resulting file contains information about each investigated file and their identified formats, along with a notes about their risk assessment. ```yaml - filename: /etc/hosts format: - endangered: false + risk: 1 mime: text/plain name: Plain Text File puid: x-fmt/111 record: 1234-abcd + impact: 2 - filename: /etc/environment format: - endangered: false + risk: 1 mime: text/plain name: Plain Text File puid: x-fmt/111 record: 1234-abcd + impact: 3 - filename: /mnt/data/de/ad/be/ef/data format: - endangered: false + risk: 1 mime: application/zip name: ZIP Format puid: x-fmt/263 record: abcd-1234 + impact: 4 - filename: /mnt/data/de/ad/be/ef/data#README.txt format: - endangered: false + risk: 1 mime: text/plain name: Plain Text File puid: x-fmt/111 record: abcd-1234 + impact: 3 - filename: /mnt/data/de/ad/be/ef/data#results.csv format: - endangered: false + risk: 1 mime: text/csv name: Comma Separated Values puid: x-fmt/18 record: abcd-1234 + impact: 5 ``` Note that the contents of the ZIP archive are inspected as well, with `#` as the delimiter between the archive's filename and the contained file's name. -### Known results - -The "known results" file can be used to override the detected file format information from siegfried per file (by filename). -The structure of this file is very similar to that of the usual results file described above, with a few minor differences. - -Each entry can specify whether or not the format is `safe`, which will be taken into consideration when reporting "endangered" files. -Further, it can optionally provide information about the actual file `format`. -If present, this information will override the format information as reported by siegfried. - -Example: -```yaml - -- filename: /mnt/data/de/ad/be/ef/data#something.idk - format: - puid: fmt/729 - name: SQLite Database File Format - mime: application/x-sqlite3 - endangered: false - -- filename: /mnt/data/de/ad/be/ef/data#anotherthing.bin - format: - puid: fmt/899 - name: Windows Portable Executable - mime: application/vnd.microsoft.portable-executable - endangered: true - safe: true - -- filename: /mnt/data/de/ad/be/ef/data#program.exe - safe: true -``` - -This example will override the file format detected by siegfried with the supplied values for the first two files. -For the second and third file, it will also mark the files as explicitly "safe" which will prevent formatscaper to report them as "endangered", even if their format is otherwise known to be. - - ## Generating an input file from Invenio The required information is relatively straight-forward to generate using `invenio shell`: @@ -246,19 +226,3 @@ with open("record-files.yaml", "w") as f: The above script simply lists all files associated with any published record, but does not consider any unpublished drafts. Such changes are very straight-forward to implement though. - - -## Filtering results - -To filter results in the shell, you can use the command [`yq`](https://github.com/mikefarah/yq) (which is a [`jq`](https://github.com/jqlang/jq) wrapper for YAML documents). - - -Show only the endangered files: -```sh -yq 'map(select(.format.endangered == true))' results.yaml -``` - -Filtering results per record: -```sh -yq 'map(select(.record == "<RECORD-ID>"))' results.yaml -``` diff --git a/formatscaper/__init__.py b/formatscaper/__init__.py index a570ab66d80fd5b029a67d59800c789131665422..c7d687d5e32421a46677d5d4828cb4ea23708bee 100644 --- a/formatscaper/__init__.py +++ b/formatscaper/__init__.py @@ -1,3 +1,5 @@ +"""Formatscaper is a tool for generating an overview of the file format landscape.""" + from .models import Format, RecordFile, Result __all__ = ( diff --git a/formatscaper/cli/__init__.py b/formatscaper/cli/__init__.py index b6e972b73a66c41f20c2a9d5e1d92cdc000fce7c..4a813220db57b054e6ff5e1d6db242b5222587c5 100644 --- a/formatscaper/cli/__init__.py +++ b/formatscaper/cli/__init__.py @@ -1,3 +1,5 @@ +"""CLI utilities for formatscaper.""" + from .formatscaper import run_formatscaper_cli from .resultman import run_resultman_cli diff --git a/formatscaper/cli/formatscaper.py b/formatscaper/cli/formatscaper.py index 16fbb43a3abfe611bf19d588008fdbf0520a2b91..36d6129d6636f6a8bb393be56434a7e765809e0a 100755 --- a/formatscaper/cli/formatscaper.py +++ b/formatscaper/cli/formatscaper.py @@ -1,10 +1,14 @@ #!/bin/env python3 +"""The file format identification command for formatscaper.""" + import argparse import os import re +import shutil import subprocess import sys +import tempfile import threading import progressbar as pb @@ -18,7 +22,6 @@ completed_tasks = 0 def parse_cli_args(): """Run the formatscaper command.""" - # set up the argument parser parser = argparse.ArgumentParser( description=( @@ -79,7 +82,7 @@ def scape_formats(config): m = re.match(r"siegfried ((\d+\.?)+)", sf_output) if m and m.group(1): ver_nums = [int(num) for num in m.group(1).split(".")] - if not (ver_nums[0] >= 1 and ver_nums[1] >= 10): + if not (ver_nums[0] >= 1 and ver_nums[1] >= 11): print( f"WARN: siegfried version too old ({m.group(1)})", file=sys.stderr ) @@ -93,19 +96,9 @@ def scape_formats(config): ) sys.exit(1) - # parse the list of known formats - formats = {f.puid: f for f in session.query(Format).all()} - # read the list of files to analyze record_files = load_record_files(config.input) - # read list of known file results - known_results = { - res.filename: res - # TODO not only marked as safe, but also generally touched-up results - for res in session.query(Result).filter(Result.safe == True) # noqa: E714 - } - # try to redirect the error logs from siegfried try: sf_error_log = open(config.sf_error_log, "w") @@ -157,7 +150,6 @@ def scape_formats(config): ) # set up variables required in the collection of results - endangered_files = [] sem = threading.Semaphore(num_threads) mutex = threading.Lock() pb_ws = [ @@ -170,26 +162,54 @@ def scape_formats(config): pb.Timer(), ] progress_bar = pb.ProgressBar(max_value=len(record_files), widgets=pb_ws) + base_dir = tempfile.mkdtemp() def process_record_file(record_file: RecordFile) -> None: with sem: + # link the files under investigation into a scoped directory + file_dir = os.path.join(base_dir, record_file.record) + file_path = os.path.join(file_dir, record_file.filename) + try: - sf_output = subprocess.check_output( - [ - config.sf_binary, - "-z", - "-multi", - "1", - "-name", - record_file.filename, - record_file.uri, - ], - stderr=sf_error_log, + # if we already have an overridden result for the record file + # in question, we skip it + overridden_result = ( + session.query(Result) + .filter( + Result.record == record_file.record, + Result.filename == record_file.filename, + Result.overridden.is_(True), + ) + .one_or_none() ) - # skip the sf info part - file_infos = yaml.safe_load_all(sf_output) - next(file_infos) + if overridden_result is not None: + file_infos = [] + + else: + # create a symlink to the file with a proper name to help siegfried + # with file format identification as the file name plays a role + # (this will be deleted afterwards) + os.makedirs(file_dir, exist_ok=True) + os.symlink(record_file.uri, file_path) + + sf_output = subprocess.check_output( + [ + config.sf_binary, + "-sym", + "-z", + "-multi", + "1", + "-name", + record_file.filename, + file_path, + ], + stderr=sf_error_log, + ) + + # skip the sf info part + file_infos = yaml.safe_load_all(sf_output) + next(file_infos) # go through all the files analyzed by siegfried which can be several, # if the original input file was an archive @@ -197,16 +217,24 @@ def scape_formats(config): if not file_info.get("errors") and file_info.get("matches", []): for match in file_info["matches"]: if match["ns"] == "pronom": - format = Format.from_sf_dict(match) - - # replace first occurrence of the URI with filename - filename = file_info["filename"].replace( - record_file.uri, record_file.filename, 1 - ) - # the storing of results needs to be mutually exclusive + # evaluate result in mutex to avoid race conditions with mutex: - format = formats.setdefault(format.puid, format) + # retrieve or add the format + format = ( + session.query(Format) + .filter(Format.puid == match["id"]) + .one_or_none() + ) + if format is None: + format = Format.from_sf_dict(match) + session.add(format) + + # replace first occurrence of the URI with filename + filename = file_info["filename"].replace( + (file_dir + os.path.sep), "", 1 + ) + result = Result( filename=filename, record=record_file.record, @@ -214,18 +242,7 @@ def scape_formats(config): ) # check if we claim to know better than siegfried - if result.filename in known_results: - known_res = known_results[result.filename] - result.safe = known_res.safe - if known_res.format is not None: - result.format = formats.get( - known_res.format.puid, known_res.format - ) - session.add(result) - format_endangered = formats[format.puid].endangered - if format_endangered and not result.safe: - endangered_files.append(result) # when the task ends, update the progress bar with mutex: @@ -237,6 +254,13 @@ def scape_formats(config): except (subprocess.CalledProcessError, ValueError) as e: print("WARN: error during sf execution:", str(e), file=sys.stderr) + finally: + try: + # in any case, remove the symlink to the file we generated + os.remove(file_path) + except FileNotFoundError: + pass + # analyze all files in parallel, and create the summary after all threads complete threads = [] for record_file in record_files or []: @@ -250,12 +274,11 @@ def scape_formats(config): except KeyboardInterrupt: pass + # clean up + shutil.rmtree(base_dir) if sf_error_log is not None: sf_error_log.close() - if endangered_files: - print(yaml.dump([f.as_dict() for f in endangered_files], sort_keys=False)) - # write new results to disk session.commit() diff --git a/formatscaper/cli/resultman.py b/formatscaper/cli/resultman.py index b6b611e19ab1a96883ff88743851d63945bdacea..f79d17357b6738f4199460a87bea11cc8034939d 100755 --- a/formatscaper/cli/resultman.py +++ b/formatscaper/cli/resultman.py @@ -1,7 +1,8 @@ #!/bin/env python3 +"""Textual user interface for managing the results.""" + import argparse -import enum import math from collections import defaultdict from typing import Optional @@ -12,14 +13,6 @@ from urwid.command_map import Command from ..models import Format, Result, create_db_session -class FormatFilter(enum.Enum): - """Filter settings for file formats.""" - - ALL = "all" - SAFE = "safe" - ENDANGERED = "endangered" - - class SText(uw.Text): """A selectable Text widget.""" @@ -44,12 +37,6 @@ def parse_cli_args(): parser = argparse.ArgumentParser( description="TUI tool for managing file format information" ) - parser.add_argument( - "--filter", - default=FormatFilter.ALL.value, - choices=[f.value for f in FormatFilter], - help="filter for formats based on their risk", - ) parser.add_argument( "--invenio-domain", default="researchdata.tuwien.ac.at", @@ -77,20 +64,6 @@ def run_resultman(config): # as this action only makes sense then columns.set_focus(left) - elif key in ["S", "E"]: - # formats_list is a ScrollBar wrapping a ListBox for the buttons - selected = formats_list.original_widget.focus - - if selected is not None: - # each SimpleButton has a format and is wrapped in an AttrMap - format = selected.original_widget.format - new_value = key == "E" - - format.endangered = new_value - selected.set_attr_map({None: "endangered" if new_value else "safe"}) - - current_filter = FormatFilter(config.filter) - # TODO make configurable session = create_db_session("sqlite:///db.sqlite") @@ -101,15 +74,18 @@ def run_resultman(config): except Exception as e: print(e) - # color palette & settings + # color palette & settings: (name, fg, bg) palette = [ ("border", "light gray,bold", "dark gray"), ("bold", "bold", ""), ("darkbg", "", "black"), ("reversed", "bold,standout", ""), - ("endangered", "white", "dark red"), - ("safe", "", "black"), - ("safe-result", "light green", "black"), + # styling for formats based on risk + ("vlr", "black", "dark green"), + ("lr", "light green", "black"), + ("mr", "black", "yellow"), + ("hr", "white", "dark red"), + ("vhr", "black,bold", "dark red"), ] uw.command_map["j"] = Command.DOWN @@ -141,10 +117,20 @@ def run_resultman(config): content.append(uw.Filler(uw.Text([f" {i}) ", ("bold", repository_url)]))) for res in results: + try: + if res.risk == 0: + attribute = None + else: + # min: 1×1=1, max: 5×5=25 + risk = int(res.risk / 5) + attribute = ["vlr", "lr", "mr", "hr", "vhr"][risk] + except IndexError: + attribute = "vhr" + content.append( uw.AttrMap( SText([" * ", res.filename], wrap="any"), - "safe-result" if res.safe else None, + attribute, focus_map="reversed", ) ) @@ -205,18 +191,17 @@ def run_resultman(config): right.set_focus(format_details) # defining the basic layout - def create_format_buttons(filter: FormatFilter): + def create_format_buttons(): format_buttons = [] max_len_results = max([len(f.results) for f in formats]) if formats else 0 max_num_files = math.ceil(math.log10(max_len_results) if max_len_results else 0) for format in sorted(formats, key=lambda f: f.name or ""): - if filter == FormatFilter.ENDANGERED and not format.endangered: - continue - elif filter == FormatFilter.SAFE and format.endangered: - continue + try: + attribute = [None, "vlr", "lr", "mr", "hr", "vhr"][format.risk] + except IndexError: + attribute = "vhr" - attribute = "endangered" if format.endangered else "safe" prefix = str(len(format.results)).rjust(max_num_files) button = SimpleButton(f"[{prefix}] {format.name or '[UNKNOWN]'}") button.format = format @@ -225,7 +210,7 @@ def run_resultman(config): return format_buttons - content = create_format_buttons(current_filter) or [uw.Text("No formats available")] + content = create_format_buttons() or [uw.Text("No formats available")] formats_list = uw.ScrollBar(uw.ListBox(uw.SimpleFocusListWalker(content))) formats_list._command_map["l"] = "activate" formats_label = uw.Filler(uw.AttrMap(uw.Text("FORMATS", align="center"), "border")) @@ -239,7 +224,6 @@ def run_resultman(config): | Up/Down/j/k/^d/^u: navigate up and down | | Enter/Space/l: select current format | | Esc/h/q: go back to formats list | - | S/E: mark focussed format as safe/endangered | | Q/^c: quit with/without saving changes | +---------------------------------------------------------------+ @@ -252,8 +236,8 @@ def run_resultman(config): div = uw.AttrMap(uw.SolidFill(" "), "border") columns = uw.Columns([("weight", 1, left), (1, div), ("weight", 2, right)]) - filter_info = uw.Text(f"Showing: {current_filter.value} formats") - status_line = uw.Filler(uw.AttrMap(filter_info, "border")) + info = uw.Text(f"Showing {len(formats)} formats") + status_line = uw.Filler(uw.AttrMap(info, "border")) top = uw.Pile([uw.AttrMap(columns, "darkbg"), ("pack", status_line)]) loop = uw.MainLoop(top, palette, unhandled_input=fallback_key_handler) diff --git a/formatscaper/models.py b/formatscaper/models.py index 3037e067a0a8367fa25971ee94ba265dfab3352a..5e9bd3da0dc33d068b08ea215d4f50c82e3087fb 100644 --- a/formatscaper/models.py +++ b/formatscaper/models.py @@ -1,7 +1,9 @@ +"""Data models for formatscaper.""" + import dataclasses from typing import List, Optional -from sqlalchemy import ForeignKey, create_engine +from sqlalchemy import ForeignKey, UniqueConstraint, create_engine from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column, relationship @@ -28,12 +30,18 @@ class Format(ModelBase): puid: Mapped[Optional[str]] = mapped_column(unique=True) name: Mapped[Optional[str]] = mapped_column() mime: Mapped[Optional[str]] = mapped_column() - endangered: Mapped[bool] = mapped_column(default=False) + + # actually the probability of becoming obsolete, but that's quite a mouthful + risk: Mapped[int] = mapped_column(default=0) results: Mapped[List["Result"]] = relationship( back_populates="format", cascade="all, delete-orphan", ) + comments: Mapped[List["FormatComment"]] = relationship( + back_populates="format", + cascade="all, delete-orphan", + ) def as_dict(self): """Dump the data as dictionary.""" @@ -41,7 +49,7 @@ class Format(ModelBase): "puid": self.puid, "name": self.name, "mime": self.mime, - "endangered": self.endangered, + "risk": self.risk, } @classmethod @@ -55,7 +63,24 @@ class Format(ModelBase): def __repr__(self): """Return repr(self) without the list of results.""" - return f"Format(puid='{self.puid}', name='{self.name}', mime='{self.mime}', endangered={self.endangered})" # noqa + return f"Format(puid='{self.puid}', name='{self.name}', mime='{self.mime}', risk={self.risk})" # noqa + + +@dataclasses.dataclass +class FormatComment(ModelBase): + """Comment about a file format.""" + + __tablename__ = "format_comment" + + id: Mapped[int] = mapped_column(primary_key=True) + comment: Mapped[str] = mapped_column() + + format_id: Mapped[Optional[int]] = mapped_column(ForeignKey("format.id")) + format: Mapped[Format] = relationship(back_populates="comments") + + def __repr__(self): + """Return repr(self).""" + return f"FormatComment(comment='{self.comment}')" @dataclasses.dataclass @@ -67,20 +92,30 @@ class Result(ModelBase): id: Mapped[int] = mapped_column(primary_key=True) filename: Mapped[str] = mapped_column() record: Mapped[Optional[str]] = mapped_column(default=None) - safe: Mapped[bool] = mapped_column(default=False) + impact: Mapped[int] = mapped_column(default=0) + + # flag indicating that the result was overridden manually + overridden: Mapped[bool] = mapped_column(default=False) format_id: Mapped[Optional[int]] = mapped_column(ForeignKey("format.id")) format: Mapped[Format] = relationship(back_populates="results") + # filenames are unique per record + __table_args__ = (UniqueConstraint("record", "filename"),) + + @property + def risk(self): + """Calculate the risk assessment for the file.""" + return self.format.risk * self.impact + def as_dict(self): """Dump the data as dictionary.""" result = { "filename": self.filename, "record": self.record, "format": self.format.as_dict(), + "impact": self.impact, } - if self.safe: - result["safe"] = self.safe return result diff --git a/formatscaper/utils.py b/formatscaper/utils.py index 6bb960956f69d161f3639eeeec7894ec6800deff..f5c5cd65e8af2cfea0272fde120b5d61e144ec75 100644 --- a/formatscaper/utils.py +++ b/formatscaper/utils.py @@ -1,3 +1,5 @@ +"""Utility functions for handling formats and results.""" + import pickle import re import sys diff --git a/pyproject.toml b/pyproject.toml index 4e1e034b6454cacdb30683e821c73376ccfc76fd..d16a6f726ee5387770c269c7292058228ebad461 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,12 @@ dev = [ "flake8 >= 7.0", "flake8-pyproject >= 1.2.3", ] +tests = [ + "pytest >= 8.3", + "pytest-black >= 0.3", + "pytest-cov >= 5.0", + "pytest-isort >= 4.0", +] [project.scripts] formatscaper = "formatscaper.cli:run_formatscaper_cli" @@ -48,3 +54,11 @@ extend-ignore = ["E203", "E704"] [tool.isort] profile = "black" + +[tool.pytest.ini_options] +addopts = '--black --isort --doctest-glob="*.rst" --doctest-modules --cov=formatscaper --cov-report=term-missing' + +[tool.coverage.run] +omit = [ + "formatscaper/cli/resultman.py" +] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..d8d5840434a6e4fbaa58268c852d84a31e7a5a72 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1 @@ +"""Pytest configuration.""" diff --git a/tests/data/test_formats.yml b/tests/data/test_formats.yml new file mode 100644 index 0000000000000000000000000000000000000000..9b193376570ca2736879ee59b89f961f3fe702dd --- /dev/null +++ b/tests/data/test_formats.yml @@ -0,0 +1,92 @@ +- puid: x-fmt/111 + name: Plain Text File + mime: text/plain + risk: 1 +- puid: x-fmt/263 + name: ZIP Format + mime: application/zip + risk: 2 +- puid: UNKNOWN + name: null + mime: null + risk: 5 +- puid: fmt/818 + name: YAML + mime: null + risk: 1 +- puid: fmt/938 + name: Python Source Code File + mime: null + risk: 3 +- puid: fmt/1149 + name: Markdown + mime: text/markdown + risk: 1 +- puid: fmt/43 + name: JPEG File Interchange Format + mime: image/jpeg + risk: 1 +- puid: fmt/615 + name: Gimp Image File Format + mime: null + risk: 3 +- puid: x-fmt/390 + name: Exchangeable Image File Format (Compressed) + mime: image/jpeg + risk: 1 +- puid: fmt/12 + name: Portable Network Graphics + mime: image/png + risk: 1 +- puid: fmt/1639 + name: Adobe InDesign Document + mime: null + risk: 2 +- puid: fmt/13 + name: Portable Network Graphics + mime: image/png + risk: 2 +- puid: fmt/11 + name: Portable Network Graphics + mime: image/png + risk: 2 +- puid: fmt/276 + name: Acrobat PDF 1.7 - Portable Document Format + mime: application/pdf + risk: 1 +- puid: fmt/215 + name: Microsoft Powerpoint for Windows + mime: application/vnd.openxmlformats-officedocument.presentationml.presentation + risk: 2 +- puid: fmt/412 + name: Microsoft Word for Windows + mime: application/vnd.openxmlformats-officedocument.wordprocessingml.document + risk: 2 +- puid: fmt/199 + name: MPEG-4 Media File + mime: application/mp4 + risk: 2 +- puid: x-fmt/391 + name: Exchangeable Image File Format (Compressed) + mime: image/jpeg + risk: 1 +- puid: fmt/471 + name: Hypertext Markup Language + mime: text/html + risk: 1 +- puid: fmt/92 + name: Scalable Vector Graphics + mime: image/svg+xml + risk: 1 +- puid: fmt/4 + name: Graphics Interchange Format + mime: image/gif + risk: 1 +- puid: x-fmt/224 + name: Cascading Style Sheet + mime: text/css + risk: 1 +- puid: fmt/20 + name: Acrobat PDF 1.6 - Portable Document Format + mime: application/pdf + risk: 1 diff --git a/tests/data/test_record_files.yml b/tests/data/test_record_files.yml new file mode 100644 index 0000000000000000000000000000000000000000..bb6081af500526fcfb0c0c3151a559e563ee34fd --- /dev/null +++ b/tests/data/test_record_files.yml @@ -0,0 +1,19 @@ +- filename: hosts + uri: /etc/hosts + record: 1234-abcd + +- filename: environment + uri: /etc/environment + record: 1234-abcd + +- filename: FS Table + uri: /etc/fstab + record: 1234-abcd + +- filename: original_name.pdf + uri: /home/mmoser/Documents/renamed.pdf + record: different-record + +- filename: invenio-upload.zip + uri: /mnt/data/uploaded_data/12/34/56/data + record: qwer-5678 diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4d14575e732d4093a455df6d20388a6f46be7fac --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,50 @@ +"""Testing the utility functions.""" + +import os +import tempfile + +import yaml + +from formatscaper.models import Format, RecordFile +from formatscaper.utils import load_formats, load_record_files, store_formats + + +def test_load_record_files_from_yaml(): + """Test loading of record files.""" + record_files = load_record_files("tests/data/test_record_files.yml") + assert len(record_files) > 1 + + for record_file in record_files: + assert isinstance(record_file, RecordFile) + + +def test_load_formats(): + """Test loading of formats.""" + formats = load_formats("tests/data/test_formats.yml") + assert len(formats) > 1 + + for format in formats.values(): + assert isinstance(format, Format) + assert 0 <= format.risk <= 5 + + +def test_store_formats(): + """Test storing of formats.""" + formats = [ + Format(puid="x-fmt/111", name="Plain Text File", mime="text/plain", risk=1), + Format(puid="fmt/615", name="Gimp Image File Format", mime=None, risk=3), + Format(puid="UNKNOWN", name=None, mime=None, risk=5), + ] + + try: + filename = tempfile.mktemp(suffix=".yml") + store_formats(formats, filename) + with open(filename, "r") as f: + result = yaml.safe_load(f) + + assert len(result) == len(formats) + for format in formats: + assert format.as_dict() in result + + finally: + os.remove(filename)