Center for Research Data Management
Formatscaper

Repository

pipenv run ./formatscaper.py --sf-binary "${GOPATH}/bin/sf" -i record-files.yaml
- filename: hosts
  uri: /etc/hosts
  record: 1234-abcd

- filename: FS Table
  uri: /etc/fstab
  record: abcd-1234

- filename: researchdata.zip
  uri: /mnt/data/de/ad/be/ef/data
  record: abcd-1234
- endangered: false
  mime: text/plain
  name: Plain Text File
  puid: x-fmt/111
- endangered: false
  mime: application/zip
  name: ZIP Format
  puid: x-fmt/263
- endangered: true
  mime: null
  name: Adobe Illustrator CC 2020 Artwork
  puid: fmt/1864
- endangered: false
  mime: application/postscript
  name: Encapsulated PostScript File Format
  puid: fmt/124
- endangered: false
  mime: application/pdf
  name: Acrobat PDF 1.4 - Portable Document Format
  puid: fmt/18
- endangered: false
  mime: image/svg+xml
  name: Scalable Vector Graphics
  puid: fmt/92
- endangered: true
  mime: null
  name: null
  puid: UNKNOWN
- filename: /etc/hosts
  format:
    endangered: false
    mime: text/plain
    name: Plain Text File
    puid: x-fmt/111
  record: 1234-abcd
- filename: /etc/environment
  format:
    endangered: false
    mime: text/plain
    name: Plain Text File
    puid: x-fmt/111
  record: 1234-abcd
- filename: /mnt/data/de/ad/be/ef/data
  format:
    endangered: false
    mime: application/zip
    name: ZIP Format
    puid: x-fmt/263
  record: abcd-1234
- filename: /mnt/data/de/ad/be/ef/data#README.txt
  format:
    endangered: false
    mime: text/plain
    name: Plain Text File
    puid: x-fmt/111
  record: abcd-1234
- filename: /mnt/data/de/ad/be/ef/data#results.csv
  format:
    endangered: false
    mime: text/csv
    name: Comma Separated Values
    puid: x-fmt/18
  record: abcd-1234

- filename: /mnt/data/de/ad/be/ef/data#something.idk
  format:
    puid: fmt/729
    name: SQLite Database File Format
    mime: application/x-sqlite3
    endangered: false

- filename: /mnt/data/de/ad/be/ef/data#anotherthing.bin
  format:
    puid: fmt/899
    name: Windows Portable Executable
    mime: application/vnd.microsoft.portable-executable
    endangered: true
  safe: true

- filename: /mnt/data/de/ad/be/ef/data#program.exe
  safe: true
import yaml
from invenio_rdm_records.proxies import current_rdm_records_service as svc

# get all (published) records in the system
rc = svc.record_cls
recs = [rc(rm.data, model=rm) for rm in rc.model_cls.query.all()]

# get the expected structure from the records
record_files = [
    {"record": r["id"], "filename": fn, "uri": entry.file.file.uri}
    for r in recs
    for fn, entry in r.files.entries.items()
    if r.files.entries
]

# serialize the information as YAML file
with open("record-files.yaml", "w") as f:
    yaml.dump(record_files, f)
yq 'map(select(.format.endangered == true))' results.yaml
yq 'map(select(.record == "<RECORD-ID>"))' results.yaml