From 08f9eb1eb18264e50dbea3fdaefad240aa846c9e Mon Sep 17 00:00:00 2001
From: Maximilian Moser <maximilian.moser@tuwien.ac.at>
Date: Wed, 26 Jul 2023 16:16:27 +0200
Subject: [PATCH] Add task for updating the names vocabulary with TISS data

* for now, we're limiting the updates to employees who have their ORCID
  identifier linked on their TISS profile, because that allows us to
  create perfect matches
---
 invenio_config_tuw/tasks.py         | 136 ++++++++++++++++++++++++++++
 invenio_config_tuw/tiss/__init__.py |  17 ++++
 invenio_config_tuw/tiss/models.py   |  86 ++++++++++++++++++
 invenio_config_tuw/tiss/utils.py    |  54 +++++++++++
 4 files changed, 293 insertions(+)
 create mode 100644 invenio_config_tuw/tasks.py
 create mode 100644 invenio_config_tuw/tiss/__init__.py
 create mode 100644 invenio_config_tuw/tiss/models.py
 create mode 100644 invenio_config_tuw/tiss/utils.py

diff --git a/invenio_config_tuw/tasks.py b/invenio_config_tuw/tasks.py
new file mode 100644
index 0000000..69e07f3
--- /dev/null
+++ b/invenio_config_tuw/tasks.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2023 TU Wien.
+#
+# Invenio-Config-TUW is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License; see LICENSE file for more details.
+
+"""Celery tasks running in the background."""
+
+import copy
+from typing import List, Optional
+
+import requests
+from celery import shared_task
+from flask import current_app
+from invenio_access.permissions import system_identity
+from invenio_db import db
+from invenio_records_resources.services.uow import UnitOfWork
+from invenio_vocabularies.contrib.names.api import Name
+
+from .tiss import Employee, fetch_tiss_data
+
+
+def get_tuw_ror_aliases():
+    """Fetch the aliases of TU Wien known to ROR."""
+    try:
+        response = requests.get("https://api.ror.org/organizations/04d836q62")
+        if response == 200:
+            tuw_ror = response.json()
+            tuw_ror_names = [tuw_ror["name"], *tuw_ror["acronyms"], *tuw_ror["aliases"]]
+            return tuw_ror_names
+
+    except Exception as e:
+        current_app.logger.warn(
+            f"Error while fetching TU Wien information from ROR: {e}"
+        )
+
+    return [
+        "TU Wien",
+        "TUW",
+        "Technische UniversitÃ¤t Wien",
+        "Vienna University of Technology",
+    ]
+
+
+def find_orcid_match(employee: Employee, names: List[Name]) -> Optional[Name]:
+    """Find the name entry with the same ORCID as the given employee."""
+    if not employee.orcid:
+        return None
+
+    for name in names:
+        if {"scheme": "orcid", "identifier": employee.orcid} in name.get(
+            "identifiers", []
+        ):
+            return name
+
+    return None
+
+
+def update_name_data(
+    name: dict, employee: Employee, tuw_aliases: Optional[List[str]] = None
+) -> dict:
+    """Update the given name entry data with the information from the employee."""
+    tuw_aliases = tuw_aliases or ["TU Wien"]
+    name = copy.deepcopy(name)
+    name["given_name"] = employee.first_name
+    name["family_name"] = employee.last_name
+
+    # normalize & deduplicate affilations, and make sure that TU Wien is one of them
+    # NOTE: sorting is done to remove indeterminism and prevent unnecessary updates
+    affiliations = {
+        aff["name"] for aff in name["affiliations"] if aff["name"] not in tuw_aliases
+    }
+    affiliations.add("TU Wien")
+    name["affiliations"] = sorted(
+        [{"name": aff} for aff in affiliations], key=lambda aff: aff["name"]
+    )
+
+    # similar to above, add the ORCID mentioned in TISS and deduplicate
+    identifiers = {(id_["scheme"], id_["identifier"]) for id_ in name["identifiers"]}
+    if employee.orcid:
+        identifiers.add(("orcid", employee.orcid))
+
+    name["identifiers"] = sorted(
+        [{"scheme": scheme, "identifier": id_} for scheme, id_ in identifiers],
+        key=lambda id_: f'{id_["scheme"]}:{id_["identifier"]}',
+    )
+
+    return name
+
+
+@shared_task(ignore_result=True)
+def sync_names_from_tiss():
+    """Look up TU Wien employees via TISS and update the names vocabulary."""
+    results = {"created": 0, "updated": 0}
+    tuw_ror_aliases = get_tuw_ror_aliases()
+    svc = current_app.extensions["invenio-vocabularies"].names_service
+
+    all_names = [
+        svc.record_cls.get_record(model.id)
+        for model in svc.record_cls.model_cls.query.all()
+        if not model.is_deleted and model.data
+    ]
+
+    _, employees = fetch_tiss_data()
+    employees_with_orcid = [e for e in employees if not e.pseudoperson and e.orcid]
+
+    with UnitOfWork(db.session) as uow:
+        for employee in employees_with_orcid:
+            matching_name = find_orcid_match(employee, all_names)
+
+            if matching_name:
+                # if we found a match via ORCID, we update it according to the TISS data
+                name = svc.read(identity=system_identity, id_=matching_name["id"])
+                new_name_data = update_name_data(name.data, employee, tuw_ror_aliases)
+
+                # only update the entry if it actually differs somehow
+                if name.data != new_name_data:
+                    svc.update(
+                        identity=system_identity,
+                        id_=name.id,
+                        data=new_name_data,
+                        uow=uow,
+                    )
+                    results["updated"] += 1
+
+            else:
+                # if we couldn't find a match via ORCID, that's a new entry
+                svc.create(
+                    identity=system_identity, data=employee.to_name_entry(), uow=uow
+                )
+                results["created"] += 1
+
+        uow.commit()
+
+    return results
diff --git a/invenio_config_tuw/tiss/__init__.py b/invenio_config_tuw/tiss/__init__.py
new file mode 100644
index 0000000..2bcd3a5
--- /dev/null
+++ b/invenio_config_tuw/tiss/__init__.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2023 TU Wien.
+#
+# Invenio-Config-TUW is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License; see LICENSE file for more details.
+
+"""Utilities for integrating InvenioRDM with TISS."""
+
+from .models import Employee, OrgUnit
+from .utils import fetch_tiss_data
+
+__all__ = (
+    "Employee",
+    "OrgUnit",
+    "fetch_tiss_data",
+)
diff --git a/invenio_config_tuw/tiss/models.py b/invenio_config_tuw/tiss/models.py
new file mode 100644
index 0000000..0efad7b
--- /dev/null
+++ b/invenio_config_tuw/tiss/models.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2023 TU Wien.
+#
+# Invenio-Config-TUW is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License; see LICENSE file for more details.
+
+"""Data classes for representing information from TISS."""
+
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+@dataclass
+class OrgUnit:
+    """An organizational unit at TU Wien."""
+
+    tiss_id: int
+    code: str
+    name_en: str
+    name_de: str
+    employees: List["Employee"]
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "OrgUnit":
+        """Parse the organizational unit from the given dictionary."""
+        return cls(
+            tiss_id=data["tiss_id"],
+            code=data["code"],
+            name_de=data.get("name_de", ""),
+            name_en=data.get("name_en", ""),
+            employees=[Employee.from_dict(emp) for emp in data.get("employees")],
+        )
+
+    def __hash__(self):
+        """Use the TISS ID for hashing."""
+        return hash(self.tiss_id)
+
+
+@dataclass
+class Employee:
+    """An employee at TU Wien."""
+
+    tiss_id: int
+    orcid: Optional[str]
+    first_name: str
+    last_name: str
+    pseudoperson: bool
+    titles_pre: str
+    titles_post: str
+
+    @property
+    def full_name(self):
+        """Create the full name in the same style as InvenioRDM does."""
+        return f"{self.last_name}, {self.first_name}"
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "Employee":
+        """Parse the employee from the given dictionary."""
+        return cls(
+            tiss_id=data["tiss_id"],
+            orcid=data.get("orcid", None),
+            first_name=data["first_name"],
+            last_name=data["last_name"],
+            pseudoperson=data.get("pseudoperson", False),
+            titles_pre=data.get("preceding_titles", ""),
+            titles_post=data.get("postpositioned_titles", ""),
+        )
+
+    def to_name_entry(self):
+        """Massage the employee into the shape of a name entry."""
+        ids = []
+        if self.orcid:
+            ids.append({"scheme": "orcid", "identifier": self.orcid})
+
+        return {
+            "given_name": self.first_name,
+            "family_name": self.last_name,
+            "identifiers": ids,
+            "affiliations": [{"name": "TU Wien"}],
+        }
+
+    def __hash__(self):
+        """Use the TISS ID for hashing."""
+        return hash(self.tiss_id)
diff --git a/invenio_config_tuw/tiss/utils.py b/invenio_config_tuw/tiss/utils.py
new file mode 100644
index 0000000..a788832
--- /dev/null
+++ b/invenio_config_tuw/tiss/utils.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2023 TU Wien.
+#
+# Invenio-Config-TUW is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License; see LICENSE file for more details.
+
+"""Functions for fetching information from TISS."""
+
+from typing import Optional, Set, Tuple
+
+import requests
+
+from .models import Employee, OrgUnit
+
+
+def _get_org_unit_dict(code: str) -> dict:
+    """Fetch the data about the org unit from TISS."""
+    response = requests.get(
+        f"https://tiss.tuwien.ac.at/api/orgunit/v22/code/{code}?persons=true"
+    )
+    # NOTE: some org units don't seem to have an OID
+    #       (e.g. "E366t1 - Institutsbibliothek"),
+    #       it seems to be safer to go through the 'code'
+    assert response.status_code == 200
+
+    org_unit = response.json()
+    return org_unit
+
+
+def _fetch_tiss_data(
+    org_unit: dict,
+    org_units: Optional[Set[OrgUnit]] = None,
+    employees: Optional[Set[Employee]] = None,
+) -> Tuple[Set[OrgUnit], Set[Employee]]:
+    """Fetch and parse the info about org units and employees from TISS."""
+    org_units = org_units if org_units is not None else set()
+    employees = employees if employees is not None else set()
+
+    unit = OrgUnit.from_dict(org_unit)
+    org_units.add(unit)
+    employees.update(set(unit.employees))
+
+    child_units = org_unit.get("children", org_unit.get("child_orgs_refs", []))
+    for child_unit in child_units:
+        child_unit_dict = _get_org_unit_dict(child_unit["code"])
+        _fetch_tiss_data(child_unit_dict, org_units, employees)
+
+    return org_units, employees
+
+
+def fetch_tiss_data() -> Tuple[Set[OrgUnit], Set[Employee]]:
+    """Fetch and parse the info about all org units and their employees from TISS."""
+    return _fetch_tiss_data(_get_org_unit_dict("E000"))
-- 
GitLab