From cef30fed47374c94cf978f15c4bf69fc113edcfb Mon Sep 17 00:00:00 2001 From: Maximilian Moser <maximilian.moser@tuwien.ac.at> Date: Thu, 12 Dec 2024 12:11:34 +0100 Subject: [PATCH 1/2] Fix logic for syncing names from TISS * account for the fact that several TISS profiles can have the smae ORCID listed * also, the `names_svc.update()` method seems broken currently --- invenio_config_tuw/tasks.py | 121 ++++++++++++++++++++++-------- invenio_config_tuw/tiss/models.py | 1 + 2 files changed, 89 insertions(+), 33 deletions(-) diff --git a/invenio_config_tuw/tasks.py b/invenio_config_tuw/tasks.py index 894cba6..b520b9c 100644 --- a/invenio_config_tuw/tasks.py +++ b/invenio_config_tuw/tasks.py @@ -8,6 +8,8 @@ """Celery tasks running in the background.""" import copy +from collections import defaultdict +from difflib import SequenceMatcher from typing import List, Optional import requests @@ -18,13 +20,13 @@ from invenio_accounts.proxies import current_datastore from invenio_db import db from invenio_mail.tasks import send_email from invenio_rdm_records.proxies import current_rdm_records_service as records_service -from invenio_records_resources.services.uow import UnitOfWork +from invenio_records_resources.services.uow import RecordIndexOp, UnitOfWork from invenio_vocabularies.contrib.names.api import Name from .tiss import Employee, fetch_tiss_data -def get_tuw_ror_aliases(): +def get_tuw_ror_aliases() -> List[str]: """Fetch the aliases of TU Wien known to ROR.""" try: response = requests.get("https://api.ror.org/organizations/04d836q62") @@ -46,15 +48,13 @@ def get_tuw_ror_aliases(): ] -def find_orcid_match(employee: Employee, names: List[Name]) -> Optional[Name]: - """Find the name entry with the same ORCID as the given employee.""" - if not employee.orcid: +def find_orcid_match(orcid: str, names: List[Name]) -> Optional[Name]: + """Find the name entry with the given ORCID.""" + if not orcid: return None for name in names: - if {"scheme": "orcid", "identifier": employee.orcid} in name.get( - "identifiers", [] - ): + if {"scheme": "orcid", "identifier": orcid} in name.get("identifiers", []): return name return None @@ -68,6 +68,8 @@ def update_name_data( name = copy.deepcopy(name) name["given_name"] = employee.first_name name["family_name"] = employee.last_name + if "name" in name: + name["name"] = f"{employee.last_name}, {employee.first_name}" # normalize & deduplicate affilations, and make sure that TU Wien is one of them # NOTE: sorting is done to remove indeterminism and prevent unnecessary updates @@ -92,10 +94,23 @@ def update_name_data( return name +def _calc_name_distance( + employee: Optional[Employee], name_voc: Optional[Name] +) -> float: + """Calculate the distance between the employee name and the vocabulary entry.""" + if employee is None or name_voc is None: + return 0 + + fn, ln = name_voc.get("given_name", ""), name_voc.get("family_name", "") + fn_dist = SequenceMatcher(a=fn, b=employee.first_name).ratio() + ln_dist = SequenceMatcher(a=ln, b=employee.last_name).ratio() + return fn_dist + ln_dist + + @shared_task(ignore_result=True) -def sync_names_from_tiss(): +def sync_names_from_tiss() -> dict: """Look up TU Wien employees via TISS and update the names vocabulary.""" - results = {"created": 0, "updated": 0} + results = {"created": 0, "updated": 0, "failed": 0} tuw_ror_aliases = get_tuw_ror_aliases() svc = current_app.extensions["invenio-vocabularies"].names_service @@ -104,35 +119,75 @@ def sync_names_from_tiss(): for model in svc.record_cls.model_cls.query.all() if not model.is_deleted and model.data ] - _, employees = fetch_tiss_data() - employees_with_orcid = [e for e in employees if not e.pseudoperson and e.orcid] + + # it can happen that 2+ TISS profiles have the same ORCID listed + orcid_employees = defaultdict(list) + for employee in employees: + if not employee.pseudoperson and employee.orcid: + orcid_employees[employee.orcid].append(employee) with UnitOfWork(db.session) as uow: - for employee in employees_with_orcid: - matching_name = find_orcid_match(employee, all_names) - - if matching_name: - # if we found a match via ORCID, we update it according to the TISS data - name = svc.read(identity=system_identity, id_=matching_name["id"]) - new_name_data = update_name_data(name.data, employee, tuw_ror_aliases) - - # only update the entry if it actually differs somehow - if name.data != new_name_data: - svc.update( - identity=system_identity, - id_=name.id, - data=new_name_data, - uow=uow, + for orcid, employees in orcid_employees.items(): + matching_name = find_orcid_match(orcid, all_names) + if len(employees) == 1: + (employee,) = employees + elif len(employees) > 1: + # if we several TISS profiles with the same ORCID, we use the one + # with the closest match in name in our names vocabulary + employee = sorted( + employees, + key=lambda e: _calc_name_distance(employee, matching_name), + )[-1] + else: + continue + + try: + if matching_name: + name_voc_id = matching_name.pid.pid_value + old_name = matching_name.get( + "name", + f"{matching_name.get('family_name')}, {matching_name.get('given_name')}", ) - results["updated"] += 1 + new_name = f"{employee.last_name}, {employee.first_name}" - else: - # if we couldn't find a match via ORCID, that's a new entry - svc.create( - identity=system_identity, data=employee.to_name_entry(), uow=uow + # if we found a match via ORCID, we update it according to the TISS data + name = svc.read(identity=system_identity, id_=name_voc_id) + new_name_data = update_name_data( + name.data, employee, tuw_ror_aliases + ) + + # only update the entry if it actually differs somehow + if name.data != new_name_data: + # note: it seems like `svc.update()` is currently broken because + # some fields that get added to the record (e.g. "pid") isn't + # expected in the JSONSchema used for validation + # we avoid this by popping "$schema" here + # app-rdm v12, vocabularies v3.4.2 + name_rec = name._record + name_rec.update(new_name_data) + name_rec.pop("$schema") + name_rec.commit() + uow.register(RecordIndexOp(name_rec)) + results["updated"] += 1 + + current_app.logger.info( + f"TISS sync: updated name '{name_voc_id}' from " + f"'{old_name}' to '{new_name}'" + ) + + else: + # if we couldn't find a match via ORCID, that's a new entry + svc.create( + identity=system_identity, data=employee.to_name_entry(), uow=uow + ) + results["created"] += 1 + + except Exception as e: + results["failed"] += 1 + current_app.logger.warn( + f"TISS sync: failed for '{employee}', with error: {e}" ) - results["created"] += 1 uow.commit() diff --git a/invenio_config_tuw/tiss/models.py b/invenio_config_tuw/tiss/models.py index 0efad7b..bb8168f 100644 --- a/invenio_config_tuw/tiss/models.py +++ b/invenio_config_tuw/tiss/models.py @@ -75,6 +75,7 @@ class Employee: ids.append({"scheme": "orcid", "identifier": self.orcid}) return { + "id": self.orcid or str(self.tiss_id), "given_name": self.first_name, "family_name": self.last_name, "identifiers": ids, -- GitLab From 45d42502ffa3ec95b79178bf5fecc196151da8c7 Mon Sep 17 00:00:00 2001 From: Maximilian Moser <maximilian.moser@tuwien.ac.at> Date: Thu, 12 Dec 2024 23:53:09 +0100 Subject: [PATCH 2/2] Bump version to v2024.4.5 --- CHANGES.rst | 3 ++- invenio_config_tuw/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 54b4c88..b6ac56e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -8,7 +8,7 @@ Changes ======= -Version 2024.4 (released 2024-11-26, updated 2024-12-05) +Version 2024.4 (released 2024-11-26, updated 2024-12-12) - Pin `Flask-Menu`` dependency - Add `Invenio-DAMAP` to the dependencies @@ -16,6 +16,7 @@ Version 2024.4 (released 2024-11-26, updated 2024-12-05) - Cast TISS ID into string for the Invenio-DAMAP integration - Send out notification emails when records get published - Rework curation menu item registration and unpin `Flask-Menu` +- Fix TISS name vocabulary synchronization Version 2024.3 (released 2024-10-01, updated 2024-11-13) diff --git a/invenio_config_tuw/__init__.py b/invenio_config_tuw/__init__.py index 3882381..8b0fddf 100644 --- a/invenio_config_tuw/__init__.py +++ b/invenio_config_tuw/__init__.py @@ -9,6 +9,6 @@ from .ext import InvenioConfigTUW -__version__ = "2024.4.4" +__version__ = "2024.4.5" __all__ = ("__version__", "InvenioConfigTUW") -- GitLab