From 8e3de6a220b9f48107e1f0af8193fd37102f9ae3 Mon Sep 17 00:00:00 2001 From: Paul-Christian Volkmer Date: Sun, 6 Apr 2025 14:42:09 +0200 Subject: [PATCH] feat: add pseudonymization for patient IDs (#107) --- .../etl/processor/pseudonym/extensions.kt | 101 ++++- .../etl/processor/pseudonym/ExtensionsTest.kt | 383 +++++++++++------- 2 files changed, 328 insertions(+), 156 deletions(-) diff --git a/src/main/kotlin/dev/dnpm/etl/processor/pseudonym/extensions.kt b/src/main/kotlin/dev/dnpm/etl/processor/pseudonym/extensions.kt index bf645f6..111494b 100644 --- a/src/main/kotlin/dev/dnpm/etl/processor/pseudonym/extensions.kt +++ b/src/main/kotlin/dev/dnpm/etl/processor/pseudonym/extensions.kt @@ -21,12 +21,12 @@ package dev.dnpm.etl.processor.pseudonym import de.ukw.ccc.bwhc.dto.MtbFile import dev.dnpm.etl.processor.PatientId +import dev.pcvolkmer.mv64e.mtb.Mtb import org.apache.commons.codec.digest.DigestUtils /** Replaces patient ID with generated patient pseudonym * * @param pseudonymizeService The pseudonymizeService to be used - * * @return The MTB file containing patient pseudonymes */ infix fun MtbFile.pseudonymizeWith(pseudonymizeService: PseudonymizeService) { @@ -49,7 +49,11 @@ infix fun MtbFile.pseudonymizeWith(pseudonymizeService: PseudonymizeService) { } this.lastGuidelineTherapies?.forEach { it.patient = patientPseudonym } this.molecularPathologyFindings?.forEach { it.patient = patientPseudonym } - this.molecularTherapies?.forEach { molecularTherapy -> molecularTherapy.history.forEach { it.patient = patientPseudonym } } + this.molecularTherapies?.forEach { molecularTherapy -> + molecularTherapy.history.forEach { + it.patient = patientPseudonym + } + } this.ngsReports?.forEach { it.patient = patientPseudonym } this.previousGuidelineTherapies?.forEach { it.patient = patientPseudonym } this.rebiopsyRequests?.forEach { it.patient = patientPseudonym } @@ -63,7 +67,6 @@ infix fun MtbFile.pseudonymizeWith(pseudonymizeService: PseudonymizeService) { * Creates new hash of content IDs with given prefix except for patient IDs * * @param pseudonymizeService The pseudonymizeService to be used - * * @return The MTB file containing rehashed content IDs */ infix fun MtbFile.anonymizeContentWith(pseudonymizeService: PseudonymizeService) { @@ -120,8 +123,8 @@ infix fun MtbFile.anonymizeContentWith(pseudonymizeService: PseudonymizeService) id = id?.let { anonymize(it) } } } - this.geneticCounsellingRequests?.onEach { geneticCounsellingRequest -> - geneticCounsellingRequest?.apply { + this.geneticCounsellingRequests?.onEach { geneticCounsellingRequest -> + geneticCounsellingRequest?.apply { id = id?.let { anonymize(it) } } } @@ -223,4 +226,90 @@ infix fun MtbFile.anonymizeContentWith(pseudonymizeService: PseudonymizeService) id = id?.let { anonymize(it) } } } -} \ No newline at end of file +} + +/** Replaces patient ID with generated patient pseudonym + * + * @since 0.11.0 + * + * @param pseudonymizeService The pseudonymizeService to be used + * @return The MTB file containing patient pseudonymes + */ +infix fun Mtb.pseudonymizeWith(pseudonymizeService: PseudonymizeService) { + val patientPseudonym = pseudonymizeService.patientPseudonym(PatientId(this.patient.id)).value + + this.episodesOfCare?.forEach { it.patient.id = patientPseudonym } + this.carePlans?.forEach { + it.patient.id = patientPseudonym + it.rebiopsyRequests?.forEach { it.patient.id = patientPseudonym } + it.histologyReevaluationRequests?.forEach { it.patient.id = patientPseudonym } + it.medicationRecommendations.forEach { it.patient.id = patientPseudonym } + it.studyEnrollmentRecommendations?.forEach { it.patient.id = patientPseudonym } + it.procedureRecommendations?.forEach { it.patient.id = patientPseudonym } + it.geneticCounselingRecommendation.patient.id = patientPseudonym + } + this.diagnoses?.forEach { it.patient.id = patientPseudonym } + this.guidelineTherapies?.forEach { it.patient.id = patientPseudonym } + this.guidelineProcedures?.forEach { it.patient.id = patientPseudonym } + this.patient.id = patientPseudonym + this.claims?.forEach { it.patient.id = patientPseudonym } + this.claimResponses?.forEach { it.patient.id = patientPseudonym } + this.diagnoses?.forEach { it.patient.id = patientPseudonym } + this.histologyReports?.forEach { + it.patient.id = patientPseudonym + it.results.tumorMorphology?.patient?.id = patientPseudonym + it.results.tumorCellContent?.patient?.id = patientPseudonym + } + this.ngsReports?.forEach { + it.patient.id = patientPseudonym + it.results.simpleVariants?.forEach { it.patient.id = patientPseudonym } + it.results.copyNumberVariants?.forEach { it.patient.id = patientPseudonym } + it.results.dnaFusions?.forEach { it.patient.id = patientPseudonym } + it.results.rnaFusions?.forEach { it.patient.id = patientPseudonym } + it.results.tumorCellContent?.patient?.id = patientPseudonym + it.results.brcaness?.patient?.id = patientPseudonym + it.results.tmb?.patient?.id = patientPseudonym + it.results.hrdScore?.patient?.id = patientPseudonym + } + this.ihcReports?.forEach { + it.patient.id = patientPseudonym + it.results.msiMmr?.forEach { it.patient.id = patientPseudonym } + it.results.proteinExpression?.forEach { it.patient.id = patientPseudonym } + } + this.responses?.forEach { it.patient.id = patientPseudonym } + this.specimens?.forEach { it.patient.id = patientPseudonym } + this.priorDiagnosticReports?.forEach { it.patient.id = patientPseudonym } + this.performanceStatus.forEach { it.patient.id = patientPseudonym } + this.systemicTherapies.forEach { + it.history?.forEach { + it.patient.id = patientPseudonym + } + } +} + +/** + * Creates new hash of content IDs with given prefix except for patient IDs + * + * @since 0.11.0 + * + * @param pseudonymizeService The pseudonymizeService to be used + * @return The MTB file containing rehashed content IDs + */ +infix fun Mtb.anonymizeContentWith(pseudonymizeService: PseudonymizeService) { + val prefix = pseudonymizeService.prefix() + + fun anonymize(id: String): String { + val hash = DigestUtils.sha256Hex("$prefix-$id").substring(0, 41).lowercase() + return "$prefix$hash" + } + + this.episodesOfCare?.forEach { + it?.apply { + id = id?.let { + anonymize(it) + } + } + } + + // TODO all other properties +} diff --git a/src/test/kotlin/dev/dnpm/etl/processor/pseudonym/ExtensionsTest.kt b/src/test/kotlin/dev/dnpm/etl/processor/pseudonym/ExtensionsTest.kt index 0acf7db..d0ccb2b 100644 --- a/src/test/kotlin/dev/dnpm/etl/processor/pseudonym/ExtensionsTest.kt +++ b/src/test/kotlin/dev/dnpm/etl/processor/pseudonym/ExtensionsTest.kt @@ -1,7 +1,7 @@ /* * This file is part of ETL-Processor * - * Copyright (c) 2023 Comprehensive Cancer Center Mainfranken, Datenintegrationszentrum Philipps-Universität Marburg and Contributors + * Copyright (c) 2025 Comprehensive Cancer Center Mainfranken, Datenintegrationszentrum Philipps-Universität Marburg and Contributors * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published @@ -21,7 +21,12 @@ package dev.dnpm.etl.processor.pseudonym import com.fasterxml.jackson.databind.ObjectMapper import de.ukw.ccc.bwhc.dto.* +import dev.pcvolkmer.mv64e.mtb.MTBEpisodeOfCare +import dev.pcvolkmer.mv64e.mtb.Mtb +import dev.pcvolkmer.mv64e.mtb.PeriodDate +import dev.pcvolkmer.mv64e.mtb.Reference import org.assertj.core.api.Assertions.assertThat +import org.junit.jupiter.api.Nested import org.junit.jupiter.api.Test import org.junit.jupiter.api.assertThrows import org.junit.jupiter.api.extension.ExtendWith @@ -32,167 +37,245 @@ import org.mockito.kotlin.doAnswer import org.mockito.kotlin.whenever import org.springframework.core.io.ClassPathResource -const val FAKE_MTB_FILE_PATH = "fake_MTBFile.json" -const val CLEAN_PATIENT_ID = "5dad2f0b-49c6-47d8-a952-7b9e9e0f7549" - @ExtendWith(MockitoExtension::class) class ExtensionsTest { - private fun fakeMtbFile(): MtbFile { - val mtbFile = ClassPathResource(FAKE_MTB_FILE_PATH).inputStream - return ObjectMapper().readValue(mtbFile, MtbFile::class.java) - } + @Nested + inner class UsingBwhcDatamodel { - private fun MtbFile.serialized(): String { - return ObjectMapper().writeValueAsString(this) - } + val FAKE_MTB_FILE_PATH = "fake_MTBFile.json" + val CLEAN_PATIENT_ID = "5dad2f0b-49c6-47d8-a952-7b9e9e0f7549" - @Test - fun shouldNotContainCleanPatientId(@Mock pseudonymizeService: PseudonymizeService) { - doAnswer { - it.arguments[0] - "PSEUDO-ID" - }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) - - val mtbFile = fakeMtbFile() - - mtbFile.pseudonymizeWith(pseudonymizeService) - - assertThat(mtbFile.patient.id).isEqualTo("PSEUDO-ID") - assertThat(mtbFile.serialized()).doesNotContain(CLEAN_PATIENT_ID) - } - - @Test - fun shouldNotContainAnyUuidAfterRehashingOfIds(@Mock pseudonymizeService: PseudonymizeService) { - doAnswer { - it.arguments[0] - "PSEUDO-ID" - }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) - - doAnswer { - "TESTDOMAIN" - }.whenever(pseudonymizeService).prefix() - - val mtbFile = fakeMtbFile() - - mtbFile.pseudonymizeWith(pseudonymizeService) - mtbFile.anonymizeContentWith(pseudonymizeService) - - val pattern = "\"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\"".toRegex().toPattern() - val matcher = pattern.matcher(mtbFile.serialized()) - - assertThrows { - matcher.find() - matcher.group() - }.also { - assertThat(it.message).isEqualTo("No match found") + private fun fakeMtbFile(): MtbFile { + val mtbFile = ClassPathResource(FAKE_MTB_FILE_PATH).inputStream + return ObjectMapper().readValue(mtbFile, MtbFile::class.java) } + private fun MtbFile.serialized(): String { + return ObjectMapper().writeValueAsString(this) + } + + @Test + fun shouldNotContainCleanPatientId(@Mock pseudonymizeService: PseudonymizeService) { + doAnswer { + it.arguments[0] + "PSEUDO-ID" + }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) + + val mtbFile = fakeMtbFile() + + mtbFile.pseudonymizeWith(pseudonymizeService) + + assertThat(mtbFile.patient.id).isEqualTo("PSEUDO-ID") + assertThat(mtbFile.serialized()).doesNotContain(CLEAN_PATIENT_ID) + } + + @Test + fun shouldNotContainAnyUuidAfterRehashingOfIds(@Mock pseudonymizeService: PseudonymizeService) { + doAnswer { + it.arguments[0] + "PSEUDO-ID" + }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) + + doAnswer { + "TESTDOMAIN" + }.whenever(pseudonymizeService).prefix() + + val mtbFile = fakeMtbFile() + + mtbFile.pseudonymizeWith(pseudonymizeService) + mtbFile.anonymizeContentWith(pseudonymizeService) + + val pattern = "\"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}\"".toRegex().toPattern() + val matcher = pattern.matcher(mtbFile.serialized()) + + assertThrows { + matcher.find() + matcher.group() + }.also { + assertThat(it.message).isEqualTo("No match found") + } + + } + + @Test + fun shouldRehashIdsWithPrefix(@Mock pseudonymizeService: PseudonymizeService) { + doAnswer { + it.arguments[0] + "PSEUDO-ID" + }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) + + doAnswer { + "TESTDOMAIN" + }.whenever(pseudonymizeService).prefix() + + val mtbFile = MtbFile.builder() + .withPatient( + Patient.builder() + .withId("1") + .withBirthDate("2000-08-08") + .withGender(Patient.Gender.MALE) + .build() + ) + .withConsent( + Consent.builder() + .withId("1") + .withStatus(Consent.Status.ACTIVE) + .withPatient("123") + .build() + ) + .withEpisode( + Episode.builder() + .withId("1") + .withPatient("1") + .withPeriod(PeriodStart("2023-08-08")) + .build() + ) + .build() + + mtbFile.pseudonymizeWith(pseudonymizeService) + mtbFile.anonymizeContentWith(pseudonymizeService) + + + assertThat(mtbFile.episode.id) + // TESTDOMAIN + .isEqualTo("TESTDOMAIN44e20a53bbbf9f3ae39626d05df7014dcd77d6098") + } + + @Test + fun shouldNotThrowExceptionOnNullValues(@Mock pseudonymizeService: PseudonymizeService) { + doAnswer { + it.arguments[0] + "PSEUDO-ID" + }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) + + doAnswer { + "TESTDOMAIN" + }.whenever(pseudonymizeService).prefix() + + val mtbFile = MtbFile.builder() + .withPatient( + Patient.builder() + .withId("1") + .withBirthDate("2000-08-08") + .withGender(Patient.Gender.MALE) + .build() + ) + .withConsent( + Consent.builder() + .withId("1") + .withStatus(Consent.Status.ACTIVE) + .withPatient("123") + .build() + ) + .withEpisode( + Episode.builder() + .withId("1") + .withPatient("1") + .withPeriod(PeriodStart("2023-08-08")) + .build() + ) + .withClaims(null) + .withDiagnoses(null) + .withCarePlans(null) + .withClaimResponses(null) + .withEcogStatus(null) + .withFamilyMemberDiagnoses(null) + .withGeneticCounsellingRequests(null) + .withHistologyReevaluationRequests(null) + .withHistologyReports(null) + .withLastGuidelineTherapies(null) + .withMolecularPathologyFindings(null) + .withMolecularTherapies(null) + .withNgsReports(null) + .withPreviousGuidelineTherapies(null) + .withRebiopsyRequests(null) + .withRecommendations(null) + .withResponses(null) + .withStudyInclusionRequests(null) + .withSpecimens(null) + .build() + + mtbFile.pseudonymizeWith(pseudonymizeService) + mtbFile.anonymizeContentWith(pseudonymizeService) + + assertThat(mtbFile.episode.id).isNotNull() + } } - @Test - fun shouldRehashIdsWithPrefix(@Mock pseudonymizeService: PseudonymizeService) { - doAnswer { - it.arguments[0] - "PSEUDO-ID" - }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) + @Nested + inner class UsingDnpmV2Datamodel { - doAnswer { - "TESTDOMAIN" - }.whenever(pseudonymizeService).prefix() + val FAKE_MTB_FILE_PATH = "mv64e-mtb-fake-patient.json" + val CLEAN_PATIENT_ID = "63f8fd7b-8127-4f3c-8843-aa9199e21c29" - val mtbFile = MtbFile.builder() - .withPatient( - Patient.builder() - .withId("1") - .withBirthDate("2000-08-08") - .withGender(Patient.Gender.MALE) - .build() - ) - .withConsent( - Consent.builder() - .withId("1") - .withStatus(Consent.Status.ACTIVE) - .withPatient("123") - .build() - ) - .withEpisode( - Episode.builder() - .withId("1") - .withPatient("1") - .withPeriod(PeriodStart("2023-08-08")) - .build() - ) - .build() + private fun fakeMtbFile(): Mtb { + val mtbFile = ClassPathResource(FAKE_MTB_FILE_PATH).inputStream + return ObjectMapper().readValue(mtbFile, Mtb::class.java) + } - mtbFile.pseudonymizeWith(pseudonymizeService) - mtbFile.anonymizeContentWith(pseudonymizeService) + private fun Mtb.serialized(): String { + return ObjectMapper().writeValueAsString(this) + } + @Test + fun shouldNotContainCleanPatientId(@Mock pseudonymizeService: PseudonymizeService) { + doAnswer { + it.arguments[0] + "PSEUDO-ID" + }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) - assertThat(mtbFile.episode.id) - // TESTDOMAIN - .isEqualTo("TESTDOMAIN44e20a53bbbf9f3ae39626d05df7014dcd77d6098") + val mtbFile = fakeMtbFile() + + mtbFile.pseudonymizeWith(pseudonymizeService) + + assertThat(mtbFile.patient.id).isEqualTo("PSEUDO-ID") + assertThat(mtbFile.serialized()).doesNotContain(CLEAN_PATIENT_ID) + } + + @Test + fun shouldNotThrowExceptionOnNullValues(@Mock pseudonymizeService: PseudonymizeService) { + doAnswer { + it.arguments[0] + "PSEUDO-ID" + }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) + + doAnswer { + "TESTDOMAIN" + }.whenever(pseudonymizeService).prefix() + + val mtbFile = Mtb.builder() + .withPatient( + dev.pcvolkmer.mv64e.mtb.Patient.builder() + .withId("1") + .withBirthDate("2000-08-08") + .withGender(null) + .build() + ) + .withEpisodesOfCare( + listOf( + MTBEpisodeOfCare.builder() + .withId("1") + .withPatient(Reference("1")) + .withPeriod(PeriodDate.builder().withStart("2023-08-08").build()) + .build() + ) + ) + .withClaims(null) + .withDiagnoses(null) + .withCarePlans(null) + .withClaimResponses(null) + .withHistologyReports(null) + .withNgsReports(null) + .withResponses(null) + .withSpecimens(null) + .build() + + mtbFile.pseudonymizeWith(pseudonymizeService) + mtbFile.anonymizeContentWith(pseudonymizeService) + + assertThat(mtbFile.episodesOfCare).hasSize(1) + assertThat(mtbFile.episodesOfCare.map { it.id }).isNotNull + } } - - @Test - fun shouldNotThrowExceptionOnNullValues(@Mock pseudonymizeService: PseudonymizeService) { - doAnswer { - it.arguments[0] - "PSEUDO-ID" - }.whenever(pseudonymizeService).patientPseudonym(anyValueClass()) - - doAnswer { - "TESTDOMAIN" - }.whenever(pseudonymizeService).prefix() - - val mtbFile = MtbFile.builder() - .withPatient( - Patient.builder() - .withId("1") - .withBirthDate("2000-08-08") - .withGender(Patient.Gender.MALE) - .build() - ) - .withConsent( - Consent.builder() - .withId("1") - .withStatus(Consent.Status.ACTIVE) - .withPatient("123") - .build() - ) - .withEpisode( - Episode.builder() - .withId("1") - .withPatient("1") - .withPeriod(PeriodStart("2023-08-08")) - .build() - ) - .withClaims(null) - .withDiagnoses(null) - .withCarePlans(null) - .withClaimResponses(null) - .withEcogStatus(null) - .withFamilyMemberDiagnoses(null) - .withGeneticCounsellingRequests(null) - .withHistologyReevaluationRequests(null) - .withHistologyReports(null) - .withLastGuidelineTherapies(null) - .withMolecularPathologyFindings(null) - .withMolecularTherapies(null) - .withNgsReports(null) - .withPreviousGuidelineTherapies(null) - .withRebiopsyRequests(null) - .withRecommendations(null) - .withResponses(null) - .withStudyInclusionRequests(null) - .withSpecimens(null) - .build() - - mtbFile.pseudonymizeWith(pseudonymizeService) - mtbFile.anonymizeContentWith(pseudonymizeService) - - - assertThat(mtbFile.episode.id).isNotNull() - } - -} \ No newline at end of file +}