makeAnonymizedDataset.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. import os
  2. import json
  3. import re
  4. import subprocess
  5. import nibabel
  6. import shutil
  7. import sys
  8. import numpy
  9. shome=os.path.expanduser('~nixUser')
  10. sys.path.insert(1,shome+'/software/src/labkeyInterface')
  11. import labkeyInterface
  12. import labkeyDatabaseBrowser
  13. fhome=os.path.expanduser('~')
  14. fconfig=os.path.join(fhome,'.labkey','network.json')
  15. net=labkeyInterface.labkeyInterface()
  16. net.init(fconfig)
  17. db=labkeyDatabaseBrowser.labkeyDB(net)
  18. hi=0
  19. project='iPNUMMretro/Study'
  20. #project='Orthanc/Database'
  21. labkeyBase='/data/labkey'
  22. #tempBase=os.path.join(fhome,'temp')
  23. #all images from database
  24. imageDataset='Imaging1'
  25. clinicalDataset='ClinicalData'
  26. anonymousClinicalDataset='AnonymousClinicalData'
  27. anonymousImagingDataset='AnonymousImaging'
  28. ds=db.selectRows(project,'study',clinicalDataset,[])
  29. fieldMatches={
  30. 'SequenceNum':'SequenceNum',
  31. 'datumRojstva':'birthDate',
  32. 'DatumDiagnozeMetaM':'mmDiagnosisDate',
  33. 'StarostObZacetkuIT':'itStartAge',
  34. 'Spol':'sex',
  35. 'Origo':'origo',
  36. 'OrigoCode':'origoCode',
  37. 'MStadij':'mStage',
  38. 'MStadijCode':'mStageCode',
  39. 'MetastatskeLokalizacije':'metastaticLocalization',
  40. 'SteviloMetaLokalizacija':'metastaticLocalizationCount',
  41. 'Genetika':'mutations',
  42. 'DolocitevMut':'mutationDetermination',
  43. 'PSObUvedbiIT':'psAtITIntroduction',
  44. 'PridruzeneKronicneBolezni':'assocatedCronicDiseases',
  45. 'PricetekIT':'itStart',
  46. 'ZakljucekIT':'itEnd',
  47. 'VzrokPrenehanje':'itEndCause',
  48. 'StAplikacij':'applicationCount',
  49. 'NajboljsiOdgovor':'bora',
  50. 'DatumPD_PFS':'DatumPD_PFS',
  51. 'VitalnoStanje':'vitalState',
  52. 'DatumSmrti':'deathDate',
  53. 'KozniIzpuscaj':'skinRash',
  54. 'Vitiligo':'vitiligo',
  55. 'Puritus':'puritus',
  56. 'Hipotiroza':'hypotirosis',
  57. 'Osteoartritis':'osteoartritis',
  58. 'Diareja':'diarrhea',
  59. 'Pnevmonitis':'pneumonitis',
  60. 'Hepatitis':'hepatitis',
  61. 'Fatigue':'fatigue',
  62. 'DrugaTox':'otherTox',
  63. 'LDH':'ldh',
  64. 'S100':'s100',
  65. 'KSSistemsko':'sistemicKS',
  66. 'KSvzrokNiPrejemal':'ksNotAdministeredCause',
  67. 'PETopazovanNU':'petMonitoredAE',
  68. 'PETObelezilNU':'petDetectedAE',
  69. 'PETPredSimZnaki':'earlyPETAEDetection',
  70. 'UkrepanjeZaradiPET':'petRelatedAction',
  71. 'DatumIzhodiscnegaPETPredIT':'initialPETDate',
  72. 'noPETCT':'petCTCount',
  73. 'DatumPrvegaPETZNU':'firstAEPETCTDate',
  74. 'DatumPrvegaSimOzLab':'aeIdentificationDate',
  75. 'DodatnePreiskavePolegPET':'aeAdditionalExams',
  76. 'IzidNU':'aeOutcome'
  77. }
  78. #randomize patientIDs
  79. patientList=[row['PatientId'] for row in ds['rows']]
  80. patientList=list(set(patientList))
  81. patientCodes={}
  82. perm=numpy.random.permutation(len(patientList))
  83. for i in numpy.arange(len(perm)):
  84. code='A{:03d}'.format(perm[i])
  85. patientCodes[patientList[i]]=code
  86. #anonymize clinical data
  87. for row in ds['rows']:
  88. outRow={}
  89. for f in fieldMatches:
  90. outRow[fieldMatches[f]]=row[f]
  91. #mask patientId
  92. outRow['PatientId']=patientCodes[row['PatientId']]
  93. db.modifyRows('insert',project,'study',anonymousClinicalDataset,[outRow])
  94. #anonymize image data
  95. ds=db.selectRows(project,'study',imageDataset,[])
  96. fields=['SequenceNum','studyDate']
  97. #for links
  98. projectAnonymousBase=os.path.join(labkeyBase,'files',project,'@files/anonymous')
  99. for row in ds['rows']:
  100. outRow={}
  101. for f in fields:
  102. outRow[f]=row[f]
  103. outRow['PatientId']=patientCodes[row['PatientId']]
  104. #copy links
  105. for f in ['CT','PETWB']:
  106. #idealy we should use series uuid from dicom, this is a cludge
  107. anonSeriesId='{}_{}_{:03.0f}'.\
  108. format(f,outRow['PatientId'],row['SequenceNum'])
  109. origFile=os.path.join(projectAnonymousBase,row[f]+'.zip')
  110. modFile=os.path.join(projectAnonymousBase,anonSeriesId+'.zip')
  111. subprocess.run(['cp','-d',origFile,modFile])
  112. outRow[f+'_UUID']=anonSeriesId
  113. outRow[f]='[DICOM]'
  114. db.modifyRows('insert',project,'study',anonymousImagingDataset,[outRow])
  115. print('Done')