importXML.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. import os
  2. import sys
  3. import importlib
  4. import json
  5. import lxml.etree
  6. #need labkey interface's importXLS
  7. def getImporter(setup):
  8. sys.path.append(setup['paths']['nixWrapper'])
  9. import nixWrapper
  10. nixWrapper.loadLibrary('labkeyInterface')
  11. import importXLSX
  12. importlib.reload(importXLSX)
  13. return importXLSX
  14. def getFileBrowser(db):
  15. import nixWrapper
  16. nixWrapper.loadLibrary('labkeyInterface')
  17. import labkeyFileBrowser
  18. return labkeyFileBrowser.labkeyFileBrowser(db.net)
  19. def countElements(xmlRoot,xPath):
  20. xElUp=xmlRoot
  21. if xPath.find('/')>-1:
  22. xPathUp=xPath[:xPath.rfind('/')]
  23. xElUp=xmlRoot.find(xPathUp)
  24. n=len(xElUp)
  25. print('Counting {} in {}[{}]'.format(xPath,xElUp.tag,n))
  26. i=1
  27. while xmlRoot.find(f'{xPath}[{i}]') is not None:
  28. i=i+1
  29. return i-1
  30. def mergePaths(corePath,subPath):
  31. if len(corePath)>0:
  32. if len(subPath)>0:
  33. return '/'.join([corePath,subPath])
  34. return corePath
  35. return subPath
  36. def getVal(xmlRoot,aliasVal,xPath=''):
  37. q=aliasVal.split(':')
  38. xName=q[0]
  39. elementPath=mergePaths(xPath,xName)
  40. attributeName=q[1]
  41. eList=[elementPath]
  42. if xPath[-1]!=']':
  43. n=countElements(xmlRoot,elementPath)
  44. eList=[f'{elementPath}[{i+1}]' for i in range(n)]
  45. print(eList)
  46. try:
  47. return [xmlRoot.find(e).get(attributeName) for e in eList]
  48. except AttributeError:
  49. return None
  50. def updateTxt(txt,replacePatterns):
  51. for x in replacePatterns:
  52. txt=txt.replace(x,replacePatterns[x])
  53. return txt
  54. def updateAliases(aliasValues,aliasReplace):
  55. return {a:updateTxt(aliasValues[a],aliasReplace) for a in aliasValues}
  56. def parseJSON(x):
  57. print(f'Decoding [{x}]')
  58. try:
  59. return json.loads(x)
  60. except TypeError:
  61. pass
  62. return {}
  63. def readSetup(importXLSX,pars):
  64. db=importXLSX.getDB(pars)
  65. ds=db.selectRows(pars['project'],'lists','importSetup',[])
  66. setupRows=ds['rows']
  67. for r in setupRows:
  68. r['aliasReplace']=parseJSON(r['aliasReplace'])
  69. r['presetValues']=parseJSON(r['presetValues'])
  70. return setupRows
  71. def getID(root):
  72. return root.find('Patient/PatientID').get('val')
  73. def getXMLRoot(xmlFile):
  74. return lxml.etree.ElementTree(file=xmlFile).getroot()
  75. def matchingLengths(dictArray):
  76. #print(dictArray)
  77. n={x:len(dictArray[x]) for x in dictArray}
  78. a=list(n.values())
  79. return all(x==a[0] for x in a)
  80. def importXML(importXLSX,pars,xmlRoot,dryRun=True):
  81. #def importData(pars,filename,getId=getId,modify=modify,convertLookup=convertLookup,dryRun=True,debug=True):
  82. #master routine that imports data based on pars,
  83. #applies user supplied functions modify, convertLookup and get Id and
  84. #updates relevant database
  85. #some useful fields from pars (d is for default value)
  86. # - seqNumOffset specify visit/sequenceNum offset (number, d:0 will result in 1)
  87. # - XPath - xml path to the element to take data from (helpful if multiple elements are present in xml and identical data is sought)
  88. # - additionalKeyColumn - name of the variable/column used for separating data entries (on top of ParticipantId and SequenceNum, helpful in the same cases as XPath)
  89. # - presetVariables - set some of the row variables to this values (same cases as XPath)
  90. # - project - labkey project
  91. # - schema - labkey schema (list/study, d: study)
  92. # - query - labkey query
  93. #set this is as sequenceNum for entries, or initial seqNum if more than a single entry is in the dataset
  94. seqNumOffset=pars.get('seqNumOffset',0)
  95. xPath=pars.get('XPath','')
  96. keyColumn=pars.get('additionalKeyColumn')
  97. presetValues=pars.get('presetValues',{})
  98. aliasReplace=pars.get('aliasReplace',{})
  99. allowMultiple=pars.get('allowMultiple',"False")
  100. #convert to boolean
  101. allowMultiple=allowMultiple=="True" or allowMultiple=="true"
  102. fields=importXLSX.getFields(pars)
  103. lookupVars=importXLSX.getVariables(fields,fieldType='LOOKUP')
  104. dateVars=importXLSX.getVariables(fields,fieldType='DATE')
  105. doubleVars=importXLSX.getVariables(fields,fieldType='DOUBLE')
  106. #convert dates to list
  107. dateVars=list(dateVars.keys())
  108. print(f'dateVars: {dateVars}')
  109. lookupMap={f:importXLSX.getLookupMap(pars,fields,f) for f in lookupVars}
  110. alias=importXLSX.invertMap(importXLSX.getAlias(fields))
  111. alias=updateAliases(alias,aliasReplace)
  112. print(f'aliases: {alias}')
  113. row={}
  114. #patient id can be either set in pars (takes precedence) or from xml record
  115. pid=pars.get('id',getID(xmlRoot))
  116. row={'ParticipantId':pid,'SequenceNum':seqNumOffset+1}
  117. row.update(presetValues)
  118. rows=[]
  119. vals={}
  120. for f in fields:
  121. try:
  122. x=getVal(xmlRoot,alias[f],xPath)
  123. if not x:
  124. continue
  125. if not allowMultiple:
  126. x=x[0:1]
  127. n=len(x)
  128. print('{}[{}]: {}'.format(f,n,x))
  129. vals[f]=x
  130. except KeyError:
  131. print(f'Alias for field {f} not found')
  132. continue
  133. if not matchingLengths(vals):
  134. print('Mismatch lenghts of variables {}'.format({v:len(vals[v]) for v in vals}))
  135. return
  136. n=[len(vals[v]) for v in vals][0]
  137. for i in range(n):
  138. r={x:row[x] for x in row}
  139. r.update({v:vals[v][i] for v in vals})
  140. if n>1:
  141. r[keyColumn]='{}_{}'.format(r[keyColumn],i+1)
  142. rows.append(r)
  143. #print(rows)
  144. importXLSX.loadSafely(pars,rows,keyColumn,dryRun)
  145. def main(parameterFile):
  146. with open(parameterFile) as f:
  147. pars=json.load(f)
  148. print(pars)
  149. fhome=os.path.expanduser('~')
  150. with open(os.path.join(fhome,".labkey","setup.json")) as f:
  151. setup=json.load(f)
  152. importXLSX=getImporter(setup)
  153. #needs server
  154. db=importXLSX.getDB(pars)
  155. db.net.getCSRF()
  156. keyFilter={'variable':'Key','value':'{key}'.format(**pars),'oper':'eq'}
  157. #needs project
  158. ds=db.selectRows(pars['project'],'lists','importXML',[keyFilter])
  159. r=ds['rows'][0]
  160. xmlFile=os.path.join(fhome,'temp','DCIS','data.xml')
  161. url=db.net.connectionConfig['host']+r['_labkeyurl_fileUpload']
  162. fb=getFileBrowser(db)
  163. fb.readFileToFile(url,xmlFile)
  164. xmlRoot=getXMLRoot(xmlFile)
  165. #needs project
  166. setupRows=readSetup(importXLSX,pars)
  167. #setupRows=setupRows[17:18]
  168. #let labkey know that we started a processing run
  169. #do not field fileUpload
  170. del r['fileUpload']
  171. r['status']=3
  172. if r['ID']==None:
  173. r['ID']=getID(xmlRoot)
  174. db.modifyRows('update',pars['project'],'lists','importXML',[r])
  175. for s in setupRows:
  176. #needs project and schema for queries
  177. importXML(importXLSX,pars|s,xmlRoot,dryRun=False)
  178. #make calling importXML updates
  179. #we are done now, set status to OK(1)
  180. r['status']=1
  181. db.modifyRows('update',pars['project'],'lists','importXML',[r])
  182. if __name__ == "__main__" :
  183. main(sys.argv[1])