importXML.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. import os
  2. import sys
  3. import importlib
  4. import json
  5. import lxml.etree
  6. #need labkey interface's importXLS
  7. def getImporter(setup):
  8. sys.path.append(setup['paths']['nixWrapper'])
  9. import nixWrapper
  10. nixWrapper.loadLibrary('labkeyInterface')
  11. import importXLSX
  12. importlib.reload(importXLSX)
  13. return importXLSX
  14. def getFileBrowser(db):
  15. import nixWrapper
  16. nixWrapper.loadLibrary('labkeyInterface')
  17. import labkeyFileBrowser
  18. return labkeyFileBrowser.labkeyFileBrowser(db.net)
  19. def countElements(xmlRoot,xPath):
  20. xElUp=xmlRoot
  21. if xPath.find('/')>-1:
  22. xPathUp=xPath[:xPath.rfind('/')]
  23. xElUp=xmlRoot.find(xPathUp)
  24. n=len(xElUp)
  25. print('Counting {} in {}[{}]'.format(xPath,xElUp.tag,n))
  26. i=1
  27. while xmlRoot.find(f'{xPath}[{i}]') is not None:
  28. i=i+1
  29. return i-1
  30. def getVal(xmlRoot,aliasVal,xPath=''):
  31. q=aliasVal.split(':')
  32. xName=q[0]
  33. if len(xPath)>0:
  34. if len(xName)>0:
  35. elementPath='/'.join([xPath,xName])
  36. else:
  37. elementPath=xPath
  38. else:
  39. elementPath=xName
  40. attributeName=q[1]
  41. #find number of elements with this path
  42. n=countElements(xmlRoot,elementPath)
  43. try:
  44. return [xmlRoot.find(f'{elementPath}[{i+1}]').get(attributeName) for i in range(n)]
  45. except AttributeError:
  46. return [None]
  47. def updateTxt(txt,replacePatterns):
  48. for x in replacePatterns:
  49. txt=txt.replace(x,replacePatterns[x])
  50. return txt
  51. def updateAliases(aliasValues,aliasReplace):
  52. return {a:updateTxt(aliasValues[a],aliasReplace) for a in aliasValues}
  53. def parseJSON(x):
  54. print(f'Decoding [{x}]')
  55. try:
  56. return json.loads(x)
  57. except TypeError:
  58. pass
  59. return {}
  60. def readSetup(importXLSX,pars):
  61. db=importXLSX.getDB(pars)
  62. ds=db.selectRows(pars['project'],'lists','importSetup',[])
  63. setupRows=ds['rows']
  64. for r in setupRows:
  65. r['aliasReplace']=parseJSON(r['aliasReplace'])
  66. r['presetValues']=parseJSON(r['presetValues'])
  67. return setupRows
  68. def getID(root):
  69. return root.find('Patient/PatientID').get('val')
  70. def getXMLRoot(xmlFile):
  71. return lxml.etree.ElementTree(file=xmlFile).getroot()
  72. def matchingLengths(dictArray):
  73. #print(dictArray)
  74. n={x:len(dictArray[x]) for x in dictArray}
  75. a=list(n.values())
  76. return all(x==a[0] for x in a)
  77. def importXML(importXLSX,pars,xmlRoot,dryRun=True):
  78. #def importData(pars,filename,getId=getId,modify=modify,convertLookup=convertLookup,dryRun=True,debug=True):
  79. #master routine that imports data based on pars,
  80. #applies user supplied functions modify, convertLookup and get Id and
  81. #updates relevant database
  82. #some useful fields from pars (d is for default value)
  83. # - seqNumOffset specify visit/sequenceNum offset (number, d:0 will result in 1)
  84. # - XPath - xml path to the element to take data from (helpful if multiple elements are present in xml and identical data is sought)
  85. # - additionalKeyColumn - name of the variable/column used for separating data entries (on top of ParticipantId and SequenceNum, helpful in the same cases as XPath)
  86. # - presetVariables - set some of the row variables to this values (same cases as XPath)
  87. # - project - labkey project
  88. # - schema - labkey schema (list/study, d: study)
  89. # - query - labkey query
  90. #set this is as sequenceNum for entries, or initial seqNum if more than a single entry is in the dataset
  91. seqNumOffset=pars.get('seqNumOffset',0)
  92. xPath=pars.get('XPath','')
  93. keyColumn=pars.get('additionalKeyColumn')
  94. presetValues=pars.get('presetValues',{})
  95. aliasReplace=pars.get('aliasReplace',{})
  96. allowMultiple=pars.get('allowMultiple',"False")
  97. #convert to boolean
  98. allowMultiple=allowMultiple=="True" or allowMultiple=="true"
  99. fields=importXLSX.getFields(pars)
  100. lookupVars=importXLSX.getVariables(fields,fieldType='LOOKUP')
  101. dateVars=importXLSX.getVariables(fields,fieldType='DATE')
  102. doubleVars=importXLSX.getVariables(fields,fieldType='DOUBLE')
  103. #convert dates to list
  104. dateVars=list(dateVars.keys())
  105. print(f'dateVars: {dateVars}')
  106. lookupMap={f:importXLSX.getLookupMap(pars,fields,f) for f in lookupVars}
  107. alias=importXLSX.invertMap(importXLSX.getAlias(fields))
  108. alias=updateAliases(alias,aliasReplace)
  109. print(f'aliases: {alias}')
  110. row={}
  111. #patient id can be either set in pars (takes precedence) or from xml record
  112. pid=pars.get('id',getID(xmlRoot))
  113. row={'ParticipantId':pid,'SequenceNum':seqNumOffset+1}
  114. row.update(presetValues)
  115. rows=[]
  116. vals={}
  117. for f in fields:
  118. try:
  119. vals[f]=getVal(xmlRoot,alias[f],xPath)
  120. if not allowMultiple:
  121. vals[f]=vals[f][0:1]
  122. except KeyError:
  123. print(f'Alias for field {f} not found')
  124. continue
  125. if not matchingLengths(vals):
  126. print('Mismatch lenghts of variables {}'.format({v:len(vals[v]) for v in vals}))
  127. return
  128. n=[len(vals[v]) for v in vals][0]
  129. for i in range(n):
  130. r={x:row[x] for x in row}
  131. r.update({v:vals[v][i] for v in vals})
  132. if n>1:
  133. r[keyColumn]='{}_{}'.format(r[keyColumn],i+1)
  134. rows.append(r)
  135. #print(rows)
  136. importXLSX.loadSafely(pars,rows,keyColumn,dryRun)
  137. def main(parameterFile):
  138. with open(parameterFile) as f:
  139. pars=json.load(f)
  140. print(pars)
  141. fhome=os.path.expanduser('~')
  142. with open(os.path.join(fhome,".labkey","setup.json")) as f:
  143. setup=json.load(f)
  144. importXLSX=getImporter(setup)
  145. #needs server
  146. db=importXLSX.getDB(pars)
  147. db.net.getCSRF()
  148. keyFilter={'variable':'Key','value':'{key}'.format(**pars),'oper':'eq'}
  149. #needs project
  150. ds=db.selectRows(pars['project'],'lists','importXML',[keyFilter])
  151. r=ds['rows'][0]
  152. xmlFile=os.path.join(fhome,'temp','DCIS','data.xml')
  153. url=db.net.connectionConfig['host']+r['_labkeyurl_fileUpload']
  154. fb=getFileBrowser(db)
  155. fb.readFileToFile(url,xmlFile)
  156. xmlRoot=getXMLRoot(xmlFile)
  157. #needs project
  158. setupRows=readSetup(importXLSX,pars)
  159. #setupRows=setupRows[17:18]
  160. for s in setupRows:
  161. #needs project and schema for queries
  162. importXML(importXLSX,pars|s,xmlRoot,dryRun=False)
  163. #make calling importXML updates
  164. r['status']=1
  165. r['ID']=getID(xmlRoot)
  166. del r['fileUpload']
  167. db.modifyRows('update',pars['project'],'lists','importXML',[r])
  168. if __name__ == "__main__" :
  169. main(sys.argv[1])