importXLSX.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. import os
  2. import pandas
  3. import sys
  4. import importlib
  5. import re
  6. import datetime
  7. import chardet
  8. import json
  9. import math
  10. #same directory!
  11. import labkeyInterface
  12. import labkeyDatabaseBrowser
  13. import labkeyFileBrowser
  14. def connectDB(server):
  15. net=labkeyInterface.labkeyInterface()
  16. qfile='{}.json'.format(server)
  17. fconfig=os.path.join(os.path.expanduser('~'),'.labkey',qfile)
  18. net.init(fconfig)
  19. net.getCSRF()
  20. return labkeyDatabaseBrowser.labkeyDB(net)
  21. def getDB(pars):
  22. try:
  23. return pars['db']
  24. except KeyError:
  25. pass
  26. server=pars.get('server','onko-nix')
  27. db=connectDB(server)
  28. pars['db']=db
  29. return db
  30. def getFields(pars):
  31. project=pars.get('project','DCIS/Study')
  32. schema=pars.get('schema','demographics')
  33. query=pars.get('query','demographics')
  34. db=getDB(pars)
  35. #data on query structure are both in queryDesign and selectRows/metadata content
  36. dsgn=db.getQueryDesign(project,schema,query)
  37. dsgnFields={f['name']:f for f in dsgn['fields']}
  38. ds=db.selectRows(project,schema,query,[])
  39. mdFields={x['name']:x for x in ds['metaData']['fields']}
  40. #in principle, any property from mdFields could get copied
  41. #to content reported from dsgnFields
  42. #try/except to robustify against missing components
  43. copyFields=['lookup']
  44. for f in dsgnFields:
  45. try:
  46. dsgnFields[f].update({x:mdFields[f][x] for x in copyFields})
  47. except KeyError:
  48. pass
  49. return dsgnFields
  50. def getAlias(fields):
  51. fieldMap={}
  52. for f in fields.values():
  53. aliasList=getAliasList(f['importAliases'])
  54. fieldMap.update({x:f['name'] for x in aliasList})
  55. return fieldMap
  56. def invertMap(qmap):
  57. return {qmap[x]:x for x in qmap}
  58. def getVariables(fields,fieldType='LOOKUP',fieldName=None):
  59. #get list of variables of particular type to help manage import
  60. #if type is LOOKUP also return data on lookup query
  61. if fieldName:
  62. return {fieldName:fields[fieldName]['lookupQuery']}
  63. if fieldType=='LOOKUP':
  64. return {f['name']:f['lookupQuery'] for f in fields.values() \
  65. if f['lookupQuery']}
  66. if fieldType=='DATE':
  67. return {f['name']:fieldType for f in fields.values() \
  68. if f['rangeURI'].find('dateTime')>-1}
  69. if fieldType=='DOUBLE':
  70. return {f['name']:fieldType for f in fields.values() \
  71. if f['rangeURI'].find('double')>-1}
  72. return {}
  73. def getLookupMap(pars,fields,fieldName):
  74. #get possible values of categorical variables/factors from labkey
  75. try:
  76. lookup=fields[fieldName]['lookup']
  77. except KeyError:
  78. print(fields[fieldName])
  79. raise KeyError(f'Could not find lookup for {fieldName}')
  80. schema=lookup['schemaName']
  81. query=lookup['queryName']
  82. key=lookup['keyColumn']
  83. val=lookup['displayColumn']
  84. project=pars['project']
  85. db=getDB(pars)
  86. ds=db.selectRows(project,schema,query,[])
  87. cMap={r[val]:r[key] for r in ds['rows']}
  88. def parseLookup(lookup,qv):
  89. #parse/convert lookup
  90. #is it key?
  91. try:
  92. return lookup[qv]
  93. except KeyError:
  94. pass
  95. if pandas.isna(qv):
  96. return qv
  97. try:
  98. qv=qv.item()
  99. qv=str(qv)
  100. return lookup[qv]
  101. except AttributeError:
  102. pass
  103. qv=qv.replace('Č','C')
  104. return lookup[qv]
  105. def asKey(qv):
  106. if not qv:
  107. return qv
  108. try:
  109. return int(qv)
  110. except (TypeError,ValueError):
  111. print(f'Failed to parse {qv} as key')
  112. return None
  113. def parseDate(qv):
  114. if not qv:
  115. return qv
  116. #from xls format to native python format
  117. fmts=['datetime','pandas','%d.%m.%y','%Y-%m-%d %H:%M:%S.%f',\
  118. '%d/%m/%Y','%Y-%m-%d %H:%M:%S']
  119. for fmt in fmts:
  120. try:
  121. if fmt=='pandas':
  122. #print(f'Trying {qv} as pandas.Timestamp')
  123. date=pandas.Timestamp.to_pydatetime(qv)
  124. elif fmt=='datetime':
  125. #print(f'Trying {qv} as datetime.datetime')
  126. if not isinstance(qv,datetime.datetime):
  127. raise TypeError('Not a datetime object')
  128. date=qv
  129. else:
  130. #print(f'Trying {qv} with {fmt}')
  131. date=datetime.datetime.strptime(qv,fmt)
  132. break
  133. except TypeError:
  134. #print('Failed (type): {}'.format(type(qv)))
  135. continue
  136. except ValueError:
  137. #print('Failed (value)')
  138. continue
  139. #sometimes parsing fails
  140. try:
  141. return date.isoformat()
  142. except UnboundLocalError:
  143. print (f'Failed to parsed {qv} as date')
  144. return None
  145. def parseDouble(qv):
  146. try:
  147. return float(qv)
  148. except ValueError:
  149. return None
  150. #m
  151. def setMissingLookupValues(filename,xlsFieldName,project,lookup,\
  152. labkeyFieldName=None,dryRun=True):
  153. #list all possible values for a field
  154. #perhaps needs to be updated
  155. df=pandas.read_excel(filename)
  156. vars=df.columns
  157. vals=set([df.at[r,xlsFieldName] for r in df.index if not pandas.isna(df.at[r,xlsFieldName])])
  158. try:
  159. vals={v.item() for v in vals}
  160. except AttributeError:
  161. pass
  162. print(vals)
  163. if not labkeyFieldName:
  164. labkeyFieldName=xlsFieldName
  165. db=connectDB('onko-nix')
  166. ds=db.selectRows(project,'lists',lookup,[])
  167. #only record values from labkey (should be unique anyhow)
  168. setVals=set([r[labkeyFieldName] for r in ds['rows']])
  169. off=len(list(setVals))
  170. missing=sorted(list(vals-setVals))
  171. #print('Missing {}'.format(missing))
  172. n=len(missing)
  173. entries=[{'Key':'{}'.format(i+1+off),columnName:missing[i]} \
  174. for i in range(n)]
  175. print(entries)
  176. if dryRun:
  177. return
  178. db.modifyRows('insert',project,'lists',lookup,entries)
  179. def getAliasList(x):
  180. #split aliases by comma, taking into account quotation marks,
  181. #where commas are ignored
  182. if not x:
  183. return []
  184. #sophisticated spliting that ignores commas in double (and single!) quotes
  185. ar=re.split(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', x)
  186. #remove the quotes afterwards
  187. ar=[s.replace('"','') for s in ar]
  188. ar=[s.strip() for s in ar]
  189. return ar
  190. def printErr(resp):
  191. #print error from server response
  192. try:
  193. print(resp['exception'])
  194. except KeyError:
  195. pass
  196. def findAlias(v,alias):
  197. #find matchng alias for field v from XLS
  198. try:
  199. return alias[v.strip()]
  200. except KeyError:
  201. pass
  202. #remove .N qualifiers, attach to the end by pandas.read_excel
  203. v=re.sub(r'(.*)\.[0-9]*$',r'\1',v.strip())
  204. try:
  205. return alias[v]
  206. except KeyError:
  207. pass
  208. return None
  209. def getSequenceNum(pMap,id):
  210. #updates pMap and return seqnum for this entry
  211. try:
  212. n=len(pMap[id])
  213. except KeyError:
  214. n=0
  215. seqNum=n+1
  216. if n==0:
  217. pMap[id]=[seqNum]
  218. else:
  219. pMap[id].append(seqNum)
  220. return seqNum
  221. def entryInList(r,entries):
  222. #is entry in list
  223. candidates=[x for x in entries if entriesMatch(x,r)]
  224. return len(candidates)>0
  225. def entriesMatch(x,r):
  226. #do a pair of entries match? Should we update data from one over the other?
  227. matchFields=['ParticipantId','SequenceNum']
  228. for f in matchFields:
  229. if x[f]!=r[f]:
  230. return False
  231. return True
  232. def validate(qv):
  233. #rough checks on value read from XLSX
  234. #NAN
  235. try:
  236. if math.isnan(qv):
  237. return None
  238. except TypeError:
  239. pass
  240. #NAD
  241. if pandas.isnull(qv):
  242. return None
  243. return qv
  244. #modify,getId
  245. def importData(pars,filename,getId,modify,convertLookup,dryRun=True):
  246. #master routine that imports data based on pars,
  247. #applies user supplied functionsmodify, convertLookup and get Id and
  248. #updates relevant database
  249. #some useful fields from pars (d is for default value)
  250. # - skiprows removes irelevant rows (number, d: 0)
  251. # - usecols is EXCEL like notations for cols taken (identifer, d: None for all)
  252. # - sheet_name selects sheet for import (number, d:0 for first sheet)
  253. # - seqNumOffset specify visit/sequenceNum offset (number, d:0 will result in 1)
  254. # - project - labkey project
  255. # - schema - labkey schema (list/study, d: study)
  256. # - query - labkey query
  257. skiprows=pars.get('skiprows',0)
  258. usecols=pars.get('usecols',None)
  259. sheet_name=pars.get('sheet_name',0)
  260. #set this is as sequenceNum for entries, or initial seqNum if more than a single entry is in the dataset
  261. seqNumOffset=pars.get('seqNumOffset',0)
  262. fields=getFields(pars)
  263. lookupVars=getVariables(fields,fieldType='LOOKUP')
  264. dateVars=getVariables(fields,fieldType='DATE')
  265. doubleVars=getVariables(fields,fieldType='DOUBLE')
  266. usecols=pars.get('usecols',None)
  267. #convert dates to list
  268. dateVars=list(dateVars.keys())
  269. print(f'dateVars: {dateVars}')
  270. lookupMap={f:getLookupMap(pars,fields,f) for f in lookupVars}
  271. alias=getAlias(fields)
  272. print(f'aliases: {alias}')
  273. df=pandas.read_excel(filename,sheet_name=sheet_name,skiprows=skiprows,\
  274. usecols=usecols)
  275. vars=df.columns
  276. print(vars)
  277. pMap={}
  278. print('Index: {}'.format(len(df.index)))
  279. idx=df.index[0:10] #for debug
  280. # idx=df.index #for all
  281. entries=[]
  282. for r in idx:
  283. id=getId(df,r)
  284. entry={}
  285. entry['ParticipantId']=id
  286. for v in vars:
  287. qv=validate(df.at[r,v])
  288. qv=modify(qv,v)
  289. f=findAlias(v,alias)
  290. if not f:
  291. continue
  292. if f in lookupMap:
  293. if convertLookup(v):
  294. qv=parseLookup(lookupMap[f],qv)
  295. else:
  296. qv=asKey(qv)
  297. if f in dateVars:
  298. qv=parseDate(qv)
  299. if f in doubleVars:
  300. qv=parseDouble(qv)
  301. try:
  302. numpyType=qv.dtype
  303. qv=qv.item()
  304. except AttributeError:
  305. pass
  306. entry[f]=qv
  307. #print('{}:{}/{}'.format(f,qv,type(qv)))
  308. seqNum=getSequenceNum(pMap,id)
  309. entry['SequenceNum']=seqNum+seqNumOffset
  310. entries.append(entry)
  311. #for p in pMap:
  312. # print('{}: {}'.format(p,len(pMap[p])))
  313. print(entries)
  314. #delete previous incarnations
  315. db=getDB(pars)
  316. project=pars.get('project','DCIS/Study')
  317. schema=pars.get('schema','demographics')
  318. query=pars.get('query','demographics')
  319. updateRows=[]
  320. insertRows=[]
  321. for entry in entries:
  322. v='ParticipantId'
  323. idFilter={'variable':v,'value':entry[v],'oper':'eq'}
  324. v='SequenceNum'
  325. seqFilter={'variable':v,'value':'{}'.format(entry[v]),'oper':'eq'}
  326. ds=db.selectRows(project,schema,query,[idFilter,seqFilter])
  327. if len(ds['rows'])>0:
  328. r=ds['rows'][0]
  329. r.update(entry)
  330. updateRows.append(r)
  331. else:
  332. insertRows.append(entry)
  333. n=len(updateRows)
  334. print(f'Updating {n} entries')
  335. if n and not dryRun:
  336. printErr(db.modifyRows('update',project,schema,query,updateRows))
  337. n=len(insertRows)
  338. print(f'Inserting {n} entries')
  339. if n and not dryRun:
  340. printErr(db.modifyRows('insert',project,schema,query,insertRows))