importXLSX.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. import os
  2. import pandas
  3. import sys
  4. import importlib
  5. import re
  6. import datetime
  7. import chardet
  8. import json
  9. import math
  10. #same directory!
  11. import labkeyInterface
  12. import labkeyDatabaseBrowser
  13. import labkeyFileBrowser
  14. def connectDB(server):
  15. net=labkeyInterface.labkeyInterface()
  16. qfile='{}.json'.format(server)
  17. fconfig=os.path.join(os.path.expanduser('~'),'.labkey',qfile)
  18. net.init(fconfig)
  19. net.getCSRF()
  20. return labkeyDatabaseBrowser.labkeyDB(net)
  21. def getDB(pars):
  22. try:
  23. return pars['db']
  24. except KeyError:
  25. pass
  26. server=pars.get('server','onko-nix')
  27. db=connectDB(server)
  28. pars['db']=db
  29. return db
  30. def getFields(pars):
  31. project=pars.get('project','DCIS/Study')
  32. schema=pars.get('schema','demographics')
  33. query=pars.get('query','demographics')
  34. db=getDB(pars)
  35. #data on query structure are both in queryDesign and selectRows/metadata content
  36. dsgn=db.getQueryDesign(project,schema,query)
  37. dsgnFields={f['name']:f for f in dsgn['fields']}
  38. ds=db.selectRows(project,schema,query,[])
  39. mdFields={x['name']:x for x in ds['metaData']['fields']}
  40. #in principle, any property from mdFields could get copied
  41. #to content reported from dsgnFields
  42. #try/except to robustify against missing components
  43. copyFields=['lookup','type']
  44. for f in dsgnFields:
  45. try:
  46. dsgnFields[f].update({x:mdFields[f][x] for x in copyFields})
  47. except KeyError:
  48. # print('{}: {}'.format(p,len(pMap[p])))
  49. pass
  50. return dsgnFields
  51. def getAlias(fields):
  52. fieldMap={}
  53. for f in fields.values():
  54. aliasList=getAliasList(f['importAliases'])
  55. fieldMap.update({x:f['name'] for x in aliasList})
  56. return fieldMap
  57. def invertMap(qmap):
  58. return {qmap[x]:x for x in qmap}
  59. def getVariables(fields,fieldType='LOOKUP',fieldName=None):
  60. #get list of variables of particular type to help manage import
  61. #if type is LOOKUP also return data on lookup query
  62. if fieldName:
  63. return {fieldName:fields[fieldName]['lookupQuery']}
  64. if fieldType=='LOOKUP':
  65. return {f['name']:f['lookupQuery'] for f in fields.values() \
  66. if f['lookupQuery']}
  67. if fieldType=='DATE':
  68. return {f['name']:fieldType for f in fields.values() \
  69. if f['rangeURI'].find('dateTime')>-1}
  70. if fieldType=='DOUBLE':
  71. return {f['name']:fieldType for f in fields.values() \
  72. if f['rangeURI'].find('double')>-1}
  73. return {}
  74. def getLookupMap(pars,fields,fieldName):
  75. #get possible values of categorical variables/factors from labkey
  76. try:
  77. lookup=fields[fieldName]['lookup']
  78. except KeyError:
  79. print(fields[fieldName])
  80. raise KeyError(f'Could not find lookup for {fieldName}')
  81. schema=lookup['schemaName']
  82. query=lookup['queryName']
  83. key=lookup['keyColumn']
  84. val=lookup['displayColumn']
  85. project=lookup.get('containerPath',pars['project'])
  86. db=getDB(pars)
  87. ds=db.selectRows(project,schema,query,[])
  88. cMap={r[val]:r[key] for r in ds['rows']}
  89. return cMap
  90. def parseLookup(lookup,qv):
  91. #parse/convert lookup
  92. #is it key?
  93. try:
  94. return lookup[qv]
  95. except KeyError:
  96. pass
  97. if pandas.isna(qv):
  98. return qv
  99. try:
  100. qv=qv.item()
  101. qv=str(qv)
  102. return lookup[qv]
  103. except AttributeError:
  104. pass
  105. qv=qv.replace('Č','C')
  106. return lookup[qv]
  107. def asKey(qv,field):
  108. if not qv:
  109. return qv
  110. if not field['type']=='int':
  111. return qv
  112. try:
  113. return int(qv)
  114. except (TypeError,ValueError):
  115. print(f'Failed to parse {qv} as key')
  116. return None
  117. def parseDate(qv):
  118. if not qv:
  119. return qv
  120. #from xls format to native python format
  121. fmts=['datetime','pandas','%d.%m.%y','%Y-%m-%d %H:%M:%S.%f',\
  122. '%d/%m/%Y','%Y-%m-%d %H:%M:%S']
  123. usedFmt=None
  124. for fmt in fmts:
  125. usedFmt=fmt
  126. try:
  127. if fmt=='pandas':
  128. #print(f'Trying {qv} as pandas.Timestamp')
  129. date=pandas.Timestamp.to_pydatetime(qv)
  130. break
  131. elif fmt=='datetime':
  132. #print(f'Trying {qv} as datetime.datetime')
  133. if not isinstance(qv,datetime.datetime):
  134. raise TypeError('Not a datetime object')
  135. date=qv
  136. break
  137. else:
  138. #print(f'Trying {qv} with {fmt}')
  139. date=datetime.datetime.strptime(qv,fmt)
  140. break
  141. except TypeError:
  142. #print('Failed (type): {}'.format(type(qv)))
  143. continue
  144. except ValueError:
  145. #print('Failed (value)')
  146. continue
  147. except AttributeError:
  148. #print('Failed (attribute)')
  149. continue
  150. #sometimes parsing fails
  151. try:
  152. return date.isoformat()
  153. except UnboundLocalError:
  154. print (f'Failed to parse {qv} as date, used {usedFmt}')
  155. return None
  156. def parseDouble(qv):
  157. try:
  158. return float(qv)
  159. except (ValueError,TypeError):
  160. return None
  161. #m
  162. def setMissingLookupValues(filename,xlsFieldName,project,lookup,\
  163. labkeyFieldName=None,dryRun=True):
  164. #list all possible values for a field
  165. #perhaps needs to be updated
  166. df=pandas.read_excel(filename)
  167. vars=df.columns
  168. vals=set([df.at[r,xlsFieldName] for r in df.index if not pandas.isna(df.at[r,xlsFieldName])])
  169. try:
  170. vals={v.item() for v in vals}
  171. except AttributeError:
  172. pass
  173. print(vals)
  174. if not labkeyFieldName:
  175. labkeyFieldName=xlsFieldName
  176. db=connectDB('onko-nix')
  177. ds=db.selectRows(project,'lists',lookup,[])
  178. #only record values from labkey (should be unique anyhow)
  179. setVals=set([r[labkeyFieldName] for r in ds['rows']])
  180. off=len(list(setVals))
  181. missing=sorted(list(vals-setVals))
  182. #print('Missing {}'.format(missing))
  183. n=len(missing)
  184. entries=[{'Key':'{}'.format(i+1+off),columnName:missing[i]} \
  185. for i in range(n)]
  186. print(entries)
  187. if dryRun:
  188. return
  189. db.modifyRows('insert',project,'lists',lookup,entries)
  190. def getAliasList(x):
  191. #split aliases by comma, taking into account quotation marks,
  192. #where commas are ignored
  193. if not x:
  194. return []
  195. #sophisticated spliting that ignores commas in double (and single!) quotes
  196. ar=re.split(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', x)
  197. #remove the quotes afterwards
  198. ar=[s.replace('"','') for s in ar]
  199. ar=[s.strip() for s in ar]
  200. return ar
  201. def printErr(resp):
  202. #print error from server response
  203. try:
  204. print(resp['exception'])
  205. except KeyError:
  206. pass
  207. def findAlias(v,alias):
  208. #find matchng alias for field v from XLS
  209. try:
  210. return alias[v.strip()]
  211. except KeyError:
  212. pass
  213. #remove .N qualifiers, attach to the end by pandas.read_excel
  214. v=re.sub(r'(.*)\.[0-9]*$',r'\1',v.strip())
  215. try:
  216. return alias[v]
  217. except KeyError:
  218. pass
  219. return None
  220. def getSequenceNum(pMap,id):
  221. #updates pMap and return seqnum for this entry
  222. try:
  223. n=len(pMap[id])
  224. except KeyError:
  225. n=0
  226. seqNum=n+1
  227. if n==0:
  228. pMap[id]=[seqNum]
  229. else:
  230. pMap[id].append(seqNum)
  231. return seqNum
  232. def entryInList(r,entries):
  233. #is entry in list
  234. candidates=[x for x in entries if entriesMatch(x,r)]
  235. return len(candidates)>0
  236. def entriesMatch(x,r):
  237. #do a pair of entries match? Should we update data from one over the other?
  238. matchFields=['ParticipantId','SequenceNum']
  239. for f in matchFields:
  240. if x[f]!=r[f]:
  241. return False
  242. return True
  243. def validate(qv):
  244. #rough checks on value read from XLSX
  245. #NAN
  246. try:
  247. if math.isnan(qv):
  248. return None
  249. except TypeError:
  250. pass
  251. #NAD
  252. if pandas.isnull(qv):
  253. return None
  254. return qv
  255. #defaults for modify, getId and convertLookup
  256. def modify(qv,columnName):
  257. return qv
  258. def getId(df,r,pars):
  259. try:
  260. rawId=str(df.at[r,'ID'])
  261. except KeyError:
  262. msg='Getting id from field ID field. '
  263. msg+=' Overload getId function with getId=getIdFnc in importData'
  264. print(msg)
  265. raise KeyError
  266. return rawId.replace(' ','')
  267. def convertLookup(xlsColumnName):
  268. return True
  269. def rowValid(df,r,pars):
  270. return True
  271. def updateAndValidateEntry(e,pars):
  272. return True
  273. def importData(pars,filename,getId=getId,modify=modify,\
  274. convertLookup=convertLookup,dryRun=True,debug=True,
  275. rowValid=rowValid,updateAndValidateEntry=updateAndValidateEntry):
  276. #master routine that imports data based on pars,
  277. #applies user supplied functions modify, convertLookup and get Id and
  278. #updates relevant database
  279. #rowValid signals a valid row, argument is current row from xlsx
  280. #updateAndValidateEntry can add fields that can be computed from other fields in entry and allows to reject entries not matching a target pattern
  281. #some useful fields from pars (d is for default value)
  282. # - skiprows removes irelevant rows (number, d: 0)
  283. # - usecols is EXCEL like notations for cols taken (identifer, d: None for all)
  284. # - sheet_name selects sheet for import (number, d:0 for first sheet)
  285. # - seqNumOffset specify visit/sequenceNum offset (number, d:0 will result in 1)
  286. # - project - labkey project
  287. # - schema - labkey schema (list/study, d: study)
  288. # - query - labkey query
  289. # - additionalVars - values that are constant for the whole dataset, labeled by a valid alias for a labkey field
  290. #- idField - designated unique field of the labkey query
  291. #- updateAlias - a function to modify aliases based on parameters, e.g. for datasets that take values based on a qualifying field
  292. skiprows=pars.get('skiprows',0)
  293. usecols=pars.get('usecols',None)
  294. sheet_name=pars.get('sheet_name',0)
  295. #set this is as sequenceNum for entries, or initial seqNum if more than a single entry is in the dataset
  296. seqNumOffset=pars.get('seqNumOffset',0)
  297. additionalVars=pars.get('additionalVars',{})
  298. idField=pars.get('idField','ParticipantId')
  299. updateAlias=pars.get('updateAlias',lambda x,y: x)
  300. fields=getFields(pars)
  301. lookupVars=getVariables(fields,fieldType='LOOKUP')
  302. dateVars=getVariables(fields,fieldType='DATE')
  303. doubleVars=getVariables(fields,fieldType='DOUBLE')
  304. usecols=pars.get('usecols',None)
  305. #convert dates to list
  306. dateVars=list(dateVars.keys())
  307. print(f'dateVars: {dateVars}')
  308. lookupMap={f:getLookupMap(pars,fields,f) for f in lookupVars}
  309. if debug:
  310. print(lookupMap)
  311. alias=getAlias(fields)
  312. alias=updateAlias(alias,pars)
  313. print(f'aliases: {alias}')
  314. df=pandas.read_excel(filename,sheet_name=sheet_name,skiprows=skiprows,\
  315. usecols=usecols)
  316. vars=df.columns
  317. print(vars)
  318. pMap={}
  319. print('Index: {}'.format(len(df.index)))
  320. idx=df.index #for all
  321. if debug:
  322. idx=df.index[0:10] #for debug
  323. entries=[]
  324. for r in idx:
  325. if not rowValid(df,r,pars):
  326. continue
  327. id=getId(df,r,pars)
  328. entry={}
  329. entry[idField]=id
  330. for v in vars:
  331. qv=validate(df.at[r,v])
  332. qv=modify(qv,v)
  333. f=findAlias(v,alias)
  334. if not f:
  335. continue
  336. if f in lookupMap:
  337. if convertLookup(v):
  338. qv=parseLookup(lookupMap[f],qv)
  339. else:
  340. qv=asKey(qv,fields[f])
  341. if f in dateVars:
  342. qv=parseDate(qv)
  343. if f in doubleVars:
  344. qv=parseDouble(qv)
  345. try:
  346. numpyType=qv.dtype
  347. qv=qv.item()
  348. except AttributeError:
  349. pass
  350. entry[f]=qv
  351. #print('{}:{}/{}'.format(f,qv,type(qv)))
  352. for v in additionalVars:
  353. f=findAlias(v,alias)
  354. qv=additionalVars[v]
  355. if f in lookupMap:
  356. qv=parseLookup(lookupMap[f],qv)
  357. entry[f]=qv
  358. seqNum=getSequenceNum(pMap,id)
  359. entry['SequenceNum']=seqNum+seqNumOffset
  360. #set date also? Use pars to find dateVariable
  361. try:
  362. dateVariable=pars['dateVariable']
  363. entry['Date']=entry[dateVariable]
  364. except KeyError:
  365. pass
  366. if not updateAndValidateEntry(entry,pars):
  367. continue
  368. entries.append(entry)
  369. print(entries)
  370. #delete previous incarnations
  371. loadSafely(pars,entries,dryRun=dryRun)
  372. def loadSafely(pars,entries,dryRun=True):
  373. #allow additional keys in keyColumn
  374. db=getDB(pars)
  375. project=pars.get('project','DCIS/Study')
  376. schema=pars.get('schema','demographics')
  377. query=pars.get('query','demographics')
  378. idField=pars.get('idField','ParticipantId')
  379. #add columns from labkey that will be checked to find a match
  380. matchColumns=pars.get('matchColumns',[])
  381. #whether to insert new entries
  382. matchOnly=pars.get('matchOnly',False)
  383. #print rows to be inserted, debugging feature
  384. printRows=pars.get('printRows',False)
  385. updateRows=[]
  386. insertRows=[]
  387. selVal=[idField,'SequenceNum']
  388. selVal.extend(matchColumns)
  389. print(f'Matching by {selVal}')
  390. for entry in entries:
  391. qFilter=[{'variable':v,'value':'{}'.format(entry[v]),'oper':'eq'} for v in selVal]
  392. ds=db.selectRows(project,schema,query,qFilter)
  393. if len(ds['rows'])>0:
  394. r=ds['rows'][0]
  395. r.update(entry)
  396. updateRows.append(r)
  397. else:
  398. insertRows.append(entry)
  399. #update rows
  400. n=len(updateRows)
  401. print(f'Updating {n} entries')
  402. if printRows:
  403. print(updateRows)
  404. if n and not dryRun:
  405. printErr(db.modifyRows('update',project,schema,query,updateRows))
  406. #insert rows
  407. n=len(insertRows)
  408. print(f'Inserting {n} entries')
  409. if matchOnly:
  410. print('Inserting supressed by matchOnly flag')
  411. return
  412. if printRows:
  413. print(insertRows)
  414. if n and not dryRun:
  415. printErr(db.modifyRows('insert',project,schema,query,insertRows))