|
@@ -0,0 +1,406 @@
|
|
|
+import os
|
|
|
+import pandas
|
|
|
+import sys
|
|
|
+import importlib
|
|
|
+import re
|
|
|
+import datetime
|
|
|
+import chardet
|
|
|
+import json
|
|
|
+import math
|
|
|
+
|
|
|
+#same directory!
|
|
|
+import labkeyInterface
|
|
|
+import labkeyDatabaseBrowser
|
|
|
+import labkeyFileBrowser
|
|
|
+
|
|
|
+def connectDB(server):
|
|
|
+ net=labkeyInterface.labkeyInterface()
|
|
|
+ qfile='{}.json'.format(server)
|
|
|
+ fconfig=os.path.join(os.path.expanduser('~'),'.labkey',qfile)
|
|
|
+ net.init(fconfig)
|
|
|
+ net.getCSRF()
|
|
|
+ return labkeyDatabaseBrowser.labkeyDB(net)
|
|
|
+
|
|
|
+def getDB(pars):
|
|
|
+ try:
|
|
|
+ return pars['db']
|
|
|
+ except KeyError:
|
|
|
+ pass
|
|
|
+ server=pars.get('server','onko-nix')
|
|
|
+ db=connectDB(server)
|
|
|
+ pars['db']=db
|
|
|
+ return db
|
|
|
+
|
|
|
+def getFields(pars):
|
|
|
+ project=pars.get('project','DCIS/Study')
|
|
|
+ schema=pars.get('schema','demographics')
|
|
|
+ query=pars.get('query','demographics')
|
|
|
+ db=getDB(pars)
|
|
|
+
|
|
|
+#data on query structure are both in queryDesign and selectRows/metadata content
|
|
|
+
|
|
|
+ dsgn=db.getQueryDesign(project,schema,query)
|
|
|
+ dsgnFields={f['name']:f for f in dsgn['fields']}
|
|
|
+
|
|
|
+ ds=db.selectRows(project,schema,query,[])
|
|
|
+ mdFields={x['name']:x for x in ds['metaData']['fields']}
|
|
|
+
|
|
|
+#in principle, any property from mdFields could get copied
|
|
|
+#to content reported from dsgnFields
|
|
|
+#try/except to robustify against missing components
|
|
|
+ copyFields=['lookup']
|
|
|
+ for f in dsgnFields:
|
|
|
+ try:
|
|
|
+ dsgnFields[f].update({x:mdFields[f][x] for x in copyFields})
|
|
|
+ except KeyError:
|
|
|
+ pass
|
|
|
+ return dsgnFields
|
|
|
+
|
|
|
+def getAlias(fields):
|
|
|
+ fieldMap={}
|
|
|
+ for f in fields.values():
|
|
|
+ aliasList=getAliasList(f['importAliases'])
|
|
|
+ fieldMap.update({x:f['name'] for x in aliasList})
|
|
|
+ return fieldMap
|
|
|
+
|
|
|
+def invertMap(qmap):
|
|
|
+ return {qmap[x]:x for x in qmap}
|
|
|
+
|
|
|
+def getVariables(fields,fieldType='LOOKUP',fieldName=None):
|
|
|
+#get list of variables of particular type to help manage import
|
|
|
+#if type is LOOKUP also return data on lookup query
|
|
|
+ if fieldName:
|
|
|
+ return {fieldName:fields[fieldName]['lookupQuery']}
|
|
|
+ if fieldType=='LOOKUP':
|
|
|
+ return {f['name']:f['lookupQuery'] for f in fields.values() \
|
|
|
+ if f['lookupQuery']}
|
|
|
+ if fieldType=='DATE':
|
|
|
+ return {f['name']:fieldType for f in fields.values() \
|
|
|
+ if f['rangeURI'].find('dateTime')>-1}
|
|
|
+ if fieldType=='DOUBLE':
|
|
|
+ return {f['name']:fieldType for f in fields.values() \
|
|
|
+ if f['rangeURI'].find('double')>-1}
|
|
|
+ return {}
|
|
|
+
|
|
|
+
|
|
|
+def getLookupMap(pars,fields,fieldName):
|
|
|
+#get possible values of categorical variables/factors from labkey
|
|
|
+ try:
|
|
|
+ lookup=fields[fieldName]['lookup']
|
|
|
+ except KeyError:
|
|
|
+ print(fields[fieldName])
|
|
|
+ raise KeyError(f'Could not find lookup for {fieldName}')
|
|
|
+ schema=lookup['schemaName']
|
|
|
+ query=lookup['queryName']
|
|
|
+ key=lookup['keyColumn']
|
|
|
+ val=lookup['displayColumn']
|
|
|
+ project=pars['project']
|
|
|
+ db=getDB(pars)
|
|
|
+ ds=db.selectRows(project,schema,query,[])
|
|
|
+ cMap={r[val]:r[key] for r in ds['rows']}
|
|
|
+
|
|
|
+def parseLookup(lookup,qv):
|
|
|
+#parse/convert lookup
|
|
|
+#is it key?
|
|
|
+ try:
|
|
|
+ return lookup[qv]
|
|
|
+
|
|
|
+ except KeyError:
|
|
|
+ pass
|
|
|
+
|
|
|
+ if pandas.isna(qv):
|
|
|
+ return qv
|
|
|
+
|
|
|
+ try:
|
|
|
+ qv=qv.item()
|
|
|
+ qv=str(qv)
|
|
|
+ return lookup[qv]
|
|
|
+
|
|
|
+ except AttributeError:
|
|
|
+ pass
|
|
|
+
|
|
|
+ qv=qv.replace('Č','C')
|
|
|
+ return lookup[qv]
|
|
|
+
|
|
|
+def asKey(qv):
|
|
|
+ if not qv:
|
|
|
+ return qv
|
|
|
+ try:
|
|
|
+ return int(qv)
|
|
|
+ except (TypeError,ValueError):
|
|
|
+ print(f'Failed to parse {qv} as key')
|
|
|
+ return None
|
|
|
+
|
|
|
+def parseDate(qv):
|
|
|
+
|
|
|
+ if not qv:
|
|
|
+ return qv
|
|
|
+
|
|
|
+#from xls format to native python format
|
|
|
+ fmts=['datetime','pandas','%d.%m.%y','%Y-%m-%d %H:%M:%S.%f',\
|
|
|
+ '%d/%m/%Y','%Y-%m-%d %H:%M:%S']
|
|
|
+
|
|
|
+
|
|
|
+ for fmt in fmts:
|
|
|
+
|
|
|
+ try:
|
|
|
+
|
|
|
+ if fmt=='pandas':
|
|
|
+#print(f'Trying {qv} as pandas.Timestamp')
|
|
|
+ date=pandas.Timestamp.to_pydatetime(qv)
|
|
|
+ elif fmt=='datetime':
|
|
|
+#print(f'Trying {qv} as datetime.datetime')
|
|
|
+ if not isinstance(qv,datetime.datetime):
|
|
|
+ raise TypeError('Not a datetime object')
|
|
|
+ date=qv
|
|
|
+ else:
|
|
|
+#print(f'Trying {qv} with {fmt}')
|
|
|
+ date=datetime.datetime.strptime(qv,fmt)
|
|
|
+ break
|
|
|
+
|
|
|
+ except TypeError:
|
|
|
+#print('Failed (type): {}'.format(type(qv)))
|
|
|
+ continue
|
|
|
+ except ValueError:
|
|
|
+#print('Failed (value)')
|
|
|
+ continue
|
|
|
+
|
|
|
+#sometimes parsing fails
|
|
|
+ try:
|
|
|
+ return date.isoformat()
|
|
|
+ except UnboundLocalError:
|
|
|
+ print (f'Failed to parsed {qv} as date')
|
|
|
+ return None
|
|
|
+
|
|
|
+def parseDouble(qv):
|
|
|
+ try:
|
|
|
+ return float(qv)
|
|
|
+ except ValueError:
|
|
|
+ return None
|
|
|
+
|
|
|
+ #m
|
|
|
+def setMissingLookupValues(filename,xlsFieldName,project,lookup,\
|
|
|
+ labkeyFieldName=None,dryRun=True):
|
|
|
+
|
|
|
+#list all possible values for a field
|
|
|
+#perhaps needs to be updated
|
|
|
+ df=pandas.read_excel(filename)
|
|
|
+ vars=df.columns
|
|
|
+
|
|
|
+ vals=set([df.at[r,xlsFieldName] for r in df.index if not pandas.isna(df.at[r,xlsFieldName])])
|
|
|
+ try:
|
|
|
+ vals={v.item() for v in vals}
|
|
|
+ except AttributeError:
|
|
|
+ pass
|
|
|
+ print(vals)
|
|
|
+
|
|
|
+
|
|
|
+ if not labkeyFieldName:
|
|
|
+ labkeyFieldName=xlsFieldName
|
|
|
+ db=connectDB('onko-nix')
|
|
|
+ ds=db.selectRows(project,'lists',lookup,[])
|
|
|
+#only record values from labkey (should be unique anyhow)
|
|
|
+ setVals=set([r[labkeyFieldName] for r in ds['rows']])
|
|
|
+ off=len(list(setVals))
|
|
|
+ missing=sorted(list(vals-setVals))
|
|
|
+#print('Missing {}'.format(missing))
|
|
|
+ n=len(missing)
|
|
|
+ entries=[{'Key':'{}'.format(i+1+off),columnName:missing[i]} \
|
|
|
+ for i in range(n)]
|
|
|
+ print(entries)
|
|
|
+ if dryRun:
|
|
|
+ return
|
|
|
+ db.modifyRows('insert',project,'lists',lookup,entries)
|
|
|
+
|
|
|
+def getAliasList(x):
|
|
|
+#split aliases by comma, taking into account quotation marks,
|
|
|
+#where commas are ignored
|
|
|
+ if not x:
|
|
|
+ return []
|
|
|
+#sophisticated spliting that ignores commas in double (and single!) quotes
|
|
|
+ ar=re.split(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', x)
|
|
|
+#remove the quotes afterwards
|
|
|
+ ar=[s.replace('"','') for s in ar]
|
|
|
+ ar=[s.strip() for s in ar]
|
|
|
+ return ar
|
|
|
+
|
|
|
+def printErr(resp):
|
|
|
+#print error from server response
|
|
|
+ try:
|
|
|
+ print(resp['exception'])
|
|
|
+ except KeyError:
|
|
|
+ pass
|
|
|
+
|
|
|
+def findAlias(v,alias):
|
|
|
+#find matchng alias for field v from XLS
|
|
|
+ try:
|
|
|
+ return alias[v.strip()]
|
|
|
+ except KeyError:
|
|
|
+ pass
|
|
|
+#remove .N qualifiers, attach to the end by pandas.read_excel
|
|
|
+ v=re.sub(r'(.*)\.[0-9]*$',r'\1',v.strip())
|
|
|
+ try:
|
|
|
+ return alias[v]
|
|
|
+ except KeyError:
|
|
|
+ pass
|
|
|
+
|
|
|
+ return None
|
|
|
+
|
|
|
+def getSequenceNum(pMap,id):
|
|
|
+ #updates pMap and return seqnum for this entry
|
|
|
+ try:
|
|
|
+ n=len(pMap[id])
|
|
|
+ except KeyError:
|
|
|
+ n=0
|
|
|
+ seqNum=n+1
|
|
|
+ if n==0:
|
|
|
+ pMap[id]=[seqNum]
|
|
|
+ else:
|
|
|
+ pMap[id].append(seqNum)
|
|
|
+ return seqNum
|
|
|
+
|
|
|
+
|
|
|
+def entryInList(r,entries):
|
|
|
+#is entry in list
|
|
|
+ candidates=[x for x in entries if entriesMatch(x,r)]
|
|
|
+ return len(candidates)>0
|
|
|
+
|
|
|
+def entriesMatch(x,r):
|
|
|
+#do a pair of entries match? Should we update data from one over the other?
|
|
|
+ matchFields=['ParticipantId','SequenceNum']
|
|
|
+ for f in matchFields:
|
|
|
+ if x[f]!=r[f]:
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+def validate(qv):
|
|
|
+#rough checks on value read from XLSX
|
|
|
+
|
|
|
+#NAN
|
|
|
+ try:
|
|
|
+ if math.isnan(qv):
|
|
|
+ return None
|
|
|
+ except TypeError:
|
|
|
+ pass
|
|
|
+
|
|
|
+#NAD
|
|
|
+ if pandas.isnull(qv):
|
|
|
+ return None
|
|
|
+ return qv
|
|
|
+
|
|
|
+#modify,getId
|
|
|
+
|
|
|
+def importData(pars,filename,getId,modify,convertLookup,dryRun=True):
|
|
|
+#master routine that imports data based on pars,
|
|
|
+#applies user supplied functionsmodify, convertLookup and get Id and
|
|
|
+#updates relevant database
|
|
|
+
|
|
|
+#some useful fields from pars (d is for default value)
|
|
|
+# - skiprows removes irelevant rows (number, d: 0)
|
|
|
+# - usecols is EXCEL like notations for cols taken (identifer, d: None for all)
|
|
|
+# - sheet_name selects sheet for import (number, d:0 for first sheet)
|
|
|
+# - seqNumOffset specify visit/sequenceNum offset (number, d:0 will result in 1)
|
|
|
+# - project - labkey project
|
|
|
+# - schema - labkey schema (list/study, d: study)
|
|
|
+# - query - labkey query
|
|
|
+
|
|
|
+ skiprows=pars.get('skiprows',0)
|
|
|
+ usecols=pars.get('usecols',None)
|
|
|
+ sheet_name=pars.get('sheet_name',0)
|
|
|
+#set this is as sequenceNum for entries, or initial seqNum if more than a single entry is in the dataset
|
|
|
+ seqNumOffset=pars.get('seqNumOffset',0)
|
|
|
+ fields=getFields(pars)
|
|
|
+ lookupVars=getVariables(fields,fieldType='LOOKUP')
|
|
|
+ dateVars=getVariables(fields,fieldType='DATE')
|
|
|
+ doubleVars=getVariables(fields,fieldType='DOUBLE')
|
|
|
+ usecols=pars.get('usecols',None)
|
|
|
+#convert dates to list
|
|
|
+ dateVars=list(dateVars.keys())
|
|
|
+ print(f'dateVars: {dateVars}')
|
|
|
+ lookupMap={f:getLookupMap(pars,fields,f) for f in lookupVars}
|
|
|
+ alias=getAlias(fields)
|
|
|
+ print(f'aliases: {alias}')
|
|
|
+
|
|
|
+ df=pandas.read_excel(filename,sheet_name=sheet_name,skiprows=skiprows,\
|
|
|
+ usecols=usecols)
|
|
|
+ vars=df.columns
|
|
|
+ print(vars)
|
|
|
+
|
|
|
+ pMap={}
|
|
|
+ print('Index: {}'.format(len(df.index)))
|
|
|
+ idx=df.index[0:10] #for debug
|
|
|
+# idx=df.index #for all
|
|
|
+
|
|
|
+ entries=[]
|
|
|
+ for r in idx:
|
|
|
+ id=getId(df,r)
|
|
|
+ entry={}
|
|
|
+ entry['ParticipantId']=id
|
|
|
+
|
|
|
+ for v in vars:
|
|
|
+ qv=validate(df.at[r,v])
|
|
|
+ qv=modify(qv,v)
|
|
|
+ f=findAlias(v,alias)
|
|
|
+ if not f:
|
|
|
+ continue
|
|
|
+ if f in lookupMap:
|
|
|
+ if convertLookup(v):
|
|
|
+ qv=parseLookup(lookupMap[f],qv)
|
|
|
+ else:
|
|
|
+ qv=asKey(qv)
|
|
|
+
|
|
|
+
|
|
|
+ if f in dateVars:
|
|
|
+ qv=parseDate(qv)
|
|
|
+
|
|
|
+ if f in doubleVars:
|
|
|
+ qv=parseDouble(qv)
|
|
|
+ try:
|
|
|
+ numpyType=qv.dtype
|
|
|
+ qv=qv.item()
|
|
|
+ except AttributeError:
|
|
|
+ pass
|
|
|
+
|
|
|
+ entry[f]=qv
|
|
|
+
|
|
|
+#print('{}:{}/{}'.format(f,qv,type(qv)))
|
|
|
+ seqNum=getSequenceNum(pMap,id)
|
|
|
+ entry['SequenceNum']=seqNum+seqNumOffset
|
|
|
+ entries.append(entry)
|
|
|
+
|
|
|
+#for p in pMap:
|
|
|
+# print('{}: {}'.format(p,len(pMap[p])))
|
|
|
+ print(entries)
|
|
|
+#delete previous incarnations
|
|
|
+ db=getDB(pars)
|
|
|
+ project=pars.get('project','DCIS/Study')
|
|
|
+ schema=pars.get('schema','demographics')
|
|
|
+ query=pars.get('query','demographics')
|
|
|
+
|
|
|
+ updateRows=[]
|
|
|
+ insertRows=[]
|
|
|
+ for entry in entries:
|
|
|
+ v='ParticipantId'
|
|
|
+ idFilter={'variable':v,'value':entry[v],'oper':'eq'}
|
|
|
+
|
|
|
+ v='SequenceNum'
|
|
|
+ seqFilter={'variable':v,'value':'{}'.format(entry[v]),'oper':'eq'}
|
|
|
+ ds=db.selectRows(project,schema,query,[idFilter,seqFilter])
|
|
|
+ if len(ds['rows'])>0:
|
|
|
+ r=ds['rows'][0]
|
|
|
+ r.update(entry)
|
|
|
+ updateRows.append(r)
|
|
|
+ else:
|
|
|
+ insertRows.append(entry)
|
|
|
+
|
|
|
+ n=len(updateRows)
|
|
|
+ print(f'Updating {n} entries')
|
|
|
+ if n and not dryRun:
|
|
|
+ printErr(db.modifyRows('update',project,schema,query,updateRows))
|
|
|
+ n=len(insertRows)
|
|
|
+ print(f'Inserting {n} entries')
|
|
|
+ if n and not dryRun:
|
|
|
+ printErr(db.modifyRows('insert',project,schema,query,insertRows))
|
|
|
+
|
|
|
+
|
|
|
+
|