123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425 |
- import os
- import pandas
- import sys
- import importlib
- import re
- import datetime
- import chardet
- import json
- import math
- #same directory!
- import labkeyInterface
- import labkeyDatabaseBrowser
- import labkeyFileBrowser
- def connectDB(server):
- net=labkeyInterface.labkeyInterface()
- qfile='{}.json'.format(server)
- fconfig=os.path.join(os.path.expanduser('~'),'.labkey',qfile)
- net.init(fconfig)
- net.getCSRF()
- return labkeyDatabaseBrowser.labkeyDB(net)
- def getDB(pars):
- try:
- return pars['db']
- except KeyError:
- pass
- server=pars.get('server','onko-nix')
- db=connectDB(server)
- pars['db']=db
- return db
- def getFields(pars):
- project=pars.get('project','DCIS/Study')
- schema=pars.get('schema','demographics')
- query=pars.get('query','demographics')
- db=getDB(pars)
-
- #data on query structure are both in queryDesign and selectRows/metadata content
-
- dsgn=db.getQueryDesign(project,schema,query)
- dsgnFields={f['name']:f for f in dsgn['fields']}
- ds=db.selectRows(project,schema,query,[])
- mdFields={x['name']:x for x in ds['metaData']['fields']}
-
- #in principle, any property from mdFields could get copied
- #to content reported from dsgnFields
- #try/except to robustify against missing components
- copyFields=['lookup']
- for f in dsgnFields:
- try:
- dsgnFields[f].update({x:mdFields[f][x] for x in copyFields})
- except KeyError:
- pass
- return dsgnFields
- def getAlias(fields):
- fieldMap={}
- for f in fields.values():
- aliasList=getAliasList(f['importAliases'])
- fieldMap.update({x:f['name'] for x in aliasList})
- return fieldMap
- def invertMap(qmap):
- return {qmap[x]:x for x in qmap}
- def getVariables(fields,fieldType='LOOKUP',fieldName=None):
- #get list of variables of particular type to help manage import
- #if type is LOOKUP also return data on lookup query
- if fieldName:
- return {fieldName:fields[fieldName]['lookupQuery']}
- if fieldType=='LOOKUP':
- return {f['name']:f['lookupQuery'] for f in fields.values() \
- if f['lookupQuery']}
- if fieldType=='DATE':
- return {f['name']:fieldType for f in fields.values() \
- if f['rangeURI'].find('dateTime')>-1}
- if fieldType=='DOUBLE':
- return {f['name']:fieldType for f in fields.values() \
- if f['rangeURI'].find('double')>-1}
- return {}
- def getLookupMap(pars,fields,fieldName):
- #get possible values of categorical variables/factors from labkey
- try:
- lookup=fields[fieldName]['lookup']
- except KeyError:
- print(fields[fieldName])
- raise KeyError(f'Could not find lookup for {fieldName}')
- schema=lookup['schemaName']
- query=lookup['queryName']
- key=lookup['keyColumn']
- val=lookup['displayColumn']
- project=pars['project']
- db=getDB(pars)
- ds=db.selectRows(project,schema,query,[])
- cMap={r[val]:r[key] for r in ds['rows']}
-
- def parseLookup(lookup,qv):
- #parse/convert lookup
- #is it key?
- try:
- return lookup[qv]
-
- except KeyError:
- pass
-
- if pandas.isna(qv):
- return qv
-
- try:
- qv=qv.item()
- qv=str(qv)
- return lookup[qv]
-
- except AttributeError:
- pass
- qv=qv.replace('Č','C')
- return lookup[qv]
- def asKey(qv):
- if not qv:
- return qv
- try:
- return int(qv)
- except (TypeError,ValueError):
- print(f'Failed to parse {qv} as key')
- return None
- def parseDate(qv):
- if not qv:
- return qv
- #from xls format to native python format
- fmts=['datetime','pandas','%d.%m.%y','%Y-%m-%d %H:%M:%S.%f',\
- '%d/%m/%Y','%Y-%m-%d %H:%M:%S']
- for fmt in fmts:
- try:
- if fmt=='pandas':
- #print(f'Trying {qv} as pandas.Timestamp')
- date=pandas.Timestamp.to_pydatetime(qv)
- elif fmt=='datetime':
- #print(f'Trying {qv} as datetime.datetime')
- if not isinstance(qv,datetime.datetime):
- raise TypeError('Not a datetime object')
- date=qv
- else:
- #print(f'Trying {qv} with {fmt}')
- date=datetime.datetime.strptime(qv,fmt)
- break
- except TypeError:
- #print('Failed (type): {}'.format(type(qv)))
- continue
- except ValueError:
- #print('Failed (value)')
- continue
- #sometimes parsing fails
- try:
- return date.isoformat()
- except UnboundLocalError:
- print (f'Failed to parsed {qv} as date')
- return None
- def parseDouble(qv):
- try:
- return float(qv)
- except (ValueError,TypeError):
- return None
-
- #m
- def setMissingLookupValues(filename,xlsFieldName,project,lookup,\
- labkeyFieldName=None,dryRun=True):
-
- #list all possible values for a field
- #perhaps needs to be updated
- df=pandas.read_excel(filename)
- vars=df.columns
-
- vals=set([df.at[r,xlsFieldName] for r in df.index if not pandas.isna(df.at[r,xlsFieldName])])
- try:
- vals={v.item() for v in vals}
- except AttributeError:
- pass
- print(vals)
-
-
- if not labkeyFieldName:
- labkeyFieldName=xlsFieldName
- db=connectDB('onko-nix')
- ds=db.selectRows(project,'lists',lookup,[])
- #only record values from labkey (should be unique anyhow)
- setVals=set([r[labkeyFieldName] for r in ds['rows']])
- off=len(list(setVals))
- missing=sorted(list(vals-setVals))
- #print('Missing {}'.format(missing))
- n=len(missing)
- entries=[{'Key':'{}'.format(i+1+off),columnName:missing[i]} \
- for i in range(n)]
- print(entries)
- if dryRun:
- return
- db.modifyRows('insert',project,'lists',lookup,entries)
- def getAliasList(x):
- #split aliases by comma, taking into account quotation marks,
- #where commas are ignored
- if not x:
- return []
- #sophisticated spliting that ignores commas in double (and single!) quotes
- ar=re.split(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', x)
- #remove the quotes afterwards
- ar=[s.replace('"','') for s in ar]
- ar=[s.strip() for s in ar]
- return ar
- def printErr(resp):
- #print error from server response
- try:
- print(resp['exception'])
- except KeyError:
- pass
-
- def findAlias(v,alias):
- #find matchng alias for field v from XLS
- try:
- return alias[v.strip()]
- except KeyError:
- pass
- #remove .N qualifiers, attach to the end by pandas.read_excel
- v=re.sub(r'(.*)\.[0-9]*$',r'\1',v.strip())
- try:
- return alias[v]
- except KeyError:
- pass
-
- return None
- def getSequenceNum(pMap,id):
- #updates pMap and return seqnum for this entry
- try:
- n=len(pMap[id])
- except KeyError:
- n=0
- seqNum=n+1
- if n==0:
- pMap[id]=[seqNum]
- else:
- pMap[id].append(seqNum)
- return seqNum
- def entryInList(r,entries):
- #is entry in list
- candidates=[x for x in entries if entriesMatch(x,r)]
- return len(candidates)>0
- def entriesMatch(x,r):
- #do a pair of entries match? Should we update data from one over the other?
- matchFields=['ParticipantId','SequenceNum']
- for f in matchFields:
- if x[f]!=r[f]:
- return False
- return True
- def validate(qv):
- #rough checks on value read from XLSX
- #NAN
- try:
- if math.isnan(qv):
- return None
- except TypeError:
- pass
- #NAD
- if pandas.isnull(qv):
- return None
- return qv
- #defaults for modify, getId and convertLookup
- def modify(qv,columnName):
- return qv
- def getId(df,r):
- try:
- rawId=str(df.at[r,'ID'])
- except KeyError:
- msg='Getting id from field ID field. '
- msg+=' Overload getId function with getId=getIdFnc in importData'
- print(msg)
- raise KeyError
- return rawId.replace(' ','')
- def convertLookup(xlsColumnName):
- return True
- def importData(pars,filename,getId=getId,modify=modify,\
- convertLookup=convertLookup,dryRun=True,debug=True):
- #master routine that imports data based on pars,
- #applies user supplied functions modify, convertLookup and get Id and
- #updates relevant database
- #some useful fields from pars (d is for default value)
- # - skiprows removes irelevant rows (number, d: 0)
- # - usecols is EXCEL like notations for cols taken (identifer, d: None for all)
- # - sheet_name selects sheet for import (number, d:0 for first sheet)
- # - seqNumOffset specify visit/sequenceNum offset (number, d:0 will result in 1)
- # - project - labkey project
- # - schema - labkey schema (list/study, d: study)
- # - query - labkey query
- skiprows=pars.get('skiprows',0)
- usecols=pars.get('usecols',None)
- sheet_name=pars.get('sheet_name',0)
- #set this is as sequenceNum for entries, or initial seqNum if more than a single entry is in the dataset
- seqNumOffset=pars.get('seqNumOffset',0)
- fields=getFields(pars)
- lookupVars=getVariables(fields,fieldType='LOOKUP')
- dateVars=getVariables(fields,fieldType='DATE')
- doubleVars=getVariables(fields,fieldType='DOUBLE')
- usecols=pars.get('usecols',None)
- #convert dates to list
- dateVars=list(dateVars.keys())
- print(f'dateVars: {dateVars}')
- lookupMap={f:getLookupMap(pars,fields,f) for f in lookupVars}
- alias=getAlias(fields)
- print(f'aliases: {alias}')
- df=pandas.read_excel(filename,sheet_name=sheet_name,skiprows=skiprows,\
- usecols=usecols)
- vars=df.columns
- print(vars)
- pMap={}
- print('Index: {}'.format(len(df.index)))
- idx=df.index #for all
- if debug:
- idx=df.index[0:10] #for debug
- entries=[]
- for r in idx:
- id=getId(df,r)
- entry={}
- entry['ParticipantId']=id
- for v in vars:
- qv=validate(df.at[r,v])
- qv=modify(qv,v)
- f=findAlias(v,alias)
- if not f:
- continue
- if f in lookupMap:
- if convertLookup(v):
- qv=parseLookup(lookupMap[f],qv)
- else:
- qv=asKey(qv)
-
-
- if f in dateVars:
- qv=parseDate(qv)
-
- if f in doubleVars:
- qv=parseDouble(qv)
- try:
- numpyType=qv.dtype
- qv=qv.item()
- except AttributeError:
- pass
-
- entry[f]=qv
-
- #print('{}:{}/{}'.format(f,qv,type(qv)))
- seqNum=getSequenceNum(pMap,id)
- entry['SequenceNum']=seqNum+seqNumOffset
- entries.append(entry)
-
- #for p in pMap:
- # print('{}: {}'.format(p,len(pMap[p])))
- print(entries)
- #delete previous incarnations
- db=getDB(pars)
- project=pars.get('project','DCIS/Study')
- schema=pars.get('schema','demographics')
- query=pars.get('query','demographics')
-
- updateRows=[]
- insertRows=[]
- for entry in entries:
- v='ParticipantId'
- idFilter={'variable':v,'value':entry[v],'oper':'eq'}
- v='SequenceNum'
- seqFilter={'variable':v,'value':'{}'.format(entry[v]),'oper':'eq'}
- ds=db.selectRows(project,schema,query,[idFilter,seqFilter])
- if len(ds['rows'])>0:
- r=ds['rows'][0]
- r.update(entry)
- updateRows.append(r)
- else:
- insertRows.append(entry)
-
- n=len(updateRows)
- print(f'Updating {n} entries')
- if n and not dryRun:
- printErr(db.modifyRows('update',project,schema,query,updateRows))
- n=len(insertRows)
- print(f'Inserting {n} entries')
- if n and not dryRun:
- printErr(db.modifyRows('insert',project,schema,query,insertRows))
-
|