Selaa lähdekoodia

Adding importXLSX

Andrej 4 kuukautta sitten
vanhempi
commit
d2da55a7b1
1 muutettua tiedostoa jossa 406 lisäystä ja 0 poistoa
  1. 406 0
      importXLSX.py

+ 406 - 0
importXLSX.py

@@ -0,0 +1,406 @@
+import os
+import pandas 
+import sys
+import importlib
+import re
+import datetime
+import chardet
+import json
+import math
+
+#same directory!
+import labkeyInterface
+import labkeyDatabaseBrowser
+import labkeyFileBrowser
+
+def connectDB(server):
+   net=labkeyInterface.labkeyInterface()
+   qfile='{}.json'.format(server)
+   fconfig=os.path.join(os.path.expanduser('~'),'.labkey',qfile)
+   net.init(fconfig)
+   net.getCSRF()
+   return labkeyDatabaseBrowser.labkeyDB(net)
+
+def getDB(pars):
+   try:
+      return pars['db']
+   except KeyError:
+      pass
+   server=pars.get('server','onko-nix')
+   db=connectDB(server)
+   pars['db']=db
+   return db
+
+def getFields(pars):
+   project=pars.get('project','DCIS/Study')
+   schema=pars.get('schema','demographics')
+   query=pars.get('query','demographics')
+   db=getDB(pars)
+    
+#data on query structure are both in queryDesign and selectRows/metadata content
+    
+   dsgn=db.getQueryDesign(project,schema,query)
+   dsgnFields={f['name']:f for f in dsgn['fields']} 
+
+   ds=db.selectRows(project,schema,query,[])
+   mdFields={x['name']:x for x in ds['metaData']['fields']}
+    
+#in principle, any property from mdFields could get copied 
+#to content reported from dsgnFields
+#try/except to robustify against missing components
+   copyFields=['lookup']
+   for f in dsgnFields:
+      try:
+         dsgnFields[f].update({x:mdFields[f][x] for x in copyFields})
+      except KeyError:
+         pass
+   return dsgnFields
+
+def getAlias(fields):
+   fieldMap={}
+   for f in fields.values():
+      aliasList=getAliasList(f['importAliases'])
+      fieldMap.update({x:f['name'] for x in aliasList})
+   return fieldMap
+
+def invertMap(qmap):
+   return {qmap[x]:x for x in qmap} 
+
+def getVariables(fields,fieldType='LOOKUP',fieldName=None):
+#get list of variables of particular type to help manage import
+#if type is LOOKUP also return data on lookup query
+   if fieldName:
+      return {fieldName:fields[fieldName]['lookupQuery']}
+   if fieldType=='LOOKUP':
+      return {f['name']:f['lookupQuery'] for f in fields.values() \
+         if f['lookupQuery']}
+   if fieldType=='DATE':
+      return {f['name']:fieldType for f in fields.values() \
+         if f['rangeURI'].find('dateTime')>-1}
+   if fieldType=='DOUBLE':
+      return {f['name']:fieldType for f in fields.values() \
+         if f['rangeURI'].find('double')>-1}
+   return {}
+
+
+def getLookupMap(pars,fields,fieldName):
+#get possible values of categorical variables/factors from labkey
+   try:
+      lookup=fields[fieldName]['lookup']
+   except KeyError:
+      print(fields[fieldName])
+      raise KeyError(f'Could not find lookup for {fieldName}')
+   schema=lookup['schemaName']
+   query=lookup['queryName']
+   key=lookup['keyColumn']
+   val=lookup['displayColumn']
+   project=pars['project']
+   db=getDB(pars)
+   ds=db.selectRows(project,schema,query,[])
+   cMap={r[val]:r[key] for r in ds['rows']}
+    
+def parseLookup(lookup,qv):
+#parse/convert lookup 
+#is it key?
+   try:
+      return lookup[qv]
+        
+   except KeyError:
+      pass
+    
+   if pandas.isna(qv):
+      return qv
+    
+   try:
+      qv=qv.item()
+      qv=str(qv)
+      return lookup[qv]
+    
+   except AttributeError:
+      pass
+
+   qv=qv.replace('Č','C')
+   return lookup[qv]
+
+def asKey(qv):
+   if not qv:
+      return qv
+   try:
+      return int(qv)
+   except (TypeError,ValueError):
+      print(f'Failed to parse {qv} as key')
+      return None
+
+def parseDate(qv):
+
+   if not qv:
+      return qv
+
+#from xls format to native python format
+   fmts=['datetime','pandas','%d.%m.%y','%Y-%m-%d %H:%M:%S.%f',\
+      '%d/%m/%Y','%Y-%m-%d %H:%M:%S']
+
+
+   for fmt in fmts:
+
+      try:
+
+         if fmt=='pandas':
+#print(f'Trying {qv} as pandas.Timestamp')
+            date=pandas.Timestamp.to_pydatetime(qv)
+         elif fmt=='datetime':
+#print(f'Trying {qv} as datetime.datetime')
+            if not isinstance(qv,datetime.datetime):
+               raise TypeError('Not a datetime object')
+               date=qv
+         else:
+#print(f'Trying {qv} with {fmt}')
+            date=datetime.datetime.strptime(qv,fmt)
+            break
+
+      except TypeError:
+#print('Failed (type): {}'.format(type(qv)))
+         continue
+      except ValueError:
+#print('Failed (value)')
+         continue
+
+#sometimes parsing fails
+   try:
+      return date.isoformat()
+   except UnboundLocalError:
+      print (f'Failed to parsed {qv} as date')
+      return None
+
+def parseDouble(qv):
+   try:
+      return float(qv)
+   except ValueError:
+      return None
+    
+    #m
+def setMissingLookupValues(filename,xlsFieldName,project,lookup,\
+   labkeyFieldName=None,dryRun=True):
+    
+#list all possible values for a field
+#perhaps needs to be updated
+   df=pandas.read_excel(filename)
+   vars=df.columns
+    
+   vals=set([df.at[r,xlsFieldName] for r in df.index if not pandas.isna(df.at[r,xlsFieldName])])
+   try:
+      vals={v.item() for v in vals}
+   except AttributeError:
+      pass
+   print(vals)
+    
+    
+   if not labkeyFieldName:
+      labkeyFieldName=xlsFieldName
+   db=connectDB('onko-nix')
+   ds=db.selectRows(project,'lists',lookup,[])
+#only record values from labkey (should be unique anyhow)
+   setVals=set([r[labkeyFieldName] for r in ds['rows']])
+   off=len(list(setVals))
+   missing=sorted(list(vals-setVals))
+#print('Missing {}'.format(missing))
+   n=len(missing)
+   entries=[{'Key':'{}'.format(i+1+off),columnName:missing[i]} \
+      for i in range(n)]
+   print(entries)
+   if dryRun:
+      return
+   db.modifyRows('insert',project,'lists',lookup,entries)
+
+def getAliasList(x):
+#split aliases by comma, taking into account quotation marks, 
+#where commas are ignored
+   if not x:
+      return []
+#sophisticated spliting that ignores commas in double (and single!) quotes
+   ar=re.split(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', x)
+#remove the quotes afterwards
+   ar=[s.replace('"','') for s in ar]
+   ar=[s.strip() for s in ar]
+   return ar
+
+def printErr(resp):
+#print error from server response
+   try:
+      print(resp['exception'])
+   except KeyError:
+      pass
+     
+def findAlias(v,alias):
+#find matchng alias for field v from XLS
+   try:
+      return alias[v.strip()]
+   except KeyError:
+      pass
+#remove .N qualifiers, attach to the end by pandas.read_excel
+   v=re.sub(r'(.*)\.[0-9]*$',r'\1',v.strip())
+   try:
+      return alias[v]
+   except KeyError:
+      pass
+    
+   return None
+
+def getSequenceNum(pMap,id):
+    #updates pMap and return seqnum for this entry
+   try:
+      n=len(pMap[id])
+   except KeyError:
+      n=0
+   seqNum=n+1
+   if n==0:
+      pMap[id]=[seqNum]
+   else:
+      pMap[id].append(seqNum)
+   return seqNum
+
+
+def entryInList(r,entries):
+#is entry in list
+   candidates=[x for x in entries if entriesMatch(x,r)]
+   return len(candidates)>0
+
+def entriesMatch(x,r):
+#do a pair of entries match? Should we update data from one over the other?
+   matchFields=['ParticipantId','SequenceNum']
+   for f in matchFields:
+      if x[f]!=r[f]:
+         return False
+   return True
+
+def validate(qv):
+#rough checks on value read from XLSX
+
+#NAN
+   try:
+      if math.isnan(qv):
+         return None
+   except TypeError:
+      pass
+
+#NAD
+   if pandas.isnull(qv):
+      return None
+   return qv
+
+#modify,getId
+
+def importData(pars,filename,getId,modify,convertLookup,dryRun=True):
+#master routine that imports data based on pars, 
+#applies user supplied functionsmodify, convertLookup and get Id and 
+#updates relevant database
+
+#some useful fields from pars (d is for default value)
+# - skiprows removes irelevant rows (number, d: 0)
+# - usecols is EXCEL like notations for cols taken (identifer, d: None for all)
+# - sheet_name selects sheet for import (number, d:0 for first sheet)
+# - seqNumOffset specify visit/sequenceNum offset (number, d:0 will result in 1)
+# - project - labkey project
+# - schema - labkey schema (list/study, d: study)
+# - query - labkey query
+
+   skiprows=pars.get('skiprows',0)
+   usecols=pars.get('usecols',None)
+   sheet_name=pars.get('sheet_name',0)
+#set this is as sequenceNum for entries, or initial seqNum if more than a single entry is in the dataset
+   seqNumOffset=pars.get('seqNumOffset',0)
+   fields=getFields(pars)
+   lookupVars=getVariables(fields,fieldType='LOOKUP')
+   dateVars=getVariables(fields,fieldType='DATE')
+   doubleVars=getVariables(fields,fieldType='DOUBLE')
+   usecols=pars.get('usecols',None)
+#convert dates to list
+   dateVars=list(dateVars.keys())
+   print(f'dateVars: {dateVars}')
+   lookupMap={f:getLookupMap(pars,fields,f) for f in lookupVars}
+   alias=getAlias(fields)
+   print(f'aliases: {alias}')
+
+   df=pandas.read_excel(filename,sheet_name=sheet_name,skiprows=skiprows,\
+      usecols=usecols)
+   vars=df.columns
+   print(vars)
+
+   pMap={}
+   print('Index: {}'.format(len(df.index)))
+   idx=df.index[0:10] #for debug
+#   idx=df.index #for all
+
+   entries=[]
+   for r in idx:
+      id=getId(df,r)    
+      entry={}
+      entry['ParticipantId']=id
+
+      for v in vars:
+         qv=validate(df.at[r,v])
+         qv=modify(qv,v)
+         f=findAlias(v,alias)
+         if not f:
+            continue
+         if f in lookupMap:
+            if convertLookup(v):
+               qv=parseLookup(lookupMap[f],qv)
+            else:
+               qv=asKey(qv)
+                
+            
+         if f in dateVars:
+            qv=parseDate(qv)
+                
+         if f in doubleVars:
+            qv=parseDouble(qv)
+         try:
+            numpyType=qv.dtype
+            qv=qv.item()
+         except AttributeError:
+            pass
+            
+         entry[f]=qv
+            
+#print('{}:{}/{}'.format(f,qv,type(qv)))
+      seqNum=getSequenceNum(pMap,id)
+      entry['SequenceNum']=seqNum+seqNumOffset
+      entries.append(entry)
+            
+#for p in pMap:
+#    print('{}: {}'.format(p,len(pMap[p])))
+   print(entries)
+#delete previous incarnations
+   db=getDB(pars)
+   project=pars.get('project','DCIS/Study')
+   schema=pars.get('schema','demographics')
+   query=pars.get('query','demographics')
+   
+   updateRows=[]
+   insertRows=[]
+   for entry in entries:
+      v='ParticipantId'
+      idFilter={'variable':v,'value':entry[v],'oper':'eq'}
+
+      v='SequenceNum'
+      seqFilter={'variable':v,'value':'{}'.format(entry[v]),'oper':'eq'}
+      ds=db.selectRows(project,schema,query,[idFilter,seqFilter])
+      if len(ds['rows'])>0:
+         r=ds['rows'][0]
+         r.update(entry)
+         updateRows.append(r)
+      else:
+         insertRows.append(entry)
+        
+   n=len(updateRows)
+   print(f'Updating {n} entries')
+   if n and not dryRun:
+      printErr(db.modifyRows('update',project,schema,query,updateRows))
+   n=len(insertRows)
+   print(f'Inserting {n} entries')
+   if n and not dryRun:
+      printErr(db.modifyRows('insert',project,schema,query,insertRows))
+    
+
+