{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "3ef7e184-24ef-463d-9bcf-a2ab28e1f3cf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loadLibrary\n", "remoteSourcesURL https://git0.fmf.uni-lj.si/studen/nixSuite/raw/master/remoteResources/resources.json\n", "{'labkeyInterface': {'url': 'https://git0.fmf.uni-lj.si/studen/labkeyInterface/archive/master.zip', 'branch': 'master', 'modules': []}, 'irAEMM': {'url': 'https://git0.fmf.uni-lj.si/studen/iraemm/archive/master.zip', 'branch': 'master', 'modules': ['iraemmBrowser']}, 'SlicerLabkeyExtension': {'url': 'https://git0.fmf.uni-lj.si/studen/SlicerLabkeyExtension/archive/SlicerExtensionIndex.zip', 'branch': 'SlicerExtensionIndex', 'modules': ['labkeyBrowser']}, 'limfomiPET': {'url': 'https://git0.fmf.uni-lj.si/studen/limfomiPET/archive/master.zip', 'branch': 'master', 'modules': ['imageBrowser', 'segmentationBrowser']}, 'parseConfig': {'url': 'https://git0.fmf.uni-lj.si/studen/parseConfig/archive/master.zip', 'branch': 'master', 'modules': []}, 'orthancInterface': {'url': 'https://git0.fmf.uni-lj.si/studen/orthancInterface/archive/master.zip', 'branch': 'master', 'modules': []}}\n", "{'url': 'https://git0.fmf.uni-lj.si/studen/labkeyInterface/archive/master.zip', 'branch': 'master', 'modules': []}\n", "File C:\\Users\\studen\\temp\\labkeyInterface.zip: True\n" ] } ], "source": [ "import sys\n", "import os\n", "import chardet\n", "import json\n", "import re\n", "import openpyxl\n", "import pandas\n", "\n", "#you should get nixSuite via git clone https://git0.fmf.uni-lj.si/studen/nixSuite.git\n", "#if you don't put it to $HOME/software/src/, you should update the path\n", "nixSuite=os.path.join(os.path.expanduser('~'),'software','src','nixSuite')\n", "sys.path.append(os.path.join(nixSuite,'wrapper'))\n", "import nixWrapper\n", "nixWrapper.loadLibrary('labkeyInterface')\n", "import labkeyInterface\n", "import labkeyDatabaseBrowser\n", "import labkeyFileBrowser\n", "\n", "def connectDB(server):\n", " #check connectivity. This checks the configuration in $HOME/.labkey/network.json, \n", " #where paths to certificates are stored\n", " net=labkeyInterface.labkeyInterface()\n", " fconfig=os.path.join(os.path.expanduser('~'),'.labkey','{}.json'.format(server))\n", " net.init(fconfig)\n", " #this reports the certificate used\n", " try:\n", " print('Using: {}'.format(net.connectionConfig['SSL']['user']))\n", " except KeyError:\n", " pass\n", " #This gets a deafult CSRF code; It should report user name plus a long string of random hex numbers\n", " net.getCSRF()\n", " db=labkeyDatabaseBrowser.labkeyDB(net)\n", " fb=labkeyFileBrowser.labkeyFileBrowser(net)\n", " return db\n", " \n", "def readXLSX(file):\n", " wb=openpyxl.load_workbook(filename=file)\n", " ws=wb.worksheets[0]\n", " df=pandas.DataFrame(ws.values)\n", " df=useFirstLineAsVarNames(df)\n", " return df\n", " \n", "def useFirstLineAsVarNames(df):\n", " columnNames = df.iloc[0] \n", " df = df[1:] \n", " df.columns = columnNames\n", " return df\n", "\n", "def renameColumns(df,nameMap):\n", " return df.rename(columns=nameMap)\n", "\n", "def formatDemographics(row):\n", " vMap={'birthDate':'birthDate','MIB1':'MIB1','stage':'stage','performanceStatusWHO':'performanceStatusWHO'}\n", " return {vMap[v]:row[v] for v in vMap}\n", " \n", "def modifyRowsWithCheck(db,project,schema,query,entry,testFields):\n", " mode='insert'\n", " filters=[{'variable':x,'value':'{}'.format(entry[x]),'oper':'eq'} for x in testFields]\n", " ds=db.selectRows(project,schema,query,filters)\n", " finalEntry={}\n", " if len(ds['rows'])==1:\n", " finalEntry=ds['rows'][0]\n", " mode='update'\n", " for q in entry:\n", " finalEntry[q]=entry[q];\n", " print(db.modifyRows(mode,project,schema,query,[finalEntry])) " ] }, { "cell_type": "code", "execution_count": 3, "id": "3cc1bbd2-d8cf-4c5f-a69b-d0772d0bcf2e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User: andrej studen CSRF: d644aaab64bf058cff784f6c63582087\n", "User: andrej studen CSRF: 60cdf6b330e5d58e9d88a8ff01f72841\n", "User: andrej studen CSRF: 97b65fef3c1c1ec1ac4a3235cc991e32\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " + 1\n", " + 1\n", " - 0\n", " + 1\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " + 1\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " + 1\n", " - 0\n", " - 0\n", "None supplied\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", "None supplied\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " + 1\n", " + 1\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", " - 0\n", " + 1\n", " + 1\n", " - 0\n", " - 0\n", " - 0\n", " - 0\n", " + 1\n", " + 1\n", " + 1\n", " - 0\n", " + 1\n", " - 0\n", "II.A II.A\n", "IV.B.E. IV.B.E\n", "IV.B.E. IV.B.E\n", "IV.B.E. IV.B.E\n", "IV.A.E. IV.A.E\n", "III.A. III.A\n", "Iv.A IV.A\n", "IV.B. IV.B\n", "IV IV\n", "IV.B. IV.B\n", "IV.A.X. IV.A.X\n", "IV.B.E IV.B.E\n", "IV.B.S.E. IV.B.S.E\n", "IV.B.E.S. IV.B.E.S\n", "IV IV\n", "IV.A.E IV.A.E\n", "IV.B.E IV.B.E\n", "IV.A.E. IV.A.E\n", "IV.A.E. IV.A.E\n", "IV.B.X.E. IV.B.X.E\n", "IV.B.S.E. IV.B.S.E\n", "IV.A.X.E. IV.A.X.E\n", "IV.A.E. IV.A.E\n", "IV.A.E. IV.A.E\n", "IV.B.E. IV.B.E\n", "IV.B. IV.B\n", "IV.A. IV.A\n", "IV.A.E. IV.A.E\n", "IV.A IV.A\n", "II.A II.A\n", "III.A.S. III.A.S\n", "II.A.e. II.A.E\n", "IV.A.X IV.A.X\n", "III.B.E III.B.E\n", "IV IV\n", "IV.B IV.B\n", "IV IV\n", "IV.X.B. IV.X.B\n", "II.A.E. II.A.E\n", "II.B.E. II.B.E\n", "II II\n", "II.B II.B\n", "II.A II.A\n", "IV.B.E IV.B.E\n", "II.A.E. II.A.E\n", "IV.A.E. IV.A.E\n", "IV.A. IV.A\n", "IV.A.E. IV.A.E\n", "IV.B.E. IV.B.E\n", "IV.A.S. IV.A.S\n", "II.A. II.A\n", "IV.A.E.X. IV.A.E.X\n", "II.A.E. II.A.E\n", "IV.B.X.E. IV.B.X.E\n", "III.A. III.A\n", "II.A.X.E. II.A.X.E\n", "IV.A.E. IV.A.E\n", "II.A. II.A\n", "II.A. II.A\n", "IV IV\n", "II.A. II.A\n", "II.A. II.A\n", "IV.B.S. IV.B.S\n", "II.B. II.B\n", "IV.A.X.E.S IV.A.X.E.S\n", "III.A. III.A\n", "III.B.X. III.B.X\n", "III.A.X. III.A.X\n", "II.A. II.A\n", "IV.B.E. IV.B.E\n", "IV.A.E. IV.A.E\n", "IV.S. IV.S\n", "II.A.E. II.A.E\n", "IV.B.E. IV.B.E\n", "III.A. III.A\n", "II.A. II.A\n", "IV.A. IV.A\n", "IV.B.E. IV.B.E\n", "II.A. II.A\n", "IV.A.E. IV.A.E\n", "III.A.E. III.A.E\n", "IV.B.E. IV.B.E\n", "II.B.E. II.B.E\n", "IV.B.E. IV.B.E\n", "IV.B. IV.B\n", "II.A.E. II.A.E\n", "IV.B.E. IV.B.E\n", "II.A.X. II.A.X\n", "IV.A.S.E. IV.A.S.E\n", "IV.B.X IV.B.X\n", "II.A. II.A\n", "IV. IV\n", "IV. IV\n", "IV.A. IV.A\n", "II.A.E. II.A.E\n", "IV.B. IV.B\n", "II.A.E.X. II.A.E.X\n", "II.A. II.A\n", "II.A. II.A\n", "II.A.E. II.A.E\n", "IV.B.E. IV.B.E\n", "IV.A IV.A\n", "IV.B.E. IV.B.E\n", "IV.B.E. IV.B.E\n", "IV.B.X. IV.B.X\n", "II.A II.A\n", "IV.A. IV.A\n", "III.A.S.E. III.A.S.E\n", "IV.A.X. IV.A.X\n", "II.B. II.B\n", "IV.B.E. IV.B.E\n", "IV.B. IV.B\n", "IV.A.E. IV.A.E\n", "III.A.S. III.A.S\n", "IV.A.E. IV.A.E\n", "IV.B. IV.B\n", "IV.A. IV.A\n", "IV.A.E.S. IV.A.E.S\n", "IV.B.X. IV.B.X\n", "IV.B.X. IV.B.X\n", "IV.A.E.S. IV.A.E.S\n", "IV.B.E. IV.B.E\n", "III.A. III.A\n", "IV.A.E. IV.A.E\n", "II.A.S. II.A.S\n", "IV.B.E. IV.B.E\n", "IV.A.S. IV.A.S\n", "III.B.X. III.B.X\n", "IV.B:E. IV.B.E\n", "II II\n", "IV.A. IV.A\n", "IV.A IV.A\n", "II.A. II.A\n", "IV.A.E. IV.A.E\n", "II.A.E. II.A.E\n", "IV.A. IV.A\n", "IV.A. IV.A\n", "II.A.E. II.A.E\n", "IV.A. IV.A\n", "II.B.X.E. II.B.X.E\n", "III.A.E. III.A.E\n", "IV.A. IV.A\n", "IV.B.E. IV.B.E\n", "IV.A.E. IV.A.E\n", "IV.A.E. IV.A.E\n", "IV.B.E. IV.B.E\n", "III.B.X.S. III.B.X.S\n", "IV.A.E. IV.A.E\n", "IV.A.E. IV.A.E\n", "IV.B. IV.B\n", "III.B.S. III.B.S\n", "II.A. II.A\n", "IV.B.E.S. IV.B.E.S\n", "III.A. III.A\n", "IV.B.E. IV.B.E\n", "III.A.S.X. III.A.S.X\n", "III.A.S.X. III.A.S.X\n", "IV.B. IV.B\n", "III III\n", "IV.A.E. IV.A.E\n", "IV.B.S.X.E. IV.B.S.X.E\n", "IV.A.E. IV.A.E\n", "III.A. III.A\n", "II.A.E. II.A.E\n", "III.A. III.A\n", "II.B.X. II.B.X\n", "IV.A.E IV.A.E\n", "II.A.E. II.A.E\n", "II.E.B. II.E.B\n", "II.B.E. II.B.E\n", "IV.B. IV.B\n", "IV.B:X. IV.B.X\n", "III.B.E. III.B.E\n", "IV.B.E. IV.B.E\n", "IV.B. IV.B\n", "IV.A.X. IV.A.X\n", "III.B.E. III.B.E\n", "IV.B.S. IV.B.S\n", "II.A.E. II.A.E\n", "ii.a.x. II.A.X\n", "iv.b.x. IV.B.X\n", "II.A. II.A\n", "III.A.X. III.A.X\n", "II.X.B.E II.X.B.E\n", "IV.A. IV.A\n", "IV.A.E. IV.A.E\n" ] }, { "data": { "text/plain": [ "['II',\n", " 'II.A',\n", " 'II.A.E',\n", " 'II.A.E.X',\n", " 'II.A.S',\n", " 'II.A.X',\n", " 'II.A.X.E',\n", " 'II.B',\n", " 'II.B.E',\n", " 'II.B.X',\n", " 'II.B.X.E',\n", " 'II.E.B',\n", " 'II.X.B.E',\n", " 'III',\n", " 'III.A',\n", " 'III.A.E',\n", " 'III.A.S',\n", " 'III.A.S.E',\n", " 'III.A.S.X',\n", " 'III.A.X',\n", " 'III.B.E',\n", " 'III.B.S',\n", " 'III.B.X',\n", " 'III.B.X.S',\n", " 'IV',\n", " 'IV.A',\n", " 'IV.A.E',\n", " 'IV.A.E.S',\n", " 'IV.A.E.X',\n", " 'IV.A.S',\n", " 'IV.A.S.E',\n", " 'IV.A.X',\n", " 'IV.A.X.E',\n", " 'IV.A.X.E.S',\n", " 'IV.B',\n", " 'IV.B.E',\n", " 'IV.B.E.S',\n", " 'IV.B.S',\n", " 'IV.B.S.E',\n", " 'IV.B.S.X.E',\n", " 'IV.B.X',\n", " 'IV.B.X.E',\n", " 'IV.S',\n", " 'IV.X.B']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#update enum and provide parsers\n", "setup={}\n", "setup['server']='onko-nix'\n", "setup['project']='limfomiPET/Study2023'\n", "\n", "\n", "def getNameMap():\n", " nameMap={}\n", " #nameMap['vpisna št.']='ParticipantId'\n", " nameMap['OI št.']='ParticipantId'\n", " nameMap['datum diagnoze']='diagnosisDate'\n", " nameMap['spol M Z']='sex'\n", " nameMap['rojstni datum']='birthDate'\n", " nameMap['MIB-1 (%)']='MIB1'\n", " nameMap['stadij']='stage'\n", " nameMap['performance status (WHO)']='performanceStatusWHO'\n", " nameMap['vrsta KT']='chemotherapyType'\n", " nameMap['število ciklov KT']='chemotherapyCycles'\n", " nameMap['datum prvega cikla KT']='firstCTCycleDate'\n", " nameMap['datum konca KT']='endCTDate'\n", " nameMap['RT po KT +/-']='radiotherapyAfterChemotherapy'\n", " nameMap['odgovor na celotno zdravljenje KT +/- RT']='treatmentResponse'\n", " nameMap['datum potrditve recidiva/ progresa']='progressRegressionConfirmationDate'\n", " nameMap['datum evalvacije']='treatmentEvaluationDate'\n", " nameMap['stanje evaluacije brez/z bol']='treatmentEvaluationStatus'\n", " nameMap['stanje evaluacije bolezni brez 0/z 1']='treatmentEvaluationStatusNumeric'\n", " nameMap['stanje živ 1/mrtev 0']='survivalStatus'\n", " nameMap['datum smrti']='deathDate'\n", " return nameMap\n", " \n", "\n", "def parseXMLS(file):\n", " df=readXLSX(file)\n", " nameMap=getNameMap()\n", "\n", " #format new dataset based on the data above\n", " df=renameColumns(df,nameMap)\n", " #print(df.iloc[4,:])\n", " return df\n", "\n", "def parseStage(stage):\n", " mapStage={'Iv.A':'IV.A','II.A.e':'II.A.E','I.V.A.X':'IV.A.X','I IV.A.E':'IV.A.E','ni':None,'IV.B:E':'IV.B.E','IV.B:X':'IV.B.X','ii.a.x':'II.A.X','iv.b.x':'IV.B.X'}\n", " if not stage:\n", " return stage\n", " if stage[-1]=='.':\n", " stage=stage[:-1]\n", " if stage in mapStage:\n", " stage=mapStage[stage]\n", " return stage\n", "\n", "def parseType(t):\n", " mapType={'R-CHOP':'RCHOP','RCHOp':'RCHOP','rchop':'RCHOP','chop':'CHOP','80% RCOEP':'RCOEP 80%','rchop 50%':'RCHOP 50%','rchop + mtx':'RCHOP+MTX',\n", " 'mini RCHOP':'RCHOP MINI','MINI RCHOP':'RCHOP MINI','Repoch':'REPOCH','R-mini CHOEP':'R-MINI CHOEP'}\n", " if not t:\n", " return t\n", " if t in mapType:\n", " t=mapType[t]\n", " return t\n", "\n", "def parseRadiotherapy(x):\n", " if x==None: \n", " return None\n", " if x.find('+')>-1:\n", " return 1\n", " return 0\n", "\n", "def parseResponse(x):\n", " if x==None:\n", " return None\n", " if x.find('+')>-1:\n", " return 1\n", " if x.find('relaps')>-1:\n", " return 0\n", " if x.find('CR')>-1:\n", " return 1\n", " return 0\n", "\n", "def parseParticipant(x):\n", " #remove all spaces\n", " return x.replace(' ','')\n", " #removes a single initial space\n", " pattern=r'^ ([^ ]*) $'\n", " m=re.match(pattern,x)\n", " if not m:\n", " return x\n", " return m[1]\n", "\n", "def parseStatus(x):\n", " if x==None:\n", " return None\n", " if x.find('brez')>-1:\n", " return 1\n", " return 0\n", "\n", "def testEnum(df,field,parse):\n", " values=set()\n", " df=df.reset_index()\n", " for index, r in df.iterrows():\n", " s=r[field]\n", " if not s:\n", " print('None supplied')\n", " continue\n", " s1=parse(s)\n", " if s1==None:\n", " continue\n", " values.add(s1)\n", " print('{} {}'.format(s,s1))\n", "\n", " return sorted(values)\n", "\n", "\n", "def getStageEnum(df):\n", " stages=set()\n", " df=df.reset_index()\n", " for index, r in df.iterrows():\n", " s=r['stage']\n", " if not s:\n", " continue\n", " s1=parseStage(s)\n", " if not s1:\n", " continue\n", " stages.add(s1)\n", " #print('{} {}'.format(s,s1))\n", "\n", " return sorted(stages)\n", "\n", "def getCTTypeEnum(df):\n", " types=set()\n", " df=df.reset_index()\n", " for index, r in df.iterrows():\n", " s=r['chemotherapyType']\n", " if not s:\n", " continue\n", " s1=parseType(s)\n", " if not s1:\n", " continue\n", " types.add(s1)\n", " #print('{} {}'.format(s,s1))\n", "\n", " return sorted(types)\n", " \n", "def setRows(db,df,query,getEnum,field):\n", " newList=getEnum(df)\n", " db=connectDB('onko-nix')\n", " project='limfomiPET/Study'\n", " schema='lists'\n", " #query='enumStage'\n", " \n", " ds=db.selectRows(project,schema,query,[])\n", " valSet=set([r[field] for r in ds['rows']])\n", " newSet=set(newList)\n", " \n", "\n", " newRows=newSet.difference(valSet)\n", " m=len(newRows)\n", " if m==0:\n", " return\n", " \n", " db.modifyRows('delete',project,schema,query,ds['rows'])\n", " n=len(newList)\n", " rows=[{'Key':i,field:s} for i,s in zip(range(n),newList)]\n", " print(db.modifyRows('insert',project,schema,query,rows))\n", " \n", " \n", "def setStageRows(db,df):\n", " setRows(db,df,'enumStage',getStageEnum,'stage')\n", " \n", "def setCTTypeRows(db,df):\n", " setRows(db,df,'enumCTType',getCTTypeEnum,'ctType')\n", " \n", " \n", "file=os.path.join('Z:','studija_limfom_23_5.xlsx')\n", "df=parseXMLS(file)\n", "df.chemotherapyType.unique()\n", "db=connectDB(setup['server'])\n", "setStageRows(db,df)\n", "setCTTypeRows(db,df)\n", "testEnum(df,'radiotherapyAfterChemotherapy',parseRadiotherapy)\n", "testEnum(df,'stage',parseStage)\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "2b17aa16-c0bf-4b23-ad3d-c569aa42c70b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User: andrej studen CSRF: 5288f5c7dfec8489f8cb44fe6978b295\n", "User: andrej studen CSRF: a5e5420ee22c6b317873b420f8ec7ca2\n", "User: andrej studen CSRF: b06982b7522d2d10db97c1b469461ba7\n", "User: andrej studen CSRF: 8512bf9325cc3968e06023a139c00cbb\n", "User: andrej studen CSRF: c1a6f5b40241125f0c72bd116834ada4\n", "User: andrej studen CSRF: 0f90630e8f5dc2e7c688327a65d328fb\n", "User: andrej studen CSRF: f22c3491f2a567ce456e1a5f3bc19db4\n", "User: andrej studen CSRF: 7ce762e872dee9ace6975aa23c552d4d\n", "User: andrej studen CSRF: 69fca10bad715f56ebda06201f204321\n", "User: andrej studen CSRF: 191de91c32d266a8e5b2fad0274ea22e\n", "User: andrej studen CSRF: 8921046580e54b3fa48136826f79bc33\n", "User: andrej studen CSRF: 105be1034fd52cfaae51a24277418b7e\n", "User: andrej studen CSRF: 402eb86e8d9c809ec81882eb9976f66c\n", "User: andrej studen CSRF: 9cdb88e74380e257a7be118cad0e768f\n", "User: andrej studen CSRF: 4f15ce0fa6e1858b46c94aba4fa933b4\n", "User: andrej studen CSRF: 25a68faca230843c7a34cf4a68d1500e\n", "User: andrej studen CSRF: cc297931288e644213403e09397cdf88\n", "User: andrej studen CSRF: e203f43591841075e8b6868f1845214e\n", "User: andrej studen CSRF: 948873267a136ddee160c0a0f7918f5c\n", "User: andrej studen CSRF: d0c67dcc99526666df4f41f3572b69ea\n", "User: andrej studen CSRF: 83dae14e9aea5c252f76687dd63e9941\n", "User: andrej studen CSRF: 232164a76d57276b975490495236e6e5\n", "User: andrej studen CSRF: c3a34e7b952f084874d6eea6cf9ea13b\n", "User: andrej studen CSRF: 5b9abc04b76df217d065fdbfc33106ad\n", "User: andrej studen CSRF: aedda793810eef663cec3615c03b1a86\n", "User: andrej studen CSRF: 51b4df1b983d91cfb8b284bdf1c33849\n", "User: andrej studen CSRF: e967122348caed0c0d051a7cbdddb956\n", "User: andrej studen CSRF: 4636e9c5fb39767f71a8c15bc16b50c1\n", "User: andrej studen CSRF: e73e0c910d3c6f47937d7d6c53d43673\n", "User: andrej studen CSRF: d19a10e65ccf468ef84c44ef3fb61aac\n", "User: andrej studen CSRF: 14cd317f789b8aa76ffb4b19c8322d8e\n", "User: andrej studen CSRF: 1b19f1a93660c9b88e5b9741ab341772\n", "User: andrej studen CSRF: 10ac95921d45c92d40f2ab3240e11de3\n", "User: andrej studen CSRF: 78cb79d3f90d3a47413061c38a07eee4\n", "User: andrej studen CSRF: 40526ec7298d8df9d98ec2d1397d4724\n", "User: andrej studen CSRF: f418a146bc332d05f8c4526422287c7c\n", "User: andrej studen CSRF: 0f9d3adf60db6064909ff2eef4064004\n", "User: andrej studen CSRF: 8168ab244380d205561e1e86177ca2e0\n", "User: andrej studen CSRF: 98c5fd69789cb6422b400cdf2e18a940\n", "User: andrej studen CSRF: 960a29c6c41b6f2230f0c620cea331c9\n", "User: andrej studen CSRF: 02b510dca2e2cec3bda656f175f522b2\n", "User: andrej studen CSRF: 6145dcc9d60164c9fbe027fadb2389f4\n", "User: andrej studen CSRF: 352cbfe5b4705aeaef6539c8fe5b4707\n", "User: andrej studen CSRF: d9b5abdbd376cb584a5d0872d21799c9\n", "User: andrej studen CSRF: 213128fb0614085ded30759fb249f0af\n", "User: andrej studen CSRF: 1903d6bd34dd82e808e1749561f79765\n", "User: andrej studen CSRF: a54080abe2b20351464224d23217c000\n", "User: andrej studen CSRF: 52611eef60a35fc71df1df11efa30bdc\n", "User: andrej studen CSRF: 19174ca27890ba5750899876b627a198\n", "User: andrej studen CSRF: f2b6d5c83576053fef214b6f3cfe4da6\n", "Inserted 4, updated 45, failed 0 of 49\n" ] } ], "source": [ "#load by datasets\n", "setup={}\n", "setup['server']='onko-nix'\n", "setup['project']='limfomiPET/Study2023'\n", "\n", "def initStatus(rows=None):\n", " status={'insert':0,'update':0,'failed':0,'total':0} \n", " if rows:\n", " status['total']=len(rows)\n", " return status\n", "\n", "def addStatus(s1,s2):\n", " for x in s1:\n", " s1[x]+=s2[x]\n", " return s1\n", "\n", "def printStatus(s):\n", " print('Inserted {}, updated {}, failed {} of {}'.format(s['insert'],s['update'],s['failed'],s['total']))\n", " \n", "def checkLoad(db,project,schema,query,rows,filterFields):\n", " count=initStatus(rows)\n", " for r in rows:\n", " filters=[{'variable':x,'value':'{}'.format(r[x]),'oper':'eq'} for x in filterFields]\n", " ds=db.selectRows(project,schema,query,filters)\n", " entry={}\n", " mode='insert'\n", " #print('Rows {}'.format(len(ds['rows'])))\n", " if len(ds['rows'])==1:\n", " \n", " entry=ds['rows'][0]\n", " mode='update'\n", " for x in r:\n", " entry[x]=r[x]\n", " respData=db.modifyRows(mode,project,schema,query,[entry])\n", " encoding=chardet.detect(respData)[\"encoding\"]\n", " status=json.loads(respData.decode(encoding))\n", " try:\n", " print(status['exception'])\n", " count['failed']+=1\n", " except KeyError:\n", " count[mode]+=1\n", " continue\n", " #print('Inserted {}, updated {}, failed {}'.format(count['insert'],count['update'],count['failed']))\n", " return count\n", " \n", "def checkField(x):\n", " try:\n", " if x.find('ni')>-1:\n", " return None\n", " if x.find('NI')>-1:\n", " return None\n", " if x.find('?')>-1:\n", " return None\n", " if x.find('-')>-1:\n", " return None\n", " except AttributeError:\n", " return x\n", " return x\n", " \n", "def checkDate(x):\n", " try:\n", " return x.isoformat()\n", " except AttributeError:\n", " return None\n", " return None\n", "\n", "\n", "def loadCancer(df,setup):\n", " db=connectDB(setup['server'])\n", " project=setup['project']\n", " schema='lists'\n", " ds=db.selectRows(project,schema,'enumCTType',[])\n", " ctType={r['ctType']:r['Key'] for r in ds['rows']}\n", " ds=db.selectRows(project,schema,'enumStage',[])\n", " stageType={r['stage']:r['Key'] for r in ds['rows']}\n", " \n", " schema='study'\n", " query='cancerData'\n", " df=df.reset_index()\n", " fields=['ParticipantId','diagnosisDate','MIB1','IPI','LDH','performanceStatusWHO']\n", " \n", " status=initStatus()\n", " for index, r in df.iterrows():\n", " x={f:r[f] for f in fields}\n", " x['ParticipantId']=parseParticipant(r['ParticipantId'])\n", " x['SequenceNum']=0\n", " try:\n", " x['stage']=stageType[parseStage(r['stage'])]\n", " except KeyError:\n", " x['stage']=None\n", " x['diagnosisDate']=checkDate(x['diagnosisDate'])\n", " for v in ['performanceStatusWHO','MIB1','IPI','LDH']:\n", " x[v]=checkField(x[v])\n", " \n", " count=checkLoad(db,project,schema,query,[x],['ParticipantId','SequenceNum'])\n", " status=addStatus(status,count)\n", " \n", " printStatus(status)\n", " \n", " \n", "def loadTreatment(df,setup):\n", " db=connectDB(setup['server'])\n", " project=setup['project']\n", " schema='lists'\n", " ds=db.selectRows(project,schema,'enumCTType',[])\n", " ctType={r['ctType']:r['Key'] for r in ds['rows']}\n", " schema='study'\n", " query='cancerTreatment'\n", " df=df.reset_index()\n", " fields=['ParticipantId','chemotherapyCycles']\n", " status=initStatus()\n", " for index, r in df.iterrows():\n", " x={f:r[f] for f in fields}\n", " x['ParticipantId']=parseParticipant(r['ParticipantId'])\n", " x['SequenceNum']=0\n", " try:\n", " x['chemotherapyType']=ctType[parseType(r['chemotherapyType'])]\n", " except KeyError:\n", " x['chemotherapyType']=None\n", " x['firstCTCycleDate']=checkDate(r['firstCTCycleDate'])\n", " x['endCTDate']=checkDate(r['endCTDate'])\n", " x['chemotherapyCycles']=checkField(x['chemotherapyCycles'])\n", " x['radiotherapy']=parseRadiotherapy(r['radiotherapyAfterChemotherapy'])\n", " count=checkLoad(db,project,schema,query,[x],['ParticipantId','SequenceNum'])\n", " status=addStatus(status,count)\n", " printStatus(status)\n", "\n", "def loadEndOfTreatment(df,setup):\n", " db=connectDB(setup['server'])\n", " project=setup['project']\n", " schema='lists'\n", " ds=db.selectRows(project,schema,'enumEventType',[])\n", " eType={r['eventType']:r['Key'] for r in ds['rows']}\n", " ds=db.selectRows(project,schema,'enumEventOutcome',[])\n", " eOutcome={r['eventOutcome']:r['Key'] for r in ds['rows']}\n", " schema='study'\n", " query='events'\n", " eTypeId=eType['END TREATMENT']\n", " df=df.reset_index()\n", " fields=['ParticipantId']\n", " status=initStatus()\n", " for index, r in df.iterrows():\n", " x={f:r[f] for f in fields}\n", " x['ParticipantId']=parseParticipant(r['ParticipantId'])\n", " if r['treatmentResponse']==None:\n", " continue\n", " x['SequenceNum']=0\n", " x['eventDate']=checkDate(r['endCTDate'])\n", " x['eventOutcome']=eOutcome['RECURRENCE']\n", " if parseResponse(r['treatmentResponse'])==1:\n", " x['eventOutcome']=eOutcome['REMISSION']\n", " x['eventType']=eTypeId\n", " count=checkLoad(db,project,schema,query,[x],['ParticipantId','SequenceNum'])\n", " status=addStatus(status,count)\n", " printStatus(status)\n", " \n", "def loadEvaluation(df,setup):\n", " db=connectDB(setup['server'])\n", " project=setup['project']\n", " schema='lists'\n", " ds=db.selectRows(project,schema,'enumEventType',[])\n", " eType={r['eventType']:r['Key'] for r in ds['rows']}\n", " ds=db.selectRows(project,schema,'enumEventOutcome',[])\n", " eOutcome={r['eventOutcome']:r['Key'] for r in ds['rows']}\n", " schema='study'\n", " query='events'\n", " eTypeId=eType['EVALUATION']\n", " df=df.reset_index()\n", " fields=['ParticipantId']\n", " status=initStatus()\n", " for index, r in df.iterrows():\n", " x={f:r[f] for f in fields}\n", " x['ParticipantId']=parseParticipant(r['ParticipantId'])\n", " if r['treatmentEvaluationStatus']==None:\n", " continue\n", " x['SequenceNum']=1\n", " x['eventDate']=checkDate(r['treatmentEvaluationDate'])\n", " x['eventOutcome']=eOutcome['RECURRENCE']\n", " if parseStatus(r['treatmentEvaluationStatus'])==1:\n", " x['eventOutcome']=eOutcome['REMISSION']\n", " x['eventType']=eTypeId\n", " count=checkLoad(db,project,schema,query,[x],['ParticipantId','SequenceNum'])\n", " status=addStatus(status,count)\n", " printStatus(status)\n", " \n", "def loadDeath(df,setup):\n", " db=connectDB(setup['server'])\n", " project=setup['project']\n", " schema='lists'\n", " ds=db.selectRows(project,schema,'enumEventType',[])\n", " eType={r['eventType']:r['Key'] for r in ds['rows']}\n", " ds=db.selectRows(project,schema,'enumEventOutcome',[])\n", " eOutcome={r['eventOutcome']:r['Key'] for r in ds['rows']}\n", " schema='study'\n", " query='events'\n", " eTypeId=eType['DEATH']\n", " df=df.reset_index()\n", " fields=['ParticipantId']\n", " status=initStatus()\n", " for index, r in df.iterrows():\n", " x={f:r[f] for f in fields}\n", " x['ParticipantId']=parseParticipant(r['ParticipantId'])\n", " if r['survivalStatus']==None:\n", " continue\n", " if r['deathDate']==None:\n", " continue\n", " x['SequenceNum']=2\n", " x['eventDate']=checkDate(r['deathDate'])\n", " if r['survivalStatus']==1:\n", " continue\n", " x['eventOutcome']=eOutcome['DEATH']\n", " x['eventType']=eTypeId\n", " count=checkLoad(db,project,schema,query,[x],['ParticipantId','SequenceNum'])\n", " status=addStatus(status,count)\n", " printStatus(status)\n", " \n", "\n", "#loadCancer(df,setup)\n", "#loadTreatment(df,setup)\n", "#loadEndOfTreatment(df,setup)\n", "#loadEvaluation(df,setup)\n", "#loadDeath(df,setup)" ] }, { "cell_type": "code", "execution_count": 49, "id": "88a5201f-7967-4be1-9634-da942d7504f8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "61 7281/16\n", "Name: ParticipantId, dtype: object\n", "62 7489/16\n", "Name: ParticipantId, dtype: object\n" ] } ], "source": [ "#find entries in labkey that are not in excel\n", "def findRedundantLabkeyEntries(df,setup,query):\n", " xId=df['ParticipantId'].tolist()\n", " xId=[parseParticipant(x) for x in xId]\n", " db=connectDB(setup['server'])\n", " project=setup['project']\n", " schema='study'\n", " #query='cancerData'\n", " ds=db.selectRows(project,schema,query,[])\n", " yId=[r['ParticipantId'] for r in ds['rows']]\n", " xs=set(xId)\n", " ys=set(yId)\n", " print(ys-xs)\n", "\n", "#findRedundantLabkeyEntries(df,setup,'cancerTreatment')\n", "#find entries with missing data\n", "print(df.loc[df['treatmentEvaluationStatus'].isnull(),\"ParticipantId\"])\n", "print(df.loc[df['treatmentResponse'].isnull(),\"ParticipantId\"])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "21933e6e-2ca8-4d34-bb47-fd0bfb296236", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 5 }