kernel.cu 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. /*
  2. %% Author: Rodrigo de Barros Vimieiro
  3. % Date: March, 2019
  4. % rodrigo.vimieiro@gmail.com
  5. % =========================================================================
  6. %{
  7. % -------------------------------------------------------------------------
  8. % backprojectionDDb_mex_CUDA(proj,param)
  9. % -------------------------------------------------------------------------
  10. % DESCRIPTION:
  11. % This function reconstruct the 3D volume from projections, based on
  12. % the Distance-Driven principle. It works by calculating the overlap
  13. % in X and Y axis of the volume and the detector boundaries.
  14. % The geometry is for DBT with half cone-beam. All parameters are set
  15. % in "ParameterSettings" code.
  16. %
  17. % INPUT:
  18. %
  19. % - proj = 2D projections for each angle
  20. % - param = Parameter of all geometry
  21. % - nProj = projection number to be backprojected
  22. %
  23. % OUTPUT:
  24. %
  25. % - data3d = reconstructed volume.
  26. %
  27. % Reference:
  28. % - Branchless Distance Driven Projection and Backprojection,
  29. % Samit Basu and Bruno De Man (2006)
  30. % - GPU Acceleration of Branchless Distance Driven Projection and
  31. % Backprojection, Liu et al (2016)
  32. % - GPU-Based Branchless Distance-Driven Projection and Backprojection,
  33. % Liu et al (2017)
  34. % - A GPU Implementation of Distance-Driven Computed Tomography,
  35. % Ryan D. Wagner (2017)
  36. % ---------------------------------------------------------------------
  37. % Copyright (C) <2019> <Rodrigo de Barros Vimieiro>
  38. %
  39. % This program is free software: you can redistribute it and/or modify
  40. % it under the terms of the GNU General Public License as published by
  41. % the Free Software Foundation, either version 3 of the License, or
  42. % (at your option) any later version.
  43. %
  44. % This program is distributed in the hope that it will be useful,
  45. % but WITHOUT ANY WARRANTY; without even the implied warranty of
  46. % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  47. % GNU General Public License for more details.
  48. %
  49. % You should have received a copy of the GNU General Public License
  50. % along with this program. If not, see <http://www.gnu.org/licenses/>.
  51. %}
  52. % =========================================================================
  53. %% 3-D Distance Driven Back-projection Code
  54. */
  55. #include "backprojectionDDb_mex_cuda.h"
  56. /****************************************************************************
  57. * CUDA Kernels *
  58. ****************************************************************************/
  59. __global__ void pad_projections_kernel(double* d_img, const int nDetXMap, const int nDetYMap, const int nElem, unsigned int np) {
  60. const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
  61. // Make sure we don't try and access memory outside
  62. // by having any threads mapped there return early
  63. if (threadGId >= nElem)
  64. return;
  65. d_img[(np*nDetYMap *nDetXMap) + (threadGId*nDetYMap)] = 0;
  66. return;
  67. }
  68. __global__ void map_boudaries_kernel(double* d_pBound, const int nElem, const double valueLeftBound, const double sizeElem, const double offset) {
  69. const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
  70. // Make sure we don't try and access memory outside
  71. // by having any threads mapped there return early
  72. if (threadGId >= nElem)
  73. return;
  74. d_pBound[threadGId] = (threadGId - valueLeftBound) * sizeElem + offset;
  75. return;
  76. }
  77. __global__ void rot_detector_kernel(double* d_pRdetY, double* d_pRdetZ, double* d_pYcoord, double* d_pZcoord, const double yOffset, const double zOffset,
  78. const double phi, const int nElem) {
  79. const int threadGId = blockIdx.x * blockDim.x + threadIdx.x;
  80. // Make sure we don't try and access memory outside
  81. // by having any threads mapped there return early
  82. if (threadGId >= nElem)
  83. return;
  84. // cos and sin are in measured in radians.
  85. d_pRdetY[threadGId] = ((d_pYcoord[threadGId] - yOffset)* cos(phi) - (d_pZcoord[threadGId] - zOffset)* sin(phi)) + yOffset;
  86. d_pRdetZ[threadGId] = ((d_pYcoord[threadGId] - yOffset)* sin(phi) + (d_pZcoord[threadGId] - zOffset)* cos(phi)) + zOffset;
  87. return;
  88. }
  89. __global__ void mapDet2Slice_kernel(double* const pXmapp, double* const pYmapp, double tubeX, double tubeY, double tubeZ, double* const pXcoord,
  90. double* const pYcoord, double* const pZcoord, double* const pZSlicecoord, const int nDetXMap, const int nDetYMap, const unsigned int nz) {
  91. /*
  92. Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns.
  93. So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord).
  94. */
  95. const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
  96. blockIdx.y * blockDim.y + threadIdx.y);
  97. const int thread_1D_pos = thread_2D_pos.y * nDetYMap + thread_2D_pos.x;
  98. // Make sure we don't try and access memory outside the detector
  99. // by having any threads mapped there return early
  100. if (thread_2D_pos.x >= nDetYMap || thread_2D_pos.y >= nDetXMap)
  101. return;
  102. pXmapp[thread_1D_pos] = ((pXcoord[thread_2D_pos.y] - tubeX)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pXcoord[thread_2D_pos.y] * tubeZ) + (pXcoord[thread_2D_pos.y] * pZcoord[thread_2D_pos.x])) / (-tubeZ + pZcoord[thread_2D_pos.x]);
  103. if (thread_2D_pos.y == 0)
  104. pYmapp[thread_2D_pos.x] = ((pYcoord[thread_2D_pos.x] - tubeY)*(pZSlicecoord[nz] - pZcoord[thread_2D_pos.x]) - (pYcoord[thread_2D_pos.x] * tubeZ) + (pYcoord[thread_2D_pos.x] * pZcoord[thread_2D_pos.x])) / (-tubeZ + pZcoord[thread_2D_pos.x]);
  105. return;
  106. }
  107. __global__ void img_integration_kernel(double* d_img, const int nPixX, const int nPixY, bool direction, unsigned int offsetX, unsigned int offsetY, unsigned int nSlices) {
  108. /*
  109. Integration of 2D slices over the whole volume
  110. (S.1.Integration. - Liu et al(2017))
  111. ** Perfom an inclusive scan **
  112. */
  113. const int3 memory_2D_pos = make_int3(blockIdx.x * blockDim.x + threadIdx.x + offsetX,
  114. blockIdx.y * blockDim.y + threadIdx.y + offsetY,
  115. blockIdx.z * blockDim.z + threadIdx.z);
  116. const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
  117. blockIdx.y * blockDim.y + threadIdx.y);
  118. // Make sure we don't try and access memory outside the detector
  119. // by having any threads mapped there return early
  120. if (memory_2D_pos.x >= nPixY || memory_2D_pos.y >= nPixX || memory_2D_pos.z >= nSlices)
  121. return;
  122. if (direction == integrateXcoord) {
  123. for (int s = 1; s <= blockDim.y; s *= 2) {
  124. int spot = thread_2D_pos.y - s;
  125. double val = 0;
  126. if (spot >= 0) {
  127. val = d_img[(memory_2D_pos.z*nPixY*nPixX) + (offsetY + spot) * nPixY + memory_2D_pos.x];
  128. }
  129. __syncthreads();
  130. if (spot >= 0) {
  131. d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val;
  132. }
  133. __syncthreads();
  134. }
  135. }
  136. else
  137. {
  138. for (int s = 1; s <= blockDim.x; s *= 2) {
  139. int spot = thread_2D_pos.x - s;
  140. double val = 0;
  141. if (spot >= 0) {
  142. val = d_img[(memory_2D_pos.z*nPixY*nPixX) + memory_2D_pos.y * nPixY + spot + offsetX];
  143. }
  144. __syncthreads();
  145. if (spot >= 0) {
  146. d_img[(memory_2D_pos.z*nPixY*nPixX) + (memory_2D_pos.y * nPixY) + memory_2D_pos.x] += val;
  147. }
  148. __syncthreads();
  149. }
  150. }
  151. return;
  152. }
  153. //__global__ void bilinear_interp_kernel(double* d_interpData, cudaTextureObject_t texObj, double* const d_pDetmX, double* const d_pDetmY, const int nDetXMap, const int nDetYMap, const double2 normFactor, const double2 offset)
  154. //{
  155. //
  156. // const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
  157. // blockIdx.y * blockDim.y + threadIdx.y);
  158. //
  159. // // Make sure we don't try and access memory outside the detector
  160. // // by having any threads mapped there return early
  161. // if (thread_2D_pos.x >= nDetYMap || thread_2D_pos.y >= nDetXMap)
  162. // return;
  163. //
  164. // // Normalize coordinates from 0-1 (coords on texture memory)
  165. // double xQuery = d_pDetmX[thread_2D_pos.y * nDetYMap + thread_2D_pos.x] / normFactor.x + offset.x + (-.5f * (1 / normFactor.x));
  166. // double yQuery = d_pDetmY[thread_2D_pos.x] / normFactor.y + offset.y + (.5f * (1 / normFactor.y));
  167. //
  168. // // Read from texture and write to global memory
  169. // d_interpData[thread_2D_pos.y * nDetYMap + thread_2D_pos.x] = tex2D<double>(texObj, xQuery, yQuery);
  170. //
  171. // return;
  172. //}
  173. __global__ void bilinear_interpolation_kernel_GPU(double* d_sliceI, double* d_pProj, double* d_pObjX, double* d_pObjY, double* d_pDetmX, double* d_pDetmY, const int nPixXMap, const int nPixYMap,
  174. const int nDetXMap, const int nDetYMap, const int nDetX, const int nDetY, const unsigned int np) {
  175. const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
  176. blockIdx.y * blockDim.y + threadIdx.y);
  177. // Make sure we don't try and access memory outside the detector
  178. // by having any threads mapped there return early
  179. if (thread_2D_pos.x >= nPixYMap || thread_2D_pos.y >= nPixXMap)
  180. return;
  181. /*
  182. S.2. Interpolation - Liu et al (2017)
  183. Reference:
  184. - https://en.wikipedia.org/wiki/Bilinear_interpolation
  185. - https://stackoverflow.com/questions/21128731/bilinear-interpolation-in-c-c-and-cuda
  186. Note: *** We are using the Unit Square equation ***
  187. alpha = X - X1
  188. 1-alpha = X2 - X
  189. beta = Y - Y1
  190. 1-beta = Y2 - Y
  191. ----> Xcoord (thread_2D_pos.y)
  192. | 0___________
  193. v |_d00_|_d10_|
  194. Ycoord (thread_2D_pos.x) |_d01_|_d11_|
  195. Matlab code --
  196. objX = nDetX - (objX ./ detmX(1, end - 1));
  197. objY = (objY ./ detmX(1, end - 1)) - (detmY(1) / detmX(1, end - 1));
  198. */
  199. // Adjust the mapped coordinates to cross the range of (0-nDetX).*duMap
  200. // Divide by pixelSize to get a unitary pixel size
  201. const double xNormData = nDetX - d_pObjX[thread_2D_pos.y] / d_pDetmX[0];
  202. const signed int xData = floor(xNormData);
  203. const double alpha = xNormData - xData;
  204. // Adjust the mapped coordinates to cross the range of (0-nDetY).*dyMap
  205. // Divide by pixelSize to get a unitary pixel size
  206. const double yNormData = (d_pObjY[thread_2D_pos.x] / d_pDetmX[0]) - (d_pDetmY[0] / d_pDetmX[0]);
  207. const signed int yData = floor(yNormData);
  208. const double beta = yNormData - yData;
  209. double d00, d01, d10, d11;
  210. if (((xNormData) >= 0) && ((xNormData) <= nDetX) && ((yNormData) >= 0) && ((yNormData) <= nDetY)) d00 = d_pProj[(np*nDetYMap*nDetXMap) + (xData*nDetYMap + yData)]; else d00 = 0.0;
  211. if (((xData + 1) > 0) && ((xData + 1) <= nDetX) && ((yNormData) >= 0) && ((yNormData) <= nDetY)) d10 = d_pProj[(np*nDetYMap*nDetXMap) + ((xData + 1)*nDetYMap + yData)]; else d10 = 0.0;
  212. if (((xNormData) >= 0) && ((xNormData) <= nDetX) && ((yData + 1) > 0) && ((yData + 1) <= nDetY)) d01 = d_pProj[(np*nDetYMap*nDetXMap) + (xData*nDetYMap + yData + 1)]; else d01 = 0.0;
  213. if (((xData + 1) > 0) && ((xData + 1) <= nDetX) && ((yData + 1) > 0) && ((yData + 1) <= nDetY)) d11 = d_pProj[(np*nDetYMap*nDetXMap) + ((xData + 1)*nDetYMap + yData + 1)]; else d11 = 0.0;
  214. double result_temp1 = alpha * d10 + (-d00 * alpha + d00);
  215. double result_temp2 = alpha * d11 + (-d01 * alpha + d01);
  216. d_sliceI[thread_2D_pos.y * nPixYMap + thread_2D_pos.x] = beta * result_temp2 + (-result_temp1 * beta + result_temp1);
  217. }
  218. __global__ void differentiation_kernel(double* d_pVolume, double* d_sliceI, double tubeX, double rtubeY, double rtubeZ,
  219. double* const d_pObjX, double* const d_pObjY, double* const d_pObjZ, const int nPixX, const int nPixY, const int nPixXMap,
  220. const int nPixYMap, const double du, const double dv, const double dx, const double dy, const double dz, const unsigned int nz) {
  221. const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
  222. blockIdx.y * blockDim.y + threadIdx.y);
  223. const int thread_1D_pos = (nPixX*nPixY*nz) + (thread_2D_pos.y * nPixY) + thread_2D_pos.x;
  224. // Make sure we don't try and access memory outside the detector
  225. // by having any threads mapped there return early
  226. if (thread_2D_pos.x >= nPixY || thread_2D_pos.y >= nPixX)
  227. return;
  228. /*
  229. S.3. Differentiation - Eq. 24 - Liu et al (2017)
  230. Detector integral projection
  231. ___________
  232. |_A_|_B_|___|
  233. |_C_|_D_|___|
  234. |___|___|___|
  235. (thread_2D_pos.x,thread_2D_pos.y)
  236. ________________
  237. |_A_|__B__|_____|
  238. |_C_|(0,0)|(0,1)|
  239. |___|(1,0)|(1,1)|
  240. Threads are lauched from D up to nPixX (thread_2D_pos.y) and nPixY (thread_2D_pos.x)
  241. i.e., they are running on the detector image. Thread (0,0) is on D.
  242. Coordinates on intergal projection:
  243. A = thread_2D_pos.y * nPixYMap + thread_2D_pos.x
  244. B = ((thread_2D_pos.y+1) * nPixYMap) + thread_2D_pos.x
  245. C = thread_2D_pos.y * nPixYMap + thread_2D_pos.x + 1
  246. D = ((thread_2D_pos.y+1) * nPixYMap) + thread_2D_pos.x + 1
  247. */
  248. unsigned int coordA = thread_2D_pos.y * nPixYMap + thread_2D_pos.x;
  249. unsigned int coordB = ((thread_2D_pos.y + 1) * nPixYMap) + thread_2D_pos.x;
  250. unsigned int coordC = coordA + 1;
  251. unsigned int coordD = coordB + 1;
  252. // x - ray angle in X coord
  253. double gamma = atan((d_pObjX[thread_2D_pos.y] + (dx / 2.0) - tubeX) / (rtubeZ - d_pObjZ[nz]));
  254. // x - ray angle in Y coord
  255. double alpha = atan((d_pObjY[thread_2D_pos.x] + (dy / 2.0) - rtubeY) / (rtubeZ - d_pObjZ[nz]));
  256. double dA, dB, dC, dD;
  257. dA = d_sliceI[coordA];
  258. dB = d_sliceI[coordB];
  259. dC = d_sliceI[coordC];
  260. dD = d_sliceI[coordD];
  261. // Treat border of interpolated integral detector
  262. if (dC == 0 && dD == 0) {
  263. dC = dA;
  264. dD = dB;
  265. }
  266. // S.3.Differentiation - Eq. 24 - Liu et al(2017)
  267. d_pVolume[thread_1D_pos] += ((dD - dC - dB + dA)*(du*dv*dz / (cos(alpha)*cos(gamma)*dx*dy)));
  268. return;
  269. }
  270. __global__ void division_kernel(double* d_img, const int nPixX, const int nPixY, unsigned int nSlices, unsigned int nProj){
  271. const int3 thread_2D_pos = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
  272. blockIdx.y * blockDim.y + threadIdx.y,
  273. blockIdx.z * blockDim.z + threadIdx.z);
  274. const int thread_1D_pos = (nPixX*nPixY*thread_2D_pos.z) + (thread_2D_pos.y * nPixY) + thread_2D_pos.x;
  275. // Make sure we don't try and access memory outside the detector
  276. // by having any threads mapped there return early
  277. if (thread_2D_pos.x >= nPixY || thread_2D_pos.y >= nPixX || thread_2D_pos.z >= nSlices)
  278. return;
  279. d_img[thread_1D_pos] /= (double) nProj;
  280. }
  281. /****************************************************************************
  282. * function: backprojectionDDb() - CUDA backprojection Branchless Distance Driven. *
  283. ****************************************************************************/
  284. void backprojectionDDb(double* const h_pVolume,
  285. double* const h_pProj,
  286. double* const h_pTubeAngle,
  287. double* const h_pDetAngle,
  288. const double idXProj,
  289. const unsigned int nProj,
  290. const unsigned int nPixX,
  291. const unsigned int nPixY,
  292. const unsigned int nSlices,
  293. const unsigned int nDetX,
  294. const unsigned int nDetY,
  295. const double dx,
  296. const double dy,
  297. const double dz,
  298. const double du,
  299. const double dv,
  300. const double DSD,
  301. const double DDR,
  302. const double DAG)
  303. {
  304. // Unique GPU
  305. int devID = 0;
  306. cudaDeviceProp deviceProp;
  307. cudaGetDeviceProperties(&deviceProp, devID);
  308. cudaCheckErrors("cudaGetDeviceProperties");
  309. const unsigned int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
  310. mexPrintf("GPU Device %d: \"%s\" with compute capability %d.%d has %d Multi-Processors and %zu bytes of global memory\n\n", devID,
  311. deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount, deviceProp.totalGlobalMem);
  312. // Create timmer variables and start it
  313. StopWatchInterface *timer = NULL;
  314. sdkCreateTimer(&timer);
  315. sdkStartTimer(&timer);
  316. //cudaStream_t stream1;
  317. //cudaStreamCreate(&stream1);
  318. dim3 threadsPerBlock(1, 1, 1);
  319. dim3 blockSize(1, 1, 1);
  320. // Number of mapped detector and pixels
  321. const int nDetXMap = nDetX + 1;
  322. const int nDetYMap = nDetY + 1;
  323. const int nPixXMap = nPixX + 1;
  324. const int nPixYMap = nPixY + 1;
  325. // Convention: h_ variables live on host
  326. // Convention: d_ variables live on device (GPU global mem)
  327. double* d_pProj;
  328. double* d_sliceI;
  329. double* d_pVolume;
  330. double* d_pTubeAngle;
  331. double* d_pDetAngle;
  332. // Allocate global memory on the device, place result in "d_----"
  333. cudaMalloc((void **)&d_pProj, nDetXMap*nDetYMap*nProj * sizeof(double));
  334. cudaMalloc((void **)&d_sliceI, nPixXMap*nPixYMap * sizeof(double));
  335. cudaMalloc((void **)&d_pVolume, nPixX*nPixY*nSlices * sizeof(double));
  336. cudaMalloc((void **)&d_pTubeAngle, nProj * sizeof(double));
  337. cudaMalloc((void **)&d_pDetAngle, nProj * sizeof(double));
  338. cudaCheckErrors("cudaMalloc Initial");
  339. // Copy data from host memory "h_----" to device memory "d_----"
  340. cudaMemcpy((void *)d_pTubeAngle, (void *)h_pTubeAngle, nProj * sizeof(double), cudaMemcpyHostToDevice);
  341. cudaMemcpy((void *)d_pDetAngle, (void *)h_pDetAngle, nProj * sizeof(double), cudaMemcpyHostToDevice);
  342. cudaCheckErrors("cudaMemcpy Angles");
  343. /*
  344. Copy projection data from host memory "h_----" to device memory "d_----" padding with zeros for image integation
  345. */
  346. // Initialize first column and row with zeros
  347. double* h_pProj_tmp;
  348. double* d_pProj_tmp;
  349. threadsPerBlock.x = maxThreadsPerBlock;
  350. blockSize.x = (nDetXMap / maxThreadsPerBlock) + 1;
  351. for (unsigned int np = 0; np < nProj; np++) {
  352. // Pad on X coord direction
  353. pad_projections_kernel << <blockSize, threadsPerBlock >> > (d_pProj, nDetXMap, nDetYMap, nDetXMap, np);
  354. // Pad on Y coord direction
  355. d_pProj_tmp = d_pProj + (nDetXMap*nDetYMap*np) + 1;
  356. cudaMemset(d_pProj_tmp, 0, nPixY * sizeof(double));
  357. cudaCheckErrors("cudaMemset Padding Projections");
  358. }
  359. // Copy projections data from host memory
  360. for (unsigned int np = 0; np < nProj; np++)
  361. for (unsigned int c = 0; c < nDetX; c++) {
  362. h_pProj_tmp = h_pProj + (c *nDetY) + (nDetX*nDetY*np);
  363. d_pProj_tmp = d_pProj + (((c + 1) *nDetYMap) + 1) + (nDetXMap*nDetYMap*np);
  364. cudaMemcpy((void *)d_pProj_tmp, (void *)h_pProj_tmp, nDetY * sizeof(double), cudaMemcpyHostToDevice);
  365. cudaCheckErrors("cudaMemcpy Projections");
  366. }
  367. // Pointer for projections coordinates
  368. double* d_pDetX;
  369. double* d_pDetY;
  370. double* d_pDetZ;
  371. double* d_pObjX;
  372. double* d_pObjY;
  373. double* d_pObjZ;
  374. // Allocate global memory on the device for projections coordinates
  375. cudaMalloc((void **)&d_pDetX, nDetXMap * sizeof(double));
  376. cudaMalloc((void **)&d_pDetY, nDetYMap * sizeof(double));
  377. cudaMalloc((void **)&d_pDetZ, nDetYMap * sizeof(double));
  378. cudaMalloc((void **)&d_pObjX, nPixXMap * sizeof(double));
  379. cudaMalloc((void **)&d_pObjY, nPixYMap * sizeof(double));
  380. cudaMalloc((void **)&d_pObjZ, nSlices * sizeof(double));
  381. cudaCheckErrors("cudaMalloc Coordinates");
  382. // Pointer for mapped coordinates
  383. double* d_pDetmY;
  384. double* d_pDetmX;
  385. // Allocate global memory on the device for mapped coordinates
  386. cudaMalloc((void **)&d_pDetmY, nDetYMap * sizeof(double));
  387. cudaMalloc((void **)&d_pDetmX, nDetYMap * nDetXMap * sizeof(double));
  388. cudaCheckErrors("cudaMalloc Map-Coordinates");
  389. // Pointer for rotated detector coords
  390. double* d_pRdetY;
  391. double* d_pRdetZ;
  392. // Allocate global memory on the device for for rotated detector coords
  393. cudaMalloc((void **)&d_pRdetY, nDetYMap * sizeof(double));
  394. cudaMalloc((void **)&d_pRdetZ, nDetYMap * sizeof(double));
  395. cudaCheckErrors("cudaMalloc Rot-Coordinates");
  396. // Generate detector and object boudaries
  397. threadsPerBlock.x = maxThreadsPerBlock;
  398. blockSize.x = (nDetX / maxThreadsPerBlock) + 1;
  399. map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pDetX, nDetXMap, (double)nDetX, -du, 0.0);
  400. blockSize.x = (nDetY / maxThreadsPerBlock) + 1;
  401. map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pDetY, nDetYMap, nDetY / 2.0, dv, 0.0);
  402. blockSize.x = (nPixX / maxThreadsPerBlock) + 1;
  403. map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjX, nPixXMap, (double)nPixX, -dx, 0.0);
  404. blockSize.x = (nPixY / maxThreadsPerBlock) + 1;
  405. map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjY, nPixYMap, nPixY / 2.0, dy, 0.0);
  406. blockSize.x = (nSlices / maxThreadsPerBlock) + 1;
  407. map_boudaries_kernel << <blockSize, threadsPerBlock >> > (d_pObjZ, nSlices, 0.0, dz, DAG + (dz / 2.0));
  408. //mexPrintf("Map Boundaries -- Threads:%d Blocks:%d \n", threads, blocks);
  409. // Initiate variables value with 0
  410. cudaMemset(d_pDetZ, 0, nDetYMap * sizeof(double));
  411. cudaMemset(d_pVolume, 0, nPixX * nPixY * nSlices * sizeof(double));
  412. cudaCheckErrors("cudaMemset Zeros");
  413. //cudaDeviceSynchronize();
  414. // X - ray tube initial position
  415. double tubeX = 0;
  416. double tubeY = 0;
  417. double tubeZ = DSD;
  418. // Iso - center position
  419. double isoY = 0;
  420. double isoZ = DDR;
  421. // Integration of 2D projection over the whole projections
  422. // (S.1.Integration. - Liu et al(2017))
  423. // Naive integration o the X coord
  424. threadsPerBlock.x = 10;
  425. threadsPerBlock.y = 10;
  426. threadsPerBlock.z = 10;
  427. blockSize.x = (unsigned int)ceil(((double)nDetYMap / (threadsPerBlock.x - 1)));
  428. blockSize.y = 1;
  429. blockSize.z = (unsigned int)ceil((double)nProj / threadsPerBlock.z);
  430. //mexPrintf("Divisao: %f \n", (double)nDetY / (threadsPerBlock.x - 1));
  431. //mexPrintf("Divisao ceil: %d \n", (unsigned int)ceil((double)nDetY / (threadsPerBlock.x - 1)));
  432. for (int k = 0; k < ceil((double)nDetXMap / (threadsPerBlock.x - 1)); k++) {
  433. img_integration_kernel << <blockSize, threadsPerBlock >> > (d_pProj, nDetXMap, nDetYMap, integrateXcoord, 0, k * 9, nProj);
  434. //mexPrintf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y);
  435. }
  436. // Naive integration o the Y coord
  437. threadsPerBlock.x = 10;
  438. threadsPerBlock.y = 10;
  439. threadsPerBlock.z = 10;
  440. blockSize.x = 1;
  441. blockSize.y = (unsigned int)ceil((double)nDetXMap / (threadsPerBlock.y - 1));
  442. blockSize.z = (unsigned int)ceil((double)nProj / threadsPerBlock.z);
  443. for (int k = 0; k < ceil((double)nDetYMap / (threadsPerBlock.y - 1)); k++) {
  444. img_integration_kernel << <blockSize, threadsPerBlock >> > (d_pProj, nDetXMap, nDetYMap, integrateYcoord, k * 9, 0, nProj);
  445. //mexPrintf("integration kernel -- threadsX:%d blocksX:%d threadsY:%d blocksY:%d \n", threadsPerBlock.x, blockSize.x, threadsPerBlock.y, blockSize.y);
  446. }
  447. double* d_pDetmX_tmp = d_pDetmX + (nDetYMap * (nDetXMap-2));
  448. // Test if we will loop over all projs or not
  449. unsigned int projIni, projEnd, nProj2Run;
  450. if(idXProj == -1){
  451. projIni = 0;
  452. projEnd = nProj;
  453. nProj2Run = nProj;
  454. }
  455. else{
  456. nProj2Run = 1;
  457. projIni = (unsigned int) idXProj;
  458. projEnd = (unsigned int) idXProj + 1;
  459. }
  460. // For each projection
  461. for (unsigned int p = projIni; p < projEnd; p++) {
  462. // Get specif tube angle for the projection
  463. double theta = h_pTubeAngle[p] * M_PI / 180.0;
  464. // Get specif detector angle for the projection
  465. double phi = h_pDetAngle[p] * M_PI / 180.0;
  466. //mexPrintf("Tube angle:%f Det angle:%f\n", theta, phi);
  467. // Tube rotation
  468. double rtubeY = ((tubeY - isoY)*cos(theta) - (tubeZ - isoZ)*sin(theta)) + isoY;
  469. double rtubeZ = ((tubeY - isoY)*sin(theta) + (tubeZ - isoZ)*cos(theta)) + isoZ;
  470. //mexPrintf("R tube Y:%f R tube Z:%f\n", rtubeY, rtubeZ);
  471. // Detector rotation
  472. threadsPerBlock.x = maxThreadsPerBlock;
  473. threadsPerBlock.y = 1;
  474. threadsPerBlock.z = 1;
  475. blockSize.x = (nDetYMap / maxThreadsPerBlock) + 1;
  476. blockSize.y = 1;
  477. blockSize.z = 1;
  478. rot_detector_kernel << <blockSize, threadsPerBlock >> > (d_pRdetY, d_pRdetZ, d_pDetY, d_pDetZ, isoY, isoZ, phi, nDetYMap);
  479. //cudaDeviceSynchronize();
  480. //mexPrintf("Detector rotation -- Threads:%d Blocks:%d \n", threads, blocks);
  481. // For each slice
  482. for (unsigned int nz = 0; nz < nSlices; nz++) {
  483. /*
  484. Map detector onto XY plane(Inside proj loop in case detector rotates)
  485. *** Note: Matlab has linear indexing as a column of elements, i.e, the elements are actually stored in memory as queued columns.
  486. So threads in X are launched in the column direction (DBT Y coord), and threads in Y are launched in the row direction (DBT X coord).
  487. */
  488. threadsPerBlock.x = 32;
  489. threadsPerBlock.y = 32;
  490. threadsPerBlock.z = 1;
  491. blockSize.x = (nDetYMap / threadsPerBlock.x) + 1;
  492. blockSize.y = (nDetXMap / threadsPerBlock.y) + 1;
  493. blockSize.z = 1;
  494. mapDet2Slice_kernel << <blockSize, threadsPerBlock >> > (d_pDetmX, d_pDetmY, tubeX, rtubeY, rtubeZ, d_pDetX, d_pRdetY, d_pRdetZ, d_pObjZ, nDetXMap, nDetYMap, nz);
  495. //cudaDeviceSynchronize();
  496. //mexPrintf("Map detector onto XY plane -- ThreadsX:%d ThreadsY:%d BlocksX:%d BlocksY:%d\n", threadsPerBlock.x, threadsPerBlock.y, blockSizeX, blockSizeY);
  497. /*
  498. Creates texture memory
  499. */
  500. /*
  501. // Allocate CUDA array in device memory
  502. cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
  503. cudaArray* d_imageArray;
  504. cudaMallocArray(&d_imageArray, &channelDesc, nPixXMap, nPixYMap);
  505. // Copy to texture memory
  506. checkCudaErrors(cudaMemcpyToArray(d_imageArray, 0, 0, d_pSlice, nPixXMap* nPixYMap * sizeof(double), cudaMemcpyDeviceToDevice));
  507. // Specify texture
  508. struct cudaResourceDesc resDesc;
  509. memset(&resDesc, 0, sizeof(resDesc));
  510. resDesc.resType = cudaResourceTypeArray;
  511. resDesc.res.array.array = d_imageArray;
  512. // Specify texture object parameters
  513. struct cudaTextureDesc texDesc;
  514. memset(&texDesc, 0, sizeof(texDesc));
  515. texDesc.addressMode[0] = cudaAddressModeBorder;
  516. texDesc.addressMode[1] = cudaAddressModeBorder;
  517. texDesc.filterMode = cudaFilterModeLinear;
  518. texDesc.readMode = cudaReadModeElementType;
  519. texDesc.normalizedCoords = true;
  520. // Create texture object
  521. cudaTextureObject_t texObj = 0;
  522. cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  523. // Allocate result of transformation in device memory
  524. double* d_sliceI;
  525. cudaMalloc(&d_sliceI, nDetXMap*nDetYMap * sizeof(double));
  526. // Invoke kernel
  527. blockSize.x = (nDetYMap / threadsPerBlock.x) + 1;
  528. blockSize.y = (nDetXMap / threadsPerBlock.y) + 1;
  529. const double2 normFactor = make_double2(nPixX*-dx,
  530. nPixY*dy);
  531. const double2 offset = make_double2(1,
  532. 0.5);
  533. bilinear_interp_kernel << <blockSize, threadsPerBlock >> > (d_sliceI, texObj, d_pDetmX, d_pDetmY, nDetXMap, nDetYMap, normFactor, offset);
  534. mexPrintf("Texture interpolation -- Threads:%d Blocks:%d \n", threads, blocks);
  535. Destroy texture object
  536. cudaDestroyTextureObject(texObj);
  537. */
  538. /*
  539. S.2. Interpolation - Liu et al (2017)
  540. */
  541. blockSize.x = (nPixYMap / threadsPerBlock.x) + 1;
  542. blockSize.y = (nPixXMap / threadsPerBlock.y) + 1;
  543. bilinear_interpolation_kernel_GPU << <blockSize, threadsPerBlock >> > (d_sliceI, d_pProj, d_pObjX, d_pObjY, d_pDetmX_tmp, d_pDetmY, nPixXMap, nPixYMap, nDetXMap, nDetYMap, nDetX, nDetY, p);
  544. /*
  545. S.3. Differentiation - Eq. 24 - Liu et al (2017)
  546. */
  547. blockSize.x = (nPixY / threadsPerBlock.x) + 1;
  548. blockSize.y = (nPixX / threadsPerBlock.y) + 1;
  549. differentiation_kernel << <blockSize, threadsPerBlock >> > (d_pVolume, d_sliceI, tubeX, rtubeY, rtubeZ, d_pObjX, d_pObjY, d_pObjZ, nPixX, nPixY, nPixXMap, nPixYMap, du, dv, dx, dy, dz, nz);
  550. } // Loop end slices
  551. } // Loop end Projections
  552. // Normalize volume dividing by the number of projs
  553. threadsPerBlock.x = 10;
  554. threadsPerBlock.y = 10;
  555. threadsPerBlock.z = 10;
  556. blockSize.x = (nPixY / threadsPerBlock.x) + 1;
  557. blockSize.y = (nPixX / threadsPerBlock.y) + 1;
  558. blockSize.z = (nSlices / threadsPerBlock.z) + 1;
  559. division_kernel << <blockSize, threadsPerBlock >> > (d_pVolume, nPixX, nPixY, nSlices, nProj2Run);
  560. //double* const h_var1 = (double*)mxMalloc(1 * sizeof(double));
  561. //cudaMemcpy((void *)h_var1, (void *)d_pDetmX_tmp, 1 * sizeof(double), cudaMemcpyDeviceToHost);
  562. //mexPrintf("P: %d Nz: %d Var:%f \n",p,nz,h_var1[0]);
  563. //nDetYMap
  564. //double* const h_pPixmX = (double*)mxMalloc(nDetXMap*nDetYMap * sizeof(double));
  565. //double* const h_pPixmY = (double*)mxMalloc(nDetYMap * sizeof(double));
  566. //cudaMemcpy((void *)h_pPixmX, (void *)d_pDetmX, nDetXMap*nDetYMap * sizeof(double), cudaMemcpyDeviceToHost);
  567. //cudaMemcpy((void *)h_pPixmY, (void *)d_pDetmY, nDetYMap * sizeof(double), cudaMemcpyDeviceToHost);
  568. //for (int u = 0; u < nDetXMap; u++)
  569. // for (int v = 0; v < nDetYMap; v++) {
  570. // h_pProj[(0 * nDetYMap*nDetXMap) + (u*nDetYMap) + v] = h_pPixmX[(u*nDetYMap) + v];
  571. // h_pProj[(1 * nDetYMap*nDetXMap) + (u*nDetYMap) + v] = h_pPixmY[v];
  572. // }
  573. // d_sliceI
  574. //double* const h_var = (double*)mxMalloc(nPixXMap*nPixYMap * sizeof(double));
  575. //cudaMemcpy((void *)h_var, (void *)d_sliceI, nPixXMap*nPixYMap * sizeof(double), cudaMemcpyDeviceToHost);
  576. //for (int u = 0; u < nPixXMap; u++)
  577. // for (int v = 0; v < nPixYMap; v++) {
  578. // h_pVolume[(0 * nPixYMap*nPixXMap) + (u*nPixYMap) + v] = h_var[(u*nPixYMap) + v];
  579. // }
  580. // d_pProj
  581. //cudaMemcpy((void *)h_pVolume, (void *)d_pProj, nProj*nDetXMap*nDetYMap * sizeof(double), cudaMemcpyDeviceToHost);
  582. // d_pVolume
  583. cudaMemcpy((void *)h_pVolume, (void *)d_pVolume, nSlices* nPixX * nPixY * sizeof(double), cudaMemcpyDeviceToHost);
  584. cudaCheckErrors("cudaMemcpy Final");
  585. cudaFree(d_pProj);
  586. cudaFree(d_sliceI);
  587. cudaFree(d_pVolume);
  588. cudaFree(d_pTubeAngle);
  589. cudaFree(d_pDetAngle);
  590. cudaFree(d_pDetX);
  591. cudaFree(d_pDetY);
  592. cudaFree(d_pDetZ);
  593. cudaFree(d_pObjX);
  594. cudaFree(d_pObjY);
  595. cudaFree(d_pObjZ);
  596. cudaFree(d_pDetmY);
  597. cudaFree(d_pDetmX);
  598. cudaFree(d_pRdetY);
  599. cudaFree(d_pRdetZ);
  600. cudaCheckErrors("cudaFree Final");
  601. sdkStopTimer(&timer);
  602. mexPrintf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
  603. sdkDeleteTimer(&timer);
  604. cudaDeviceReset();
  605. return;
  606. }