TIGRE modifying minTV to minP when calculating p norm of the magnitude gradient images

modifying minTV to minP when calculating p norm of the magnitude gradient images

Open zhanglin-1993 opened this issue 6 years ago • 14 comments

Hello: I'm trying to realize p-norm version of the ASD-POCS, and complete it with CUDA. the matlab code works well while the CUDA code comes kinds of errors all the time. There maybe comes NaN voxels, which you said maybe the problem of eps, but it worked nothing when I set eps=0.0001, and maybe came the problem of the input parameter couldn't be the single one, and sometimes, it reminded me that the MinP wasn't a available function although I had compiled and got the mexw64 file in the proper path. could you please tell me where the problem is? the only change of minP.mexw64 to the minTV.mexw64 is an additional parameter p. Here is the related files.

OS-ASD-POCS: f=minimizeP(f0,dtvg,ng,p);
minimizeP.m:

if nargin==1
    dtvg=1;ng=30;p=1;
else 
      if nargin == 4
          dtvg=varargin{1};
          ng=varargin{2};
          p=varargin{3};
    else
        error('Wrogn amount of inputs');
    end
end
img=(permute(img,[3 2 1]));
img=minP(img,dtvg,ng,p);
img=(permute(img,[3 2 1]));
end

minP.cpp

#include "tmwtypes.h"
#include "mex.h"
#include <math.h>
#include <cmath>
#include "matrix.h"
#include "POCS_P.hpp"
#include <string.h>
void mexFunction(int nlhs , mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
///////// First check if the amount of imputs is rigth.
    int maxIter;
    float alpha;
    float p;
    if (nrhs==1){
        maxIter=100;
        alpha=15.0f;
        p=1.0f;
    }
    if (nrhs==2){
        mexErrMsgIdAndTxt("err", "Only 1 POCS hyperparemter inputed");
    }
    if (nrhs==3){
        mexErrMsgIdAndTxt("err", "Only 2 POCS hyperparemter inputed");
    }
    if (nrhs>4){
        mexErrMsgIdAndTxt("err", "Too many imput argumets");
    }
    if (nrhs==4){
        size_t mrows = mxGetM(prhs[1]);
        size_t ncols = mxGetN(prhs[1]);
        if (mrows!=1 || ncols !=1)
                mexErrMsgIdAndTxt("err", "POCS parameters shoudl be 1x1");
        mrows = mxGetM(prhs[2]);
        ncols = mxGetN(prhs[2]);
        if (mrows!=1 || ncols !=1)
                mexErrMsgIdAndTxt("err", "POCS parameters shoudl be 1x1");
        mrows = mxGetM(prhs[3]);
        ncols = mxGetN(prhs[3]);
        if (mrows!=1 || ncols !=1)
                mexErrMsgIdAndTxt("err", "POCS parameters shoudl be 1x1");
        alpha= (float)(mxGetScalar(prhs[1]));
        maxIter=(int)floor(mxGetScalar(prhs[2])+0.5);
        p=(float)(mxGetScalar(prhs[3]));
}

// First input should be x from (Ax=b), or the image.
mxArray const * const image = prhs[0];
mwSize const numDims = mxGetNumberOfDimensions(image);

// Image should be dim 3
if (numDims!=3){
    mexErrMsgIdAndTxt("err", "Image is not 3D");
}
// Now that input is ok, parse it to C data types.
float const * const imgaux = static_cast<float const *>(mxGetData(image));
const mwSize *size_img= mxGetDimensions(image); //get size of image

float *  img = (float*)malloc(size_img[0] *size_img[1] *size_img[2]* sizeof(float));

for(int i=0;i<size_img[0]*size_img[1]*size_img[2];i++)
    img[i]=(float)imgaux[i];
  

// Allocte output image
float *  imgout = (float*)malloc(size_img[0] *size_img[1] *size_img[2]* sizeof(float));
// call C function with the CUDA denoising

const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] };
pocs_p(img,imgout, alpha, imageSize, maxIter, p); 

//prepareotputs
plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL);
float *mxImgout =(float*) mxGetPr(plhs[0]);

memcpy(mxImgout,imgout,size_img[0] *size_img[1] *size_img[2]*sizeof(float));
//free memory
free(img);
free(imgout);
}

POCS_P.hpp:

#ifndef POCS_P_HPP
#define POCS_P_HPP
#include "mex.h"
#include "tmwtypes.h"
void pocs_p(const float* img,float* dst,float alpha,const long* image_size, int maxIter,float p);
#endif

pocs_p.cu:

#define MAXTHREADS 1024
#include "POCS_P.hpp"

#define cudaCheckErrors(msg) \
do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
                mexPrintf("ERROR in: %s \n",msg);\
                mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\
        } \
} while (0)
    
// CUDA kernels
//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927
    __global__ void divideArrayScalar(float* vec,float scalar,const size_t n)
    {
        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
        for(; i<n; i+=gridDim.x*blockDim.x) {
            vec[i]/=scalar;
        }
    }
    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
    {
        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
        for(; i<n; i+=gridDim.x*blockDim.x) {
            vec[i]*=scalar;
        }
    }
    __global__ void substractArrays(float* vec,float* vec2,const size_t n)
    {
        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
        for(; i<n; i+=gridDim.x*blockDim.x) {
            vec[i]-=vec2[i];
        }
    }
    
    __device__ __inline__
            void gradient(const float* u, float* grad,
            long z, long y, long x,
            long depth, long rows, long cols)
    {
        unsigned long size2d = rows*cols;
        unsigned long long idx = z * size2d + y * cols + x;
        
        float uidx = u[idx];
        
        if ( z - 1 >= 0 && z<depth) {
            grad[0] = (uidx-u[(z-1)*size2d + y*cols + x]) ;
        }
        
        if ( y - 1 >= 0 && y<rows){
            grad[1] = (uidx-u[z*size2d + (y-1)*cols + x]) ;
        }
        
        if ( x - 1 >= 0 && x<cols) {
            grad[2] = (uidx-u[z*size2d + y*cols + (x-1)]);
        }
    }
    
    __global__ void gradientTV(const float* f, float* dftv,
            long depth, long rows, long cols, float p){
        unsigned long x = threadIdx.x + blockIdx.x * blockDim.x;
        unsigned long y = threadIdx.y + blockIdx.y * blockDim.y;
        unsigned long z = threadIdx.z + blockIdx.z * blockDim.z;
        unsigned long long idx = z * rows * cols + y * cols + x;
        if ( x >= cols || y >= rows || z >= depth )
            return;
        
        float df[3] ={0,0,0};
        float dfi[3]={0,0,0}; // dfi== \partial f_{i+1,j,k}
        float dfj[3]={0,0,0};
        float dfk[3]={0,0,0};
        gradient(f,df  ,z  ,y  ,x  , depth,rows,cols);
        gradient(f,dfi ,z  ,y  ,x+1, depth,rows,cols);
        gradient(f,dfj ,z  ,y+1,x  , depth,rows,cols);
        gradient(f,dfk ,z+1,y  ,x  , depth,rows,cols);
        float eps=0.0001; //% avoid division by zero
        dftv[idx]=(df[0]+df[1]+df[2])*((pow(df[0] *df[0] +df[1] *df[1] +df[2] *df[2],(p-2)/2))+eps)
        -dfi[2]*((pow(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2],(p-2)/2))+eps)  
        -dfj[1]*((pow(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2],(p-2)/2))+eps)
        -dfk[0]*((pow(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2],(p-2)/2))+eps);
        
    }
    
    __device__ void warpReduce(volatile float *sdata, size_t tid) {
        sdata[tid] += sdata[tid + 32];
        sdata[tid] += sdata[tid + 16];
        sdata[tid] += sdata[tid + 8];
        sdata[tid] += sdata[tid + 4];
        sdata[tid] += sdata[tid + 2];
        sdata[tid] += sdata[tid + 1];
    }
    
    __global__ void  reduceNorm2(float *g_idata, float *g_odata, size_t n){
        extern __shared__ volatile float sdata[];
        //http://stackoverflow.com/a/35133396/1485872
        size_t tid = threadIdx.x;
        size_t i = blockIdx.x*blockDim.x + tid;
        size_t gridSize = blockDim.x*gridDim.x;
        float mySum = 0;
        float value=0;
        while (i < n) {
            value=g_idata[i]; //avoid reading twice
            mySum += value*value;
            i += gridSize;
        }
        sdata[tid] = mySum;
        __syncthreads();
        
        if (tid < 512)
            sdata[tid] += sdata[tid + 512];
        __syncthreads();
        if (tid < 256)
            sdata[tid] += sdata[tid + 256];
        __syncthreads();
        
        if (tid < 128)
            sdata[tid] += sdata[tid + 128];
        __syncthreads();
        
        if (tid <  64)
            sdata[tid] += sdata[tid + 64];
        __syncthreads();
        
        
#if (__CUDA_ARCH__ >= 300)
        if ( tid < 32 )
        {
            mySum = sdata[tid] + sdata[tid + 32];
            for (int offset = warpSize/2; offset > 0; offset /= 2) {
                mySum += __shfl_down(mySum, offset);
            }
        }
#else
        if (tid < 32) {
            warpReduce(sdata, tid);
            mySum = sdata[0];
        }
#endif
        if (tid == 0) g_odata[blockIdx.x] = mySum;
    }
    __global__ void  reduceSum(float *g_idata, float *g_odata, size_t n){
        extern __shared__ volatile float sdata[];
        //http://stackoverflow.com/a/35133396/1485872
        size_t tid = threadIdx.x;
        size_t i = blockIdx.x*blockDim.x + tid;
        size_t gridSize = blockDim.x*gridDim.x;
        float mySum = 0;
       // float value=0;
        while (i < n) {
            mySum += g_idata[i];
            i += gridSize;
        }
        sdata[tid] = mySum;
        __syncthreads();
        
        if (tid < 512)
            sdata[tid] += sdata[tid + 512];
        __syncthreads();
        if (tid < 256)
            sdata[tid] += sdata[tid + 256];
        __syncthreads();
        
        if (tid < 128)
            sdata[tid] += sdata[tid + 128];
        __syncthreads();
        
        if (tid <  64)
            sdata[tid] += sdata[tid + 64];
        __syncthreads();
        
        
#if (__CUDA_ARCH__ >= 300)
        if ( tid < 32 )
        {
            mySum = sdata[tid] + sdata[tid + 32];
            for (int offset = warpSize/2; offset > 0; offset /= 2) {
                mySum += __shfl_down(mySum, offset);
            }
        }
#else
        if (tid < 32) {
            warpReduce(sdata, tid);
            mySum = sdata[0];
        }
#endif
        if (tid == 0) g_odata[blockIdx.x] = mySum;
    }
    
    
    
    
// main function
 void pocs_p(const float* img,float* dst,float alpha,const long* image_size, int maxIter,float p){

        size_t total_pixels = image_size[0] * image_size[1]  * image_size[2] ;
        size_t mem_size = sizeof(float) * total_pixels;
        
        float *d_image, *d_dimgTV,*d_norm2aux,*d_norm2;
        // memory for image
        cudaMalloc(&d_image, mem_size);
        cudaCheckErrors("Malloc Image error");
        cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice);
        cudaCheckErrors("Memory Malloc and Memset: SRC");
        // memory for df
        cudaMalloc(&d_dimgTV, mem_size);
        cudaCheckErrors("Memory Malloc and Memset: TV");
        
        cudaMalloc(&d_norm2, mem_size);
        cudaCheckErrors("Memory Malloc and Memset: TV");
        
        // memory for L2norm auxiliar
        cudaMalloc(&d_norm2aux, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
        cudaCheckErrors("Memory Malloc and Memset: NORMAux");
        
        // For the gradient
        dim3 blockGrad(10, 10, 10);
        dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (image_size[2]+blockGrad.z-1)/blockGrad.z);
        
        // For the reduction
        float sumnorm2;
        
        for(unsigned int i=0;i<maxIter;i++){

            // Compute the gradient of the TV norm
            gradientTV<<<gridGrad, blockGrad>>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0],p);
            cudaCheckErrors("Gradient");
             
            cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice);
            cudaCheckErrors("Copy from gradient call error");
            // Compute the L2 norm of the gradint. For that, reduction is used.
            //REDUCE
            size_t dimblockRed = MAXTHREADS;
            size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
            reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2, d_norm2aux, total_pixels);
            cudaCheckErrors("reduce1");
            if (dimgridRed > 1) {
                reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2aux, d_norm2, dimgridRed);
                cudaCheckErrors("reduce2");
                cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost);
                cudaCheckErrors("cudaMemcpy");
                
            }
            else {
                cudaMemcpy(&sumnorm2, d_norm2aux, sizeof(float), cudaMemcpyDeviceToHost);
                cudaCheckErrors("cudaMemcpy");
            }
            //mexPrintf("%f ",sqrt(sumnorm2));
            //NOMRALIZE
            //in a Tesla, maximum blocks =15 SM * 4 blocks/SM
            divideArrayScalar  <<<60,MAXTHREADS>>>(d_dimgTV,sqrt(sumnorm2),total_pixels);
            cudaCheckErrors("Division error");
            //MULTIPLY HYPERPARAMETER
            multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha,   total_pixels);
            cudaCheckErrors("Multiplication error");
            //SUBSTRACT GRADIENT
            substractArrays    <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels);
            cudaCheckErrors("Substraction error");
            sumnorm2=0;
        }
        
        cudaCheckErrors("TV minimization");
        
        cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost);
        cudaCheckErrors("Copy result back");
        
        cudaFree(d_image);
        cudaFree(d_norm2aux);
        cudaFree(d_dimgTV);
        cudaFree(d_norm2);

        cudaCheckErrors("Memory free");
        
    }

Nov 20 '17 07:11 zhanglin-1993

Here is the related code, include the matlab code: gradientpnorm.m and other 4 cuda codes. minp.zip

Nov 20 '17 08:11 zhanglin-1993

Hi @ilanzhang . I will try to have a look at this, but it is very likely I will not be able to until January.

If you want to debug it to try to find where NaNs appear, the best way would be printing some values of the image aftear each step in the cuda code, and triying to see which line exactly has the NaNs. That will help pinpoint the source of errors.

Nov 20 '17 13:11 AnderBiguri

Thanks for your reply. I'll try it after finishing my job.

Dec 13 '17 11:12 zhanglin-1993

@ilanzhang any updates on this?

Mar 21 '18 15:03 AnderBiguri

@AnderBiguri have been using the matlab version, and the cuda version haven't been fixed, i don't know where the wrong is.

Mar 30 '18 00:03 zhanglin-1993

@ilanzhang hum, it may be helpful if you show me the maths if you still need help. The TV norm minimization is in the end defined by a derivation of the discrete gradient of the TV norm. Do you have a generic solution for the discrete gradient of the P norm?

Apr 02 '18 09:04 AnderBiguri

@AnderBiguri Here is the file. Thanks for your help! J20170702_p-norm calculation - to AnderBiguri.docx

Apr 23 '18 07:04 zhanglin-1993

Hi @ilanzhang,

Thanks for this! Interesting approach. You mention that you have submitted a paper based on this? If that is the case, then the best would be to hopefully get that accepted, then we can have a look at how to include it in TIGRE or properly make the CUDA work, if that is OK with you!

May 08 '18 09:05 AnderBiguri

@AnderBiguri very glad that the paper been accepted. if there is time for you, could you add the above code to work in CUDA?

Mar 16 '19 02:03 zhanglin-1993

@AnderBiguri and another thing confusing me is that: I use matlab 2018b, VS2015 and CUDA 9.0 in Win7 ultimate now. after compile, it shows that VS2015 was comfigured for the compilation of C, while what show next is, error using mex, can not find the compiler supporting. Do you know what's wrong here?

Mar 16 '19 03:03 zhanglin-1993

@ilanzhang Congratulations! Could you send me the code by email and a small demo of reconstruction using it? That would help me greatly, as if you share it I plan to modify it to be able to run on multiple GPUs.

I'd say the most likely case of your error is that you did not install the C++ compiler when installing VS2015, its not set up by default. Here you have a detailed installation and debugging help: https://github.com/CERN/TIGRE/blob/frontispiece/Frontispiece/MATLAB_installation.md

Mar 16 '19 12:03 AnderBiguri

@ilanzhang I know this is super late, but just letting you know I have not forgotten! Unfortunately implementing this for me in CUDA is something that is too much to take right now, so I am not sure when I will be able to do it.

Apr 14 '20 09:04 AnderBiguri

@AnderBiguri That's so nice of you, I'm sure this part will not only useful for me, but also useful for others!

Jul 01 '20 01:07 zhanglin-1993

@AnderBiguri there's a small suggestion here: when you add those part some day, if the regularization part could be designed to be combined freely, for example, I could apply the L1 norm of the gradient, or L2 norm of the gradient, or L0.8 norm of the gradient, and even I can choose the (L1-L2) norm of the garient. there're some new theories for regularization those years, its expansibility will be better for TIGRE if it's designed so.

Jul 01 '20 01:07 zhanglin-1993

TIGRE TIGRE copied to clipboard

modifying minTV to minP when calculating p norm of the magnitude gradient images

TIGRE
TIGRE copied to clipboard