Error occured after calling culaDcooGmresIlu0 many times

General CULA Sparse support and troubleshooting. Use this forum if you are having a general problem or have encountered a bug.

Error occured after calling culaDcooGmresIlu0 many times

Postby dpshsw » Sun Apr 07, 2013 8:40 pm

Hey, I meet the cula error "insufficient memory to complete this operation" after I continuously call the culaDcooGmresIlu0 function 7300 times in Matlab environment to calculate x=A\B. (for testing purpose)
A is a fixed 810*810 matrix of double precision.
Every time after I run the culaDcooGmresIlu0 function I call culaSparseShutdown() and free() the space malloc-ed in CPU. But it seems the program fails to immediately release the space used in GPU after calling the culaDcooGmresIlu0 function and shut down culaSparse.
I'm looking for a solution and I will be really grateful to your help!
My code is pasted below. Thanks!

Code: Select all
#include <algorithm>

#include "mex.h"
#include "cula_sparse.h"
#include "cusparse.h"
#include "culamex.hpp"

//#include <cuda_runtime_api.h>

using std::max;
using std::min;


template <typename T>
void mexCulaLd(int nlhs,           /* number of expected outputs */
              mxArray* plhs[],        /* output pointer array */
              int nrhs,               /* number of inputs */
              const mxArray* prhs[],  /* input pointer array */
              mxClassID id,
              mxComplexity complexity)
{
    double tolerance = 2e-5;
   int maxIter = 500;
   
   if (nrhs == 3)
   {
      tolerance=mxGetScalar( prhs[2] );
   }
   if (nrhs == 4)
   {
      tolerance=mxGetScalar( prhs[2] );
      maxIter=mxGetScalar( prhs[3] );
   }
   
    int i,j,k;
    int idx;
    int N;
   
   double* Y = 0;

    int nnz;
    int* rowInd = 0;
    int* colInd = 0;
    double* yn = 0;
    double* I = 0;
    double* U = 0;

    culaStatus status;
    culaIterativeConfig config;
    culaIterativeResult result;

    culaCgOptions cgOptions;
   culaBicgOptions bicgOptions;
   culaBicgstabOptions bicgstabOptions;
   culaBicgstablOptions bicgstablOptions;
   culaGmresOptions gmresOptions;
   culaMinresOptions minresOptions;

    culaEmptyOptions emptyOptions;
   culaJacobiOptions jacobiOptions;
   culaBlockjacobiOptions blockjacobiOptions;
   culaIlu0Options ilu0Options;

    status = culaSparseInitialize();
    checkStatus(status, "culaSparseInitialize");

   N = (int) mxGetN(prhs[0]);
   
   I = (double*)malloc(N*sizeof(double));//dense
   U = (double*)malloc(N*sizeof(double));//dense   
   
   //get dense I
    memcpy( I, mxGetPr( prhs[1] ), N * sizeof(double) );
   
   if(mxIsSparse(prhs[0]))
   {
      nnz = mxGetNzmax(prhs[0]);
      yn = (double*)malloc(nnz*sizeof(double));//sparse
      rowInd = (int*)malloc(nnz*sizeof(int));//sparse
      colInd = (int*)malloc(nnz*sizeof(int));//sparse
      memcpy( yn, mxGetPr( prhs[0] ), nnz * sizeof(double) );
         
      mwSize nrow;
      mwIndex *ir, *jc;
      mwIndex x, y;
      
      ir = mxGetIr(prhs[0]);
      jc = mxGetJc(prhs[0]);
      k=0;
      for( y=0; y<N; y++ ) {
          nrow = jc[y+1] - jc[y];
          for( x=0; x<nrow; x++ ) {
        rowInd[k]=(*ir++);
        colInd[k]=y;
        k++;
          }
      }      
   }
   else
   {
      //get dense Y and nnz
      nnz = 0;
      Y = (double*)malloc(N*N*sizeof(double));//dense
      memcpy( Y, mxGetPr( prhs[0] ), N * N * sizeof(double) );
      
      for(i=0;i<N;i++){
         for(j=0;j<N;j++){
            if(Y[i+j*N]>0.000001||Y[i+j*N]<-0.000001)
               nnz++;
         }
      }   
      yn = (double*)malloc(nnz*sizeof(double));//sparse
      colInd = (int*)malloc(nnz*sizeof(int));//sparse
      rowInd = (int*)malloc(nnz*sizeof(int));//sparse
   
      //get sparse Y
      k = 0;
      for(i=0;i<N;i++){
         for(j=0;j<N;j++){
            if(Y[i+j*N]>0.000001||Y[i+j*N]<-0.000001){
               yn[k] = Y[i+j*N];
               rowInd[k] = i;
               colInd[k] = j;
               k++;
            }
         }
      }   
   }

    // Set configuration options
    culaIterativeConfigInit(&config);
    config.tolerance = (const double)tolerance;
    config.maxIterations = (const double)maxIter;
   config.useInitialResultVector = 0;
    config.indexing = 0;
   config.debug = 0;

    culaCgOptionsInit(&cgOptions);
   culaBicgOptionsInit(&bicgOptions);
   culaBicgstabOptionsInit(&bicgstabOptions);
   culaBicgstablOptionsInit(&bicgstablOptions);
   culaGmresOptionsInit(&gmresOptions);
   culaMinresOptionsInit(&minresOptions);

    culaEmptyOptionsInit(&emptyOptions);
   culaJacobiOptionsInit(&jacobiOptions);
   culaIlu0OptionsInit(&ilu0Options);
   culaBlockjacobiOptionsInit(&blockjacobiOptions);
   
   ilu0Options.reordering=culaAmdReordering;

   //status = culaDcooCg(&config, &cgOptions, &emptyOptions, N, nnz, yn, (const int *)colInd,(const int *)rowInd, U, I, &result);
   //status = culaDcooBicg(&config, &bicgOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooBicgstab(&config, &bicgstabOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooBicgstabl(&config, &bicgstablOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooGmres(&config, &gmresOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   status= culaDcooGmresIlu0(&config, &gmresOptions, &ilu0Options, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooMinres(&config, &minresOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   
   //status = culaDcooGmresIlu0(&config, &gmresOptions, &ilu0Options, N, nnz, yn, colInd, rowInd, U, I, &result);
   
    checkSparseResult(status,&result);

    culaSparseShutdown();
   
   plhs[0]=mxCreateNumericMatrix(N, 1, id, (mxComplexity)0);
   memcpy( mxGetPr( plhs[0] ), U , N * sizeof(double) );
   

    free(rowInd);
    free(colInd);
    free(yn);
    free(I);
    free(U);
   free(Y);
}

// MATLAB Gateway Function
void mexFunction(int nlhs,              /* number of expected outputs */
                 mxArray* plhs[],       /* output pointer array */
                 int nrhs,              /* number of inputs */
                 const mxArray* prhs[]  /* input pointer array */ )
{
    // We only support a full SVD in this example
    if(nrhs != 2 && nrhs != 3 && nrhs != 4)
        mexErrMsgTxt("culald: Must have 2 or 3 or 4 input argument [A,B,tolerance,maxIter]");
    if(nlhs != 1)
        mexErrMsgTxt("culald: Must have 1 output arguments [X]");

    // Get precision (single or double)
    mxClassID classID = mxGetClassID(prhs[0]);

    // Get complexity (real or complex)
    mxComplexity complexity = mxIsComplex(prhs[0]) ? mxCOMPLEX : mxREAL;

    // Switch based on data type
    if (classID == mxDOUBLE_CLASS && complexity == mxREAL )
        mexCulaLd<double>(nlhs, plhs, nrhs, prhs, classID, complexity);
    else
        mexErrMsgTxt("culasvd: Unknown or unsupported data type");
}
dpshsw
 
Posts: 2
Joined: Fri Mar 29, 2013 5:29 pm

Re: Error occured after calling culaDcooGmresIlu0 many times

Postby john » Mon Apr 08, 2013 6:54 am

Hello,
Do you have a non-Matlab reproducer code for us? It helps us separate out CULA from any potential interference by Mex/Matlab.

Thanks!
john
Administrator
 
Posts: 587
Joined: Thu Jul 23, 2009 2:31 pm

Re: Error occured after calling culaDcooGmresIlu0 many times

Postby dpshsw » Thu Apr 11, 2013 3:59 am

Hey, I've posted the code which could be run in Visual Studio below.
After the main function calls the function test_gpuSparse() less than 10000 times, the program would break down.

Code: Select all
#define NO_MKL
//#define NO_CUDA

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <fstream>
using namespace std;
#include <string.h>
#include <time.h>

// Timers

#ifdef _WIN32
#include <windows.h>
#include <psapi.h>
#endif

#include <cula_sparse.h>
#include <cusparse.h>

#ifndef NO_CUDA
#   include <cuda_runtime_api.h>
#   ifdef _MSC_VER
#       pragma comment(lib, "cudart.lib")
#   endif
#endif

// operational parameters
const double tolerance = 1e-5; // many matrices will not converge to e-9, but we have selected one that does
const int maxIter = 500;
const int defaultMatrixSize = (1<<23);


double getHighResolutionTime(void);
void cgMklRci(int n, double* an, int* colInd, int* rowPtr, double* x, double* b);
void checkRuntimeStatus(culaStatus status);
void checkSparseResult(culaStatus status, culaIterativeResult* result);
int calculateN();
int test_gpuSparse();

void my_clock(LARGE_INTEGER &t)//计时
{
   QueryPerformanceCounter(&t);
}

double cal_timeCost(LARGE_INTEGER &t_start,LARGE_INTEGER &t_end,LARGE_INTEGER &tf)//计算耗时,返回值单位为us
{
   double t=(1000000*t_end.QuadPart-1000000*t_start.QuadPart)/tf.QuadPart;    
   return t;
}

int main(int argc, char** argv)
{
   int i = 0;
   int retValue = EXIT_SUCCESS;
   while (retValue == EXIT_SUCCESS && i <10000)
      {
         retValue = test_gpuSparse();
         i++;
      }
   return i;
}

int test_gpuSparse()
{
   cudaSetDevice(1);
    int retVal = EXIT_SUCCESS;

    int i,j,k;
    int idx;
    int N;
   ifstream infile;
   ofstream outfile;
   char name_str[100];

   //timer
   LARGE_INTEGER t_start, t_end, tf;
   QueryPerformanceFrequency(&tf);
   double solve_time=0;

    // storage
   double* Y = 0;

    int nnz;
    int* rowInd = 0;
    int* colInd = 0;
    double* yn = 0;
    double* I = 0;
    double* U = 0;

   size_t matrixSizeBytes = 0;
    size_t bytes;

    // timers
    double matrixStart, matrixStop;

    // cula-specific items
    culaStatus status;
    culaIterativeConfig config;
    culaIterativeResult result;

    culaCgOptions cgOptions;
   culaBicgOptions bicgOptions;
   culaBicgstabOptions bicgstabOptions;
   culaBicgstablOptions bicgstablOptions;
   culaGmresOptions gmresOptions;
   culaMinresOptions minresOptions;

    culaEmptyOptions emptyOptions;
   culaJacobiOptions jacobiOptions;
   culaBlockjacobiOptions blockjacobiOptions;
   culaIlu0Options ilu0Options;

    // enable sparse solvers and select a GPU device
    status = culaSparseInitialize();
    checkRuntimeStatus(status);

    matrixStart = getHighResolutionTime();
    printf("--------------------------------------------------------------------\n");
    printf("Matrix Setup \n");
    printf("--------------------------------------------------------------------\n");

   //==========================================================
   N = 810;
   Y = (double*)malloc(N*N*sizeof(double));//dense
   I = (double*)malloc(N*sizeof(double));//dense
   U = (double*)malloc(N*sizeof(double));//dense

   //get dense Y and nnz
   nnz = 0;
   sprintf(name_str,".\\case\\810Y.txt");
   infile.open(name_str,ios::in);
   for(i=0;i<N;i++){
      for(j=0;j<N;j++){
         infile>>Y[i+j*N];
         if(Y[i+j*N]>0.000001||Y[i+j*N]<-0.000001)
            nnz++;
      }
   }
   infile.close();

   yn = (double*)malloc(nnz*sizeof(double));//sparse
   colInd = (int*)malloc(nnz*sizeof(double));//sparse
   rowInd = (int*)malloc(nnz*sizeof(double));//sparse
   I = (double*)malloc(N*sizeof(double));//dense
   U = (double*)malloc(N*sizeof(double));//dense

   
   //get dense I
   sprintf(name_str,".\\case\\810I.txt");
   infile.open(name_str,ios::in);
   for(i=0;i<N;i++){
      infile>>I[i];
   }
   infile.close();

   //get sparse Y
   k = 0;
   for(i=0;i<N;i++){
      for(j=0;j<N;j++){
         if(Y[i+j*N]>0.000001||Y[i+j*N]<-0.000001){
            yn[k] = Y[i+j*N];
            rowInd[k] = i;
            colInd[k] = j;
            k++;
         }
      }
   }
   

    matrixStop = getHighResolutionTime();
    printf("Size:        %lu bytes\n", (unsigned long) matrixSizeBytes );
    printf("Total Time:  %-.4gs\n\n", matrixStop - matrixStart );

    printf("--------------------------------------------------------------------\n");
    printf("Running CULA ... \n");
    printf("--------------------------------------------------------------------\n");

    // Set configuration options
    culaIterativeConfigInit(&config);
    config.tolerance = tolerance;
    config.maxIterations = maxIter;
   config.useInitialResultVector = 0;
    config.indexing = 0;
   config.debug = 0;
   //remember residual Vector
   double* reVe = (double*)malloc(2*maxIter*sizeof(double));
   config.residualVector = reVe;
   result.residual.byIteration = reVe;
   

   gmresOptions.restart=40;

    culaCgOptionsInit(&cgOptions);
   culaBicgOptionsInit(&bicgOptions);
   culaBicgstabOptionsInit(&bicgstabOptions);
   culaBicgstablOptionsInit(&bicgstablOptions);
   culaGmresOptionsInit(&gmresOptions);
   culaMinresOptionsInit(&minresOptions);

    culaEmptyOptionsInit(&emptyOptions);
   culaJacobiOptionsInit(&jacobiOptions);
   culaIlu0OptionsInit(&ilu0Options);
   culaBlockjacobiOptionsInit(&blockjacobiOptions);

   //choose to reorder matrice for ilu0 preconditioner
   ilu0Options.reordering=culaAmdReordering;

   my_clock(t_start);
    //status = culaDcooCg(&config, &cgOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooBicg(&config, &bicgOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooBicgstab(&config, &bicgstabOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooBicgstabl(&config, &bicgstablOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooGmres(&config, &gmresOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   status= culaDcooGmresIlu0(&config, &gmresOptions, &ilu0Options, N, nnz, yn, colInd, rowInd, U, I, &result);
   //status = culaDcooMinres(&config, &minresOptions, &emptyOptions, N, nnz, yn, colInd, rowInd, U, I, &result);
   
   //status = culaDcooGmresIlu0(&config, &gmresOptions, &ilu0Options, N, nnz, yn, colInd, rowInd, U, I, &result);
   
   my_clock(t_end);
   solve_time+=cal_timeCost(t_start,t_end,tf);
    // it is inadequate to check only the "status"
    checkSparseResult(status, &result);

    culaSparseShutdown();

   cout<<"nnz:"<<nnz<<endl;
   cout<<"solve time:"<<solve_time/1000.<<"ms"<<endl<<endl;

   sprintf(name_str,".\\result\\%dU.txt",N); 
   outfile.open(name_str,ios::out);
   for(i=0;i<N;i++){
      outfile.setf(ios::scientific);
      outfile.precision(9);
      outfile<<U[i]<<"\t";
      outfile<<endl;
   }
   outfile.close();
   sprintf(name_str,".\\result\\reVe.txt"); 
   outfile.open(name_str,ios::out);
   for(i=0;i<maxIter;i++){
      outfile.setf(ios::scientific);
      outfile.precision(9);
      outfile<<reVe[i]<<"\t";
      outfile<<endl;
   }
   outfile.close();

#ifdef NO_MKL
    printf("Note: Comparison to Intel MKL is disabled.\n\n");
#endif

exiting:
    free(rowInd);
    free(colInd);
    free(yn);
    free(I);
    free(U);
   free(Y);
   free(reVe);
    return retVal;

outOfMemory:
    retVal = EXIT_FAILURE;
    printf("\nOut of memory\n"); goto exiting;
}

// try to select a matrix that uses most of the GPU's memory
#ifndef NO_CUDA
int calculateN()
{
    struct cudaDeviceProp prop;
    int device;
    int n;

    cudaError_t err;
    size_t memSize;

    err = cudaGetDevice(&device);
    if( err ) goto exitCudaError;
    err = cudaGetDeviceProperties(&prop, device);
    if( err ) goto exitCudaError;

    memSize = prop.totalGlobalMem;

    n = (int) ( memSize / (4*4 + 8*8)); // device storage needed: 4*n of 4-byte quantities, 8*n of 8-byte quantities
    n = (int)(n*0.50); // twiddle factor, no allocator is 100% optimal
    printf("Automatically selecting N=%d based on GPU memory for device %d\n", n, device);

    return n;

exitCudaError:
    printf("A CUDA error (%d) was encountered when querying for GPU memory size.\nPlease define NO_CUDA and try again.\n", err);
    exit(EXIT_FAILURE);
}
#endif

void checkRuntimeStatus(culaStatus status)
{
    char buf[256];
    int exitVal = EXIT_FAILURE;

    if( !status ) // success
        return;

    culaGetErrorInfoString(status, culaGetErrorInfo(), buf, sizeof(buf));
    printf("%s\n", buf);

    if( status == culaInsufficientComputeCapability )
        exitVal = EXIT_SUCCESS;

    if( status != culaDataError ) // For data errors, we can learn more by continuing
    {
        culaSparseShutdown();
        exit(exitVal);
    }
}

void checkSparseResult(culaStatus status, culaIterativeResult* result)
{
    char buf[256];

    // checkRuntimeStatus will exit the program for many errors because they are fatal
    // Both culaNoError and culaDataError indicate that there is interesting information
    // contained in result, and so we look in there on those codes using culaGetIterativeResultString
    checkRuntimeStatus(status);

    if(!result) // no struct to parse
        return;

    culaIterativeResultString(result, buf, sizeof(buf));

    printf("%s\n", buf);
}

// Cross platform high resolution timer
#ifdef _WIN32
double getHighResolutionTime(void)
{
    double freq;
    double seconds;
    LARGE_INTEGER end_time;
    LARGE_INTEGER performance_frequency_hz;

    QueryPerformanceCounter(&end_time);
    QueryPerformanceFrequency(&performance_frequency_hz);

    seconds = (double) end_time.QuadPart;
    freq = (double) performance_frequency_hz.QuadPart;
    seconds /= freq;

    return seconds;
}
#endif
dpshsw
 
Posts: 2
Joined: Fri Mar 29, 2013 5:29 pm

Re: Error occured after calling culaDcooGmresIlu0 many times

Postby john » Thu Apr 11, 2013 5:55 am

Hello, there are two problems -
1) We cannot run the test unless you specify the data files, or find a way to construct the matrix data without the need for files.
2) Your test leaks memory - at minimum it appears to be the I and U arrays at each pass.
john
Administrator
 
Posts: 587
Joined: Thu Jul 23, 2009 2:31 pm


Return to CULA Sparse Support

Who is online

Users browsing this forum: No registered users and 1 guest

cron