Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
134 views
in Technique[技术] by (71.8m points)

cusolver - multiple SVDs of a matrix using CUDA

I am trying to do SVD using the code given below, but I do not get correct results for d_S, when num_Matrices > 128, lets say 256, I get correct results when num_Matrices = 128*128, I am referring to a code based on an answer to a question asked on stack overflow, link is Parallel implementation for multiple SVDs using CUDA.

My code is given below which gives singular values for the first 26 matrices as all 0s.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define FULLSVD
#define PRINTRESULTS

#define M_SQR(x)         ((x)*(x))

__global__ void initSIGPU(cuComplex *SI, float *SDiag, int N, int M, int len, float epsilon);
void initSI(cuComplex *SI, float *SDiag, int N, int M, int len, float epsilon, int threadsPerBlock);
__global__ void initCZeros(cuComplex *in, int len);
void checkArray(cuComplex *ptr, int rows, int cols, const char *name);
void printArray(cuComplex *ptr, int rows, int cols, char mode, const char *name);
cuComplex complexMul(cuComplex a, cuComplex b);
double verifyDecomposition(cuComplex *U, float *S, cuComplex *V, cuComplex *A, int M, int N);
void GPU_Multi(cuComplex **M, cuComplex **N, cuComplex **P,
               size_t pr, size_t pc, size_t mc,
               size_t num_mat, cuComplex alpha, cuComplex beta);

/********/
/* MAIN */
/********/
int main() {

    const int           M = 5;
    const int           N = 3;
    const int           K = 2;
    int                 lda = M;
    const int         numMatrices = 128*3;
    //const int           numMatrices = 256;
    cublasHandle_t cublasHandle = NULL;

    /* residual and executed_sweeps are not supported on gesvdjBatched */
    //double residual = 0;
    int executed_sweeps = 0;

    TimingGPU timerGPU;
    cudaEvent_t start, stop;
    cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;

    cublasCreate(&cublasHandle);
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int major = -1, minor = -1, patch = -1;
    cusolverGetProperty(MAJOR_VERSION, &major);
    cusolverGetProperty(MINOR_VERSION, &minor);
    cusolverGetProperty(PATCH_LEVEL, &patch);
    printf("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d
",
           major, minor, patch);

    // --- Setting the host matrix
    cuComplex *h_A = (cuComplex *)malloc(lda * N * numMatrices * sizeof(cuComplex));
    for (unsigned int k = 0; k < numMatrices; k++)
        for (unsigned int i = 0; i < M; i++) {
            for (unsigned int j = 0; j < N; j++) {
                h_A[k * M * N + j * M + i] = make_float2((rand() * 1.0 + k*0.1)/ RAND_MAX, (rand() * 1.0 + k*0.1) / RAND_MAX); //make_float2((1. / (k + 1)) * (i + j * j) * (i + j), (1. / (k + 1)) * (i + j * j) * (i + j));
                //printf("[%d, %d] %f
", i, j, h_A[j*M + i]);
                //printf("%f %f ", h_A[j * M + i].x, h_A[j * M + i].y);
            }
            //printf("
");
        }

    printArray(h_A, M, N, 'T', "h_A1");
    printArray(h_A + M * N, M, N, 'T', "h_A2");

    cuComplex *d_b = NULL;
    checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(cuComplex) * M * K * numMatrices));

    cuComplex *h_b = (cuComplex *)malloc(sizeof(cuComplex) * M * K * numMatrices);

    for (int k = 0; k < numMatrices; k++)
    {
        for (int i = 0; i < M; i++)
        {
            for (int j = 0; j < K; j++)
            {
                //h_b[i] = make_float2((i + 1) * 1.0, (i + 1) * 1.0);

                if ((i == 0) || (i == 1) || (i == 2) || (i == 3) || (i == 4))
                    h_b[k * M * K + i * K + j] = make_float2(1.0, 0.0);
                else
                    h_b[k * M * K + i * K + j] = make_float2(2.0, 0.0);
            }
        }
    }

    gpuErrchk(cudaMemcpy(d_b, h_b, sizeof(cuComplex) * M * K * numMatrices, cudaMemcpyHostToDevice));

    printArray(h_b, M, K, 'N', "h_b1");
    printArray(h_b + M * K, M, K, 'N', "h_b2");

    // --- Setting the device matrix and moving the host matrix to the device
    cuComplex *d_A;         gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(cuComplex)));
    gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(cuComplex), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    float *h_S = (float *)malloc(N * numMatrices * sizeof(float));
    cuComplex *h_SI = (cuComplex *)malloc(N * M * numMatrices * sizeof(cuComplex));
    cuComplex *h_x = (cuComplex *)malloc(N * numMatrices * sizeof(cuComplex));

    cuComplex *h_U = NULL;
    cuComplex *h_V = NULL;
#ifdef FULLSVD
    h_U = (cuComplex *)malloc(M * M * numMatrices * sizeof(cuComplex));
    h_V = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
#endif

    // --- device side SVD workspace and matrices
    int work_size = 0;

    int *devInfo;       gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    float *d_S;         gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(float)));
    cuComplex *d_SI;    gpuErrchk(cudaMalloc(&d_SI, N * M * numMatrices * sizeof(cuComplex)));

    cuComplex *d_x;     gpuErrchk(cudaMalloc(&d_x, N * numMatrices * sizeof(cuComplex)));
    cuComplex *d_U = NULL;
    cuComplex *d_V = NULL;
#ifdef FULLSVD
    gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(cuComplex)));
    gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(cuComplex)));
#endif

    cuComplex *d_work = NULL; /* device workspace for gesvdj */
    int devInfo_h = 0; /* host copy of error devInfo_h */

    // --- Parameters configuration of Jacobi-based SVD
    const double            tol = 1.e-7;
    const int               maxSweeps = 100;
    cusolverEigMode_t jobz;                                   // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
    jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
    jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif

    const int               econ = 0;                            // --- econ = 1 for economy size 

    // --- Numerical result parameters of gesvdj 
    double                  residual = 0;
    int                     executedSweeps = 0;

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle = NULL;
    cusolveSafeCall(cusolverDnCreate(&solver_handle));

    // --- Configuration of gesvdj
    gesvdjInfo_t gesvdj_params = NULL;
    cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));

    // --- Set the computation tolerance, since the default tolerance is machine precision
    cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));

    // --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
    cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));

    // --- Query the SVD workspace 
    cusolveSafeCall(cusolverDnCgesvdjBatched_bufferSize(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M,N) if econ = 1
        lda,                                        // --- Leading dimension of V, ldv >= max(1, N)
        &work_size,
        gesvdj_params,
        numMatrices));

    gpuErrchk(cudaMalloc(&d_work, sizeof(cuComplex) * work_size));

    // --- Compute SVD
    timerGPU.StartCounter();
    cusolveSafeCall(cusolverDnCgesvdjBatched(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M, N) if econ = 1
        N,                                          // --- Leading dimension of V, ldv >= max(1, N)
        d_work,
        work_size,
        devInfo,
        gesvdj_params,
        numMatrices));

    cudaStat1 = cudaDeviceSynchronize();
    assert(CUSOLVER_STATUS_SUCCESS == status);
    assert(cudaSuccess == cudaStat1);
    printf("Calculation of the singular values only: %f ms

", timerGPU.GetCounter());

    /*
    * The folowing two functions do not support batched version.
    * The error CUSOLVER_STATUS_NOT_SUPPORTED is returned.
    */
    status = cusolverDnXgesvdjGetSweeps(
        solver_handle,
        gesvdj_params,
        &executed_sweeps);
    assert(CUSOLVER_STATUS_NOT_SUPPORTED == status);
    status = cusolverDnXgesvdjGetResidual(
        solver_handle,
        gesvdj_params,
        &residual);
    assert(CUSOLVER_STATUS_NOT_SUPPORTED == status);

    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(float) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
    gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(cuComplex) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(cuComplex) * N * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
}
question from:https://stackoverflow.com/questions/65880586/multiple-svds-of-a-matrix-using-cuda

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)
Waitting for answers

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

2.1m questions

2.1m answers

60 comments

57.0k users

...