用户
 找回密码
 立即注册
发表于 2021-7-1 12:38:08
64180
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include<iostream>
#include<fstream>
#include <cusolverSp.h>
#include <cuda_runtime_api.h>
using namespace std;

#define imin( x, y ) ((x)<(y))? (x) : (y)

int main()
{
        cusolverSpHandle_t cusolverH = NULL;

        //GPU 批量处理 QR
        csrqrInfo_t info = NULL;
        cusparseMatDescr_t descrA = NULL;

    cusparseStatus_t cusparse_status = CUSPARSE_STATUS_SUCCESS;
    cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    cudaError_t cudaStat4 = cudaSuccess;
    cudaError_t cudaStat5 = cudaSuccess;

    //int n = 4; int nnzA = 7;
    int n = 1000000; int nnzA = 2999998;
    //int n = 5; int nnzA = 9;

    int* h_csrRowPtrA = new int[n + 1];
    int* h_csrColIndA = new int[nnzA];
    double* h_csrValA = new double[nnzA];
    double* h_b = new double[n]; // batchSize * m
    double* h_x = new double[n]; // batchSize * m

    size_t size_qr = 0;
    size_t size_internal = 0;
    void* buffer_qr = NULL; // QR 分解的工作空间

    /*
    //矩阵A的 CSR 格式
    h_csrRowPtrA[0] = 1; h_csrRowPtrA[1] = 2; h_csrRowPtrA[2] = 3; h_csrRowPtrA[3] = 4; h_csrRowPtrA[4] = 8;
    h_csrColIndA[0] = 1; h_csrColIndA[1] = 2; h_csrColIndA[2] = 3; h_csrColIndA[3] = 1; h_csrColIndA[4] = 2; h_csrColIndA[5] = 3; h_csrColIndA[6] = 4;
    h_csrValA[0] = 1.0 ; h_csrValA[1] = 2.0 ; h_csrValA[2] = 3.0 ; h_csrValA[3] = 0.1 ; h_csrValA[4] = 0.1 ; h_csrValA[5] = 0.1 ; h_csrValA[6] = 4.0 ;

    //矩阵B的值
    h_b[0] = 1.0; h_b[1] = 1.0; h_b[2] = 1.0; h_b[3] = 1.0;
    */

    //矩阵A的 CSR 格式(1000000阶)

    int* csrRowPtr = new int[n + 1];
    ifstream ofs;
    ofs.open("csrRowPtr.txt", ios::in);
    if (!ofs)
    {
        cerr << "open ofs Fail" << endl;
        exit(1);
    }
    for (int i = 0; i < n + 1; ++i)
    {
        ofs >> csrRowPtr[i];
        h_csrRowPtrA[i] = csrRowPtr[i];
        //cout << h_csrRowPtrA[i] << endl;
    }

    //cout << endl;
    int* csrColInd = new int[nnzA];
    ifstream ofs1;
    ofs1.open("csrColInd.txt", ios::in);
    if (!ofs1)
    {
        cerr << "open ofs1 Fail" << endl;
        exit(1);
    }
    for (int i = 0; i < nnzA; ++i)
    {
        ofs1 >> csrColInd[i];
        h_csrColIndA[i] = csrColInd[i];
        //cout << h_csrColIndA[i] << endl;
    }

    //cout << endl;
    double* csrVal = new double[nnzA];
    ifstream ofs2;
    ofs2.open("csrVal.txt", ios::in);
    if (!ofs2)
    {
        cerr << "open ofs2 Fail" << endl;
        exit(1);
    }
    for (int i = 0; i < nnzA; ++i)
    {
        ofs2 >> csrVal[i];
        h_csrValA[i] = csrVal[i];
        //cout << h_csrValA[i] << endl;
    }

    //矩阵B的 CSR 格式(1000000阶)
    //cout << endl;
    double* B = new double[n]; // batchSize * m
    ifstream ofs3;
    ofs3.open("B.txt", ios::in);
    if (!ofs3)
    {
        cerr << "open ofs3 Fail" << endl;
        exit(1);
    }
    for (int i = 0; i < n; ++i)
    {
        ofs3 >> B[i];
        h_b[i] = B[i];
        //cout << h_b[i] << endl;
    }
    ofs.close();
    ofs1.close();
    ofs2.close();
    ofs3.close();


    const int batchSize = 30;

    double* csrValABatch = (double*)malloc(sizeof(double) * nnzA * batchSize);
    double* bBatch = (double*)malloc(sizeof(double) * n * batchSize);
    double* xBatch = (double*)malloc(sizeof(double) * n * batchSize);
    assert(NULL != csrValABatch);
    assert(NULL != bBatch);
    assert(NULL != xBatch);

    //在 host 上准备 Aj 和 Bj
    for (int colidx = 0; colidx < nnzA; colidx++)
    {
        double Areg = h_csrValA[colidx];
        for (int batchId = 0; batchId < batchSize; batchId++)
        {
            double eps = ((double)((rand() % 100) + 1)) * 1.e-4;
            csrValABatch[batchId * nnzA + colidx] = Areg + eps;
        }
    }
    for (int j = 0; j < n; j++)
    {
        double breg = h_b[j];
        for (int batchId = 0; batchId < batchSize; batchId++)
        {
            double eps = ((double)((rand() % 100) + 1)) * 1.e-4;
            bBatch[batchId * n + j] = breg + eps;
        }
    }

    // 创造句柄以及矩阵描述符
    cusolver_status = cusolverSpCreate(&cusolverH);
    assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);
    cusparse_status = cusparseCreateMatDescr(&descrA);
    assert(cusparse_status == CUSPARSE_STATUS_SUCCESS);
    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE); // base-1
    cusolver_status = cusolverSpCreateCsrqrInfo(&info);
    assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

    // 复制资料进 GPU
    int* d_csrRowPtrA = NULL;
    int* d_csrColIndA = NULL;
    double* d_csrValA = NULL;
    double* d_b = NULL; // batchSize * m
    double* d_x = NULL;
    cudaStat1 = cudaMalloc((void**)&d_csrValA, sizeof(double) * nnzA * batchSize);
    cudaStat2 = cudaMalloc((void**)&d_csrColIndA, sizeof(int) * nnzA);
    cudaStat3 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int) * (n + 1));
    cudaStat4 = cudaMalloc((void**)&d_b, sizeof(double) * n * batchSize);
    cudaStat5 = cudaMalloc((void**)&d_x, sizeof(double) * n * batchSize);
    assert(cudaStat1 == cudaSuccess);
    assert(cudaStat2 == cudaSuccess);
    assert(cudaStat3 == cudaSuccess);
    assert(cudaStat4 == cudaSuccess);
    assert(cudaStat5 == cudaSuccess);

    cudaStat1 = cudaMemcpy(d_csrValA, csrValABatch, sizeof(double) * nnzA * batchSize, cudaMemcpyHostToDevice);
    cudaStat2 = cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA,cudaMemcpyHostToDevice);
    cudaStat3 = cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (n + 1),cudaMemcpyHostToDevice);
    cudaStat4 = cudaMemcpy(d_b, bBatch, sizeof(double) * n * batchSize,cudaMemcpyHostToDevice);
    assert(cudaStat1 == cudaSuccess);
    assert(cudaStat2 == cudaSuccess);
    assert(cudaStat3 == cudaSuccess);
    assert(cudaStat4 == cudaSuccess);

    // 分析
    cusolver_status = cusolverSpXcsrqrAnalysisBatched(cusolverH, n, n, nnzA, descrA, d_csrRowPtrA, d_csrColIndA, info);
    assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

    // 准备工作空间
    cusolver_status = cusolverSpDcsrqrBufferInfoBatched(cusolverH, n, n, nnzA, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, batchSize,
    info, &size_internal, &size_qr);
    assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);
    printf("numerical factorization needs internal data %lld bytes\n", (long long)size_internal);
    printf("numerical factorization needs working space %lld bytes\n", (long long)size_qr);

    cudaStat1 = cudaMalloc((void**)&buffer_qr, size_qr);
    assert(cudaStat1 == cudaSuccess);

    // 数值分解
    cusolver_status = cusolverSpDcsrqrsvBatched( cusolverH, n, n, nnzA, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_b, d_x, batchSize, info, buffer_qr);
    assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

    // 检查残差
    // xBatch = [x0, x1, x2, ...]
    cudaStat1 = cudaMemcpy(xBatch, d_x, sizeof(double) * n * batchSize, cudaMemcpyDeviceToHost);
    assert(cudaStat1 == cudaSuccess);
    const int baseA = (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
    for (int batchId = 0; batchId < batchSize; batchId++)
    {
        // measure |bj - Aj*xj|
        double* csrValAj = csrValABatch + batchId * nnzA;
        double* xj = xBatch + batchId * n;
        double* bj = bBatch + batchId * n;
        // sup| bj - Aj*xj|
        double sup_res = 0;
        for (int row = 0; row < n; row++)
        {
            const int start = h_csrRowPtrA[row] - baseA;
            const int end = h_csrRowPtrA[row + 1] - baseA;
            double Ax = 0.0; // Aj(row,*xj
            for (int colidx = start; colidx < end; colidx++)
            {
                const int col = h_csrColIndA[colidx] - baseA;
                const double Areg = csrValAj[colidx];
                const double xreg = xj[col];
                Ax = Ax + Areg * xreg;
            }
            double r = bj[row] - Ax;
            sup_res = (sup_res > fabs(r)) ? sup_res : fabs(r);
        }
        printf("batchId %d: sup|bj - Aj*xj| = %E \n", batchId, sup_res);
    }
    for (int batchId = 0; batchId < batchSize; batchId++)
    {
        double* xj = xBatch + batchId * n;
        for (int row = 0; row < n; row++)
        {
            printf("x%d[%d] = %E\n", batchId, row, xj[row]);
        }
        printf("\n");
    }

    delete[] csrRowPtr;
    delete[] csrColInd;
    delete[] csrVal;
    delete[] B;
    delete[] h_csrValA;
    delete[] h_csrRowPtrA;
    delete[] h_csrColIndA;
    delete[] h_b;
    delete[] h_x;
    if (csrValABatch)free(csrValABatch);
    if (bBatch)free(bBatch);
    if (xBatch)free(xBatch);
    if(d_csrValA)cudaFree(d_csrValA);
    if (d_csrRowPtrA)cudaFree(d_csrRowPtrA);
    if (d_csrColIndA)cudaFree(d_csrColIndA);
    if (d_b)cudaFree(d_b);
    if (d_x)cudaFree(d_x);
    cusolverSpDestroy(cusolverH);

    return 0;
}

本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有帐号?立即注册

x
使用道具 举报 回复
发新帖
您需要登录后才可以回帖 登录 | 立即注册