Cuda 实现归约 reduction 与优化

 
Category: C_C++

写在前面

https://leetgpu.com/challenges/reduction

reduction 实现(基础版)

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>

constexpr int BLOCK_SIZE = 256;

#define CUDA_CHECK(call)                                                  \
    do {                                                                  \
        cudaError_t err = call;                                           \
        if (err != cudaSuccess) {                                         \
            fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(err));                             \
            exit(1);                                                      \
        }                                                                 \
    } while (0)

__global__ void block_reduce(const float* input, float* partial, int N) {
    __shared__ float cache[BLOCK_SIZE];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    float local = 0;
    for (int i = tid; i < N; i += gridDim.x * blockDim.x) local += input[i];
    cache[threadIdx.x] = local;
    __syncthreads();

    for (int i = blockDim.x >> 1; i > 0; i >>= 1) {
        if (threadIdx.x < i) {
            cache[threadIdx.x] += cache[threadIdx.x + i];
        }
        __syncthreads();
    }
    if (threadIdx.x == 0) partial[blockIdx.x] = cache[0];
}

// input, output are device pointers
extern "C" void solve(const float* input, float* output, int N) {
    int blocks = min(65535, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);
    float* d_partial;
    CUDA_CHECK(cudaMalloc((void**)&d_partial, blocks * sizeof(float)));
    block_reduce<<<blocks, BLOCK_SIZE>>>(input, d_partial, N);
    CUDA_CHECK(cudaDeviceSynchronize());
#if 0 // cpu partial reduction
    float* h_partial = (float*)malloc(blocks * sizeof(float));
    CUDA_CHECK(cudaMemcpy(h_partial.data(), d_partial, blocks * sizeof(float),
                          cudaMemcpyDeviceToHost)); // core
    for (int i = 0; i < blocks; ++i) {
        output[0] += h_partial[i]; // can not access device mem in host
    }
#else // GPU 2nd Reduction
    while (blocks > 1) {
        int next = (blocks + BLOCK_SIZE - 1) / BLOCK_SIZE;
        block_reduce<<<next, BLOCK_SIZE>>>(d_partial, d_partial, blocks);
        blocks = next;
    }
    CUDA_CHECK(
        cudaMemcpy(output, d_partial, sizeof(float), cudaMemcpyDeviceToDevice));
#endif
    CUDA_CHECK(cudaFree(d_partial));
}

void cpu_reduction(const flaot* input, float*output, int N) {
    for (int i = int(log2f(N)) + 1; i > 1; ) {

    }
}

int main(int argc, char** argv) {
    //
    if (argc != 2) {
        fprintf(stderr, "Usage: %s N", argv[0]);
        return 1;
    }
    int N = atoi(argv[1]);
    srand(42); // keep uniform
    float *h_input, *h_output;
    float *d_input, *d_output;

    h_input = (float*)malloc(N * sizeof(float));
    h_output = (float*)malloc(sizeof(float));
    double cpu_result = 0, cpu_reduction_result = 0;
    for (int i = 0; i < N; ++i) {
        h_input[i] = (float)rand() / (RAND_MAX + 1.0) * 3;
        cpu_result += h_input[i];
    }


    CUDA_CHECK(cudaMalloc((void**)&d_input, N * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float)));
    CUDA_CHECK(cudaMemcpy(d_input, h_input, N * sizeof(float),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(
        cudaMemcpy(d_output, h_output, sizeof(float), cudaMemcpyHostToDevice));
    solve(d_input, d_output, N);

    CUDA_CHECK(
        cudaMemcpy(h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost));
    printf("cpu(double) output:%.5f\n", cpu_result);
    printf("cpu(reduction) output:%.5f\n", cpu_reduction_result);
    printf("gpu(reduction) output:%.5f\n", h_output[0]);
    printf("diff : %.5f\n", fabs(h_output[0] - cpu_result));
    CUDA_CHECK(cudaFree(d_input));
    CUDA_CHECK(cudaFree(d_output));
    return 0;
}

性能测试(CPU vs GPU)

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
#include <chrono>
using namespace std::chrono;

constexpr int BLOCK_SIZE = 256;

#define CUDA_CHECK(call)                                                  \
    do {                                                                  \
        cudaError_t err = call;                                           \
        if (err != cudaSuccess) {                                         \
            fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(err));                             \
            exit(1);                                                      \
        }                                                                 \
    } while (0)

class TimeCost {
public:
    TimeCost() : _start_time(std::chrono::high_resolution_clock::now()) {}

    void begin_time_point() {
        _start_time = std::chrono::high_resolution_clock::now();
    }

    double get_time_cost(bool is_reset = false) {
        auto end_time = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
            end_time - _start_time);
        double cost = static_cast<double>(duration.count()) / 1e3;

        if (is_reset) {
            _start_time = std::chrono::high_resolution_clock::now();
        }

        return cost;
    }

private:
    std::chrono::high_resolution_clock::time_point _start_time;
};

class TimeCostCuda {
public:
    TimeCostCuda() {
        CUDA_CHECK(cudaEventCreate(&_start));
        CUDA_CHECK(cudaEventCreate(&_stop));
        CUDA_CHECK(cudaEventRecord(_start, 0));
    }

    ~TimeCostCuda() {
        CUDA_CHECK(cudaEventDestroy(_start));
        CUDA_CHECK(cudaEventDestroy(_stop));
    }

    double get_time_cost(bool is_reset = false) {
        CUDA_CHECK(cudaEventRecord(_stop, 0));
        CUDA_CHECK(cudaEventSynchronize(_stop));
        float elapsedTime;
        CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, _start, _stop));


        if (is_reset) {
            CUDA_CHECK(cudaEventRecord(_start, 0));
        }
        return elapsedTime;
    }


private:
    cudaEvent_t _start, _stop;
};

__global__ void block_reduce(const float* input, float* partial, int N) {
    __shared__ float cache[BLOCK_SIZE];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    float local = 0;
    for (int i = tid; i < N; i += gridDim.x * blockDim.x) local += input[i];
    cache[threadIdx.x] = local;
    __syncthreads();

    for (int i = blockDim.x >> 1; i > 0; i >>= 1) {
        if (threadIdx.x < i) {
            cache[threadIdx.x] += cache[threadIdx.x + i];
        }
        __syncthreads();
    }
    if (threadIdx.x == 0) partial[blockIdx.x] = cache[0];
}

// input, output are device pointers
extern "C" void solve(const float* input, float* output, int N) {


    int blocks = min(65535, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);
    float* d_partial;
    CUDA_CHECK(cudaMalloc((void**)&d_partial, blocks * sizeof(float)));
    block_reduce<<<blocks, BLOCK_SIZE>>>(input, d_partial, N);
    CUDA_CHECK(cudaDeviceSynchronize());
#if 0 // cpu partial reduction
    float* h_partial = (float*)malloc(blocks * sizeof(float));
    CUDA_CHECK(cudaMemcpy(h_partial.data(), d_partial, blocks * sizeof(float),
                          cudaMemcpyDeviceToHost)); // core
    for (int i = 0; i < blocks; ++i) {
        output[0] += h_partial[i]; // can not access device mem in host
    }
#else // GPU 2nd Reduction
    while (blocks > 1) {
        int next = (blocks + BLOCK_SIZE - 1) / BLOCK_SIZE;
        block_reduce<<<next, BLOCK_SIZE>>>(d_partial, d_partial, blocks);
        blocks = next;
    }
    CUDA_CHECK(
        cudaMemcpy(output, d_partial, sizeof(float), cudaMemcpyDeviceToDevice));
#endif

    CUDA_CHECK(cudaFree(d_partial));
}

void cpu_reduction_2_N(const float* input, float* output, int N) {
    N >>= 1;
    size_t size = N * sizeof(float);
    float* partial = (float*)malloc(size);
    memcpy(partial, input, size);
    for (int i = 0; i < N; ++i) {
        partial[i] += input[i + N];
    }
    for (int i = N >> 1; i >= 1; i >>= 1) {
        for (int j = 0; j < i; ++j) {
            partial[j] += partial[i + j];
        }
    }
    output[0] = partial[0];
    free(partial);
}

void cpu_reduction(const float* input, float* output, int N) {

    float* partial = (float*)malloc(N * sizeof(float));
    memcpy(partial, input, N * sizeof(float));


    int len = N;
    while (len > 1) {
        int half = len / 2;
        for (int i = 0; i < half; ++i) {
            partial[i] += partial[i + half];
        }
        if (len & 1) {

            partial[0] += partial[len - 1];
        }
        len = half;
    }

    output[0] = partial[0];
    free(partial);
}

int main(int argc, char** argv) {
    // suppose N is even number
    if (argc != 2) {
        fprintf(stderr, "Usage: %s N", argv[0]);
        return 1;
    }
    int N = atoi(argv[1]);
    srand(42); // keep uniform
    float *h_input, *h_output, *cpu_reduction_output;
    float *d_input, *d_output;

    h_input = (float*)malloc(N * sizeof(float));
    h_output = (float*)malloc(sizeof(float));
    cpu_reduction_output = (float*)malloc(sizeof(float));
    double cpu_result_double = 0;
    float cpu_result_float = 0;
    for (int i = 0; i < N; ++i) {
        h_input[i] = (float)rand() / (RAND_MAX + 1.0) * 3;
    }


    CUDA_CHECK(cudaMalloc((void**)&d_input, N * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_output, sizeof(float)));
    CUDA_CHECK(cudaMemcpy(d_input, h_input, N * sizeof(float),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(
        cudaMemcpy(d_output, h_output, sizeof(float), cudaMemcpyHostToDevice));
    TimeCostCuda tc_gpu;
    // call cuda kernel
    solve(d_input, d_output, N);

    CUDA_CHECK(
        cudaMemcpy(h_output, d_output, sizeof(float), cudaMemcpyDeviceToHost));
    printf("gpu(reduction) timecost:%.5fms output:%.5f\n",
           tc_gpu.get_time_cost(), h_output[0]);

    TimeCost tc;
    cpu_reduction(h_input, cpu_reduction_output, N);

    printf("cpu(reduction) timecost:%.5fms output:%.5f\n",
           tc.get_time_cost(true), cpu_reduction_output[0]);

    for (int i = 0; i < N; ++i) {
        cpu_result_double += h_input[i];
    }
    printf("cpu(double) timecost:%.5fms output:%.5f\n", tc.get_time_cost(true),
           cpu_result_double);
    for (int i = 0; i < N; ++i) {
        cpu_result_float += h_input[i];
    }
    printf("cpu(float) timecost:%.5fms output:%.5f\n", tc.get_time_cost(),
           cpu_result_float);
    printf("diff between gpu and cpu(double): %.5f\n",
           fabs(h_output[0] - cpu_result_double));
    CUDA_CHECK(cudaFree(d_input));
    CUDA_CHECK(cudaFree(d_output));
    free(h_input);
    free(h_output);
    return 0;
}

即使不优化 GPU 也是很快

$ ./make.sh  reduction.cu && ./a.out 100000000

gpu(reduction) timecost:4.64150ms output:149976224.00000
cpu(reduction) timecost:650.92900ms output:149976240.00000
cpu(double) timecost:340.58000ms output:149976233.62818
cpu(float) timecost:385.87800ms output:67108864.00000
diff between gpu and cpu(double): 9.62818