验证中...
Languages: C/C++
Categories: 常用工具包
Latest update 2019-11-09 17:43
RadixSort.cu
Raw Copy
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <device_functions.h>
#include <cstdlib>
#include <ctime>
#define NUM_ELEM 32
#define MAX_NUM_LISTS 32
typedef unsigned long int u32;
__device__ void radixSort(u32 * const sortTMP, const u32 numList, const u32 numElements, const u32 tid, u32 *const sortTMP1)
{
for (u32 bit = 0; bit < 32; ++bit)
{
const u32 bitMask = (1 << bit);
u32 baseCNT0 = 0;
u32 baseCNT1 = 0;
for (u32 i = 0; i < numElements; i += numList)
{
const u32 elem = sortTMP[i + tid];
if ((elem & bitMask) > 0)
{
sortTMP1[baseCNT1 + tid] = elem;
baseCNT1 += numList;
}
else
{
sortTMP[baseCNT0 + tid] = elem;
baseCNT0 += numList;
}
}
for (u32 i = 0; i < baseCNT1; i += numList)
{
sortTMP[baseCNT0 + i + tid] = sortTMP1[i + tid];
}
}
__syncthreads();
return;
}
__device__ void copyDataToShared(const u32 * const data, u32 * const sortTMP, const u32 numList, const u32 numElements, const u32 tid)
{
for (u32 i = 0; i < numElements; i += numList)
{
sortTMP[i + tid] = data[i + tid];
}
__syncthreads();
return;
}
__device__ void mergeArray(const u32* const srcArray, u32* const destArray, const u32 numList, const u32 numElements, const u32 tid)
{
__shared__ u32 listIndex[MAX_NUM_LISTS];
listIndex[tid] = 0;
__syncthreads();
if (tid == 0)
{
const u32 numElementsPerList = (numElements / numList);
for (u32 i = 0; i < numElements; ++i)
{
u32 minVal = 0xFFFFFFFF;
u32 minIdx = 0;
for (u32 list = 0; list < numList; ++list)
{
if (listIndex[list] < numElementsPerList)
{
const u32 srcIDX = list + (listIndex[list] * numList);
const u32 data = srcArray[srcIDX];
if (data <= minVal)
{
minVal = data;
minIdx = list;
}
}
}
listIndex[minIdx]++;
destArray[i] = minVal;
}
}
return;
}
__global__ void gpuSortArrayArray(u32 * const data, const u32 numList, const u32 numElements)
{
const u32 tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ u32 sortTMP[NUM_ELEM];
__shared__ u32 sortTMP1[NUM_ELEM];
copyDataToShared(data, sortTMP, numList, numElements, tid);
radixSort(sortTMP, numList, numElements, tid, sortTMP1);
mergeArray(sortTMP, data, numList, numElements, tid);
}
int main(void)
{
clock_t start, end;
u32 dataInput[NUM_ELEM];
u32 * data;
srand((int)time(0));
std::cout << "排序前:" << std::endl;
for (int i = 0; i < NUM_ELEM; ++i)
{
dataInput[i] = rand() % 4096;
std::cout << dataInput[i] << " ";
if ((i + 1) % 10 == 0 && i != 0)
std::cout << std::endl;
}
start = clock();
cudaMalloc((void**)&data, sizeof(u32) * NUM_ELEM);
cudaMemcpy(data, dataInput, sizeof(u32) * NUM_ELEM, cudaMemcpyHostToDevice);
gpuSortArrayArray<<<1, MAX_NUM_LISTS>>>(data, MAX_NUM_LISTS, NUM_ELEM);
cudaMemcpy(dataInput, data, sizeof(u32) * NUM_ELEM, cudaMemcpyDeviceToHost);
end = clock();
std::cout << "排序后:" << std::endl;
for (int i = 0; i < NUM_ELEM; ++i)
{
std::cout << dataInput[i] << " ";
if ((i + 1) % 10 == 0 && i != 0)
std::cout << std::endl;
}
std::cout << "共耗时:" << (double)(end - start)/CLOCKS_PER_SEC << std::endl;
cudaFree(data);
return 0;
}
radix.png

Comment list( 0 )

Sign in for post a comment

Help Search

191139_cd20d5fd_5186603 191143_ebef6f8d_5186603