OpenCL(来自 CUDA 的端口)中的暴力破解不起作用

bruteforce in OpenCL (port from CUDA) isn't working

本文关键字:不起作用 破解 来自 CUDA OpenCL      更新时间:2023-10-16

*代码和问题中的另一个更新*

刚开始学习openCL大约1周左右,我试图移植一个CUDA程序,对MD5哈希进行粗处理,从中获得实际的字符串。我使用了两个文件:kernel.cl和main.cpp.

//this is kernel.cl
{...*defining some md5 variables*...}
void IncrementBruteGPU(unsigned char* ourBrute, unsigned int charSetLen, unsigned int bruteLength, unsigned int incrementBy){
int i = 0;
while(incrementBy > 0 && i < bruteLength)
{
int add = incrementBy + ourBrute[i];
ourBrute[i] = add % charSetLen;
incrementBy = add / charSetLen;
i++;
}}
void md5_vfy(unsigned char* data, unsigned int length, unsigned int *a1, unsigned int *b1, unsigned int *c1, unsigned int *d1){
{...*some md5 hashing function*...}}
__kernel void crack(unsigned int numThreads, unsigned int charSetLen,
unsigned int bruteLength, unsigned int v1,
unsigned int v2, unsigned int v3, unsigned int v4,
__constant unsigned char *cudaBrute, 
__constant unsigned char *cudaCharSet,
__global unsigned char *correctPass){
//count index
unsigned int idx = get_global_id(0);
int totalLen = 0;
int bruteStart = 0;
unsigned char word[14];
unsigned char ourBrute[14];
int i = 0;
for(i = 0; i < 14; i++)
{
ourBrute[i] = cudaBrute[i];
}
i = 0;
bruteStart = i;
i+= bruteLength;
totalLen = i;
IncrementBruteGPU(ourBrute, charSetLen, bruteLength, idx);
int timer = 0;
for(timer = 0; timer < 200; timer++)
{
//substitute into string
for(i = 0; i < bruteLength; i++)
{
word[i+bruteStart] = cudaCharSet[ourBrute[i]];
}
unsigned int c1 = 0, c2 = 0, c3 = 0, c4 = 0;
//find MD5 hash from word
md5_vfy(word,totalLen, &c1, &c2, &c3, &c4);
//compare hash with the input one
if(c1 == v1 && c2 == v2 && c3 == v3 && c4 == v4)
{
//place the right string into first index of array
int j;
for(j= 0; j < 14; j++)
{
correctPass[j] = word[j];
}
correctPass[totalLen] = 0;
}
IncrementBruteGPU(ourBrute, charSetLen, bruteLength, numThreads);
}}

这是主要的:

//just the main, not the entire main.cpp
int main( int argc, char** argv){
int digit=1;
int charSetLen = 0;
char hash[32];
char *strhash[32];
printf("Insert Hash: ");
scanf("%s", strhash);
system("cls");
int numThreads = BLOCKS * THREADS_PER_BLOCK;
unsigned char currentBrute[14];
unsigned char cpuCorrectPass[14];
ZeroFill(currentBrute, 14);
ZeroFill(cpuCorrectPass, 14);
charSetLen = 65;
unsigned char charSet[65];
memcpy(charSet, " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@_", charSetLen);
memcpy(hash, strhash, 32);
//break hash into 4 processes of MD5
unsigned int v1, v2, v3, v4;
md5_to_ints(hash,&v1,&v2,&v3,&v4);
//openCL starts here
cl_platform_id cpPlatform;        // OpenCL platform
cl_device_id device_id;           // device ID
cl_context context;               // context
cl_command_queue queue;           // command queue
cl_program program;               // program
cl_kernel kernel;                 // kernel
cl_int err;
cl_mem correctPass;
cl_mem cudaCharSet;
cl_mem cudaBrute;
size_t globalSize, localSize;
size_t bytes = 14*sizeof(char);
//5 work-groups
localSize = 10;
globalSize = 50;
// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
} 
// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
}
if(err < 0) {
perror("Couldn't access any devices");
exit(1);   
}
// Create a context  
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");
exit(1);   
}
// Create a command queue 
queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if(err < 0) {
perror("Couldn't create a command queue");
exit(1);   
}
// Build the program executable 
program = build_program(context, device_id, PROGRAM_FILE);
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if(err < 0) {
perror("Couldn't create a kernel");
exit(1);
}
// Create the input and output arrays in device memory for our calculation
cudaBrute = clCreateBuffer(context, CL_MEM_READ_ONLY, 14, NULL, NULL);
cudaCharSet = clCreateBuffer(context, CL_MEM_READ_ONLY, 95, NULL, NULL);
correctPass = clCreateBuffer(context, CL_MEM_READ_WRITE, 14, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, correctPass, CL_TRUE, 0,
bytes, cpuCorrectPass, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, cudaCharSet, CL_TRUE, 0,
bytes, charSet, 0, NULL, NULL);
// Set the arguments to our compute kernel
err  = clSetKernelArg(kernel, 0, sizeof(unsigned int), &numThreads);
err  |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &charSetLen);
err  |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &digit);
err  |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &v1);
err  |= clSetKernelArg(kernel, 4, sizeof(unsigned int), &v2);
err  |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &v3);
err  |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &v4);
err  |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cudaBrute);
err  |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &cudaCharSet);
err  |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &correctPass);
bool finished = false;
int ct = 0;
while(true){
do{
err = clEnqueueWriteBuffer(queue, cudaBrute, CL_TRUE, 0,
bytes, currentBrute, 0, NULL, NULL);
// Execute the kernel over the entire range of the data set  
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
0, NULL, NULL);
// Wait for the command queue to get serviced before reading back results
clFinish(queue);
// Read the results from the device
clEnqueueReadBuffer(queue, correctPass, CL_TRUE, 0, bytes, cpuCorrectPass, 0, NULL, NULL );
if(cpuCorrectPass[0] != 0)
{       
printf("MD5 Cracked---->t");
int k = 0;
while(cpuCorrectPass[k] != 0)
{
printf("%c", cpuCorrectPass[k]);
k++;
}
printf("nn");
return 0;
}
finished = BruteIncrement(currentBrute, charSetLen, digit, numThreads * 200);
if(ct % OUTPUT_INTERVAL == 0)
{
printf("STATUS: ");
int k = 0;
for(k = 0; k < digit; k++)
printf("%c",charSet[currentBrute[k]]);
printf("n");
}
ct++;
} while(!finished);
digit=digit+1;
}   
// release OpenCL resources
clReleaseMemObject(correctPass);
clReleaseMemObject(cudaCharSet);
clReleaseMemObject(cudaBrute);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;}

这个程序的问题是它永远找不到正确的字符串。比较暴力哈希和输入哈希的想法似乎不起作用。我得到了CUDA版本的完美工作。

请告诉我是什么原因导致它无法正确运行。我怀疑内核根本不工作,或者我对读/写内存缺乏了解;openCL中的缓冲区,或者通常导致这种情况。

*如果你想看所有的文件,请问我。因为我觉得如果我把它们放在这里会太长。谢谢你,很抱歉格式不好。

您的内核正在从OpenCL内核源代码(cudaBrutecudaCharSetcorrectPass)中程序范围内定义的常量数组中读取和写入。这些数组没有初始化,主机将永远无法从内核获得输出。要将输入数据从主机传输到内核并从内核检索结果,需要使用内核参数,而不是程序范围变量。

您的内核定义应该是这样的:

__kernel void crack(unsigned int numThreads, unsigned int charSetLen,
unsigned int bruteLength, unsigned int v1,
unsigned int v2, unsigned int v3, unsigned int v4,
__global uchar *cudaBrute, 
__global uchar *cudaCharSet,
__global uchar *correctPass)
{
...
(do stuff with the arguments)
...
}

要设置主机代码中的参数,您可以执行以下操作:

// Set the arguments to our compute kernel
err  = clSetKernelArg(kernel, 0, sizeof(int), &numThreads);
err  |= clSetKernelArg(kernel, 1, sizeof(int), &charSetLen);
err  |= clSetKernelArg(kernel, 2, sizeof(int), &digit);
err  |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &v1);
err  |= clSetKernelArg(kernel, 4, sizeof(unsigned int), &v2);
err  |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &v3);
err  |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &v4);
err  |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cudaBrute);
err  |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &cudaCharSet);
err  |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &correctPass);

请注意第二个参数,它是内核定义中的参数索引,以及我们现在如何在使用clCreateBuffer创建的缓冲区中传递最后三个参数。


(编辑:在进一步调试后发现了更多问题)

您正在更新主机上digit的值。为了在每次内核调用中将更新后的值传递给设备,您需要重新设置内核参数。您可以简单地将这条线移动到clEnqueueNDRangeKernel呼叫之前:

err  |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &digit);

将数据写入cudaCharSet缓冲区时,需要确保写入的数据量正确。您的代码当前使用bytes(即14),但实际上应该是charSetLen(即65):

err = clEnqueueWriteBuffer(queue, cudaCharSet, CL_TRUE, 0,
charSetLen, charSet, 0, NULL, NULL);