%qtconsole --colors=linux from __future__ import division import numpy as np import pyopencl as cl import pyopencl.array import pyopencl.tools %load_ext pyopencl.ipython_ext ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) device=ctx.devices[0] print device.max_mem_alloc_size print device.max_work_group_size %%cl_kernel -o "-cl-fast-relaxed-math" __kernel void get_id(__global int *c, const int x) { int gid = get_global_id(0); c[gid] = gid+x; } c = cl.array.empty(queue, 100, dtype=np.int32) get_id(queue, (100,), None, c.data, np.int32(33)) c.get() %%cl_kernel __kernel void ComputeHashes(__global uint4 * hashes, __constant char * charset, __constant char * base, const uint charsetLength, const uint prefixLength, const uint plainTextLength) { uint X[16]; uint id = get_global_id(0); int counter = id; int oc, a = 0, carry = 0; X[0] = 0; X[1] = 0; X[2] = 0; X[3] = 0; X[4] = 0; X[5] = 0; X[6] = 0; X[7] = 0; X[8] = 0; X[9] = 0; X[10] = 0; X[11] = 0; X[12] = 0; X[13] = 0; X[14] = 0; X[15] = 0; for (int i = 0; i < prefixLength; ++i) { X[i >> 2] |= base[i] << ((i & 3) << 3); } for (int i = prefixLength; i < plainTextLength; ++i) { oc = counter / charsetLength; a = base[i] + carry + counter - oc * charsetLength; if (a >= charsetLength) { a -= charsetLength; carry = 1; } else carry = 0; X[i >> 2] |= charset[a] << ((i & 3) << 3); counter = oc; } X[plainTextLength >> 2] |= ((uint)(0x00000080) << ((plainTextLength & 3) << 3)); uint A, B, C, D; #define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n))) #define P(a,b,c,d,k,s,t) \ { \ a += F(b,c,d) + X[k] + t; a = S(a,s) + b; \ } \ #define P0(a,b,c,d,k,s,t) \ { \ a += F(b,c,d) + t; a = S(a,s) + b; \ } \ #define P14(a,b,c,d,k,s,t) \ { \ a += F(b,c,d) + (plainTextLength << 3) + t; a = S(a,s) + b; \ } \ A = 0x67452301; B = 0xefcdab89; C = 0x98badcfe; D = 0x10325476; #define F(x,y,z) (z ^ (x & (y ^ z))) P( A, B, C, D, 0, 7, 0xD76AA478 ); P( D, A, B, C, 1, 12, 0xE8C7B756 ); P( C, D, A, B, 2, 17, 0x242070DB ); P( B, C, D, A, 3, 22, 0xC1BDCEEE ); P( A, B, C, D, 4, 7, 0xF57C0FAF ); P( D, A, B, C, 5, 12, 0x4787C62A ); P( C, D, A, B, 6, 17, 0xA8304613 ); P( B, C, D, A, 7, 22, 0xFD469501 ); P( A, B, C, D, 8, 7, 0x698098D8 ); P( D, A, B, C, 9, 12, 0x8B44F7AF ); P( C, D, A, B, 10, 17, 0xFFFF5BB1 ); P( B, C, D, A, 11, 22, 0x895CD7BE ); P( A, B, C, D, 12, 7, 0x6B901122 ); P( D, A, B, C, 13, 12, 0xFD987193 ); P14( C, D, A, B, 14, 17, 0xA679438E ); P( B, C, D, A, 15, 22, 0x49B40821 ); #undef F #define F(x,y,z) (y ^ (z & (x ^ y))) P( A, B, C, D, 1, 5, 0xF61E2562 ); P( D, A, B, C, 6, 9, 0xC040B340 ); P( C, D, A, B, 11, 14, 0x265E5A51 ); P( B, C, D, A, 0, 20, 0xE9B6C7AA ); P( A, B, C, D, 5, 5, 0xD62F105D ); P( D, A, B, C, 10, 9, 0x02441453 ); P( C, D, A, B, 15, 14, 0xD8A1E681 ); P( B, C, D, A, 4, 20, 0xE7D3FBC8 ); P( A, B, C, D, 9, 5, 0x21E1CDE6 ); P14( D, A, B, C, 14, 9, 0xC33707D6 ); P( C, D, A, B, 3, 14, 0xF4D50D87 ); P( B, C, D, A, 8, 20, 0x455A14ED ); P( A, B, C, D, 13, 5, 0xA9E3E905 ); P( D, A, B, C, 2, 9, 0xFCEFA3F8 ); P( C, D, A, B, 7, 14, 0x676F02D9 ); P( B, C, D, A, 12, 20, 0x8D2A4C8A ); #undef F #define F(x,y,z) (x ^ y ^ z) P( A, B, C, D, 5, 4, 0xFFFA3942 ); P( D, A, B, C, 8, 11, 0x8771F681 ); P( C, D, A, B, 11, 16, 0x6D9D6122 ); P14( B, C, D, A, 14, 23, 0xFDE5380C ); P( A, B, C, D, 1, 4, 0xA4BEEA44 ); P( D, A, B, C, 4, 11, 0x4BDECFA9 ); P( C, D, A, B, 7, 16, 0xF6BB4B60 ); P( B, C, D, A, 10, 23, 0xBEBFBC70 ); P( A, B, C, D, 13, 4, 0x289B7EC6 ); P( D, A, B, C, 0, 11, 0xEAA127FA ); P( C, D, A, B, 3, 16, 0xD4EF3085 ); P( B, C, D, A, 6, 23, 0x04881D05 ); P( A, B, C, D, 9, 4, 0xD9D4D039 ); P( D, A, B, C, 12, 11, 0xE6DB99E5 ); P( C, D, A, B, 15, 16, 0x1FA27CF8 ); P( B, C, D, A, 2, 23, 0xC4AC5665 ); #undef F #define F(x,y,z) (y ^ (x | ~z)) P( A, B, C, D, 0, 6, 0xF4292244 ); P( D, A, B, C, 7, 10, 0x432AFF97 ); P14( C, D, A, B, 14, 15, 0xAB9423A7 ); P( B, C, D, A, 5, 21, 0xFC93A039 ); P( A, B, C, D, 12, 6, 0x655B59C3 ); P( D, A, B, C, 3, 10, 0x8F0CCC92 ); P( C, D, A, B, 10, 15, 0xFFEFF47D ); P( B, C, D, A, 1, 21, 0x85845DD1 ); P( A, B, C, D, 8, 6, 0x6FA87E4F ); P( D, A, B, C, 15, 10, 0xFE2CE6E0 ); P( C, D, A, B, 6, 15, 0xA3014314 ); P( B, C, D, A, 13, 21, 0x4E0811A1 ); P( A, B, C, D, 4, 6, 0xF7537E82 ); P( D, A, B, C, 11, 10, 0xBD3AF235 ); P( C, D, A, B, 2, 15, 0x2AD7D2BB ); P( B, C, D, A, 9, 21, 0xEB86D391 ); #undef F hashes[id].x = A + 0x67452301; hashes[id].y = B + 0xefcdab89; hashes[id].z = C + 0x98badcfe; hashes[id].w = D + 0x10325476; } from time import time t0=time() N=100000 hashes = cl.array.empty(queue, N, dtype=cl.array.vec.uint4) charset_str = "abcefghijklmnopqrstuvwxyz0123456789" charset_host = np.array(map(ord, charset_str), dtype=np.uint8) charset = cl.array.to_device(queue, charset_host) prefixLength = np.uint32(17) base = cl.array.to_device(queue, np.array([ord('x')]*prefixLength+[0, 0, 0, 0], dtype=np.uint8)) charsetLength = np.uint32(charset.shape[0]) plainTextLength = np.uint32(base.shape[0]) ComputeHashes(queue, (N,), None, hashes.data, charset.data, base.data, charsetLength, prefixLength, plainTextLength).wait() rtn = hashes.get() print time()-t0 from itertools import product, islice from hashlib import md5 def hd(n): return "".join("%02x"%((x>>i)&0xff) for x in n for i in range(0,32,8)) a= map(hd, rtn) t0 = time() h = (('x'*prefixLength+"".join(reversed(s))) for s in islice(product(*[charset_str ]*4), len(a))) b = map(lambda x:md5(x).digest(), h) print time()-t0 !set PYOPENCL_COMPILER_OUTPUT=1 %%cl_kernel /* * This software is Copyright (c) 2012 Myrice * and it is hereby released to the general public under the following terms: * Redistribution and use in source and binary forms, with or without modification, are permitted. */ #ifdef cl_khr_byte_addressable_store #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable #endif #define uint8_t unsigned char #define uint32_t unsigned int #define uint64_t unsigned long #define SALT_SIZE 0 #define BINARY_SIZE 8 #define FULL_BINARY_SIZE 64 #define PLAINTEXT_LENGTH 20 #define CIPHERTEXT_LENGTH 128 #define KEYS_PER_CRYPT (1024*512) #define ITERATIONS 1 #define MIN_KEYS_PER_CRYPT (KEYS_PER_CRYPT) #define MAX_KEYS_PER_CRYPT (ITERATIONS*KEYS_PER_CRYPT) /// Warning: This version of SWAP64(n) is slow and avoid bugs on AMD GPUs(7970) #define SWAP64(n) as_ulong(as_uchar8(n).s76543210) /*#define SWAP64(n) \ (((n) << 56) \ | (((n) & 0xff00) << 40) \ | (((n) & 0xff0000) << 24) \ | (((n) & 0xff000000) << 8) \ | (((n) >> 8) & 0xff000000) \ | (((n) >> 24) & 0xff0000) \ | (((n) >> 40) & 0xff00) \ | ((n) >> 56)) */ #define rol(x,n) ((x << n) | (x >> (64-n))) #define ror(x,n) ((x >> n) | (x << (64-n))) #define Ch(x,y,z) ((x & y) ^ ( (~x) & z)) #define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) #define Sigma0(x) ((ror(x,28)) ^ (ror(x,34)) ^ (ror(x,39))) #define Sigma1(x) ((ror(x,14)) ^ (ror(x,18)) ^ (ror(x,41))) #define sigma0(x) ((ror(x,1)) ^ (ror(x,8)) ^(x>>7)) #define sigma1(x) ((ror(x,19)) ^ (ror(x,61)) ^(x>>6)) typedef struct { // notice memory align problem uint64_t H[8]; uint32_t buffer[32]; //1024 bits uint32_t buflen; } sha512_ctx; typedef struct { uint8_t length; char v[PLAINTEXT_LENGTH+1]; } sha512_key; /* Macros for reading/writing chars from int32's */ #define PUTCHAR(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2] & ~(0xffU << (((index) & 3) << 3))) + ((val) << (((index) & 3) << 3)) __constant uint64_t k[] = { 0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL, 0x3956c25bf348b538UL, 0x59f111f1b605d019UL, 0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL, 0xd807aa98a3030242UL, 0x12835b0145706fbeUL, 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL, 0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, 0x9bdc06a725c71235UL, 0xc19bf174cf692694UL, 0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL, 0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL, 0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL, 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL, 0x983e5152ee66dfabUL, 0xa831c66d2db43210UL, 0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL, 0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL, 0x06ca6351e003826fUL, 0x142929670a0e6e70UL, 0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL, 0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL, 0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, 0x81c2c92e47edaee6UL, 0x92722c851482353bUL, 0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL, 0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL, 0xd192e819d6ef5218UL, 0xd69906245565a910UL, 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL, 0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, 0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL, 0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL, 0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL, 0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL, 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL, 0x90befffa23631e28UL, 0xa4506cebde82bde9UL, 0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL, 0xca273eceea26619cUL, 0xd186b8c721c0c207UL, 0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL, 0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL, 0x113f9804bef90daeUL, 0x1b710b35131c471bUL, 0x28db77f523047d84UL, 0x32caab7b40c72493UL, 0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL, 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL, }; inline void sha512(__global const char* password, uint8_t pass_len, __global uint64_t* hash, uint32_t offset) { __private sha512_ctx ctx; uint32_t* b32 = ctx.buffer; //set password to buffer for (uint32_t i = 0; i < pass_len; i++) { PUTCHAR(b32,i,password[i]); } ctx.buflen = pass_len; //append 1 to ctx buffer uint32_t length = ctx.buflen; PUTCHAR(b32, length, 0x80); while((++length & 3) != 0) { PUTCHAR(b32, length, 0); } uint32_t* buffer32 = b32+(length>>2); for(uint32_t i = length; i < 128; i+=4) {// append 0 to 128 *buffer32++=0; } //append length to buffer uint64_t *buffer64 = (uint64_t *)ctx.buffer; buffer64[15] = SWAP64((uint64_t) ctx.buflen * 8); // sha512 main int i; uint64_t a = 0x6a09e667f3bcc908UL; uint64_t b = 0xbb67ae8584caa73bUL; uint64_t c = 0x3c6ef372fe94f82bUL; uint64_t d = 0xa54ff53a5f1d36f1UL; uint64_t e = 0x510e527fade682d1UL; uint64_t f = 0x9b05688c2b3e6c1fUL; uint64_t g = 0x1f83d9abfb41bd6bUL; uint64_t h = 0x5be0cd19137e2179UL; __private uint64_t w[16]; uint64_t *data = (uint64_t *) ctx.buffer; #pragma unroll 16 for (i = 0; i < 16; i++) w[i] = SWAP64(data[i]); uint64_t t1, t2; #pragma unroll 16 for (i = 0; i < 16; i++) { t1 = k[i] + w[i] + h + Sigma1(e) + Ch(e, f, g); t2 = Maj(a, b, c) + Sigma0(a); h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; } for (i = 16; i < 80; i++) { w[i & 15] =sigma1(w[(i - 2) & 15]) + sigma0(w[(i - 15) & 15]) + w[(i -16) & 15] + w[(i - 7) & 15]; t1 = k[i] + w[i & 15] + h + Sigma1(e) + Ch(e, f, g); t2 = Maj(a, b, c) + Sigma0(a); h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; } hash[offset] = SWAP64(a); } __kernel void kernel_sha512( __global const sha512_key *password, __global uint64_t *hash) { uint32_t idx = get_global_id(0); for(uint32_t it = 0; it < ITERATIONS; ++it) { uint32_t offset = idx+it*KEYS_PER_CRYPT; sha512(password[offset].v, password[offset].length, hash, offset); } } __kernel void kernel_cmp( __constant uint64_t* binary, __global uint64_t *hash, __global uint32_t* result) { uint32_t idx = get_global_id(0); if(idx == 0) *result = 0; for(uint32_t it = 0; it < ITERATIONS; ++it) { uint32_t offset = idx+it*KEYS_PER_CRYPT; if (*binary == hash[offset]) *result = 1; } } """ typedef struct { uint8_t length; char v[PLAINTEXT_LENGTH+1]; } sha512_key; """ sha512_dtype = np.dtype([("length", np.uint8), ("v", 'u1', 21)]) pyopencl.tools.register_dtype(sha512_dtype, 'sha512_key') from time import time t0=time() keys = ["", "bbb", "ccc"] N=len(keys) keys_host = np.array([(len(k), map(ord,k)+[0]*(21-len(k)) ) for k in keys], dtype=sha512_dtype) keys_cl = cl.array.to_device(queue, keys_host) hashes = cl.array.zeros(queue, N*8, dtype=cl.array.vec.ulong8) print hashes.get() rtn = kernel_sha512(queue, (N,), None, keys_cl.data, hashes.data) hex(hashes.get()[0][0]) from hashlib import sha512 sha512('aaa').hexdigest() hex(hashes.get()[0][0])