Use my_md5 function to calculate the MD5 hash value of an input file.
my_md5.cxx is a simple deterministic scalar user-defined function that calculates the MD5 hash value of an input file (a LONG BINARY argument). You can use the sample my_md5.cxx to process up to 4GB of input data.
my_md5 functions uses the get_piece() API to stream data in pieces. The streaming approach allow the UDF to allocate chunks of memory that are filled in each time the get_piece() API is called. By allocating pieces or chunks, the UDF does not have to allocate enough storage to hold the entire data of the column value. For example, if a UDF wants to process a column value with a size 400MB, then it needs to allocate a memory block of 400MB in order to hold the value. However, by using the the get_piece() API the UDF is able to process the data value in blocks of 4MB.
#include "extfnapiv3.h" #include <stdlib.h> #include <string.h> #include <math.h> // A simple deterministic scalar UDF that calculates // the MD5 hash value of an input file (a LOB binary argument) // // CREATE FUNCTION my_md5(IN arg1 LONG BINARY) // RETURNS VARCHAR(32) // DETERMINISTIC // IGNORE NULL VALUES // EXTERNAL NAME 'my_md5@libudfex' //MD5 F, G, H and I functions #define F_md5(X,Y,Z) (((X) & (Y)) | ((~X) & (Z))) #define G_md5(X,Y,Z) (((X) & (Z)) | ((Y) & (~Z))) #define H_md5(X,Y,Z) ((X) ^ (Y) ^ (Z)) #define I_md5(X,Y,Z) ((Y) ^ ((X) | (~Z))) //define S constants for rotations #define S11 7 #define S12 12 #define S13 17 #define S14 22 #define S21 5 #define S22 9 #define S23 14 #define S24 20 #define S31 4 #define S32 11 #define S33 16 #define S34 23 #define S41 6 #define S42 10 #define S43 15 #define S44 21 /* ROTATE_LEFT rotates x left n bits. */ #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) void FF_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x, a_sql_uint32 s,a_sql_uint32 t ) { a_sql_uint32 temp = (a + F_md5(b,c,d)) + x + t; a = b + ROTATE_LEFT(temp,s); } void GG_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x, a_sql_uint32 s,a_sql_uint32 t ) { a_sql_uint32 temp = (a + G_md5(b,c,d)) + x + t; a = b + ROTATE_LEFT(temp,s); } void HH_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x, a_sql_uint32 s,a_sql_uint32 t ) { a_sql_uint32 temp = (a + H_md5(b,c,d)) + x + t; a = b + ROTATE_LEFT(temp,s); } void II_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x, a_sql_uint32 s,a_sql_uint32 t ) { a_sql_uint32 temp = (a + I_md5(b,c,d)) + x + t; a = b + ROTATE_LEFT(temp,s); } //Appends the message length to the end of the buffer void appendMsgLength( char * buffer, a_sql_uint64 argLen, a_sql_uint64 bufLen ) { a_sql_uint32 i; a_sql_uint64 bitLen = argLen*8; //since argLen is in bytes... for( i=8;i>=1;i-- ) { buffer[bufLen-i] = ((char)(bitLen & 0xFF)); bitLen = (bitLen >> 8); } } void digestInput( a_sql_uint32 * x, a_sql_uint32 & a, a_sql_uint32 & b, a_sql_uint32 & c, a_sql_uint32 & d ) { //Round 1 FF_md5(a,b,c,d,x[0],S11,0xd76aa478); FF_md5(d,a,b,c,x[1],S12,0xe8c7b756); FF_md5(c,d,a,b,x[2],S13,0x242070db); FF_md5(b,c,d,a,x[3],S14,0xc1bdceee); FF_md5(a,b,c,d,x[4],S11,0xf57c0faf); FF_md5(d,a,b,c,x[5],S12,0x4787c62a); FF_md5(c,d,a,b,x[6],S13,0xa8304613); FF_md5(b,c,d,a,x[7],S14,0xfd469501); FF_md5(a,b,c,d,x[8],S11,0x698098d8); FF_md5(d,a,b,c,x[9],S12,0x8b44f7af); FF_md5(c,d,a,b,x[10],S13,0xffff5bb1); FF_md5(b,c,d,a,x[11],S14,0x895cd7be); FF_md5(a,b,c,d,x[12],S11,0x6b901122); FF_md5(d,a,b,c,x[13],S12,0xfd987193); FF_md5(c,d,a,b,x[14],S13,0xa679438e); FF_md5(b,c,d,a,x[15],S14,0x49b40821); //Round 2 GG_md5(a,b,c,d,x[1],S21,0xf61e2562); GG_md5(d,a,b,c,x[6],S22,0xc040b340); GG_md5(c,d,a,b,x[11],S23,0x265e5a51); GG_md5(b,c,d,a,x[0],S24,0xe9b6c7aa); GG_md5(a,b,c,d,x[5],S21,0xd62f105d); GG_md5(d,a,b,c,x[10],S22,0x2441453); GG_md5(c,d,a,b,x[15],S23,0xd8a1e681); GG_md5(b,c,d,a,x[4],S24,0xe7d3fbc8); GG_md5(a,b,c,d,x[9],S21,0x21e1cde6); GG_md5(d,a,b,c,x[14],S22,0xc33707d6); GG_md5(c,d,a,b,x[3],S23,0xf4d50d87); GG_md5(b,c,d,a,x[8],S24,0x455a14ed); GG_md5(a,b,c,d,x[13],S21,0xa9e3e905); GG_md5(d,a,b,c,x[2],S22,0xfcefa3f8); GG_md5(c,d,a,b,x[7],S23,0x676f02d9); GG_md5(b,c,d,a,x[12],S24,0x8d2a4c8a); //Round 3 HH_md5(a,b,c,d,x[5],S31,0xfffa3942); HH_md5(d,a,b,c,x[8],S32,0x8771f681); HH_md5(c,d,a,b,x[11],S33,0x6d9d6122); HH_md5(b,c,d,a,x[14],S34,0xfde5380c); HH_md5(a,b,c,d,x[1],S31,0xa4beea44); HH_md5(d,a,b,c,x[4],S32,0x4bdecfa9); HH_md5(c,d,a,b,x[7],S33,0xf6bb4b60); HH_md5(b,c,d,a,x[10],S34,0xbebfbc70); HH_md5(a,b,c,d,x[13],S31,0x289b7ec6); HH_md5(d,a,b,c,x[0],S32,0xeaa127fa); HH_md5(c,d,a,b,x[3],S33,0xd4ef3085); HH_md5(b,c,d,a,x[6],S34,0x4881d05); HH_md5(a,b,c,d,x[9],S31,0xd9d4d039); HH_md5(d,a,b,c,x[12],S32,0xe6db99e5); HH_md5(c,d,a,b,x[15],S33,0x1fa27cf8); HH_md5(b,c,d,a,x[2],S34,0xc4ac5665); //Round 4 II_md5(a,b,c,d,x[0],S41,0xf4292244); II_md5(d,a,b,c,x[7],S42,0x432aff97); II_md5(c,d,a,b,x[14],S43,0xab9423a7); II_md5(b,c,d,a,x[5],S44,0xfc93a039); II_md5(a,b,c,d,x[12],S41,0x655b59c3); II_md5(d,a,b,c,x[3],S42,0x8f0ccc92); II_md5(c,d,a,b,x[10],S43,0xffeff47d); II_md5(b,c,d,a,x[1],S44,0x85845dd1); II_md5(a,b,c,d,x[8],S41,0x6fa87e4f); II_md5(d,a,b,c,x[15],S42,0xfe2ce6e0); II_md5(c,d,a,b,x[6],S43,0xa3014314); II_md5(b,c,d,a,x[13],S44,0x4e0811a1); II_md5(a,b,c,d,x[4],S41,0xf7537e82); II_md5(d,a,b,c,x[11],S42,0xbd3af235); II_md5(c,d,a,b,x[2],S43,0x2ad7d2bb); II_md5(b,c,d,a,x[9],S44,0xeb86d391); } void setPartition( a_sql_uint32 * p, char * input ) { int k = 0; for( int i=0;i<64;i+=4 ) { p[k] = 0; //zero out p[k] first p[k] = (((a_sql_uint32)input[i])&0xFF) | ((((a_sql_uint32)input[i+1])&0xFF) << 8) | ((((a_sql_uint32)input[i+2])&0xFF) << 16) | ((((a_sql_uint32)input[i+3])&0xFF) << 24); k++; } } //Helper method to convert the integers to a char buffer //of hex characters void setResult( char * res_buff, a_sql_uint32 A, a_sql_uint32 B, a_sql_uint32 C, a_sql_uint32 D ) { char alpha[16] = { '0','1','2', '3','4','5', '6','7','8', '9','a','b', 'c','d','e', 'f' }; //Put char representation for A into the buffer res_buff[6] = alpha[(int)((A >> 28) & 0xF)]; res_buff[7] = alpha[(int)((A >> 24) & 0xF)]; res_buff[4] = alpha[(int)((A >> 20) & 0xF)]; res_buff[5] = alpha[(int)((A >> 16) & 0xF)]; res_buff[2] = alpha[(int)((A >> 12) & 0xF)]; res_buff[3] = alpha[(int)((A >> 8) & 0xF)]; res_buff[0] = alpha[(int)((A >> 4) & 0xF)]; res_buff[1] = alpha[(int)((A) & 0xF)]; //Put char representation for B into the buffer res_buff[14] = alpha[(int)((B >> 28) & 0xF)]; res_buff[15] = alpha[(int)((B >> 24) & 0xF)]; res_buff[12] = alpha[(int)((B >> 20) & 0xF)]; res_buff[13] = alpha[(int)((B >> 16) & 0xF)]; res_buff[10] = alpha[(int)((B >> 12) & 0xF)]; res_buff[11] = alpha[(int)((B >> 8) & 0xF)]; res_buff[8] = alpha[(int)((B >> 4) & 0xF)]; res_buff[9] = alpha[(int)((B) & 0xF)]; //Put char representation for C into the buffer res_buff[22] = alpha[(int)((C >> 28) & 0xF)]; res_buff[23] = alpha[(int)((C >> 24) & 0xF)]; res_buff[20] = alpha[(int)((C >> 20) & 0xF)]; res_buff[21] = alpha[(int)((C >> 16) & 0xF)]; res_buff[18] = alpha[(int)((C >> 12) & 0xF)]; res_buff[19] = alpha[(int)((C >> 8) & 0xF)]; res_buff[16] = alpha[(int)((C >> 4) & 0xF)]; res_buff[17] = alpha[(int)((C) & 0xF)]; //Put char representation for D into the buffer res_buff[30] = alpha[(int)((D >> 28) & 0xF)]; res_buff[31] = alpha[(int)((D >> 24) & 0xF)]; res_buff[28] = alpha[(int)((D >> 20) & 0xF)]; res_buff[29] = alpha[(int)((D >> 16) & 0xF)]; res_buff[26] = alpha[(int)((D >> 12) & 0xF)]; res_buff[27] = alpha[(int)((D >> 8) & 0xF)]; res_buff[24] = alpha[(int)((D >> 4) & 0xF)]; res_buff[25] = alpha[(int)((D) & 0xF)]; } #if defined __cplusplus extern "C" { #endif static void my_md5_evaluate(a_v3_extfn_scalar_context *cntxt, void *arg_handle) { an_extfn_value arg; an_extfn_value outval; a_sql_int64 total_len; // Get first argument a_sql_uint32 fetchedLength = 0; (void) cntxt->get_value( arg_handle, 1, &arg ); if (arg.data == NULL) { return; } // MD5 Algorithm Initialization // Init A, B, C and D digest variables a_sql_uint32 A = 0x67452301; a_sql_uint32 B = 0xEFCDAB89; a_sql_uint32 C = 0x98BADCFE; a_sql_uint32 D = 0x10325476; // Make copies of digest variables a_sql_uint32 AA; a_sql_uint32 BB; a_sql_uint32 CC; a_sql_uint32 DD; // Try to stream with every 1MB and calculate MD5 for each block a_sql_int32 block_len = 1024*1024; total_len = 0; // Total length calculated char* buffer = new char[block_len]; // Buffer to store streaming data a_sql_int64 remain_len = arg.len.total_len - total_len; // Length that need to be streamed and calculated a_sql_int32 bufLen = block_len; // Size of data that need to be streamed in a_sql_uint32 * partition = new a_sql_uint32[16]; a_sql_uint64 i; // Stream in data buffer of size 1MB until remain size is smaller or equal to 1MB // Calculate MD5 for each block while (remain_len > block_len) { // Starting fetch data fetchedLength = 0; while (fetchedLength < bufLen) { if (arg.piece_len <= (bufLen - fetchedLength)) { // When buffer has enough space for next piece memcpy(buffer + fetchedLength, (char*)arg.data, arg.piece_len); fetchedLength += arg.piece_len; } else { // When only part of next piece fits in the buffer i = bufLen - fetchedLength; memcpy(buffer + fetchedLength, (char*)arg.data, i); fetchedLength += i; } (void) cntxt->get_piece(arg_handle, 1, &arg, total_len + fetchedLength); } // Buffer is now filled, try to calculate // Start the algorithm with partitions of 512 bits (16 32-bit words) for( i=0;i<block_len;i+=64) { AA = A; BB = B; CC = C; DD = D; setPartition(partition,buffer+i); digestInput(partition,A,B,C,D); //increment A, B, C and D by their original values A = A + AA; B = B + BB; C = C + CC; D = D + DD; } // increase total_len decrease remain_len total_len += fetchedLength; remain_len -= fetchedLength; } // Stream and calculate MD5 for the last block // Calculate size of the last block // Round up so size is dividable by 64 bytes block_len = (64 - (remain_len % 64)) + remain_len; if ((block_len - remain_len) < 9) { // If the remaining room is not enough for appending 1 byte // of a 1 bit and 7 0's, and a 64 bit long message block_len += 64; } // Since the last block probably has a different size than 1MB, // delete the old buffer and allocate a new one to store the data delete[] buffer; buffer = new char[block_len]; bufLen = remain_len; // Starting fetch data fetchedLength = 0; while (fetchedLength < bufLen) { memcpy(buffer + fetchedLength, (char*)arg.data, arg.piece_len); fetchedLength += arg.piece_len; (void) cntxt->get_piece(arg_handle, 1, &arg, total_len + fetchedLength); } //append a 1 bit with 7 0's after buffer[bufLen] = (char)(0x80); // Pad with 0s and leave 8 bytes at the end for the message length for( i = bufLen+1;i<(block_len-8);i++ ) { buffer[i] = 0; } // Append message length as a 64-bit integer // Message append at the end of buffer, indicating the length of entire file appendMsgLength( buffer, (bufLen+total_len) , block_len); // Buffer is now filled, try to calculate // Start the algorithm with partitions of 512 bits (16 32-bit words) for( i=0;i<block_len;i+=64) { AA = A; BB = B; CC = C; DD = D; setPartition(partition,buffer+i); digestInput(partition,A,B,C,D); //increment A, B, C and D by their original values A = A + AA; B = B + BB; C = C + CC; D = D + DD; } // Set the result value char * res_buff = new char[32]; setResult( res_buff,A,B,C,D ); a_sql_int32 idx; for( idx=0;idx<32;idx++ ) { outval.type = DT_VARCHAR; outval.piece_len = 1; outval.data = &(res_buff[idx]); cntxt->set_value( arg_handle, &outval, idx ); } delete[] buffer; delete[] partition; delete[] res_buff; } static a_v3_extfn_scalar my_md5_descriptor = { 0, 0, &my_md5_evaluate, 0, // Reserved - initialize to NULL 0, // Reserved - initialize to NULL 0, // Reserved - initialize to NULL 0, // Reserved - initialize to NULL 0, // Reserved - initialize to NULL NULL // _for_server_internal_use }; a_v3_extfn_scalar *my_md5() { return &my_md5_descriptor; } #if defined __cplusplus } #endif