Use my_md5 function to calculate the MD5 hash value of an input file.
my_md5.cxx is a simple deterministic scalar user-defined function that calculates the MD5 hash value of an input file (a LONG BINARY argument). You can use the sample my_md5.cxx to process up to 4GB of input data.
my_md5 functions uses the get_piece() API to stream data in pieces. The streaming approach allow the UDF to allocate chunks of memory that are filled in each time the get_piece() API is called. By allocating pieces or chunks, the UDF does not have to allocate enough storage to hold the entire data of the column value. For example, if a UDF wants to process a column value with a size 400MB, then it needs to allocate a memory block of 400MB in order to hold the value. However, by using the the get_piece() API the UDF is able to process the data value in blocks of 4MB.
#include "extfnapiv3.h"
#include <stdlib.h>
#include <string.h>
#include <math.h>
// A simple deterministic scalar UDF that calculates
// the MD5 hash value of an input file (a LOB binary argument)
//
// CREATE FUNCTION my_md5(IN arg1 LONG BINARY)
// RETURNS VARCHAR(32)
// DETERMINISTIC
// IGNORE NULL VALUES
// EXTERNAL NAME 'my_md5@libudfex'
//MD5 F, G, H and I functions
#define F_md5(X,Y,Z) (((X) & (Y)) | ((~X) & (Z)))
#define G_md5(X,Y,Z) (((X) & (Z)) | ((Y) & (~Z)))
#define H_md5(X,Y,Z) ((X) ^ (Y) ^ (Z))
#define I_md5(X,Y,Z) ((Y) ^ ((X) | (~Z)))
//define S constants for rotations
#define S11 7
#define S12 12
#define S13 17
#define S14 22
#define S21 5
#define S22 9
#define S23 14
#define S24 20
#define S31 4
#define S32 11
#define S33 16
#define S34 23
#define S41 6
#define S42 10
#define S43 15
#define S44 21
/* ROTATE_LEFT rotates x left n bits. */
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
void FF_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x,
a_sql_uint32 s,a_sql_uint32 t ) {
a_sql_uint32 temp = (a + F_md5(b,c,d)) + x + t;
a = b + ROTATE_LEFT(temp,s);
}
void GG_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x,
a_sql_uint32 s,a_sql_uint32 t ) {
a_sql_uint32 temp = (a + G_md5(b,c,d)) + x + t;
a = b + ROTATE_LEFT(temp,s);
}
void HH_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x,
a_sql_uint32 s,a_sql_uint32 t ) {
a_sql_uint32 temp = (a + H_md5(b,c,d)) + x + t;
a = b + ROTATE_LEFT(temp,s);
}
void II_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x,
a_sql_uint32 s,a_sql_uint32 t ) {
a_sql_uint32 temp = (a + I_md5(b,c,d)) + x + t;
a = b + ROTATE_LEFT(temp,s);
}
//Appends the message length to the end of the buffer
void appendMsgLength( char * buffer, a_sql_uint64 argLen, a_sql_uint64 bufLen ) {
a_sql_uint32 i;
a_sql_uint64 bitLen = argLen*8; //since argLen is in bytes...
for( i=8;i>=1;i-- ) {
buffer[bufLen-i] = ((char)(bitLen & 0xFF));
bitLen = (bitLen >> 8);
}
}
void digestInput( a_sql_uint32 * x, a_sql_uint32 & a, a_sql_uint32 & b,
a_sql_uint32 & c, a_sql_uint32 & d ) {
//Round 1
FF_md5(a,b,c,d,x[0],S11,0xd76aa478);
FF_md5(d,a,b,c,x[1],S12,0xe8c7b756);
FF_md5(c,d,a,b,x[2],S13,0x242070db);
FF_md5(b,c,d,a,x[3],S14,0xc1bdceee);
FF_md5(a,b,c,d,x[4],S11,0xf57c0faf);
FF_md5(d,a,b,c,x[5],S12,0x4787c62a);
FF_md5(c,d,a,b,x[6],S13,0xa8304613);
FF_md5(b,c,d,a,x[7],S14,0xfd469501);
FF_md5(a,b,c,d,x[8],S11,0x698098d8);
FF_md5(d,a,b,c,x[9],S12,0x8b44f7af);
FF_md5(c,d,a,b,x[10],S13,0xffff5bb1);
FF_md5(b,c,d,a,x[11],S14,0x895cd7be);
FF_md5(a,b,c,d,x[12],S11,0x6b901122);
FF_md5(d,a,b,c,x[13],S12,0xfd987193);
FF_md5(c,d,a,b,x[14],S13,0xa679438e);
FF_md5(b,c,d,a,x[15],S14,0x49b40821);
//Round 2
GG_md5(a,b,c,d,x[1],S21,0xf61e2562);
GG_md5(d,a,b,c,x[6],S22,0xc040b340);
GG_md5(c,d,a,b,x[11],S23,0x265e5a51);
GG_md5(b,c,d,a,x[0],S24,0xe9b6c7aa);
GG_md5(a,b,c,d,x[5],S21,0xd62f105d);
GG_md5(d,a,b,c,x[10],S22,0x2441453);
GG_md5(c,d,a,b,x[15],S23,0xd8a1e681);
GG_md5(b,c,d,a,x[4],S24,0xe7d3fbc8);
GG_md5(a,b,c,d,x[9],S21,0x21e1cde6);
GG_md5(d,a,b,c,x[14],S22,0xc33707d6);
GG_md5(c,d,a,b,x[3],S23,0xf4d50d87);
GG_md5(b,c,d,a,x[8],S24,0x455a14ed);
GG_md5(a,b,c,d,x[13],S21,0xa9e3e905);
GG_md5(d,a,b,c,x[2],S22,0xfcefa3f8);
GG_md5(c,d,a,b,x[7],S23,0x676f02d9);
GG_md5(b,c,d,a,x[12],S24,0x8d2a4c8a);
//Round 3
HH_md5(a,b,c,d,x[5],S31,0xfffa3942);
HH_md5(d,a,b,c,x[8],S32,0x8771f681);
HH_md5(c,d,a,b,x[11],S33,0x6d9d6122);
HH_md5(b,c,d,a,x[14],S34,0xfde5380c);
HH_md5(a,b,c,d,x[1],S31,0xa4beea44);
HH_md5(d,a,b,c,x[4],S32,0x4bdecfa9);
HH_md5(c,d,a,b,x[7],S33,0xf6bb4b60);
HH_md5(b,c,d,a,x[10],S34,0xbebfbc70);
HH_md5(a,b,c,d,x[13],S31,0x289b7ec6);
HH_md5(d,a,b,c,x[0],S32,0xeaa127fa);
HH_md5(c,d,a,b,x[3],S33,0xd4ef3085);
HH_md5(b,c,d,a,x[6],S34,0x4881d05);
HH_md5(a,b,c,d,x[9],S31,0xd9d4d039);
HH_md5(d,a,b,c,x[12],S32,0xe6db99e5);
HH_md5(c,d,a,b,x[15],S33,0x1fa27cf8);
HH_md5(b,c,d,a,x[2],S34,0xc4ac5665);
//Round 4
II_md5(a,b,c,d,x[0],S41,0xf4292244);
II_md5(d,a,b,c,x[7],S42,0x432aff97);
II_md5(c,d,a,b,x[14],S43,0xab9423a7);
II_md5(b,c,d,a,x[5],S44,0xfc93a039);
II_md5(a,b,c,d,x[12],S41,0x655b59c3);
II_md5(d,a,b,c,x[3],S42,0x8f0ccc92);
II_md5(c,d,a,b,x[10],S43,0xffeff47d);
II_md5(b,c,d,a,x[1],S44,0x85845dd1);
II_md5(a,b,c,d,x[8],S41,0x6fa87e4f);
II_md5(d,a,b,c,x[15],S42,0xfe2ce6e0);
II_md5(c,d,a,b,x[6],S43,0xa3014314);
II_md5(b,c,d,a,x[13],S44,0x4e0811a1);
II_md5(a,b,c,d,x[4],S41,0xf7537e82);
II_md5(d,a,b,c,x[11],S42,0xbd3af235);
II_md5(c,d,a,b,x[2],S43,0x2ad7d2bb);
II_md5(b,c,d,a,x[9],S44,0xeb86d391);
}
void setPartition( a_sql_uint32 * p, char * input ) {
int k = 0;
for( int i=0;i<64;i+=4 ) {
p[k] = 0; //zero out p[k] first
p[k] = (((a_sql_uint32)input[i])&0xFF) | ((((a_sql_uint32)input[i+1])&0xFF) << 8) |
((((a_sql_uint32)input[i+2])&0xFF) << 16) | ((((a_sql_uint32)input[i+3])&0xFF) << 24);
k++;
}
}
//Helper method to convert the integers to a char buffer
//of hex characters
void setResult( char * res_buff, a_sql_uint32 A, a_sql_uint32 B,
a_sql_uint32 C, a_sql_uint32 D ) {
char alpha[16] = { '0','1','2',
'3','4','5',
'6','7','8',
'9','a','b',
'c','d','e',
'f' };
//Put char representation for A into the buffer
res_buff[6] = alpha[(int)((A >> 28) & 0xF)];
res_buff[7] = alpha[(int)((A >> 24) & 0xF)];
res_buff[4] = alpha[(int)((A >> 20) & 0xF)];
res_buff[5] = alpha[(int)((A >> 16) & 0xF)];
res_buff[2] = alpha[(int)((A >> 12) & 0xF)];
res_buff[3] = alpha[(int)((A >> 8) & 0xF)];
res_buff[0] = alpha[(int)((A >> 4) & 0xF)];
res_buff[1] = alpha[(int)((A) & 0xF)];
//Put char representation for B into the buffer
res_buff[14] = alpha[(int)((B >> 28) & 0xF)];
res_buff[15] = alpha[(int)((B >> 24) & 0xF)];
res_buff[12] = alpha[(int)((B >> 20) & 0xF)];
res_buff[13] = alpha[(int)((B >> 16) & 0xF)];
res_buff[10] = alpha[(int)((B >> 12) & 0xF)];
res_buff[11] = alpha[(int)((B >> 8) & 0xF)];
res_buff[8] = alpha[(int)((B >> 4) & 0xF)];
res_buff[9] = alpha[(int)((B) & 0xF)];
//Put char representation for C into the buffer
res_buff[22] = alpha[(int)((C >> 28) & 0xF)];
res_buff[23] = alpha[(int)((C >> 24) & 0xF)];
res_buff[20] = alpha[(int)((C >> 20) & 0xF)];
res_buff[21] = alpha[(int)((C >> 16) & 0xF)];
res_buff[18] = alpha[(int)((C >> 12) & 0xF)];
res_buff[19] = alpha[(int)((C >> 8) & 0xF)];
res_buff[16] = alpha[(int)((C >> 4) & 0xF)];
res_buff[17] = alpha[(int)((C) & 0xF)];
//Put char representation for D into the buffer
res_buff[30] = alpha[(int)((D >> 28) & 0xF)];
res_buff[31] = alpha[(int)((D >> 24) & 0xF)];
res_buff[28] = alpha[(int)((D >> 20) & 0xF)];
res_buff[29] = alpha[(int)((D >> 16) & 0xF)];
res_buff[26] = alpha[(int)((D >> 12) & 0xF)];
res_buff[27] = alpha[(int)((D >> 8) & 0xF)];
res_buff[24] = alpha[(int)((D >> 4) & 0xF)];
res_buff[25] = alpha[(int)((D) & 0xF)];
}
#if defined __cplusplus
extern "C" {
#endif
static void my_md5_evaluate(a_v3_extfn_scalar_context *cntxt,
void *arg_handle)
{
an_extfn_value arg;
an_extfn_value outval;
a_sql_int64 total_len;
// Get first argument
a_sql_uint32 fetchedLength = 0;
(void) cntxt->get_value( arg_handle, 1, &arg );
if (arg.data == NULL)
{
return;
}
// MD5 Algorithm Initialization
// Init A, B, C and D digest variables
a_sql_uint32 A = 0x67452301;
a_sql_uint32 B = 0xEFCDAB89;
a_sql_uint32 C = 0x98BADCFE;
a_sql_uint32 D = 0x10325476;
// Make copies of digest variables
a_sql_uint32 AA;
a_sql_uint32 BB;
a_sql_uint32 CC;
a_sql_uint32 DD;
// Try to stream with every 1MB and calculate MD5 for each block
a_sql_int32 block_len = 1024*1024;
total_len = 0; // Total length calculated
char* buffer = new char[block_len]; // Buffer to store streaming data
a_sql_int64 remain_len = arg.len.total_len - total_len; // Length that need to be streamed and calculated
a_sql_int32 bufLen = block_len; // Size of data that need to be streamed in
a_sql_uint32 * partition = new a_sql_uint32[16];
a_sql_uint64 i;
// Stream in data buffer of size 1MB until remain size is smaller or equal to 1MB
// Calculate MD5 for each block
while (remain_len > block_len) {
// Starting fetch data
fetchedLength = 0;
while (fetchedLength < bufLen) {
if (arg.piece_len <= (bufLen - fetchedLength)) {
// When buffer has enough space for next piece
memcpy(buffer + fetchedLength, (char*)arg.data, arg.piece_len);
fetchedLength += arg.piece_len;
} else {
// When only part of next piece fits in the buffer
i = bufLen - fetchedLength;
memcpy(buffer + fetchedLength, (char*)arg.data, i);
fetchedLength += i;
}
(void) cntxt->get_piece(arg_handle, 1, &arg, total_len + fetchedLength);
}
// Buffer is now filled, try to calculate
// Start the algorithm with partitions of 512 bits (16 32-bit words)
for( i=0;i<block_len;i+=64) {
AA = A;
BB = B;
CC = C;
DD = D;
setPartition(partition,buffer+i);
digestInput(partition,A,B,C,D);
//increment A, B, C and D by their original values
A = A + AA;
B = B + BB;
C = C + CC;
D = D + DD;
}
// increase total_len decrease remain_len
total_len += fetchedLength;
remain_len -= fetchedLength;
}
// Stream and calculate MD5 for the last block
// Calculate size of the last block
// Round up so size is dividable by 64 bytes
block_len = (64 - (remain_len % 64)) + remain_len;
if ((block_len - remain_len) < 9) {
// If the remaining room is not enough for appending 1 byte
// of a 1 bit and 7 0's, and a 64 bit long message
block_len += 64;
}
// Since the last block probably has a different size than 1MB,
// delete the old buffer and allocate a new one to store the data
delete[] buffer;
buffer = new char[block_len];
bufLen = remain_len;
// Starting fetch data
fetchedLength = 0;
while (fetchedLength < bufLen) {
memcpy(buffer + fetchedLength, (char*)arg.data, arg.piece_len);
fetchedLength += arg.piece_len;
(void) cntxt->get_piece(arg_handle, 1, &arg, total_len + fetchedLength);
}
//append a 1 bit with 7 0's after
buffer[bufLen] = (char)(0x80);
// Pad with 0s and leave 8 bytes at the end for the message length
for( i = bufLen+1;i<(block_len-8);i++ ) {
buffer[i] = 0;
}
// Append message length as a 64-bit integer
// Message append at the end of buffer, indicating the length of entire file
appendMsgLength( buffer, (bufLen+total_len) , block_len);
// Buffer is now filled, try to calculate
// Start the algorithm with partitions of 512 bits (16 32-bit words)
for( i=0;i<block_len;i+=64) {
AA = A;
BB = B;
CC = C;
DD = D;
setPartition(partition,buffer+i);
digestInput(partition,A,B,C,D);
//increment A, B, C and D by their original values
A = A + AA;
B = B + BB;
C = C + CC;
D = D + DD;
}
// Set the result value
char * res_buff = new char[32];
setResult( res_buff,A,B,C,D );
a_sql_int32 idx;
for( idx=0;idx<32;idx++ ) {
outval.type = DT_VARCHAR;
outval.piece_len = 1;
outval.data = &(res_buff[idx]);
cntxt->set_value( arg_handle, &outval, idx );
}
delete[] buffer;
delete[] partition;
delete[] res_buff;
}
static a_v3_extfn_scalar my_md5_descriptor = {
0,
0,
&my_md5_evaluate,
0, // Reserved - initialize to NULL
0, // Reserved - initialize to NULL
0, // Reserved - initialize to NULL
0, // Reserved - initialize to NULL
0, // Reserved - initialize to NULL
NULL // _for_server_internal_use
};
a_v3_extfn_scalar *my_md5()
{
return &my_md5_descriptor;
}
#if defined __cplusplus
}
#endif