UDF Example: my_md5 Definition

Use my_md5 function to calculate the MD5 hash value of an input file.

my_md5 definition

my_md5.cxx is a simple deterministic scalar user-defined function that calculates the MD5 hash value of an input file (a LONG BINARY argument). You can use the sample my_md5.cxx to process up to 4GB of input data.

Note: Large object data support requires a separately licensed Sybase IQ option.

my_md5 functions uses the get_piece() API to stream data in pieces. The streaming approach allow the UDF to allocate chunks of memory that are filled in each time the get_piece() API is called. By allocating pieces or chunks, the UDF does not have to allocate enough storage to hold the entire data of the column value. For example, if a UDF wants to process a column value with a size 400MB, then it needs to allocate a memory block of 400MB in order to hold the value. However, by using the the get_piece() API the UDF is able to process the data value in blocks of 4MB.

#include "extfnapiv3.h"
#include <stdlib.h>
#include <string.h>
#include <math.h>


//  A simple deterministic scalar UDF that calculates
//  the MD5 hash value of an input file (a LOB binary argument)
//
//	CREATE FUNCTION my_md5(IN arg1 LONG BINARY) 
//			RETURNS VARCHAR(32)
//			DETERMINISTIC
//                      IGNORE NULL VALUES
//			EXTERNAL NAME 'my_md5@libudfex'


//MD5 F, G, H and I functions
#define F_md5(X,Y,Z) (((X) & (Y)) | ((~X) & (Z))) 
#define G_md5(X,Y,Z) (((X) & (Z)) | ((Y) & (~Z)))
#define H_md5(X,Y,Z) ((X) ^ (Y) ^ (Z)) 
#define I_md5(X,Y,Z) ((Y) ^ ((X) | (~Z))) 

//define S constants for rotations
#define S11 7
#define S12 12
#define S13 17
#define S14 22
#define S21 5
#define S22 9
#define S23 14
#define S24 20
#define S31 4
#define S32 11
#define S33 16
#define S34 23
#define S41 6
#define S42 10
#define S43 15
#define S44 21

/* ROTATE_LEFT rotates x left n bits. */
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))

void FF_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x,
            a_sql_uint32 s,a_sql_uint32 t ) {
    a_sql_uint32 temp = (a + F_md5(b,c,d)) + x + t;
    a = b + ROTATE_LEFT(temp,s);
}

void GG_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x,
            a_sql_uint32 s,a_sql_uint32 t ) {
    a_sql_uint32 temp = (a + G_md5(b,c,d)) + x + t;
    a = b + ROTATE_LEFT(temp,s);
}

void HH_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x,
            a_sql_uint32 s,a_sql_uint32 t ) {
    a_sql_uint32 temp = (a + H_md5(b,c,d)) + x + t;
    a = b + ROTATE_LEFT(temp,s);
}

void II_md5(a_sql_uint32 & a, a_sql_uint32 b, a_sql_uint32 c, a_sql_uint32 d, a_sql_uint32 x,
            a_sql_uint32 s,a_sql_uint32 t ) {
    a_sql_uint32 temp = (a + I_md5(b,c,d)) + x + t;
    a = b + ROTATE_LEFT(temp,s);
}

//Appends the message length to the end of the buffer
void appendMsgLength( char * buffer, a_sql_uint64 argLen, a_sql_uint64 bufLen ) {
    a_sql_uint32 i;
    a_sql_uint64 bitLen = argLen*8; //since argLen is in bytes...
    for( i=8;i>=1;i-- ) {
        buffer[bufLen-i] = ((char)(bitLen & 0xFF));
        bitLen = (bitLen >> 8);
    }
}

void digestInput( a_sql_uint32 * x, a_sql_uint32 & a, a_sql_uint32 & b,
                 a_sql_uint32 & c, a_sql_uint32 & d ) {
    
    //Round 1
    FF_md5(a,b,c,d,x[0],S11,0xd76aa478);
    FF_md5(d,a,b,c,x[1],S12,0xe8c7b756);
    FF_md5(c,d,a,b,x[2],S13,0x242070db);
    FF_md5(b,c,d,a,x[3],S14,0xc1bdceee);

    FF_md5(a,b,c,d,x[4],S11,0xf57c0faf);
    FF_md5(d,a,b,c,x[5],S12,0x4787c62a);
    FF_md5(c,d,a,b,x[6],S13,0xa8304613);
    FF_md5(b,c,d,a,x[7],S14,0xfd469501);

    FF_md5(a,b,c,d,x[8],S11,0x698098d8);
    FF_md5(d,a,b,c,x[9],S12,0x8b44f7af);
    FF_md5(c,d,a,b,x[10],S13,0xffff5bb1);
    FF_md5(b,c,d,a,x[11],S14,0x895cd7be);

    FF_md5(a,b,c,d,x[12],S11,0x6b901122);
    FF_md5(d,a,b,c,x[13],S12,0xfd987193);
    FF_md5(c,d,a,b,x[14],S13,0xa679438e);
    FF_md5(b,c,d,a,x[15],S14,0x49b40821);

    //Round 2
    GG_md5(a,b,c,d,x[1],S21,0xf61e2562);
    GG_md5(d,a,b,c,x[6],S22,0xc040b340);
    GG_md5(c,d,a,b,x[11],S23,0x265e5a51);
    GG_md5(b,c,d,a,x[0],S24,0xe9b6c7aa);

    GG_md5(a,b,c,d,x[5],S21,0xd62f105d);
    GG_md5(d,a,b,c,x[10],S22,0x2441453);
    GG_md5(c,d,a,b,x[15],S23,0xd8a1e681);
    GG_md5(b,c,d,a,x[4],S24,0xe7d3fbc8);

    GG_md5(a,b,c,d,x[9],S21,0x21e1cde6);
    GG_md5(d,a,b,c,x[14],S22,0xc33707d6);
    GG_md5(c,d,a,b,x[3],S23,0xf4d50d87);
    GG_md5(b,c,d,a,x[8],S24,0x455a14ed);

    GG_md5(a,b,c,d,x[13],S21,0xa9e3e905);
    GG_md5(d,a,b,c,x[2],S22,0xfcefa3f8);
    GG_md5(c,d,a,b,x[7],S23,0x676f02d9);
    GG_md5(b,c,d,a,x[12],S24,0x8d2a4c8a);

    //Round 3
    HH_md5(a,b,c,d,x[5],S31,0xfffa3942);
    HH_md5(d,a,b,c,x[8],S32,0x8771f681);
    HH_md5(c,d,a,b,x[11],S33,0x6d9d6122);
    HH_md5(b,c,d,a,x[14],S34,0xfde5380c);

    HH_md5(a,b,c,d,x[1],S31,0xa4beea44);
    HH_md5(d,a,b,c,x[4],S32,0x4bdecfa9);
    HH_md5(c,d,a,b,x[7],S33,0xf6bb4b60);
    HH_md5(b,c,d,a,x[10],S34,0xbebfbc70);

    HH_md5(a,b,c,d,x[13],S31,0x289b7ec6);
    HH_md5(d,a,b,c,x[0],S32,0xeaa127fa);
    HH_md5(c,d,a,b,x[3],S33,0xd4ef3085);
    HH_md5(b,c,d,a,x[6],S34,0x4881d05);

    HH_md5(a,b,c,d,x[9],S31,0xd9d4d039);
    HH_md5(d,a,b,c,x[12],S32,0xe6db99e5);
    HH_md5(c,d,a,b,x[15],S33,0x1fa27cf8);
    HH_md5(b,c,d,a,x[2],S34,0xc4ac5665);

    //Round 4
    II_md5(a,b,c,d,x[0],S41,0xf4292244);
    II_md5(d,a,b,c,x[7],S42,0x432aff97);
    II_md5(c,d,a,b,x[14],S43,0xab9423a7);
    II_md5(b,c,d,a,x[5],S44,0xfc93a039);

    II_md5(a,b,c,d,x[12],S41,0x655b59c3);
    II_md5(d,a,b,c,x[3],S42,0x8f0ccc92);
    II_md5(c,d,a,b,x[10],S43,0xffeff47d);
    II_md5(b,c,d,a,x[1],S44,0x85845dd1);

    II_md5(a,b,c,d,x[8],S41,0x6fa87e4f);
    II_md5(d,a,b,c,x[15],S42,0xfe2ce6e0);
    II_md5(c,d,a,b,x[6],S43,0xa3014314);
    II_md5(b,c,d,a,x[13],S44,0x4e0811a1);

    II_md5(a,b,c,d,x[4],S41,0xf7537e82);
    II_md5(d,a,b,c,x[11],S42,0xbd3af235);
    II_md5(c,d,a,b,x[2],S43,0x2ad7d2bb);
    II_md5(b,c,d,a,x[9],S44,0xeb86d391);
}

void setPartition( a_sql_uint32 * p, char * input ) {
    int k = 0;
    for( int i=0;i<64;i+=4 ) {
        p[k] = 0; //zero out p[k] first
        p[k] = (((a_sql_uint32)input[i])&0xFF) | ((((a_sql_uint32)input[i+1])&0xFF) << 8) |
        ((((a_sql_uint32)input[i+2])&0xFF) << 16) | ((((a_sql_uint32)input[i+3])&0xFF) << 24);
        k++;
    }
}

//Helper method to convert the integers to a char buffer
//of hex characters
void setResult( char * res_buff, a_sql_uint32 A, a_sql_uint32 B,
               a_sql_uint32 C, a_sql_uint32 D ) {
    
    char alpha[16] = { '0','1','2',
                        '3','4','5',
                        '6','7','8',
                        '9','a','b',
                        'c','d','e',
                        'f' };

    //Put char representation for A into the buffer
    res_buff[6] = alpha[(int)((A >> 28) & 0xF)];
    res_buff[7] = alpha[(int)((A >> 24) & 0xF)];

    res_buff[4] = alpha[(int)((A >> 20) & 0xF)];
    res_buff[5] = alpha[(int)((A >> 16) & 0xF)];

    res_buff[2] = alpha[(int)((A >> 12) & 0xF)];
    res_buff[3] = alpha[(int)((A >> 8) & 0xF)];

    res_buff[0] = alpha[(int)((A >> 4) & 0xF)];
    res_buff[1] = alpha[(int)((A) & 0xF)];

    //Put char representation for B into the buffer
    res_buff[14] = alpha[(int)((B >> 28) & 0xF)];
    res_buff[15] = alpha[(int)((B >> 24) & 0xF)];

    res_buff[12] = alpha[(int)((B >> 20) & 0xF)];
    res_buff[13] = alpha[(int)((B >> 16) & 0xF)];

    res_buff[10] = alpha[(int)((B >> 12) & 0xF)];
    res_buff[11] = alpha[(int)((B >> 8) & 0xF)];

    res_buff[8] = alpha[(int)((B >> 4) & 0xF)];
    res_buff[9] = alpha[(int)((B) & 0xF)];

    //Put char representation for C into the buffer
    res_buff[22] = alpha[(int)((C >> 28) & 0xF)];
    res_buff[23] = alpha[(int)((C >> 24) & 0xF)];

    res_buff[20] = alpha[(int)((C >> 20) & 0xF)];
    res_buff[21] = alpha[(int)((C >> 16) & 0xF)];

    res_buff[18] = alpha[(int)((C >> 12) & 0xF)];
    res_buff[19] = alpha[(int)((C >> 8) & 0xF)];

    res_buff[16] = alpha[(int)((C >> 4) & 0xF)];
    res_buff[17] = alpha[(int)((C) & 0xF)];

    //Put char representation for D into the buffer
    res_buff[30] = alpha[(int)((D >> 28) & 0xF)];
    res_buff[31] = alpha[(int)((D >> 24) & 0xF)];

    res_buff[28] = alpha[(int)((D >> 20) & 0xF)];
    res_buff[29] = alpha[(int)((D >> 16) & 0xF)];

    res_buff[26] = alpha[(int)((D >> 12) & 0xF)];
    res_buff[27] = alpha[(int)((D >> 8) & 0xF)];

    res_buff[24] = alpha[(int)((D >> 4) & 0xF)];
    res_buff[25] = alpha[(int)((D) & 0xF)];
}

#if defined __cplusplus
extern "C" {
#endif

static void my_md5_evaluate(a_v3_extfn_scalar_context *cntxt, 
                      void *arg_handle)
{
    an_extfn_value  arg;
    an_extfn_value  outval;

    a_sql_int64 total_len;

    //  Get first argument
    a_sql_uint32 fetchedLength = 0;
    (void) cntxt->get_value( arg_handle, 1, &arg );
    if (arg.data == NULL)
    {
        return;
    }

    // MD5 Algorithm Initialization

    // Init A, B, C and D digest variables
    a_sql_uint32 A = 0x67452301;
    a_sql_uint32 B = 0xEFCDAB89;
    a_sql_uint32 C = 0x98BADCFE;
    a_sql_uint32 D = 0x10325476;

    // Make copies of digest variables
    a_sql_uint32 AA;
    a_sql_uint32 BB;
    a_sql_uint32 CC;
    a_sql_uint32 DD;

    
    // Try to stream with every 1MB and calculate MD5 for each block
    a_sql_int32 block_len = 1024*1024;
    total_len = 0; 						// Total length calculated
    char* buffer = new char[block_len]; 			// Buffer to store streaming data
    a_sql_int64 remain_len = arg.len.total_len - total_len; 	// Length that need to be streamed and calculated
    a_sql_int32 bufLen = block_len; 				// Size of data that need to be streamed in
    a_sql_uint32 * partition = new a_sql_uint32[16];
    a_sql_uint64 i;

    // Stream in data buffer of size 1MB until remain size is smaller or equal to 1MB
    // Calculate MD5 for each block
    while (remain_len > block_len) {
	// Starting fetch data
	fetchedLength = 0;
	while (fetchedLength < bufLen) {
	    if (arg.piece_len <= (bufLen - fetchedLength)) {
		// When buffer has enough space for next piece
		memcpy(buffer + fetchedLength, (char*)arg.data, arg.piece_len);
		fetchedLength += arg.piece_len;
	    } else {
		// When only part of next piece fits in the buffer
		i = bufLen - fetchedLength;
		memcpy(buffer + fetchedLength, (char*)arg.data, i);
		fetchedLength += i;
	    }
	    (void) cntxt->get_piece(arg_handle, 1, &arg, total_len + fetchedLength);
	}
	// Buffer is now filled, try to calculate
	// Start the algorithm with partitions of 512 bits (16 32-bit words)
	for( i=0;i<block_len;i+=64) {
	    AA = A;
	    BB = B;
	    CC = C;
	    DD = D;
	    
	    setPartition(partition,buffer+i);
	    digestInput(partition,A,B,C,D);
	    
	    //increment A, B, C and D by their original values
	    A = A + AA;
	    B = B + BB;
	    C = C + CC;
	    D = D + DD;
	}
	
	// increase total_len decrease remain_len
	total_len += fetchedLength;
	remain_len -= fetchedLength;
    }

    // Stream and calculate MD5 for the last block
    // Calculate size of the last block
    // Round up so size is dividable by 64 bytes
    block_len = (64 - (remain_len % 64)) + remain_len;
    if ((block_len - remain_len) < 9) {
	// If the remaining room is not enough for appending 1 byte
	// of a 1 bit and 7 0's, and a 64 bit long message
	block_len += 64; 
    }

    // Since the last block probably has a different size than 1MB,
    // delete the old buffer and allocate a new one to store the data
    delete[] buffer;
    buffer = new char[block_len];
    bufLen = remain_len;

    // Starting fetch data
    fetchedLength = 0;
    while (fetchedLength < bufLen) {
	memcpy(buffer + fetchedLength, (char*)arg.data, arg.piece_len);
	fetchedLength += arg.piece_len;
	(void) cntxt->get_piece(arg_handle, 1, &arg, total_len + fetchedLength);
    }

    //append a 1 bit with 7 0's after
    buffer[bufLen] = (char)(0x80);
    // Pad with 0s and leave 8 bytes at the end for the message length
    for( i = bufLen+1;i<(block_len-8);i++ ) {
	buffer[i] = 0;
    }
    // Append message length as a 64-bit integer
    // Message append at the end of buffer, indicating the length of entire file
    appendMsgLength( buffer, (bufLen+total_len) , block_len);

    // Buffer is now filled, try to calculate
    // Start the algorithm with partitions of 512 bits (16 32-bit words)
    for( i=0;i<block_len;i+=64) {
	AA = A;
	BB = B;
	CC = C;
	DD = D;
	
	setPartition(partition,buffer+i);
	digestInput(partition,A,B,C,D);
	
	//increment A, B, C and D by their original values
	A = A + AA;
	B = B + BB;
	C = C + CC;
	D = D + DD;
    }
  
    // Set the result value
    char * res_buff = new char[32];

    setResult( res_buff,A,B,C,D );

    a_sql_int32 idx;
    for( idx=0;idx<32;idx++ ) {
	outval.type = DT_VARCHAR;
	outval.piece_len = 1;
	outval.data = &(res_buff[idx]);

	cntxt->set_value( arg_handle, &outval, idx );
    }
    
    delete[] buffer;
    delete[] partition;
    delete[] res_buff;
}


static a_v3_extfn_scalar my_md5_descriptor = { 
    0, 
    0, 
    &my_md5_evaluate,
    0,          // Reserved - initialize to NULL
    0,          // Reserved - initialize to NULL
    0,          // Reserved - initialize to NULL
    0,          // Reserved - initialize to NULL
    0,          // Reserved - initialize to NULL
        NULL                    // _for_server_internal_use
};


a_v3_extfn_scalar *my_md5()
{
  return &my_md5_descriptor;
}

#if defined __cplusplus
}
#endif