Re: md5(large_object_id)

Karsten Hilbert <Karsten.Hilbert@xxxxxxx> · Wed, 7 Oct 2015 17:34:39 +0200

For the record - I have also devised another solution to the
underlying problem (md5(bytea) cannot process large amounts
of input), chunked md5():

	create or replace function lo_chunked_md5(oid, int)
		returns text
		language 'plpgsql'
		stable strict
		as '
	DECLARE
		lo_id alias for $1;
		chunk_size alias for $2;
		_lo_fd integer;
		_lo_size integer;
		_chunk_count integer;
		_remainder integer;
		_md5_concat text;
		INV_READ  constant integer := x''40000''::integer;
		SEEK_SET  constant integer := 0;
		SEEK_END  constant integer := 2;
	BEGIN
		-- check for existence of lo_id

		_lo_fd := lo_open(lo_id, INV_READ);
		-- get size
		_lo_size := lo_lseek(_lo_fd, 0, SEEK_END);
		PERFORM lo_close(_lo_fd);
		-- calculate chunks and remainder
		_chunk_count := _lo_size / chunk_size;
		_remainder := _lo_size % chunk_size;
		-- loop over chunks
		_md5_concat := '''';
		FOR _chunk_id in 1.._chunk_count LOOP
			_md5_concat := _md5_concat || md5(lo_get(lo_id, (_chunk_id - 1) * chunk_size, chunk_size));
		END LOOP;
		-- add remainder
		_md5_concat := _md5_concat || md5(lo_get(lo_id, _chunk_count * chunk_size, _remainder));
		return md5(_md5_concat);
	END;';

This can easily be mirrored by a client-side function, say,

	def file2chunked_md5(filename=None, chunk_size=500*_MB):
		_log.debug('chunked_md5(%s, %s)', filename, chunk_size)
		md5_concat = u''
		f = open(filename, 'rb')
		while True:
			md5 = hashlib.md5()
			data = f.read(chunk_size)
			if not data:
				break
			md5.update(data)
			md5_concat += md5.hexdigest()
		f.close()

		md5 = hashlib.md5()
		md5.update(md5_concat)
		hex_digest = md5.hexdigest()

		_log.debug('md5(%s): %s', md5_concat, hex_digest)

		return hexdigest

in Python.

Still, I'd welcome a native, streaming md5(loid) which is
bound to be more optimized by design.

Karsten
-- 
GPG key ID E4071346 @ eu.pool.sks-keyservers.net
E167 67FD A291 2BEA 73BD  4537 78B9 A9F9 E407 1346

-- 
Sent via pgsql-general mailing list (pgsql-general@xxxxxxxxxxxxxx)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general