/** @file @brief Returns a unique hash for each file in a directory @details Hashes each file in each directory, and then hashes the hashes to create a hash for each directory also. This makes use of the new `hashing_file()` and `hashing` functions, available since 9.4m6. Interestingly, those functions can be used in pure macro, eg: %put %sysfunc(hashing_file(md5,/path/to/file.blob,0)); Actual usage: %let fpath=/some/directory; %mp_hashdirectory(&fpath,outds=myhash,maxdepth=2) data _null_; set work.myhash; put (_all_)(=); run; Whilst files are hashed in their entirety, the logic for creating a folder hash is as follows: @li Sort the files by filename (case sensitive, uppercase then lower) @li Take the first 100 hashes, concatenate and hash @li Concatenate this hash with another 100 hashes and hash again @li Continue until the end of the folder. This is the folder hash @li If a folder contains other folders, start from the bottom of the tree - the folder hashes cascade upwards so you know immediately if there is a change in a sub/sub directory @li If a subfolder has no content (empty) then it is ignored. No hash created. @li If the file is empty, it is also ignored / no hash created. @li If the target directory (&inloc) is empty, &outds will also be empty

SAS Macros

@li mp_dirlist.sas

Related Files

@li mp_hashdataset.sas @li mp_hashdirectory.test.sas @li mp_md5.sas @param [in] inloc Full filepath of the file to be hashed (unquoted) @param [in] iftrue= (1=1) A condition under which the macro should be executed @param [in] maxdepth= (0) Set to a positive integer to indicate the level of subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited recursion, set to MAX. @param [in] method= (MD5) the hashing method to use. Available options: @li MD5 @li SH1 @li SHA256 @li SHA384 @li SHA512 @li CRC32 @param [out] outds= (work.mp_hashdirectory) The output dataset. Contains: @li directory - the parent folder @li file_hash - the hash output @li hash_duration - how long the hash took (first hash always takes longer) @li file_path - /full/path/to/each/file.ext @li file_or_folder - contains either "file" or "folder" @li level - the depth of the directory (top level is 0) @version 9.4m6 @author Allan Bowe **/ %macro mp_hashdirectory(inloc, outds=work.mp_hashdirectory, method=MD5, maxdepth=0, iftrue=%str(1=1) )/*/STORE SOURCE*/; %local curlevel tempds maxlevel; %if not(%eval(%unquote(&iftrue))) %then %return; /* get the directory listing */ %mp_dirlist(path=&inloc, outds=&outds, maxdepth=&maxdepth, showparent=YES) /* create the hashes */ data &outds; set &outds (rename=(filepath=file_path)); length FILE_HASH $32 HASH_DURATION 8; keep directory file_hash hash_duration file_path file_or_folder level; ts=datetime(); if file_or_folder='file' then do; /* if file is empty, hashing_file will break - so ignore / delete */ length fname val $8; drop fname val fid is_empty; rc=filename(fname,file_path); fid=fopen(fname); if fid > 0 then do; rc=fread(fid); is_empty=fget(fid,val); end; rc=fclose(fid); rc=filename(fname); if is_empty ne 0 then delete; else file_hash=hashing_file("&method",cats(file_path),0); end; hash_duration=datetime()-ts; run; proc sort data=&outds ; by descending level directory file_path; run; %let maxlevel=0; data _null_; set &outds; call symputx('maxlevel',level,'l'); stop; run; /* now hash the hashes to populate folder hashes, starting from the bottom */ %do curlevel=&maxlevel %to 0 %by -1; data work._data_ (keep=directory file_hash); set &outds; where level=&curlevel; by descending level directory file_path; length str $32767 tmp_hash $32; retain str tmp_hash ; /* reset vars when starting a new directory */ if first.directory then do; str=''; tmp_hash=''; i=0; end; /* hash each chunk of 100 file paths */ i+1; str=cats(str,file_hash); if mod(i,100)=0 or last.directory then do; tmp_hash=hashing("&method",cats(tmp_hash,str)); str=''; end; /* output the hash at directory level */ if last.directory then do; file_hash=tmp_hash; output; end; if last.level then stop; run; %let tempds=&syslast; /* join the hash back into the main table */ proc sql undo_policy=none; create table &outds as select a.directory ,coalesce(b.file_hash,a.file_hash) as file_hash ,a.hash_duration ,a.file_path ,a.file_or_folder ,a.level from &outds a left join &tempds b on a.file_path=b.directory order by level desc, directory, file_path; drop table &tempds; %end; %mend mp_hashdirectory;