diff --git a/all.sas b/all.sas index d017a7f..f95256e 100644 --- a/all.sas +++ b/all.sas @@ -4534,6 +4534,9 @@ run; @param [in] maxdepth= (0) Set to a positive integer to indicate the level of subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited recursion, set to MAX. + @param [in] showparent= (NO) By default, the initial parent directory is not + part of the results. Set to YES to include it. For this record only, + directory=filepath. @param [out] outds= (work.mp_dirlist) The output dataset to create @param [out] getattrs= (NO) If getattrs=YES then the doptname / foptname functions are used to scan all properties - any characters that are not @@ -4570,6 +4573,7 @@ run; , fref=0 , outds=work.mp_dirlist , getattrs=NO + , showparent=NO , maxdepth=0 , level=0 /* The level of recursion to perform. For internal use only. */ )/*/STORE SOURCE*/; @@ -4652,6 +4656,15 @@ data &out_ds(compress=no output; end; rc = dclose(did); + %if &showparent=YES and &level=0 %then %do; + filepath=directory; + file_or_folder='folder'; + ext=''; + filename=scan(directory,-1,'/\'); + msg=''; + level=&level; + output; + %end; stop; run; @@ -4739,6 +4752,9 @@ run; data _null_; set &out_ds; where file_or_folder='folder'; + %if &showparent=YES and &level=0 %then %do; + if filepath ne directory; + %end; length code $10000; code=cats('%nrstr(%mp_dirlist(path=',filepath,",outds=&outds" ,",getattrs=&getattrs,level=%eval(&level+1),maxdepth=&maxdepth))"); @@ -5754,7 +5770,7 @@ data _null_; run; %if %upcase(&showlog)=YES %then %do; - options ps=max; + options ps=max lrecl=max; data _null_; infile &outref; input; @@ -5762,7 +5778,8 @@ run; run; %end; -%mend mp_ds2md;/** +%mend mp_ds2md; +/** @file @brief Create a smaller version of a dataset, without data loss @details This macro will scan the input dataset and create a new one, that @@ -8553,7 +8570,7 @@ run; put hashkey=; run; - ![sas md5 hash dataset log results](https://i.imgur.com/MqF98vk.png) + ![sas md5 hash dataset log results](https://i.4gl.io/1/KorUKoyE05.png/raw)

SAS Macros

@li mf_getattrn.sas @@ -8563,11 +8580,12 @@ run;

Related Files

@li mp_hashdataset.test.sas + @li mp_hashdirectory.sas @param [in] libds dataset to hash @param [in] salt= Provide a salt (could be, for instance, the dataset name) - @param [in] iftrue= A condition under which the macro should be executed. - @param [out] outds= (work.mf_hashdataset) The output dataset to create. This + @param [in] iftrue= (1=1) A condition under which the macro should be executed + @param [out] outds= (work._data_) The output dataset to create. This will contain one column (hashkey) with one observation (a $hex32. representation of the input hash) |hashkey:$32.| @@ -8630,6 +8648,158 @@ run; run; %end; %mend mp_hashdataset; +/** + @file + @brief Returns a unique hash for each file in a directory + @details Hashes each file in each directory, and then hashes the hashes to + create a hash for each directory also. + + This makes use of the new `hashing_file()` and `hashing` functions, available + since 9.4m6. Interestingly, these can even be used in pure macro, eg: + + %put %sysfunc(hashing_file(md5,/path/to/file.blob,0)); + + An example of this logic being applied in JavaScript is available in the + @sasjs/utils library. + + Usage: + + %let fpath=/some/directory; + + %mp_hashdirectory(&fpath,outds=myhash,maxdepth=2) + + data _null_; + set work.myhash; + put (_all_)(=); + run; + + Whilst files are hashed in their entirety, the logic for creating a folder + hash is as follows: + + @li Sort the files by filename (case sensitive, uppercase then lower) + @li Take the first 100 hashes, concatenate and hash + @li Concatenate this hash with another 100 hashes and hash again + @li Continue until the end of the folder. This is the folder hash + @li If a folder contains other folders, start from the bottom of the tree - + the folder hashes cascade upwards so you know immediately if there is a + change in a sub/sub directory + @li If the folder has no content (empty) then it is ignored. No hash created. + +

SAS Macros

+ @li mp_dirlist.sas + +

Related Files

+ @li mp_hashdataset.sas + @li mp_hashdirectory.test.sas + @li mp_md5.sas + + @param [in] inloc Full filepath of the file to be hashed (unquoted) + @param [in] iftrue= (1=1) A condition under which the macro should be executed + @param [in] maxdepth= (0) Set to a positive integer to indicate the level of + subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited + recursion, set to MAX. + @param [in] method= (MD5) the hashing method to use. Available options: + @li MD5 + @li SH1 + @li SHA256 + @li SHA384 + @li SHA512 + @li CRC32 + @param [out] outds= (work.mp_hashdirectory) The output dataset. Contains: + @li directory - the parent folder + @li file_hash - the hash output + @li hash_duration - how long the hash took (first hash always takes longer) + @li file_path - /full/path/to/each/file.ext + @li file_or_folder - contains either "file" or "folder" + @li level - the depth of the directory (top level is 0) + + @version 9.4m6 + @author Allan Bowe +**/ + +%macro mp_hashdirectory(inloc, + outds=work.mp_hashdirectory, + method=MD5, + maxdepth=0, + iftrue=%str(1=1) +)/*/STORE SOURCE*/; + +%local curlevel tempds ; + +%if not(%eval(%unquote(&iftrue))) %then %return; + +/* get the directory listing */ +%mp_dirlist(path=&inloc, outds=&outds, maxdepth=&maxdepth, showparent=YES) + +/* create the hashes */ +data &outds; + set &outds (rename=(filepath=file_path)); + length FILE_HASH $32 HASH_DURATION 8; + keep directory file_hash hash_duration file_path file_or_folder level; + + ts=datetime(); + if file_or_folder='file' then do; + file_hash=hashing_file("&method",cats(file_path),0); + end; + hash_duration=datetime()-ts; +run; + +proc sort data=&outds ; + by descending level directory file_path; +run; + +data _null_; + set &outds; + call symputx('maxlevel',level,'l'); + stop; +run; + +/* now hash the hashes to populate folder hashes, starting from the bottom */ +%do curlevel=&maxlevel %to 0 %by -1; + data work._data_ (keep=directory file_hash); + set &outds; + where level=&curlevel; + by descending level directory file_path; + length str $32767 tmp_hash $32; + retain str tmp_hash ; + /* reset vars when starting a new directory */ + if first.directory then do; + str=''; + tmp_hash=''; + i=0; + end; + /* hash each chunk of 100 file paths */ + i+1; + str=cats(str,file_hash); + if mod(i,100)=0 or last.directory then do; + tmp_hash=hashing("&method",cats(tmp_hash,str)); + str=''; + end; + /* output the hash at directory level */ + if last.directory then do; + file_hash=tmp_hash; + output; + end; + if last.level then stop; + run; + %let tempds=&syslast; + /* join the hash back into the main table */ + proc sql undo_policy=none; + create table &outds as + select a.directory + ,coalesce(b.file_hash,a.file_hash) as file_hash + ,a.hash_duration + ,a.file_path + ,a.file_or_folder + ,a.level + from &outds a + left join &tempds b + on a.file_path=b.directory + order by level desc, directory, file_path; + drop table &tempds; +%end; + +%mend mp_hashdirectory; /** @file @brief Performs a wrapped \%include diff --git a/base/mp_dirlist.sas b/base/mp_dirlist.sas index 4807b00..2af46d2 100644 --- a/base/mp_dirlist.sas +++ b/base/mp_dirlist.sas @@ -27,6 +27,9 @@ @param [in] maxdepth= (0) Set to a positive integer to indicate the level of subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited recursion, set to MAX. + @param [in] showparent= (NO) By default, the initial parent directory is not + part of the results. Set to YES to include it. For this record only, + directory=filepath. @param [out] outds= (work.mp_dirlist) The output dataset to create @param [out] getattrs= (NO) If getattrs=YES then the doptname / foptname functions are used to scan all properties - any characters that are not @@ -63,6 +66,7 @@ , fref=0 , outds=work.mp_dirlist , getattrs=NO + , showparent=NO , maxdepth=0 , level=0 /* The level of recursion to perform. For internal use only. */ )/*/STORE SOURCE*/; @@ -145,6 +149,15 @@ data &out_ds(compress=no output; end; rc = dclose(did); + %if &showparent=YES and &level=0 %then %do; + filepath=directory; + file_or_folder='folder'; + ext=''; + filename=scan(directory,-1,'/\'); + msg=''; + level=&level; + output; + %end; stop; run; @@ -232,6 +245,9 @@ run; data _null_; set &out_ds; where file_or_folder='folder'; + %if &showparent=YES and &level=0 %then %do; + if filepath ne directory; + %end; length code $10000; code=cats('%nrstr(%mp_dirlist(path=',filepath,",outds=&outds" ,",getattrs=&getattrs,level=%eval(&level+1),maxdepth=&maxdepth))"); diff --git a/base/mp_ds2md.sas b/base/mp_ds2md.sas index 543c5cc..7febabe 100644 --- a/base/mp_ds2md.sas +++ b/base/mp_ds2md.sas @@ -92,7 +92,7 @@ data _null_; run; %if %upcase(&showlog)=YES %then %do; - options ps=max; + options ps=max lrecl=max; data _null_; infile &outref; input; @@ -100,4 +100,4 @@ run; run; %end; -%mend mp_ds2md; \ No newline at end of file +%mend mp_ds2md; diff --git a/base/mp_hashdataset.sas b/base/mp_hashdataset.sas index f469e7e..19e8f44 100644 --- a/base/mp_hashdataset.sas +++ b/base/mp_hashdataset.sas @@ -11,7 +11,7 @@ put hashkey=; run; - ![sas md5 hash dataset log results](https://i.imgur.com/MqF98vk.png) + ![sas md5 hash dataset log results](https://i.4gl.io/1/KorUKoyE05.png/raw)

SAS Macros

@li mf_getattrn.sas @@ -21,11 +21,12 @@

Related Files

@li mp_hashdataset.test.sas + @li mp_hashdirectory.sas @param [in] libds dataset to hash @param [in] salt= Provide a salt (could be, for instance, the dataset name) - @param [in] iftrue= A condition under which the macro should be executed. - @param [out] outds= (work.mf_hashdataset) The output dataset to create. This + @param [in] iftrue= (1=1) A condition under which the macro should be executed + @param [out] outds= (work._data_) The output dataset to create. This will contain one column (hashkey) with one observation (a $hex32. representation of the input hash) |hashkey:$32.| diff --git a/base/mp_hashdirectory.sas b/base/mp_hashdirectory.sas new file mode 100644 index 0000000..c523841 --- /dev/null +++ b/base/mp_hashdirectory.sas @@ -0,0 +1,152 @@ +/** + @file + @brief Returns a unique hash for each file in a directory + @details Hashes each file in each directory, and then hashes the hashes to + create a hash for each directory also. + + This makes use of the new `hashing_file()` and `hashing` functions, available + since 9.4m6. Interestingly, these can even be used in pure macro, eg: + + %put %sysfunc(hashing_file(md5,/path/to/file.blob,0)); + + An example of this logic being applied in JavaScript is available in the + @sasjs/utils library. + + Usage: + + %let fpath=/some/directory; + + %mp_hashdirectory(&fpath,outds=myhash,maxdepth=2) + + data _null_; + set work.myhash; + put (_all_)(=); + run; + + Whilst files are hashed in their entirety, the logic for creating a folder + hash is as follows: + + @li Sort the files by filename (case sensitive, uppercase then lower) + @li Take the first 100 hashes, concatenate and hash + @li Concatenate this hash with another 100 hashes and hash again + @li Continue until the end of the folder. This is the folder hash + @li If a folder contains other folders, start from the bottom of the tree - + the folder hashes cascade upwards so you know immediately if there is a + change in a sub/sub directory + @li If the folder has no content (empty) then it is ignored. No hash created. + +

SAS Macros

+ @li mp_dirlist.sas + +

Related Files

+ @li mp_hashdataset.sas + @li mp_hashdirectory.test.sas + @li mp_md5.sas + + @param [in] inloc Full filepath of the file to be hashed (unquoted) + @param [in] iftrue= (1=1) A condition under which the macro should be executed + @param [in] maxdepth= (0) Set to a positive integer to indicate the level of + subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited + recursion, set to MAX. + @param [in] method= (MD5) the hashing method to use. Available options: + @li MD5 + @li SH1 + @li SHA256 + @li SHA384 + @li SHA512 + @li CRC32 + @param [out] outds= (work.mp_hashdirectory) The output dataset. Contains: + @li directory - the parent folder + @li file_hash - the hash output + @li hash_duration - how long the hash took (first hash always takes longer) + @li file_path - /full/path/to/each/file.ext + @li file_or_folder - contains either "file" or "folder" + @li level - the depth of the directory (top level is 0) + + @version 9.4m6 + @author Allan Bowe +**/ + +%macro mp_hashdirectory(inloc, + outds=work.mp_hashdirectory, + method=MD5, + maxdepth=0, + iftrue=%str(1=1) +)/*/STORE SOURCE*/; + +%local curlevel tempds ; + +%if not(%eval(%unquote(&iftrue))) %then %return; + +/* get the directory listing */ +%mp_dirlist(path=&inloc, outds=&outds, maxdepth=&maxdepth, showparent=YES) + +/* create the hashes */ +data &outds; + set &outds (rename=(filepath=file_path)); + length FILE_HASH $32 HASH_DURATION 8; + keep directory file_hash hash_duration file_path file_or_folder level; + + ts=datetime(); + if file_or_folder='file' then do; + file_hash=hashing_file("&method",cats(file_path),0); + end; + hash_duration=datetime()-ts; +run; + +proc sort data=&outds ; + by descending level directory file_path; +run; + +data _null_; + set &outds; + call symputx('maxlevel',level,'l'); + stop; +run; + +/* now hash the hashes to populate folder hashes, starting from the bottom */ +%do curlevel=&maxlevel %to 0 %by -1; + data work._data_ (keep=directory file_hash); + set &outds; + where level=&curlevel; + by descending level directory file_path; + length str $32767 tmp_hash $32; + retain str tmp_hash ; + /* reset vars when starting a new directory */ + if first.directory then do; + str=''; + tmp_hash=''; + i=0; + end; + /* hash each chunk of 100 file paths */ + i+1; + str=cats(str,file_hash); + if mod(i,100)=0 or last.directory then do; + tmp_hash=hashing("&method",cats(tmp_hash,str)); + str=''; + end; + /* output the hash at directory level */ + if last.directory then do; + file_hash=tmp_hash; + output; + end; + if last.level then stop; + run; + %let tempds=&syslast; + /* join the hash back into the main table */ + proc sql undo_policy=none; + create table &outds as + select a.directory + ,coalesce(b.file_hash,a.file_hash) as file_hash + ,a.hash_duration + ,a.file_path + ,a.file_or_folder + ,a.level + from &outds a + left join &tempds b + on a.file_path=b.directory + order by level desc, directory, file_path; + drop table &tempds; +%end; + +%mend mp_hashdirectory; diff --git a/tests/base/mp_hashdirectory.test.sas b/tests/base/mp_hashdirectory.test.sas new file mode 100644 index 0000000..08a0457 --- /dev/null +++ b/tests/base/mp_hashdirectory.test.sas @@ -0,0 +1,133 @@ +/** + @file + @brief Testing mp_hashdirectory.sas macro + + +

SAS Macros

+ @li mf_mkdir.sas + @li mf_nobs.sas + @li mp_assert.sas + @li mp_assertscope.sas + @li mp_hashdirectory.sas + +**/ + +/* set up a directory to hash */ +%let fpath=%sysfunc(pathname(work))/testdir; + +%mf_mkdir(&fpath) +%mf_mkdir(&fpath/sub1) +%mf_mkdir(&fpath/sub2) +%mf_mkdir(&fpath/sub1/subsub) + +/* note - the path in the file means the hash is different in each run */ +%macro makefile(path,name); + data _null_; + file "&path/&name" termstr=lf; + put "This file is located at:"; + put "&path"; + put "and it is called:"; + put "&name"; + run; +%mend makefile; + +%macro spawner(path); + %do x=1 %to 5; + %makefile(&path,file&x..txt) + %end; +%mend spawner; + +%spawner(&fpath) +%spawner(&fpath/sub1) +%spawner(&fpath/sub1/subsub) + + +%mp_assertscope(SNAPSHOT) +%mp_hashdirectory(&fpath,outds=work.hashes,maxdepth=MAX) +%mp_assertscope(COMPARE) + +%mp_assert( + iftrue=(&syscc=0), + desc=No errors, + outds=work.test_results +) + +%mp_assert( + iftrue=(%mf_nobs(work.hashes)=19), + desc=record created for each entry, + outds=work.test_results +) + +proc sql; +select count(*) into: misscheck + from work.hashes + where file_hash is missing; + +%mp_assert( + iftrue=(&misscheck=1), + desc=Only one missing hash - the empty directory, + outds=work.test_results +) + +data _null_; + set work.hashes; + if directory=file_path then call symputx('tophash',file_hash); +run; + +%mp_assert( + iftrue=(%length(&tophash)=32), + desc=ensure valid top level hash created, + outds=work.test_results +) + +/* now change a file and re-hash */ +data _null_; + file "&fpath/sub1/subsub/file1.txt" termstr=lf; + put "This file has changed!"; +run; + +%mp_hashdirectory(&fpath,outds=work.hashes2,maxdepth=MAX) + +data _null_; + set work.hashes2; + if directory=file_path then call symputx('tophash2',file_hash); +run; + +%mp_assert( + iftrue=(&tophash ne &tophash2), + desc=ensure the changing of the hash results in a new value, + outds=work.test_results +) + +/* now change it back and see if it matches */ +data _null_; + file "&fpath/sub1/subsub/file1.txt" termstr=lf; + put "This file is located at:"; + put "&fpath/sub1/subsub"; + put "and it is called:"; + put "file1.txt"; + run; +run; + +%mp_hashdirectory(&fpath,outds=work.hashes3,maxdepth=MAX) + +data _null_; + set work.hashes3; + if directory=file_path then call symputx('tophash3',file_hash); +run; + +%mp_assert( + iftrue=(&tophash=&tophash3), + desc=ensure the same files result in the same hash, + outds=work.test_results +) + +/* dump contents for debugging */ +data _null_; + set work.hashes; + put file_hash file_path; +run; +data _null_; + set work.hashes2; + put file_hash file_path; +run;