diff --git a/all.sas b/all.sas
index d017a7f..f95256e 100644
--- a/all.sas
+++ b/all.sas
@@ -4534,6 +4534,9 @@ run;
@param [in] maxdepth= (0) Set to a positive integer to indicate the level of
subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited
recursion, set to MAX.
+ @param [in] showparent= (NO) By default, the initial parent directory is not
+ part of the results. Set to YES to include it. For this record only,
+ directory=filepath.
@param [out] outds= (work.mp_dirlist) The output dataset to create
@param [out] getattrs= (NO) If getattrs=YES then the doptname / foptname
functions are used to scan all properties - any characters that are not
@@ -4570,6 +4573,7 @@ run;
, fref=0
, outds=work.mp_dirlist
, getattrs=NO
+ , showparent=NO
, maxdepth=0
, level=0 /* The level of recursion to perform. For internal use only. */
)/*/STORE SOURCE*/;
@@ -4652,6 +4656,15 @@ data &out_ds(compress=no
output;
end;
rc = dclose(did);
+ %if &showparent=YES and &level=0 %then %do;
+ filepath=directory;
+ file_or_folder='folder';
+ ext='';
+ filename=scan(directory,-1,'/\');
+ msg='';
+ level=&level;
+ output;
+ %end;
stop;
run;
@@ -4739,6 +4752,9 @@ run;
data _null_;
set &out_ds;
where file_or_folder='folder';
+ %if &showparent=YES and &level=0 %then %do;
+ if filepath ne directory;
+ %end;
length code $10000;
code=cats('%nrstr(%mp_dirlist(path=',filepath,",outds=&outds"
,",getattrs=&getattrs,level=%eval(&level+1),maxdepth=&maxdepth))");
@@ -5754,7 +5770,7 @@ data _null_;
run;
%if %upcase(&showlog)=YES %then %do;
- options ps=max;
+ options ps=max lrecl=max;
data _null_;
infile &outref;
input;
@@ -5762,7 +5778,8 @@ run;
run;
%end;
-%mend mp_ds2md;/**
+%mend mp_ds2md;
+/**
@file
@brief Create a smaller version of a dataset, without data loss
@details This macro will scan the input dataset and create a new one, that
@@ -8553,7 +8570,7 @@ run;
put hashkey=;
run;
- 
+ 
SAS Macros
@li mf_getattrn.sas
@@ -8563,11 +8580,12 @@ run;
Related Files
@li mp_hashdataset.test.sas
+ @li mp_hashdirectory.sas
@param [in] libds dataset to hash
@param [in] salt= Provide a salt (could be, for instance, the dataset name)
- @param [in] iftrue= A condition under which the macro should be executed.
- @param [out] outds= (work.mf_hashdataset) The output dataset to create. This
+ @param [in] iftrue= (1=1) A condition under which the macro should be executed
+ @param [out] outds= (work._data_) The output dataset to create. This
will contain one column (hashkey) with one observation (a $hex32.
representation of the input hash)
|hashkey:$32.|
@@ -8630,6 +8648,158 @@ run;
run;
%end;
%mend mp_hashdataset;
+/**
+ @file
+ @brief Returns a unique hash for each file in a directory
+ @details Hashes each file in each directory, and then hashes the hashes to
+ create a hash for each directory also.
+
+ This makes use of the new `hashing_file()` and `hashing` functions, available
+ since 9.4m6. Interestingly, these can even be used in pure macro, eg:
+
+ %put %sysfunc(hashing_file(md5,/path/to/file.blob,0));
+
+ An example of this logic being applied in JavaScript is available in the
+ @sasjs/utils library.
+
+ Usage:
+
+ %let fpath=/some/directory;
+
+ %mp_hashdirectory(&fpath,outds=myhash,maxdepth=2)
+
+ data _null_;
+ set work.myhash;
+ put (_all_)(=);
+ run;
+
+ Whilst files are hashed in their entirety, the logic for creating a folder
+ hash is as follows:
+
+ @li Sort the files by filename (case sensitive, uppercase then lower)
+ @li Take the first 100 hashes, concatenate and hash
+ @li Concatenate this hash with another 100 hashes and hash again
+ @li Continue until the end of the folder. This is the folder hash
+ @li If a folder contains other folders, start from the bottom of the tree -
+ the folder hashes cascade upwards so you know immediately if there is a
+ change in a sub/sub directory
+ @li If the folder has no content (empty) then it is ignored. No hash created.
+
+ SAS Macros
+ @li mp_dirlist.sas
+
+ Related Files
+ @li mp_hashdataset.sas
+ @li mp_hashdirectory.test.sas
+ @li mp_md5.sas
+
+ @param [in] inloc Full filepath of the file to be hashed (unquoted)
+ @param [in] iftrue= (1=1) A condition under which the macro should be executed
+ @param [in] maxdepth= (0) Set to a positive integer to indicate the level of
+ subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited
+ recursion, set to MAX.
+ @param [in] method= (MD5) the hashing method to use. Available options:
+ @li MD5
+ @li SH1
+ @li SHA256
+ @li SHA384
+ @li SHA512
+ @li CRC32
+ @param [out] outds= (work.mp_hashdirectory) The output dataset. Contains:
+ @li directory - the parent folder
+ @li file_hash - the hash output
+ @li hash_duration - how long the hash took (first hash always takes longer)
+ @li file_path - /full/path/to/each/file.ext
+ @li file_or_folder - contains either "file" or "folder"
+ @li level - the depth of the directory (top level is 0)
+
+ @version 9.4m6
+ @author Allan Bowe
+**/
+
+%macro mp_hashdirectory(inloc,
+ outds=work.mp_hashdirectory,
+ method=MD5,
+ maxdepth=0,
+ iftrue=%str(1=1)
+)/*/STORE SOURCE*/;
+
+%local curlevel tempds ;
+
+%if not(%eval(%unquote(&iftrue))) %then %return;
+
+/* get the directory listing */
+%mp_dirlist(path=&inloc, outds=&outds, maxdepth=&maxdepth, showparent=YES)
+
+/* create the hashes */
+data &outds;
+ set &outds (rename=(filepath=file_path));
+ length FILE_HASH $32 HASH_DURATION 8;
+ keep directory file_hash hash_duration file_path file_or_folder level;
+
+ ts=datetime();
+ if file_or_folder='file' then do;
+ file_hash=hashing_file("&method",cats(file_path),0);
+ end;
+ hash_duration=datetime()-ts;
+run;
+
+proc sort data=&outds ;
+ by descending level directory file_path;
+run;
+
+data _null_;
+ set &outds;
+ call symputx('maxlevel',level,'l');
+ stop;
+run;
+
+/* now hash the hashes to populate folder hashes, starting from the bottom */
+%do curlevel=&maxlevel %to 0 %by -1;
+ data work._data_ (keep=directory file_hash);
+ set &outds;
+ where level=&curlevel;
+ by descending level directory file_path;
+ length str $32767 tmp_hash $32;
+ retain str tmp_hash ;
+ /* reset vars when starting a new directory */
+ if first.directory then do;
+ str='';
+ tmp_hash='';
+ i=0;
+ end;
+ /* hash each chunk of 100 file paths */
+ i+1;
+ str=cats(str,file_hash);
+ if mod(i,100)=0 or last.directory then do;
+ tmp_hash=hashing("&method",cats(tmp_hash,str));
+ str='';
+ end;
+ /* output the hash at directory level */
+ if last.directory then do;
+ file_hash=tmp_hash;
+ output;
+ end;
+ if last.level then stop;
+ run;
+ %let tempds=&syslast;
+ /* join the hash back into the main table */
+ proc sql undo_policy=none;
+ create table &outds as
+ select a.directory
+ ,coalesce(b.file_hash,a.file_hash) as file_hash
+ ,a.hash_duration
+ ,a.file_path
+ ,a.file_or_folder
+ ,a.level
+ from &outds a
+ left join &tempds b
+ on a.file_path=b.directory
+ order by level desc, directory, file_path;
+ drop table &tempds;
+%end;
+
+%mend mp_hashdirectory;
/**
@file
@brief Performs a wrapped \%include
diff --git a/base/mp_dirlist.sas b/base/mp_dirlist.sas
index 4807b00..2af46d2 100644
--- a/base/mp_dirlist.sas
+++ b/base/mp_dirlist.sas
@@ -27,6 +27,9 @@
@param [in] maxdepth= (0) Set to a positive integer to indicate the level of
subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited
recursion, set to MAX.
+ @param [in] showparent= (NO) By default, the initial parent directory is not
+ part of the results. Set to YES to include it. For this record only,
+ directory=filepath.
@param [out] outds= (work.mp_dirlist) The output dataset to create
@param [out] getattrs= (NO) If getattrs=YES then the doptname / foptname
functions are used to scan all properties - any characters that are not
@@ -63,6 +66,7 @@
, fref=0
, outds=work.mp_dirlist
, getattrs=NO
+ , showparent=NO
, maxdepth=0
, level=0 /* The level of recursion to perform. For internal use only. */
)/*/STORE SOURCE*/;
@@ -145,6 +149,15 @@ data &out_ds(compress=no
output;
end;
rc = dclose(did);
+ %if &showparent=YES and &level=0 %then %do;
+ filepath=directory;
+ file_or_folder='folder';
+ ext='';
+ filename=scan(directory,-1,'/\');
+ msg='';
+ level=&level;
+ output;
+ %end;
stop;
run;
@@ -232,6 +245,9 @@ run;
data _null_;
set &out_ds;
where file_or_folder='folder';
+ %if &showparent=YES and &level=0 %then %do;
+ if filepath ne directory;
+ %end;
length code $10000;
code=cats('%nrstr(%mp_dirlist(path=',filepath,",outds=&outds"
,",getattrs=&getattrs,level=%eval(&level+1),maxdepth=&maxdepth))");
diff --git a/base/mp_ds2md.sas b/base/mp_ds2md.sas
index 543c5cc..7febabe 100644
--- a/base/mp_ds2md.sas
+++ b/base/mp_ds2md.sas
@@ -92,7 +92,7 @@ data _null_;
run;
%if %upcase(&showlog)=YES %then %do;
- options ps=max;
+ options ps=max lrecl=max;
data _null_;
infile &outref;
input;
@@ -100,4 +100,4 @@ run;
run;
%end;
-%mend mp_ds2md;
\ No newline at end of file
+%mend mp_ds2md;
diff --git a/base/mp_hashdataset.sas b/base/mp_hashdataset.sas
index f469e7e..19e8f44 100644
--- a/base/mp_hashdataset.sas
+++ b/base/mp_hashdataset.sas
@@ -11,7 +11,7 @@
put hashkey=;
run;
- 
+ 
SAS Macros
@li mf_getattrn.sas
@@ -21,11 +21,12 @@
Related Files
@li mp_hashdataset.test.sas
+ @li mp_hashdirectory.sas
@param [in] libds dataset to hash
@param [in] salt= Provide a salt (could be, for instance, the dataset name)
- @param [in] iftrue= A condition under which the macro should be executed.
- @param [out] outds= (work.mf_hashdataset) The output dataset to create. This
+ @param [in] iftrue= (1=1) A condition under which the macro should be executed
+ @param [out] outds= (work._data_) The output dataset to create. This
will contain one column (hashkey) with one observation (a $hex32.
representation of the input hash)
|hashkey:$32.|
diff --git a/base/mp_hashdirectory.sas b/base/mp_hashdirectory.sas
new file mode 100644
index 0000000..c523841
--- /dev/null
+++ b/base/mp_hashdirectory.sas
@@ -0,0 +1,152 @@
+/**
+ @file
+ @brief Returns a unique hash for each file in a directory
+ @details Hashes each file in each directory, and then hashes the hashes to
+ create a hash for each directory also.
+
+ This makes use of the new `hashing_file()` and `hashing` functions, available
+ since 9.4m6. Interestingly, these can even be used in pure macro, eg:
+
+ %put %sysfunc(hashing_file(md5,/path/to/file.blob,0));
+
+ An example of this logic being applied in JavaScript is available in the
+ @sasjs/utils library.
+
+ Usage:
+
+ %let fpath=/some/directory;
+
+ %mp_hashdirectory(&fpath,outds=myhash,maxdepth=2)
+
+ data _null_;
+ set work.myhash;
+ put (_all_)(=);
+ run;
+
+ Whilst files are hashed in their entirety, the logic for creating a folder
+ hash is as follows:
+
+ @li Sort the files by filename (case sensitive, uppercase then lower)
+ @li Take the first 100 hashes, concatenate and hash
+ @li Concatenate this hash with another 100 hashes and hash again
+ @li Continue until the end of the folder. This is the folder hash
+ @li If a folder contains other folders, start from the bottom of the tree -
+ the folder hashes cascade upwards so you know immediately if there is a
+ change in a sub/sub directory
+ @li If the folder has no content (empty) then it is ignored. No hash created.
+
+ SAS Macros
+ @li mp_dirlist.sas
+
+ Related Files
+ @li mp_hashdataset.sas
+ @li mp_hashdirectory.test.sas
+ @li mp_md5.sas
+
+ @param [in] inloc Full filepath of the file to be hashed (unquoted)
+ @param [in] iftrue= (1=1) A condition under which the macro should be executed
+ @param [in] maxdepth= (0) Set to a positive integer to indicate the level of
+ subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited
+ recursion, set to MAX.
+ @param [in] method= (MD5) the hashing method to use. Available options:
+ @li MD5
+ @li SH1
+ @li SHA256
+ @li SHA384
+ @li SHA512
+ @li CRC32
+ @param [out] outds= (work.mp_hashdirectory) The output dataset. Contains:
+ @li directory - the parent folder
+ @li file_hash - the hash output
+ @li hash_duration - how long the hash took (first hash always takes longer)
+ @li file_path - /full/path/to/each/file.ext
+ @li file_or_folder - contains either "file" or "folder"
+ @li level - the depth of the directory (top level is 0)
+
+ @version 9.4m6
+ @author Allan Bowe
+**/
+
+%macro mp_hashdirectory(inloc,
+ outds=work.mp_hashdirectory,
+ method=MD5,
+ maxdepth=0,
+ iftrue=%str(1=1)
+)/*/STORE SOURCE*/;
+
+%local curlevel tempds ;
+
+%if not(%eval(%unquote(&iftrue))) %then %return;
+
+/* get the directory listing */
+%mp_dirlist(path=&inloc, outds=&outds, maxdepth=&maxdepth, showparent=YES)
+
+/* create the hashes */
+data &outds;
+ set &outds (rename=(filepath=file_path));
+ length FILE_HASH $32 HASH_DURATION 8;
+ keep directory file_hash hash_duration file_path file_or_folder level;
+
+ ts=datetime();
+ if file_or_folder='file' then do;
+ file_hash=hashing_file("&method",cats(file_path),0);
+ end;
+ hash_duration=datetime()-ts;
+run;
+
+proc sort data=&outds ;
+ by descending level directory file_path;
+run;
+
+data _null_;
+ set &outds;
+ call symputx('maxlevel',level,'l');
+ stop;
+run;
+
+/* now hash the hashes to populate folder hashes, starting from the bottom */
+%do curlevel=&maxlevel %to 0 %by -1;
+ data work._data_ (keep=directory file_hash);
+ set &outds;
+ where level=&curlevel;
+ by descending level directory file_path;
+ length str $32767 tmp_hash $32;
+ retain str tmp_hash ;
+ /* reset vars when starting a new directory */
+ if first.directory then do;
+ str='';
+ tmp_hash='';
+ i=0;
+ end;
+ /* hash each chunk of 100 file paths */
+ i+1;
+ str=cats(str,file_hash);
+ if mod(i,100)=0 or last.directory then do;
+ tmp_hash=hashing("&method",cats(tmp_hash,str));
+ str='';
+ end;
+ /* output the hash at directory level */
+ if last.directory then do;
+ file_hash=tmp_hash;
+ output;
+ end;
+ if last.level then stop;
+ run;
+ %let tempds=&syslast;
+ /* join the hash back into the main table */
+ proc sql undo_policy=none;
+ create table &outds as
+ select a.directory
+ ,coalesce(b.file_hash,a.file_hash) as file_hash
+ ,a.hash_duration
+ ,a.file_path
+ ,a.file_or_folder
+ ,a.level
+ from &outds a
+ left join &tempds b
+ on a.file_path=b.directory
+ order by level desc, directory, file_path;
+ drop table &tempds;
+%end;
+
+%mend mp_hashdirectory;
diff --git a/tests/base/mp_hashdirectory.test.sas b/tests/base/mp_hashdirectory.test.sas
new file mode 100644
index 0000000..08a0457
--- /dev/null
+++ b/tests/base/mp_hashdirectory.test.sas
@@ -0,0 +1,133 @@
+/**
+ @file
+ @brief Testing mp_hashdirectory.sas macro
+
+
+ SAS Macros
+ @li mf_mkdir.sas
+ @li mf_nobs.sas
+ @li mp_assert.sas
+ @li mp_assertscope.sas
+ @li mp_hashdirectory.sas
+
+**/
+
+/* set up a directory to hash */
+%let fpath=%sysfunc(pathname(work))/testdir;
+
+%mf_mkdir(&fpath)
+%mf_mkdir(&fpath/sub1)
+%mf_mkdir(&fpath/sub2)
+%mf_mkdir(&fpath/sub1/subsub)
+
+/* note - the path in the file means the hash is different in each run */
+%macro makefile(path,name);
+ data _null_;
+ file "&path/&name" termstr=lf;
+ put "This file is located at:";
+ put "&path";
+ put "and it is called:";
+ put "&name";
+ run;
+%mend makefile;
+
+%macro spawner(path);
+ %do x=1 %to 5;
+ %makefile(&path,file&x..txt)
+ %end;
+%mend spawner;
+
+%spawner(&fpath)
+%spawner(&fpath/sub1)
+%spawner(&fpath/sub1/subsub)
+
+
+%mp_assertscope(SNAPSHOT)
+%mp_hashdirectory(&fpath,outds=work.hashes,maxdepth=MAX)
+%mp_assertscope(COMPARE)
+
+%mp_assert(
+ iftrue=(&syscc=0),
+ desc=No errors,
+ outds=work.test_results
+)
+
+%mp_assert(
+ iftrue=(%mf_nobs(work.hashes)=19),
+ desc=record created for each entry,
+ outds=work.test_results
+)
+
+proc sql;
+select count(*) into: misscheck
+ from work.hashes
+ where file_hash is missing;
+
+%mp_assert(
+ iftrue=(&misscheck=1),
+ desc=Only one missing hash - the empty directory,
+ outds=work.test_results
+)
+
+data _null_;
+ set work.hashes;
+ if directory=file_path then call symputx('tophash',file_hash);
+run;
+
+%mp_assert(
+ iftrue=(%length(&tophash)=32),
+ desc=ensure valid top level hash created,
+ outds=work.test_results
+)
+
+/* now change a file and re-hash */
+data _null_;
+ file "&fpath/sub1/subsub/file1.txt" termstr=lf;
+ put "This file has changed!";
+run;
+
+%mp_hashdirectory(&fpath,outds=work.hashes2,maxdepth=MAX)
+
+data _null_;
+ set work.hashes2;
+ if directory=file_path then call symputx('tophash2',file_hash);
+run;
+
+%mp_assert(
+ iftrue=(&tophash ne &tophash2),
+ desc=ensure the changing of the hash results in a new value,
+ outds=work.test_results
+)
+
+/* now change it back and see if it matches */
+data _null_;
+ file "&fpath/sub1/subsub/file1.txt" termstr=lf;
+ put "This file is located at:";
+ put "&fpath/sub1/subsub";
+ put "and it is called:";
+ put "file1.txt";
+ run;
+run;
+
+%mp_hashdirectory(&fpath,outds=work.hashes3,maxdepth=MAX)
+
+data _null_;
+ set work.hashes3;
+ if directory=file_path then call symputx('tophash3',file_hash);
+run;
+
+%mp_assert(
+ iftrue=(&tophash=&tophash3),
+ desc=ensure the same files result in the same hash,
+ outds=work.test_results
+)
+
+/* dump contents for debugging */
+data _null_;
+ set work.hashes;
+ put file_hash file_path;
+run;
+data _null_;
+ set work.hashes2;
+ put file_hash file_path;
+run;