1
0
mirror of https://github.com/sasjs/core.git synced 2025-12-11 06:24:35 +00:00

feat: adding mcf_length to mp_getmaxvarlengths

BREAKING CHANGE: mp_getmaxvarlengths now returns 0 for non-special missings, and will use numeric length (as opposed to cast-to-character length) by default
This commit is contained in:
munja
2022-01-23 23:26:10 +01:00
parent f7fac50108
commit 142b46570d
7 changed files with 196 additions and 53 deletions

View File

@@ -167,7 +167,7 @@ SAS code can contain one of two types of dependency - SAS Macros, and SAS Includ
@li someprogram.sas FREFTWO
```
The CLI can then extract all the dependencies and insert as precode (SAS Macros) or in a temp engine fileref (SAS Includes) when creating SAS Jobs and Services.
The CLI can then extract all the dependencies and insert as precode (SAS Macros) or in a temp engine fileref (SAS Includes) when creating SAS Jobs and Services (and Tests).
When contributing to this library, it is therefore important to ensure that all dependencies are listed in the header in this format.
@@ -183,6 +183,7 @@ When contributing to this library, it is therefore important to ensure that all
- Mandatory parameters should be positional, all optional parameters should be keyword (var=) style.
- All dataset references must be 2 level (eg `work.blah`, not `blah`). This is to avoid contention when options [DATASTMTCHK](https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000279064.htm)=ALLKEYWORDS is in effect.
- Avoid naming collisions! All macro variables should be local scope. Use system generated work tables where possible - eg `data ; set sashelp.class; run; data &output; set &syslast; run;`
- Where global macro variables are absolutely necessary, they should make use of `&sasjs_prefix` - see mp_init.sas
- The use of `quit;` for `proc sql` is optional unless you are looking to benefit from the timing statistics.
- Use [sasjs lint](https://github.com/sasjs/lint)!
@@ -192,9 +193,9 @@ When contributing to this library, it is therefore important to ensure that all
## Breaking Changes
We are currently on major release v3. The following changes are planned when the next major (breaking) release becomes necessary:
We are currently on major release v4. The following changes are planned when the next major (breaking) release becomes necessary:
* Remove `dbg` parameter from mp_jsonout.sas (implement mdebug instead)
* (None as yet)
## Star Gazing

View File

@@ -1,28 +1,46 @@
/**
@file mp_getmaxvarlengths.sas
@file
@brief Scans a dataset to find the max length of the variable values
@details
This macro will scan a base dataset and produce an output dataset with two
columns:
- NAME Name of the base dataset column
- MAXLEN Maximum length of the data contained therein.
- MAXLEN Maximum length of the data contained therein.
Character fields may be allocated very large widths (eg 32000) of which the
maximum value is likely to be much narrower. This macro was designed to
enable a HTML table to be appropriately sized however this could be used as
part of a data audit to ensure we aren't over-sizing our tables in relation to
the data therein.
Character fields are often allocated very large widths (eg 32000) of which the
maximum value is likely to be much narrower. Identifying such cases can be
helpful in the following scenarios:
@li Enabling a HTML table to be appropriately sized (`num2char=YES`)
@li Reducing the size of a dataset to save on storage (mp_ds2squeeze.sas)
@li Identifying columns containing nothing but missing values (`MAXLEN=0` in
the output table)
If the entire column is made up of (non-special) missing values then a value
of 0 is returned.
Numeric fields are converted using the relevant format to determine the width.
Usage:
%mp_getmaxvarlengths(sashelp.class,outds=work.myds)
@param libds Two part dataset (or view) reference.
@param outds= The output dataset to create
@param [in] libds Two part dataset (or view) reference.
@param [in] num2char= (NO) When set to NO, numeric fields are sized according
to the number of bytes used (or set to zero in the case of non-special
missings). When YES, the numeric field is converted to character (using the
format, if available), and that is sized instead, using `lengthn()`.
@param [out] outds= The output dataset to create, eg:
|NAME:$8.|MAXLEN:best.|
|---|---|
|`Name `|`7 `|
|`Sex `|`1 `|
|`Age `|`3 `|
|`Height `|`8 `|
|`Weight `|`3 `|
<h4> SAS Macros </h4>
@li mcf_length.sas
@li mf_getuniquename.sas
@li mf_getvarlist.sas
@li mf_getvartype.sas
@li mf_getvarformat.sas
@@ -30,20 +48,32 @@
@version 9.2
@author Allan Bowe
<h4> Related Macros </h4>
@li mp_ds2squeeze.sas
@li mp_getmaxvarlengths.test.sas
**/
%macro mp_getmaxvarlengths(
libds /* libref.dataset to analyse */
,outds=work.mp_getmaxvarlengths /* name of output dataset to create */
libds
,num2char=NO
,outds=work.mp_getmaxvarlengths
)/*/STORE SOURCE*/;
%local vars x var fmt;
%local vars prefix x var fmt;
%let vars=%mf_getvarlist(libds=&libds);
%let prefix=%substr(%mf_getuniquename(),1,25);
%let num2char=%upcase(&num2char);
%if &num2char=NO %then %do;
/* compile length function for numeric fields */
%mcf_length(wrap=YES, insert_cmplib=YES)
%end;
proc sql;
create table &outds (rename=(
%do x=1 %to %sysfunc(countw(&vars,%str( )));
________&x=%scan(&vars,&x)
&prefix.&x=%scan(&vars,&x)
%end;
))
as select
@@ -51,18 +81,21 @@ create table &outds (rename=(
%let var=%scan(&vars,&x);
%if &x>1 %then ,;
%if %mf_getvartype(&libds,&var)=C %then %do;
max(length(&var)) as ________&x
max(lengthn(&var)) as &prefix.&x
%end;
%else %do;
%else %if &num2char=YES %then %do;
%let fmt=%mf_getvarformat(&libds,&var);
%put fmt=&fmt;
%if %str(&fmt)=%str() %then %do;
max(length(cats(&var))) as ________&x
max(lengthn(cats(&var))) as &prefix.&x
%end;
%else %do;
max(length(put(&var,&fmt))) as ________&x
max(lengthn(put(&var,&fmt))) as &prefix.&x
%end;
%end;
%else %do;
max(mcf_length(&var)) as &prefix.&x
%end;
%end;
from &libds;

View File

@@ -33,37 +33,39 @@
%macro mp_init(prefix=SASJS
)/*/STORE SOURCE*/;
%global
&prefix._INIT_NUM /* initialisation time as numeric */
&prefix._INIT_DTTM /* initialisation time in E8601DT26.6 format */
&prefix.WORK /* avoid typing %sysfunc(pathname(work)) every time */
;
%if %eval(&&&prefix._INIT_NUM>0) %then %return; /* only run once */
%global
SASJS_PREFIX /* the ONLY hard-coded global macro variable in SASjs */
&prefix._INIT_NUM /* initialisation time as numeric */
&prefix._INIT_DTTM /* initialisation time in E8601DT26.6 format */
&prefix.WORK /* avoid typing %sysfunc(pathname(work)) every time */
;
%if %length(&sasjs_prefix>0) %then %return; /* only run once */
%let sasjs_prefix=&prefix;
data _null_;
dttm=datetime();
call symputx("&prefix._init_num",dttm,'g');
call symputx("&prefix._init_dttm",put(dttm,E8601DT26.6),'g');
call symputx("&prefix.work",pathname('WORK'),'g');
run;
data _null_;
dttm=datetime();
call symputx("&sasjs_prefix._init_num",dttm,'g');
call symputx("&sasjs_prefix._init_dttm",put(dttm,E8601DT26.6),'g');
call symputx("&sasjs_prefix.work",pathname('WORK'),'g');
run;
options
noautocorrect /* disallow misspelled procedure names */
compress=CHAR /* default is none so ensure we have something! */
datastmtchk=ALLKEYWORDS /* protection from overwriting input datasets */
dsoptions=note2err /* undocumented - convert bad NOTEs to ERRs */
%str(err)orcheck=STRICT /* catch errs in libname/filename statements */
fmterr /* ensure err when a format cannot be found */
mergenoby=%str(ERR)OR /* throw err when a merge has no BY variables */
missing=. /* changing this can cause hard to detect errs */
noquotelenmax /* avoid warnings for long strings */
noreplace /* avoid overwriting permanent datasets */
ps=max /* reduce log size slightly */
ls=max /* reduce log even more and avoid word truncation */
validmemname=COMPATIBLE /* avoid special characters etc in table names */
validvarname=V7 /* avoid special characters etc in variable names */
varinitchk=%str(ERR)OR /* avoid data mistakes from variable name typos */
varlenchk=%str(ERR)OR /* fail hard if truncation (data loss) can result */
;
options
noautocorrect /* disallow misspelled procedure names */
compress=CHAR /* default is none so ensure we have something! */
datastmtchk=ALLKEYWORDS /* protection from overwriting input datasets */
dsoptions=note2err /* undocumented - convert bad NOTEs to ERRs */
%str(err)orcheck=STRICT /* catch errs in libname/filename statements */
fmterr /* ensure err when a format cannot be found */
mergenoby=%str(ERR)OR /* throw err when a merge has no BY variables */
missing=. /* changing this can cause hard to detect errs */
noquotelenmax /* avoid warnings for long strings */
noreplace /* avoid overwriting permanent datasets */
ps=max /* reduce log size slightly */
ls=max /* reduce log even more and avoid word truncation */
validmemname=COMPATIBLE /* avoid special characters etc in table names */
validvarname=V7 /* avoid special characters etc in variable names */
varinitchk=%str(ERR)OR /* avoid data mistakes from variable name typos */
varlenchk=%str(ERR)OR /* fail hard if truncation (data loss) can result */
;
%mend mp_init;

View File

@@ -39,6 +39,9 @@
@param [out] pkg= (utils) The output package in which to create the function.
Uses a 3 part format: libref.catalog.package
<h4> SAS Macros </h4>
@li mf_existfunction.sas
<h4> Related Macros </h4>
@li mcf_length.test.sas
@@ -51,13 +54,15 @@
,pkg=UTILS
)/*/STORE SOURCE*/;
%if %mf_existfunction(mcf_length)=1 %then %return;
%if &wrap=YES %then %do;
proc fcmp outlib=&lib..&cat..&pkg;
%end;
function mcf_length(var);
if missing(var) then len=0;
else if trunc(var,3)=var then len=3;
if var=. then len=0;
else if missing(var) or trunc(var,3)=var then len=3;
else if trunc(var,4)=var then len=4;
else if trunc(var,5)=var then len=5;
else if trunc(var,6)=var then len=6;

View File

@@ -39,6 +39,9 @@
@param [out] pkg= (utils) The output package in which to create the function.
Uses a 3 part format: libref.catalog.package
<h4> SAS Macros </h4>
@li mf_existfunction.sas
**/
%macro mcf_string2file(wrap=NO
@@ -48,6 +51,8 @@
,pkg=UTILS
)/*/STORE SOURCE*/;
%if %mf_existfunction(mcf_string2file)=1 %then %return;
%if &wrap=YES %then %do;
proc fcmp outlib=&lib..&cat..&pkg;
%end;

View File

@@ -12,6 +12,7 @@
data test;
call symputx('null',mcf_length(.));
call symputx('special',mcf_length(._))
call symputx('three',mcf_length(1));
call symputx('four',mcf_length(10000000));
call symputx('five',mcf_length(12345678));
@@ -24,6 +25,10 @@ run;
iftrue=(%str(&null)=%str(0)),
desc=Check if NULL returns 0
)
%mp_assert(
iftrue=(%str(&special)=%str(3)),
desc=Check if special missing ._ returns 3
)
%mp_assert(
iftrue=(%str(&three)=%str(3)),
desc=Check for length 3
@@ -47,4 +52,16 @@ run;
%mp_assert(
iftrue=(%str(&eight)=%str(8)),
desc=Check for length 8
)
%mp_assert(
iftrue=(&syscc=0),
desc=Check syscc=0 before re-initialisation
)
/* test 2 - compile again test for warnings */
%mcf_length(wrap=YES, insert_cmplib=YES)
%mp_assert(
iftrue=(&syscc=0),
desc=Check syscc=0 after re-initialisation
)

View File

@@ -0,0 +1,80 @@
/**
@file
@brief Testing mp_getmaxvarlengths macro
<h4> SAS Macros </h4>
@li mp_getmaxvarlengths.sas
@li mp_assert.sas
@li mp_assertdsobs.sas
@li mp_assertscope.sas
**/
/* regular usage */
%mp_assertscope(SNAPSHOT)
%mp_getmaxvarlengths(sashelp.class,outds=work.myds)
%mp_assertscope(COMPARE,desc=checking scope leakage on mp_getmaxvarlengths)
%mp_assert(
iftrue=(&syscc=0),
desc=No errs
)
%mp_assertdsobs(work.myds,
desc=Has 5 records,
test=EQUALS 5
)
data work.errs;
set work.myds;
if name='Name' and maxlen ne 7 then output;
if name='Sex' and maxlen ne 1 then output;
if name='Age' and maxlen ne 3 then output;
if name='Height' and maxlen ne 8 then output;
if name='Weight' and maxlen ne 3 then output;
run;
data _null_;
set work.errs;
putlog (_all_)(=);
run;
%mp_assertdsobs(work.errs,
desc=Err table has 0 records,
test=EQUALS 0
)
/* test2 */
data work.test2;
length a 3 b 5;
a=1/3;
b=1/3;
c=1/3;
d=._;
e=.;
output;
output;
run;
%mp_getmaxvarlengths(work.test2,outds=work.myds2)
%mp_assert(
iftrue=(&syscc=0),
desc=No errs in second test (with nulls)
)
%mp_assertdsobs(work.myds2,
desc=Has 5 records,
test=EQUALS 5
)
data work.errs2;
set work.myds2;
if name='a' and maxlen ne 3 then output;
if name='b' and maxlen ne 5 then output;
if name='c' and maxlen ne 8 then output;
if name='d' and maxlen ne 3 then output;
if name='e' and maxlen ne 0 then output;
run;
data _null_;
set work.errs2;
putlog (_all_)(=);
run;
%mp_assertdsobs(work.errs2,
desc=Err table has 0 records,
test=EQUALS 0
)