/*-------------------------------------------------------------------* * Name: BOXGLM.SAS * * Title: Power transformations by Box-Cox method with * * graphic display of maximum likelihood solution, t-values* * for model effects, and influence ofobservations * * on choice of power. * * Doc: http://www.math.yorku.ca/SCS/sasmac/boxglm.html * *-------------------------------------------------------------------* * Author: Michael Friendly * * Created: 23 Oct 1991 10:20:14 * * Revised: 26 Feb 1998 16:20:10 * * Version: 1.2 * * -fixed bug not allowing boxinfl to be called internally * * -fixed bug when model contains interactions * *-------------------------------------------------------------------*/ /*------------------------------------------------------------------*/ /* PURPOSE: Computes the maximum likelihood power transformation for*/ /* a GLM with one or more predictors. */ /* INPUT: */ /* data= the name of the data set holding the response and */ /* predictor variables. (Default: most recently created) */ /* resp= name of the response variable for analysis. */ /* model= the independent variables in the regression, i.e., */ /* the terms on the right side of the = sign in the MODEL */ /* statement for PROC GLM. */ /* OUTPUT: */ /* out= the name of the data set to hold the transformed data. */ /* Contains all original variables, with the transformed */ /* response replacing the original variable. */ /* outplot=the name of the data set containing _RMSE_, and t- */ /* values for each effect in the model, with one observa- */ /* tion for each power value tried. */ /* Source: */ /* This program incorporates portions of program code from the */ /* macro ADXTRANS in the ADX system for experimental design */ /* distributed as part of the SAS/QC product. */ /* Those programs bear the following copyright notice: */ /* Copyright (c) 1987 by SAS Institute Inc. Cary NC 27511 USA */ /*------------------------------------------------------------------*/ %macro boxglm( resp=, /* name of response variable */ model=, /* independent variables in regression */ class=, id=, /* ID variable for observations */ data=_last_, /* input dataset */ out=_data_, /* output dataset with transformed resp */ outplot=_plot_, /* output dataset for plot of powers */ pplot=RMSE EFFECT INFL, /* printer plots: RMSE, EFFECT, INFL */ gplot=RMSE EFFECT INFL, /* graphic plots: RMSE, EFFECT, INFL */ lopower=-2, /* low value for power */ hipower=2, /* high value for power */ npower=21, /* number of power values in interval */ conf=.95); /* confidence coefficient of CI on power*/ %if (%length(&model) = 0) %then %do; %put ERROR: BOXGLM: MODEL= must be specified; %goto exit; %end; %if (%length(&resp) = 0) %then %do; %put ERROR: BOXGLM: RESP= must be specified; %goto exit; %end; %let nmod = %numwords(&model,%str( )); options nonotes; /* / Get the number of non-missing observations; quick exit if none. /-------------------------------------------------------------------*/ data _nomiss_; set &data; if &resp ^= .; proc contents noprint data=_nomiss_ out=_NOBS_; data _null_; set _NOBS_; call symput('NOBS',left(put(nobs,best20.))); run; %if (&nobs <= 0) %then %goto done; /* / Check for valid data (positive). /-------------------------------------------------------------------*/ %let FLAG = 0; data _null_; set _nomiss_; if (&resp <= 0) then do; put "ERROR: Non-positive value of dependent variable &resp" "(obs=" _n_ ") " &resp ; call symput('FLAG','1'); end; run; %if (&FLAG) %then %do; %put BOXGLM: Cannot compute Box-Cox transformation; %goto done; %end; %put BOXGLM: Computing transforms of &resp ...; /* / Transpose the data set; transform all values for each power into / variables NEW1, NEW2, ...; and then transpose back again. Values / are centered and scaled so as to be approximately in the same scale / as originally and so that the transformed value of the geometric / mean is the geometric mean. /-------------------------------------------------------------------*/ data _tmp_; set _nomiss_; keep &resp; proc transpose data=_tmp_ out=_tmp_; var &resp; data _tmp_; set _tmp_; array col{&NOBS}; array new{&NOBS}; keep _name_ new1-new&NOBS; gmean = 0; do i = 1 to &NOBS; gmean = gmean + log(col{i}); end; gmean = exp(gmean / &NOBS); inc = (&hipower-&lopower)/(&npower-1); do i = 0 to &npower-1; lambda = &lopower + (i * inc); if (abs(lambda) < (inc/2)) then do z1 = log(gmean); do j = 1 to &NOBS; new{j} = gmean + (log(col{j})-z1)*gmean; end; end; else do; z1 = gmean**lambda; z2 = lambda*(gmean**(lambda-1)); do j = 1 to &NOBS; new{j} = gmean + ((col{j}**lambda)-z1)/z2; end; end; _name_ = "_tf" || left(put(i+1,best20.)); output; end; proc transpose data=_tmp_ out=_tmp_; var new1-new&NOBS; data _tmp_; merge _nomiss_ _tmp_; drop _NAME_; run; %put BOXGLM: Regressing on transforms for &resp ...; /* / Perform the regression on all transformed variates at once. /-------------------------------------------------------------------*/ options notes; proc glm data=_tmp_ outstat=_reg_ noprint; %if %length(&class)>0 %then %do; class &class; %end; model _tf1-_tf&npower = &model / ss3; run; data _reg_ ; set _reg_; by _source_ notsorted; drop i; if first._source_ then i=0; i+1; _lambda_= &lopower+((i-1)*((&hipower-&lopower)/(&npower-1))); proc sort; by _lambda_; * proc print data=_reg_; * var _name_ _source_ _type_ ss df f prob; %put BOXGLM: Extracting GLM summary information ...; /* / Extract error sums of squares, F and PROB values, calculate _RMSE_ / If no degrees of freedom left for error, exit. /-------------------------------------------------------------------*/ %let FLAG = 0; data &outplot; set _reg_; by _lambda_; keep _lambda_ _sse_ _rmse_ _edf_ _like_ f1-f&nmod p1-p&nmod; retain _lambda_ _sse_ _rmse_ _edf_ _like_ f1-f&nmod p1-p&nmod; retain i j 0; array gf{&nmod} f1-f&nmod; array gp{&nmod} p1-p&nmod; label _lambda_ = 'Box-Cox Power (lambda)' _like_ = 'Log Likelihood'; if _source_='ERROR' then do; _sse_ = ss; if df>0 then do; _rmse_= sqrt(ss/df); _like_ = -&NOBS*log(_rmse_); end; else do; call symput('FLAG','1'); end; _edf_ = df; j =0; end; else do; j+1; gf{j} = f; gp{j} = prob;; end; if last._lambda_ then output; proc print data=&outplot; run; %let nmodp1 = %eval(&nmod+1); %if (&FLAG) %then %do; %put BOXGLM: No degrees of freedom left to estimate error.; %put %str( )Transformation cannot be estimated.; %goto done; %end; /* / Compute the optimal transform, the one with minimum RMSE, and an / approximate confidence interval. The approximate .95 confidence / interval is based on the fact that / 2{ L(lambda-hat) - L(lambda) } <= cinv(.95,1) = 3.84 /-------------------------------------------------------------------*/ %put BOXGLM: Computing optimal transformation and confidence interval; proc transpose data=&outplot out=_reg_; var _rmse_ _like_; data _null_; set _reg_; array col{&npower}; array rmse{&npower}; retain imin rmse1-rmse&npower; if (trim(_name_) = '_RMSE_') then do; rmse{1} = col{1}; minrmse = rmse{1}; imin = 1; do i = 2 to &npower; rmse{i} = col{i}; if (rmse{i} < minrmse) then do; minrmse = rmse{i}; imin = i; end; end; call symput('_imin_',left(put(imin,best20.))); lambda = &lopower+((imin-1)*(&hipower-&lopower)/(&npower-1)); call symput('elambda',left(put(lambda,best20.))); end; else if (trim(_name_) = '_LIKE_') then do; call symput('maxlike',left(put(col{imin},best20.))); end; run; %put _imin_= &_imin_ elambda = &elambda maxlike= &maxlike; data &outplot; set &outplot end=eof; drop _lolam_ _hilam_ _hirmse_; retain _lolam_ 10 _hilam_ -10 _hirmse_ 0; label conf = "&conf Confidence Interval"; if (_like_ < &maxlike - (cinv(&conf,1)/2) ) /* was 1.92 */ then conf = " "; else do; conf = "*"; _lolam_ = min(_lolam_,_lambda_); _hilam_ = max(_hilam_,_lambda_); _hirmse_= max(_hirmse_,_rmse_); end; if eof then do; call symput('lolam',left(put(_lolam_,best20.))); call symput('hilam',left(put(_hilam_,best20.))); call symput('hirms',left(put(_hirmse_,best20.))); end; run; options notes; proc print data=&outplot label; var _lambda_ _like_ _rmse_ conf; run; %put BOXGLM: Estimated Power Transformation, Lambda = &elambda.; /* / Plot likelihood and t-values for effects as functions of the / power. /-------------------------------------------------------------------*/ data &out; set _tmp_; drop _tf1-_tf&npower; &resp = _tf&_imin_; label &resp = "Transformed &resp (lambda=&elambda)"; %let pplot = %upcase(&pplot); %if &pplot ^= NONE %then %do; %let sym = 1 2 3 4 5 6 7 8 9; %let sym = &sym A B C D E F G H I J K L M N O P Q R S T U V W X Y Z; %let sym = &sym a b c d e f g h i j k l m n o p q r s t u v w x y z; proc plot data=&outplot; %if %index(&pplot,RMSE)>0 %then %do; title 'RMSE for Box-Cox Power Transform'; plot _rmse_ * _lambda_ = 'L'; label _rmse_ = "Root Mean Squared Error for &resp"; run; %end; %if %index(&pplot,EFFECT)>0 %then %do; plot %do i = 1 %to &nmod; f&i * _lambda_ = "%scan(&sym,&i)" %end; / overlay; title 'F-values for Model Effects'; run; %end; %end; %let gplot = %upcase(&gplot); %if &gplot ^= NONE %then %do; %let sym = dot star square circle triangle $ + hash x; %if %index(&gplot,EFFECT)>0 %then %do; data _labels_; set &outplot end=eof; length function $8 text $16; if eof then do; xsys='2'; ysys='2'; size=1.1; function='LABEL'; position='6'; x = _lambda_; %do i = 1 %to &nmod; y = f&i ; text = " %scan(&model,&i) "; output; %end; end; %end; proc gplot data=&outplot; %do i = 1 %to &nmod; symbol&i i=spline v=%scan(&sym,&i) c=black ; %end; axis1 label=(h=1.3 a=90 f=duplex) value=(h=1.2); axis2 label=(h=1.3 f=duplex 'Box-Cox Power (' f=cgreek 'l' f=duplex ')' ) value=(h=1.2) offset=(3); axis3 label=(h=1.5 a=90 f=duplex 'F-value') value=(h=1.3); axis4 label=(h=1.3 f=duplex 'Box-Cox Power (' f=cgreek 'l' f=duplex ')' ) value=(h=1.2) offset=(2,9); %if %index(&gplot,RMSE)>0 %then %do; plot _rmse_ * _lambda_ = 1 / href=&lolam &hilam lhref=20 chref=red vref=&hirms lvref=33 cvref=red vaxis=axis1 haxis=axis2 hminor=1 des="BoxCox plot of RMSE * Lambda for &resp"; title h=1.5 "Box-Cox Power Transform for &resp"; label _rmse_ = "Root Mean Squared Error"; run; %end; %if %index(&gplot,EFFECT)>0 %then %do; plot %do i = 1 %to &nmod; f&i * _lambda_ = &i %end; / overlay anno=_labels_ href=&lolam &hilam lhref=20 chref=red /* vref=&tval -&tval lvref=33 cvref=red */ vaxis=axis3 haxis=axis4 hminor=1 des="BoxCox Effect plot for &resp"; title h=1.5 "F-values for Model Effects on &resp"; run; %end; %end; /* if &gplot ^= NONE */ %if &pplot ^= NONE or &gplot ^= NONE %then %do; title ; %end; %done: options nonotes; proc datasets nolist; delete _NOBS_ _tmp_ _nomiss_ _reg_; %if %index(&gplot,EFFECT)>0 %then %do; delete _labels_; %end; quit; %if %index(&gplot,INFL)>0 or %index(&pplot,INFL)>0 %then %do; %boxinfl(data=&data, resp=&resp, model=&model, id=&id, class=&class, gplot=&gplot, pplot=&pplot); %end; options notes; %exit: %mend; /*-------------------------------------------------------------------*/ /* Copyright (c) 1987 by SAS Institute Inc. Cary NC 27511 USA */ /* */ /* NAME: NUM(ber of )WORDS */ /* PURPOSE: Returns the number of words in a given list, with an */ /* optional specification of word delimiters. */ /*-------------------------------------------------------------------*/ %macro numwords(lst,wordchar); %let i = 1; %if (%length(&wordchar)) %then %do; %let v = %scan(&lst,&i,&wordchar); %do %while (%length(&v) > 0); %let i = %eval(&i + 1); %let v = %scan(&lst,&i,&wordchar); %end; %end; %else %do; %let v = %scan(&lst,&i); %do %while (%length(&v) > 0); %let i = %eval(&i + 1); %let v = %scan(&lst,&i); %end; %end; %eval(&i - 1) %mend;