**************************************************************************************;
* find_fix_extendedascii_characters.sas *;
* *;
* This module identifies, documents and fixes problems in Toolkit data sets *;
* related to the use of extended-ascii characters in column values. Most often, we *;
* have seen this problem when reading Microsoft files such as Excel spreadsheets *;
* that contain"curly" quotes and double quotes, etc. *;
**************************************************************************************;
** Reminders: **:
* (1) Any problems detected in a specific Controlled Terminology data set *;
* may require identical updates in the Current folder. *;
* (2) Any problem detected in the GlobalLibrary ALSO requires correction *;
* down the equivalent !sasroot....standards/ folder hierarchy. *;
* (3) It is recommended that any problems detected ALSO be corrected in *;
* any source input file (e.g. the source xls file), given the data set *;
* may subsequently be recreated incorrectly from that source file. *;
* (4) For reference, the following source was used: *;
* http://www.danshort.com/ASCIImap/ *;
* http://www.asciitable.com/ *;
%cst_setStandardProperties(_cstStandard=CST-FRAMEWORK,_cstSubType=initialize);
%macro _cstFindCharacters(_cstDSName=,_cstColumnName=,_cstOutputDS=work._cstProblems);
%let _cstDSList=;
%let _cstDSListCnt=1;
%let _cstVarList=;
%let _cstVarListCnt=0;
%* Test a specific column *;
%if %length(&_cstColumnName)>0 %then
%do;
%put ***** Starting test for: &_cstDSName..&_cstColumnName;
data work._csttemp (keep=_cstDS _cstColumn _cstRecCnt _cstValue _cstNote);
set &_cstDSName;
attrib _cstDS format=$32.
_cstColumn format=$32.
_cstRecCnt format=8.
_cstValue format=8.
_cstNote format=$80.;
retain _cstDS _cstColumn;
if _n_=1 then do;
_cstDS="&_cstDSName";
_cstColumn="&_cstColumnName";
end;
do i=0 to 31,128 to 255;
xx=indexc(&_cstColumnName,byte(i));
if find(&_cstColumnName,byte(i)) then do;
_cstValue=i;
_cstRecCnt=_n_;
_cstNote=catx(' ','Invalid character for record',put(_n_,8.),'@ column',put(xx,8.));
output;
end;
end;
run;
%if %sysfunc(exist(&_cstOutputDS)) %then
%do;
proc append base=&_cstOutputDS data=work._csttemp;
run;
%end;
%else
%do;
data &_cstOutputDS;
set work._csttemp;
run;
%end;
%end;
%* Test all (character) columns in a data set *;
%else
%do;
%let _cstDomainOnly=%scan(&_cstDSName,2,.);
%* Also allow and process specification of entire libraries using _all_ *;
%if %sysfunc(index(%str(&_cstDomainOnly),%str(_all_)))>0 %then
%do;
proc sql noprint;
select catx('.',strip(libname),strip(memname)) into :_cstDSList separated by ' '
from dictionary.tables
where libname=upcase(scan("&_cstDSName",1,'.'));
select count(*) into :_cstDSListCnt
from dictionary.tables
where libname=upcase(scan("&_cstDSName",1,'.'));
quit;
%end;
%put _cstDSList=&_cstDSList;
%do _cnt_=1 %to &_cstDSListCnt;
%if %length(&_cstDSList)>0 %then
%let _cstDSName=%scan(&_cstDSList,&_cnt_,' ');
%put >>>>>;
%put >>>>> Starting test for: &_cstDSName;
%put >>>>>;
proc sql noprint;
select cats("'",name,"'") into :_cstVarList separated by ' '
from dictionary.columns
where libname=upcase(scan("&_cstDSName",1,'.')) and
memname=upcase(scan("&_cstDSName",2,'.')) and
type='char';
select name into :_cstVarList2 separated by ' '
from dictionary.columns
where libname=upcase(scan("&_cstDSName",1,'.')) and
memname=upcase(scan("&_cstDSName",2,'.')) and
type='char';
select count(*) into :_cstVarListCnt
from dictionary.columns
where libname=upcase(scan("&_cstDSName",1,'.')) and
memname=upcase(scan("&_cstDSName",2,'.')) and
type='char';
quit;
%put _cstVarList=&_cstVarList;
%put _cstVarList2=&_cstVarList2;
data work._csttemp (keep=_cstDS _cstColumn _cstRecCnt _cstValue _cstNote);
set &_cstDSName;
attrib _cstDS format=$32.
_cstColumn format=$32.
_cstRecCnt format=8.
_cstValue format=8.
_cstNote format=$80.;
retain _cstDS _cstColumn;
if _n_=1 then do;
_cstDS="&_cstDSName";
end;
array var{&_cstVarListCnt} $32 _temporary_ (&_cstVarList);
array var2{&_cstVarListCnt} &_cstVarList2;
do i=1 to dim(var);
_cstColumn=var[i];
do j=0 to 31,128 to 255;
xx=indexc(var2[i],byte(j));
if find(var2[i],byte(j)) then do;
_cstValue=j;
_cstRecCnt=_n_;
_cstNote=catx(' ','Invalid character for record',put(_n_,8.),'@ column',put(xx,8.));
output;
end;
end;
end;
run;
%if %sysfunc(exist(&_cstOutputDS)) %then
%do;
proc append base=&_cstOutputDS data=work._csttemp;
run;
%end;
%else
%do;
data &_cstOutputDS;
set work._csttemp;
run;
%end;
%end;
%end;
%mend;
* This is how we do the actual remapping of undesired bytes to acceptable bytes *;
* The methodology in the value statement below is old=new *;
proc format library=work.formats;
value btype
9=32
10=32
19=45
20=45
24=39
25=39
28=34
29=34
145=39
146=39
147=34
148=34
150=45
160=32
other=?;
run;
%macro _cstFixCharacters(_cstInputDS=work._cstProblems,_cstDSName=);
%* Allow specification of a single data set to-be-fixed *;
%if %length(&_cstDSName)>0 %then
%do;
proc sort data=&_cstInputDS (where=(_cstDS="&_cstDSName")) out=work._FixSetup;
by _cstDS _cstRecCnt _cstColumn;
run;
%end;
%else %do;
proc sort data=&_cstInputDS out=work._FixSetup;
by _cstDS _cstRecCnt _cstColumn;
run;
%end;
filename _cstCode CATALOG "work._cstCode.fixit.source";
data _null_;
set work._FixSetup end=last;
by _cstDS _cstRecCnt _cstColumn;
attrib tempvar format=$200.;
file _cstCode;
if first._cstDS then
do;
* Note we write to a work file because we do not want to directly overwrite the existing file *;
* Instead, we should confirm the update prior to a copy to the production location. *;
tempvar=cats('data work.',scan(_cstDS,2,'.'),';');
put @1 tempvar;
put @3 'set' @7 _cstDS ';';
end;
if first._cstRecCnt then
do;
tempvar=catx(' ','if _n_=',_cstRecCnt,'then do;');
put @5 tempvar;
end;
tempvar=cats(_cstColumn,'=tranwrd(',_cstColumn,',byte(',_cstValue,'),byte(',put(_cstValue,btype.),'));');
put @7 tempvar;
if last._cstRecCnt then
put @5 'end;';
if last._cstDS then
put @1 'run;';
run;
/* Example fixes:
f1=tranwrd(f1,byte(10),' ');
* change extended ascii single/double quote to normal ascii quotes*;
if find(&_cstColumnName,byte(i)) then f1=tranwrd(f1,byte(145),byte(39));
if find(f1,byte(146)) then f1=tranwrd(f1,byte(146),byte(39));
if find(f1,byte(147)) then f1=tranwrd(f1,byte(147),byte(39));
if find(f1,byte(148)) then f1=tranwrd(f1,byte(148),byte(39));
* change extended ascii dashes/hypens to normal ascii dashes/hypens*;
if find(f1,byte(150)) then f1=tranwrd(f1,byte(150),byte(45));
if find(f1,byte(151)) then f1=tranwrd(f1,byte(151),byte(45)||byte(45));
* change extended ascii character 160 to a space , ascii 32*;
if find(f1,byte(160)) then f1=tranwrd(f1,byte(160),byte(32));
*/
%mend;
These sample files and code examples are provided by SAS Institute
Inc. "as is" without warranty of any kind, either express or implied, including
but not limited to the implied warranties of merchantability and fitness for a
particular purpose. Recipients acknowledge and agree that SAS Institute shall
not be liable for any damages whatsoever arising out of their use of this material.
In addition, SAS Institute will provide no support for the materials contained herein.