| Perl regular expressions
|
Perl Regular Expressions are supported beginning with SAS®9.
This example parses records where the format of the record can change within the file. This type of file is difficult to parse with an INPUT statement alone since the number of values in a record and their order can vary.
To read this file, a regexp is built each time the record format changes. When the record format changes, there is a line in the file that begins with #Fields:. The #Fields: line contains information about the order and number of values to be read from the records that come on subsequent lines. The information on the #Fields: line is used to create a regexp to parse the lines that are about to come.
Matching parentheses are used within the regexp to setup submatches. Submatches within the record correspond to values within the record. The submatches are extracted with PRXPOSN and assigned into the correct variable for that value. The values are then written to an output data set.
This data used by this example is ocean buoy data from the National Data Buoy Center.
filename temp temp;
/* Create the data file from the buoys */
/* Not all of the buoys output the same amount of data, so a */
/* Fields: line is output to indicate that columns that are to */
/* follow. */
/* The set of field names that are possible are: */
/* MM DD HH EST WDIR WSPD GST WVHT DPD APD MWD PRES ATMP WTMP DEWP VIS PTDY TIDE */
/* A description of what these field names mean is below. */
data _null_;
file temp;
input;
len = length(_infile_);
put _infile_ $VARYING. len;
datalines;
#Buoy: 42036 - W. TAMPA 106NM West Northwest of Tampa, FL
#Fields: MM DD HH EST WDIR WSPD GST WVHT DPD APD MWD PRES ATMP WTMP DEWP PTDY
11 15 8 am N 19.4 23.3 6.2 6 4.8 NNE 30.01 67.3 73.4 60.1 +0.01
11 15 7 am N 19.4 25.3 5.9 6 4.9 NNE 30.00 67.5 73.4 60.3 +0.00
11 15 6 am NNE 19.4 23.3 5.2 6 4.6 NNE 30.01 67.6 73.4 60.3 -0.00
11 15 5 am N 15.5 19.4 6.2 6 4.8 NNE 30.00 67.8 73.4 60.1 -0.02
11 15 4 am NNE 17.5 23.3 5.9 6 4.8 NNE 30.00 67.8 73.4 59.9 -0.02
#Buoy: FPSN7 - Frying Pan Shoals, NC
#Fields: MM DD HH EST WDIR WSPD GST WVHT DPD APD PRES ATMP PTDY
11 15 8 am NE 26 28 8.5 10 6.2 30.06 71.8 +0.00
11 15 7 am ENE 25 28 8.5 9 6.1 30.07 71.8 -0.00
11 15 6 am ENE 24 27 9.2 10 6.1 30.06 71.4 -0.02
11 15 5 am NE 25 28 8.9 9 6.1 30.06 71.4 -0.03
11 15 4 am ENE 25 28 8.5 9 6.0 30.07 71.4 -0.04
#Buoy: 41002 - S HATTERAS - 250 NM East of Charleston, SC
#Fields: MM DD HH EST WDIR WSPD GST WVHT DPD APD PRES ATMP WTMP PTDY
11 15 8 am E 27.2 33.0 11.2 9 6.1 30.01 69.3 77.2 +0.01
11 15 7 am E 29.1 35.0 10.8 9 6.2 30.00 69.1 77.2 -0.01
11 15 6 am E 27.2 31.1 11.2 9 6.4 29.99 68.4 77.0 -0.04
11 15 5 am E 25.3 31.1 9.5 9 5.9 30.00 70.2 77.0 -0.06
11 15 4 am E 25.3 31.1 10.8 9 6.4 30.01 70.5 77.0 -0.03
#Buoy: FWYF1 - Fowey Rocks, FL
#Fields: MM DD HH EST WDIR WSPD GST PRES ATMP WTMP PTDY
11 15 8 am W 14 16 29.92 67.6 77.7 +0.03
11 15 7 am W 13 15 29.91 67.5 77.7 +0.02
11 15 6 am W 12 13 29.91 67.1 77.5 +0.02
11 15 5 am WNW 15 16 29.89 67.6 77.2 +0.00
11 15 4 am WNW 14 16 29.89 68.0 76.5 -0.02
;
/* Read the buoy data and create a SAS data set */
/* While reading the data, try to detect if a row has bogus values. */
/* The idea behind this one is to read a line, then determine if it */
/* is a line that contains the field definitions for the lines to */
/* come or if it is a line that contains data that needs to be parsed */
/* and validated. */
/* If a line is a field definition line, the we'll read a field name, */
/* assign the column number it is in to the corresponding i_ var, then */
/* concatenate a regexp to read the field to the existing regexp. */
/* If a line is data, we'll pass the line to prxmatch, then use prxposn */
/* to pull out the parsed fields. If a line doesn't match with prxmatch */
/* and error is output along with the line. */
data buoydata;
length mm dd hh est wdir wspd gst wvht dpd apd
mwd pres atmp wtmp dewp vis ptdy tide
$ 8;
length i_mm i_dd i_hh i_est i_wdir i_wspd i_gst i_wvht i_dpd i_apd
i_mwd i_pres i_atmp i_wtmp i_dewp i_vis i_ptdy i_tide
8;
array vars[*] _CHARACTER_;
array ivars[*] _NUMERIC_;
retain ivars;
length buoy $ 80 re 8;
retain buoy re;
keep mm dd hh est wdir wspd gst wvht dpd apd
mwd pres atmp wtmp dewp vis ptdy tide
buoy;
length field_name $ 8 regexp $ 1024;
infile temp;
input;
if _infile_ =: "#Buoy:" then
buoy = substr(_infile_,7);
else if _infile_ =: "#Fields:" then do;
/* Init the regexp and column number array */
regexp = "/^";
do i = 1 to dim(ivars);
ivars[i] = .;
end;
/* Iterate over the field names, build the regexp, and assign */
/* the column number for the fields that will be read. */
i = 2;
field_name = scan(_infile_, i, ' ');
/* &prxint: match one or two digits, then a space */
/* &prxnum: match zero or one + or -, one or more digits, */
/* zero or one decimal point, zero or more digits, */
/* then a space. */
/* &prxcompass: match one of the compass directions. */
%let prxint = "(\d{1,2})\s";
%let prxnum = "([+-]*\d+\.?\d*)\s";
%let prxcompass = "(N|NNE|NE|ENE|E|ESE|SE|SSE|S|SSW|SW|WSW|W|WNW|NW|NNW)\s";
do while (^missing(field_name));
select(field_name);
when("MM") do;
i_mm = i-1;
regexp = trim(regexp) || &prxint;
end;
when("DD") do;
i_dd = i-1;
regexp = trim(regexp) || &prxint;
end;
when("HH") do;
i_hh = i-1;
regexp = trim(regexp) || &prxint;
end;
when("EST") do;
i_est = i-1;
regexp = trim(regexp) || "(am|pm)\s";
end;
when("WDIR") do;
i_wdir = i-1;
regexp = trim(regexp) || &prxcompass;
end;
when("WSPD") do;
i_wspd = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("GST") do;
i_gst = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("WVHT") do;
i_wvht = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("DPD") do;
i_dpd = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("APD") do;
i_apd = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("MWD") do;
i_mwd = i-1;
regexp = trim(regexp) || &prxcompass;
end;
when("PRES") do;
i_pres = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("ATMP") do;
i_atmp = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("WTMP") do;
i_wtmp = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("DEWP") do;
i_dewp = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("VIS") do;
i_vis = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("PTDY") do;
i_ptdy = i-1;
regexp = trim(regexp) || &prxnum;
end;
when("TIDE") do;
i_tide = i-1;
regexp = trim(regexp) || &prxnum;
end;
otherwise
putlog 'ERROR: Unknown field name ' field_name;
end; /* select(field_name) */
/* Get the next field name until we're done */
i+1;
field_name = scan(_infile_, i, ' ');
end; /* while(^missing(field_name)) */
/* Remove last \s and put $/ on end */
substr(regexp,length(regexp)-1) = "$/";
/* If re isn't missing, then let's free the prior regexp */
if ^missing(re) then
call prxfree(re);
re = prxparse(regexp);
if missing(re) then
putlog 'ERROR: Invalid regexp ' regexp;
end; /* _infile_ =: "#Fields:" */
else do;
/* Does current line match the regexp we built? */
if prxmatch(re, _infile_) then do;
/* If so, pull out the fields, assign into vars, and output */
do i = 1 to dim(ivars);
if ^missing(ivars[i]) then do;
call prxposn(re, ivars[i], pos, len);
vars[i] = substr(_infile_, pos, len);
end;
end;
output;
end;
else do;
putlog 'ERROR: Invalid row in buoy input.';
putlog 'ERROR- ' _infile_;
end;
end;
run;
filename temp clear;
proc print data=buoydata; run;
/*** http://www.ndbc.noaa.gov/measdes.shtml
ATMP Air temperature (Celsius). For sensor heights
on buoys, see Hull Descriptions. For sensor
heights at C-MAN stations, see C-MAN Sensor Locations
WTMP Sea surface temperature (Celsius). For sensor
depth, see Hull Description.
DEWP Dewpoint temperature taken at the same height
as the air temperature measurement.
PRES Sea level pressure (hPa). For C-MAN sites and
Great Lakes buoys, the recorded pressure is
reduced to sea level using the method described
in NWS Technical Procedures Bulletin 291
(11/14/80).
WSPD Wind speed (m/s) averaged over an eight-minute
period for buoys and a two-minute period for land stations.
Reported Hourly. See Wind Averaging Methods.
WDIR Wind direction (the direction the wind is coming from in degrees
clockwise from N) during the same period used for WSPD.
See Wind Averaging Methods
GST Peak 5 or 8 second gust speed (m/s) measured
during the eight-minute or two-minute period.
The 5 or 8 second period can be determined by
payload, See the Sensor Reporting, Sampling, and Accuracy section.
WVHT Significant wave height (meters) is calculated as
the average of the highest one-third of all of the wave
heights during the 20-minute sampling period.
See the Wave Measurements section.
APD Average wave period (seconds) of all waves during
the 20-minute period. See the Wave Measurements
section.
DPD Dominant wave period (seconds) is the period
with the maximum wave energy. See the Wave Measurements
section.
MWD Mean wave direction (degrees) corresponding to energy of
the dominant period (DOMPD). See the Wave Measurements
section.
VIS Station visibility (statute miles)
PTDY Pressure Tendency is the direction (plus or minus) and
the amount of pressure change (hPa)for a three hour period
ending at the time of observation.
TIDE The periodic rising and falling of the earth's oceans.
Tide is measured in feet.
***/