options set=GRIDHOST="grid001.example.com";
options set=GRIDINSTALLLOC="/opt/TKGrid_REP"; 1
proc lasr create port=10011;
performance nodes=all;
run;
1 | Using a TKGrid_REP installation location enables reading data in parallel with SAS/ACCESS engines from distributed databases and Hadoop clusters. |
libname lasrlib sasiola host="grid001.example.com" port=10011; data lasrlib.webscore; set somelib.webscore; run;
options set=HADOOP_JAR_FILES_PATH="/opt/hadoopjars"; libname hdplib hadoop server="grid001.example.com" 1 config="/home/sasdemo/config.xml" hdfs_metadir="/user/sasdemo/meta" hdfs_datadir="/user/sasdemo/data"; proc hdmd name=hdplib.webdata file_format=delimited 2 encoding=utf8 sep="|" data_file="web-data.txt"; column id int; column links varchar(256); column var1 double; column var2 double; column var3 double; column var4 double; column var5 double; column var6 double; column var7 double; column var8 double; run; proc lasr add data=hdplib.webdata port=10010; performance host="grid001.example.com"; run;
1 | The LIBNAME statement with the Hadoop engine uses the HDFS_METADIR= option. This option enables working with XML-based table definitions called SASHDMD descriptors. |
2 | The HDMD procedure is used to create the SASHDMD descriptors from existing data in HDFS, which in this case is a delimited file. |
%let seed = 12345; proc imstat; table lasrlib.webdata; 1 tableinfo; columninfo; frequency goalVar; run; compute sampkey "sampkey = ranuni(&seed.)"; 2 run; table lasrlib.webdata; 3 deleterows; where sampkey ge 0.31 and goalVar = 0; run;
1 | The TABLE statement specifies the Webdata table that was loaded to memory as the active table. The following three statements, TABLEINFO, COLUMNINFO, and FREQUENCY provide information about the table and a variable that is named Goalvar. |
2 | The COMPUTE statement creates a column that is named Sampkey. The column is permanent and is added to the table. |
3 | The TABLE statement is used again to reopen the table. This enables SAS to access the newly created column, Sampkey. The DELETEROWS statement is subject to the WHERE clause and marks 70% of the table for deletion where the goal was not met. Because the PURGE option is not used, the rows are not actually deleted. Instead, the rows are just disregarded in subsequent analyses that use the table. |
table lasrlib.webdata; where sampkey ge 0.3; /* training set */ randomwoods goalVar / input = ( browser var1-var8 ) nominal = ( browser ) nbins=100 maxlevel=10 maxbranches=2 /* tree specs */ greedy gain leafsize=50 ntree=100 seed=1314 m=5 /* forest spec */ treeinfo bootstrap=0.3 temptable ; run; table lasrlib.&_templast_; promote RF; /* promote the model into */ /* a permanent table */ run; table lasrlib.webdata; where sampkey lt 0.3; /* validation set */ randomwoods / lasrtree = lasrlib.RF nominal = ( browser ) temptable assess vars = ( userid goalVar ) ; run; table lasrlib.&_templast_; /* assess */ where strip(_RF_Level_) eq '1'; assess _RF_P_/ y = goalVar event = '1' nbins = 10 step = 0.001; run; table lasrlib.webscore; /* score */ compute goalVar "goalVar = 2"; randomwoods / lasrtree = lasrlib.RF nominal = ( browser ) temptable assess vars = ( userid ) ; run; table lasrlib.&_templast_; promote scoreresult; quit;