Accurately selecting a random percentage sample

This is something which has bugged me for ages, there doesn’t appear to be a way to generate a completely accurate % sample.
For example if I have 10,000 rows and I want a 10% sample, I want to see 1,000 rows in my sample, not 1,043 or 958
There may well already be an example of doing this somewhere but it hasn’t turned up in any of my searches

Results From Testing

These are the results from testing my code vs the generally accepted way of doing things, I think the results speak for themselves but I am happy to be corrected.

Results from a fixed 2m row dataset:
fixed_dataset_results

Results from a randomised number of rows dataset:

large_dataset_results

Results from a smaller randomised number of rows dataset:

small_dataset_results

A quick sanity check to make sure the same rows aren’t being selected each time:

times_chosen

The Standard Practice:

The generally accepted rule (as it seems to appear again and again) is to use the following code which can be run straight into a table if you want to sample individual rows, or you can creating a mapping table or left join if you want to sample a dimension (I’ve included a simple example of this)

The code used is something like this:

if((rand()<=0.10+now()*0) = 0, 'Standard Group', 'Trial Group') where 0.10 is a 10% sample. Now this appears to be accurate +/- 1% (it becomes less accurate as the population size decreases) which is fine in most cases, but it should be possible to select an exact percentage. How you use this code: For sampling all rows:

//random row sampler  
  
  Transactions:  
  load rowno() as ID,//this is our unique row identifier  
  if((rand()<=0.10+now()*0) = 0, 'Standard Group', 'Trial Group') as trial_flag,  
  '1' as TransCounter,  
  TransLineID,  
  TransID,  
  Pick(Ceil(5*Rand1),'A','B','C','D','E') as Department,  
  money(round(RAND()*25000,0.01)) AS Value;  
  Load  
  Rand() as Rand1,  
  floor(rand()*100) as TransLineID,  
  floor(rand()*100000) as TransID  
  Autogenerate 10000*Rand()  
  While Rand()<=0.5 or IterNo()=1;  
 

For sampling a dimension:

//random dimension sampler  
  
  Transactions:  
  load rowno() as ID,//this is our unique row identifier  
  '1' as TransCounter,  
  TransLineID,  
  TransID,  
  Pick(Ceil(5*Rand1),'A','B','C','D','E') as Department,  
  money(round(RAND()*25000,0.01)) AS Value;  
  Load  
  Rand() as Rand1,  
  floor(rand()*100) as TransLineID,  
  floor(rand()*100000) as TransID  
  Autogenerate 10000*Rand()  
  While Rand()<=0.5 or IterNo()=1;  
    
  NoConcatenate  
    
  tmp:  
  load distinct TransID  
  Resident Transactions;  
    
  NoConcatenate  
    
  tmp2:  
  load TransID, if((rand()<=0.10+now()*0) = 0, 'Standard Group', 'Trial Group') as trial_flag  
  resident tmp;  
    
  drop table tmp;  
    
    
  left join (Transactions)  
  load TransID, trial_flag  
  resident tmp2;  
  
  
  drop table tmp2;  

My Solution:

My solution is slightly more long winded and does involve some resident loads and joins, I am sure someone might be able to make it more efficient! However on my basic sample dataset of 2m rows it still processed in less than 10 seconds.
However I have also added variables in for the setup to make life easier.

//set up our variables  
//you will need to alter these here to suit your requirements  
  
  
  let v_sample_size = 0.05; // 5%  
  let v_sample_type = 'column';  //this can be column or table, leaving it blank will turn the sampling off  
    
  let v_sample_table = 'Transactions'; //set this to the table your column is in for column sampling, or the table you want to sample if you're table sampling  
  let v_sample_column = 'TransID'; //set this to your ID column if using a table or 'column' to sample a column  
  
  
  let v_sort_order = if(rand() >=0.5,'desc','asc'); //this just adds another element of randomisation to the mix  
    
  let v_sample_column_name ='Trial Flag'; //this is the column name for flagging if the item is a sample or not  
  let v_sample_column_true='Trial Group'; //flag if it is in the sample  
  let v_sample_column_false='Standard Group'; //flag if it is not in the sample  
  
  
//lets load some dummy data note you MUST have a unique row identifier for table based samples  
//**********************************************************************************************************  
// INSERT YOUR CODE HERE  
//**********************************************************************************************************  
  
  
//just create a QVD if it doesn't exist to give us the same data to re-test on  
  
  
  Transactions:  
  load rowno() as ID,//this is our unique row identifier  
  '1' as TransCounter,  
  TransLineID,  
  TransID,  
  Pick(Ceil(5*Rand1),'A','B','C','D','E') as Department,  
  money(round(RAND()*25000,0.01)) AS Value;  
  Load  
  Rand() as Rand1,  
  floor(rand()*100) as TransLineID,  
  floor(rand()*100000) as TransID  
  Autogenerate 10000*Rand()  
  While Rand()<=0.5 or IterNo()=1;;  
  
  
//**********************************************************************************************************  
//**********************************************************************************************************  
  
  
//this is the sampling work here  
  
  //column sampling  
    
if v_sample_type='column' THEN  
  
  
  //load the distinct column value  
  testtable:  
  load distinct $(v_sample_column)  
  Resident $(v_sample_table);  
    
  //now apply our random number to the test table  
  left join (testtable)  
  LOAD $(v_sample_column),num(rand(),'0.00000000')as xx_randomnumber  
  resident testtable;  
  
    
  //set up the sample size   
  let v_load_number = round(NoOfRows('testtable')*$(v_sample_size));  
  
  NoConcatenate  
  
  //load the final sample table  
  sampleoutputtable:  
  first $(v_load_number) load $(v_sample_column), '$(v_sample_column_true)' as xx_flag  
  Resident testtable  
  order by xx_randomnumber $(v_sort_order);  
  
  drop table testtable;  
  
  //now join back to the main table  
  left join ($(v_sample_table))  
  load  $(v_sample_column), xx_flag  
  resident sampleoutputtable;  
  
  //join table to itself to get rid of the null values  
  join ($(v_sample_table))  
  load $(v_sample_column), if(len(trim(xx_flag))>0,xx_flag,'$(v_sample_column_false)') as  [$(v_sample_column_name)]  
  resident $(v_sample_table);  
  
  
  drop table sampleoutputtable;  
  drop field xx_flag;  
  
  
end if  
  
  
  
  // table sampling  
  
if v_sample_type='table' THEN  
  
  // this time we can simply load our table with the ID  
  testtable:  
  LOAD $(v_sample_column),num(rand(),'0.00000000')as xx_randomnumber  
  Resident $(v_sample_table);  
  
  
  //set up the sample size   
  let v_load_number = round(NoOfRows('testtable')*$(v_sample_size));  
  
  NoConcatenate  
  
  //load the final sample table  
  sampleoutputtable:  
  first $(v_load_number) load $(v_sample_column), '$(v_sample_column_true)' as xx_flag  
  Resident testtable  
  order by xx_randomnumber $(v_sort_order);  
  
  drop table testtable;  
  
  //now join back to the main table  
  left join ($(v_sample_table))  
  load  $(v_sample_column), xx_flag  
  resident sampleoutputtable;  
  
  //join table to itself to get rid of the null values  
  join ($(v_sample_table))  
  load $(v_sample_column), if(len(trim(xx_flag))>0,xx_flag,'$(v_sample_column_false)') as  [$(v_sample_column_name)]  
  resident $(v_sample_table);  
  
  
  drop table sampleoutputtable;  
  drop field xx_flag;  
  
  
  
  
end if  

qvw files are here:

basic_random_sample

proper_random_sample

Leave a Reply