This is something which has bugged me for ages, there doesn’t appear to be a way to generate a completely accurate % sample.

For example if I have 10,000 rows and I want a 10% sample, I want to see 1,000 rows in my sample, not 1,043 or 958

There may well already be an example of doing this somewhere but it hasn’t turned up in any of my searches

Results From Testing

These are the results from testing my code vs the generally accepted way of doing things, I think the results speak for themselves but I am happy to be corrected.

Results from a fixed 2m row dataset:

Results from a randomised number of rows dataset:

Results from a smaller randomised number of rows dataset:

A quick sanity check to make sure the same rows aren’t being selected each time:

**The Standard Practice:**

The generally accepted rule (as it seems to appear again and again) is to use the following code which can be run straight into a table if you want to sample individual rows, or you can creating a mapping table or left join if you want to sample a dimension (I’ve included a simple example of this)

The code used is something like this:

if((rand()<=0.10+now()*0) = 0, 'Standard Group', 'Trial Group') where 0.10 is a 10% sample. Now this appears to be accurate +/- 1% (it becomes less accurate as the population size decreases) which is fine in most cases, but it should be possible to select an exact percentage. How you use this code: For sampling all rows:

```
//random row sampler
Transactions:
load rowno() as ID,//this is our unique row identifier
if((rand()<=0.10+now()*0) = 0, 'Standard Group', 'Trial Group') as trial_flag,
'1' as TransCounter,
TransLineID,
TransID,
Pick(Ceil(5*Rand1),'A','B','C','D','E') as Department,
money(round(RAND()*25000,0.01)) AS Value;
Load
Rand() as Rand1,
floor(rand()*100) as TransLineID,
floor(rand()*100000) as TransID
Autogenerate 10000*Rand()
While Rand()<=0.5 or IterNo()=1;
```

For sampling a dimension:

```
//random dimension sampler
Transactions:
load rowno() as ID,//this is our unique row identifier
'1' as TransCounter,
TransLineID,
TransID,
Pick(Ceil(5*Rand1),'A','B','C','D','E') as Department,
money(round(RAND()*25000,0.01)) AS Value;
Load
Rand() as Rand1,
floor(rand()*100) as TransLineID,
floor(rand()*100000) as TransID
Autogenerate 10000*Rand()
While Rand()<=0.5 or IterNo()=1;
NoConcatenate
tmp:
load distinct TransID
Resident Transactions;
NoConcatenate
tmp2:
load TransID, if((rand()<=0.10+now()*0) = 0, 'Standard Group', 'Trial Group') as trial_flag
resident tmp;
drop table tmp;
left join (Transactions)
load TransID, trial_flag
resident tmp2;
drop table tmp2;
```

My Solution:

My solution is slightly more long winded and does involve some resident loads and joins, I am sure someone might be able to make it more efficient! However on my basic sample dataset of 2m rows it still processed in less than 10 seconds.

However I have also added variables in for the setup to make life easier.

```
//set up our variables
//you will need to alter these here to suit your requirements
let v_sample_size = 0.05; // 5%
let v_sample_type = 'column'; //this can be column or table, leaving it blank will turn the sampling off
let v_sample_table = 'Transactions'; //set this to the table your column is in for column sampling, or the table you want to sample if you're table sampling
let v_sample_column = 'TransID'; //set this to your ID column if using a table or 'column' to sample a column
let v_sort_order = if(rand() >=0.5,'desc','asc'); //this just adds another element of randomisation to the mix
let v_sample_column_name ='Trial Flag'; //this is the column name for flagging if the item is a sample or not
let v_sample_column_true='Trial Group'; //flag if it is in the sample
let v_sample_column_false='Standard Group'; //flag if it is not in the sample
//lets load some dummy data note you MUST have a unique row identifier for table based samples
//**********************************************************************************************************
// INSERT YOUR CODE HERE
//**********************************************************************************************************
//just create a QVD if it doesn't exist to give us the same data to re-test on
Transactions:
load rowno() as ID,//this is our unique row identifier
'1' as TransCounter,
TransLineID,
TransID,
Pick(Ceil(5*Rand1),'A','B','C','D','E') as Department,
money(round(RAND()*25000,0.01)) AS Value;
Load
Rand() as Rand1,
floor(rand()*100) as TransLineID,
floor(rand()*100000) as TransID
Autogenerate 10000*Rand()
While Rand()<=0.5 or IterNo()=1;;
//**********************************************************************************************************
//**********************************************************************************************************
//this is the sampling work here
//column sampling
if v_sample_type='column' THEN
//load the distinct column value
testtable:
load distinct $(v_sample_column)
Resident $(v_sample_table);
//now apply our random number to the test table
left join (testtable)
LOAD $(v_sample_column),num(rand(),'0.00000000')as xx_randomnumber
resident testtable;
//set up the sample size
let v_load_number = round(NoOfRows('testtable')*$(v_sample_size));
NoConcatenate
//load the final sample table
sampleoutputtable:
first $(v_load_number) load $(v_sample_column), '$(v_sample_column_true)' as xx_flag
Resident testtable
order by xx_randomnumber $(v_sort_order);
drop table testtable;
//now join back to the main table
left join ($(v_sample_table))
load $(v_sample_column), xx_flag
resident sampleoutputtable;
//join table to itself to get rid of the null values
join ($(v_sample_table))
load $(v_sample_column), if(len(trim(xx_flag))>0,xx_flag,'$(v_sample_column_false)') as [$(v_sample_column_name)]
resident $(v_sample_table);
drop table sampleoutputtable;
drop field xx_flag;
end if
// table sampling
if v_sample_type='table' THEN
// this time we can simply load our table with the ID
testtable:
LOAD $(v_sample_column),num(rand(),'0.00000000')as xx_randomnumber
Resident $(v_sample_table);
//set up the sample size
let v_load_number = round(NoOfRows('testtable')*$(v_sample_size));
NoConcatenate
//load the final sample table
sampleoutputtable:
first $(v_load_number) load $(v_sample_column), '$(v_sample_column_true)' as xx_flag
Resident testtable
order by xx_randomnumber $(v_sort_order);
drop table testtable;
//now join back to the main table
left join ($(v_sample_table))
load $(v_sample_column), xx_flag
resident sampleoutputtable;
//join table to itself to get rid of the null values
join ($(v_sample_table))
load $(v_sample_column), if(len(trim(xx_flag))>0,xx_flag,'$(v_sample_column_false)') as [$(v_sample_column_name)]
resident $(v_sample_table);
drop table sampleoutputtable;
drop field xx_flag;
end if
```

qvw files are here: