Bug: function MathSample() does not generate correct results

 
The function MathSample() (defined in <Math\Stat\Math.mqh>) does not generate correct random samples.


This a fix for the include file <Math\Stat\Math.mqh>:

bool MathSample(const int &array[],const int count,int &result[])
  {
   int size=ArraySize(array);
//--- check array size
   if(size==0)
      return false;
//--- prepare target array and calculate values
   if(ArraySize(result)<count)
      if(ArrayResize(result,count)!=count)
         return(false);
   for(int i=0; i<count; i++)
     {
      ///////int ind=(int)(count-1)*MathRand()/32767;
      int ind=(int)(size*MathRand()/32768.0);
      result[i]=array[ind];
     }
//---
   return true;
  }


bool MathSample(const int &array[],const int count,const bool replace,int &result[])
  {
//--- unique values not needed
   ///////if(!replace)
   if(replace)
      return MathSample(array,count,result);
   int size=ArraySize(array);
   if(size==0 || count>size)
      return(false);
//--- prepare target array
   if(ArraySize(result)<count)
      if(ArrayResize(result,count)!=count)
         return(false);
//--- unique values needed, prepare indices
   int indices[];
/*
   MathSequenceByCount(0,count-1,count,indices);
   int swaps=size/2;
   int max_v=count-1;
   for(int i=0; i<swaps; i++)
     {
      //--- select random indices and swap them
      int ind1=(int)max_v*MathRand()/32767;
      int ind2=(int)max_v*MathRand()/32767;
      int t=indices[ind1];
      indices[ind1]=indices[ind2];
      indices[ind2]=t;
     }
*/
   MathSequenceByCount(0,size-1,size,indices);
//--- Fisher-Yates algorithm
//--- https://bost.ocks.org/mike/shuffle/compare.html
   for(int i=size-1; i > 0; --i)
     {
      //--- pick a random index j = [0, i]
      int j = (int)((i+1)*MathRand()/32768.0);
      int t=indices[i];
      indices[i]=indices[j];
      indices[j]=t;
     }
//--- select data according to indices
   for(int i=0; i<count; i++)
      result[i]=array[indices[i]];
//---
   return true;
  }

According to the online documentation:

The replace=true argument allows performing random sampling of the elements with replacement back to the original sequence.


This is a script file to test the original and fixed include files.

//+------------------------------------------------------------------+
//|                                              MathSample_test.mq5 |
//|                        Copyright 2018, MetaQuotes Software Corp. |
//|                                             https://www.mql5.com |
//+------------------------------------------------------------------+
#property copyright "Copyright 2018, MetaQuotes Software Corp."
#property link      "https://www.mql5.com"
#property version   "1.00"
#property script_show_inputs

//#include <Math\Stat\Math.mqh>
#include "Math_fix.mqh"

input int  population=9;
input int  samplesize=4;     // sample size
input bool replacement=true; // replacement (repetition is allowed)
input int  num_draws=10;     // number of draws
//+------------------------------------------------------------------+
//|                                                                  |
//+------------------------------------------------------------------+
void OnStart()
  {
   PrintFormat("MathSample(population=%i,samplesize=%i,replacement=%s)",population,samplesize,replacement?"true":"false");

//--- prepare the population array
   int array[];
   MathSequenceByCount(1,population,population,array);

//--- initialize the generator of random numbers
   MathSrand(GetTickCount()^(uint)ChartID());

//--- draw 10 simple random samples
   int result[];
   for(int i=0; i<num_draws; i++)
      if(MathSample(array,samplesize,replacement,result))
         ArrayPrint(result);
  }
//+------------------------------------------------------------------+

//sample output using the original header <Math\Stat\Math.mqh>
/*
MathSample_demo (CHFJPY,M1)   MathSample(population=9,samplesize=4,replacement=true)
MathSample_demo (CHFJPY,M1)   3 2 1 4
MathSample_demo (CHFJPY,M1)   3 2 1 4
MathSample_demo (CHFJPY,M1)   1 2 3 4
MathSample_demo (CHFJPY,M1)   2 3 1 4
MathSample_demo (CHFJPY,M1)   3 1 2 4
MathSample_demo (CHFJPY,M1)   2 1 3 4
MathSample_demo (CHFJPY,M1)   1 3 2 4
MathSample_demo (CHFJPY,M1)   3 2 1 4
MathSample_demo (CHFJPY,M1)   1 2 3 4
MathSample_demo (CHFJPY,M1)   3 2 1 4
MathSample_demo (CHFJPY,M1)   MathSample(population=9,samplesize=4,replacement=false)
MathSample_demo (CHFJPY,M1)   2 3 1 2
MathSample_demo (CHFJPY,M1)   1 2 3 2
MathSample_demo (CHFJPY,M1)   1 2 3 3
MathSample_demo (CHFJPY,M1)   3 2 1 3
MathSample_demo (CHFJPY,M1)   1 3 1 1
MathSample_demo (CHFJPY,M1)   2 3 1 1
MathSample_demo (CHFJPY,M1)   2 3 3 3
MathSample_demo (CHFJPY,M1)   2 3 1 3
MathSample_demo (CHFJPY,M1)   3 3 1 1
MathSample_demo (CHFJPY,M1)   3 2 1 3
*/

//sample output using the fixed header "Math_fix.mqh""
/*
MathSample_demo (CHFJPY,M1)   MathSample(population=9,samplesize=4,replacement=true)
MathSample_demo (CHFJPY,M1)   6 5 1 7
MathSample_demo (CHFJPY,M1)   4 2 1 6
MathSample_demo (CHFJPY,M1)   3 3 7 1
MathSample_demo (CHFJPY,M1)   9 5 4 2
MathSample_demo (CHFJPY,M1)   2 7 3 5
MathSample_demo (CHFJPY,M1)   5 3 5 4
MathSample_demo (CHFJPY,M1)   3 7 3 8
MathSample_demo (CHFJPY,M1)   9 9 4 9
MathSample_demo (CHFJPY,M1)   7 1 2 5
MathSample_demo (CHFJPY,M1)   5 7 6 5
MathSample_demo (CHFJPY,M1)   MathSample(population=9,samplesize=4,replacement=false)
MathSample_demo (CHFJPY,M1)   6 4 8 1
MathSample_demo (CHFJPY,M1)   5 1 6 9
MathSample_demo (CHFJPY,M1)   4 9 2 6
MathSample_demo (CHFJPY,M1)   1 8 4 6
MathSample_demo (CHFJPY,M1)   4 8 9 7
MathSample_demo (CHFJPY,M1)   3 8 4 7
MathSample_demo (CHFJPY,M1)   2 3 4 7
MathSample_demo (CHFJPY,M1)   1 7 3 4
MathSample_demo (CHFJPY,M1)   9 4 7 8
MathSample_demo (CHFJPY,M1)   1 9 2 5
*/
Files:
Math.mqh  423 kb
Math_fix.mqh  425 kb
 

Thank you for your message

You are right, calculations of source index are incorrect

 

1) MathSample w/o replace


IMHO correct and faster way is
int ind=int((ulong)size*MathRand()/32768);
multiplication overload is not possible and division will be replaced with shifting at runtime
 

2) MathSample with replace

Yes, Fisher-Yates algorithm gives much better shuffle

But using it for indices... why not!


we can little bit change it because we need only first count elements from indices array
 

Thank you for getting attention to this bug.

I need to clarify the difference between sampling schemes.

Sampling With Replacement

You replace back the first item you draw to the list before you draw a second. So, if you had a list of names to choose two people from (sample of 2), you could choose the same person twice. If the list had unique items, then our sample would contain duplicates.

//sample output using the fixed header "Math_fix.mqh""
/*
MathSample_demo (CHFJPY,M1)   MathSample(population=9,samplesize=4,replacement=true)
MathSample_demo (CHFJPY,M1)   6 5 1 7
MathSample_demo (CHFJPY,M1)   4 2 1 6
MathSample_demo (CHFJPY,M1)   3 3 7 1
MathSample_demo (CHFJPY,M1)   9 5 4 2
MathSample_demo (CHFJPY,M1)   2 7 3 5
MathSample_demo (CHFJPY,M1)   5 3 5 4
MathSample_demo (CHFJPY,M1)   3 7 3 8
MathSample_demo (CHFJPY,M1)   9 9 4 9
MathSample_demo (CHFJPY,M1)   7 1 2 5
MathSample_demo (CHFJPY,M1)   5 7 6 5
*/

Sampling Without Replacement

You don’t replace back the first item you draw before you draw a second. So, if you had a list of names to choose two people from, you couldn’t choose the same person twice. If the list has unique items then our sample is unique also (no duplicates).

Reference:

https://stats.libretexts.org/Courses/Diablo_Valley_College/Math_142%3A_Elementary_Statistics/Math_142%3A_Course_Material/05%3A_Chapter_5/Ch_3.4_Sampling_with_without_replacement

Ch 3.4 Sampling With/Without Replacement
Ch 3.4 Sampling With/Without Replacement
  • stats.libretexts.org
Sampling with replacement – selected subjects are put back into the population before another subject are sampled. Subject can possibly be selected more than once. Sampling without replacement – Selected subjects will not be in the “pool” for selection.  All selected subjects are unique. This is the default assumption for statistical sampling...
 
Ilyas #:

2) MathSample with replace

Yes, Fisher-Yates algorithm gives much better shuffle

But using it for indices... why not!


we can little bit change it because we need only first count elements from indices array

For sampling without replacement, Fisher-Yates algorithm shuffles the whole array (population) via indices because the array is const, then we only draw the first count elements on the top (sample) without replacement back into array.

Edit:
To ensure truely random sampling without selection bias (which affects calculation of probabilities), Fisher-Yates is the best. 
 
amrali #:

For sampling without replacement, Fisher-Yates algorithm shuffles the whole array (population) via indices because the array is const, then we only draw the first count elements on the top (sample) without replacement back into array.

Edit:
To ensure truely random sampling without selection bias (which affects calculation of probabilities), Fisher-Yates is the best. 

Thank You

 
amrali #:

For sampling without replacement, Fisher-Yates algorithm shuffles the whole array (population) via indices because the array is const, then we only draw the first count elements on the top (sample) without replacement back into array.

Edit:
To ensure truely random sampling without selection bias (which affects calculation of probabilities), Fisher-Yates is the best. 

That is what I mean "little change"

bool MathSample(const int &array[],const int count,const bool replace,int &result[])
  {
//--- unique values not needed
   if(!replace)
      return MathSample(array,count,result);
   int size=ArraySize(array);
   if(size==0 || count>size)
      return(false);
//--- prepare target array
   if(ArraySize(result)<count)
      if(ArrayResize(result,count)!=count)
         return(false);
//--- unique values needed, prepare indices
   int indices[];

   if(!MathSequenceByCount(0,size-1,size,indices))
      return(false);

   for(int i=0; i<count; i++)
     {
      //--- select random indices and swap them
      int j=i+int((ulong)(size-i)*MathRand()/32768);

      if(j!=i)
        {
         int t=indices[i];
         indices[i]=indices[j];
         indices[j]=t;
        }
     }
//--- select data according to indices
   for(int i=0; i<count; i++)
      result[i]=array[indices[i]];
//---
   return true;
  }

I suspect this shuffling is good enough and will be work faster because it does not shuffle whole indices array, only first count elements which will be used in the next loop

 
amrali #:

Thank you for getting attention to this bug.

I need to clarify the difference between sampling schemes.

Sampling With Replacement

You replace back the first item you draw to the list before you draw a second. So, if you had a list of names to choose two people from (sample of 2), you could choose the same person twice. If the list had unique items, then our sample would contain duplicates.

Sampling Without Replacement

You don’t replace back the first item you draw before you draw a second. So, if you had a list of names to choose two people from, you couldn’t choose the same person twice. If the list has unique items then our sample is unique also (no duplicates).

Reference:

https://stats.libretexts.org/Courses/Diablo_Valley_College/Math_142%3A_Elementary_Statistics/Math_142%3A_Course_Material/05%3A_Chapter_5/Ch_3.4_Sampling_with_without_replacement

we will think what to do, there might be some codes use existing opposite meaning of replace

might be it better to move replace parameter at the end of parameters lists (it will be helpful to revise existing codes, because compiling errors will be occurred) and rename it to replacement


have you read header of MathSample function ?

//+------------------------------------------------------------------+
//| MathSample                                                       |
//+------------------------------------------------------------------+
//| The function takes a sample of the specified size from the       |
//| elements of the array[] with replacement.                        |
//|                                                                  |
//| Arguments:                                                       |
//| array[]     : Array with double values                           |
//| count       : Number of elements to choose                       |
//| replace     : If true, only unique values are placed to result[] |   !!!
//| result[]    : Output array                                       |
//|                                                                  |
//| Return value: true if successful, otherwise false.               |
//+------------------------------------------------------------------+
 

We made decision to change behavior

New code

//+------------------------------------------------------------------+
//| MathSample                                                       |
//+------------------------------------------------------------------+
//| The function takes a sample of the specified size from the       |
//| elements of the array[] with replacement.                        |
//|                                                                  |
//| Arguments:                                                       |
//| array[]     : Array with double values                           |
//| count       : Number of elements to choose                       |
//| replace     : If true, Sampling with Replacement will be used    |
//| result[]    : Output array                                       |
//|                                                                  |
//| Return value: true if successful, otherwise false.               |
//+------------------------------------------------------------------+
bool MathSample(const double &array[],const int count,const bool replace,double &result[])
  {
//--- Sampling with Replacement
   if(replace)
      return MathSample(array,count,result);

   ...
 

Yes, I like this idea. It is in fact more efficient to use a partial-shuffle Fisher algorithm here for samples without replacement.

Visual verification:

https://bost.ocks.org/mike/shuffle/compare.html

In Choose an algorithm: custom

Enter JavaScript code:

// sampling without replacement
function partial_shuffle(array, count = 20) {
    var size = array.length,
        i, j;
    for (i = 0; i < count; i++) {
        j = i + Math.floor(Math.random() * (size - i));  // i <= j <= size-1
        //if (j != i) {
            t = array[j];
            array[j] = array[i];
            array[i] = t;
        //}
    }
}

Click refresh button few times to check randomness in each selected sample (count = 20), on the left upper part of graph.

Reason: