static const char* szModule    = "data.cpp";
static const char* szExtension = "dat";

//------------------------------------------------------------------------------
//    module data.cpp                                                         //
//                                                                            //
//    Class TData encapsulates data matrix with 'nTup' rows and 'nVar'        //
//    columns. NOTE: The first column is regarded as output! Use              //
//    MoveToFirst() to met this condition!                                    //
//    See below or http://www.newty.de/pnc2/sdocu.html for more information.  //
//                                                                            //
//    copyright (c) 1998-2003 by Lars Haendel                                 //
//    home: www.newty.de                                                      //
//                                                                            //
//    This program is free software and can be used under the terms of the    //
//    GNU licence. See header file for further information and disclaimer.    //
//                                                                            //
//------------------------------------------------------------------------------
//                                                                            //
//    CREATE: Create empty object and call Load() or let a bunch of           //
//    different learn and test data objects be generated for you from a       //
//    basis data object by the function GenerateLearnAndTestData().           //
//                                                                            //
//    USE: Call CalculateWeights() to calculate feature weights using         //
//    mutual information criterion. Access maxima, minima, weights etc.       //
//    by the corresponding functions, sort and randomize the order of         //
//    the data tuples and (most important for the use with TPnc etc.)         //
//    call SetOutputColumn() to move the specified column to the first        //
//    and right-shift all preceding columns. Use                              //
//    OriginalColumnInFile() or ActualColumnInData() to get the               //
//    original or actual Id of a variable.                                    //
//                                                                            //
//    NOTE: The output column is moved to first, the term 'with output'       //
//    means, that the index, used to access the columns, is counted with      //
//    output.                                                                 //
//                                                                            //
//    File I/O: Load and save routines                                        //
//------------------------------------------------------------------------------



//----------------------------------------------------------------------------------------------------------------------
#include <stdlib>                // due to   randomize()
#include <stdio>                 //          sprintf() ...
#include <iomanip>               //          setw()
#include <math>                  //          sqrt() ...

#include "defines.h"             //
#include "fileUtil.h"            //          readExpNoEndl() ...
#include "stdlist.h"             //          list template
#include "exception.h"           //          IfTrueThrowTypeA()
#include "ddata.h"               //          TDataData (will also include data.h)


#define MAX_FOR_DEF_SYMBOLIC  (int) 9        // max. # symbols (range of integer values) to be initialized as symbolic
#define MAX_INTEGERS          (int) 1024     // maximal range of integer variable to count # of really occuring symbols


// #define DB_WRITE_SPLITPOINTS // debug mode: write splitpoints to file (default: OFF)


//----------------------------------------------------------------------------------------------------------------------
// hardcoded status texts  note: used to allow thread safe calls to StatusText()
static const char* szStateReady      = "Ready";
static const char* szStateScanFormat = "Scanning file format";
static const char* szStateLoading    = "Loading";
static const char* szStateCalcMeans  = "Calculating column wise means";
static const char* szStateCalcDev    = "Calculating column wise deviations";
static const char* szStateSaving     = "Saving";
static const char* szStateDetectSymb = "Detecting symbolic columns";
static const char* szStateWeights    = "Calculating feature weights";
static const char* szStateUnknown    = "Unknown";



//----------------------------------------------------------------------------------------------------------------------
// return original column number (referring to the loaded data file) of variable with Id 'varId'
const int TData::OriginalColumnInFile(const int& varId, const int& outcol){
   if(varId==0)   return outcol+1;
   else           return varId+(varId>outcol);}


//----------------------------------------------------------------------------------------------------------------------
// return actual variable Id of the 'col'-th column ('col' referrs to originally loaded data file)
const int TData::ActualColumnInData(const int& col, const int& outcol)  {
   if(col==outcol) return 0;
   else            return col+(col<outcol);}



//----------------------------------------------------------------------------------------------------------------------
// variable type conversion routines

// convert variable type (enum) to character
char TData::VarTypeToChar(const int type){
   switch(type)   {
      case cont : return 'c';          // continuous
      case symb : return 'n';          // symbolic (nominal)
      default: return '-';   }}

// convert character to variable type
TData::TVarType TData::CharToVarType(const char c){
   switch(c)   {
      case 'c' : return cont;          // continuous
      case 'n' : return symb;          // symbolic (nominal)
      default: return none;  }}



//----------------------------------------------------------------------------------------------------------------------
// constructor
TData::TData()
{
   // initialize
   ref = 1;                                                             // reference counter
   _nTup = _nVar = 0;                                                   // # tuples and variables, i.e. rows and columns
   outcol = 0;

   szLoadFileName[0] = '\0';                                            // 'delete' filenames
   szSaveFileName[0] = '\0';

   data = _data = NULL;                                                 // set pointer to NULL
   _nIntegerMaxMin = _nSymbolsFound = NULL;
   varType  = w_Type = NULL;
   weight = mean = min = max = range = invRange = dev = NULL;

   f_HasClone = f_IsClone = f_Locked = f_Loaded = f_Sorted = f_Randomized = false;     // re-set flags

   w_nIntervals = -1;
   SetState(ready);                                                      // initialize state
}


//----------------------------------------------------------------------------------------------------------------------
// constructor for clones I - used for splitting data into two parts
TData::TData(const TData* _parent, const int& __nTup, const bool& f_FromTop, const char* szName)
{
   parent=_parent;                                                // store pointer to parent
   ref = 1;                                                       // set own reference counter
   parent->ref++;                                                 // increment parent's reference counter
   w_nIntervals = -1;

   // initialize
   f_IsClone = true;                                              // set clone flag
   _nTup  = __nTup;
   f_Loaded = true;
   f_HasClone = f_Locked = f_Sorted = f_Randomized = false;
   weight = mean  = min = max = range = invRange   = dev = NULL;
   w_Type = NULL;
   SetState(ready);                                               // initialize state


   // copy/initialize
   _nVar           = parent->_nVar;
   unknown         = parent->unknown;
   _nIntegerMaxMin = parent->_nIntegerMaxMin;
   _nSymbolsFound  = parent->_nSymbolsFound;
   varType         = parent->varType;
   outcol          = parent->outcol;



   strcpy(szLoadFileName, parent->szLoadFileName);       // copy parent's filename or set specified one
   if(szName)
      strcpy(szLoadFileName, szName);


   // new data pointer and copy row/tuple pointers
   if(_nTup>parent->_nTup)                               // check specified # tuples
      _nTup=parent->_nTup;

   int offset=0;                                         // set offset if the last '_nTup' rows are taken
   if(!f_FromTop)
      offset = parent->_nTup-_nTup;
   data  = new float*[_nTup];                            // new pointer to tuples
   _data = new float*[_nTup];                            // new pointer to tuples in original order
   for(int i=0;i<_nTup;i++)
   {
      data[i]  = parent->data[offset+i];                 // copy row pointer
      _data[i] = data[i];
   }

   // note: The call to CalculateStatistics() has been removed as there may be data objects with only one data tuple!
   //       Function must be called seperately by user now.
}


//----------------------------------------------------------------------------------------------------------------------
// constructor for clones II - used for splitting data for n-fold cross-validation
TData::TData(const TData* _parent, const int& begin, const int& end, const bool& f_Inside, const char* szName)
{
   parent=_parent;                                                 // store pointer to parent
   ref = 1;                                                        // set own reference counter
   parent->ref++;                                                  // increment parent's reference counter
   w_nIntervals = -1;

   // initialize
   f_IsClone = true;                                               // set clone flag
   f_Loaded = true;
   f_HasClone = f_Locked = f_Sorted = f_Randomized =  false;
   weight = mean  = min = max = range = invRange   = dev = NULL;
   w_Type = NULL;
   SetState(ready);                                                // initialize state

   // calculate # tuples
   _nTup = end-begin+1;             // ini
   if(!f_Inside)
      _nTup = parent->nTup()-_nTup;



   // copy/initialize
   _nVar           = parent->_nVar;
   unknown         = parent->unknown;
   _nIntegerMaxMin = parent->_nIntegerMaxMin;
   _nSymbolsFound  = parent->_nSymbolsFound;
   varType         = parent->varType;
   outcol          = parent->outcol;;



   strcpy(szLoadFileName, parent->szLoadFileName);    // copy parent's filename or set specified one
   if(szName)
      strcpy(szLoadFileName, szName);


   // new data pointer and copy row/tuple pointers
   if(_nTup>parent->_nTup)                            // check specified # tuples
      _nTup=parent->_nTup;


   data  = new float*[_nTup];                         // new pointer to tuples
   _data = new float*[_nTup];                         // new pointer to tuples in original order


   // copy row/tuple pointers
   if(f_Inside)
      for(int i=0;i<_nTup;i++)
         data[i] = parent->data[begin+i];
   else
   {
      for(int i=0;i<begin;i++)
         data[i]  = parent->data[i];                  // copy row pointer
      for(int i=end+1;i<parent->nTup();i++)
         data[i-end+begin-1]  = parent->data[i];      // copy row pointer
   }

   for(int i=0;i<_nTup;i++)                           // store row pointers in original order
      _data[i] = data[i];

   // calculate statistics   (hack!) see above
//   CalculateStatitics();
}


//----------------------------------------------------------------------------------------------------------------------
// use to 'delete' instance
void TData::Release() const
{
   if(--ref==0)
      delete this;         // delete instance
}


//----------------------------------------------------------------------------------------------------------------------
// private(!) destructor
TData::~TData()
{
   if(f_IsClone)
      parent->Release();
   else
   {                                         // release data matrix
      delete[] _nIntegerMaxMin;
      delete[] _nSymbolsFound;
      delete[] varType;
      if(data)
         for(int i=0;i<_nTup;i++)            // tuples/rows
            delete[] data[i];
   }
   delete[] data;
   delete[] _data;

   delete[] w_Type;     // copy of variable types used to calculate feature weights

   // release statistics
   delete[] weight;
   delete[] mean;
   delete[] min;
   delete[] max;
   delete[] range;
   delete[] invRange;
   delete[] dev;
}


//----------------------------------------------------------------------------------------------------------------------
// make compatible with given data object, i.e. copy variable types etc.
bool TData::MakeCompatible(const TData*const& ref)
{
   bool f_NoOutput = (_nVar==ref->nVar()-1);

   // a) check: allow same # variables or one variable less if specified by flag f_AllowNoOutput
   IfTrueThrowTypeU(_nVar!=ref->nVar() && !f_NoOutput, "Number of variables does not match");


   // b) set output column in data file if data is with output
   if(!f_NoOutput)
      SetOutputColumn(ref->OutputColumn());


  // c) set variable types  -  note: there is an offset by 1 is flag f_NoOutput is true
   for(int j=f_NoOutput;j<nVar();j++)                             // for each variable
      if(ref->IsSymbolic(j))                                      // if variable is symbolic in master (ddata) object
         if(nIntegerMaxMin(j-f_NoOutput)!=0)                      // check and if it's ok ...
            SetVarType(j-f_NoOutput, TData::symb);                // ... then set
         else
            ThrowTypeU("Variable types don't match!");            // ... else throw exception


   // d) return true if data is with output
   return  !f_NoOutput;
}

bool TData::MakeCompatible(const TDataData*const& ref)
{
   bool f_NoOutput = (_nVar==ref->nVar()-1);

   // a) check: allow same # variables or one variable less if specified by flag f_AllowNoOutput
   IfTrueThrowTypeU(_nVar!=ref->nVar() && !f_NoOutput, "Number of variables does not match");


   // b) set output column in data file if data is with output
   if(!f_NoOutput)
      SetOutputColumn(ref->OutputColumn());


   // c) set variable types  -  note: there is an offset by 1 is flag f_NoOutput is true
   for(int j=f_NoOutput;j<nVar();j++)                             // for each variable
      if(ref->IsSymbolic(j))                                      // if variable is symbolic in master (ddata) object
         if(nIntegerMaxMin(j-f_NoOutput)!=0)                      // check and if it's ok ...
            SetVarType(j-f_NoOutput, TData::symb);                // ... then set
         else
            ThrowTypeU("Variable types don't match!");            // ... else throw exception


   // d) return true if data is with output
   return  !f_NoOutput;
}


//----------------------------------------------------------------------------------------------------------------------
// set type of variable throwing an exception if nominal or ordinal types are tried to set for a non-symbolic variable
void TData::SetVarType(const int& j, const TVarType& type)
{
   char szText[STS];
   sprintf(szText, "Variable %d(%d) has non-integer values! Cannot set symbolic (nominal) type for it!", j+1
            , OriginalColumnInFile(j));
   IfTrueThrowTypeA( type==symb && !IsInteger(j), szText, "TData::SetVarType", szModule);

   varType[j]=type;                                                        // ok, set type
}


//----------------------------------------------------------------------------------------------------------------------
// set output column. This column is moved to first and all preecedding columns are shifted to the right. If another
// column has alread been set as output column this is un-done. Statistics, variable types and # symbols are moved too.
void TData::SetOutputColumn(const int& colId)
{
   // a) check
   IfTrueThrowTypeA(f_Locked, "Function called on locked instance! Call TData::Unlock() before!", "TData::SetOutputColumn", szModule);
   IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::SetOutputColumn", szModule);
   IfTrueThrowTypeA(colId<0 || colId>=_nVar, "Index exceeds matrix dimensions!", "TData::SetOutputColumn", szModule);
   IfTrueThrowTypeA(f_IsClone, "Function called on cloned instance!", "TData::SetOutputColumn", szModule);
   IfTrueThrowTypeA(f_HasClone, "Function called for instance which was used to make clones!", "TData::SetOutputColumn", szModule);


    // b) return if column is already set as output column, i.e. if it's already moved to first
   if(colId==outcol)
      return;

   // reset flags
   f_Sorted = false;


   // c) if another column has already been moved to first: rewind this, i.e. move it back
   if(outcol!=0)
   {
      //c1)  data matrix: move first column back to the 'outcol'-th column
      for(int i=0;i<_nTup;i++)
      {
         float y0=data[i][0];                                  // save first column

         for(int j=0;j<outcol;j++)                             // left shift all following columns up to target column
            data[i][j]=data[i][j+1];

         data[i][outcol]=y0;                                   // restore
      }


      // c2) variable types
      const TVarType varType0 = varType[0];                    // save first column
      for(int j=0;j<outcol;j++)                                // left shift all following columns up to target column
         varType[j]=varType[j+1];
      varType[outcol] = varType0;                              // restore


      // c3) statistics
      const float min0      = min[0];                          // save first column
      const float max0      = max[0];
      const float mean0     = mean[0];
      const float dev0      = dev[0];
      const float range0    = range[0];
      const float invRange0 = invRange[0];
      for(int j=0;j<outcol;j++)                                // left shift all following columns up to target column
      {
         min[j]      = min[j+1];
         max[j]      = max[j+1];
         mean[j]     = mean[j+1];
         dev[j]      = dev[j+1];
         range[j]    = range[j+1];
         invRange[j] = invRange[j+1];
      }
      min[outcol]       = min0;                                // restore
      max[outcol]       = max0;
      mean[outcol]      = mean0;
      dev[outcol]       = dev0;
      range[outcol]     = range0;
      invRange[outcol]  = invRange0;

      // c4) # symbols
      int nSymbols0 = _nIntegerMaxMin[0];                      // save first column
      for(int j=0;j<outcol;j++)                                // left shift all following columns up to target column
         _nIntegerMaxMin[j]=_nIntegerMaxMin[j+1];
      _nIntegerMaxMin[outcol] = nSymbols0;                     // restore


      nSymbols0 = _nSymbolsFound[0];                           // save first column
      for(int j=0;j<outcol;j++)                                // left shift all following columns up to target column
         _nSymbolsFound[j]=_nSymbolsFound[j+1];
      _nSymbolsFound[outcol] = nSymbols0;                      // restore
   }
   outcol = colId;   // store new output column



   //d)  data matrix: move 'colId'-th column to first
   for(int i=0;i<_nTup;i++)
   {
      float y=data[i][colId];                                  // save 'colId'-th column

      for(int j=colId;j>0;j--)                                 // right shift all preceeding columns
         data[i][j]=data[i][j-1];

      data[i][0]=y;                                            // restore
   }


   // e) variable types
   const TVarType typeOfY = varType[colId];                    // save
   for(int j=colId;j>0;j--)                                    // right shift all preceeding columns
      varType[j]=varType[j-1];
   varType[0] = typeOfY;                                       // restore


   // e) statistics
   const float min0      = min[colId];                         // save orignal values of 'colId'-th column
   const float max0      = max[colId];
   const float mean0     = mean[colId];
   const float dev0      = dev[colId];
   const float range0    = range[colId];
   const float invRange0 = invRange[colId];
   for(int j=colId;j>0;j--)                                    // right shift all preceeding columns
   {
      min[j]       = min[j-1];
      max[j]       = max[j-1];
      mean[j]      = mean[j-1];
      dev[j]       = dev[j-1];
      range[j]     = range[j-1];
      invRange[j]  = invRange[j-1];
   }
   min[0]                     = min0;                          // restore
   max[0]                     = max0;
   mean[0]                    = mean0;
   dev[0]                     = dev0;
   range[0]                   = range0;
   invRange[0]                = invRange0;


   // f) # symbols
   int nSymbols0 = _nIntegerMaxMin[colId];                     // save
   for(int j=colId;j>0;j--)                                    // right shift all preceeding columns
      _nIntegerMaxMin[j]=_nIntegerMaxMin[j-1];
   _nIntegerMaxMin[0] = nSymbols0;                             // restore


   nSymbols0 = _nSymbolsFound[colId];                          // save
   for(int j=colId;j>0;j--)                                    // right shift all preceeding columns
      _nSymbolsFound[j]=_nSymbolsFound[j-1];
   _nSymbolsFound[0] = nSymbols0;                              // restore
}


//----------------------------------------------------------------------------------------------------------------------
// detect if columns is symbolic, i.e. contains only integer values
void TData::DetectSymbolicColumns()
{
   // a) check
   IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!"
                     , "TData::DetectSymbolicColumns", szModule);


   // allocate memory
   _nIntegerMaxMin = new int[_nVar];         // # symbols per variable (zero if variable is continuous)
   varType = new TVarType[_nVar];            // type of each variable (continuous, ordinal, nominal)
   _nSymbolsFound  = new int[_nVar];         // # symbols per variable


   // b) new flags which indicates that column is symbolic
   bool* symbolic = new bool[_nVar];

   // c) iterate all variables to check if any tuple contains non-integer values
   for(int j=0;j<_nVar;j++)
   {
      symbolic[j]=true;                            // ini

      for(int i=0;i<_nTup;i++)                     // check all tuples
      {
         if(floor(data[i][j])!=data[i][j])         // if value is non-integer ...
         {
            symbolic[j]=false;                     // reset flag
            break;                                 // and break
         }
      }
   }

   // d) evaluate symbolic flags and set # symbols calculated as maximal minus minmal symbol value
   for(int j=0;j<_nVar;j++)
      if(symbolic[j])
         _nIntegerMaxMin[j]=max[j]-min[j]+1;       // determine # symbols
      else
         _nIntegerMaxMin[j] = 0;


   // e) determine real number of symbols and set default variable types
   for(int j=0;j<_nVar;j++)
   {
      int nFound = 0;                                                // reset/ini

      // detect # symbols
      if(_nIntegerMaxMin[j]>0 && _nIntegerMaxMin[j]<MAX_INTEGERS)    // if variable has only and not to much integer
                                                                     // ... then count # of really occuring symbols
      {
         const int nSymb = _nIntegerMaxMin[j];                       // abbrevation
         bool* f_symbol_found = new bool[nSymb];                     // allocate and reset flag vector
         for(int s=0;s<nSymb;s++)
            f_symbol_found[s] = false;

         for(int i=0;i<_nTup;i++)                                    // process all tuples and for each one ...
            f_symbol_found[(int) (data[i][j]-min[j])] = true;        // ... set corresponding symbol flag(PoD)

         // count # symbol flags
         for(int s=0;s<nSymb;s++)
            if(f_symbol_found[s])
               nFound++;

         delete[] f_symbol_found;      // release
      }

      // set default variable type
      if(nFound>0 && nFound<MAX_FOR_DEF_SYMBOLIC)
         varType[j] = symb;  // set symbolic (nominal) as default for integer columns with not to many different symbols
      else
         varType[j] = cont;


      _nSymbolsFound[j] = nFound;                     // store # symbols found
   }

   // release
   delete[] symbolic;
}


//----------------------------------------------------------------------------------------------------------------------
// set state
void TData::SetState(TStateEnum _state) const
{
   state = _state;      // a) store new state (obsolete ?)

   switch(state)        // b) set staus text (pointer)
   {
      case ready  : szStatusText = szStateReady;       break;
      case scan   : szStatusText = szStateScanFormat;  break;
      case read   : szStatusText = szStateLoading;     break;
      case calcI  : szStatusText = szStateCalcMeans;   break;
      case calcII : szStatusText = szStateCalcDev;     break;
      case save   : szStatusText = szStateSaving;      break;
      case detect : szStatusText = szStateDetectSymb;  break;
      case weights: szStatusText = szStateWeights;     break;
      default     : szStatusText = szStateUnknown;
   }
}


//----------------------------------------------------------------------------------------------------------------------
// load data matrix from file  -  # columns and # rows are detected automatically
// The TDataData object is optional and used to check if loaded data has the correct # variables
void TData::Load(const char* szFilename, const bool* f_Stop, const TDataData*const& ddata/*=NULL*/)
{
   ifstream file(szFilename, ios::in);       // try to open file for reading

   // a) check
   IfTrueThrowTypeA(f_Loaded,"Function cannot be called twice in the lifetime of an instance!","TData::Load",szModule);
   IfTrueThrowTypeU(!file, "Read Error: Unable to open data file '%s'!", szFilename);


   // b) initialize
   f_Loaded    = true;
   load_data_progress=0;
   int line = 1;

   strcpy(szLoadFileName, szFilename);       // store filename
   strcpy(szSaveFileName, szFilename);
   streampos begin = file.tellg();           // preserve actual stream position (begin of file)


   // c) detect # columns, scanning the first non-empty/non-commented row
   SetState(TData::scan);                    // set state
   try
   {
      line += skipwsEx(file);                // remove whitespaces and comments

      while(skipws(file)==0)                 // proceed until linefeed
      {
         Read(file, (float) 1.0);            // read value
         _nVar++;                            // count columns
      }
   }
   catch(int errNo)     // exception handling
   {
      char szText[STS];
      sprintf(szText, "Detecting number of columns - line %d:   %s", line, ::GetLastError(errNo));
      ThrowTypeU(szText);
   }

   // if given: check # variables in data file if it matches (i.e. is equal or one less) the TDataData object 
   if(ddata)
      IfTrueThrowTypeU(_nVar!=ddata->nVar()&&!(_nVar==ddata->nVar()-1)
                        , "Number of variables does not match!  File loading aborting!");
   else     // else: learn data is loaded, min. variable count is 2
      IfTrueThrowTypeU(_nVar < 2, "There must be a minimum of one input and one output variable\nin data file!\
  File loading aborting! ");




   // determine file size for progress indication
   file.seekg(0L, ios::end);                 // position to end
   filesize = file.tellg();
   file.seekg(begin);                        // restore stream position


   // d) read data
   SetState(TData::read);
   int j;                                    // column counter
   line = 1;
   TStdList<float*> list;                    // temporay list to read rows
   list.SetName("TData::Load() list");
   try
   {
      line += skipwsEx(file);                // remove whitespaces and comments

      while((char) file.peek() != EOF)       // while next character in file is not eof (end of file)
      {
         // process control
         load_data_progress = 100*file.tellg()/filesize;


         float*& row = list.Ins();           // insert new list element and get reference to it
         row = new float[_nVar];             // allocate memory

         // read all columns
         for(j=0; j<_nVar;j++)
            row[j] = ReadExpNoEndl(file, (float) 0);    // read value

         // remove whitespaces and columns throwing an exception if no endl occurs
         line += skipwsExEndl(file);

         if(f_Stop)                                               // if given
            if(*f_Stop)                                           // check stop flag
               ThrowTypeU("User break! File loading aborted!");   // and throw exception if set
      }

   }
   catch (int errNo)    // exception handling
   {
      DeleteEntries(&list);                                       // release memory in list

      char szText[STS];                                           // compose error text
      sprintf(szText,"Line %d  -  Row %d  -  Column %d  -  %s", line, list.Size(), j+1, ::GetLastError(errNo));
      ThrowTypeU(szText);
   }


   // e) aloocate memory for data matrix and copy rows from list to array (list is local variable and will be released)
   _nTup = list.Size();                // get # tuples (rows)
   data = new float*[_nTup];           // allocate
   list.Reset();                       // reset list to top
   for(int i=0;i<_nTup;i++)
   {
      list.Next();                     // position forward
      data[i] = list.Get();            // copy
   }


   // f) preserve pointers to rows in original order
   _data = new float*[_nTup];          // allocate
   for(int i=0;i<_nTup;i++)
      _data[i] = data[i];              // copy


   file.close();

   // g) calculate column wise statistics and detect symbolic columns
   CalculateStatitics();

   SetState(TData::ready);
}


//----------------------------------------------------------------------------------------------------------------------
// write data matrix to file
void TData::Save(const char* szFilename, const bool* f_Stop /* = NULL */) const
{
   ofstream file(szFilename, ios::out);         // try to open file for output

   // checks  -  note: set error text and return in case of an error
   IfTrueThrowTypeU(!file, "Write Error: Unable to open data file '%s'", szFilename);
   IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!","TData::Save", szModule);

   // write header
   file << ComChar << " Original File  = " << szLoadFileName << endl;
   file << ComChar << "  #Rows         = " << _nTup << endl;
   file << ComChar << "  #Columns      = " << _nVar << endl;
   file << ComChar << "  Output Column = " << (outcol+1) << endl;


   // state setzen und ggf. dialog das stop-flag übermitteln
   SetState(save);      // state setzen


   // write format description
   file << ComChar << endl << ComChar << " Statistics on Columns:" << endl;
   file << ComChar << " Column      Minimum      Maximum         Mean    Deviation       Weight  #Symbols and Type";
   file << endl;

   for(int j=0;j<_nVar;j++)   // for all variables
   {
      file << ComChar << setw(7) << OriginalColumnInFile(j) << " ";                 // original column in data file
      file << setw(W_FLOAT) << min[j]  << " " << setw(W_FLOAT) << max[j] << " ";    // minimum
      file << setw(W_FLOAT) << mean[j] << " " << setw(W_FLOAT) << dev[j] << " ";    // maximum
      file << setw(W_FLOAT) << weight[j] << " ";                                    // weight
      file << setw(W_SYMBOL) << _nIntegerMaxMin[j] << "/";                          // # interger value min. up to max.
      file << setw(W_SYMBOL) << _nSymbolsFound[j] << " ";                           // # symbols
      file << setw(W_SYMBOL) << VarTypeToChar(varType[j]) << endl;                  // variable type
    }
   file << endl;


   // set column widths
   int* w = new int[_nVar];
   for(int j=0;j<_nVar;j++)
      if(IsSymbolic(j))
         w[j] = W_SYMBOL;
      else
         w[j] = W_FLOAT;


   // write all tuples
   for(int i=0;i<_nTup;i++)
   {
      for(int j=0;j<_nVar;j++)                        // over all columns
         file << setw(w[j]) << data[i][j] << " ";
      file << endl;                                   // linefeed


      // check stop flag
      if(f_Stop)
         if(*f_Stop)
         {
            file << endl << ComChar << " Error: Userbreak! Process terminated!" << endl;
            break;
         }
   }

   // release memory, close file and set state
   delete[] w;
   file.close();
   SetState(ready);
}


//----------------------------------------------------------------------------------------------------------------------
// calculate statistics on columns
void TData::CalculateStatitics()
{
   // a) checks
   IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!"
                     , "TData::CalculateStatistics", szModule);

   IfTrueThrowTypeU(_nTup<2, "Unable to calculate statistics! Data file must contain at least 2 data tuples!");


   // b) allocate memory
   mean     = new float[_nVar];                          // column wise calculated statistics
   max      = new float[_nVar];
   min      = new float[_nVar];
   dev      = new float[_nVar];
   range    = new float[_nVar];
   invRange = new float[_nVar];

   weight                  = new float[_nVar];           // feature weights


   // c) calculate mean, maximum and minimum for each column
   SetState(calcI);                                      // set state

   for(int j=0;j<_nVar;j++)                              // over all columns
   {
      dev [j] = 0;                                       // ini for d)
      mean[j] = 0;                                       // ini
      max [j] = data[0][j];
      min [j] = data[0][j];

      for(int i=0;i<_nTup;i++)                           // over all tuples
      {
         mean[j]+= data[i][j];                           // sum up

         if(data[i][j] > max[j])                         // search maximum and minimum
            max[j] = data[i][j];
         else
            if(data[i][j] < min[j])
               min[j] = data[i][j];
      }
      mean    [j] = mean[j]/_nTup;                       // calculate mean
      range   [j] = max[j]-min[j];                       //     "     range

      if(range[j]!=0)                                    //     "     inverse of range
         invRange[j] = 1/range[j];
      else
         invRange[j] = 1;
    }


   // d) calculate standard deviation
   SetState(calcII);
   for(int j=0;j<_nVar;j++)                                    // over all columns
      for(int i=0;i<_nTup;i++)                                 // over all tuples
         dev[j] += (mean[j]-data[i][j])*(mean[j]-data[i][j]);  // sum up

   for(int j=0;j<_nVar;j++)
      if(_nTup>1)
         dev[j] = sqrt(fabs(dev[j]/(_nTup-1)));                // calculate deviation
                                                               // ? use fabs() ? to cope with rounding effects ?
      else
         dev[j] = 0;


   // e) detect symbolic columns
   SetState(detect);
   if(!f_IsClone)                                              // only call if instance is not a clone
      DetectSymbolicColumns();


   // f) initialize weights and 1/weight
   for(int j=0;j<_nVar;j++)                                    // over all columns
      weight[j]=1;

   SetState(ready);
}


//----------------------------------------------------------------------------------------------------------------------
// randomize order of tuples
void TData::Randomize(const unsigned int seed/*=1*/)
{
   // a) check
   IfTrueThrowTypeA(f_Locked, "Function called on locked instance! Call TData::Unlock() before!", "TData::Randomize", szModule);
   IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::Randomize", szModule);

   f_Randomized = true;                         // b) set flags
   f_Sorted     = false;

   int* /*cr*/ id = RandomId(_nTup, seed);      // c) get random id's


   for(int i=0;i<_nTup;i++)
      data[i] = _data[id[i]];

   delete[] id;
}


//----------------------------------------------------------------------------------------------------------------------
// restore original order of tuples
void TData::Reorder()
{
   // a) check
   IfTrueThrowTypeA(f_Locked, "Fucntion called on locked instance! Call TData::Unlock() before!", "TData::Reorder", szModule);
   IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::Reorder", szModule);


   // note: there was a problem if data is sorted and then the output column is changed: the sort flags is resetted
   //       then but the tuples are not in their original order! Thus do not check anymore. Just do it!!
   // if(f_Randomized || f_Sorted)  // check if rows are disordered
   //    return;

   f_Randomized = false;               // b) set flags
   f_Sorted     = false;


   // c) reorder rows if they are not in original order
   for(int i=0;i<_nTup;i++)            // copy back
      data[i] = _data[i];
}


//----------------------------------------------------------------------------------------------------------------------
// sort tuples regarding the 'colId'-th column  -  note: at the moment 'colId' must be 0!
// in case of equality the tuples remain like they were!
void TData::Sort(const int& colId)
{
   if(f_Sorted)
      return;


   // a) check
   IfTrueThrowTypeA(f_Locked, "Function called on locked instance! Call TData::Unlock() before!", "TData::Sort", szModule);
   IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::Sort", szModule);
   IfTrueThrowTypeA(colId!=0, "At the moment function can only be called with 'colId=0'!", "TData::Sort", szModule);


   // b) set flags
   f_Randomized = false;
   f_Sorted     = true;


   // c) get id's sorted regarding column 0  -  note: this way ensures that Sort() will always result in the same order
   TIdVec* vec = new TIdVec[_nTup];
   for(int i=0;i<_nTup;i++)
   {
      vec[i].a = _data[i][0];
      vec[i].b = i;
      vec[i].id= i;
   }
   qsort((void*) vec, _nTup, sizeof(vec[0]), IdVecCmpDes);  // sort in decreasing(!) order


   // d) sort
   for(int i=0;i<_nTup;i++)
      data[_nTup-1-i]=_data[vec[i].id];                     // note: tuples were sorted in decreasing order

   delete[] vec;  // release
}


//----------------------------------------------------------------------------------------------------------------------
// equal frequency binning
float* /*cr*/ TData::EqualFrequencyBinning(const int& j, const int& nIntervals)
{
   float* splits=new float[nIntervals];

   // copy values of j-th variable to vector
   float* x=new float[_nTup];    // allocate
   for(int i=0;i<_nTup;i++)
      x[i]=data[i][j];           // copy


   // sort
   qsort((void*) x, _nTup, sizeof(x[0]), FloatCmpAsc);

   splits[0] = x[0];                   // set left border to minimum
   for(int k=1;k<nIntervals;k++)
   {
      int id=k*_nTup/((float) nIntervals);
      splits[k]=(x[id]+x[id-1])/2;     // set split/left border in the middle between two points
                                       // note: condition nTup>nIntervals yields
   }
   delete[] x; // release


   #ifdef DB_WRITE_SPLITPOINTS
   file << "Variable " << j << " " << splits[0] << " ";
   for(int k=1;k<nIntervals;k++)
      file << splits[k] << " ";
   file << endl;
   #endif

   return splits;
}

// convert splits/left borders of discretization intervals to midpoints
// note: 'splits' is a float array containing at position 'k' the left border of the
//       'k+1'-th interval.
void SplitsToMidPoints(float*& splits, const int& nIntervals, const float& max)
{
   // set midpoints as half of the range of left borders of current and next bin
   for(int k=0;k<nIntervals-1;k++)
      splits[k] =splits[k] + (splits[k+1]-splits[k])/2.0;

   splits[nIntervals-1] = splits[nIntervals-1] + (max-splits[nIntervals-1])/2.0;    // last bin
}

// quantization for equal frequency binning
int Quant(const float value, const float* splits, const int nInt)
{
   for(int k=1;k<nInt;k++)
      if(value<splits[k])
         return (k-1);
   return nInt-1;
}


//----------------------------------------------------------------------------------------------------------------------
// calculate feature weights using mutual information criterium
bool TData::CalculateWeights(const int& nIntervals, const bool& f_Classification, const bool& f_EqualWidthBinning)
{
   // 1. check
   IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::CalculateWeights", szModule);
   IfTrueThrowTypeA(nIntervals<2, "Function called with illegal value for 'nIntervals'!", "TData::CalculateWeights", szModule);
   IfTrueThrowTypeA(_nIntegerMaxMin[0]==0 && f_Classification,
         "Current output is continuous! Classification flag must not be set!", "TData::CalculateWeights", szModule);

   // ToDo: what happens if a variable is symbolic but there is only one symbol ??

   bool f_EqualTypes = (w_Type!=NULL);                                // check if input (!) variable types have changed
   if(w_Type)
   {
      for(int j=1;j<_nVar;j++)
         if(w_Type[j]!=varType[j])
            f_EqualTypes=false;
   }
   else
      w_Type = new TVarType[_nVar];                                        // allocate


   // return if weights already calculated
   if(w_f_Classification == f_Classification && f_EqualTypes && nIntervals == w_nIntervals && outcol == w_Outcol
    && w_f_EqualWidthBinning == f_EqualWidthBinning)
      return false;


   SetState(TData::weights);     // set state

   w_nIntervals = nIntervals;                                              // store calculation parameters
   w_Outcol = outcol;
   w_f_Classification = f_Classification;
   w_f_EqualWidthBinning = f_EqualWidthBinning;
   memcpy(w_Type, varType, sizeof(varType[0])*_nVar);                      // variable types


   // 2.
   weight[0] = 1;                                                          // note: output doesn't have weight

   // set # output intervals
   int nOpI = nIntervals;
   if(f_Classification)
      nOpI = _nIntegerMaxMin[0];

   float** mat    = new float*[nOpI];                                      // new probability matrix (row pointers)
   float* pClass  = new float[nOpI];                                       // class probabilities


   // 3. calculate weight for each input(!) variable
   for(int j=1;j<_nVar;j++)
   {
      weight[j]=0;
      int nIpI = nIntervals;                       // a) set # input intervals for j-th variable

      const float* /*cr*/  splits = NULL;
      if(IsSymbolic(j))
         nIpI = _nIntegerMaxMin[j];
      else
         if(!f_EqualWidthBinning)
            splits = EqualFrequencyBinning(j, nIpI);  // calculate split points for continuous variables



      if(nIpI==1)
         continue;  // if there's only one interval -> weight is zero -> continue with next variable


      float* pFeature = new float[nIpI];           // b) new and reset feature probabilities
      for(int t=0;t<nIpI;t++)
         pFeature[t]=0;


      // c) new and reset probability matrix
      for(int s=0;s<nOpI;s++)
      {
         mat[s] = new float[nIpI];                 // new row

         for(int t=0;t<nIpI;t++)                   // reset
            mat[s][t]=0;
      }


      // d) apportion data tuples, i.e. count occurences
      for(int i=0;i<_nTup;i++)
         if(!f_EqualWidthBinning)
            if(IsSymbolic(j))
               mat[Quant(data[i][0], max[0], min[0], nOpI, f_Classification)][Quant(data[i][j], max[j], min[j], nIpI, true)]++;
            else
               mat[Quant(data[i][0], max[0], min[0], nOpI, f_Classification)][Quant(data[i][j], splits, nIpI)]++;
         else
            mat[Quant(data[i][0], max[0], min[0], nOpI, f_Classification)][Quant(data[i][j], max[j], min[j], nIpI, IsSymbolic(j))]++;



      // e) turn occurences in probabilities and calculate class and feature probabilities
      for(int s=0;s<nOpI;s++)
      {
         pClass[s]=0;                              // ini
         for(int t=0;t<nIpI;t++)
         {
            mat[s][t]   /= _nTup;
            pClass[s]   += mat[s][t];              // calculate s-th class    pobability: sum up row
            pFeature[t] += mat[s][t];              // calculate t-th feature  pobability: sum up column
         }
      }


      // f) sum up mutual information and set weight
      for(int s=0;s<nOpI;s++)
         for(int t=0;t<nIpI;t++)
            if(mat[s][t]!=0)
               weight[j]+=mat[s][t]*log( mat[s][t]/pClass[s]/pFeature[t] );


      // g) release
      delete[] pFeature;
      for(int s=0;s<nOpI;s++)
         delete[] mat[s];

      delete[] splits;
   }


   // 4. release
   delete[] mat;
   delete[] pClass;


   // 5. standardize weights to have mean of '1' and round them to defined precision

   // a) calculate sum of weights
   float sum=0;
   for(int j=1;j<_nVar;j++)
      sum+=weight[j];
   sum/=_nVar-1;


   // b) check the sum of weights differs from zero.    note: All zero weights may happen if the output's entropy is
   // already zero because all output values are the same. Then, of course, the transinformations will also be zero.
   if(sum==0)
      for(int j=1;j<_nVar;j++)
         weight[j] = 1;             // set dummy value   note: learning however should be useless :-)
   else
   {
      // c) standardize and round to defined precision
      int fac = pow(10, WEIGHT_PRECISION);
      for(int j=1;j<_nVar;j++)
         weight[j] = floor(weight[j]/sum * fac + 0.5) / fac;
   }


   SetState(TData::ready);    // set state

   return true;               // new feature weights were calculated
}


//----------------------------------------------------------------------------------------------------------------------
// used for validation purposes only - overwrite output values
void TData::Hack()
{
   for(int i=0;i<_nTup;i++)
      data[i][0] = i+1;
}


//----------------------------------------------------------------------------------------------------------------------
// generate learn/test data and save it if basename is specified
void TData::GenerateLearnAndTestData(const unsigned int& seed, TData**& data_L, TData**& data_T, const int& N_R,
                                     const bool& f_CV, const int& N_L, const int& N_T, const char*const& szDataBn,
                                     const char*const& szDir, const int& N_Bins, const bool& f_Regression,
                                     const bool& f_EqualWidthBinning, const int& optId/*=-1*/, const int& nChar2/*=0*/)
{
   // a) new fields
   data_L = new TData*[N_R];
   data_T = new TData*[N_R];


   // b) initialize random seeds for randomization of data tuples
   srand(seed);                                    // set random number generator to specified state
   unsigned int* seeds = new unsigned int[N_R];    // store random seeds
   for(int i=0;i<N_R;i++)
      seeds[i] = abs(random(INT_MAX));


   // c) prepare

   bool f_WasSorted = IsSorted();   // remember if data was sorted/locked to restore at the end
   bool f_WasLocked = IsLocked();

   Unlock();                        // unlock data to allow randomization
   if(f_CV)
      Randomize(seeds[0]);          // initial randomization of data tuples, necessary for random n-fold cross validation



   // d) generate learn and test data
   int nChar = log(N_R)/log(10)+1;
   for(int i=0;i<N_R;i++)
   {
      // generate (file) names
      char name_L[STS], name_T[STS];
      if(optId!=-1)
      {
         // note: add current optimization id
         sprintf(name_L, "%s%s_CV%0*d_Run%0*d_L.%s", szDir, szDataBn, nChar2, optId+1, nChar, i+1, szExtension);
         sprintf(name_T, "%s%s_CV%0*d_Run%0*d_T.%s", szDir, szDataBn, nChar2, optId+1, nChar, i+1, szExtension);
      }
      else
      {
         sprintf(name_L, "%s%s_CV%0*d_L.%s", szDir, szDataBn, nChar, i+1, szExtension);
         sprintf(name_T, "%s%s_CV%0*d_T.%s", szDir, szDataBn, nChar, i+1, szExtension);
      }


      // generate data
      if(!f_CV)                                                            // n-fold repetition
      {
         Randomize(seeds[i]);                                              // randomize order of data tuples

         data_L[i] = new TData(this, N_L, true, name_L);                   // take first <N_L> tuples as learn data
         data_T[i] = new TData(this, N_T, false, name_T);                  // take last <N_T> tuples as test data
      }
      else                                                                 // n-fold cross-validation
      {
         data_L[i] = new TData(this, i*N_T, (i+1)*N_T-1, false, name_L);   //
         data_T[i] = new TData(this, i*N_T, (i+1)*N_T-1, true, name_T);    //
      }

      data_L[i]->CalculateStatitics();         // statistic for learn data
      data_L[i]->Sort(0);                                                        // sort
      data_L[i]->CalculateWeights(N_Bins, !f_Regression, f_EqualWidthBinning);   // calculate weights
      data_L[i]->Lock();                                                         // lock
   }

   // release seeds
   delete[] seeds;

   // re-sort data and lock it again if necessary
   if(f_WasSorted) Sort(0);
   if(f_WasLocked) Lock();


   // e) save generated data if basename is specified
   if(szDataBn[0]=='\0')
      return;

   for(int i=0;i<N_R;i++)
   {
      data_L[i]->Save(data_L[i]->LoadFileName());     // save data
      data_T[i]->Save(data_T[i]->LoadFileName());
   }

   #ifdef VALIDATION_2
   data_T[0]->Hack();   // overwrite output
   #endif
}