//------------------------------------------------------------------------------
//    module data.h                                                           //
//                                                                            //
//    Class TData encapsulates data matrix with 'nTup' rows and 'nVar'        //
//    columns. NOTE: The first column is regarded as output! Use              //
//    MoveToFirst() to met this condition!                                    //
//    See source or http://www.newty.de/pnc2/sdocu.html for more information. //
//                                                                            //
//    copyright (c) 1999-2003 by Lars Haendel                                 //
//    home: www.newty.de                                                      //
//                                                                            //
//    This program is free software; you can redistribute it and/or modify    //
//    it under the terms of the GNU General Public License as published by    //
//    the Free Software Foundation as version 2 of the License.               //
//                                                                            //
//    This program is distributed in the hope that it will be useful,         //
//    but WITHOUT ANY WARRANTY; without even the implied warranty of          //
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the           //
//    GNU General Public License for more details.                            //
//                                                                            //
//    You should have received a copy of the GNU General Public License       //
//    along with this program; if not, write to the Free Software             //
//    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.               //
//                                                                            //
//------------------------------------------------------------------------------


#ifndef _DATA_H
#define _DATA_H

#include <fstream>               // due to:  ifstream ...
#include <math>                  //          fabs()

#include "stdlist.h"
#include "util.h"                //          Quant()



#define W_FLOAT               (int) 12       // width for stream output of floats
#define W_SYMBOL              (int) 3        //             "           of symbolic variables
#define WEIGHT_PRECISION      (int) 2        // precision to round calculated feature weights



class TDataData;     // forward declaration


//----------------------------------------------------------------------------------------------------------------------
// encapsulates data matrix with 'nTup' rows/tuples and 'nVar' columns/variables
class TData
{
public:

      //----------------------------------------------------------------------------------------------------------------
      // misc.

      // variable type definition and conversion routines
      enum TVarType {cont, symb, none};                           // define as enum
      static char VarTypeToChar(const int type);                  // convert variable type to character
      static TVarType CharToVarType(const char c);                // convert character to variable type

      // note: You may use TData::OutputColumn() to get current output column
      // convert actual variable Id to original column number (referring to the loaded data file)
      static const int OriginalColumnInFile(const int& actId, const int& outcol);

      // return actual variable Id of column 'oriId' (which referrs to the originally loaded data file)
      static const int ActualColumnInData(const int& oriId, const int& outcol);



   // constructor/destructor
   TData();
   void Release() const;      // note: destructor is private, use Release() instead

   TData* GetObject() { ref++; return this; };


   // split data in newly alocated learn and test data objects using the clone constructors
   void GenerateLearnAndTestData(const unsigned int& seed, TData**& data_L, TData**& data_T, const int& N_R,
   const bool& f_CV, const int& N_L, const int& N_T, const char*const& szDataBn, const char*const& szDir,
   const int& N_Int_Input, const bool& f_Regression, const bool& f_EqualWidthBinning, const int& optId=-1,
   const int& nChar2=0);


   const char* StatusText() const { return szStatusText; };             // set and return text according to state
   const int& LoadDataProgress() const { return load_data_progress; };

   const char* LoadFileName() const  { return szLoadFileName; };
   const char* SaveFileName() const  { return szSaveFileName; };


   const int OriginalColumnInFile(const int& actId) const { return OriginalColumnInFile(actId, outcol);};
   const int ActualColumnInData(const int& oriId)   const { return ActualColumnInData(oriId, outcol);};


   // lock and unlock
   inline void Lock()            { f_Locked=true;  };
   inline void Unlock()          { f_Locked=false;    };
   inline bool IsLocked() const  { return f_Locked;   };

   inline const int& OutputColumn() const { return outcol; };

   inline const bool IsContinous(const int& j) const { return varType[j]==cont; };
   inline const bool IsSymbolic (const int& j) const { return varType[j]==symb; };
   const bool IsInteger(const int& j) const { return (_nIntegerMaxMin[j]!=0); };

   inline const int& nIntegerMaxMin(const int& j) const { return _nIntegerMaxMin[j]; };
   inline const int& nSymbolsFound(const int& j) const { return _nSymbolsFound[j]; };


   inline const TVarType& GetVarType  (const int& j) const { return varType[j]; };
   void                   SetVarType  (const int& j, const TVarType& type);

   inline const bool IsDisordered()             const { return (!f_Sorted && !f_Randomized); };
   inline const bool IsSorted()                 const { return f_Sorted; };


   // make compatible with given data object, i.e. set output column and variable types
   bool MakeCompatible(const TData*const& ref);
   bool MakeCompatible(const TDataData*const& ref);

   // load and save data from file and calculate statistics/weights
   void Load(const char* filename, const bool* stop=NULL, const TDataData*const& ddata=NULL);
   void Save(const char* filename, const bool* stop=NULL) const;
   bool CalculateWeights(const int& nIntervals, const bool& f_Classification, const bool& f_EqualWidthBinning);


   // randomize/reorder/sort/normalize tuples
   void Randomize(const unsigned int seed=1);                  // randomize order of tuples
   void Reorder();                                             // restore original order
   void Hack();                                                // overwrite output with random values
   void Sort(const int& j);                                    // sort tuples

   void SetOutputColumn(const int& j);                         // set output column which is moved to first then



   // get #tuples, #columns
   inline const int& nVar() const { return _nVar; };           // # variables/columns
   inline const int& nTup() const { return _nTup; };           // # tuples/rows

   // get pointer to tuple/row
   inline const float*const&  Row       (const int& row)                 const { return data[row];       };
   inline const float*const&  operator()(const int& row)                 const { return data[row];       };
   inline const float&        operator()(const int& row, const int& col) const { return data[row][col];  };
   inline const float**const& GetDataPointer()                           const { return (const float**) data; };


   // functions to get statistics and weights
   inline const float*const& Mean()                   const { return mean;    };
   inline const float*const& Min()                    const { return min;     };
   inline const float*const& Max()                    const { return max;     };
   inline const float*const& Range()                  const { return range;   };
   inline const float*const& InvRange()               const { return invRange;};
   inline const float*const& Dev()                    const { return dev;     };
   inline const float*const& Weights()                const { return weight;  };

protected:

   // clone constructors either for repeated splitting or cross-validation
   TData(const TData* parent, const int& __nTup, const bool& f_FromTop, const char* szName=NULL);
   TData(const TData* parent, const int& begin, const int& end, const bool& f_Inside, const char* szName=NULL);

   mutable enum TStateEnum {ready, scan, read, calcI, calcII, save, detect, weights} state;  // state-enum
   mutable const char* szStatusText;                                                // pointer to status text
   void SetState(TStateEnum _state) const;                                          // set state


   mutable int progress;               // progress counter
   mutable int filesize;
   mutable int load_data_progress;
   char szLoadFileName[STS];
   mutable char szSaveFileName[STS];

   ~TData();                           // private(!) destructor, call Release() instead

private:

   friend class TDataData;

   mutable int ref;              // reference counter

   bool f_IsClone;               // instance was cloned from another
   bool f_HasClone;              // other instance(s) have been cloned from this instance (forbids Norm())
   const TData* parent;          // if instance is a clone: points to parent

   // calculation parameter for feature weigths (store to prevent unnecessary re-calculations)
   int w_nIntervals;             // # intervals used
   int w_Outcol;                 // output column used
   bool w_f_Classification;      // output was considered as symbolic
   bool w_f_EqualWidthBinning;   // equal width binning was used
   TVarType* w_Type;             // variable types used


   void CalculateStatitics();
   void DetectSymbolicColumns();
   float* EqualFrequencyBinning(const int& j, const int& nIntervals);

   float**           data;                // data matrix
   mutable float**   _data;               // pointer to rows in original order
   int               _nTup;               // # rows/tuples
   int               _nVar;               // # columns/columns
   float             unknown;             // value which indicates missing/unknown value
   int               outcol;              // output column in originally loaded data file
   int*              _nIntegerMaxMin;     // # symbols for each variable calculated as maximal minus minimal symbol
                                          // value, '0' indicates that variable is continous
   int*              _nSymbolsFound;      // reals # symbols for each variable, '0' indicates that variable is continous
   TVarType*         varType;             // type of variable (continous or symbolic (nominal))


   // flags
   mutable bool   f_Locked;
   bool           f_Loaded, f_Sorted, f_Randomized;


   // -> column wise calculated statistics and feature weights
   float* mean;
   float* min;
   float* max;
   float* range;
   float* invRange;                    // note: 1/range
   float* dev;
   float* weight;                      // variable weights normalized to have a mean of '1'
};
#endif