//------------------------------------------------------------------------------
// module data.h //
// //
// Class TData encapsulates data matrix with 'nTup' rows and 'nVar' //
// columns. NOTE: The first column is regarded as output! Use //
// MoveToFirst() to met this condition! //
// See source or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 1999-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation as version 2 of the License. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. //
// //
//------------------------------------------------------------------------------
#ifndef _DATA_H
#define _DATA_H
#include <fstream> // due to: ifstream ...
#include <math> // fabs()
#include "stdlist.h"
#include "util.h" // Quant()
#define W_FLOAT (int) 12 // width for stream output of floats
#define W_SYMBOL (int) 3 // " of symbolic variables
#define WEIGHT_PRECISION (int) 2 // precision to round calculated feature weights
class TDataData; // forward declaration
//----------------------------------------------------------------------------------------------------------------------
// encapsulates data matrix with 'nTup' rows/tuples and 'nVar' columns/variables
class TData
{
public:
//----------------------------------------------------------------------------------------------------------------
// misc.
// variable type definition and conversion routines
enum TVarType {cont, symb, none}; // define as enum
static char VarTypeToChar(const int type); // convert variable type to character
static TVarType CharToVarType(const char c); // convert character to variable type
// note: You may use TData::OutputColumn() to get current output column
// convert actual variable Id to original column number (referring to the loaded data file)
static const int OriginalColumnInFile(const int& actId, const int& outcol);
// return actual variable Id of column 'oriId' (which referrs to the originally loaded data file)
static const int ActualColumnInData(const int& oriId, const int& outcol);
// constructor/destructor
TData();
void Release() const; // note: destructor is private, use Release() instead
TData* GetObject() { ref++; return this; };
// split data in newly alocated learn and test data objects using the clone constructors
void GenerateLearnAndTestData(const unsigned int& seed, TData**& data_L, TData**& data_T, const int& N_R,
const bool& f_CV, const int& N_L, const int& N_T, const char*const& szDataBn, const char*const& szDir,
const int& N_Int_Input, const bool& f_Regression, const bool& f_EqualWidthBinning, const int& optId=-1,
const int& nChar2=0);
const char* StatusText() const { return szStatusText; }; // set and return text according to state
const int& LoadDataProgress() const { return load_data_progress; };
const char* LoadFileName() const { return szLoadFileName; };
const char* SaveFileName() const { return szSaveFileName; };
const int OriginalColumnInFile(const int& actId) const { return OriginalColumnInFile(actId, outcol);};
const int ActualColumnInData(const int& oriId) const { return ActualColumnInData(oriId, outcol);};
// lock and unlock
inline void Lock() { f_Locked=true; };
inline void Unlock() { f_Locked=false; };
inline bool IsLocked() const { return f_Locked; };
inline const int& OutputColumn() const { return outcol; };
inline const bool IsContinous(const int& j) const { return varType[j]==cont; };
inline const bool IsSymbolic (const int& j) const { return varType[j]==symb; };
const bool IsInteger(const int& j) const { return (_nIntegerMaxMin[j]!=0); };
inline const int& nIntegerMaxMin(const int& j) const { return _nIntegerMaxMin[j]; };
inline const int& nSymbolsFound(const int& j) const { return _nSymbolsFound[j]; };
inline const TVarType& GetVarType (const int& j) const { return varType[j]; };
void SetVarType (const int& j, const TVarType& type);
inline const bool IsDisordered() const { return (!f_Sorted && !f_Randomized); };
inline const bool IsSorted() const { return f_Sorted; };
// make compatible with given data object, i.e. set output column and variable types
bool MakeCompatible(const TData*const& ref);
bool MakeCompatible(const TDataData*const& ref);
// load and save data from file and calculate statistics/weights
void Load(const char* filename, const bool* stop=NULL, const TDataData*const& ddata=NULL);
void Save(const char* filename, const bool* stop=NULL) const;
bool CalculateWeights(const int& nIntervals, const bool& f_Classification, const bool& f_EqualWidthBinning);
// randomize/reorder/sort/normalize tuples
void Randomize(const unsigned int seed=1); // randomize order of tuples
void Reorder(); // restore original order
void Hack(); // overwrite output with random values
void Sort(const int& j); // sort tuples
void SetOutputColumn(const int& j); // set output column which is moved to first then
// get #tuples, #columns
inline const int& nVar() const { return _nVar; }; // # variables/columns
inline const int& nTup() const { return _nTup; }; // # tuples/rows
// get pointer to tuple/row
inline const float*const& Row (const int& row) const { return data[row]; };
inline const float*const& operator()(const int& row) const { return data[row]; };
inline const float& operator()(const int& row, const int& col) const { return data[row][col]; };
inline const float**const& GetDataPointer() const { return (const float**) data; };
// functions to get statistics and weights
inline const float*const& Mean() const { return mean; };
inline const float*const& Min() const { return min; };
inline const float*const& Max() const { return max; };
inline const float*const& Range() const { return range; };
inline const float*const& InvRange() const { return invRange;};
inline const float*const& Dev() const { return dev; };
inline const float*const& Weights() const { return weight; };
protected:
// clone constructors either for repeated splitting or cross-validation
TData(const TData* parent, const int& __nTup, const bool& f_FromTop, const char* szName=NULL);
TData(const TData* parent, const int& begin, const int& end, const bool& f_Inside, const char* szName=NULL);
mutable enum TStateEnum {ready, scan, read, calcI, calcII, save, detect, weights} state; // state-enum
mutable const char* szStatusText; // pointer to status text
void SetState(TStateEnum _state) const; // set state
mutable int progress; // progress counter
mutable int filesize;
mutable int load_data_progress;
char szLoadFileName[STS];
mutable char szSaveFileName[STS];
~TData(); // private(!) destructor, call Release() instead
private:
friend class TDataData;
mutable int ref; // reference counter
bool f_IsClone; // instance was cloned from another
bool f_HasClone; // other instance(s) have been cloned from this instance (forbids Norm())
const TData* parent; // if instance is a clone: points to parent
// calculation parameter for feature weigths (store to prevent unnecessary re-calculations)
int w_nIntervals; // # intervals used
int w_Outcol; // output column used
bool w_f_Classification; // output was considered as symbolic
bool w_f_EqualWidthBinning; // equal width binning was used
TVarType* w_Type; // variable types used
void CalculateStatitics();
void DetectSymbolicColumns();
float* EqualFrequencyBinning(const int& j, const int& nIntervals);
float** data; // data matrix
mutable float** _data; // pointer to rows in original order
int _nTup; // # rows/tuples
int _nVar; // # columns/columns
float unknown; // value which indicates missing/unknown value
int outcol; // output column in originally loaded data file
int* _nIntegerMaxMin; // # symbols for each variable calculated as maximal minus minimal symbol
// value, '0' indicates that variable is continous
int* _nSymbolsFound; // reals # symbols for each variable, '0' indicates that variable is continous
TVarType* varType; // type of variable (continous or symbolic (nominal))
// flags
mutable bool f_Locked;
bool f_Loaded, f_Sorted, f_Randomized;
// -> column wise calculated statistics and feature weights
float* mean;
float* min;
float* max;
float* range;
float* invRange; // note: 1/range
float* dev;
float* weight; // variable weights normalized to have a mean of '1'
};
#endif