static const char* szModule = "data.cpp";
static const char* szExtension = "dat";
//------------------------------------------------------------------------------
// module data.cpp //
// //
// Class TData encapsulates data matrix with 'nTup' rows and 'nVar' //
// columns. NOTE: The first column is regarded as output! Use //
// MoveToFirst() to met this condition! //
// See below or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 1998-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software and can be used under the terms of the //
// GNU licence. See header file for further information and disclaimer. //
// //
//------------------------------------------------------------------------------
// //
// CREATE: Create empty object and call Load() or let a bunch of //
// different learn and test data objects be generated for you from a //
// basis data object by the function GenerateLearnAndTestData(). //
// //
// USE: Call CalculateWeights() to calculate feature weights using //
// mutual information criterion. Access maxima, minima, weights etc. //
// by the corresponding functions, sort and randomize the order of //
// the data tuples and (most important for the use with TPnc etc.) //
// call SetOutputColumn() to move the specified column to the first //
// and right-shift all preceding columns. Use //
// OriginalColumnInFile() or ActualColumnInData() to get the //
// original or actual Id of a variable. //
// //
// NOTE: The output column is moved to first, the term 'with output' //
// means, that the index, used to access the columns, is counted with //
// output. //
// //
// File I/O: Load and save routines //
//------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------------------------------------
#include <stdlib> // due to randomize()
#include <stdio> // sprintf() ...
#include <iomanip> // setw()
#include <math> // sqrt() ...
#include "defines.h" //
#include "fileUtil.h" // readExpNoEndl() ...
#include "stdlist.h" // list template
#include "exception.h" // IfTrueThrowTypeA()
#include "ddata.h" // TDataData (will also include data.h)
#define MAX_FOR_DEF_SYMBOLIC (int) 9 // max. # symbols (range of integer values) to be initialized as symbolic
#define MAX_INTEGERS (int) 1024 // maximal range of integer variable to count # of really occuring symbols
// #define DB_WRITE_SPLITPOINTS // debug mode: write splitpoints to file (default: OFF)
//----------------------------------------------------------------------------------------------------------------------
// hardcoded status texts note: used to allow thread safe calls to StatusText()
static const char* szStateReady = "Ready";
static const char* szStateScanFormat = "Scanning file format";
static const char* szStateLoading = "Loading";
static const char* szStateCalcMeans = "Calculating column wise means";
static const char* szStateCalcDev = "Calculating column wise deviations";
static const char* szStateSaving = "Saving";
static const char* szStateDetectSymb = "Detecting symbolic columns";
static const char* szStateWeights = "Calculating feature weights";
static const char* szStateUnknown = "Unknown";
//----------------------------------------------------------------------------------------------------------------------
// return original column number (referring to the loaded data file) of variable with Id 'varId'
const int TData::OriginalColumnInFile(const int& varId, const int& outcol){
if(varId==0) return outcol+1;
else return varId+(varId>outcol);}
//----------------------------------------------------------------------------------------------------------------------
// return actual variable Id of the 'col'-th column ('col' referrs to originally loaded data file)
const int TData::ActualColumnInData(const int& col, const int& outcol) {
if(col==outcol) return 0;
else return col+(col<outcol);}
//----------------------------------------------------------------------------------------------------------------------
// variable type conversion routines
// convert variable type (enum) to character
char TData::VarTypeToChar(const int type){
switch(type) {
case cont : return 'c'; // continuous
case symb : return 'n'; // symbolic (nominal)
default: return '-'; }}
// convert character to variable type
TData::TVarType TData::CharToVarType(const char c){
switch(c) {
case 'c' : return cont; // continuous
case 'n' : return symb; // symbolic (nominal)
default: return none; }}
//----------------------------------------------------------------------------------------------------------------------
// constructor
TData::TData()
{
// initialize
ref = 1; // reference counter
_nTup = _nVar = 0; // # tuples and variables, i.e. rows and columns
outcol = 0;
szLoadFileName[0] = '\0'; // 'delete' filenames
szSaveFileName[0] = '\0';
data = _data = NULL; // set pointer to NULL
_nIntegerMaxMin = _nSymbolsFound = NULL;
varType = w_Type = NULL;
weight = mean = min = max = range = invRange = dev = NULL;
f_HasClone = f_IsClone = f_Locked = f_Loaded = f_Sorted = f_Randomized = false; // re-set flags
w_nIntervals = -1;
SetState(ready); // initialize state
}
//----------------------------------------------------------------------------------------------------------------------
// constructor for clones I - used for splitting data into two parts
TData::TData(const TData* _parent, const int& __nTup, const bool& f_FromTop, const char* szName)
{
parent=_parent; // store pointer to parent
ref = 1; // set own reference counter
parent->ref++; // increment parent's reference counter
w_nIntervals = -1;
// initialize
f_IsClone = true; // set clone flag
_nTup = __nTup;
f_Loaded = true;
f_HasClone = f_Locked = f_Sorted = f_Randomized = false;
weight = mean = min = max = range = invRange = dev = NULL;
w_Type = NULL;
SetState(ready); // initialize state
// copy/initialize
_nVar = parent->_nVar;
unknown = parent->unknown;
_nIntegerMaxMin = parent->_nIntegerMaxMin;
_nSymbolsFound = parent->_nSymbolsFound;
varType = parent->varType;
outcol = parent->outcol;
strcpy(szLoadFileName, parent->szLoadFileName); // copy parent's filename or set specified one
if(szName)
strcpy(szLoadFileName, szName);
// new data pointer and copy row/tuple pointers
if(_nTup>parent->_nTup) // check specified # tuples
_nTup=parent->_nTup;
int offset=0; // set offset if the last '_nTup' rows are taken
if(!f_FromTop)
offset = parent->_nTup-_nTup;
data = new float*[_nTup]; // new pointer to tuples
_data = new float*[_nTup]; // new pointer to tuples in original order
for(int i=0;i<_nTup;i++)
{
data[i] = parent->data[offset+i]; // copy row pointer
_data[i] = data[i];
}
// note: The call to CalculateStatistics() has been removed as there may be data objects with only one data tuple!
// Function must be called seperately by user now.
}
//----------------------------------------------------------------------------------------------------------------------
// constructor for clones II - used for splitting data for n-fold cross-validation
TData::TData(const TData* _parent, const int& begin, const int& end, const bool& f_Inside, const char* szName)
{
parent=_parent; // store pointer to parent
ref = 1; // set own reference counter
parent->ref++; // increment parent's reference counter
w_nIntervals = -1;
// initialize
f_IsClone = true; // set clone flag
f_Loaded = true;
f_HasClone = f_Locked = f_Sorted = f_Randomized = false;
weight = mean = min = max = range = invRange = dev = NULL;
w_Type = NULL;
SetState(ready); // initialize state
// calculate # tuples
_nTup = end-begin+1; // ini
if(!f_Inside)
_nTup = parent->nTup()-_nTup;
// copy/initialize
_nVar = parent->_nVar;
unknown = parent->unknown;
_nIntegerMaxMin = parent->_nIntegerMaxMin;
_nSymbolsFound = parent->_nSymbolsFound;
varType = parent->varType;
outcol = parent->outcol;;
strcpy(szLoadFileName, parent->szLoadFileName); // copy parent's filename or set specified one
if(szName)
strcpy(szLoadFileName, szName);
// new data pointer and copy row/tuple pointers
if(_nTup>parent->_nTup) // check specified # tuples
_nTup=parent->_nTup;
data = new float*[_nTup]; // new pointer to tuples
_data = new float*[_nTup]; // new pointer to tuples in original order
// copy row/tuple pointers
if(f_Inside)
for(int i=0;i<_nTup;i++)
data[i] = parent->data[begin+i];
else
{
for(int i=0;i<begin;i++)
data[i] = parent->data[i]; // copy row pointer
for(int i=end+1;i<parent->nTup();i++)
data[i-end+begin-1] = parent->data[i]; // copy row pointer
}
for(int i=0;i<_nTup;i++) // store row pointers in original order
_data[i] = data[i];
// calculate statistics (hack!) see above
// CalculateStatitics();
}
//----------------------------------------------------------------------------------------------------------------------
// use to 'delete' instance
void TData::Release() const
{
if(--ref==0)
delete this; // delete instance
}
//----------------------------------------------------------------------------------------------------------------------
// private(!) destructor
TData::~TData()
{
if(f_IsClone)
parent->Release();
else
{ // release data matrix
delete[] _nIntegerMaxMin;
delete[] _nSymbolsFound;
delete[] varType;
if(data)
for(int i=0;i<_nTup;i++) // tuples/rows
delete[] data[i];
}
delete[] data;
delete[] _data;
delete[] w_Type; // copy of variable types used to calculate feature weights
// release statistics
delete[] weight;
delete[] mean;
delete[] min;
delete[] max;
delete[] range;
delete[] invRange;
delete[] dev;
}
//----------------------------------------------------------------------------------------------------------------------
// make compatible with given data object, i.e. copy variable types etc.
bool TData::MakeCompatible(const TData*const& ref)
{
bool f_NoOutput = (_nVar==ref->nVar()-1);
// a) check: allow same # variables or one variable less if specified by flag f_AllowNoOutput
IfTrueThrowTypeU(_nVar!=ref->nVar() && !f_NoOutput, "Number of variables does not match");
// b) set output column in data file if data is with output
if(!f_NoOutput)
SetOutputColumn(ref->OutputColumn());
// c) set variable types - note: there is an offset by 1 is flag f_NoOutput is true
for(int j=f_NoOutput;j<nVar();j++) // for each variable
if(ref->IsSymbolic(j)) // if variable is symbolic in master (ddata) object
if(nIntegerMaxMin(j-f_NoOutput)!=0) // check and if it's ok ...
SetVarType(j-f_NoOutput, TData::symb); // ... then set
else
ThrowTypeU("Variable types don't match!"); // ... else throw exception
// d) return true if data is with output
return !f_NoOutput;
}
bool TData::MakeCompatible(const TDataData*const& ref)
{
bool f_NoOutput = (_nVar==ref->nVar()-1);
// a) check: allow same # variables or one variable less if specified by flag f_AllowNoOutput
IfTrueThrowTypeU(_nVar!=ref->nVar() && !f_NoOutput, "Number of variables does not match");
// b) set output column in data file if data is with output
if(!f_NoOutput)
SetOutputColumn(ref->OutputColumn());
// c) set variable types - note: there is an offset by 1 is flag f_NoOutput is true
for(int j=f_NoOutput;j<nVar();j++) // for each variable
if(ref->IsSymbolic(j)) // if variable is symbolic in master (ddata) object
if(nIntegerMaxMin(j-f_NoOutput)!=0) // check and if it's ok ...
SetVarType(j-f_NoOutput, TData::symb); // ... then set
else
ThrowTypeU("Variable types don't match!"); // ... else throw exception
// d) return true if data is with output
return !f_NoOutput;
}
//----------------------------------------------------------------------------------------------------------------------
// set type of variable throwing an exception if nominal or ordinal types are tried to set for a non-symbolic variable
void TData::SetVarType(const int& j, const TVarType& type)
{
char szText[STS];
sprintf(szText, "Variable %d(%d) has non-integer values! Cannot set symbolic (nominal) type for it!", j+1
, OriginalColumnInFile(j));
IfTrueThrowTypeA( type==symb && !IsInteger(j), szText, "TData::SetVarType", szModule);
varType[j]=type; // ok, set type
}
//----------------------------------------------------------------------------------------------------------------------
// set output column. This column is moved to first and all preecedding columns are shifted to the right. If another
// column has alread been set as output column this is un-done. Statistics, variable types and # symbols are moved too.
void TData::SetOutputColumn(const int& colId)
{
// a) check
IfTrueThrowTypeA(f_Locked, "Function called on locked instance! Call TData::Unlock() before!", "TData::SetOutputColumn", szModule);
IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::SetOutputColumn", szModule);
IfTrueThrowTypeA(colId<0 || colId>=_nVar, "Index exceeds matrix dimensions!", "TData::SetOutputColumn", szModule);
IfTrueThrowTypeA(f_IsClone, "Function called on cloned instance!", "TData::SetOutputColumn", szModule);
IfTrueThrowTypeA(f_HasClone, "Function called for instance which was used to make clones!", "TData::SetOutputColumn", szModule);
// b) return if column is already set as output column, i.e. if it's already moved to first
if(colId==outcol)
return;
// reset flags
f_Sorted = false;
// c) if another column has already been moved to first: rewind this, i.e. move it back
if(outcol!=0)
{
//c1) data matrix: move first column back to the 'outcol'-th column
for(int i=0;i<_nTup;i++)
{
float y0=data[i][0]; // save first column
for(int j=0;j<outcol;j++) // left shift all following columns up to target column
data[i][j]=data[i][j+1];
data[i][outcol]=y0; // restore
}
// c2) variable types
const TVarType varType0 = varType[0]; // save first column
for(int j=0;j<outcol;j++) // left shift all following columns up to target column
varType[j]=varType[j+1];
varType[outcol] = varType0; // restore
// c3) statistics
const float min0 = min[0]; // save first column
const float max0 = max[0];
const float mean0 = mean[0];
const float dev0 = dev[0];
const float range0 = range[0];
const float invRange0 = invRange[0];
for(int j=0;j<outcol;j++) // left shift all following columns up to target column
{
min[j] = min[j+1];
max[j] = max[j+1];
mean[j] = mean[j+1];
dev[j] = dev[j+1];
range[j] = range[j+1];
invRange[j] = invRange[j+1];
}
min[outcol] = min0; // restore
max[outcol] = max0;
mean[outcol] = mean0;
dev[outcol] = dev0;
range[outcol] = range0;
invRange[outcol] = invRange0;
// c4) # symbols
int nSymbols0 = _nIntegerMaxMin[0]; // save first column
for(int j=0;j<outcol;j++) // left shift all following columns up to target column
_nIntegerMaxMin[j]=_nIntegerMaxMin[j+1];
_nIntegerMaxMin[outcol] = nSymbols0; // restore
nSymbols0 = _nSymbolsFound[0]; // save first column
for(int j=0;j<outcol;j++) // left shift all following columns up to target column
_nSymbolsFound[j]=_nSymbolsFound[j+1];
_nSymbolsFound[outcol] = nSymbols0; // restore
}
outcol = colId; // store new output column
//d) data matrix: move 'colId'-th column to first
for(int i=0;i<_nTup;i++)
{
float y=data[i][colId]; // save 'colId'-th column
for(int j=colId;j>0;j--) // right shift all preceeding columns
data[i][j]=data[i][j-1];
data[i][0]=y; // restore
}
// e) variable types
const TVarType typeOfY = varType[colId]; // save
for(int j=colId;j>0;j--) // right shift all preceeding columns
varType[j]=varType[j-1];
varType[0] = typeOfY; // restore
// e) statistics
const float min0 = min[colId]; // save orignal values of 'colId'-th column
const float max0 = max[colId];
const float mean0 = mean[colId];
const float dev0 = dev[colId];
const float range0 = range[colId];
const float invRange0 = invRange[colId];
for(int j=colId;j>0;j--) // right shift all preceeding columns
{
min[j] = min[j-1];
max[j] = max[j-1];
mean[j] = mean[j-1];
dev[j] = dev[j-1];
range[j] = range[j-1];
invRange[j] = invRange[j-1];
}
min[0] = min0; // restore
max[0] = max0;
mean[0] = mean0;
dev[0] = dev0;
range[0] = range0;
invRange[0] = invRange0;
// f) # symbols
int nSymbols0 = _nIntegerMaxMin[colId]; // save
for(int j=colId;j>0;j--) // right shift all preceeding columns
_nIntegerMaxMin[j]=_nIntegerMaxMin[j-1];
_nIntegerMaxMin[0] = nSymbols0; // restore
nSymbols0 = _nSymbolsFound[colId]; // save
for(int j=colId;j>0;j--) // right shift all preceeding columns
_nSymbolsFound[j]=_nSymbolsFound[j-1];
_nSymbolsFound[0] = nSymbols0; // restore
}
//----------------------------------------------------------------------------------------------------------------------
// detect if columns is symbolic, i.e. contains only integer values
void TData::DetectSymbolicColumns()
{
// a) check
IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!"
, "TData::DetectSymbolicColumns", szModule);
// allocate memory
_nIntegerMaxMin = new int[_nVar]; // # symbols per variable (zero if variable is continuous)
varType = new TVarType[_nVar]; // type of each variable (continuous, ordinal, nominal)
_nSymbolsFound = new int[_nVar]; // # symbols per variable
// b) new flags which indicates that column is symbolic
bool* symbolic = new bool[_nVar];
// c) iterate all variables to check if any tuple contains non-integer values
for(int j=0;j<_nVar;j++)
{
symbolic[j]=true; // ini
for(int i=0;i<_nTup;i++) // check all tuples
{
if(floor(data[i][j])!=data[i][j]) // if value is non-integer ...
{
symbolic[j]=false; // reset flag
break; // and break
}
}
}
// d) evaluate symbolic flags and set # symbols calculated as maximal minus minmal symbol value
for(int j=0;j<_nVar;j++)
if(symbolic[j])
_nIntegerMaxMin[j]=max[j]-min[j]+1; // determine # symbols
else
_nIntegerMaxMin[j] = 0;
// e) determine real number of symbols and set default variable types
for(int j=0;j<_nVar;j++)
{
int nFound = 0; // reset/ini
// detect # symbols
if(_nIntegerMaxMin[j]>0 && _nIntegerMaxMin[j]<MAX_INTEGERS) // if variable has only and not to much integer
// ... then count # of really occuring symbols
{
const int nSymb = _nIntegerMaxMin[j]; // abbrevation
bool* f_symbol_found = new bool[nSymb]; // allocate and reset flag vector
for(int s=0;s<nSymb;s++)
f_symbol_found[s] = false;
for(int i=0;i<_nTup;i++) // process all tuples and for each one ...
f_symbol_found[(int) (data[i][j]-min[j])] = true; // ... set corresponding symbol flag(PoD)
// count # symbol flags
for(int s=0;s<nSymb;s++)
if(f_symbol_found[s])
nFound++;
delete[] f_symbol_found; // release
}
// set default variable type
if(nFound>0 && nFound<MAX_FOR_DEF_SYMBOLIC)
varType[j] = symb; // set symbolic (nominal) as default for integer columns with not to many different symbols
else
varType[j] = cont;
_nSymbolsFound[j] = nFound; // store # symbols found
}
// release
delete[] symbolic;
}
//----------------------------------------------------------------------------------------------------------------------
// set state
void TData::SetState(TStateEnum _state) const
{
state = _state; // a) store new state (obsolete ?)
switch(state) // b) set staus text (pointer)
{
case ready : szStatusText = szStateReady; break;
case scan : szStatusText = szStateScanFormat; break;
case read : szStatusText = szStateLoading; break;
case calcI : szStatusText = szStateCalcMeans; break;
case calcII : szStatusText = szStateCalcDev; break;
case save : szStatusText = szStateSaving; break;
case detect : szStatusText = szStateDetectSymb; break;
case weights: szStatusText = szStateWeights; break;
default : szStatusText = szStateUnknown;
}
}
//----------------------------------------------------------------------------------------------------------------------
// load data matrix from file - # columns and # rows are detected automatically
// The TDataData object is optional and used to check if loaded data has the correct # variables
void TData::Load(const char* szFilename, const bool* f_Stop, const TDataData*const& ddata/*=NULL*/)
{
ifstream file(szFilename, ios::in); // try to open file for reading
// a) check
IfTrueThrowTypeA(f_Loaded,"Function cannot be called twice in the lifetime of an instance!","TData::Load",szModule);
IfTrueThrowTypeU(!file, "Read Error: Unable to open data file '%s'!", szFilename);
// b) initialize
f_Loaded = true;
load_data_progress=0;
int line = 1;
strcpy(szLoadFileName, szFilename); // store filename
strcpy(szSaveFileName, szFilename);
streampos begin = file.tellg(); // preserve actual stream position (begin of file)
// c) detect # columns, scanning the first non-empty/non-commented row
SetState(TData::scan); // set state
try
{
line += skipwsEx(file); // remove whitespaces and comments
while(skipws(file)==0) // proceed until linefeed
{
Read(file, (float) 1.0); // read value
_nVar++; // count columns
}
}
catch(int errNo) // exception handling
{
char szText[STS];
sprintf(szText, "Detecting number of columns - line %d: %s", line, ::GetLastError(errNo));
ThrowTypeU(szText);
}
// if given: check # variables in data file if it matches (i.e. is equal or one less) the TDataData object
if(ddata)
IfTrueThrowTypeU(_nVar!=ddata->nVar()&&!(_nVar==ddata->nVar()-1)
, "Number of variables does not match! File loading aborting!");
else // else: learn data is loaded, min. variable count is 2
IfTrueThrowTypeU(_nVar < 2, "There must be a minimum of one input and one output variable\nin data file!\
File loading aborting! ");
// determine file size for progress indication
file.seekg(0L, ios::end); // position to end
filesize = file.tellg();
file.seekg(begin); // restore stream position
// d) read data
SetState(TData::read);
int j; // column counter
line = 1;
TStdList<float*> list; // temporay list to read rows
list.SetName("TData::Load() list");
try
{
line += skipwsEx(file); // remove whitespaces and comments
while((char) file.peek() != EOF) // while next character in file is not eof (end of file)
{
// process control
load_data_progress = 100*file.tellg()/filesize;
float*& row = list.Ins(); // insert new list element and get reference to it
row = new float[_nVar]; // allocate memory
// read all columns
for(j=0; j<_nVar;j++)
row[j] = ReadExpNoEndl(file, (float) 0); // read value
// remove whitespaces and columns throwing an exception if no endl occurs
line += skipwsExEndl(file);
if(f_Stop) // if given
if(*f_Stop) // check stop flag
ThrowTypeU("User break! File loading aborted!"); // and throw exception if set
}
}
catch (int errNo) // exception handling
{
DeleteEntries(&list); // release memory in list
char szText[STS]; // compose error text
sprintf(szText,"Line %d - Row %d - Column %d - %s", line, list.Size(), j+1, ::GetLastError(errNo));
ThrowTypeU(szText);
}
// e) aloocate memory for data matrix and copy rows from list to array (list is local variable and will be released)
_nTup = list.Size(); // get # tuples (rows)
data = new float*[_nTup]; // allocate
list.Reset(); // reset list to top
for(int i=0;i<_nTup;i++)
{
list.Next(); // position forward
data[i] = list.Get(); // copy
}
// f) preserve pointers to rows in original order
_data = new float*[_nTup]; // allocate
for(int i=0;i<_nTup;i++)
_data[i] = data[i]; // copy
file.close();
// g) calculate column wise statistics and detect symbolic columns
CalculateStatitics();
SetState(TData::ready);
}
//----------------------------------------------------------------------------------------------------------------------
// write data matrix to file
void TData::Save(const char* szFilename, const bool* f_Stop /* = NULL */) const
{
ofstream file(szFilename, ios::out); // try to open file for output
// checks - note: set error text and return in case of an error
IfTrueThrowTypeU(!file, "Write Error: Unable to open data file '%s'", szFilename);
IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!","TData::Save", szModule);
// write header
file << ComChar << " Original File = " << szLoadFileName << endl;
file << ComChar << " #Rows = " << _nTup << endl;
file << ComChar << " #Columns = " << _nVar << endl;
file << ComChar << " Output Column = " << (outcol+1) << endl;
// state setzen und ggf. dialog das stop-flag übermitteln
SetState(save); // state setzen
// write format description
file << ComChar << endl << ComChar << " Statistics on Columns:" << endl;
file << ComChar << " Column Minimum Maximum Mean Deviation Weight #Symbols and Type";
file << endl;
for(int j=0;j<_nVar;j++) // for all variables
{
file << ComChar << setw(7) << OriginalColumnInFile(j) << " "; // original column in data file
file << setw(W_FLOAT) << min[j] << " " << setw(W_FLOAT) << max[j] << " "; // minimum
file << setw(W_FLOAT) << mean[j] << " " << setw(W_FLOAT) << dev[j] << " "; // maximum
file << setw(W_FLOAT) << weight[j] << " "; // weight
file << setw(W_SYMBOL) << _nIntegerMaxMin[j] << "/"; // # interger value min. up to max.
file << setw(W_SYMBOL) << _nSymbolsFound[j] << " "; // # symbols
file << setw(W_SYMBOL) << VarTypeToChar(varType[j]) << endl; // variable type
}
file << endl;
// set column widths
int* w = new int[_nVar];
for(int j=0;j<_nVar;j++)
if(IsSymbolic(j))
w[j] = W_SYMBOL;
else
w[j] = W_FLOAT;
// write all tuples
for(int i=0;i<_nTup;i++)
{
for(int j=0;j<_nVar;j++) // over all columns
file << setw(w[j]) << data[i][j] << " ";
file << endl; // linefeed
// check stop flag
if(f_Stop)
if(*f_Stop)
{
file << endl << ComChar << " Error: Userbreak! Process terminated!" << endl;
break;
}
}
// release memory, close file and set state
delete[] w;
file.close();
SetState(ready);
}
//----------------------------------------------------------------------------------------------------------------------
// calculate statistics on columns
void TData::CalculateStatitics()
{
// a) checks
IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!"
, "TData::CalculateStatistics", szModule);
IfTrueThrowTypeU(_nTup<2, "Unable to calculate statistics! Data file must contain at least 2 data tuples!");
// b) allocate memory
mean = new float[_nVar]; // column wise calculated statistics
max = new float[_nVar];
min = new float[_nVar];
dev = new float[_nVar];
range = new float[_nVar];
invRange = new float[_nVar];
weight = new float[_nVar]; // feature weights
// c) calculate mean, maximum and minimum for each column
SetState(calcI); // set state
for(int j=0;j<_nVar;j++) // over all columns
{
dev [j] = 0; // ini for d)
mean[j] = 0; // ini
max [j] = data[0][j];
min [j] = data[0][j];
for(int i=0;i<_nTup;i++) // over all tuples
{
mean[j]+= data[i][j]; // sum up
if(data[i][j] > max[j]) // search maximum and minimum
max[j] = data[i][j];
else
if(data[i][j] < min[j])
min[j] = data[i][j];
}
mean [j] = mean[j]/_nTup; // calculate mean
range [j] = max[j]-min[j]; // " range
if(range[j]!=0) // " inverse of range
invRange[j] = 1/range[j];
else
invRange[j] = 1;
}
// d) calculate standard deviation
SetState(calcII);
for(int j=0;j<_nVar;j++) // over all columns
for(int i=0;i<_nTup;i++) // over all tuples
dev[j] += (mean[j]-data[i][j])*(mean[j]-data[i][j]); // sum up
for(int j=0;j<_nVar;j++)
if(_nTup>1)
dev[j] = sqrt(fabs(dev[j]/(_nTup-1))); // calculate deviation
// ? use fabs() ? to cope with rounding effects ?
else
dev[j] = 0;
// e) detect symbolic columns
SetState(detect);
if(!f_IsClone) // only call if instance is not a clone
DetectSymbolicColumns();
// f) initialize weights and 1/weight
for(int j=0;j<_nVar;j++) // over all columns
weight[j]=1;
SetState(ready);
}
//----------------------------------------------------------------------------------------------------------------------
// randomize order of tuples
void TData::Randomize(const unsigned int seed/*=1*/)
{
// a) check
IfTrueThrowTypeA(f_Locked, "Function called on locked instance! Call TData::Unlock() before!", "TData::Randomize", szModule);
IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::Randomize", szModule);
f_Randomized = true; // b) set flags
f_Sorted = false;
int* /*cr*/ id = RandomId(_nTup, seed); // c) get random id's
for(int i=0;i<_nTup;i++)
data[i] = _data[id[i]];
delete[] id;
}
//----------------------------------------------------------------------------------------------------------------------
// restore original order of tuples
void TData::Reorder()
{
// a) check
IfTrueThrowTypeA(f_Locked, "Fucntion called on locked instance! Call TData::Unlock() before!", "TData::Reorder", szModule);
IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::Reorder", szModule);
// note: there was a problem if data is sorted and then the output column is changed: the sort flags is resetted
// then but the tuples are not in their original order! Thus do not check anymore. Just do it!!
// if(f_Randomized || f_Sorted) // check if rows are disordered
// return;
f_Randomized = false; // b) set flags
f_Sorted = false;
// c) reorder rows if they are not in original order
for(int i=0;i<_nTup;i++) // copy back
data[i] = _data[i];
}
//----------------------------------------------------------------------------------------------------------------------
// sort tuples regarding the 'colId'-th column - note: at the moment 'colId' must be 0!
// in case of equality the tuples remain like they were!
void TData::Sort(const int& colId)
{
if(f_Sorted)
return;
// a) check
IfTrueThrowTypeA(f_Locked, "Function called on locked instance! Call TData::Unlock() before!", "TData::Sort", szModule);
IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::Sort", szModule);
IfTrueThrowTypeA(colId!=0, "At the moment function can only be called with 'colId=0'!", "TData::Sort", szModule);
// b) set flags
f_Randomized = false;
f_Sorted = true;
// c) get id's sorted regarding column 0 - note: this way ensures that Sort() will always result in the same order
TIdVec* vec = new TIdVec[_nTup];
for(int i=0;i<_nTup;i++)
{
vec[i].a = _data[i][0];
vec[i].b = i;
vec[i].id= i;
}
qsort((void*) vec, _nTup, sizeof(vec[0]), IdVecCmpDes); // sort in decreasing(!) order
// d) sort
for(int i=0;i<_nTup;i++)
data[_nTup-1-i]=_data[vec[i].id]; // note: tuples were sorted in decreasing order
delete[] vec; // release
}
//----------------------------------------------------------------------------------------------------------------------
// equal frequency binning
float* /*cr*/ TData::EqualFrequencyBinning(const int& j, const int& nIntervals)
{
float* splits=new float[nIntervals];
// copy values of j-th variable to vector
float* x=new float[_nTup]; // allocate
for(int i=0;i<_nTup;i++)
x[i]=data[i][j]; // copy
// sort
qsort((void*) x, _nTup, sizeof(x[0]), FloatCmpAsc);
splits[0] = x[0]; // set left border to minimum
for(int k=1;k<nIntervals;k++)
{
int id=k*_nTup/((float) nIntervals);
splits[k]=(x[id]+x[id-1])/2; // set split/left border in the middle between two points
// note: condition nTup>nIntervals yields
}
delete[] x; // release
#ifdef DB_WRITE_SPLITPOINTS
file << "Variable " << j << " " << splits[0] << " ";
for(int k=1;k<nIntervals;k++)
file << splits[k] << " ";
file << endl;
#endif
return splits;
}
// convert splits/left borders of discretization intervals to midpoints
// note: 'splits' is a float array containing at position 'k' the left border of the
// 'k+1'-th interval.
void SplitsToMidPoints(float*& splits, const int& nIntervals, const float& max)
{
// set midpoints as half of the range of left borders of current and next bin
for(int k=0;k<nIntervals-1;k++)
splits[k] =splits[k] + (splits[k+1]-splits[k])/2.0;
splits[nIntervals-1] = splits[nIntervals-1] + (max-splits[nIntervals-1])/2.0; // last bin
}
// quantization for equal frequency binning
int Quant(const float value, const float* splits, const int nInt)
{
for(int k=1;k<nInt;k++)
if(value<splits[k])
return (k-1);
return nInt-1;
}
//----------------------------------------------------------------------------------------------------------------------
// calculate feature weights using mutual information criterium
bool TData::CalculateWeights(const int& nIntervals, const bool& f_Classification, const bool& f_EqualWidthBinning)
{
// 1. check
IfTrueThrowTypeA(!f_Loaded, "Function called on empty instance! Call TData::Load() before!", "TData::CalculateWeights", szModule);
IfTrueThrowTypeA(nIntervals<2, "Function called with illegal value for 'nIntervals'!", "TData::CalculateWeights", szModule);
IfTrueThrowTypeA(_nIntegerMaxMin[0]==0 && f_Classification,
"Current output is continuous! Classification flag must not be set!", "TData::CalculateWeights", szModule);
// ToDo: what happens if a variable is symbolic but there is only one symbol ??
bool f_EqualTypes = (w_Type!=NULL); // check if input (!) variable types have changed
if(w_Type)
{
for(int j=1;j<_nVar;j++)
if(w_Type[j]!=varType[j])
f_EqualTypes=false;
}
else
w_Type = new TVarType[_nVar]; // allocate
// return if weights already calculated
if(w_f_Classification == f_Classification && f_EqualTypes && nIntervals == w_nIntervals && outcol == w_Outcol
&& w_f_EqualWidthBinning == f_EqualWidthBinning)
return false;
SetState(TData::weights); // set state
w_nIntervals = nIntervals; // store calculation parameters
w_Outcol = outcol;
w_f_Classification = f_Classification;
w_f_EqualWidthBinning = f_EqualWidthBinning;
memcpy(w_Type, varType, sizeof(varType[0])*_nVar); // variable types
// 2.
weight[0] = 1; // note: output doesn't have weight
// set # output intervals
int nOpI = nIntervals;
if(f_Classification)
nOpI = _nIntegerMaxMin[0];
float** mat = new float*[nOpI]; // new probability matrix (row pointers)
float* pClass = new float[nOpI]; // class probabilities
// 3. calculate weight for each input(!) variable
for(int j=1;j<_nVar;j++)
{
weight[j]=0;
int nIpI = nIntervals; // a) set # input intervals for j-th variable
const float* /*cr*/ splits = NULL;
if(IsSymbolic(j))
nIpI = _nIntegerMaxMin[j];
else
if(!f_EqualWidthBinning)
splits = EqualFrequencyBinning(j, nIpI); // calculate split points for continuous variables
if(nIpI==1)
continue; // if there's only one interval -> weight is zero -> continue with next variable
float* pFeature = new float[nIpI]; // b) new and reset feature probabilities
for(int t=0;t<nIpI;t++)
pFeature[t]=0;
// c) new and reset probability matrix
for(int s=0;s<nOpI;s++)
{
mat[s] = new float[nIpI]; // new row
for(int t=0;t<nIpI;t++) // reset
mat[s][t]=0;
}
// d) apportion data tuples, i.e. count occurences
for(int i=0;i<_nTup;i++)
if(!f_EqualWidthBinning)
if(IsSymbolic(j))
mat[Quant(data[i][0], max[0], min[0], nOpI, f_Classification)][Quant(data[i][j], max[j], min[j], nIpI, true)]++;
else
mat[Quant(data[i][0], max[0], min[0], nOpI, f_Classification)][Quant(data[i][j], splits, nIpI)]++;
else
mat[Quant(data[i][0], max[0], min[0], nOpI, f_Classification)][Quant(data[i][j], max[j], min[j], nIpI, IsSymbolic(j))]++;
// e) turn occurences in probabilities and calculate class and feature probabilities
for(int s=0;s<nOpI;s++)
{
pClass[s]=0; // ini
for(int t=0;t<nIpI;t++)
{
mat[s][t] /= _nTup;
pClass[s] += mat[s][t]; // calculate s-th class pobability: sum up row
pFeature[t] += mat[s][t]; // calculate t-th feature pobability: sum up column
}
}
// f) sum up mutual information and set weight
for(int s=0;s<nOpI;s++)
for(int t=0;t<nIpI;t++)
if(mat[s][t]!=0)
weight[j]+=mat[s][t]*log( mat[s][t]/pClass[s]/pFeature[t] );
// g) release
delete[] pFeature;
for(int s=0;s<nOpI;s++)
delete[] mat[s];
delete[] splits;
}
// 4. release
delete[] mat;
delete[] pClass;
// 5. standardize weights to have mean of '1' and round them to defined precision
// a) calculate sum of weights
float sum=0;
for(int j=1;j<_nVar;j++)
sum+=weight[j];
sum/=_nVar-1;
// b) check the sum of weights differs from zero. note: All zero weights may happen if the output's entropy is
// already zero because all output values are the same. Then, of course, the transinformations will also be zero.
if(sum==0)
for(int j=1;j<_nVar;j++)
weight[j] = 1; // set dummy value note: learning however should be useless :-)
else
{
// c) standardize and round to defined precision
int fac = pow(10, WEIGHT_PRECISION);
for(int j=1;j<_nVar;j++)
weight[j] = floor(weight[j]/sum * fac + 0.5) / fac;
}
SetState(TData::ready); // set state
return true; // new feature weights were calculated
}
//----------------------------------------------------------------------------------------------------------------------
// used for validation purposes only - overwrite output values
void TData::Hack()
{
for(int i=0;i<_nTup;i++)
data[i][0] = i+1;
}
//----------------------------------------------------------------------------------------------------------------------
// generate learn/test data and save it if basename is specified
void TData::GenerateLearnAndTestData(const unsigned int& seed, TData**& data_L, TData**& data_T, const int& N_R,
const bool& f_CV, const int& N_L, const int& N_T, const char*const& szDataBn,
const char*const& szDir, const int& N_Bins, const bool& f_Regression,
const bool& f_EqualWidthBinning, const int& optId/*=-1*/, const int& nChar2/*=0*/)
{
// a) new fields
data_L = new TData*[N_R];
data_T = new TData*[N_R];
// b) initialize random seeds for randomization of data tuples
srand(seed); // set random number generator to specified state
unsigned int* seeds = new unsigned int[N_R]; // store random seeds
for(int i=0;i<N_R;i++)
seeds[i] = abs(random(INT_MAX));
// c) prepare
bool f_WasSorted = IsSorted(); // remember if data was sorted/locked to restore at the end
bool f_WasLocked = IsLocked();
Unlock(); // unlock data to allow randomization
if(f_CV)
Randomize(seeds[0]); // initial randomization of data tuples, necessary for random n-fold cross validation
// d) generate learn and test data
int nChar = log(N_R)/log(10)+1;
for(int i=0;i<N_R;i++)
{
// generate (file) names
char name_L[STS], name_T[STS];
if(optId!=-1)
{
// note: add current optimization id
sprintf(name_L, "%s%s_CV%0*d_Run%0*d_L.%s", szDir, szDataBn, nChar2, optId+1, nChar, i+1, szExtension);
sprintf(name_T, "%s%s_CV%0*d_Run%0*d_T.%s", szDir, szDataBn, nChar2, optId+1, nChar, i+1, szExtension);
}
else
{
sprintf(name_L, "%s%s_CV%0*d_L.%s", szDir, szDataBn, nChar, i+1, szExtension);
sprintf(name_T, "%s%s_CV%0*d_T.%s", szDir, szDataBn, nChar, i+1, szExtension);
}
// generate data
if(!f_CV) // n-fold repetition
{
Randomize(seeds[i]); // randomize order of data tuples
data_L[i] = new TData(this, N_L, true, name_L); // take first <N_L> tuples as learn data
data_T[i] = new TData(this, N_T, false, name_T); // take last <N_T> tuples as test data
}
else // n-fold cross-validation
{
data_L[i] = new TData(this, i*N_T, (i+1)*N_T-1, false, name_L); //
data_T[i] = new TData(this, i*N_T, (i+1)*N_T-1, true, name_T); //
}
data_L[i]->CalculateStatitics(); // statistic for learn data
data_L[i]->Sort(0); // sort
data_L[i]->CalculateWeights(N_Bins, !f_Regression, f_EqualWidthBinning); // calculate weights
data_L[i]->Lock(); // lock
}
// release seeds
delete[] seeds;
// re-sort data and lock it again if necessary
if(f_WasSorted) Sort(0);
if(f_WasLocked) Lock();
// e) save generated data if basename is specified
if(szDataBn[0]=='\0')
return;
for(int i=0;i<N_R;i++)
{
data_L[i]->Save(data_L[i]->LoadFileName()); // save data
data_T[i]->Save(data_T[i]->LoadFileName());
}
#ifdef VALIDATION_2
data_T[0]->Hack(); // overwrite output
#endif
}