static const char* szModule = "ddata.cpp";
//------------------------------------------------------------------------------
// module ddata.cpp //
// //
// Class TDataData encapsulates some necessary information from the learn //
// data file like # variables, minima, maxima, feature, weights ... //
// Counted object: use Release() to delete! //
// See below or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 1998-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software and can be used under the terms of the //
// GNU licence. See header file for further information and disclaimer. //
// //
//------------------------------------------------------------------------------
// //
// CREATE: Pass a TData object which is used to initialize/copy //
// //
// USE: You get maxima, minima, weights etc. just like from a TData //
// object. //
// //
// File I/O: Load and save routines //
//------------------------------------------------------------------------------
#include <iomanip> // due to: setw()
#include "fileUtil.h" // readExpNoEndl() ...
#include "ddata.h" // TDataData (will also include data.h)
//----------------------------------------------------------------------------------------------------------------------
// dummy constructor used if instance is initialized with Load()
TDataData::TDataData()
{
// counted object, initialize reference counter
ref=1;
f_Loaded = false;
// set pointer to NULL
min = weight = NULL;
_nIntegerMaxMin = NULL;
#ifndef RELEASE // obsolete in release versions
_nSymbolsFound = NULL;
#endif
type = NULL;
_nVar = _nInp = meanY = outcol = 0;
strcpy(szFilename, "unknown");
}
//----------------------------------------------------------------------------------------------------------------------
// normal constructor, initialize from given TData object
TDataData::TDataData(const TData*const& data, const TNormType& normType, const float& overlapFac)
{
// a) initialize/copy
ref = 1; // counted object, initialize reference counter
f_Loaded = true;
Allocate(data->nVar()); // allocate memory
_nInp = data->nVar()-1;
_nTup = data->nTup();
outcol= data->outcol;
strcpy(szFilename, data->LoadFileName());
meanY = data->Mean()[0]; // output mean
// b) copy minima, maxima, deviation etc. from given data object
memcpy(min, data->Min(), sizeof(float)*_nVar);
memcpy(type, data->varType, sizeof(TData::TVarType)*_nVar);
memcpy(_nIntegerMaxMin, data->_nIntegerMaxMin, sizeof(int)*_nVar);
memcpy(weight, data->Weights(), sizeof(float)*_nVar);
#ifndef RELEASE // obsolete in release versions
memcpy(_nSymbolsFound, data->_nSymbolsFound, sizeof(int)*_nVar);
#endif
// c) calculate normalization factor
// c1) normalize continuous varaibles by the fourth of their standard deviation (four sigma)
if(normType==ByFourSigma)
{
for(int j=0;j<_nVar;j++)
if(NOMINAL_VARIABLES && IsSymbolic(j)) // nominal: use overlap factor/weigth
normFac[j] = overlapFac;
else // continuous: divide by four sigma
if(data->Dev()[j]!=0) // note: prevent division by zero error; use default value 1
normFac[j] = 1/(4*data->Dev()[j]);
else
normFac[j] = 1;
}
else
// c2) normalize continuous variables by their range
if(normType==ByRange)
{
for(int j=0;j<_nVar;j++)
if(NOMINAL_VARIABLES && IsSymbolic(j)) // nominal: use overlap factor/weigth
normFac[j] = overlapFac;
else // continuous: divide by range
if(data->Range()[j]!=0) // note: prevent division by zero error; use default value 1
normFac[j] = 1/data->Range()[j];
else
normFac[j] = 1;
}
else
// c3) normalize variables (regardless of type) by the average distance of the learn tuples to each other
if(normType==ByAverageDistance)
{
const int nTup = data->nTup(); // abbrevation
for(int j=0;j<_nVar;j++)
{
float d=0; // ini
for(int i=0;i<nTup;i++) // sum up distance for all combinations
for(int u=i;u<nTup;u++)
if(NOMINAL_VARIABLES && IsSymbolic(j)) // nominal:
{
if(data->Row(i)[j]!=data->Row(u)[j])
d++; // sum up; distance is one if none-matching
}
else // continuous:
d += fabs(data->Row(i)[j]-data->Row(u)[j]); // sum up distance
d /= (nTup*(nTup-1)*0.5); // calculate average distance
if(d!=0)
normFac[j] = 1/d; // use inverse to normalize
else
normFac[j] = 1;
}
}
}
//----------------------------------------------------------------------------------------------------------------------
// destructor note: private! Use Release() instead!
TDataData::~TDataData()
{
// release
delete[] min;
delete[] weight;
delete[] normFac;
delete[] type;
delete[] _nIntegerMaxMin;
#ifndef RELEASE // obsolete in release versions
delete[] _nSymbolsFound;
#endif
}
//----------------------------------------------------------------------------------------------------------------------
// used to 'delete' instance
void TDataData::Release() const
{
if(--ref==0) // decrement reference counter
delete this; // delete instance
}
//----------------------------------------------------------------------------------------------------------------------
// allocate memory, used in constructor and in Load()
void TDataData::Allocate(const int& __nVar)
{
_nVar = __nVar; // store
min = new float[_nVar]; // allocate memory
normFac = new float[_nVar];
weight = new float[_nVar];
_nIntegerMaxMin = new int[_nVar];
type = new TData::TVarType[_nVar];
#ifndef RELEASE // obsolete in release versions
_nSymbolsFound = new int[_nVar];
#endif
}
//----------------------------------------------------------------------------------------------------------------------
// save to file (ofstream)
void TDataData::Save(ofstream& file) const
{
// write header
file << "[Misc]" << endl; // section name
file << "nVar = " << _nVar << endl; // # variables
file << "nTup = " << _nTup << endl; // # tuples
file << "Output = " << (outcol+1) << endl; // output column (referring original data file)
file << "Filename = " << szFilename << endl << endl << endl; // filename of original data
file << "[DataData]" << endl; // section name
file << ComChar << " Statistics and variable types obtained from the learn data file." << endl;
file << ComChar << " Format: Column | Minimum | Weight | Normalization Factor | # Symbols | \
type (n=nominal, o=ordinal, c=continuous)" << endl << endl; // format
for(int j=0;j<_nVar;j++) // for all variables
{
file << setiosflags(ios::left) << resetiosflags(ios::right); // left justified output
file << setw(6) << OriginalColumnInFile(j) << " "; // column
file << setw(W_FLOAT) << min[j] << " "; // minimum
file << setw(W_FLOAT) << weight[j] << " "; // weight
file << setw(W_FLOAT) << normFac[j] << " "; // normalization factor
file << setw(W_SYMBOL) << _nIntegerMaxMin[j] << " "; // # integer values max to min
#ifndef RELEASE // obsolete in release versions
file << setw(W_SYMBOL) << _nSymbolsFound[j] << " "; // # symbols
#endif
file << setw(W_SYMBOL) << TData::VarTypeToChar(type[j]) << endl; // variable types
file << resetiosflags(ios::left) << setiosflags(ios::right); // restore right justified output
}
file << endl << endl; // linefeed
}
//----------------------------------------------------------------------------------------------------------------------
// load from file (ifstream)
void TDataData::Load(ifstream& file, int& line)
{
// a) checks
IfTrueThrowTypeA(f_Loaded, "Function cannot be called twice in the lifetime of an instance!", "TDataData::Load"
, szModule);
f_Loaded = true;
// b) read # variables and data data
char c;
int j=-1; // variable counter
try
{
// b1) read section [Misc]
line += SearchKey(file, "[Misc]"); // position to section
_nVar = ReadKeyValue(file, "nVar", -1, SEARCH_LINES); // read # variables
_nTup = ReadKeyValue(file, "nTup", -1, SEARCH_LINES); // read # tuples
outcol = ReadKeyValue(file, "Output", -1, SEARCH_LINES)-1; // read output column (ref. to original data file)
ReadKeyString(file, "Filename", szFilename, STS, SEARCH_LINES);// filename of original data
if(szFilename[0]=='\0') // if no filename was found
strcpy(szFilename, "unknown"); // set it to be unknown
if(_nVar<=0) throw 1; // ... and check them
if(_nTup<=0) throw 10; // ... and check them
if(outcol<0 || outcol >=_nVar) throw 2;
Allocate(_nVar); // allocate memory for now known # variables
// b2) read section [DataData]
line += SearchKey(file, "[DataData]"); // position to section
for(j=0;j<_nVar;j++) // read statistics for each variables
{
ReadExpNoEndl(file, (float) 0); // remove column number
min[j] = ReadExpNoEndl(file, (float) 0); // read minimum
weight[j] = ReadExpNoEndl(file, (float) 0); // read weight
normFac[j] = ReadExpNoEndl(file, (float) 0); // read normalization factor
_nIntegerMaxMin[j] = ReadExpNoEndl(file, (float) 0); // read # integer values from min. up to max.
#ifndef RELEASE // obsolete in release versions
_nSymbolsFound[j] = ReadExpNoEndl(file, (float) 0); // read # symbols
#endif
skipwsNoEndl(file); // position foreward
c = (char) file.get(); // read, check and set variable type
type[j]=TData::CharToVarType(c);
if(type[j]==TData::none)
throw 3; // error: unknown variable type
line += skipwsExEndl(file); // remove linefeed
}
}
catch(int errNo) // exception handling: compose error text and throw again
{
char szText[STS];
switch(errNo)
{
case 1 : strcpy(szText, "Error in section [Basic]: Key 'nVar' missing or value negative!"); break;
case 10 : strcpy(szText, "Error in section [Basic]: Key 'nTup' missing or value negative!"); break;
case 2 : sprintf(szText, "Error in section [Basic]: Key 'Output' missing or value not e[1..%d]!", _nVar); break;
case 3 : sprintf(szText, "Error in line %d in section [DataData] while reading variable %d: Unknown variable type '%c'!", line, j+1, c); break;
case KeyNotFound : sprintf(szText,"%s",::GetLastError(errNo)); break;
default : if(j<0)
sprintf(szText,"Error in section [Basic]: %s", ::GetLastError(errNo));
else
sprintf(szText,"Error in line %d in section [DataData] while reading variable %d: %s", line, j+1, ::GetLastError(errNo));
}
ThrowTypeU(szText); // throw exception
}
}