static const char* szModule = "cluster.cpp";
//------------------------------------------------------------------------------
// module cluster.cpp //
// //
// Model learned with the PNC2 cluster algorithm. Used in combination //
// with TPrediction. //
// See below or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 2000-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software and can be used under the terms of the //
// GNU licence. See header file for further information and disclaimer. //
// //
//------------------------------------------------------------------------------
// //
// CREATE: a) Create an empty instance and call Load() //
// b) Call TPnc::ToTCluster() to convert from TPnc //
// //
// NOTE: The parameters (min. cuboid mass, usage of feature weights //
// and used metric) are initialized by a call to function Prepare(). //
// Either the parameters originally used for learning or alternate //
// ones that you pass to Prepare() are used . //
// //
// NOTE: There are two flags 'f_HasPruningInformation' (f_PI) and //
// 'f_HasOriginalCuboids' (f_OC). Here's their meaning: //
// //
// f_PI f_OC | Meaning //
// -------------------------------------------------------------------- //
// true true -> original cuboid bounds/masks are stored //
// and additional pruning information is avalaible //
// true false -> illegal //
// false true -> original cuboid bounds/masks are stored and no //
// pruning information is available //
// false false -> pruned cuboids are stored, i.e. original bounds/ //
// masks are replaced by minima/maxima/unity mask //
// //
// File I/O: Load and save routines //
//------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------------------------------------
#include <stdlib> // due to qsort()
#include <stdio> // sprintf()
#include <iomanip> // setw()
#include <math> // sqrt()
#include <values> // MAXFLOAT
#include "cluster.h"
#include "fileutil.h" // file io utilities
#include "defines.h"
#include "exception.h" // IfTrueThrowTypeA()
#ifndef RELEASE
#include "statistics.h" // CalcUpper() for confidence intervals
#endif
//----------------------------------------------------------------------------------------------------------------------
// Memory in-efficiency warning: Each cuboid is stored by his bounds/masks for each(!) input and by two arrays
// indicating the active, i.e. none-pruned, inputs. All this storage is always used, no matter if pruning information
// or original cuboid bounds/masks are available. If there is no pruning information, then the arrays with the active
// input Ids just contain the Id of each input. And if the original cuboid bounds/masks are not avalaible, then the
// the cuboid is represented for the pruned inputs just by +-MAXFLOAT (continuous input) or by a mask with all bits set
// (symbolic input). Maybe one should change this in future ... ?
//----------------------------------------------------------------------------------------------------------------------
// Encapsulated two function pointers that are used to improve performance of distance calculations in the input space.
// The metric used is d = sum (w_j * |d_j|^p )^1/p with 'w_j' being the j-th variable's weight and 'd_j' being the j-th
// variable component wise distance. For performance reasons there are two function pointers 'DFunc1' and 'DFunc2'
// which point to the functions which a) raise 'd_j' to the power of 'p' and b) which raise 'sum(w_j * |d_j|^p)' to the
// power of '1/p'.
// note: the absolute values (function fabs()) are used to cope with rounding effects (small negative values)
void Dummy (float& , const float& ) {}; // do nothing
void Square (float& d_j, const float& ) { d_j *= d_j; }; // square value
void Root (float& d_j, const float& ) { d_j = sqrt(fabs(d_j)); }; // square root of value
void Power (float& d_j, const float& p) { d_j = pow(fabs(d_j), p); }; // raise value to the power of 'p'
// initialize function pointers
void TDFunc::Initialize(const float& _p)
{
p = _p; // copy
inv_p = 1.0/p;
// set function pointers
if(p==1) { DFunc1 = Dummy; DFunc2 = Dummy; } else // city block distance
if(p==2) { DFunc1 = Square; DFunc2 = Root; } else // euclidian distance
if(p==0.5) { DFunc1 = Root; DFunc2 = Square; } else
{ DFunc1 = Power; DFunc2 = Power; };
};
//----------------------------------------------------------------------------------------------------------------------
// dummy constructor used if Load() will be called to initialize/fill object
TCluster::TCluster()
{
l = r = NULL; // set all pointer to NULL
m = neg = q = NULL;
activeIdL = activeIdR = NULL;
mapper = nActiveL = nActiveR = NULL;
ddata = NULL;
// initialize
_nCub = _nCubRed = avrVarPerCub = avrHitrate = sizeRatio = 0;
p_min = -1; // to ensure, that Prepare() gets executed
f_Loaded = false; // reset flag
#ifndef RELEASE // obsolete in release version
_d_ref = 0;
#endif
}
//----------------------------------------------------------------------------------------------------------------------
// normal constructor used if instance is filled by friend TPnc::ToTCluster()
TCluster::TCluster(const int& __nTup, const TDataData*const& _ddata, const TParameter& _para)
{
_nCub = __nTup; // # clusters ignoring cardinality
ddata = _ddata; // pointer to data data (extrema, weights, etc.)
para = _para; // parameter used to learn
dfunc.Initialize(para.Metric); // set metric
f_Loaded = true;
_nCubRed = _nCub; // assume 'k=0', i.e. all cuboids are used
avrVarPerCub = avrHitrate = sizeRatio = 0; // initialize
p_min = -1; // to ensure, that Prepare() gets executed
#ifndef RELEASE // obsolete in release version
_d_ref = 0;
#endif
Allocate(); // allocate memory for specified # clusters in ddata->nVar() dimensional space (output+input)
}
//----------------------------------------------------------------------------------------------------------------------
// destructor
TCluster::~TCluster()
{
// a) # positive and negative examples, hitrate, mapper ...
delete[] m;
delete[] neg;
delete[] q;
delete[] mapper;
delete[] nActiveL;
delete[] nActiveR;
// b) left and right bounds, active Id's ...
DeleteMatrix(l, _nCub);
DeleteMatrix(r, _nCub);
DeleteMatrix(activeIdL, _nCub);
DeleteMatrix(activeIdR, _nCub);
// c) data data (extrema, weights etc.) object
if(ddata)
ddata->Release();
}
//----------------------------------------------------------------------------------------------------------------------
// allocate memory, used in constructor and load routine
void TCluster::Allocate()
{
// allocate memory
l = MakeMatrix((float) 0.0, _nCub, ddata->nVar()); // left and
r = MakeMatrix((float) 0.0, _nCub, ddata->nVar()); // right bounds
m = new float[_nCub]; // mass
neg = new float[_nCub]; // # covered negative examples
q = new float[_nCub]; // hitrate
mapper = new int[_nCub]; // new and initialize mapper
nActiveL = new int[_nCub]; // new # active (i.e. none pruned) left bounds
nActiveR = new int[_nCub]; // new # active (i.e. none pruned) right bounds
for(int i=0;i<_nCub;i++) // ini
{
mapper[i] = i;
nActiveLowerBounds(i)=nActiveUpperBounds(i)=0;
}
activeIdL = MakeMatrix((int) 1, _nCub, ddata->nVar()); // id's of active left bounds of each cuboid
activeIdR = MakeMatrix((int) 1, _nCub, ddata->nVar()); // id's of active right bounds of each cuboid
}
//----------------------------------------------------------------------------------------------------------------------
// save model data to file (ofstream), i.e. save data data, cuboid data and pruning information
// note: If a parameter object is passed this will overrule the class' own parameter object
// note: pruning information is not written if it is not avalaible or if compact flag is set
// note: pruned variable bounds/masks are replaced by '-' if compact and prune flag are set or if no original cuboid
// bounds/masks are avalaible.
void TCluster::Save(ofstream& file, const bool& f_Compact/*=false*/, const TParameter* _para/*=NULL*/)
{
if(!_para) // use own parameters if nothing else is given
_para = ¶
if(!f_Compact) // if everything should be saved ...
{
TParameter tmp = *_para; // copy parameters
tmp.p_min = 0; // set cardinality to zero and ...
Prepare(&tmp); // ... call Prepare() to ensure that all cuboids are saved irrespective of their mass
}
else
Prepare(_para);
// a) write data data (extrema, weights etc.) and parameters
ddata->Save(file);
_para->Save(file);
// b) write # cuboids
const bool f_WritePruningInfo = f_HasPruningInformation && !f_Compact;
const bool f_WritePrunedCubs = (f_Compact && _para->Prune) || !f_HasOriginalCuboids;
file << endl << endl << "[Basic]" << endl; // section name
file << "nCuboid = " << _nCubRed << endl; // # cuboids
file << "PruningInformation = " << FlagToString(f_WritePruningInfo) << endl;
file << "OriginalCuboids = " << FlagToString(!f_WritePrunedCubs) << endl;
#ifndef RELEASE // obsolete in release version
file << "d_ref = " << _d_ref << endl;
#endif
file << endl << endl;
// c1) write header cuboid data
file << "[Cuboid]" << endl; // section name
file << ComChar << " Each row contains the data of one cuboid.";
if(!f_Compact)
file << " Hint: Use the export feature to get a more compact representation!";
file << endl << ComChar << " Format: ";
#ifdef HITRATE
file << "Hitrate | ";
#endif
file << "# positive examples | # negative examples | output | left and right bound or mask of covered";
file << " symbols for each variable" << endl << endl;
// c2) write cuboid data
char szText[STS];
for(int t=0;t<nCuboidsRed();t++) // over all clusters (mapped)
{
#ifdef HITRATE
file << setw(6) << WritePercentage(Hitrate(t), 6, 1) << " "; // hitrate
#endif
file << setw(6) << Mass(t) << " " << setw(6) << Neg(t) << " "; // # positive and negative examples
file << setw(10) << Output(t) << " "; // output
// compact mode
if(f_WritePrunedCubs)
{
int cL=0, cU=0; // counter to index active lower and upper bound arrays (pruning information)
for(int j=1;j<ddata->nVar();j++) // over all input variables
if(NOMINAL_VARIABLES && ddata->IsSymbolic(j)) // nominal:
if(ActiveLowerBound(t, cL)==j) // if variable is active, i.e. not pruned ...
{
if(cL<nActiveLowerBounds(t)-1)
cL++; // increment counter
WriteBits(file, Mask(t,j), ddata->nIntegerMaxMin(j)); // write bits
file << " "; // write space
}
else
file << setw(ddata->nIntegerMaxMin(j)) << '-' << " "; // write '-' indicating pruned variable
else // continuous or ordinal:
{
// left bound
if(ActiveLowerBound(t, cL)==j) // if left bound is active, i.e. not pruned
{
if(cL<nActiveLowerBounds(t)-1)
cL++;
sprintf(szText, "%9g ->", Lower(t,j)); // write left bound
file << szText;
}
else
file << " - ->"; // else write '-'
// right bound
if(ActiveUpperBound(t, cU)==j) // if right bound is active, i.e. not pruned
{
if(cU<nActiveUpperBounds(t)-1)
cU++;
sprintf(szText, " %-9g ", Upper(t,j)); // write right bound
file << szText;
}
else
file << " - "; // else write '-'
}
}
else // non-compact mode
for(int j=1;j<ddata->nVar();j++) // over all input variables
if(NOMINAL_VARIABLES && ddata->IsSymbolic(j))
{
WriteBits(file, Mask(t,j), ddata->nIntegerMaxMin(j)); // symbolic (nominal): write bits
file << " ";
}
else
{
sprintf(szText, "%9g -> %-9g ", Lower(t,j), Upper(t,j)); // continuous or ordinal: write left and right
file << szText; // bounds
}
file << endl;
}
file << endl << endl;
// d1) write header pruning information
if(!f_WritePruningInfo) // do not proceed if there is no (useful) pruning information or if 'compact' flag is set
return;
file << "[Prune]" << endl; // section name
file << ComChar << " Each row contains the none pruned variable Id's of each cuboid." << endl;
file << ComChar << " Format: none pruned variables regarding lower bounds / none pruned variables regarding";
file << " upper bounds" << endl << endl;
// d2) write pruning information
for(int t=0;t<nCuboidsRed();t++) // for each cuboid (mapped)
{
for(int g=0;g<nActiveLowerBounds(t);g++) // write all active lower bounds
file << (ActiveLowerBound(t,g)+1) << " ";
file << " / "; // write seperator
for(int g=0;g<nActiveUpperBounds(t);g++)
file << (ActiveUpperBound(t,g)+1) << " ";
file << endl; // linefeed
}
}
//----------------------------------------------------------------------------------------------------------------------
// load model, i.e. data data, cuboid data and pruning information
void TCluster::Load(ifstream& file, int& line)
{
// a) check
IfTrueThrowTypeA(f_Loaded, "Function cannot be called twice in the lifetime of an instance!", "TCluster::Load()"
, szModule);
f_Loaded = true; // set flag to prevent another call of Load()
// b) load data data and parameters
TDataData* _ddata = new TDataData(); // note: allocate and load with local and none const pointer ...
_ddata->Load(file, line); // ... to get rid of a compiler warning
ddata = _ddata; // write back
para.Load(file, line);
// c) read cuboid data and pruning information
char c[3];
int t=-1, j, varId;
bool f_ReadingSectionPrune=false;
try // try to read
{
// c1) read section [Basic]
line += SearchKey(file, "[Basic]"); // position to section
_nCub = ReadKeyValue(file, "nCuboid", -1, SEARCH_LINES); // read # cuboids
int tmp1 = ReadKeyBool (file, "PruningInformation", -1, SEARCH_LINES); // flags
int tmp2 = ReadKeyBool (file, "OriginalCuboids", -1, SEARCH_LINES);
#ifndef RELEASE // obsolete in release version
_d_ref = ReadKeyValue(file, "d_ref", -1.0, SEARCH_LINES); // read average cuboid diameter
#endif
if(_nCub<=0) throw 1; // ... and check
if(tmp1<0) throw 11;
if(tmp2<0) throw 12;
if(tmp1==1 && tmp2==0) throw 13;
#ifndef RELEASE // obsolete in release version
if(_d_ref<0) throw 14;
#endif
f_HasPruningInformation = tmp1; // set
f_HasOriginalCuboids = tmp2;
Allocate(); // allocate memory
// c2) read section [Cuboid]
line += SearchKey(file, "[Cuboid]"); // position to section
for(t=0;t<_nCub;t++) // for each cuboid
{
Mass(t) = ReadExpNoEndl(file, (float) 0); // read # positive examples
Neg(t) = ReadExpNoEndl(file, (float) 0); // read # negative examples
Output(t) = ReadExpNoEndl(file, (float) 0); // read output value
if(f_HasOriginalCuboids) // cuboid data is unpruned, i.e. without '-'
for(j=1;j<ddata->nVar();j++) // read cuboid's bounds
{
if(NOMINAL_VARIABLES && ddata->IsSymbolic(j)) // symbolic (nominal) variable:
{
skipwsNoEndl(file); // remove whitespaces
ReadBits(file, Mask(t,j), ddata->nIntegerMaxMin(j)); // read mask
}
else // continuous or ordinal variable:
{
Lower(t,j)=ReadExpNoEndl(file, (float) 0); // read lower bound
skipwsNoEndl(file); // remove '->'
file.getline(c, 3); // read two characters
if(strcmp(c, "->")!=0) // and compare to '->'
throw 2;
Upper(t,j)=ReadExpNoEndl(file, (float) 0); // read upper bound
}
}
else // cuboid data is pruned, i.e. contains '-'
// indicating pruned bounds/masks
for(j=1;j<ddata->nVar();j++) // read cuboid's bounds
{
if(NOMINAL_VARIABLES && ddata->IsSymbolic(j)) // symbolic (nominal) variable:
{
skipwsNoEndl(file); // remove whitespaces
if(file.peek()!='-') // if no '-' is encountered ...
{
ReadBits(file, Mask(t,j), ddata->nIntegerMaxMin(j)); // ... read mask
ActiveLowerBound(t, nActiveLowerBounds(t)) = j; // add variable to pruning information
nActiveLowerBounds(t)++; // increment
}
else
{
file.get(); // remove the '-'
SetAllBits(Mask(t,j)); // set mask to cover all
}
}
else // continuous or ordinal variable:
{
skipwsNoEndl(file); // remove whitespaces
if(file.peek()!='-') // if no '-' is encountered ...
{
Lower(t,j)=ReadExpNoEndl(file, (float) 0); // read lower bound
ActiveLowerBound(t, nActiveLowerBounds(t)) = j; // add variable to pruning information
nActiveLowerBounds(t)++; // increment
}
else
{
file.get(); // remove the '-'
Lower(t,j) = -MAXFLOAT; // set left bound minus infinity
}
skipwsNoEndl(file); // remove '->'
file.getline(c, 3); // read two characters
if(strcmp(c, "->")!=0) // and compare to '->'
throw 2;
skipwsNoEndl(file); // remove whitespaces
if(file.peek()!='-') // if no '-' is encountered ...
{
Upper(t,j)=ReadExpNoEndl(file, (float) 0); // read upper bound
ActiveUpperBound(t, nActiveUpperBounds(t)) = j; // add variable to pruning information
nActiveUpperBounds(t)++; // increment
}
else
{
file.get(); // remove the '-'
Upper(t,j) = MAXFLOAT; // set left bound to infinity
}
}
}
line += skipwsExEndl(file); // remove linefeed
}
// c3) read section [Prune] (pruning information)
if(f_HasPruningInformation)
{
f_ReadingSectionPrune=true;
line += SearchKey(file, "[Prune]"); // position to section
for(t=0;t<_nCub;t++) // for each cuboid
{
// c3.1) read active Id's of left bounds
while(file.peek()!='/') // peek character and compare to separator '/'
{
if(nActiveLowerBounds(t)>=ddata->nVar()) // check if too many id's are given
throw 4;
varId = ReadExpNoEndl(file, 0)-1; // read variable Id
if(varId<1 || varId >=ddata->nVar()) // check it - note: output column must not be
throw 6; // specified
ActiveLowerBound(t,nActiveLowerBounds(t)++)=varId; // all checks passed? Then set Id
skipwsExNoEndl(file); // remove whitespaces
}
file.get(); // remove the seperator '/'
// c3.2) read active Id's of right bounds
int lplus=skipws(file);
while(lplus==0) // continue until linebreak
// (PoD: Maybe a problem in Unix/Linux??)
{
if(nActiveUpperBounds(t)>=ddata->nVar()) // check if too many id's are given
throw 5;
varId = ReadExpNoEndl(file, 0)-1; // read variable Id
if(varId<1 || varId >= ddata->nVar()) // check it - note: output column must not be
// specified
throw 7;
if(NOMINAL_VARIABLES && ddata->IsSymbolic(varId)) // check: do not specify upper bound for symbolic (nominal) variables
throw 8;
ActiveUpperBound(t,nActiveUpperBounds(t)++) = varId; // all checks passed? Then set Id
lplus=skipws(file);
}
line += lplus; // track line number
}
}
}
catch(int errNo) // exception handling
{
char szText[STS];
switch(errNo) // compose error text
{
case 1 : strcpy (szText, "Error in section [Basic]: Value of 'nCuboid' missing, negative or zero!"); break;
case 11 : strcpy (szText, "Error in section [Basic]: Flag 'PruningInformation' not found!"); break;
case 12 : strcpy (szText, "Error in section [Basic]: Flag 'OriginalCuboids' not found!"); break;
case 13 : strcpy (szText, "Error in section [Basic]: Flag 'PruningInformation' cannot be set if flag 'OriginalCuboids' is unset!"); break;
case 14 : strcpy (szText, "Error in section [Basic]: Reference distance 'd_ref' not found or value negative! Please coorect."); break;
case 2 : sprintf(szText, "Error in line %d in section [Cuboid]: Wanted '->' but found %s while reading cuboid %d, bound %d!", line, c, t+1, j); break;
case 4 : sprintf(szText, "Error in line %d in section [Prune]: Found to many variable Id's for lower bounds of cuboid %d!", line, t+1); break;
case 5 : sprintf(szText, "Error in line %d in section [Prune]: Found to many variable Id's for upper bounds of cuboid %d!", line, t+1); break;
case 6 : sprintf(szText, "Error in line %d in section [Prune]: Variable Id '%d' for lower bounds of cuboid %d is not e[2..%d]!", line, varId+1, t+1, ddata->nVar()); break;
case 7 : sprintf(szText, "Error in line %d in section [Prune]: Variable Id '%d' for upper bounds of cuboid %d is not e[2..%d]!", line, varId+1, t+1, ddata->nVar()); break;
case 8 : sprintf(szText, "Error in line %d in section [Prune]: Variable %d is symbolic (nominal). Thus it cannot be given as variable Id for the upper bounds of cuboid %d!", line, varId+1, t+1); break;
case KeyNotFound : sprintf(szText,"%s",::GetLastError(errNo)); break;
default : if(t<0)
sprintf(szText,"Error in section [Basic]: %s", ::GetLastError(errNo));
else
if(f_ReadingSectionPrune)
sprintf(szText,"Error in line %d in section [Prune] while reading cuboid %d: %s!", line, t+1, ::GetLastError(errNo));
else
sprintf(szText,"Error on line %d in section [Cuboid] while reading cuboid %d, bound %d: %s!", line, t+1, min(j,ddata->nVar()-1), ::GetLastError(errNo));
}
ThrowTypeU(szText); // throw error
}
// misc.
CalculateHitRate(); // caclulate cuboid hitrates
dfunc.Initialize(para.Metric); // initialize metric
}
//----------------------------------------------------------------------------------------------------------------------
// calculate hitrate of each cuboid/cluster
void TCluster::CalculateHitRate()
{
#ifdef LAPCOR_HITRATE
int nClasses = para.N_Int; // ini for regression tasks
if(!para.f_Regression)
// adjust for classification tasks, e.g. if a two class problem is encoded with '7' and '15' as output values)
nClasses = ddata->nSymbolsFound(0);
#endif
#ifdef CONF_HITRATE
float u_Gamma = CalcUGamma(ALPHA); // used for confidence intervals
#endif
// calculate hitrate of each cuboid/cluster
for(int i=0;i<_nCub;i++)
#ifdef NORMAL_HITRATE
q[i]=m[i]/(m[i]+neg[i]);
#endif
#ifdef LAPCOR_HITRATE
q[i]=(m[i]+1)/(m[i]+neg[i]+nClasses);
#endif
#ifdef CONF_HITRATE
q[i]=1-CalcUpper(m[i]+neg[i], neg[i], u_Gamma);
#endif
}
//----------------------------------------------------------------------------------------------------------------------
// prepare to be used to make predictions: initialize mapper to all cuboids with a mass bigger than TParameter::k and
// calculate average hitrate, cuboid size etc.
// note: If a parameter object is passed this will overrule the class' own parameter object
void TCluster::Prepare(const TParameter* _para/*=NULL*/)
{
if(!_para) // use own parameters if nothing else is given
_para = ¶
if(p_min==_para->p_min && f_UseWeights==_para->Weights && metric==_para->Metric) // return if nothing has changed
return;
// else:
// store parameters
dfunc.Initialize(_para->Metric); // set metric fucntion
metric = _para->Metric;
f_UseWeights = _para->Weights;
p_min = _para->p_min;
// ini/reset
avrVarPerCub = 0; // counts # non-prunde variables
avrHitrate = 0; // sums up hitrate
avrMass = 0; // sums up masses
_nCubRed = 0; // counts # clusters that exceed min. cuboid mass
// over all clusters - note: variable types and cluster data are with(!) output
for(int i=0;i<_nCub;i++)
if(m[i]>_para->p_min) // use only cuboids that exceed the min. cuboid mass
{
mapper[_nCubRed] = i; // a) set mapper and increment counter
avrHitrate += q[i]; // b) sum up hitrate and mass
avrMass += m[i];
// c) count active bounds
for(int g=0;g<nActiveL[i];g++) // count active lower bounds
if(NOMINAL_VARIABLES && ddata->IsSymbolic(activeIdL[i][g]))
avrVarPerCub += 2; // symbolic bounds count for 2
else
avrVarPerCub++; // increment
avrVarPerCub += nActiveR[i]; // add # upper bounds (note: can only be non-symbolic)
_nCubRed++; // increment counter
}
// turn sums in mean values
if(_nCubRed==0)
avrVarPerCub = avrHitrate = avrMass = sizeRatio = 0;
else
{
// calculate average values per cuboid
avrVarPerCub /= 2*_nCubRed; // # active bounds
avrHitrate /= _nCubRed; // hitrate
avrMass /= _nCubRed; // mass
sizeRatio = _nCubRed/((float) ddata->nTup()); // size relative to original # tuples in learn data
}
}
//----------------------------------------------------------------------------------------------------------------------
// calculate average diameter, i.e. distance from lower left to upper right bound
#ifndef RELEASE // obsolete in release version
void TCluster::CalcAvrDiameter()
{
_d_ref = 0; // ini
// over all cuboids
float* d = new float[ddata->nVar()]; // componentwise range of cuboid
for(int i=0;i<_nCub;i++)
{
for(int j=1;j<ddata->nVar();j++) // over all input(!) variables
if(NOMINAL_VARIABLES && ddata->IsSymbolic(j))
{
// --------------------
// symbolic (nominal):
int cSet=0;
for(int g=0;g<ddata->nIntegerMaxMin(j);g++)
if(IsBitSet(MaskDirect(i,j), g)) // count how many symbols are covered
cSet++;
cSet = min(cSet, ddata->nSymbolsFound(j)); // bound, because SetAllBits() could have been called
d[j] = ((float) cSet)/ddata->nSymbolsFound(j); // note: divide by # of really occuring symbols
d[j] *= ddata->NormFac()[j]; // multiply by normalization factor
// raise component wise distance to the power of metric parameter 'p'
dfunc.DFunc1(d[j], dfunc.p);
if(f_UseWeights)
d[j] *= ddata->Weights()[j]; // scale by variable's weight
}
else
{
// --------------------
// continuous or ordinal:
d[j] = ddata->NormFac()[j] * (r[i][j]-l[i][j]);
dfunc.DFunc1(d[j], dfunc.p); // raise distance to the power of metric parameter 'p'
if(f_UseWeights)
d[j] *= ddata->Weights()[j]; // scale by variable's weight if it's specified
}
// 2nd step
float d_sum=0;
for(int j=1;j<ddata->nVar();j++) // over all input(!) variables
d_sum += d[j]; // sum up component wise distances
dfunc.DFunc2(d_sum, dfunc.inv_p); // raise sum to the power of '1/p' ...
_d_ref += d_sum;
}
delete[] d; // release
_d_ref /= _nCub;; // average
}
#endif
//----------------------------------------------------------------------------------------------------------------------
// return # average active bounds
const float TCluster::AvrVarPerCub(const bool& f_Pruned) const{
if(f_Pruned) return avrVarPerCub;
else return ddata->nVar()-1;};
//----------------------------------------------------------------------------------------------------------------------
// calculate distance of j-th variable to left bound
// note: 'j' is given with(!) output and 'cubId' is already mapped
float TCluster::d_Lower(const int& j, const float x_j, const int& cubId)
{
float d_j=0; // ini
// note: symbolic flags, weights, 'l' (also Mask()) and 'r' are with(!) output
if(NOMINAL_VARIABLES && ddata->IsSymbolic(j))
{
// ----------------
// symbolic (nominal):
const int bit = x_j-ddata->Min()[j];
// note on low and highcheck: test data may have a different minimum, i.e. unseen symbol ...
if(bit<0 || bit>MaxBits || !IsBitSet(MaskDirect(cubId, j), bit)) // (PoD)
{
d_j = ddata->NormFac()[j] * 1.0; // not covered
// note: code placed here(!) because it should not be executed if distance is zero
dfunc.DFunc1(d_j, dfunc.p); // raise component wise distance ('d_j') to the power of metric parameter 'p'
if(f_UseWeights)
d_j *= ddata->Weights()[j]; // scale by variable's weight if it's specified
}
}
else
// --------------------
// continuous or ordinal:
if(x_j<l[cubId][j]) // if value id left of left bound ....
{
d_j = ddata->NormFac()[j]*(l[cubId][j]-x_j); // use distance value to left bound normalized by variable's range
// note: code placed here(!) because it should not be executed if distance is zero
dfunc.DFunc1(d_j, dfunc.p); // raise component wise distance ('d_j') to the power of metric parameter 'p'
if(f_UseWeights)
d_j *= ddata->Weights()[j]; // scale by variable's weight if it's specified
}
return d_j;
}
//----------------------------------------------------------------------------------------------------------------------
// calculate distance of j-th variable to right bound
// note: 'j' is given with(!) output and 'cubId' is already mapped
float TCluster::d_Upper(const int& j, const float x_j, const int& cubId)
{
float d_j=0; // ini
// --------------------
// continuous or ordinal:
if(x_j>r[cubId][j])
{ // if value is right of right bound ....
d_j = ddata->NormFac()[j]*(x_j-r[cubId][j]); // use distance value to right bound normalized by variable's range
dfunc.DFunc1(d_j, dfunc.p); // raise component wise distance ('d_j') to the power of metric parameter 'p'
if(f_UseWeights)
d_j *= ddata->Weights()[j]; // scale by variable's weight if it's specified
}
return d_j;
}
//----------------------------------------------------------------------------------------------------------------------
// distance cuboid to tuple using pruned cuboids
float TCluster::PrunedDistance(const float*const& /* input data */ x, const int& _cubId)
{
float d=0; // ini
int cubId = mapper[_cubId]; // get real cuboid id
// over all active left bounds
for(int h=0;h<nActiveL[cubId];h++)
{
int j = activeIdL[cubId][h]; // variable id with(!) output
d += d_Lower(j, x[j-1], cubId); // sum up
}
// over all active right bounds
for(int h=0;h<nActiveR[cubId];h++)
{
int j = activeIdR[cubId][h]; // variable id with(!) output
d += d_Upper(j, x[j-1], cubId); // sum up
}
dfunc.DFunc2(d, dfunc.inv_p); // raise sum of component wise distances ('d') to the power of '1/p'
return d; // return distance
}
//----------------------------------------------------------------------------------------------------------------------
// distance cuboid to tuple
float TCluster::Distance(const float*const& /* input data */ x, const int& _cubId)
{
float d=0; // ini
int cubId = mapper[_cubId]; // get real cuboid id
// over all left bounds
for(int j=1;j<ddata->nVar();j++) // variable id with(!) output
d += d_Lower(j, x[j-1], cubId); // sum up
// over all right bounds
for(int j=1;j<ddata->nVar();j++) // variable id with(!) output
if(!(NOMINAL_VARIABLES && ddata->IsSymbolic(j))) // note: symbolic flags are with(!) output
d += d_Upper(j, x[j-1], cubId); // sum up
dfunc.DFunc2(d, dfunc.inv_p); // raise sum of component wise distances ('d') to the power of '1/p'
return d; // return distance
}
//----------------------------------------------------------------------------------------------------------------------
int BarId(const int& mass)
{
return min(N_BARS-1, max(0, mass-1));
}
void TCluster::CalculateHistogram()
{
for(int h=0;h<N_BARS;h++) // reset histogram values
hist[h]=0;
for(int i=0;i<_nCub;i++)
hist[BarId(m[i])]++;
}