static const char* szModule = "pnc.cpp";
//------------------------------------------------------------------------------
// module pnc.cpp //
// //
// Contains the class TPnc and all other necessary sub-classes for //
// the PNC2 cluster algorithm. //
// See below or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 1999-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software and can be used under the terms of the //
// GNU licence. See header file for further information and disclaimer. //
// //
//------------------------------------------------------------------------------
// //
// CREATE: Pass learn data (TData) and parameters (TParameter) //
// //
// USE: Learn by calling the function Iterate() as long as it returns //
// 'false'. There will be 2 steps for learning. You may get //
// information about current progress or state by the functions //
// GetProgress() and GetStatusText(). After learned, you've got to //
// convert to TCluster using the function ToTCluster() to be able to //
// use the learned model with a prediction object (TPrediction). //
// If specified in parameter object (TParameter::f_Prune) or by //
// calling argument 'f_PruneAnyway' the function ToTCluster() calls //
// function PostPrune() and makes a context sensitive feature //
// selection for each cuboid and stores this as additional(!) //
// information. Also you may specify that no cuboids are converted //
// whose mass is less than a threshold 'K' by the calling argument //
// 'K'. Note: The parameter TParameter::p_min is ignored here to allow //
// you to change it and try different settings later. //
// //
// File I/O: main class and sub-classes have debug save routines //
//------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------------------------------------
// Additional notes:
//
// 1. TInterval::Del(opI) means merge the 'opI'-th interval with the 'opI-1'-th one
// 2. TCuboidsLists (list with TCuboid lists) must be kept consistent with TInterval
// 3. TPnc::nInt = # intervals used to discretize output (regular intervals)
// TIntervals::nInt() = # non-empty regular intervals
// TIntervals::Size() = # output intervals if # tuples per interval is bounded to 'N_G_max'
//
// 4. very important variables:
// int opI - first pass : actually processed output interval, gets incremented
// 2nd pass (cascade-mode) : actually processed output interval determined by TIntervals::NextInterval()
//----------------------------------------------------------------------------------------------------------------------
#include <limits> // due to MAXFLOAT
#include <iomanip> // setw()
#include <stdio> // sprintf()
#include <stdlib> // min() ...
#include <math> // sqrt()
#include <values> // MAXFLOAT
#include "pnc.h"
#include "defines.h" // definitions like DEBUG ...
#include "fileutil.h" // RemoveExt()
#include "exception.h" // IfTrueThrowTypeA()
#ifndef RELEASE
#include "statistics.h" // CalcUGamma (confidence intervals)
#endif
//#define PROFILE // insert profiling code. results are written to 'profile.txt'
#ifdef PROFILE
#include <time>
clock_t t_FindMin, t_FindMinEx, t_NegTest, t_PostPrune, t_Convert2TCluster, t_Reset, t_Learn;
clock_t t_TCuboids, t_Distance, t_TIntervals;
long nFindMin, nFindMinEx, nNegTest, nHitTest, nTCuboids, nDistance, nConvert2TCluster;
ofstream logfile("profile.txt", ios::app);
#endif
//-------------------------------------------------------------------------------------------------------------------
//
// class TCuboid
//
//----------------------------------------------------------------------------------------------------------------------
// allocate memory
void TCuboid::Initialize(const int& nInp)
{
delete[] l; // delete old memory
delete[] r;
delete tuples;
l = new float[nInp]; // ... and allocate new one
r = new float[nInp];
tuples = new TTupleIdList; // tuple id list
tuples->SetName("TTupleIdList");
}
//-------------------------------------------------------------------------------------------------------------------
//
// class TCuboids - TCuboid list
//
//----------------------------------------------------------------------------------------------------------------------
// constructor - create empty cuboid list, used when two cuboid lists are merged (2nd pass cascade mode)
TCuboids::TCuboids(const TDFunc& _dfunc, const TDataData*const& _ddata, const bool& _f_UseWeights)
{
dfunc = _dfunc;
ddata = _ddata;
f_UseWeights = _f_UseWeights; // flag: use feature weigths in distance function
list = new TCuboidList;
list->SetName("TCuboidList(M)");
}
//----------------------------------------------------------------------------------------------------------------------
// constructor - create trivial cuboids from (part of the) (learn) data
TCuboids::TCuboids(const TIntervals*const& intervals, const TData*const& _data, const int& opI, const TDFunc& _dfunc
, const TDataData*const& _ddata, const bool& _f_UseWeights)
{
#ifdef PROFILE
clock_t time=clock();
nTCuboids++;
#endif
// 1.
dfunc = _dfunc;
ddata = _ddata;
f_UseWeights = _f_UseWeights; // flag: use feature weigths in distance function
list = new TCuboidList;
list->SetName("TCuboidList");
// 2. abbrevations
const float y_quant = intervals->Y_Quant(opI); // quantized output value
const float y_snap = intervals->Y_Snap(opI); // snapped, i.e. quantized and de-quantized, output value
const int nTup = intervals->Size(opI); // # tuples to create tivial cuboids from
const float** data = intervals->Get(opI); // pointer to first tuple (of interval)
const int id = intervals->Id(opI); // id of first tuple (of interval)
// 3. create trivial cuboids
for(int i=0;i<nTup;i++)
{
TCuboid& cuboid = list->Ins(); // a) insert new list entry (cuboid)
cuboid.Initialize(ddata->nInp()); // b) initialize it
// c) set cuboid's bounds and/or masks
// note: symbolic flags, minima/maxima and data tuples themself are with(!) output
for(int j=0;j<ddata->nInp();j++)
if(NOMINAL_VARIABLES && _data->IsSymbolic(j+1)) // symbolic (nominal):
{
ClearAllBits(cuboid.Mask(j)); // reset mask
SetBit(cuboid.Mask(j), (int) (data[i][j+1]-_data->Min()[j+1])); // set current symbol's flag (PoD)
}
else // continuous or ordinal:
cuboid.Min(j) = cuboid.Max(j) = data[i][j+1]; // set left and right bounds
// d)
cuboid.y_snap = y_snap; // snapped, i.e. quantized and de-quantized, output value
cuboid.y_quant = y_quant; // quantized output value
cuboid.m = 1; // mass
cuboid.neg = 0; // # covered negative examples
int tupleId = id+i; // tuple id
cuboid.tuples->Ins(tupleId); // initialize tuple id list
}
#ifdef PROFILE // profiling code
t_TCuboids +=clock()-time;
#endif
}
//----------------------------------------------------------------------------------------------------------------------
// debug function - save cuboid list
void TCuboids::Save(ofstream& file) const
{
// 1. write header
file << ComChar << " TCuboids" << endl;
file << ComChar << " # Inputs = " << ddata->nInp() << endl;
file << ComChar << " # Cuboids = " << list->Size() << endl;
file << ComChar << " output (snapped and quantized) | left and right bounds for all input variables | mass |";
file << " # negative examples" << endl;
// 2. write cuboid data
list->Reset(); // position to top of list
for(int i=0;i<list->Size();i++) // over all cuboids
{
// a) get reference to i-th cuboid
list->Next();
const TCuboid& cuboid = list->Get();
// b) write output value (snapped and quantized)
file << cuboid.y_snap << " " << cuboid.y_quant << " ";
// c) write cuboid's bounds and/or masks
for(int j=0;j<ddata->nInp();j++)
if(NOMINAL_VARIABLES && ddata->IsSymbolic(j+1))
WriteBits(file, cuboid.Mask(j)); // symbolic (nominal): write bits
else
file << " " << cuboid.Min(j) << " " << cuboid.Max(j); // continuous or ordinal: write left and right bound
// d) mass, covered negative examples
file << " " << cuboid.m << " " << cuboid.neg << endl;
}
file << endl;
}
//----------------------------------------------------------------------------------------------------------------------
// distance of i-th cuboid in list to reference cuboid <ref>
float TCuboids::Distance(const int& i, const TCuboid*const& ref) const
{
#ifdef PROFILE
clock_t time=clock();
nDistance++;
#endif
// ini
float d = 0;
// get i-th cuboid from list
list->Pos(i);
const TCuboid& cuboid = list->Get();
// calculate distance of cuboids to each other
// note: weight, variable types and IvdmDistance() are with(!) output
for(int j=0;j<ddata->nInp();j++) // over all inputs
{
float d_j=0; // distance of j-th variable
//--------------------------------------------
// overlap-metric
if(NOMINAL_VARIABLES && ddata->IsSymbolic(j+1))
{
// symbolic (nominal):
if((ref->Mask(j) & cuboid.Mask(j)) == 0) // no bits overlapping ?
d_j = ddata->NormFac()[j+1] * 1.0;
}
else
{
// continuous or ordinal:
const float l1 = ref->Min(j), r1 = ref->Max(j), l2 = cuboid.Min(j), r2 = cuboid.Max(j); // abbrevation
// determine situation
d_j = 0; // ini
if(r1<l2)
d_j = ddata->NormFac()[j+1]*(l2-r1); // right bound 1 lies left of left bound 2
else
if(l1>r2)
d_j = ddata->NormFac()[j+1]*(l1-r2); // left bound 1 lies right of rigth bound 2
}
dfunc.DFunc1(d_j, dfunc.p); // raise component wise distance ('d_j') to the power of metric parameter 'p'
if(f_UseWeights)
d_j *= ddata->Weights()[j+1]; // scale by variable's weight if it's specified
d += d_j;
}
#ifdef PROFILE // profiling code
t_Distance+=clock()-time;
#endif
dfunc.DFunc2(d, dfunc.inv_p); // raise sum of component wise distances ('d') to the power of '1/p'
return d;
}
//----------------------------------------------------------------------------------------------------------------------
// return generalization of two cuboids
TCuboid* /* cr */ Generalize(const TCuboid& a, const TCuboid& b, const TData*const& data)
{
// a) allocate memory
TCuboid* cuboid = new TCuboid;
cuboid->Initialize(data->nVar()-1);
// b) generalize
for(int j=0;j<data->nVar()-1;j++)
if(NOMINAL_VARIABLES && data->IsSymbolic(j+1)) // '+1' because variable types are with(!) output
// symbolic (nominal):
cuboid->Mask(j) = a.Mask(j) | b.Mask(j); // generalize by OR operation
else
{
// continuous or ordinal:
cuboid->Min(j) = min(b.Min(j), a.Min(j)); // minimum of left bounds
cuboid->Max(j) = max(b.Max(j), a.Max(j)); // maximum of right bounds
}
// c) sum up masses
cuboid->m = a.m + b.m;
// d) copy
cuboid->y_snap = a.y_snap;
cuboid->y_quant = a.y_quant;
// e) store id's of merged tuples
for(int i=0;i<a.m;i++) cuboid->tuples->Ins(a.tuples->Get(i));
for(int i=0;i<b.m;i++) cuboid->tuples->Ins(b.tuples->Get(i));
return cuboid;
}
//----------------------------------------------------------------------------------------------------------------------
// delete (i,j)-th cuboid and overwrite max(i,j)-th cuboid with <cuboid>
void TCuboids::Merge(const int& i, const int& j, TCuboid& _cuboid)
{
// a) delete cuboid with smaller index
Del(min(i, j));
// b) get reference to cuboid with bigger index
TCuboid& cuboid = Get(max(i, j)-1); // note: '-1' cause a preceeding cuboid in the list was deleted in a)
// c) delete 'old' memory - necessary cause cuboid will be overwritten
cuboid.Release();
// d) set (copy/overwrite) new cuboid
cuboid = _cuboid;
}
//-------------------------------------------------------------------------------------------------------------------
//
// class TCuboidsEx
//
//----------------------------------------------------------------------------------------------------------------------
// constructor
TCuboidsEx::TCuboidsEx(TCuboids*const& _a, TCuboids*const& _b, const TDFunc& dfunc, const TDataData*const& ddata
, const bool& f_UseWeigths)
{
is_extended=true;
// copy
a = _a;
b = _b;
// new cuboid list for merged cuboids
m = new TCuboids(dfunc, ddata, f_UseWeigths);
}
//----------------------------------------------------------------------------------------------------------------------
// destructor
TCuboidsEx::~TCuboidsEx()
{
if(is_extended)
delete a;
delete b;
delete m;
}
//----------------------------------------------------------------------------------------------------------------------
// merge two cuboids
void TCuboidsEx::Merge(TCuboids*const& cub1, int id1, TCuboids*const& cub2, const int& id2, TCuboid& cuboid)
{
cub2->Del(id2); // delete 2nd cuboid from corresponding list
if(cub1 == m) // if first cuboid is from list 'm' ...
{
if(cub2 == m) // if 2nd cuboid is also from list 'm' ...
id1--; // decrement its id cause one preceeding cuboid was deleted
TCuboid& entry = cub1->Get(id1); // get first cuboid
entry.Release(); // delete memory
entry = cuboid; // set (copy/overwrite) new cuboid
}
else // else if first cuboid isn't from cuboid list 'm' ...
{
cub1->Del(id1); // ... delete it from corresponding list
m->App(cuboid); // append new cuboid to cuboid list 'm'
}
}
//-------------------------------------------------------------------------------------------------------------------
//
// class TCuboidsLists - encapsulates list over cuboid lists
//
//----------------------------------------------------------------------------------------------------------------------
// constructor
TCuboidsLists::TCuboidsLists()
{
list = new TCuboidsList; // create list
list->SetName("TCuboidsList"); // set list name
mapper = NULL;
}
//----------------------------------------------------------------------------------------------------------------------
// destructor
TCuboidsLists::~TCuboidsLists()
{
DeleteEntries(list); // delete list entries ...
delete list; // ... and list
delete[] mapper;
}
//----------------------------------------------------------------------------------------------------------------------
// return mapper, call MakeMapper() before if flag <consistent> is unset
const TCuboid**const& TCuboidsLists::GetMapper(bool& consistent)
{
if(!consistent) // call MakeMapper() before if flag is unset
MakeMapper();
consistent=true;
return (const TCuboid**const) mapper;
}
//----------------------------------------------------------------------------------------------------------------------
// create mapper to all cuboids
void TCuboidsLists::MakeMapper()
{
delete[] mapper; // release old mapper
// a) count exemplars
_nCuboids=0;
for(int i=0;i<Size();i++) // over all cuboid lists
_nCuboids +=Size(i); // sum up
// b) create mapper
mapper = new TCuboid*[_nCuboids];
int c=0; // counter
for(int i=0;i<Size();i++) // over all cuboid lists
{
const TCuboids* cuboids = Get(i); // get i-th cuboid list
for(int t=0;t<Size(i);t++) // over all cuboids in cuboid list
mapper[c++] = &cuboids->Get(t);
}
IfTrueThrowTypeB(c!=_nCuboids, MSG_IPI, "TCuboidsLists::MakeMapper", szModule);
}
//-------------------------------------------------------------------------------------------------------------------
//
// class TIntervals and utility functions - provides output interval wise access to (learn-) data tuples
//
// step through the data tuples up from position <tup+1> and count how many tuples have the same quantized output
// value. Check if this number exceeds the maximum group size and thus adjust the N_G_Max value currently used.
float DetermineGroupSize(const int& y_quant, const int& tup, const int& nTup, const float**const& dat
, const float& max, const float& min, const TParameter*const& para)
{
// determine # tuples with same quantized output value
int i;
for(i=tup+1;i<nTup;i++)
if(y_quant!=Quant(dat[i][0], max, min, para->N_Int, !para->f_Regression))
break;
float nTupSameOutput = i-tup; // # tuples with same quantized output value
int nGroups = ceil(nTupSameOutput/para->N_G_Max); // # groups necessary
return nTupSameOutput/nGroups;
}
//----------------------------------------------------------------------------------------------------------------------
// constructor
TIntervals::TIntervals(const TData*const& _data, const TParameter*const& para)
{
#ifdef PROFILE
clock_t time=clock();
#endif
data = _data; // copy
float max = data->Max()[0];
float min = data->Min()[0];
const float** dat = data->GetDataPointer(); // direct pointer to data matrix
// <order> used to access variables in order of decreasing weight, used to optimize HitTest()
order = new TSorter(data->nVar()-1); // create
for(int j=0;j<data->nVar()-1;j++)
order->GetA(j) = order->GetB(j) = data->Weights()[j+1]; // note: weights are with(!) output - note: use weights
// irrespective of flag TParameter::f_UseWeights
order->SortDes(); // sort descending
// create interval list
list = new TIntervalList; // new list
list->SetName("TIntervalList");
// ini
// quantize first output value (is the lowest one cause tuples are sorted)
float y_quant = Quant(dat[0][0], max, min, para->N_Int, para->f_Regression);
const float** pt2opI = &dat[0]; // pointer to first tuple of interval
int id = 0; // id of " " " "
_nInt=1; // reset (ini with '1' and not with '0'!)
int k = 1; // # tuples in interval/group
float loss = 1.5; // ini: '1.0' to increase first interval by one tuple and '0.5' to cope with rounding effects
float N_G_Max = DetermineGroupSize(y_quant, 0, data->nTup(), dat, max, min, para); // max. interval/group size for
// first quantized output value
// iterate tuples and close old interval if #tuples exceeds <N_G_Max> or (quantized) output value changes
for(int i=1;i<data->nTup();i++)
{
int _y_quant = Quant(dat[i][0], max, min, para->N_Int, !para->f_Regression); // quantize i-th output value
// check conditions to close old interval: other quantized output value or to many tuples in interval
if(_y_quant != y_quant || k >= floor(N_G_Max) + floor(fabs(loss)))
{
#ifdef DEBUG_LOG_ON
DebugLogFile << "k=" << k << " ";
#endif
// sum up loss: intervals can only have integer # tuples in them but calculated size is a float value
if(loss>=1)
loss -= 1;
loss += N_G_Max-floor(N_G_Max); // sum up loss
// if output value has changed ...
if(_y_quant != y_quant)
{
N_G_Max = DetermineGroupSize(_y_quant, i, data->nTup(), dat, max, min, para); // determine new group size
loss = 1.5; // ini: see note above
_nInt++; // increment number of none-empty intervals
}
// close old interval
TInterval& interval = list->Ins(); // insert new interval in list
interval.opI = pt2opI;
interval.id = id;
interval.nTup = k;
interval.y_quant = y_quant;
interval.y_snap = DeQuant(y_quant, max, min, para->N_Int, !para->f_Regression);
// store values for new interval
y_quant = _y_quant; // store new (quantized) output value
pt2opI = &dat[i]; // -> first tuple of new interval
id = i;
k = 1;
}
else
k++; // increment interval tuple counter
}
// close last interval
TInterval& interval = list->Ins(); // insert new interval in list
interval.opI = pt2opI;
interval.id = id;
interval.nTup = k;
interval.y_quant = y_quant;
interval.y_snap = DeQuant(y_quant, max, min, para->N_Int, !para->f_Regression);
// new and fill check-vectors, i.e. arrays with the id's of tuples with a different class which may be neg. examples
checkVec = new int*[para->N_Int];
len_checkVec = new int [para->N_Int];
for(int i=0;i<para->N_Int;i++) // create
CreateCheckVec(i, para);
nCheckVec = para->N_Int; // store for destructor
id4Bg2 = 0; // ini for function NextInterval()
#ifdef DEBUG_LOG_ON
DebugLogFile << "k=" << k << " ";
DebugLogFile << "Intervalls=" << list->Size() << " ";
#endif
#ifdef PROFILE
t_TIntervals+=clock()-time;
#endif
}
//----------------------------------------------------------------------------------------------------------------------
// destructor
TIntervals::~TIntervals()
{
delete list;
// celete checkVecs
for(int i=0;i<nCheckVec;i++)
delete[] checkVec[i];
delete[] checkVec;
delete[] len_checkVec;
delete order;
}
//----------------------------------------------------------------------------------------------------------------------
// debug function
void TIntervals::Save(ofstream& file) const
{
file << ComChar << " TIntervals" << endl;
file << ComChar << " # tuples = " << data->nTup() << endl;
file << ComChar << " # intervals= " << list->Size() << endl;
file << ComChar << " interval id | quantized interval value | # tuples in it | snapped interval value" << endl;
// over all intervals
for(int i=0;i<list->Size();i++)
file << i << " " << Y_Quant(i) << " " << Size(i) << " " << Y_Snap(i) << endl;
file << endl;
}
//----------------------------------------------------------------------------------------------------------------------
// return size (# tuples) of regular output interval with (quantized) output value <value>
const int TIntervals::SizeReg(const int& value) const
{
int len = 0;
// iterate list and sum up
for(int i=0;i<list->Size();i++)
if(Y_Quant(i)==value) // compare
len += Size(i);
return len;
}
//----------------------------------------------------------------------------------------------------------------------
// returns true if tuple is inside cuboid - note: tuple is given with(!) output
bool HitTest(const TCuboid& cub, const float*const& tuple, const TData*const& data, const float& w_COD
, const float*const& weights, const TSorter*const& order)
{
#ifdef PROFILE
nHitTest++;
#endif
float out=0; // counts variables outside
float w = w_COD; // why??
// count # variables outside note: weights, variable types and (data) minima are with(!) output
for(int v=0;v<order->Size();v++)
{
const int& j=order->GetId(v); // performance boost: process variables in order of decreasing weight
if(NOMINAL_VARIABLES && data->IsSymbolic(j+1))
{
// symbolic (nominal): outside if flag is unset for symbol (PoD)
if(!IsBitSet(cub.Mask(j), (int) tuple[j+1]-data->Min()[j+1] ))
{
out += weights[j+1]; // sum up outside weights
if(out > w) // and check if treshold is exceeded
return false; // not inside
}
}
else
// continuous or ordinal
if( tuple[j+1] < cub.Min(j) || tuple[j+1] > cub.Max(j) )
{
out += weights[j+1]; // sum up outside weights
if(out > w) // and check if treshold is exceeded
return false; // not inside
}
}
// else ... tuple is inside
return true;
}
//----------------------------------------------------------------------------------------------------------------------
// delete interval from list - note: this means merge(!) the opI-th interval with the 'opI-1'-th one
void TIntervals::Del(const int& opI)
{
list->Pos(opI); // position
int len = list->Get().nTup; //
list->Del(); // delete
list->Pos(opI-1); // position to preceeding interval
list->Get().nTup = len+list->Get().nTup; // add length
}
//----------------------------------------------------------------------------------------------------------------------
// count # covered negative examples and decide upon merge
bool TIntervals::NegTest(TCuboid& cuboid, const int& y_quant, const int& maxNeg, const float& w_COD
, const float*const& weight) const
{
#ifdef PROFILE
clock_t time=clock();
nNegTest++;
#endif
float neg = 0; // ini
// check all tuples of <checkVec>
for(int i=0;i<len_checkVec[y_quant];i++)
if(HitTest(cuboid, data->Row(checkVec[y_quant][i]), data, w_COD, weight, order))
if(++neg > maxNeg)
return false; // deny merge
cuboid.neg = neg; // store # covered negative examples
#ifdef PROFILE
t_NegTest+=clock()-time;
#endif
return true; // allow merge
}
//----------------------------------------------------------------------------------------------------------------------
// create check-vectors and randomize their order:
// store indices of all (learn-)tuples whose (quantized) output value differs more than <DiffMax> from <y>
void TIntervals::CreateCheckVec(const int& y_quant, const TParameter*const& para)
{
int nId = 0; // ini
int* vec = new int[data->nTup()];
// over all intervals in list
int c =0; // counter
for(int i=0;i<list->Size();i++)
// if (quantized) output value differs more than <DiffMax> ...
#ifndef RELEASE
if(abs(Y_Quant(i)-y_quant)>para->DifMax)
#else
if(abs(Y_Quant(i)-y_quant)>0)
#endif
for(int t=0;t<Size(i);t++) // ... store all indices of interval
vec[nId++] = c++;
else
c += Size(i); // sum up skipped
// set randomized tuple id's of <vec> as check-vector
checkVec[y_quant] = Randomize(vec, nId);
len_checkVec[y_quant] = nId;
}
//----------------------------------------------------------------------------------------------------------------------
// get pointer to first tuple of i-th interval
const float**const& TIntervals::Get(const int& i) const
{
// note: '-1- is invalid but does not cause TStdList::Pos() to throw an exception. Thus do it here
IfTrueThrowTypeB(i==-1, "TIntervals::Get(const int i) called with 'i=-1' which is illegal!");
list->Pos(i); // position
return list->Get().opI; // return
}
//----------------------------------------------------------------------------------------------------------------------
// return index to an interval which is followed by another one with the same quantized output value. Function used
// in cascade mode (2nd pass) to find the intervals which need to be merged. Beware of danger! Do not change unless
// you know what you do!! ;-)
int TIntervals::NextInterval()
{
// ini: get (quantized) output value of current interval
int y = Y_Quant(id4Bg2++);
// a) search through list
int i;
for(i=0;i<list->Size();i++)
{
if(id4Bg2 >= list->Size()) // check for flashover
id4Bg2 = 0;
const float _y = Y_Quant(id4Bg2++);
if(y==_y) // compare (quantized) output values
break;
else
y =_y; // store new (quantized) output value
}
// b) if none found ... return '-1'
if(i==list->Size())
return -1;
// c) calculate return value
int opI = id4Bg2 - 2;
// d) position forward to (quantized) next output value as start position for the next call to this function
// note: maximally check as many times as there are list items
for(i=0;i<list->Size();i++)
{
if(id4Bg2 >= list->Size()) // check for flashover
id4Bg2 = 0;
if(y!=Y_Quant(id4Bg2)) // stop if new (quantized) output value is found
break;
id4Bg2++;
}
if(id4Bg2!= 0)
id4Bg2--; // position back, as (later) a preceeding element of the interval list will be deleted
return opI;
}
//-------------------------------------------------------------------------------------------------------------------
//
// class TAdjMat - adjazenz matrix (lower triangular matrix) which stores distances from all cuboids of a cuboid
// list to eachother
//
//----------------------------------------------------------------------------------------------------------------------
// constructor - note_: data structure is list over list to emulate lower triangular matrix
TAdjMat::TAdjMat(const TCuboids*const& _cuboids)
{
cuboids = _cuboids;
// new matrix (row pointer list)
mat = new TFloatsList;
mat->SetName("TAdjMatRowList");
// over all rows - 'i=1' cause matrix is lower triangular without diagonal
for(int i=1;i<cuboids->Size();i++)
{
TFloatList* row = new TFloatList; // new row (distances list) ...
row->SetName("TAdjMatDistancesList");
mat->Ins(row); // ... and insert it
// set i-th cuboid as reference cuboid ...
TCuboid* ref = &cuboids->Get(i);
// ... and calculate the distances to all other cuboids - note: mind that the matrix is lower triangular
for(int j=0;j<i;j++)
row->Ins()=cuboids->Distance(j, ref);
}
}
//----------------------------------------------------------------------------------------------------------------------
// destructor
TAdjMat::~TAdjMat()
{
// delete matrix
DeleteEntries(mat);
delete mat;
}
//----------------------------------------------------------------------------------------------------------------------
// append row
void TAdjMat::App(const TCuboid*const& ref)
{
if(cuboids->Size() < 2) // to not proceed if there are less than two cuboids in cuboid list
return;
mat->Pos(Size()-1); // position to the end of the row pointer list
TFloatList* row = new TFloatList; // new row ...
row->SetName("TAdjMatDistancesList(appended)");
mat->Ins(row); // ... and insert it
// calculate distances to all other cuboids - note: mind that the matrix is triangular
for(int j=0;j<Size();j++)
row->Ins()=cuboids->Distance(j, ref);
}
//----------------------------------------------------------------------------------------------------------------------
// debug function
void TAdjMat::Save(ofstream& file) const
{
file << ComChar << " TAdjMat" << endl;
file << ComChar << " Size = " << mat->Size() << endl;
file << ComChar << " # Cuboids = " << cuboids->Size() << endl;
mat->Reset(); // position to first row
for(int i=0;i<mat->Size();i++) // over all rows
{
mat->Next();
TFloatList* row = mat->Get(); // get row (list with distances)
row->Reset();
for(int j=0;j<row->Size();j++) // over complete row
{
row->Next();
file << row->Get() << " ";
}
file << endl;
}
file << endl;
}
//----------------------------------------------------------------------------------------------------------------------
// delete row/column in adjazency matrix
void TAdjMat::Del(const int& rowId)
{
// valid values are e[0..Size()] (i.e. there are Size()+1 valid values!)
IfTrueThrowTypeB( rowId<0 || rowId>mat->Size(), "Function called with illegal value for 'rowId'!", "TAdjMat::Del"
, szModule);
// second case: delete last row which means delete no(!) column
IfTrueThrowTypeB( rowId==mat->Size(), MSG_FNI, "TAdjMat::Del", szModule);
// first case: rowId==0 , i.e. delete first column and no(!) row
if(rowId==0)
{
// reset to first row
mat->Reset();
mat->Next();
// delete first column
delete mat->Get(); // delete row
mat->Del(); // delete from row pointer list
if(mat->Size()==0) // return if it's the last row
return;
// else
TFloatList* row = mat->Get(); // get row
row->Reset(); // reset list with distances
row->Next(); //
row->Del(); // delete first entry (first column)
for(int i=1;i<mat->Size();i++) // over all rows - 'i=1' cause first row already worked over
{
mat->Next();
TFloatList* row = mat->Get(); // get row
row->Reset(); // reset list with distances
row->Next(); //
row->Del(); // delete first entry (first column)
}
return;
}
// else: delete row and column
mat->Pos(rowId-1); // '-1' cause rows specified by <rowId> are enumerated starting with 1
delete mat->Get(); // delete row
mat->Del(); // delete row from row pointer list
TFloatList* row = mat->Get(); // get row
row->Pos(rowId); // position
row->Del(); // ... and delete
for(int i=rowId;i<mat->Size();i++) // over all rows - 'i=rowId' cause columns need to be deleted just up from there
{
mat->Next(); //
TFloatList* row = mat->Get(); // get row
row->Pos(rowId); // position
row->Del(); // ... and delete
}
}
//----------------------------------------------------------------------------------------------------------------------
// re-calculate distances - caution: reference cuboid must be correctly initialized!
void TAdjMat::Recalc(const int& rowId, const TCuboid*const& ref)
{
// valid values are e[0..Size] - (i.e. there are Size()+1 valid values)
IfTrueThrowTypeB(rowId<0 || rowId>mat->Size(), "Function called with illegal value for 'rowId'!", "TAdjMat::Recalc"
, szModule);
// 1. re-calculate row -> ...
if(rowId != 0)
{
mat->Pos(rowId-1); // '-1'
TFloatList* row = mat->Get(); // get row
row->Reset();
// re-calculate row
for(int i=0;i<row->Size();i++) // over all distances
{
row->Next();
row->Get() = cuboids->Distance(i, ref); // re-calculate
}
}
else
mat->Reset();
// 2. re-calculate column
for(int i=rowId;i<mat->Size();i++)
{
mat->Next();
TFloatList* row = mat->Get(); // get row
row->Pos(rowId); // position to column
row->Get() = cuboids->Distance(i+1, ref); // re-calculate
}
}
//----------------------------------------------------------------------------------------------------------------------
// disconnect nodes 'i' and 'j'
void TAdjMat::Disconnect(const int& i, const int& j)
{
// note: valid values are: i e[1..Size()], j e[0..Size()-1] with i<=j
IfTrueThrowTypeB(i<=0||i>mat->Size(),"Function called with illegal value for 'i'!", "TAdjMat::Disconnect", szModule);
IfTrueThrowTypeB(j<0||j>mat->Size()-1,"Function called with illegal value for 'j'", "TAdjMat::Disconnect", szModule);
IfTrueThrowTypeB(i<=j, "Function called with 'i<=j' which is illegal!", "TAdjMat::Disconnect", szModule);
mat->Pos(i-1);
TFloatList* row = mat->Get(); // get row
row->Pos(j); // position in row and ...
row->Get() = MAXFLOAT; // ... disconncet by setting maximal possible value
}
//----------------------------------------------------------------------------------------------------------------------
// search smallest distance in adjazenz matrix
//
// notes: rows are indicated by 'k' and enumerated from '1' to 'Size()-1'
// columns are indicated by 'l' and enumerated from '0' to 'Size()-2'
// This allows to use 'k' and 'l' to index inside TCuboids-objects
// note: k>l
float TAdjMat::FindMin(int& k, int& l) const
{
#ifdef PROFILE
clock_t time=clock();
nFindMin++;
#endif
float min = MAXFLOAT; // ini
if(mat->Size() == 0) // return if matrix is empty
return min;
// 1. reset row pointer list
mat->Reset();
// 2. iterate all rows - 'i=1' cause matrix is triangular without diagonal
for(int i=1;i<cuboids->Size();i++) // over all rows
{
mat->Next(); //
TFloatList*& row = mat->Get(); // get row
row->Reset(); // ... reset it
// iterate over all distances in row - note: mind that matrix is triangular without diagonal
for(int j=0;j<i;j++)
{
row->Next(); //
// if new minimum is found -> ...
if(row->Get() < min)
{
min = row->Get(); // store new minimum
k = i; // store row and column
l = j;
if(min==0) // performance boost: stop if distance of zero is found
{
#ifdef PROFILE
t_FindMin+=clock()-time; // profiling code
#endif
return min;
}
}
}
}
#ifdef PROFILE
t_FindMin+=clock()-time;
#endif
return min;
}
//-------------------------------------------------------------------------------------------------------------------
//
// class TAdjArr - adjazenz-array to represent all combinations between two cuboids lists (rectangular matrix)
//
//----------------------------------------------------------------------------------------------------------------------
// constructor
TAdjMatEx::TAdjArr::TAdjArr(TCuboids*const& _a, TCuboids*const& _b)
{
mat = new TFloatsList; // new row pointer list
mat->SetName("TAdjArrRowList");
a = _a;
b = _b;
m = NULL;
for(int i=0;i<b->Size();i++) // new b->Size() rows and calculate distances
{
TFloatList*& row = mat->Ins(); // new row pointer list entry ...
row = new TFloatList; // and new distances list
row->SetName("TAdjArrDistancesList");
// set reference cuboid
TCuboid* ref = &b->Get(i);
for(int j=0;j<a->Size();j++) // over all columns
row->Ins()=a->Distance(j, ref); // calculate distance to reference cuboid
}
_nCols = a->Size(); // store # columns
}
//----------------------------------------------------------------------------------------------------------------------
// constructor
TAdjMatEx::TAdjArr::TAdjArr(TCuboids*const& _a, TCuboids*const& _b, TCuboids*const& _m)
{
mat = new TFloatsList; // new row list
mat->SetName("TAdjArrRowList"); // and set its name
// copy
a = _a;
b = _b;
m = _m;
_nCols = a->Size()+b->Size(); // store # columns
}
//----------------------------------------------------------------------------------------------------------------------
// destructor
TAdjMatEx::TAdjArr::~TAdjArr()
{
DeleteEntries(mat);
delete mat;
}
//----------------------------------------------------------------------------------------------------------------------
// recalculate distances if cuboid was changed/merged
void TAdjMatEx::TAdjArr::Recalc(const int& _row, const TCuboid*const& ref)
{
// check
IfTrueThrowTypeB(!m, "Fucntion called for instance constructed with only 2 adjacency objects!", "TAdjArr::Recalc", szModule);
IfTrueThrowTypeB(_row<0 || _row>=mat->Size(), "Function called with illegal values!", "TAdjArr::Recalc", szModule);
mat->Pos(_row); // position to row
TFloatList* row = mat->Get(); // get row
if(_nCols == 0) // return if there are no columns anymore
return;
row->Reset();
for(int j=0;j<a->Size();j++) // all columns (<a>) - note: matrix has size m->Size() x (a->Size()+b->Size())
{
row->Next();
row->Get() = a->Distance(j, ref); // calculate distance to reference cuboid
}
for(int j=0;j<b->Size();j++) // all columns (<b>)
{
row->Next();
row->Get() = b->Distance(j, ref); // calculate distance to reference cuboid
}
}
//----------------------------------------------------------------------------------------------------------------------
// delete row
void TAdjMatEx::TAdjArr::DelRow(const int& rowId)
{
IfTrueThrowTypeB(rowId<0||rowId>=nRows(),"Function called with illegal value of 'rowId'!","TAdjArr::DelRow",szModule);
mat->Pos(rowId); // position
delete mat->Get(); // delete distances list
mat->Del(); // delete from row pointer list
}
//----------------------------------------------------------------------------------------------------------------------
// delete column
void TAdjMatEx::TAdjArr::DelCol(const int& colId)
{
IfTrueThrowTypeB(colId<0||colId>=_nCols,"Function called with illegal value of 'colId'!","TAdjArr::DelCol",szModule);
for(int i=0;i<nRows();i++) // delete colId-th distance in all distances lists, i.e. in all rows
{
mat->Pos(i); // position to i-th row
TFloatList* row = mat->Get();
row->Pos(colId); // position to colId-th distance
row->Del(); // delete list entry
}
_nCols--; // decrement column counter
}
//----------------------------------------------------------------------------------------------------------------------
// append row
void TAdjMatEx::TAdjArr::App()
{
IfTrueThrowTypeB(!m,"Function called for instance constructed with only 2 adjacency objects!","TAdjArr::App",szModule);
mat->Pos(nRows()-1); // position to the end of row pointer list
TFloatList*& row = mat->Ins(); // insert new row
row = new TFloatList; // ... and new distances list
row->SetName("TAdjArrDistancesList");
TCuboid* ref = &m->Get(nRows()-1); // set reference cuboid
for(int j=0;j<a->Size();j++) // all columns due to <a>
row->Ins()=a->Distance(j, ref); // calculate distance
for(int j=0;j<b->Size();j++) // all columns du to <b>
row->Ins()=b->Distance(j, ref); // calculate distance
}
//----------------------------------------------------------------------------------------------------------------------
// disconnect node
void TAdjMatEx::TAdjArr::Disconnect(const int& rowId, const int& colId)
{
IfTrueThrowTypeB(rowId<0 || rowId>=mat->Size() || colId<0 || colId>=_nCols, "Function called with illegal values!",
"TAdjArr::Disconnect", szModule);
mat->Pos(rowId); // position to rowId-th row
TFloatList* row = mat->Get();
row->Pos(colId); // position to colId-th column
row->Get() = MAXFLOAT; // disconnect
}
//----------------------------------------------------------------------------------------------------------------------
// debug function
void TAdjMatEx::TAdjArr::Save(ofstream& file) const
{
// write header
file << ComChar << " TAdjArr" << endl;
file << ComChar << " Size = " << mat->Size() << " x " << _nCols << endl;
// write matrix
if(mat->Size() > 0 && _nCols >0) // if matrix (list) non-empty
{
mat->Reset();
// over all rows
for(int i=0;i<nRows();i++)
{
mat->Next(); //
TFloatList*& row = mat->Get(); // get row
row->Reset();
for(int j=0;j<_nCols;j++) // iterate all distances in row
{
row->Next();
file << row->Get() << " ";
}
}
}
file << endl;
}
//----------------------------------------------------------------------------------------------------------------------
// find minimum distance
float TAdjMatEx::TAdjArr::FindMin(int& id1, TCuboids*& cub2, int& id2)
{
#ifdef PROFILE
clock_t time=clock();
nFindMinEx++;
#endif
float min = MAXFLOAT; // ini with biggest possible value
if(mat->Size() > 0 && _nCols >0) // if matrix (list) non-empty
{
mat->Reset();
// search for smallest distance
for(int i=0;i<nRows();i++)
{
mat->Next(); //
TFloatList*& row = mat->Get(); // get row
row->Reset();
for(int j=0;j<_nCols;j++) // iterate all distances in row
{
row->Next();
if(row->Get() < min) // if new minimum found ...
{
min = row->Get(); // ... store it
id1 = i;
id2 = j;
}
}
}
// set cuboid lists: change 'cub2' und 'id2' to 'b', if 'id2' identifies a column in the part of 'b'
cub2 = a; // ini
if(m && id2 >= a->Size())
{
cub2 = b;
id2 = id2-a->Size();
}
}
#ifdef PROFILE
t_FindMinEx+=clock()-time;
#endif
return min;
}
//-------------------------------------------------------------------------------------------------------------------
//
// class TAdjMatEx
//
//----------------------------------------------------------------------------------------------------------------------
// constructor
TAdjMatEx::TAdjMatEx(TCuboidsEx*const& cuboidsEx)
{
a = cuboidsEx->a;
b = cuboidsEx->b;
m = cuboidsEx->m;
matA_B = new TAdjArr(a, b);
matM_AB = new TAdjArr(a, b, m);
matM_M = new TAdjMat(m);
}
// destructor
TAdjMatEx::~TAdjMatEx()
{
delete matA_B;
delete matM_AB;
delete matM_M;
}
//----------------------------------------------------------------------------------------------------------------------
// find smallest distance, search all three adjazenz objects
float TAdjMatEx::FindMin(TCuboids*& cub1, int& id1, TCuboids*& cub2, int& id2)
{
// temporary variables to store results of each call of FindMin()
TCuboids *cubInMatA_B, *cubInMatM_AB;
int id1_A_B, id2_A_B, id1_M_AB, id2_M_AB, id1_M_M, id2_M_M;
// call FindMin()
float min_A_B = matA_B ->FindMin(id1_A_B, cubInMatA_B, id2_A_B);
float min_M_AB = matM_AB ->FindMin(id1_M_AB, cubInMatM_AB, id2_M_AB);
float min_M_M = matM_M ->FindMin(id1_M_M , id2_M_M);
// find smallest value
if(min_M_M < min_A_B && min_M_M < min_M_AB)
{
// matrix 'M' has smallest distance
cub1 = m;
cub2 = m;
id1 = id1_M_M;
id2 = id2_M_M;
return min_M_M;
}
// else
if(min_A_B < min_M_AB)
{
// matrix 'A_B' has smallest distance
cub1 = b;
cub2 = a;
id1 = id1_A_B;
id2 = id2_A_B;
return min_A_B;
}
// else
// matrix 'M_AB' has smallest distance
cub1 = m;
cub2 = cubInMatM_AB;
id1 = id1_M_AB;
id2 = id2_M_AB;
return min_M_AB;
}
//----------------------------------------------------------------------------------------------------------------------
// disconnect node
void TAdjMatEx::Disconnect(const TCuboids*const& cubs1, const int& id1, const TCuboids*const& cubs2, const int& id2)
{
if(cubs1!=m) matA_B->Disconnect(id1, id2); // delete in matrix 'A_B'
else if(cubs2 == a)matM_AB->Disconnect(id1,id2); // delete in matrix 'M_AB' in part A
else if(cubs2 == b) matM_AB->Disconnect(id1,id2+a->Size());// delete in matrix 'M_AB' in part B
else if(cubs2 == m) matM_M->Disconnect(id1, id2); // delete in matrix 'M_M'
}
//----------------------------------------------------------------------------------------------------------------------
// reflect merge of two cuboids in adjazenz matrices
void TAdjMatEx::Merge(const TCuboids*const& cubs1, const int& id1, const TCuboids*const& cubs2, const int& id2
, const TCuboid*const& ref)
{
// case a) : merge in matrix 'A_B'
if(cubs1 != m)
{
matA_B ->DelRow(id1); // delete row
matA_B ->DelCol(id2); // ...and columns
matM_AB->DelCol(id2);
matM_AB->DelCol(a->Size()+id1); // '+1-1=+-0' cause preceeding column was deleted and(!) cuboid list 'A'
// has an cuboid fewer
matM_AB->App(); // append new row - note: will be calculated automatically
matM_M ->App(ref);
return;
}
// case c): merge in matrix 'M_M'
if(cubs2 == m) // delete in matrix 'M_AB' and(!) 'M_M'
{
matM_AB->DelRow(id2); // delete two rows (first delete)
matM_AB->Recalc(id1-1, ref); // '-1' cause preceeding row(?) was deleted
matM_M ->Del(id2); // 2nd delete (see above)
if(matM_M->Size() >0)
matM_M->Recalc(id1-1, ref); // ... recalculate
return;
}
// case b): merge in matrix 'M_AB'
if(cubs2 == a)
{
matA_B ->DelCol(id2);
matM_AB->DelCol(id2);
}
else
{
matA_B ->DelRow(id2);
matM_AB->DelCol(id2+a->Size());
}
// do recalculations
matM_AB->Recalc(id1, ref);
if(matM_M->Size()>0) // only if matrix ins't empty
matM_M->Recalc(id1, ref);
}
//-------------------------------------------------------------------------------------------------------------------
//
// class TPnc
//
//----------------------------------------------------------------------------------------------------------------------
// constructor
TPnc::TPnc(TData*const& _data, const TParameter& _para)
{
#ifdef DEBUG_LOG_ON
DebugLogFile << "TPnc object intialized ... ";
#endif
// check if data is sorted
IfTrueThrowTypeA(!_data->IsSorted(), "Data is not sorted! Call Sort(0) before!", "TPnc::TPnc", szModule);
data = _data; // copy learn data and parameter
para = _para;
f_Finished = false; // set flags
f_Pruned = false;
f_Next = true;
f_Cascade = false;
state = s_Pass1; // set state
n_ges = data->nTup(); // initialize total number of (trivial) cuboids
opI = 0; // reset
progress = 0;
data->Lock(); // lock data
dfunc.Initialize(para.Metric); // set metric
TDataData::TNormType normType = TDataData::ByRange; // normalization type for continuous features
if(!para.f_NormalizeByRange)
normType = TDataData::ByFourSigma;
ddata = new TDataData(data, normType, para.OverlapFac);
intervals = new TIntervals(data, ¶); // interval wise access to sorted data
cuboidsLists = new TCuboidsLists; // list with cuboid lists
cuboids = NULL; // initialize with NULL pointer
adjMat = NULL;
cuboidsEx = NULL;
adjMatEx = NULL;
#ifndef RELEASE
u_Gamma = CalcUGamma(ALPHA); // used for conf. intervals for confident merge test
#else
u_Gamma = 1.0;
#endif
// profiling code
#ifdef PROFILE
t_FindMin = t_FindMinEx = t_NegTest = t_PostPrune = t_Convert2TCluster = t_Distance = t_TCuboids = t_TIntervals= 0;
nNegTest = nFindMin = nFindMinEx = nHitTest = nDistance = nTCuboids = nConvert2TCluster =0;
t_Reset = clock();
#endif
}
//----------------------------------------------------------------------------------------------------------------------
// destructor
TPnc::~TPnc()
{
#ifdef DEBUG_LOG_ON
DebugLogFile << "... deleted" << endl;
#endif
#ifdef PROFILE // profiling code
logfile << endl << endl << "T_Learn ="; // learning
WriteTime(t_Learn, logfile);
logfile << endl << "T_PostPrune ="; // post pruning
WriteTime(t_PostPrune, logfile);
logfile << endl << "FindMin " << setw(8) << nFindMin << " calls ";
WriteTime(t_FindMin, logfile);
logfile << endl << "FindMinEx " << setw(8) << nFindMinEx << " calls ";
WriteTime(t_FindMinEx, logfile);
logfile << endl << "NegTest " << setw(8) << nNegTest << " calls ";
WriteTime(t_NegTest, logfile);
logfile << endl << "HitTest " << setw(8) << nHitTest << " calls ";
logfile << endl << "Distance " << setw(8) << nDistance << " calls ";
WriteTime(t_Distance, logfile);
logfile << endl << "Convert2TCluster " << setw(8) << nConvert2TCluster << " calls ";
WriteTime(t_Convert2TCluster, logfile);
#endif
delete cuboids;
delete cuboidsEx;
delete adjMat;
delete adjMatEx;
delete intervals;
delete cuboidsLists;
data->Unlock();
ddata->Release();
}
//----------------------------------------------------------------------------------------------------------------------
// debug function: write model to file (not tested!)
void TPnc::Save(ofstream& file) const
{
// a) write header
file << ComChar << " Cuboid data (debug function)" << endl;
// b) get mapper to all cuboids
const TCuboid**const& mapper=cuboidsLists->GetMapper(consistent_mapper);
// c) write control parameters
file << ComChar << " OpI = " << cuboidsLists->Size() << endl; // # cuboid lists
file << ComChar << " Cascade = " << FlagToString(f_Cascade) << endl; // cascade flag
for(int i=0;i<cuboidsLists->Size();i++) // # cuboids in each cuboid list
file << ComChar << " N_Cuboids" << (i+1) << " = " << cuboidsLists->Size(i) << endl;
file << ComChar << " N_C = " << cuboidsLists->nCuboids() << endl << endl; // total # cuboids all together
// d) write comment lines with variable id and range
char szText[STS]; // variable numbers (comment line one)
sprintf(szText, "%c VarId%3d ", ComChar, 1);
file << szText;
for(int j=1;j<data->nVar();j++)
{
if(NOMINAL_VARIABLES && data->IsSymbolic(j))
sprintf(szText, "%*d ", data->nIntegerMaxMin(j), j+1); // note: use different width for symbolic columns
else
sprintf(szText, "%10d ", j+1);
file << szText;
}
file << endl << ComChar << " Range "; // input minima/maxima (comment line two)
for(int j=1;j<data->nVar();j++)
{
if(NOMINAL_VARIABLES && data->IsSymbolic(j))
sprintf(szText, "%*d ", data->nIntegerMaxMin(j), data->nIntegerMaxMin(j)); // symbolic: write # symbols
else
sprintf(szText, "%9g-%-9g", data->Min()[j], data->Max()[j]); // continuous: write minma/maxima
file << szText << " ";
}
file << endl;
// d) write cuboid data
for(int i=0;i<cuboidsLists->nCuboids();i++)
{
sprintf(szText, "%10g ", mapper[i]->y_snap); // snapped output value
file << szText;
// note: variable 'j' is without(!) output
for(int j=0;j<data->nVar()-1;j++) // for each input variable
{
if(NOMINAL_VARIABLES && data->IsSymbolic(j+1))
{
WriteBits(file, mapper[i]->Mask(j), data->nIntegerMaxMin(j+1)); // symbolic (nominal): write bits
file << " ";
}
else
{
sprintf(szText, "%9g-%-9g", mapper[i]->Min(j), mapper[i]->Max(j)); // cont. or ordinal: write left and
// right bound
file << szText;
}
file << " ";
}
// mass and # covered negative examples
file << setw(12) << mapper[i]->m << " " << setw(12) << mapper[i]->neg << endl;
}
}
//----------------------------------------------------------------------------------------------------------------------
// write mass, #negative examples and tuple id's (id's of all tuples merged in the cluster) for each cluster
void TPnc::SaveClusterTupleIds(ofstream& file) const
{
// a) get mapper to all cuboids
const TCuboid**const& mapper=cuboidsLists->GetMapper(consistent_mapper);
// b) write header
file << ComChar << " Each cuboid's tuple Id's" << endl;
file << ComChar << " Format: # positive examples | # negative examples | Id's of tuples merged in cuboid" <<endl;
// c) write tuple id's for each cluster
for(int i=0;i<cuboidsLists->nCuboids();i++)
{
file << setw(10) << mapper[i]->m << " " << setw(10) << mapper[i]->neg << " "; // mass and # negative examples
for(int k=0;k<mapper[i]->m;k++)
file << " " << (mapper[i]->tuples->Get(k)+1);
file << endl;
}
}
#ifndef RELEASE
//----------------------------------------------------------------------------------------------------------------------
// return maximal # negative examples of generalized cuboid which does not lead to a hirate worse than the mean hitrate
// (reduced by confidence intervals) of the two single cuboids.
int MaxNegs(const TCuboid& a, const TCuboid& b, const float& u_Gamma, const float& r_min)
{
float p=a.m+b.m; // mass of generalized cuboid, also the number of positive examples
float r_a = 1-CalcUpper(a.m+a.neg, a.neg, u_Gamma); // hitrate of each single cuboid decreased by confidence interval
float r_b = 1-CalcUpper(b.m+b.neg, b.neg, u_Gamma);
float r = max(r_min, (a.m*r_a+b.m*r_b)/p); // average hitrate bounded to 'r_min'
return (p-r*p)/r; // r=p/(p+n) <=> n=(p-rp)/r
}
#endif
//----------------------------------------------------------------------------------------------------------------------
// return maximal # negative examples of generalized cuboid which does not lead to a hirate worse than the mean hitrate
// (determined by laplace correction) of the two single cuboids.
int MaxNegsLapCor(const TCuboid& a, const TCuboid& b, const int& nClasses)
{
float p=a.m+b.m; // mass of generalized cuboid, also the number of positive examples
float r_a = (a.m+1.0) / (a.m+a.neg+nClasses); // hitrate of each single cuboid
float r_b = (b.m+1.0) / (b.m+b.neg+nClasses);
// float r = max(r_min, (a.m*r_a+b.m*r_b)/p); // average hitrate bounded to 'r_min'
float r = (a.m*r_a+b.m*r_b)/p; // average hitrate
return (p*(1-r) + 1 - r*nClasses) / r;
}
//----------------------------------------------------------------------------------------------------------------------
// do one iteration step - return false if finished
bool TPnc::Iterate()
{
// if 2nd pass (cascade mode) ...
if(f_Cascade)
{
consistent_mapper = false; // reset flag
// 1. if flag is set: stepover to next output interval (outer loop)
if(f_Next)
{
// a) get id of next output interval
opI = intervals->NextInterval();
// b) if there are no more intervals to process -> end process
if(opI == -1)
{
#ifdef PROFILE // profiling code
t_Learn = clock()-t_Reset;
#endif
#ifdef DEBUG_LOG_ON
DebugLogFile << " Finished! "; // learning is finished
#endif
f_Finished = true;
return false;
}
#ifdef DEBUG_LOG_ON
DebugLogFile << "|"; // start merging of two groups/intervalls
#endif
// progress indication
f_Next = false; // reset flag
nChecked = 0; // reset # checked nodes
nToCheck = 1+cuboidsLists->Get(opI)->Size()*cuboidsLists->Get(opI+1)->Size();
baseProgress = 1.0-(intervals->Size()-para.N_Int)/((float) nCascadeInt); // progress
// c) create 'extended' cuboid list using the op-th and the 'opI-1'-th cuboid lists
cuboidsEx = new TCuboidsEx(cuboidsLists->Get(opI), cuboidsLists->Get(opI+1), dfunc, ddata, para.Weights);
// delete the two cuboid lists just taken to form TCuboidsEx object
cuboidsLists->Del(opI);
cuboidsLists->Del(opI);
// d) create 'extended' adjazenz matrix
adjMatEx = new TAdjMatEx(cuboidsEx);
}
// actualize progress
progress = baseProgress + incProgress*min((float) 1.0, nChecked/((float)nToCheck));
// inner loop: proceed as long as there are un-tested combinations of cuboids
// 2. find the two closest cuboids and try to merge them
TCuboids *cubs1, *cubs2;
int id1, id2;
if(adjMatEx->FindMin(cubs1, id1, cubs2, id2)!= MAXFLOAT ) // a) find the two closest cuboids ...
{
const TCuboid& a = cubs1->Get(id1); // b) ... get references to them ...
const TCuboid& b = cubs2->Get(id2); //
TCuboid* conj = Generalize(a, b, data); // ... generalize them
// determine maximal # negative examples
#ifdef SINGLE_MERGE_TEST
int maxNegs = (a.m+b.m)*para.Eta; // single merge test
#endif
#ifdef DOUBLE_MERGE_TEST
int maxNegs = min(a.m*para.Eta+b.neg, b.m*para.Eta+a.neg); // double merge test
#endif
#ifdef CONFIDENT_MERGE_TEST
int maxNegs = MaxNegs(a, b, u_Gamma, 1.0/(1.0+para.Eta)); // confident merge test
#endif
#ifdef LAPCOR_MERGE_TEST
int maxNegs = MaxNegsLapCor(a, b, intervals->nInt()); // laplace corrected merge test
#endif
// c) make test and merge if generalized cuboid passes test
if(intervals->NegTest(*conj, intervals->Y_Quant(opI), maxNegs, para.w_COD, ddata->Weights()))
{
// test passed
// delete min(k,l)-th cuboid and overwrite max(k,l)-th one with generalized cuboid <conj>
cuboidsEx->Merge(cubs1, id1, cubs2, id2, *conj);
// reflect merge in adjazenz matrix
adjMatEx->Merge(cubs1, id1, cubs2, id2, conj);
conj->Assimilate();
n_ges--; // decrement
}
else
adjMatEx->Disconnect(cubs1, id1, cubs2, id2); // test failed: disconnect node(k, l)
delete conj; // d) delete generalized cuboid - note: if accepted and kept cuboid was copied and 'assimilated'
nChecked++; // increment # checked nodes
}
else
{
// 3. end actual step of outer loop: replace the two just processed cuboid lists with the resulting cuboid list
#ifdef DEBUG_LOG_ON
DebugLogFile << "ok ";
#endif
// set flag
f_Next = true;
// delete interval from interval list
intervals->Del(opI+1);
// ins. cub. in list of all cub. lists - note: 'opI-1'cause insertion takes place after(!) specified position
cuboidsLists->Ins(opI-1, cuboidsEx->ToCuboids());
// release 'extended' cuboid list and 'extended' adjazenz matrix
delete adjMatEx; adjMatEx = NULL;
delete cuboidsEx; cuboidsEx = NULL;
}
return true;
}
// else: not cascade mode
// 1. if flag is set: stepover to next output interval (outer loop)
if(f_Next)
{
#ifdef DEBUG_LOG_ON
DebugLogFile << ".";
#endif
// a) create cuboids from part of the (learn) data tuples
cuboids = new TCuboids(intervals, data, opI, dfunc, ddata, para.Weights);
// b) create adjazenz matrix
adjMat = new TAdjMat(cuboids);
f_Next = false; // reset flag
nChecked = 0; // reset # checked nodes
incProgress = 1.0/intervals->Size();// progress in percent if one output interval/group is processed in first pass
nToCheck = 1+cuboids->Size()*(cuboids->Size()-1)*0.5;
baseProgress = opI*incProgress; // progress
}
// actualize progress
int nStillToCheck = 1+cuboids->Size()*(cuboids->Size()-1)*0.5;
progress = baseProgress+incProgress*(1-sqrt(nStillToCheck/((float) nToCheck)));
// inner loop: proceed as long as there are un-tested combinations of cuboids
// 2. find the two closest cuboids and try to merge them
int k, l;
if(adjMat->FindMin(k, l)!=MAXFLOAT) // a) find the two closest cuboid ...
{
const TCuboid& a = cuboids->Get(k); // b) ... get references to them
const TCuboid& b = cuboids->Get(l);
TCuboid* conj = Generalize(a, b, data); // ... generalize them ...
// determine maximal # negative examples
#ifdef SINGLE_MERGE_TEST
int maxNegs = (a.m+b.m)*para.Eta; // single merge test
#endif
#ifdef DOUBLE_MERGE_TEST
int maxNegs = min(a.m*para.Eta+b.neg, b.m*para.Eta+a.neg); // double merge test
#endif
#ifdef CONFIDENT_MERGE_TEST
int maxNegs = MaxNegs(a, b, u_Gamma, 1.0/(1.0+para.Eta)); // confident merge test
#endif
#ifdef LAPCOR_MERGE_TEST
int maxNegs = MaxNegsLapCor(a, b, intervals->nInt()); // laplace corrected merge test
#endif
// c) make test and merge if generalized cuboid passes test
if(intervals->NegTest(*conj, intervals->Y_Quant(opI), maxNegs, para.w_COD, ddata->Weights()))
{
// test passed
// delete (k,l)-th cuboid and overwrite max(k,l)-th cuboid in cuboid list with the generalized cuboid <conj>
cuboids->Merge(k, l, *conj);
// reflect merge in adjazenz matrix - note: only done if there is more than one cuboid left
adjMat->Del(min(k, l));
if(adjMat->Size() > 0)
adjMat->Recalc(max(k, l)-1, conj); // '-1' cause preceeding row/column was deleted
n_ges--; // decrement
conj->Assimilate();
}
else
adjMat->Disconnect(k, l); // disconnect node(k, l)
delete conj; // see note above
nChecked++; // increment # checked nodes
}
else
{
// 3. end actual step of outer loop: insert actually processed cuboid list in 'list of all cuboid lists'
#ifdef DEBUG_LOG_ON
DebugLogFile << "ok ";
#endif
// set flags
consistent_mapper = false;
f_Next = true;
// append actual cuboid list
cuboidsLists->App(cuboids);
// release adjazenz matrix and defer reponsibility for cuboid list by setting it to NULL
delete adjMat; adjMat = NULL;
cuboids= NULL; // note: ownership was passed to <cuboidsList>
opI++;
// if last interval was processed -> proceed in cascade-mode
if(opI == intervals->Size())
{
f_Cascade = true;
nCascadeInt = intervals->Size()-intervals->nInt(); // # intervals which need to be merged in 2nd pass
// progress in percent if one output interval/group is processed in 2nd pass (cascade mode)
if(nCascadeInt>0)
incProgress = 1.0/nCascadeInt;
state = s_Pass2;
}
}
return true;
}
// id's of tuples merged in i-th cuboid
TTupleIdList* TPnc::Tuples(const int& i) const{ return cuboidsLists->GetMapper(consistent_mapper)[i]->tuples;};
//----------------------------------------------------------------------------------------------------------------------
// write status text to static variable (char field) and return pointer to it
const char* TPnc::GetStatusText()
{
static char szStatus[STS];
switch(state)
{
case s_Converting : sprintf(szStatus, "Converting ..."); break;
case s_Pass1 : sprintf(szStatus, "First pass ..."); break;
case s_Pass2 : sprintf(szStatus, "Second pass ..."); break;
case s_Pruning : sprintf(szStatus, "Pruning ..."); break;
}
return szStatus;
}
//----------------------------------------------------------------------------------------------------------------------
// get progress - note: very simple heuristics used! ToDo: improve in future
const float TPnc::GetProgress(const bool& f_Prune/*=true*/) const
{
float base=0.0;
float step;
if(f_Prune) // if pruning needs to be done
{
step = 0.5; // 0.5=50% for first pass
if(state==s_Pass2)
{
base = 0.5;
step = 0.2; // 0.2=20% for 2nd pass
}
else
if(state==s_Pruning)
{
base = 0.7;
step = 0.3; // 0.3=30% for pruning
}
}
else // else: only learning needs to be done
{
step = 0.7; // 0.7=70% for first pass
if(state==s_Pass2)
{
base = 0.7;
step = 0.3; // 0.3=30% for 2nd pass
}
}
// note: the variable 'progress' indicates progress of the currently processed step (first pass, 2nd pass, pruning)
return base + progress*step; // calculate progress;
}
//----------------------------------------------------------------------------------------------------------------------
// re-calculate cuboid's bounds by taking an average of the two most outstanding values
// notes: - use ORIGINAL bounds if bound was pruned or is symbolic
// - do not call for symbolic (nominal) variables
// - variable Id 'j' is counted with(!) output
void CalcPosAndRadians(const int& j, float& lower, float& upper, const TCuboid*const& cub, const TData*const& data
, const float& fac)
{
const int nTup = cub->tuples->Size(); // # tuples merged in cluster
float l=-1,r=-1; // ini
// use original bounds if bound was pruned
if(cub->Min(j-1)== -MAXFLOAT) l=cub->Min(j-1);
if(cub->Max(j-1)== MAXFLOAT) r=cub->Max(j-1);
// if left and/or right bound were not pruned
if(r==-1 || l==-1)
{
float* x=new float[nTup]; // allocate
for(int t=0;t<nTup;t++) // copy j-th variable of all tuples
x[t]=data->Row(cub->tuples->Get(t))[j]; // note: data is with output
qsort(x, nTup, sizeof(x[0]), FloatCmpDes); // sort descending
// re-calculate if bound was not reduced
if(r==-1)
r=(x[0]+x[1]*fac)/(1+fac); // mean of first and 2nd maximum
if(l==-1)
l=(x[nTup-1]+x[nTup-2]*fac)/(1+fac); // mean of first and 2nd minima
delete[] x; // release
}
lower = l; // copy
upper = r;
}
//----------------------------------------------------------------------------------------------------------------------
// calculate real output value of cuboid
float CalcCuboidOutput(const TCuboid*const& cub, const TData*const& data)
{
const int nTup = cub->tuples->Size(); // # tuples merged in cluster
float y=0; // ini
// calculate mean output value of all tuples merged in cuboid
for(int t=0;t<nTup;t++)
y += data->Row(cub->tuples->Get(t))[0]; // sum up
y /= nTup; // make mean
return y;
}
//----------------------------------------------------------------------------------------------------------------------
// convert cuboids to TCluster to use it for TPrediction or derivates - note: skip cuboids with a mass less
// than ppara->k+1
TCluster* /*cr*/ TPnc::ToTCluster(const bool& f_PruneAnyWay, const bool*const& f_Stop/*=NULL*/, const int& k /*=0*/)
{
#ifdef DEBUG_LOG_ON
DebugLogFile << "Converting ... ";
#endif
if(!cuboidsLists) // security check
return NULL;
#ifdef PROFILE
clock_t time=clock();
nConvert2TCluster++;
#endif
state = s_Converting;
// 1. get mapper to all cuboids
const TCuboid**const& mapper = cuboidsLists->GetMapper(consistent_mapper);
// 2. new clusters
int nCls = nCluster(k); // # cuboids with mass > k
if(nCls<=0) // catch trivial case ...
nCls=1;
TCluster* cls = new TCluster(nCls, ddata->GetObject(), para);
cls->f_HasOriginalCuboids = true; // model is initialized with un-pruned bounds here
// 3. copy cuboid data:
int count=0;
for(int i=0;i<cuboidsLists->nCuboids();i++)
{
// a) skip if # tuples per cuboid is to small
if(mapper[i]->m<=k && (count!=0 || i!= cuboidsLists->nCuboids()-1)) // note: ensure at least one cluster
continue;
// b) output
#ifdef RE_CALC_Y_C
if(para.f_Regression)
// re-calculate cuboid's output value as mean over the output of all merged tuples
cls->Output(count) = CalcCuboidOutput(mapper[i], data);
else
#endif
cls->Output(count) = mapper[i]->y_snap;
// c) input positions and radians
for(int j=1;j<data->nVar();j++) // note: '-1' cause there is no output in TCuboid::l ....
if(NOMINAL_VARIABLES && data->IsSymbolic(j))
cls->Mask(count, j) = mapper[i]->Mask(j-1); // symbolic (nominal): copy
else
// continuous or ordinal:
#ifndef RELEASE // obsolete in release versions
if(para.Noise>0 && mapper[i]->tuples->Size()>1) // recalculate left and right bound
CalcPosAndRadians(j, cls->Lower(count,j), cls->Upper(count,j), mapper[i], data, para.Noise);
else
#endif
{
cls->Lower(count,j) = mapper[i]->Min(j-1); // copy left and ..
cls->Upper(count,j) = mapper[i]->Max(j-1); // ... right bound
}
// d) mass and # covered negative examples
cls->Mass(count) = mapper[i]->m;
cls->NegMass(count) = mapper[i]->neg;
count++;
}
#ifndef RELEASE // obsolete in release version
cls->CalcAvrDiameter(); // calculate average diameter
#endif
// 4. post prune if necessary/specified
// set flag which indicates that 'useful' pruning information is avalaible
cls->f_HasPruningInformation = (para.Prune || f_PruneAnyWay);
if(para.Prune || f_PruneAnyWay)
{
#ifdef DEBUG_LOG_ON
DebugLogFile << "Pruning ... ";
#endif
PostPrune(f_Stop);
}
// 5. set active id's, i.e. id's of non pruned variables note: id's are indexed with(!) output
state = s_Converting;
count=0;
for(int i=0;i<cuboidsLists->nCuboids();i++) // for all cuboids
{
// a) skip if # tuples per cuboid is to small
if(mapper[i]->m<=k && (count!=0 || i!= cuboidsLists->nCuboids()-1)) // note: ensure at least one cluster
continue;
// b) count # active, i.e. none pruned, variables and store their id's for performance reasons
int nActiveL=0, nActiveR=0; // ini
for(int j=1;j<data->nVar();j++)
if(NOMINAL_VARIABLES && data->IsSymbolic(j))
{
if((mapper[i]->Mask(j-1) ^~0)!=0) // not all bits are set (not pruned) ...
cls->ActiveLowerBound(count, nActiveL++) = j; // store id
}
else
{
if(mapper[i]->Min(j-1)>=data->Min()[j]) // if lower bound is not(!) pruned ...
cls->ActiveLowerBound(count, nActiveL++) = j; // store id
if(mapper[i]->Max(j-1)<=data->Max()[j]) // if upper bound is not(!) pruned ...
cls->ActiveUpperBound(count, nActiveR++) = j; // store id
}
cls->nActiveLowerBounds(count) = nActiveL; // copy
cls->nActiveUpperBounds(count) = nActiveR;
count++;
}
#ifdef PROFILE
t_Convert2TCluster +=clock()-time;
#endif
cls->CalculateHitRate(); // calculate hitrate for each cuboid
#ifdef DEBUG_LOG_ON
DebugLogFile << "Done! ";
#endif
return cls;
}
#ifdef FORWARD_PRUNING
//----------------------------------------------------------------------------------------------------------------------
// post pruning of cuboids; alternate forward selection pruning
struct TTId
{
int id;
float mass;
};
void TPnc::PostPrune(const bool*const& f_Stop)
{
// 1. do some checks and get mapper to all cuboids
if(!cuboidsLists) return; // check: nothing can be pruned if there are no cuboids
if(f_Pruned) return; // check: return if instance is already pruned
f_Pruned=true;
const TCuboid**const& mapper = cuboidsLists->GetMapper(consistent_mapper); // get mapper to all cuboids
state = s_Pruning; // set state
progress = 0;
// 2. for each cuboid: try to generalize left and/or right bounds
const int nCub = cuboidsLists->nCuboids(); // abbrevation
for(int i=0;i<nCub;i++)
{
if(f_Stop)
if(*f_Stop) // check stop flag and abort if it is set
return;
// a) progress and abbrevation
progress = i/((float)nCub); // current progress
TCuboid* cub = const_cast<TCuboid*>(mapper[i]);
const int nInp = data->nVar()-1; // # input variables
// b) make array with id's of tuples not(!) inside the cuboid (negative tuples)
bool* ins = new bool[data->nTup()]; // allocate
for(int i=0;i<data->nTup();i++) // initialize/set
ins[i]=true;
for(int t=0;t<cub->tuples->Size();t++) // reset if tuples is inside cuboid
ins[cub->tuples->Get(t)] = false;
int nNegId = data->nTup()-cub->tuples->Size(); // # tuples outside (# negative examples)
TTId* negId = new TTId[nNegId]; // allocate
int c=0; // copy id's of negative tuples
for(int i=0;i<data->nTup();i++)
if(ins[i])
{
negId[c].id = i;
negId[c++].mass = para.w_COD;
}
delete[] ins; // release
// c) allocate new cuboid (left and right) bounds and initialize them as completely generalized
float* l0 = new float[nInp]; // allocate
float* r0 = new float[nInp];
for(int j=0;j<nInp;j++) // ini (completely generalize)
{
l0[j] = data->Min()[j+1]-data->Range()[j+1]; // set left bound beyond minimum
r0[j] = data->Max()[j+1]+data->Range()[j+1]; // set right bound beyond maximum
}
TTId* negId0 = new TTId[nNegId]; // allocate
TTId* negId00 = new TTId[nNegId]; // allocate
int nNegId0;
while(nNegId>0)
{
// d) find most discriminative bound
bool left;
int varId;
float best=0; // # negative examples discriminated by best bound so far
// two runs, one for left and one for right bound
// first run for left bound
for(int j=0;j<nInp;j++) // left
{
if(l0[j]>=data->Min()[j+1]) // continue if left bound is already accepted
continue;
float weight = ddata->Weights()[j+1]; // variable weight
float count=0;
int nNegId00=0;
for(int t=0;t<nNegId;t++) // for all negative examples
{
negId00[nNegId00]=negId[t]; // copy
if(data->Row(negId[t].id)[j+1]<cub->Min(j)) // if example is discriminated by lower (left) bound
{
count+=weight; // ... increment counter
if(negId[t].mass>weight) //
nNegId00++; // keep
else
negId00[nNegId00].mass -= weight; // decrease mass
}
else
nNegId00++; // keep
}
// check if new maximum is found
if(count>best)
{
left = true; // set new best values
varId = j;
best = count;
TTId* tmp=negId0; // swap pointer
negId0=negId00;
negId00=tmp;
nNegId0 = nNegId00;
}
}
// 2nd run for right bound
for(int j=0;j<nInp;j++)
{
if(r0[j]<=data->Max()[j+1]) // continue if right bound is already accepted
continue;
float weight = ddata->Weights()[j+1]; // variable weight
float count=0;
int nNegId00=0;
for(int t=0;t<nNegId;t++) // for all negative examples
{
negId00[nNegId00]=negId[t]; // copy
if(data->Row(negId[t].id)[j+1]>cub->Max(j)) // if example is discriminated by upper (right) bound
{
count += weight; // ... increment counter
if(negId[t].mass>weight) //
nNegId00++; // keep
else
negId00[nNegId00].mass -= weight; // decrease mass
}
else
nNegId00++; // keep
}
// check if new maximum is found
if(count>best)
{
left = false; // set new best values
varId = j;
best = count;
TTId* tmp=negId0; // swap pointer
negId0=negId00;
negId00=tmp;
nNegId0 = nNegId00;
}
}
if(best==0)
break;
// re-generalize most discriminative bound
if(left)
l0[varId] = cub->Min(varId); // copy from cuboid
else
r0[varId] = cub->Max(varId); // copy from cuboid
// "copy" new id's of negative examples
TTId* tmp=negId; // swap pointer
negId=negId0;
negId0=tmp;
nNegId = nNegId0; // # negative tuples
}
cub->Replace(l0, r0);
delete[] negId00;
delete[] negId0;
delete[] negId; // release
}
}
#else
//----------------------------------------------------------------------------------------------------------------------
// post pruning of cuboids: component wise try to generalize left and/or right bounds or mask if this does not affect
// the cuboid's separation power, i.e. if no new negative examples are covered
void TPnc::PostPrune(const bool*const& f_Stop)
{
#ifdef PROFILE
clock_t time=clock();
#endif
// 1. do some checks and get mapper to all cuboids
if(!cuboidsLists) return; // check: nothing can be pruned if there are no cuboids
if(f_Pruned) return; // check: return if instance is already pruned
f_Pruned=true;
const TCuboid**const& mapper = cuboidsLists->GetMapper(consistent_mapper); // get mapper to all cuboids
state = s_Pruning; // set state
progress = 0;
// 2. for each cuboid: try to generalize left and/or right bounds
const int nCub = cuboidsLists->nCuboids(); // abbrevation
for(int i=0;i<nCub;i++)
{
if(f_Stop)
if(*f_Stop) // check stop flag and abort if it is set
return;
// a) progress and abbrevation
progress = i/((float)nCub); // current progress
TCuboid* cub = const_cast<TCuboid*>(mapper[i]);
// b) get variable id's sorted regarding cuboid's range and variable's weight
// note: symbolic flags, weights, range and symbol count are with(!) output
TIdVec* id = new TIdVec[data->nVar()-1];
for(int j=0;j<data->nVar()-1;j++) // over all input variables
{
if(NOMINAL_VARIABLES && data->IsSymbolic(j+1))
{ // symbolic (nominal): percentage of symbols covered
int cSet=0;
for(int g=0;g<data->nIntegerMaxMin(j+1);g++)
if(IsBitSet(cub->Mask(j), g)) // count how many symbols are covered
cSet++;
id[j].a = ((float) cSet)/data->nSymbolsFound(j+1);
}
else
// continuous or ordinal: normalized radian
id[j].a = (cub->Max(j)-cub->Min(j))*ddata->Weights()[j+1]*ddata->NormFac()[j+1];
id[j].b = ddata->Weights()[j+1]; // 2nd criterion: weight
id[j].id = j; // set corresponding index
}
qsort(id, data->nVar()-1, sizeof(id[0]), IdVecCmpDes); // sort descending
// c) try to generalize cuboid's variables in the order determined above in b)
for(int j=0;j<data->nVar()-1;j++)
{
const int varId = id[j].id; // variable to generalize
if(NOMINAL_VARIABLES && data->IsSymbolic(varId+1)) // variable types are with(!) output
// symbolic (nominal):
{
int mask = cub->Mask(varId); // preserve original mask
SetAllBits(cub->Mask(varId)); // manipulate cuboid to cover all symbols
// test if any new(!) negative example is covered due to generalization
for(int i=0;i<intervals->SizeOfCheckVec(cub->y_quant);i++)
// if negative example is inside generalized cuboid ...
if(HitTest(*cub, data->Row( intervals->CheckVec(cub->y_quant)[i]), data, para.w_COD, ddata->Weights()
, intervals->Order()))
{
// restore original mask
cub->Mask(varId)=mask;
// ... but also inside the original cuboid
if(HitTest(*cub, data->Row( intervals->CheckVec(cub->y_quant)[i]), data, para.w_COD, ddata->Weights()
, intervals->Order()))
// ... then keep generalization
SetAllBits(cub->Mask(varId)); // cover all symbols
else
break; // else the original mask is still valid
}
}
else
// continuous or ordinal:
{
// 1) left bound
float l = cub->Min(varId); // preserve original left bound
cub->Min(varId) = -MAXFLOAT; // manipulate left bound
// test if any new(!) negative example is covered due to (left) generalization
for(int i=0;i<intervals->SizeOfCheckVec(cub->y_quant);i++)
// if negative example is inside generalized cuboid ...
if(HitTest(*cub, data->Row( intervals->CheckVec(cub->y_quant)[i]), data, para.w_COD, ddata->Weights()
, intervals->Order()))
{
// restore original left bound
cub->Min(varId)=l;
// ... but also inside the original cuboid (note: it's not the real(!) orig. cub. but it will do so)
if(HitTest(*cub, data->Row( intervals->CheckVec(cub->y_quant)[i]), data, para.w_COD, ddata->Weights()
, intervals->Order()))
cub->Min(varId) = -MAXFLOAT; // ... then keep generalization by resetting left boudn again
else
break; // else the original bound is still valid, i.e. left bound was not generalized
}
// 2) right bound
float r = cub->Max(varId); // preserve original right bound
cub->Max(varId) = MAXFLOAT; // manipulate right bound
// test if any new(!) negative example is covered due to (right) generalization
for(int i=0;i<intervals->SizeOfCheckVec(cub->y_quant);i++)
// if negative example is inside generalized cuboid ...
if(HitTest(*cub, data->Row( intervals->CheckVec(cub->y_quant)[i]), data, para.w_COD, ddata->Weights()
, intervals->Order()))
{
// restore original right bound
cub->Max(varId)=r;
// ... but also inside the original cuboid ... (note: it's not the original cuboid but it will do so)
if(HitTest(*cub, data->Row( intervals->CheckVec(cub->y_quant)[i]), data, para.w_COD, ddata->Weights()
, intervals->Order()))
cub->Max(varId) = MAXFLOAT; // ... then keep generalization by resetting right bound again
else
break; // else the original right bound is still valid, i.e. right bound was not generalized
}
}
}
delete[] id; // release
}
#ifdef PROFILE
t_PostPrune=clock()-time;
#endif
}
#endif // endif FORWARD_PRUNING
//----------------------------------------------------------------------------------------------------------------------
// # cuboids in all cuboid lists - if <k> is specified count # cuboids with more than <k> tuples
int TPnc::nCluster(const int& k) const
{
const TCuboid** mapper = cuboidsLists->GetMapper(consistent_mapper); // get mapper
if(k==0)
return cuboidsLists->nCuboids();
else // if minimum # tuples per cuboid is given -> iterate and count
{
int n=0;
for(int i=0;i<cuboidsLists->nCuboids();i++) // for all cuboids
if(mapper[i]->m>k)
n++;
return n;
}
}