//------------------------------------------------------------------------------
// module pnc.h //
// //
// Contains the class TPnc and all other necessary sub-classes for //
// the PNC2 cluster algorithm. //
// See source or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 1999-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation as version 2 of the License. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. //
// //
//------------------------------------------------------------------------------
#ifndef _PNC_H
#define _PNC_H
//----------------------------------------------------------------------------------------------------------------------
#define PNC_EXTENSION "pnc"
#define MAX_INP_SYMBOLS (int) 32 // max. # symbols (range of integer variable) for symbolic/nominal input variable
#define MAX_OUT_SYMBOLS (int) 1024 // max. # symbols (range of integer variable) for symbolic/nominal output variable
#include "data.h" // due to TData
#include "stdList.h" // TStdList - class template with simple but efficient single linked list
#include "util.h" // TSorter
#include "prediction.h" // TCluster
//----------------------------------------------------------------------------------------------------------------------
// cuboid defined by lower left and upper right bounds in the input space
typedef TStdList<int> TTupleIdList;
class TCuboid
{
public:
// constructor/destructor
TCuboid() { l=r= NULL; tuples=NULL; };
~TCuboid() { delete[] l; delete[] r; delete tuples; };
void Initialize(const int& nInp); // allocate memory
void Assimilate() { l=r=NULL; tuples=NULL; }; // defer reponsibility for allocated memory
void Release() { delete[] l; delete[] r; delete tuples; }; // release memory
// access functions for lower left and upper right bound or mask for symbolic variables
// note: symbolic variables are encoded as a bit mask which is stored in the float variable
// 'l[j]'. The bit mask utilities need integer variables and thus you find the casts below
inline const float& Min (const int& j) const { return l[j]; };
inline const float& Max (const int& j) const { return r[j]; };
inline const int& Mask(const int& j) const { return *((int*) &(l[j])); }; // Yeah ;-)
inline float& Min (const int& j) { return l[j]; };
inline float& Max (const int& j) { return r[j]; };
inline int& Mask(const int& j) { return *((int*) &(l[j])); };
void Replace(float* _l, float* _r) { delete[] l; delete[] r; l=_l; r=_r;};
TTupleIdList* tuples; // list with id's of merged tuples, can be used to index in TData
int neg; // # covered negative examples
int m; // mass
float y_snap; // snapped output value/class (snapped=quantized and de-quantize, i.e. value is in the
// middle of correspondig output interval)
int y_quant; // quantized output value/class
private:
float* l; // lower left bound (or bit mask)
float* r; // upper right bound
};
//----------------------------------------------------------------------------------------------------------------------
// element of output interval list
typedef struct
{
const float** opI; // -> first tuple of interval
int y_quant; // quantized output value
float y_snap; // (snapped=quantized and dequantized) output value
int nTup; // # tuples in interval
int id; // id of first tuple, can be used to index in TData
} TInterval;
//----------------------------------------------------------------------------------------------------------------------
// provides output interval wise access to (learn) data tuples
//
typedef TStdList<TInterval> TIntervalList;
class TIntervals
{
public:
// constructor/destructor
TIntervals(const TData*const& _data, const TParameter*const& para);
~TIntervals();
int NextInterval(); // return id of next output interval
void Del(const int& opI); // delete interval from list - note: this means merge(!) the opI-th interval
// with the 'opI-1'-th one
// test if cuboid passes merge test
bool NegTest(TCuboid& cuboid, const int& exOpI, int const& maxNeg, const float& w_min, const float*const& weights) const;
// # output intervals
inline const int& Size() const { return list->Size(); };
// # output intervals (regular)
inline const int& nInt() const { return _nInt; };
// # tuples in i-th interval
inline const int& Size(const int& i) const { list->Pos(i); return list->Get().nTup; };
// quantized output value of i-th interval
inline const int& Y_Quant(const int& i) const { list->Pos(i); return list->Get().y_quant; };
// snapped output value of i-th interval
inline const float& Y_Snap(const int& i) const { list->Pos(i); return list->Get().y_snap; };
// id of first tuples of i-th interval
inline const int& Id(const int& i) const { list->Pos(i); return list->Get().id; };
// -> first tuple of i-th interval
inline const float**const& Get(const int& i) const;
// # tuples in regular interval with output value <value>
const int SizeReg(const int& value) const;
// get # tuples in i-th check vector
inline const int& SizeOfCheckVec (const int& i) const { return len_checkVec[i]; };
// get i-th check vector
inline const int*& CheckVec (const int& i) const { return checkVec[i]; };
inline const TSorter*const& Order() const { return order; };
void Save(ofstream& file) const; // debug function
private:
TIntervalList* list; // -> list
int _nInt; // # non-empty regular intervals
// i-th check-vector keeps the id's of the potentially negative examples for the i-th regular(!) output interval
int nCheckVec; // # checkVecs - used in destructor
int** checkVec; // -> check-vectors
int* len_checkVec; // # id's in check-vectors
// create check-vector: only tuples whose (quantized) output value differs more than <diffMax> from <y_quant> are regarded
void CreateCheckVec(const int& y_quant, const TParameter*const& para);
const TData* data; // learn data
TSorter* order; // variable id's in order of decreasing weight, used to optimize HitTest()
int id4Bg2; // counter for NextInterval()
};
//----------------------------------------------------------------------------------------------------------------------
// encapsulates list with cuboids
//
typedef TStdList<TCuboid> TCuboidList;
class TCuboids
{
public:
// constructor/destructor
// create empty cuboid list
TCuboids(const TDFunc& _dfunc, const TDataData*const& _ddata, const bool& _f_UseWeights);
// create trivial cuboids from (part of the) (learn) data
TCuboids(const TIntervals*const& intervals, const TData*const& _data, const int& opI, const TDFunc& _dfunc,
const TDataData*const& _ddata, const bool& _f_UseWeights);
~TCuboids() { delete list; };
void Del(const int& i) { list->Pos(i); list->Del(); }; // delete i-th cuboid from list
TCuboid& Get(const int& i) const { list->Pos(i); return list->Get(); }; // return reference to i-th cuboid in list
// distance of i-th cuboid in list to reference cuboid <ref>
float Distance(const int& i, const TCuboid*const& ref) const;
// append cuboid to list
void App(TCuboid& cub) { list->Pos(Size()-1); list->Ins(cub); };
// append cuboids of list 'a' and 'b'
void App(TCuboids& a, TCuboids& b) { list->Cat(a.list); list->Cat(b.list); };
// delete min(i,j)-th cuboid and overwrite max(i,j)-th cuboid with <cuboid>
void Merge(const int& i, const int& j, TCuboid& cuboid);
inline const int& Size() const { return list->Size(); }; // return # cuboids in list
inline const int& nInp() const { return ddata->nInp(); }; // return # input variables
void Save(ofstream& file) const; // debug function
private:
TCuboidList* list; // -> list with cuboids
TDFunc dfunc;
const TDataData* ddata; // data data like weights, # variables etc.
bool f_UseWeights;
};
//----------------------------------------------------------------------------------------------------------------------
// 'extended' cuboid list, i.e. encapsulates three cuboid lists for the cascade-mode
class TCuboidsEx
{
public:
// constructor/destructor
TCuboidsEx(TCuboids*const& _a, TCuboids*const& _b, const TDFunc& dfunc, const TDataData*const& ddata
, const bool& f_UseWeigths);
~TCuboidsEx();
TCuboids *a, *b, *m; // cuboid lists: cuboids of 'a' and 'b' are merged and moved to 'm'
// merge
void Merge(TCuboids*const& cub1, int id1, TCuboids*const& cub2, const int& id2, TCuboid& cuboid);
inline const int Size() const { return a->Size()+b->Size()+m->Size(); }; // # cuboids in all three lists
// append cuboids of 'b' and 'm' to 'a' and return 'a' - note: once called 'reduces' object's behaviour
TCuboids*& ToCuboids() { a->App(*b, *m); is_extended=false; return a; };
private:
bool is_extended; // flag indicates that ToCuboids() has not been called yet
};
//----------------------------------------------------------------------------------------------------------------------
// encapsulates list over cuboid lists
//
typedef TStdList<TCuboids*> TCuboidsList;
class TCuboidsLists
{
public:
// constructor/destructor
TCuboidsLists();
~TCuboidsLists();
TCuboids*& Get (const int& i) { list->Pos(i); return list->Get(); }; // -> i-th cuboid list
inline const int& Size(const int& i) { return Get(i)->Size(); }; // # cuboids in i-th cuboid list
void Del (const int& i) { list->Pos(i); list->Del(); }; // delete i-th cuboid list
// insert/append cuboid list at position <i> or at the end of the cuboids(!) list
void Ins(const int& i, TCuboids*& cuboids) { list->Pos(i); list->Ins(cuboids); };
void App(TCuboids*& cuboids) { Ins(list->Size()-1, cuboids); };
inline const int& Size() const { return list->Size(); }; // # cuboids lists
// # cuboids in all lists - beware: only consistent if MakeMapper() called after last change of lists
inline const int& nCuboids() const { return _nCuboids; };
void MakeMapper(); // create mapper to all cuboids in all lists
const TCuboid**const& GetMapper(bool& consistent); // return mapper
private:
TCuboidsList* list; // -> list
TCuboid** mapper; // -> mapper
int _nCuboids; // # cuboids in all cuboids lists
};
//----------------------------------------------------------------------------------------------------------------------
// adjazenz matrix which stores distances from all cuboids to eachother (lower triangular matrix)
typedef TStdList<float> TFloatList;
typedef TStdList<TFloatList*> TFloatsList;
class TAdjMat
{
public:
// constructor/destructor
TAdjMat(const TCuboids*const& _cuboids);
~TAdjMat();
float FindMin(int& i, int& j) const; // find pair with minimum distance
inline const int& Size() const { return mat->Size(); };
// function to edit matrix
void App(const TCuboid*const& ref); // append row
void Del(const int& rowId); // delete row/column
void Disconnect(const int& k, const int& l); // disconnect node
void Recalc(const int& rowId, const TCuboid*const& ref); // re-calculate row/column
void Save(ofstream& file) const; // debug function
private:
TFloatsList* mat; // -> adjazenz matrix
const TCuboids* cuboids; // -> cuboid list
};
//----------------------------------------------------------------------------------------------------------------------
// encapsulates three adjazenz matrices: 1. cuboids 'b' versus 'a'
// 2. cuboids 'm' versus 'a'+'b' and
// 3. cuboids 'm' with each other
class TAdjMatEx
{
//-------------------------------------------------------------------------------------------------------------------
// adjazenz-array to represent all combinations between two cuboid lists (rectangular matrix)
class TAdjArr
{
public:
// constructor/destructor
TAdjArr(TCuboids*const& _a, TCuboids*const& _b);
TAdjArr(TCuboids*const& _a, TCuboids*const& _b, TCuboids*const& _m);
~TAdjArr();
// fucntions to edit matrix
void App(); // append row
void Recalc(const int& row, const TCuboid*const& ref); // re-calculate row
void DelRow(const int& row); // delete row
void DelCol(const int& col); // delete column
void Disconnect(const int& row, const int& col); // disconnect node
float FindMin(int& id1, TCuboids*& cub2, int& id2); // find pair with minimum distance
// # rows/columns
inline const int& nRows() const { return mat->Size(); };
inline const int& nCols() const { return _nCols; };
void Save(ofstream& file) const; // debug function
private:
int _nCols; // # columns
TFloatsList* mat; // -> adjazenz matrix
TCuboids *a, *b, *m; // -> cuboids lists - note: m maybe empty
};
public:
// constructor/destructor
TAdjMatEx(TCuboidsEx*const& cubEx);
~TAdjMatEx();
// functions to modify adjazenz objects
float FindMin (TCuboids*& cub1, int& id1, TCuboids*& cub2, int& id2);
void Disconnect(const TCuboids*const& cub1, const int& id1, const TCuboids*const& cub2, const int& id2);
void Merge (const TCuboids*const& cub1, const int& id1, const TCuboids*const& cub2, const int& id2
, const TCuboid*const& ref);
void Save(ofstream& file) const { matM_M->Save(file); matA_B->Save(file); matM_AB->Save(file); }; // debug function
private:
TCuboids *a, *b, *m; // -> 3 cuboids lists
TAdjArr *matA_B, *matM_AB; // -> adjazenz rectangular matrices (1. and 2.)
TAdjMat *matM_M; // -> adjazenz matrix (3.)
};
//----------------------------------------------------------------------------------------------------------------------
// class TPnc
typedef TStdList<int> TTupleIdList;
class TPnc
{
public:
// constructor/destructor
TPnc(TData*const& _data, const TParameter& _para);
~TPnc();
// save model data to file (debug function)
void Save(ofstream& file) const;
// convert cuboids to TCluster
TCluster* /*cr*/ ToTCluster(const bool& prune_always, const bool*const& f_Stop=NULL, const int& k=0);
const TParameter* GetParameters() const { return ¶ };
// state and progress
enum TPncState { s_Converting, s_Pass1, s_Pass2, s_Pruning } state;
const char* GetStatusText(); // get status text
const float GetProgress(const bool& f_Prune=true) const; // get progress in percent
const float GetSizeRatio() const { return n_ges/((float) data->nTup()+1); };
const bool& IsFinished() const { return f_Finished; };
bool Iterate(); // do one iteration/clustering step, return true if finished
TTupleIdList* Tuples(const int& i) const;
void SaveClusterTupleIds(ofstream& file) const; // write tuple id's for each cluster
static inline const char* Extension() { return PNC_EXTENSION; }; // extensions for datafiles
// static inline const char* ClassName() { return szModuleVersion; };
private:
// post pruning of cuboids: component wise try to generalize left and/or right bounds if this does not affect the
// cuboid's separation power, i.e. if no new negative examples are covered
void PostPrune(const bool*const& f_Stop);
int nCluster(const int& k=0) const; // # cuboids in all cuboid lists
TIntervals* intervals; // used to access (learn-)data output interval wise
TCuboids* cuboids; // -> cuboids (actually processed)
TCuboidsLists* cuboidsLists; // -> list with cuboids lists, i.e. list with lists of cuboids
TAdjMat* adjMat; // -> adjazenz matrix
TAdjMatEx* adjMatEx; // -> 'extended' adjazenz matrix (cascade-mode)
TCuboidsEx* cuboidsEx; // -> 'extended' cuboid lists (cascade-mode)
float u_Gamma; // used for calculation of confidence intervals
int opI; // actual processed output interval
// progress indication
float baseProgress, incProgress; // progress in percent if one output interval/group is processed in first pass
int nChecked, nToCheck, nCascadeInt;
bool f_Cascade; // flag: working in cascade mode
bool f_Next; // flag: process next interval
int n_ges; // actual # cuboids + unprocessed tuples
bool f_Finished;
bool f_Pruned; // flag indicates that PostPrune() was called
mutable bool consistent_mapper;
float progress;
TData* data; // -> (learn) data matrix
TParameter para; // -> parameter object
TDFunc dfunc; // metric
TDataData* ddata; // data data like minima, maxima, weights etc.
};
#endif