//    module pnc.h                                                            //
//                                                                            //
//    Contains the class TPnc and all other necessary sub-classes for         //
//    the PNC2 cluster algorithm.                                             //
//    See source or http://www.newty.de/pnc2/sdocu.html for more information. //
//                                                                            //
//    copyright (c) 1999-2003 by Lars Haendel                                 //
//    home: www.newty.de                                                      //
//                                                                            //
//    This program is free software; you can redistribute it and/or modify    //
//    it under the terms of the GNU General Public License as published by    //
//    the Free Software Foundation as version 2 of the License.               //
//                                                                            //
//    This program is distributed in the hope that it will be useful,         //
//    but WITHOUT ANY WARRANTY; without even the implied warranty of          //
//    GNU General Public License for more details.                            //
//                                                                            //
//    You should have received a copy of the GNU General Public License       //
//    along with this program; if not, write to the Free Software             //
//    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.               //
//                                                                            //

#ifndef _PNC_H
#define _PNC_H

#define PNC_EXTENSION         "pnc"
#define MAX_INP_SYMBOLS (int) 32    // max. # symbols (range of integer variable) for symbolic/nominal input variable
#define MAX_OUT_SYMBOLS (int) 1024  // max. # symbols (range of integer variable) for symbolic/nominal output variable

#include "data.h"          // due to   TData
#include "stdList.h"       //          TStdList - class template with simple but efficient single linked list
#include "util.h"          //          TSorter
#include "prediction.h"    //          TCluster

//    cuboid defined by lower left and upper right bounds in the input space

   typedef TStdList<int>   TTupleIdList;

class TCuboid

   // constructor/destructor
   TCuboid()   { l=r= NULL;  tuples=NULL; };
   ~TCuboid()  { delete[] l;  delete[] r;  delete tuples;  };

   void Initialize(const int& nInp);                                 // allocate memory
   void Assimilate() { l=r=NULL; tuples=NULL; };                     // defer reponsibility for allocated memory
   void Release()    { delete[] l;  delete[] r;  delete tuples; };   // release memory

   // access functions for lower left and upper right bound or mask for symbolic variables
   // note: symbolic variables are encoded as a bit mask which is stored in the float variable
   //       'l[j]'. The bit mask utilities need integer variables and thus you find the casts below
   inline const float&  Min (const int& j) const { return l[j]; };
   inline const float&  Max (const int& j) const { return r[j]; };
   inline const int&    Mask(const int& j) const { return *((int*) &(l[j])); };  // Yeah ;-)

   inline float&        Min (const int& j)       { return l[j]; };
   inline float&        Max (const int& j)       { return r[j]; };
   inline int&          Mask(const int& j)       { return *((int*) &(l[j])); };

   void Replace(float* _l, float* _r) { delete[] l; delete[] r; l=_l; r=_r;};

   TTupleIdList* tuples;   // list with id's of merged tuples, can be used to index in TData

   int      neg;           // # covered negative examples
   int      m;             // mass
   float    y_snap;        // snapped output value/class (snapped=quantized and de-quantize, i.e. value is in the
                           // middle of correspondig output interval)
   int      y_quant;       // quantized output value/class


   float*   l;             // lower left bound (or bit mask)
   float*   r;             // upper right bound

//    element of output interval list
typedef struct
   const float**  opI;        // -> first tuple of interval
   int            y_quant;    // quantized output value
   float          y_snap;     // (snapped=quantized and dequantized) output value
   int            nTup;       // # tuples in interval
   int            id;         // id of first tuple, can be used to index in TData
} TInterval;

//    provides output interval wise access to (learn) data tuples

   typedef TStdList<TInterval>   TIntervalList;

class TIntervals

   // constructor/destructor
   TIntervals(const TData*const& _data, const TParameter*const& para);

   int  NextInterval();                // return id of next output interval
   void Del(const int& opI);           // delete interval from list - note: this means merge(!) the opI-th interval
                                       // with the 'opI-1'-th one

   // test if cuboid passes merge test
   bool NegTest(TCuboid& cuboid, const int& exOpI, int const& maxNeg, const float& w_min, const float*const& weights) const;

   // # output intervals
   inline const int& Size() const { return list->Size(); };

   // # output intervals (regular)
   inline const int& nInt() const { return _nInt; };

   // # tuples in i-th interval
   inline const int& Size(const int& i) const { list->Pos(i); return list->Get().nTup; };

   // quantized output value of i-th interval
   inline const int& Y_Quant(const int& i) const { list->Pos(i); return list->Get().y_quant; };

   // snapped output value of i-th interval
   inline const float& Y_Snap(const int& i) const { list->Pos(i); return list->Get().y_snap; };

   // id of first tuples of i-th interval
   inline const int& Id(const int& i) const { list->Pos(i); return list->Get().id; };

   // -> first tuple of i-th interval
   inline const float**const& Get(const int& i) const;

   // # tuples in regular interval with output value <value>
   const int SizeReg(const int& value) const;

    // get # tuples in i-th check vector
   inline const int& SizeOfCheckVec (const int& i) const { return len_checkVec[i]; };

   // get i-th check vector
   inline const int*& CheckVec         (const int& i) const { return checkVec[i]; };
   inline const TSorter*const& Order() const { return order; };
   void Save(ofstream& file) const;                                                    // debug function


   TIntervalList* list;          // -> list

   int _nInt;  // # non-empty regular intervals

   // i-th check-vector keeps the id's of the potentially negative examples for the i-th regular(!) output interval
   int   nCheckVec;              // # checkVecs - used in destructor
   int** checkVec;               // -> check-vectors
   int*  len_checkVec;           // # id's in check-vectors

   // create check-vector: only tuples whose (quantized) output value differs more than <diffMax> from <y_quant> are regarded
   void CreateCheckVec(const int& y_quant, const TParameter*const& para);

   const TData* data;            // learn data
   TSorter*  order;              // variable id's in order of decreasing weight, used to optimize HitTest()
   int id4Bg2;                   // counter for NextInterval()

//    encapsulates list with cuboids
   typedef TStdList<TCuboid> TCuboidList;

class TCuboids

   // constructor/destructor

      // create empty cuboid list
      TCuboids(const TDFunc& _dfunc, const TDataData*const& _ddata, const bool& _f_UseWeights);

      // create trivial cuboids from (part of the) (learn) data
      TCuboids(const TIntervals*const& intervals, const TData*const& _data, const int& opI, const TDFunc& _dfunc,
               const TDataData*const& _ddata, const bool& _f_UseWeights);
      ~TCuboids() { delete list; };

   void     Del(const int& i)       { list->Pos(i); list->Del();        };    // delete i-th cuboid from list
   TCuboid& Get(const int& i) const { list->Pos(i); return list->Get(); };    // return reference to i-th cuboid in list

   // distance of i-th cuboid in list to reference cuboid <ref>
   float Distance(const int& i, const TCuboid*const& ref) const;

   // append cuboid to list
   void App(TCuboid& cub)              { list->Pos(Size()-1);  list->Ins(cub);      };

   // append cuboids of list 'a' and 'b'
   void App(TCuboids& a, TCuboids& b)  { list->Cat(a.list);    list->Cat(b.list);   };

   // delete min(i,j)-th cuboid and overwrite max(i,j)-th cuboid with <cuboid>
   void Merge(const int& i, const int& j, TCuboid& cuboid);

   inline const int& Size() const { return list->Size();    };    // return # cuboids in list
   inline const int& nInp() const { return ddata->nInp();   };    // return # input variables

   void Save(ofstream& file) const;    // debug function

   TCuboidList* list;                  // -> list with cuboids
   TDFunc dfunc;
   const TDataData* ddata;             // data data like weights, # variables etc.
   bool f_UseWeights;

//    'extended' cuboid list, i.e. encapsulates three cuboid lists for the cascade-mode
class TCuboidsEx

   // constructor/destructor
   TCuboidsEx(TCuboids*const& _a, TCuboids*const& _b, const TDFunc& dfunc, const TDataData*const& ddata
               , const bool& f_UseWeigths);

   TCuboids *a, *b, *m;    // cuboid lists: cuboids of 'a' and 'b' are merged and moved to 'm'

   // merge
   void Merge(TCuboids*const& cub1, int id1, TCuboids*const& cub2, const int& id2, TCuboid& cuboid);

   inline const int Size() const { return a->Size()+b->Size()+m->Size(); };   // # cuboids in all three lists

   // append cuboids of 'b' and 'm' to 'a' and return 'a' - note: once called 'reduces' object's behaviour
   TCuboids*& ToCuboids() { a->App(*b, *m); is_extended=false; return a; };

   bool is_extended;    // flag indicates that ToCuboids() has not been called yet

//    encapsulates list over cuboid lists

   typedef TStdList<TCuboids*> TCuboidsList;

class TCuboidsLists

   // constructor/destructor

   TCuboids*&        Get (const int& i) { list->Pos(i);  return list->Get();  };    // -> i-th cuboid list
   inline const int& Size(const int& i) { return Get(i)->Size();              };    // # cuboids in i-th cuboid list
   void              Del (const int& i) { list->Pos(i);  list->Del();         };    // delete i-th cuboid list

   // insert/append cuboid list at position <i> or at the end of the cuboids(!) list
   void Ins(const int& i, TCuboids*& cuboids)      { list->Pos(i); list->Ins(cuboids); };
   void App(TCuboids*& cuboids)                    { Ins(list->Size()-1, cuboids);     };

   inline const int& Size() const { return list->Size(); };    // # cuboids lists

   // # cuboids in all lists - beware: only consistent if MakeMapper() called after last change of lists
   inline const int& nCuboids() const { return _nCuboids;   };

   void                    MakeMapper();                                   // create mapper to all cuboids in all lists
   const TCuboid**const&   GetMapper(bool& consistent);                    // return mapper

   TCuboidsList*  list;          // -> list
   TCuboid**      mapper;        // -> mapper
   int            _nCuboids;     // # cuboids in all cuboids lists

//    adjazenz matrix which stores distances from all cuboids to eachother (lower triangular matrix)

   typedef TStdList<float>       TFloatList;
   typedef TStdList<TFloatList*> TFloatsList;

class TAdjMat

   // constructor/destructor
   TAdjMat(const TCuboids*const& _cuboids);

   float FindMin(int& i, int& j) const;                        // find pair with minimum distance
   inline const int& Size() const { return mat->Size(); };

   // function to edit matrix
   void App(const TCuboid*const& ref);                            // append row
   void Del(const int& rowId);                                    // delete row/column
   void Disconnect(const int& k, const int& l);                   // disconnect node
   void Recalc(const int& rowId, const TCuboid*const& ref);       // re-calculate row/column

   void Save(ofstream& file) const;                               // debug function


   TFloatsList*      mat;                    // -> adjazenz matrix
   const TCuboids*   cuboids;                // -> cuboid list

//    encapsulates three adjazenz matrices:  1. cuboids 'b' versus 'a'
//                                           2. cuboids 'm' versus 'a'+'b' and
//                                           3. cuboids 'm' with each other
class TAdjMatEx

   // adjazenz-array to represent all combinations between two cuboid lists (rectangular matrix)
   class TAdjArr

      // constructor/destructor
      TAdjArr(TCuboids*const& _a, TCuboids*const& _b);
      TAdjArr(TCuboids*const& _a, TCuboids*const& _b, TCuboids*const& _m);

      // fucntions to edit matrix
      void App();                                                          // append row
      void Recalc(const int& row, const TCuboid*const& ref);               // re-calculate row
      void DelRow(const int& row);                                         // delete row
      void DelCol(const int& col);                                         // delete column
      void Disconnect(const int& row, const int& col);                     // disconnect node

      float FindMin(int& id1, TCuboids*& cub2, int& id2);               // find pair with minimum distance

      // # rows/columns
      inline const int& nRows() const { return mat->Size(); };
      inline const int& nCols() const { return _nCols;         };

      void Save(ofstream& file) const;                // debug function


      int               _nCols;                       // # columns

      TFloatsList* mat;                               // -> adjazenz matrix
      TCuboids    *a, *b, *m;                         // -> cuboids lists - note: m maybe empty


   // constructor/destructor
   TAdjMatEx(TCuboidsEx*const& cubEx);

   // functions to modify adjazenz objects
   float FindMin   (TCuboids*& cub1, int& id1, TCuboids*& cub2, int& id2);
   void  Disconnect(const TCuboids*const& cub1, const int& id1, const TCuboids*const& cub2, const int& id2);
   void  Merge     (const TCuboids*const& cub1, const int& id1, const TCuboids*const& cub2, const int& id2
                     , const TCuboid*const& ref);

   void Save(ofstream& file) const { matM_M->Save(file); matA_B->Save(file); matM_AB->Save(file); };  // debug function

   TCuboids    *a, *b, *m;             // -> 3 cuboids lists
   TAdjArr     *matA_B, *matM_AB;      // -> adjazenz rectangular matrices (1. and 2.)
   TAdjMat     *matM_M;                // -> adjazenz matrix (3.)

//    class TPnc

   typedef TStdList<int>   TTupleIdList;

class TPnc

   // constructor/destructor
   TPnc(TData*const& _data, const TParameter& _para);

   // save model data to file (debug function)
   void Save(ofstream& file) const;

   // convert cuboids to TCluster
   TCluster* /*cr*/ ToTCluster(const bool& prune_always, const bool*const& f_Stop=NULL, const int& k=0);
   const TParameter* GetParameters() const { return &para; };

   // state and progress
   enum TPncState { s_Converting, s_Pass1, s_Pass2, s_Pruning } state;
   const char* GetStatusText();                                                        // get status text
   const float GetProgress(const bool& f_Prune=true)  const;                           // get progress in percent
   const float GetSizeRatio() const { return n_ges/((float) data->nTup()+1); };
   const bool& IsFinished()   const { return f_Finished; };

   bool Iterate();                     // do one iteration/clustering step, return true if finished

   TTupleIdList* Tuples(const int& i) const;
   void  SaveClusterTupleIds(ofstream& file) const;                        // write tuple id's for each cluster

   static inline const char* Extension() { return PNC_EXTENSION;   };      // extensions for datafiles
//   static inline const char* ClassName() { return szModuleVersion; };


   // post pruning of cuboids: component wise try to generalize left and/or right bounds if this does not affect the
   // cuboid's separation power, i.e. if no new negative examples are covered
   void PostPrune(const bool*const& f_Stop);

   int nCluster(const int& k=0) const;    // # cuboids in all cuboid lists

   TIntervals*    intervals;              // used to access (learn-)data output interval wise
   TCuboids*      cuboids;                // -> cuboids (actually processed)
   TCuboidsLists* cuboidsLists;           // -> list with cuboids lists, i.e. list with lists of cuboids

   TAdjMat*       adjMat;                 // -> adjazenz matrix
   TAdjMatEx*     adjMatEx;               // -> 'extended' adjazenz matrix (cascade-mode)
   TCuboidsEx*    cuboidsEx;              // -> 'extended' cuboid lists (cascade-mode)

   float u_Gamma;          // used for calculation of confidence intervals

   int   opI;              // actual processed output interval

   // progress indication
   float baseProgress, incProgress;       // progress in percent if one output interval/group is processed in first pass
   int   nChecked, nToCheck, nCascadeInt;

   bool  f_Cascade;        // flag: working in cascade mode
   bool  f_Next;           // flag: process next interval
   int   n_ges;            // actual # cuboids + unprocessed tuples

   bool f_Finished;
   bool f_Pruned;          // flag indicates that PostPrune() was called

   mutable bool consistent_mapper;

   float progress;

   TData*      data;             // -> (learn) data matrix
   TParameter  para;             // -> parameter object
   TDFunc      dfunc;            // metric
   TDataData*  ddata;            // data data like minima, maxima, weights etc.