//    module project.h                                                        //
//                                                                            //
//    Encapsulates project settings. Abstract base class! GUI and batch       //
//    interface each derive their own project classes.                        //
//    See source or http://www.newty.de/pnc2/sdocu.html for more information. //
//                                                                            //
//    copyright (c) 2001-2003 by Lars Haendel                                 //
//    home: www.newty.de                                                      //
//                                                                            //
//    This program is free software; you can redistribute it and/or modify    //
//    it under the terms of the GNU General Public License as published by    //
//    the Free Software Foundation as version 2 of the License.               //
//                                                                            //
//    This program is distributed in the hope that it will be useful,         //
//    but WITHOUT ANY WARRANTY; without even the implied warranty of          //
//    GNU General Public License for more details.                            //
//                                                                            //
//    You should have received a copy of the GNU General Public License       //
//    along with this program; if not, write to the Free Software             //
//    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.               //
//                                                                            //

#ifndef PROJECT_H
#define PROJECT_H

#include <fstream>               // due to   ifstream ...

#include "stdlist.h"             //          simple single linked list
#include "ParaSet.h"             //          TParaSetList
#include "data.h"                //          TData
#include "task.h"                //          TTask
#include "SetResult.h"           //          nCriterion etc.
#include "fileutil.h"            //          PrefixPath()
#include "pnc.h"                 //          MAX_INP_SYMBOLS, i.e. maximal # symbols for input(!) variables allowing
                                 //          it to be symbolic

#define MAX_N_VAR                (int)  1023          // maximum number of variables in data file

// test type names
#define SZ_TYPE_REP              "Repetition"
#define SZ_TYPE_CV               "Cross-Validation"
#define SZ_TYPE_LOOCV            "Loocv"
#define SZ_TYPE_SPECIAL          "Special"

// section '[Basic]'
#define DEF_OUTCOL               (int)   0            // default output column in data matrix
#define DEF_RANDOMIZE            (bool)  true

#define SZ_DIRECTORY             "OutputDirectory"    // output directory
#define SZ_SIMULATION            "Simulation"

// section '[Tuning]'
#define DEF_TUNE_TYPE            (TTestType) Rep      // note: must(!) be 'Rep' !! Otherwise ini of N_R_Tune
                                                      // in TProjectG::Initialize() could be wrong
#define DEF_SPLIT                (int)   80           // data splitting; can be used instead of key 'N_L' and 'N_T'
#define MAX_N_L_TUNE             (int) 1000           // max. # tuning learn data tuples
#define MIN_SPLIT                (int)   1
#define MAX_SPLIT                (int)   90

#define DEF_MAX_SIZE             (int)   10           // in percent
#define MIN_MAX_SIZE             (int)   1            // max. model size
#define MAX_MAX_SIZE             (int)   100
#define DEF_SKIPPING             (bool)  true
#define DEF_MIN_COMPRESSION      (int)   50            // in percent
#define MIN_MIN_COMPRESSION      (int)   10
#define MAX_MIN_COMPRESSION      (int)   100

#define SZ_MAX_SIZE              "MaxSize"
#define SZ_MIN_COMPRESSION       "MinCompression"
#define SZ_SKIPPING              "Skipping"
#define DEF_SKIP_TUNING          (bool) false

#define MAX_N_R_TUNE             (int)   1000          // hard coded maximal value, use Max_N_R_Tune to get value
                                                       // depending on tuning type and # data tuples
#define DEF_N_R_TUNE             (int)   10

// utility functions

class TProject;   // forward declaration

// test type definition (enum) and conversion routines
enum TTestType{Rep, Cv, Loocv, Special};
char* TestTypeToString(const int type);

// copy parameters from TParaSet (and project settings) to TParameter
TParameter ToTParameter(TParaSet& para, const TProject* prj);

// decide if model is to big
bool ModelToBig(TCluster* model, const TData*const& data, const TParameter& para, const TProject*const& prj);

// model size as # cuboids multiplied with average variables per cuboid
float ModelSize(const TCluster*const& cls, const TData*const& data, const bool& f_Prune);

// convert test type (enum) to character string
char* TestTypeToString(const int type);

// convert test type string to test type
TTestType StringToTestType(const char* szType);

// encapsulates settings for usage of PNC2 algorithm
class TProject

   // constructor/destructor
   ~TProject() { tasks.Clear(); };

   // a) load/save/ini/reset and data association

   // load and save settings from/to file
   void Load(ifstream& file, const char*const& _szProjectFilePath);
   void Save(ofstream& file, const bool& f_WriteTuningAnyway=false);

   // check project parameters against constraints and given data file, set output column and variable types,
   // set dependant variables and associate data  -  WARNING: caller has to release returned TParaSetList !!
   TParaSetList* /*cr*/ Synchronize(TData*const& _data1, const bool& f_CheckTuningAnyway=false);

   // b) data variable types and classification flag

   // check if actual output column has (not to many and) integer values and thus problem can be considered as
   // classification problem
   bool CouldBeClassification(const int& oriId);

   // check if specified input(!) variable could be symbolic; note: the output type is ignored! Types are only uzsed
   // for input variables!
   bool CouldBeSymbolic(const int& varId);

   // set output column in data object (output column will be moved to first)
   void WriteThroughOutCol(const bool& f_TestDataIsWithOutput);

   // return # classes of given problem if it is/would be considered as classification problem; note: will be zero if
   // output is continuous
   int nClasses();

   // c) dependant variables
   int nParaSets() const;                                                     // # parameter sets for tuning
   int nParaSetsLearn() const;                                                // # parameter sets that need learning

   // d) section [Basic]
   // get data file names with full path, will be empty if not specified
   const char* GetData1FileName() const { return GetPrefixedPath(szData1, szProjectFilePath); };
   const char* GetData2FileName() const { return GetPrefixedPath(szData2, szProjectFilePath); };   // test data
   const int& GetOutCol() const { return outcol; };                           // get output column

   void SetOutCol(int _outcol);                                               // set output column
   bool& Randomize () { return f_Randomize;  };                               // randomization flag
   int & Regression() { return f_Regression; };                               // regression flag

   const bool& Randomize () const { return f_Randomize;  };                   // const versions
   const int & Regression() const { return f_Regression; };

   const float& GetOverlapFac() const { return overlapFac; };
   const int& Get_N_G_Max() const { return N_G_Max; };
   const int& Get_N_Bins() const { return N_Bins; };

   void Set_N_G_Max(const int& _N_G_Max);
   void Set_N_Bins(const int& _N_Bins);
   void SetOverlapFac(const float& _fac);

   bool& NormalizeByRange() { return f_NormalizeByRange; };                   // normalization of continuous features
   bool& EqualWidthBinning() { return f_EqualWidthBinning; };                 // binning to discretize cont. features

   const bool& NormalizeByRange() const { return f_NormalizeByRange; };       // const versions
   const bool& EqualWidthBinning() const { return f_EqualWidthBinning; };

   // h) section [Tuning]

   // get and set project's tuning settings
   const bool       DoTuning()         const { return f_Tune; };        // tune parameters
   const int&       Get_N_R_Tune()     const { return N_R_Tune; };      // # repetitions/cross-validations for tuning
   const int&       GetMaxSize()       const { return maxSize; };       // maximal model size (tuning constraint)
   float            MaxModelSize()     const;                           // return maximal model size
   const int&       GetMinCompression()const { return minCompression; };
   const TTestType& GetTuneType()      const { return tuneType; };      // get tuning type (Repetition, Cross-Val.)
   const int&       Get_N_L_Tune()     const { return N_L_Tune; };      // # learn tuples
   const int&       Get_N_T_Tune()     const { return N_T_Tune; };      // # test tuples
   const int&       GetDataSplitting() const { return split; };         // data splitting; alternate way to specify #
                                                                        // learn and test tuples
   void SetMaxSize(int maxSize);
   void SetMinCompression(int minCmpr);
   void SetTuning(const bool& enable) { f_Tune = enable; };
   void SetDataSplitting(const int& _split);

   bool& Skipping() { return f_Skipping; };
   const bool& Skipping() const { return f_Skipping; };

   int Min_N_R_Tune();                 // minimum tuning repetition/cross-validation
   int Max_N_R_Tune();                 // maximum tuning repetition/cross-validation
   void IniTuneCounts();               // initialize tune counts (N_L_Tune, N_R_Tune, N_R_Tune)
   virtual int Peek_N_L()=0;           // peek learn data file tuples count

   // i) section [Task...]
   // note: a task contains strings with ';' seperated values for the algorithm's parameters. All possible combinations
   // of parameters within the task are considered as possible parameter sets for tuning
   void AddTask(const TTask*const& task);                                     // add a task
   void DeleteAllTasks() { tasks.Clear(); tasks.Reset(); };                   // delete all tasks
   void DeleteTask(const int& index) { tasks.Pos(index); tasks.Del(); };      // delete specified task

   const int& nTasks() const { return tasks.Size(); };                        // get # tasks
   const TTask& GetTask(const int& index) const { return tasks.Get(index); }; // get specified task

   // wrapper to call ToParaSetList; returns list with parameter sets according to actually defined tasks
   TParaSetList* /*cr*/ ToParaSetList(){ return ::ToParaSetList(&tasks, data1, f_Regression); };


   // initialize parameters with default values or just rest them
   void TProject::Reset();

   // section [Basic]
   int  N_G_Max, N_Bins;       // 'global' parameters (max. # tuples per adjacency matrix and # bins used to discretize
                               // continuous feature for feature weight calculation (mutual information criterion))
   char szData1[STS], szData2[STS];          // data filenames
   int  outcol;                              // output column in data file
   char szVarTypes[MAX_N_VAR];               // variable types
   int f_Regression;                         // flag: regression task
   bool f_Randomize;                         // call randomize(), i.e. initialize random number generator
   float overlapFac;
   bool f_NormalizeByRange;                  // normalize cont. features by range
   bool f_EqualWidthBinning;                 // equal width binning to discretize cont. features for weight calculation

   // section [Tuning]
   char szTuneType[STS];
   int f_Tune;                               // flag: tune parameters
   int N_R_Tune;                             // # repetitions/cross-validations
   TTestType tuneType;                       // tuning type (Repetition, CV or Loocv)
   int maxSize;                              // maximum size (in percent of learn data size) of learned models
   int minCompression;                       // min. compression rate '# cuboids / # tuples' in percent
   int N_L_Tune, N_T_Tune;                   // # learn/test tuples
   int split;                                // data splitting (in %), used in GUI instead of N_L_Tune and N_T_Tune

   bool f_Skipping;                          // skip parameter sets that produce big models

   // misc.
   TTaskList tasks;                          // task list
   TData *data1, *data2;                     // associated data objects
   bool f_Checked;                           // project file was checked using Synchronize()
   char szProjectFilePath[STS];              // directory of project file