//------------------------------------------------------------------------------
// module project.h //
// //
// Encapsulates project settings. Abstract base class! GUI and batch //
// interface each derive their own project classes. //
// See source or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 2001-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software; you can redistribute it and/or modify //
// it under the terms of the GNU General Public License as published by //
// the Free Software Foundation as version 2 of the License. //
// //
// This program is distributed in the hope that it will be useful, //
// but WITHOUT ANY WARRANTY; without even the implied warranty of //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. //
// //
// You should have received a copy of the GNU General Public License //
// along with this program; if not, write to the Free Software //
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. //
// //
//------------------------------------------------------------------------------
#ifndef PROJECT_H
#define PROJECT_H
#include <fstream> // due to ifstream ...
#include "stdlist.h" // simple single linked list
#include "ParaSet.h" // TParaSetList
#include "data.h" // TData
#include "task.h" // TTask
#include "SetResult.h" // nCriterion etc.
#include "fileutil.h" // PrefixPath()
#include "pnc.h" // MAX_INP_SYMBOLS, i.e. maximal # symbols for input(!) variables allowing
// it to be symbolic
//----------------------------------------------------------------------------------------------------------------------
#define MAX_N_VAR (int) 1023 // maximum number of variables in data file
// test type names
#define SZ_TYPE_REP "Repetition"
#define SZ_TYPE_CV "Cross-Validation"
#define SZ_TYPE_LOOCV "Loocv"
#define SZ_TYPE_SPECIAL "Special"
// section '[Basic]'
#define DEF_OUTCOL (int) 0 // default output column in data matrix
#define DEF_RANDOMIZE (bool) true
#define SZ_DIRECTORY "OutputDirectory" // output directory
#define SZ_SIMULATION "Simulation"
// section '[Tuning]'
#define DEF_TUNE_TYPE (TTestType) Rep // note: must(!) be 'Rep' !! Otherwise ini of N_R_Tune
// in TProjectG::Initialize() could be wrong
#define DEF_SPLIT (int) 80 // data splitting; can be used instead of key 'N_L' and 'N_T'
#define MAX_N_L_TUNE (int) 1000 // max. # tuning learn data tuples
#define MIN_SPLIT (int) 1
#define MAX_SPLIT (int) 90
#define DEF_MAX_SIZE (int) 10 // in percent
#define MIN_MAX_SIZE (int) 1 // max. model size
#define MAX_MAX_SIZE (int) 100
#define DEF_SKIPPING (bool) true
#define DEF_MIN_COMPRESSION (int) 50 // in percent
#define MIN_MIN_COMPRESSION (int) 10
#define MAX_MIN_COMPRESSION (int) 100
#define SZ_MAX_SIZE "MaxSize"
#define SZ_MIN_COMPRESSION "MinCompression"
#define SZ_SKIPPING "Skipping"
#define DEF_SKIP_TUNING (bool) false
#define MAX_N_R_TUNE (int) 1000 // hard coded maximal value, use Max_N_R_Tune to get value
// depending on tuning type and # data tuples
#define DEF_N_R_TUNE (int) 10
//----------------------------------------------------------------------------------------------------------------------
// utility functions
class TProject; // forward declaration
// test type definition (enum) and conversion routines
enum TTestType{Rep, Cv, Loocv, Special};
char* TestTypeToString(const int type);
// copy parameters from TParaSet (and project settings) to TParameter
TParameter ToTParameter(TParaSet& para, const TProject* prj);
// decide if model is to big
bool ModelToBig(TCluster* model, const TData*const& data, const TParameter& para, const TProject*const& prj);
// model size as # cuboids multiplied with average variables per cuboid
float ModelSize(const TCluster*const& cls, const TData*const& data, const bool& f_Prune);
// convert test type (enum) to character string
char* TestTypeToString(const int type);
// convert test type string to test type
TTestType StringToTestType(const char* szType);
//----------------------------------------------------------------------------------------------------------------------
// encapsulates settings for usage of PNC2 algorithm
class TProject
{
public:
// constructor/destructor
TProject();
~TProject() { tasks.Clear(); };
//-------------------------------------------------------------------------------------------------------------------
// a) load/save/ini/reset and data association
// load and save settings from/to file
void Load(ifstream& file, const char*const& _szProjectFilePath);
void Save(ofstream& file, const bool& f_WriteTuningAnyway=false);
// check project parameters against constraints and given data file, set output column and variable types,
// set dependant variables and associate data - WARNING: caller has to release returned TParaSetList !!
TParaSetList* /*cr*/ Synchronize(TData*const& _data1, const bool& f_CheckTuningAnyway=false);
//-------------------------------------------------------------------------------------------------------------------
// b) data variable types and classification flag
// check if actual output column has (not to many and) integer values and thus problem can be considered as
// classification problem
bool CouldBeClassification(const int& oriId);
// check if specified input(!) variable could be symbolic; note: the output type is ignored! Types are only uzsed
// for input variables!
bool CouldBeSymbolic(const int& varId);
// set output column in data object (output column will be moved to first)
void WriteThroughOutCol(const bool& f_TestDataIsWithOutput);
// return # classes of given problem if it is/would be considered as classification problem; note: will be zero if
// output is continuous
int nClasses();
//-------------------------------------------------------------------------------------------------------------------
// c) dependant variables
int nParaSets() const; // # parameter sets for tuning
int nParaSetsLearn() const; // # parameter sets that need learning
//-------------------------------------------------------------------------------------------------------------------
// d) section [Basic]
// get data file names with full path, will be empty if not specified
const char* GetData1FileName() const { return GetPrefixedPath(szData1, szProjectFilePath); };
const char* GetData2FileName() const { return GetPrefixedPath(szData2, szProjectFilePath); }; // test data
const int& GetOutCol() const { return outcol; }; // get output column
void SetOutCol(int _outcol); // set output column
bool& Randomize () { return f_Randomize; }; // randomization flag
int & Regression() { return f_Regression; }; // regression flag
const bool& Randomize () const { return f_Randomize; }; // const versions
const int & Regression() const { return f_Regression; };
const float& GetOverlapFac() const { return overlapFac; };
const int& Get_N_G_Max() const { return N_G_Max; };
const int& Get_N_Bins() const { return N_Bins; };
void Set_N_G_Max(const int& _N_G_Max);
void Set_N_Bins(const int& _N_Bins);
void SetOverlapFac(const float& _fac);
bool& NormalizeByRange() { return f_NormalizeByRange; }; // normalization of continuous features
bool& EqualWidthBinning() { return f_EqualWidthBinning; }; // binning to discretize cont. features
const bool& NormalizeByRange() const { return f_NormalizeByRange; }; // const versions
const bool& EqualWidthBinning() const { return f_EqualWidthBinning; };
//-------------------------------------------------------------------------------------------------------------------
// h) section [Tuning]
// get and set project's tuning settings
const bool DoTuning() const { return f_Tune; }; // tune parameters
const int& Get_N_R_Tune() const { return N_R_Tune; }; // # repetitions/cross-validations for tuning
const int& GetMaxSize() const { return maxSize; }; // maximal model size (tuning constraint)
float MaxModelSize() const; // return maximal model size
const int& GetMinCompression()const { return minCompression; };
const TTestType& GetTuneType() const { return tuneType; }; // get tuning type (Repetition, Cross-Val.)
const int& Get_N_L_Tune() const { return N_L_Tune; }; // # learn tuples
const int& Get_N_T_Tune() const { return N_T_Tune; }; // # test tuples
const int& GetDataSplitting() const { return split; }; // data splitting; alternate way to specify #
// learn and test tuples
void SetMaxSize(int maxSize);
void SetMinCompression(int minCmpr);
void SetTuning(const bool& enable) { f_Tune = enable; };
void SetDataSplitting(const int& _split);
bool& Skipping() { return f_Skipping; };
const bool& Skipping() const { return f_Skipping; };
int Min_N_R_Tune(); // minimum tuning repetition/cross-validation
int Max_N_R_Tune(); // maximum tuning repetition/cross-validation
void IniTuneCounts(); // initialize tune counts (N_L_Tune, N_R_Tune, N_R_Tune)
virtual int Peek_N_L()=0; // peek learn data file tuples count
//-------------------------------------------------------------------------------------------------------------------
// i) section [Task...]
// note: a task contains strings with ';' seperated values for the algorithm's parameters. All possible combinations
// of parameters within the task are considered as possible parameter sets for tuning
void AddTask(const TTask*const& task); // add a task
void DeleteAllTasks() { tasks.Clear(); tasks.Reset(); }; // delete all tasks
void DeleteTask(const int& index) { tasks.Pos(index); tasks.Del(); }; // delete specified task
const int& nTasks() const { return tasks.Size(); }; // get # tasks
const TTask& GetTask(const int& index) const { return tasks.Get(index); }; // get specified task
// wrapper to call ToParaSetList; returns list with parameter sets according to actually defined tasks
TParaSetList* /*cr*/ ToParaSetList(){ return ::ToParaSetList(&tasks, data1, f_Regression); };
protected:
// initialize parameters with default values or just rest them
void TProject::Reset();
//-------------------------------------------------------------------------------------------------------------------
// section [Basic]
int N_G_Max, N_Bins; // 'global' parameters (max. # tuples per adjacency matrix and # bins used to discretize
// continuous feature for feature weight calculation (mutual information criterion))
char szData1[STS], szData2[STS]; // data filenames
int outcol; // output column in data file
char szVarTypes[MAX_N_VAR]; // variable types
int f_Regression; // flag: regression task
bool f_Randomize; // call randomize(), i.e. initialize random number generator
float overlapFac;
bool f_NormalizeByRange; // normalize cont. features by range
bool f_EqualWidthBinning; // equal width binning to discretize cont. features for weight calculation
//-------------------------------------------------------------------------------------------------------------------
// section [Tuning]
char szTuneType[STS];
int f_Tune; // flag: tune parameters
int N_R_Tune; // # repetitions/cross-validations
TTestType tuneType; // tuning type (Repetition, CV or Loocv)
int maxSize; // maximum size (in percent of learn data size) of learned models
int minCompression; // min. compression rate '# cuboids / # tuples' in percent
int N_L_Tune, N_T_Tune; // # learn/test tuples
int split; // data splitting (in %), used in GUI instead of N_L_Tune and N_T_Tune
bool f_Skipping; // skip parameter sets that produce big models
//-------------------------------------------------------------------------------------------------------------------
// misc.
TTaskList tasks; // task list
TData *data1, *data2; // associated data objects
bool f_Checked; // project file was checked using Synchronize()
char szProjectFilePath[STS]; // directory of project file
};
#endif