static const char* szModule = "project.cpp";
//------------------------------------------------------------------------------
// module project.cpp //
// //
// Encapsulates project settings. Abstract base class! GUI and batch //
// interface each derive their own project classes. //
// See below or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 2001-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software and can be used under the terms of the //
// GNU licence. See header file for further information and disclaimer. //
// //
//------------------------------------------------------------------------------
// //
// CREATE: Create an empty instance and call Load(). //
// //
// NOTE: The tasks for optimization (i.e. different parameter sets //
// that are tried during the parameter tuning) are kept in a list //
// of TTask objects. TTask can be seen as a struct with load/save //
// functionality and stores the different parameters as strings as //
// it is written in section [TaskX] in the project file. Call //
// ToParaSetList() to convert all the tasks from the list to a list //
// of TParaSet objects. //
// //
// NOTE: All paths in TProject are relative to the project file. //
// Path names are scanned for correct path delimiter; each wrong //
// delimiter is corrected automatically. //
// //
// NOTE: CommentChar() is just returning a reference to global //
// ComChar. Setting is not saved within the project! //
// //
// File I/O: Load and Save routines //
//------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------------------------------------
// note on filename: Filenames can be given either complete with their drive and path or relative to the directory of
// the project file. Filenames are stored as they were read. Path is added (if necessary) when you
// access a filename using the appropriate functions.
//----------------------------------------------------------------------------------------------------------------------
#include <stdio> // due to: sprintf()
#include <iomanip> // setw()
#include "project.h"
#include "defines.h" // definitions like VALIDATION1
#include "para.h" // parameter default values
#include "fileutil.h" // file io utilities
#include "exception.h" // IfTrueThrowTypeU()
//----------------------------------------------------------------------------------------------------------------------
// copy parameters from TParaSet (and project settings) to TParameter
TParameter ToTParameter(TParaSet& para, const TProject* prj)
{
TParameter p;
p.N_Int = para.N_Int;
p.w_COD = para.w_COD;
p.Eta = para.Eta;
p.W_Kernel = para.W_Kernel;
p.Sigma = para.Sigma;
p.p_min = para.p_min;
p.Prune = para.Prune;
p.Weights = para.Weights;
p.Metric = para.Metric;
#ifndef RELEASE // obsolete in release versions
p.W_Kernel_Min = para.W_Kernel_Min;
p.DifMax = para.DifMax;
p.Noise = para.Noise;
#endif
p.N_G_Max = prj->Get_N_G_Max();
p.N_Bins = prj->Get_N_Bins();
p.OverlapFac = prj->GetOverlapFac();
p.f_NormalizeByRange = prj->NormalizeByRange();
p.f_EqualWidthBinning = prj->EqualWidthBinning();
p.f_Regression = prj->Regression();
return p;
}
//----------------------------------------------------------------------------------------------------------------------
// model size in units - used together with TProject::MaxModelSize()
float ModelSize(const TCluster*const& cls, const TData*const& data, const bool& f_Prune)
{
return cls->nCuboidsRed()*cls->AvrVarPerCub(f_Prune)/(data->nVar()-1);
}
//----------------------------------------------------------------------------------------------------------------------
// decide if model is to big
bool ModelToBig(TCluster* model, const TData*const& data, const TParameter& para, const TProject*const& prj)
{
model->Prepare(¶); // prepare model, i.e. determine average bounds ...
bool f_ToBig = ModelSize(model, data, para.Prune) > prj->MaxModelSize(); // size to big
bool f_LowCmpr = model->nCuboids()/((float)data->nTup()) > prj->GetMinCompression()/100.0; // compression to low
return (f_ToBig || f_LowCmpr);
}
//----------------------------------------------------------------------------------------------------------------------
// test type conversion routines
// convert test type (enum) to character string
char* TestTypeToString(const int type)
{
switch(type)
{
case Rep : return SZ_TYPE_REP; // repetition
case Cv : return SZ_TYPE_CV; // cross-validation
case Loocv : return SZ_TYPE_LOOCV; ; // leave one out cross-validation
case Special : return SZ_TYPE_SPECIAL; // special mode: use all tuples to tune parameters once
default : return "Error: Tuning type unknown!";
}
}
// convert test type string to test type
TTestType StringToTestType(const char* szType)
{
if(strcmp(szType, SZ_TYPE_REP)==0)
return Rep;
else
if(strcmp(szType, SZ_TYPE_CV)==0)
return Cv;
else
if(strcmp(szType, SZ_TYPE_LOOCV)==0)
return Loocv;
else
if(strcmp(szType, SZ_TYPE_SPECIAL)==0)
return Special;
else
return DEF_TUNE_TYPE; // return default tuning type if string is empty/unknown
}
//----------------------------------------------------------------------------------------------------------------------
// return maximal model size in units - note: use it together with function ModelSize()
float TProject::MaxModelSize() const
{
return 0.5*N_L_Tune*maxSize/100.0;
};
//----------------------------------------------------------------------------------------------------------------------
// constructor
TProject::TProject()
{
tasks.SetName("TProject task list"); // set list name (debug reasons)
Reset(); // reset
}
//----------------------------------------------------------------------------------------------------------------------
// initialize parameters with default values or just rest them
void TProject::Reset()
{
// a) clear task list, this causes section '[TaskX]' to be somehow non-existing
DeleteAllTasks();
// b) section 'Basic'
N_Bins = DEF_N_BINS;
N_G_Max = DEF_N_G_MAX;
szData1[0] = szData2[0] = '\0';
outcol = DEF_OUTCOL;
// note: do not initialize f_Regression as this will be done based on the data
f_Randomize = DEF_RANDOMIZE;
overlapFac = DEF_OVERLAP_FAC;
f_EqualWidthBinning = DEF_EQUAL_WIDTH_BINNING;
f_NormalizeByRange = DEF_NORMALIZE_BY_RANGE;
// c) section 'Tuning'
szTuneType[0] = '\0';
f_Tune = !DEF_SKIP_TUNING; // flag: tune parameters
N_R_Tune = DEF_N_R_TUNE;
tuneType = DEF_TUNE_TYPE;
maxSize = DEF_MAX_SIZE;
minCompression = DEF_MIN_COMPRESSION;
N_L_Tune = N_T_Tune = split = 0; // reset
f_Skipping = DEF_SKIPPING;
// d) misc.
data1 = data2 = NULL;
f_Checked = false;
szProjectFilePath[0] = '\0';
}
//----------------------------------------------------------------------------------------------------------------------
// load project settings from file
void TProject::Load(ifstream& file, const char*const& _szProjectFilePath)
{
DeleteAllTasks(); // clear task list
strcpy(szProjectFilePath, _szProjectFilePath);
// read all sections of project file
char szSection[STS]; // name of section currently checked
try
{
streampos curPos = file.tellg(); // preserve actual stream position
//----------------------------------------------------------------------------------------------------------------
// a) read section '[Basic]'
strcpy(szSection, "[Basic]");
SearchKey(file, szSection); // position to section
if(!ReadKeyString(file, "Data1", szData1, STS)) throw 100; // data filename (absolutely necessary!)
CorrectPathDelimiter(szData1);
ReadKeyString(file, "Data2", szData2, STS); // 2nd data filename (test data)
CorrectPathDelimiter(szData2);
ReadKeyString(file, "VarTypes", szVarTypes, MAX_N_VAR); // variable types
outcol = ReadKeyValue(file, "Output", DEF_OUTCOL)-1; // output column
f_Regression = ReadKeyBool (file, "Regression");
f_Randomize = ReadKeyBool (file, "Randomize", DEF_RANDOMIZE); // flag: initialize random number generator
N_G_Max = ReadKeyValue(file, SZ_N_G_MAX, DEF_N_G_MAX); // PNC parameter
N_Bins = ReadKeyValue(file, SZ_N_BINS, DEF_N_BINS); // # bins used to discr. continuous variables
overlapFac = ReadKeyValue(file, SZ_OVERLAP_FAC, DEF_OVERLAP_FAC);
f_NormalizeByRange = ReadKeyBool (file, SZ_NORMALIZE_BY_RANGE, DEF_NORMALIZE_BY_RANGE);
f_EqualWidthBinning= ReadKeyBool (file, SZ_EQUAL_WIDTH_BINNING, DEF_EQUAL_WIDTH_BINNING);
//----------------------------------------------------------------------------------------------------------------
// b) read section '[Tuning]'
file.seekg(curPos); // restore stream position
strcpy(szSection, "[Tuning]");
SearchKey(file, szSection); // position to section
f_Tune = ReadKeyBool(file, "Tune");
ReadKeyString(file, "Type", szTuneType, STS); // tuning type
tuneType = StringToTestType(szTuneType);
N_R_Tune = ReadKeyValue(file, "N_R", 0); // # repetitions
N_L_Tune = ReadKeyValue(file, "N_L", 0); // # learn tuples
N_T_Tune = ReadKeyValue(file, "N_T", 0); // # test tuples
split = ReadKeyValue(file, "DataSplitting", 0); // data splitting note: used in GUI instead
// of 'N_L' and 'N_T'
maxSize = ReadKeyValue(file, SZ_MAX_SIZE, DEF_MAX_SIZE); // maximum size of learned model (in percent)
minCompression =
ReadKeyValue(file, SZ_MIN_COMPRESSION, DEF_MIN_COMPRESSION); // min. compression rate
f_Skipping = ReadKeyBool(file, SZ_SKIPPING, DEF_SKIPPING); // skip active flag
//----------------------------------------------------------------------------------------------------------------
// c) read section '[Task...]'
try
{
while(true) // note: loop is left by exception (... although this is bad style ;-)
{
// e1) compose section name and position to section
sprintf(szSection, "[Task%d]", tasks.Size()+1);
file.seekg(curPos); // restore stream position
SearchKey(file, szSection); // position to section
// e2) read parameter strings
TTask task;
task.Load(file); // read task strings from file
// e3) add task to task list
tasks.Ins() = task;
}
}
catch(int errNo)
{
// suppress error 'key not found', the # tasks is checked for the batch interface in TProjectB::Synchronize()
if(errNo!=KeyNotFound)
throw errNo; // else propagate exception
}
file.seekg(curPos); // restore stream position
}
catch(int errNo) // exception handling
{
char szText[STS];
// compose error text
if(errNo==100)
sprintf(szText, "Section '[Basic]': (Learn) data filename missing! Please specify key 'Data1'!");
else
if(strcmp(szSection, GetLastKey())==0)
sprintf(szText, "Section '%s' not found!", szSection);
else
sprintf(szText, "Section '%s' reading key '%s': %s", szSection, GetLastKey(), GetLastError(errNo));
ThrowTypeU(szText); // 'propagate' exception
}
}
//----------------------------------------------------------------------------------------------------------------------
// save project settings to file
void TProject::Save(ofstream& file, const bool& f_WriteTuningAnyway/*=false*/)
{
if(!f_Checked)
file << ComChar << " This is an unchecked copy of the project file! It's just what I loaded!" << endl << endl;
file << setiosflags(ios::left) << resetiosflags(ios::right); // set left justified output
//-------------------------------------------------------------------------------------------------------------------
// a) write section [Basic]
file << "[Basic]" << endl;
file << setw(WNAME) << "Data1" << " = " << szData1 << endl;
if(szData2[0]!='\0')
file << setw(WNAME) << "Data2" << " = " << szData2 << endl;
file << setw(WNAME) << "VarTypes" << " = "; // write variable types
if(data1) // use types from data object if there is one
for(int j=0;j<data1->nVar();j++)
file << TData::VarTypeToChar(data1->GetVarType(data1->ActualColumnInData(j)));
else
if(szVarTypes[0]=='\0')
file << "?";
else
file << szVarTypes; // else use string which was read in
file << endl;
file << setw(WNAME) << "Output" << " = " << (outcol+1) << endl;
file << setw(WNAME) << "Regression" << " = " << FlagToString(f_Regression) << endl;
file << setw(WNAME) << "Randomize" << " = " << FlagToString(f_Randomize) << endl;
file << setw(WNAME) << SZ_N_G_MAX << " = " << N_G_Max << endl;
file << setw(WNAME) << SZ_N_BINS << " = " << N_Bins << endl;
file << setw(WNAME) << SZ_OVERLAP_FAC << " = " << overlapFac << endl;
file << setw(WNAME) << SZ_NORMALIZE_BY_RANGE << " = " << FlagToString(f_NormalizeByRange) << endl;
file << setw(WNAME) << SZ_EQUAL_WIDTH_BINNING << " = " << FlagToString(f_EqualWidthBinning) << endl;
//-------------------------------------------------------------------------------------------------------------------
// b) write section [Task...]
for(int i=0;i<tasks.Size();i++)
{
TTask& task = tasks.Get(i); // get i-th task from list
file << endl << endl << "[Task" << (i+1) << "]" << endl;
task.Save(file);
}
// necessary as TTask::Save() resets to right justified output
file << setiosflags(ios::left) << resetiosflags(ios::right);
//-------------------------------------------------------------------------------------------------------------------
// c) write section [Tuning]
file << endl << endl << "[Tuning]" << endl;
file << setw(WNAME) << "Tune" << " = " << FlagToString(f_Tune) << endl;
if(f_Tune || f_WriteTuningAnyway)
{
file << setw(WNAME) << "Type" << " = " << TestTypeToString(tuneType) << endl;
if(tuneType!=Loocv)
file << setw(WNAME) << "N_R" << " = " << N_R_Tune << endl;
if(tuneType==Rep)
// note: either split or N_L and N_T are given; split 'writes trough' to N_L and N_T but not vice versa.
// Thus prefer split!
if(split==0)
{
file << setw(WNAME) << "N_L" << " = " << N_L_Tune << endl;
file << setw(WNAME) << "N_T" << " = " << N_T_Tune << endl;
}
else
file << setw(WNAME) << "DataSplitting" << " = " << split << endl;
file << setw(WNAME) << SZ_MAX_SIZE << " = " << maxSize << endl;
file << setw(WNAME) << SZ_MIN_COMPRESSION << " = " << minCompression << endl;
file << setw(WNAME) << SZ_SKIPPING << " = " << FlagToString(f_Skipping) << endl;
}
file << resetiosflags(ios::left) << setiosflags(ios::right); // restore right justified output
}
//----------------------------------------------------------------------------------------------------------------------
// check project parameters against constraints and given data file, set output column and variable types and set
// dependant variables - WARNING: caller has to release returned TParaSetList !!
TParaSetList* /*cr*/ TProject::Synchronize(TData*const& _data1, const bool& f_CheckTuningAnyway/*=false*/)
{
data1 = _data1; // associate data
f_Checked = true;
int varId=0;
TParaSetList* sets = NULL;
// check
try
{
//----------------------------------------------------------------------------------------------------------------
// a) section '[Basic]'
if(outcol<0 || outcol >= data1->nVar()) throw 102; // output column
if(N_G_Max < MIN_N_G_MAX ) throw 103; // PNC (global) parameter
if(N_Bins < MIN_N_BINS || N_Bins > MAX_N_BINS) throw 104; // # bins
if(overlapFac < MIN_OVERLAP_FAC || overlapFac > MAX_OVERLAP_FAC) throw 105; //
if(data1->nTup()<2) throw 110; // # data tuples
if(data1->nVar()>MAX_N_VAR) throw 111; // too many variables
if(f_Regression < 0) throw 112; // study flag not found in section 'Batch'
// manipulate data and evaluate variable types string, i.e. set variable types
if(szVarTypes[0]!='\0') // if variable types are specified ...
{
for(varId=0;varId<data1->nVar();varId++) // set type for each variable
{
if(szVarTypes[varId]=='\0')
throw 120; // error: unexpected end of string
// convert char to enum, legal are 'c' and 'n' ...
const TData::TVarType type = TData::CharToVarType(szVarTypes[varId]);
if(type==TData::none)
throw 121; // ... and check this conversion
const int actId = data1->ActualColumnInData(varId);
if(type==TData::symb && (!data1->IsInteger(actId)
|| data1->nIntegerMaxMin(varId)>MAX_INP_SYMBOLS))
throw 122; // throw error if type cannot be set
data1->SetVarType(actId, type); // set variable type
}
if(szVarTypes[varId]!='\0')
throw 123; // all variable types have been set, now check if string is terminated, else throw error
}
else // use defaults but ...
for(int j=0;j<data1->nVar();j++)
// ... ensure that symbolic variables has at most a range of 'MAX_INP_SYMBOLS'
if(data1->nIntegerMaxMin(j)>MAX_INP_SYMBOLS)
data1->SetVarType(j, TData::cont);
data1->SetOutputColumn(outcol); // set output column
// (hack!): originally this check was NOT done for GUI. But why? Now it's done ...
if(!f_Regression && !data1->IsSymbolic(0))
throw 107; // continuous output cannot be a classification task
//----------------------------------------------------------------------------------------------------------------
// d) section '[Tuning]'
if(f_Tune < 0)
throw 500; // key 'Tuning' not found
// check settings if tuning is enabled or explicitly specified (used for GUI)
if(f_Tune || f_CheckTuningAnyway)
{
if(szTuneType[0]=='\0') throw 503; // key 'Type' missing
if(tuneType==Special) throw 505; // special not allowed for tuning
if((tuneType==Cv || tuneType==Rep) && N_R_Tune==0)
throw 506; // CV and repetition need N_R ...
if(tuneType==Loocv && N_R_Tune!=0)
throw 507; // ... but Loocv dosen't
if(split!=0) // if split is specified
{
// ToDo: check if N_T or N_L is given and throw exception if it is, as it is ignored
if(tuneType!=Rep) throw 509; // must not be given for all other types except repetition
}
else
{
if(tuneType==Rep && (N_L_Tune==0 || N_T_Tune==0)) throw 510; // repetition needs # learn and test tuples
if(tuneType!=Rep && (N_L_Tune!=0 || N_T_Tune!=0)) throw 511; // ... all other types don't
}
if(maxSize<MIN_MAX_SIZE || maxSize>MAX_MAX_SIZE) throw 521; // maximum model size
if(minCompression<MIN_MIN_COMPRESSION || minCompression>MAX_MIN_COMPRESSION) throw 522; // min. compr. rate
if(tuneType!=Loocv) // check repetition/cross-validation count
{
if(N_R_Tune<Min_N_R_Tune()) throw 523; // minimum
if(N_R_Tune>Max_N_R_Tune()) throw 524; // maximum
}
// now we've checked and can initialize (if necessary) N_L_Tune, N_T_Tune and N_R_Tune with respect to
// the tuning type
IniTuneCounts();
#ifndef VALIDATION_1
if(tuneType==Rep && N_T_Tune+N_L_Tune>Peek_N_L())
throw 513; // check for overlapping of learn and test data
#endif
if(tuneType==Cv && N_R_Tune>Peek_N_L())
throw 514; // # cross-validations cannot be greater than # data tuples
}
//----------------------------------------------------------------------------------------------------------------
// e) section '[Task...]'
sets = ::ToParaSetList(&tasks, data1, f_Regression);
}
catch(int errNo) // exception handling
{
delete sets; // release
char szText[STS]; // compose error text
switch(errNo)
{
// section '[Basic]'
case 102 : sprintf(szText, "Section '[Basic]': Output column not given or value not e[1..%d]! Please specify or correct key 'Output'!", data1->nVar()); break;
case 103 : sprintf(szText, "Section '[Basic]': Parameter '%s' must be e[%d..oo]! Please correct!", SZ_N_G_MAX, MIN_N_G_MAX); break;
case 104 : sprintf(szText, "Section '[Basic]': Parameter '%s' must be e[%d..%d]! Please correct!", SZ_N_BINS, MIN_N_BINS, MAX_N_BINS); break;
case 105 : sprintf(szText, "Section '[Basic]': Parameter '%s' must be e[%d..%d]! Please correct!", SZ_OVERLAP_FAC, MIN_OVERLAP_FAC, MAX_OVERLAP_FAC); break;
case 107 : sprintf(szText, "Section '[Basic]': Flag 'Regression' must be set if output is continuous!"); break;
case 110 : sprintf(szText, "Insufficient number of tuples in data file '%s'!", szData1); break;
case 111 : sprintf(szText, "Too many variables in data file '%s'!", szData1); break;
case 112 : sprintf(szText, "Section '[Basic]': Key 'Regression' not found! Please specify key 'Regression'!"); break;
case 120 : sprintf(szText, "Section '[Basic]': Unexpected end of variable type string! Please specify type for every variable or remove key 'VarTypes'!"); break;
case 121 : sprintf(szText, "Section '[Basic]': Character %d of variable type string is illegal! Please correct to either 'c' or 'n'!", varId+1); break;
case 122 : sprintf(szText, "Section '[Basic]': Cannot set variable %d to be symbolic (nominal)! Please correct variable type string 'VarTypes'!", varId+1); break;
case 123 : sprintf(szText, "Section '[Basic]': Variable type string must have exactly %d characters! Please correct key 'VarTypes'!", data1->nVar()); break;
// section '[Tuning]'
case 500 : sprintf(szText, "Section '[Tuning]': Flag 'Tuning' not found! Please specify key 'Tuning'!"); break;
case 503 : sprintf(szText, "Section '[Tuning]': Tuning type missing! Please specify key 'Type'!"); break;
case 505 : sprintf(szText, "Section '[Tuning]': Type 'Special' not allowed as tuning type! Please correct key 'Type'!"); break;
case 506 : sprintf(szText, "Section '[Tuning]': Number of repetitions/cross-validations not given or out of range! Please specify key 'N_R'!"); break;
case 507 : sprintf(szText, "Section '[Tuning]': Number of repetitions/cross-validations must not be given if tuning type is 'Loocv'! Please remove key 'N_R'!"); break;
case 509 : sprintf(szText, "Section '[Tuning]': Data splitting must not be given for types 'Cross-Validation' or 'Loocv'! Please remove key 'DataSplitting'!"); break;
case 510 : sprintf(szText, "Section '[Tuning]': Number of learn and test tuples not given or out of range! Please specify keys 'N_L' and 'N_T' or key 'DataSplitting'!"); break;
case 511 : sprintf(szText, "Section '[Tuning]': Number of learn and test tuples must not be given for types 'Cross-Validation' or 'Loocv'! Please remove keys 'N_L' and 'N_T'!"); break;
case 513 : sprintf(szText, "Section '[Tuning]': Overlapping of learn and test data! 'N_L+N_T<=%d' must yield!", Peek_N_L()); break;
case 514 : sprintf(szText, "Section '[Tuning]': Too many cross-validations! 'N_R<=%d' must yield!", Peek_N_L()); break;
case 521 : sprintf(szText, "Section '[Tuning]': Maximum model size must be e[%d..%d]! Please correct key 'MaxSize'!", MIN_MAX_SIZE, MAX_MAX_SIZE); break;
case 522 : sprintf(szText, "Section '[Tuning]': Minimum compression rate must be e[%d..%d]! Please correct key 'MinCompression'!", MIN_MIN_COMPRESSION, MAX_MIN_COMPRESSION); break;
case 523 : sprintf(szText, "Section '[Tuning]': N_R must be >= %d! Please correct!", Min_N_R_Tune()); break;
case 524 : sprintf(szText, "Section '[Tuning]': N_R must be <= %d! Please correct!", Max_N_R_Tune()); break;
}
ThrowTypeU(szText); // 'propagate' exception
}
// return parameter set list generated from settings in section [Parameter] note: not really good style but this
// list was generated while checking, so why do not use it ?
return sets;
}
//----------------------------------------------------------------------------------------------------------------------
// set output column; note: buffered, no changes are done to associated data object until you call WriteThroughOutCol()
void TProject::SetOutCol(int _outcol)
{
IfTrueThrowTypeA(!data1, "No learn data object associated!", "TProject::OutCol", szModule);
if(_outcol <0) // bound to legal values
_outcol=0;
if(_outcol >= data1->nVar())
_outcol = data1->nVar()-1;
outcol = _outcol; // set
if(!CouldBeClassification(outcol)) // ensure that regression flag is set (i.e. classification unset)
f_Regression = true; // if output cannot be symbolic
}
//----------------------------------------------------------------------------------------------------------------------
// set parameter for maximum model size; used as constraint while tuning parameters
void TProject::SetMaxSize(int size)
{
if(size<MIN_MAX_SIZE) // check bounds
size = MIN_MAX_SIZE;
if(size>MAX_MAX_SIZE)
size = MAX_MAX_SIZE;
maxSize = size; // set
}
//----------------------------------------------------------------------------------------------------------------------
// set parameter for minimum compression; used as constraint while tuning parameters
void TProject::SetMinCompression(int size)
{
if(size<MIN_MIN_COMPRESSION) // check bounds
size = MIN_MIN_COMPRESSION;
if(size>MAX_MIN_COMPRESSION)
size = MIN_MIN_COMPRESSION;
minCompression = size; // set
}
//----------------------------------------------------------------------------------------------------------------------
// set maximum group size
void TProject::Set_N_G_Max(const int& _N_G_Max)
{
if(_N_G_Max<MIN_N_G_MAX) N_G_Max = MIN_N_G_MAX; // check constraints
else if(_N_G_Max>MAX_N_G_MAX) N_G_Max = MAX_N_G_MAX;
else N_G_Max = _N_G_Max;
}
//----------------------------------------------------------------------------------------------------------------------
// set # bins used to dicretize continuous variables
void TProject::Set_N_Bins(const int& _N_Bins)
{
if(_N_Bins<MIN_N_BINS) N_G_Max = MIN_N_BINS; // check constraints
else if(_N_Bins>MAX_N_BINS) N_Bins = MAX_N_BINS;
else N_Bins = _N_Bins;
}
//----------------------------------------------------------------------------------------------------------------------
// set overlap factor
void TProject::SetOverlapFac(const float& __fac)
{
int r = pow(10, PRC_OVERLAP_FAC);
float _fac = floor(__fac*r + 0.5)/r; // round to precision
if(_fac<MIN_OVERLAP_FAC) overlapFac = MIN_OVERLAP_FAC; // check constraints
else if(_fac>MAX_OVERLAP_FAC) overlapFac = MAX_OVERLAP_FAC;
else overlapFac = _fac;
}
//----------------------------------------------------------------------------------------------------------------------
// total # parameter sets for tuning
int TProject::nParaSets() const
{
int nSets =0; // ini
for(int i=0;i<tasks.Size();i++) // sum up over all tasks
nSets += tasks.Get(i).nSets; // get i-th task from list
return nSets;
}
//----------------------------------------------------------------------------------------------------------------------
// # parameter sets for tuning that need learning
int TProject::nParaSetsLearn() const
{
int nSets =0; // ini
for(int i=0;i<tasks.Size();i++) // sum up over all tasks
nSets += tasks.Get(i).nSetsLearn; // get i-th task from list
return nSets;
}
//----------------------------------------------------------------------------------------------------------------------
// add task to task list, actualize parameter set count
void TProject::AddTask(const TTask*const& task)
{
IfTrueThrowTypeA(!data1, "No learn data object associated!", "TProject::AddTask", szModule); // check
TTaskList list; // create temporary task list ...
list.Ins() = *task; // ... and insert new task as the only list entry
delete ::ToParaSetList(&list, data1, f_Regression); // then parse the task list to determine # parameter sets
// note: immediately delete return value
tasks.Pos(tasks.Size()-1); // position to last element
tasks.Ins() = list.Get(0); // add to task list
}
//----------------------------------------------------------------------------------------------------------------------
// check if actual output column has (not to many and) integer values and thus problem can be considered as
// classification problem
bool TProject::CouldBeClassification(const int& oriId)
{
IfTrueThrowTypeA(!data1, "No learn data object associated!", "TProject::CouldBeClassification", szModule); // check
const int nInt = data1->nIntegerMaxMin(data1->ActualColumnInData(oriId));
return (nInt<MAX_OUT_SYMBOLS && nInt>0);
}
//----------------------------------------------------------------------------------------------------------------------
// check if specified input(!) variable could be symbolic; note: the output type is ignored! Types are only uzsed for
// input variables!
bool TProject::CouldBeSymbolic(const int& varId)
{
IfTrueThrowTypeA(!data1, "No learn data object associated!", "TProject::CouldBeSymbolic", szModule); // check
return (data1->nIntegerMaxMin(varId)<MAX_INP_SYMBOLS && data1->nIntegerMaxMin(varId)>0);
}
//----------------------------------------------------------------------------------------------------------------------
// set output column in data object (output column will be moved to first)
void TProject::WriteThroughOutCol(const bool& f_TestDataIsWithOutput)
{
IfTrueThrowTypeA(!data1, "No learn data object associated!", "TProject::WriteThroughOutCol", szModule); // check
data1->SetOutputColumn(outcol); // set output column in learn data
if(data2 && f_TestDataIsWithOutput)
data2->SetOutputColumn(outcol); // set also in test data if it exists
}
//----------------------------------------------------------------------------------------------------------------------
// return # classes of given problem if it is/would be considered as classification problem; note: will be zero if
// output is continuous
int TProject::nClasses()
{
IfTrueThrowTypeA(!data1, "No learn data object associated!", "TProject::nClasses", szModule); // check
return data1->nIntegerMaxMin(data1->ActualColumnInData(outcol));
}
//----------------------------------------------------------------------------------------------------------------------
// set data splitting
void TProject::SetDataSplitting(const int& _split)
{
IfTrueThrowTypeA(!data1, "No learn data object associated!", "TProject::SetDataSplitting", szModule); // check
split = _split; // copy
if(split<MIN_SPLIT) split=MIN_SPLIT; // check constraints
if(split>MAX_SPLIT) split=MAX_SPLIT;
N_L_Tune = split*Peek_N_L()/100.0; // set # learn and ...
N_T_Tune = Peek_N_L()-N_L_Tune; // ... test data tuples
}
//----------------------------------------------------------------------------------------------------------------------
// minimum tuning repetition/cross-validation count
int TProject::Min_N_R_Tune()
{
int min=1; // initialize
if(tuneType==Cv) // set minimum of 2 for cross-validation
min=2;
return min;
}
//----------------------------------------------------------------------------------------------------------------------
// maximum tuning repetition/cross-validation count
int TProject::Max_N_R_Tune()
{
int max=MAX_N_R_TUNE; // initialize
if(tuneType==Cv)
max=Peek_N_L()/2; // use this setting for cross-validation
return max;
}
//----------------------------------------------------------------------------------------------------------------------
// initialize tuning learn and test data tuple counts (N_L_Tune and N_R_Tune) and tuning repetition/cross-validation
// count (N_R_Tune) with respect to the tuning type. note: Ini for type 'Repetition' onyl done if split was given
void TProject::IniTuneCounts()
{
if(tuneType==Rep && split!=0) // repetition
SetDataSplitting(split);
if(tuneType==Cv) // 'N-fold cross-validation
{
N_T_Tune=Peek_N_L()/N_R_Tune;
N_L_Tune=Peek_N_L()-N_T_Tune;
}
if(tuneType==Loocv) // leave-one-out cross-validation
{
N_L_Tune = Peek_N_L()-1;
N_T_Tune = 1;
N_R_Tune = Peek_N_L();
}
}