static const char* szModule = "predcition.cpp";
//------------------------------------------------------------------------------
// module prediction.cpp //
// //
// The class TPrediction predicts the unknown output value given the //
// input based on the model learned with the PNC cluster/learn algorithm. //
// The class TCluster encapsulates such a model. //
// See below or http://www.newty.de/pnc2/sdocu.html for more information. //
// //
// copyright (c) 2000-2003 by Lars Haendel //
// home: www.newty.de //
// //
// This program is free software and can be used under the terms of the //
// GNU licence. See header file for further information and disclaimer. //
// //
//------------------------------------------------------------------------------
// //
// CREATE: Pass learned model (TCluster) and (if you like) alternate //
// parameters (TParameter). //
// //
// USE: Predict one single input(!) data tuple (float*) or predict //
// several tuples (TData*) by calling Predict(). In the latter case //
// you can pass an output file, a loss object and you can specify if //
// the data is passed with or without the output values. Of course //
// you cannot get meaningful loss results if the real output values //
// are not present but you may access the predictions made from the //
// loss object via the function Y_p() . //
// //
// File I/O: some output in function Predict() //
//------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------------------------------------
#include <stdlib> // due to qsort()
#include <stdio> // sprintf()
#include <iomanip> // setw()
#include <math> // sqrt()
#include <values> // MAXFLOAT
#include "prediction.h"
#include "fileutil.h" // file I/O utilities
#include "defines.h"
#include "exception.h" // IfTrueThrowTypeA()
//----------------------------------------------------------------------------------------------------------------------
//#define PREDICTION_DIAGNOSTIC // for each prediction: write distance of tuple to nearest cuboid etc. to logfile
#ifdef PREDICTION_DIAGNOSTIC
ofstream prdDiagFile("C:\\prddiag.dat");
#endif
//---------------------------------------------------------------------------------------
// utility function: 1/exp(x) - note: approximation is used for better performance
#ifdef DEBUG
float expn(float x)
{
IfTrueThrowTypeB(x<0, "Function called with negative argument!", "expn", szModule);
return 1.0/(1.0 + x + x*x*0.5 + x*x*x*0.1666667);
}
#else
inline float expn(float x){ return 1.0/(1.0 + x + x*x*0.5 + x*x*x*0.1666667); }
#endif
//----------------------------------------------------------------------------------------------------------------------
// constructor
TPrediction::TPrediction(TCluster*const& _cls, const TParameter*const& _para/*=NULL*/)
{
// copy
cls = _cls;
if(_para)
para = _para; // use parameter object if given
else
para = cls->GetParameters(); // else: take parameters from TCluster object
beta = 1.0/(para->Sigma*(cls->nVar()-1)); // kernel width for regression
d_ins = new float[cls->nCuboids()]; // cuboid 'inside' distances/probability
d_out = new float[cls->nCuboids()]; // cuboid 'outside' distances/probability
cls->Prepare(para); // prepare mapper to use only those cuboids with a mass bigger than 'para.K'
}
//----------------------------------------------------------------------------------------------------------------------
// predict unknown output value for given input
float TPrediction::Predict(const float*const x, float& nIns)
{
nIns=0; // initialize
float r_value=0;
if(cls->nCuboidsRed()>0) // if there are any cuboids -> predict using ...
if(para->f_Regression)
r_value = StandardPrediction(x, nIns); // ... standard prediction for regression tasks
else
r_value = WeightedCountPrediction(x, nIns); // ... and "weighted count" for classification tasks ...
return r_value;
}
//----------------------------------------------------------------------------------------------------------------------
// standard inside-outside nearest cuboid prediction
float TPrediction::StandardPrediction(const float*const& x, float& nIns)
{
// 1. reset all probabilities
for(int t=0;t<cls->nCuboidsRed();t++)
d_ins[t] = d_out[t] = 0;
// 2. calculate probabilities
for(int t=cls->nCuboidsRed();t--;) // for each cuboid
{
// a) calculate distance to t'th cuboid
float d;
if(para->Prune) // use either pruned or none-pruned model
d = cls->PrunedDistance(x, t);
else
d = cls->Distance(x, t);
// b) if tuple is inside cuboid -> ...
if(d==0)
{
// set t'th cuboid 'inside'-probability
#ifdef HITRATE
d_ins[t] = cls->Hitrate(t);
#else
d_ins[t] = 1.0;
#endif
nIns++; // increment 'inside' counter
}
else // ... else if tuple is 'outside'
{
#ifdef GAUSS_MSF
d *= d; // square distance
#endif
// set t'th cuboid 'outside' probability
#ifdef HITRATE
d_out[t] = expn(beta*d)*cls->Hitrate(t); // modify probability by hitrate
#else
d_out[t] = expn(beta*d);
#endif
}
}
// 3. evaluate probabilities
double y =0.0; // ini
if(!EvMean(d_ins, y)) // minimum risk evaluation (COG = center of gravity)
EvMean(d_out, y);
return y;
}
//----------------------------------------------------------------------------------------------------------------------
// calculate mean of all probabilities (minimum risk evaluation, COG)
bool TPrediction::EvMean(const float*const& mu, double& y)
{
double sum=0.0; // ini
// calculate mean of all probabilities
for(int t=0;t<cls->nCuboidsRed();t++)
{
y += mu[t]*cls->Output(t);
sum += mu[t];
}
if(sum==0) // no probabilities have been greater than zero -> do not predict and return false
return false;
y /= sum; // ... else
return true;
}
//----------------------------------------------------------------------------------------------------------------------
// predict value with maximal probability (maximum likelihood evaluation)
bool TPrediction::EvMax(const float*const& mu, double& y)
{
int maxId=0; // note: ini with invalid value should also be possible as there is at least one cuboid
float max=-MAXFLOAT;
// estimate maximum of all suggestions
for(int t=0;t<cls->nCuboidsRed();t++) // for all cuboids
if(mu[t] > max) // if probability is greater -> new maximum is found
{
max = mu[t];
maxId=t;
}
if(max==0) // no probabilities have been greater than zero -> do not predict and return false
return false;
y=cls->Output(maxId); // predict value with maximal probability
return true;
}
//----------------------------------------------------------------------------------------------------------------------
// weighted count prediction for given tuple
float TPrediction::WeightedCountPrediction(const float*const& x, float& nIns)
{
// 1. calculate distance of tuple to each cuboid
for(int t=0;t<cls->nCuboidsRed();t++)
if(para->Prune) // distance to t-th cuboid
d_out[t] = cls->PrunedDistance(x, t);
else
d_out[t] = cls->Distance(x, t);
// 2. evaluate distances and predict
return EvDistances(d_out, nIns);
}
//----------------------------------------------------------------------------------------------------------------------
// evaluate distances from each cuboid to tuple to make a "weighted count" prediction
float TPrediction::EvDistances(const float*const& d, float& nIns)
{
// a) sort distances
TSorter dis(cls->nCuboidsRed());
for(int t=0;t<cls->nCuboidsRed();t++)
{
dis.GetA(t) = d[t]; // 1th criterion distance
dis.GetB(t) = 0; // 2nd criterion not used
}
dis.SortAsc();
// b) determine neighbourhood
float d_ref=dis.GetA(0); // set distance of nearest cuboid as 'reference' distance
#ifndef RELEASE
float max=d_ref*para->W_Kernel+cls->d_ref()*para->W_Kernel_Min; // calculate width of 'neighborhood'
#else
float max=d_ref*para->W_Kernel;
#endif
float w_inv=0; // inverse of width
if((max-d_ref)!=0)
w_inv=1.0/(max-d_ref);
int k=1; // get # cuboids in neighborhood
for(;k<cls->nCuboidsRed();k++)
if(dis.GetA(k)>max)
break;
if(d_ref==0.0) // if smallest distance is zero (inside) then set # cuboids inside
nIns=k;
// c) for each cuboid in the neighbourhood: sum up class proposals
int nClasses=cls->GetDataData()->nIntegerMaxMin(0); // # classes output variable
TSorter count(nClasses);
for(int i=0;i<nClasses;i++) // reset/ini
count.GetA(i) = count.GetB(i)= 0; // 1th criterion and 2nd criterion
// save: reference distance, maximal distance, k and if it's an inside prediction
#ifdef PREDICTION_DIAGNOSTIC
prdDiagFile << d_ref << " " << dis.GetA(nClasses-1) << " " << k << " " << (d_ref==0.0) << endl;
#endif
int minY = cls->GetDataData()->Min()[0];
for(int t=0;t<k;t++) // sum up
{
const int id = cls->Output(dis.GetId(t)) - minY; // output class mapped to id (PoD!)
#ifdef HITRATE
// first criterion: distance weighted count multiplied with hitrate
count.GetA(id) += (1.0-(dis.GetA(t)-d_ref)*w_inv) * cls->Hitrate(dis.GetId(t));
#else
count.GetA(id) += (1.0-(dis.GetA(t)-d_ref)*w_inv) * 1; // first criterion: distance weighted count
#endif
#ifdef MASS_TIE_BREAKING
// 2nd criterion (tie breaking policy): sum up mass
count.GetB(id) += cls->Mass(dis.GetId(t));
#endif
}
#ifndef MASS_TIE_BREAKING
// tie breaking policy: predict most frequent class
// count.GetB(<most frequent class>) = 2; // ToDo
#endif
// d) predict class with highest proposal count
count.SortDes(); // sort descending
return count.GetId(0)+minY; // ... take first (this is the best) and re-map to class value
}
//----------------------------------------------------------------------------------------------------------------------
// predict output value for all tuples in test data object 'data_T', flag f_no_output indicates if the first column in
// data object contains the real output values
void TPrediction::Predict(const TData*const& data_T, TLossFunction*const& loss, ofstream* file/*=NULL*/
, const bool& f_IsWithOutput/*=true*/, const bool* f_Stop/*=NULL*/)
{
// pre-checks
IfTrueThrowTypeA(data_T->nTup()==0, "Function called with empty data object!", "TPrediction::Test", szModule);
IfTrueThrowTypeA(!file&&!loss, "Function called without loss or file stream object!", "TPrediction::Test", szModule);
// a) write header if file exists
if(file)
{
*file << ComChar << " Learn data = " << cls->GetDataData()->FileName() << endl;
*file << ComChar << " Test data = " << data_T->LoadFileName() << endl;
*file << ComChar << " Cuboids = " << cls->nCuboids() << " / " << cls->nCuboidsRed() << " (reduced)" << endl;
*file << endl;
para->Save(*file, true);
}
// b) predict all tuples
progress = 0;
if(f_IsWithOutput)
// a) real output values are given (column 0 in data object) and used to calculate loss
for(int i=0;i<data_T->nTup();i++)
{
float nIns; // # cuboids with distance of zero
float res = Predict(&(data_T->Row(i)[1]), nIns); // predict output value
loss->EvPrediction(res, data_T->Row(i)[0], nIns); // pass predicted and real value to loss function
if(f_Stop) // check stop flag
if(*f_Stop)
break;
progress++;
}
else
// b) no real ouput values are known
for(int i=0;i<data_T->nTup();i++)
{
float nIns; // # cuboids with distance of zero
float res = Predict(data_T->Row(i), nIns); // predict output value
loss->EvPrediction(res, 0, nIns); // pass predicted and dummy real value to loss function
if(f_Stop) // check stop flag
if(*f_Stop)
break;
progress++;
}
// c) write results
if(file)
{
loss->WriteResults(*file, !para->f_Regression, f_IsWithOutput); // write results (loss etc.)
*file << endl << endl; // linefeed
if(f_Stop)
if(*f_Stop) // write error message if stop flag is set
*file << "User break! Simulation terminated!" << endl;
}
}