/*************************************************************************/
/*									 */
/*	Calculate information, information gain, and print dists	 */
/*	--------------------------------------------------------	 */
/*									 */
/*************************************************************************/


#include "buildex.h"


/*************************************************************************/
/*								                                    	 */
/*  Determine the worth of a particular split according to the	    	 */
/*  operative criterion						                        	 */
/*								                                    	 */
/*	    Parameters:					                             		 */
/*		SplitInfo:	potential info of the split	                    	 */
/*		SplitGain:	gain in info of the split		                     */
/*		MinGain:	gain above which the Gain Ratio		                 */
/*				may be used				                                 */
/*									                                     */
/*  If the Gain criterion is being used, the information gain of	     */
/*  the split is returned, but if the Gain Ratio criterion is		     */
/*  being used, the ratio of the information gain of the split to	     */
/*  its potential information is returned.				                 */
/*									                                     */
/*************************************************************************/


float Worth(ThisInfo, ThisGain, MinGain)
/*    -----  */
    float ThisInfo, ThisGain, MinGain;
{
    if ( GAINRATIO )
    {
	if ( ThisGain >= MinGain - Epsilon && ThisInfo > Epsilon )
	{
	    return ThisGain / ThisInfo;
	}
	else
	{
	    return -Epsilon;
	}
    }
    else
    {
	return ( ThisInfo > 0 && ThisGain > -Epsilon ? ThisGain : -Epsilon );
    }
}



/*************************************************************************/
/*									 */
/*  Zero the frequency tables Freq[][] and ValFreq[] up to MaxVal	 */
/*									 */
/*************************************************************************/


    ResetFreq(MaxVal)
/*  ---------  */
    DiscrValue MaxVal;
{
    DiscrValue v;
    ClassNo c;

    ForEach(v, 0, MaxVal)
    { 
	ForEach(c, 0, MaxClass)
	{
	    Freq[v][c] = 0;
	}
	ValFreq[v] = 0;
    } 
}



/*************************************************************************/
/*									                                     */
/*  Given tables Freq[][] and ValFreq[], compute the information gain.	 */
/*									                                     */
/*	    Parameters:						                             	 */
/*		BaseInfo:	average information for all items with	             */
/*				known values of the test attribute	                     */
/*		UnknownRate:	fraction of items with unknown ditto             */
/*		MaxVal:		number of forks			                         	 */
/*		TotalItems:	number of items with known values of	             */
/*				test att				                                 */
/*								                                       	 */
/*  where Freq[x][y] contains the no. of cases with value x for a    	 */
/*  particular attribute that are members of class y,			         */
/*  and ValFreq[x] contains the no. of cases with value x for a		     */
/*  particular attribute					                        	 */
/*								                                    	 */
/*************************************************************************/


float ComputeGain(BaseInfo, UnknFrac, MaxVal, TotalItems)
/*    -----------  */
    float BaseInfo, UnknFrac;
    DiscrValue MaxVal;
    ItemCount TotalItems;
{
    DiscrValue v;
    float ThisInfo=0.0, ThisGain, TotalInfo();
    short ReasonableSubsets=0;

    /*  Check whether all values are unknown or the same  */

    if ( ! TotalItems ) return -Epsilon;

    /*  There must be at least two subsets with MINOBJS items  */

    ForEach(v, 1, MaxVal)
    {
	if ( ValFreq[v] >= MINOBJS ) ReasonableSubsets++;
    }
    if ( ReasonableSubsets < 2 ) return -Epsilon;

    /*  Compute total info after split, by summing the
	info of each of the subsets formed by the test  */

    ForEach(v, 1, MaxVal)
    {
	ThisInfo += TotalInfo(Freq[v], 0, MaxClass);
    }

    /*  Set the gain in information for all items, adjusted for unknowns  */

    ThisGain = (1 - UnknFrac) * (BaseInfo - ThisInfo / TotalItems);

    Verbosity(5)
        printf("ComputeThisGain: items %.1f info %.3f base %.3f unkn %.3f result %.3f\n",
    		TotalItems + ValFreq[0], ThisInfo, BaseInfo, UnknFrac, ThisGain);

    return ThisGain;
}




/*************************************************************************/
/*									                                     */
/*  Compute the total information in V[ MinVal..MaxVal ]	        	 */
/*								                                     	 */
/*************************************************************************/


float TotalInfo(V, MinVal, MaxVal)
/*    ---------  */
    ItemCount V[];
    DiscrValue MinVal, MaxVal;
{
    DiscrValue v;
    float Sum=0.0;
    ItemCount N, TotalItems=0;

    ForEach(v, MinVal, MaxVal)
    {
	N = V[v];

	Sum += N * Log(N);
	TotalItems += N;
    }

    return TotalItems * Log(TotalItems) - Sum;
}

