-- See: L. Allison. Types and classes of machine learning and data mining.
--      26th Australasian Computer Science Conference (ACSC) pp.207-215,
--      Adelaide, February 2003
--      L. Allison. Models for machine learning and data mining in
--      functional programming.      doi:10.1017/S0956796804005301
--      J. Functional Programming, 15(1), pp.15-32, Jan. 2005
-- Author: Lloyd ALLISON           lloyd at bruce cs monash edu au
--         http://www.csse.monash.edu.au/~lloyd/tildeFP/II/200309/
-- This program is free software; you can redistribute it and/or modify it
-- under the terms of the GNU General Public License (GPL) as published by
-- the Free Software Foundation; either version 2 of the License, or (at
-- your option) any later version.  This program is distributed in the hope
-- that it will be useful, but without any warranty, without even the implied
-- warranty of merchantability or fitness for a particular purpose.  See the
-- GNU General Public License for more details.  You should have received a
-- copy of the GNU General Public License with this program; if not, write to:
-- Free Software Foundation, Inc., Boston, MA 02111, USA.

module SM_Utilities (module SM_Utilities) where                   -- export all


latticeK1 = 1/12                                                   -- lattice
latticeK2 = 5/(36 * sqrt(3))                                       -- constants
-- see J.H.Conway and N.J.A.Sloane. Sphere Packings, Lattices and Groups.
--     Springer 1998 3rd edn.


data Throw = H | T  deriving  (Enum, Bounded, Eq, Ord, Show)  --  Mr. Bernoulli

instance Splits Throw where splits = splitsBE                   -- Splits Throw

-- ----------------------------------------------------------------------------


assert  p x     = assert1 p x "assertion failed"
assert1 p x why = if p x then x else error why


log2 = log 2.0


-- logPlusBase b (-logBase b p1) (-logBase b p2) = -logBase b (p1+p2)
logPlusBase b nlPr1 nlPr2 =   -- NB. this is "OK" but there are faster ways.
  let (bigger, smaller) =
        if nlPr1 >= nlPr2 then (nlPr1, nlPr2) else (nlPr2, nlPr1)
      diff = bigger - smaller     -- is non-negative
      eps  = b ** (-diff)         -- between 0 and 1
      nits = log b
  in if diff * nits > 30 then smaller else smaller - (logBase b (1+eps))
  -- the cutoff of 30 nits is somewhat(!) arbitrary.

-- and base e ...
logPlus nlPr1 nlPr2 =   -- NB. this is "OK" but there are faster ways.
  let (bigger, smaller) =
        if nlPr1 >= nlPr2 then (nlPr1, nlPr2) else (nlPr2, nlPr1)
      diff = bigger - smaller     -- is non-negative
      eps  = exp (-diff)          -- between 0 and 1
  in if diff > 30 then smaller else smaller - (log (1+eps))


normalise xs =   -- scale xs to make sum to 1.0
  let (ans, finalTotal) = n xs 0
      n  []    total    = ([], total)
      n (x:xs) total    =
        let (rest, sum) = n xs (total+x)
        in ((x/finalTotal):rest, sum)
  in ans


stirling       n =                                  -- Stirling's approximation
  let ans = (n + 0.5)*(log n) - n + (log(2*pi)) / 2         -- ~ logBase e (n!)
  in max ans 0

stirlingBase b n = (stirling n) / (log b)   -- and to base b

-- ----------------------------------------------------------------------------

prng seed = (seed * (1 + 4*37*109) + 9999) `mod` (32 * 1024)
  -- Pseudo Random Number Generator
  -- ([0 .. 32K-1],  only a 32K cycle,  really should use a better one!)
  -- Linear Congruential Pseudo Random Number Generator,
  -- see:  D. E. Knuth, The Art of Computer Programming, Vol. 2.

-- ----------------------------------------------L.Allison--CSSE--Monash--.au--
-- The class Splits and associated type and functions are primarily to do
-- with partitioning data, as in classification-trees do, although they might
-- have more uses, e.g. in clustering/ unsupervised-classification.

class Splits t where
  splits :: [t] -> [Splitter t]  -- i.e. can find ways of partitioning t-lists;
  -- these ways should be proposed in order of decreasing prior-probability.

data Splitter t = Splitter Int   (t -> Int) (() -> String)
                  -- i.e. arity partition_fn description

instance Show (Splitter t) where
  show (Splitter _ _ description) = description()

aritySplitter (Splitter n _ _)   = n

applySplitter (Splitter _ f _) d = f d  -- apply it to one datum

                       -- partition a data-set, ds, into n parts according to s
split s ds = partition (aritySplitter s) (map (applySplitter s) ds) ds

-- The following is not allowed :-(
-- instance (Bounded t, Enum t) => Splits t where splits = splitsBE
-- because  ... The instance type must be of the form  (T a b c) ...

                                                    -- Splits discrete_type ...
instance Splits Bool   where splits = splitsBE              -- e.g. Splits Bool

                                                   -- Splits continous type ...
instance Splits Float  where splits = splitsOrd           -- e.g. Splits  Float

instance Splits Double where splits = splitsOrd           -- e.g. Splits Double


splitsBE  []    = []        -- calculate the Splitter's for a Bounded Enum type
splitsBE (d:ds) =
  let { lwb = minBound `asTypeOf` d;  lwbn = fromEnum lwb;
        upb = maxBound `asTypeOf` d;  upbn = fromEnum upb;
        arity = upbn - lwbn + 1  }
  in if all ((==) d) ds then []     -- all the same, no variety in data set
     else [Splitter arity (\u -> (fromEnum (u `asTypeOf` d) - lwbn))
          (\()->"="++(show lwb)++(if arity > 2 then ".." else "|")++(show upb))]
  -- If the type is also Ord(ered, e.g. Bad|Poor|...|VG) and if
  -- arity is quite big, then it might be better to use  splitsOrd lwb..upb .

splitsOrd ds =   -- calculate Splitter's for an Ord(ered, inc' continuous) type
  let splitPoints [] = []  -- lines like the next make it all worthwhile
      splitPoints ds = (medianEtc . tail . unique . msort) ds       -- !
  in map (\cut -> Splitter 2 (\y -> if y < cut then 0 else 1)
                             (\() -> "<|>=" ++ show cut)) (splitPoints ds)

                                                           -- Splits tuples ...
instance (Splits t1, Splits t2) => Splits (t1, t2) where    -- e.g. Splits Pair
  splits xys =
    let (xs, ys) = unzip xys
    in interleave (splitsAttr 0 fst xs)
                  (splitsAttr 1 snd ys)
  -- Note, we get hierarchical dataspaces done for free.

instance (Splits t1, Splits t2, Splits t3) => Splits (t1,t2,t3) where -- Triple
  splits xyzs =
    let (xs, ys, zs) = unzip3 xyzs
        sx = splitsAttr 0 (\(x,_,_) -> x) xs
        sy = splitsAttr 1 (\(_,y,_) -> y) ys
        sz = splitsAttr 2 (\(_,_,z) -> z) zs
    in interleave3 sx sy sz

instance (Splits t1, Splits t2, Splits t3, Splits t4) =>
 Splits (t1,t2,t3,t4) where                                             -- Quad
  splits wxyzs =
    let (ws, xs, ys, zs) = unzip4 wxyzs
        sw = splitsAttr 0 (\(w,_,_,_) -> w) ws
        sx = splitsAttr 1 (\(_,x,_,_) -> x) xs
        sy = splitsAttr 2 (\(_,_,y,_) -> y) ys
        sz = splitsAttr 3 (\(_,_,_,z) -> z) zs
    in interleave4 sw sx sy sz

splitsAttr n sel xs =                                     -- selected attribute
  -- get splits for nth attribute of multivariate dataset
  -- NB. sel must select the nth attribute !
  let prefix = "@" ++ (show n)
  in map (\(Splitter n f d)->Splitter n (f.sel) (\()->prefix++d())) (splits xs)

-- ------------------------------------- make tuples instances of class Enum --
                          -- NB. tuples of Bounded types are already in Bounded

instance (Enum t1,Bounded t1, Enum t2,Bounded t2) => Enum (t1,t2) where -- Pair
  fromEnum (v1,v2) =   -- NB. (minBound, minBound') -> 0
    let min1 = fromEnum (minBound `asTypeOf` v1)
        min2 = fromEnum (minBound `asTypeOf` v2)
        max2 = fromEnum (maxBound `asTypeOf` v2)
        width = max2 - min2 + 1
    in (fromEnum v1 - min1)*width + (fromEnum v2 - min2)

  toEnum =   -- had a big battle to remove 'ambiguous type' errors in this
    let (mnB1, mnB2) = minBound `asTypeOf` (te 0)  -- !
        (mxB1, mxB2) = maxBound `asTypeOf` (mnB1, mnB2)
        (min1, min2) = (fromEnum mnB1, fromEnum mnB2)
        (max1, max2) = (fromEnum mxB1, fromEnum mxB2)
        width = max2 - min2 + 1
        te n = (toEnum(min1 + n `div` width), toEnum(min2 + n `mod` width))
    in te

instance (Enum t1,Bounded t1, Enum t2,Bounded t2, Enum t3,Bounded t3) =>
  Enum (t1,t2,t3) where                                               -- triple
    fromEnum (v1,v2,v3) = fromEnum (v1,(v2,v3))
    toEnum n = let (v1,(v2,v3)) = toEnum n in (v1,v2,v3)

instance (Enum t1,Bounded t1, Enum t2,Bounded t2,                       -- quad
  Enum t3,Bounded t3, Enum t4,Bounded t4) => Enum (t1,t2,t3,t4) where
    fromEnum (v1,v2,v3,v4) = fromEnum (v1,(v2,v3,v4))
    toEnum n = let (v1,(v2,v3,v4)) = toEnum n in (v1,v2,v3,v4)

instance (Enum t1,Bounded t1, Enum t2,Bounded t2, Enum t3,Bounded t3,   -- five
  Enum t4,Bounded t4, Enum t5,Bounded t5) => Enum (t1,t2,t3,t4,t5) where
    fromEnum (v1,v2,v3,v4,v5) = fromEnum (v1,(v2,v3,v4,v5))
    toEnum n = let (v1,(v2,v3,v4,v5)) = toEnum n in (v1,v2,v3,v4,v5)

-- ------------------------------9/2002--9/2003--L.Allison--CSSE--Monash--.au--
                                  -- some ordinary but useful List functions...

unzip4 = foldr (\(w,x,y,z) ~(ws,xs,ys,zs) -> (w:ws, x:xs, y:ys, z:zs))
               ([], [], [], [])   -- standard prelude has up to unzip3


-- given a list of heads & a list of tails (lists), put each head on its tail.
prepend heads tails = zipWith (:) heads tails


addToList n x lsts =                                   -- add x to the nth List
  let add n []         = add n [[]]      -- prem' end of Lists, so lengthen!
      add 0 (lst:lsts) = (x:lst) : lsts  -- add x to this lst
      add n (lst:lsts) = lst:(add (n-1) lsts)
  in add n lsts

partition n indxs elts =      -- partition elts into n parts according to indxs
  let p [] [] ans = ans
      p (i:is) (x:xs) ans = p is xs (addToList i x ans) -- add x to i-th part
  in p indxs elts (replicate n [])


interleave [] ys = ys
interleave xs [] = xs                   -- interleave the elements of two lists
interleave (x:xs) (y:ys) = x : y : (interleave xs ys)

interleave3 [] ys zs = interleave ys zs
interleave3 xs [] zs = interleave xs zs
interleave3 xs ys [] = interleave xs ys
interleave3 (x:xs) (y:ys) (z:zs) = x : y : z : (interleave3 xs ys zs)

interleave4 ws xs ys zs = interleave (interleave ws ys) (interleave xs zs)


unique  []    = []   -- POST: no duplicates in output
unique (x:xs) =      -- PRE:  any duplicates in input are adjacent
  let u _  []    = []
      u z (x:xs) = if z == x then u z xs else x : (u x xs)
  in x : (u x xs)


merge [] bs = bs
merge as [] = as                                   -- merge two sorted(!) lists
merge (aas@(a:as)) (bbs@(b:bs)) =
   if a <= b then a : (merge as bbs) else b : (merge aas bs)

msort []     = []
msort [x]    = [x]                                                -- merge sort
msort inList =  -- an (unstable) merge-sort, cos it is simple!
 let (as, bs) = splt inList [] []
     splt  []    as bs = (as, bs)
     splt (x:xs) as bs = splt xs bs (x:as)
 in merge (msort as) (msort bs)


-- NB. This code for medianEtc is an improvement on the 2002 test code both
-- in complexity and also in the order in which its results are produced.
medianEtc [] = []                  -- returns [median, quartiles, octiles, ...]
medianEtc s =  -- PRE: s is sorted
  let add _     0  _  0  _  ss = ss                      -- n1 = n2 = 0
      add _     n1 s1 0  _  ss = ((n1,s1):ss)            -- n1 > 0, n2 = 0
      add _     0  _  n2 s2 ss = ((n2,s2):ss)            -- n1 = 0, n2 > 0
      add True  n1 s1 n2 s2 ss = (n1,s1) : (n2,s2) : ss  -- n1 > 0, n2 > 0
      add False n1 s1 n2 s2 ss = (n2,s2) : (n1,s1) : ss  -- switch _2, _1

      select fwd [] []    = []
      select fwd [] level = select (not fwd) level []    -- down a level
      -- param 3 of select is an accum'ing buffer of next level of xyz-iles
      select fwd ((n,s):l1) l2 = -- NB. n == |s| (usable part),  n >= 1
        let n1 = n `div` 2       -- 1->0, 2->1, 3->1 ... i.e. |small| (proper)
            n2 = n - n1          -- 1->1, 2->1, 3->2 ... i.e. |median:big|
            small = s            -- or at any rate small's 1st n1 elements
            (median:big) = drop n1 s
        in median : (select fwd l1 (add fwd n1 small (n2-1) big l2))
      -- note the breadth-first traversal

  in select True [(length s, s)] []
  -- Guaranteed "OK" behaviour, alternatively might try Hoare's Find, one day.
  -- Neighbouring values in the result are as similar as possible.

-- ------------------------------9/2002--9/2003--L.Allison--CSSE--Monash--.au--

test01 = print "-- test01 --"
 >> print( "prng 17 ...         = " ++ show( take 8 (iterate prng 17) ) )
 >> (let sb1010 = stirlingBase 10 10
     in print("stirlingBase 10 10  = "++show sb1010
           ++ ", 10**_="++show (10**sb1010) ))
 >> print( "normalise [1,1,2,4] = " ++ show( normalise [1,1,2,4] ) )
 >> print( "logPlus       7 7   = " ++ show( logPlus 7 7 ) )
 >> print( "logPlusBase 2 7 7   = " ++ show( logPlusBase 2 7 7 ) )
 >> print( "partition 2 ... ... = "
       ++ show( partition 2 [0,1,3,0,1] ["fst1","snd1","fth","fst2","snd2"] ) )
 >> print( "msort [3,1,6,5,...] = " ++ show( msort [3,1,5,3,3,2,4] ) )
 >> print( "medianEtc ...       = "
       ++ show( medianEtc [1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15] ) )
 >> print( "splits[(H,(True,..) = "
       ++ show( splits [(H,(True,4.0)),(H,(False,1.0 :: Double)),
                        (T,(True,3.0)),(H,(True, 2.0))]))

-- ----------------------------------------------------------------------------