-- See: L. Allison. Types and classes of machine learning and data mining. -- 26th Australasian Computer Science Conference (ACSC) pp.207-215, -- Adelaide, February 2003 -- L. Allison. Models for machine learning and data mining in -- functional programming. doi:10.1017/S0956796804005301 -- J. Functional Programming, 15(1), pp.15-32, Jan. 2005 -- Author: Lloyd ALLISON lloyd at bruce cs monash edu au -- http://www.csse.monash.edu.au/~lloyd/tildeFP/II/200309/ -- This program is free software; you can redistribute it and/or modify it -- under the terms of the GNU General Public License (GPL) as published by -- the Free Software Foundation; either version 2 of the License, or (at -- your option) any later version. This program is distributed in the hope -- that it will be useful, but without any warranty, without even the implied -- warranty of merchantability or fitness for a particular purpose. See the -- GNU General Public License for more details. You should have received a -- copy of the GNU General Public License with this program; if not, write to: -- Free Software Foundation, Inc., Boston, MA 02111, USA. module SM_Utilities (module SM_Utilities) where -- export all latticeK1 = 1/12 -- lattice latticeK2 = 5/(36 * sqrt(3)) -- constants -- see J.H.Conway and N.J.A.Sloane. Sphere Packings, Lattices and Groups. -- Springer 1998 3rd edn. data Throw = H | T deriving (Enum, Bounded, Eq, Ord, Show) -- Mr. Bernoulli instance Splits Throw where splits = splitsBE -- Splits Throw -- ---------------------------------------------------------------------------- assert p x = assert1 p x "assertion failed" assert1 p x why = if p x then x else error why log2 = log 2.0 -- logPlusBase b (-logBase b p1) (-logBase b p2) = -logBase b (p1+p2) logPlusBase b nlPr1 nlPr2 = -- NB. this is "OK" but there are faster ways. let (bigger, smaller) = if nlPr1 >= nlPr2 then (nlPr1, nlPr2) else (nlPr2, nlPr1) diff = bigger - smaller -- is non-negative eps = b ** (-diff) -- between 0 and 1 nits = log b in if diff * nits > 30 then smaller else smaller - (logBase b (1+eps)) -- the cutoff of 30 nits is somewhat(!) arbitrary. -- and base e ... logPlus nlPr1 nlPr2 = -- NB. this is "OK" but there are faster ways. let (bigger, smaller) = if nlPr1 >= nlPr2 then (nlPr1, nlPr2) else (nlPr2, nlPr1) diff = bigger - smaller -- is non-negative eps = exp (-diff) -- between 0 and 1 in if diff > 30 then smaller else smaller - (log (1+eps)) normalise xs = -- scale xs to make sum to 1.0 let (ans, finalTotal) = n xs 0 n [] total = ([], total) n (x:xs) total = let (rest, sum) = n xs (total+x) in ((x/finalTotal):rest, sum) in ans stirling n = -- Stirling's approximation let ans = (n + 0.5)*(log n) - n + (log(2*pi)) / 2 -- ~ logBase e (n!) in max ans 0 stirlingBase b n = (stirling n) / (log b) -- and to base b -- ---------------------------------------------------------------------------- prng seed = (seed * (1 + 4*37*109) + 9999) `mod` (32 * 1024) -- Pseudo Random Number Generator -- ([0 .. 32K-1], only a 32K cycle, really should use a better one!) -- Linear Congruential Pseudo Random Number Generator, -- see: D. E. Knuth, The Art of Computer Programming, Vol. 2. -- ----------------------------------------------L.Allison--CSSE--Monash--.au-- -- The class Splits and associated type and functions are primarily to do -- with partitioning data, as in classification-trees do, although they might -- have more uses, e.g. in clustering/ unsupervised-classification. class Splits t where splits :: [t] -> [Splitter t] -- i.e. can find ways of partitioning t-lists; -- these ways should be proposed in order of decreasing prior-probability. data Splitter t = Splitter Int (t -> Int) (() -> String) -- i.e. arity partition_fn description instance Show (Splitter t) where show (Splitter _ _ description) = description() aritySplitter (Splitter n _ _) = n applySplitter (Splitter _ f _) d = f d -- apply it to one datum -- partition a data-set, ds, into n parts according to s split s ds = partition (aritySplitter s) (map (applySplitter s) ds) ds -- The following is not allowed :-( -- instance (Bounded t, Enum t) => Splits t where splits = splitsBE -- because ... The instance type must be of the form (T a b c) ... -- Splits discrete_type ... instance Splits Bool where splits = splitsBE -- e.g. Splits Bool -- Splits continous type ... instance Splits Float where splits = splitsOrd -- e.g. Splits Float instance Splits Double where splits = splitsOrd -- e.g. Splits Double splitsBE [] = [] -- calculate the Splitter's for a Bounded Enum type splitsBE (d:ds) = let { lwb = minBound `asTypeOf` d; lwbn = fromEnum lwb; upb = maxBound `asTypeOf` d; upbn = fromEnum upb; arity = upbn - lwbn + 1 } in if all ((==) d) ds then [] -- all the same, no variety in data set else [Splitter arity (\u -> (fromEnum (u `asTypeOf` d) - lwbn)) (\()->"="++(show lwb)++(if arity > 2 then ".." else "|")++(show upb))] -- If the type is also Ord(ered, e.g. Bad|Poor|...|VG) and if -- arity is quite big, then it might be better to use splitsOrd lwb..upb . splitsOrd ds = -- calculate Splitter's for an Ord(ered, inc' continuous) type let splitPoints [] = [] -- lines like the next make it all worthwhile splitPoints ds = (medianEtc . tail . unique . msort) ds -- ! in map (\cut -> Splitter 2 (\y -> if y < cut then 0 else 1) (\() -> "<|>=" ++ show cut)) (splitPoints ds) -- Splits tuples ... instance (Splits t1, Splits t2) => Splits (t1, t2) where -- e.g. Splits Pair splits xys = let (xs, ys) = unzip xys in interleave (splitsAttr 0 fst xs) (splitsAttr 1 snd ys) -- Note, we get hierarchical dataspaces done for free. instance (Splits t1, Splits t2, Splits t3) => Splits (t1,t2,t3) where -- Triple splits xyzs = let (xs, ys, zs) = unzip3 xyzs sx = splitsAttr 0 (\(x,_,_) -> x) xs sy = splitsAttr 1 (\(_,y,_) -> y) ys sz = splitsAttr 2 (\(_,_,z) -> z) zs in interleave3 sx sy sz instance (Splits t1, Splits t2, Splits t3, Splits t4) => Splits (t1,t2,t3,t4) where -- Quad splits wxyzs = let (ws, xs, ys, zs) = unzip4 wxyzs sw = splitsAttr 0 (\(w,_,_,_) -> w) ws sx = splitsAttr 1 (\(_,x,_,_) -> x) xs sy = splitsAttr 2 (\(_,_,y,_) -> y) ys sz = splitsAttr 3 (\(_,_,_,z) -> z) zs in interleave4 sw sx sy sz splitsAttr n sel xs = -- selected attribute -- get splits for nth attribute of multivariate dataset -- NB. sel must select the nth attribute ! let prefix = "@" ++ (show n) in map (\(Splitter n f d)->Splitter n (f.sel) (\()->prefix++d())) (splits xs) -- ------------------------------------- make tuples instances of class Enum -- -- NB. tuples of Bounded types are already in Bounded instance (Enum t1,Bounded t1, Enum t2,Bounded t2) => Enum (t1,t2) where -- Pair fromEnum (v1,v2) = -- NB. (minBound, minBound') -> 0 let min1 = fromEnum (minBound `asTypeOf` v1) min2 = fromEnum (minBound `asTypeOf` v2) max2 = fromEnum (maxBound `asTypeOf` v2) width = max2 - min2 + 1 in (fromEnum v1 - min1)*width + (fromEnum v2 - min2) toEnum = -- had a big battle to remove 'ambiguous type' errors in this let (mnB1, mnB2) = minBound `asTypeOf` (te 0) -- ! (mxB1, mxB2) = maxBound `asTypeOf` (mnB1, mnB2) (min1, min2) = (fromEnum mnB1, fromEnum mnB2) (max1, max2) = (fromEnum mxB1, fromEnum mxB2) width = max2 - min2 + 1 te n = (toEnum(min1 + n `div` width), toEnum(min2 + n `mod` width)) in te instance (Enum t1,Bounded t1, Enum t2,Bounded t2, Enum t3,Bounded t3) => Enum (t1,t2,t3) where -- triple fromEnum (v1,v2,v3) = fromEnum (v1,(v2,v3)) toEnum n = let (v1,(v2,v3)) = toEnum n in (v1,v2,v3) instance (Enum t1,Bounded t1, Enum t2,Bounded t2, -- quad Enum t3,Bounded t3, Enum t4,Bounded t4) => Enum (t1,t2,t3,t4) where fromEnum (v1,v2,v3,v4) = fromEnum (v1,(v2,v3,v4)) toEnum n = let (v1,(v2,v3,v4)) = toEnum n in (v1,v2,v3,v4) instance (Enum t1,Bounded t1, Enum t2,Bounded t2, Enum t3,Bounded t3, -- five Enum t4,Bounded t4, Enum t5,Bounded t5) => Enum (t1,t2,t3,t4,t5) where fromEnum (v1,v2,v3,v4,v5) = fromEnum (v1,(v2,v3,v4,v5)) toEnum n = let (v1,(v2,v3,v4,v5)) = toEnum n in (v1,v2,v3,v4,v5) -- ------------------------------9/2002--9/2003--L.Allison--CSSE--Monash--.au-- -- some ordinary but useful List functions... unzip4 = foldr (\(w,x,y,z) ~(ws,xs,ys,zs) -> (w:ws, x:xs, y:ys, z:zs)) ([], [], [], []) -- standard prelude has up to unzip3 -- given a list of heads & a list of tails (lists), put each head on its tail. prepend heads tails = zipWith (:) heads tails addToList n x lsts = -- add x to the nth List let add n [] = add n [[]] -- prem' end of Lists, so lengthen! add 0 (lst:lsts) = (x:lst) : lsts -- add x to this lst add n (lst:lsts) = lst:(add (n-1) lsts) in add n lsts partition n indxs elts = -- partition elts into n parts according to indxs let p [] [] ans = ans p (i:is) (x:xs) ans = p is xs (addToList i x ans) -- add x to i-th part in p indxs elts (replicate n []) interleave [] ys = ys interleave xs [] = xs -- interleave the elements of two lists interleave (x:xs) (y:ys) = x : y : (interleave xs ys) interleave3 [] ys zs = interleave ys zs interleave3 xs [] zs = interleave xs zs interleave3 xs ys [] = interleave xs ys interleave3 (x:xs) (y:ys) (z:zs) = x : y : z : (interleave3 xs ys zs) interleave4 ws xs ys zs = interleave (interleave ws ys) (interleave xs zs) unique [] = [] -- POST: no duplicates in output unique (x:xs) = -- PRE: any duplicates in input are adjacent let u _ [] = [] u z (x:xs) = if z == x then u z xs else x : (u x xs) in x : (u x xs) merge [] bs = bs merge as [] = as -- merge two sorted(!) lists merge (aas@(a:as)) (bbs@(b:bs)) = if a <= b then a : (merge as bbs) else b : (merge aas bs) msort [] = [] msort [x] = [x] -- merge sort msort inList = -- an (unstable) merge-sort, cos it is simple! let (as, bs) = splt inList [] [] splt [] as bs = (as, bs) splt (x:xs) as bs = splt xs bs (x:as) in merge (msort as) (msort bs) -- NB. This code for medianEtc is an improvement on the 2002 test code both -- in complexity and also in the order in which its results are produced. medianEtc [] = [] -- returns [median, quartiles, octiles, ...] medianEtc s = -- PRE: s is sorted let add _ 0 _ 0 _ ss = ss -- n1 = n2 = 0 add _ n1 s1 0 _ ss = ((n1,s1):ss) -- n1 > 0, n2 = 0 add _ 0 _ n2 s2 ss = ((n2,s2):ss) -- n1 = 0, n2 > 0 add True n1 s1 n2 s2 ss = (n1,s1) : (n2,s2) : ss -- n1 > 0, n2 > 0 add False n1 s1 n2 s2 ss = (n2,s2) : (n1,s1) : ss -- switch _2, _1 select fwd [] [] = [] select fwd [] level = select (not fwd) level [] -- down a level -- param 3 of select is an accum'ing buffer of next level of xyz-iles select fwd ((n,s):l1) l2 = -- NB. n == |s| (usable part), n >= 1 let n1 = n `div` 2 -- 1->0, 2->1, 3->1 ... i.e. |small| (proper) n2 = n - n1 -- 1->1, 2->1, 3->2 ... i.e. |median:big| small = s -- or at any rate small's 1st n1 elements (median:big) = drop n1 s in median : (select fwd l1 (add fwd n1 small (n2-1) big l2)) -- note the breadth-first traversal in select True [(length s, s)] [] -- Guaranteed "OK" behaviour, alternatively might try Hoare's Find, one day. -- Neighbouring values in the result are as similar as possible. -- ------------------------------9/2002--9/2003--L.Allison--CSSE--Monash--.au-- test01 = print "-- test01 --" >> print( "prng 17 ... = " ++ show( take 8 (iterate prng 17) ) ) >> (let sb1010 = stirlingBase 10 10 in print("stirlingBase 10 10 = "++show sb1010 ++ ", 10**_="++show (10**sb1010) )) >> print( "normalise [1,1,2,4] = " ++ show( normalise [1,1,2,4] ) ) >> print( "logPlus 7 7 = " ++ show( logPlus 7 7 ) ) >> print( "logPlusBase 2 7 7 = " ++ show( logPlusBase 2 7 7 ) ) >> print( "partition 2 ... ... = " ++ show( partition 2 [0,1,3,0,1] ["fst1","snd1","fth","fst2","snd2"] ) ) >> print( "msort [3,1,6,5,...] = " ++ show( msort [3,1,5,3,3,2,4] ) ) >> print( "medianEtc ... = " ++ show( medianEtc [1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15] ) ) >> print( "splits[(H,(True,..) = " ++ show( splits [(H,(True,4.0)),(H,(False,1.0 :: Double)), (T,(True,3.0)),(H,(True, 2.0))])) -- ----------------------------------------------------------------------------