module Data.Clustering.Hierarchical
    (-- * Dendrogram data type
     Dendrogram(..)
    ,Distance
    ,elements
    ,cutAt
     -- * Linkage data type
    ,Linkage(..)
     -- * Clustering function
    ,dendrogram
    ) where

import Data.Clustering.Hierarchical.Internal.Types (Dendrogram(..), Linkage(..), Distance)
import qualified Data.Clustering.Hierarchical.Internal.DistanceMatrix as DM
import qualified Data.Clustering.Hierarchical.Internal.Optimal as O

-- | List of elements in a dendrogram.
elements :: Dendrogram a -> [a]
elements :: forall a. Dendrogram a -> [a]
elements = [a] -> Dendrogram a -> [a]
forall {a}. [a] -> Dendrogram a -> [a]
go []
    where
      go :: [a] -> Dendrogram a -> [a]
go [a]
acc (Leaf a
x)       = a
x a -> [a] -> [a]
forall a. a -> [a] -> [a]
: [a]
acc
      go [a]
acc (Branch Distance
_ Dendrogram a
l Dendrogram a
r) = [a] -> Dendrogram a -> [a]
go ([a] -> Dendrogram a -> [a]
go [a]
acc Dendrogram a
r) Dendrogram a
l

-- | @dendro \`cutAt\` threshold@ cuts the dendrogram @dendro@ at
-- all branches which have distances strictly greater than
-- @threshold@.
--
-- For example, suppose we have
--
-- @
-- dendro = Branch 0.8
--            (Branch 0.5
--              (Branch 0.2
--                (Leaf \'A\')
--                (Leaf \'B\'))
--              (Leaf \'C\'))
--            (Leaf \'D\')
-- @
--
-- Then:
--
-- @
-- dendro \`cutAt\` 0.9 == dendro \`cutAt\` 0.8 == [dendro] -- no changes
-- dendro \`cutAt\` 0.7 == dendro \`cutAt\` 0.5 == [Branch 0.5 (Branch 0.2 (Leaf \'A\') (Leaf \'B\')) (Leaf \'C\'), Leaf \'D\']
-- dendro \`cutAt\` 0.4 == dendro \`cutAt\` 0.2 == [Branch 0.2 (Leaf \'A\') (Leaf \'B\'), Leaf \'C\', Leaf \'D\']
-- dendro \`cutAt\` 0.1 == [Leaf \'A\', Leaf \'B\', Leaf \'C\', Leaf \'D\'] -- no branches at all
-- @
cutAt :: Dendrogram a -> Distance -> [Dendrogram a]
cutAt :: forall a. Dendrogram a -> Distance -> [Dendrogram a]
cutAt Dendrogram a
dendro Distance
threshold = [Dendrogram a] -> Dendrogram a -> [Dendrogram a]
forall {a}. [Dendrogram a] -> Dendrogram a -> [Dendrogram a]
go [] Dendrogram a
dendro
    where
      go :: [Dendrogram a] -> Dendrogram a -> [Dendrogram a]
go [Dendrogram a]
acc x :: Dendrogram a
x@(Leaf a
_)                        = Dendrogram a
x Dendrogram a -> [Dendrogram a] -> [Dendrogram a]
forall a. a -> [a] -> [a]
: [Dendrogram a]
acc
      go [Dendrogram a]
acc x :: Dendrogram a
x@(Branch Distance
d Dendrogram a
l Dendrogram a
r) | Distance
d Distance -> Distance -> Bool
forall a. Ord a => a -> a -> Bool
<= Distance
threshold = Dendrogram a
x Dendrogram a -> [Dendrogram a] -> [Dendrogram a]
forall a. a -> [a] -> [a]
: [Dendrogram a]
acc
                              | Bool
otherwise      = [Dendrogram a] -> Dendrogram a -> [Dendrogram a]
go ([Dendrogram a] -> Dendrogram a -> [Dendrogram a]
go [Dendrogram a]
acc Dendrogram a
r) Dendrogram a
l  -- cut!


-- | Calculates a complete, rooted dendrogram for a list of items
-- and a linkage type.  The following are the time and space
-- complexities for each linkage:
--
-- ['SingleLinkage'] /O(n^2)/ time and /O(n)/ space, using the
--   SLINK algorithm.  This algorithm is optimal in both space
--   and time and gives the same answer as the naive algorithm
--   using a distance matrix.
--
-- ['CompleteLinkage'] /O(n^3)/ time and /O(n^2)/ space, using
--   the naive algorithm with a distance matrix.  Use 'CLINK' if
--   you need more performance.
--
-- [Complete linkage with 'CLINK'] /O(n^2)/ time and /O(n)/
--   space, using the CLINK algorithm.  Note that this algorithm
--   doesn't always give the same answer as the naive algorithm
--   using a distance matrix, but it's much faster.
--
-- ['UPGMA'] /O(n^3)/ time and /O(n^2)/ space, using the naive
--   algorithm with a distance matrix.
--
-- ['FakeAverageLinkage'] /O(n^3)/ time and /O(n^2)/ space, using
-- the naive algorithm with a distance matrix.
dendrogram :: Linkage              -- ^ Linkage type to be used.
           -> [a]                  -- ^ Items to be clustered.
           -> (a -> a -> Distance) -- ^ Distance function between items.
           -> Dendrogram a         -- ^ Complete dendrogram.
dendrogram :: forall a. Linkage -> [a] -> (a -> a -> Distance) -> Dendrogram a
dendrogram Linkage
SingleLinkage      = [a] -> (a -> a -> Distance) -> Dendrogram a
forall a. [a] -> (a -> a -> Distance) -> Dendrogram a
O.singleLinkage
dendrogram Linkage
CompleteLinkage    = [a] -> (a -> a -> Distance) -> Dendrogram a
forall a. [a] -> (a -> a -> Distance) -> Dendrogram a
DM.completeLinkage
dendrogram Linkage
CLINK              = [a] -> (a -> a -> Distance) -> Dendrogram a
forall a. [a] -> (a -> a -> Distance) -> Dendrogram a
O.completeLinkage
dendrogram Linkage
UPGMA              = [a] -> (a -> a -> Distance) -> Dendrogram a
forall a. [a] -> (a -> a -> Distance) -> Dendrogram a
DM.upgma
dendrogram Linkage
FakeAverageLinkage = [a] -> (a -> a -> Distance) -> Dendrogram a
forall a. [a] -> (a -> a -> Distance) -> Dendrogram a
DM.fakeAverageLinkage