feat(naivebayes): implement NaiveBayes algorithm

feat(example): a document classifier using NaiveBayes over reuters data
This commit is contained in:
Mahdi Dibaiee 2016-07-29 16:16:44 +04:30
parent 493a20eb0a
commit 26eb4531fa
4 changed files with 125 additions and 19 deletions

View File

@ -0,0 +1,29 @@
module Main
where
import Sibe
import Sibe.NaiveBayes
import Text.Printf
import Data.List
import Data.Maybe
import Debug.Trace
main = do
dataset <- readFile "examples/naivebayes-doc-classifier/data-reuters"
test <- readFile "examples/naivebayes-doc-classifier/data-reuters-test"
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/naivebayes-doc-classifier/data-classes"
let intClasses = [0..length classes - 1]
documents = createDocuments classes dataset
testDocuments = createDocuments classes test
nb = initialize documents
let testResults (Document text c) =
let r = determine text nb intClasses documents
in trace (classes !! c ++ " ~ " ++ classes !! r) c == r
let results = map testResults testDocuments
putStr "Accuracy: "
putStr . show . round $ (genericLength (filter (==True) results) / genericLength results) * 100
putStrLn "%"

View File

@ -15,11 +15,13 @@ cabal-version: >=1.10
library
hs-source-dirs: src
exposed-modules: Sibe
exposed-modules: Sibe, Sibe.NaiveBayes
build-depends: base >= 4.7 && < 5
, hmatrix
, random
, deepseq
, containers
, split
default-language: Haskell2010
executable sibe-exe
@ -40,6 +42,16 @@ executable example-xor
, hmatrix
default-language: Haskell2010
executable example-naivebayes-doc-classifier
hs-source-dirs: examples
main-is: naivebayes-doc-classifier.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base
, sibe
, hmatrix
, containers
default-language: Haskell2010
test-suite sibe-test
type: exitcode-stdio-1.0
hs-source-dirs: test

View File

@ -5,7 +5,7 @@
module Sibe
(Network(..),
Layer,
Layer(..),
Input,
Output,
Activation,
@ -35,13 +35,13 @@ module Sibe
type Output = Vector Double
type Activation = (Vector Double -> Vector Double, Vector Double -> Vector Double)
data Layer = L { biases :: !(Vector Double)
data Layer = Layer { biases :: !(Vector Double)
, nodes :: !(Matrix Double)
, activation :: Activation
}
instance Show Layer where
show (L biases nodes _) = "(" ++ show biases ++ "," ++ show nodes ++ ")"
show (Layer biases nodes _) = "(" ++ show biases ++ "," ++ show nodes ++ ")"
data Network = O Layer
| Layer :- Network
@ -52,8 +52,8 @@ module Sibe
saveNetwork network file =
writeFile file ((show . reverse) (gen network []))
where
gen (O (L biases nodes _)) list = (biases, nodes) : list
gen (L biases nodes _ :- n) list = gen n $ (biases, nodes) : list
gen (O (Layer biases nodes _)) list = (biases, nodes) : list
gen (Layer biases nodes _ :- n) list = gen n $ (biases, nodes) : list
loadNetwork :: [Activation] -> String -> IO Network
loadNetwork activations file = do
@ -65,21 +65,21 @@ module Sibe
return network
where
gen [(biases, nodes)] [a] = O (L biases nodes a)
gen ((biases, nodes):hs) (a:as) = L biases nodes a :- gen hs as
gen [(biases, nodes)] [a] = O (Layer biases nodes a)
gen ((biases, nodes):hs) (a:as) = Layer biases nodes a :- gen hs as
runLayer :: Input -> Layer -> Output
runLayer input (L !biases !weights _) = input <# weights + biases
runLayer input (Layer !biases !weights _) = input <# weights + biases
forward :: Input -> Network -> Output
forward input (O l@(L _ _ (fn, _))) = fn $ runLayer input l
forward input (l@(L _ _ (fn, _)) :- n) = forward ((fst . activation $ l) $ runLayer input l) n
forward input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
forward input (l@(Layer _ _ (fn, _)) :- n) = forward ((fst . activation $ l) $ runLayer input l) n
randomLayer :: Seed -> (Int, Int) -> Activation -> Layer
randomLayer seed (wr, wc) =
let weights = uniformSample seed wr $ replicate wc (-1, 1)
biases = randomVector seed Uniform wc * 2 - 1
in L biases weights
in Layer biases weights
randomNetwork :: Seed -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network
randomNetwork seed input [] (output, a) =
@ -110,7 +110,7 @@ module Sibe
train input network target alpha = fst $ run input network
where
run :: Input -> Network -> (Network, Vector Double)
run input (O l@(L biases weights (fn, fn'))) =
run input (O l@(Layer biases weights (fn, fn'))) =
let y = runLayer input l
o = fn y
delta = o - target
@ -119,13 +119,13 @@ module Sibe
biases' = biases - scale alpha de
weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
layer = L biases' weights' (fn, fn') -- updated layer
layer = Layer biases' weights' (fn, fn') -- updated layer
pass = weights #> de
-- pass = weights #> de
in (O layer, pass)
run input (l@(L biases weights (fn, fn')) :- n) =
run input (l@(Layer biases weights (fn, fn')) :- n) =
let y = runLayer input l
o = fn y
(n', delta) = run o n
@ -134,7 +134,7 @@ module Sibe
biases' = biases - scale alpha de
weights' = weights - scale alpha (input `outer` de)
layer = L biases' weights' (fn, fn')
layer = Layer biases' weights' (fn, fn')
pass = weights #> de
-- pass = weights #> de

65
src/Sibe/NaiveBayes.hs Normal file
View File

@ -0,0 +1,65 @@
module Sibe.NaiveBayes
(Document(..),
NB(..),
createDocuments,
initialize,
calculate,
determine
)
where
import Data.List
import Debug.Trace
import qualified Data.Set as Set
import Data.List.Split
import Data.Maybe
type Class = Int
data Document = Document { text :: String
, c :: Class
} deriving (Eq, Show, Read)
data NB = NB { vocabulary :: Double
, megadoc :: String
}
initialize :: [Document] -> NB
initialize documents =
let megadoc = concatMap (\(Document text _) -> text ++ " ") documents
vocabulary = genericLength ((ordNub . words) megadoc)
in NB vocabulary megadoc
determine :: String -> NB -> [Class] -> [Document] -> Class
determine text nb classes documents =
let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes)
m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores
in fst m
calculate :: String -> NB -> Class -> [Document] -> Double
calculate text (NB vocabulary megadoc) cls documents =
let docs = filter (\(Document text c) -> c == cls) documents
texts = map (\(Document text _) -> text ++ " ") docs
classText = concat texts
classWords = words classText
c = genericLength classWords
pc = genericLength docs / genericLength documents
in pc * product (map (cword classWords c) (words text))
where
cword classWords c word =
let wc = genericLength (filter (==word) classWords)
in (wc + 1) / (c + vocabulary)
createDocuments classes content =
let splitted = splitOn (replicate 10 '-' ++ "\n") content
pairs = map (\a -> ((head . lines) a, (concat . tail . lines) a)) splitted
documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
in documents
l :: (Show a) => a -> a
l a = trace (show a) a
ordNub :: (Ord a) => [a] -> [a]
ordNub = go Set.empty
where
go _ [] = []
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs