fix(naivebayes): fix the algorithm to make it actually work

feat(cleanDocuments): preprocess documents, use stemming and stopword elimination for better accuracy
2016-08-05 23:54:36 +04:30
parent 3cf0625794
commit ea1f05f001
10 changed files with 254 additions and 54 deletions
--- a/examples/naivebayes-doc-classifier.hs
+++ b/examples/naivebayes-doc-classifier.hs
@@ -1,12 +1,13 @@
 module Main
  where
-    import Sibe
+    -- import Sibe
    import Sibe.NaiveBayes
    import Text.Printf
    import Data.List
    import Data.Maybe
    import Debug.Trace
    import Data.List.Split
    import Control.Arrow ((&&&))
    main = do
      dataset <- readFile "examples/doc-classifier-data/data-reuters"
@@ -15,18 +16,33 @@ module Main
      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
      let intClasses = [0..length classes - 1]
-          documents = createDocuments classes dataset
+      -- let intClasses = [0, 1]
-          testDocuments = createDocuments classes test
+          documents = cleanDocuments $ createDocuments classes dataset
-          devTestDocuments = take 20 testDocuments
+          -- documents = [Document "Chinese Beijing Chinese" 0,
-          nb = initialize documents
+          --              Document "Chinese Chinese Shanghai" 0,
          --              Document "Chinese Macao" 0,
          --              Document "Japan Tokyo Chinese" 1]
          -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
          testDocuments = cleanDocuments $ createDocuments classes test
          devTestDocuments = take 30 testDocuments
          -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
          nb = train documents intClasses
-          results = map (\(Document text c) -> (c, determine text nb intClasses documents)) testDocuments
+          results = map (\(Document text c) -> (c, run text nb)) testDocuments
-          -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
+          -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
      -- print (text $ head documents)
      let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
      mapM_ showResults results
      putStrLn $ "Recall: " ++ show (recall results)
      putStrLn $ "Precision: " ++ show (precision results)
-      putStrLn $ "F Measure: " ++ show (fmeasure (precision results) (recall results))
+      putStrLn $ "F Measure: " ++ show (fmeasure results)
      putStrLn $ "Accuracy: " ++ show (accuracy results)
    createDocuments classes content =
      let splitted = splitOn (replicate 10 '-' ++ "\n") content
          pairs = map ((head . lines) &&& (unwords . tail . lines)) splitted
          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
      in documents
--- a/examples/naivebayes-sentiment-analysis.hs
+++ b/examples/naivebayes-sentiment-analysis.hs
@@ -0,0 +1,54 @@
 {-# LANGUAGE BangPatterns #-}
 module Main
  where
    -- import Sibe
    import Sibe.NaiveBayes
    import Text.Printf
    import Data.List
    import Data.Maybe
    import Debug.Trace
    import Data.List.Split
    import System.Directory
    import Control.DeepSeq
    import System.IO
    main = do
      putStr "Reading documents... "
      neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/"
      pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/"
      test_neg_documents  <- createDocuments "examples/sentiment-analysis-data/test/neg/"
      test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/"
      putStrLn "done"
      let classes = [0..9] -- rating, from 0 to 9 (1 to 10)
          documents = neg_documents ++ pos_documents
          nb = train documents classes
          testDocuments = neg_documents ++ pos_documents
          results = map (\(Document text c) -> (c, run text nb)) testDocuments
          -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
      print results
      -- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r))
      -- mapM_ showResults results
      --
      -- putStrLn $ "Recall: " ++ show (recall results)
      -- putStrLn $ "Precision: " ++ show (precision results)
      -- putStrLn $ "F Measure: " ++ show (fmeasure results)
      -- putStrLn $ "Accuracy: " ++ show (accuracy results)
    createDocuments :: FilePath -> IO [Document]
    createDocuments path = do
      files <- drop 2 <$> getDirectoryContents path
      let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int]
      contents <- mapM (forceReadFile . (path ++)) files
      return $ zipWith Document contents ratings
    forceReadFile :: FilePath -> IO String
    forceReadFile file = do
      handle <- openFile file ReadMode
      content <- hGetContents handle
      content `deepseq` hClose handle
      return content
--- a/examples/sentiment-analysis-data
+++ b/examples/sentiment-analysis-data
@@ -0,0 +1 @@
 ../../sibe-repos/sentiment-analysis-data
--- a/examples/xor.hs
+++ b/examples/xor.hs
@@ -7,7 +7,7 @@ module Main where
  main = do
    let learning_rate = 0.5
        (iterations, epochs) = (2, 1000)
-        a = (logistic, logistic')
+        a = (sigmoid, sigmoid')
        rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
        inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
--- a/profiling/run
+++ b/profiling/run
@@ -0,0 +1,6 @@
 #!/bin/bash
 PROG==geniconvert
 VIEW==open
 stack build --profile
--- a/profiling/setup
+++ b/profiling/setup
@@ -0,0 +1,6 @@
 #!/bin/bash
 chmod u+x profiling/setup
 chmod u+x profiling/run
 chmod u+x profiling/compare
 chmod u+x profiling/save
--- a/sibe.cabal
+++ b/sibe.cabal
@@ -22,6 +22,10 @@ library
                     , deepseq
                     , containers
                     , split
                     , regex-base
                     , regex-pcre
                     , text
                     , stemmer
  default-language:    Haskell2010
 executable sibe-exe
@@ -53,6 +57,19 @@ executable example-naivebayes-doc-classifier
                     , split
  default-language:    Haskell2010
 executable example-naivebayes-sentiment-analysis
  hs-source-dirs:      examples
  main-is:             naivebayes-sentiment-analysis.hs
  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
  build-depends:       base
                     , sibe
                     , hmatrix
                     , containers
                     , split
                     , directory
                     , deepseq
  default-language:    Haskell2010
 test-suite sibe-test
  type:                exitcode-stdio-1.0
  hs-source-dirs:      test
--- a/src/Sibe.hs
+++ b/src/Sibe.hs
@@ -17,8 +17,10 @@ module Sibe
     train,
     session,
     shuffle,
-     logistic,
+     sigmoid,
-     logistic',
+     sigmoid',
     relu,
     relu',
     crossEntropy,
     genSeed,
     replaceVector
@@ -88,11 +90,17 @@ module Sibe
        randomLayer seed (input, h) a :-
        randomNetwork (seed + 1) h hs output
-      logistic :: Vector Double -> Vector Double
+      sigmoid :: Vector Double -> Vector Double
-      logistic x = 1 / (1 + exp (-x))
+      sigmoid x = 1 / max (1 + exp (-x)) 1e-10
-      logistic' :: Vector Double -> Vector Double
+      sigmoid' :: Vector Double -> Vector Double
-      logistic' x = logistic x * (1 - logistic x)
+      sigmoid' x = sigmoid x * (1 - sigmoid x)
      relu :: Vector Double -> Vector Double
      relu x = log (max (1 + exp x) 1e-10)
      relu' :: Vector Double -> Vector Double
      relu' = sigmoid
      crossEntropy :: Output -> Output -> Double
      crossEntropy output target =
@@ -100,7 +108,7 @@ module Sibe
            n = fromIntegral (length pairs)
        in (-1 / n) * sum (map f pairs)
        where
-          f (a, y) = y * log a + (1 - y) * log (1 - a)
+          f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)
      train :: Input
            -> Network
@@ -114,8 +122,8 @@ module Sibe
            let y = runLayer input l
                o = fn y
                delta = o - target
-                -- de = delta * fn' y -- quadratic cost
+                de = delta * fn' y
-                de = delta -- cross entropy cost
+                -- de = delta -- cross entropy cost
                biases'  = biases  - scale alpha de
                weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
@@ -174,3 +182,6 @@ module Sibe
          rrow index (x:xs)
            | index == index = value:xs
            | otherwise = x : rrow (index + 1) xs
      clip :: Double -> (Double, Double) -> Double
      clip x (l, u) = min u (max l x)
--- a/src/Sibe/NaiveBayes.hs
+++ b/src/Sibe/NaiveBayes.hs
@@ -1,15 +1,17 @@
 module Sibe.NaiveBayes
  (Document(..),
   NB(..),
-   createDocuments,
+   train,
-   initialize,
+   run,
   calculate,
   determine,
   ordNub,
   accuracy,
   precision,
   recall,
   fmeasure,
   mean,
   stdev,
   cleanText,
   cleanDocuments,
  )
  where
    import Data.List
@@ -18,47 +20,126 @@ module Sibe.NaiveBayes
    import Data.List.Split
    import Data.Maybe
    import Control.Arrow ((&&&))
-    type Class = Int
+    import Text.Regex.PCRE
    import Data.Char (isSpace)
    import NLP.Stemmer
    type Class = Int;
    data Document = Document { text :: String
                             , c    :: Class
                             } deriving (Eq, Show, Read)
-    data NB = NB { vocabulary :: Double
+    data NB = NB { documents  :: [Document]
                 , classes    :: [(Class, Double)]
                 , vocabulary :: Int
                 , megadoc    :: String
-                 }
+                 , cd         :: [(Class, [Document])]
                 , cw         :: [(Class, [(String, Int)])]
                 } deriving (Eq, Show, Read)
-    initialize :: [Document] -> NB
+    train :: [Document] -> [Class] -> NB
-    initialize documents =
+    train documents classes =
-      let megadoc = concatMap (\(Document text _) -> text ++ " ") documents
+      let megadoc = concatDocs documents
          vocabulary = genericLength ((ordNub . words) megadoc)
-      in NB vocabulary megadoc
+          -- (class, prior probability)
          cls = zip classes (map classPrior classes)
-    determine :: String -> NB -> [Class] -> [Document] -> Class
+          -- (class, [document])
-    determine text nb classes documents =
+          cd = zip classes (map classDocs classes)
      let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes)
          m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores
      in fst m
-    calculate :: String -> NB -> Class -> [Document] -> Double
+          -- (class, [(word, count)])
-    calculate text (NB vocabulary megadoc) cls documents =
+          cw = zip classes $ l (map classWordsCounts classes)
-      let docs = filter (\(Document text c) -> c == cls) documents
+
-          texts = map (\(Document text _) -> text ++ " ") docs
+      in NB { documents  = documents
-          classText = concat texts
+            , classes    = cls
-          classWords = words classText
+            , vocabulary = vocabulary
-          c = genericLength classWords
+            , megadoc    = megadoc
-          pc = genericLength docs / genericLength documents
+            , cd         = cd
-      in pc * product (map (cword classWords c) (words text))
+            , cw         = cw
            }
      where
-        cword classWords c word =
+        concatDocs = concatMap (\(Document text _) -> text ++ " ")
          let wc = genericLength (filter (==word) classWords)
          in (wc + 1) / (c + vocabulary)
-    createDocuments classes content =
+        classDocs x = filter ((==x) . c) documents
-      let splitted = splitOn (replicate 10 '-' ++ "\n") content
+        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
-          pairs = map ((head . lines) &&& (concat . tail . lines)) splitted
+        classWords x = words (classMegadoc x)
-          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
+        classNGram n = ngram n . classMegadoc
-      in documents
+        classVocabulary x = ordNub (classWords x)
        classPrior x = genericLength (classDocs x) / genericLength documents
        countWordInDoc d w = genericLength (filter (==w) d)
        classWordsCounts x =
          let voc = classVocabulary x
          in zip voc $ map (countWordInDoc (classWords x)) voc
    ngram :: Int -> String -> [String]
    ngram n text =
      let ws = words text
      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
    run :: String -> NB -> Class
    run text (NB documents classes vocabulary megadoc cd cw) =
      let scores = map (score . fst) classes
      in argmax scores
      where
        score c =
          let prior = snd (classes !! c)
          in prior * product (map (prob c) (words text))
        prob c w =
          let tctM = find ((==w) . fst) (snd (cw !! c))
              tct  = (snd . fromJust) tctM
              cvoc = (genericLength . snd) (cw !! c)
              voc  = vocabulary
          in
            if isJust tctM then
              realToFrac (tct + 1) / realToFrac (cvoc + voc)
            else
              1 / realToFrac (cvoc + voc)
    argmax :: (Ord a) => [a] -> Int
    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
    mean :: [Double] -> Double
    mean x = sum x / genericLength x
    stdev :: [Double] -> Double
    stdev x =
      let avg = mean x
          variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
      in sqrt variance
    cleanText :: String -> String
    cleanText string =
      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
          stemmed = unwords $ map (stem Porter) (words spacify)
      in stemmed
      where
        trim = f . f
          where
            f = reverse . dropWhile isSpace
        replace needle replacement =
          map (\c -> if c == needle then replacement else c)
    cleanDocuments :: [Document] -> [Document]
    cleanDocuments documents =
      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
          wc = wordCounts (concatDocs cleaned)
          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
          stopwords = l $ map fst (take 30 wlist)
          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
      in wstopwords
      where
        vocabulary x = ordNub (words x)
        countWordInDoc d w = genericLength (filter (==w) d)
        wordCounts x =
          let voc = vocabulary x
          in zip voc $ map (countWordInDoc (words x)) voc
        removeWords list text =
          unwords $ filter (`notElem` list) (words text)
        concatDocs = concatMap (\(Document text _) -> text ++ " ")
    l :: (Show a) => a -> a
    l a = trace (show a) a
@@ -100,5 +181,8 @@ module Sibe.NaiveBayes
              then 0
              else t / y
-    fmeasure :: Double -> Double -> Double
+    fmeasure :: [(Int, Int)] -> Double
-    fmeasure r p = (2 * p * r) / (p + r)
+    fmeasure results =
      let r = recall results
          p = precision results
      in (2 * p * r) / (p + r)
--- a/stack.yaml
+++ b/stack.yaml
@@ -41,7 +41,10 @@ packages:
    commit: 42a88fbcb6bd1d2c4dc18fae5e962bd34fb316a1
  subdirs:
    - packages/base
- '.'
+- .
 - http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz
 - http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz
 - http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz
 # Dependency packages to be pulled from upstream that are not in the resolver
 # (e.g., acme-missiles-0.3)
@@ -70,3 +73,5 @@ extra-package-dbs: []
 #
 # Allow a newer minor version of GHC than the snapshot specifies
 # compiler-check: newer-minor
 system-ghc: false