From ea1f05f00181d0e6265bc6e5e25075b38b3b1ba3 Mon Sep 17 00:00:00 2001
From: Mahdi Dibaiee <mdibaiee@aol.com>
Date: Fri, 5 Aug 2016 23:54:36 +0430
Subject: [PATCH] fix(naivebayes): fix the algorithm to make it actually work
 feat(cleanDocuments): preprocess documents, use stemming and stopword
 elimination for better accuracy

---
 examples/naivebayes-doc-classifier.hs     |  32 +++--
 examples/naivebayes-sentiment-analysis.hs |  54 ++++++++
 examples/sentiment-analysis-data          |   1 +
 examples/xor.hs                           |   2 +-
 profiling/run                             |   6 +
 profiling/setup                           |   6 +
 sibe.cabal                                |  17 +++
 src/Sibe.hs                               |  29 ++--
 src/Sibe/NaiveBayes.hs                    | 154 +++++++++++++++++-----
 stack.yaml                                |   7 +-
 10 files changed, 254 insertions(+), 54 deletions(-)
 create mode 100644 examples/naivebayes-sentiment-analysis.hs
 create mode 120000 examples/sentiment-analysis-data
 create mode 100644 profiling/run
 create mode 100644 profiling/setup

diff --git a/examples/naivebayes-doc-classifier.hs b/examples/naivebayes-doc-classifier.hs
index 82becbf..f842e8c 100644
--- a/examples/naivebayes-doc-classifier.hs
+++ b/examples/naivebayes-doc-classifier.hs
@@ -1,12 +1,13 @@
 module Main
   where
-    import Sibe
+    -- import Sibe
     import Sibe.NaiveBayes
     import Text.Printf
     import Data.List
     import Data.Maybe
     import Debug.Trace
     import Data.List.Split
+    import Control.Arrow ((&&&))
 
     main = do
       dataset <- readFile "examples/doc-classifier-data/data-reuters"
@@ -15,18 +16,33 @@ module Main
       classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
 
       let intClasses = [0..length classes - 1]
-          documents = createDocuments classes dataset
-          testDocuments = createDocuments classes test
-          devTestDocuments = take 20 testDocuments
-          nb = initialize documents
+      -- let intClasses = [0, 1]
+          documents = cleanDocuments $ createDocuments classes dataset
+          -- documents = [Document "Chinese Beijing Chinese" 0,
+          --              Document "Chinese Chinese Shanghai" 0,
+          --              Document "Chinese Macao" 0,
+          --              Document "Japan Tokyo Chinese" 1]
+          -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
+          testDocuments = cleanDocuments $ createDocuments classes test
+          devTestDocuments = take 30 testDocuments
+          -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
+          nb = train documents intClasses
 
-          results = map (\(Document text c) -> (c, determine text nb intClasses documents)) testDocuments
-          -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
+          results = map (\(Document text c) -> (c, run text nb)) testDocuments
+          -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
+
+      -- print (text $ head documents)
 
       let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
       mapM_ showResults results
 
       putStrLn $ "Recall: " ++ show (recall results)
       putStrLn $ "Precision: " ++ show (precision results)
-      putStrLn $ "F Measure: " ++ show (fmeasure (precision results) (recall results))
+      putStrLn $ "F Measure: " ++ show (fmeasure results)
       putStrLn $ "Accuracy: " ++ show (accuracy results)
+
+    createDocuments classes content =
+      let splitted = splitOn (replicate 10 '-' ++ "\n") content
+          pairs = map ((head . lines) &&& (unwords . tail . lines)) splitted
+          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
+      in documents
diff --git a/examples/naivebayes-sentiment-analysis.hs b/examples/naivebayes-sentiment-analysis.hs
new file mode 100644
index 0000000..b97f83d
--- /dev/null
+++ b/examples/naivebayes-sentiment-analysis.hs
@@ -0,0 +1,54 @@
+{-# LANGUAGE BangPatterns #-}
+module Main
+  where
+    -- import Sibe
+    import Sibe.NaiveBayes
+    import Text.Printf
+    import Data.List
+    import Data.Maybe
+    import Debug.Trace
+    import Data.List.Split
+    import System.Directory
+    import Control.DeepSeq
+    import System.IO
+
+    main = do
+      putStr "Reading documents... "
+      neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/"
+      pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/"
+
+      test_neg_documents  <- createDocuments "examples/sentiment-analysis-data/test/neg/"
+      test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/"
+      putStrLn "done"
+
+      let classes = [0..9] -- rating, from 0 to 9 (1 to 10)
+          documents = neg_documents ++ pos_documents
+          nb = train documents classes
+
+          testDocuments = neg_documents ++ pos_documents
+
+          results = map (\(Document text c) -> (c, run text nb)) testDocuments
+          -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
+      print results
+
+      -- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r))
+      -- mapM_ showResults results
+      --
+      -- putStrLn $ "Recall: " ++ show (recall results)
+      -- putStrLn $ "Precision: " ++ show (precision results)
+      -- putStrLn $ "F Measure: " ++ show (fmeasure results)
+      -- putStrLn $ "Accuracy: " ++ show (accuracy results)
+
+    createDocuments :: FilePath -> IO [Document]
+    createDocuments path = do
+      files <- drop 2 <$> getDirectoryContents path
+      let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int]
+      contents <- mapM (forceReadFile . (path ++)) files
+      return $ zipWith Document contents ratings
+
+    forceReadFile :: FilePath -> IO String
+    forceReadFile file = do
+      handle <- openFile file ReadMode
+      content <- hGetContents handle
+      content `deepseq` hClose handle
+      return content
diff --git a/examples/sentiment-analysis-data b/examples/sentiment-analysis-data
new file mode 120000
index 0000000..dbd75b3
--- /dev/null
+++ b/examples/sentiment-analysis-data
@@ -0,0 +1 @@
+../../sibe-repos/sentiment-analysis-data
\ No newline at end of file
diff --git a/examples/xor.hs b/examples/xor.hs
index f58acba..5ed8d27 100644
--- a/examples/xor.hs
+++ b/examples/xor.hs
@@ -7,7 +7,7 @@ module Main where
   main = do
     let learning_rate = 0.5
         (iterations, epochs) = (2, 1000)
-        a = (logistic, logistic')
+        a = (sigmoid, sigmoid')
         rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
 
         inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
diff --git a/profiling/run b/profiling/run
new file mode 100644
index 0000000..b6c2ad8
--- /dev/null
+++ b/profiling/run
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+PROG==geniconvert
+VIEW==open
+
+stack build --profile
diff --git a/profiling/setup b/profiling/setup
new file mode 100644
index 0000000..feb97cd
--- /dev/null
+++ b/profiling/setup
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+chmod u+x profiling/setup
+chmod u+x profiling/run
+chmod u+x profiling/compare
+chmod u+x profiling/save
diff --git a/sibe.cabal b/sibe.cabal
index 4a74e99..d401501 100644
--- a/sibe.cabal
+++ b/sibe.cabal
@@ -22,6 +22,10 @@ library
                      , deepseq
                      , containers
                      , split
+                     , regex-base
+                     , regex-pcre
+                     , text
+                     , stemmer
   default-language:    Haskell2010
 
 executable sibe-exe
@@ -53,6 +57,19 @@ executable example-naivebayes-doc-classifier
                      , split
   default-language:    Haskell2010
 
+executable example-naivebayes-sentiment-analysis
+  hs-source-dirs:      examples
+  main-is:             naivebayes-sentiment-analysis.hs
+  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
+  build-depends:       base
+                     , sibe
+                     , hmatrix
+                     , containers
+                     , split
+                     , directory
+                     , deepseq
+  default-language:    Haskell2010
+
 test-suite sibe-test
   type:                exitcode-stdio-1.0
   hs-source-dirs:      test
diff --git a/src/Sibe.hs b/src/Sibe.hs
index 5f78556..79d4f08 100644
--- a/src/Sibe.hs
+++ b/src/Sibe.hs
@@ -17,8 +17,10 @@ module Sibe
      train,
      session,
      shuffle,
-     logistic,
-     logistic',
+     sigmoid,
+     sigmoid',
+     relu,
+     relu',
      crossEntropy,
      genSeed,
      replaceVector
@@ -88,11 +90,17 @@ module Sibe
         randomLayer seed (input, h) a :-
         randomNetwork (seed + 1) h hs output
 
-      logistic :: Vector Double -> Vector Double
-      logistic x = 1 / (1 + exp (-x))
+      sigmoid :: Vector Double -> Vector Double
+      sigmoid x = 1 / max (1 + exp (-x)) 1e-10
 
-      logistic' :: Vector Double -> Vector Double
-      logistic' x = logistic x * (1 - logistic x)
+      sigmoid' :: Vector Double -> Vector Double
+      sigmoid' x = sigmoid x * (1 - sigmoid x)
+
+      relu :: Vector Double -> Vector Double
+      relu x = log (max (1 + exp x) 1e-10)
+
+      relu' :: Vector Double -> Vector Double
+      relu' = sigmoid
 
       crossEntropy :: Output -> Output -> Double
       crossEntropy output target =
@@ -100,7 +108,7 @@ module Sibe
             n = fromIntegral (length pairs)
         in (-1 / n) * sum (map f pairs)
         where
-          f (a, y) = y * log a + (1 - y) * log (1 - a)
+          f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)
 
       train :: Input
             -> Network
@@ -114,8 +122,8 @@ module Sibe
             let y = runLayer input l
                 o = fn y
                 delta = o - target
-                -- de = delta * fn' y -- quadratic cost
-                de = delta -- cross entropy cost
+                de = delta * fn' y
+                -- de = delta -- cross entropy cost
 
                 biases'  = biases  - scale alpha de
                 weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
@@ -174,3 +182,6 @@ module Sibe
           rrow index (x:xs)
             | index == index = value:xs
             | otherwise = x : rrow (index + 1) xs
+
+      clip :: Double -> (Double, Double) -> Double
+      clip x (l, u) = min u (max l x)
diff --git a/src/Sibe/NaiveBayes.hs b/src/Sibe/NaiveBayes.hs
index 7a7cd05..1521a61 100644
--- a/src/Sibe/NaiveBayes.hs
+++ b/src/Sibe/NaiveBayes.hs
@@ -1,15 +1,17 @@
 module Sibe.NaiveBayes
   (Document(..),
    NB(..),
-   createDocuments,
-   initialize,
-   calculate,
-   determine,
+   train,
+   run,
    ordNub,
    accuracy,
    precision,
    recall,
    fmeasure,
+   mean,
+   stdev,
+   cleanText,
+   cleanDocuments,
   )
   where
     import Data.List
@@ -18,47 +20,126 @@ module Sibe.NaiveBayes
     import Data.List.Split
     import Data.Maybe
     import Control.Arrow ((&&&))
-    type Class = Int
+    import Text.Regex.PCRE
+    import Data.Char (isSpace)
+    import NLP.Stemmer
+
+    type Class = Int;
 
     data Document = Document { text :: String
                              , c    :: Class
                              } deriving (Eq, Show, Read)
 
-    data NB = NB { vocabulary :: Double
+    data NB = NB { documents  :: [Document]
+                 , classes    :: [(Class, Double)]
+                 , vocabulary :: Int
                  , megadoc    :: String
-                 }
+                 , cd         :: [(Class, [Document])]
+                 , cw         :: [(Class, [(String, Int)])]
+                 } deriving (Eq, Show, Read)
 
-    initialize :: [Document] -> NB
-    initialize documents =
-      let megadoc = concatMap (\(Document text _) -> text ++ " ") documents
+    train :: [Document] -> [Class] -> NB
+    train documents classes =
+      let megadoc = concatDocs documents
           vocabulary = genericLength ((ordNub . words) megadoc)
-      in NB vocabulary megadoc
+          -- (class, prior probability)
+          cls = zip classes (map classPrior classes)
 
-    determine :: String -> NB -> [Class] -> [Document] -> Class
-    determine text nb classes documents =
-      let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes)
-          m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores
-      in fst m
+          -- (class, [document])
+          cd = zip classes (map classDocs classes)
 
-    calculate :: String -> NB -> Class -> [Document] -> Double
-    calculate text (NB vocabulary megadoc) cls documents =
-      let docs = filter (\(Document text c) -> c == cls) documents
-          texts = map (\(Document text _) -> text ++ " ") docs
-          classText = concat texts
-          classWords = words classText
-          c = genericLength classWords
-          pc = genericLength docs / genericLength documents
-      in pc * product (map (cword classWords c) (words text))
+          -- (class, [(word, count)])
+          cw = zip classes $ l (map classWordsCounts classes)
+
+      in NB { documents  = documents
+            , classes    = cls
+            , vocabulary = vocabulary
+            , megadoc    = megadoc
+            , cd         = cd
+            , cw         = cw
+            }
       where
-        cword classWords c word =
-          let wc = genericLength (filter (==word) classWords)
-          in (wc + 1) / (c + vocabulary)
+        concatDocs = concatMap (\(Document text _) -> text ++ " ")
 
-    createDocuments classes content =
-      let splitted = splitOn (replicate 10 '-' ++ "\n") content
-          pairs = map ((head . lines) &&& (concat . tail . lines)) splitted
-          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
-      in documents
+        classDocs x = filter ((==x) . c) documents
+        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
+        classWords x = words (classMegadoc x)
+        classNGram n = ngram n . classMegadoc
+        classVocabulary x = ordNub (classWords x)
+        classPrior x = genericLength (classDocs x) / genericLength documents
+        countWordInDoc d w = genericLength (filter (==w) d)
+        classWordsCounts x =
+          let voc = classVocabulary x
+          in zip voc $ map (countWordInDoc (classWords x)) voc
+
+    ngram :: Int -> String -> [String]
+    ngram n text =
+      let ws = words text
+      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
+
+    run :: String -> NB -> Class
+    run text (NB documents classes vocabulary megadoc cd cw) =
+      let scores = map (score . fst) classes
+      in argmax scores
+      where
+        score c =
+          let prior = snd (classes !! c)
+          in prior * product (map (prob c) (words text))
+        prob c w =
+          let tctM = find ((==w) . fst) (snd (cw !! c))
+              tct  = (snd . fromJust) tctM
+              cvoc = (genericLength . snd) (cw !! c)
+              voc  = vocabulary
+          in
+            if isJust tctM then
+              realToFrac (tct + 1) / realToFrac (cvoc + voc)
+            else
+              1 / realToFrac (cvoc + voc)
+
+    argmax :: (Ord a) => [a] -> Int
+    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
+
+    mean :: [Double] -> Double
+    mean x = sum x / genericLength x
+
+    stdev :: [Double] -> Double
+    stdev x =
+      let avg = mean x
+          variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
+      in sqrt variance
+
+    cleanText :: String -> String
+    cleanText string =
+      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
+          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
+          stemmed = unwords $ map (stem Porter) (words spacify)
+      in stemmed
+      where
+        trim = f . f
+          where
+            f = reverse . dropWhile isSpace
+        replace needle replacement =
+          map (\c -> if c == needle then replacement else c)
+
+    cleanDocuments :: [Document] -> [Document]
+    cleanDocuments documents =
+      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
+          wc = wordCounts (concatDocs cleaned)
+          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
+          stopwords = l $ map fst (take 30 wlist)
+          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
+      in wstopwords
+      where
+        vocabulary x = ordNub (words x)
+        countWordInDoc d w = genericLength (filter (==w) d)
+        wordCounts x =
+          let voc = vocabulary x
+          in zip voc $ map (countWordInDoc (words x)) voc
+
+        removeWords list text =
+          unwords $ filter (`notElem` list) (words text)
+
+        concatDocs = concatMap (\(Document text _) -> text ++ " ")
 
     l :: (Show a) => a -> a
     l a = trace (show a) a
@@ -100,5 +181,8 @@ module Sibe.NaiveBayes
               then 0
               else t / y
 
-    fmeasure :: Double -> Double -> Double
-    fmeasure r p = (2 * p * r) / (p + r)
+    fmeasure :: [(Int, Int)] -> Double
+    fmeasure results =
+      let r = recall results
+          p = precision results
+      in (2 * p * r) / (p + r)
diff --git a/stack.yaml b/stack.yaml
index 305964d..d02fe3e 100644
--- a/stack.yaml
+++ b/stack.yaml
@@ -41,7 +41,10 @@ packages:
     commit: 42a88fbcb6bd1d2c4dc18fae5e962bd34fb316a1
   subdirs:
     - packages/base
-- '.'
+- .
+- http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz
+- http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz
+- http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz
 
 # Dependency packages to be pulled from upstream that are not in the resolver
 # (e.g., acme-missiles-0.3)
@@ -70,3 +73,5 @@ extra-package-dbs: []
 #
 # Allow a newer minor version of GHC than the snapshot specifies
 # compiler-check: newer-minor
+
+system-ghc: false