fix(naivebayes): fix the algorithm to make it actually work

feat(cleanDocuments): preprocess documents, use stemming and stopword elimination for better accuracy
2016-08-05 23:54:36 +04:30
parent 3cf0625794
commit ea1f05f001
10 changed files with 254 additions and 54 deletions
@@ -1,12 +1,13 @@
 module Main
  where
-    import Sibe
+    -- import Sibe
    import Sibe.NaiveBayes
    import Text.Printf
    import Data.List
    import Data.Maybe
    import Debug.Trace
    import Data.List.Split
+    import Control.Arrow ((&&&))

    main = do
      dataset <- readFile "examples/doc-classifier-data/data-reuters"
@@ -15,18 +16,33 @@ module Main
      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"

      let intClasses = [0..length classes - 1]
-          documents = createDocuments classes dataset
-          testDocuments = createDocuments classes test
-          devTestDocuments = take 20 testDocuments
-          nb = initialize documents
+      -- let intClasses = [0, 1]
+          documents = cleanDocuments $ createDocuments classes dataset
+          -- documents = [Document "Chinese Beijing Chinese" 0,
+          --              Document "Chinese Chinese Shanghai" 0,
+          --              Document "Chinese Macao" 0,
+          --              Document "Japan Tokyo Chinese" 1]
+          -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
+          testDocuments = cleanDocuments $ createDocuments classes test
+          devTestDocuments = take 30 testDocuments
+          -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
+          nb = train documents intClasses

-          results = map (\(Document text c) -> (c, determine text nb intClasses documents)) testDocuments
-          -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
+          results = map (\(Document text c) -> (c, run text nb)) testDocuments
+          -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
+
+      -- print (text $ head documents)

      let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
      mapM_ showResults results

      putStrLn $ "Recall: " ++ show (recall results)
      putStrLn $ "Precision: " ++ show (precision results)
-      putStrLn $ "F Measure: " ++ show (fmeasure (precision results) (recall results))
+      putStrLn $ "F Measure: " ++ show (fmeasure results)
      putStrLn $ "Accuracy: " ++ show (accuracy results)
+
+    createDocuments classes content =
+      let splitted = splitOn (replicate 10 '-' ++ "\n") content
+          pairs = map ((head . lines) &&& (unwords . tail . lines)) splitted
+          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
+      in documents
@@ -0,0 +1,54 @@
+{-# LANGUAGE BangPatterns #-}
+module Main
+  where
+    -- import Sibe
+    import Sibe.NaiveBayes
+    import Text.Printf
+    import Data.List
+    import Data.Maybe
+    import Debug.Trace
+    import Data.List.Split
+    import System.Directory
+    import Control.DeepSeq
+    import System.IO
+
+    main = do
+      putStr "Reading documents... "
+      neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/"
+      pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/"
+
+      test_neg_documents  <- createDocuments "examples/sentiment-analysis-data/test/neg/"
+      test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/"
+      putStrLn "done"
+
+      let classes = [0..9] -- rating, from 0 to 9 (1 to 10)
+          documents = neg_documents ++ pos_documents
+          nb = train documents classes
+
+          testDocuments = neg_documents ++ pos_documents
+
+          results = map (\(Document text c) -> (c, run text nb)) testDocuments
+          -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
+      print results
+
+      -- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r))
+      -- mapM_ showResults results
+      --
+      -- putStrLn $ "Recall: " ++ show (recall results)
+      -- putStrLn $ "Precision: " ++ show (precision results)
+      -- putStrLn $ "F Measure: " ++ show (fmeasure results)
+      -- putStrLn $ "Accuracy: " ++ show (accuracy results)
+
+    createDocuments :: FilePath -> IO [Document]
+    createDocuments path = do
+      files <- drop 2 <$> getDirectoryContents path
+      let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int]
+      contents <- mapM (forceReadFile . (path ++)) files
+      return $ zipWith Document contents ratings
+
+    forceReadFile :: FilePath -> IO String
+    forceReadFile file = do
+      handle <- openFile file ReadMode
+      content <- hGetContents handle
+      content `deepseq` hClose handle
+      return content
@@ -0,0 +1 @@
+../../sibe-repos/sentiment-analysis-data
@@ -7,7 +7,7 @@ module Main where
  main = do
    let learning_rate = 0.5
        (iterations, epochs) = (2, 1000)
-        a = (logistic, logistic')
+        a = (sigmoid, sigmoid')
        rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output

        inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+PROG==geniconvert
+VIEW==open
+
+stack build --profile
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+chmod u+x profiling/setup
+chmod u+x profiling/run
+chmod u+x profiling/compare
+chmod u+x profiling/save
@@ -22,6 +22,10 @@ library
                     , deepseq
                     , containers
                     , split
+                     , regex-base
+                     , regex-pcre
+                     , text
+                     , stemmer
  default-language:    Haskell2010

 executable sibe-exe
@@ -53,6 +57,19 @@ executable example-naivebayes-doc-classifier
                     , split
  default-language:    Haskell2010

+executable example-naivebayes-sentiment-analysis
+  hs-source-dirs:      examples
+  main-is:             naivebayes-sentiment-analysis.hs
+  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
+  build-depends:       base
+                     , sibe
+                     , hmatrix
+                     , containers
+                     , split
+                     , directory
+                     , deepseq
+  default-language:    Haskell2010
+
 test-suite sibe-test
  type:                exitcode-stdio-1.0
  hs-source-dirs:      test
@@ -17,8 +17,10 @@ module Sibe
     train,
     session,
     shuffle,
-     logistic,
-     logistic',
+     sigmoid,
+     sigmoid',
+     relu,
+     relu',
     crossEntropy,
     genSeed,
     replaceVector
@@ -88,11 +90,17 @@ module Sibe
        randomLayer seed (input, h) a :-
        randomNetwork (seed + 1) h hs output

-      logistic :: Vector Double -> Vector Double
-      logistic x = 1 / (1 + exp (-x))
+      sigmoid :: Vector Double -> Vector Double
+      sigmoid x = 1 / max (1 + exp (-x)) 1e-10

-      logistic' :: Vector Double -> Vector Double
-      logistic' x = logistic x * (1 - logistic x)
+      sigmoid' :: Vector Double -> Vector Double
+      sigmoid' x = sigmoid x * (1 - sigmoid x)
+
+      relu :: Vector Double -> Vector Double
+      relu x = log (max (1 + exp x) 1e-10)
+
+      relu' :: Vector Double -> Vector Double
+      relu' = sigmoid

      crossEntropy :: Output -> Output -> Double
      crossEntropy output target =
@@ -100,7 +108,7 @@ module Sibe
            n = fromIntegral (length pairs)
        in (-1 / n) * sum (map f pairs)
        where
-          f (a, y) = y * log a + (1 - y) * log (1 - a)
+          f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)

      train :: Input
            -> Network
@@ -114,8 +122,8 @@ module Sibe
            let y = runLayer input l
                o = fn y
                delta = o - target
-                -- de = delta * fn' y -- quadratic cost
-                de = delta -- cross entropy cost
+                de = delta * fn' y
+                -- de = delta -- cross entropy cost

                biases'  = biases  - scale alpha de
                weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
@@ -174,3 +182,6 @@ module Sibe
          rrow index (x:xs)
            | index == index = value:xs
            | otherwise = x : rrow (index + 1) xs
+
+      clip :: Double -> (Double, Double) -> Double
+      clip x (l, u) = min u (max l x)
@@ -1,15 +1,17 @@
 module Sibe.NaiveBayes
  (Document(..),
   NB(..),
-   createDocuments,
-   initialize,
-   calculate,
-   determine,
+   train,
+   run,
   ordNub,
   accuracy,
   precision,
   recall,
   fmeasure,
+   mean,
+   stdev,
+   cleanText,
+   cleanDocuments,
  )
  where
    import Data.List
@@ -18,47 +20,126 @@ module Sibe.NaiveBayes
    import Data.List.Split
    import Data.Maybe
    import Control.Arrow ((&&&))
-    type Class = Int
+    import Text.Regex.PCRE
+    import Data.Char (isSpace)
+    import NLP.Stemmer
+
+    type Class = Int;

    data Document = Document { text :: String
                             , c    :: Class
                             } deriving (Eq, Show, Read)

-    data NB = NB { vocabulary :: Double
+    data NB = NB { documents  :: [Document]
+                 , classes    :: [(Class, Double)]
+                 , vocabulary :: Int
                 , megadoc    :: String
-                 }
+                 , cd         :: [(Class, [Document])]
+                 , cw         :: [(Class, [(String, Int)])]
+                 } deriving (Eq, Show, Read)

-    initialize :: [Document] -> NB
-    initialize documents =
-      let megadoc = concatMap (\(Document text _) -> text ++ " ") documents
+    train :: [Document] -> [Class] -> NB
+    train documents classes =
+      let megadoc = concatDocs documents
          vocabulary = genericLength ((ordNub . words) megadoc)
-      in NB vocabulary megadoc
+          -- (class, prior probability)
+          cls = zip classes (map classPrior classes)

-    determine :: String -> NB -> [Class] -> [Document] -> Class
-    determine text nb classes documents =
-      let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes)
-          m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores
-      in fst m
+          -- (class, [document])
+          cd = zip classes (map classDocs classes)

-    calculate :: String -> NB -> Class -> [Document] -> Double
-    calculate text (NB vocabulary megadoc) cls documents =
-      let docs = filter (\(Document text c) -> c == cls) documents
-          texts = map (\(Document text _) -> text ++ " ") docs
-          classText = concat texts
-          classWords = words classText
-          c = genericLength classWords
-          pc = genericLength docs / genericLength documents
-      in pc * product (map (cword classWords c) (words text))
+          -- (class, [(word, count)])
+          cw = zip classes $ l (map classWordsCounts classes)
+
+      in NB { documents  = documents
+            , classes    = cls
+            , vocabulary = vocabulary
+            , megadoc    = megadoc
+            , cd         = cd
+            , cw         = cw
+            }
      where
-        cword classWords c word =
-          let wc = genericLength (filter (==word) classWords)
-          in (wc + 1) / (c + vocabulary)
+        concatDocs = concatMap (\(Document text _) -> text ++ " ")

-    createDocuments classes content =
-      let splitted = splitOn (replicate 10 '-' ++ "\n") content
-          pairs = map ((head . lines) &&& (concat . tail . lines)) splitted
-          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
-      in documents
+        classDocs x = filter ((==x) . c) documents
+        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
+        classWords x = words (classMegadoc x)
+        classNGram n = ngram n . classMegadoc
+        classVocabulary x = ordNub (classWords x)
+        classPrior x = genericLength (classDocs x) / genericLength documents
+        countWordInDoc d w = genericLength (filter (==w) d)
+        classWordsCounts x =
+          let voc = classVocabulary x
+          in zip voc $ map (countWordInDoc (classWords x)) voc
+
+    ngram :: Int -> String -> [String]
+    ngram n text =
+      let ws = words text
+      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
+
+    run :: String -> NB -> Class
+    run text (NB documents classes vocabulary megadoc cd cw) =
+      let scores = map (score . fst) classes
+      in argmax scores
+      where
+        score c =
+          let prior = snd (classes !! c)
+          in prior * product (map (prob c) (words text))
+        prob c w =
+          let tctM = find ((==w) . fst) (snd (cw !! c))
+              tct  = (snd . fromJust) tctM
+              cvoc = (genericLength . snd) (cw !! c)
+              voc  = vocabulary
+          in
+            if isJust tctM then
+              realToFrac (tct + 1) / realToFrac (cvoc + voc)
+            else
+              1 / realToFrac (cvoc + voc)
+
+    argmax :: (Ord a) => [a] -> Int
+    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
+
+    mean :: [Double] -> Double
+    mean x = sum x / genericLength x
+
+    stdev :: [Double] -> Double
+    stdev x =
+      let avg = mean x
+          variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
+      in sqrt variance
+
+    cleanText :: String -> String
+    cleanText string =
+      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
+          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
+          stemmed = unwords $ map (stem Porter) (words spacify)
+      in stemmed
+      where
+        trim = f . f
+          where
+            f = reverse . dropWhile isSpace
+        replace needle replacement =
+          map (\c -> if c == needle then replacement else c)
+
+    cleanDocuments :: [Document] -> [Document]
+    cleanDocuments documents =
+      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
+          wc = wordCounts (concatDocs cleaned)
+          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
+          stopwords = l $ map fst (take 30 wlist)
+          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
+      in wstopwords
+      where
+        vocabulary x = ordNub (words x)
+        countWordInDoc d w = genericLength (filter (==w) d)
+        wordCounts x =
+          let voc = vocabulary x
+          in zip voc $ map (countWordInDoc (words x)) voc
+
+        removeWords list text =
+          unwords $ filter (`notElem` list) (words text)
+
+        concatDocs = concatMap (\(Document text _) -> text ++ " ")

    l :: (Show a) => a -> a
    l a = trace (show a) a
@@ -100,5 +181,8 @@ module Sibe.NaiveBayes
              then 0
              else t / y

-    fmeasure :: Double -> Double -> Double
-    fmeasure r p = (2 * p * r) / (p + r)
+    fmeasure :: [(Int, Int)] -> Double
+    fmeasure results =
+      let r = recall results
+          p = precision results
+      in (2 * p * r) / (p + r)
@@ -41,7 +41,10 @@ packages:
    commit: 42a88fbcb6bd1d2c4dc18fae5e962bd34fb316a1
  subdirs:
    - packages/base
- '.'
+- .
+- http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz
+- http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz
+- http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz

 # Dependency packages to be pulled from upstream that are not in the resolver
 # (e.g., acme-missiles-0.3)
@@ -70,3 +73,5 @@ extra-package-dbs: []
 #
 # Allow a newer minor version of GHC than the snapshot specifies
 # compiler-check: newer-minor
+
+system-ghc: false