fix(naivebayes): fix the algorithm to make it actually work

feat(cleanDocuments): preprocess documents, use stemming and stopword elimination for better accuracy
2016-08-05 23:54:36 +04:30
parent 3cf0625794
commit ea1f05f001
10 changed files with 254 additions and 54 deletions
--- a/src/Sibe/NaiveBayes.hs
+++ b/src/Sibe/NaiveBayes.hs
@@ -1,15 +1,17 @@
 module Sibe.NaiveBayes
  (Document(..),
   NB(..),
-   createDocuments,
-   initialize,
-   calculate,
-   determine,
+   train,
+   run,
   ordNub,
   accuracy,
   precision,
   recall,
   fmeasure,
+   mean,
+   stdev,
+   cleanText,
+   cleanDocuments,
  )
  where
    import Data.List
@@ -18,47 +20,126 @@ module Sibe.NaiveBayes
    import Data.List.Split
    import Data.Maybe
    import Control.Arrow ((&&&))
-    type Class = Int
+    import Text.Regex.PCRE
+    import Data.Char (isSpace)
+    import NLP.Stemmer
+
+    type Class = Int;

    data Document = Document { text :: String
                             , c    :: Class
                             } deriving (Eq, Show, Read)

-    data NB = NB { vocabulary :: Double
+    data NB = NB { documents  :: [Document]
+                 , classes    :: [(Class, Double)]
+                 , vocabulary :: Int
                 , megadoc    :: String
-                 }
+                 , cd         :: [(Class, [Document])]
+                 , cw         :: [(Class, [(String, Int)])]
+                 } deriving (Eq, Show, Read)

-    initialize :: [Document] -> NB
-    initialize documents =
-      let megadoc = concatMap (\(Document text _) -> text ++ " ") documents
+    train :: [Document] -> [Class] -> NB
+    train documents classes =
+      let megadoc = concatDocs documents
          vocabulary = genericLength ((ordNub . words) megadoc)
-      in NB vocabulary megadoc
+          -- (class, prior probability)
+          cls = zip classes (map classPrior classes)

-    determine :: String -> NB -> [Class] -> [Document] -> Class
-    determine text nb classes documents =
-      let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes)
-          m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores
-      in fst m
+          -- (class, [document])
+          cd = zip classes (map classDocs classes)

-    calculate :: String -> NB -> Class -> [Document] -> Double
-    calculate text (NB vocabulary megadoc) cls documents =
-      let docs = filter (\(Document text c) -> c == cls) documents
-          texts = map (\(Document text _) -> text ++ " ") docs
-          classText = concat texts
-          classWords = words classText
-          c = genericLength classWords
-          pc = genericLength docs / genericLength documents
-      in pc * product (map (cword classWords c) (words text))
+          -- (class, [(word, count)])
+          cw = zip classes $ l (map classWordsCounts classes)
+
+      in NB { documents  = documents
+            , classes    = cls
+            , vocabulary = vocabulary
+            , megadoc    = megadoc
+            , cd         = cd
+            , cw         = cw
+            }
      where
-        cword classWords c word =
-          let wc = genericLength (filter (==word) classWords)
-          in (wc + 1) / (c + vocabulary)
+        concatDocs = concatMap (\(Document text _) -> text ++ " ")

-    createDocuments classes content =
-      let splitted = splitOn (replicate 10 '-' ++ "\n") content
-          pairs = map ((head . lines) &&& (concat . tail . lines)) splitted
-          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
-      in documents
+        classDocs x = filter ((==x) . c) documents
+        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
+        classWords x = words (classMegadoc x)
+        classNGram n = ngram n . classMegadoc
+        classVocabulary x = ordNub (classWords x)
+        classPrior x = genericLength (classDocs x) / genericLength documents
+        countWordInDoc d w = genericLength (filter (==w) d)
+        classWordsCounts x =
+          let voc = classVocabulary x
+          in zip voc $ map (countWordInDoc (classWords x)) voc
+
+    ngram :: Int -> String -> [String]
+    ngram n text =
+      let ws = words text
+      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
+
+    run :: String -> NB -> Class
+    run text (NB documents classes vocabulary megadoc cd cw) =
+      let scores = map (score . fst) classes
+      in argmax scores
+      where
+        score c =
+          let prior = snd (classes !! c)
+          in prior * product (map (prob c) (words text))
+        prob c w =
+          let tctM = find ((==w) . fst) (snd (cw !! c))
+              tct  = (snd . fromJust) tctM
+              cvoc = (genericLength . snd) (cw !! c)
+              voc  = vocabulary
+          in
+            if isJust tctM then
+              realToFrac (tct + 1) / realToFrac (cvoc + voc)
+            else
+              1 / realToFrac (cvoc + voc)
+
+    argmax :: (Ord a) => [a] -> Int
+    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
+
+    mean :: [Double] -> Double
+    mean x = sum x / genericLength x
+
+    stdev :: [Double] -> Double
+    stdev x =
+      let avg = mean x
+          variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
+      in sqrt variance
+
+    cleanText :: String -> String
+    cleanText string =
+      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
+          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
+          stemmed = unwords $ map (stem Porter) (words spacify)
+      in stemmed
+      where
+        trim = f . f
+          where
+            f = reverse . dropWhile isSpace
+        replace needle replacement =
+          map (\c -> if c == needle then replacement else c)
+
+    cleanDocuments :: [Document] -> [Document]
+    cleanDocuments documents =
+      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
+          wc = wordCounts (concatDocs cleaned)
+          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
+          stopwords = l $ map fst (take 30 wlist)
+          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
+      in wstopwords
+      where
+        vocabulary x = ordNub (words x)
+        countWordInDoc d w = genericLength (filter (==w) d)
+        wordCounts x =
+          let voc = vocabulary x
+          in zip voc $ map (countWordInDoc (words x)) voc
+
+        removeWords list text =
+          unwords $ filter (`notElem` list) (words text)
+
+        concatDocs = concatMap (\(Document text _) -> text ++ " ")

    l :: (Show a) => a -> a
    l a = trace (show a) a
@@ -100,5 +181,8 @@ module Sibe.NaiveBayes
              then 0
              else t / y

-    fmeasure :: Double -> Double -> Double
-    fmeasure r p = (2 * p * r) / (p + r)
+    fmeasure :: [(Int, Int)] -> Double
+    fmeasure results =
+      let r = recall results
+          p = precision results
+      in (2 * p * r) / (p + r)