feat(stopwords): removeWords and removeStopwords functions as pre-processors

feat(confidence, WIP): calculate confidence of each classification
2016-08-08 10:02:26 +04:30
parent ea1f05f001
commit 099c25e166
3 changed files with 742 additions and 52 deletions
--- a/src/Sibe/NaiveBayes.hs
+++ b/src/Sibe/NaiveBayes.hs
@ -12,6 +12,9 @@ module Sibe.NaiveBayes
   stdev,
   cleanText,
   cleanDocuments,
+   ngram,
+   removeWords,
+   removeStopwords,
  )
  where
    import Data.List
@ -21,7 +24,7 @@ module Sibe.NaiveBayes
    import Data.Maybe
    import Control.Arrow ((&&&))
    import Text.Regex.PCRE
-    import Data.Char (isSpace)
+    import Data.Char (isSpace, isNumber)
    import NLP.Stemmer

    type Class = Int;
@ -36,6 +39,7 @@ module Sibe.NaiveBayes
                 , megadoc    :: String
                 , cd         :: [(Class, [Document])]
                 , cw         :: [(Class, [(String, Int)])]
+                 , cgram      :: [(Class, [(String, Int)])]
                 } deriving (Eq, Show, Read)

    train :: [Document] -> [Class] -> NB
@ -49,7 +53,9 @@ module Sibe.NaiveBayes
          cd = zip classes (map classDocs classes)

          -- (class, [(word, count)])
-          cw = zip classes $ l (map classWordsCounts classes)
+          cw = zip classes $ map classWordsCounts classes
+
+          cgram = zip classes $ map classNGramCounts classes

      in NB { documents  = documents
            , classes    = cls
@ -57,44 +63,60 @@ module Sibe.NaiveBayes
            , megadoc    = megadoc
            , cd         = cd
            , cw         = cw
+            , cgram      = cgram
            }
      where
        concatDocs = concatMap (\(Document text _) -> text ++ " ")

        classDocs x = filter ((==x) . c) documents
-        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
-        classWords x = words (classMegadoc x)
-        classNGram n = ngram n . classMegadoc
-        classVocabulary x = ordNub (classWords x)
+        classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
+        classWords = words . classMegadoc
+        classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
+        classNGramWords = words . classNGram
+        classVocabulary = ordNub . classWords
        classPrior x = genericLength (classDocs x) / genericLength documents
        countWordInDoc d w = genericLength (filter (==w) d)
-        classWordsCounts x =
-          let voc = classVocabulary x
-          in zip voc $ map (countWordInDoc (classWords x)) voc
+        wordsCount ws voc =
+          zip voc $ map (countWordInDoc ws) voc
+        classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
+        classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)

-    ngram :: Int -> String -> [String]
-    ngram n text =
-      let ws = words text
-      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
+    ngram :: Int -> [Document] -> [Document]
+    ngram n documents =
+      map (\(Document text c) -> Document (helper text) c) documents
+      where
+        helper text =
+          let ws = words text
+              pairs = zip [0..] ws
+              grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
+          in unwords ("<b>":grams)

-    run :: String -> NB -> Class
-    run text (NB documents classes vocabulary megadoc cd cw) =
+    run :: String -> NB -> (Class, Double)
+    run txt (NB documents classes vocabulary megadoc cd cw cgram) =
      let scores = map (score . fst) classes
-      in argmax scores
+          index = argmax scores
+          m = maximum scores
+          confidence = m / sum scores
+      in (index, 0)
      where
        score c =
          let prior = snd (classes !! c)
-          in prior * product (map (prob c) (words text))
+
+          -- below is the formula according to Multinominal Naive Bayes, but it seems
+          -- using a uniform prior probability seems to work better
+          -- in prior * product (map (prob c) (words txt))
+
+          in product (map (prob c) (words txt))
+
        prob c w =
-          let tctM = find ((==w) . fst) (snd (cw !! c))
-              tct  = (snd . fromJust) tctM
-              cvoc = (genericLength . snd) (cw !! c)
+          let tctM = find ((== w) . fst) (snd (cw !! c))
+              tct  = if isJust tctM then (snd . fromJust) tctM else 0
+              cvoc = sum $ map snd (snd (cw !! c))
              voc  = vocabulary
-          in
-            if isJust tctM then
-              realToFrac (tct + 1) / realToFrac (cvoc + voc)
-            else
-              1 / realToFrac (cvoc + voc)
+              gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
+              pg   = if isJust gram then (snd . fromJust) gram else 0
+          -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
+          in realToFrac (tct + 1) / realToFrac (cvoc + voc)

    argmax :: (Ord a) => [a] -> Int
    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
@ -113,7 +135,8 @@ module Sibe.NaiveBayes
      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
          stemmed = unwords $ map (stem Porter) (words spacify)
-      in stemmed
+          nonumber = filter (not . isNumber) stemmed
+      in (unwords . words) nonumber
      where
        trim = f . f
          where
@ -124,11 +147,21 @@ module Sibe.NaiveBayes
    cleanDocuments :: [Document] -> [Document]
    cleanDocuments documents =
      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
-          wc = wordCounts (concatDocs cleaned)
+      in cleaned
+
+    removeWords :: [String] -> [Document] -> [Document]
+    removeWords ws documents =
+      map (\(Document text c) -> Document (rm ws text) c) documents
+      where
+          rm list text =
+            unwords $ filter (`notElem` list) (words text)
+
+    removeStopwords :: Int -> [Document] -> [Document]
+    removeStopwords i documents =
+      let wc = wordCounts (concatDocs documents)
          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
-          stopwords = l $ map fst (take 30 wlist)
-          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
-      in wstopwords
+          stopwords = map fst (take i wlist)
+      in removeWords stopwords documents
      where
        vocabulary x = ordNub (words x)
        countWordInDoc d w = genericLength (filter (==w) d)
@ -136,9 +169,6 @@ module Sibe.NaiveBayes
          let voc = vocabulary x
          in zip voc $ map (countWordInDoc (words x)) voc

-        removeWords list text =
-          unwords $ filter (`notElem` list) (words text)
-
        concatDocs = concatMap (\(Document text _) -> text ++ " ")

    l :: (Show a) => a -> a
@ -151,37 +181,38 @@ module Sibe.NaiveBayes
        go s (x:xs) = if x `Set.member` s then go s xs
                                          else x : go (Set.insert x s) xs

-    accuracy :: [(Int, Int)] -> Double
+    accuracy :: [(Int, (Int, Double))] -> Double
    accuracy results =
-      let correct = filter (uncurry (==)) results
+      let pairs = map (\(a, b) -> (a, fst b)) results
+          correct = filter (uncurry (==)) pairs
      in genericLength correct / genericLength results

-    recall :: [(Int, Int)] -> Double
+    recall :: [(Int, (Int, Double))] -> Double
    recall results =
      let classes = ordNub (map fst results)
          s = sum (map rec classes) / genericLength classes
      in s
      where
        rec a =
-          let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
-              y = genericLength $ filter (\(c, r) -> c == a) results
+          let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
+              y = genericLength $ filter (\(c, (r, _)) -> c == a) results
          in t / y

-    precision :: [(Int, Int)] -> Double
+    precision :: [(Int, (Int, Double))] -> Double
    precision results =
      let classes = ordNub (map fst results)
          s = sum (map prec classes) / genericLength classes
      in s
      where
        prec a =
-          let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
-              y = genericLength $ filter (\(c, r) -> r == a) results
+          let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
+              y = genericLength $ filter (\(c, (r, _)) -> r == a) results
          in
            if y == 0
              then 0
              else t / y

-    fmeasure :: [(Int, Int)] -> Double
+    fmeasure :: [(Int, (Int, Double))] -> Double
    fmeasure results =
      let r = recall results
          p = precision results