feat(stopwords): removeWords and removeStopwords functions as pre-processors

feat(confidence, WIP): calculate confidence of each classification
2016-08-08 10:02:26 +04:30
parent ea1f05f001
commit 099c25e166
3 changed files with 742 additions and 52 deletions
--- a/examples/naivebayes-doc-classifier.hs
+++ b/examples/naivebayes-doc-classifier.hs
@@ -14,26 +14,20 @@ module Main
      test <- readFile "examples/doc-classifier-data/data-reuters-test"
      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
      sws <- lines <$> readFile "examples/stopwords"
      let intClasses = [0..length classes - 1]
-      -- let intClasses = [0, 1]
+          documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
          documents = cleanDocuments $ createDocuments classes dataset
          -- documents = [Document "Chinese Beijing Chinese" 0,
          --              Document "Chinese Chinese Shanghai" 0,
          --              Document "Chinese Macao" 0,
          --              Document "Japan Tokyo Chinese" 1]
          -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
          testDocuments = cleanDocuments $ createDocuments classes test
          devTestDocuments = take 30 testDocuments
          -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
          nb = train documents intClasses
          results = map (\(Document text c) -> (c, run text nb)) testDocuments
          -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
-      -- print (text $ head documents)
+      print (text $ head documents)
-      let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
+      let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
      mapM_ showResults results
      putStrLn $ "Recall: " ++ show (recall results)
--- a/examples/stopwords
+++ b/examples/stopwords
@@ -0,0 +1,665 @@
 a
 able
 about
 above
 abst
 accordance
 according
 accordingly
 across
 act
 actually
 added
 adj
 affected
 affecting
 affects
 after
 afterwards
 again
 against
 ah
 all
 almost
 alone
 along
 already
 also
 although
 always
 am
 among
 amongst
 an
 and
 announce
 another
 any
 anybody
 anyhow
 anymore
 anyone
 anything
 anyway
 anyways
 anywhere
 apparently
 approximately
 are
 aren
 arent
 arise
 around
 as
 aside
 ask
 asking
 at
 auth
 available
 away
 awfully
 b
 back
 be
 became
 because
 become
 becomes
 becoming
 been
 before
 beforehand
 begin
 beginning
 beginnings
 begins
 behind
 being
 believe
 below
 beside
 besides
 between
 beyond
 biol
 both
 brief
 briefly
 but
 by
 c
 ca
 came
 can
 cannot
 can't
 cause
 causes
 certain
 certainly
 co
 com
 come
 comes
 contain
 containing
 contains
 could
 couldnt
 d
 date
 did
 didn't
 different
 do
 does
 doesn't
 doing
 done
 don't
 down
 downwards
 due
 during
 e
 each
 ed
 edu
 effect
 eg
 eight
 eighty
 either
 else
 elsewhere
 end
 ending
 enough
 especially
 et
 et-al
 etc
 even
 ever
 every
 everybody
 everyone
 everything
 everywhere
 ex
 except
 f
 far
 few
 ff
 fifth
 first
 five
 fix
 followed
 following
 follows
 for
 former
 formerly
 forth
 found
 four
 from
 further
 furthermore
 g
 gave
 get
 gets
 getting
 give
 given
 gives
 giving
 go
 goes
 gone
 got
 gotten
 h
 had
 happens
 hardly
 has
 hasn't
 have
 haven't
 having
 he
 hed
 hence
 her
 here
 hereafter
 hereby
 herein
 heres
 hereupon
 hers
 herself
 hes
 hi
 hid
 him
 himself
 his
 hither
 home
 how
 howbeit
 however
 hundred
 i
 id
 ie
 if
 i'll
 im
 immediate
 immediately
 importance
 important
 in
 inc
 indeed
 index
 information
 instead
 into
 invention
 inward
 is
 isn't
 it
 itd
 it'll
 its
 itself
 i've
 j
 just
 k
 keep   keeps
 kept
 kg
 km
 know
 known
 knows
 l
 largely
 last
 lately
 later
 latter
 latterly
 least
 less
 lest
 let
 lets
 like
 liked
 likely
 line
 little
 'll
 look
 looking
 looks
 ltd
 m
 made
 mainly
 make
 makes
 many
 may
 maybe
 me
 mean
 means
 meantime
 meanwhile
 merely
 mg
 might
 million
 miss
 ml
 more
 moreover
 most
 mostly
 mr
 mrs
 much
 mug
 must
 my
 myself
 n
 na
 name
 namely
 nay
 nd
 near
 nearly
 necessarily
 necessary
 need
 needs
 neither
 never
 nevertheless
 new
 next
 nine
 ninety
 no
 nobody
 non
 none
 nonetheless
 noone
 nor
 normally
 nos
 not
 noted
 nothing
 now
 nowhere
 o
 obtain
 obtained
 obviously
 of
 off
 often
 oh
 ok
 okay
 old
 omitted
 on
 once
 one
 ones
 only
 onto
 or
 ord
 other
 others
 otherwise
 ought
 our
 ours
 ourselves
 out
 outside
 over
 overall
 owing
 own
 p
 page
 pages
 part
 particular
 particularly
 past
 per
 perhaps
 placed
 please
 plus
 poorly
 possible
 possibly
 potentially
 pp
 predominantly
 present
 previously
 primarily
 probably
 promptly
 proud
 provides
 put
 q
 que
 quickly
 quite
 qv
 r
 ran
 rather
 rd
 re
 readily
 really
 recent
 recently
 ref
 refs
 regarding
 regardless
 regards
 related
 relatively
 research
 respectively
 resulted
 resulting
 results
 right
 run
 s
 said
 same
 saw
 say
 saying
 says
 sec
 section
 see
 seeing
 seem
 seemed
 seeming
 seems
 seen
 self
 selves
 sent
 seven
 several
 shall
 she
 shed
 she'll
 shes
 should
 shouldn't
 show
 showed
 shown
 showns
 shows
 significant
 significantly
 similar
 similarly
 since
 six
 slightly
 so
 some
 somebody
 somehow
 someone
 somethan
 something
 sometime
 sometimes
 somewhat
 somewhere
 soon
 sorry
 specifically
 specified
 specify
 specifying
 still
 stop
 strongly
 sub
 substantially
 successfully
 such
 sufficiently
 suggest
 sup
 sure   t
 take
 taken
 taking
 tell
 tends
 th
 than
 thank
 thanks
 thanx
 that
 that'll
 thats
 that've
 the
 their
 theirs
 them
 themselves
 then
 thence
 there
 thereafter
 thereby
 thered
 therefore
 therein
 there'll
 thereof
 therere
 theres
 thereto
 thereupon
 there've
 these
 they
 theyd
 they'll
 theyre
 they've
 think
 this
 those
 thou
 though
 thoughh
 thousand
 throug
 through
 throughout
 thru
 thus
 til
 tip
 to
 together
 too
 took
 toward
 towards
 tried
 tries
 truly
 try
 trying
 ts
 twice
 two
 u
 un
 under
 unfortunately
 unless
 unlike
 unlikely
 until
 unto
 up
 upon
 ups
 us
 use
 used
 useful
 usefully
 usefulness
 uses
 using
 usually
 v
 value
 various
 've
 very
 via
 viz
 vol
 vols
 vs
 w
 want
 wants
 was
 wasnt
 way
 we
 wed
 welcome
 we'll
 went
 were
 werent
 we've
 what
 whatever
 what'll
 whats
 when
 whence
 whenever
 where
 whereafter
 whereas
 whereby
 wherein
 wheres
 whereupon
 wherever
 whether
 which
 while
 whim
 whither
 who
 whod
 whoever
 whole
 who'll
 whom
 whomever
 whos
 whose
 why
 widely
 willing
 wish
 with
 within
 without
 wont
 words
 world
 would
 wouldnt
 www
 x
 y
 yes
 yet
 you
 youd
 you'll
 your
 youre
 yours
 yourself
 yourselves
 you've
 z
 zero
--- a/src/Sibe/NaiveBayes.hs
+++ b/src/Sibe/NaiveBayes.hs
@@ -12,6 +12,9 @@ module Sibe.NaiveBayes
   stdev,
   cleanText,
   cleanDocuments,
   ngram,
   removeWords,
   removeStopwords,
  )
  where
    import Data.List
@@ -21,7 +24,7 @@ module Sibe.NaiveBayes
    import Data.Maybe
    import Control.Arrow ((&&&))
    import Text.Regex.PCRE
-    import Data.Char (isSpace)
+    import Data.Char (isSpace, isNumber)
    import NLP.Stemmer
    type Class = Int;
@@ -36,6 +39,7 @@ module Sibe.NaiveBayes
                 , megadoc    :: String
                 , cd         :: [(Class, [Document])]
                 , cw         :: [(Class, [(String, Int)])]
                 , cgram      :: [(Class, [(String, Int)])]
                 } deriving (Eq, Show, Read)
    train :: [Document] -> [Class] -> NB
@@ -49,7 +53,9 @@ module Sibe.NaiveBayes
          cd = zip classes (map classDocs classes)
          -- (class, [(word, count)])
-          cw = zip classes $ l (map classWordsCounts classes)
+          cw = zip classes $ map classWordsCounts classes
          cgram = zip classes $ map classNGramCounts classes
      in NB { documents  = documents
            , classes    = cls
@@ -57,44 +63,60 @@ module Sibe.NaiveBayes
            , megadoc    = megadoc
            , cd         = cd
            , cw         = cw
            , cgram      = cgram
            }
      where
        concatDocs = concatMap (\(Document text _) -> text ++ " ")
        classDocs x = filter ((==x) . c) documents
-        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
+        classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
-        classWords x = words (classMegadoc x)
+        classWords = words . classMegadoc
-        classNGram n = ngram n . classMegadoc
+        classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
-        classVocabulary x = ordNub (classWords x)
+        classNGramWords = words . classNGram
        classVocabulary = ordNub . classWords
        classPrior x = genericLength (classDocs x) / genericLength documents
        countWordInDoc d w = genericLength (filter (==w) d)
-        classWordsCounts x =
+        wordsCount ws voc =
-          let voc = classVocabulary x
+          zip voc $ map (countWordInDoc ws) voc
-          in zip voc $ map (countWordInDoc (classWords x)) voc
+        classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
        classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
-    ngram :: Int -> String -> [String]
+    ngram :: Int -> [Document] -> [Document]
-    ngram n text =
+    ngram n documents =
      map (\(Document text c) -> Document (helper text) c) documents
      where
        helper text =
          let ws = words text
-      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
+              pairs = zip [0..] ws
              grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
          in unwords ("<b>":grams)
-    run :: String -> NB -> Class
+    run :: String -> NB -> (Class, Double)
-    run text (NB documents classes vocabulary megadoc cd cw) =
+    run txt (NB documents classes vocabulary megadoc cd cw cgram) =
      let scores = map (score . fst) classes
-      in argmax scores
+          index = argmax scores
          m = maximum scores
          confidence = m / sum scores
      in (index, 0)
      where
        score c =
          let prior = snd (classes !! c)
-          in prior * product (map (prob c) (words text))
+
          -- below is the formula according to Multinominal Naive Bayes, but it seems
          -- using a uniform prior probability seems to work better
          -- in prior * product (map (prob c) (words txt))
          in product (map (prob c) (words txt))
        prob c w =
-          let tctM = find ((==w) . fst) (snd (cw !! c))
+          let tctM = find ((== w) . fst) (snd (cw !! c))
-              tct  = (snd . fromJust) tctM
+              tct  = if isJust tctM then (snd . fromJust) tctM else 0
-              cvoc = (genericLength . snd) (cw !! c)
+              cvoc = sum $ map snd (snd (cw !! c))
              voc  = vocabulary
-          in
+              gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
-            if isJust tctM then
+              pg   = if isJust gram then (snd . fromJust) gram else 0
-              realToFrac (tct + 1) / realToFrac (cvoc + voc)
+          -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
-            else
+          in realToFrac (tct + 1) / realToFrac (cvoc + voc)
              1 / realToFrac (cvoc + voc)
    argmax :: (Ord a) => [a] -> Int
    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
@@ -113,7 +135,8 @@ module Sibe.NaiveBayes
      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
          stemmed = unwords $ map (stem Porter) (words spacify)
-      in stemmed
+          nonumber = filter (not . isNumber) stemmed
      in (unwords . words) nonumber
      where
        trim = f . f
          where
@@ -124,11 +147,21 @@ module Sibe.NaiveBayes
    cleanDocuments :: [Document] -> [Document]
    cleanDocuments documents =
      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
-          wc = wordCounts (concatDocs cleaned)
+      in cleaned
    removeWords :: [String] -> [Document] -> [Document]
    removeWords ws documents =
      map (\(Document text c) -> Document (rm ws text) c) documents
      where
          rm list text =
            unwords $ filter (`notElem` list) (words text)
    removeStopwords :: Int -> [Document] -> [Document]
    removeStopwords i documents =
      let wc = wordCounts (concatDocs documents)
          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
-          stopwords = l $ map fst (take 30 wlist)
+          stopwords = map fst (take i wlist)
-          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
+      in removeWords stopwords documents
      in wstopwords
      where
        vocabulary x = ordNub (words x)
        countWordInDoc d w = genericLength (filter (==w) d)
@@ -136,9 +169,6 @@ module Sibe.NaiveBayes
          let voc = vocabulary x
          in zip voc $ map (countWordInDoc (words x)) voc
        removeWords list text =
          unwords $ filter (`notElem` list) (words text)
        concatDocs = concatMap (\(Document text _) -> text ++ " ")
    l :: (Show a) => a -> a
@@ -151,37 +181,38 @@ module Sibe.NaiveBayes
        go s (x:xs) = if x `Set.member` s then go s xs
                                          else x : go (Set.insert x s) xs
-    accuracy :: [(Int, Int)] -> Double
+    accuracy :: [(Int, (Int, Double))] -> Double
    accuracy results =
-      let correct = filter (uncurry (==)) results
+      let pairs = map (\(a, b) -> (a, fst b)) results
          correct = filter (uncurry (==)) pairs
      in genericLength correct / genericLength results
-    recall :: [(Int, Int)] -> Double
+    recall :: [(Int, (Int, Double))] -> Double
    recall results =
      let classes = ordNub (map fst results)
          s = sum (map rec classes) / genericLength classes
      in s
      where
        rec a =
-          let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
+          let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
-              y = genericLength $ filter (\(c, r) -> c == a) results
+              y = genericLength $ filter (\(c, (r, _)) -> c == a) results
          in t / y
-    precision :: [(Int, Int)] -> Double
+    precision :: [(Int, (Int, Double))] -> Double
    precision results =
      let classes = ordNub (map fst results)
          s = sum (map prec classes) / genericLength classes
      in s
      where
        prec a =
-          let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
+          let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
-              y = genericLength $ filter (\(c, r) -> r == a) results
+              y = genericLength $ filter (\(c, (r, _)) -> r == a) results
          in
            if y == 0
              then 0
              else t / y
-    fmeasure :: [(Int, Int)] -> Double
+    fmeasure :: [(Int, (Int, Double))] -> Double
    fmeasure results =
      let r = recall results
          p = precision results