feat(stopwords): removeWords and removeStopwords functions as pre-processors

feat(confidence, WIP): calculate confidence of each classification
2016-08-08 10:02:26 +04:30
parent ea1f05f001
commit 099c25e166
3 changed files with 742 additions and 52 deletions
@@ -14,26 +14,20 @@ module Main
      test <- readFile "examples/doc-classifier-data/data-reuters-test"

      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
+      sws <- lines <$> readFile "examples/stopwords"

      let intClasses = [0..length classes - 1]
-      -- let intClasses = [0, 1]
-          documents = cleanDocuments $ createDocuments classes dataset
-          -- documents = [Document "Chinese Beijing Chinese" 0,
-          --              Document "Chinese Chinese Shanghai" 0,
-          --              Document "Chinese Macao" 0,
-          --              Document "Japan Tokyo Chinese" 1]
-          -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
+          documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
          testDocuments = cleanDocuments $ createDocuments classes test
          devTestDocuments = take 30 testDocuments
-          -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
          nb = train documents intClasses

          results = map (\(Document text c) -> (c, run text nb)) testDocuments
          -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments

-      -- print (text $ head documents)
+      print (text $ head documents)

-      let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
+      let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
      mapM_ showResults results

      putStrLn $ "Recall: " ++ show (recall results)
@@ -0,0 +1,665 @@
+a
+able
+about
+above
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+added
+adj
+affected
+affecting
+affects
+after
+afterwards
+again
+against
+ah
+all
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+apparently
+approximately
+are
+aren
+arent
+arise
+around
+as
+aside
+ask
+asking
+at
+auth
+available
+away
+awfully
+b
+back
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+begin
+beginning
+beginnings
+begins
+behind
+being
+believe
+below
+beside
+besides
+between
+beyond
+biol
+both
+brief
+briefly
+but
+by
+c
+ca
+came
+can
+cannot
+can't
+cause
+causes
+certain
+certainly
+co
+com
+come
+comes
+contain
+containing
+contains
+could
+couldnt
+d
+date
+did
+didn't
+different
+do
+does
+doesn't
+doing
+done
+don't
+down
+downwards
+due
+during
+e
+each
+ed
+edu
+effect
+eg
+eight
+eighty
+either
+else
+elsewhere
+end
+ending
+enough
+especially
+et
+et-al
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+except
+f
+far
+few
+ff
+fifth
+first
+five
+fix
+followed
+following
+follows
+for
+former
+formerly
+forth
+found
+four
+from
+further
+furthermore
+g
+gave
+get
+gets
+getting
+give
+given
+gives
+giving
+go
+goes
+gone
+got
+gotten
+h
+had
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+hed
+hence
+her
+here
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+hes
+hi
+hid
+him
+himself
+his
+hither
+home
+how
+howbeit
+however
+hundred
+i
+id
+ie
+if
+i'll
+im
+immediate
+immediately
+importance
+important
+in
+inc
+indeed
+index
+information
+instead
+into
+invention
+inward
+is
+isn't
+it
+itd
+it'll
+its
+itself
+i've
+j
+just
+k
+keep   keeps
+kept
+kg
+km
+know
+known
+knows
+l
+largely
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+lets
+like
+liked
+likely
+line
+little
+'ll
+look
+looking
+looks
+ltd
+m
+made
+mainly
+make
+makes
+many
+may
+maybe
+me
+mean
+means
+meantime
+meanwhile
+merely
+mg
+might
+million
+miss
+ml
+more
+moreover
+most
+mostly
+mr
+mrs
+much
+mug
+must
+my
+myself
+n
+na
+name
+namely
+nay
+nd
+near
+nearly
+necessarily
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+ninety
+no
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+now
+nowhere
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+omitted
+on
+once
+one
+ones
+only
+onto
+or
+ord
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+page
+pages
+part
+particular
+particularly
+past
+per
+perhaps
+placed
+please
+plus
+poorly
+possible
+possibly
+potentially
+pp
+predominantly
+present
+previously
+primarily
+probably
+promptly
+proud
+provides
+put
+q
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+respectively
+resulted
+resulting
+results
+right
+run
+s
+said
+same
+saw
+say
+saying
+says
+sec
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sent
+seven
+several
+shall
+she
+shed
+she'll
+shes
+should
+shouldn't
+show
+showed
+shown
+showns
+shows
+significant
+significantly
+similar
+similarly
+since
+six
+slightly
+so
+some
+somebody
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+still
+stop
+strongly
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure   t
+take
+taken
+taking
+tell
+tends
+th
+than
+thank
+thanks
+thanx
+that
+that'll
+thats
+that've
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+thereafter
+thereby
+thered
+therefore
+therein
+there'll
+thereof
+therere
+theres
+thereto
+thereupon
+there've
+these
+they
+theyd
+they'll
+theyre
+they've
+think
+this
+those
+thou
+though
+thoughh
+thousand
+throug
+through
+throughout
+thru
+thus
+til
+tip
+to
+together
+too
+took
+toward
+towards
+tried
+tries
+truly
+try
+trying
+ts
+twice
+two
+u
+un
+under
+unfortunately
+unless
+unlike
+unlikely
+until
+unto
+up
+upon
+ups
+us
+use
+used
+useful
+usefully
+usefulness
+uses
+using
+usually
+v
+value
+various
+'ve
+very
+via
+viz
+vol
+vols
+vs
+w
+want
+wants
+was
+wasnt
+way
+we
+wed
+welcome
+we'll
+went
+were
+werent
+we've
+what
+whatever
+what'll
+whats
+when
+whence
+whenever
+where
+whereafter
+whereas
+whereby
+wherein
+wheres
+whereupon
+wherever
+whether
+which
+while
+whim
+whither
+who
+whod
+whoever
+whole
+who'll
+whom
+whomever
+whos
+whose
+why
+widely
+willing
+wish
+with
+within
+without
+wont
+words
+world
+would
+wouldnt
+www
+x
+y
+yes
+yet
+you
+youd
+you'll
+your
+youre
+yours
+yourself
+yourselves
+you've
+z
+zero
@@ -12,6 +12,9 @@ module Sibe.NaiveBayes
   stdev,
   cleanText,
   cleanDocuments,
+   ngram,
+   removeWords,
+   removeStopwords,
  )
  where
    import Data.List
@@ -21,7 +24,7 @@ module Sibe.NaiveBayes
    import Data.Maybe
    import Control.Arrow ((&&&))
    import Text.Regex.PCRE
-    import Data.Char (isSpace)
+    import Data.Char (isSpace, isNumber)
    import NLP.Stemmer

    type Class = Int;
@@ -36,6 +39,7 @@ module Sibe.NaiveBayes
                 , megadoc    :: String
                 , cd         :: [(Class, [Document])]
                 , cw         :: [(Class, [(String, Int)])]
+                 , cgram      :: [(Class, [(String, Int)])]
                 } deriving (Eq, Show, Read)

    train :: [Document] -> [Class] -> NB
@@ -49,7 +53,9 @@ module Sibe.NaiveBayes
          cd = zip classes (map classDocs classes)

          -- (class, [(word, count)])
-          cw = zip classes $ l (map classWordsCounts classes)
+          cw = zip classes $ map classWordsCounts classes
+
+          cgram = zip classes $ map classNGramCounts classes

      in NB { documents  = documents
            , classes    = cls
@@ -57,44 +63,60 @@ module Sibe.NaiveBayes
            , megadoc    = megadoc
            , cd         = cd
            , cw         = cw
+            , cgram      = cgram
            }
      where
        concatDocs = concatMap (\(Document text _) -> text ++ " ")

        classDocs x = filter ((==x) . c) documents
-        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
-        classWords x = words (classMegadoc x)
-        classNGram n = ngram n . classMegadoc
-        classVocabulary x = ordNub (classWords x)
+        classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
+        classWords = words . classMegadoc
+        classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
+        classNGramWords = words . classNGram
+        classVocabulary = ordNub . classWords
        classPrior x = genericLength (classDocs x) / genericLength documents
        countWordInDoc d w = genericLength (filter (==w) d)
-        classWordsCounts x =
-          let voc = classVocabulary x
-          in zip voc $ map (countWordInDoc (classWords x)) voc
+        wordsCount ws voc =
+          zip voc $ map (countWordInDoc ws) voc
+        classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
+        classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)

-    ngram :: Int -> String -> [String]
-    ngram n text =
+    ngram :: Int -> [Document] -> [Document]
+    ngram n documents =
+      map (\(Document text c) -> Document (helper text) c) documents
+      where
+        helper text =
          let ws = words text
-      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
+              pairs = zip [0..] ws
+              grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
+          in unwords ("<b>":grams)

-    run :: String -> NB -> Class
-    run text (NB documents classes vocabulary megadoc cd cw) =
+    run :: String -> NB -> (Class, Double)
+    run txt (NB documents classes vocabulary megadoc cd cw cgram) =
      let scores = map (score . fst) classes
-      in argmax scores
+          index = argmax scores
+          m = maximum scores
+          confidence = m / sum scores
+      in (index, 0)
      where
        score c =
          let prior = snd (classes !! c)
-          in prior * product (map (prob c) (words text))
+
+          -- below is the formula according to Multinominal Naive Bayes, but it seems
+          -- using a uniform prior probability seems to work better
+          -- in prior * product (map (prob c) (words txt))
+
+          in product (map (prob c) (words txt))
+
        prob c w =
-          let tctM = find ((==w) . fst) (snd (cw !! c))
-              tct  = (snd . fromJust) tctM
-              cvoc = (genericLength . snd) (cw !! c)
+          let tctM = find ((== w) . fst) (snd (cw !! c))
+              tct  = if isJust tctM then (snd . fromJust) tctM else 0
+              cvoc = sum $ map snd (snd (cw !! c))
              voc  = vocabulary
-          in
-            if isJust tctM then
-              realToFrac (tct + 1) / realToFrac (cvoc + voc)
-            else
-              1 / realToFrac (cvoc + voc)
+              gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
+              pg   = if isJust gram then (snd . fromJust) gram else 0
+          -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
+          in realToFrac (tct + 1) / realToFrac (cvoc + voc)

    argmax :: (Ord a) => [a] -> Int
    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
@@ -113,7 +135,8 @@ module Sibe.NaiveBayes
      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
          stemmed = unwords $ map (stem Porter) (words spacify)
-      in stemmed
+          nonumber = filter (not . isNumber) stemmed
+      in (unwords . words) nonumber
      where
        trim = f . f
          where
@@ -124,11 +147,21 @@ module Sibe.NaiveBayes
    cleanDocuments :: [Document] -> [Document]
    cleanDocuments documents =
      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
-          wc = wordCounts (concatDocs cleaned)
+      in cleaned
+
+    removeWords :: [String] -> [Document] -> [Document]
+    removeWords ws documents =
+      map (\(Document text c) -> Document (rm ws text) c) documents
+      where
+          rm list text =
+            unwords $ filter (`notElem` list) (words text)
+
+    removeStopwords :: Int -> [Document] -> [Document]
+    removeStopwords i documents =
+      let wc = wordCounts (concatDocs documents)
          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
-          stopwords = l $ map fst (take 30 wlist)
-          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
-      in wstopwords
+          stopwords = map fst (take i wlist)
+      in removeWords stopwords documents
      where
        vocabulary x = ordNub (words x)
        countWordInDoc d w = genericLength (filter (==w) d)
@@ -136,9 +169,6 @@ module Sibe.NaiveBayes
          let voc = vocabulary x
          in zip voc $ map (countWordInDoc (words x)) voc

-        removeWords list text =
-          unwords $ filter (`notElem` list) (words text)
-
        concatDocs = concatMap (\(Document text _) -> text ++ " ")

    l :: (Show a) => a -> a
@@ -151,37 +181,38 @@ module Sibe.NaiveBayes
        go s (x:xs) = if x `Set.member` s then go s xs
                                          else x : go (Set.insert x s) xs

-    accuracy :: [(Int, Int)] -> Double
+    accuracy :: [(Int, (Int, Double))] -> Double
    accuracy results =
-      let correct = filter (uncurry (==)) results
+      let pairs = map (\(a, b) -> (a, fst b)) results
+          correct = filter (uncurry (==)) pairs
      in genericLength correct / genericLength results

-    recall :: [(Int, Int)] -> Double
+    recall :: [(Int, (Int, Double))] -> Double
    recall results =
      let classes = ordNub (map fst results)
          s = sum (map rec classes) / genericLength classes
      in s
      where
        rec a =
-          let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
-              y = genericLength $ filter (\(c, r) -> c == a) results
+          let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
+              y = genericLength $ filter (\(c, (r, _)) -> c == a) results
          in t / y

-    precision :: [(Int, Int)] -> Double
+    precision :: [(Int, (Int, Double))] -> Double
    precision results =
      let classes = ordNub (map fst results)
          s = sum (map prec classes) / genericLength classes
      in s
      where
        prec a =
-          let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
-              y = genericLength $ filter (\(c, r) -> r == a) results
+          let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
+              y = genericLength $ filter (\(c, (r, _)) -> r == a) results
          in
            if y == 0
              then 0
              else t / y

-    fmeasure :: [(Int, Int)] -> Double
+    fmeasure :: [(Int, (Int, Double))] -> Double
    fmeasure results =
      let r = recall results
          p = precision results