feat(stopwords): removeWords and removeStopwords functions as pre-processors

feat(confidence, WIP): calculate confidence of each classification
This commit is contained in:
Mahdi Dibaiee 2016-08-08 10:02:26 +04:30
parent ea1f05f001
commit 099c25e166
3 changed files with 742 additions and 52 deletions

View File

@ -14,26 +14,20 @@ module Main
test <- readFile "examples/doc-classifier-data/data-reuters-test"
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
sws <- lines <$> readFile "examples/stopwords"
let intClasses = [0..length classes - 1]
-- let intClasses = [0, 1]
documents = cleanDocuments $ createDocuments classes dataset
-- documents = [Document "Chinese Beijing Chinese" 0,
-- Document "Chinese Chinese Shanghai" 0,
-- Document "Chinese Macao" 0,
-- Document "Japan Tokyo Chinese" 1]
-- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
testDocuments = cleanDocuments $ createDocuments classes test
devTestDocuments = take 30 testDocuments
-- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
nb = train documents intClasses
results = map (\(Document text c) -> (c, run text nb)) testDocuments
-- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
-- print (text $ head documents)
print (text $ head documents)
let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
mapM_ showResults results
putStrLn $ "Recall: " ++ show (recall results)

665
examples/stopwords Normal file
View File

@ -0,0 +1,665 @@
a
able
about
above
abst
accordance
according
accordingly
across
act
actually
added
adj
affected
affecting
affects
after
afterwards
again
against
ah
all
almost
alone
along
already
also
although
always
am
among
amongst
an
and
announce
another
any
anybody
anyhow
anymore
anyone
anything
anyway
anyways
anywhere
apparently
approximately
are
aren
arent
arise
around
as
aside
ask
asking
at
auth
available
away
awfully
b
back
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
beginnings
begins
behind
being
believe
below
beside
besides
between
beyond
biol
both
brief
briefly
but
by
c
ca
came
can
cannot
can't
cause
causes
certain
certainly
co
com
come
comes
contain
containing
contains
could
couldnt
d
date
did
didn't
different
do
does
doesn't
doing
done
don't
down
downwards
due
during
e
each
ed
edu
effect
eg
eight
eighty
either
else
elsewhere
end
ending
enough
especially
et
et-al
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
except
f
far
few
ff
fifth
first
five
fix
followed
following
follows
for
former
formerly
forth
found
four
from
further
furthermore
g
gave
get
gets
getting
give
given
gives
giving
go
goes
gone
got
gotten
h
had
happens
hardly
has
hasn't
have
haven't
having
he
hed
hence
her
here
hereafter
hereby
herein
heres
hereupon
hers
herself
hes
hi
hid
him
himself
his
hither
home
how
howbeit
however
hundred
i
id
ie
if
i'll
im
immediate
immediately
importance
important
in
inc
indeed
index
information
instead
into
invention
inward
is
isn't
it
itd
it'll
its
itself
i've
j
just
k
keep keeps
kept
kg
km
know
known
knows
l
largely
last
lately
later
latter
latterly
least
less
lest
let
lets
like
liked
likely
line
little
'll
look
looking
looks
ltd
m
made
mainly
make
makes
many
may
maybe
me
mean
means
meantime
meanwhile
merely
mg
might
million
miss
ml
more
moreover
most
mostly
mr
mrs
much
mug
must
my
myself
n
na
name
namely
nay
nd
near
nearly
necessarily
necessary
need
needs
neither
never
nevertheless
new
next
nine
ninety
no
nobody
non
none
nonetheless
noone
nor
normally
nos
not
noted
nothing
now
nowhere
o
obtain
obtained
obviously
of
off
often
oh
ok
okay
old
omitted
on
once
one
ones
only
onto
or
ord
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
owing
own
p
page
pages
part
particular
particularly
past
per
perhaps
placed
please
plus
poorly
possible
possibly
potentially
pp
predominantly
present
previously
primarily
probably
promptly
proud
provides
put
q
que
quickly
quite
qv
r
ran
rather
rd
re
readily
really
recent
recently
ref
refs
regarding
regardless
regards
related
relatively
research
respectively
resulted
resulting
results
right
run
s
said
same
saw
say
saying
says
sec
section
see
seeing
seem
seemed
seeming
seems
seen
self
selves
sent
seven
several
shall
she
shed
she'll
shes
should
shouldn't
show
showed
shown
showns
shows
significant
significantly
similar
similarly
since
six
slightly
so
some
somebody
somehow
someone
somethan
something
sometime
sometimes
somewhat
somewhere
soon
sorry
specifically
specified
specify
specifying
still
stop
strongly
sub
substantially
successfully
such
sufficiently
suggest
sup
sure t
take
taken
taking
tell
tends
th
than
thank
thanks
thanx
that
that'll
thats
that've
the
their
theirs
them
themselves
then
thence
there
thereafter
thereby
thered
therefore
therein
there'll
thereof
therere
theres
thereto
thereupon
there've
these
they
theyd
they'll
theyre
they've
think
this
those
thou
though
thoughh
thousand
throug
through
throughout
thru
thus
til
tip
to
together
too
took
toward
towards
tried
tries
truly
try
trying
ts
twice
two
u
un
under
unfortunately
unless
unlike
unlikely
until
unto
up
upon
ups
us
use
used
useful
usefully
usefulness
uses
using
usually
v
value
various
've
very
via
viz
vol
vols
vs
w
want
wants
was
wasnt
way
we
wed
welcome
we'll
went
were
werent
we've
what
whatever
what'll
whats
when
whence
whenever
where
whereafter
whereas
whereby
wherein
wheres
whereupon
wherever
whether
which
while
whim
whither
who
whod
whoever
whole
who'll
whom
whomever
whos
whose
why
widely
willing
wish
with
within
without
wont
words
world
would
wouldnt
www
x
y
yes
yet
you
youd
you'll
your
youre
yours
yourself
yourselves
you've
z
zero

View File

@ -12,6 +12,9 @@ module Sibe.NaiveBayes
stdev,
cleanText,
cleanDocuments,
ngram,
removeWords,
removeStopwords,
)
where
import Data.List
@ -21,7 +24,7 @@ module Sibe.NaiveBayes
import Data.Maybe
import Control.Arrow ((&&&))
import Text.Regex.PCRE
import Data.Char (isSpace)
import Data.Char (isSpace, isNumber)
import NLP.Stemmer
type Class = Int;
@ -36,6 +39,7 @@ module Sibe.NaiveBayes
, megadoc :: String
, cd :: [(Class, [Document])]
, cw :: [(Class, [(String, Int)])]
, cgram :: [(Class, [(String, Int)])]
} deriving (Eq, Show, Read)
train :: [Document] -> [Class] -> NB
@ -49,7 +53,9 @@ module Sibe.NaiveBayes
cd = zip classes (map classDocs classes)
-- (class, [(word, count)])
cw = zip classes $ l (map classWordsCounts classes)
cw = zip classes $ map classWordsCounts classes
cgram = zip classes $ map classNGramCounts classes
in NB { documents = documents
, classes = cls
@ -57,44 +63,60 @@ module Sibe.NaiveBayes
, megadoc = megadoc
, cd = cd
, cw = cw
, cgram = cgram
}
where
concatDocs = concatMap (\(Document text _) -> text ++ " ")
classDocs x = filter ((==x) . c) documents
classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
classWords x = words (classMegadoc x)
classNGram n = ngram n . classMegadoc
classVocabulary x = ordNub (classWords x)
classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
classWords = words . classMegadoc
classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
classNGramWords = words . classNGram
classVocabulary = ordNub . classWords
classPrior x = genericLength (classDocs x) / genericLength documents
countWordInDoc d w = genericLength (filter (==w) d)
classWordsCounts x =
let voc = classVocabulary x
in zip voc $ map (countWordInDoc (classWords x)) voc
wordsCount ws voc =
zip voc $ map (countWordInDoc ws) voc
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
ngram :: Int -> String -> [String]
ngram n text =
ngram :: Int -> [Document] -> [Document]
ngram n documents =
map (\(Document text c) -> Document (helper text) c) documents
where
helper text =
let ws = words text
in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
pairs = zip [0..] ws
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
in unwords ("<b>":grams)
run :: String -> NB -> Class
run text (NB documents classes vocabulary megadoc cd cw) =
run :: String -> NB -> (Class, Double)
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
let scores = map (score . fst) classes
in argmax scores
index = argmax scores
m = maximum scores
confidence = m / sum scores
in (index, 0)
where
score c =
let prior = snd (classes !! c)
in prior * product (map (prob c) (words text))
-- below is the formula according to Multinominal Naive Bayes, but it seems
-- using a uniform prior probability seems to work better
-- in prior * product (map (prob c) (words txt))
in product (map (prob c) (words txt))
prob c w =
let tctM = find ((==w) . fst) (snd (cw !! c))
tct = (snd . fromJust) tctM
cvoc = (genericLength . snd) (cw !! c)
let tctM = find ((== w) . fst) (snd (cw !! c))
tct = if isJust tctM then (snd . fromJust) tctM else 0
cvoc = sum $ map snd (snd (cw !! c))
voc = vocabulary
in
if isJust tctM then
realToFrac (tct + 1) / realToFrac (cvoc + voc)
else
1 / realToFrac (cvoc + voc)
gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
pg = if isJust gram then (snd . fromJust) gram else 0
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
argmax :: (Ord a) => [a] -> Int
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
@ -113,7 +135,8 @@ module Sibe.NaiveBayes
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
stemmed = unwords $ map (stem Porter) (words spacify)
in stemmed
nonumber = filter (not . isNumber) stemmed
in (unwords . words) nonumber
where
trim = f . f
where
@ -124,11 +147,21 @@ module Sibe.NaiveBayes
cleanDocuments :: [Document] -> [Document]
cleanDocuments documents =
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
wc = wordCounts (concatDocs cleaned)
in cleaned
removeWords :: [String] -> [Document] -> [Document]
removeWords ws documents =
map (\(Document text c) -> Document (rm ws text) c) documents
where
rm list text =
unwords $ filter (`notElem` list) (words text)
removeStopwords :: Int -> [Document] -> [Document]
removeStopwords i documents =
let wc = wordCounts (concatDocs documents)
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
stopwords = l $ map fst (take 30 wlist)
wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
in wstopwords
stopwords = map fst (take i wlist)
in removeWords stopwords documents
where
vocabulary x = ordNub (words x)
countWordInDoc d w = genericLength (filter (==w) d)
@ -136,9 +169,6 @@ module Sibe.NaiveBayes
let voc = vocabulary x
in zip voc $ map (countWordInDoc (words x)) voc
removeWords list text =
unwords $ filter (`notElem` list) (words text)
concatDocs = concatMap (\(Document text _) -> text ++ " ")
l :: (Show a) => a -> a
@ -151,37 +181,38 @@ module Sibe.NaiveBayes
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
accuracy :: [(Int, Int)] -> Double
accuracy :: [(Int, (Int, Double))] -> Double
accuracy results =
let correct = filter (uncurry (==)) results
let pairs = map (\(a, b) -> (a, fst b)) results
correct = filter (uncurry (==)) pairs
in genericLength correct / genericLength results
recall :: [(Int, Int)] -> Double
recall :: [(Int, (Int, Double))] -> Double
recall results =
let classes = ordNub (map fst results)
s = sum (map rec classes) / genericLength classes
in s
where
rec a =
let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
y = genericLength $ filter (\(c, r) -> c == a) results
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
in t / y
precision :: [(Int, Int)] -> Double
precision :: [(Int, (Int, Double))] -> Double
precision results =
let classes = ordNub (map fst results)
s = sum (map prec classes) / genericLength classes
in s
where
prec a =
let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
y = genericLength $ filter (\(c, r) -> r == a) results
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
in
if y == 0
then 0
else t / y
fmeasure :: [(Int, Int)] -> Double
fmeasure :: [(Int, (Int, Double))] -> Double
fmeasure results =
let r = recall results
p = precision results