feat(stopwords): removeWords and removeStopwords functions as pre-processors
feat(confidence, WIP): calculate confidence of each classification
This commit is contained in:
parent
ea1f05f001
commit
099c25e166
@ -14,26 +14,20 @@ module Main
|
|||||||
test <- readFile "examples/doc-classifier-data/data-reuters-test"
|
test <- readFile "examples/doc-classifier-data/data-reuters-test"
|
||||||
|
|
||||||
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
||||||
|
sws <- lines <$> readFile "examples/stopwords"
|
||||||
|
|
||||||
let intClasses = [0..length classes - 1]
|
let intClasses = [0..length classes - 1]
|
||||||
-- let intClasses = [0, 1]
|
documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
|
||||||
documents = cleanDocuments $ createDocuments classes dataset
|
|
||||||
-- documents = [Document "Chinese Beijing Chinese" 0,
|
|
||||||
-- Document "Chinese Chinese Shanghai" 0,
|
|
||||||
-- Document "Chinese Macao" 0,
|
|
||||||
-- Document "Japan Tokyo Chinese" 1]
|
|
||||||
-- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
|
|
||||||
testDocuments = cleanDocuments $ createDocuments classes test
|
testDocuments = cleanDocuments $ createDocuments classes test
|
||||||
devTestDocuments = take 30 testDocuments
|
devTestDocuments = take 30 testDocuments
|
||||||
-- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
|
|
||||||
nb = train documents intClasses
|
nb = train documents intClasses
|
||||||
|
|
||||||
results = map (\(Document text c) -> (c, run text nb)) testDocuments
|
results = map (\(Document text c) -> (c, run text nb)) testDocuments
|
||||||
-- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
|
-- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
|
||||||
|
|
||||||
-- print (text $ head documents)
|
print (text $ head documents)
|
||||||
|
|
||||||
let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
||||||
mapM_ showResults results
|
mapM_ showResults results
|
||||||
|
|
||||||
putStrLn $ "Recall: " ++ show (recall results)
|
putStrLn $ "Recall: " ++ show (recall results)
|
||||||
|
665
examples/stopwords
Normal file
665
examples/stopwords
Normal file
@ -0,0 +1,665 @@
|
|||||||
|
a
|
||||||
|
able
|
||||||
|
about
|
||||||
|
above
|
||||||
|
abst
|
||||||
|
accordance
|
||||||
|
according
|
||||||
|
accordingly
|
||||||
|
across
|
||||||
|
act
|
||||||
|
actually
|
||||||
|
added
|
||||||
|
adj
|
||||||
|
affected
|
||||||
|
affecting
|
||||||
|
affects
|
||||||
|
after
|
||||||
|
afterwards
|
||||||
|
again
|
||||||
|
against
|
||||||
|
ah
|
||||||
|
all
|
||||||
|
almost
|
||||||
|
alone
|
||||||
|
along
|
||||||
|
already
|
||||||
|
also
|
||||||
|
although
|
||||||
|
always
|
||||||
|
am
|
||||||
|
among
|
||||||
|
amongst
|
||||||
|
an
|
||||||
|
and
|
||||||
|
announce
|
||||||
|
another
|
||||||
|
any
|
||||||
|
anybody
|
||||||
|
anyhow
|
||||||
|
anymore
|
||||||
|
anyone
|
||||||
|
anything
|
||||||
|
anyway
|
||||||
|
anyways
|
||||||
|
anywhere
|
||||||
|
apparently
|
||||||
|
approximately
|
||||||
|
are
|
||||||
|
aren
|
||||||
|
arent
|
||||||
|
arise
|
||||||
|
around
|
||||||
|
as
|
||||||
|
aside
|
||||||
|
ask
|
||||||
|
asking
|
||||||
|
at
|
||||||
|
auth
|
||||||
|
available
|
||||||
|
away
|
||||||
|
awfully
|
||||||
|
b
|
||||||
|
back
|
||||||
|
be
|
||||||
|
became
|
||||||
|
because
|
||||||
|
become
|
||||||
|
becomes
|
||||||
|
becoming
|
||||||
|
been
|
||||||
|
before
|
||||||
|
beforehand
|
||||||
|
begin
|
||||||
|
beginning
|
||||||
|
beginnings
|
||||||
|
begins
|
||||||
|
behind
|
||||||
|
being
|
||||||
|
believe
|
||||||
|
below
|
||||||
|
beside
|
||||||
|
besides
|
||||||
|
between
|
||||||
|
beyond
|
||||||
|
biol
|
||||||
|
both
|
||||||
|
brief
|
||||||
|
briefly
|
||||||
|
but
|
||||||
|
by
|
||||||
|
c
|
||||||
|
ca
|
||||||
|
came
|
||||||
|
can
|
||||||
|
cannot
|
||||||
|
can't
|
||||||
|
cause
|
||||||
|
causes
|
||||||
|
certain
|
||||||
|
certainly
|
||||||
|
co
|
||||||
|
com
|
||||||
|
come
|
||||||
|
comes
|
||||||
|
contain
|
||||||
|
containing
|
||||||
|
contains
|
||||||
|
could
|
||||||
|
couldnt
|
||||||
|
d
|
||||||
|
date
|
||||||
|
did
|
||||||
|
didn't
|
||||||
|
different
|
||||||
|
do
|
||||||
|
does
|
||||||
|
doesn't
|
||||||
|
doing
|
||||||
|
done
|
||||||
|
don't
|
||||||
|
down
|
||||||
|
downwards
|
||||||
|
due
|
||||||
|
during
|
||||||
|
e
|
||||||
|
each
|
||||||
|
ed
|
||||||
|
edu
|
||||||
|
effect
|
||||||
|
eg
|
||||||
|
eight
|
||||||
|
eighty
|
||||||
|
either
|
||||||
|
else
|
||||||
|
elsewhere
|
||||||
|
end
|
||||||
|
ending
|
||||||
|
enough
|
||||||
|
especially
|
||||||
|
et
|
||||||
|
et-al
|
||||||
|
etc
|
||||||
|
even
|
||||||
|
ever
|
||||||
|
every
|
||||||
|
everybody
|
||||||
|
everyone
|
||||||
|
everything
|
||||||
|
everywhere
|
||||||
|
ex
|
||||||
|
except
|
||||||
|
f
|
||||||
|
far
|
||||||
|
few
|
||||||
|
ff
|
||||||
|
fifth
|
||||||
|
first
|
||||||
|
five
|
||||||
|
fix
|
||||||
|
followed
|
||||||
|
following
|
||||||
|
follows
|
||||||
|
for
|
||||||
|
former
|
||||||
|
formerly
|
||||||
|
forth
|
||||||
|
found
|
||||||
|
four
|
||||||
|
from
|
||||||
|
further
|
||||||
|
furthermore
|
||||||
|
g
|
||||||
|
gave
|
||||||
|
get
|
||||||
|
gets
|
||||||
|
getting
|
||||||
|
give
|
||||||
|
given
|
||||||
|
gives
|
||||||
|
giving
|
||||||
|
go
|
||||||
|
goes
|
||||||
|
gone
|
||||||
|
got
|
||||||
|
gotten
|
||||||
|
h
|
||||||
|
had
|
||||||
|
happens
|
||||||
|
hardly
|
||||||
|
has
|
||||||
|
hasn't
|
||||||
|
have
|
||||||
|
haven't
|
||||||
|
having
|
||||||
|
he
|
||||||
|
hed
|
||||||
|
hence
|
||||||
|
her
|
||||||
|
here
|
||||||
|
hereafter
|
||||||
|
hereby
|
||||||
|
herein
|
||||||
|
heres
|
||||||
|
hereupon
|
||||||
|
hers
|
||||||
|
herself
|
||||||
|
hes
|
||||||
|
hi
|
||||||
|
hid
|
||||||
|
him
|
||||||
|
himself
|
||||||
|
his
|
||||||
|
hither
|
||||||
|
home
|
||||||
|
how
|
||||||
|
howbeit
|
||||||
|
however
|
||||||
|
hundred
|
||||||
|
i
|
||||||
|
id
|
||||||
|
ie
|
||||||
|
if
|
||||||
|
i'll
|
||||||
|
im
|
||||||
|
immediate
|
||||||
|
immediately
|
||||||
|
importance
|
||||||
|
important
|
||||||
|
in
|
||||||
|
inc
|
||||||
|
indeed
|
||||||
|
index
|
||||||
|
information
|
||||||
|
instead
|
||||||
|
into
|
||||||
|
invention
|
||||||
|
inward
|
||||||
|
is
|
||||||
|
isn't
|
||||||
|
it
|
||||||
|
itd
|
||||||
|
it'll
|
||||||
|
its
|
||||||
|
itself
|
||||||
|
i've
|
||||||
|
j
|
||||||
|
just
|
||||||
|
k
|
||||||
|
keep keeps
|
||||||
|
kept
|
||||||
|
kg
|
||||||
|
km
|
||||||
|
know
|
||||||
|
known
|
||||||
|
knows
|
||||||
|
l
|
||||||
|
largely
|
||||||
|
last
|
||||||
|
lately
|
||||||
|
later
|
||||||
|
latter
|
||||||
|
latterly
|
||||||
|
least
|
||||||
|
less
|
||||||
|
lest
|
||||||
|
let
|
||||||
|
lets
|
||||||
|
like
|
||||||
|
liked
|
||||||
|
likely
|
||||||
|
line
|
||||||
|
little
|
||||||
|
'll
|
||||||
|
look
|
||||||
|
looking
|
||||||
|
looks
|
||||||
|
ltd
|
||||||
|
m
|
||||||
|
made
|
||||||
|
mainly
|
||||||
|
make
|
||||||
|
makes
|
||||||
|
many
|
||||||
|
may
|
||||||
|
maybe
|
||||||
|
me
|
||||||
|
mean
|
||||||
|
means
|
||||||
|
meantime
|
||||||
|
meanwhile
|
||||||
|
merely
|
||||||
|
mg
|
||||||
|
might
|
||||||
|
million
|
||||||
|
miss
|
||||||
|
ml
|
||||||
|
more
|
||||||
|
moreover
|
||||||
|
most
|
||||||
|
mostly
|
||||||
|
mr
|
||||||
|
mrs
|
||||||
|
much
|
||||||
|
mug
|
||||||
|
must
|
||||||
|
my
|
||||||
|
myself
|
||||||
|
n
|
||||||
|
na
|
||||||
|
name
|
||||||
|
namely
|
||||||
|
nay
|
||||||
|
nd
|
||||||
|
near
|
||||||
|
nearly
|
||||||
|
necessarily
|
||||||
|
necessary
|
||||||
|
need
|
||||||
|
needs
|
||||||
|
neither
|
||||||
|
never
|
||||||
|
nevertheless
|
||||||
|
new
|
||||||
|
next
|
||||||
|
nine
|
||||||
|
ninety
|
||||||
|
no
|
||||||
|
nobody
|
||||||
|
non
|
||||||
|
none
|
||||||
|
nonetheless
|
||||||
|
noone
|
||||||
|
nor
|
||||||
|
normally
|
||||||
|
nos
|
||||||
|
not
|
||||||
|
noted
|
||||||
|
nothing
|
||||||
|
now
|
||||||
|
nowhere
|
||||||
|
o
|
||||||
|
obtain
|
||||||
|
obtained
|
||||||
|
obviously
|
||||||
|
of
|
||||||
|
off
|
||||||
|
often
|
||||||
|
oh
|
||||||
|
ok
|
||||||
|
okay
|
||||||
|
old
|
||||||
|
omitted
|
||||||
|
on
|
||||||
|
once
|
||||||
|
one
|
||||||
|
ones
|
||||||
|
only
|
||||||
|
onto
|
||||||
|
or
|
||||||
|
ord
|
||||||
|
other
|
||||||
|
others
|
||||||
|
otherwise
|
||||||
|
ought
|
||||||
|
our
|
||||||
|
ours
|
||||||
|
ourselves
|
||||||
|
out
|
||||||
|
outside
|
||||||
|
over
|
||||||
|
overall
|
||||||
|
owing
|
||||||
|
own
|
||||||
|
p
|
||||||
|
page
|
||||||
|
pages
|
||||||
|
part
|
||||||
|
particular
|
||||||
|
particularly
|
||||||
|
past
|
||||||
|
per
|
||||||
|
perhaps
|
||||||
|
placed
|
||||||
|
please
|
||||||
|
plus
|
||||||
|
poorly
|
||||||
|
possible
|
||||||
|
possibly
|
||||||
|
potentially
|
||||||
|
pp
|
||||||
|
predominantly
|
||||||
|
present
|
||||||
|
previously
|
||||||
|
primarily
|
||||||
|
probably
|
||||||
|
promptly
|
||||||
|
proud
|
||||||
|
provides
|
||||||
|
put
|
||||||
|
q
|
||||||
|
que
|
||||||
|
quickly
|
||||||
|
quite
|
||||||
|
qv
|
||||||
|
r
|
||||||
|
ran
|
||||||
|
rather
|
||||||
|
rd
|
||||||
|
re
|
||||||
|
readily
|
||||||
|
really
|
||||||
|
recent
|
||||||
|
recently
|
||||||
|
ref
|
||||||
|
refs
|
||||||
|
regarding
|
||||||
|
regardless
|
||||||
|
regards
|
||||||
|
related
|
||||||
|
relatively
|
||||||
|
research
|
||||||
|
respectively
|
||||||
|
resulted
|
||||||
|
resulting
|
||||||
|
results
|
||||||
|
right
|
||||||
|
run
|
||||||
|
s
|
||||||
|
said
|
||||||
|
same
|
||||||
|
saw
|
||||||
|
say
|
||||||
|
saying
|
||||||
|
says
|
||||||
|
sec
|
||||||
|
section
|
||||||
|
see
|
||||||
|
seeing
|
||||||
|
seem
|
||||||
|
seemed
|
||||||
|
seeming
|
||||||
|
seems
|
||||||
|
seen
|
||||||
|
self
|
||||||
|
selves
|
||||||
|
sent
|
||||||
|
seven
|
||||||
|
several
|
||||||
|
shall
|
||||||
|
she
|
||||||
|
shed
|
||||||
|
she'll
|
||||||
|
shes
|
||||||
|
should
|
||||||
|
shouldn't
|
||||||
|
show
|
||||||
|
showed
|
||||||
|
shown
|
||||||
|
showns
|
||||||
|
shows
|
||||||
|
significant
|
||||||
|
significantly
|
||||||
|
similar
|
||||||
|
similarly
|
||||||
|
since
|
||||||
|
six
|
||||||
|
slightly
|
||||||
|
so
|
||||||
|
some
|
||||||
|
somebody
|
||||||
|
somehow
|
||||||
|
someone
|
||||||
|
somethan
|
||||||
|
something
|
||||||
|
sometime
|
||||||
|
sometimes
|
||||||
|
somewhat
|
||||||
|
somewhere
|
||||||
|
soon
|
||||||
|
sorry
|
||||||
|
specifically
|
||||||
|
specified
|
||||||
|
specify
|
||||||
|
specifying
|
||||||
|
still
|
||||||
|
stop
|
||||||
|
strongly
|
||||||
|
sub
|
||||||
|
substantially
|
||||||
|
successfully
|
||||||
|
such
|
||||||
|
sufficiently
|
||||||
|
suggest
|
||||||
|
sup
|
||||||
|
sure t
|
||||||
|
take
|
||||||
|
taken
|
||||||
|
taking
|
||||||
|
tell
|
||||||
|
tends
|
||||||
|
th
|
||||||
|
than
|
||||||
|
thank
|
||||||
|
thanks
|
||||||
|
thanx
|
||||||
|
that
|
||||||
|
that'll
|
||||||
|
thats
|
||||||
|
that've
|
||||||
|
the
|
||||||
|
their
|
||||||
|
theirs
|
||||||
|
them
|
||||||
|
themselves
|
||||||
|
then
|
||||||
|
thence
|
||||||
|
there
|
||||||
|
thereafter
|
||||||
|
thereby
|
||||||
|
thered
|
||||||
|
therefore
|
||||||
|
therein
|
||||||
|
there'll
|
||||||
|
thereof
|
||||||
|
therere
|
||||||
|
theres
|
||||||
|
thereto
|
||||||
|
thereupon
|
||||||
|
there've
|
||||||
|
these
|
||||||
|
they
|
||||||
|
theyd
|
||||||
|
they'll
|
||||||
|
theyre
|
||||||
|
they've
|
||||||
|
think
|
||||||
|
this
|
||||||
|
those
|
||||||
|
thou
|
||||||
|
though
|
||||||
|
thoughh
|
||||||
|
thousand
|
||||||
|
throug
|
||||||
|
through
|
||||||
|
throughout
|
||||||
|
thru
|
||||||
|
thus
|
||||||
|
til
|
||||||
|
tip
|
||||||
|
to
|
||||||
|
together
|
||||||
|
too
|
||||||
|
took
|
||||||
|
toward
|
||||||
|
towards
|
||||||
|
tried
|
||||||
|
tries
|
||||||
|
truly
|
||||||
|
try
|
||||||
|
trying
|
||||||
|
ts
|
||||||
|
twice
|
||||||
|
two
|
||||||
|
u
|
||||||
|
un
|
||||||
|
under
|
||||||
|
unfortunately
|
||||||
|
unless
|
||||||
|
unlike
|
||||||
|
unlikely
|
||||||
|
until
|
||||||
|
unto
|
||||||
|
up
|
||||||
|
upon
|
||||||
|
ups
|
||||||
|
us
|
||||||
|
use
|
||||||
|
used
|
||||||
|
useful
|
||||||
|
usefully
|
||||||
|
usefulness
|
||||||
|
uses
|
||||||
|
using
|
||||||
|
usually
|
||||||
|
v
|
||||||
|
value
|
||||||
|
various
|
||||||
|
've
|
||||||
|
very
|
||||||
|
via
|
||||||
|
viz
|
||||||
|
vol
|
||||||
|
vols
|
||||||
|
vs
|
||||||
|
w
|
||||||
|
want
|
||||||
|
wants
|
||||||
|
was
|
||||||
|
wasnt
|
||||||
|
way
|
||||||
|
we
|
||||||
|
wed
|
||||||
|
welcome
|
||||||
|
we'll
|
||||||
|
went
|
||||||
|
were
|
||||||
|
werent
|
||||||
|
we've
|
||||||
|
what
|
||||||
|
whatever
|
||||||
|
what'll
|
||||||
|
whats
|
||||||
|
when
|
||||||
|
whence
|
||||||
|
whenever
|
||||||
|
where
|
||||||
|
whereafter
|
||||||
|
whereas
|
||||||
|
whereby
|
||||||
|
wherein
|
||||||
|
wheres
|
||||||
|
whereupon
|
||||||
|
wherever
|
||||||
|
whether
|
||||||
|
which
|
||||||
|
while
|
||||||
|
whim
|
||||||
|
whither
|
||||||
|
who
|
||||||
|
whod
|
||||||
|
whoever
|
||||||
|
whole
|
||||||
|
who'll
|
||||||
|
whom
|
||||||
|
whomever
|
||||||
|
whos
|
||||||
|
whose
|
||||||
|
why
|
||||||
|
widely
|
||||||
|
willing
|
||||||
|
wish
|
||||||
|
with
|
||||||
|
within
|
||||||
|
without
|
||||||
|
wont
|
||||||
|
words
|
||||||
|
world
|
||||||
|
would
|
||||||
|
wouldnt
|
||||||
|
www
|
||||||
|
x
|
||||||
|
y
|
||||||
|
yes
|
||||||
|
yet
|
||||||
|
you
|
||||||
|
youd
|
||||||
|
you'll
|
||||||
|
your
|
||||||
|
youre
|
||||||
|
yours
|
||||||
|
yourself
|
||||||
|
yourselves
|
||||||
|
you've
|
||||||
|
z
|
||||||
|
zero
|
@ -12,6 +12,9 @@ module Sibe.NaiveBayes
|
|||||||
stdev,
|
stdev,
|
||||||
cleanText,
|
cleanText,
|
||||||
cleanDocuments,
|
cleanDocuments,
|
||||||
|
ngram,
|
||||||
|
removeWords,
|
||||||
|
removeStopwords,
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
import Data.List
|
import Data.List
|
||||||
@ -21,7 +24,7 @@ module Sibe.NaiveBayes
|
|||||||
import Data.Maybe
|
import Data.Maybe
|
||||||
import Control.Arrow ((&&&))
|
import Control.Arrow ((&&&))
|
||||||
import Text.Regex.PCRE
|
import Text.Regex.PCRE
|
||||||
import Data.Char (isSpace)
|
import Data.Char (isSpace, isNumber)
|
||||||
import NLP.Stemmer
|
import NLP.Stemmer
|
||||||
|
|
||||||
type Class = Int;
|
type Class = Int;
|
||||||
@ -36,6 +39,7 @@ module Sibe.NaiveBayes
|
|||||||
, megadoc :: String
|
, megadoc :: String
|
||||||
, cd :: [(Class, [Document])]
|
, cd :: [(Class, [Document])]
|
||||||
, cw :: [(Class, [(String, Int)])]
|
, cw :: [(Class, [(String, Int)])]
|
||||||
|
, cgram :: [(Class, [(String, Int)])]
|
||||||
} deriving (Eq, Show, Read)
|
} deriving (Eq, Show, Read)
|
||||||
|
|
||||||
train :: [Document] -> [Class] -> NB
|
train :: [Document] -> [Class] -> NB
|
||||||
@ -49,7 +53,9 @@ module Sibe.NaiveBayes
|
|||||||
cd = zip classes (map classDocs classes)
|
cd = zip classes (map classDocs classes)
|
||||||
|
|
||||||
-- (class, [(word, count)])
|
-- (class, [(word, count)])
|
||||||
cw = zip classes $ l (map classWordsCounts classes)
|
cw = zip classes $ map classWordsCounts classes
|
||||||
|
|
||||||
|
cgram = zip classes $ map classNGramCounts classes
|
||||||
|
|
||||||
in NB { documents = documents
|
in NB { documents = documents
|
||||||
, classes = cls
|
, classes = cls
|
||||||
@ -57,44 +63,60 @@ module Sibe.NaiveBayes
|
|||||||
, megadoc = megadoc
|
, megadoc = megadoc
|
||||||
, cd = cd
|
, cd = cd
|
||||||
, cw = cw
|
, cw = cw
|
||||||
|
, cgram = cgram
|
||||||
}
|
}
|
||||||
where
|
where
|
||||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||||
|
|
||||||
classDocs x = filter ((==x) . c) documents
|
classDocs x = filter ((==x) . c) documents
|
||||||
classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
|
classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
|
||||||
classWords x = words (classMegadoc x)
|
classWords = words . classMegadoc
|
||||||
classNGram n = ngram n . classMegadoc
|
classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
|
||||||
classVocabulary x = ordNub (classWords x)
|
classNGramWords = words . classNGram
|
||||||
|
classVocabulary = ordNub . classWords
|
||||||
classPrior x = genericLength (classDocs x) / genericLength documents
|
classPrior x = genericLength (classDocs x) / genericLength documents
|
||||||
countWordInDoc d w = genericLength (filter (==w) d)
|
countWordInDoc d w = genericLength (filter (==w) d)
|
||||||
classWordsCounts x =
|
wordsCount ws voc =
|
||||||
let voc = classVocabulary x
|
zip voc $ map (countWordInDoc ws) voc
|
||||||
in zip voc $ map (countWordInDoc (classWords x)) voc
|
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
|
||||||
|
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
|
||||||
|
|
||||||
ngram :: Int -> String -> [String]
|
ngram :: Int -> [Document] -> [Document]
|
||||||
ngram n text =
|
ngram n documents =
|
||||||
let ws = words text
|
map (\(Document text c) -> Document (helper text) c) documents
|
||||||
in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
|
where
|
||||||
|
helper text =
|
||||||
|
let ws = words text
|
||||||
|
pairs = zip [0..] ws
|
||||||
|
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
||||||
|
in unwords ("<b>":grams)
|
||||||
|
|
||||||
run :: String -> NB -> Class
|
run :: String -> NB -> (Class, Double)
|
||||||
run text (NB documents classes vocabulary megadoc cd cw) =
|
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
|
||||||
let scores = map (score . fst) classes
|
let scores = map (score . fst) classes
|
||||||
in argmax scores
|
index = argmax scores
|
||||||
|
m = maximum scores
|
||||||
|
confidence = m / sum scores
|
||||||
|
in (index, 0)
|
||||||
where
|
where
|
||||||
score c =
|
score c =
|
||||||
let prior = snd (classes !! c)
|
let prior = snd (classes !! c)
|
||||||
in prior * product (map (prob c) (words text))
|
|
||||||
|
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
||||||
|
-- using a uniform prior probability seems to work better
|
||||||
|
-- in prior * product (map (prob c) (words txt))
|
||||||
|
|
||||||
|
in product (map (prob c) (words txt))
|
||||||
|
|
||||||
prob c w =
|
prob c w =
|
||||||
let tctM = find ((==w) . fst) (snd (cw !! c))
|
let tctM = find ((== w) . fst) (snd (cw !! c))
|
||||||
tct = (snd . fromJust) tctM
|
tct = if isJust tctM then (snd . fromJust) tctM else 0
|
||||||
cvoc = (genericLength . snd) (cw !! c)
|
cvoc = sum $ map snd (snd (cw !! c))
|
||||||
voc = vocabulary
|
voc = vocabulary
|
||||||
in
|
gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
|
||||||
if isJust tctM then
|
pg = if isJust gram then (snd . fromJust) gram else 0
|
||||||
realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
|
||||||
else
|
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
||||||
1 / realToFrac (cvoc + voc)
|
|
||||||
|
|
||||||
argmax :: (Ord a) => [a] -> Int
|
argmax :: (Ord a) => [a] -> Int
|
||||||
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
|
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
|
||||||
@ -113,7 +135,8 @@ module Sibe.NaiveBayes
|
|||||||
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
|
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
|
||||||
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
||||||
stemmed = unwords $ map (stem Porter) (words spacify)
|
stemmed = unwords $ map (stem Porter) (words spacify)
|
||||||
in stemmed
|
nonumber = filter (not . isNumber) stemmed
|
||||||
|
in (unwords . words) nonumber
|
||||||
where
|
where
|
||||||
trim = f . f
|
trim = f . f
|
||||||
where
|
where
|
||||||
@ -124,11 +147,21 @@ module Sibe.NaiveBayes
|
|||||||
cleanDocuments :: [Document] -> [Document]
|
cleanDocuments :: [Document] -> [Document]
|
||||||
cleanDocuments documents =
|
cleanDocuments documents =
|
||||||
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
|
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
|
||||||
wc = wordCounts (concatDocs cleaned)
|
in cleaned
|
||||||
|
|
||||||
|
removeWords :: [String] -> [Document] -> [Document]
|
||||||
|
removeWords ws documents =
|
||||||
|
map (\(Document text c) -> Document (rm ws text) c) documents
|
||||||
|
where
|
||||||
|
rm list text =
|
||||||
|
unwords $ filter (`notElem` list) (words text)
|
||||||
|
|
||||||
|
removeStopwords :: Int -> [Document] -> [Document]
|
||||||
|
removeStopwords i documents =
|
||||||
|
let wc = wordCounts (concatDocs documents)
|
||||||
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
|
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
|
||||||
stopwords = l $ map fst (take 30 wlist)
|
stopwords = map fst (take i wlist)
|
||||||
wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
|
in removeWords stopwords documents
|
||||||
in wstopwords
|
|
||||||
where
|
where
|
||||||
vocabulary x = ordNub (words x)
|
vocabulary x = ordNub (words x)
|
||||||
countWordInDoc d w = genericLength (filter (==w) d)
|
countWordInDoc d w = genericLength (filter (==w) d)
|
||||||
@ -136,9 +169,6 @@ module Sibe.NaiveBayes
|
|||||||
let voc = vocabulary x
|
let voc = vocabulary x
|
||||||
in zip voc $ map (countWordInDoc (words x)) voc
|
in zip voc $ map (countWordInDoc (words x)) voc
|
||||||
|
|
||||||
removeWords list text =
|
|
||||||
unwords $ filter (`notElem` list) (words text)
|
|
||||||
|
|
||||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||||
|
|
||||||
l :: (Show a) => a -> a
|
l :: (Show a) => a -> a
|
||||||
@ -151,37 +181,38 @@ module Sibe.NaiveBayes
|
|||||||
go s (x:xs) = if x `Set.member` s then go s xs
|
go s (x:xs) = if x `Set.member` s then go s xs
|
||||||
else x : go (Set.insert x s) xs
|
else x : go (Set.insert x s) xs
|
||||||
|
|
||||||
accuracy :: [(Int, Int)] -> Double
|
accuracy :: [(Int, (Int, Double))] -> Double
|
||||||
accuracy results =
|
accuracy results =
|
||||||
let correct = filter (uncurry (==)) results
|
let pairs = map (\(a, b) -> (a, fst b)) results
|
||||||
|
correct = filter (uncurry (==)) pairs
|
||||||
in genericLength correct / genericLength results
|
in genericLength correct / genericLength results
|
||||||
|
|
||||||
recall :: [(Int, Int)] -> Double
|
recall :: [(Int, (Int, Double))] -> Double
|
||||||
recall results =
|
recall results =
|
||||||
let classes = ordNub (map fst results)
|
let classes = ordNub (map fst results)
|
||||||
s = sum (map rec classes) / genericLength classes
|
s = sum (map rec classes) / genericLength classes
|
||||||
in s
|
in s
|
||||||
where
|
where
|
||||||
rec a =
|
rec a =
|
||||||
let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
|
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
||||||
y = genericLength $ filter (\(c, r) -> c == a) results
|
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
|
||||||
in t / y
|
in t / y
|
||||||
|
|
||||||
precision :: [(Int, Int)] -> Double
|
precision :: [(Int, (Int, Double))] -> Double
|
||||||
precision results =
|
precision results =
|
||||||
let classes = ordNub (map fst results)
|
let classes = ordNub (map fst results)
|
||||||
s = sum (map prec classes) / genericLength classes
|
s = sum (map prec classes) / genericLength classes
|
||||||
in s
|
in s
|
||||||
where
|
where
|
||||||
prec a =
|
prec a =
|
||||||
let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
|
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
||||||
y = genericLength $ filter (\(c, r) -> r == a) results
|
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
|
||||||
in
|
in
|
||||||
if y == 0
|
if y == 0
|
||||||
then 0
|
then 0
|
||||||
else t / y
|
else t / y
|
||||||
|
|
||||||
fmeasure :: [(Int, Int)] -> Double
|
fmeasure :: [(Int, (Int, Double))] -> Double
|
||||||
fmeasure results =
|
fmeasure results =
|
||||||
let r = recall results
|
let r = recall results
|
||||||
p = precision results
|
p = precision results
|
||||||
|
Loading…
Reference in New Issue
Block a user