feat(stopwords): removeWords and removeStopwords functions as pre-processors
feat(confidence, WIP): calculate confidence of each classification
This commit is contained in:
		@@ -14,26 +14,20 @@ module Main
 | 
			
		||||
      test <- readFile "examples/doc-classifier-data/data-reuters-test"
 | 
			
		||||
 | 
			
		||||
      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
 | 
			
		||||
      sws <- lines <$> readFile "examples/stopwords"
 | 
			
		||||
 | 
			
		||||
      let intClasses = [0..length classes - 1]
 | 
			
		||||
      -- let intClasses = [0, 1]
 | 
			
		||||
          documents = cleanDocuments $ createDocuments classes dataset
 | 
			
		||||
          -- documents = [Document "Chinese Beijing Chinese" 0,
 | 
			
		||||
          --              Document "Chinese Chinese Shanghai" 0,
 | 
			
		||||
          --              Document "Chinese Macao" 0,
 | 
			
		||||
          --              Document "Japan Tokyo Chinese" 1]
 | 
			
		||||
          -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
 | 
			
		||||
          documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
 | 
			
		||||
          testDocuments = cleanDocuments $ createDocuments classes test
 | 
			
		||||
          devTestDocuments = take 30 testDocuments
 | 
			
		||||
          -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
 | 
			
		||||
          nb = train documents intClasses
 | 
			
		||||
 | 
			
		||||
          results = map (\(Document text c) -> (c, run text nb)) testDocuments
 | 
			
		||||
          -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
 | 
			
		||||
 | 
			
		||||
      -- print (text $ head documents)
 | 
			
		||||
      print (text $ head documents)
 | 
			
		||||
 | 
			
		||||
      let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
 | 
			
		||||
      let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
 | 
			
		||||
      mapM_ showResults results
 | 
			
		||||
 | 
			
		||||
      putStrLn $ "Recall: " ++ show (recall results)
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										665
									
								
								examples/stopwords
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										665
									
								
								examples/stopwords
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,665 @@
 | 
			
		||||
a
 | 
			
		||||
able
 | 
			
		||||
about
 | 
			
		||||
above
 | 
			
		||||
abst
 | 
			
		||||
accordance
 | 
			
		||||
according
 | 
			
		||||
accordingly
 | 
			
		||||
across
 | 
			
		||||
act
 | 
			
		||||
actually
 | 
			
		||||
added
 | 
			
		||||
adj
 | 
			
		||||
affected
 | 
			
		||||
affecting
 | 
			
		||||
affects
 | 
			
		||||
after
 | 
			
		||||
afterwards
 | 
			
		||||
again
 | 
			
		||||
against
 | 
			
		||||
ah
 | 
			
		||||
all
 | 
			
		||||
almost
 | 
			
		||||
alone
 | 
			
		||||
along
 | 
			
		||||
already
 | 
			
		||||
also
 | 
			
		||||
although
 | 
			
		||||
always
 | 
			
		||||
am
 | 
			
		||||
among
 | 
			
		||||
amongst
 | 
			
		||||
an
 | 
			
		||||
and
 | 
			
		||||
announce
 | 
			
		||||
another
 | 
			
		||||
any
 | 
			
		||||
anybody
 | 
			
		||||
anyhow
 | 
			
		||||
anymore
 | 
			
		||||
anyone
 | 
			
		||||
anything
 | 
			
		||||
anyway
 | 
			
		||||
anyways
 | 
			
		||||
anywhere
 | 
			
		||||
apparently
 | 
			
		||||
approximately
 | 
			
		||||
are
 | 
			
		||||
aren
 | 
			
		||||
arent
 | 
			
		||||
arise
 | 
			
		||||
around
 | 
			
		||||
as
 | 
			
		||||
aside
 | 
			
		||||
ask
 | 
			
		||||
asking
 | 
			
		||||
at
 | 
			
		||||
auth
 | 
			
		||||
available
 | 
			
		||||
away
 | 
			
		||||
awfully
 | 
			
		||||
b
 | 
			
		||||
back
 | 
			
		||||
be
 | 
			
		||||
became
 | 
			
		||||
because
 | 
			
		||||
become
 | 
			
		||||
becomes
 | 
			
		||||
becoming
 | 
			
		||||
been
 | 
			
		||||
before
 | 
			
		||||
beforehand
 | 
			
		||||
begin
 | 
			
		||||
beginning
 | 
			
		||||
beginnings
 | 
			
		||||
begins
 | 
			
		||||
behind
 | 
			
		||||
being
 | 
			
		||||
believe
 | 
			
		||||
below
 | 
			
		||||
beside
 | 
			
		||||
besides
 | 
			
		||||
between
 | 
			
		||||
beyond
 | 
			
		||||
biol
 | 
			
		||||
both
 | 
			
		||||
brief
 | 
			
		||||
briefly
 | 
			
		||||
but
 | 
			
		||||
by
 | 
			
		||||
c
 | 
			
		||||
ca
 | 
			
		||||
came
 | 
			
		||||
can
 | 
			
		||||
cannot
 | 
			
		||||
can't
 | 
			
		||||
cause
 | 
			
		||||
causes
 | 
			
		||||
certain
 | 
			
		||||
certainly
 | 
			
		||||
co
 | 
			
		||||
com
 | 
			
		||||
come
 | 
			
		||||
comes
 | 
			
		||||
contain
 | 
			
		||||
containing
 | 
			
		||||
contains
 | 
			
		||||
could
 | 
			
		||||
couldnt
 | 
			
		||||
d
 | 
			
		||||
date
 | 
			
		||||
did
 | 
			
		||||
didn't
 | 
			
		||||
different
 | 
			
		||||
do
 | 
			
		||||
does
 | 
			
		||||
doesn't
 | 
			
		||||
doing
 | 
			
		||||
done
 | 
			
		||||
don't
 | 
			
		||||
down
 | 
			
		||||
downwards
 | 
			
		||||
due
 | 
			
		||||
during
 | 
			
		||||
e
 | 
			
		||||
each
 | 
			
		||||
ed
 | 
			
		||||
edu
 | 
			
		||||
effect
 | 
			
		||||
eg
 | 
			
		||||
eight
 | 
			
		||||
eighty
 | 
			
		||||
either
 | 
			
		||||
else
 | 
			
		||||
elsewhere
 | 
			
		||||
end
 | 
			
		||||
ending
 | 
			
		||||
enough
 | 
			
		||||
especially
 | 
			
		||||
et
 | 
			
		||||
et-al
 | 
			
		||||
etc
 | 
			
		||||
even
 | 
			
		||||
ever
 | 
			
		||||
every
 | 
			
		||||
everybody
 | 
			
		||||
everyone
 | 
			
		||||
everything
 | 
			
		||||
everywhere
 | 
			
		||||
ex
 | 
			
		||||
except
 | 
			
		||||
f
 | 
			
		||||
far
 | 
			
		||||
few
 | 
			
		||||
ff
 | 
			
		||||
fifth
 | 
			
		||||
first
 | 
			
		||||
five
 | 
			
		||||
fix
 | 
			
		||||
followed
 | 
			
		||||
following
 | 
			
		||||
follows
 | 
			
		||||
for
 | 
			
		||||
former
 | 
			
		||||
formerly
 | 
			
		||||
forth
 | 
			
		||||
found
 | 
			
		||||
four
 | 
			
		||||
from
 | 
			
		||||
further
 | 
			
		||||
furthermore
 | 
			
		||||
g
 | 
			
		||||
gave
 | 
			
		||||
get
 | 
			
		||||
gets
 | 
			
		||||
getting
 | 
			
		||||
give
 | 
			
		||||
given
 | 
			
		||||
gives
 | 
			
		||||
giving
 | 
			
		||||
go
 | 
			
		||||
goes
 | 
			
		||||
gone
 | 
			
		||||
got
 | 
			
		||||
gotten
 | 
			
		||||
h
 | 
			
		||||
had
 | 
			
		||||
happens
 | 
			
		||||
hardly
 | 
			
		||||
has
 | 
			
		||||
hasn't
 | 
			
		||||
have
 | 
			
		||||
haven't
 | 
			
		||||
having
 | 
			
		||||
he
 | 
			
		||||
hed
 | 
			
		||||
hence
 | 
			
		||||
her
 | 
			
		||||
here
 | 
			
		||||
hereafter
 | 
			
		||||
hereby
 | 
			
		||||
herein
 | 
			
		||||
heres
 | 
			
		||||
hereupon
 | 
			
		||||
hers
 | 
			
		||||
herself
 | 
			
		||||
hes
 | 
			
		||||
hi
 | 
			
		||||
hid
 | 
			
		||||
him
 | 
			
		||||
himself
 | 
			
		||||
his
 | 
			
		||||
hither
 | 
			
		||||
home
 | 
			
		||||
how
 | 
			
		||||
howbeit
 | 
			
		||||
however
 | 
			
		||||
hundred
 | 
			
		||||
i
 | 
			
		||||
id
 | 
			
		||||
ie
 | 
			
		||||
if
 | 
			
		||||
i'll
 | 
			
		||||
im
 | 
			
		||||
immediate
 | 
			
		||||
immediately
 | 
			
		||||
importance
 | 
			
		||||
important
 | 
			
		||||
in
 | 
			
		||||
inc
 | 
			
		||||
indeed
 | 
			
		||||
index
 | 
			
		||||
information
 | 
			
		||||
instead
 | 
			
		||||
into
 | 
			
		||||
invention
 | 
			
		||||
inward
 | 
			
		||||
is
 | 
			
		||||
isn't
 | 
			
		||||
it
 | 
			
		||||
itd
 | 
			
		||||
it'll
 | 
			
		||||
its
 | 
			
		||||
itself
 | 
			
		||||
i've
 | 
			
		||||
j
 | 
			
		||||
just
 | 
			
		||||
k
 | 
			
		||||
keep   keeps
 | 
			
		||||
kept
 | 
			
		||||
kg
 | 
			
		||||
km
 | 
			
		||||
know
 | 
			
		||||
known
 | 
			
		||||
knows
 | 
			
		||||
l
 | 
			
		||||
largely
 | 
			
		||||
last
 | 
			
		||||
lately
 | 
			
		||||
later
 | 
			
		||||
latter
 | 
			
		||||
latterly
 | 
			
		||||
least
 | 
			
		||||
less
 | 
			
		||||
lest
 | 
			
		||||
let
 | 
			
		||||
lets
 | 
			
		||||
like
 | 
			
		||||
liked
 | 
			
		||||
likely
 | 
			
		||||
line
 | 
			
		||||
little
 | 
			
		||||
'll
 | 
			
		||||
look
 | 
			
		||||
looking
 | 
			
		||||
looks
 | 
			
		||||
ltd
 | 
			
		||||
m
 | 
			
		||||
made
 | 
			
		||||
mainly
 | 
			
		||||
make
 | 
			
		||||
makes
 | 
			
		||||
many
 | 
			
		||||
may
 | 
			
		||||
maybe
 | 
			
		||||
me
 | 
			
		||||
mean
 | 
			
		||||
means
 | 
			
		||||
meantime
 | 
			
		||||
meanwhile
 | 
			
		||||
merely
 | 
			
		||||
mg
 | 
			
		||||
might
 | 
			
		||||
million
 | 
			
		||||
miss
 | 
			
		||||
ml
 | 
			
		||||
more
 | 
			
		||||
moreover
 | 
			
		||||
most
 | 
			
		||||
mostly
 | 
			
		||||
mr
 | 
			
		||||
mrs
 | 
			
		||||
much
 | 
			
		||||
mug
 | 
			
		||||
must
 | 
			
		||||
my
 | 
			
		||||
myself
 | 
			
		||||
n
 | 
			
		||||
na
 | 
			
		||||
name
 | 
			
		||||
namely
 | 
			
		||||
nay
 | 
			
		||||
nd
 | 
			
		||||
near
 | 
			
		||||
nearly
 | 
			
		||||
necessarily
 | 
			
		||||
necessary
 | 
			
		||||
need
 | 
			
		||||
needs
 | 
			
		||||
neither
 | 
			
		||||
never
 | 
			
		||||
nevertheless
 | 
			
		||||
new
 | 
			
		||||
next
 | 
			
		||||
nine
 | 
			
		||||
ninety
 | 
			
		||||
no
 | 
			
		||||
nobody
 | 
			
		||||
non
 | 
			
		||||
none
 | 
			
		||||
nonetheless
 | 
			
		||||
noone
 | 
			
		||||
nor
 | 
			
		||||
normally
 | 
			
		||||
nos
 | 
			
		||||
not
 | 
			
		||||
noted
 | 
			
		||||
nothing
 | 
			
		||||
now
 | 
			
		||||
nowhere
 | 
			
		||||
o
 | 
			
		||||
obtain
 | 
			
		||||
obtained
 | 
			
		||||
obviously
 | 
			
		||||
of
 | 
			
		||||
off
 | 
			
		||||
often
 | 
			
		||||
oh
 | 
			
		||||
ok
 | 
			
		||||
okay
 | 
			
		||||
old
 | 
			
		||||
omitted
 | 
			
		||||
on
 | 
			
		||||
once
 | 
			
		||||
one
 | 
			
		||||
ones
 | 
			
		||||
only
 | 
			
		||||
onto
 | 
			
		||||
or
 | 
			
		||||
ord
 | 
			
		||||
other
 | 
			
		||||
others
 | 
			
		||||
otherwise
 | 
			
		||||
ought
 | 
			
		||||
our
 | 
			
		||||
ours
 | 
			
		||||
ourselves
 | 
			
		||||
out
 | 
			
		||||
outside
 | 
			
		||||
over
 | 
			
		||||
overall
 | 
			
		||||
owing
 | 
			
		||||
own
 | 
			
		||||
p
 | 
			
		||||
page
 | 
			
		||||
pages
 | 
			
		||||
part
 | 
			
		||||
particular
 | 
			
		||||
particularly
 | 
			
		||||
past
 | 
			
		||||
per
 | 
			
		||||
perhaps
 | 
			
		||||
placed
 | 
			
		||||
please
 | 
			
		||||
plus
 | 
			
		||||
poorly
 | 
			
		||||
possible
 | 
			
		||||
possibly
 | 
			
		||||
potentially
 | 
			
		||||
pp
 | 
			
		||||
predominantly
 | 
			
		||||
present
 | 
			
		||||
previously
 | 
			
		||||
primarily
 | 
			
		||||
probably
 | 
			
		||||
promptly
 | 
			
		||||
proud
 | 
			
		||||
provides
 | 
			
		||||
put
 | 
			
		||||
q
 | 
			
		||||
que
 | 
			
		||||
quickly
 | 
			
		||||
quite
 | 
			
		||||
qv
 | 
			
		||||
r
 | 
			
		||||
ran
 | 
			
		||||
rather
 | 
			
		||||
rd
 | 
			
		||||
re
 | 
			
		||||
readily
 | 
			
		||||
really
 | 
			
		||||
recent
 | 
			
		||||
recently
 | 
			
		||||
ref
 | 
			
		||||
refs
 | 
			
		||||
regarding
 | 
			
		||||
regardless
 | 
			
		||||
regards
 | 
			
		||||
related
 | 
			
		||||
relatively
 | 
			
		||||
research
 | 
			
		||||
respectively
 | 
			
		||||
resulted
 | 
			
		||||
resulting
 | 
			
		||||
results
 | 
			
		||||
right
 | 
			
		||||
run
 | 
			
		||||
s
 | 
			
		||||
said
 | 
			
		||||
same
 | 
			
		||||
saw
 | 
			
		||||
say
 | 
			
		||||
saying
 | 
			
		||||
says
 | 
			
		||||
sec
 | 
			
		||||
section
 | 
			
		||||
see
 | 
			
		||||
seeing
 | 
			
		||||
seem
 | 
			
		||||
seemed
 | 
			
		||||
seeming
 | 
			
		||||
seems
 | 
			
		||||
seen
 | 
			
		||||
self
 | 
			
		||||
selves
 | 
			
		||||
sent
 | 
			
		||||
seven
 | 
			
		||||
several
 | 
			
		||||
shall
 | 
			
		||||
she
 | 
			
		||||
shed
 | 
			
		||||
she'll
 | 
			
		||||
shes
 | 
			
		||||
should
 | 
			
		||||
shouldn't
 | 
			
		||||
show
 | 
			
		||||
showed
 | 
			
		||||
shown
 | 
			
		||||
showns
 | 
			
		||||
shows
 | 
			
		||||
significant
 | 
			
		||||
significantly
 | 
			
		||||
similar
 | 
			
		||||
similarly
 | 
			
		||||
since
 | 
			
		||||
six
 | 
			
		||||
slightly
 | 
			
		||||
so
 | 
			
		||||
some
 | 
			
		||||
somebody
 | 
			
		||||
somehow
 | 
			
		||||
someone
 | 
			
		||||
somethan
 | 
			
		||||
something
 | 
			
		||||
sometime
 | 
			
		||||
sometimes
 | 
			
		||||
somewhat
 | 
			
		||||
somewhere
 | 
			
		||||
soon
 | 
			
		||||
sorry
 | 
			
		||||
specifically
 | 
			
		||||
specified
 | 
			
		||||
specify
 | 
			
		||||
specifying
 | 
			
		||||
still
 | 
			
		||||
stop
 | 
			
		||||
strongly
 | 
			
		||||
sub
 | 
			
		||||
substantially
 | 
			
		||||
successfully
 | 
			
		||||
such
 | 
			
		||||
sufficiently
 | 
			
		||||
suggest
 | 
			
		||||
sup
 | 
			
		||||
sure   t
 | 
			
		||||
take
 | 
			
		||||
taken
 | 
			
		||||
taking
 | 
			
		||||
tell
 | 
			
		||||
tends
 | 
			
		||||
th
 | 
			
		||||
than
 | 
			
		||||
thank
 | 
			
		||||
thanks
 | 
			
		||||
thanx
 | 
			
		||||
that
 | 
			
		||||
that'll
 | 
			
		||||
thats
 | 
			
		||||
that've
 | 
			
		||||
the
 | 
			
		||||
their
 | 
			
		||||
theirs
 | 
			
		||||
them
 | 
			
		||||
themselves
 | 
			
		||||
then
 | 
			
		||||
thence
 | 
			
		||||
there
 | 
			
		||||
thereafter
 | 
			
		||||
thereby
 | 
			
		||||
thered
 | 
			
		||||
therefore
 | 
			
		||||
therein
 | 
			
		||||
there'll
 | 
			
		||||
thereof
 | 
			
		||||
therere
 | 
			
		||||
theres
 | 
			
		||||
thereto
 | 
			
		||||
thereupon
 | 
			
		||||
there've
 | 
			
		||||
these
 | 
			
		||||
they
 | 
			
		||||
theyd
 | 
			
		||||
they'll
 | 
			
		||||
theyre
 | 
			
		||||
they've
 | 
			
		||||
think
 | 
			
		||||
this
 | 
			
		||||
those
 | 
			
		||||
thou
 | 
			
		||||
though
 | 
			
		||||
thoughh
 | 
			
		||||
thousand
 | 
			
		||||
throug
 | 
			
		||||
through
 | 
			
		||||
throughout
 | 
			
		||||
thru
 | 
			
		||||
thus
 | 
			
		||||
til
 | 
			
		||||
tip
 | 
			
		||||
to
 | 
			
		||||
together
 | 
			
		||||
too
 | 
			
		||||
took
 | 
			
		||||
toward
 | 
			
		||||
towards
 | 
			
		||||
tried
 | 
			
		||||
tries
 | 
			
		||||
truly
 | 
			
		||||
try
 | 
			
		||||
trying
 | 
			
		||||
ts
 | 
			
		||||
twice
 | 
			
		||||
two
 | 
			
		||||
u
 | 
			
		||||
un
 | 
			
		||||
under
 | 
			
		||||
unfortunately
 | 
			
		||||
unless
 | 
			
		||||
unlike
 | 
			
		||||
unlikely
 | 
			
		||||
until
 | 
			
		||||
unto
 | 
			
		||||
up
 | 
			
		||||
upon
 | 
			
		||||
ups
 | 
			
		||||
us
 | 
			
		||||
use
 | 
			
		||||
used
 | 
			
		||||
useful
 | 
			
		||||
usefully
 | 
			
		||||
usefulness
 | 
			
		||||
uses
 | 
			
		||||
using
 | 
			
		||||
usually
 | 
			
		||||
v
 | 
			
		||||
value
 | 
			
		||||
various
 | 
			
		||||
've
 | 
			
		||||
very
 | 
			
		||||
via
 | 
			
		||||
viz
 | 
			
		||||
vol
 | 
			
		||||
vols
 | 
			
		||||
vs
 | 
			
		||||
w
 | 
			
		||||
want
 | 
			
		||||
wants
 | 
			
		||||
was
 | 
			
		||||
wasnt
 | 
			
		||||
way
 | 
			
		||||
we
 | 
			
		||||
wed
 | 
			
		||||
welcome
 | 
			
		||||
we'll
 | 
			
		||||
went
 | 
			
		||||
were
 | 
			
		||||
werent
 | 
			
		||||
we've
 | 
			
		||||
what
 | 
			
		||||
whatever
 | 
			
		||||
what'll
 | 
			
		||||
whats
 | 
			
		||||
when
 | 
			
		||||
whence
 | 
			
		||||
whenever
 | 
			
		||||
where
 | 
			
		||||
whereafter
 | 
			
		||||
whereas
 | 
			
		||||
whereby
 | 
			
		||||
wherein
 | 
			
		||||
wheres
 | 
			
		||||
whereupon
 | 
			
		||||
wherever
 | 
			
		||||
whether
 | 
			
		||||
which
 | 
			
		||||
while
 | 
			
		||||
whim
 | 
			
		||||
whither
 | 
			
		||||
who
 | 
			
		||||
whod
 | 
			
		||||
whoever
 | 
			
		||||
whole
 | 
			
		||||
who'll
 | 
			
		||||
whom
 | 
			
		||||
whomever
 | 
			
		||||
whos
 | 
			
		||||
whose
 | 
			
		||||
why
 | 
			
		||||
widely
 | 
			
		||||
willing
 | 
			
		||||
wish
 | 
			
		||||
with
 | 
			
		||||
within
 | 
			
		||||
without
 | 
			
		||||
wont
 | 
			
		||||
words
 | 
			
		||||
world
 | 
			
		||||
would
 | 
			
		||||
wouldnt
 | 
			
		||||
www
 | 
			
		||||
x
 | 
			
		||||
y
 | 
			
		||||
yes
 | 
			
		||||
yet
 | 
			
		||||
you
 | 
			
		||||
youd
 | 
			
		||||
you'll
 | 
			
		||||
your
 | 
			
		||||
youre
 | 
			
		||||
yours
 | 
			
		||||
yourself
 | 
			
		||||
yourselves
 | 
			
		||||
you've
 | 
			
		||||
z
 | 
			
		||||
zero
 | 
			
		||||
@@ -12,6 +12,9 @@ module Sibe.NaiveBayes
 | 
			
		||||
   stdev,
 | 
			
		||||
   cleanText,
 | 
			
		||||
   cleanDocuments,
 | 
			
		||||
   ngram,
 | 
			
		||||
   removeWords,
 | 
			
		||||
   removeStopwords,
 | 
			
		||||
  )
 | 
			
		||||
  where
 | 
			
		||||
    import Data.List
 | 
			
		||||
@@ -21,7 +24,7 @@ module Sibe.NaiveBayes
 | 
			
		||||
    import Data.Maybe
 | 
			
		||||
    import Control.Arrow ((&&&))
 | 
			
		||||
    import Text.Regex.PCRE
 | 
			
		||||
    import Data.Char (isSpace)
 | 
			
		||||
    import Data.Char (isSpace, isNumber)
 | 
			
		||||
    import NLP.Stemmer
 | 
			
		||||
 | 
			
		||||
    type Class = Int;
 | 
			
		||||
@@ -36,6 +39,7 @@ module Sibe.NaiveBayes
 | 
			
		||||
                 , megadoc    :: String
 | 
			
		||||
                 , cd         :: [(Class, [Document])]
 | 
			
		||||
                 , cw         :: [(Class, [(String, Int)])]
 | 
			
		||||
                 , cgram      :: [(Class, [(String, Int)])]
 | 
			
		||||
                 } deriving (Eq, Show, Read)
 | 
			
		||||
 | 
			
		||||
    train :: [Document] -> [Class] -> NB
 | 
			
		||||
@@ -49,7 +53,9 @@ module Sibe.NaiveBayes
 | 
			
		||||
          cd = zip classes (map classDocs classes)
 | 
			
		||||
 | 
			
		||||
          -- (class, [(word, count)])
 | 
			
		||||
          cw = zip classes $ l (map classWordsCounts classes)
 | 
			
		||||
          cw = zip classes $ map classWordsCounts classes
 | 
			
		||||
 | 
			
		||||
          cgram = zip classes $ map classNGramCounts classes
 | 
			
		||||
 | 
			
		||||
      in NB { documents  = documents
 | 
			
		||||
            , classes    = cls
 | 
			
		||||
@@ -57,44 +63,60 @@ module Sibe.NaiveBayes
 | 
			
		||||
            , megadoc    = megadoc
 | 
			
		||||
            , cd         = cd
 | 
			
		||||
            , cw         = cw
 | 
			
		||||
            , cgram      = cgram
 | 
			
		||||
            }
 | 
			
		||||
      where
 | 
			
		||||
        concatDocs = concatMap (\(Document text _) -> text ++ " ")
 | 
			
		||||
 | 
			
		||||
        classDocs x = filter ((==x) . c) documents
 | 
			
		||||
        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
 | 
			
		||||
        classWords x = words (classMegadoc x)
 | 
			
		||||
        classNGram n = ngram n . classMegadoc
 | 
			
		||||
        classVocabulary x = ordNub (classWords x)
 | 
			
		||||
        classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
 | 
			
		||||
        classWords = words . classMegadoc
 | 
			
		||||
        classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
 | 
			
		||||
        classNGramWords = words . classNGram
 | 
			
		||||
        classVocabulary = ordNub . classWords
 | 
			
		||||
        classPrior x = genericLength (classDocs x) / genericLength documents
 | 
			
		||||
        countWordInDoc d w = genericLength (filter (==w) d)
 | 
			
		||||
        classWordsCounts x =
 | 
			
		||||
          let voc = classVocabulary x
 | 
			
		||||
          in zip voc $ map (countWordInDoc (classWords x)) voc
 | 
			
		||||
        wordsCount ws voc =
 | 
			
		||||
          zip voc $ map (countWordInDoc ws) voc
 | 
			
		||||
        classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
 | 
			
		||||
        classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
 | 
			
		||||
 | 
			
		||||
    ngram :: Int -> String -> [String]
 | 
			
		||||
    ngram n text =
 | 
			
		||||
      let ws = words text
 | 
			
		||||
      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
 | 
			
		||||
    ngram :: Int -> [Document] -> [Document]
 | 
			
		||||
    ngram n documents =
 | 
			
		||||
      map (\(Document text c) -> Document (helper text) c) documents
 | 
			
		||||
      where
 | 
			
		||||
        helper text =
 | 
			
		||||
          let ws = words text
 | 
			
		||||
              pairs = zip [0..] ws
 | 
			
		||||
              grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
 | 
			
		||||
          in unwords ("<b>":grams)
 | 
			
		||||
 | 
			
		||||
    run :: String -> NB -> Class
 | 
			
		||||
    run text (NB documents classes vocabulary megadoc cd cw) =
 | 
			
		||||
    run :: String -> NB -> (Class, Double)
 | 
			
		||||
    run txt (NB documents classes vocabulary megadoc cd cw cgram) =
 | 
			
		||||
      let scores = map (score . fst) classes
 | 
			
		||||
      in argmax scores
 | 
			
		||||
          index = argmax scores
 | 
			
		||||
          m = maximum scores
 | 
			
		||||
          confidence = m / sum scores
 | 
			
		||||
      in (index, 0)
 | 
			
		||||
      where
 | 
			
		||||
        score c =
 | 
			
		||||
          let prior = snd (classes !! c)
 | 
			
		||||
          in prior * product (map (prob c) (words text))
 | 
			
		||||
 | 
			
		||||
          -- below is the formula according to Multinominal Naive Bayes, but it seems
 | 
			
		||||
          -- using a uniform prior probability seems to work better
 | 
			
		||||
          -- in prior * product (map (prob c) (words txt))
 | 
			
		||||
 | 
			
		||||
          in product (map (prob c) (words txt))
 | 
			
		||||
 | 
			
		||||
        prob c w =
 | 
			
		||||
          let tctM = find ((==w) . fst) (snd (cw !! c))
 | 
			
		||||
              tct  = (snd . fromJust) tctM
 | 
			
		||||
              cvoc = (genericLength . snd) (cw !! c)
 | 
			
		||||
          let tctM = find ((== w) . fst) (snd (cw !! c))
 | 
			
		||||
              tct  = if isJust tctM then (snd . fromJust) tctM else 0
 | 
			
		||||
              cvoc = sum $ map snd (snd (cw !! c))
 | 
			
		||||
              voc  = vocabulary
 | 
			
		||||
          in
 | 
			
		||||
            if isJust tctM then
 | 
			
		||||
              realToFrac (tct + 1) / realToFrac (cvoc + voc)
 | 
			
		||||
            else
 | 
			
		||||
              1 / realToFrac (cvoc + voc)
 | 
			
		||||
              gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
 | 
			
		||||
              pg   = if isJust gram then (snd . fromJust) gram else 0
 | 
			
		||||
          -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
 | 
			
		||||
          in realToFrac (tct + 1) / realToFrac (cvoc + voc)
 | 
			
		||||
 | 
			
		||||
    argmax :: (Ord a) => [a] -> Int
 | 
			
		||||
    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
 | 
			
		||||
@@ -113,7 +135,8 @@ module Sibe.NaiveBayes
 | 
			
		||||
      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
 | 
			
		||||
          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
 | 
			
		||||
          stemmed = unwords $ map (stem Porter) (words spacify)
 | 
			
		||||
      in stemmed
 | 
			
		||||
          nonumber = filter (not . isNumber) stemmed
 | 
			
		||||
      in (unwords . words) nonumber
 | 
			
		||||
      where
 | 
			
		||||
        trim = f . f
 | 
			
		||||
          where
 | 
			
		||||
@@ -124,11 +147,21 @@ module Sibe.NaiveBayes
 | 
			
		||||
    cleanDocuments :: [Document] -> [Document]
 | 
			
		||||
    cleanDocuments documents =
 | 
			
		||||
      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
 | 
			
		||||
          wc = wordCounts (concatDocs cleaned)
 | 
			
		||||
      in cleaned
 | 
			
		||||
 | 
			
		||||
    removeWords :: [String] -> [Document] -> [Document]
 | 
			
		||||
    removeWords ws documents =
 | 
			
		||||
      map (\(Document text c) -> Document (rm ws text) c) documents
 | 
			
		||||
      where
 | 
			
		||||
          rm list text =
 | 
			
		||||
            unwords $ filter (`notElem` list) (words text)
 | 
			
		||||
 | 
			
		||||
    removeStopwords :: Int -> [Document] -> [Document]
 | 
			
		||||
    removeStopwords i documents =
 | 
			
		||||
      let wc = wordCounts (concatDocs documents)
 | 
			
		||||
          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
 | 
			
		||||
          stopwords = l $ map fst (take 30 wlist)
 | 
			
		||||
          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
 | 
			
		||||
      in wstopwords
 | 
			
		||||
          stopwords = map fst (take i wlist)
 | 
			
		||||
      in removeWords stopwords documents
 | 
			
		||||
      where
 | 
			
		||||
        vocabulary x = ordNub (words x)
 | 
			
		||||
        countWordInDoc d w = genericLength (filter (==w) d)
 | 
			
		||||
@@ -136,9 +169,6 @@ module Sibe.NaiveBayes
 | 
			
		||||
          let voc = vocabulary x
 | 
			
		||||
          in zip voc $ map (countWordInDoc (words x)) voc
 | 
			
		||||
 | 
			
		||||
        removeWords list text =
 | 
			
		||||
          unwords $ filter (`notElem` list) (words text)
 | 
			
		||||
 | 
			
		||||
        concatDocs = concatMap (\(Document text _) -> text ++ " ")
 | 
			
		||||
 | 
			
		||||
    l :: (Show a) => a -> a
 | 
			
		||||
@@ -151,37 +181,38 @@ module Sibe.NaiveBayes
 | 
			
		||||
        go s (x:xs) = if x `Set.member` s then go s xs
 | 
			
		||||
                                          else x : go (Set.insert x s) xs
 | 
			
		||||
 | 
			
		||||
    accuracy :: [(Int, Int)] -> Double
 | 
			
		||||
    accuracy :: [(Int, (Int, Double))] -> Double
 | 
			
		||||
    accuracy results =
 | 
			
		||||
      let correct = filter (uncurry (==)) results
 | 
			
		||||
      let pairs = map (\(a, b) -> (a, fst b)) results
 | 
			
		||||
          correct = filter (uncurry (==)) pairs
 | 
			
		||||
      in genericLength correct / genericLength results
 | 
			
		||||
 | 
			
		||||
    recall :: [(Int, Int)] -> Double
 | 
			
		||||
    recall :: [(Int, (Int, Double))] -> Double
 | 
			
		||||
    recall results =
 | 
			
		||||
      let classes = ordNub (map fst results)
 | 
			
		||||
          s = sum (map rec classes) / genericLength classes
 | 
			
		||||
      in s
 | 
			
		||||
      where
 | 
			
		||||
        rec a =
 | 
			
		||||
          let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
 | 
			
		||||
              y = genericLength $ filter (\(c, r) -> c == a) results
 | 
			
		||||
          let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
 | 
			
		||||
              y = genericLength $ filter (\(c, (r, _)) -> c == a) results
 | 
			
		||||
          in t / y
 | 
			
		||||
 | 
			
		||||
    precision :: [(Int, Int)] -> Double
 | 
			
		||||
    precision :: [(Int, (Int, Double))] -> Double
 | 
			
		||||
    precision results =
 | 
			
		||||
      let classes = ordNub (map fst results)
 | 
			
		||||
          s = sum (map prec classes) / genericLength classes
 | 
			
		||||
      in s
 | 
			
		||||
      where
 | 
			
		||||
        prec a =
 | 
			
		||||
          let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
 | 
			
		||||
              y = genericLength $ filter (\(c, r) -> r == a) results
 | 
			
		||||
          let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
 | 
			
		||||
              y = genericLength $ filter (\(c, (r, _)) -> r == a) results
 | 
			
		||||
          in
 | 
			
		||||
            if y == 0
 | 
			
		||||
              then 0
 | 
			
		||||
              else t / y
 | 
			
		||||
 | 
			
		||||
    fmeasure :: [(Int, Int)] -> Double
 | 
			
		||||
    fmeasure :: [(Int, (Int, Double))] -> Double
 | 
			
		||||
    fmeasure results =
 | 
			
		||||
      let r = recall results
 | 
			
		||||
          p = precision results
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user