fix(naivebayes): fix the algorithm to make it actually work
feat(cleanDocuments): preprocess documents, use stemming and stopword elimination for better accuracy
This commit is contained in:
parent
3cf0625794
commit
ea1f05f001
@ -1,12 +1,13 @@
|
||||
module Main
|
||||
where
|
||||
import Sibe
|
||||
-- import Sibe
|
||||
import Sibe.NaiveBayes
|
||||
import Text.Printf
|
||||
import Data.List
|
||||
import Data.Maybe
|
||||
import Debug.Trace
|
||||
import Data.List.Split
|
||||
import Control.Arrow ((&&&))
|
||||
|
||||
main = do
|
||||
dataset <- readFile "examples/doc-classifier-data/data-reuters"
|
||||
@ -15,18 +16,33 @@ module Main
|
||||
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
||||
|
||||
let intClasses = [0..length classes - 1]
|
||||
documents = createDocuments classes dataset
|
||||
testDocuments = createDocuments classes test
|
||||
devTestDocuments = take 20 testDocuments
|
||||
nb = initialize documents
|
||||
-- let intClasses = [0, 1]
|
||||
documents = cleanDocuments $ createDocuments classes dataset
|
||||
-- documents = [Document "Chinese Beijing Chinese" 0,
|
||||
-- Document "Chinese Chinese Shanghai" 0,
|
||||
-- Document "Chinese Macao" 0,
|
||||
-- Document "Japan Tokyo Chinese" 1]
|
||||
-- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
|
||||
testDocuments = cleanDocuments $ createDocuments classes test
|
||||
devTestDocuments = take 30 testDocuments
|
||||
-- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
|
||||
nb = train documents intClasses
|
||||
|
||||
results = map (\(Document text c) -> (c, determine text nb intClasses documents)) testDocuments
|
||||
-- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
|
||||
results = map (\(Document text c) -> (c, run text nb)) testDocuments
|
||||
-- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
|
||||
|
||||
-- print (text $ head documents)
|
||||
|
||||
let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
||||
mapM_ showResults results
|
||||
|
||||
putStrLn $ "Recall: " ++ show (recall results)
|
||||
putStrLn $ "Precision: " ++ show (precision results)
|
||||
putStrLn $ "F Measure: " ++ show (fmeasure (precision results) (recall results))
|
||||
putStrLn $ "F Measure: " ++ show (fmeasure results)
|
||||
putStrLn $ "Accuracy: " ++ show (accuracy results)
|
||||
|
||||
createDocuments classes content =
|
||||
let splitted = splitOn (replicate 10 '-' ++ "\n") content
|
||||
pairs = map ((head . lines) &&& (unwords . tail . lines)) splitted
|
||||
documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
|
||||
in documents
|
||||
|
54
examples/naivebayes-sentiment-analysis.hs
Normal file
54
examples/naivebayes-sentiment-analysis.hs
Normal file
@ -0,0 +1,54 @@
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
module Main
|
||||
where
|
||||
-- import Sibe
|
||||
import Sibe.NaiveBayes
|
||||
import Text.Printf
|
||||
import Data.List
|
||||
import Data.Maybe
|
||||
import Debug.Trace
|
||||
import Data.List.Split
|
||||
import System.Directory
|
||||
import Control.DeepSeq
|
||||
import System.IO
|
||||
|
||||
main = do
|
||||
putStr "Reading documents... "
|
||||
neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/"
|
||||
pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/"
|
||||
|
||||
test_neg_documents <- createDocuments "examples/sentiment-analysis-data/test/neg/"
|
||||
test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/"
|
||||
putStrLn "done"
|
||||
|
||||
let classes = [0..9] -- rating, from 0 to 9 (1 to 10)
|
||||
documents = neg_documents ++ pos_documents
|
||||
nb = train documents classes
|
||||
|
||||
testDocuments = neg_documents ++ pos_documents
|
||||
|
||||
results = map (\(Document text c) -> (c, run text nb)) testDocuments
|
||||
-- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
|
||||
print results
|
||||
|
||||
-- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r))
|
||||
-- mapM_ showResults results
|
||||
--
|
||||
-- putStrLn $ "Recall: " ++ show (recall results)
|
||||
-- putStrLn $ "Precision: " ++ show (precision results)
|
||||
-- putStrLn $ "F Measure: " ++ show (fmeasure results)
|
||||
-- putStrLn $ "Accuracy: " ++ show (accuracy results)
|
||||
|
||||
createDocuments :: FilePath -> IO [Document]
|
||||
createDocuments path = do
|
||||
files <- drop 2 <$> getDirectoryContents path
|
||||
let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int]
|
||||
contents <- mapM (forceReadFile . (path ++)) files
|
||||
return $ zipWith Document contents ratings
|
||||
|
||||
forceReadFile :: FilePath -> IO String
|
||||
forceReadFile file = do
|
||||
handle <- openFile file ReadMode
|
||||
content <- hGetContents handle
|
||||
content `deepseq` hClose handle
|
||||
return content
|
1
examples/sentiment-analysis-data
Symbolic link
1
examples/sentiment-analysis-data
Symbolic link
@ -0,0 +1 @@
|
||||
../../sibe-repos/sentiment-analysis-data
|
@ -7,7 +7,7 @@ module Main where
|
||||
main = do
|
||||
let learning_rate = 0.5
|
||||
(iterations, epochs) = (2, 1000)
|
||||
a = (logistic, logistic')
|
||||
a = (sigmoid, sigmoid')
|
||||
rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
|
||||
|
||||
inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
|
||||
|
6
profiling/run
Normal file
6
profiling/run
Normal file
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
PROG==geniconvert
|
||||
VIEW==open
|
||||
|
||||
stack build --profile
|
6
profiling/setup
Normal file
6
profiling/setup
Normal file
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
chmod u+x profiling/setup
|
||||
chmod u+x profiling/run
|
||||
chmod u+x profiling/compare
|
||||
chmod u+x profiling/save
|
17
sibe.cabal
17
sibe.cabal
@ -22,6 +22,10 @@ library
|
||||
, deepseq
|
||||
, containers
|
||||
, split
|
||||
, regex-base
|
||||
, regex-pcre
|
||||
, text
|
||||
, stemmer
|
||||
default-language: Haskell2010
|
||||
|
||||
executable sibe-exe
|
||||
@ -53,6 +57,19 @@ executable example-naivebayes-doc-classifier
|
||||
, split
|
||||
default-language: Haskell2010
|
||||
|
||||
executable example-naivebayes-sentiment-analysis
|
||||
hs-source-dirs: examples
|
||||
main-is: naivebayes-sentiment-analysis.hs
|
||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||
build-depends: base
|
||||
, sibe
|
||||
, hmatrix
|
||||
, containers
|
||||
, split
|
||||
, directory
|
||||
, deepseq
|
||||
default-language: Haskell2010
|
||||
|
||||
test-suite sibe-test
|
||||
type: exitcode-stdio-1.0
|
||||
hs-source-dirs: test
|
||||
|
29
src/Sibe.hs
29
src/Sibe.hs
@ -17,8 +17,10 @@ module Sibe
|
||||
train,
|
||||
session,
|
||||
shuffle,
|
||||
logistic,
|
||||
logistic',
|
||||
sigmoid,
|
||||
sigmoid',
|
||||
relu,
|
||||
relu',
|
||||
crossEntropy,
|
||||
genSeed,
|
||||
replaceVector
|
||||
@ -88,11 +90,17 @@ module Sibe
|
||||
randomLayer seed (input, h) a :-
|
||||
randomNetwork (seed + 1) h hs output
|
||||
|
||||
logistic :: Vector Double -> Vector Double
|
||||
logistic x = 1 / (1 + exp (-x))
|
||||
sigmoid :: Vector Double -> Vector Double
|
||||
sigmoid x = 1 / max (1 + exp (-x)) 1e-10
|
||||
|
||||
logistic' :: Vector Double -> Vector Double
|
||||
logistic' x = logistic x * (1 - logistic x)
|
||||
sigmoid' :: Vector Double -> Vector Double
|
||||
sigmoid' x = sigmoid x * (1 - sigmoid x)
|
||||
|
||||
relu :: Vector Double -> Vector Double
|
||||
relu x = log (max (1 + exp x) 1e-10)
|
||||
|
||||
relu' :: Vector Double -> Vector Double
|
||||
relu' = sigmoid
|
||||
|
||||
crossEntropy :: Output -> Output -> Double
|
||||
crossEntropy output target =
|
||||
@ -100,7 +108,7 @@ module Sibe
|
||||
n = fromIntegral (length pairs)
|
||||
in (-1 / n) * sum (map f pairs)
|
||||
where
|
||||
f (a, y) = y * log a + (1 - y) * log (1 - a)
|
||||
f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)
|
||||
|
||||
train :: Input
|
||||
-> Network
|
||||
@ -114,8 +122,8 @@ module Sibe
|
||||
let y = runLayer input l
|
||||
o = fn y
|
||||
delta = o - target
|
||||
-- de = delta * fn' y -- quadratic cost
|
||||
de = delta -- cross entropy cost
|
||||
de = delta * fn' y
|
||||
-- de = delta -- cross entropy cost
|
||||
|
||||
biases' = biases - scale alpha de
|
||||
weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
|
||||
@ -174,3 +182,6 @@ module Sibe
|
||||
rrow index (x:xs)
|
||||
| index == index = value:xs
|
||||
| otherwise = x : rrow (index + 1) xs
|
||||
|
||||
clip :: Double -> (Double, Double) -> Double
|
||||
clip x (l, u) = min u (max l x)
|
||||
|
@ -1,15 +1,17 @@
|
||||
module Sibe.NaiveBayes
|
||||
(Document(..),
|
||||
NB(..),
|
||||
createDocuments,
|
||||
initialize,
|
||||
calculate,
|
||||
determine,
|
||||
train,
|
||||
run,
|
||||
ordNub,
|
||||
accuracy,
|
||||
precision,
|
||||
recall,
|
||||
fmeasure,
|
||||
mean,
|
||||
stdev,
|
||||
cleanText,
|
||||
cleanDocuments,
|
||||
)
|
||||
where
|
||||
import Data.List
|
||||
@ -18,47 +20,126 @@ module Sibe.NaiveBayes
|
||||
import Data.List.Split
|
||||
import Data.Maybe
|
||||
import Control.Arrow ((&&&))
|
||||
type Class = Int
|
||||
import Text.Regex.PCRE
|
||||
import Data.Char (isSpace)
|
||||
import NLP.Stemmer
|
||||
|
||||
type Class = Int;
|
||||
|
||||
data Document = Document { text :: String
|
||||
, c :: Class
|
||||
} deriving (Eq, Show, Read)
|
||||
|
||||
data NB = NB { vocabulary :: Double
|
||||
data NB = NB { documents :: [Document]
|
||||
, classes :: [(Class, Double)]
|
||||
, vocabulary :: Int
|
||||
, megadoc :: String
|
||||
}
|
||||
, cd :: [(Class, [Document])]
|
||||
, cw :: [(Class, [(String, Int)])]
|
||||
} deriving (Eq, Show, Read)
|
||||
|
||||
initialize :: [Document] -> NB
|
||||
initialize documents =
|
||||
let megadoc = concatMap (\(Document text _) -> text ++ " ") documents
|
||||
train :: [Document] -> [Class] -> NB
|
||||
train documents classes =
|
||||
let megadoc = concatDocs documents
|
||||
vocabulary = genericLength ((ordNub . words) megadoc)
|
||||
in NB vocabulary megadoc
|
||||
-- (class, prior probability)
|
||||
cls = zip classes (map classPrior classes)
|
||||
|
||||
determine :: String -> NB -> [Class] -> [Document] -> Class
|
||||
determine text nb classes documents =
|
||||
let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes)
|
||||
m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores
|
||||
in fst m
|
||||
-- (class, [document])
|
||||
cd = zip classes (map classDocs classes)
|
||||
|
||||
calculate :: String -> NB -> Class -> [Document] -> Double
|
||||
calculate text (NB vocabulary megadoc) cls documents =
|
||||
let docs = filter (\(Document text c) -> c == cls) documents
|
||||
texts = map (\(Document text _) -> text ++ " ") docs
|
||||
classText = concat texts
|
||||
classWords = words classText
|
||||
c = genericLength classWords
|
||||
pc = genericLength docs / genericLength documents
|
||||
in pc * product (map (cword classWords c) (words text))
|
||||
-- (class, [(word, count)])
|
||||
cw = zip classes $ l (map classWordsCounts classes)
|
||||
|
||||
in NB { documents = documents
|
||||
, classes = cls
|
||||
, vocabulary = vocabulary
|
||||
, megadoc = megadoc
|
||||
, cd = cd
|
||||
, cw = cw
|
||||
}
|
||||
where
|
||||
cword classWords c word =
|
||||
let wc = genericLength (filter (==word) classWords)
|
||||
in (wc + 1) / (c + vocabulary)
|
||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||
|
||||
createDocuments classes content =
|
||||
let splitted = splitOn (replicate 10 '-' ++ "\n") content
|
||||
pairs = map ((head . lines) &&& (concat . tail . lines)) splitted
|
||||
documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
|
||||
in documents
|
||||
classDocs x = filter ((==x) . c) documents
|
||||
classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
|
||||
classWords x = words (classMegadoc x)
|
||||
classNGram n = ngram n . classMegadoc
|
||||
classVocabulary x = ordNub (classWords x)
|
||||
classPrior x = genericLength (classDocs x) / genericLength documents
|
||||
countWordInDoc d w = genericLength (filter (==w) d)
|
||||
classWordsCounts x =
|
||||
let voc = classVocabulary x
|
||||
in zip voc $ map (countWordInDoc (classWords x)) voc
|
||||
|
||||
ngram :: Int -> String -> [String]
|
||||
ngram n text =
|
||||
let ws = words text
|
||||
in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
|
||||
|
||||
run :: String -> NB -> Class
|
||||
run text (NB documents classes vocabulary megadoc cd cw) =
|
||||
let scores = map (score . fst) classes
|
||||
in argmax scores
|
||||
where
|
||||
score c =
|
||||
let prior = snd (classes !! c)
|
||||
in prior * product (map (prob c) (words text))
|
||||
prob c w =
|
||||
let tctM = find ((==w) . fst) (snd (cw !! c))
|
||||
tct = (snd . fromJust) tctM
|
||||
cvoc = (genericLength . snd) (cw !! c)
|
||||
voc = vocabulary
|
||||
in
|
||||
if isJust tctM then
|
||||
realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
||||
else
|
||||
1 / realToFrac (cvoc + voc)
|
||||
|
||||
argmax :: (Ord a) => [a] -> Int
|
||||
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
|
||||
|
||||
mean :: [Double] -> Double
|
||||
mean x = sum x / genericLength x
|
||||
|
||||
stdev :: [Double] -> Double
|
||||
stdev x =
|
||||
let avg = mean x
|
||||
variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
|
||||
in sqrt variance
|
||||
|
||||
cleanText :: String -> String
|
||||
cleanText string =
|
||||
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
|
||||
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
||||
stemmed = unwords $ map (stem Porter) (words spacify)
|
||||
in stemmed
|
||||
where
|
||||
trim = f . f
|
||||
where
|
||||
f = reverse . dropWhile isSpace
|
||||
replace needle replacement =
|
||||
map (\c -> if c == needle then replacement else c)
|
||||
|
||||
cleanDocuments :: [Document] -> [Document]
|
||||
cleanDocuments documents =
|
||||
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
|
||||
wc = wordCounts (concatDocs cleaned)
|
||||
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
|
||||
stopwords = l $ map fst (take 30 wlist)
|
||||
wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
|
||||
in wstopwords
|
||||
where
|
||||
vocabulary x = ordNub (words x)
|
||||
countWordInDoc d w = genericLength (filter (==w) d)
|
||||
wordCounts x =
|
||||
let voc = vocabulary x
|
||||
in zip voc $ map (countWordInDoc (words x)) voc
|
||||
|
||||
removeWords list text =
|
||||
unwords $ filter (`notElem` list) (words text)
|
||||
|
||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||
|
||||
l :: (Show a) => a -> a
|
||||
l a = trace (show a) a
|
||||
@ -100,5 +181,8 @@ module Sibe.NaiveBayes
|
||||
then 0
|
||||
else t / y
|
||||
|
||||
fmeasure :: Double -> Double -> Double
|
||||
fmeasure r p = (2 * p * r) / (p + r)
|
||||
fmeasure :: [(Int, Int)] -> Double
|
||||
fmeasure results =
|
||||
let r = recall results
|
||||
p = precision results
|
||||
in (2 * p * r) / (p + r)
|
||||
|
@ -41,7 +41,10 @@ packages:
|
||||
commit: 42a88fbcb6bd1d2c4dc18fae5e962bd34fb316a1
|
||||
subdirs:
|
||||
- packages/base
|
||||
- '.'
|
||||
- .
|
||||
- http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz
|
||||
- http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz
|
||||
- http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz
|
||||
|
||||
# Dependency packages to be pulled from upstream that are not in the resolver
|
||||
# (e.g., acme-missiles-0.3)
|
||||
@ -70,3 +73,5 @@ extra-package-dbs: []
|
||||
#
|
||||
# Allow a newer minor version of GHC than the snapshot specifies
|
||||
# compiler-check: newer-minor
|
||||
|
||||
system-ghc: false
|
||||
|
Loading…
Reference in New Issue
Block a user