fix(naivebayes): fix the algorithm to make it actually work
feat(cleanDocuments): preprocess documents, use stemming and stopword elimination for better accuracy
This commit is contained in:
@ -1,12 +1,13 @@
|
||||
module Main
|
||||
where
|
||||
import Sibe
|
||||
-- import Sibe
|
||||
import Sibe.NaiveBayes
|
||||
import Text.Printf
|
||||
import Data.List
|
||||
import Data.Maybe
|
||||
import Debug.Trace
|
||||
import Data.List.Split
|
||||
import Control.Arrow ((&&&))
|
||||
|
||||
main = do
|
||||
dataset <- readFile "examples/doc-classifier-data/data-reuters"
|
||||
@ -15,18 +16,33 @@ module Main
|
||||
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
||||
|
||||
let intClasses = [0..length classes - 1]
|
||||
documents = createDocuments classes dataset
|
||||
testDocuments = createDocuments classes test
|
||||
devTestDocuments = take 20 testDocuments
|
||||
nb = initialize documents
|
||||
-- let intClasses = [0, 1]
|
||||
documents = cleanDocuments $ createDocuments classes dataset
|
||||
-- documents = [Document "Chinese Beijing Chinese" 0,
|
||||
-- Document "Chinese Chinese Shanghai" 0,
|
||||
-- Document "Chinese Macao" 0,
|
||||
-- Document "Japan Tokyo Chinese" 1]
|
||||
-- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
|
||||
testDocuments = cleanDocuments $ createDocuments classes test
|
||||
devTestDocuments = take 30 testDocuments
|
||||
-- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
|
||||
nb = train documents intClasses
|
||||
|
||||
results = map (\(Document text c) -> (c, determine text nb intClasses documents)) testDocuments
|
||||
-- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
|
||||
results = map (\(Document text c) -> (c, run text nb)) testDocuments
|
||||
-- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
|
||||
|
||||
-- print (text $ head documents)
|
||||
|
||||
let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
||||
mapM_ showResults results
|
||||
|
||||
putStrLn $ "Recall: " ++ show (recall results)
|
||||
putStrLn $ "Precision: " ++ show (precision results)
|
||||
putStrLn $ "F Measure: " ++ show (fmeasure (precision results) (recall results))
|
||||
putStrLn $ "F Measure: " ++ show (fmeasure results)
|
||||
putStrLn $ "Accuracy: " ++ show (accuracy results)
|
||||
|
||||
createDocuments classes content =
|
||||
let splitted = splitOn (replicate 10 '-' ++ "\n") content
|
||||
pairs = map ((head . lines) &&& (unwords . tail . lines)) splitted
|
||||
documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
|
||||
in documents
|
||||
|
54
examples/naivebayes-sentiment-analysis.hs
Normal file
54
examples/naivebayes-sentiment-analysis.hs
Normal file
@ -0,0 +1,54 @@
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
module Main
|
||||
where
|
||||
-- import Sibe
|
||||
import Sibe.NaiveBayes
|
||||
import Text.Printf
|
||||
import Data.List
|
||||
import Data.Maybe
|
||||
import Debug.Trace
|
||||
import Data.List.Split
|
||||
import System.Directory
|
||||
import Control.DeepSeq
|
||||
import System.IO
|
||||
|
||||
main = do
|
||||
putStr "Reading documents... "
|
||||
neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/"
|
||||
pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/"
|
||||
|
||||
test_neg_documents <- createDocuments "examples/sentiment-analysis-data/test/neg/"
|
||||
test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/"
|
||||
putStrLn "done"
|
||||
|
||||
let classes = [0..9] -- rating, from 0 to 9 (1 to 10)
|
||||
documents = neg_documents ++ pos_documents
|
||||
nb = train documents classes
|
||||
|
||||
testDocuments = neg_documents ++ pos_documents
|
||||
|
||||
results = map (\(Document text c) -> (c, run text nb)) testDocuments
|
||||
-- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
|
||||
print results
|
||||
|
||||
-- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r))
|
||||
-- mapM_ showResults results
|
||||
--
|
||||
-- putStrLn $ "Recall: " ++ show (recall results)
|
||||
-- putStrLn $ "Precision: " ++ show (precision results)
|
||||
-- putStrLn $ "F Measure: " ++ show (fmeasure results)
|
||||
-- putStrLn $ "Accuracy: " ++ show (accuracy results)
|
||||
|
||||
createDocuments :: FilePath -> IO [Document]
|
||||
createDocuments path = do
|
||||
files <- drop 2 <$> getDirectoryContents path
|
||||
let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int]
|
||||
contents <- mapM (forceReadFile . (path ++)) files
|
||||
return $ zipWith Document contents ratings
|
||||
|
||||
forceReadFile :: FilePath -> IO String
|
||||
forceReadFile file = do
|
||||
handle <- openFile file ReadMode
|
||||
content <- hGetContents handle
|
||||
content `deepseq` hClose handle
|
||||
return content
|
1
examples/sentiment-analysis-data
Symbolic link
1
examples/sentiment-analysis-data
Symbolic link
@ -0,0 +1 @@
|
||||
../../sibe-repos/sentiment-analysis-data
|
@ -7,7 +7,7 @@ module Main where
|
||||
main = do
|
||||
let learning_rate = 0.5
|
||||
(iterations, epochs) = (2, 1000)
|
||||
a = (logistic, logistic')
|
||||
a = (sigmoid, sigmoid')
|
||||
rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
|
||||
|
||||
inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
|
||||
|
Reference in New Issue
Block a user