feat(notmnist): notmnist example using SGD + learning rate decay
This commit is contained in:
parent
ace0a18653
commit
b26347e19f
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +1,6 @@
|
|||||||
[submodule "examples/doc-classifier-data"]
|
[submodule "examples/doc-classifier-data"]
|
||||||
path = examples/doc-classifier-data
|
path = examples/doc-classifier-data
|
||||||
url = git@github.com:mdibaiee/doc-classifier-data
|
url = git@github.com:mdibaiee/doc-classifier-data
|
||||||
|
[submodule "examples/notMNIST"]
|
||||||
|
path = examples/notMNIST
|
||||||
|
url = git@github.com:mdibaiee/notMNIST
|
||||||
|
49
examples/424encoder.hs
Normal file
49
examples/424encoder.hs
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
module Main where
|
||||||
|
import Sibe
|
||||||
|
import Numeric.LinearAlgebra
|
||||||
|
import Data.List
|
||||||
|
import Debug.Trace
|
||||||
|
import Data.Default.Class
|
||||||
|
|
||||||
|
main = do
|
||||||
|
let alpha = 0.5
|
||||||
|
epochs = 1000
|
||||||
|
a = (sigmoid, sigmoid')
|
||||||
|
rnetwork = randomNetwork 0 (-0.1, 0.1) 4 [(2, a)] (4, a)
|
||||||
|
|
||||||
|
inputs = [vector [1, 0, 0, 0],
|
||||||
|
vector [0, 1, 0, 0],
|
||||||
|
vector [0, 0, 1, 0],
|
||||||
|
vector [0, 0, 0, 1]]
|
||||||
|
|
||||||
|
labels = [vector [1, 0, 0, 0],
|
||||||
|
vector [0, 1, 0, 0],
|
||||||
|
vector [0, 0, 1, 0],
|
||||||
|
vector [0, 0, 0, 1]]
|
||||||
|
|
||||||
|
session = def { network = rnetwork
|
||||||
|
, learningRate = 0.5
|
||||||
|
, epochs = 1000
|
||||||
|
, training = zip inputs labels
|
||||||
|
, test = zip inputs labels
|
||||||
|
} :: Session
|
||||||
|
|
||||||
|
let initialCost = crossEntropy session
|
||||||
|
|
||||||
|
newsession <- run gd session
|
||||||
|
|
||||||
|
let results = map (`forward` newsession) inputs
|
||||||
|
rounded = map (map round . toList) results
|
||||||
|
|
||||||
|
cost = crossEntropy newsession
|
||||||
|
|
||||||
|
putStrLn "parameters: "
|
||||||
|
putStrLn $ "- inputs: " ++ show inputs
|
||||||
|
putStrLn $ "- labels: " ++ show labels
|
||||||
|
putStrLn $ "- learning rate: " ++ show alpha
|
||||||
|
putStrLn $ "- epochs: " ++ show epochs
|
||||||
|
putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost
|
||||||
|
putStrLn "results: "
|
||||||
|
putStrLn $ "- actual result: " ++ show results
|
||||||
|
putStrLn $ "- rounded result: " ++ show rounded
|
||||||
|
putStrLn $ "- cost (cross-entropy): " ++ show cost
|
@ -1,6 +1,7 @@
|
|||||||
module Main
|
module Main
|
||||||
where
|
where
|
||||||
-- import Sibe
|
-- import Sibe
|
||||||
|
import Sibe.NLP
|
||||||
import Sibe.NaiveBayes
|
import Sibe.NaiveBayes
|
||||||
import Text.Printf
|
import Text.Printf
|
||||||
import Data.List
|
import Data.List
|
||||||
@ -28,14 +29,14 @@ module Main
|
|||||||
documents = cleanDocuments . removeWords sws $ createDocuments classes dataset
|
documents = cleanDocuments . removeWords sws $ createDocuments classes dataset
|
||||||
testDocuments = cleanDocuments $ createDocuments classes test
|
testDocuments = cleanDocuments $ createDocuments classes test
|
||||||
|
|
||||||
nb = train documents intClasses
|
nb = initialize documents intClasses
|
||||||
|
|
||||||
-- top-ten
|
-- top-ten
|
||||||
topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb)
|
topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb)
|
||||||
filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses
|
filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses
|
||||||
filteredClasses = map fst filtered
|
filteredClasses = map fst filtered
|
||||||
ttDocs = concatMap snd filtered
|
ttDocs = concatMap snd filtered
|
||||||
ttNB = train ttDocs filteredClasses
|
ttNB = initialize ttDocs filteredClasses
|
||||||
|
|
||||||
ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test
|
ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test
|
||||||
|
|
||||||
|
@ -1,54 +0,0 @@
|
|||||||
{-# LANGUAGE BangPatterns #-}
|
|
||||||
module Main
|
|
||||||
where
|
|
||||||
-- import Sibe
|
|
||||||
import Sibe.NaiveBayes
|
|
||||||
import Text.Printf
|
|
||||||
import Data.List
|
|
||||||
import Data.Maybe
|
|
||||||
import Debug.Trace
|
|
||||||
import Data.List.Split
|
|
||||||
import System.Directory
|
|
||||||
import Control.DeepSeq
|
|
||||||
import System.IO
|
|
||||||
|
|
||||||
main = do
|
|
||||||
putStr "Reading documents... "
|
|
||||||
neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/"
|
|
||||||
pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/"
|
|
||||||
|
|
||||||
test_neg_documents <- createDocuments "examples/sentiment-analysis-data/test/neg/"
|
|
||||||
test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/"
|
|
||||||
putStrLn "done"
|
|
||||||
|
|
||||||
let classes = [0..9] -- rating, from 0 to 9 (1 to 10)
|
|
||||||
documents = neg_documents ++ pos_documents
|
|
||||||
nb = train documents classes
|
|
||||||
|
|
||||||
testDocuments = neg_documents ++ pos_documents
|
|
||||||
|
|
||||||
results = map (\(Document text c) -> (c, run text nb)) testDocuments
|
|
||||||
-- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
|
|
||||||
print results
|
|
||||||
|
|
||||||
-- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r))
|
|
||||||
-- mapM_ showResults results
|
|
||||||
--
|
|
||||||
-- putStrLn $ "Recall: " ++ show (recall results)
|
|
||||||
-- putStrLn $ "Precision: " ++ show (precision results)
|
|
||||||
-- putStrLn $ "F Measure: " ++ show (fmeasure results)
|
|
||||||
-- putStrLn $ "Accuracy: " ++ show (accuracy results)
|
|
||||||
|
|
||||||
createDocuments :: FilePath -> IO [Document]
|
|
||||||
createDocuments path = do
|
|
||||||
files <- drop 2 <$> getDirectoryContents path
|
|
||||||
let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int]
|
|
||||||
contents <- mapM (forceReadFile . (path ++)) files
|
|
||||||
return $ zipWith Document contents ratings
|
|
||||||
|
|
||||||
forceReadFile :: FilePath -> IO String
|
|
||||||
forceReadFile file = do
|
|
||||||
handle <- openFile file ReadMode
|
|
||||||
content <- hGetContents handle
|
|
||||||
content `deepseq` hClose handle
|
|
||||||
return content
|
|
1
examples/notMNIST
Submodule
1
examples/notMNIST
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 0dbdfd43ffb8e90a3657ed040fd1fb3d25654b51
|
111
examples/notmnist.hs
Normal file
111
examples/notmnist.hs
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
{-# LANGUAGE RecordWildCards #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
{-# LANGUAGE ScopedTypeVariables #-}
|
||||||
|
|
||||||
|
module Main where
|
||||||
|
import Sibe
|
||||||
|
import Numeric.LinearAlgebra
|
||||||
|
import Data.List
|
||||||
|
import Debug.Trace
|
||||||
|
import System.IO
|
||||||
|
import System.Directory
|
||||||
|
import Codec.Picture
|
||||||
|
import Codec.Picture.Types
|
||||||
|
import qualified Data.Vector.Storable as V
|
||||||
|
import Data.Either
|
||||||
|
import System.Random
|
||||||
|
import System.Random.Shuffle
|
||||||
|
import Data.Default.Class
|
||||||
|
|
||||||
|
import qualified Graphics.Rendering.Chart.Easy as Chart
|
||||||
|
import Graphics.Rendering.Chart.Backend.Cairo
|
||||||
|
|
||||||
|
main = do
|
||||||
|
setStdGen (mkStdGen 100)
|
||||||
|
|
||||||
|
let a = (sigmoid, sigmoid')
|
||||||
|
o = (softmax, one)
|
||||||
|
rnetwork = randomNetwork 0 (-1, 1) (28*28) [(100, a)] (10, a)
|
||||||
|
|
||||||
|
(inputs, labels) <- dataset
|
||||||
|
|
||||||
|
let trp = length inputs * 70 `div` 100
|
||||||
|
tep = length inputs * 30 `div` 100
|
||||||
|
|
||||||
|
-- training data
|
||||||
|
trinputs = take trp inputs
|
||||||
|
trlabels = take trp labels
|
||||||
|
|
||||||
|
-- test data
|
||||||
|
teinputs = take tep . drop trp $ inputs
|
||||||
|
telabels = take tep . drop trp $ labels
|
||||||
|
|
||||||
|
let session = def { learningRate = 0.5
|
||||||
|
, batchSize = 32
|
||||||
|
, epochs = 35
|
||||||
|
, network = rnetwork
|
||||||
|
, training = zip trinputs trlabels
|
||||||
|
, test = zip teinputs telabels
|
||||||
|
} :: Session
|
||||||
|
|
||||||
|
let initialCost = crossEntropy session
|
||||||
|
|
||||||
|
newsession <- run (sgd . learningRateDecay (1.1, 5e-2)) session
|
||||||
|
|
||||||
|
let el = map (\(e, l, _) -> (e, l)) (chart newsession)
|
||||||
|
ea = map (\(e, _, a) -> (e, a)) (chart newsession)
|
||||||
|
toFile Chart.def "notmnist.png" $ do
|
||||||
|
Chart.layoutlr_title Chart..= "loss over time"
|
||||||
|
Chart.plotLeft (Chart.line "loss" [el])
|
||||||
|
Chart.plotRight (Chart.line "learningRate" [ea])
|
||||||
|
|
||||||
|
|
||||||
|
let cost = crossEntropy newsession
|
||||||
|
|
||||||
|
putStrLn "parameters: "
|
||||||
|
putStrLn $ "- batch size: " ++ show (batchSize session)
|
||||||
|
putStrLn $ "- learning rate: " ++ show (learningRate session)
|
||||||
|
putStrLn $ "- epochs: " ++ show (epochs session)
|
||||||
|
putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost
|
||||||
|
putStrLn "results: "
|
||||||
|
putStrLn $ "- accuracy: " ++ show (accuracy newsession)
|
||||||
|
putStrLn $ "- cost (cross-entropy): " ++ show cost
|
||||||
|
|
||||||
|
dataset :: IO ([Vector Double], [Vector Double])
|
||||||
|
dataset = do
|
||||||
|
let dir = "examples/notMNIST/"
|
||||||
|
|
||||||
|
groups <- filter ((/= '.') . head) <$> listDirectory dir
|
||||||
|
|
||||||
|
inputFiles <- mapM (listDirectory . (dir ++)) groups
|
||||||
|
|
||||||
|
let n = 512 {-- minimum (map length inputFiles) --}
|
||||||
|
numbers = map (`div` n) [0..n * length groups - 1]
|
||||||
|
inputFilesFull = map (\(i, g) -> map ((dir ++ i ++ "/") ++) g) (zip groups inputFiles)
|
||||||
|
|
||||||
|
|
||||||
|
inputImages <- mapM (mapM readImage . take n) inputFilesFull
|
||||||
|
|
||||||
|
let names = map (take n) inputFilesFull
|
||||||
|
|
||||||
|
let (l, r) = partitionEithers $ concat inputImages
|
||||||
|
inputs = map (fromPixels . convertRGB8) r
|
||||||
|
labels = map (\i -> V.replicate i 0 `V.snoc` 1 V.++ V.replicate (9 - i) 0) numbers
|
||||||
|
|
||||||
|
pairs = zip inputs labels
|
||||||
|
|
||||||
|
shuffled <- shuffleM pairs
|
||||||
|
return (map fst shuffled, map snd shuffled)
|
||||||
|
|
||||||
|
where
|
||||||
|
fromPixels :: Image PixelRGB8 -> Vector Double
|
||||||
|
fromPixels img@Image { .. } =
|
||||||
|
let pairs = [(x, y) | x <- [0..imageWidth - 1], y <- [0..imageHeight - 1]]
|
||||||
|
in V.fromList $ map iter pairs
|
||||||
|
where
|
||||||
|
iter (x, y) =
|
||||||
|
let (PixelRGB8 r g b) = convertPixel $ pixelAt img x y
|
||||||
|
in
|
||||||
|
if r == 0 && g == 0 && b == 0 then 0 else 1
|
||||||
|
|
||||||
|
|
0
examples/recurrent-doc-classifier.hs
Normal file
0
examples/recurrent-doc-classifier.hs
Normal file
@ -1,5 +0,0 @@
|
|||||||
XSym
|
|
||||||
0040
|
|
||||||
3666c4cacaf995ebd11ef25aab70de99
|
|
||||||
../../sibe-repos/sentiment-analysis-data
|
|
||||||
|
|
35
examples/sin.hs
Normal file
35
examples/sin.hs
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
module Main where
|
||||||
|
import Sibe
|
||||||
|
import Numeric.LinearAlgebra
|
||||||
|
import Data.List
|
||||||
|
import Debug.Trace
|
||||||
|
|
||||||
|
main = do
|
||||||
|
let alpha = 0.5
|
||||||
|
epochs = 1000
|
||||||
|
a = (sigmoid, sigmoid')
|
||||||
|
lo = (sigmoid, (\_ -> 1)) -- cross entropy
|
||||||
|
-- a = (relu, relu')
|
||||||
|
rnetwork = randomNetwork 0 (-1, 1) 1 [(50, a)] (1, lo)
|
||||||
|
|
||||||
|
inputs = map (\a -> vector [a]) (reverse [0, 30, 45, 60, 90])
|
||||||
|
labels = map (\deg -> vector $ [sin $ deg * pi/180]) (reverse [0, 30, 45, 60, 90])
|
||||||
|
|
||||||
|
initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels
|
||||||
|
|
||||||
|
network <- run session inputs rnetwork labels alpha epochs
|
||||||
|
|
||||||
|
let results = map (`forward` network) inputs
|
||||||
|
rounded = map (map round . toList) results
|
||||||
|
|
||||||
|
cost = zipWith crossEntropy (map (`forward` network) inputs) labels
|
||||||
|
|
||||||
|
putStrLn "parameters: "
|
||||||
|
putStrLn $ "- inputs: " ++ show inputs
|
||||||
|
putStrLn $ "- labels: " ++ show labels
|
||||||
|
putStrLn $ "- learning rate: " ++ show alpha
|
||||||
|
putStrLn $ "- epochs: " ++ show epochs
|
||||||
|
{-putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost-}
|
||||||
|
putStrLn "results: "
|
||||||
|
putStrLn $ "- actual result: " ++ show results
|
||||||
|
{-putStrLn $ "- cost (cross-entropy): " ++ show cost-}
|
@ -3,30 +3,37 @@ module Main where
|
|||||||
import Numeric.LinearAlgebra
|
import Numeric.LinearAlgebra
|
||||||
import Data.List
|
import Data.List
|
||||||
import Debug.Trace
|
import Debug.Trace
|
||||||
|
import Data.Default.Class
|
||||||
|
|
||||||
main = do
|
main = do
|
||||||
let learning_rate = 0.5
|
let a = (sigmoid, sigmoid')
|
||||||
(iterations, epochs) = (2, 1000)
|
rnetwork = randomNetwork 0 (-1, 1) 2 [(2, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
|
||||||
a = (sigmoid, sigmoid')
|
|
||||||
rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
|
|
||||||
|
|
||||||
inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
|
inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
|
||||||
labels = [vector [1], vector [1], vector [0], vector [0]]
|
labels = [vector [1], vector [1], vector [0], vector [0]]
|
||||||
|
|
||||||
initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels
|
session = def { network = rnetwork
|
||||||
|
, learningRate = 0.5
|
||||||
|
, epochs = 1000
|
||||||
|
, training = zip inputs labels
|
||||||
|
, test = zip inputs labels
|
||||||
|
} :: Session
|
||||||
|
|
||||||
network = session inputs rnetwork labels learning_rate (iterations, epochs)
|
initialCost = crossEntropy session
|
||||||
results = map (`forward` network) inputs
|
|
||||||
|
newsession <- run gd session
|
||||||
|
|
||||||
|
let results = map (`forward` newsession) inputs
|
||||||
rounded = map (map round . toList) results
|
rounded = map (map round . toList) results
|
||||||
|
|
||||||
cost = zipWith crossEntropy (map (`forward` network) inputs) labels
|
cost = crossEntropy newsession
|
||||||
|
|
||||||
putStrLn "parameters: "
|
putStrLn "parameters: "
|
||||||
putStrLn $ "- inputs: " ++ show inputs
|
putStrLn $ "- inputs: " ++ show inputs
|
||||||
putStrLn $ "- labels: " ++ show labels
|
putStrLn $ "- labels: " ++ show labels
|
||||||
putStrLn $ "- learning rate: " ++ show learning_rate
|
putStrLn $ "- learning rate: " ++ show (learningRate session)
|
||||||
putStrLn $ "- iterations/epochs: " ++ show (iterations, epochs)
|
putStrLn $ "- epochs: " ++ show (epochs session)
|
||||||
putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost
|
putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost
|
||||||
putStrLn "results: "
|
putStrLn "results: "
|
||||||
putStrLn $ "- actual result: " ++ show results
|
putStrLn $ "- actual result: " ++ show results
|
||||||
putStrLn $ "- rounded result: " ++ show rounded
|
putStrLn $ "- rounded result: " ++ show rounded
|
||||||
|
BIN
notmnist-0.png
Normal file
BIN
notmnist-0.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
BIN
notmnist-1.png
Normal file
BIN
notmnist-1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
BIN
notmnist-2.png
Normal file
BIN
notmnist-2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 25 KiB |
BIN
notmnist.png
Normal file
BIN
notmnist.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 24 KiB |
73
sibe.cabal
73
sibe.cabal
@ -15,7 +15,7 @@ cabal-version: >=1.10
|
|||||||
|
|
||||||
library
|
library
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
exposed-modules: Sibe, Sibe.NaiveBayes
|
exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, hmatrix
|
, hmatrix
|
||||||
, random
|
, random
|
||||||
@ -26,16 +26,21 @@ library
|
|||||||
, regex-pcre
|
, regex-pcre
|
||||||
, text
|
, text
|
||||||
, stemmer
|
, stemmer
|
||||||
|
, vector
|
||||||
|
, random-shuffle
|
||||||
|
, data-default-class
|
||||||
|
, Chart
|
||||||
|
, Chart-cairo
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable sibe-exe
|
--executable sibe-exe
|
||||||
hs-source-dirs: app
|
--hs-source-dirs: app
|
||||||
main-is: Main.hs
|
--main-is: Main.hs
|
||||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
--ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||||
build-depends: base
|
--build-depends: base
|
||||||
, sibe
|
--, sibe
|
||||||
, hmatrix
|
--, hmatrix
|
||||||
default-language: Haskell2010
|
--default-language: Haskell2010
|
||||||
|
|
||||||
executable example-xor
|
executable example-xor
|
||||||
hs-source-dirs: examples
|
hs-source-dirs: examples
|
||||||
@ -44,6 +49,43 @@ executable example-xor
|
|||||||
build-depends: base
|
build-depends: base
|
||||||
, sibe
|
, sibe
|
||||||
, hmatrix
|
, hmatrix
|
||||||
|
, data-default-class
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
--executable example-sin
|
||||||
|
--hs-source-dirs: examples
|
||||||
|
--main-is: sin.hs
|
||||||
|
--ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||||
|
--build-depends: base
|
||||||
|
--, sibe
|
||||||
|
--, hmatrix
|
||||||
|
--default-language: Haskell2010
|
||||||
|
|
||||||
|
executable example-424
|
||||||
|
hs-source-dirs: examples
|
||||||
|
main-is: 424encoder.hs
|
||||||
|
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||||
|
build-depends: base
|
||||||
|
, sibe
|
||||||
|
, hmatrix
|
||||||
|
, data-default-class
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable example-notmnist
|
||||||
|
hs-source-dirs: examples
|
||||||
|
main-is: notmnist.hs
|
||||||
|
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||||
|
build-depends: base
|
||||||
|
, sibe
|
||||||
|
, hmatrix
|
||||||
|
, directory >= 1.2.5.0
|
||||||
|
, JuicyPixels == 3.2.7.2
|
||||||
|
, vector == 0.11.0.0
|
||||||
|
, random
|
||||||
|
, random-shuffle
|
||||||
|
, data-default-class
|
||||||
|
, Chart
|
||||||
|
, Chart-cairo
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable example-naivebayes-doc-classifier
|
executable example-naivebayes-doc-classifier
|
||||||
@ -57,19 +99,6 @@ executable example-naivebayes-doc-classifier
|
|||||||
, split
|
, split
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable example-naivebayes-sentiment-analysis
|
|
||||||
hs-source-dirs: examples
|
|
||||||
main-is: naivebayes-sentiment-analysis.hs
|
|
||||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
|
||||||
build-depends: base
|
|
||||||
, sibe
|
|
||||||
, hmatrix
|
|
||||||
, containers
|
|
||||||
, split
|
|
||||||
, directory
|
|
||||||
, deepseq
|
|
||||||
default-language: Haskell2010
|
|
||||||
|
|
||||||
test-suite sibe-test
|
test-suite sibe-test
|
||||||
type: exitcode-stdio-1.0
|
type: exitcode-stdio-1.0
|
||||||
hs-source-dirs: test
|
hs-source-dirs: test
|
||||||
|
258
src/Sibe.hs
258
src/Sibe.hs
@ -15,22 +15,37 @@ module Sibe
|
|||||||
saveNetwork,
|
saveNetwork,
|
||||||
loadNetwork,
|
loadNetwork,
|
||||||
train,
|
train,
|
||||||
session,
|
gd,
|
||||||
shuffle,
|
sgd,
|
||||||
|
run,
|
||||||
sigmoid,
|
sigmoid,
|
||||||
sigmoid',
|
sigmoid',
|
||||||
|
softmax,
|
||||||
|
softmax',
|
||||||
|
one,
|
||||||
relu,
|
relu,
|
||||||
relu',
|
relu',
|
||||||
crossEntropy,
|
crossEntropy,
|
||||||
genSeed,
|
genSeed,
|
||||||
replaceVector
|
replaceVector,
|
||||||
|
Session(..),
|
||||||
|
accuracy,
|
||||||
|
learningRateDecay
|
||||||
) where
|
) where
|
||||||
import Numeric.LinearAlgebra
|
import Numeric.LinearAlgebra
|
||||||
import System.Random
|
import System.Random
|
||||||
|
import System.Random.Shuffle
|
||||||
import Debug.Trace
|
import Debug.Trace
|
||||||
import Data.List (foldl', sortBy)
|
import Data.List (foldl', sortBy, genericLength, permutations)
|
||||||
import System.IO
|
import System.IO
|
||||||
import Control.DeepSeq
|
import Control.DeepSeq
|
||||||
|
import Control.Monad
|
||||||
|
import qualified Data.Vector.Storable as V
|
||||||
|
import Data.Default.Class
|
||||||
|
import System.Exit
|
||||||
|
|
||||||
|
import qualified Graphics.Rendering.Chart.Easy as Chart
|
||||||
|
import Graphics.Rendering.Chart.Backend.Cairo
|
||||||
|
|
||||||
type LearningRate = Double
|
type LearningRate = Double
|
||||||
type Input = Vector Double
|
type Input = Vector Double
|
||||||
@ -48,8 +63,33 @@ module Sibe
|
|||||||
data Network = O Layer
|
data Network = O Layer
|
||||||
| Layer :- Network
|
| Layer :- Network
|
||||||
deriving (Show)
|
deriving (Show)
|
||||||
|
|
||||||
infixr 5 :-
|
infixr 5 :-
|
||||||
|
|
||||||
|
data Session = Session { network :: Network
|
||||||
|
, training :: [(Vector Double, Vector Double)]
|
||||||
|
, test :: [(Vector Double, Vector Double)]
|
||||||
|
, learningRate :: Double
|
||||||
|
, epochs :: Int
|
||||||
|
, epoch :: Int
|
||||||
|
, batchSize :: Int
|
||||||
|
, chart :: [(Int, Double, Double)]
|
||||||
|
, momentum :: Double
|
||||||
|
}
|
||||||
|
|
||||||
|
emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id))
|
||||||
|
instance Default Session where
|
||||||
|
def = Session { network = seq (die "You have not specified a network parameter") emptyNetwork
|
||||||
|
, training = seq (die "You have not specified training data") []
|
||||||
|
, test = seq (die "You have not specified test data") []
|
||||||
|
, learningRate = 0.5
|
||||||
|
, epochs = 35
|
||||||
|
, epoch = 0
|
||||||
|
, batchSize = 0
|
||||||
|
, chart = []
|
||||||
|
, momentum = 0
|
||||||
|
}
|
||||||
|
|
||||||
saveNetwork :: Network -> String -> IO ()
|
saveNetwork :: Network -> String -> IO ()
|
||||||
saveNetwork network file =
|
saveNetwork network file =
|
||||||
writeFile file ((show . reverse) (gen network []))
|
writeFile file ((show . reverse) (gen network []))
|
||||||
@ -73,22 +113,24 @@ module Sibe
|
|||||||
runLayer :: Input -> Layer -> Output
|
runLayer :: Input -> Layer -> Output
|
||||||
runLayer input (Layer !biases !weights _) = input <# weights + biases
|
runLayer input (Layer !biases !weights _) = input <# weights + biases
|
||||||
|
|
||||||
forward :: Input -> Network -> Output
|
forward :: Input -> Session -> Output
|
||||||
forward input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
|
forward input session = compute input (network session)
|
||||||
forward input (l@(Layer _ _ (fn, _)) :- n) = forward ((fst . activation $ l) $ runLayer input l) n
|
where
|
||||||
|
compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
|
||||||
|
compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n
|
||||||
|
|
||||||
randomLayer :: Seed -> (Int, Int) -> Activation -> Layer
|
randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer
|
||||||
randomLayer seed (wr, wc) =
|
randomLayer seed (wr, wc) (l, u) =
|
||||||
let weights = uniformSample seed wr $ replicate wc (-1, 1)
|
let weights = uniformSample seed wr $ replicate wc (l, u)
|
||||||
biases = randomVector seed Uniform wc * 2 - 1
|
biases = randomVector seed Uniform wc * realToFrac u - realToFrac l
|
||||||
in Layer biases weights
|
in Layer biases weights
|
||||||
|
|
||||||
randomNetwork :: Seed -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network
|
randomNetwork :: Seed -> (Double, Double) -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network
|
||||||
randomNetwork seed input [] (output, a) =
|
randomNetwork seed bound input [] (output, a) =
|
||||||
O $ randomLayer seed (input, output) a
|
O $ randomLayer seed (input, output) bound a
|
||||||
randomNetwork seed input ((h, a):hs) output =
|
randomNetwork seed bound input ((h, a):hs) output =
|
||||||
randomLayer seed (input, h) a :-
|
randomLayer seed (input, h) bound a :-
|
||||||
randomNetwork (seed + 1) h hs output
|
randomNetwork (seed + 1) bound h hs output
|
||||||
|
|
||||||
sigmoid :: Vector Double -> Vector Double
|
sigmoid :: Vector Double -> Vector Double
|
||||||
sigmoid x = 1 / max (1 + exp (-x)) 1e-10
|
sigmoid x = 1 / max (1 + exp (-x)) 1e-10
|
||||||
@ -96,18 +138,37 @@ module Sibe
|
|||||||
sigmoid' :: Vector Double -> Vector Double
|
sigmoid' :: Vector Double -> Vector Double
|
||||||
sigmoid' x = sigmoid x * (1 - sigmoid x)
|
sigmoid' x = sigmoid x * (1 - sigmoid x)
|
||||||
|
|
||||||
|
softmax :: Vector Double -> Vector Double
|
||||||
|
softmax x = cmap (\a -> exp a / s) x
|
||||||
|
where
|
||||||
|
s = V.sum $ exp x
|
||||||
|
|
||||||
|
one :: a -> Double
|
||||||
|
one x = 1
|
||||||
|
|
||||||
|
softmax' :: Vector Double -> Vector Double
|
||||||
|
softmax' x = softmax x * (1 - softmax x)
|
||||||
|
|
||||||
relu :: Vector Double -> Vector Double
|
relu :: Vector Double -> Vector Double
|
||||||
relu x = log (max (1 + exp x) 1e-10)
|
relu = cmap (max 0.1)
|
||||||
|
|
||||||
relu' :: Vector Double -> Vector Double
|
relu' :: Vector Double -> Vector Double
|
||||||
relu' = sigmoid
|
relu' = cmap dev
|
||||||
|
where dev x
|
||||||
|
| x < 0 = 0
|
||||||
|
| otherwise = 1
|
||||||
|
|
||||||
crossEntropy :: Output -> Output -> Double
|
crossEntropy :: Session -> Double
|
||||||
crossEntropy output target =
|
crossEntropy session =
|
||||||
let pairs = zip (toList output) (toList target)
|
let inputs = map fst (test session)
|
||||||
n = fromIntegral (length pairs)
|
labels = map (toList . snd) (test session)
|
||||||
in (-1 / n) * sum (map f pairs)
|
outputs = map (toList . (`forward` session)) inputs
|
||||||
|
pairs = zip outputs labels
|
||||||
|
n = genericLength pairs
|
||||||
|
|
||||||
|
in sum (map set pairs) / n
|
||||||
where
|
where
|
||||||
|
set (os, ls) = (-1 / genericLength os) * sum (zipWith (curry f) os ls)
|
||||||
f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)
|
f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)
|
||||||
|
|
||||||
train :: Input
|
train :: Input
|
||||||
@ -138,35 +199,137 @@ module Sibe
|
|||||||
o = fn y
|
o = fn y
|
||||||
(n', delta) = run o n
|
(n', delta) = run o n
|
||||||
|
|
||||||
de = delta * fn' y -- quadratic cost
|
de = delta * fn' y
|
||||||
|
|
||||||
biases' = biases - scale alpha de
|
biases' = biases - cmap (*alpha) de
|
||||||
weights' = weights - scale alpha (input `outer` de)
|
weights' = weights - cmap (*alpha) (input `outer` de)
|
||||||
layer = Layer biases' weights' (fn, fn')
|
layer = Layer biases' weights' (fn, fn')
|
||||||
|
|
||||||
pass = weights #> de
|
pass = weights #> de
|
||||||
-- pass = weights #> de
|
-- pass = weights #> de
|
||||||
in (layer :- n', pass)
|
in (layer :- n', pass)
|
||||||
|
|
||||||
session :: [Input] -> Network -> [Output] -> Double -> (Int, Int) -> Network
|
{-trainMomentum :: Input
|
||||||
session inputs network labels alpha (iterations, epochs) =
|
-> Network
|
||||||
let n = length inputs
|
-> Output -- target
|
||||||
indexes = shuffle n (map (`mod` n) [0..n * epochs])
|
-> Double -- learning rate
|
||||||
in foldl' iter network indexes
|
-> (Double, Double) -- momentum
|
||||||
|
-> Network -- network's output
|
||||||
|
trainMomentum input network target alpha (m, v) = fst $ run input network
|
||||||
where
|
where
|
||||||
iter net i =
|
run :: Input -> Network -> (Network, Vector Double)
|
||||||
let n = length inputs
|
run input (O l@(Layer biases weights (fn, fn'))) =
|
||||||
index = i `mod` n
|
let y = runLayer input l
|
||||||
input = inputs !! index
|
o = fn y
|
||||||
label = labels !! index
|
delta = o - target
|
||||||
in foldl' (\net _ -> train input net label alpha) net [0..iterations]
|
de = delta * fn' y
|
||||||
|
v =
|
||||||
|
-- de = delta -- cross entropy cost
|
||||||
|
|
||||||
shuffle :: Seed -> [a] -> [a]
|
biases' = biases - scale alpha de
|
||||||
shuffle seed list =
|
weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
|
||||||
let ords = map ord $ take (length list) (randomRs (0, 1) (mkStdGen seed) :: [Int])
|
layer = Layer biases' weights' (fn, fn') -- updated layer
|
||||||
in map snd $ sortBy (\x y -> fst x) (zip ords list)
|
|
||||||
where ord x | x == 0 = LT
|
pass = weights #> de
|
||||||
| x == 1 = GT
|
-- pass = weights #> de
|
||||||
|
|
||||||
|
in (O layer, pass)
|
||||||
|
run input (l@(Layer biases weights (fn, fn')) :- n) =
|
||||||
|
let y = runLayer input l
|
||||||
|
o = fn y
|
||||||
|
(n', delta) = run o n
|
||||||
|
|
||||||
|
de = delta * fn' y
|
||||||
|
|
||||||
|
biases' = biases - cmap (*alpha) de
|
||||||
|
weights' = weights - cmap (*alpha) (input `outer` de)
|
||||||
|
layer = Layer biases' weights' (fn, fn')
|
||||||
|
|
||||||
|
pass = weights #> de
|
||||||
|
-- pass = weights #> de
|
||||||
|
in (layer :- n', pass)-}
|
||||||
|
|
||||||
|
gd :: Session -> IO Session
|
||||||
|
gd session = do
|
||||||
|
seed <- newStdGen
|
||||||
|
|
||||||
|
let pairs = training session
|
||||||
|
alpha = learningRate session
|
||||||
|
net = network session
|
||||||
|
|
||||||
|
let n = length pairs
|
||||||
|
|
||||||
|
shuffled <- shuffleM pairs
|
||||||
|
|
||||||
|
let newnet = foldl' (\n (input, label) -> train input n label alpha) net pairs
|
||||||
|
|
||||||
|
return session { network = newnet
|
||||||
|
, epoch = epoch session + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
sgd :: Session -> IO Session
|
||||||
|
sgd session = do
|
||||||
|
seed <- newStdGen
|
||||||
|
|
||||||
|
let pairs = training session
|
||||||
|
bsize = batchSize session
|
||||||
|
alpha = learningRate session
|
||||||
|
net = network session
|
||||||
|
|
||||||
|
let n = length pairs
|
||||||
|
iterations = n `div` bsize - 1
|
||||||
|
|
||||||
|
shuffled <- shuffleM pairs
|
||||||
|
|
||||||
|
let iter net i =
|
||||||
|
let n = length pairs
|
||||||
|
batch = take bsize . drop (i * bsize) $ shuffled
|
||||||
|
batchInputs = map fst batch
|
||||||
|
batchLabels = map snd batch
|
||||||
|
batchPair = zip batchInputs batchLabels
|
||||||
|
in foldl' (\n (input, label) -> train input n label alpha) net batchPair
|
||||||
|
|
||||||
|
let newnet = foldl' iter net [0..iterations]
|
||||||
|
cost = crossEntropy (session { network = newnet })
|
||||||
|
|
||||||
|
let el = map (\(e, l, _) -> (e, l)) (chart session)
|
||||||
|
ea = map (\(e, _, a) -> (e, a)) (chart session)
|
||||||
|
|
||||||
|
putStrLn $ (show $ epoch session) ++ " => " ++ (show cost) ++ " @ " ++ (show $ learningRate session)
|
||||||
|
|
||||||
|
toFile Chart.def "sgd.png" $ do
|
||||||
|
Chart.layoutlr_title Chart..= "loss over time"
|
||||||
|
Chart.plotLeft (Chart.line "loss" [el])
|
||||||
|
Chart.plotRight (Chart.line "learningRate" [ea])
|
||||||
|
|
||||||
|
return session { network = newnet
|
||||||
|
, epoch = epoch session + 1
|
||||||
|
, chart = (epoch session, cost, learningRate session):chart session
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
accuracy :: Session -> Double
|
||||||
|
accuracy session =
|
||||||
|
let inputs = map fst (test session)
|
||||||
|
labels = map snd (test session)
|
||||||
|
|
||||||
|
results = map (`forward` session) inputs
|
||||||
|
rounded = map (map round . toList) results
|
||||||
|
|
||||||
|
equals = zipWith (==) rounded (map (map round . toList) labels)
|
||||||
|
in genericLength (filter (== True) equals) / genericLength inputs
|
||||||
|
|
||||||
|
learningRateDecay :: (Double, Double) -> Session -> Session
|
||||||
|
learningRateDecay (step, m) session =
|
||||||
|
session { learningRate = max m $ learningRate session / step }
|
||||||
|
|
||||||
|
run :: (Session -> IO Session)
|
||||||
|
-> Session -> IO Session
|
||||||
|
run fn session = foldM (\s i -> fn s) session [0..epochs session]
|
||||||
|
|
||||||
|
factorial :: Int -> Int
|
||||||
|
factorial 0 = 1
|
||||||
|
factorial x = x * factorial (x - 1)
|
||||||
|
|
||||||
genSeed :: IO Seed
|
genSeed :: IO Seed
|
||||||
genSeed = do
|
genSeed = do
|
||||||
@ -176,12 +339,7 @@ module Sibe
|
|||||||
replaceVector :: Vector Double -> Int -> Double -> Vector Double
|
replaceVector :: Vector Double -> Int -> Double -> Vector Double
|
||||||
replaceVector vec index value =
|
replaceVector vec index value =
|
||||||
let list = toList vec
|
let list = toList vec
|
||||||
in fromList $ rrow index list
|
in fromList $ take index list ++ value : drop (index + 1) list
|
||||||
where
|
|
||||||
rrow index [] = []
|
|
||||||
rrow index (x:xs)
|
|
||||||
| index == index = value:xs
|
|
||||||
| otherwise = x : rrow (index + 1) xs
|
|
||||||
|
|
||||||
clip :: Double -> (Double, Double) -> Double
|
clip :: Double -> (Double, Double) -> Double
|
||||||
clip x (l, u) = min u (max l x)
|
clip x (l, u) = min u (max l x)
|
||||||
|
0
src/Sibe/LogisticRegression.hs
Normal file
0
src/Sibe/LogisticRegression.hs
Normal file
129
src/Sibe/NLP.hs
Normal file
129
src/Sibe/NLP.hs
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
module Sibe.NLP
|
||||||
|
(Class,
|
||||||
|
Document(..),
|
||||||
|
ordNub,
|
||||||
|
accuracy,
|
||||||
|
recall,
|
||||||
|
precision,
|
||||||
|
fmeasure,
|
||||||
|
cleanText,
|
||||||
|
cleanDocuments,
|
||||||
|
removeWords,
|
||||||
|
removeStopwords,
|
||||||
|
ngram,
|
||||||
|
ngramText,
|
||||||
|
)
|
||||||
|
where
|
||||||
|
import Data.List
|
||||||
|
import Debug.Trace
|
||||||
|
import qualified Data.Set as Set
|
||||||
|
import Data.List.Split
|
||||||
|
import Data.Maybe
|
||||||
|
import Control.Arrow ((&&&))
|
||||||
|
import Text.Regex.PCRE
|
||||||
|
import Data.Char (isSpace, isNumber, toLower)
|
||||||
|
import NLP.Stemmer
|
||||||
|
|
||||||
|
type Class = Int;
|
||||||
|
|
||||||
|
data Document = Document { text :: String
|
||||||
|
, c :: Class
|
||||||
|
} deriving (Eq, Show, Read)
|
||||||
|
|
||||||
|
|
||||||
|
cleanText :: String -> String
|
||||||
|
cleanText string =
|
||||||
|
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
|
||||||
|
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
||||||
|
stemmed = unwords $ map (stem Porter) (words spacify)
|
||||||
|
nonumber = filter (not . isNumber) stemmed
|
||||||
|
lower = map toLower nonumber
|
||||||
|
in (unwords . words) lower -- remove unnecessary spaces
|
||||||
|
where
|
||||||
|
trim = f . f
|
||||||
|
where
|
||||||
|
f = reverse . dropWhile isSpace
|
||||||
|
replace needle replacement =
|
||||||
|
map (\c -> if c == needle then replacement else c)
|
||||||
|
|
||||||
|
cleanDocuments :: [Document] -> [Document]
|
||||||
|
cleanDocuments documents =
|
||||||
|
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
|
||||||
|
in cleaned
|
||||||
|
|
||||||
|
removeWords :: [String] -> [Document] -> [Document]
|
||||||
|
removeWords ws documents =
|
||||||
|
map (\(Document text c) -> Document (rm ws text) c) documents
|
||||||
|
where
|
||||||
|
rm list text =
|
||||||
|
unwords $ filter (`notElem` list) (words text)
|
||||||
|
|
||||||
|
removeStopwords :: Int -> [Document] -> [Document]
|
||||||
|
removeStopwords i documents =
|
||||||
|
let wc = wordCounts (concatDocs documents)
|
||||||
|
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
|
||||||
|
stopwords = map fst (take i wlist)
|
||||||
|
in removeWords stopwords documents
|
||||||
|
where
|
||||||
|
vocabulary x = ordNub (words x)
|
||||||
|
countWordInDoc d w = genericLength (filter (==w) d)
|
||||||
|
wordCounts x =
|
||||||
|
let voc = vocabulary x
|
||||||
|
in zip voc $ map (countWordInDoc (words x)) voc
|
||||||
|
|
||||||
|
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||||
|
|
||||||
|
ordNub :: (Ord a) => [a] -> [a]
|
||||||
|
ordNub = go Set.empty
|
||||||
|
where
|
||||||
|
go _ [] = []
|
||||||
|
go s (x:xs) = if x `Set.member` s then go s xs
|
||||||
|
else x : go (Set.insert x s) xs
|
||||||
|
|
||||||
|
accuracy :: [(Int, (Int, Double))] -> Double
|
||||||
|
accuracy results =
|
||||||
|
let pairs = map (\(a, b) -> (a, fst b)) results
|
||||||
|
correct = filter (uncurry (==)) pairs
|
||||||
|
in genericLength correct / genericLength results
|
||||||
|
|
||||||
|
recall :: [(Int, (Int, Double))] -> Double
|
||||||
|
recall results =
|
||||||
|
let classes = ordNub (map fst results)
|
||||||
|
s = sum (map rec classes) / genericLength classes
|
||||||
|
in s
|
||||||
|
where
|
||||||
|
rec a =
|
||||||
|
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
||||||
|
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
|
||||||
|
in t / y
|
||||||
|
|
||||||
|
precision :: [(Int, (Int, Double))] -> Double
|
||||||
|
precision results =
|
||||||
|
let classes = ordNub (map fst results)
|
||||||
|
s = sum (map prec classes) / genericLength classes
|
||||||
|
in s
|
||||||
|
where
|
||||||
|
prec a =
|
||||||
|
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
||||||
|
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
|
||||||
|
in
|
||||||
|
if y == 0
|
||||||
|
then 0
|
||||||
|
else t / y
|
||||||
|
|
||||||
|
fmeasure :: [(Int, (Int, Double))] -> Double
|
||||||
|
fmeasure results =
|
||||||
|
let r = recall results
|
||||||
|
p = precision results
|
||||||
|
in (2 * p * r) / (p + r)
|
||||||
|
|
||||||
|
ngram :: Int -> [Document] -> [Document]
|
||||||
|
ngram n documents =
|
||||||
|
map (\(Document text c) -> Document (ngramText n text) c) documents
|
||||||
|
|
||||||
|
ngramText :: Int -> String -> String
|
||||||
|
ngramText n text =
|
||||||
|
let ws = words text
|
||||||
|
pairs = zip [0..] ws
|
||||||
|
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
||||||
|
in unwords ("<b>_":grams)
|
@ -1,7 +1,7 @@
|
|||||||
module Sibe.NaiveBayes
|
module Sibe.NaiveBayes
|
||||||
(Document(..),
|
(Document(..),
|
||||||
NB(..),
|
NB(..),
|
||||||
train,
|
initialize,
|
||||||
run,
|
run,
|
||||||
session,
|
session,
|
||||||
ordNub,
|
ordNub,
|
||||||
@ -19,21 +19,13 @@ module Sibe.NaiveBayes
|
|||||||
removeStopwords,
|
removeStopwords,
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
|
import Sibe.NLP
|
||||||
import Data.List
|
import Data.List
|
||||||
import Debug.Trace
|
import Debug.Trace
|
||||||
import qualified Data.Set as Set
|
import qualified Data.Set as Set
|
||||||
import Data.List.Split
|
import Data.List.Split
|
||||||
import Data.Maybe
|
import Data.Maybe
|
||||||
import Control.Arrow ((&&&))
|
import Control.Arrow ((&&&))
|
||||||
import Text.Regex.PCRE
|
|
||||||
import Data.Char (isSpace, isNumber, toLower)
|
|
||||||
import NLP.Stemmer
|
|
||||||
|
|
||||||
type Class = Int;
|
|
||||||
|
|
||||||
data Document = Document { text :: String
|
|
||||||
, c :: Class
|
|
||||||
} deriving (Eq, Show, Read)
|
|
||||||
|
|
||||||
data NB = NB { documents :: [Document]
|
data NB = NB { documents :: [Document]
|
||||||
, classes :: [(Class, Double)]
|
, classes :: [(Class, Double)]
|
||||||
@ -44,8 +36,8 @@ module Sibe.NaiveBayes
|
|||||||
, cgram :: [(Class, [(String, Int)])]
|
, cgram :: [(Class, [(String, Int)])]
|
||||||
} deriving (Eq, Show, Read)
|
} deriving (Eq, Show, Read)
|
||||||
|
|
||||||
train :: [Document] -> [Class] -> NB
|
initialize :: [Document] -> [Class] -> NB
|
||||||
train documents classes =
|
initialize documents classes =
|
||||||
let megadoc = concatDocs documents
|
let megadoc = concatDocs documents
|
||||||
vocabulary = genericLength ((ordNub . words) megadoc)
|
vocabulary = genericLength ((ordNub . words) megadoc)
|
||||||
-- (class, prior probability)
|
-- (class, prior probability)
|
||||||
@ -83,17 +75,6 @@ module Sibe.NaiveBayes
|
|||||||
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
|
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
|
||||||
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
|
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
|
||||||
|
|
||||||
ngram :: Int -> [Document] -> [Document]
|
|
||||||
ngram n documents =
|
|
||||||
map (\(Document text c) -> Document (ngramText n text) c) documents
|
|
||||||
|
|
||||||
ngramText :: Int -> String -> String
|
|
||||||
ngramText n text =
|
|
||||||
let ws = words text
|
|
||||||
pairs = zip [0..] ws
|
|
||||||
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
|
||||||
in unwords ("<b>_":grams)
|
|
||||||
|
|
||||||
session :: [Document] -> NB -> [(Class, (Class, Double))]
|
session :: [Document] -> NB -> [(Class, (Class, Double))]
|
||||||
session docs nb =
|
session docs nb =
|
||||||
let results = map (\(Document text c) -> (c, run text nb)) docs
|
let results = map (\(Document text c) -> (c, run text nb)) docs
|
||||||
@ -143,91 +124,5 @@ module Sibe.NaiveBayes
|
|||||||
variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
|
variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
|
||||||
in sqrt variance
|
in sqrt variance
|
||||||
|
|
||||||
cleanText :: String -> String
|
|
||||||
cleanText string =
|
|
||||||
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
|
|
||||||
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
|
||||||
stemmed = unwords $ map (stem Porter) (words spacify)
|
|
||||||
nonumber = filter (not . isNumber) stemmed
|
|
||||||
lower = map toLower nonumber
|
|
||||||
in (unwords . words) lower -- remove unnecessary spaces
|
|
||||||
where
|
|
||||||
trim = f . f
|
|
||||||
where
|
|
||||||
f = reverse . dropWhile isSpace
|
|
||||||
replace needle replacement =
|
|
||||||
map (\c -> if c == needle then replacement else c)
|
|
||||||
|
|
||||||
cleanDocuments :: [Document] -> [Document]
|
|
||||||
cleanDocuments documents =
|
|
||||||
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
|
|
||||||
in cleaned
|
|
||||||
|
|
||||||
removeWords :: [String] -> [Document] -> [Document]
|
|
||||||
removeWords ws documents =
|
|
||||||
map (\(Document text c) -> Document (rm ws text) c) documents
|
|
||||||
where
|
|
||||||
rm list text =
|
|
||||||
unwords $ filter (`notElem` list) (words text)
|
|
||||||
|
|
||||||
removeStopwords :: Int -> [Document] -> [Document]
|
|
||||||
removeStopwords i documents =
|
|
||||||
let wc = wordCounts (concatDocs documents)
|
|
||||||
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
|
|
||||||
stopwords = map fst (take i wlist)
|
|
||||||
in removeWords stopwords documents
|
|
||||||
where
|
|
||||||
vocabulary x = ordNub (words x)
|
|
||||||
countWordInDoc d w = genericLength (filter (==w) d)
|
|
||||||
wordCounts x =
|
|
||||||
let voc = vocabulary x
|
|
||||||
in zip voc $ map (countWordInDoc (words x)) voc
|
|
||||||
|
|
||||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
|
||||||
|
|
||||||
l :: (Show a) => a -> a
|
l :: (Show a) => a -> a
|
||||||
l a = trace (show a) a
|
l a = trace (show a) a
|
||||||
|
|
||||||
ordNub :: (Ord a) => [a] -> [a]
|
|
||||||
ordNub = go Set.empty
|
|
||||||
where
|
|
||||||
go _ [] = []
|
|
||||||
go s (x:xs) = if x `Set.member` s then go s xs
|
|
||||||
else x : go (Set.insert x s) xs
|
|
||||||
|
|
||||||
accuracy :: [(Int, (Int, Double))] -> Double
|
|
||||||
accuracy results =
|
|
||||||
let pairs = map (\(a, b) -> (a, fst b)) results
|
|
||||||
correct = filter (uncurry (==)) pairs
|
|
||||||
in genericLength correct / genericLength results
|
|
||||||
|
|
||||||
recall :: [(Int, (Int, Double))] -> Double
|
|
||||||
recall results =
|
|
||||||
let classes = ordNub (map fst results)
|
|
||||||
s = sum (map rec classes) / genericLength classes
|
|
||||||
in s
|
|
||||||
where
|
|
||||||
rec a =
|
|
||||||
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
|
||||||
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
|
|
||||||
in t / y
|
|
||||||
|
|
||||||
precision :: [(Int, (Int, Double))] -> Double
|
|
||||||
precision results =
|
|
||||||
let classes = ordNub (map fst results)
|
|
||||||
s = sum (map prec classes) / genericLength classes
|
|
||||||
in s
|
|
||||||
where
|
|
||||||
prec a =
|
|
||||||
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
|
||||||
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
|
|
||||||
in
|
|
||||||
if y == 0
|
|
||||||
then 0
|
|
||||||
else t / y
|
|
||||||
|
|
||||||
fmeasure :: [(Int, (Int, Double))] -> Double
|
|
||||||
fmeasure results =
|
|
||||||
let r = recall results
|
|
||||||
p = precision results
|
|
||||||
in (2 * p * r) / (p + r)
|
|
||||||
|
74
stack.yaml
74
stack.yaml
@ -1,40 +1,5 @@
|
|||||||
# This file was automatically generated by 'stack init'
|
|
||||||
#
|
|
||||||
# Some commonly used options have been documented as comments in this file.
|
|
||||||
# For advanced use and comprehensive documentation of the format, please see:
|
|
||||||
# http://docs.haskellstack.org/en/stable/yaml_configuration/
|
|
||||||
|
|
||||||
# Resolver to choose a 'specific' stackage snapshot or a compiler version.
|
|
||||||
# A snapshot resolver dictates the compiler version and the set of packages
|
|
||||||
# to be used for project dependencies. For example:
|
|
||||||
#
|
|
||||||
# resolver: lts-3.5
|
|
||||||
# resolver: nightly-2015-09-21
|
|
||||||
# resolver: ghc-7.10.2
|
|
||||||
# resolver: ghcjs-0.1.0_ghc-7.10.2
|
|
||||||
# resolver:
|
|
||||||
# name: custom-snapshot
|
|
||||||
# location: "./custom-snapshot.yaml"
|
|
||||||
resolver: lts-6.7
|
resolver: lts-6.7
|
||||||
|
|
||||||
# User packages to be built.
|
|
||||||
# Various formats can be used as shown in the example below.
|
|
||||||
#
|
|
||||||
# packages:
|
|
||||||
# - some-directory
|
|
||||||
# - https://example.com/foo/bar/baz-0.0.2.tar.gz
|
|
||||||
# - location:
|
|
||||||
# git: https://github.com/commercialhaskell/stack.git
|
|
||||||
# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a
|
|
||||||
# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a
|
|
||||||
# extra-dep: true
|
|
||||||
# subdirs:
|
|
||||||
# - auto-update
|
|
||||||
# - wai
|
|
||||||
#
|
|
||||||
# A package marked 'extra-dep: true' will only be built if demanded by a
|
|
||||||
# non-dependency (i.e. a user package), and its test suites and benchmarks
|
|
||||||
# will not be run. This is useful for tweaking upstream packages.
|
|
||||||
packages:
|
packages:
|
||||||
- location:
|
- location:
|
||||||
git: git@github.com:albertoruiz/hmatrix.git
|
git: git@github.com:albertoruiz/hmatrix.git
|
||||||
@ -42,36 +7,11 @@ packages:
|
|||||||
subdirs:
|
subdirs:
|
||||||
- packages/base
|
- packages/base
|
||||||
- .
|
- .
|
||||||
- http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz
|
|
||||||
- http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz
|
|
||||||
- http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz
|
|
||||||
|
|
||||||
# Dependency packages to be pulled from upstream that are not in the resolver
|
extra-deps:
|
||||||
# (e.g., acme-missiles-0.3)
|
- directory-1.2.7.0
|
||||||
extra-deps: []
|
- text-1.2.2.1
|
||||||
|
- stemmer-0.5.2
|
||||||
# Override default flag values for local packages and extra-deps
|
- containers-0.5.7.1
|
||||||
flags: {}
|
- Chart-1.8
|
||||||
|
- Chart-cairo-1.8
|
||||||
# Extra package databases containing global packages
|
|
||||||
extra-package-dbs: []
|
|
||||||
|
|
||||||
# Control whether we use the GHC we find on the path
|
|
||||||
# system-ghc: true
|
|
||||||
#
|
|
||||||
# Require a specific version of stack, using version ranges
|
|
||||||
# require-stack-version: -any # Default
|
|
||||||
# require-stack-version: ">=1.1"
|
|
||||||
#
|
|
||||||
# Override the architecture used by stack, especially useful on Windows
|
|
||||||
# arch: i386
|
|
||||||
# arch: x86_64
|
|
||||||
#
|
|
||||||
# Extra directories used by stack for building
|
|
||||||
# extra-include-dirs: [/path/to/dir]
|
|
||||||
# extra-lib-dirs: [/path/to/dir]
|
|
||||||
#
|
|
||||||
# Allow a newer minor version of GHC than the snapshot specifies
|
|
||||||
# compiler-check: newer-minor
|
|
||||||
|
|
||||||
system-ghc: false
|
|
||||||
|
Loading…
Reference in New Issue
Block a user