feat(Numeric): move all modules to Numeric
This commit is contained in:
362
src/Numeric/Sibe.hs
Normal file
362
src/Numeric/Sibe.hs
Normal file
@ -0,0 +1,362 @@
|
||||
{-# LANGUAGE GADTs #-}
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DataKinds #-}
|
||||
{-# LANGUAGE TypeOperators #-}
|
||||
|
||||
module Numeric.Sibe
|
||||
(Network(..),
|
||||
Layer(..),
|
||||
Input,
|
||||
Output,
|
||||
Activation,
|
||||
forward,
|
||||
forward',
|
||||
runLayer,
|
||||
runLayer',
|
||||
randomLayer,
|
||||
randomNetwork,
|
||||
buildNetwork,
|
||||
saveNetwork,
|
||||
loadNetwork,
|
||||
train,
|
||||
gd,
|
||||
sgd,
|
||||
run,
|
||||
sigmoid,
|
||||
sigmoid',
|
||||
softmax,
|
||||
softmax',
|
||||
relu,
|
||||
relu',
|
||||
crossEntropy,
|
||||
crossEntropy',
|
||||
genSeed,
|
||||
replaceVector,
|
||||
Session(..),
|
||||
accuracy,
|
||||
learningRateDecay,
|
||||
ignoreBiases,
|
||||
one
|
||||
) where
|
||||
import Numeric.LinearAlgebra
|
||||
import System.Random
|
||||
import System.Random.Shuffle
|
||||
import Debug.Trace
|
||||
import Data.List (foldl', sortBy, genericLength, permutations)
|
||||
import System.IO
|
||||
import Control.DeepSeq
|
||||
import Control.Monad
|
||||
import qualified Data.Vector.Storable as V
|
||||
import Data.Default.Class
|
||||
import System.Exit
|
||||
|
||||
import qualified Graphics.Rendering.Chart.Easy as Chart
|
||||
import Graphics.Rendering.Chart.Backend.Cairo
|
||||
|
||||
type LearningRate = Double
|
||||
type Input = Vector Double
|
||||
type Output = Vector Double
|
||||
type Activation = (Vector Double -> Vector Double, Vector Double -> Vector Double)
|
||||
|
||||
data Layer = Layer { biases :: !(Vector Double)
|
||||
, nodes :: !(Matrix Double)
|
||||
, activation :: Activation
|
||||
}
|
||||
|
||||
instance Show Layer where
|
||||
show (Layer biases nodes _) = "(" ++ show biases ++ "," ++ show nodes ++ ")"
|
||||
|
||||
data Network = O Layer
|
||||
| Layer :- Network
|
||||
|
||||
instance Show Network where
|
||||
show (Layer biases nodes _ :- n) =
|
||||
(show . length $ toLists nodes) ++ "x" ++ (show . length . head . toLists $ nodes) ++ " " ++ (show . length . toList $ biases) ++ " :- " ++ show n
|
||||
show (O (Layer biases nodes _)) =
|
||||
(show . length $ toLists nodes) ++ "x" ++ (show . length . head . toLists $ nodes) ++ " " ++ (show . length . toList $ biases)
|
||||
|
||||
infixr 5 :-
|
||||
|
||||
data Session = Session { network :: Network
|
||||
, training :: [(Vector Double, Vector Double)]
|
||||
, test :: [(Vector Double, Vector Double)]
|
||||
, learningRate :: Double
|
||||
, epochs :: Int
|
||||
, epoch :: Int
|
||||
, batchSize :: Int
|
||||
, chart :: [(Int, Double, Double)]
|
||||
, drawChart :: Bool
|
||||
, chartName :: String
|
||||
, momentum :: Double
|
||||
, debug :: Bool
|
||||
} deriving (Show)
|
||||
|
||||
emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id))
|
||||
instance Default Session where
|
||||
def = Session { network = seq (die "You have not specified a network parameter") emptyNetwork
|
||||
, training = seq (die "You have not specified training data") []
|
||||
, test = seq (die "You have not specified test data") []
|
||||
, learningRate = 0.5
|
||||
, epochs = 35
|
||||
, epoch = 0
|
||||
, batchSize = 0
|
||||
, chart = []
|
||||
, drawChart = False
|
||||
, chartName = "chart.png"
|
||||
, momentum = 0
|
||||
, debug = False
|
||||
}
|
||||
|
||||
saveNetwork :: Network -> String -> IO ()
|
||||
saveNetwork network file =
|
||||
writeFile file ((show . reverse) (gen network []))
|
||||
where
|
||||
gen (O (Layer biases nodes _)) list = (biases, nodes) : list
|
||||
gen (Layer biases nodes _ :- n) list = gen n $ (biases, nodes) : list
|
||||
|
||||
loadNetwork :: [Activation] -> String -> IO Network
|
||||
loadNetwork activations file = do
|
||||
handle <- openFile file ReadMode
|
||||
content <- hGetContents handle
|
||||
let list = read content :: [(Vector Double, Matrix Double)]
|
||||
network = gen list activations
|
||||
content `deepseq` hClose handle
|
||||
return network
|
||||
|
||||
where
|
||||
gen [(biases, nodes)] [a] = O (Layer biases nodes a)
|
||||
gen ((biases, nodes):hs) (a:as) = Layer biases nodes a :- gen hs as
|
||||
|
||||
runLayer :: Input -> Layer -> Output
|
||||
runLayer input (Layer !biases !weights _) = input <# weights + biases
|
||||
|
||||
runLayer' :: Input -> Layer -> Output
|
||||
runLayer' input (Layer !biases !weights _) = input <# weights
|
||||
|
||||
forward :: Input -> Session -> Output
|
||||
forward input session = compute input (network session)
|
||||
where
|
||||
compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
|
||||
compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n
|
||||
|
||||
forward' :: Input -> Session -> Output
|
||||
forward' input session = compute input (network session)
|
||||
where
|
||||
compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer' input l
|
||||
compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer' input l) n
|
||||
|
||||
randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer
|
||||
randomLayer seed (wr, wc) (l, u) =
|
||||
let weights = uniformSample seed wr $ replicate wc (l, u)
|
||||
biases = randomVector seed Uniform wc * realToFrac u - realToFrac l
|
||||
in Layer biases weights
|
||||
|
||||
randomNetwork :: Seed -> (Double, Double) -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network
|
||||
randomNetwork seed bound input [] (output, a) =
|
||||
O $ randomLayer seed (input, output) bound a
|
||||
randomNetwork seed bound input ((h, a):hs) output =
|
||||
randomLayer seed (input, h) bound a :-
|
||||
randomNetwork (seed + 1) bound h hs output
|
||||
|
||||
buildNetwork :: Seed -> (Double, Double) -> Int -> [(Int, Int, Activation)] -> (Int, Int, Activation) -> Network
|
||||
buildNetwork seed bound input [] (outputRows, outputColumns, a) =
|
||||
O $ randomLayer seed (input, outputColumns) bound a
|
||||
buildNetwork seed bound input ((rows, columns, a):hs) output =
|
||||
randomLayer seed (input, columns) bound a :-
|
||||
buildNetwork (seed + 1) bound columns hs output
|
||||
|
||||
sigmoid :: Vector Double -> Vector Double
|
||||
sigmoid x = 1 / max (1 + exp (-x)) 1e-10
|
||||
|
||||
sigmoid' :: Vector Double -> Vector Double
|
||||
sigmoid' x = sigmoid x * (1 - sigmoid x)
|
||||
|
||||
softmax :: Vector Double -> Vector Double
|
||||
softmax x = cmap (\a -> exp a / s) x
|
||||
where
|
||||
s = V.sum $ exp x
|
||||
|
||||
softmax' :: Vector Double -> Vector Double
|
||||
softmax' = cmap (\a -> sig a * (1 - sig a))
|
||||
where
|
||||
sig x = 1 / max (1 + exp (-x)) 1e-10
|
||||
|
||||
-- used for negative sampling
|
||||
{-sampledSoftmax :: Vector Double -> Vector Double-}
|
||||
{-sampledSoftmax x = cmap (\a -> exp a / s) x-}
|
||||
{-where-}
|
||||
{-s = V.sum . exp $ x-}
|
||||
|
||||
relu :: Vector Double -> Vector Double
|
||||
relu = cmap (max 0.1)
|
||||
|
||||
relu' :: Vector Double -> Vector Double
|
||||
relu' = cmap dev
|
||||
where dev x
|
||||
| x < 0 = 0
|
||||
| otherwise = 1
|
||||
|
||||
crossEntropy :: Session -> Double
|
||||
crossEntropy session =
|
||||
let inputs = map fst (test session)
|
||||
labels = map (toList . snd) (test session)
|
||||
outputs = map (toList . (`forward` session)) inputs
|
||||
pairs = zip outputs labels
|
||||
n = genericLength pairs
|
||||
in sum (map set pairs) / n
|
||||
where
|
||||
set (os, ls) = (-1 / genericLength os) * sum (zipWith f os ls)
|
||||
f a y = y * log (max 1e-10 a)
|
||||
|
||||
crossEntropy' :: Vector Double -> Vector Double
|
||||
crossEntropy' x = 1 / fromIntegral (V.length x)
|
||||
|
||||
one :: Vector Double -> Vector Double
|
||||
one v = vector $ replicate (V.length v) 1
|
||||
|
||||
train :: Input
|
||||
-> Network
|
||||
-> Output -- target
|
||||
-> Double -- learning rate
|
||||
-> Network -- network's output
|
||||
train input network target alpha = fst $ run input network
|
||||
where
|
||||
run :: Input -> Network -> (Network, Vector Double)
|
||||
run input (O l@(Layer biases weights (fn, fn'))) =
|
||||
let y = runLayer input l
|
||||
o = fn y
|
||||
delta = o - target
|
||||
de = delta * fn' y
|
||||
|
||||
biases' = biases - scale alpha de
|
||||
weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
|
||||
layer = Layer biases' weights' (fn, fn') -- updated layer
|
||||
|
||||
pass = weights #> de
|
||||
|
||||
in (O layer, pass)
|
||||
run input (l@(Layer biases weights (fn, fn')) :- n) =
|
||||
let y = runLayer input l
|
||||
o = fn y
|
||||
(n', delta) = run o n
|
||||
|
||||
de = delta * fn' y
|
||||
|
||||
biases' = biases - scale alpha de
|
||||
weights' = weights - scale alpha (input `outer` de)
|
||||
layer = Layer biases' weights' (fn, fn')
|
||||
|
||||
pass = weights #> de
|
||||
in (layer :- n', pass)
|
||||
|
||||
gd :: Session -> IO Session
|
||||
gd session = do
|
||||
seed <- newStdGen
|
||||
|
||||
let pairs = training session
|
||||
alpha = learningRate session
|
||||
net = network session
|
||||
|
||||
let n = length pairs
|
||||
|
||||
shuffled <- shuffleM pairs
|
||||
|
||||
let newnet = foldl' (\n (input, label) -> train input n label alpha) net pairs
|
||||
cost = crossEntropy (session { network = newnet })
|
||||
|
||||
let el = map (\(e, l, _) -> (e, l)) (chart session)
|
||||
ea = map (\(e, _, a) -> (e, a)) (chart session)
|
||||
|
||||
when (drawChart session) $ do
|
||||
toFile Chart.def (chartName session) $ do
|
||||
Chart.layoutlr_title Chart..= "loss over time"
|
||||
Chart.plotLeft (Chart.line "loss" [el])
|
||||
Chart.plotRight (Chart.line "learningRate" [ea])
|
||||
|
||||
return session { network = newnet
|
||||
, epoch = epoch session + 1
|
||||
, chart = (epoch session, cost, learningRate session):chart session
|
||||
}
|
||||
|
||||
sgd :: Session -> IO Session
|
||||
sgd session = do
|
||||
seed <- newStdGen
|
||||
|
||||
let pairs = training session
|
||||
bsize = batchSize session
|
||||
alpha = learningRate session
|
||||
net = network session
|
||||
|
||||
let n = length pairs
|
||||
iterations = n `div` bsize - 1
|
||||
|
||||
shuffled <- shuffleM pairs
|
||||
|
||||
let iter net i =
|
||||
let n = length pairs
|
||||
batch = take bsize . drop (i * bsize) $ shuffled
|
||||
batchInputs = map fst batch
|
||||
batchLabels = map snd batch
|
||||
batchPair = zip batchInputs batchLabels
|
||||
in foldl' (\n (input, label) -> train input n label alpha) net batchPair
|
||||
|
||||
let newnet = foldl' iter net [0..iterations]
|
||||
cost = crossEntropy (session { network = newnet })
|
||||
|
||||
let el = map (\(e, l, _) -> (e, l)) (chart session)
|
||||
ea = map (\(e, _, a) -> (e, a)) (chart session)
|
||||
|
||||
when (drawChart session) $ do
|
||||
toFile Chart.def (chartName session) $ do
|
||||
Chart.layoutlr_title Chart..= "loss over time"
|
||||
Chart.plotLeft (Chart.line "loss" [el])
|
||||
Chart.plotRight (Chart.line "learningRate" [ea])
|
||||
|
||||
return session { network = newnet
|
||||
, epoch = epoch session + 1
|
||||
, chart = (epoch session, cost, learningRate session):chart session
|
||||
}
|
||||
|
||||
|
||||
accuracy :: Session -> Double
|
||||
accuracy session =
|
||||
let inputs = map fst (test session)
|
||||
labels = map snd (test session)
|
||||
|
||||
results = map (`forward` session) inputs
|
||||
rounded = map (map round . toList) results
|
||||
|
||||
equals = zipWith (==) rounded (map (map round . toList) labels)
|
||||
in genericLength (filter (== True) equals) / genericLength inputs
|
||||
|
||||
learningRateDecay :: (Double, Double) -> Session -> Session
|
||||
learningRateDecay (step, m) session =
|
||||
session { learningRate = max m $ learningRate session / step }
|
||||
|
||||
ignoreBiases :: Session -> Session
|
||||
ignoreBiases session =
|
||||
session { network = rmbias (network session) }
|
||||
where
|
||||
rmbias (O (Layer biases nodes a)) = O $ Layer (biases * 0) nodes a
|
||||
rmbias ((Layer biases nodes a) :- n) = Layer (biases * 0) nodes a :- rmbias n
|
||||
|
||||
run :: (Session -> IO Session)
|
||||
-> Session -> IO Session
|
||||
run fn session = foldM (\s i -> fn s) session [0..epochs session]
|
||||
|
||||
factorial :: Int -> Int
|
||||
factorial 0 = 1
|
||||
factorial x = x * factorial (x - 1)
|
||||
|
||||
genSeed :: IO Seed
|
||||
genSeed = do
|
||||
(seed, _) <- random <$> newStdGen :: IO (Int, StdGen)
|
||||
return seed
|
||||
|
||||
replaceVector :: Vector Double -> Int -> Double -> Vector Double
|
||||
replaceVector vec index value =
|
||||
let list = toList vec
|
||||
in fromList $ take index list ++ value : drop (index + 1) list
|
||||
|
||||
clip :: Double -> (Double, Double) -> Double
|
||||
clip x (l, u) = min u (max l x)
|
122
src/Numeric/Sibe/NLP.hs
Normal file
122
src/Numeric/Sibe/NLP.hs
Normal file
@ -0,0 +1,122 @@
|
||||
module Numeric.Sibe.NLP
|
||||
(Class,
|
||||
Document(..),
|
||||
accuracy,
|
||||
recall,
|
||||
precision,
|
||||
fmeasure,
|
||||
cleanText,
|
||||
cleanDocuments,
|
||||
removeWords,
|
||||
removeStopwords,
|
||||
ngram,
|
||||
ngramText,
|
||||
)
|
||||
where
|
||||
import Numeric.Sibe.Utils
|
||||
import Data.List
|
||||
import Debug.Trace
|
||||
import Data.List.Split
|
||||
import Data.Maybe
|
||||
import Control.Arrow ((&&&))
|
||||
import Text.Regex.PCRE
|
||||
import Data.Char (isSpace, isNumber, toLower)
|
||||
import NLP.Stemmer
|
||||
import qualified Data.Set as Set
|
||||
|
||||
type Class = Int;
|
||||
|
||||
data Document = Document { text :: String
|
||||
, c :: Class
|
||||
} deriving (Eq, Show, Read)
|
||||
|
||||
|
||||
cleanText :: String -> String
|
||||
cleanText string =
|
||||
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
|
||||
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
||||
stemmed = unwords $ map (stem Porter) (words spacify)
|
||||
nonumber = filter (not . isNumber) stemmed
|
||||
lower = map toLower nonumber
|
||||
in (unwords . words) lower -- remove unnecessary spaces
|
||||
where
|
||||
trim = f . f
|
||||
where
|
||||
f = reverse . dropWhile isSpace
|
||||
replace needle replacement =
|
||||
map (\c -> if c == needle then replacement else c)
|
||||
|
||||
cleanDocuments :: [Document] -> [Document]
|
||||
cleanDocuments documents =
|
||||
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
|
||||
in cleaned
|
||||
|
||||
removeWords :: [String] -> [Document] -> [Document]
|
||||
removeWords ws documents =
|
||||
map (\(Document text c) -> Document (rm ws text) c) documents
|
||||
where
|
||||
rm list text =
|
||||
unwords $ filter (`notElem` list) (words text)
|
||||
|
||||
removeStopwords :: Int -> [Document] -> [Document]
|
||||
removeStopwords i documents =
|
||||
let wc = wordCounts (concatDocs documents)
|
||||
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
|
||||
stopwords = map fst (take i wlist)
|
||||
in removeWords stopwords documents
|
||||
where
|
||||
vocabulary x = ordNub (words x)
|
||||
countWordInDoc d w = genericLength (filter (==w) d)
|
||||
wordCounts x =
|
||||
let voc = vocabulary x
|
||||
in zip voc $ map (countWordInDoc (words x)) voc
|
||||
|
||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||
|
||||
accuracy :: [(Int, (Int, Double))] -> Double
|
||||
accuracy results =
|
||||
let pairs = map (\(a, b) -> (a, fst b)) results
|
||||
correct = filter (uncurry (==)) pairs
|
||||
in genericLength correct / genericLength results
|
||||
|
||||
recall :: [(Int, (Int, Double))] -> Double
|
||||
recall results =
|
||||
let classes = ordNub (map fst results)
|
||||
s = sum (map rec classes) / genericLength classes
|
||||
in s
|
||||
where
|
||||
rec a =
|
||||
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
||||
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
|
||||
in t / y
|
||||
|
||||
precision :: [(Int, (Int, Double))] -> Double
|
||||
precision results =
|
||||
let classes = ordNub (map fst results)
|
||||
s = sum (map prec classes) / genericLength classes
|
||||
in s
|
||||
where
|
||||
prec a =
|
||||
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
||||
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
|
||||
in
|
||||
if y == 0
|
||||
then 0
|
||||
else t / y
|
||||
|
||||
fmeasure :: [(Int, (Int, Double))] -> Double
|
||||
fmeasure results =
|
||||
let r = recall results
|
||||
p = precision results
|
||||
in (2 * p * r) / (p + r)
|
||||
|
||||
ngram :: Int -> [Document] -> [Document]
|
||||
ngram n documents =
|
||||
map (\(Document text c) -> Document (ngramText n text) c) documents
|
||||
|
||||
ngramText :: Int -> String -> String
|
||||
ngramText n text =
|
||||
let ws = words text
|
||||
pairs = zip [0..] ws
|
||||
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
||||
in unwords ("<b>_":grams)
|
128
src/Numeric/Sibe/NaiveBayes.hs
Normal file
128
src/Numeric/Sibe/NaiveBayes.hs
Normal file
@ -0,0 +1,128 @@
|
||||
module Numeric.Sibe.NaiveBayes
|
||||
(Document(..),
|
||||
NB(..),
|
||||
initialize,
|
||||
run,
|
||||
session,
|
||||
accuracy,
|
||||
precision,
|
||||
recall,
|
||||
fmeasure,
|
||||
mean,
|
||||
stdev,
|
||||
cleanText,
|
||||
cleanDocuments,
|
||||
ngram,
|
||||
ngramText,
|
||||
removeWords,
|
||||
removeStopwords,
|
||||
)
|
||||
where
|
||||
import Numeric.Sibe.Utils
|
||||
import Numeric.Sibe.NLP
|
||||
import Data.List
|
||||
import Debug.Trace
|
||||
import qualified Data.Set as Set
|
||||
import Data.List.Split
|
||||
import Data.Maybe
|
||||
import Control.Arrow ((&&&))
|
||||
|
||||
data NB = NB { documents :: [Document]
|
||||
, classes :: [(Class, Double)]
|
||||
, vocabulary :: Int
|
||||
, megadoc :: String
|
||||
, cd :: [(Class, [Document])]
|
||||
, cw :: [(Class, [(String, Int)])]
|
||||
, cgram :: [(Class, [(String, Int)])]
|
||||
} deriving (Eq, Show, Read)
|
||||
|
||||
initialize :: [Document] -> [Class] -> NB
|
||||
initialize documents classes =
|
||||
let megadoc = concatDocs documents
|
||||
vocabulary = genericLength ((ordNub . words) megadoc)
|
||||
-- (class, prior probability)
|
||||
cls = zip classes (map classPrior classes)
|
||||
|
||||
-- (class, [document])
|
||||
cd = zip classes (map classDocs classes)
|
||||
|
||||
-- (class, [(word, count)])
|
||||
cw = zip classes $ map classWordsCounts classes
|
||||
|
||||
cgram = zip classes $ map classNGramCounts classes
|
||||
|
||||
in NB { documents = documents
|
||||
, classes = cls
|
||||
, vocabulary = vocabulary
|
||||
, megadoc = megadoc
|
||||
, cd = cd
|
||||
, cw = cw
|
||||
, cgram = cgram
|
||||
}
|
||||
where
|
||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||
|
||||
classDocs x = filter ((==x) . c) documents
|
||||
classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
|
||||
classWords = words . classMegadoc
|
||||
classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
|
||||
classNGramWords = words . classNGram
|
||||
classVocabulary = ordNub . classWords
|
||||
classPrior x = genericLength (classDocs x) / genericLength documents
|
||||
countWordInDoc d w = genericLength (filter (==w) d)
|
||||
wordsCount ws voc =
|
||||
zip voc $ map (countWordInDoc ws) voc
|
||||
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
|
||||
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
|
||||
|
||||
session :: [Document] -> NB -> [(Class, (Class, Double))]
|
||||
session docs nb =
|
||||
let results = map (\(Document text c) -> (c, run text nb)) docs
|
||||
in results
|
||||
|
||||
run :: String -> NB -> (Class, Double)
|
||||
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
|
||||
let scores = map (score . fst) classes
|
||||
index = argmax scores
|
||||
m = maximum scores
|
||||
in (fst (classes !! index), m)
|
||||
where
|
||||
score c =
|
||||
let prior = snd (fromJust $ find ((==c) . fst) classes)
|
||||
|
||||
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
||||
-- using a uniform prior probability seems to work better when working with imbalanced
|
||||
-- training datasets, instead, we help rare classes get higher scores using
|
||||
-- alpha = (1 - prior * ALPHA), we use ALPHA = 1 here
|
||||
-- in prior * product (map (prob c) (words txt))
|
||||
|
||||
alpha = 1 - prior
|
||||
|
||||
in alpha * product (map (prob c) (words txt))
|
||||
|
||||
prob c w =
|
||||
let fcw = fromJust $ find ((==c) . fst) cw
|
||||
fcg = fromJust $ find ((==c) . fst) cgram
|
||||
tctM = find ((== w) . fst) (snd fcw)
|
||||
tct = if isJust tctM then (snd . fromJust) tctM else 0
|
||||
cvoc = sum $ map snd (snd fcw)
|
||||
voc = vocabulary
|
||||
gram = find ((==w) . last . splitOn "_" . fst) (snd fcg)
|
||||
pg = if isJust gram then (snd . fromJust) gram else 0
|
||||
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
|
||||
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
||||
|
||||
argmax :: (Ord a) => [a] -> Int
|
||||
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
|
||||
|
||||
mean :: [Double] -> Double
|
||||
mean x = sum x / genericLength x
|
||||
|
||||
stdev :: [Double] -> Double
|
||||
stdev x =
|
||||
let avg = mean x
|
||||
variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
|
||||
in sqrt variance
|
||||
|
||||
l :: (Show a) => a -> a
|
||||
l a = trace (show a) a
|
41
src/Numeric/Sibe/Utils.hs
Normal file
41
src/Numeric/Sibe/Utils.hs
Normal file
@ -0,0 +1,41 @@
|
||||
module Numeric.Sibe.Utils
|
||||
( similarity
|
||||
, ordNub
|
||||
, onehot
|
||||
, average
|
||||
, pca
|
||||
) where
|
||||
import qualified Data.Vector.Storable as V
|
||||
import qualified Data.Set as Set
|
||||
import Numeric.LinearAlgebra
|
||||
|
||||
similarity :: Vector Double -> Vector Double -> Double
|
||||
similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
|
||||
where
|
||||
magnitude :: Vector Double -> Double
|
||||
magnitude v = sqrt $ V.sum (cmap (^2) v)
|
||||
|
||||
onehot :: Int -> Int -> Vector Double
|
||||
onehot len i = vector $ replicate i 0 ++ [1] ++ replicate (len - i - 1) 0
|
||||
|
||||
ordNub :: (Ord a) => [a] -> [a]
|
||||
ordNub = go Set.empty
|
||||
where
|
||||
go _ [] = []
|
||||
go s (x:xs) = if x `Set.member` s then go s xs
|
||||
else x : go (Set.insert x s) xs
|
||||
|
||||
average :: Vector Double -> Vector Double
|
||||
average v = cmap (/ (V.sum v)) v
|
||||
|
||||
pca :: Matrix Double -> Int -> Matrix Double
|
||||
pca m d =
|
||||
let rs = toRows m
|
||||
means = map (\v -> V.sum v / fromIntegral (V.length v)) rs
|
||||
meanReduced = map (\(a, b) -> V.map (+ (negate b)) a) $ zip rs means
|
||||
mat = fromRows meanReduced
|
||||
|
||||
(u, s, v) = svd mat
|
||||
diagS = diagRect 0 s (rows mat) (cols mat)
|
||||
|
||||
in u ?? (All, Take d) <> diagS ?? (Take d, Take d)
|
129
src/Numeric/Sibe/Word2Vec.hs
Normal file
129
src/Numeric/Sibe/Word2Vec.hs
Normal file
@ -0,0 +1,129 @@
|
||||
{-# LANGUAGE ScopedTypeVariables #-}
|
||||
|
||||
module Numeric.Sibe.Word2Vec
|
||||
( word2vec
|
||||
, Word2Vec (..)
|
||||
, W2VMethod (..)
|
||||
) where
|
||||
import Numeric.Sibe
|
||||
import Numeric.Sibe.Utils
|
||||
import Debug.Trace
|
||||
import Data.Char
|
||||
import Data.Maybe
|
||||
import Data.List
|
||||
import Numeric.LinearAlgebra as H hiding (find)
|
||||
import qualified Data.Vector.Storable as V
|
||||
import Data.Default.Class
|
||||
import Data.Function (on)
|
||||
import Control.Monad
|
||||
import System.Random
|
||||
|
||||
import Graphics.Rendering.Chart as Chart
|
||||
import Graphics.Rendering.Chart.Backend.Cairo
|
||||
import Control.Lens
|
||||
|
||||
data W2VMethod = SkipGram | CBOW
|
||||
data Word2Vec = Word2Vec { docs :: [String]
|
||||
, window :: Int
|
||||
, dimensions :: Int
|
||||
, method :: W2VMethod
|
||||
, w2vChartName :: String
|
||||
, w2vDrawChart :: Bool
|
||||
}
|
||||
instance Default Word2Vec where
|
||||
def = Word2Vec { docs = []
|
||||
, window = 2
|
||||
, w2vChartName = "w2v.png"
|
||||
, w2vDrawChart = False
|
||||
}
|
||||
|
||||
word2vec w2v session = do
|
||||
seed <- newStdGen
|
||||
|
||||
let s = session { training = trainingData
|
||||
, network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
|
||||
}
|
||||
|
||||
when (debug s) $ do
|
||||
putStr "vocabulary size: "
|
||||
print v
|
||||
|
||||
putStr "trainingData length: "
|
||||
print . length $ trainingData
|
||||
|
||||
-- biases are not used in skipgram/cbow
|
||||
newses <- run (sgd . ignoreBiases) s
|
||||
|
||||
|
||||
-- export the hidden layer
|
||||
let (hidden@(Layer biases nodes _) :- _) = network newses
|
||||
-- run words through the hidden layer alone to get the word vector
|
||||
let computedVocVec = map (\(w, v) -> (w, runLayer' v hidden)) vocvec
|
||||
|
||||
when (w2vDrawChart w2v) $ do
|
||||
let m = fromRows . map snd $ computedVocVec
|
||||
twoDimensions = pca m 2
|
||||
textData = zipWith (\s l -> (V.head l, V.last l, s)) (map fst computedVocVec) (toRows twoDimensions)
|
||||
|
||||
chart = toRenderable layout
|
||||
where
|
||||
textP = plot_annotation_values .~ textData
|
||||
$ def
|
||||
layout = layout_title .~ "word vectors"
|
||||
$ layout_plots .~ [toPlot textP]
|
||||
$ def
|
||||
|
||||
renderableToFile def (w2vChartName w2v) chart
|
||||
return ()
|
||||
|
||||
return (computedVocVec, vocvec)
|
||||
where
|
||||
-- clean documents
|
||||
ds = map cleanText (docs w2v)
|
||||
|
||||
-- words of each document
|
||||
wd = map (words . (++ " ") . (map toLower)) ds
|
||||
|
||||
-- all words together, used to generate the vocabulary
|
||||
ws = words (concatMap ((++ " ") . map toLower) ds)
|
||||
vocabulary = ordNub ws
|
||||
v = length vocabulary
|
||||
|
||||
-- generate one-hot vectors for each word of vocabulary
|
||||
vocvec = zip vocabulary $ map (onehot v) [0..v - 1]
|
||||
|
||||
-- training data: generate input and output pairs for each word and the words in it's window
|
||||
trainingData = concatMap (\wds -> concatMap (iter wds) $ zip [0..] wds) wd
|
||||
where
|
||||
iter wds (i, w) =
|
||||
let v = snd . fromJust . find ((==w) . fst) $ vocvec
|
||||
before = take (window w2v) . drop (i - window w2v) $ wds
|
||||
after = take (window w2v) . drop (i + 1) $ wds
|
||||
ns
|
||||
| i == 0 = after
|
||||
| i == length vocvec - 1 = before
|
||||
| otherwise = before ++ after
|
||||
vectorized = map (\w -> snd . fromJust $ find ((== w) . fst) vocvec) ns
|
||||
new = foldl1 (+) vectorized
|
||||
in
|
||||
if length wds <= 1
|
||||
then []
|
||||
else
|
||||
case method w2v of
|
||||
SkipGram -> [(v, average new)]
|
||||
CBOW -> [(average new, v)]
|
||||
_ -> error "unsupported word2vec method"
|
||||
|
||||
cleanText :: String -> String
|
||||
cleanText string =
|
||||
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?', '\'']) (trim string)
|
||||
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
||||
nonumber = filter (not . isNumber) spacify
|
||||
lower = map toLower nonumber
|
||||
in (unwords . words) lower -- remove unnecessary spaces
|
||||
where
|
||||
trim = f . f
|
||||
where
|
||||
f = reverse . dropWhile isSpace
|
||||
replace needle replacement =
|
||||
map (\c -> if c == needle then replacement else c)
|
Reference in New Issue
Block a user