relu: run notmnist using relu activation and draw the chart
[wip] word2vec: work in progress implementation of word2vec
This commit is contained in:
@ -1,7 +1,6 @@
|
||||
module Sibe.NLP
|
||||
(Class,
|
||||
Document(..),
|
||||
ordNub,
|
||||
accuracy,
|
||||
recall,
|
||||
precision,
|
||||
@ -14,15 +13,16 @@ module Sibe.NLP
|
||||
ngramText,
|
||||
)
|
||||
where
|
||||
import Sibe.Utils
|
||||
import Data.List
|
||||
import Debug.Trace
|
||||
import qualified Data.Set as Set
|
||||
import Data.List.Split
|
||||
import Data.Maybe
|
||||
import Control.Arrow ((&&&))
|
||||
import Text.Regex.PCRE
|
||||
import Data.Char (isSpace, isNumber, toLower)
|
||||
import NLP.Stemmer
|
||||
import qualified Data.Set as Set
|
||||
|
||||
type Class = Int;
|
||||
|
||||
@ -73,13 +73,6 @@ module Sibe.NLP
|
||||
|
||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||
|
||||
ordNub :: (Ord a) => [a] -> [a]
|
||||
ordNub = go Set.empty
|
||||
where
|
||||
go _ [] = []
|
||||
go s (x:xs) = if x `Set.member` s then go s xs
|
||||
else x : go (Set.insert x s) xs
|
||||
|
||||
accuracy :: [(Int, (Int, Double))] -> Double
|
||||
accuracy results =
|
||||
let pairs = map (\(a, b) -> (a, fst b)) results
|
||||
|
@ -4,7 +4,6 @@ module Sibe.NaiveBayes
|
||||
initialize,
|
||||
run,
|
||||
session,
|
||||
ordNub,
|
||||
accuracy,
|
||||
precision,
|
||||
recall,
|
||||
@ -19,6 +18,7 @@ module Sibe.NaiveBayes
|
||||
removeStopwords,
|
||||
)
|
||||
where
|
||||
import Sibe.Utils
|
||||
import Sibe.NLP
|
||||
import Data.List
|
||||
import Debug.Trace
|
||||
|
24
src/Sibe/Utils.hs
Normal file
24
src/Sibe/Utils.hs
Normal file
@ -0,0 +1,24 @@
|
||||
module Sibe.Utils
|
||||
(similarity,
|
||||
ordNub,
|
||||
onehot
|
||||
) where
|
||||
import qualified Data.Vector.Storable as V
|
||||
import qualified Data.Set as Set
|
||||
import Numeric.LinearAlgebra
|
||||
|
||||
similarity :: Vector Double -> Vector Double -> Double
|
||||
similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
|
||||
where
|
||||
magnitude :: Vector Double -> Double
|
||||
magnitude v = sqrt $ V.sum (cmap (^2) v)
|
||||
|
||||
onehot :: Int -> Int -> Vector Double
|
||||
onehot len i = vector $ replicate i 0 ++ [1] ++ replicate (len - i - 1) 0
|
||||
|
||||
ordNub :: (Ord a) => [a] -> [a]
|
||||
ordNub = go Set.empty
|
||||
where
|
||||
go _ [] = []
|
||||
go s (x:xs) = if x `Set.member` s then go s xs
|
||||
else x : go (Set.insert x s) xs
|
103
src/Sibe/Word2Vec.hs
Normal file
103
src/Sibe/Word2Vec.hs
Normal file
@ -0,0 +1,103 @@
|
||||
module Sibe.Word2Vec
|
||||
(word2vec,
|
||||
Word2Vec (..)
|
||||
) where
|
||||
import Sibe
|
||||
import Sibe.NLP
|
||||
import Sibe.Utils
|
||||
import Debug.Trace
|
||||
import Data.Char
|
||||
import Data.Maybe
|
||||
import Data.List
|
||||
import Numeric.LinearAlgebra hiding (find)
|
||||
import qualified Data.Vector.Storable as V
|
||||
import Data.Default.Class
|
||||
import Data.Function (on)
|
||||
|
||||
data Word2Vec = Word2Vec { docs :: [String]
|
||||
, window :: Int
|
||||
}
|
||||
instance Default Word2Vec where
|
||||
def = Word2Vec { docs = []
|
||||
, window = 2
|
||||
}
|
||||
|
||||
word2vec w2v session = do
|
||||
return trainingData
|
||||
let s = session { training = trainingData
|
||||
, network = buildNetwork 0 (-1, 1) v [(v, 25, (id, one))] (20, v, (softmax, crossEntropy'))
|
||||
}
|
||||
print trainingData
|
||||
newses <- run (gd . learningRateDecay (1.1, 0.1) . ignoreBiases) s
|
||||
|
||||
let (hidden@(Layer biases nodes _) :- _) = network newses
|
||||
{-let computedVocVec = map (\(w, v) -> (w, forward v newses)) vocvec-}
|
||||
let computedVocVec = map (\(w, v) -> (w, (fromRows [v]) <> nodes)) vocvec
|
||||
print computedVocVec
|
||||
|
||||
{-mapM_ (\(w, v) -> do
|
||||
putStr $ w ++ ": "
|
||||
let similarities = map (similarity v . snd) computedVocVec
|
||||
let sorted = sortBy (compare `on` similarity v . snd) computedVocVec
|
||||
{-print $ zip (map fst sorted) similarities-}
|
||||
print . take 2 . drop 1 . reverse $ map fst sorted
|
||||
) computedVocVec-}
|
||||
|
||||
return newses
|
||||
where
|
||||
ws = words (concatMap ((++ " <start> ") . map toLower) (docs w2v))
|
||||
vocabulary = ordNub ws
|
||||
v = length vocabulary
|
||||
|
||||
cooccurence = foldl' iter [] (zip [0..] ws)
|
||||
where
|
||||
iter acc (i, w) =
|
||||
let a = findIndex ((== w) . fst) acc
|
||||
before = take (window w2v) . drop (i - window w2v) $ ws
|
||||
after = take (window w2v) . drop (i + 1) $ ws
|
||||
ns = if i == 0 then after else before ++ after
|
||||
in
|
||||
if isJust a then
|
||||
let idx = fromJust a
|
||||
new = foldl (\acc n -> add acc n) (snd $ acc !! idx) ns
|
||||
in take idx acc ++ [(w, new)] ++ drop (idx + 1) acc
|
||||
else
|
||||
acc ++ [(w, map (\n -> (n, 1)) ns)]
|
||||
|
||||
add [] n = [(n, 1)]
|
||||
add ((hw, hc):hs) n
|
||||
| n == hw = (hw, hc + 1):hs
|
||||
| otherwise = (hw, hc):add hs n
|
||||
|
||||
vocvec = zip vocabulary $ map (onehot v) [0..v - 1]
|
||||
{-trainingData = map iter cooccurence
|
||||
where
|
||||
iter (w, targets) =
|
||||
let ts = map (\(w, c) -> c * (snd . fromJust $ find ((== w) . fst) vocvec)) targets
|
||||
folded = foldl (+) (vector $ replicate v 0) ts
|
||||
input = snd . fromJust $ find ((== w) . fst) vocvec
|
||||
in (input, folded)-}
|
||||
trainingData = map iter $ zip [window w2v..length vocvec - window w2v] vocvec
|
||||
where
|
||||
iter (i, (w, v)) =
|
||||
let before = take (window w2v) . drop (i - window w2v) $ vocvec
|
||||
after = take (window w2v) . drop (i + 1) $ vocvec
|
||||
ns = map snd $ before ++ after
|
||||
new = foldl1 (+) ns
|
||||
in (v, new)
|
||||
|
||||
add [] n = [(n, 1)]
|
||||
add ((hw, hc):hs) n
|
||||
| n == hw = (hw, hc + 1):hs
|
||||
| otherwise = (hw, hc):add hs n
|
||||
|
||||
wordfrequency = foldl' iter [] ws
|
||||
where
|
||||
iter acc w =
|
||||
let i = findIndex ((== w) . fst) acc
|
||||
in
|
||||
if isJust i then
|
||||
let idx = fromJust i
|
||||
in take idx acc ++ [(w, snd (acc !! idx) + 1)] ++ drop (idx + 1) acc
|
||||
else
|
||||
acc ++ [(w, 1)]
|
44
src/Sibe/Word2Vec.hs.backup
Normal file
44
src/Sibe/Word2Vec.hs.backup
Normal file
@ -0,0 +1,44 @@
|
||||
module Sibe.Word2Vec
|
||||
(word2vec,
|
||||
mapTuple
|
||||
) where
|
||||
import Sibe
|
||||
import Sibe.NLP
|
||||
import Debug.Trace
|
||||
import Data.Char
|
||||
import Data.Maybe
|
||||
import Data.List
|
||||
import Numeric.LinearAlgebra hiding (find)
|
||||
import qualified Data.Vector.Storable as V
|
||||
|
||||
word2vec docs session = do
|
||||
let cooccurence = concat $ map co docs
|
||||
a = (sigmoid, sigmoid')
|
||||
o = (softmax, crossEntropy')
|
||||
window = 2
|
||||
s = session { training = cooccurence
|
||||
, test = cooccurence
|
||||
, network = buildNetwork 0 (-1, 1) n [(n, 300, (id, id))] (300, n, (softmax, crossEntropy'))
|
||||
}
|
||||
print $ network s
|
||||
newses <- run gd s
|
||||
return (newses, cooccurence, vocabulary, vocvec)
|
||||
where
|
||||
n = length vocabulary
|
||||
vocabulary = ordNub . words . map toLower . concatMap (++ " ") $ docs
|
||||
vocvec = zip vocabulary $ map tovec [0..n]
|
||||
tovec i = replicate i 0 ++ [1] ++ replicate (n - i - 1) 0
|
||||
co d =
|
||||
let p = pairs d
|
||||
in map (\(a, [b, c]) -> (f a, V.concat [f b, f c])) p
|
||||
where
|
||||
f w = vector . snd . fromJust $ find ((== w) . fst) vocvec
|
||||
pairs d = concatMap iter [0..length ws]
|
||||
where
|
||||
ws = words $ map toLower d
|
||||
iter i
|
||||
| i > 0 && i < length ws - 1 = [(ws !! i, [ws !! (i - 1), ws !! (i + 1)])]
|
||||
| otherwise = []
|
||||
|
||||
mapTuple :: (a -> b) -> (a, a) -> (b, b)
|
||||
mapTuple f (a, b) = (f a, f b)
|
Reference in New Issue
Block a user