diff --git a/.gitmodules b/.gitmodules
index 17f4304..4cf4c67 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
[submodule "examples/notMNIST"]
path = examples/notMNIST
url = git@github.com:mdibaiee/notMNIST
+[submodule "examples/blogs-corpus"]
+ path = examples/blogs-corpus
+ url = git@github.com:mdibaiee/blogs-corpus
diff --git a/examples/blogs-corpus b/examples/blogs-corpus
new file mode 160000
index 0000000..f242d6e
--- /dev/null
+++ b/examples/blogs-corpus
@@ -0,0 +1 @@
+Subproject commit f242d6e602cacf41cd662bfca57501638bcf7c2a
diff --git a/examples/word2vec.hs b/examples/word2vec.hs
index 921fa88..1669c41 100644
--- a/examples/word2vec.hs
+++ b/examples/word2vec.hs
@@ -12,27 +12,45 @@ module Main where
import Data.Function (on)
import Numeric.LinearAlgebra
import System.IO
+ import System.Directory
import Data.List.Split
+ import Control.Exception (evaluate)
+ import Debug.Trace
+ import Data.Char
+ import System.Random
+
+ rf :: FilePath -> IO String
+ rf p = do
+ hs <- openFile p ReadMode
+ hSetEncoding hs latin1
+ content <- evaluate =<< hGetContents hs
+ length content `seq` hClose hs
+ return content
main = do
+ setStdGen (mkStdGen 100)
sws <- lines <$> readFile "examples/stopwords"
- {-ds <- do
- content <- readFile "examples/doc-classifier-data/data-reuters"
- let splitted = splitOn (replicate 10 '-' ++ "\n") content
- d = concatMap (tail . lines) (take 100 splitted)
- return $ removeWords sws d-}
- --let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
+
+ -- real data, takes a lot of time to train
+ {-ds <- do-}
+ {-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
+ {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
+
+ {-let texts = map (unwords . splitOn " ") contents-}
+ {-let tags = ["", "", "", "", "", "", " "]-}
+ {-return $ map cleanText $ removeWords (sws ++ tags) texts-}
+
let ds = ["the king loves the queen", "the queen loves the king",
"the dwarf hates the king", "the queen hates the dwarf",
"the dwarf poisons the king", "the dwarf poisons the queen"]
- let session = def { learningRate = 5e-2
+ let session = def { learningRate = 1e-1
, batchSize = 1
- , epochs = 100
+ , epochs = 200
, debug = True
} :: Session
w2v = def { docs = ds
- , dimensions = 50
+ , dimensions = 25
, method = SkipGram
, window = 2
} :: Word2Vec
@@ -49,10 +67,25 @@ module Main where
return ()
+ cleanText :: String -> String
+ cleanText string =
+ let notag = unwords $ filter ((/= "") . take 6) (words string)
+ ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
+ spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
+ nonumber = filter (not . isNumber) spacify
+ lower = map toLower nonumber
+ in unwords . words $ lower
+ where
+ trim = f . f
+ where
+ f = reverse . dropWhile isSpace
+ replace needle replacement =
+ map (\c -> if c == needle then replacement else c)
+
removeWords :: [String] -> [String] -> [String]
removeWords ws documents =
- map (rm ws) documents
+ map rm documents
where
- rm list text =
- unwords $ filter (`notElem` list) (words text)
+ rm text =
+ unwords $ filter (`notElem` ws) (words text)
diff --git a/sibe.cabal b/sibe.cabal
index de76920..5bde54d 100644
--- a/sibe.cabal
+++ b/sibe.cabal
@@ -16,6 +16,7 @@ cabal-version: >=1.10
library
hs-source-dirs: src
exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
+ ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base >= 4.7 && < 5
, hmatrix
, random
@@ -53,6 +54,8 @@ executable example-word2vec
, data-default-class
, split
, vector
+ , directory
+ , random
default-language: Haskell2010
executable example-424
diff --git a/src/Sibe.hs b/src/Sibe.hs
index eb97ca5..7460aab 100644
--- a/src/Sibe.hs
+++ b/src/Sibe.hs
@@ -26,6 +26,7 @@ module Sibe
sigmoid',
softmax,
softmax',
+ sampledSoftmax,
relu,
relu',
crossEntropy,
@@ -181,6 +182,12 @@ module Sibe
where
sig x = 1 / max (1 + exp (-x)) 1e-10
+ -- used for negative sampling
+ sampledSoftmax :: Int -> Vector Double -> Vector Double
+ sampledSoftmax n x = cmap (\a -> exp a / s) x
+ where
+ s = V.sum . exp $ V.take n x
+
relu :: Vector Double -> Vector Double
relu = cmap (max 0.1)
diff --git a/src/Sibe/Word2Vec.hs b/src/Sibe/Word2Vec.hs
index 94c3ea7..b18f8ae 100644
--- a/src/Sibe/Word2Vec.hs
+++ b/src/Sibe/Word2Vec.hs
@@ -14,6 +14,7 @@ module Sibe.Word2Vec
import Data.Default.Class
import Data.Function (on)
import Control.Monad
+ import System.Random
data W2VMethod = SkipGram | CBOW
data Word2Vec = Word2Vec { docs :: [String]
@@ -27,8 +28,9 @@ module Sibe.Word2Vec
}
word2vec w2v session = do
+ seed <- newStdGen
let s = session { training = trainingData
- , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one))
+ , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
}
when (debug s) $ do