diff --git a/.gitmodules b/.gitmodules index 17f4304..4cf4c67 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "examples/notMNIST"] path = examples/notMNIST url = git@github.com:mdibaiee/notMNIST +[submodule "examples/blogs-corpus"] + path = examples/blogs-corpus + url = git@github.com:mdibaiee/blogs-corpus diff --git a/examples/blogs-corpus b/examples/blogs-corpus new file mode 160000 index 0000000..f242d6e --- /dev/null +++ b/examples/blogs-corpus @@ -0,0 +1 @@ +Subproject commit f242d6e602cacf41cd662bfca57501638bcf7c2a diff --git a/examples/word2vec.hs b/examples/word2vec.hs index 921fa88..1669c41 100644 --- a/examples/word2vec.hs +++ b/examples/word2vec.hs @@ -12,27 +12,45 @@ module Main where import Data.Function (on) import Numeric.LinearAlgebra import System.IO + import System.Directory import Data.List.Split + import Control.Exception (evaluate) + import Debug.Trace + import Data.Char + import System.Random + + rf :: FilePath -> IO String + rf p = do + hs <- openFile p ReadMode + hSetEncoding hs latin1 + content <- evaluate =<< hGetContents hs + length content `seq` hClose hs + return content main = do + setStdGen (mkStdGen 100) sws <- lines <$> readFile "examples/stopwords" - {-ds <- do - content <- readFile "examples/doc-classifier-data/data-reuters" - let splitted = splitOn (replicate 10 '-' ++ "\n") content - d = concatMap (tail . lines) (take 100 splitted) - return $ removeWords sws d-} - --let ds = ["I like deep learning", "I like NLP", "I enjoy flying"] + + -- real data, takes a lot of time to train + {-ds <- do-} + {-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-} + {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-} + + {-let texts = map (unwords . splitOn " ") contents-} + {-let tags = ["", "", "", "", "", "", " "]-} + {-return $ map cleanText $ removeWords (sws ++ tags) texts-} + let ds = ["the king loves the queen", "the queen loves the king", "the dwarf hates the king", "the queen hates the dwarf", "the dwarf poisons the king", "the dwarf poisons the queen"] - let session = def { learningRate = 5e-2 + let session = def { learningRate = 1e-1 , batchSize = 1 - , epochs = 100 + , epochs = 200 , debug = True } :: Session w2v = def { docs = ds - , dimensions = 50 + , dimensions = 25 , method = SkipGram , window = 2 } :: Word2Vec @@ -49,10 +67,25 @@ module Main where return () + cleanText :: String -> String + cleanText string = + let notag = unwords $ filter ((/= "") . take 6) (words string) + ws = unwords $ filter (`notElem` ["urlLink"]) (words notag) + spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\''] + nonumber = filter (not . isNumber) spacify + lower = map toLower nonumber + in unwords . words $ lower + where + trim = f . f + where + f = reverse . dropWhile isSpace + replace needle replacement = + map (\c -> if c == needle then replacement else c) + removeWords :: [String] -> [String] -> [String] removeWords ws documents = - map (rm ws) documents + map rm documents where - rm list text = - unwords $ filter (`notElem` list) (words text) + rm text = + unwords $ filter (`notElem` ws) (words text) diff --git a/sibe.cabal b/sibe.cabal index de76920..5bde54d 100644 --- a/sibe.cabal +++ b/sibe.cabal @@ -16,6 +16,7 @@ cabal-version: >=1.10 library hs-source-dirs: src exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils + ghc-options: -threaded -rtsopts -with-rtsopts=-N build-depends: base >= 4.7 && < 5 , hmatrix , random @@ -53,6 +54,8 @@ executable example-word2vec , data-default-class , split , vector + , directory + , random default-language: Haskell2010 executable example-424 diff --git a/src/Sibe.hs b/src/Sibe.hs index eb97ca5..7460aab 100644 --- a/src/Sibe.hs +++ b/src/Sibe.hs @@ -26,6 +26,7 @@ module Sibe sigmoid', softmax, softmax', + sampledSoftmax, relu, relu', crossEntropy, @@ -181,6 +182,12 @@ module Sibe where sig x = 1 / max (1 + exp (-x)) 1e-10 + -- used for negative sampling + sampledSoftmax :: Int -> Vector Double -> Vector Double + sampledSoftmax n x = cmap (\a -> exp a / s) x + where + s = V.sum . exp $ V.take n x + relu :: Vector Double -> Vector Double relu = cmap (max 0.1) diff --git a/src/Sibe/Word2Vec.hs b/src/Sibe/Word2Vec.hs index 94c3ea7..b18f8ae 100644 --- a/src/Sibe/Word2Vec.hs +++ b/src/Sibe/Word2Vec.hs @@ -14,6 +14,7 @@ module Sibe.Word2Vec import Data.Default.Class import Data.Function (on) import Control.Monad + import System.Random data W2VMethod = SkipGram | CBOW data Word2Vec = Word2Vec { docs :: [String] @@ -27,8 +28,9 @@ module Sibe.Word2Vec } word2vec w2v session = do + seed <- newStdGen let s = session { training = trainingData - , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one)) + , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy')) } when (debug s) $ do