feat(blogs-corpus): new corpus for word2vec

2016-09-19 16:00:45 +04:30
parent f16cc26798
commit d9d24f69a6
6 changed files with 62 additions and 13 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "examples/notMNIST"]
 	path = examples/notMNIST
 	url = git@github.com:mdibaiee/notMNIST
 [submodule "examples/blogs-corpus"]
 	path = examples/blogs-corpus
 	url = git@github.com:mdibaiee/blogs-corpus
--- a/examples/blogs-corpus
+++ b/examples/blogs-corpus
--- a/examples/word2vec.hs
+++ b/examples/word2vec.hs
@@ -12,27 +12,45 @@ module Main where
  import Data.Function (on)
  import Numeric.LinearAlgebra
  import System.IO
  import System.Directory
  import Data.List.Split
  import Control.Exception (evaluate)
  import Debug.Trace
  import Data.Char
  import System.Random
  rf :: FilePath -> IO String
  rf p = do
    hs <- openFile p ReadMode
    hSetEncoding hs latin1
    content <- evaluate =<< hGetContents hs
    length content `seq` hClose hs
    return content
  main = do
    setStdGen (mkStdGen 100)
    sws <- lines <$> readFile "examples/stopwords"
-    {-ds <- do
+
-        content <- readFile "examples/doc-classifier-data/data-reuters"
+    -- real data, takes a lot of time to train
-        let splitted = splitOn (replicate 10 '-' ++ "\n") content
+    {-ds <- do-}
-            d = concatMap (tail . lines) (take 100 splitted)
+        {-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
-        return $ removeWords sws d-}
+        {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
-    --let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
+
        {-let texts = map (unwords . splitOn "&nbsp;") contents-}
        {-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", "&nbsp;"]-}
        {-return $ map cleanText $ removeWords (sws ++ tags) texts-}
    let ds = ["the king loves the queen", "the queen loves the king",
              "the dwarf hates the king", "the queen hates the dwarf",
              "the dwarf poisons the king", "the dwarf poisons the queen"]
-    let session = def { learningRate = 5e-2
+    let session = def { learningRate = 1e-1
                      , batchSize = 1
-                      , epochs = 100
+                      , epochs = 200
                      , debug = True
                      } :: Session
        w2v = def { docs = ds
-                  , dimensions = 50
+                  , dimensions = 25
                  , method = SkipGram
                  , window = 2
                  } :: Word2Vec
@@ -49,10 +67,25 @@ module Main where
    return ()
  cleanText :: String -> String
  cleanText string = 
    let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
        ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
        spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
        nonumber = filter (not . isNumber) spacify
        lower = map toLower nonumber
    in unwords . words $ lower
    where
      trim = f . f
        where
          f = reverse . dropWhile isSpace
      replace needle replacement =
        map (\c -> if c == needle then replacement else c)
  removeWords :: [String] -> [String] -> [String]
  removeWords ws documents =
-    map (rm ws) documents
+    map rm documents
    where
-        rm list text =
+        rm text = 
-          unwords $ filter (`notElem` list) (words text)
+          unwords $ filter (`notElem` ws) (words text)
--- a/sibe.cabal
+++ b/sibe.cabal
@@ -16,6 +16,7 @@ cabal-version:       >=1.10
 library
  hs-source-dirs:      src
  exposed-modules:     Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
  build-depends:       base >= 4.7 && < 5
                     , hmatrix
                     , random
@@ -53,6 +54,8 @@ executable example-word2vec
                     , data-default-class
                     , split
                     , vector
                     , directory
                     , random
  default-language:    Haskell2010
 executable example-424
--- a/src/Sibe.hs
+++ b/src/Sibe.hs
@@ -26,6 +26,7 @@ module Sibe
     sigmoid',
     softmax,
     softmax',
     sampledSoftmax,
     relu,
     relu',
     crossEntropy,
@@ -181,6 +182,12 @@ module Sibe
        where
          sig x = 1 / max (1 + exp (-x)) 1e-10
      -- used for negative sampling
      sampledSoftmax :: Int -> Vector Double -> Vector Double
      sampledSoftmax n x = cmap (\a -> exp a / s) x
        where
          s = V.sum . exp $ V.take n x
      relu :: Vector Double -> Vector Double
      relu = cmap (max 0.1)
--- a/src/Sibe/Word2Vec.hs
+++ b/src/Sibe/Word2Vec.hs
@@ -14,6 +14,7 @@ module Sibe.Word2Vec
    import Data.Default.Class
    import Data.Function (on)
    import Control.Monad
    import System.Random
    data W2VMethod = SkipGram | CBOW
    data Word2Vec = Word2Vec { docs :: [String]
@@ -27,8 +28,9 @@ module Sibe.Word2Vec
                     }
    word2vec w2v session = do
      seed <- newStdGen
      let s = session { training = trainingData
-                      , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one))
+                      , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
                      }
      when (debug s) $ do