feat(blogs-corpus): new corpus for word2vec

This commit is contained in:
Mahdi Dibaiee 2016-09-19 16:00:45 +04:30
parent f16cc26798
commit d9d24f69a6
6 changed files with 62 additions and 13 deletions

3
.gitmodules vendored
View File

@ -4,3 +4,6 @@
[submodule "examples/notMNIST"] [submodule "examples/notMNIST"]
path = examples/notMNIST path = examples/notMNIST
url = git@github.com:mdibaiee/notMNIST url = git@github.com:mdibaiee/notMNIST
[submodule "examples/blogs-corpus"]
path = examples/blogs-corpus
url = git@github.com:mdibaiee/blogs-corpus

1
examples/blogs-corpus Submodule

@ -0,0 +1 @@
Subproject commit f242d6e602cacf41cd662bfca57501638bcf7c2a

View File

@ -12,27 +12,45 @@ module Main where
import Data.Function (on) import Data.Function (on)
import Numeric.LinearAlgebra import Numeric.LinearAlgebra
import System.IO import System.IO
import System.Directory
import Data.List.Split import Data.List.Split
import Control.Exception (evaluate)
import Debug.Trace
import Data.Char
import System.Random
rf :: FilePath -> IO String
rf p = do
hs <- openFile p ReadMode
hSetEncoding hs latin1
content <- evaluate =<< hGetContents hs
length content `seq` hClose hs
return content
main = do main = do
setStdGen (mkStdGen 100)
sws <- lines <$> readFile "examples/stopwords" sws <- lines <$> readFile "examples/stopwords"
{-ds <- do
content <- readFile "examples/doc-classifier-data/data-reuters" -- real data, takes a lot of time to train
let splitted = splitOn (replicate 10 '-' ++ "\n") content {-ds <- do-}
d = concatMap (tail . lines) (take 100 splitted) {-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
return $ removeWords sws d-} {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
--let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
{-let texts = map (unwords . splitOn "&nbsp;") contents-}
{-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", "&nbsp;"]-}
{-return $ map cleanText $ removeWords (sws ++ tags) texts-}
let ds = ["the king loves the queen", "the queen loves the king", let ds = ["the king loves the queen", "the queen loves the king",
"the dwarf hates the king", "the queen hates the dwarf", "the dwarf hates the king", "the queen hates the dwarf",
"the dwarf poisons the king", "the dwarf poisons the queen"] "the dwarf poisons the king", "the dwarf poisons the queen"]
let session = def { learningRate = 5e-2 let session = def { learningRate = 1e-1
, batchSize = 1 , batchSize = 1
, epochs = 100 , epochs = 200
, debug = True , debug = True
} :: Session } :: Session
w2v = def { docs = ds w2v = def { docs = ds
, dimensions = 50 , dimensions = 25
, method = SkipGram , method = SkipGram
, window = 2 , window = 2
} :: Word2Vec } :: Word2Vec
@ -49,10 +67,25 @@ module Main where
return () return ()
cleanText :: String -> String
cleanText string =
let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
nonumber = filter (not . isNumber) spacify
lower = map toLower nonumber
in unwords . words $ lower
where
trim = f . f
where
f = reverse . dropWhile isSpace
replace needle replacement =
map (\c -> if c == needle then replacement else c)
removeWords :: [String] -> [String] -> [String] removeWords :: [String] -> [String] -> [String]
removeWords ws documents = removeWords ws documents =
map (rm ws) documents map rm documents
where where
rm list text = rm text =
unwords $ filter (`notElem` list) (words text) unwords $ filter (`notElem` ws) (words text)

View File

@ -16,6 +16,7 @@ cabal-version: >=1.10
library library
hs-source-dirs: src hs-source-dirs: src
exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, hmatrix , hmatrix
, random , random
@ -53,6 +54,8 @@ executable example-word2vec
, data-default-class , data-default-class
, split , split
, vector , vector
, directory
, random
default-language: Haskell2010 default-language: Haskell2010
executable example-424 executable example-424

View File

@ -26,6 +26,7 @@ module Sibe
sigmoid', sigmoid',
softmax, softmax,
softmax', softmax',
sampledSoftmax,
relu, relu,
relu', relu',
crossEntropy, crossEntropy,
@ -181,6 +182,12 @@ module Sibe
where where
sig x = 1 / max (1 + exp (-x)) 1e-10 sig x = 1 / max (1 + exp (-x)) 1e-10
-- used for negative sampling
sampledSoftmax :: Int -> Vector Double -> Vector Double
sampledSoftmax n x = cmap (\a -> exp a / s) x
where
s = V.sum . exp $ V.take n x
relu :: Vector Double -> Vector Double relu :: Vector Double -> Vector Double
relu = cmap (max 0.1) relu = cmap (max 0.1)

View File

@ -14,6 +14,7 @@ module Sibe.Word2Vec
import Data.Default.Class import Data.Default.Class
import Data.Function (on) import Data.Function (on)
import Control.Monad import Control.Monad
import System.Random
data W2VMethod = SkipGram | CBOW data W2VMethod = SkipGram | CBOW
data Word2Vec = Word2Vec { docs :: [String] data Word2Vec = Word2Vec { docs :: [String]
@ -27,8 +28,9 @@ module Sibe.Word2Vec
} }
word2vec w2v session = do word2vec w2v session = do
seed <- newStdGen
let s = session { training = trainingData let s = session { training = trainingData
, network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one)) , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
} }
when (debug s) $ do when (debug s) $ do