feat(blogs-corpus): new corpus for word2vec
This commit is contained in:
parent
f16cc26798
commit
d9d24f69a6
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -4,3 +4,6 @@
|
|||||||
[submodule "examples/notMNIST"]
|
[submodule "examples/notMNIST"]
|
||||||
path = examples/notMNIST
|
path = examples/notMNIST
|
||||||
url = git@github.com:mdibaiee/notMNIST
|
url = git@github.com:mdibaiee/notMNIST
|
||||||
|
[submodule "examples/blogs-corpus"]
|
||||||
|
path = examples/blogs-corpus
|
||||||
|
url = git@github.com:mdibaiee/blogs-corpus
|
||||||
|
1
examples/blogs-corpus
Submodule
1
examples/blogs-corpus
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit f242d6e602cacf41cd662bfca57501638bcf7c2a
|
@ -12,27 +12,45 @@ module Main where
|
|||||||
import Data.Function (on)
|
import Data.Function (on)
|
||||||
import Numeric.LinearAlgebra
|
import Numeric.LinearAlgebra
|
||||||
import System.IO
|
import System.IO
|
||||||
|
import System.Directory
|
||||||
import Data.List.Split
|
import Data.List.Split
|
||||||
|
import Control.Exception (evaluate)
|
||||||
|
import Debug.Trace
|
||||||
|
import Data.Char
|
||||||
|
import System.Random
|
||||||
|
|
||||||
|
rf :: FilePath -> IO String
|
||||||
|
rf p = do
|
||||||
|
hs <- openFile p ReadMode
|
||||||
|
hSetEncoding hs latin1
|
||||||
|
content <- evaluate =<< hGetContents hs
|
||||||
|
length content `seq` hClose hs
|
||||||
|
return content
|
||||||
|
|
||||||
main = do
|
main = do
|
||||||
|
setStdGen (mkStdGen 100)
|
||||||
sws <- lines <$> readFile "examples/stopwords"
|
sws <- lines <$> readFile "examples/stopwords"
|
||||||
{-ds <- do
|
|
||||||
content <- readFile "examples/doc-classifier-data/data-reuters"
|
-- real data, takes a lot of time to train
|
||||||
let splitted = splitOn (replicate 10 '-' ++ "\n") content
|
{-ds <- do-}
|
||||||
d = concatMap (tail . lines) (take 100 splitted)
|
{-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
|
||||||
return $ removeWords sws d-}
|
{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
|
||||||
--let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
|
|
||||||
|
{-let texts = map (unwords . splitOn " ") contents-}
|
||||||
|
{-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", " "]-}
|
||||||
|
{-return $ map cleanText $ removeWords (sws ++ tags) texts-}
|
||||||
|
|
||||||
let ds = ["the king loves the queen", "the queen loves the king",
|
let ds = ["the king loves the queen", "the queen loves the king",
|
||||||
"the dwarf hates the king", "the queen hates the dwarf",
|
"the dwarf hates the king", "the queen hates the dwarf",
|
||||||
"the dwarf poisons the king", "the dwarf poisons the queen"]
|
"the dwarf poisons the king", "the dwarf poisons the queen"]
|
||||||
|
|
||||||
let session = def { learningRate = 5e-2
|
let session = def { learningRate = 1e-1
|
||||||
, batchSize = 1
|
, batchSize = 1
|
||||||
, epochs = 100
|
, epochs = 200
|
||||||
, debug = True
|
, debug = True
|
||||||
} :: Session
|
} :: Session
|
||||||
w2v = def { docs = ds
|
w2v = def { docs = ds
|
||||||
, dimensions = 50
|
, dimensions = 25
|
||||||
, method = SkipGram
|
, method = SkipGram
|
||||||
, window = 2
|
, window = 2
|
||||||
} :: Word2Vec
|
} :: Word2Vec
|
||||||
@ -49,10 +67,25 @@ module Main where
|
|||||||
|
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
cleanText :: String -> String
|
||||||
|
cleanText string =
|
||||||
|
let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
|
||||||
|
ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
|
||||||
|
spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
|
||||||
|
nonumber = filter (not . isNumber) spacify
|
||||||
|
lower = map toLower nonumber
|
||||||
|
in unwords . words $ lower
|
||||||
|
where
|
||||||
|
trim = f . f
|
||||||
|
where
|
||||||
|
f = reverse . dropWhile isSpace
|
||||||
|
replace needle replacement =
|
||||||
|
map (\c -> if c == needle then replacement else c)
|
||||||
|
|
||||||
removeWords :: [String] -> [String] -> [String]
|
removeWords :: [String] -> [String] -> [String]
|
||||||
removeWords ws documents =
|
removeWords ws documents =
|
||||||
map (rm ws) documents
|
map rm documents
|
||||||
where
|
where
|
||||||
rm list text =
|
rm text =
|
||||||
unwords $ filter (`notElem` list) (words text)
|
unwords $ filter (`notElem` ws) (words text)
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@ cabal-version: >=1.10
|
|||||||
library
|
library
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
|
exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
|
||||||
|
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, hmatrix
|
, hmatrix
|
||||||
, random
|
, random
|
||||||
@ -53,6 +54,8 @@ executable example-word2vec
|
|||||||
, data-default-class
|
, data-default-class
|
||||||
, split
|
, split
|
||||||
, vector
|
, vector
|
||||||
|
, directory
|
||||||
|
, random
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable example-424
|
executable example-424
|
||||||
|
@ -26,6 +26,7 @@ module Sibe
|
|||||||
sigmoid',
|
sigmoid',
|
||||||
softmax,
|
softmax,
|
||||||
softmax',
|
softmax',
|
||||||
|
sampledSoftmax,
|
||||||
relu,
|
relu,
|
||||||
relu',
|
relu',
|
||||||
crossEntropy,
|
crossEntropy,
|
||||||
@ -181,6 +182,12 @@ module Sibe
|
|||||||
where
|
where
|
||||||
sig x = 1 / max (1 + exp (-x)) 1e-10
|
sig x = 1 / max (1 + exp (-x)) 1e-10
|
||||||
|
|
||||||
|
-- used for negative sampling
|
||||||
|
sampledSoftmax :: Int -> Vector Double -> Vector Double
|
||||||
|
sampledSoftmax n x = cmap (\a -> exp a / s) x
|
||||||
|
where
|
||||||
|
s = V.sum . exp $ V.take n x
|
||||||
|
|
||||||
relu :: Vector Double -> Vector Double
|
relu :: Vector Double -> Vector Double
|
||||||
relu = cmap (max 0.1)
|
relu = cmap (max 0.1)
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@ module Sibe.Word2Vec
|
|||||||
import Data.Default.Class
|
import Data.Default.Class
|
||||||
import Data.Function (on)
|
import Data.Function (on)
|
||||||
import Control.Monad
|
import Control.Monad
|
||||||
|
import System.Random
|
||||||
|
|
||||||
data W2VMethod = SkipGram | CBOW
|
data W2VMethod = SkipGram | CBOW
|
||||||
data Word2Vec = Word2Vec { docs :: [String]
|
data Word2Vec = Word2Vec { docs :: [String]
|
||||||
@ -27,8 +28,9 @@ module Sibe.Word2Vec
|
|||||||
}
|
}
|
||||||
|
|
||||||
word2vec w2v session = do
|
word2vec w2v session = do
|
||||||
|
seed <- newStdGen
|
||||||
let s = session { training = trainingData
|
let s = session { training = trainingData
|
||||||
, network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one))
|
, network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
|
||||||
}
|
}
|
||||||
|
|
||||||
when (debug s) $ do
|
when (debug s) $ do
|
||||||
|
Loading…
Reference in New Issue
Block a user