feat(blogs-corpus): new corpus for word2vec
This commit is contained in:
parent
f16cc26798
commit
d9d24f69a6
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -4,3 +4,6 @@
|
||||
[submodule "examples/notMNIST"]
|
||||
path = examples/notMNIST
|
||||
url = git@github.com:mdibaiee/notMNIST
|
||||
[submodule "examples/blogs-corpus"]
|
||||
path = examples/blogs-corpus
|
||||
url = git@github.com:mdibaiee/blogs-corpus
|
||||
|
1
examples/blogs-corpus
Submodule
1
examples/blogs-corpus
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit f242d6e602cacf41cd662bfca57501638bcf7c2a
|
@ -12,27 +12,45 @@ module Main where
|
||||
import Data.Function (on)
|
||||
import Numeric.LinearAlgebra
|
||||
import System.IO
|
||||
import System.Directory
|
||||
import Data.List.Split
|
||||
import Control.Exception (evaluate)
|
||||
import Debug.Trace
|
||||
import Data.Char
|
||||
import System.Random
|
||||
|
||||
rf :: FilePath -> IO String
|
||||
rf p = do
|
||||
hs <- openFile p ReadMode
|
||||
hSetEncoding hs latin1
|
||||
content <- evaluate =<< hGetContents hs
|
||||
length content `seq` hClose hs
|
||||
return content
|
||||
|
||||
main = do
|
||||
setStdGen (mkStdGen 100)
|
||||
sws <- lines <$> readFile "examples/stopwords"
|
||||
{-ds <- do
|
||||
content <- readFile "examples/doc-classifier-data/data-reuters"
|
||||
let splitted = splitOn (replicate 10 '-' ++ "\n") content
|
||||
d = concatMap (tail . lines) (take 100 splitted)
|
||||
return $ removeWords sws d-}
|
||||
--let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
|
||||
|
||||
-- real data, takes a lot of time to train
|
||||
{-ds <- do-}
|
||||
{-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
|
||||
{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
|
||||
|
||||
{-let texts = map (unwords . splitOn " ") contents-}
|
||||
{-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", " "]-}
|
||||
{-return $ map cleanText $ removeWords (sws ++ tags) texts-}
|
||||
|
||||
let ds = ["the king loves the queen", "the queen loves the king",
|
||||
"the dwarf hates the king", "the queen hates the dwarf",
|
||||
"the dwarf poisons the king", "the dwarf poisons the queen"]
|
||||
|
||||
let session = def { learningRate = 5e-2
|
||||
let session = def { learningRate = 1e-1
|
||||
, batchSize = 1
|
||||
, epochs = 100
|
||||
, epochs = 200
|
||||
, debug = True
|
||||
} :: Session
|
||||
w2v = def { docs = ds
|
||||
, dimensions = 50
|
||||
, dimensions = 25
|
||||
, method = SkipGram
|
||||
, window = 2
|
||||
} :: Word2Vec
|
||||
@ -49,10 +67,25 @@ module Main where
|
||||
|
||||
return ()
|
||||
|
||||
cleanText :: String -> String
|
||||
cleanText string =
|
||||
let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
|
||||
ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
|
||||
spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
|
||||
nonumber = filter (not . isNumber) spacify
|
||||
lower = map toLower nonumber
|
||||
in unwords . words $ lower
|
||||
where
|
||||
trim = f . f
|
||||
where
|
||||
f = reverse . dropWhile isSpace
|
||||
replace needle replacement =
|
||||
map (\c -> if c == needle then replacement else c)
|
||||
|
||||
removeWords :: [String] -> [String] -> [String]
|
||||
removeWords ws documents =
|
||||
map (rm ws) documents
|
||||
map rm documents
|
||||
where
|
||||
rm list text =
|
||||
unwords $ filter (`notElem` list) (words text)
|
||||
rm text =
|
||||
unwords $ filter (`notElem` ws) (words text)
|
||||
|
||||
|
@ -16,6 +16,7 @@ cabal-version: >=1.10
|
||||
library
|
||||
hs-source-dirs: src
|
||||
exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
|
||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||
build-depends: base >= 4.7 && < 5
|
||||
, hmatrix
|
||||
, random
|
||||
@ -53,6 +54,8 @@ executable example-word2vec
|
||||
, data-default-class
|
||||
, split
|
||||
, vector
|
||||
, directory
|
||||
, random
|
||||
default-language: Haskell2010
|
||||
|
||||
executable example-424
|
||||
|
@ -26,6 +26,7 @@ module Sibe
|
||||
sigmoid',
|
||||
softmax,
|
||||
softmax',
|
||||
sampledSoftmax,
|
||||
relu,
|
||||
relu',
|
||||
crossEntropy,
|
||||
@ -181,6 +182,12 @@ module Sibe
|
||||
where
|
||||
sig x = 1 / max (1 + exp (-x)) 1e-10
|
||||
|
||||
-- used for negative sampling
|
||||
sampledSoftmax :: Int -> Vector Double -> Vector Double
|
||||
sampledSoftmax n x = cmap (\a -> exp a / s) x
|
||||
where
|
||||
s = V.sum . exp $ V.take n x
|
||||
|
||||
relu :: Vector Double -> Vector Double
|
||||
relu = cmap (max 0.1)
|
||||
|
||||
|
@ -14,6 +14,7 @@ module Sibe.Word2Vec
|
||||
import Data.Default.Class
|
||||
import Data.Function (on)
|
||||
import Control.Monad
|
||||
import System.Random
|
||||
|
||||
data W2VMethod = SkipGram | CBOW
|
||||
data Word2Vec = Word2Vec { docs :: [String]
|
||||
@ -27,8 +28,9 @@ module Sibe.Word2Vec
|
||||
}
|
||||
|
||||
word2vec w2v session = do
|
||||
seed <- newStdGen
|
||||
let s = session { training = trainingData
|
||||
, network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one))
|
||||
, network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
|
||||
}
|
||||
|
||||
when (debug s) $ do
|
||||
|
Loading…
Reference in New Issue
Block a user