perf(word2vec): better word2vec
This commit is contained in:
parent
313e120f25
commit
f16cc26798
@ -16,17 +16,17 @@ module Main where
|
||||
|
||||
main = do
|
||||
sws <- lines <$> readFile "examples/stopwords"
|
||||
{-ds <- do-}
|
||||
{-content <- readFile "examples/doc-classifier-data/data-reuters"-}
|
||||
{-let splitted = splitOn (replicate 10 '-' ++ "\n") content-}
|
||||
{-d = concatMap (tail . lines) (take 100 splitted)-}
|
||||
{-return $ removeWords sws d-}
|
||||
{-ds <- do
|
||||
content <- readFile "examples/doc-classifier-data/data-reuters"
|
||||
let splitted = splitOn (replicate 10 '-' ++ "\n") content
|
||||
d = concatMap (tail . lines) (take 100 splitted)
|
||||
return $ removeWords sws d-}
|
||||
--let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
|
||||
let ds = ["the king loves the queen", "the queen loves the king",
|
||||
"the dwarf hates the king", "the queen hates the dwarf",
|
||||
"the dwarf poisons the king", "the dwarf poisons the queen"]
|
||||
|
||||
let session = def { learningRate = 0.1
|
||||
let session = def { learningRate = 5e-2
|
||||
, batchSize = 1
|
||||
, epochs = 100
|
||||
, debug = True
|
||||
|
@ -1,7 +1,8 @@
|
||||
module Sibe.Utils
|
||||
(similarity,
|
||||
ordNub,
|
||||
onehot
|
||||
( similarity
|
||||
, ordNub
|
||||
, onehot
|
||||
, average
|
||||
) where
|
||||
import qualified Data.Vector.Storable as V
|
||||
import qualified Data.Set as Set
|
||||
@ -22,3 +23,6 @@ module Sibe.Utils
|
||||
go _ [] = []
|
||||
go s (x:xs) = if x `Set.member` s then go s xs
|
||||
else x : go (Set.insert x s) xs
|
||||
|
||||
average :: Vector Double -> Vector Double
|
||||
average v = cmap (/ (V.sum v)) v
|
||||
|
@ -75,11 +75,14 @@ module Sibe.Word2Vec
|
||||
| i == length vocvec - 1 = before
|
||||
| otherwise = before ++ after
|
||||
vectorized = map (\w -> snd . fromJust $ find ((== w) . fst) vocvec) ns
|
||||
new = cmap (max 1) $ foldl1 (+) vectorized
|
||||
new = foldl1 (+) vectorized
|
||||
in
|
||||
if length wds <= 1
|
||||
then []
|
||||
else
|
||||
case method w2v of
|
||||
SkipGram -> zip (repeat v) vectorized
|
||||
CBOW -> zip vectorized (repeat v)
|
||||
SkipGram -> [(v, average new)]
|
||||
CBOW -> [(average new, v)]
|
||||
_ -> error "unsupported word2vec method"
|
||||
|
||||
cleanText :: String -> String
|
||||
|
Loading…
Reference in New Issue
Block a user