89 lines
3.0 KiB
Haskell
89 lines
3.0 KiB
Haskell
{-# LANGUAGE RecordWildCards #-}
|
|
{-# LANGUAGE FlexibleContexts #-}
|
|
{-# LANGUAGE ScopedTypeVariables #-}
|
|
|
|
module Main where
|
|
import Numeric.Sibe
|
|
import Numeric.Sibe.Word2Vec
|
|
import Numeric.Sibe.Utils
|
|
import Data.Default.Class
|
|
import qualified Data.Vector.Storable as V
|
|
import Data.List (sortBy)
|
|
import Data.Function (on)
|
|
import Numeric.LinearAlgebra
|
|
import System.IO
|
|
import System.Directory
|
|
import Data.List.Split
|
|
import Control.Exception (evaluate)
|
|
import Debug.Trace
|
|
import Data.Char
|
|
import System.Random
|
|
|
|
rf :: FilePath -> IO String
|
|
rf p = do
|
|
hs <- openFile p ReadMode
|
|
hSetEncoding hs latin1
|
|
content <- evaluate =<< hGetContents hs
|
|
length content `seq` hClose hs
|
|
return content
|
|
|
|
main = do
|
|
setStdGen (mkStdGen 100)
|
|
sws <- lines <$> readFile "examples/stopwords"
|
|
|
|
-- real data, takes a lot of time to train
|
|
{-ds <- do-}
|
|
{-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
|
|
{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
|
|
|
|
{-let texts = map (unwords . splitOn " ") contents-}
|
|
{-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", " "]-}
|
|
{-return $ map cleanText $ removeWords (sws ++ tags) texts-}
|
|
|
|
let ds = ["the king loves the queen", "the queen loves the king",
|
|
"the dwarf hates the king", "the queen hates the dwarf",
|
|
"the dwarf poisons the king", "the dwarf poisons the queen",
|
|
"the man loves the woman", "the woman loves the man",
|
|
"the thief hates the man", "the woman hates the thief",
|
|
"the thief robs the man", "the thief robs the woman"]
|
|
|
|
let session = def { learningRate = 0.1
|
|
, batchSize = 1
|
|
, epochs = 10000
|
|
, debug = True
|
|
} :: Session
|
|
w2v = def { docs = ds
|
|
, dimensions = 30
|
|
, method = SkipGram
|
|
, window = 2
|
|
, w2vDrawChart = True
|
|
, w2vChartName = "w2v.png"
|
|
} :: Word2Vec
|
|
|
|
(computed, vocvec) <- word2vec w2v session
|
|
|
|
return ()
|
|
|
|
cleanText :: String -> String
|
|
cleanText string =
|
|
let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
|
|
ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
|
|
spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
|
|
nonumber = filter (not . isNumber) spacify
|
|
lower = map toLower nonumber
|
|
in unwords . words $ lower
|
|
where
|
|
trim = f . f
|
|
where
|
|
f = reverse . dropWhile isSpace
|
|
replace needle replacement =
|
|
map (\c -> if c == needle then replacement else c)
|
|
|
|
removeWords :: [String] -> [String] -> [String]
|
|
removeWords ws documents =
|
|
map rm documents
|
|
where
|
|
rm text =
|
|
unwords $ filter (`notElem` ws) (words text)
|
|
|