diff --git a/examples/sin.hs b/examples/sin.hs deleted file mode 100644 index 8dc2616..0000000 --- a/examples/sin.hs +++ /dev/null @@ -1,35 +0,0 @@ -module Main where - import Sibe - import Numeric.LinearAlgebra - import Data.List - import Debug.Trace - - main = do - let alpha = 0.5 - epochs = 1000 - a = (sigmoid, sigmoid') - lo = (sigmoid, (\_ -> 1)) -- cross entropy - -- a = (relu, relu') - rnetwork = randomNetwork 0 (-1, 1) 1 [(50, a)] (1, lo) - - inputs = map (\a -> vector [a]) (reverse [0, 30, 45, 60, 90]) - labels = map (\deg -> vector $ [sin $ deg * pi/180]) (reverse [0, 30, 45, 60, 90]) - - initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels - - network <- run session inputs rnetwork labels alpha epochs - - let results = map (`forward` network) inputs - rounded = map (map round . toList) results - - cost = zipWith crossEntropy (map (`forward` network) inputs) labels - - putStrLn "parameters: " - putStrLn $ "- inputs: " ++ show inputs - putStrLn $ "- labels: " ++ show labels - putStrLn $ "- learning rate: " ++ show alpha - putStrLn $ "- epochs: " ++ show epochs - {-putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost-} - putStrLn "results: " - putStrLn $ "- actual result: " ++ show results - {-putStrLn $ "- cost (cross-entropy): " ++ show cost-} diff --git a/examples/word2vec.hs b/examples/word2vec.hs index 99c92b5..f0b45e5 100644 --- a/examples/word2vec.hs +++ b/examples/word2vec.hs @@ -15,20 +15,43 @@ module Main where import Data.List.Split main = do - {-ds <- do - content <- readFile "examples/doc-classifier-data/data-reuters" - let splitted = splitOn (replicate 10 '-' ++ "\n") content - d = concatMap (tail . lines) (take 100 splitted) - return d-} - let ds = ["I like deep learning", "I like NLP", "I enjoy flying"] + sws <- lines <$> readFile "examples/stopwords" + {-ds <- do-} + {-content <- readFile "examples/doc-classifier-data/data-reuters"-} + {-let splitted = splitOn (replicate 10 '-' ++ "\n") content-} + {-d = concatMap (tail . lines) (take 100 splitted)-} + {-return $ removeWords sws d-} + --let ds = ["I like deep learning", "I like NLP", "I enjoy flying"] + let ds = ["the king loves the queen", "the queen loves the king", + "the dwarf hates the king", "the queen hates the dwarf", + "the dwarf poisons the king", "the dwarf poisons the queen"] - let session = def { learningRate = 0.8 - , batchSize = 10 - , epochs = 1000 + let session = def { learningRate = 0.1 + , batchSize = 16 + , epochs = 100 } :: Session - w2v = def { docs = ds }:: Word2Vec + w2v = def { docs = ds + , dimensions = 50 + , method = SkipGram + , window = 3 + } :: Word2Vec - r <- word2vec w2v session - {-print r-} + (computed, vocvec) <- word2vec w2v session + + mapM_ (\(w, v) -> do + putStr $ w ++ ": " + let similarities = map (similarity v . snd) computed + let sorted = sortBy (compare `on` similarity v . snd) computed + print . take 2 . drop 1 . reverse $ map fst sorted + ) computed + return () + + removeWords :: [String] -> [String] -> [String] + removeWords ws documents = + map (rm ws) documents + where + rm list text = + unwords $ filter (`notElem` list) (words text) + diff --git a/log b/log new file mode 100644 index 0000000..89da90f --- /dev/null +++ b/log @@ -0,0 +1,1012 @@ +economist: ["nar","competitive"] +crumbling: ["accompanied","plan"] +accompanied: ["crumbling","rumor"] +dramatic: ["prepared","account"] +reversal: ["account","dramatic"] +account: ["reversal","dramatic"] +brazils: ["sufficient","surpluses"] +surpluses: ["sufficient","brazils"] +sufficient: ["brazils","surpluses"] +needed: ["billion","switched"] +debt: ["earners","crisis"] +target: ["averaging","monthly"] +repeat: ["monthly","averaging"] +monthly: ["repeat","averaging"] +averaging: ["target","monthly"] +exports: ["diverted","rises"] +diverted: ["exclusive","exports"] +increased: ["avoid","surplus"] +avoid: ["increased","shortages"] +domestic: ["avoid","gross"] +surplus: ["avoid","satisfactory"] +plunged: ["averaged","october"] +october: ["plunged","averaged"] +averaged: ["plunged","october"] +zealands: ["official","another"] +official: ["another","zealands"] +fell: ["reserves","nz"] +nz: ["fell","billion"] +statistical: ["commerice","period"] +bulletin: ["optimism","optimistic"] +panama: ["canal","commission"] +canal: ["panama","backlog"] +government: ["caused","policies"] +agency: ["daily","operations"] +daily: ["agency","operations"] +backlog: ["harris","enter"] +ships: ["load","enter"] +waiting: ["load","enter"] +enter: ["waiting","load"] +days: ["cut","over"] +due:: ["helped","tuesday"] +transit:: ["evening","postponed"] +backlog:: ["end","day"] +super: ["tankers","vessels"] +tankers: ["super","regular"] +regular: ["tankers","declared"] +vessels: ["tankers","petrochemical"] +end:: ["hrs","south"] +hrs: ["srw","end:"] +<america: ["federally","guaranteed"] +first: ["handler","determining"] +federally: ["guaranteed","<america"] +guaranteed: ["federally","<america"] +mortgage: ["two>","special"] +fund: ["two>","mortgage"] +two>: ["making","mortgage"] +exchangeable: ["unit","includes"] +income: ["treatment","cts"] +am: ["reporting","international"] +reporting: ["am","bdm"] +balance: ["remain","contribution"] +remain: ["balance","contribution"] +orders: ["graphics","harris"] +harris: ["graphics","backlog"] +graphics: ["harris","orders"] +corresponding: ["cars","periods"] +annualized: ["running","basis"] +running: ["annualized","basis"] +division: ["vicon","cultivators"] +introductions: ["enable","benefit"] +dollar: ["half","second"] +research: ["engineering","associates"] +development: ["engineering","expenditures"] +engineering: ["expenditures","development"] +expenditures: ["engineering","development"] +allocated: ["sellers","reasonable"] +revenues: ["properties","pretax"] +guarantees: ["credit","guarantee"] +earmarked: ["switched","edible"] +edible: ["beans","earmarked"] +beans: ["edible","creates"] +honduras: ["beans","edible"] +switched: ["earmarked","white"] +action: ["reduces","coverage"] +reduces: ["action","coverage"] +coverage: ["action","reduces"] +creates: ["beans","edible"] +all: ["star","guarantee"] +guarantee: ["credit","all"] +shipped: ["september","registered"] +gulf: ["technologies","applied"] +barge: ["freight","firmed"] +freight: ["barge","firmed"] +firmed: ["barge","freight"] +steady: ["vessel","increasing"] +vessel: ["steady","increasing"] +loadings: ["mill","vessel"] +increasing: ["steady","vessel"] +demand: ["dealers","grows"] +barges: ["no","supply"] +supply: ["excess","dealers"] +dealers: ["supply","ships"] +no: ["barges","st"] +traded: ["no","yesterdays"] +st: ["louis","no"] +louis: ["merchants","st"] +merchants: ["louis","st"] +exchange: ["filing","pan"] +session: ["readdress","eight"] +yesterday: ["elevating","chief"] +quotes: ["delivered","included"] +illinois: ["ex","river"] +river: ["illinois","ex"] +joliet: ["chicago","delivery"] +tariff: ["ex","chicago"] +ex: ["chicago","illinois"] +quoted: ["philippine","figures"] +percentage: ["multiple","constitute"] +points: ["percentage","quoted"] +next: ["owensboro","cairo"] +mississippi: ["nov","lower"] +owensboro: ["next","yesterdays"] +on: ["station","joliet"] +station: ["on","illinois"] +comparison: ["offered","tariff"] +yesterdays: ["owensboro","traded"] +memphis: ["nov","cairo"] +cairo: ["memphis","next"] +nov: ["mississippi","memphis"] +applied: ["technologies","gulf"] +technologies: ["applied","gulf"] +pipeline: ["subsidiaries","engaged"] +federal: ["owed","atico"] +taxes: ["owed","federal"] +owed: ["taxes","federal"] +transaction: ["<union","owed"] +offset: ["carryovers","carryforwards"] +carryovers: ["offset","carryforwards"] +latest: ["diamond","mths"] +virginia: ["power","territory"] +territory: ["dominion","virginia"] +dominion: ["territory","resources"] +resources: ["dominion","territory"] +<d>: ["improvement","<woodco"] +cofab: ["specialized","manufacture"] +<gulfex: ["acquired","<woodco"] +based: ["fabricator","alberta"] +fabricator: ["custom","houston"] +custom: ["fabricator","process"] +pressure: ["custom","fabricator"] +process: ["fabricator","custom"] +energy: ["petrochemical","universitys"] +petrochemical: ["marine","pulp"] +manufacture: ["companies","specialized"] +specialized: ["cofab","manufacture"] +cooling: ["lubricating","gas"] +lubricating: ["cooling","systems"] +oil: ["regain","manager"] +gas: ["properties","bcf"] +utility: ["pulp","paper"] +pulp: ["paper","utility"] +paper: ["pulp","utility"] +marine: ["pulp","petrochemical"] +amounts: ["qtr","yr"] +atico: ["peninsula","savings"] +peninsula: ["atico","savings"] +savings: ["atico","peninsula"] +pan: ["america","recorded"] +america: ["pan","banks"] +banks: ["purusant","ncnb"] +ncnb: ["purusant","banks"] +purusant: ["ncnb","banks"] +merger: ["wholly","purusant"] +modify: ["exporters","proposal"] +presenting: ["importers","delegates"] +importers: ["presenting","delegates"] +discussed: ["tonight","change"] +tonight: ["informally","change"] +informally: ["tonight","formally"] +producers: ["formal","member"] +talks: ["sides","formal"] +formal: ["producers","talks"] +eight: ["member","session"] +member: ["eight","splinter"] +splinter: ["member","eight"] +affect: ["distribution","commercial"] +proposed: ["out","shortfall"] +out: ["proposed","shortfall"] +declarations: ["nar","delegates"] +primary: ["diluted","pesos"] +pesos: ["primary","diluted"] +diluted: ["primary","pesos"] +philippine: ["distance","quoted"] +distance: ["telephone","philippine"] +telephone: ["distance","philippine"] +declaration: ["fluctuate","day"] +funds: ["commencement","period"] +commencement: ["funds","period"] +novebmer: ["extended","december"] +bdm: ["full","reporting"] +secondary: ["canron","recorded"] +offering: ["canron","actively"] +canron: ["offering","secondary"] +affiliated: ["miami","group"] +investment: ["questioned","purposes"] +firms: ["lowered","led"] +lowered: ["firms","cyclops"] +stake: ["ceding","disclosed"] +cyclops: ["lowered","wrather"] +filing: ["securities","commission"] +securities: ["filing","eckenfelder"] +led: ["mutual","firms"] +mutual: ["led","robeson"] +feb: ["jan","cyclops"] +agricultural: ["stabilization","consultant"] +stabilization: ["agricultural","consultant"] +conservation: ["land","ascs"] +ascs: ["established","values"] +established: ["ascs","values"] +values: ["ascs","established"] +redemption: ["ccc","commodity"] +bushel: ["priced","dollars"] +priced: ["bushel","dollars"] +yellow: ["only","grade"] +grade: ["sweet","yellow"] +only: ["cwt","yellow"] +hrw: ["srw","durum"] +srw: ["hrw","hrs"] +sww: ["srw","hrs"] +durum: ["hrw","srw"] +ill: ["track","editor"] +track: ["ill","equipment"] +toledo: ["out","share"] +peoria: ["issue","mlotok"] +denver: ["analysis","effective"] +evansville: ["paper","pulp"] +cincinnati: ["seeking","they"] +minneapolis: ["specialized","policy"] +baltimore: ["management","ohio"] +norf: ["phil","house"] +phil: ["norf","surprise"] +kansas: ["city","realistic"] +city: ["kansas","cambridge"] +amarillo: ["dividend","fluctuate"] +lubbock: ["texas","locations"] +lou: ["tex","increasing"] +portland: ["load","bancorp"] +seattle: ["higher","mln"] +stockton: ["eckenfelder","bache"] +la: ["meant","unusually"] +duluth: ["<emr>","problems"] +brly: ["rye","sorg"] +rye: ["brly","soyb"] +soyb: ["rye","brly"] +sorg: ["brly","soyb"] +evnsvlle: ["optimistic","emergency"] +cinci: ["fetched","complained"] +mpls: ["increased","noting"] +balt: ["nor","stake"] +nor: ["balt","volume"] +kc: ["united","stake"] +lo: ["merchants","louis"] +amarlo: ["faces","excess"] +lubbck: ["years","valuable"] +port: ["crop","planting"] +prevailing: ["adjusted","announced"] +location: ["mkt","strict"] +strict: ["middling","low"] +low: ["strict","combining"] +middling: ["strict","inch"] +inch: ["slm","strict"] +upland: ["cotton","determined"] +cotton: ["upland","handler"] +midnight: ["stockholders","expires"] +locations: ["lubbock","qualities"] +texas: ["lubbock","sour"] +qualities: ["locations","lubbock"] +determining: ["first","handler"] +handler: ["first","determining"] +certificate: ["preparing","handler"] +determined: ["upland","up"] +follows: ["allocations","upland"] +northern: ["european","upland"] +european: ["northern","small"] +adjustments: ["explain","closing"] +mkt: ["location","spot"] +slm: ["inch","determining"] +sum: ["closing","explain"] +began: ["ends","december"] +allocations: ["follows","quota"] +raw: ["totaled","short"] +barbados: ["belize","malawi"] +belize: ["barbados","fiji"] +bolivia: ["fiji","mozambique"] +colombia: ["half","special"] +congo: ["barbados","fiji"] +costa: ["rica","paraguay"] +rica: ["costa","guyana"] +ivory: ["coast","guyana"] +coast: ["ivory","guyana"] +dom: ["rep","factual"] +rep: ["dom","cera"] +ecuador: ["haiti","mozambique"] +el: ["salvador","zimbabwe"] +salvador: ["el","jamaica"] +fiji: ["bolivia","philippines"] +gabon: ["mozambique","philippines"] +guatemala: ["belize","haiti"] +guyana: ["malawi","madagascar"] +haiti: ["ecuador","fiji"] +india: ["jamaica","bolivia"] +jamaica: ["belize","india"] +madagascar: ["guyana","malawi"] +malawi: ["barbados","guyana"] +mauritius: ["guyana","fiji"] +mexico: ["considerations","delay"] +mozambique: ["gabon","bolivia"] +papua: ["guinea","nil"] +guinea: ["papua","nil"] +paraguay: ["bolivia","peru"] +peru: ["paraguay","mozambique"] +philippines: ["fiji","gabon"] +stchristopher: ["there","factual"] +nevis: ["india","jamaica"] +swaziland: ["barbados","congo"] +taiwan: ["barbados","belize"] +thailand: ["mozambique","bolivia"] +trinidad: ["tobago","haiti"] +tobago: ["trinidad","taiwan"] +zimbabwe: ["bolivia","india"] +loading: ["there","doubts"] +load: ["waiting","ships"] +express: ["closed","comment"] +remained: ["silent","highly"] +silent: ["remained","believes"] +rumors: ["comment","variation"] +spinoff: ["speculated","some"] +shearson: ["elevating","partially"] +lehman: ["salomon","brothers"] +brothers: ["salomon","lehman"] +move: ["unhappy","considering"] +unhappy: ["move","considering"] +lift: ["closed","believes"] +rumor: ["accompanied","lift"] +partially: ["public","calculated"] +public: ["partially","calculated"] +command: ["components","partially"] +value: ["boosting","variation"] +boosting: ["value","variation"] +talk: ["administrations","formally"] +financial: ["stability","suffield"] +services: ["boost","split"] +boost: ["services","split"] +closed: ["express","lift"] +heavy: ["volume","cera"] +volume: ["heavy","nor"] +comment: ["rumors","highly"] +activity: ["slowdown","slight"] +comments: ["seek","brothers"] +tuesday: ["helped","fuel"] +helped: ["tuesday","fuel"] +fuel: ["helped","tuesday"] +changes: ["fundamental","management"] +at: ["toy","meeting"] +undervalued: ["fully","reflect"] +fully: ["undervalued","reflect"] +reflect: ["undervalued","fully"] +elevating: ["chief","yesterday"] +chief: ["elevating","yesterday"] +officer: ["jeffery","lane"] +jeffery: ["officer","lane"] +lane: ["officer","jeffery"] +position: ["preserve","required"] +vacant: ["created","president"] +created: ["vacant","brian"] +positions: ["chairmen","divisions"] +chairmen: ["divisions","positions"] +divisions: ["chairmen","positions"] +speculated: ["partial","spinoff"] +partial: ["speculated","believes"] +sense: ["contrary","variation"] +contrary: ["variation","sense"] +variation: ["contrary","sense"] +some: ["several","spinoff"] +however: ["disagreed","up"] +disagreed: ["however","speculated"] +center: ["environment","profit"] +contributing: ["believes","express"] +highly: ["believes","i"] +sell: ["reason","alberta"] +perrin: ["lipper","analytical"] +lipper: ["perrin","analytical"] +analytical: ["lipper","perrin"] +questioned: ["better","travel"] +better: ["questioned","travel"] +profitable: ["fetched","bache"] +several: ["complained","some"] +reason: ["sell","strong"] +asset: ["organisation","economic"] +considered: ["spinning","option"] +option: ["spinning","considered"] +spinning: ["option","considered"] +suggests: ["budget","selling"] +larry: ["prudential","eckenfelder"] +eckenfelder: ["prudential","bache"] +prudential: ["eckenfelder","bache"] +bache: ["eckenfelder","prudential"] +believes: ["partial","highly"] +past: ["adapting","speed"] +fetched: ["profitable","shearson"] +big: ["shearsons","premium"] +premium: ["big","shearsons"] +place: ["premium","shearsons"] +shearsons: ["big","premium"] +book: ["premium","big"] +worth: ["bilion","exhibits"] +bilion: ["worth","barrels"] +capitalization: ["larry","prudential"] +plans: ["globally","expand"] +expand: ["globally","plans"] +globally: ["expand","plans"] +they: ["thought","enormous"] +enormous: ["internal","takes"] +internal: ["enormous","takes"] +takes: ["enormous","internal"] +you: ["valuations","realistic"] +realistic: ["you","reflect"] +valuations: ["you","fully"] +enhance: ["kinds","endeavors"] +ability: ["kinds","endeavors"] +kinds: ["endeavors","enhance"] +endeavors: ["kinds","enhance"] +road: ["ef","hutton"] +ef: ["hutton","road"] +hutton: ["ef","road"] +michael: ["ef","hutton"] +lewis: ["that","preclude"] +theyve: ["investing","outlined"] +outlined: ["fact","theyre"] +fact: ["theyre","outlined"] +theyre: ["fact","outlined"] +investing: ["theyve","fact"] +heavily: ["weigh","fact"] +arena: ["that","preclude"] +that: ["preclude","arena"] +preclude: ["that","arena"] +divestitures: ["way","nar"] +way: ["divestitures","paul"] +reduced: ["louisiana","marathon"] +exposure: ["highly","true"] +brokerage: ["components","command"] +travel: ["better","reflect"] +true: ["water","find"] +water: ["true","mark"] +mark: ["lesser","water"] +lesser: ["mark","water"] +components: ["brokerage","command"] +multiple: ["constitute","percentage"] +constitute: ["multiple","percentage"] +contributed: ["after","nars"] +after: ["contributed","lewis"] +liberty: ["star","all"] +star: ["all","liberty"] +payout: ["beverage","special"] +fluctuate: ["declaration","fund"] +feed: ["prices","produce"] +chinese: ["reduce","hog"] +reduce: ["chinese","herd"] +hog: ["reduce","numbers"] +herd: ["reduce","hog"] +pork: ["reduce","herd"] +production: ["world","excesses"] +world: ["account","production"] +numbers: ["start","hog"] +start: ["numbers","hog"] +head: ["deposits","loans"] +projections: ["profitability","assets"] +projected: ["handel","morton"] +fall: ["sharp","projected"] +ivaco: ["steel","coleco"] +steel: ["marshall","ivaco"] +canadian: ["non","key"] +result: ["ceding","noncash"] +minimal: ["rising","noting"] +subsequent: ["substantial","quarters"] +quarters: ["properties","subsequent"] +substantial: ["subsequent","quarters"] +reach: ["levels","humidity"] +continuing: ["high","restored"] +carloadings: ["totaled","grain"] +cars: ["arrivals","corresponding"] +railroads: ["association","ago"] +mill: ["totalled","loadings"] +totalled: ["mill","delivery"] +industrial: ["brierley","minister"] +pacific: ["equity","industrial"] +hong: ["kong","raised"] +kong: ["<hong","hong"] +raised: ["questionmark","hong"] +wrather: ["disclosed","cyclops"] +principally: ["brierley","equity"] +brierley: ["principally","industrial"] +investments: ["thomas","publicly"] +publicly: ["zealand","held"] +zealand: ["publicly","guinea"] +bought: ["robeson","ca"] +when: ["disclosed","wrather"] +disclosed: ["when","wrather"] +purposes: ["investment","month"] +dls: ["vs","profit"] +vs: ["dls","profit"] +diamond: ["shamrock","companies"] +shamrock: ["diamond","<chem>"] +cut: ["days","contract"] +crude: ["postings","changed"] +barrel: ["copany","opening"] +reduction: ["brings","falling"] +brings: ["marathons","posted"] +posted: ["brings","marathons"] +intermediate: ["sour","texas"] +copany: ["barrel","intermediate"] +weak: ["citing","supply"] +citing: ["weak","markets"] +markets: ["citing","optimism"] +liebert: ["merger","purusant"] +wholly: ["merger","owned"] +emerson: ["receive","wholly"] +electric: ["<emr>","<hong"] +<emr>: ["electric","<hong"] +under: ["shareholder","merger"] +receive: ["emerson","shares"] +coleco: ["carryforwards","ivaco"] +profitability: ["return","enable"] +prepared: ["statement","dramatic"] +swing: ["steep","lane"] +steep: ["swing","divisions"] +cabbage: ["patch","kids"] +patch: ["cabbage","kids"] +kids: ["patch","cabbage"] +changed: ["calender","postings"] +single: ["question","changed"] +diversified: ["organization","coffee"] +enable: ["introductions","return"] +toy: ["fair","at"] +fair: ["toy","president"] +vice: ["senior","president"] +morton: ["handel","projected"] +handel: ["morton","projected"] +reasonable: ["cake","standing"] +subsidiaries: ["engaged","pipeline"] +engaged: ["subsidiaries","pipeline"] +post: ["subject","boost"] +closing: ["adjustments","explain"] +explain: ["adjustments","closing"] +miami: ["affiliated","group"] +fundamental: ["changes","led"] +robeson: ["bought","wrather"] +jan: ["postings","feb"] +study: ["preparing","specific"] +robesons: ["afterwards","recommend"] +afterwards: ["robesons","recommend"] +carl: ["singer","elected"] +singer: ["carl","elected"] +elected: ["carl","singer"] +shortly: ["accounting","government"] +accounting: ["shortly","office"] +office: ["representatives","budget"] +gao: ["expanded","specific"] +cost: ["certificates","less"] +outlays: ["administration","industry"] +administration: ["consideration","outlays"] +industry: ["outlays","sectors"] +sources: ["consideration","administration"] +analysis: ["spriggs","petroleum"] +budget: ["office","suggests"] +reuters: ["interviewed","confident"] +preparing: ["specific","certificate"] +specific: ["preparing","handler"] +request: ["clear","moscow"] +sen: ["jesse","charles"] +jesse: ["helms","sen"] +helms: ["jesse","sen"] +r: ["grassley","charles"] +nc: ["sen","helms"] +senate: ["committee","yemen"] +committee: ["senate","agriculture"] +focus: ["negative","cost"] +released: ["scheduled","mid"] +source: ["less","if"] +depends: ["if","encourage"] +programs: ["impact","matrix"] +usda: ["urged","eep"] +if: ["encourage","determines"] +determines: ["encourage","if"] +encourage: ["if","determines"] +entries: ["forfeitures","loan"] +forfeitures: ["entries","loan"] +up: ["however","determined"] +caused: ["stockpile","government"] +stockpile: ["caused","allowed"] +decrease: ["brings","marathons"] +less: ["decrease","source"] +expanded: ["gao","slowed"] +negative: ["questionmark","focus"] +fire: ["negative","superior+"] +weigh: ["heavily","increase"] +consultant: ["agricultural","expanded"] +omb: ["expansion","agricultural"] +expansion: ["omb","michel"] +remains: ["firmly","committed"] +firmly: ["remains","committed"] +committed: ["remains","firmly"] +writedowns: ["noncash","unrealized"] +unrealized: ["writedowns","earners"] +losses: ["earners","writedowns"] +ten: ["div","qtly"] +opec: ["optimistic","pact"] +forced: ["meet","advantage"] +meet: ["advantage","forced"] +readdress: ["session","june"] +agreement: ["slide","reiterate"] +slide: ["halt","organization"] +movement: ["easy","higher"] +easy: ["movement","constitute"] +thought: ["sort","emergency"] +emergency: ["thought","optimistic"] +sort: ["thought","internal"] +problems: ["optimistic","daniel"] +daniel: ["yergin","director"] +yergin: ["director","daniel"] +director: ["yergin","daniel"] +cambridge: ["yergin","daniel"] +associates: ["cera","research"] +cera: ["associates","research"] +problem: ["address","opecs"] +faces: ["excess","excellent"] +excess: ["supply","faces"] +opecs: ["problem","hold"] +issue: ["mlotok","addressed"] +addressed: ["issue","paul"] +paul: ["addressed","mlotok"] +mlotok: ["issue","paul"] +salomon: ["brothers","lehman"] +optimism: ["repeated","markets"] +keep: ["output","reiterate"] +pessimistic: ["address","outlook"] +address: ["pessimistic","optimistic"] +wishes: ["regain","initiative"] +regain: ["wishes","initiative"] +uncertain: ["doubtful","analysts"] +bpd: ["closer","clearly"] +learn: ["buyers","then"] +deemed: ["fixed","quotas"] +differentials: ["fixed","quotas"] +regional: ["minister","manager"] +manager: ["spoke","regional"] +spoke: ["manager","major"] +condition: ["excellent","satisfactory"] +teach: ["lesson","added"] +lesson: ["teach","added"] +added: ["lesson","teach"] +david: ["t","mideast"] +t: ["david","editor"] +mizrahi: ["slackens","t"] +editor: ["t","mideast"] +mideast: ["david","editor"] +optimistic: ["problems","principal"] +principal: ["amount","debenures"] +advantage: ["meet","forced"] +winter: ["clearly","excess"] +slackens: ["mizrahi","t"] +reiterate: ["keep","agreement"] +output: ["keep","reiterate"] +critical: ["months","extended"] +hold: ["pact","ability"] +pact: ["hold","weeks"] +then: ["learn","dillard"] +dillard: ["buyers","then"] +spriggs: ["petroleum","analysis"] +petroleum: ["marathon","spriggs"] +bijan: ["moussavar","rahmani"] +moussavar: ["bijan","rahmani"] +rahmani: ["moussavar","bijan"] +harvard: ["rahmani","universitys"] +universitys: ["bijan","harvard"] +environment: ["policy","center"] +policy: ["environment","center"] +rising: ["minimal","because"] +prompted: ["excesses","herd"] +excesses: ["prompted","keep"] +clearly: ["closer","winter"] +closer: ["bpd","clearly"] +characterized: ["cheating","thought"] +cheating: ["characterized","thought"] +interview: ["telephone","distance"] +cenergy: ["for","fourth"] +properties: ["quarters","gas"] +following: ["receive","offers"] +calender: ["changed","end"] +noncash: ["writedowns","result"] +barrels: ["bilion","loans"] +cubic: ["feet","bcf"] +feet: ["cubic","bcf"] +bcf: ["feet","gas"] +disappear: ["produce","recover"] +produce: ["disappear","recover"] +recover: ["produce","disappear"] +torchmark: ["regularly","robeson"] +subordinated: ["convertible","debentures"] +debentures: ["redeem","portion"] +as: ["consequence","equipment"] +amount: ["debenures","principal"] +debenures: ["amount","principal"] +redeem: ["globally","plans"] +senior: ["drexel","burnham"] +drexel: ["senior","burnham"] +burnham: ["drexel","another"] +lambert: ["father","lehman"] +father: ["lambert","angeles"] +epsilon: ["acquired","data"] +thomas: ["ca","professor"] +drexels: ["los","angeles"] +los: ["angeles","drexels"] +angeles: ["los","university"] +retired: ["los","drexels"] +university: ["angeles","los"] +professor: ["ca","thomas"] +ca: ["professor","thomas"] +seek: ["control","ceding"] +nova: ["minister","non"] +owns: ["relatives","nine"] +husky: ["relatives","ceding"] +<union: ["transaction","faith"] +faith: ["<union","union"] +holding: ["promexpo","ltd>"] +ltd>: ["<hong","whampoa"] +minister: ["regional","industrial"] +michel: ["cote","faiths"] +cote: ["michel","ruled"] +ruled: ["cote","michel"] +union: ["faith","faiths"] +faiths: ["michel","cote"] +ceding: ["result","husky"] +non: ["ruling","key"] +ruling: ["non","key"] +key: ["non","ruling"] +completing: ["deal","surpluses"] +deal: ["completing","speed"] +equally: ["<hutchison","whampoa"] +<hutchison: ["equally","whampoa"] +whampoa: ["<hutchison","equally"] +<hong: ["ltd>","electric"] +private: ["consumption","slow"] +relatives: ["li","nine"] +li: ["relatives","husky"] +ka: ["shing","hutchison"] +shing: ["hutchison","ka"] +hutchison: ["shing","ka"] +imperial: ["commerice","statistical"] +commerice: ["imperial","statistical"] +excludes: ["extraordinary","mths"] +power: ["virginia","dominion"] +suffield: ["shamrock","regulatory"] +coastal: ["me","bancorp"] +bancorp: ["<csbk>","coastal"] +<csbk>: ["bancorp","coastal"] +me: ["coastal","bancorp"] +superintendent: ["maines","banking"] +maines: ["superintendent","banking"] +banking: ["sector","tourism"] +<camco: ["inc>","improvements"] +santa: ["realty","anita"] +anita: ["realty","enterprises"] +realty: ["anita","enterprises"] +enterprises: ["anita","realty"] +marshall: ["drummond","steel"] +drummond: ["mccall","marshall"] +mccall: ["drummond","marshall"] +switzerlands: ["given","combining"] +combining: ["switzerlands","low"] +unemployment: ["leaving","workforce"] +stability: ["external","large"] +large: ["representing","given"] +external: ["stability","given"] +excellent: ["satisfactory","faces"] +satisfactory: ["excellent","surplus"] +organisation: ["economic","cooperation"] +economic: ["organisation","cooperation"] +cooperation: ["organisation","economic"] +oecd: ["balances","picked"] +reflected: ["success","tight"] +success: ["reflected","tight"] +tight: ["reflected","success"] +monetary: ["policies","allowed"] +policies: ["monetary","government"] +picked: ["oecd","<sedio"] +signs: ["pick","slowdown"] +slowdown: ["slight","signs"] +slight: ["slowdown","activity"] +pick: ["signs","slight"] +concern: ["grows","domestic"] +forecast: ["gross","decline"] +gross: ["domestic","decline"] +small: ["consumer","gdpwhich"] +rise: ["consumer","small"] +consumer: ["small","rise"] +sharp: ["fall","years"] +job: ["creation","modest"] +creation: ["job","absorb"] +absorb: ["creation","job"] +modest: ["job","creation"] +workforce: ["leaving","unemployment"] +leaving: ["workforce","unemployment"] +lowest: ["unchanged","industrial"] +nation: ["area","oecd"] +area: ["nation","oecd"] +assuming: ["francs","swiss"] +swiss: ["assuming","term"] +francs: ["assuming","swiss"] +half: ["dollar","second"] +rises: ["inflation","exports"] +faster: ["grows","vigorous"] +last: ["reflecting","buoyant"] +reflecting: ["last","buoyant"] +buoyant: ["reflecting","last"] +consumption: ["appeared","real"] +meant: ["reflecting","slow"] +contribution: ["gdp","crisis"] +gdp: ["contribution","crisis"] +shrink: ["weakness","relative"] +given: ["switzerlands","external"] +grows: ["concern","domestic"] +gdpwhich: ["contribute","way"] +contribute: ["gdpwhich","way"] +balances: ["oecd","international"] +real: ["unusually","slow"] +appeared: ["consumption","real"] +unusually: ["real","slow"] +relative: ["weakness","noted"] +weakness: ["relative","noted"] +noted: ["relative","weakness"] +slow: ["unusually","real"] +outstrip: ["languished","inflation"] +plant: ["machinery","allow"] +machinery: ["consequence","equipment"] +bright: ["capacity","restored"] +capacity: ["bright","decembers"] +scope: ["rationalisation","modernisation"] +rationalisation: ["scope","modernisation"] +modernisation: ["scope","rationalisation"] +sectors: ["sector","industry"] +consequence: ["machinery","equipment"] +equipment: ["consequence","manufactures"] +decelerate: ["rationalisation","land"] +vigorous: ["enormous","takes"] +questionmark: ["raised","negative"] +tourism: ["sector","banking"] +sector: ["tourism","banking"] +earners: ["losses","debt"] +term: ["appreciation","franc"] +appreciation: ["term","franc"] +franc: ["term","appreciation"] +accelerating: ["deregulation","earners"] +deregulation: ["accelerating","banking"] +lead: ["loss","carryforwards"] +particularly: ["recent","developments"] +recent: ["particularly","developments"] +developments: ["recent","particularly"] +question: ["subsidy","single"] +flexibility: ["adapting","past"] +adapting: ["flexibility","speed"] +speed: ["adapting","past"] +required: ["to","preserve"] +to: ["preserve","required"] +preserve: ["competitive","to"] +competitive: ["preserve","to"] +actively: ["considering","offering"] +subsidized: ["reagan","offering"] +soviet: ["counterparts","talks"] +eep: ["request","administrations"] +fired: ["examples","companies"] +aide: ["lyng","richard"] +secretary: ["richard","agriculture"] +richard: ["lyng","secretary"] +lyng: ["richard","aide"] +formally: ["informally","tonight"] +most: ["confident","interviewed"] +interviewed: ["confident","most"] +confident: ["most","interviewed"] +soviets: ["hint","live"] +counterparts: ["talks","formal"] +reagan: ["subsidized","decide"] +decide: ["favor","reagan"] +moscow: ["request","clear"] +sailing: ["i","reintroducing"] +interagency: ["administrations","review"] +an: ["consummated","representing"] +consummated: ["enhancement","overnight"] +overnight: ["consummated","export"] +favor: ["decide","extended"] +china: ["subsidy","crops"] +subsidy: ["china","question"] +representing: ["large","an"] +deliberations: ["nudged","members"] +nudged: ["deliberations","members"] +members: ["nudged","congress"] +congress: ["complained","members"] +number: ["urged","congress"] +urged: ["number","usda"] +lyngs: ["exemption","visit"] +visit: ["lyngs","exemption"] +capitol: ["hill","press"] +hill: ["press","capitol"] +house: ["disarray","hill"] +press: ["hill","capitol"] +administrations: ["interagency","talk"] +comprised: ["subcabinet","asked"] +subcabinet: ["comprised","asked"] +asked: ["subcabinet","comprised"] +clear: ["request","jesse"] +representatives: ["office","trade"] +sides: ["talks","formal"] +consultations: ["calm","described"] +described: ["consultations","calm"] +calm: ["consultations","described"] +basic: ["factual","economics"] +factual: ["economics","basic"] +economics: ["factual","basic"] +another: ["informal","not"] +informal: ["another","not"] +suggestion: ["hint","live"] +hint: ["suggestion","live"] +live: ["hint","suggestion"] +pledge: ["form","buy"] +granted: ["provide","undisclosed"] +consideration: ["administration","sources"] +groups: ["follow","demands"] +delayed: ["stemming","disarray"] +disarray: ["delayed","stemming"] +stemming: ["delayed","disarray"] +iran: ["arms","affair"] +arms: ["affair","iran"] +affair: ["arms","iran"] +<versatile: ["corp>","uruguay"] +corp>: ["<versatile","explain"] +alberta: ["principle","versatile"] +versatile: ["principle","alberta"] +noble: ["cultivators","vicon"] +cultivators: ["noble","ontario"] +vicon: ["noble","cultivators"] +ontario: ["cultivators","vicon"] +undisclosed: ["granted","shareholder"] +manufactures: ["tillage","spraying"] +tillage: ["manufactures","spraying"] +spraying: ["manufactures","tillage"] +groupe: ["promexpo","videotron"] +videotron: ["groupe","ltd"] +promexpo: ["groupe","videotron"] +specializes: ["exhibits","three"] +exhibits: ["specializes","worth"] +texaco: ["canada","postings"] +benchmark: ["hills","edmonton"] +edmonton: ["swann","hills"] +swann: ["edmonton","hills"] +hills: ["benchmark","edmonton"] +sweet: ["louisiana","grade"] +bbl: ["sour","intermediate"] +postings: ["jan","texaco"] +erodible: ["cropland","signing"] +cropland: ["erodible","signing"] +enrolled: ["land","erodible"] +charles: ["grassley","sen"] +grassley: ["charles","sen"] +iowa: ["grassley","charles"] +indicated: ["consider","extend"] +consider: ["indicated","exemption"] +signing: ["erodible","cropland"] +doubtful: ["richard","lyngs"] +retroactive: ["bonus","accepted"] +land: ["conservation","enrolled"] +how: ["referring","tide"] +tide: ["how","referring"] +referring: ["how","tide"] +demands: ["follow","groups"] +follow: ["demands","groups"] +marathon: ["petroleum","co"] +grades: ["contract","pay"] +marathons: ["brings","posted"] +sour: ["bbl","texas"] +louisiana: ["sweet","grade"] +grown: ["failed","inadvertantly"] +failed: ["inadvertantly","certify"] +certify: ["inadvertantly","failed"] +allowed: ["policies","monetary"] +planting: ["eligible","crop"] +eligible: ["planting","farmer"] +restrict: ["plantings","crops"] +plantings: ["restrict","crops"] +crops: ["restrict","plantings"] +acreage: ["base","valuable"] +complained: ["congress","several"] +inadvertantly: ["failed","certify"] +lose: ["brly","corn"] +allow: ["historic","oats"] +historic: ["allow","oats"] +extend: ["exemption","lyngs"] +exemption: ["extend","lyngs"] +circuit: ["maker","ionic"] +ionic: ["circuit","maker"] +maker: ["boards","circuit"] +boards: ["maker","circuit"] +pretax: ["profits","concentrated"] diff --git a/sibe.cabal b/sibe.cabal index b6b4218..de76920 100644 --- a/sibe.cabal +++ b/sibe.cabal @@ -33,15 +33,6 @@ library , Chart-cairo default-language: Haskell2010 ---executable sibe-exe - --hs-source-dirs: app - --main-is: Main.hs - --ghc-options: -threaded -rtsopts -with-rtsopts=-N - --build-depends: base - --, sibe - --, hmatrix - --default-language: Haskell2010 - executable example-xor hs-source-dirs: examples main-is: xor.hs @@ -64,15 +55,6 @@ executable example-word2vec , vector default-language: Haskell2010 ---executable example-sin - --hs-source-dirs: examples - --main-is: sin.hs - --ghc-options: -threaded -rtsopts -with-rtsopts=-N - --build-depends: base - --, sibe - --, hmatrix - --default-language: Haskell2010 - executable example-424 hs-source-dirs: examples main-is: 424encoder.hs diff --git a/src/Sibe.hs b/src/Sibe.hs index 3f18a7f..ba70ed2 100644 --- a/src/Sibe.hs +++ b/src/Sibe.hs @@ -10,7 +10,9 @@ module Sibe Output, Activation, forward, + forward', runLayer, + runLayer', randomLayer, randomNetwork, buildNetwork, @@ -84,7 +86,6 @@ module Sibe , batchSize :: Int , chart :: [(Int, Double, Double)] , momentum :: Double - , biases :: Bool } deriving (Show) emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id)) @@ -98,7 +99,6 @@ module Sibe , batchSize = 0 , chart = [] , momentum = 0 - , biases = True } saveNetwork :: Network -> String -> IO () @@ -133,6 +133,12 @@ module Sibe compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n + forward' :: Input -> Session -> Output + forward' input session = compute input (network session) + where + compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer' input l + compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer' input l) n + randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer randomLayer seed (wr, wc) (l, u) = let weights = uniformSample seed wr $ replicate wc (l, u) @@ -209,14 +215,12 @@ module Sibe o = fn y delta = o - target de = delta * fn' y - -- de = delta / fromIntegral (V.length o) -- cross entropy cost biases' = biases - scale alpha de weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly layer = Layer biases' weights' (fn, fn') -- updated layer pass = weights #> de - -- pass = weights #> de in (O layer, pass) run input (l@(Layer biases weights (fn, fn')) :- n) = @@ -226,12 +230,11 @@ module Sibe de = delta * fn' y - biases' = biases - cmap (*alpha) de - weights' = weights - cmap (*alpha) (input `outer` de) + biases' = biases - scale alpha de + weights' = weights - scale alpha (input `outer` de) layer = Layer biases' weights' (fn, fn') pass = weights #> de - -- pass = weights #> de in (layer :- n', pass) gd :: Session -> IO Session @@ -280,8 +283,6 @@ module Sibe let el = map (\(e, l, _) -> (e, l)) (chart session) ea = map (\(e, _, a) -> (e, a)) (chart session) - putStrLn $ (show $ epoch session) ++ " => " ++ (show cost) ++ " @ " ++ (show $ learningRate session) - toFile Chart.def "sgd.png" $ do Chart.layoutlr_title Chart..= "loss over time" Chart.plotLeft (Chart.line "loss" [el]) @@ -312,8 +313,8 @@ module Sibe ignoreBiases session = session { network = rmbias (network session) } where - rmbias (O (Layer nodes biases a)) = O $ Layer nodes (biases * 0) a - rmbias ((Layer nodes biases a) :- n) = Layer nodes (biases * 0) a :- rmbias n + rmbias (O (Layer biases nodes a)) = O $ Layer (biases * 0) nodes a + rmbias ((Layer biases nodes a) :- n) = Layer (biases * 0) nodes a :- rmbias n run :: (Session -> IO Session) -> Session -> IO Session diff --git a/src/Sibe/Word2Vec.hs b/src/Sibe/Word2Vec.hs index f1576d1..53b80d2 100644 --- a/src/Sibe/Word2Vec.hs +++ b/src/Sibe/Word2Vec.hs @@ -1,9 +1,9 @@ module Sibe.Word2Vec - (word2vec, - Word2Vec (..) + ( word2vec + , Word2Vec (..) + , W2VMethod (..) ) where import Sibe - import Sibe.NLP import Sibe.Utils import Debug.Trace import Data.Char @@ -14,8 +14,11 @@ module Sibe.Word2Vec import Data.Default.Class import Data.Function (on) + data W2VMethod = SkipGram | CBOW data Word2Vec = Word2Vec { docs :: [String] , window :: Int + , dimensions :: Int + , method :: W2VMethod } instance Default Word2Vec where def = Word2Vec { docs = [] @@ -23,83 +26,70 @@ module Sibe.Word2Vec } word2vec w2v session = do - return trainingData let s = session { training = trainingData - , network = buildNetwork 0 (-1, 1) v [(v, 25, (id, one))] (20, v, (softmax, crossEntropy')) - , biases = False + , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one)) } - print trainingData - newses <- run (gd . learningRateDecay (1.1, 0.1)) s + putStr "vocabulary size: " + print v + + putStr "trainingData length: " + print . length $ trainingData + + -- biases are not used in skipgram/cbow + newses <- run (sgd . ignoreBiases) s + + + -- export the hidden layer let (hidden@(Layer biases nodes _) :- _) = network newses - {-let computedVocVec = map (\(w, v) -> (w, forward v newses)) vocvec-} - print biases - let computedVocVec = map (\(w, v) -> (w, v <# nodes)) vocvec - {-print computedVocVec-} + -- run words through the hidden layer alone to get the word vector + let computedVocVec = map (\(w, v) -> (w, runLayer' v hidden)) vocvec - {-mapM_ (\(w, v) -> do - putStr $ w ++ ": " - let similarities = map (similarity v . snd) computedVocVec - let sorted = sortBy (compare `on` similarity v . snd) computedVocVec - {-print $ zip (map fst sorted) similarities-} - print . take 2 . drop 1 . reverse $ map fst sorted - ) computedVocVec-} - - return newses + return (computedVocVec, vocvec) where - ws = words (concatMap ((++ " ") . map toLower) (docs w2v)) + -- clean documents + ds = map cleanText (docs w2v) + + -- words of each document + wd = map (words . (++ " ") . (map toLower)) ds + + -- all words together, used to generate the vocabulary + ws = words (concatMap ((++ " ") . map toLower) ds) vocabulary = ordNub ws v = length vocabulary - cooccurence = foldl' iter [] (zip [0..] ws) - where - iter acc (i, w) = - let a = findIndex ((== w) . fst) acc - before = take (window w2v) . drop (i - window w2v) $ ws - after = take (window w2v) . drop (i + 1) $ ws - ns = if i == 0 then after else before ++ after - in - if isJust a then - let idx = fromJust a - new = foldl (\acc n -> add acc n) (snd $ acc !! idx) ns - in take idx acc ++ [(w, new)] ++ drop (idx + 1) acc - else - acc ++ [(w, map (\n -> (n, 1)) ns)] - - add [] n = [(n, 1)] - add ((hw, hc):hs) n - | n == hw = (hw, hc + 1):hs - | otherwise = (hw, hc):add hs n - + -- generate one-hot vectors for each word of vocabulary vocvec = zip vocabulary $ map (onehot v) [0..v - 1] - {-trainingData = map iter cooccurence - where - iter (w, targets) = - let ts = map (\(w, c) -> c * (snd . fromJust $ find ((== w) . fst) vocvec)) targets - folded = foldl (+) (vector $ replicate v 0) ts - input = snd . fromJust $ find ((== w) . fst) vocvec - in (input, folded)-} - trainingData = map iter $ zip [window w2v..length vocvec - window w2v] vocvec - where - iter (i, (w, v)) = - let before = take (window w2v) . drop (i - window w2v) $ vocvec - after = take (window w2v) . drop (i + 1) $ vocvec - ns = map snd $ before ++ after - new = foldl1 (+) ns - in (v, new) - add [] n = [(n, 1)] - add ((hw, hc):hs) n - | n == hw = (hw, hc + 1):hs - | otherwise = (hw, hc):add hs n - - wordfrequency = foldl' iter [] ws + -- training data: generate input and output pairs for each word and the words in it's window + trainingData = concatMap (\wds -> concatMap (iter wds) $ zip [0..] wds) wd where - iter acc w = - let i = findIndex ((== w) . fst) acc + iter wds (i, w) = + let v = snd . fromJust . find ((==w) . fst) $ vocvec + before = take (window w2v) . drop (i - window w2v) $ wds + after = take (window w2v) . drop (i + 1) $ wds + ns + | i == 0 = after + | i == length vocvec - 1 = before + | otherwise = before ++ after + vectorized = map (\w -> snd . fromJust $ find ((== w) . fst) vocvec) ns + new = foldl1 (+) vectorized in - if isJust i then - let idx = fromJust i - in take idx acc ++ [(w, snd (acc !! idx) + 1)] ++ drop (idx + 1) acc - else - acc ++ [(w, 1)] + case method w2v of + SkipGram -> zip (repeat v) vectorized + CBOW -> zip vectorized (repeat v) + _ -> error "unsupported word2vec method" + + cleanText :: String -> String + cleanText string = + let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?', '\'']) (trim string) + spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r'] + nonumber = filter (not . isNumber) spacify + lower = map toLower nonumber + in (unwords . words) lower -- remove unnecessary spaces + where + trim = f . f + where + f = reverse . dropWhile isSpace + replace needle replacement = + map (\c -> if c == needle then replacement else c)