2017-04-17 10:17:44 +00:00
|
|
|
from urllib.parse import urlparse, urlunparse, urljoin
|
2017-04-20 03:20:04 +00:00
|
|
|
import os
|
|
|
|
from csv import DictWriter
|
2017-04-17 10:17:44 +00:00
|
|
|
|
|
|
|
def to_absolute(url, host):
|
|
|
|
if url == '/': return host
|
|
|
|
|
|
|
|
if url[0:2] == '//':
|
|
|
|
url = urlparse(host).scheme + ':' + url
|
|
|
|
|
|
|
|
p = urlparse(url)
|
|
|
|
if not (p.scheme in ['http', 'https', '']): return None
|
|
|
|
|
|
|
|
if not p.netloc:
|
|
|
|
p = urlparse(urljoin(host, url))
|
|
|
|
|
|
|
|
return p
|
|
|
|
|
2017-04-20 05:35:07 +00:00
|
|
|
def write_results(main, data, first=False):
|
|
|
|
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w' if first else 'a') as f:
|
2017-04-20 03:20:04 +00:00
|
|
|
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
|
|
|
|
|
2017-04-20 05:35:07 +00:00
|
|
|
if first:
|
|
|
|
w.writeheader()
|
|
|
|
|
2017-04-20 03:20:04 +00:00
|
|
|
w.writerows(data)
|
2017-04-17 10:17:44 +00:00
|
|
|
|