web-scraper/helpers.py

from urllib.parse import urlparse, urlunparse, urljoin
import os
from csv import DictWriter

def to_absolute(url, host):
    if url == '/': return host

    if url[0:2] == '//':
        url = urlparse(host).scheme + ':' + url

    p = urlparse(url)
    if not (p.scheme in ['http', 'https', '']): return None

    if not p.netloc:
        p = urlparse(urljoin(host, url))

    return p
        
def write_results(main, data):
    with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
            w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])

            w.writeheader()
            w.writerows(data)
initial commit 2017-04-17 10:17:44 +00:00			`from urllib.parse import urlparse, urlunparse, urljoin`
fix: write CSV progressively 2017-04-20 03:20:04 +00:00			`import os`
			`from csv import DictWriter`
initial commit 2017-04-17 10:17:44 +00:00
			`def to_absolute(url, host):`
			`if url == '/': return host`

			`if url[0:2] == '//':`
			`url = urlparse(host).scheme + ':' + url`

			`p = urlparse(url)`
			`if not (p.scheme in ['http', 'https', '']): return None`

			`if not p.netloc:`
			`p = urlparse(urljoin(host, url))`

			`return p`

fix: write CSV progressively 2017-04-20 03:20:04 +00:00			`def write_results(main, data):`
			`with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:`
			`w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])`

			`w.writeheader()`
			`w.writerows(data)`
initial commit 2017-04-17 10:17:44 +00:00