initial commit

2017-04-17 14:47:44 +04:30
commit 695b666699
5 changed files with 212 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,94 @@
 results
 #### joe made this: http://goel.io/joe
 #### python ####
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 env/
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 *.egg-info/
 .installed.cfg
 *.egg
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *,cover
 .hypothesis/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # IPython Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # dotenv
 .env
 # virtualenv
 .venv/
 venv/
 ENV/
 # Spyder project settings
 .spyderproject
 # Rope project settings
 .ropeproject
--- a/helpers.py
+++ b/helpers.py
@@ -0,0 +1,17 @@
 from urllib.parse import urlparse, urlunparse, urljoin
 def to_absolute(url, host):
    if url == '/': return host
    if url[0:2] == '//':
        url = urlparse(host).scheme + ':' + url
    p = urlparse(url)
    if not (p.scheme in ['http', 'https', '']): return None
    if not p.netloc:
        p = urlparse(urljoin(host, url))
    return p
--- a/index.py
+++ b/index.py
@@ -0,0 +1,93 @@
 from bs4 import BeautifulSoup
 import requests
 import sys
 import os
 import shutil
 import argparse
 from urllib.parse import urlparse, urljoin
 from csv import DictWriter
 from helpers import to_absolute
 tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
 parser = argparse.ArgumentParser(description='Web scraper')
 parser.add_argument('file', help='path to file containing target websites, one line for each')
 parser.add_argument('--depth', type=int, help='how deep should the scraper follow links')
 parser.add_argument('--no-image', help='do not download images', action='store_true')
 args = parser.parse_args()
 path = os.path.join(os.path.dirname(__file__), args.file)
 with open(path) as f:
    sites = [a.replace('\n', '') for a in f.readlines()]
 for host in sites:
    data = []
    visited = []
    queue = []
    main = urlparse(host.replace('\n', ''))
    base_dir = os.path.join('results', main.netloc)
    images_dir = os.path.join(base_dir, 'images')
    if not os.path.isdir(base_dir):
        os.mkdir(base_dir)
    if not os.path.isdir(images_dir):
        os.mkdir(images_dir)
    def scrape(url, depth=0):
        if args.depth is not None and depth > args.depth: return
        t = url.geturl()
        if t in visited: return
        html = requests.get(t).text
        visited.append(t)
        soup = BeautifulSoup(html, 'html.parser')
        elements = soup.find_all(tags)
        for el in elements:
            href = el.get('href')
            if not href and not el.string and not el.name == 'img': continue
            record = {
                'page': url.path,
                'tag': el.name,
                'text': el.string,
                'link': href,
                'image': el.src if el.name == 'img' else None
            }
            if not args.no_image and el.name == 'img' and el.get('src'):
                p = to_absolute(el.get('src'), host)
                filepath = os.path.join(images_dir, os.path.basename(p.path))
                if not os.path.exists(filepath):
                    response = requests.get(p.geturl(), stream=True)
                    with open(filepath, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    del response
            data.append(record)
            if href and href != '/':
                p = to_absolute(href, host)
                if p and p.netloc == main.netloc:
                    queue.insert(0, p)
        for link in queue:
            queue.remove(link)
            scrape(link, depth=depth + 1)
    scrape(main)
    with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
            w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
            w.writeheader()
            w.writerows(data)
--- a/6
+++ b/6
@@ -0,0 +1,6 @@
 appdirs==1.4.3
 beautifulsoup4==4.5.3
 packaging==16.8
 pyparsing==2.2.0
 requests==2.13.0
 six==1.10.0
--- a/2
+++ b/2
@@ -0,0 +1,2 @@
 https://www.theguardian.com/international
 https://theread.me
		`@@ -0,0 +1,2 @@`
							`https://www.theguardian.com/international`
							`https://theread.me`