From 695b66669962d42df3991c0b58f81e282e19d78b Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Mon, 17 Apr 2017 14:47:44 +0430 Subject: [PATCH] initial commit --- .gitignore | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++ helpers.py | 17 ++++++++++ index.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++ requirements | 6 ++++ test_websites | 2 ++ 5 files changed, 212 insertions(+) create mode 100644 .gitignore create mode 100644 helpers.py create mode 100644 index.py create mode 100644 requirements create mode 100644 test_websites diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..87779ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,94 @@ +results +#### joe made this: http://goel.io/joe +#### python #### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +.venv/ +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000..deb21a3 --- /dev/null +++ b/helpers.py @@ -0,0 +1,17 @@ +from urllib.parse import urlparse, urlunparse, urljoin + +def to_absolute(url, host): + if url == '/': return host + + if url[0:2] == '//': + url = urlparse(host).scheme + ':' + url + + p = urlparse(url) + if not (p.scheme in ['http', 'https', '']): return None + + if not p.netloc: + p = urlparse(urljoin(host, url)) + + return p + + diff --git a/index.py b/index.py new file mode 100644 index 0000000..d732c70 --- /dev/null +++ b/index.py @@ -0,0 +1,93 @@ +from bs4 import BeautifulSoup +import requests +import sys +import os +import shutil +import argparse +from urllib.parse import urlparse, urljoin +from csv import DictWriter +from helpers import to_absolute + +tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img'] + +parser = argparse.ArgumentParser(description='Web scraper') +parser.add_argument('file', help='path to file containing target websites, one line for each') +parser.add_argument('--depth', type=int, help='how deep should the scraper follow links') +parser.add_argument('--no-image', help='do not download images', action='store_true') + +args = parser.parse_args() + +path = os.path.join(os.path.dirname(__file__), args.file) +with open(path) as f: + sites = [a.replace('\n', '') for a in f.readlines()] + +for host in sites: + data = [] + visited = [] + queue = [] + + main = urlparse(host.replace('\n', '')) + base_dir = os.path.join('results', main.netloc) + images_dir = os.path.join(base_dir, 'images') + + if not os.path.isdir(base_dir): + os.mkdir(base_dir) + if not os.path.isdir(images_dir): + os.mkdir(images_dir) + + def scrape(url, depth=0): + if args.depth is not None and depth > args.depth: return + + t = url.geturl() + + if t in visited: return + + html = requests.get(t).text + visited.append(t) + + soup = BeautifulSoup(html, 'html.parser') + elements = soup.find_all(tags) + + for el in elements: + href = el.get('href') + + if not href and not el.string and not el.name == 'img': continue + + record = { + 'page': url.path, + 'tag': el.name, + 'text': el.string, + 'link': href, + 'image': el.src if el.name == 'img' else None + } + + if not args.no_image and el.name == 'img' and el.get('src'): + p = to_absolute(el.get('src'), host) + filepath = os.path.join(images_dir, os.path.basename(p.path)) + + if not os.path.exists(filepath): + response = requests.get(p.geturl(), stream=True) + with open(filepath, 'wb') as out_file: + shutil.copyfileobj(response.raw, out_file) + del response + + + data.append(record) + + if href and href != '/': + p = to_absolute(href, host) + if p and p.netloc == main.netloc: + queue.insert(0, p) + + for link in queue: + queue.remove(link) + scrape(link, depth=depth + 1) + + scrape(main) + + with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f: + w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image']) + + w.writeheader() + w.writerows(data) + diff --git a/requirements b/requirements new file mode 100644 index 0000000..5a4219f --- /dev/null +++ b/requirements @@ -0,0 +1,6 @@ +appdirs==1.4.3 +beautifulsoup4==4.5.3 +packaging==16.8 +pyparsing==2.2.0 +requests==2.13.0 +six==1.10.0 diff --git a/test_websites b/test_websites new file mode 100644 index 0000000..34d2f4f --- /dev/null +++ b/test_websites @@ -0,0 +1,2 @@ +https://www.theguardian.com/international +https://theread.me