From 695b66669962d42df3991c0b58f81e282e19d78b Mon Sep 17 00:00:00 2001
From: Mahdi Dibaiee <mdibaiee@aol.com>
Date: Mon, 17 Apr 2017 14:47:44 +0430
Subject: [PATCH] initial commit

---
 .gitignore    | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++
 helpers.py    | 17 ++++++++++
 index.py      | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++
 requirements  |  6 ++++
 test_websites |  2 ++
 5 files changed, 212 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 helpers.py
 create mode 100644 index.py
 create mode 100644 requirements
 create mode 100644 test_websites

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..87779ea
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,94 @@
+results
+#### joe made this: http://goel.io/joe
+#### python ####
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+.venv/
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
diff --git a/helpers.py b/helpers.py
new file mode 100644
index 0000000..deb21a3
--- /dev/null
+++ b/helpers.py
@@ -0,0 +1,17 @@
+from urllib.parse import urlparse, urlunparse, urljoin
+
+def to_absolute(url, host):
+    if url == '/': return host
+
+    if url[0:2] == '//':
+        url = urlparse(host).scheme + ':' + url
+
+    p = urlparse(url)
+    if not (p.scheme in ['http', 'https', '']): return None
+
+    if not p.netloc:
+        p = urlparse(urljoin(host, url))
+
+    return p
+        
+
diff --git a/index.py b/index.py
new file mode 100644
index 0000000..d732c70
--- /dev/null
+++ b/index.py
@@ -0,0 +1,93 @@
+from bs4 import BeautifulSoup
+import requests
+import sys
+import os
+import shutil
+import argparse
+from urllib.parse import urlparse, urljoin
+from csv import DictWriter
+from helpers import to_absolute
+
+tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
+
+parser = argparse.ArgumentParser(description='Web scraper')
+parser.add_argument('file', help='path to file containing target websites, one line for each')
+parser.add_argument('--depth', type=int, help='how deep should the scraper follow links')
+parser.add_argument('--no-image', help='do not download images', action='store_true')
+
+args = parser.parse_args()
+
+path = os.path.join(os.path.dirname(__file__), args.file)
+with open(path) as f:
+    sites = [a.replace('\n', '') for a in f.readlines()]
+
+for host in sites:
+    data = []
+    visited = []
+    queue = []
+
+    main = urlparse(host.replace('\n', ''))
+    base_dir = os.path.join('results', main.netloc)
+    images_dir = os.path.join(base_dir, 'images')
+
+    if not os.path.isdir(base_dir):
+        os.mkdir(base_dir)
+    if not os.path.isdir(images_dir):
+        os.mkdir(images_dir)
+
+    def scrape(url, depth=0):
+        if args.depth is not None and depth > args.depth: return
+
+        t = url.geturl()
+
+        if t in visited: return
+
+        html = requests.get(t).text
+        visited.append(t)
+
+        soup = BeautifulSoup(html, 'html.parser')
+        elements = soup.find_all(tags)
+
+        for el in elements:
+            href = el.get('href')
+
+            if not href and not el.string and not el.name == 'img': continue
+
+            record = {
+                'page': url.path,
+                'tag': el.name,
+                'text': el.string,
+                'link': href,
+                'image': el.src if el.name == 'img' else None
+            }
+
+            if not args.no_image and el.name == 'img' and el.get('src'):
+                p = to_absolute(el.get('src'), host)
+                filepath = os.path.join(images_dir, os.path.basename(p.path))
+
+                if not os.path.exists(filepath):
+                    response = requests.get(p.geturl(), stream=True)
+                    with open(filepath, 'wb') as out_file:
+                        shutil.copyfileobj(response.raw, out_file)
+                    del response
+
+
+            data.append(record)
+            
+            if href and href != '/':
+                p = to_absolute(href, host)
+                if p and p.netloc == main.netloc:
+                    queue.insert(0, p)
+
+        for link in queue:
+            queue.remove(link)
+            scrape(link, depth=depth + 1)
+                        
+    scrape(main)
+
+    with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
+            w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
+
+            w.writeheader()
+            w.writerows(data)
+
diff --git a/requirements b/requirements
new file mode 100644
index 0000000..5a4219f
--- /dev/null
+++ b/requirements
@@ -0,0 +1,6 @@
+appdirs==1.4.3
+beautifulsoup4==4.5.3
+packaging==16.8
+pyparsing==2.2.0
+requests==2.13.0
+six==1.10.0
diff --git a/test_websites b/test_websites
new file mode 100644
index 0000000..34d2f4f
--- /dev/null
+++ b/test_websites
@@ -0,0 +1,2 @@
+https://www.theguardian.com/international
+https://theread.me