initial commit

This commit is contained in:
Mahdi Dibaiee 2017-04-17 14:47:44 +04:30
commit 695b666699
5 changed files with 212 additions and 0 deletions

94
.gitignore vendored Normal file
View File

@ -0,0 +1,94 @@
results
#### joe made this: http://goel.io/joe
#### python ####
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
.venv/
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject

17
helpers.py Normal file
View File

@ -0,0 +1,17 @@
from urllib.parse import urlparse, urlunparse, urljoin
def to_absolute(url, host):
if url == '/': return host
if url[0:2] == '//':
url = urlparse(host).scheme + ':' + url
p = urlparse(url)
if not (p.scheme in ['http', 'https', '']): return None
if not p.netloc:
p = urlparse(urljoin(host, url))
return p

93
index.py Normal file
View File

@ -0,0 +1,93 @@
from bs4 import BeautifulSoup
import requests
import sys
import os
import shutil
import argparse
from urllib.parse import urlparse, urljoin
from csv import DictWriter
from helpers import to_absolute
tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
parser = argparse.ArgumentParser(description='Web scraper')
parser.add_argument('file', help='path to file containing target websites, one line for each')
parser.add_argument('--depth', type=int, help='how deep should the scraper follow links')
parser.add_argument('--no-image', help='do not download images', action='store_true')
args = parser.parse_args()
path = os.path.join(os.path.dirname(__file__), args.file)
with open(path) as f:
sites = [a.replace('\n', '') for a in f.readlines()]
for host in sites:
data = []
visited = []
queue = []
main = urlparse(host.replace('\n', ''))
base_dir = os.path.join('results', main.netloc)
images_dir = os.path.join(base_dir, 'images')
if not os.path.isdir(base_dir):
os.mkdir(base_dir)
if not os.path.isdir(images_dir):
os.mkdir(images_dir)
def scrape(url, depth=0):
if args.depth is not None and depth > args.depth: return
t = url.geturl()
if t in visited: return
html = requests.get(t).text
visited.append(t)
soup = BeautifulSoup(html, 'html.parser')
elements = soup.find_all(tags)
for el in elements:
href = el.get('href')
if not href and not el.string and not el.name == 'img': continue
record = {
'page': url.path,
'tag': el.name,
'text': el.string,
'link': href,
'image': el.src if el.name == 'img' else None
}
if not args.no_image and el.name == 'img' and el.get('src'):
p = to_absolute(el.get('src'), host)
filepath = os.path.join(images_dir, os.path.basename(p.path))
if not os.path.exists(filepath):
response = requests.get(p.geturl(), stream=True)
with open(filepath, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
data.append(record)
if href and href != '/':
p = to_absolute(href, host)
if p and p.netloc == main.netloc:
queue.insert(0, p)
for link in queue:
queue.remove(link)
scrape(link, depth=depth + 1)
scrape(main)
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
w.writeheader()
w.writerows(data)

6
requirements Normal file
View File

@ -0,0 +1,6 @@
appdirs==1.4.3
beautifulsoup4==4.5.3
packaging==16.8
pyparsing==2.2.0
requests==2.13.0
six==1.10.0

2
test_websites Normal file
View File

@ -0,0 +1,2 @@
https://www.theguardian.com/international
https://theread.me