initial commit
This commit is contained in:
commit
695b666699
94
.gitignore
vendored
Normal file
94
.gitignore
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
results
|
||||
#### joe made this: http://goel.io/joe
|
||||
#### python ####
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*,cover
|
||||
.hypothesis/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# IPython Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# dotenv
|
||||
.env
|
||||
|
||||
# virtualenv
|
||||
.venv/
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
17
helpers.py
Normal file
17
helpers.py
Normal file
@ -0,0 +1,17 @@
|
||||
from urllib.parse import urlparse, urlunparse, urljoin
|
||||
|
||||
def to_absolute(url, host):
|
||||
if url == '/': return host
|
||||
|
||||
if url[0:2] == '//':
|
||||
url = urlparse(host).scheme + ':' + url
|
||||
|
||||
p = urlparse(url)
|
||||
if not (p.scheme in ['http', 'https', '']): return None
|
||||
|
||||
if not p.netloc:
|
||||
p = urlparse(urljoin(host, url))
|
||||
|
||||
return p
|
||||
|
||||
|
93
index.py
Normal file
93
index.py
Normal file
@ -0,0 +1,93 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
import argparse
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from csv import DictWriter
|
||||
from helpers import to_absolute
|
||||
|
||||
tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
|
||||
|
||||
parser = argparse.ArgumentParser(description='Web scraper')
|
||||
parser.add_argument('file', help='path to file containing target websites, one line for each')
|
||||
parser.add_argument('--depth', type=int, help='how deep should the scraper follow links')
|
||||
parser.add_argument('--no-image', help='do not download images', action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
path = os.path.join(os.path.dirname(__file__), args.file)
|
||||
with open(path) as f:
|
||||
sites = [a.replace('\n', '') for a in f.readlines()]
|
||||
|
||||
for host in sites:
|
||||
data = []
|
||||
visited = []
|
||||
queue = []
|
||||
|
||||
main = urlparse(host.replace('\n', ''))
|
||||
base_dir = os.path.join('results', main.netloc)
|
||||
images_dir = os.path.join(base_dir, 'images')
|
||||
|
||||
if not os.path.isdir(base_dir):
|
||||
os.mkdir(base_dir)
|
||||
if not os.path.isdir(images_dir):
|
||||
os.mkdir(images_dir)
|
||||
|
||||
def scrape(url, depth=0):
|
||||
if args.depth is not None and depth > args.depth: return
|
||||
|
||||
t = url.geturl()
|
||||
|
||||
if t in visited: return
|
||||
|
||||
html = requests.get(t).text
|
||||
visited.append(t)
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
elements = soup.find_all(tags)
|
||||
|
||||
for el in elements:
|
||||
href = el.get('href')
|
||||
|
||||
if not href and not el.string and not el.name == 'img': continue
|
||||
|
||||
record = {
|
||||
'page': url.path,
|
||||
'tag': el.name,
|
||||
'text': el.string,
|
||||
'link': href,
|
||||
'image': el.src if el.name == 'img' else None
|
||||
}
|
||||
|
||||
if not args.no_image and el.name == 'img' and el.get('src'):
|
||||
p = to_absolute(el.get('src'), host)
|
||||
filepath = os.path.join(images_dir, os.path.basename(p.path))
|
||||
|
||||
if not os.path.exists(filepath):
|
||||
response = requests.get(p.geturl(), stream=True)
|
||||
with open(filepath, 'wb') as out_file:
|
||||
shutil.copyfileobj(response.raw, out_file)
|
||||
del response
|
||||
|
||||
|
||||
data.append(record)
|
||||
|
||||
if href and href != '/':
|
||||
p = to_absolute(href, host)
|
||||
if p and p.netloc == main.netloc:
|
||||
queue.insert(0, p)
|
||||
|
||||
for link in queue:
|
||||
queue.remove(link)
|
||||
scrape(link, depth=depth + 1)
|
||||
|
||||
scrape(main)
|
||||
|
||||
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
|
||||
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
|
||||
|
||||
w.writeheader()
|
||||
w.writerows(data)
|
||||
|
6
requirements
Normal file
6
requirements
Normal file
@ -0,0 +1,6 @@
|
||||
appdirs==1.4.3
|
||||
beautifulsoup4==4.5.3
|
||||
packaging==16.8
|
||||
pyparsing==2.2.0
|
||||
requests==2.13.0
|
||||
six==1.10.0
|
2
test_websites
Normal file
2
test_websites
Normal file
@ -0,0 +1,2 @@
|
||||
https://www.theguardian.com/international
|
||||
https://theread.me
|
Loading…
Reference in New Issue
Block a user