web-scraper/index.py

from bs4 import BeautifulSoup
import requests
import sys
import os
import shutil
import argparse
from urllib.parse import urlparse, urljoin
from helpers import to_absolute, write_results
import time

tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']

parser = argparse.ArgumentParser(description='Web scraper')
parser.add_argument('file', help='path to file containing target websites, one line for each')
parser.add_argument('--depth', type=int, help='how deep should the scraper follow links')
parser.add_argument('--no-image', help='do not download images', action='store_true')
parser.add_argument('--delay', help='delay between requests in seconds, use to avoid being treated as an attacker', type=float)

args = parser.parse_args()

path = os.path.join(os.path.dirname(__file__), args.file)
with open(path) as f:
    sites = [a.replace('\n', '') for a in f.readlines()]

for host in sites:
    data = []
    visited = []
    queue = []

    main = urlparse(host.replace('\n', ''))
    base_dir = os.path.join('results', main.netloc)
    images_dir = os.path.join(base_dir, 'images')

    if not os.path.isdir(base_dir):
        os.makedirs(base_dir)
    if not os.path.isdir(images_dir):
        os.makedirs(images_dir)

    def scrape(url, depth=0):
        if args.depth is not None and depth > args.depth: return

        t = url.geturl()

        if t in visited: return

        print(t)

        html = requests.get(t).text
        visited.append(t)

        soup = BeautifulSoup(html, 'html.parser')
        elements = soup.find_all(tags)

        for el in elements:
            href = el.get('href')

            if not href and not el.string and not el.name == 'img': continue

            record = {
                'page': url.path,
                'tag': el.name,
                'text': el.string,
                'link': href,
                'image': el.src if el.name == 'img' else None
            }

            if not args.no_image and el.name == 'img' and el.get('src'):
                p = to_absolute(el.get('src'), host)
                filepath = os.path.join(images_dir, os.path.basename(p.path))

                if not os.path.exists(filepath):
                    response = requests.get(p.geturl(), stream=True)
                    with open(filepath, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    del response


            data.append(record)
            
            if href and href != '/':
                p = to_absolute(href, host)
                if p and p.netloc == main.netloc:
                    queue.insert(0, p)
            

        write_results(main, data)

        for link in queue:
            queue.remove(link)
            if args.delay is not None:
                time.sleep(args.delay)
            scrape(link, depth=depth + 1)
                        
    scrape(main)
initial commit 2017-04-17 10:17:44 +00:00			`from bs4 import BeautifulSoup`
			`import requests`
			`import sys`
			`import os`
			`import shutil`
			`import argparse`
			`from urllib.parse import urlparse, urljoin`
fix: write CSV progressively 2017-04-20 03:20:04 +00:00			`from helpers import to_absolute, write_results`
fix: log the page being scraped feat: delay option 2017-04-19 17:44:54 +00:00			`import time`
initial commit 2017-04-17 10:17:44 +00:00
			`tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']`

			`parser = argparse.ArgumentParser(description='Web scraper')`
			`parser.add_argument('file', help='path to file containing target websites, one line for each')`
			`parser.add_argument('--depth', type=int, help='how deep should the scraper follow links')`
			`parser.add_argument('--no-image', help='do not download images', action='store_true')`
fix: log the page being scraped feat: delay option 2017-04-19 17:44:54 +00:00			`parser.add_argument('--delay', help='delay between requests in seconds, use to avoid being treated as an attacker', type=float)`
initial commit 2017-04-17 10:17:44 +00:00
			`args = parser.parse_args()`

			`path = os.path.join(os.path.dirname(__file__), args.file)`
			`with open(path) as f:`
			`sites = [a.replace('\n', '') for a in f.readlines()]`

			`for host in sites:`
			`data = []`
			`visited = []`
			`queue = []`

			`main = urlparse(host.replace('\n', ''))`
			`base_dir = os.path.join('results', main.netloc)`
			`images_dir = os.path.join(base_dir, 'images')`

			`if not os.path.isdir(base_dir):`
fix: makedirs 2017-04-17 10:26:16 +00:00			`os.makedirs(base_dir)`
initial commit 2017-04-17 10:17:44 +00:00			`if not os.path.isdir(images_dir):`
fix: makedirs 2017-04-17 10:26:16 +00:00			`os.makedirs(images_dir)`
initial commit 2017-04-17 10:17:44 +00:00
			`def scrape(url, depth=0):`
			`if args.depth is not None and depth > args.depth: return`

			`t = url.geturl()`

			`if t in visited: return`

fix: log the page being scraped feat: delay option 2017-04-19 17:44:54 +00:00			`print(t)`

initial commit 2017-04-17 10:17:44 +00:00			`html = requests.get(t).text`
			`visited.append(t)`

			`soup = BeautifulSoup(html, 'html.parser')`
			`elements = soup.find_all(tags)`

			`for el in elements:`
			`href = el.get('href')`

			`if not href and not el.string and not el.name == 'img': continue`

			`record = {`
			`'page': url.path,`
			`'tag': el.name,`
			`'text': el.string,`
			`'link': href,`
			`'image': el.src if el.name == 'img' else None`
			`}`

			`if not args.no_image and el.name == 'img' and el.get('src'):`
			`p = to_absolute(el.get('src'), host)`
			`filepath = os.path.join(images_dir, os.path.basename(p.path))`

			`if not os.path.exists(filepath):`
			`response = requests.get(p.geturl(), stream=True)`
			`with open(filepath, 'wb') as out_file:`
			`shutil.copyfileobj(response.raw, out_file)`
			`del response`


			`data.append(record)`

			`if href and href != '/':`
			`p = to_absolute(href, host)`
			`if p and p.netloc == main.netloc:`
			`queue.insert(0, p)`
fix: write CSV progressively 2017-04-20 03:20:04 +00:00

			`write_results(main, data)`
initial commit 2017-04-17 10:17:44 +00:00
			`for link in queue:`
			`queue.remove(link)`
fix: log the page being scraped feat: delay option 2017-04-19 17:44:54 +00:00			`if args.delay is not None:`
			`time.sleep(args.delay)`
initial commit 2017-04-17 10:17:44 +00:00			`scrape(link, depth=depth + 1)`

			`scrape(main)`