From b0273c0cc32058c77a86443980293192200e6d68 Mon Sep 17 00:00:00 2001
From: Mahdi Dibaiee <mdibaiee@aol.com>
Date: Wed, 19 Apr 2017 22:14:54 +0430
Subject: [PATCH] fix: log the page being scraped feat: delay option

---
 index.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/index.py b/index.py
index bbc3fcf..07c5b59 100644
--- a/index.py
+++ b/index.py
@@ -7,6 +7,7 @@ import argparse
 from urllib.parse import urlparse, urljoin
 from csv import DictWriter
 from helpers import to_absolute
+import time
 
 tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
 
@@ -14,6 +15,7 @@ parser = argparse.ArgumentParser(description='Web scraper')
 parser.add_argument('file', help='path to file containing target websites, one line for each')
 parser.add_argument('--depth', type=int, help='how deep should the scraper follow links')
 parser.add_argument('--no-image', help='do not download images', action='store_true')
+parser.add_argument('--delay', help='delay between requests in seconds, use to avoid being treated as an attacker', type=float)
 
 args = parser.parse_args()
 
@@ -42,6 +44,8 @@ for host in sites:
 
         if t in visited: return
 
+        print(t)
+
         html = requests.get(t).text
         visited.append(t)
 
@@ -81,6 +85,8 @@ for host in sites:
 
         for link in queue:
             queue.remove(link)
+            if args.delay is not None:
+                time.sleep(args.delay)
             scrape(link, depth=depth + 1)
                         
     scrape(main)