From f67502e68ce76d1cc2bad23f30101c47413da7ac Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Thu, 20 Apr 2017 10:05:07 +0430 Subject: [PATCH] fix: optimize memory usage --- helpers.py | 8 +++++--- index.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/helpers.py b/helpers.py index c2bfe9d..14927b1 100644 --- a/helpers.py +++ b/helpers.py @@ -16,10 +16,12 @@ def to_absolute(url, host): return p -def write_results(main, data): - with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f: +def write_results(main, data, first=False): + with open(os.path.join('results', main.netloc, 'texts.csv'), 'w' if first else 'a') as f: w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image']) - w.writeheader() + if first: + w.writeheader() + w.writerows(data) diff --git a/index.py b/index.py index cfa9ada..788b4de 100644 --- a/index.py +++ b/index.py @@ -23,7 +23,6 @@ with open(path) as f: sites = [a.replace('\n', '') for a in f.readlines()] for host in sites: - data = [] visited = [] queue = [] @@ -37,6 +36,7 @@ for host in sites: os.makedirs(images_dir) def scrape(url, depth=0): + data = [] if args.depth is not None and depth > args.depth: return t = url.geturl() @@ -83,7 +83,7 @@ for host in sites: queue.insert(0, p) - write_results(main, data) + write_results(main, data, first=depth == 0) for link in queue: queue.remove(link)