fix: optimize memory usage

This commit is contained in:
Mahdi Dibaiee 2017-04-20 10:05:07 +04:30
parent a2564fb602
commit f67502e68c
2 changed files with 7 additions and 5 deletions

View File

@ -16,10 +16,12 @@ def to_absolute(url, host):
return p return p
def write_results(main, data): def write_results(main, data, first=False):
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f: with open(os.path.join('results', main.netloc, 'texts.csv'), 'w' if first else 'a') as f:
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image']) w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
w.writeheader() if first:
w.writeheader()
w.writerows(data) w.writerows(data)

View File

@ -23,7 +23,6 @@ with open(path) as f:
sites = [a.replace('\n', '') for a in f.readlines()] sites = [a.replace('\n', '') for a in f.readlines()]
for host in sites: for host in sites:
data = []
visited = [] visited = []
queue = [] queue = []
@ -37,6 +36,7 @@ for host in sites:
os.makedirs(images_dir) os.makedirs(images_dir)
def scrape(url, depth=0): def scrape(url, depth=0):
data = []
if args.depth is not None and depth > args.depth: return if args.depth is not None and depth > args.depth: return
t = url.geturl() t = url.geturl()
@ -83,7 +83,7 @@ for host in sites:
queue.insert(0, p) queue.insert(0, p)
write_results(main, data) write_results(main, data, first=depth == 0)
for link in queue: for link in queue:
queue.remove(link) queue.remove(link)