fix: optimize memory usage
This commit is contained in:
parent
a2564fb602
commit
f67502e68c
@ -16,10 +16,12 @@ def to_absolute(url, host):
|
||||
|
||||
return p
|
||||
|
||||
def write_results(main, data):
|
||||
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
|
||||
def write_results(main, data, first=False):
|
||||
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w' if first else 'a') as f:
|
||||
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
|
||||
|
||||
w.writeheader()
|
||||
if first:
|
||||
w.writeheader()
|
||||
|
||||
w.writerows(data)
|
||||
|
||||
|
4
index.py
4
index.py
@ -23,7 +23,6 @@ with open(path) as f:
|
||||
sites = [a.replace('\n', '') for a in f.readlines()]
|
||||
|
||||
for host in sites:
|
||||
data = []
|
||||
visited = []
|
||||
queue = []
|
||||
|
||||
@ -37,6 +36,7 @@ for host in sites:
|
||||
os.makedirs(images_dir)
|
||||
|
||||
def scrape(url, depth=0):
|
||||
data = []
|
||||
if args.depth is not None and depth > args.depth: return
|
||||
|
||||
t = url.geturl()
|
||||
@ -83,7 +83,7 @@ for host in sites:
|
||||
queue.insert(0, p)
|
||||
|
||||
|
||||
write_results(main, data)
|
||||
write_results(main, data, first=depth == 0)
|
||||
|
||||
for link in queue:
|
||||
queue.remove(link)
|
||||
|
Loading…
Reference in New Issue
Block a user