fix: optimize memory usage
This commit is contained in:
parent
a2564fb602
commit
f67502e68c
@ -16,10 +16,12 @@ def to_absolute(url, host):
|
|||||||
|
|
||||||
return p
|
return p
|
||||||
|
|
||||||
def write_results(main, data):
|
def write_results(main, data, first=False):
|
||||||
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
|
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w' if first else 'a') as f:
|
||||||
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
|
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
|
||||||
|
|
||||||
w.writeheader()
|
if first:
|
||||||
|
w.writeheader()
|
||||||
|
|
||||||
w.writerows(data)
|
w.writerows(data)
|
||||||
|
|
||||||
|
4
index.py
4
index.py
@ -23,7 +23,6 @@ with open(path) as f:
|
|||||||
sites = [a.replace('\n', '') for a in f.readlines()]
|
sites = [a.replace('\n', '') for a in f.readlines()]
|
||||||
|
|
||||||
for host in sites:
|
for host in sites:
|
||||||
data = []
|
|
||||||
visited = []
|
visited = []
|
||||||
queue = []
|
queue = []
|
||||||
|
|
||||||
@ -37,6 +36,7 @@ for host in sites:
|
|||||||
os.makedirs(images_dir)
|
os.makedirs(images_dir)
|
||||||
|
|
||||||
def scrape(url, depth=0):
|
def scrape(url, depth=0):
|
||||||
|
data = []
|
||||||
if args.depth is not None and depth > args.depth: return
|
if args.depth is not None and depth > args.depth: return
|
||||||
|
|
||||||
t = url.geturl()
|
t = url.geturl()
|
||||||
@ -83,7 +83,7 @@ for host in sites:
|
|||||||
queue.insert(0, p)
|
queue.insert(0, p)
|
||||||
|
|
||||||
|
|
||||||
write_results(main, data)
|
write_results(main, data, first=depth == 0)
|
||||||
|
|
||||||
for link in queue:
|
for link in queue:
|
||||||
queue.remove(link)
|
queue.remove(link)
|
||||||
|
Loading…
Reference in New Issue
Block a user