fix: write CSV progressively

This commit is contained in:
Mahdi Dibaiee 2017-04-20 07:50:04 +04:30
parent b0273c0cc3
commit a2564fb602
2 changed files with 12 additions and 9 deletions

View File

@ -1,4 +1,6 @@
from urllib.parse import urlparse, urlunparse, urljoin from urllib.parse import urlparse, urlunparse, urljoin
import os
from csv import DictWriter
def to_absolute(url, host): def to_absolute(url, host):
if url == '/': return host if url == '/': return host
@ -14,4 +16,10 @@ def to_absolute(url, host):
return p return p
def write_results(main, data):
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
w.writeheader()
w.writerows(data)

View File

@ -5,8 +5,7 @@ import os
import shutil import shutil
import argparse import argparse
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
from csv import DictWriter from helpers import to_absolute, write_results
from helpers import to_absolute
import time import time
tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img'] tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li', 'span', 'a', 'img']
@ -83,6 +82,9 @@ for host in sites:
if p and p.netloc == main.netloc: if p and p.netloc == main.netloc:
queue.insert(0, p) queue.insert(0, p)
write_results(main, data)
for link in queue: for link in queue:
queue.remove(link) queue.remove(link)
if args.delay is not None: if args.delay is not None:
@ -90,10 +92,3 @@ for host in sites:
scrape(link, depth=depth + 1) scrape(link, depth=depth + 1)
scrape(main) scrape(main)
with open(os.path.join('results', main.netloc, 'texts.csv'), 'w') as f:
w = DictWriter(f, fieldnames=['page', 'tag', 'text', 'link', 'image'])
w.writeheader()
w.writerows(data)